diff --git a/FeatureRequest.md b/FeatureRequest.md
index d9157b90..a0c478e1 100644
--- a/FeatureRequest.md
+++ b/FeatureRequest.md
@@ -11,6 +11,22 @@
 
 ____
 
+#### #27 Use Homie Spec for Mqtt binding
+
+* Use the standardized Home Protocol for the Mqtt binding 
+* https://homieiot.github.io/
+
+#### #26 Changes behaviour for "N" replacement
+
+* in case the higher digits has already increased by minium 1 - don't set the "N" to the last value, but to "0"
+* https://github.com/jomjol/AI-on-the-edge-device/issues/792
+
+
+#### #25 Trigger Measurement via MQTT
+
+* https://github.com/jomjol/AI-on-the-edge-device/issues/727
+
+
 #### #24 Show Mqtt state directly in Webserver
 
 * Show MQTT log in Web page. E.g. connection established or failed to connect...
@@ -48,18 +64,15 @@ ____
 
   
 
-#### #18 Document WLAN-strength in web page
+#### ~~#18 Document WLAN-strength in web page~~
 
-* https://github.com/jomjol/AI-on-the-edge-device/issues/563
+* ~~https://github.com/jomjol/AI-on-the-edge-device/issues/563~~
 
 
 
-#### #17 Direct InfluxDB connection
+#### ~~#17 Direct InfluxDB connection~~
 
-* https://github.com/jomjol/AI-on-the-edge-device/issues/534
-* Direct interface to a InfluxDB data base
-* Integrate InfluxDB interface in firmware
-* Adapt html web page for configuration
+* ~~Done in v10.6.0~~
 
 
 #### #16 Serial Communication
@@ -101,9 +114,9 @@ ____
 
   
 
-#### #12 Less reboots due to memory leakage
+#### ~~#12 Less reboots due to memory leakage~~
 
-* Issue: #414 & #425  #430
+* ~~Issue: #414 & #425  #430~~
 
   
 
@@ -222,4 +235,4 @@ ____
 
 * ~~Implementation of a software module for external light source (e.g. WS8132 LED controller, ...)~~
 * ~~Update of the camera module to use the external light instead of the internal flash light~~
-* ~~Adopt the configuration algorithm with a configurable light source~~
\ No newline at end of file
+* ~~Adopt the configuration algorithm with a configurable light source~~
diff --git a/README.md b/README.md
index 66953384..61343250 100644
--- a/README.md
+++ b/README.md
@@ -52,9 +52,31 @@ In other cases you can contact the developer via email: <img src="https://raw.gi
 
 
 
+##### 10.6.0 - Stability Increase (2022-07-17)
+
+- IndluxDB: direct injection into InfluxDB - thanks to **[wetneb](https://github.com/wetneb)**
+
+- MQTT: implemented "Retain Flag" and extend with absolute Change (in addition to rate)
+
+- `config.ini`: removal of modelsize (readout from tflite)
+
+- Updated analog neural network file (`ana1000s2.tflite`) & digital neural network file (`dig1400s2q.tflite`)
+
+- TFMicro/Lite: Update (espressif Version 20220716)
+
+- Updated esp32cam (v20220716)
+
+- ESP-IDF: Update to 4.4
+
+- Internal update (CNN algorithm optimizations, reparation for new neural network type)
+
+- Bug Fix: no time with fixed IP, Postprocessing, MQTT
+
+  
+
 ##### 10.5.2 - Stability Increase (2022-02-22)
 
-- **NEW 10.5.2:** Bug Fix: wrong `firmware.bin` (no rate update)
+- NEW 10.5.2: Bug Fix: wrong `firmware.bin` (no rate update)
 - NEW 10.5.1: Bug Fix: wrong return value, rate value & PreValue status, HTML: SSID & IP were not displayed 
 - MQTT: changed wifi naming to "wifiRSSI"
 - HTML: check selectable values for consistency
diff --git a/code/components/esp-nn/.gitignore b/code/components/esp-nn/.gitignore
new file mode 100644
index 00000000..08ca72b5
--- /dev/null
+++ b/code/components/esp-nn/.gitignore
@@ -0,0 +1,57 @@
+.config
+*.o
+*.i
+*.s
+*.orig
+*.pyc
+
+# gtags
+GTAGS
+GRTAGS
+GPATH
+
+# emacs
+.dir-locals.el
+
+# emacs temp file suffixes
+*~
+.#*
+\#*#
+
+# eclipse setting
+.settings
+
+# MacOS directory files
+.DS_Store
+
+# Example project files
+examples/**/sdkconfig
+examples/**/sdkconfig.old
+examples/**/build
+
+# Test app files
+test_app/build
+test_app/sdkconfig
+test_app/sdkconfig.old
+
+# Doc build artifacts
+docs/_build/
+docs/doxygen-warning-log.txt
+docs/sphinx-warning-log.txt
+docs/sphinx-warning-log-sanitized.txt
+docs/xml/
+docs/xml_in/
+docs/man/
+docs/doxygen_sqlite3.db
+
+TEST_LOGS
+
+
+# gcov coverage reports
+*.gcda
+*.gcno
+coverage.info
+coverage_report/
+
+# VS Code Settings
+.vscode/
diff --git a/code/components/esp-nn/.gitlab-ci.yml b/code/components/esp-nn/.gitlab-ci.yml
new file mode 100644
index 00000000..6b540bda
--- /dev/null
+++ b/code/components/esp-nn/.gitlab-ci.yml
@@ -0,0 +1,55 @@
+stages:
+  - build
+
+variables:
+  BATCH_BUILD: "1"
+  V: "0"
+  MAKEFLAGS: "-j8 --no-keep-going"
+  IDF_PATH: "$CI_PROJECT_DIR/esp-idf"
+  LOG_PATH: "$CI_PROJECT_DIR"
+
+.set_git_config: &set_git_config
+  # Set git config
+  - git config user.email "test@espressif.com"
+  - git config user.name "Espressif"
+
+.add_ssh_key: &add_ssh_key
+  # Add gitlab ssh key
+  - mkdir -p ~/.ssh
+  - chmod 700 ~/.ssh
+  - echo -n $GITLAB_KEY > ~/.ssh/id_rsa_base64
+  - base64 --decode --ignore-garbage ~/.ssh/id_rsa_base64 > ~/.ssh/id_rsa
+  - chmod 600 ~/.ssh/id_rsa
+  - echo -e "Host gitlab.espressif.cn\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config
+
+before_script:
+  # Add gitlab ssh key
+  - *add_ssh_key
+  # Set git config
+  - *set_git_config
+
+.build_esp32s3: &build_esp32s3
+  - idf.py set-target esp32s3 build
+
+.build_esp32: &build_esp32
+  - idf.py set-target esp32 build
+
+build_demo:
+  stage: build
+  image: $CI_DOCKER_REGISTRY/esp32-ci-env:esp-nn
+  tags:
+    - build
+  script:
+    # Clone IDF
+    - git clone --recursive --single-branch -b release/v4.4 --reference-if-able /local_references/gitlab/ https://gitlab-ci-token:${BOT_TOKEN}@gitlab.espressif.cn:6688/espressif/esp-idf.git
+    - cd esp-idf
+    - ./install.sh
+    - . ./export.sh
+    - cd ..
+    # Build examples now
+    - cd test_app
+    # Build esp32s3
+    - *build_esp32s3
+    # Build esp32
+    - *build_esp32
+    - cd -
diff --git a/code/components/esp-nn/CMakeLists.txt b/code/components/esp-nn/CMakeLists.txt
new file mode 100644
index 00000000..ba45866a
--- /dev/null
+++ b/code/components/esp-nn/CMakeLists.txt
@@ -0,0 +1,50 @@
+idf_build_get_property(idf_target IDF_TARGET)
+
+set(c_srcs
+    "src/activation_functions/esp_nn_relu_ansi.c"
+    "src/basic_math/esp_nn_add_ansi.c"
+    "src/basic_math/esp_nn_mul_ansi.c"
+    "src/convolution/esp_nn_conv_ansi.c"
+    "src/convolution/esp_nn_conv_opt.c"
+    "src/convolution/esp_nn_depthwise_conv_ansi.c"
+    "src/convolution/esp_nn_depthwise_conv_opt.c"
+    "src/fully_connected/esp_nn_fully_connected_ansi.c"
+    "src/softmax/esp_nn_softmax_ansi.c"
+    "src/softmax/esp_nn_softmax_opt.c"
+    "src/pooling/esp_nn_avg_pool_ansi.c"
+    "src/pooling/esp_nn_max_pool_ansi.c")
+
+if(CONFIG_IDF_TARGET_ESP32S3)
+    set(s3_srcs
+        "src/common/esp_nn_common_functions_esp32s3.S"
+        "src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S"
+        "src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S"
+        "src/activation_functions/esp_nn_relu_s8_esp32s3.S"
+        "src/basic_math/esp_nn_add_s8_esp32s3.S"
+        "src/basic_math/esp_nn_mul_s8_esp32s3.S"
+        "src/convolution/esp_nn_conv_esp32s3.c"
+        "src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c"
+        "src/convolution/esp_nn_conv_s16_mult8_esp32s3.S"
+        "src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S"
+        "src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S"
+        "src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S"
+        "src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S"
+        "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S"
+        "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S"
+        "src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S"
+        "src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S"
+        "src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S"
+        "src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S"
+        "src/pooling/esp_nn_max_pool_s8_esp32s3.S"
+        "src/pooling/esp_nn_avg_pool_s8_esp32s3.S")
+endif()
+
+idf_component_register(SRCS "${c_srcs}"
+                            "${s3_srcs}"
+                       INCLUDE_DIRS "include" "src/common")
+
+if(CONFIG_IDF_TARGET_ESP32S3)
+    target_compile_options(${COMPONENT_LIB} PRIVATE -mlongcalls -fno-unroll-loops -O2 -Wno-unused-function)
+else()
+    target_compile_options(${COMPONENT_LIB} PRIVATE -Wno-unused-function)
+endif()
\ No newline at end of file
diff --git a/code/components/esp-nn/Kconfig.projbuild b/code/components/esp-nn/Kconfig.projbuild
new file mode 100644
index 00000000..a146305b
--- /dev/null
+++ b/code/components/esp-nn/Kconfig.projbuild
@@ -0,0 +1,29 @@
+menu "ESP-NN"
+
+choice NN_OPTIMIZATIONS
+   bool "Optimization for nn functions"
+   default NN_OPTIMIZED
+   help
+      Use ANSI-C versions for verification and debug purpose.
+      Optimisations are automatically picked up for a chipset.
+      For ESP32-S3, assembly optimisations are selected.
+      For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used.
+
+config NN_ANSI_C
+   bool "ANSI C"
+   help
+      ANSI C versions for verification and debug purposes.
+config NN_OPTIMIZED
+   bool "Optimized versions"
+   help
+      Optimisations are automatically picked up for a chipset.
+      For ESP32-S3, assembly optimisations are selected.
+      For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used.
+endchoice
+
+config NN_OPTIMIZATIONS
+   int
+   default 0 if NN_ANSI_C
+   default 1 if NN_OPTIMIZED
+
+endmenu
diff --git a/code/components/esp-nn/LICENSE b/code/components/esp-nn/LICENSE
new file mode 100644
index 00000000..d6456956
--- /dev/null
+++ b/code/components/esp-nn/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/code/components/esp-nn/README.md b/code/components/esp-nn/README.md
new file mode 100644
index 00000000..f70f4074
--- /dev/null
+++ b/code/components/esp-nn/README.md
@@ -0,0 +1,55 @@
+# ESP-NN
+
+The library contains optimised NN (Neural Network) functions for various Espressif chipsets.
+
+* Supported platforms:
+   * TensorFlow Lite Micro (TFLite Micro). Repo can be found [here](https://github.com/espressif/tflite-micro-esp-examples)
+
+* Supported ESP chipsets include:
+   * ESP32-S3 (Assembly versions optimised to benefit from vector instructions of ESP32-S3)
+   * ESP32 (Generic optimisations)
+   * ESP32-C3 (Generic optimisations)
+
+## Performance
+
+### Kernelwise performance for s8 versions:
+
+  * Kernelwise performance on ESP32-S3 chip
+    * Numbers are ticks taken for kernel to execute
+    * Chip config: 240MHz, SPI: QPI 80MHz, Data cache: 64KB
+
+    | Function        | ANSI C  | ESP32-S3 Opt  | Opt Ratio | Data info   | Memory    |
+    | ----------------| --------|---------|---------|-------------|-----------|
+    | elementwise_add | 320397  | 87119   | 3.68    | size = 1615 | External  |
+    | elementwise_mul | 125958  | 44239   | 2.85    | size = 1615 | External  |
+    | convolution     | 4663012 | 428675  | 10.88   | input(10,10), filter(64x1x1x64) | External |
+    | convolution     | 301014  | 32433   | 9.28    | input(8,8), filter(16x1x1x16) | External |
+    | convolution     | 2115418 | 1020923 | 2.07    | input(10,10), filter(64x3x3x3) | External |
+    | depthwise conv  | 1190062 | 203278  | 5.85    | input (18, 18), pad(0,0), stride(1,1) filter: 1x3x3x16 | External |
+    | depthwise conv  | 837072  | 182335  | 4.59    | input (12, 12), pad(1,1), stride(1,1)  filter: 8x5x5x4 | External |
+    | max pool        | 485714  | 76747   | 6.33    | input(16,16), filter (1x3x3x16) | Internal |
+    | avg pool        | 541462  | 160580  | 3.37    | input(16,16), filter (1x3x3x16) | Internal |
+    | fully connected | 15853   | 9547    | 1.66    | len: 265, ch = 3 | Internal |
+    | prelu (relu6)   | 19472   | 2734    | 7.12    | size, 1615  | Internal  |
+
+
+## Configuration
+
+  * To configure, please use `idf.py menuconfig` and under `ESP-NN` select `NN_OPTIMIZATIONS`
+  * There are two options presented:
+     * Optimized versions
+     * ANSI C
+
+  * Default selection is for `Optimized versions`. For ESP32-S3, assembly versions are automatically selected, whereas for other chipsets (viz., ESP32, ESP32-C3), generic optimisations are selected.
+  * For debugging purposes, you may want to select `ANSI C` reference versions.
+
+
+## Contributing
+
+If you encounter an issue with ESP-NN, or wish to submit a feature request, please use the Issues section on the Github.
+
+For general questions related to this library, please use the esp32.com forum.
+
+## Copyrights and License
+
+All original source code in this repository is Copyright (C) 2020-2021 Espressif Systems. This source code is licensed under the Apache License 2.0 as described in the file LICENSE.
diff --git a/code/components/esp-nn/include/esp_nn.h b/code/components/esp-nn/include/esp_nn.h
new file mode 100644
index 00000000..bd533119
--- /dev/null
+++ b/code/components/esp-nn/include/esp_nn.h
@@ -0,0 +1,46 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(CONFIG_NN_OPTIMIZED)
+// select apt optimisations
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+#define ARCH_ESP32_S3 1
+#endif
+#ifdef CONFIG_IDF_TARGET_ESP32
+#define ARCH_ESP32 1
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* reference kernels included by default */
+#include "esp_nn_ansi_headers.h"
+
+#if defined(CONFIG_NN_OPTIMIZED)
+#if defined(ARCH_ESP32_S3)
+#include "esp_nn_esp32s3.h"
+#else // for other platforms use generic optimisations
+#include "esp_nn_generic_opt.h"
+#endif // #if defined(ARCH_ESP32_S3)
+#else
+#include "esp_nn_ansi_c.h"
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/code/components/esp-nn/include/esp_nn_ansi_c.h b/code/components/esp-nn/include/esp_nn_ansi_c.h
new file mode 100644
index 00000000..8279ebef
--- /dev/null
+++ b/code/components/esp-nn/include/esp_nn_ansi_c.h
@@ -0,0 +1,47 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file        Header definitions to include for ANSI C versions.
+ *              These are just typedefs to pick up ANSI versions.
+ */
+
+#pragma once
+
+#include "esp_nn_defs.h"
+#include "esp_nn_ansi_headers.h"
+
+#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi
+#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi
+
+#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_ansi
+
+#define esp_nn_conv_s8 esp_nn_conv_s8_ansi
+
+#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_ansi
+#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_ansi
+
+#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_ansi
+#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_ansi
+
+#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi
+
+#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi
+#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi
+
+#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi
+
+#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_ansi
+#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_ansi
+#define esp_nn_softmax_s8 esp_nn_softmax_s8_ansi
diff --git a/code/components/esp-nn/include/esp_nn_ansi_headers.h b/code/components/esp-nn/include/esp_nn_ansi_headers.h
new file mode 100644
index 00000000..52ebb680
--- /dev/null
+++ b/code/components/esp-nn/include/esp_nn_ansi_headers.h
@@ -0,0 +1,309 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+/**
+ * @file        Header definitions to include for esp_nn reference functions
+ */
+
+#include "esp_nn_defs.h"
+/************************** Basic math functions ****************************/
+
+/**
+ * @brief       elementwise addition
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ *
+ *              shift values are expected to be <= 0
+ */
+void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,
+                                    const int8_t *input2_data,
+                                    const int32_t input1_offset,
+                                    const int32_t input2_offset,
+                                    const int32_t input1_mult,
+                                    const int32_t input2_mult,
+                                    const int32_t input1_shift,
+                                    const int32_t input2_shift,
+                                    const int32_t left_shift,
+                                    int8_t *output,
+                                    const int32_t out_offset,
+                                    const int32_t out_mult,
+                                    const int32_t out_shift,
+                                    const int32_t activation_min,
+                                    const int32_t activation_max,
+                                    const int32_t size);
+/**
+ * @brief       elementwise multiplication
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ *
+ *              output shift is expected to be <= 0
+ */
+void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,
+                                    const int8_t *input2_data,
+                                    const int32_t input1_offset,
+                                    const int32_t input2_offset,
+                                    int8_t *output,
+                                    const int32_t out_offset,
+                                    const int32_t out_mult,
+                                    const int32_t out_shift,
+                                    const int32_t activation_min,
+                                    const int32_t activation_max,
+                                    const int32_t size);
+
+
+/************************** Convolution functions *****************************/
+
+/**
+ * @brief       depthwise convolution per channel
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              Version used in tflite is per channel.
+ *              This version follows the same footsprints.
+ *              Meaning, it has per out_channel shift and multiplier for
+ *              requantization
+ *
+ *              optimization notes: Though input_offset is int32 type,
+ *              offset values are contained in 8 bits [-128, 127]
+ */
+void esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims,
+                                   const int8_t *input_data,
+                                   const data_dims_t *filter_dims,
+                                   const int8_t *filter_data,
+                                   const int32_t *bias,
+                                   const data_dims_t *output_dims,
+                                   int8_t *out_data,
+                                   const dw_conv_params_t *conv_params,
+                                   const quant_data_t *quant_data);
+
+/**
+ * @brief       2d-convolution channelwise
+ *
+ * @note        operation: result += (input + offset) * filter
+ *
+ *              inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ */
+void esp_nn_conv_s8_ansi(const data_dims_t *input_dims,
+                         const int8_t *input_data,
+                         const data_dims_t *filter_dims,
+                         const int8_t *filter_data,
+                         const int32_t *bias,
+                         const data_dims_t *output_dims,
+                         int8_t *out_data,
+                         const conv_params_t *conv_params,
+                         const quant_data_t *quant_data);
+
+int esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims,
+                                      const data_dims_t *filter_dims,
+                                      const data_dims_t *output_dims,
+                                      const conv_params_t *conv_params);
+void esp_nn_set_conv_scratch_buf_ansi(const void *buf);
+
+int esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input_dims,
+                                                const data_dims_t *filter_dims,
+                                                const data_dims_t *output_dims,
+                                                const dw_conv_params_t *conv_params);
+void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf);
+
+/************************** Activation functions *****************************/
+
+/**
+ * @brief       relu6
+ *
+ * @note        inout: int8_t
+ */
+void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size);
+
+/************************** Pooling functions *****************************/
+
+
+/**
+ * @brief       max_pool
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ */
+void esp_nn_max_pool_s8_ansi(const int8_t *input,
+                             const uint16_t input_wd,
+                             const uint16_t input_ht,
+                             int8_t *output,
+                             const uint16_t output_wd,
+                             const uint16_t output_ht,
+                             const uint16_t stride_wd,
+                             const uint16_t stride_ht,
+                             const uint16_t filter_wd,
+                             const uint16_t filter_ht,
+                             const uint16_t pad_wd,
+                             const uint16_t pad_ht,
+                             const int32_t activation_min,
+                             const int32_t activation_max,
+                             const uint16_t channels);
+
+/**
+ * @brief       avg_pool
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ */
+void esp_nn_avg_pool_s8_ansi(const int8_t *input,
+                             const uint16_t input_wd,
+                             const uint16_t input_ht,
+                             int8_t *output,
+                             const uint16_t output_wd,
+                             const uint16_t output_ht,
+                             const uint16_t stride_wd,
+                             const uint16_t stride_ht,
+                             const uint16_t filter_wd,
+                             const uint16_t filter_ht,
+                             const uint16_t pad_wd,
+                             const uint16_t pad_ht,
+                             const int32_t activation_min,
+                             const int32_t activation_max,
+                             const uint16_t channels);
+
+
+/************************** Fully connected functions ***********************/
+
+/**
+ * @brief       fully connected
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ */
+void esp_nn_fully_connected_s8_ansi(const int8_t *input_data,
+                                    const int32_t input_offset,
+                                    const uint16_t row_len,
+                                    const int8_t *filter_data,
+                                    const int32_t filter_offset,
+                                    const int32_t *bias,
+                                    int8_t *out_data,
+                                    const uint16_t out_channels,
+                                    const int32_t out_offset,
+                                    const int32_t out_shift,
+                                    const int32_t out_mult,
+                                    const int32_t activation_min,
+                                    const int32_t activation_max);
+
+/**
+ * @brief   Get scratch buffer size needed by softmax function
+ *
+ * @param   width
+ * @param   height
+ * @return  size in bytes
+ *
+ * @note    buffer must be 4 byte aligned
+ */
+int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height);
+
+/* ANSI C function to be hooked up when optimised version needed */
+int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height);
+
+/**
+ * @brief   Set scratch buffer to be used by softmax function
+ *
+ * @param   buffer  this can be NULL if one needs to unset it
+ *                  must be aligned to 4 bytes
+ */
+void esp_nn_set_softmax_scratch_buf_ansi(void *buffer);
+
+/**
+ * @brief       reference softmax function
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ */
+void esp_nn_softmax_s8_ansi(const int8_t *input_data,
+                            const int32_t height,
+                            const int32_t width,
+                            const int32_t mult,
+                            const int32_t shift,
+                            const int32_t diff_min,
+                            int8_t *output_data);
+
+
+//////////////////////////// Generic optimisations /////////////////////////////
+
+/************************** Convolution functions *****************************/
+
+/**
+ * @brief       2d-convolution channelwise optimized version
+ *
+ * @note        operation: result += (input + offset) * filter
+ *
+ *              inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ */
+void esp_nn_conv_s8_opt(const data_dims_t *input_dims,
+                        const int8_t *input_data,
+                        const data_dims_t *filter_dims,
+                        const int8_t *filter_data,
+                        const int32_t *bias,
+                        const data_dims_t *output_dims,
+                        int8_t *out_data,
+                        const conv_params_t *conv_params,
+                        const quant_data_t *quant_data);
+
+/**
+ * @brief       depthwise convolution per channel optimized version
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              Version used in tflite is per channel.
+ *              This version follows the same footsprints.
+ *              Meaning, it has per out_channel shift and multiplier for
+ *              requantization
+ *
+ *              optimization notes: Though input_offset is int32 type,
+ *              offset values are contained in 8 bits [-128, 127]
+ */
+void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,
+                                  const int8_t *input_data,
+                                  const data_dims_t *filter_dims,
+                                  const int8_t *filter_data,
+                                  const int32_t *bias,
+                                  const data_dims_t *output_dims,
+                                  int8_t *out_data,
+                                  const dw_conv_params_t *conv_params,
+                                  const quant_data_t *quant_data);
+
+int esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims,
+                                     const data_dims_t *filter_dims,
+                                     const data_dims_t *output_dims,
+                                     const conv_params_t *conv_params);
+void esp_nn_set_conv_scratch_buf_opt(const void *buf);
+
+int esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_dims,
+                                               const data_dims_t *filter_dims,
+                                               const data_dims_t *output_dims,
+                                               const dw_conv_params_t *conv_params);
+void esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf);
+
+/* ANSI C function to be hooked up when optimised version needed */
+void esp_nn_set_softmax_scratch_buf_opt(void *buffer);
+
+/**
+ * @brief       optimised version of softmax function
+ *
+ * @note        the function uses extra buffer (4 * width bytes)
+ *              hence, scratch buffers must be set before calling this.
+ */
+void esp_nn_softmax_s8_opt(const int8_t *input_data,
+                           const int32_t height,
+                           const int32_t width,
+                           const int32_t mult,
+                           const int32_t shift,
+                           const int32_t diff_min,
+                           int8_t *output_data);
diff --git a/code/components/esp-nn/include/esp_nn_defs.h b/code/components/esp-nn/include/esp_nn_defs.h
new file mode 100644
index 00000000..756d8e6f
--- /dev/null
+++ b/code/components/esp-nn/include/esp_nn_defs.h
@@ -0,0 +1,83 @@
+// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+
+/**
+ * @brief structure to club data dims
+ * this structure can be used for input, output and filter
+ */
+typedef struct data_dims {
+    int32_t width;
+    int32_t height;
+    int32_t channels;
+
+    int32_t extra; // can be used as batch or any other param
+} data_dims_t;
+
+/**
+ * @brief 2d data structure (width, height)
+ *
+ */
+typedef struct data_2d {
+    int32_t width;
+    int32_t height;
+} data_2d_t;
+
+/**
+ * @brief min/max activation
+ */
+typedef struct act_params {
+    int32_t min;
+    int32_t max;
+} act_params_t;
+
+/**
+ * @brief per channel quant data
+ *
+ * @note number of shift and mult elements are equal to output channels
+ */
+typedef struct quant_data {
+    int32_t *shift;
+    int32_t *mult;
+} quant_data_t;
+
+/**
+ * @brief params specific to convolution 2d
+ *
+ */
+typedef struct conv_params {
+    int32_t in_offset;
+    int32_t out_offset;
+    data_2d_t stride;
+    data_2d_t padding;
+    data_2d_t dilation;
+    act_params_t activation;
+} conv_params_t;
+
+/**
+ * @brief params specific to depthwise convolution 2d
+ *
+ */
+typedef struct dw_conv_params {
+    int32_t in_offset;
+    int32_t out_offset;
+    int32_t ch_mult; // channel multiplier. (in_ch * ch_mult = out_ch)
+    data_2d_t stride;
+    data_2d_t padding;
+    data_2d_t dilation;
+    act_params_t activation;
+} dw_conv_params_t;
diff --git a/code/components/esp-nn/include/esp_nn_esp32s3.h b/code/components/esp-nn/include/esp_nn_esp32s3.h
new file mode 100644
index 00000000..0f52c943
--- /dev/null
+++ b/code/components/esp-nn/include/esp_nn_esp32s3.h
@@ -0,0 +1,231 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file        Header definitions to include for esp_nn optimized functions for
+ *              the ESP32-S3 platform
+ */
+
+#pragma once
+
+#include "esp_nn_defs.h"
+#include "esp_nn_ansi_headers.h"
+
+/************************** Basic math functions *****************************/
+
+
+/**
+ * @brief       elementwise addition
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ *
+ *              shift values are expected to be <= 0
+ */
+void esp_nn_add_elementwise_s8_esp32s3(const int8_t *input1_data,
+                                       const int8_t *input2_data,
+                                       const int32_t input1_offset,
+                                       const int32_t input2_offset,
+                                       const int32_t input1_mult,
+                                       const int32_t input2_mult,
+                                       const int32_t input1_shift,
+                                       const int32_t input2_shift,
+                                       const int32_t left_shift,
+                                       int8_t *output,
+                                       const int32_t out_offset,
+                                       const int32_t out_mult,
+                                       const int32_t out_shift,
+                                       const int32_t activation_min,
+                                       const int32_t activation_max,
+                                       const int32_t size);
+
+/**
+ * @brief       elementwise multiplication
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ *
+ *              output shift is expected to be <= 0
+ */
+void esp_nn_mul_elementwise_s8_esp32s3(const int8_t *input1_data,
+                                       const int8_t *input2_data,
+                                       const int32_t input1_offset,
+                                       const int32_t input2_offset,
+                                       int8_t *output,
+                                       const int32_t out_offset,
+                                       const int32_t out_mult,
+                                       const int32_t out_shift,
+                                       const int32_t activation_min,
+                                       const int32_t activation_max,
+                                       const int32_t size);
+
+
+/************************** Convolution functions *****************************/
+
+/**
+ * @brief       depthwise convolution per channel
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              Version used in tflite is per channel.
+ *              This version follows the same footsprints.
+ *              Meaning, it has per out_channel shift and multiplier for
+ *              requantization
+ *
+ *              optimization notes: Though input_offset is int32 type,
+ *              offset values are contained in 8 bits [-128, 127]
+ */
+void esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims,
+                                      const int8_t *input_data,
+                                      const data_dims_t *filter_dims,
+                                      const int8_t *filter_data,
+                                      const int32_t *bias,
+                                      const data_dims_t *output_dims,
+                                      int8_t *output_data,
+                                      const dw_conv_params_t *conv_params,
+                                      const quant_data_t *quant_data);
+
+/**
+ * @brief       2d - convolution channelwise
+ *
+ * @note        operation: result += (input + offset) * filter
+ *
+ *              inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ */
+void esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims,
+                            const int8_t *input_data,
+                            const data_dims_t *filter_dims,
+                            const int8_t *filter_data,
+                            const int32_t *bias,
+                            const data_dims_t *output_dims,
+                            int8_t *output_data,
+                            const conv_params_t *conv_params,
+                            const quant_data_t *quant_data);
+
+int esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
+                                         const data_dims_t *filter_dims,
+                                         const data_dims_t *output_dims,
+                                         const conv_params_t *conv_params);
+void esp_nn_set_conv_scratch_buf_esp32s3(const void *buf);
+
+int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
+                                                   const data_dims_t *filter_dims,
+                                                   const data_dims_t *output_dims,
+                                                   const dw_conv_params_t *conv_params);
+void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(const void *buf);
+
+/************************** Pooling functions *****************************/
+
+/**
+ * @brief       max_pool
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ */
+void esp_nn_max_pool_s8_esp32s3(const int8_t *input,
+                                const uint16_t input_wd,
+                                const uint16_t input_ht,
+                                int8_t *output,
+                                const uint16_t output_wd,
+                                const uint16_t output_ht,
+                                const uint16_t stride_wd,
+                                const uint16_t stride_ht,
+                                const uint16_t filter_wd,
+                                const uint16_t filter_ht,
+                                const uint16_t pad_wd,
+                                const uint16_t pad_ht,
+                                const int32_t activation_min,
+                                const int32_t activation_max,
+                                const uint16_t channels);
+
+/**
+ * @brief       avg_pool
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ */
+void esp_nn_avg_pool_s8_esp32s3(const int8_t *input,
+                                const uint16_t input_wd,
+                                const uint16_t input_ht,
+                                int8_t *output,
+                                const uint16_t output_wd,
+                                const uint16_t output_ht,
+                                const uint16_t stride_wd,
+                                const uint16_t stride_ht,
+                                const uint16_t filter_wd,
+                                const uint16_t filter_ht,
+                                const uint16_t pad_wd,
+                                const uint16_t pad_ht,
+                                const int32_t activation_min,
+                                const int32_t activation_max,
+                                const uint16_t channels);
+
+
+/************************** Fully connected functions *****************************/
+
+/**
+ * @brief       fully connected
+ *
+ * @note        inputs type: int8_t, output: int8_t
+ *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
+ *
+ *              Current version works only on aligned input.
+ *              row_len and channels should both be multiple of 8.
+ */
+void esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data,
+                                       const int32_t input_offset,
+                                       const uint16_t row_len,
+                                       const int8_t *filter_data,
+                                       const int32_t filter_offset,
+                                       const int32_t *bias,
+                                       int8_t *out_data,
+                                       const uint16_t out_channels,
+                                       const int32_t out_offset,
+                                       const int32_t out_shift,
+                                       const int32_t out_mult,
+                                       const int32_t activation_min,
+                                       const int32_t activation_max);
+
+/**
+ * @brief       relu6
+ *
+ * @note        inout: int8_t
+ */
+void esp_nn_relu6_s8_esp32s3(int8_t *data, uint16_t size);
+
+/********************** function defines ***************************/
+
+#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32s3
+#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32s3
+
+#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32s3
+
+#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32s3
+#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32s3
+
+#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32s3
+#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32s3
+
+#define esp_nn_conv_s8 esp_nn_conv_s8_esp32s3
+
+#define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32s3
+
+#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32s3
+#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32s3
+
+#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32s3
+
+#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_opt
+#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_opt
+#define esp_nn_softmax_s8 esp_nn_softmax_s8_opt
diff --git a/code/components/esp-nn/include/esp_nn_generic_opt.h b/code/components/esp-nn/include/esp_nn_generic_opt.h
new file mode 100644
index 00000000..136cba5d
--- /dev/null
+++ b/code/components/esp-nn/include/esp_nn_generic_opt.h
@@ -0,0 +1,47 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file        Header definitions to include for esp_nn generic optimisations
+ *              For functions which not having optimisations, _ansi versions are picked.
+ */
+
+#pragma once
+
+#include "esp_nn_defs.h"
+#include "esp_nn_ansi_headers.h"
+
+#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi
+#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi
+
+#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_opt
+
+#define esp_nn_conv_s8 esp_nn_conv_s8_opt
+
+#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_opt
+#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_opt
+
+#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_opt
+#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_opt
+
+#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi
+
+#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi
+#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi
+
+#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi
+
+#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_opt
+#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_opt
+#define esp_nn_softmax_s8 esp_nn_softmax_s8_opt
diff --git a/code/components/esp-nn/src/activation_functions/esp_nn_relu_ansi.c b/code/components/esp-nn/src/activation_functions/esp_nn_relu_ansi.c
new file mode 100644
index 00000000..1d4c3d11
--- /dev/null
+++ b/code/components/esp-nn/src/activation_functions/esp_nn_relu_ansi.c
@@ -0,0 +1,30 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <common_functions.h>
+
+void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size)
+{
+    int32_t i;
+
+    for (i = 0; i < size; i++) {
+        int32_t ip = data[i];
+
+        ip = max(ip, 0);
+        data[i] = min(ip, 6);
+    }
+}
diff --git a/code/components/esp-nn/src/basic_math/esp_nn_add_ansi.c b/code/components/esp-nn/src/basic_math/esp_nn_add_ansi.c
new file mode 100644
index 00000000..617386cf
--- /dev/null
+++ b/code/components/esp-nn/src/basic_math/esp_nn_add_ansi.c
@@ -0,0 +1,97 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+
+#include <common_functions.h>
+
+void esp_nn_add_elementwise_u8_ansi(const uint8_t *input1_data,
+                                    const uint8_t *input2_data,
+                                    const int32_t input1_offset,
+                                    const int32_t input2_offset,
+                                    const int32_t input1_mult,
+                                    const int32_t input2_mult,
+                                    const int32_t input1_shift,
+                                    const int32_t input2_shift,
+                                    const int32_t left_shift,
+                                    uint8_t *output,
+                                    const int32_t out_offset,
+                                    const int32_t out_mult,
+                                    const int32_t out_shift,
+                                    const int32_t activation_min,
+                                    const int32_t activation_max,
+                                    const int32_t size)
+{
+    for (int i = 0; i < size; i++) {
+        int32_t tmp1 = input1_data[i] + input1_offset;
+        int32_t tmp2 = input2_data[i] + input2_offset;
+
+        tmp1 <<= left_shift;
+        tmp2 <<= left_shift;
+
+        tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult);
+        tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult);
+
+        tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift);
+        tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift);
+
+        int32_t out = tmp1 + tmp2;
+        out = esp_nn_sat_round_doubling_high_mul(out, out_mult);
+        out = esp_nn_div_by_power_of_two(out, -out_shift);
+        out = out + out_offset;
+
+        out = max(activation_min, min(out, activation_max));
+        output[i] = (uint8_t) out;
+    }
+}
+
+void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,
+                                    const int8_t *input2_data,
+                                    const int32_t input1_offset,
+                                    const int32_t input2_offset,
+                                    const int32_t input1_mult,
+                                    const int32_t input2_mult,
+                                    const int32_t input1_shift,
+                                    const int32_t input2_shift,
+                                    const int32_t left_shift,
+                                    int8_t *output,
+                                    const int32_t out_offset,
+                                    const int32_t out_mult,
+                                    const int32_t out_shift,
+                                    const int32_t activation_min,
+                                    const int32_t activation_max,
+                                    const int32_t size)
+{
+    for (int i = 0; i < size; i++) {
+        int32_t tmp1 = input1_data[i] + input1_offset;
+        int32_t tmp2 = input2_data[i] + input2_offset;
+
+        tmp1 <<= left_shift;
+        tmp2 <<= left_shift;
+
+        tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult);
+        tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult);
+
+        tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift);
+        tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift);
+
+        int32_t out = tmp1 + tmp2;
+        out = esp_nn_sat_round_doubling_high_mul(out, out_mult);
+        out = esp_nn_div_by_power_of_two(out, -out_shift);
+        out = out + out_offset;
+
+        out = max(activation_min, min(out, activation_max));
+        output[i] = (int8_t) out;
+    }
+}
diff --git a/code/components/esp-nn/src/basic_math/esp_nn_mul_ansi.c b/code/components/esp-nn/src/basic_math/esp_nn_mul_ansi.c
new file mode 100644
index 00000000..db8e8cc0
--- /dev/null
+++ b/code/components/esp-nn/src/basic_math/esp_nn_mul_ansi.c
@@ -0,0 +1,42 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+
+#include <common_functions.h>
+
+void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,
+                                    const int8_t *input2_data,
+                                    const int32_t input1_offset,
+                                    const int32_t input2_offset,
+                                    int8_t *output,
+                                    const int32_t out_offset,
+                                    const int32_t out_mult,
+                                    const int32_t out_shift,
+                                    const int32_t activation_min,
+                                    const int32_t activation_max,
+                                    const int32_t size)
+{
+    for (int i = 0; i < size; i++) {
+        int32_t tmp1 = input1_data[i] + input1_offset;
+        int32_t tmp2 = input2_data[i] + input2_offset;
+
+        int32_t out = tmp1 * tmp2;
+        out = esp_nn_multiply_by_quantized_mult(out, out_mult, out_shift);
+        out = out + out_offset;
+
+        out = max(activation_min, min(out, activation_max));
+        output[i] = (int8_t) out;
+    }
+}
diff --git a/code/components/esp-nn/src/common/common_functions.h b/code/components/esp-nn/src/common/common_functions.h
new file mode 100644
index 00000000..0a74eca4
--- /dev/null
+++ b/code/components/esp-nn/src/common/common_functions.h
@@ -0,0 +1,255 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+
+/**
+ * c99 standard still doesn't strictly inline functions
+ * We need to use attribute as well to do this.
+ */
+#define __NN_FORCE_INLINE__ __attribute((always_inline)) static inline
+
+/* min/max macros */
+#ifndef max
+#define max(a, b) ({            \
+    __typeof__ (a) _a = (a);    \
+    __typeof__ (b) _b = (b);    \
+    _a > _b ? _a : _b;          \
+})
+
+#define min(a, b) ({            \
+    __typeof__ (a) _a = (a);    \
+    __typeof__ (b) _b = (b);    \
+    _a < _b ? _a : _b;          \
+})
+#endif
+
+__NN_FORCE_INLINE__ int32_t esp_nn_clz32(uint32_t in)
+{
+#if CONFIG_IDF_TARGET_ARCH_XTENSA
+    __asm__ volatile("nsau %0, %0" : "+r" (in));
+    return in;
+#elif defined(__GNUC__)
+    return __builtin_clz(in);
+#else
+    int32_t count = 32;
+    uint32_t x = in, y = in >> 16;
+    if (y != 0) {
+        count -= 16;
+        x = y;
+    }
+    y = x >> 8;
+    if (y != 0) {
+        count -= 8;
+        x = y;
+    }
+    y = x >> 4;
+    if (y != 0) {
+        count -= 4;
+        x = y;
+    }
+    y = x >> 2;
+    if (y != 0) {
+        count -= 2;
+        x = y;
+    }
+    y = x >> 1;
+    if (y != 0) {
+        return count - 2;
+    }
+    return count - x;
+#endif
+}
+
+/**
+ * Signed saturate a 32 bit value to 8 bits keeping output in 32 bit variable.
+ */
+__NN_FORCE_INLINE__ int32_t esp_nn_saturate8(int32_t in)
+{
+#if CONFIG_IDF_TARGET_ARCH_XTENSA
+    __asm__ volatile("clamps %0, %0, 7" : "+a"(in));
+    return in;
+#else
+    return max(INT8_MIN, min(in, INT8_MAX));
+#endif
+}
+
+__NN_FORCE_INLINE__ int32_t esp_nn_pick_sat_high32_of64(int64_t val64)
+{
+    int32_t sign = (int32_t) (val64 >> 63);
+    int32_t to_add = sign & ((1ul << 31) - 1);
+    return (int32_t) ((int64_t) (val64 + to_add) >> 31);
+}
+
+__NN_FORCE_INLINE__ int32_t esp_nn_sat_round_doubling_high_mul(int32_t in0, int32_t in1)
+{
+    int32_t result;
+    int64_t in0_64 = (int64_t) in0;
+    bool overflow = (in0 == in1) && (in0 == (int32_t) INT32_MIN);
+
+    /* Nudge value */
+    int64_t nudge_val = 1 << 30;
+    if ((in0 < 0) ^ (in1 < 0)) {
+        nudge_val = 1 - nudge_val;
+    }
+
+    /* Multiply and add nudge */
+    int64_t mult = in0_64 * in1 + nudge_val;
+
+    /* Round and pickup 32 bits */
+    result = esp_nn_pick_sat_high32_of64(mult);
+
+    return overflow ? INT32_MAX : result;
+}
+
+/**
+ * fast version
+ * this will fail for values closer to INT32_MAX and INT32_MIN by `1 << (exponent - 1)`.
+ * We can afford to do this because we are at the very last stage of filter.
+ * Also it is pretty rare condition as our output is going to be 8 bit.
+ */
+__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two_fast(int32_t val, int32_t exponent)
+{
+    int32_t to_add = (1 << (exponent - 1)) - (val < 0);
+    return (int32_t) ((val + to_add) >> exponent);
+}
+
+__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two(int32_t val, int32_t exponent)
+{
+    int32_t result;
+
+    const int32_t mask = (1 << exponent) - 1;
+    const int32_t remainder = val & mask;
+
+    result = val >> exponent;
+    int32_t threshold = (mask >> 1) + (result < 0);
+
+    if (remainder > threshold) {
+        result += 1;
+    }
+    return result;
+}
+
+__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult(int32_t x, int32_t mult, int32_t shift)
+{
+    int32_t left_shift = shift > 0 ? shift : 0;
+    int32_t right_shift = shift > 0 ? 0 : -shift;
+    int32_t result = esp_nn_sat_round_doubling_high_mul(x * (1 << left_shift), mult);
+    return esp_nn_div_by_power_of_two(result, right_shift);
+}
+
+__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult_fast(int32_t x, int32_t mult, int32_t shift)
+{
+    int32_t left_shift = max(shift, 0);
+    int32_t right_shift = left_shift - shift;
+
+    int64_t nudge_val = 1 << 30;
+    int64_t in0_64 = (int64_t) (x << left_shift);
+
+    /* Multiply and add nudge */
+    int64_t mult_64 = in0_64 * mult + nudge_val;
+    int32_t result = (int32_t) (mult_64 >> 31);
+    if (right_shift) {
+        result = esp_nn_div_by_power_of_two_fast(result, right_shift);
+    }
+    return result;
+}
+
+static void esp_nn_aligned_s8_pad_with_value(const int8_t *src, int8_t *dst,
+                                             const uint16_t input_wd,
+                                             const uint16_t input_ht,
+                                             const uint16_t channels,
+                                             const int32_t pad_val,
+                                             const uint16_t pad_wd,
+                                             const uint16_t pad_ht)
+{
+    /* memset with pad_val */
+    memset(dst, pad_val, ((input_wd + 2 * pad_wd) * (input_ht + 2 * pad_ht)) * channels);
+    dst += (pad_wd + input_wd + pad_wd) * channels;
+
+    for (int i = 0; i < input_ht; i++) {
+        dst += pad_wd * channels;
+        for (int j = 0; j < input_wd * channels; j++) {
+            *dst++ = *src++;
+        }
+        dst += pad_wd * channels;
+    }
+}
+
+static void esp_nn_aligned_s8_pad_end_with_value(const int8_t *src, int8_t *dst,
+                                                 const uint16_t input_wd,
+                                                 const uint16_t input_ht,
+                                                 const uint16_t channels,
+                                                 const int32_t pad_val,
+                                                 const uint16_t pad_wd,
+                                                 const uint16_t pad_ht)
+{
+    for (int i = 0; i < input_ht; i++) {
+        for (int j = 0; j < input_wd * channels; j++) {
+            *dst++ = *src++;
+        }
+        if (pad_wd) {
+            memset(dst, pad_val, pad_wd * channels);
+            dst += pad_wd * channels;
+        }
+    }
+    /* pad end `pad_ht` lines at end */
+    if (pad_ht) {
+        memset(dst, pad_val, (input_wd + pad_wd) * pad_ht * channels);
+    }
+}
+
+/**
+ * @brief       convert 8 bit input data to 16 bit
+ *
+ * @param       src int8_t source data
+ * @param       dst int16_t dst data
+ * @param       size length of data
+ * @param       offset  offset to be added to src data. Range: [-128, 127]
+ */
+__NN_FORCE_INLINE__ void esp_nn_s8_to_s16_with_offset(const int8_t *src, int16_t *dst,
+                                                      const int size, const int32_t offset)
+{
+    int i = 0;
+    for (; i < size; i += 2) {
+        dst[i + 0] = src[i + 0] + offset;
+        dst[i + 1] = src[i + 1] + offset;
+    }
+    if(i < size) {
+        dst[i] = src[i] + offset;
+    }
+}
+
+/**
+ * @brief       convert 8 bit input data to 16 bit
+ *
+ * @param       src int8_t source data
+ * @param       dst int16_t dst data
+ * @param       size length of data
+ */
+__NN_FORCE_INLINE__ void esp_nn_s8_to_s16(const int8_t *src, int16_t *dst, const int size)
+{
+    int i = 0;
+    for (; i < size; i += 2) {
+        dst[i + 0] = src[i + 0];
+        dst[i + 1] = src[i + 1];
+    }
+    if(i < size) {
+        dst[i] = src[i];
+    }
+}
diff --git a/code/components/esp-nn/src/convolution/esp_nn_conv_ansi.c b/code/components/esp-nn/src/convolution/esp_nn_conv_ansi.c
new file mode 100644
index 00000000..677c0ad8
--- /dev/null
+++ b/code/components/esp-nn/src/convolution/esp_nn_conv_ansi.c
@@ -0,0 +1,179 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <esp_nn_defs.h>
+
+#include <common_functions.h>
+
+int esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims,
+                                      const data_dims_t *filter_dims,
+                                      const data_dims_t *output_dims,
+                                      const conv_params_t *conv_params)
+{
+    return 0;
+}
+
+void esp_nn_set_conv_scratch_buf_ansi(const void *buf)
+{
+
+}
+
+/**
+ * Assumption 1: i/p channels == o/p channels
+ * Assumption 2: Pointers are valid
+ * Assumption 3: dialation width = 1
+ */
+void esp_nn_conv_u8_ansi(const uint8_t *input_data,
+                         const uint16_t input_wd,
+                         const uint16_t input_ht,
+                         const uint16_t in_channels,
+                         const int32_t input_offset,
+                         const uint16_t pad_wd,
+                         const uint16_t pad_ht,
+                         const uint16_t stride_wd,
+                         const uint16_t stride_ht,
+                         const uint8_t *filter_data,
+                         const uint16_t filter_wd,
+                         const uint16_t filter_ht,
+                         const int32_t filter_offset,
+                         const int32_t *bias,
+                         uint8_t *out_data,
+                         const uint16_t out_wd,
+                         const uint16_t out_ht,
+                         const uint16_t out_channels,
+                         const int32_t out_offset,
+                         const int32_t out_shift,
+                         const int32_t out_mult,
+                         const int32_t activation_min,
+                         const int32_t activation_max)
+{
+    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
+        const int16_t base_y = (out_y * stride_ht) - pad_ht;
+        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
+            const int16_t base_x = (out_x * stride_wd) - pad_wd;
+            for (int out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {//channel_loop
+                int32_t result = 0;
+
+                /* Select filter so as the point doesn't lie outside block */
+                int filter_y_start = max(0, -base_y);
+                int filter_x_start = max(0, -base_x);
+                int filter_y_end = min(filter_ht, input_ht - base_y);
+                int filter_x_end = min(filter_wd, input_wd - base_x);
+
+                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                    const int32_t idx_y = base_y + filter_y_idx;
+                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                        const int32_t idx_x = base_x + filter_x_idx;
+                        for (int in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) {
+                            int32_t input_index = (idx_y * input_wd + idx_x) * in_channels + in_ch_idx;
+                            int32_t filter_index = ((out_ch_idx * filter_ht + filter_y_idx)
+                                                    * filter_wd + filter_x_idx) * in_channels
+                                                   + in_ch_idx;
+                            int32_t input_val = input_data[input_index] + input_offset;
+                            int32_t filter_val = filter_data[filter_index] + filter_offset;
+                            result += input_val * filter_val;
+                        }
+                    }
+                }
+                if (bias) {
+                    result += bias[out_ch_idx];
+                }
+                result = esp_nn_multiply_by_quantized_mult(result, out_mult, out_shift);
+                result += out_offset;
+                result = max(result, activation_min);
+                result = min(result, activation_max);
+
+                int out_index = (out_y * out_wd + out_x) * out_channels + out_ch_idx;
+                out_data[out_index] = (uint8_t) result;
+            }
+        }
+    }
+}
+
+/**
+ * Assumption 1: i/p channels == o/p channels
+ * Assumption 2: Pointers are valid
+ * Assumption 3: dialation width = 1
+ */
+void esp_nn_conv_s8_ansi(const data_dims_t *input_dims,
+                         const int8_t *input_data,
+                         const data_dims_t *filter_dims,
+                         const int8_t *filter_data,
+                         const int32_t *bias,
+                         const data_dims_t *output_dims,
+                         int8_t *out_data,
+                         const conv_params_t *conv_params,
+                         const quant_data_t *quant_data)
+{
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t input_ht = input_dims->height;
+    const uint16_t in_channels = input_dims->channels;
+    const int32_t input_offset = conv_params->in_offset;
+    const int32_t out_offset = conv_params->out_offset;
+    const uint16_t pad_wd = conv_params->padding.width;
+    const uint16_t pad_ht = conv_params->padding.height;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+    const uint16_t filter_wd = filter_dims->width;
+    const uint16_t filter_ht = filter_dims->height;
+    const uint16_t out_wd = output_dims->width;
+    const uint16_t out_ht = output_dims->height;
+    const uint16_t out_channels = output_dims->channels;
+    const int32_t *out_shift = quant_data->shift;
+    const int32_t *out_mult = quant_data->mult;
+    const int32_t activation_min = conv_params->activation.min;
+    const int32_t activation_max = conv_params->activation.max;
+
+    int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx;
+
+    for (out_y = 0; out_y < out_ht; out_y++) {
+        for (out_x = 0; out_x < out_wd; out_x++) {
+            for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
+                int32_t conv_out = 0;
+
+                const int32_t base_y = stride_ht * out_y - pad_ht;
+                const int32_t base_x = stride_wd * out_x - pad_wd;
+
+                const int32_t filter_y_start = max(0, -base_y);
+                const int32_t filter_x_start = max(0, -base_x);
+
+                const int32_t filter_y_end = min(filter_ht, input_ht - base_y);
+                const int32_t filter_x_end = min(filter_wd, input_wd - base_x);
+
+                for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                    for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                        const int32_t in_row = base_y + filter_y_idx;
+                        const int32_t in_col = base_x + filter_x_idx;
+                        int32_t input_base_offset = (in_row * input_wd + in_col) * in_channels;
+                        int32_t filter_base_offset = out_ch_idx * in_channels * filter_ht * filter_wd +
+                                                       (filter_y_idx * filter_wd + filter_x_idx) * in_channels;
+                        for (in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) {
+                            conv_out +=
+                                (input_data[input_base_offset + in_ch_idx] + input_offset) *
+                                filter_data[filter_base_offset + in_ch_idx];
+                        }
+                    }
+                }
+                if (bias) {
+                    conv_out += bias[out_ch_idx];
+                }
+                conv_out = esp_nn_multiply_by_quantized_mult(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]);
+                conv_out += out_offset;
+                conv_out = max(conv_out, activation_min);
+                conv_out = min(conv_out, activation_max);
+                *out_data++ = (int8_t) conv_out;
+            }
+        }
+    }
+}
diff --git a/code/components/esp-nn/src/convolution/esp_nn_conv_esp32s3.c b/code/components/esp-nn/src/convolution/esp_nn_conv_esp32s3.c
new file mode 100644
index 00000000..e13129b2
--- /dev/null
+++ b/code/components/esp-nn/src/convolution/esp_nn_conv_esp32s3.c
@@ -0,0 +1,463 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <esp_nn_defs.h>
+
+#include <common_functions.h>
+
+static int16_t *scratch_buffer = NULL;
+
+extern void esp_nn_conv_s8_mult8_1x1_esp32s3(const int8_t *input_data,
+                                             const uint16_t input_wd,
+                                             const uint16_t input_ht,
+                                             const uint16_t in_channels,
+                                             const int32_t input_offset,
+                                             const int8_t *filter_aligned,
+                                             const int32_t *bias,
+                                             int8_t *out_data,
+                                             const uint16_t out_wd,
+                                             const uint16_t out_ht,
+                                             const uint16_t out_channels,
+                                             const int32_t out_offset,
+                                             const int32_t *out_shift,
+                                             const int32_t *out_mult,
+                                             const int32_t activation_min,
+                                             const int32_t activation_max,
+                                             void *buffer /* scratch buffer */);
+
+extern void esp_nn_conv_s16_mult4_1x1_esp32s3(const int16_t *input_data,
+                                              const uint16_t input_wd,
+                                              const uint16_t input_ht,
+                                              const uint16_t in_channels,
+                                              const int16_t *filter_data,
+                                              const int32_t *bias,
+                                              int8_t *out_data,
+                                              const uint16_t out_wd,
+                                              const uint16_t out_ht,
+                                              const uint16_t out_channels,
+                                              const int32_t out_offset,
+                                              const int32_t *out_shift,
+                                              const int32_t *out_mult,
+                                              const int32_t activation_min,
+                                              const int32_t activation_max,
+                                              void *buffer /* scratch buffer */);
+
+extern void esp_nn_conv_s16_mult8_esp32s3(const int16_t *input_data,
+                                          const uint16_t input_wd,
+                                          const uint16_t input_ht,
+                                          const uint16_t in_channels,
+                                          const uint16_t pad_wd,
+                                          const uint16_t pad_ht,
+                                          const uint16_t stride_wd,
+                                          const uint16_t stride_ht,
+                                          const int16_t *filter_data,
+                                          const uint16_t filter_wd,
+                                          const uint16_t filter_ht,
+                                          const int32_t *bias,
+                                          int8_t *out_data,
+                                          const uint16_t out_wd,
+                                          const uint16_t out_ht,
+                                          const uint16_t out_channels,
+                                          const int32_t out_offset,
+                                          const int32_t *out_shift,
+                                          const int32_t *out_mult,
+                                          const int32_t activation_min,
+                                          const int32_t activation_max);
+
+extern void esp_nn_aligned_s8_to_s16_with_offset_esp32s3(const int8_t *src, int16_t *dst,
+                                                         const int size, const int32_t offset);
+
+extern void esp_nn_s8_to_s16_esp32s3(const int8_t *src, int16_t *dst, const int size);
+
+static void esp_nn_conv_s8_unrolled(const data_dims_t *input_dims,
+                                    const int8_t *input_data,
+                                    const data_dims_t *filter_dims,
+                                    const int8_t *filter_data,
+                                    const int32_t *bias,
+                                    const data_dims_t *output_dims,
+                                    int8_t *out_data,
+                                    const conv_params_t *conv_params,
+                                    const quant_data_t *quant_data)
+{
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t input_ht = input_dims->height;
+    const uint16_t in_ch = input_dims->channels;
+    const int32_t input_offset = conv_params->in_offset;
+    const int32_t out_offset = conv_params->out_offset;
+    const uint16_t pad_wd = conv_params->padding.width;
+    const uint16_t pad_ht = conv_params->padding.height;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+    const uint16_t filter_wd = filter_dims->width;
+    const uint16_t filter_ht = filter_dims->height;
+    const uint16_t out_wd = output_dims->width;
+    const uint16_t out_ht = output_dims->height;
+    const uint16_t out_ch = output_dims->channels;
+    const int32_t *out_shift = quant_data->shift;
+    const int32_t *out_mult = quant_data->mult;
+    const int32_t activation_min = conv_params->activation.min;
+    const int32_t activation_max = conv_params->activation.max;
+
+    int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx;
+
+    for (out_y = 0; out_y < out_ht; out_y++) {
+        for (out_x = 0; out_x < out_wd; out_x++) {
+            for (out_ch_idx = 0; out_ch_idx < out_ch; out_ch_idx++) {
+                int32_t conv_out = 0;
+
+                const int32_t base_y = stride_ht * out_y - pad_ht;
+                const int32_t base_x = stride_wd * out_x - pad_wd;
+
+                const int32_t filter_y_start = max(0, -base_y);
+                const int32_t filter_x_start = max(0, -base_x);
+
+                const int32_t filter_y_end = min(filter_ht, input_ht - base_y);
+                const int32_t filter_x_end = min(filter_wd, input_wd - base_x);
+
+                for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                    for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                        const int32_t in_row = base_y + filter_y_idx;
+                        const int32_t in_col = base_x + filter_x_idx;
+                        int32_t input_base_offset = (in_row * input_wd + in_col) * in_ch;
+                        int32_t filter_base_offset = out_ch_idx * in_ch * filter_ht * filter_wd +
+                                                       (filter_y_idx * filter_wd + filter_x_idx) * in_ch;
+                        for (in_ch_idx = 0; in_ch_idx < in_ch; in_ch_idx++) {
+                            conv_out +=
+                                (input_data[input_base_offset + in_ch_idx] + input_offset) *
+                                filter_data[filter_base_offset + in_ch_idx];
+                        }
+                    }
+                }
+                if (bias) {
+                    conv_out += bias[out_ch_idx];
+                }
+                conv_out = esp_nn_multiply_by_quantized_mult_fast(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]);
+                conv_out += out_offset;
+                conv_out = max(conv_out, activation_min);
+                conv_out = min(conv_out, activation_max);
+                *out_data++ = (int8_t) conv_out;
+            }
+        }
+    }
+}
+
+static void esp_nn_conv_s8_pad_valid(const int8_t *input_data,
+                                     const uint16_t input_wd,
+                                     const uint16_t input_ht,
+                                     const uint16_t in_channels,
+                                     const int32_t input_offset,
+                                     const uint16_t stride_wd,
+                                     const uint16_t stride_ht,
+                                     const int8_t *filter_data,
+                                     const uint16_t filter_wd,
+                                     const uint16_t filter_ht,
+                                     const int32_t *bias,
+                                     int8_t *out_data,
+                                     const uint16_t out_wd,
+                                     const uint16_t out_ht,
+                                     const uint16_t out_channels,
+                                     const int32_t out_offset,
+                                     const int32_t *out_shift,
+                                     const int32_t *out_mult,
+                                     const int32_t activation_min,
+                                     const int32_t activation_max)
+{
+    int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx;
+
+    for (out_y = 0; out_y < out_ht; out_y++) {
+        for (out_x = 0; out_x < out_wd; out_x++) {
+            for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
+                int32_t conv_out = 0;
+
+                const int32_t base_y = stride_ht * out_y;
+                const int32_t base_x = stride_wd * out_x;
+
+                for (filter_y_idx = 0; filter_y_idx < filter_ht; filter_y_idx++) {
+                    for (filter_x_idx = 0; filter_x_idx < filter_wd; filter_x_idx++) {
+                        const int32_t in_row = base_y + filter_y_idx;
+                        const int32_t in_col = base_x + filter_x_idx;
+                        int32_t input_base_offset = (in_row * input_wd + in_col) * in_channels;
+                        int32_t filter_base_offset = out_ch_idx * in_channels * filter_ht * filter_wd +
+                                                       (filter_y_idx * filter_wd + filter_x_idx) * in_channels;
+                        const int8_t *input_data_ptr = input_data + input_base_offset;
+                        const int8_t *filter_data_ptr = filter_data + filter_base_offset;
+                        for (in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) {
+                            conv_out += (*input_data_ptr++ + input_offset) * *filter_data_ptr++;
+                        }
+                    }
+                }
+                if (bias) {
+                    conv_out += bias[out_ch_idx];
+                }
+                conv_out = esp_nn_multiply_by_quantized_mult_fast(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]);
+                conv_out += out_offset;
+                conv_out = max(conv_out, activation_min);
+                conv_out = min(conv_out, activation_max);
+                *out_data++ = (int8_t) conv_out;
+            }
+        }
+    }
+}
+
+static void esp_nn_conv_s8_pad_valid_3x3(const int8_t *input_data,
+                                         const uint16_t input_wd,
+                                         const uint16_t input_ht,
+                                         const uint16_t in_channels,
+                                         const int32_t input_offset,
+                                         const uint16_t stride_wd,
+                                         const uint16_t stride_ht,
+                                         const int8_t *filter_data,
+                                         const int32_t *bias,
+                                         int8_t *out_data,
+                                         const uint16_t out_wd,
+                                         const uint16_t out_ht,
+                                         const uint16_t out_channels,
+                                         const int32_t out_offset,
+                                         const int32_t *out_shift,
+                                         const int32_t *out_mult,
+                                         const int32_t activation_min,
+                                         const int32_t activation_max)
+{
+    int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx;
+
+    for (out_y = 0; out_y < out_ht; out_y++) {
+        for (out_x = 0; out_x < out_wd; out_x++) {
+            const int32_t base_y = stride_ht * out_y;
+            const int32_t base_x = stride_wd * out_x;
+            for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
+                int32_t conv_out = 0;
+                for (filter_y_idx = 0; filter_y_idx < 3; filter_y_idx++) {
+                    for (filter_x_idx = 0; filter_x_idx < 3; filter_x_idx++) {
+                        const int32_t in_row = base_y + filter_y_idx;
+                        const int32_t in_col = base_x + filter_x_idx;
+                        int32_t input_base_offset = (in_row * input_wd + in_col) * in_channels;
+                        int32_t filter_base_offset = out_ch_idx * in_channels * 3 * 3 +
+                                                       (filter_y_idx * 3 + filter_x_idx) * in_channels;
+                        const int8_t *input_data_ptr = input_data + input_base_offset;
+                        const int8_t *filter_data_ptr = filter_data + filter_base_offset;
+                        for (in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) {
+                            conv_out += (*input_data_ptr++ + input_offset) * *filter_data_ptr++;
+                        }
+                    }
+                }
+                if (bias) {
+                    conv_out += bias[out_ch_idx];
+                }
+                conv_out = esp_nn_multiply_by_quantized_mult_fast(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]);
+                conv_out += out_offset;
+                conv_out = max(conv_out, activation_min);
+                conv_out = min(conv_out, activation_max);
+                *out_data++ = (int8_t) conv_out;
+            }
+        }
+    }
+}
+
+static void esp_nn_conv_s8_pad_valid_ch3_3x3(const int8_t *input_data,
+                                             const uint16_t input_wd,
+                                             const uint16_t input_ht,
+                                             const int32_t input_offset,
+                                             const uint16_t stride_wd,
+                                             const uint16_t stride_ht,
+                                             const int8_t *filter_data,
+                                             const int32_t *bias,
+                                             int8_t *out_data,
+                                             const uint16_t out_wd,
+                                             const uint16_t out_ht,
+                                             const uint16_t out_channels,
+                                             const int32_t out_offset,
+                                             const int32_t *out_shift,
+                                             const int32_t *out_mult,
+                                             const int32_t activation_min,
+                                             const int32_t activation_max)
+{
+    int32_t out_ch_idx, out_y, out_x, filter_y_idx;
+
+    /* use scratch_buffer to pre-compute offset factor */
+    int16_t *filter_sum = (int16_t *) scratch_buffer;
+    const int8_t *filter_ptr = filter_data;
+    for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
+        int16_t sum_val = 0;
+        for (int i = 0; i < 9; i++) {
+            sum_val += *filter_ptr++;
+            sum_val += *filter_ptr++;
+            sum_val += *filter_ptr++;
+        }
+        *filter_sum++ = sum_val;
+    }
+
+    for (out_y = 0; out_y < out_ht; out_y++) {
+        for (out_x = 0; out_x < out_wd; out_x++) {
+            const int8_t *filter_data_ptr = filter_data;
+            const int32_t base_y = stride_ht * out_y;
+            const int32_t base_x = stride_wd * out_x;
+            const int8_t *input_base_ptr = input_data + (base_y * input_wd + base_x) * 3;
+            int16_t *filter_sum = (int16_t *) scratch_buffer;
+            for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
+                int32_t conv_out = 0;
+
+                for (filter_y_idx = 0; filter_y_idx < 3; filter_y_idx++) {
+                    const int8_t *input_data_ptr = input_base_ptr + (filter_y_idx * input_wd) * 3;
+                    conv_out += (*input_data_ptr++) * (*filter_data_ptr++);
+                    conv_out += (*input_data_ptr++) * (*filter_data_ptr++);
+                    conv_out += (*input_data_ptr++) * (*filter_data_ptr++);
+
+                    conv_out += (*input_data_ptr++) * (*filter_data_ptr++);
+                    conv_out += (*input_data_ptr++) * (*filter_data_ptr++);
+                    conv_out += (*input_data_ptr++) * (*filter_data_ptr++);
+
+                    conv_out += (*input_data_ptr++) * (*filter_data_ptr++);
+                    conv_out += (*input_data_ptr++) * (*filter_data_ptr++);
+                    conv_out += (*input_data_ptr++) * (*filter_data_ptr++);
+                }
+
+                conv_out += *filter_sum++ * input_offset;
+
+                if (bias) {
+                    conv_out += bias[out_ch_idx];
+                }
+                conv_out = esp_nn_multiply_by_quantized_mult_fast(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]);
+                conv_out += out_offset;
+                conv_out = max(conv_out, activation_min);
+                conv_out = min(conv_out, activation_max);
+                *out_data++ = (int8_t) conv_out;
+            }
+        }
+    }
+}
+
+int esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
+                                         const data_dims_t *filter_dims,
+                                         const data_dims_t *output_dims,
+                                         const conv_params_t *conv_params)
+{
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t input_ht = input_dims->height;
+    const uint16_t in_ch = input_dims->channels;
+    const uint16_t filter_wd = filter_dims->width;
+    const uint16_t filter_ht = filter_dims->height;
+    const uint16_t out_ch = output_dims->channels;
+    const uint16_t pad_wd = conv_params->padding.width;
+    const uint16_t pad_ht = conv_params->padding.height;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+
+    int filter_size = filter_wd * filter_ht * in_ch * out_ch;
+    int input_size = input_wd * input_ht * in_ch;
+
+    int transpose_buf_size = 2 * (8 * in_ch); /* to store intermediate data */
+    if (input_wd * input_ht < 8) {
+        transpose_buf_size = 0; // not using this for leftover
+    }
+    int align_buf_size = 32; /* extra buffer for alignment */
+    if (in_ch % 8 == 0 && filter_wd == 1 && filter_ht == 1 &&
+            pad_wd == 0 && pad_ht == 0 && stride_wd == 1 && stride_ht == 1) {
+        return filter_size + transpose_buf_size + align_buf_size;
+    }
+    return 2 * (filter_size + input_size) +  transpose_buf_size + align_buf_size;
+}
+
+void esp_nn_set_conv_scratch_buf_esp32s3(void *buf)
+{
+    scratch_buffer = (int16_t *) buf;
+}
+
+void esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims,
+                            const int8_t *input,
+                            const data_dims_t *filter_dims,
+                            const int8_t *filter_data,
+                            const int32_t *bias,
+                            const data_dims_t *output_dims,
+                            int8_t *out_data,
+                            const conv_params_t *conv_params,
+                            const quant_data_t *quant_data)
+{
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t input_ht = input_dims->height;
+    const uint16_t channels = input_dims->channels;
+    const int32_t input_offset = conv_params->in_offset;
+    const int32_t out_offset = conv_params->out_offset;
+    const uint16_t pad_wd = conv_params->padding.width;
+    const uint16_t pad_ht = conv_params->padding.height;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+    const uint16_t filter_wd = filter_dims->width;
+    const uint16_t filter_ht = filter_dims->height;
+    const uint16_t out_wd = output_dims->width;
+    const uint16_t out_ht = output_dims->height;
+    const uint16_t out_channels = output_dims->channels;
+    const int32_t *out_shift = quant_data->shift;
+    const int32_t *out_mult = quant_data->mult;
+    const int32_t activation_min = conv_params->activation.min;
+    const int32_t activation_max = conv_params->activation.max;
+
+    int filter_size = filter_wd * filter_ht * channels * out_channels;
+    int input_size = input_wd * input_ht * channels;
+    int align_len = 16 - (filter_size & 15);
+    int16_t *filter_data16 = scratch_buffer;
+    int16_t *input_data16 = scratch_buffer + filter_size + align_len;
+
+    if (scratch_buffer == NULL) {
+        printf("esp_nn_conv error! scratch_buffer not set!\n");
+        return;
+    }
+
+    if (channels % 8 == 0 && filter_wd == 1 && filter_ht == 1 &&
+            pad_wd == 0 && pad_ht == 0 && stride_wd == 1 && stride_ht == 1) {
+        int8_t *filter_aligned = (int8_t *) scratch_buffer;
+        int scratch_offset = (int) (filter_aligned + filter_size);
+        void *scratch_buf = (void *) (scratch_offset + 16 - (scratch_offset & 15));
+        memcpy(filter_aligned, filter_data, filter_size); // copy to aligned address
+        esp_nn_conv_s8_mult8_1x1_esp32s3(
+            input, input_wd, input_ht, channels, input_offset, filter_aligned,
+            bias, out_data, out_wd, out_ht, out_channels, out_offset,
+            out_shift, out_mult, activation_min, activation_max, scratch_buf);
+    } else if (channels % 4 == 0 && filter_wd == 1 && filter_ht == 1 &&
+            (input_wd * input_ht) % 4 == 0 && /* TODO: remove this check */
+            pad_wd == 0 && pad_ht == 0 && stride_wd == 1 && stride_ht == 1) {
+        int scratch_offset = (int) (input_data16 + input_size);
+        void *scratch_buf = (void *) (scratch_offset + 16 - (scratch_offset & 15));
+        esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);
+        esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input, input_data16, input_size, input_offset);
+        esp_nn_conv_s16_mult4_1x1_esp32s3(
+            input_data16, input_wd, input_ht, channels, filter_data16,
+            bias, out_data, out_wd, out_ht, out_channels, out_offset,
+            out_shift, out_mult, activation_min, activation_max, scratch_buf);
+    } else if (channels % 8 == 0) {
+        esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);
+        esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input, input_data16, input_size, input_offset);
+        esp_nn_conv_s16_mult8_esp32s3(
+            input_data16, input_wd, input_ht, channels, pad_wd, pad_ht,
+            stride_wd, stride_ht, filter_data16, filter_wd, filter_ht, bias,
+            out_data, out_wd, out_ht, out_channels, out_offset, out_shift,
+            out_mult, activation_min, activation_max);
+    } else if (pad_wd == 0 && pad_ht == 0) {
+        if (filter_wd == 3 && filter_ht == 3 && channels == 3) {
+            esp_nn_conv_s8_pad_valid_ch3_3x3(input, input_wd, input_ht, input_offset,
+                                             stride_wd, stride_ht, filter_data, bias,
+                                             out_data, out_wd, out_ht, out_channels, out_offset,
+                                             out_shift, out_mult, activation_min, activation_max);
+        } else {
+            esp_nn_conv_s8_pad_valid(input, input_wd, input_ht, channels, input_offset,
+                                     stride_wd, stride_ht, filter_data, filter_wd, filter_ht, bias,
+                                     out_data, out_wd, out_ht, out_channels, out_offset, out_shift,
+                                     out_mult, activation_min, activation_max);
+        }
+    } else {
+        /* Basic unrolled version */
+        esp_nn_conv_s8_unrolled(input_dims, input, filter_dims, filter_data,
+                                bias, output_dims, out_data, conv_params, quant_data);
+    }
+}
diff --git a/code/components/esp-nn/src/convolution/esp_nn_conv_opt.c b/code/components/esp-nn/src/convolution/esp_nn_conv_opt.c
new file mode 100644
index 00000000..be96430e
--- /dev/null
+++ b/code/components/esp-nn/src/convolution/esp_nn_conv_opt.c
@@ -0,0 +1,179 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <esp_nn_defs.h>
+
+#include <common_functions.h>
+
+int esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims,
+                                     const data_dims_t *filter_dims,
+                                     const data_dims_t *output_dims,
+                                     const conv_params_t *conv_params)
+{
+    return 0;
+}
+
+void esp_nn_set_conv_scratch_buf_opt(const void *buf)
+{
+
+}
+
+__attribute__ ((noinline))
+static void esp_nn_conv_s8_1x1(const data_dims_t *input_dims,
+                               const int8_t *input_data,
+                               const int8_t *filter_data,
+                               const int32_t *bias,
+                               const data_dims_t *output_dims,
+                               int8_t *out_data,
+                               const conv_params_t *conv_params,
+                               const quant_data_t *quant_data)
+{
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t in_channels = input_dims->channels;
+    const int32_t input_offset = conv_params->in_offset;
+    const int32_t out_offset = conv_params->out_offset;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+    const uint16_t out_wd = output_dims->width;
+    const uint16_t out_ht = output_dims->height;
+    const uint16_t out_channels = output_dims->channels;
+    const int32_t activation_min = conv_params->activation.min;
+    const int32_t activation_max = conv_params->activation.max;
+
+    for (int32_t in_row = 0; in_row < out_ht * stride_ht; in_row += stride_ht) {
+        for (int32_t in_col = 0; in_col < out_wd * stride_wd; in_col += stride_wd) {
+            const int32_t *out_mult = quant_data->mult;
+            const int32_t *out_shift = quant_data->shift;
+            const int8_t *filter_ptr = filter_data;
+            const int8_t *input_base_ptr = input_data + (in_row * input_wd + in_col) * in_channels;
+            int32_t out_ch_idx = 0;
+            for (; out_ch_idx < out_channels; out_ch_idx++) {
+                int32_t conv_out = 0;
+
+                const int8_t *input_ptr = input_base_ptr;
+
+                int32_t in_ch_idx = 0;
+                for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {
+                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
+                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
+                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
+                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
+                }
+                for (; in_ch_idx < in_channels; in_ch_idx ++) {
+                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
+                }
+                if (bias) {
+                    conv_out += bias[out_ch_idx];
+                }
+                conv_out = esp_nn_multiply_by_quantized_mult_fast(conv_out, *out_mult++, *out_shift++);
+                conv_out += out_offset;
+                conv_out = max(conv_out, activation_min);
+                conv_out = min(conv_out, activation_max);
+                *out_data++ = (int8_t) conv_out;
+            }
+        }
+    }
+}
+
+/**
+ * Assumption 1: i/p channels == o/p channels
+ * Assumption 2: Pointers are valid
+ * Assumption 3: dialation width = 1
+ */
+void esp_nn_conv_s8_opt(const data_dims_t *input_dims,
+                        const int8_t *input_data,
+                        const data_dims_t *filter_dims,
+                        const int8_t *filter_data,
+                        const int32_t *bias,
+                        const data_dims_t *output_dims,
+                        int8_t *out_data,
+                        const conv_params_t *conv_params,
+                        const quant_data_t *quant_data)
+{
+    const uint16_t filter_wd = filter_dims->width;
+    const uint16_t filter_ht = filter_dims->height;
+
+    if (filter_wd == 1 && filter_ht == 1) {
+        esp_nn_conv_s8_1x1(input_dims, input_data, filter_data, bias,
+                           output_dims, out_data, conv_params, quant_data);
+        return;
+    }
+
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t input_ht = input_dims->height;
+    const uint16_t in_channels = input_dims->channels;
+    const int32_t input_offset = conv_params->in_offset;
+    const int32_t out_offset = conv_params->out_offset;
+    const uint16_t pad_wd = conv_params->padding.width;
+    const uint16_t pad_ht = conv_params->padding.height;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+    const uint16_t out_wd = output_dims->width;
+    const uint16_t out_ht = output_dims->height;
+    const uint16_t out_channels = output_dims->channels;
+    const int32_t activation_min = conv_params->activation.min;
+    const int32_t activation_max = conv_params->activation.max;
+
+    int32_t out_ch_idx, out_y, out_x, filter_y_idx, filter_x_idx;
+
+    for (out_y = 0; out_y < out_ht; out_y++) {
+        for (out_x = 0; out_x < out_wd; out_x++) {
+            const int32_t *out_shift = quant_data->shift;
+            const int32_t *out_mult = quant_data->mult;
+            for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
+                int32_t conv_out = 0;
+
+                const int32_t base_y = stride_ht * out_y - pad_ht;
+                const int32_t base_x = stride_wd * out_x - pad_wd;
+
+                const int32_t filter_y_start = max(0, -base_y);
+                const int32_t filter_x_start = max(0, -base_x);
+
+                const int32_t filter_y_end = min(filter_ht, input_ht - base_y);
+                const int32_t filter_x_end = min(filter_wd, input_wd - base_x);
+
+                for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                    for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                        const int32_t in_row = base_y + filter_y_idx;
+                        const int32_t in_col = base_x + filter_x_idx;
+
+                        const int8_t *input_ptr = input_data +
+                                        (in_row * input_wd + in_col) * in_channels;
+                        const int8_t *filter_ptr = filter_data +
+                                        out_ch_idx * in_channels * filter_ht * filter_wd +
+                                        (filter_y_idx * filter_wd + filter_x_idx) * in_channels;
+                        int32_t in_ch_idx = 0;
+                        for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {
+                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
+                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
+                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
+                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
+                        }
+                        for (; in_ch_idx < in_channels; in_ch_idx ++) {
+                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
+                        }
+                    }
+                }
+                if (bias) {
+                    conv_out += bias[out_ch_idx];
+                }
+                conv_out = esp_nn_multiply_by_quantized_mult_fast(conv_out, *out_mult++, *out_shift++);
+                conv_out += out_offset;
+                conv_out = max(conv_out, activation_min);
+                conv_out = min(conv_out, activation_max);
+                *out_data++ = (int8_t) conv_out;
+            }
+        }
+    }
+}
diff --git a/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_ansi.c b/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_ansi.c
new file mode 100644
index 00000000..1cd02e0f
--- /dev/null
+++ b/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_ansi.c
@@ -0,0 +1,100 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <esp_nn_defs.h>
+#include <common_functions.h>
+
+int esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input_dims,
+                                                const data_dims_t *filter_dims,
+                                                const data_dims_t *output_dims,
+                                                const dw_conv_params_t *conv_params)
+{
+    return 0;
+}
+
+void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf)
+{
+
+}
+
+void esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims,
+                                   const int8_t *input_data,
+                                   const data_dims_t *filter_dims,
+                                   const int8_t *filter_data,
+                                   const int32_t *bias,
+                                   const data_dims_t *output_dims,
+                                   int8_t *out_data,
+                                   const dw_conv_params_t *conv_params,
+                                   const quant_data_t *quant_data)
+{
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t input_ht = input_dims->height;
+    const uint16_t channels = input_dims->channels;
+    const int32_t input_offset = conv_params->in_offset;
+    const int32_t out_offset = conv_params->out_offset;
+    const uint16_t pad_wd = conv_params->padding.width;
+    const uint16_t pad_ht = conv_params->padding.height;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+    const uint16_t filter_wd = filter_dims->width;
+    const uint16_t filter_ht = filter_dims->height;
+    const uint16_t out_wd = output_dims->width;
+    const uint16_t out_ht = output_dims->height;
+    const int32_t *out_shift = quant_data->shift;
+    const int32_t *out_mult = quant_data->mult;
+    const int32_t activation_min = conv_params->activation.min;
+    const int32_t activation_max = conv_params->activation.max;
+    const uint16_t ch_mult = conv_params->ch_mult;
+
+    int out_idx = 0;
+    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
+        const int16_t base_y = (out_y * stride_ht) - pad_ht;
+        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
+            const int16_t base_x = (out_x * stride_wd) - pad_wd;
+            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop
+                for (int ch_mult_idx = 0; ch_mult_idx < ch_mult; ch_mult_idx++) {
+                    int32_t result = 0;
+                    const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult;
+
+                    /* Select filter so as the point doesn't lie outside block */
+                    int filter_y_start = max(0, -base_y);
+                    int filter_x_start = max(0, -base_x);
+                    int filter_y_end = min(filter_ht, input_ht - base_y);
+                    int filter_x_end = min(filter_wd, input_wd - base_x);
+
+                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                        const int32_t idx_y = base_y + filter_y_idx;
+                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                            const int32_t idx_x = base_x + filter_x_idx;
+                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
+                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;
+                            int32_t input_val = input_data[input_index] + input_offset;
+                            int32_t filter_val = filter_data[filter_index];
+                            result += input_val * filter_val;
+                        }
+                    }
+                    if (bias) {
+                        result += bias[out_ch_idx];
+                    }
+                    result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_ch_idx], out_shift[out_ch_idx]);
+                    result += out_offset;
+                    result = max(result, activation_min);
+                    result = min(result, activation_max);
+
+                    out_data[out_idx++] = result;
+                }
+            }
+        }
+    }
+}
diff --git a/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_opt.c b/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_opt.c
new file mode 100644
index 00000000..4afea3f3
--- /dev/null
+++ b/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_opt.c
@@ -0,0 +1,291 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <esp_nn_defs.h>
+#include <common_functions.h>
+
+int esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_dims,
+                                               const data_dims_t *filter_dims,
+                                               const data_dims_t *output_dims,
+                                               const dw_conv_params_t *conv_params)
+{
+    return 0;
+}
+
+void esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf)
+{
+
+}
+
+/* common channel multiplier == 1 case */
+__attribute__ ((noinline))
+static void esp_nn_depthwise_conv_s8_ch_mult_1(const data_dims_t *input_dims,
+                                               const int8_t *input_data,
+                                               const data_dims_t *filter_dims,
+                                               const int8_t *filter_data,
+                                               const int32_t *bias,
+                                               const data_dims_t *output_dims,
+                                               int8_t *out_data,
+                                               const dw_conv_params_t *conv_params,
+                                               const quant_data_t *quant_data)
+{
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t input_ht = input_dims->height;
+    const uint16_t channels = input_dims->channels;
+    const int32_t input_offset = conv_params->in_offset;
+    const int32_t out_offset = conv_params->out_offset;
+    const uint16_t pad_wd = conv_params->padding.width;
+    const uint16_t pad_ht = conv_params->padding.height;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+    const uint16_t filter_wd = filter_dims->width;
+    const uint16_t filter_ht = filter_dims->height;
+    const uint16_t out_wd = output_dims->width;
+    const uint16_t out_ht = output_dims->height;
+    const int32_t activation_min = conv_params->activation.min;
+    const int32_t activation_max = conv_params->activation.max;
+
+    int out_idx = 0;
+    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
+        const int16_t base_y = (out_y * stride_ht) - pad_ht;
+        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
+            const int16_t base_x = (out_x * stride_wd) - pad_wd;
+
+            const int32_t *out_shift = quant_data->shift;
+            const int32_t *out_mult = quant_data->mult;
+
+            /* Select filter so as the point doesn't lie outside block */
+            int filter_y_start = max(0, -base_y);
+            int filter_x_start = max(0, -base_x);
+            int filter_y_end = min(filter_ht, input_ht - base_y);
+            int filter_x_end = min(filter_wd, input_wd - base_x);
+
+            int ch_idx = 0;
+            for (; ch_idx < channels - 3; ch_idx += 4) {//channel_loop
+                int32_t result0 = 0;
+                int32_t result1 = 0;
+                int32_t result2 = 0;
+                int32_t result3 = 0;
+
+                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                    const int32_t idx_y = base_y + filter_y_idx;
+                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                        const int32_t idx_x = base_x + filter_x_idx;
+                        int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
+                        int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels) + ch_idx;
+                        int32_t input_val0 = input_data[input_index + 0] + input_offset;
+                        int32_t input_val1 = input_data[input_index + 1] + input_offset;
+                        int32_t input_val2 = input_data[input_index + 2] + input_offset;
+                        int32_t input_val3 = input_data[input_index + 3] + input_offset;
+                        int32_t filter_val0 = filter_data[filter_index + 0];
+                        int32_t filter_val1 = filter_data[filter_index + 1];
+                        int32_t filter_val2 = filter_data[filter_index + 2];
+                        int32_t filter_val3 = filter_data[filter_index + 3];
+                        result0 += input_val0 * filter_val0;
+                        result1 += input_val1 * filter_val1;
+                        result2 += input_val2 * filter_val2;
+                        result3 += input_val3 * filter_val3;
+                    }
+                }
+                if (bias) {
+                    result0 += bias[ch_idx + 0];
+                    result1 += bias[ch_idx + 1];
+                    result2 += bias[ch_idx + 2];
+                    result3 += bias[ch_idx + 3];
+                }
+                result0 = esp_nn_multiply_by_quantized_mult_fast(result0, *out_mult++, *out_shift++);
+                result1 = esp_nn_multiply_by_quantized_mult_fast(result1, *out_mult++, *out_shift++);
+                result2 = esp_nn_multiply_by_quantized_mult_fast(result2, *out_mult++, *out_shift++);
+                result3 = esp_nn_multiply_by_quantized_mult_fast(result3, *out_mult++, *out_shift++);
+
+                result0 += out_offset;
+                result1 += out_offset;
+                result2 += out_offset;
+                result3 += out_offset;
+
+                result0 = max(result0, activation_min);
+                result1 = max(result1, activation_min);
+                result2 = max(result2, activation_min);
+                result3 = max(result3, activation_min);
+
+                result0 = min(result0, activation_max);
+                result1 = min(result1, activation_max);
+                result2 = min(result2, activation_max);
+                result3 = min(result3, activation_max);
+
+                out_data[out_idx++] = result0;
+                out_data[out_idx++] = result1;
+                out_data[out_idx++] = result2;
+                out_data[out_idx++] = result3;
+            }
+            for (; ch_idx < channels; ch_idx++) {//channel_loop
+                int32_t result = 0;
+
+                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                    const int32_t idx_y = base_y + filter_y_idx;
+                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                        const int32_t idx_x = base_x + filter_x_idx;
+                        int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
+                        int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels) + ch_idx;
+                        int32_t input_val = input_data[input_index] + input_offset;
+                        int32_t filter_val = filter_data[filter_index];
+                        result += input_val * filter_val;
+                    }
+                }
+                if (bias) {
+                    result += bias[ch_idx];
+                }
+                result = esp_nn_multiply_by_quantized_mult_fast(result, *out_mult++, *out_shift++);
+                result += out_offset;
+                result = max(result, activation_min);
+                result = min(result, activation_max);
+
+                out_data[out_idx++] = result;
+            }
+        }
+    }
+}
+
+void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,
+                                  const int8_t *input_data,
+                                  const data_dims_t *filter_dims,
+                                  const int8_t *filter_data,
+                                  const int32_t *bias,
+                                  const data_dims_t *output_dims,
+                                  int8_t *out_data,
+                                  const dw_conv_params_t *conv_params,
+                                  const quant_data_t *quant_data)
+{
+    const uint16_t ch_mult = conv_params->ch_mult;
+    if (ch_mult == 1) {
+        esp_nn_depthwise_conv_s8_ch_mult_1(input_dims, input_data, filter_dims, filter_data,
+                                           bias, output_dims, out_data, conv_params, quant_data);
+        return;
+    }
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t input_ht = input_dims->height;
+    const uint16_t channels = input_dims->channels;
+    const int32_t input_offset = conv_params->in_offset;
+    const int32_t out_offset = conv_params->out_offset;
+    const uint16_t pad_wd = conv_params->padding.width;
+    const uint16_t pad_ht = conv_params->padding.height;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+    const uint16_t filter_wd = filter_dims->width;
+    const uint16_t filter_ht = filter_dims->height;
+    const uint16_t out_wd = output_dims->width;
+    const uint16_t out_ht = output_dims->height;
+    const int32_t activation_min = conv_params->activation.min;
+    const int32_t activation_max = conv_params->activation.max;
+
+    int out_idx = 0;
+    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
+        const int16_t base_y = (out_y * stride_ht) - pad_ht;
+        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
+            const int16_t base_x = (out_x * stride_wd) - pad_wd;
+
+            const int32_t *out_shift = quant_data->shift;
+            const int32_t *out_mult = quant_data->mult;
+
+            /* Select filter so as the point doesn't lie outside block */
+            int filter_y_start = max(0, -base_y);
+            int filter_x_start = max(0, -base_x);
+            int filter_y_end = min(filter_ht, input_ht - base_y);
+            int filter_x_end = min(filter_wd, input_wd - base_x);
+
+            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop
+                int ch_mult_idx = 0;
+                for (; ch_mult_idx < ch_mult - 3; ch_mult_idx += 4) {
+                    int32_t result0 = 0;
+                    int32_t result1 = 0;
+                    int32_t result2 = 0;
+                    int32_t result3 = 0;
+                    const int out_ch_idx =  ch_idx * ch_mult + ch_mult_idx;
+
+                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                        const int32_t idx_y = base_y + filter_y_idx;
+                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                            const int32_t idx_x = base_x + filter_x_idx;
+                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
+                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;
+                            int32_t input_val = input_data[input_index] + input_offset;
+                            int32_t filter_val0 = filter_data[filter_index + 0];
+                            int32_t filter_val1 = filter_data[filter_index + 1];
+                            int32_t filter_val2 = filter_data[filter_index + 2];
+                            int32_t filter_val3 = filter_data[filter_index + 3];
+                            result0 += input_val * filter_val0;
+                            result1 += input_val * filter_val1;
+                            result2 += input_val * filter_val2;
+                            result3 += input_val * filter_val3;
+                        }
+                    }
+                    if (bias) {
+                        result0 += bias[out_ch_idx + 0];
+                        result1 += bias[out_ch_idx + 1];
+                        result2 += bias[out_ch_idx + 2];
+                        result3 += bias[out_ch_idx + 3];
+                    }
+                    result0 = esp_nn_multiply_by_quantized_mult_fast(result0, *out_mult++, *out_shift++);
+                    result1 = esp_nn_multiply_by_quantized_mult_fast(result1, *out_mult++, *out_shift++);
+                    result2 = esp_nn_multiply_by_quantized_mult_fast(result2, *out_mult++, *out_shift++);
+                    result3 = esp_nn_multiply_by_quantized_mult_fast(result3, *out_mult++, *out_shift++);
+
+                    result0 += out_offset;
+                    result1 += out_offset;
+                    result2 += out_offset;
+                    result3 += out_offset;
+
+                    result0 = max(result0, activation_min);
+                    result1 = max(result1, activation_min);
+                    result2 = max(result2, activation_min);
+                    result3 = max(result3, activation_min);
+                    result0 = min(result0, activation_max);
+                    result1 = min(result1, activation_max);
+                    result2 = min(result2, activation_max);
+                    result3 = min(result3, activation_max);
+
+                    out_data[out_idx++] = result0;
+                    out_data[out_idx++] = result1;
+                    out_data[out_idx++] = result2;
+                    out_data[out_idx++] = result3;
+                }
+                for (; ch_mult_idx < ch_mult; ch_mult_idx++) {
+                    int32_t result = 0;
+                    const int out_ch_idx =  ch_idx * ch_mult + ch_mult_idx;
+
+                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                        const int32_t idx_y = base_y + filter_y_idx;
+                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                            const int32_t idx_x = base_x + filter_x_idx;
+                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
+                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;
+                            int32_t input_val = input_data[input_index] + input_offset;
+                            int32_t filter_val = filter_data[filter_index];
+                            result += input_val * filter_val;
+                        }
+                    }
+                    if (bias) {
+                        result += bias[out_ch_idx];
+                    }
+                    result = esp_nn_multiply_by_quantized_mult_fast(result, *out_mult++, *out_shift++);
+                    result += out_offset;
+                    result = max(result, activation_min);
+                    result = min(result, activation_max);
+
+                    out_data[out_idx++] = result;
+                }
+            }
+        }
+    }
+}
diff --git a/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c b/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c
new file mode 100644
index 00000000..9167a43f
--- /dev/null
+++ b/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c
@@ -0,0 +1,543 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <esp_nn_defs.h>
+
+#include <common_functions.h>
+
+static int16_t *scratch_buffer = NULL;
+
+extern void esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3(const int16_t *input_data,
+                                                        const uint16_t input_wd,
+                                                        const uint16_t input_ht,
+                                                        const uint16_t channels,
+                                                        const uint16_t pad_wd,
+                                                        const uint16_t pad_ht,
+                                                        const uint16_t stride_wd,
+                                                        const uint16_t stride_ht,
+                                                        const uint16_t ch_mult,
+                                                        const int16_t *filter_data,
+                                                        const int32_t *bias,
+                                                        int8_t *out_data,
+                                                        const uint16_t out_wd,
+                                                        const uint16_t out_ht,
+                                                        const int32_t out_offset,
+                                                        const int32_t *out_shift,
+                                                        const int32_t *out_mult,
+                                                        const int32_t activation_min,
+                                                        const int32_t activation_max);
+
+extern void esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(const int8_t *input_data,
+                                                              const uint16_t input_wd,
+                                                              const uint16_t input_ht,
+                                                              const uint16_t channels,
+                                                              const int32_t input_offset,
+                                                              const uint16_t stride_wd,
+                                                              const uint16_t stride_ht,
+                                                              const int8_t *filter_data,
+                                                              const int32_t *bias,
+                                                              int8_t *out_data,
+                                                              const uint16_t out_wd,
+                                                              const uint16_t out_ht,
+                                                              const int32_t out_offset,
+                                                              const int32_t *out_shift,
+                                                              const int32_t *out_mult,
+                                                              const int32_t activation_min,
+                                                              const int32_t activation_max);
+
+extern void esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3(const int16_t *input_data,
+                                                               const uint16_t input_wd,
+                                                               const uint16_t input_ht,
+                                                               const uint16_t channels,
+                                                               const uint16_t stride_wd,
+                                                               const uint16_t stride_ht,
+                                                               const int16_t *filter_data,
+                                                               const int32_t *bias,
+                                                               int8_t *out_data,
+                                                               const uint16_t out_wd,
+                                                               const uint16_t out_ht,
+                                                               const int32_t out_offset,
+                                                               const int32_t *out_shift,
+                                                               const int32_t *out_mult,
+                                                               const int32_t activation_min,
+                                                               const int32_t activation_max);
+
+extern void esp_nn_depthwise_conv_s16_mult8_esp32s3(const int16_t *input_data,
+                                                    const uint16_t input_wd,
+                                                    const uint16_t input_ht,
+                                                    const uint16_t channels,
+                                                    const uint16_t pad_wd,
+                                                    const uint16_t pad_ht,
+                                                    const uint16_t stride_wd,
+                                                    const uint16_t stride_ht,
+                                                    const uint16_t ch_mult,
+                                                    const int16_t *filter_data,
+                                                    const uint16_t filter_wd,
+                                                    const uint16_t filter_ht,
+                                                    const int32_t *bias,
+                                                    int8_t *out_data,
+                                                    const uint16_t out_wd,
+                                                    const uint16_t out_ht,
+                                                    const int32_t out_offset,
+                                                    const int32_t *out_shift,
+                                                    const int32_t *out_mult,
+                                                    const int32_t activation_min,
+                                                    const int32_t activation_max);
+
+extern void esp_nn_depthwise_conv_s16_mult4_esp32s3(const int16_t *input_data,
+                                                    const uint16_t input_wd,
+                                                    const uint16_t input_ht,
+                                                    const uint16_t channels,
+                                                    const uint16_t pad_wd,
+                                                    const uint16_t pad_ht,
+                                                    const uint16_t stride_wd,
+                                                    const uint16_t stride_ht,
+                                                    const uint16_t ch_mult,
+                                                    const int16_t *filter_data,
+                                                    const uint16_t filter_wd,
+                                                    const uint16_t filter_ht,
+                                                    const int32_t *bias,
+                                                    int8_t *out_data,
+                                                    const uint16_t out_wd,
+                                                    const uint16_t out_ht,
+                                                    const int32_t out_offset,
+                                                    const int32_t *out_shift,
+                                                    const int32_t *out_mult,
+                                                    const int32_t activation_min,
+                                                    const int32_t activation_max);
+
+extern void esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3(const int16_t *input_data,
+                                                        const uint16_t input_wd,
+                                                        const uint16_t input_ht,
+                                                        const uint16_t channels,
+                                                        const uint16_t pad_wd,
+                                                        const uint16_t pad_ht,
+                                                        const uint16_t stride_wd,
+                                                        const uint16_t stride_ht,
+                                                        const int16_t *filter_data,
+                                                        const int32_t *bias,
+                                                        int8_t *out_data,
+                                                        const uint16_t out_wd,
+                                                        const uint16_t out_ht,
+                                                        const int32_t out_offset,
+                                                        const int32_t *out_shift,
+                                                        const int32_t *out_mult,
+                                                        const int32_t activation_min,
+                                                        const int32_t activation_max);
+
+extern void esp_nn_depthwise_conv_s16_mult1_esp32s3(const int16_t *input_data,
+                                                    const uint16_t input_wd,
+                                                    const uint16_t input_ht,
+                                                    const uint16_t channels,
+                                                    const uint16_t pad_wd,
+                                                    const uint16_t pad_ht,
+                                                    const uint16_t stride_wd,
+                                                    const uint16_t stride_ht,
+                                                    const int16_t *filter_data,
+                                                    const uint16_t filter_wd,
+                                                    const uint16_t filter_ht,
+                                                    const int32_t *bias,
+                                                    int8_t *out_data,
+                                                    const uint16_t out_wd,
+                                                    const uint16_t out_ht,
+                                                    const int32_t out_offset,
+                                                    const int32_t *out_shift,
+                                                    const int32_t *out_mult,
+                                                    const int32_t activation_min,
+                                                    const int32_t activation_max);
+
+extern void esp_nn_s8_to_s16_esp32s3(const int8_t *src, int16_t *dst, const int size);
+
+extern void esp_nn_aligned_s8_to_s16_with_offset_esp32s3(const int8_t *src, int16_t *dst,
+                                                         const int size, const int32_t offset);
+
+static void esp_nn_depthwise_conv_s8_unrolled(const int8_t *input_data,
+                                              const uint16_t input_wd,
+                                              const uint16_t input_ht,
+                                              const uint16_t channels,
+                                              const int32_t input_offset,
+                                              const uint16_t pad_wd,
+                                              const uint16_t pad_ht,
+                                              const uint16_t stride_wd,
+                                              const uint16_t stride_ht,
+                                              const uint16_t ch_mult,
+                                              const int8_t *filter_data,
+                                              const uint16_t filter_wd,
+                                              const uint16_t filter_ht,
+                                              const int32_t *bias,
+                                              int8_t *out_data,
+                                              const uint16_t out_wd,
+                                              const uint16_t out_ht,
+                                              const int32_t out_offset,
+                                              const int32_t *out_shift,
+                                              const int32_t *out_mult,
+                                              const int32_t activation_min,
+                                              const int32_t activation_max)
+{
+    int out_idx = 0;
+    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
+        const int16_t base_y = (out_y * stride_ht) - pad_ht;
+        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
+            const int16_t base_x = (out_x * stride_wd) - pad_wd;
+            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop
+                int ch_mult_idx = 0;
+                for (; ch_mult_idx < ch_mult - 3; ch_mult_idx += 4) {
+                    int32_t result0 = 0, result1 = 0, result2 = 0, result3 = 0;
+                    const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult;
+
+                    /* Select filter so as the point doesn't lie outside block */
+                    int filter_y_start = max(0, -base_y);
+                    int filter_x_start = max(0, -base_x);
+                    int filter_y_end = min(filter_ht, input_ht - base_y);
+                    int filter_x_end = min(filter_wd, input_wd - base_x);
+
+                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                        const int32_t idx_y = base_y + filter_y_idx;
+                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                            const int32_t idx_x = base_x + filter_x_idx;
+                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
+                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;
+                            int32_t input_val = input_data[input_index] + input_offset;
+                            int32_t filter_val0 = filter_data[filter_index + 0];
+                            int32_t filter_val1 = filter_data[filter_index + 1];
+                            int32_t filter_val2 = filter_data[filter_index + 2];
+                            int32_t filter_val3 = filter_data[filter_index + 3];
+                            result0 += input_val * filter_val0;
+                            result1 += input_val * filter_val1;
+                            result2 += input_val * filter_val2;
+                            result3 += input_val * filter_val3;
+                        }
+                    }
+                    if (bias) {
+                        result0 += bias[out_ch_idx + 0];
+                        result1 += bias[out_ch_idx + 1];
+                        result2 += bias[out_ch_idx + 2];
+                        result3 += bias[out_ch_idx + 3];
+                    }
+                    result0 = esp_nn_multiply_by_quantized_mult(result0,
+                                out_mult[out_ch_idx + 0], out_shift[out_ch_idx + 0]);
+                    result1 = esp_nn_multiply_by_quantized_mult(result1,
+                                out_mult[out_ch_idx + 1], out_shift[out_ch_idx + 1]);
+                    result2 = esp_nn_multiply_by_quantized_mult(result2,
+                                out_mult[out_ch_idx + 2], out_shift[out_ch_idx + 2]);
+                    result3 = esp_nn_multiply_by_quantized_mult(result3,
+                                out_mult[out_ch_idx + 3], out_shift[out_ch_idx + 3]);
+
+                    result0 += out_offset;
+                    result1 += out_offset;
+                    result2 += out_offset;
+                    result3 += out_offset;
+
+                    result0 = max(result0, activation_min);
+                    result1 = max(result1, activation_min);
+                    result2 = max(result2, activation_min);
+                    result3 = max(result3, activation_min);
+
+                    result0 = min(result0, activation_max);
+                    result1 = min(result1, activation_max);
+                    result2 = min(result2, activation_max);
+                    result3 = min(result3, activation_max);
+
+                    out_data[out_idx++] = result0;
+                    out_data[out_idx++] = result1;
+                    out_data[out_idx++] = result2;
+                    out_data[out_idx++] = result3;
+                }
+
+                /* left-over */
+                for (; ch_mult_idx < ch_mult; ch_mult_idx++) {
+                    int32_t result = 0;
+                    const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult;
+
+                    /* Select filter so as the point doesn't lie outside block */
+                    int filter_y_start = max(0, -base_y);
+                    int filter_x_start = max(0, -base_x);
+                    int filter_y_end = min(filter_ht, input_ht - base_y);
+                    int filter_x_end = min(filter_wd, input_wd - base_x);
+
+                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                        const int32_t idx_y = base_y + filter_y_idx;
+                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                            const int32_t idx_x = base_x + filter_x_idx;
+                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
+                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;
+                            int32_t input_val = input_data[input_index] + input_offset;
+                            int32_t filter_val = filter_data[filter_index];
+                            result += input_val * filter_val;
+                        }
+                    }
+                    if (bias) {
+                        result += bias[out_ch_idx];
+                    }
+                    result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_ch_idx], out_shift[out_ch_idx]);
+                    result += out_offset;
+                    result = max(result, activation_min);
+                    result = min(result, activation_max);
+
+                    out_data[out_idx++] = result;
+                }
+            }
+        }
+    }
+}
+
+void esp_nn_depthwise_conv_s8_ch_mult1(const int8_t *input_data,
+                                       const uint16_t input_wd,
+                                       const uint16_t input_ht,
+                                       const uint16_t channels,
+                                       const int32_t input_offset,
+                                       const uint16_t pad_wd,
+                                       const uint16_t pad_ht,
+                                       const uint16_t stride_wd,
+                                       const uint16_t stride_ht,
+                                       const int8_t *filter_data,
+                                       const uint16_t filter_wd,
+                                       const uint16_t filter_ht,
+                                       const int32_t *bias,
+                                       int8_t *out_data,
+                                       const uint16_t out_wd,
+                                       const uint16_t out_ht,
+                                       const int32_t out_offset,
+                                       const int32_t *out_shift,
+                                       const int32_t *out_mult,
+                                       const int32_t activation_min,
+                                       const int32_t activation_max)
+{
+    int out_idx = 0;
+    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
+        const int16_t base_y = (out_y * stride_ht) - pad_ht;
+        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
+            const int16_t base_x = (out_x * stride_wd) - pad_wd;
+            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop
+                int32_t result = 0;
+                /* Select filter so as the point doesn't lie outside block */
+                int filter_y_start = max(0, -base_y);
+                int filter_x_start = max(0, -base_x);
+                int filter_y_end = min(filter_ht, input_ht - base_y);
+                int filter_x_end = min(filter_wd, input_wd - base_x);
+
+                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
+                    const int32_t idx_y = base_y + filter_y_idx;
+                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
+                        const int32_t idx_x = base_x + filter_x_idx;
+                        int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
+                        int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * channels + ch_idx;
+                        int32_t input_val = input_data[input_index] + input_offset;
+                        int32_t filter_val = filter_data[filter_index];
+                        result += input_val * filter_val;
+                    }
+                }
+                if (bias) {
+                    result += bias[ch_idx];
+                }
+                result = esp_nn_multiply_by_quantized_mult(result, out_mult[ch_idx], out_shift[ch_idx]);
+                result += out_offset;
+                result = max(result, activation_min);
+                result = min(result, activation_max);
+
+                out_data[out_idx++] = result;
+            }
+        }
+    }
+}
+
+int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
+                                                   const data_dims_t *filter_dims,
+                                                   const data_dims_t *output_dims,
+                                                   const dw_conv_params_t *conv_params)
+{
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t input_ht = input_dims->height;
+    const uint16_t channels = input_dims->channels;
+    const uint16_t filter_wd = filter_dims->width;
+    const uint16_t filter_ht = filter_dims->height;
+    const uint16_t ch_mult = conv_params->ch_mult;
+    const uint16_t out_wd = output_dims->width;
+    const uint16_t out_ht = output_dims->height;
+    const uint16_t pad_wd = conv_params->padding.width;
+    const uint16_t pad_ht = conv_params->padding.height;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+
+    int filter_size = filter_wd * filter_ht * channels * ch_mult;
+    int pad_width = 0, pad_height = 0;
+
+    if ((ch_mult == 1) && (channels % 8 == 0) && (filter_wd == 3) && (filter_ht == 3)) {
+        if (channels % 16 == 0) {
+            if (pad_wd || pad_ht) {
+                pad_width = pad_wd * 2;
+                pad_height = pad_ht * 2;
+            } else {
+                // check if we need to pad additionally
+                pad_width = (out_wd * stride_wd + filter_wd - 1) - input_wd;
+                pad_height = (out_ht * stride_ht + filter_ht - 1) - input_ht;
+                // printf("in(%d %d %d), out(%d %d), filter (%d %d) stride (%d %d), pad (%d %d)",
+                //         input_wd, input_ht, channels, out_wd, out_ht, filter_wd, filter_ht,
+                //         stride_wd, stride_ht, pad_wd, pad_ht);
+            }
+            if (pad_width || pad_height) {
+                int input_size = (input_wd + pad_width) * (input_ht + pad_height) * channels;
+                // printf("ask1 %d\n", filter_size + input_size + 16);
+                return filter_size + input_size + 16;  // 16 for alignment
+            } else {
+                // printf("ask2 %d\n", filter_size + 16);
+                return filter_size + 16;  // 16 for alignment
+            }
+        } else {
+            int input_size = input_wd * input_ht * channels;
+            // printf("ask3 %d\n", 2 * (filter_size + input_size) + 16);
+            return  2 * (filter_size + input_size) + 16; // 16 for alignment
+        }
+    } else if (ch_mult % 4 == 0) {
+        int input_size = input_wd * input_ht * channels;
+        // printf("ask4 %d\n", 2 * (filter_size + input_size) + 16);
+        return  2 * (filter_size + input_size) + 16; // 16 for alignment
+    }
+    return 32; // just few bytes
+}
+
+void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(void *buf)
+{
+    scratch_buffer = (int16_t *) buf;
+}
+
+/**
+ * Assumption 1: i/p channels == o/p channels
+ * Assumption 2: Pointers are valid
+ * Assumption 3: dialation width = 1
+ */
+
+
+
+void esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims,
+                                      const int8_t *input_data,
+                                      const data_dims_t *filter_dims,
+                                      const int8_t *filter_data,
+                                      const int32_t *bias,
+                                      const data_dims_t *output_dims,
+                                      int8_t *out_data,
+                                      const dw_conv_params_t *conv_params,
+                                      const quant_data_t *quant_data)
+{
+    const uint16_t input_wd = input_dims->width;
+    const uint16_t input_ht = input_dims->height;
+    const uint16_t channels = input_dims->channels;
+    const int32_t input_offset = conv_params->in_offset;
+    const int32_t out_offset = conv_params->out_offset;
+    const uint16_t pad_wd = conv_params->padding.width;
+    const uint16_t pad_ht = conv_params->padding.height;
+    const uint16_t stride_wd = conv_params->stride.width;
+    const uint16_t stride_ht = conv_params->stride.height;
+    const uint16_t filter_wd = filter_dims->width;
+    const uint16_t filter_ht = filter_dims->height;
+    const uint16_t out_wd = output_dims->width;
+    const uint16_t out_ht = output_dims->height;
+    const int32_t *out_shift = quant_data->shift;
+    const int32_t *out_mult = quant_data->mult;
+    const int32_t activation_min = conv_params->activation.min;
+    const int32_t activation_max = conv_params->activation.max;
+    const uint16_t ch_mult = conv_params->ch_mult;
+
+    int filter_size = filter_wd * filter_ht * channels * ch_mult;
+    int align_len = 16 - (filter_size & 15);
+    int input_size = input_wd * input_ht * channels;
+    int16_t *filter_data16 = scratch_buffer;
+    int16_t *input_data16 = scratch_buffer + filter_size + align_len;
+    if (scratch_buffer == NULL) {
+        printf("esp_nn_depthwise_conv error! scratch_buffer not set!\n");
+        return;
+    }
+
+    if ((ch_mult == 1) && (channels % 8 == 0)) {
+        if ((filter_wd == 3) && (filter_ht == 3)) {
+            if ((channels % 16 == 0) && (pad_wd == 1) && (pad_ht == 1)) {
+                /* process in 8 bits */
+                int8_t *filter_aligned = (int8_t *) scratch_buffer;
+                int8_t *input_padded = (int8_t *) scratch_buffer + filter_size + align_len;
+                memcpy(filter_aligned, filter_data, filter_size);
+                esp_nn_aligned_s8_pad_with_value(input_data, input_padded, input_wd, input_ht, channels,
+                                                 -input_offset, pad_wd, pad_ht);
+                esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(input_padded, input_wd + 2 * pad_wd,
+                                                                  input_ht + 2 * pad_ht, channels, input_offset,
+                                                                  stride_wd, stride_ht, filter_aligned, bias,
+                                                                  out_data, out_wd, out_ht, out_offset, out_shift,
+                                                                  out_mult, activation_min, activation_max);
+            } else if ((channels % 16 == 0) && (pad_wd == 0) && (pad_ht == 0)) {
+                /* process in 8 bits */
+                int8_t *filter_aligned = (int8_t *) scratch_buffer;
+                int8_t *input_padded = (int8_t *) scratch_buffer + filter_size + align_len;
+
+                // check if we need to pad additionally
+                int pad_right = (out_wd * stride_wd + filter_wd - 1) - input_wd;
+                int pad_bottom = (out_ht * stride_ht + filter_ht - 1) - input_ht;
+                if (pad_right || pad_bottom) { // pad right and bottom
+                    esp_nn_aligned_s8_pad_end_with_value(input_data, input_padded, input_wd, input_ht,
+                                                         channels, -input_offset, pad_right, pad_bottom);
+                } else {
+                    input_padded = (int8_t *) input_data;
+                }
+                memcpy(filter_aligned, filter_data, filter_size);
+                esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(input_padded, input_wd + pad_right,
+                                                                  input_ht + pad_bottom, channels, input_offset,
+                                                                  stride_wd, stride_ht, filter_aligned, bias,
+                                                                  out_data, out_wd, out_ht, out_offset, out_shift,
+                                                                  out_mult, activation_min, activation_max);
+            } else { /* (channels % 8) == 0 */
+                esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);
+                esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);
+                esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3(input_data16, input_wd, input_ht, channels,
+                                                            pad_wd, pad_ht, stride_wd, stride_ht, filter_data16,
+                                                            bias, out_data, out_wd, out_ht, out_offset, out_shift,
+                                                            out_mult, activation_min, activation_max);
+            }
+        } else { // all other ch_mult == 1, `channels % 8 == 0`
+            esp_nn_depthwise_conv_s8_ch_mult1(input_data, input_wd, input_ht, channels, input_offset,
+                                              pad_wd, pad_ht, stride_wd, stride_ht,
+                                              filter_data, filter_wd, filter_ht,
+                                              bias, out_data, out_wd, out_ht, out_offset, out_shift,
+                                              out_mult, activation_min, activation_max);
+        }
+    } else if (ch_mult % 8 == 0) {
+        esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);
+        esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);
+        if (filter_wd == 3 && filter_ht == 3) {
+            esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3(input_data16, input_wd, input_ht, channels,
+                                                        pad_wd, pad_ht, stride_wd, stride_ht, ch_mult,
+                                                        filter_data16, bias,
+                                                        out_data, out_wd, out_ht, out_offset, out_shift,
+                                                        out_mult, activation_min, activation_max);
+        } else {
+            esp_nn_depthwise_conv_s16_mult8_esp32s3(input_data16, input_wd, input_ht, channels,
+                                                    pad_wd, pad_ht, stride_wd, stride_ht, ch_mult,
+                                                    filter_data16, filter_wd, filter_ht, bias,
+                                                    out_data, out_wd, out_ht, out_offset, out_shift,
+                                                    out_mult, activation_min, activation_max);
+        }
+    } else if (ch_mult % 4 == 0) {
+        esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);
+        esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);
+        esp_nn_depthwise_conv_s16_mult4_esp32s3(input_data16, input_wd, input_ht, channels,
+                                                pad_wd, pad_ht, stride_wd, stride_ht, ch_mult,
+                                                filter_data16, filter_wd, filter_ht, bias,
+                                                out_data, out_wd, out_ht, out_offset, out_shift,
+                                                out_mult, activation_min, activation_max);
+    } else {
+        esp_nn_depthwise_conv_s8_unrolled(input_data, input_wd, input_ht, channels, input_offset,
+                                          pad_wd, pad_ht, stride_wd, stride_ht, ch_mult,
+                                          filter_data, filter_wd, filter_ht,
+                                          bias, out_data, out_wd, out_ht, out_offset, out_shift,
+                                          out_mult, activation_min, activation_max);
+    }
+}
diff --git a/code/components/esp-nn/src/fully_connected/esp_nn_fully_connected_ansi.c b/code/components/esp-nn/src/fully_connected/esp_nn_fully_connected_ansi.c
new file mode 100644
index 00000000..6d800bc5
--- /dev/null
+++ b/code/components/esp-nn/src/fully_connected/esp_nn_fully_connected_ansi.c
@@ -0,0 +1,50 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+
+#include <common_functions.h>
+
+void esp_nn_fully_connected_s8_ansi(const int8_t *input_data,
+                                    const int32_t input_offset,
+                                    const uint16_t row_len,
+                                    const int8_t *filter_data,
+                                    const int32_t filter_offset,
+                                    const int32_t *bias,
+                                    int8_t *out_data,
+                                    const uint16_t out_channels,
+                                    const int32_t out_offset,
+                                    const int32_t out_shift,
+                                    const int32_t out_mult,
+                                    const int32_t activation_min,
+                                    const int32_t activation_max)
+{
+    for (int32_t out_c = 0; out_c < out_channels; ++out_c) {
+        int32_t result = 0;
+        for (int32_t data_idx = 0; data_idx < row_len; data_idx++) {
+            int32_t filter_index = row_len * out_c + data_idx;
+            int32_t input_val = input_data[data_idx];
+            int32_t filter_val = filter_data[filter_index];
+            result += (filter_val + filter_offset) * (input_val + input_offset);
+        }
+        if (bias) {
+            result += bias[out_c];
+        }
+        result = esp_nn_multiply_by_quantized_mult(result, out_mult, out_shift);
+        result += out_offset;
+        result = max(result, activation_min);
+        result = min(result, activation_max);
+        out_data[out_c] = (int8_t) result;
+    }
+}
diff --git a/code/components/esp-nn/src/pooling/esp_nn_avg_pool_ansi.c b/code/components/esp-nn/src/pooling/esp_nn_avg_pool_ansi.c
new file mode 100644
index 00000000..03846aa0
--- /dev/null
+++ b/code/components/esp-nn/src/pooling/esp_nn_avg_pool_ansi.c
@@ -0,0 +1,72 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+
+#include <common_functions.h>
+
+void esp_nn_avg_pool_s8_ansi(const int8_t *input,
+                             const uint16_t input_wd,
+                             const uint16_t input_ht,
+                             int8_t *output,
+                             const uint16_t output_wd,
+                             const uint16_t output_ht,
+                             const uint16_t stride_wd,
+                             const uint16_t stride_ht,
+                             const uint16_t filter_wd,
+                             const uint16_t filter_ht,
+                             const uint16_t pad_wd,
+                             const uint16_t pad_ht,
+                             const int32_t activation_min,
+                             const int32_t activation_max,
+                             const uint16_t channels)
+{
+    int32_t base_y = -pad_ht;
+    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {
+        int32_t base_x = -pad_wd;
+        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {
+            for (int32_t ch_idx = 0; ch_idx < channels; ch_idx++) {
+                int32_t result = 0;
+                int32_t filter_cnt = 0;
+                /* Make sure filter does not cross the input box */
+                int32_t filter_y_start = max(0, -base_y);
+                int32_t filter_x_start = max(0, -base_x);
+
+                int32_t filter_y_end = min(filter_ht, input_ht - base_y);
+                int32_t filter_x_end = min(filter_wd, input_wd - base_x);
+
+                for (int32_t filter_y = filter_y_start; filter_y < filter_y_end; filter_y++) {
+                    for (int32_t filter_x = filter_x_start; filter_x < filter_x_end; filter_x++) {
+                        int32_t in_x_idx = base_x + filter_x;
+                        int32_t in_y_idx = base_y + filter_y;
+                        int32_t input_index = (in_y_idx * input_wd + in_x_idx) * channels + ch_idx;
+                        result += input[input_index];
+                        filter_cnt++;
+                    }
+                }
+
+                /* Rounded average */
+                result = result > 0 ? (result + filter_cnt / 2) / filter_cnt
+                                    : (result - filter_cnt / 2) / filter_cnt;
+
+                /* Activation function */
+                result = max(result, activation_min);
+                result = min(result, activation_max);
+
+                int32_t output_index = (out_y * output_wd + out_x) * channels + ch_idx;
+                output[output_index] = (int8_t) result;
+            }
+        }
+    }
+}
diff --git a/code/components/esp-nn/src/pooling/esp_nn_max_pool_ansi.c b/code/components/esp-nn/src/pooling/esp_nn_max_pool_ansi.c
new file mode 100644
index 00000000..4ca5c42d
--- /dev/null
+++ b/code/components/esp-nn/src/pooling/esp_nn_max_pool_ansi.c
@@ -0,0 +1,66 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+
+#include <common_functions.h>
+
+void esp_nn_max_pool_s8_ansi(const int8_t *input,
+                             const uint16_t input_wd,
+                             const uint16_t input_ht,
+                             int8_t *output,
+                             const uint16_t output_wd,
+                             const uint16_t output_ht,
+                             const uint16_t stride_wd,
+                             const uint16_t stride_ht,
+                             const uint16_t filter_wd,
+                             const uint16_t filter_ht,
+                             const uint16_t pad_wd,
+                             const uint16_t pad_ht,
+                             const int32_t activation_min,
+                             const int32_t activation_max,
+                             const uint16_t channels)
+{
+    int32_t base_y = -pad_ht;
+    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {
+        int32_t base_x = -pad_wd;
+        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {
+            /* Make sure filter does not cross the input box */
+            int32_t filter_y_start = max(0, -base_y);
+            int32_t filter_x_start = max(0, -base_x);
+            int32_t filter_y_end = min(filter_ht, input_ht - base_y);
+            int32_t filter_x_end = min(filter_wd, input_wd - base_x);
+
+            for (int32_t ch_idx = 0; ch_idx < channels; ch_idx++) {
+                int8_t result = INT8_MIN;
+
+                for (int32_t filter_y = filter_y_start; filter_y < filter_y_end; filter_y++) {
+                    for (int32_t filter_x = filter_x_start; filter_x < filter_x_end; filter_x++) {
+                        int32_t in_x_idx = base_x + filter_x;
+                        int32_t in_y_idx = base_y + filter_y;
+                        int32_t input_index = (in_y_idx * input_wd + in_x_idx) * channels + ch_idx;
+                        result = max(input[input_index], result);
+                    }
+                }
+
+                /* Activation function */
+                result = max(result, activation_min);
+                result = min(result, activation_max);
+
+                int32_t output_index = (out_y * output_wd + out_x) * channels + ch_idx;
+                output[output_index] = result;
+            }
+        }
+    }
+}
diff --git a/code/components/esp-nn/src/softmax/esp_nn_softmax_ansi.c b/code/components/esp-nn/src/softmax/esp_nn_softmax_ansi.c
new file mode 100644
index 00000000..d71a8616
--- /dev/null
+++ b/code/components/esp-nn/src/softmax/esp_nn_softmax_ansi.c
@@ -0,0 +1,88 @@
+// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "softmax_common.h"
+
+int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height)
+{
+    (void) width;
+    (void) height;
+    return 0;
+}
+
+void esp_nn_set_softmax_scratch_buf_ansi(void *buffer)
+{
+    (void) buffer;
+    return;
+}
+
+void esp_nn_softmax_s8_ansi(const int8_t *input_data,
+                            const int32_t height,
+                            const int32_t width,
+                            const int32_t mult,
+                            const int32_t shift,
+                            const int32_t diff_min,
+                            int8_t *output_data)
+{
+    // The representation chosen for the input to the exp() function is Q5.26.
+    // We need to leave extra space since values that we skip might be as large as
+    // -32 before multiplying by input mult, and therefore as large as
+    // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+    // accumulation, but exp(-16) definitely is.
+#define ACCUM_BITS  12
+#define DIFF_BITS   5
+
+    const int32_t mask = (1 << shift);
+    int32_t col = 0;
+    const int8_t *in_ptr = input_data;
+    int8_t *out_ptr = output_data;
+
+    for (int row_idx = 0; row_idx < height; row_idx++) {
+        int8_t max_in_row = in_ptr[0];
+        for (col = 1; col < width; col++) {
+            max_in_row = max(max_in_row, in_ptr[col]);
+        }
+
+        int32_t input_diff = 0;
+        int32_t sum_of_exps = 0;
+
+        for (col = 0; col < width; col++) {
+            input_diff = in_ptr[col] - max_in_row;
+            if (input_diff >= diff_min) {
+                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);
+                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);
+                sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS);
+            }
+        }
+
+        const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps);
+        const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31));
+        const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8;
+
+        for (col = 0; col < width; col++) {
+            input_diff = in_ptr[col] - max_in_row;
+            if (input_diff >= diff_min) {
+                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);
+                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);
+                const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);
+                const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;
+                out_ptr[col] = (int8_t) esp_nn_saturate8(result);
+            } else {
+                out_ptr[col] = -128;
+            }
+        }
+        in_ptr  += width;
+        out_ptr += width;
+    }
+}
diff --git a/code/components/esp-nn/src/softmax/esp_nn_softmax_opt.c b/code/components/esp-nn/src/softmax/esp_nn_softmax_opt.c
new file mode 100644
index 00000000..93337d32
--- /dev/null
+++ b/code/components/esp-nn/src/softmax/esp_nn_softmax_opt.c
@@ -0,0 +1,108 @@
+// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "softmax_common.h"
+#include <stdio.h>
+
+static int32_t *scratch_buf = NULL;
+
+/**
+ * @brief   Get scratch buffer size needed by softmax function
+ *
+ * @param   width
+ * @param   height
+ * @return  size in bytes
+ *
+ * @note    buffer must be 4 byte aligned
+ */
+int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height)
+{
+    (void) height;
+    return width * 4;
+}
+
+/**
+ * @brief   Set scratch buffer to be used by softmax function
+ *
+ * @param   buffer  this can be NULL if one needs to unset it
+ *                  must be aligned to 4 bytes
+ */
+void esp_nn_set_softmax_scratch_buf_opt(void *buffer)
+{
+    scratch_buf = (int32_t *) buffer;
+}
+
+void esp_nn_softmax_s8_opt(const int8_t *input_data,
+                           const int32_t height,
+                           const int32_t width,
+                           const int32_t mult,
+                           const int32_t shift,
+                           const int32_t diff_min,
+                           int8_t *output_data)
+{
+    if (scratch_buf == NULL) {
+        printf("%s error! scratch buffer not set\n", __FUNCTION__);
+        return;
+    }
+    // The representation chosen for the input to the exp() function is Q5.26.
+    // We need to leave extra space since values that we skip might be as large as
+    // -32 before multiplying by input mult, and therefore as large as
+    // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+    // accumulation, but exp(-16) definitely is.
+#define ACCUM_BITS  12
+#define DIFF_BITS   5
+
+    const int32_t mask = (1 << shift);
+    int32_t col = 0;
+    const int8_t *in_ptr = input_data;
+    int8_t *out_ptr = output_data;
+
+    for (int row_idx = 0; row_idx < height; row_idx++) {
+        int8_t max_in_row = in_ptr[0];
+        for (col = 1; col < width; col++) {
+            max_in_row = max(max_in_row, in_ptr[col]);
+        }
+
+        int32_t input_diff = 0;
+        int32_t sum_of_exps = 0;
+
+        for (col = 0; col < width; col++) {
+            input_diff = in_ptr[col] - max_in_row;
+            if (input_diff >= diff_min) {
+                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);
+                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);
+                scratch_buf[col] = exp_raw; // store to avoid duplicate calculation later
+                sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS);
+            }
+        }
+
+        const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps);
+        const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31));
+        const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8;
+
+        for (col = 0; col < width; col++) {
+            input_diff = in_ptr[col] - max_in_row;
+            if (input_diff >= diff_min) {
+                int32_t exp_raw = scratch_buf[col];
+                const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);
+                const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;
+                out_ptr[col] = (int8_t) esp_nn_saturate8(result);
+            } else {
+                out_ptr[col] = -128;
+            }
+        }
+        in_ptr  += width;
+        out_ptr += width;
+    }
+}
diff --git a/code/components/esp-nn/src/softmax/softmax_common.h b/code/components/esp-nn/src/softmax/softmax_common.h
new file mode 100644
index 00000000..254d6ace
--- /dev/null
+++ b/code/components/esp-nn/src/softmax/softmax_common.h
@@ -0,0 +1,104 @@
+// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <common_functions.h>
+
+#define MASK_IF_ZERO(x)                 (x) == 0 ? ~0 : 0
+#define MASK_IF_NON_ZERO(x)             (x) != 0 ? ~0 : 0
+#define SELECT_USING_MASK(mask, a, b)   ((mask) & (a)) ^ (~(mask) & (b))
+#define SAT_HIGH_MUL(x, y)              esp_nn_sat_round_doubling_high_mul((x), (y))
+#define DIV_POW2(x,y)                   esp_nn_div_by_power_of_two((x), (y))
+
+__NN_FORCE_INLINE__ int32_t mul_power_of_2(int val, int exp)
+{
+    const int32_t thresh = ((1 << (31 - exp)) - 1);
+    int32_t result = val << exp;
+    result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), INT32_MAX, result);
+    result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), INT32_MIN, result);
+    return result;
+}
+
+/**
+ * @brief   Calculate `1 / (1 + x)` for x in [0, 1]
+ *
+ * @param   val     input value to calculate `1/(1+x)` for
+ * @return  `int32_t` result
+ * @note    Newton-Raphson division
+ *
+ *          https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division
+ *          Refer to that page for the logic behind the 48/17 and 32/17 constants.
+ *          Pseudocode: https://en.wikipedia.org/wiki/Division_algorithm#Pseudocode
+ */
+__NN_FORCE_INLINE__ int32_t esp_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val)
+{
+    const int64_t sum = (int64_t) val + INT32_MAX;
+    const int32_t half_denominator = (int32_t) ((sum + (sum >= 0 ? 1 : -1)) / 2L);
+    int32_t constant_48_over_17 = 1515870810;
+    int32_t constant_neg_32_over_17 = -1010580540;
+    int32_t x = constant_48_over_17 + SAT_HIGH_MUL(half_denominator, constant_neg_32_over_17);
+    const int32_t fixed_2_one = (1 << 29);
+
+    x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2);
+    x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2);
+    x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2);
+
+    return mul_power_of_2(x, 1);
+}
+
+#define ONE_OVER_ONE_X(x)   esp_nn_one_over_one_plus_x_for_x_in_0_1((x))
+
+/**
+ * @brief   Return exp(x) for x < 0.
+ *
+ */
+__NN_FORCE_INLINE__ int32_t esp_nn_exp_on_negative_values(int32_t val)
+{
+    int32_t shift = 24;
+
+    const int32_t one_quarter = (1 << shift);
+    int32_t mask = one_quarter - 1;
+    const int32_t val_mod_minus_quarter = (val & mask) - one_quarter;
+    const int32_t remainder             = val_mod_minus_quarter - val;
+
+    // calculate exponent for x in [-1/4, 0) in `result`
+    const int32_t x                     = (val_mod_minus_quarter << 5) + (1 << 28);
+    const int32_t x2                    = SAT_HIGH_MUL(x, x);
+    const int32_t x3                    = SAT_HIGH_MUL(x2, x);
+    const int32_t x4                    = SAT_HIGH_MUL(x2, x2);
+    const int32_t one_over_3            = 715827883;
+    const int32_t one_over_8            = 1895147668;
+
+    const int32_t x4_over_4 = DIV_POW2(x4, 2);
+    const int32_t x4_over_4_plus_x3_over_6_plus_x2_over_2 = DIV_POW2(SAT_HIGH_MUL(x4_over_4 + x3, one_over_3) + x2, 1);
+    int32_t result = one_over_8 + SAT_HIGH_MUL(one_over_8, x + x4_over_4_plus_x3_over_6_plus_x2_over_2);
+
+#define SELECT_IF_NON_ZERO(x) {                                   \
+    mask   = MASK_IF_NON_ZERO(remainder & (1 << shift++));        \
+    result = SELECT_USING_MASK(mask, SAT_HIGH_MUL(result, x), result); \
+}
+
+    SELECT_IF_NON_ZERO(1672461947)
+    SELECT_IF_NON_ZERO(1302514674)
+    SELECT_IF_NON_ZERO(790015084)
+    SELECT_IF_NON_ZERO(290630308)
+    SELECT_IF_NON_ZERO(39332535)
+    SELECT_IF_NON_ZERO(720401)
+    SELECT_IF_NON_ZERO(242)
+
+#undef SELECT_IF_NON_ZERO
+
+    mask = MASK_IF_ZERO(val);
+    return SELECT_USING_MASK(mask, INT32_MAX, result);
+}
\ No newline at end of file
diff --git a/code/components/esp-nn/test_app/CMakeLists.txt b/code/components/esp-nn/test_app/CMakeLists.txt
new file mode 100644
index 00000000..8d332768
--- /dev/null
+++ b/code/components/esp-nn/test_app/CMakeLists.txt
@@ -0,0 +1,9 @@
+# The following lines of boilerplate have to be in your project's
+# CMakeLists in this exact order for cmake to work correctly
+cmake_minimum_required(VERSION 3.5)
+
+set(EXTRA_COMPONENT_DIRS "../" "../tests/")
+set(IDF_EXCLUDE_COMPONENTS test test_app)
+
+include($ENV{IDF_PATH}/tools/cmake/project.cmake)
+project(test_app)
diff --git a/code/components/esp-nn/test_app/main/CMakeLists.txt b/code/components/esp-nn/test_app/main/CMakeLists.txt
new file mode 100644
index 00000000..04161254
--- /dev/null
+++ b/code/components/esp-nn/test_app/main/CMakeLists.txt
@@ -0,0 +1,7 @@
+
+set(COMPONENT_SRCS "main.c")
+set(COMPONENT_ADD_INCLUDEDIRS "")
+
+set(COMPONENT_PRIV_REQUIRES tests)
+
+register_component()
diff --git a/code/components/esp-nn/test_app/main/component.mk b/code/components/esp-nn/test_app/main/component.mk
new file mode 100644
index 00000000..5d85ad38
--- /dev/null
+++ b/code/components/esp-nn/test_app/main/component.mk
@@ -0,0 +1,8 @@
+#
+# Main component makefile.
+#
+# This Makefile can be left empty. By default, it will take the sources in the 
+# src/ directory, compile them and link them into lib(subdirectory_name).a
+# in the build directory. This behaviour is entirely configurable,
+# please read the ESP-IDF documents if you need to do this.
+# 
diff --git a/code/components/esp-nn/test_app/main/main.c b/code/components/esp-nn/test_app/main/main.c
new file mode 100644
index 00000000..267e35f2
--- /dev/null
+++ b/code/components/esp-nn/test_app/main/main.c
@@ -0,0 +1,87 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <freertos/FreeRTOS.h>
+#include <freertos/task.h>
+#include <esp_log.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <test_functions.h>
+#include <esp_timer.h>
+
+static const char *TAG = "test_app";
+static uint32_t start_c, start_opt, total_c, total_opt;
+
+void profile_c_start()
+{
+    /* initiate profiling */
+    start_c = esp_cpu_get_ccount();
+}
+
+void profile_c_end()
+{
+    /* record profile number */
+    total_c = esp_cpu_get_ccount() - start_c;
+}
+
+void profile_opt_start()
+{
+    /* initiate profiling */
+    start_opt = esp_cpu_get_ccount();
+}
+
+void profile_opt_end()
+{
+    /* record profile number */
+    total_opt = esp_cpu_get_ccount() - start_opt;
+}
+
+void app_main()
+{
+    /* s8 tests */
+    ESP_LOGI(TAG, "Running s8 tests...");
+    esp_nn_add_elementwise_s8_test();
+    printf("add, c %u opt %u\n", total_c, total_opt);
+    esp_nn_mul_elementwise_s8_test();
+    printf("mul, c %u opt %u\n", total_c, total_opt);
+    esp_nn_depthwise_conv_s8_test();
+    printf("depthwise, c %u opt %u\n", total_c, total_opt);
+    esp_nn_conv_s8_test();
+    printf("conv2d, c %u opt %u\n", total_c, total_opt);
+
+    esp_nn_relu6_s8_test();
+    printf("relu, c %u opt %u\n", total_c, total_opt);
+    esp_nn_avg_pool_s8_test();
+    printf("avg_pool, c %u opt %u\n", total_c, total_opt);
+    esp_nn_max_pool_s8_test();
+    printf("max_pool, c %u opt %u\n", total_c, total_opt);
+    esp_nn_fully_connected_s8_test();
+    printf("fully_connected, c %u opt %u\n", total_c, total_opt);
+    esp_nn_softmax_s8_test();
+    printf("softmax, c %u opt %u\n", total_c, total_opt);
+    ESP_LOGI(TAG, "s8 tests done!\n");
+
+    /* u8 tests */
+    //ESP_LOGI(TAG, "Running u8 tests...");
+    //esp_nn_add_elementwise_u8_test();
+    //esp_nn_depthwise_conv_u8_test();
+    //esp_nn_conv_u8_test();
+    //esp_nn_avg_pool_u8_test();
+    //esp_nn_max_pool_u8_test();
+    //esp_nn_fully_connected_u8_test();
+    //ESP_LOGI(TAG, "u8 tests done!\n");
+}
diff --git a/code/components/esp-nn/test_app/sdkconfig.defaults b/code/components/esp-nn/test_app/sdkconfig.defaults
new file mode 100644
index 00000000..bb37aac5
--- /dev/null
+++ b/code/components/esp-nn/test_app/sdkconfig.defaults
@@ -0,0 +1,5 @@
+
+#
+# esp-nn
+#
+CONFIG_NN_ESP32=y
diff --git a/code/components/esp-nn/test_app/sdkconfig.defaults.esp32s3 b/code/components/esp-nn/test_app/sdkconfig.defaults.esp32s3
new file mode 100644
index 00000000..1adc4b01
--- /dev/null
+++ b/code/components/esp-nn/test_app/sdkconfig.defaults.esp32s3
@@ -0,0 +1,8 @@
+# Default configurations for ESP32-S3
+
+CONFIG_ESP32S3_DEFAULT_CPU_FREQ_240=y
+CONFIG_ESP32S3_SPIRAM_SUPPORT=y
+
+CONFIG_ESP32S3_DATA_CACHE_64KB=y
+CONFIG_ESP32S3_DATA_CACHE_8WAYS=y
+CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y
diff --git a/code/components/esp-nn/tests/CMakeLists.txt b/code/components/esp-nn/tests/CMakeLists.txt
new file mode 100644
index 00000000..97ec946f
--- /dev/null
+++ b/code/components/esp-nn/tests/CMakeLists.txt
@@ -0,0 +1,15 @@
+
+set(COMPONENT_ADD_INCLUDEDIRS ./include/)
+set(COMPONENT_SRCS "src/basic_math_test.c"
+                   "src/convolution_test.c"
+                   "src/fully_connected_test.c"
+                   "src/pooling_test.c"
+                   "src/relu_test.c"
+                   "src/softmax_test.c")
+
+set(COMPONENT_REQUIRES )
+set(COMPONENT_PRIV_REQUIRES esp-nn)
+
+register_component()
+
+target_compile_options(${COMPONENT_LIB} PRIVATE -Wno-unused-function)
diff --git a/code/components/esp-nn/tests/README.md b/code/components/esp-nn/tests/README.md
new file mode 100644
index 00000000..41c94235
--- /dev/null
+++ b/code/components/esp-nn/tests/README.md
@@ -0,0 +1,4 @@
+# Tests for esp_nn library
+
+- Include these in your test framework and run the framework.
+- For IDF test please refer `test_app`
diff --git a/code/components/esp-nn/tests/component.mk b/code/components/esp-nn/tests/component.mk
new file mode 100644
index 00000000..2860f3ff
--- /dev/null
+++ b/code/components/esp-nn/tests/component.mk
@@ -0,0 +1,5 @@
+#FIXME
+
+COMPONENT_ADD_INCLUDEDIRS := include/
+
+COMPONENT_SRCDIRS :=  src/
diff --git a/code/components/esp-nn/tests/include/test_functions.h b/code/components/esp-nn/tests/include/test_functions.h
new file mode 100644
index 00000000..3e882efa
--- /dev/null
+++ b/code/components/esp-nn/tests/include/test_functions.h
@@ -0,0 +1,48 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+/* int8_t ops tests */
+void esp_nn_add_elementwise_s8_test();
+void esp_nn_mul_elementwise_s8_test();
+
+void esp_nn_depthwise_conv_s8_test();
+void esp_nn_conv_s8_test();
+
+void esp_nn_avg_pool_s8_test();
+void esp_nn_max_pool_s8_test();
+
+void esp_nn_fully_connected_s8_test();
+
+void esp_nn_relu6_s8_test();
+
+void esp_nn_softmax_s8_test();
+
+/* uint8_t ops tests */
+void esp_nn_add_elementwise_u8_test();
+
+void esp_nn_depthwise_conv_u8_test();
+void esp_nn_conv_u8_test();
+
+void esp_nn_avg_pool_u8_test();
+void esp_nn_max_pool_u8_test();
+
+void esp_nn_fully_connected_u8_test();
+
+/* instructions test functions */
+void compare_instructions_test();
+void arith_instructions_test();
+void min_max_instructions_test();
+void bitwise_instructions_test();
+void load_store_instructions_test();
diff --git a/code/components/esp-nn/tests/include/test_utils.h b/code/components/esp-nn/tests/include/test_utils.h
new file mode 100644
index 00000000..a152549b
--- /dev/null
+++ b/code/components/esp-nn/tests/include/test_utils.h
@@ -0,0 +1,87 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <common_functions.h>
+#include <stdio.h>
+
+/* mult value range */
+#define MULT_MAX    INT32_MAX
+#define MULT_MIN    0
+
+/* shift value range */
+#define SHIFT_MIN   -31
+#define SHIFT_MAX   30
+
+/**
+ * @brief callback function to run before C function
+ */
+void profile_c_start();
+
+/**
+ * @brief callback function to run after C function
+ */
+void profile_c_end();
+
+/**
+ * @brief callback function to run before optimized function
+ */
+void profile_opt_start();
+
+/**
+ * @brief callback function to run after optimized function
+ */
+void profile_opt_end();
+
+#define ANSI_COLOR_RED     "\x1b[31m"
+#define ANSI_COLOR_GREEN   "\x1b[32m"
+#define ANSI_COLOR_YELLOW  "\x1b[33m"
+#define ANSI_COLOR_BLUE    "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN    "\x1b[36m"
+#define ANSI_COLOR_RESET   "\x1b[0m"
+
+#define CHECK_EQUAL(ARRAY1, ARRAY2, size) ({    \
+    bool res = true;                            \
+    for (int _i = 0; _i < size; _i++) {         \
+        if (ARRAY1[_i] != ARRAY2[_i]) {         \
+            res = false;                        \
+            break;                              \
+        }                                       \
+    }                                           \
+    res;                                        \
+})
+
+#define PRINT_ARRAY_INT(ARRAY, width, height) ({        \
+    int *_array = (int *) ARRAY;                        \
+    for (int _j = 0; _j < height; _j++) {               \
+        for (int _i = 0; _i < width; _i++) {            \
+            printf("%d\t", _array[width * _j + _i]);    \
+        }                                               \
+        printf("\n");                                   \
+    }                                                   \
+    printf("\n");                                       \
+})
+
+#define PRINT_ARRAY_HEX(ARRAY, width, height) ({        \
+    uint8_t *_array = (uint8_t *) ARRAY;                \
+    for (int _j = 0; _j < height; _j++) {               \
+        for (int _i = 0; _i < width; _i++) {            \
+            printf("%02x\t", _array[width * _j + _i]);  \
+        }                                               \
+        printf("\n");                                   \
+    }                                                   \
+    printf("\n");                                       \
+})
diff --git a/code/components/esp-nn/tests/src/basic_math_test.c b/code/components/esp-nn/tests/src/basic_math_test.c
new file mode 100644
index 00000000..715d7c78
--- /dev/null
+++ b/code/components/esp-nn/tests/src/basic_math_test.c
@@ -0,0 +1,355 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+
+#include <common_functions.h>
+#include <esp_nn.h>
+#include "test_utils.h"
+
+#if CONFIG_IDF_CMAKE
+#if (CONFIG_SPIRAM_SUPPORT && (CONFIG_SPIRAM_USE_CAPS_ALLOC || CONFIG_SPIRAM_USE_MALLOC))
+#define IDF_HEAP_CAPS 1
+#endif
+
+#if IDF_HEAP_CAPS
+#include "esp_heap_caps.h"
+#endif
+#endif
+
+void esp_nn_add_elementwise_s8_test()
+{
+    /* prepare data */
+    const int size = 1600 + 8 + 7; /* odd len to test leftover */
+    int8_t *input1;
+    int8_t *input2;
+    int8_t *out_data_c;
+    int8_t *out_data_opt;
+    int8_t *input1_orig = NULL;
+    int8_t *input2_orig = NULL;
+    int8_t *out_c_orig = NULL;
+    int8_t *out_opt_orig = NULL;
+    int32_t input1_offset = 34;
+    int32_t input2_offset = 35;
+    int32_t output_offset = 36;
+    int32_t input1_shift = -8; // right_shift amt always <= 0
+    int32_t input2_shift = -8; // right_shift amt always <= 0
+    int32_t output_shift = -9; // right_shift amt always <= 0
+    int32_t left_shift = 15; // always +ve
+    int32_t input1_mult = INT32_MAX;
+    int32_t input2_mult = INT32_MAX;
+    int32_t output_mult = INT32_MAX;
+    int32_t activation_min = -128;
+    int32_t activation_max = 127;
+
+    for (int itr = 0; itr < 10; itr++) {
+        switch (itr) {
+        case 0: // all zeros
+            input1_offset = 0;
+            input2_offset = 0;
+            output_offset = 0;
+            input1_mult = 0;
+            input2_mult = 0;
+            output_mult = 0;
+            input1_shift = 0;
+            input2_shift = 0;
+            output_shift = 0;
+            left_shift = 0;
+        break;
+        case 1: // hit min
+            input1_offset = -127;
+            input2_offset = -127;
+            output_offset = -128;
+            input1_mult = MULT_MIN;
+            input2_mult = MULT_MIN;
+            output_mult = MULT_MIN;
+            input1_shift = 0;
+            input2_shift = 0;
+            output_shift = 0;
+            left_shift = 0;
+        break;
+        case 2: // hit max
+            input1_offset = 128;
+            input2_offset = 128;
+            output_offset = -127;
+            input1_mult = MULT_MAX;
+            input2_mult = MULT_MAX;
+            output_mult = MULT_MAX;
+            input1_shift = SHIFT_MIN;
+            input2_shift = SHIFT_MIN;
+            output_shift = SHIFT_MIN;
+            left_shift = 30 - 8; // since input is 8 bits
+        break;
+        case 3: // hit extreme max
+            input1_offset = 128;
+            input2_offset = 128;
+            output_offset = -127;
+            input1_mult = MULT_MAX;
+            input2_mult = MULT_MAX;
+            output_mult = MULT_MAX;
+            input1_shift = 0;
+            input2_shift = 0;
+            output_shift = 0;
+            left_shift = 30 - 8; // -8 since input is 8 bit
+        break;
+        default:  // practical random input
+            input1_offset = rand() % 256 - 127; // range [-127, 128]
+            input2_offset = rand() % 256 - 127; // range [-127, 128]
+            output_offset = rand() % 256 - 128; // range [-128, 127]
+            input1_mult = MULT_MAX / 2 + rand() % INT16_MAX;
+            input2_mult = MULT_MAX / 2 + rand() % INT16_MAX;
+            output_mult = MULT_MAX / 2 + rand() % INT16_MAX;
+            input1_shift = -8 + rand() % 4;
+            input2_shift = -8 + rand() % 4;
+            output_shift = -8 + rand() % 4;
+            left_shift = rand() % 15;
+        }
+#if IDF_HEAP_CAPS
+        input1_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        input2_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        out_c_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        out_opt_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+
+        input1 = 16 + input1_orig - ((uint32_t) input1_orig & 0xf);
+        input2 = 16 + input2_orig - ((uint32_t) input2_orig & 0xf);
+        out_data_c = 16 + out_c_orig - ((uint32_t) out_c_orig & 0xf);
+        out_data_opt = 16 + out_opt_orig - ((uint32_t) out_opt_orig & 0xf);
+#else
+        input1 = memalign(16, size);
+        input2 = memalign(16, size);
+        out_data_c = memalign(16, size);
+        out_data_opt = memalign(16, size);
+
+        input1_orig = input1;
+        input2_orig = input2;
+        out_c_orig = out_data_c;
+        out_opt_orig = out_data_opt;
+#endif
+        if (input1_orig == NULL || input2_orig == NULL || out_c_orig == NULL ||
+                out_opt_orig == NULL) {
+            printf(ANSI_COLOR_RED"%s error allocating buffers\n"ANSI_COLOR_RESET, __FUNCTION__);
+            goto elementwise_add_test_cleanup;
+        }
+
+        for (int i = 0; i < size; ++i) {
+            input1[i] = rand() % 256 - 128;
+            input2[i] = rand() % 256 - 128;
+        }
+
+        if (itr == 0) {
+            /* enable profiler */
+            profile_c_start();
+        }
+        /* C function */
+        esp_nn_add_elementwise_s8_ansi(input1, input2, input1_offset, input2_offset,
+                                       input1_mult, input2_mult, input1_shift, input2_shift,
+                                       left_shift, out_data_c, output_offset, output_mult,
+                                       output_shift, activation_min, activation_max, size);
+
+        if (itr == 0) {
+            profile_c_end();
+            profile_opt_start();
+        }
+
+        /* Optimized function */
+        esp_nn_add_elementwise_s8(input1, input2, input1_offset, input2_offset,
+                                  input1_mult, input2_mult, input1_shift, input2_shift,
+                                  left_shift, out_data_opt, output_offset, output_mult,
+                                  output_shift, activation_min, activation_max, size);
+        if (itr == 0) {
+            /* disable profiler */
+            profile_opt_end();
+        }
+
+        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size);
+        if (ret == false) {
+            printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+            printf("Output: \n");
+            PRINT_ARRAY_HEX(out_data_opt, size, 1);
+            printf("Expected: \n");
+            PRINT_ARRAY_HEX(out_data_c, size, 1);
+            printf("Input1:\n");
+            PRINT_ARRAY_HEX(input1, size, 1);
+            printf("Input2:\n");
+            PRINT_ARRAY_HEX(input2, size, 1);
+            printf("in1_shift %d, in2_shift %d, left_shift %d, out_shift %d\n",
+                   input1_shift, input2_shift, left_shift, output_shift);
+            printf("in1_mult %d, in2_mult %d, out_mult %d\n", input1_mult, input2_mult, output_mult);
+            goto elementwise_add_test_cleanup;
+        }
+        printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+
+elementwise_add_test_cleanup:
+        if (input1_orig) {
+            free(input1_orig);
+        }
+        if (input2_orig) {
+            free(input2_orig);
+        }
+        if (out_c_orig) {
+            free(out_c_orig);
+        }
+        if (out_opt_orig) {
+            free(out_opt_orig);
+        }
+    }
+}
+
+void esp_nn_mul_elementwise_s8_test()
+{
+    /* prepare data */
+    const int size = 1600 + 8 + 7; /* odd len to test leftover */
+    int8_t *input1;
+    int8_t *input2;
+    int8_t *out_data_c;
+    int8_t *out_data_opt;
+    int32_t input1_offset = 34;
+    int32_t input2_offset = 35;
+    int32_t output_offset = 36;
+    int32_t output_shift = -7;
+    int32_t output_mult = MULT_MAX; // max out_mult
+    int32_t activation_min = -128;
+    int32_t activation_max = 127;
+    int8_t *input1_orig = NULL;
+    int8_t *input2_orig = NULL;
+    int8_t *out_c_orig = NULL;
+    int8_t *out_opt_orig = NULL;
+
+    for (int itr = 0; itr < 10; itr++) {
+        switch (itr) {
+        case 0: // all zeros
+            input1_offset = 0;
+            input2_offset = 0;
+            output_offset = 0;
+            output_mult = 0;
+            output_shift = 0;
+        break;
+        case 1: // hit min
+            input1_offset = -127;
+            input2_offset = -127;
+            output_offset = -128;
+            output_mult = MULT_MIN;
+            output_shift = 0;
+        break;
+        case 2: // hit max
+            input1_offset = 128;
+            input2_offset = 128;
+            output_offset = -127;
+            output_mult = MULT_MAX;
+            output_shift = SHIFT_MIN;
+        break;
+        case 3: // hit extreme max
+            input1_offset = 128;
+            input2_offset = 128;
+            output_offset = -127;
+            output_mult = MULT_MAX;
+            output_shift = 0;
+        break;
+        default:  // practical random input
+            input1_offset = rand() % 256 - 127; // range [-127, 128]
+            input2_offset = rand() % 256 - 127; // range [-127, 128]
+            output_offset = rand() % 256 - 128; // range [-128, 127]
+            output_mult = MULT_MAX / 2 + rand() % INT16_MAX;
+            output_shift = -8 + rand() % 4;
+        }
+
+#if IDF_HEAP_CAPS
+        input1_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        input2_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        out_c_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        out_opt_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+
+        input1 = 16 + input1_orig - ((uint32_t) input1_orig & 0xf);
+        input2 = 16 + input2_orig - ((uint32_t) input2_orig & 0xf);
+        out_data_c = 16 + out_c_orig - ((uint32_t) out_c_orig & 0xf);
+        out_data_opt = 16 + out_opt_orig - ((uint32_t) out_opt_orig & 0xf);
+#else
+        input1 = memalign(16, size);
+        input2 = memalign(16, size);
+        out_data_c = memalign(16, size);
+        out_data_opt = memalign(16, size);
+
+        input1_orig = input1;
+        input2_orig = input2;
+        out_c_orig = out_data_c;
+        out_opt_orig = out_data_opt;
+#endif
+        if (input1_orig == NULL || input2_orig == NULL || out_c_orig == NULL ||
+                out_opt_orig == NULL) {
+            printf(ANSI_COLOR_RED"%s error allocating buffers\n"ANSI_COLOR_RESET, __FUNCTION__);
+            goto elementwise_mult_test_cleanup;
+        }
+
+        for (int i = 0; i < size; ++i) {
+            input1[i] = rand() % 256 - 128;
+            input2[i] = rand() % 256 - 128;
+        }
+
+        if (itr == 0) {
+            /* enable profiler */
+            profile_c_start();
+        }
+        /* C function */
+        esp_nn_mul_elementwise_s8_ansi(input1, input2, input1_offset, input2_offset,
+                                       out_data_c, output_offset, output_mult, output_shift,
+                                       activation_min, activation_max, size);
+
+        if (itr == 0) {
+            profile_c_end();
+            profile_opt_start();
+        }
+        /* Optimized function */
+        esp_nn_mul_elementwise_s8(input1, input2, input1_offset, input2_offset,
+                                  out_data_opt, output_offset, output_mult, output_shift,
+                                  activation_min, activation_max, size);
+
+        if (itr == 0) {
+            /* disable profiler */
+            profile_opt_end();
+        }
+
+        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size);
+        if (ret == false) {
+            printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+            printf("Output: \n");
+            PRINT_ARRAY_HEX(out_data_opt, size, 1);
+            printf("Expected: \n");
+            PRINT_ARRAY_HEX(out_data_c, size, 1);
+            printf("Input1:\n");
+            PRINT_ARRAY_HEX(input1, size, 1);
+            printf("Input2:\n");
+            PRINT_ARRAY_HEX(input2, size, 1);
+            goto elementwise_mult_test_cleanup;
+        }
+        printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+
+elementwise_mult_test_cleanup:
+        if (input1_orig) {
+            free(input1_orig);
+        }
+        if (input2_orig) {
+            free(input2_orig);
+        }
+        if (out_c_orig) {
+            free(out_c_orig);
+        }
+        if (out_opt_orig) {
+            free(out_opt_orig);
+        }
+    }
+}
diff --git a/code/components/esp-nn/tests/src/convolution_test.c b/code/components/esp-nn/tests/src/convolution_test.c
new file mode 100644
index 00000000..c86bdbab
--- /dev/null
+++ b/code/components/esp-nn/tests/src/convolution_test.c
@@ -0,0 +1,605 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+
+#include <esp_nn.h>
+#include "test_utils.h"
+
+#if CONFIG_IDF_CMAKE
+#if (CONFIG_SPIRAM_SUPPORT && (CONFIG_SPIRAM_USE_CAPS_ALLOC || CONFIG_SPIRAM_USE_MALLOC))
+#define IDF_HEAP_CAPS 1
+#endif
+#if IDF_HEAP_CAPS
+#include "esp_heap_caps.h"
+#endif
+#endif
+
+void esp_nn_depthwise_conv_s8_test()
+{
+    int8_t *input = NULL, *filter_data = NULL, *out_data_c = NULL, *out_data_opt = NULL;
+    int32_t *bias = NULL;
+    int32_t input_offset = 5; /* some number in [-128, 127] */
+    int32_t out_offset = 7;
+    int32_t activation_min = -125;
+    int32_t activation_max = 120;
+    void *scratch_buf = NULL;
+
+    /* independent variables */
+    int input_wd, input_ht, channels;
+    uint16_t filter_ht, filter_wd, ch_mult;
+    uint16_t pad_wd, pad_ht, stride_wd, stride_ht;
+
+    // run for 15 iterations
+    for (int itr = 0; itr < 15; itr++) {
+        /* prepare data */
+        switch (itr) {
+        case 0: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0)
+            input_wd = 18;
+            input_ht = 18;
+            filter_ht = 3;
+            filter_wd = 3;
+            ch_mult = 1;
+            channels = 16;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 1: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (1,1)
+            input_wd = 10;
+            input_ht = 10;
+            filter_ht = 3;
+            filter_wd = 3;
+            ch_mult = 1;
+            channels = 16;
+            pad_wd = 1;
+            pad_ht = 1;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 2: // (ch_mult 1, (channels % 8) = 0), filter (3,3), pad (1,1)
+            input_wd = 10;
+            input_ht = 10;
+            filter_ht = 3;
+            filter_wd = 3;
+            ch_mult = 1;
+            channels = 24;
+            pad_wd = 1;
+            pad_ht = 1;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 3: // other filter sizes (ch_mult 1, (channels % 8) = 0)
+            input_wd = 10;
+            input_ht = 10;
+            filter_ht = 3;
+            filter_wd = 3;
+            ch_mult = 1;
+            channels = 24;
+            pad_wd = 1;
+            pad_ht = 1;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 4: // other filter sizes (ch_mult 8 = 0)
+            input_wd = 6;
+            input_ht = 6;
+            filter_ht = 3;
+            filter_wd = 3;
+            ch_mult = 8;
+            channels = 4;
+            pad_wd = 1;
+            pad_ht = 1;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 5: // other filter sizes (ch_mult 8 = 0)
+            input_wd = 12;
+            input_ht = 12;
+            filter_ht = 5;
+            filter_wd = 5;
+            ch_mult = 8;
+            channels = 4;
+            pad_wd = 1;
+            pad_ht = 1;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 6: // other filter sizes (ch_mult 4 = 0)
+            input_wd = 6;
+            input_ht = 6;
+            filter_ht = 3;
+            filter_wd = 3;
+            ch_mult = 4;
+            channels = 4;
+            pad_wd = 1;
+            pad_ht = 1;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 7: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0)  stride (2,2)
+            input_wd = 6;
+            input_ht = 6;
+            filter_ht = 3;
+            filter_wd = 3;
+            ch_mult = 1;
+            channels = 16;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 2;
+            stride_ht = 2;
+            break;
+        case 8: // same as case 7, with large parameters
+            input_wd = 58;
+            input_ht = 58;
+            filter_ht = 3;
+            filter_wd = 3;
+            ch_mult = 1;
+            channels = 128;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 2;
+            stride_ht = 2;
+            break;
+        case 9: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0)  stride (2,2)
+            input_wd = 6;
+            input_ht = 6;
+            filter_ht = 3;
+            filter_wd = 3;
+            ch_mult = 1;
+            channels = 16;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 2;
+            stride_ht = 2;
+            break;
+        default:
+            input_wd = 6;
+            input_ht = 6;
+            filter_ht = 3;
+            filter_wd = 3;
+            ch_mult = 1;
+            channels = 16;
+            stride_wd = rand() % 2 + 1;
+            stride_ht = stride_wd;
+            pad_wd = stride_wd == 1 ? 0 : rand() % 2;
+            pad_ht = pad_wd;
+            printf("stride(%d), pad (%d)\t", stride_wd, pad_wd);
+            break;
+        }
+
+        uint16_t out_wd = (input_wd - filter_wd + 1) / stride_wd;
+        uint16_t out_ht = (input_ht - filter_ht + 1) / stride_ht;
+        if (itr == 9) {
+            // expect the function to handle this gracefully
+            out_wd += 1;
+            out_ht += 1;
+        }
+        int in_size = input_wd * input_ht * channels;
+        int out_size = out_wd * out_ht * channels * ch_mult;
+        int filter_size = filter_wd * filter_ht * channels * ch_mult + 4;
+        int bias_size = channels * ch_mult + 1;
+        int32_t out_shift[channels * ch_mult];
+        int32_t out_mult[channels * ch_mult];
+
+#if IDF_HEAP_CAPS
+        int8_t *input_orig = (int8_t *) heap_caps_malloc(in_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        int8_t *out_c_orig = (int8_t *) heap_caps_malloc(out_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        int8_t *out_opt_orig = (int8_t *) heap_caps_malloc(out_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        filter_data = (int8_t *) heap_caps_malloc(filter_size, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        bias = (int32_t *) heap_caps_malloc(bias_size * 4, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+
+        input = 16 + input_orig - ((uint32_t) input_orig & 0xf);
+        out_data_c = 16 + out_c_orig - ((uint32_t) out_c_orig & 0xf);
+        out_data_opt = 16 + out_opt_orig - ((uint32_t) out_opt_orig & 0xf);
+#else
+        input = memalign(16, in_size + 16);
+        filter_data = memalign(16, filter_size);
+        out_data_c = memalign(16, out_size + 16);
+        out_data_opt = memalign(16, out_size + 16);
+        bias = memalign(16, bias_size * 4);
+        int8_t *input_orig = input;
+        int8_t *out_c_orig = out_data_c;
+        int8_t *out_opt_orig = out_data_opt;
+#endif
+        if (bias == NULL || input == NULL || filter_data == NULL ||
+                out_data_c == NULL || out_data_opt == NULL || bias == NULL) {
+            printf(ANSI_COLOR_RED"%s[%d] allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+            goto dc_s8_cleanup;
+        }
+
+        /* Generate input data */
+        for (int i = 0; i < in_size; ++i) {
+            input[i] = rand() % 128;
+        }
+
+        /* Generate filter data */
+        for (int i = 0; i < filter_size; ++i) {
+            filter_data[i] = rand() % 256 - 128;
+        }
+
+        /* Generate bias data */
+        for (int i = 0; i < channels * ch_mult; ++i) {
+            bias[i + 1] = rand() % INT16_MAX; //0th index left for unalignment
+            out_shift[i] = -8 + rand() % 3;
+            out_mult[i] = 0x7eb0e200 + rand() % 50;
+        }
+
+        data_dims_t input_dims = {.width = input_wd, .height = input_ht, .channels = channels, 1};
+        data_dims_t output_dims = {.width = out_wd, .height = out_ht, .channels = channels * ch_mult, 1};
+        data_dims_t filter_dims = {.width = filter_wd, .height = filter_ht, 0, 0};
+        dw_conv_params_t conv_params = {.in_offset = input_offset, .out_offset = out_offset, .ch_mult = ch_mult,
+                                        .stride = {stride_wd, stride_ht}, .padding = {pad_wd, pad_ht},
+                                        .dilation = {0, 0}, .activation = {activation_min, activation_max}};
+        quant_data_t quant_data = {.shift = out_shift, .mult = out_mult};
+
+        int scratch_buf_size = esp_nn_get_depthwise_conv_scratch_size(&input_dims, &filter_dims,
+                                                                      &output_dims, &conv_params);
+        if (scratch_buf_size > 0) {
+#if IDF_HEAP_CAPS
+            scratch_buf = heap_caps_malloc(scratch_buf_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+            int align_sz = 16 - (((int32_t) scratch_buf) & 0xf);
+#else
+            scratch_buf = memalign(16, scratch_buf_size);
+            int align_sz = 0;
+#endif
+            if (scratch_buf == NULL) {
+                printf(ANSI_COLOR_RED"%s[%d] scratch_buf alloc failed size %d\n"ANSI_COLOR_RESET,
+                       __FUNCTION__, itr, scratch_buf_size);
+                goto dc_s8_cleanup;
+            }
+            esp_nn_set_depthwise_conv_scratch_buf(scratch_buf + align_sz);
+        }
+        if (itr == 0) {
+            /* enable profiler */
+            profile_c_start();
+        }
+
+        /* C function */
+        esp_nn_depthwise_conv_s8_ansi(&input_dims, input, &filter_dims, filter_data + 4,
+                                      bias + 1, &output_dims, out_data_c, &conv_params, &quant_data);
+
+        if (itr == 0) {
+            profile_c_end();
+            profile_opt_start();
+        }
+
+        /* Optimized function */
+        esp_nn_depthwise_conv_s8(&input_dims, input, &filter_dims, filter_data + 4,
+                                 bias + 1, &output_dims, out_data_opt, &conv_params, &quant_data);
+
+        if (itr == 0) {
+            /* disable profiler */
+            profile_opt_end();
+        }
+
+        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, out_size);
+        if (ret == false) {
+            printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+            printf("Output: \n");
+            PRINT_ARRAY_HEX(out_data_opt, out_size / out_ht, out_ht);
+            printf("Expected: \n");
+            PRINT_ARRAY_HEX(out_data_c, out_size / out_ht, out_ht);
+            printf("Input:\n");
+            PRINT_ARRAY_HEX(input, in_size / input_ht, input_ht);
+            printf("Filter data:\n");
+            PRINT_ARRAY_HEX(filter_data + 4, (filter_size - 4) / filter_ht, filter_ht);
+            printf("bias data:\n");
+            PRINT_ARRAY_INT(bias + 1, ch_mult * channels, 1);
+            goto dc_s8_cleanup;
+        }
+        printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+
+    dc_s8_cleanup:
+        if (input) {
+            free(input_orig);
+        }
+        if (filter_data) {
+            free(filter_data);
+        }
+        if (out_data_c) {
+            free(out_c_orig);
+        }
+        if (out_data_opt) {
+            free(out_opt_orig);
+        }
+        if (bias) {
+            free(bias);
+        }
+        if (scratch_buf) {
+            free(scratch_buf);
+        }
+    }
+}
+
+void esp_nn_conv_s8_test()
+{
+    const int32_t input_offset = 5; /* some number in [-128, 127] */
+    const int32_t activation_min = -125;
+    const int32_t activation_max = 122;
+    const int32_t out_offset = 3;
+
+    void *scratch_buf = NULL;
+    int8_t *input_orig;
+    int8_t *out_c_orig;
+    int8_t *out_opt_orig;
+    int8_t *filter_data;
+    int32_t *bias;
+
+    /* independent variable */
+    int in_wd, in_ht, in_channels, out_channels;
+    uint16_t filter_ht, filter_wd;
+    uint16_t pad_wd, pad_ht, stride_wd, stride_ht;
+
+    // run for 10 iterations
+    for (int itr = 0; itr < 10; itr++) {
+        switch (itr) {
+        case 0: // ch % 8 == 0 && filter (1,1), padding (0,0)
+            in_wd = 10;
+            in_ht = 10;
+            in_channels = 64;
+            out_channels = 64;
+            filter_ht = 1;
+            filter_wd = 1;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 1: // ch % 4 == 0 && (in_wd * in_ht) % 16 == 0
+            in_wd = 4;
+            in_ht = 4;
+            in_channels = 20;
+            out_channels = 8;
+            filter_ht = 1;
+            filter_wd = 1;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 2: // ch, filter (3x3x3)
+            in_wd = 10;
+            in_ht = 10;
+            in_channels = 3;
+            out_channels = 64;
+            filter_ht = 3;
+            filter_wd = 3;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 3: // remaining pad (0, 0)
+            in_wd = 10;
+            in_ht = 10;
+            in_channels = 3;
+            out_channels = 64;
+            filter_ht = 1;
+            filter_wd = 1;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 4: // unopt case
+            in_wd = 10;
+            in_ht = 10;
+            in_channels = 12;
+            out_channels = 64;
+            filter_ht = 3;
+            filter_wd = 3;
+            pad_wd = 1;
+            pad_ht = 1;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        case 5: // ch % 8 == 0 & stride (2,2)
+            in_wd = 16;
+            in_ht = 16;
+            in_channels = 16;
+            out_channels = 16;
+            filter_ht = 1;
+            filter_wd = 1;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 2;
+            stride_ht = 2;
+            break;
+        case 6: // ch % 8 == 0 && filter (1,1), padding (0,0)
+            in_wd = 2;
+            in_ht = 2;
+            in_channels = 8;
+            out_channels = 8;
+            filter_ht = 1;
+            filter_wd = 1;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        default: // ch % 8 == 0
+            in_wd = 8;
+            in_ht = 8;
+            in_channels = 16;
+            out_channels = 16;
+            filter_ht = 1;
+            filter_wd = 1;
+            pad_wd = 0;
+            pad_ht = 0;
+            stride_wd = 1;
+            stride_ht = 1;
+            break;
+        }
+
+        /* prepare data */
+        uint16_t out_wd = (in_wd - filter_wd + 1) / stride_wd;
+        uint16_t out_ht = (in_ht - filter_ht + 1) / stride_ht;
+
+        int in_size = in_wd * in_ht * in_channels;
+        int filter_size = filter_wd * filter_ht * in_channels * out_channels + 2;
+        int out_size = out_wd * out_ht * out_channels;
+
+#if IDF_HEAP_CAPS
+        input_orig = (int8_t *) heap_caps_malloc(in_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        out_c_orig = (int8_t *) heap_caps_malloc(out_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        out_opt_orig = (int8_t *) heap_caps_malloc(out_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        filter_data = (int8_t *) heap_caps_malloc(filter_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+        bias = (int32_t *) heap_caps_malloc(128 + sizeof (int32_t) * out_channels, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+
+        int8_t *input = 16 + input_orig - ((uint32_t) input_orig & 0xf);
+        int8_t *out_data_c = 16 + out_c_orig - ((uint32_t) out_c_orig & 0xf);
+        int8_t *out_data_opt = 16 + out_opt_orig - ((uint32_t) out_opt_orig & 0xf);
+#else
+        int8_t *input = memalign(16, in_size);
+        int8_t *out_data_c = memalign(16, out_size);
+        int8_t *out_data_opt = memalign(16, out_size);
+        filter_data = memalign(16, filter_size);
+        bias = calloc(1, 128 + sizeof (int32_t) * out_channels);
+        input_orig = input;
+        out_c_orig = out_data_c;
+        out_opt_orig = out_data_opt;
+#endif
+        int32_t *out_shift = calloc(1, 128 + sizeof (int32_t) * out_channels);
+        int32_t *out_mult = calloc(1, 128 + sizeof (int32_t) * out_channels);
+
+        if (input == NULL || filter_data == NULL ||
+                out_data_c == NULL || out_data_opt == NULL) {
+            printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__);
+            goto conv_s8_cleanup;
+        }
+
+        if (bias == NULL || out_shift == NULL || out_mult == NULL) {
+            printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__);
+            goto conv_s8_cleanup;
+        }
+
+        /* Generate input data between -128 -> +127 */
+        for (int i = 0; i < in_size; ++i) {
+            input[i] = rand() % 255 - 128;
+        }
+
+        /* Generate filter data between -128 -> +127 */
+        for (int i = 0; i < filter_size; ++i) {
+            filter_data[i] = rand() % 256 - 128;
+        }
+
+        /* Generate bias data */
+        for (int i = 0; i < out_channels; ++i) {
+            bias[i] = (int32_t)rand() % UINT16_MAX + UINT8_MAX;
+        }
+
+        /* Shift and multiplier */
+        for (int i = 0; i < out_channels; ++i) {
+            out_shift[i] = -10 + rand() % 2;
+            out_mult[i] = 0x7f67f4f8 + rand() % 50;
+        }
+
+        data_dims_t input_dims = {.width = in_wd, .height = in_ht, .channels = in_channels, 1};
+        data_dims_t output_dims = {.width = out_wd, .height = out_ht, .channels = out_channels, 1};
+        data_dims_t filter_dims = {.width = filter_wd, .height = filter_ht, 0, 0};
+        conv_params_t conv_params = {.in_offset = input_offset, .out_offset = out_offset,
+                                    .stride = {stride_wd, stride_ht}, .padding = {pad_wd, pad_ht},
+                                    .dilation = {0, 0}, .activation = {activation_min, activation_max}};
+        quant_data_t quant_data = {.shift = out_shift, .mult = out_mult};
+
+        int scratch_buf_size = esp_nn_get_conv_scratch_size(&input_dims, &filter_dims,
+                                                            &output_dims, &conv_params);
+        if (scratch_buf_size > 0) {
+#if IDF_HEAP_CAPS
+            void *scratch_buf = heap_caps_malloc(scratch_buf_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+            int align_sz = 16 - (((int32_t) scratch_buf) & 0xf);
+#else
+            void *scratch_buf = memalign(16, scratch_buf_size);
+            int align_sz = 0;
+#endif
+            if (scratch_buf == NULL) {
+                printf(ANSI_COLOR_RED"%s scratch_buf alloc failed size %d\n"ANSI_COLOR_RESET, __FUNCTION__, scratch_buf_size);
+                goto conv_s8_cleanup;
+            }
+            esp_nn_set_conv_scratch_buf(scratch_buf + align_sz);
+        }
+
+        if (itr == 0) {
+            /* enable profiler */
+            profile_c_start();
+        }
+
+        /* C function */
+        esp_nn_conv_s8_ansi(&input_dims, input, &filter_dims, filter_data + 2,
+                            bias, &output_dims, out_data_c, &conv_params, &quant_data);
+
+        if (itr == 0) {
+            profile_c_end();
+            profile_opt_start();
+        }
+
+        /* Optimized function */
+        esp_nn_conv_s8(&input_dims, input, &filter_dims, filter_data + 2,
+                       bias, &output_dims, out_data_opt, &conv_params, &quant_data);
+
+        if (itr == 0) {
+            /* disable profiler */
+            profile_opt_end();
+        }
+
+        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, out_size);
+        if (ret == false) {
+            printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+            printf("Output: \n");
+            PRINT_ARRAY_HEX(out_data_opt, out_size / out_ht, out_ht);
+            printf("Expected: \n");
+            PRINT_ARRAY_HEX(out_data_c, out_size / out_ht, out_ht);
+            printf("Input:\n");
+            PRINT_ARRAY_HEX(input, in_size / in_ht, in_ht);
+            printf("Filter data:\n");
+            PRINT_ARRAY_HEX(filter_data + 2, (filter_size - 2) / filter_ht, filter_ht);
+            printf("bias data:\n");
+            PRINT_ARRAY_INT(bias, out_channels, 1);
+            goto conv_s8_cleanup;
+        }
+        printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+
+    conv_s8_cleanup:
+        if (input) {
+            free(input_orig);
+        }
+        if (filter_data) {
+            free(filter_data);
+        }
+        if (out_data_c) {
+            free(out_c_orig);
+        }
+        if (out_data_opt) {
+            free(out_opt_orig);
+        }
+        if (bias) {
+            free(bias);
+        }
+        if (out_shift) {
+            free(out_shift);
+        }
+        if (out_mult) {
+            free(out_mult);
+        }
+        if (scratch_buf) {
+            free(scratch_buf);
+        }
+    }
+}
diff --git a/code/components/esp-nn/tests/src/fully_connected_test.c b/code/components/esp-nn/tests/src/fully_connected_test.c
new file mode 100644
index 00000000..d0210b46
--- /dev/null
+++ b/code/components/esp-nn/tests/src/fully_connected_test.c
@@ -0,0 +1,111 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <esp_nn.h>
+#include "test_utils.h"
+
+
+void esp_nn_fully_connected_s8_test()
+{
+    /* prepare data */
+    static uint16_t row_len = 256 + 8 + 7; /* odd len to test unaligned+left-over */
+    static uint16_t out_channels = 3;
+    int8_t input[row_len];
+    int8_t filter_data[row_len * out_channels];
+    int8_t output_c[out_channels], output_opt[out_channels];
+    static int32_t activation_min = -128;
+    static int32_t activation_max = 127;
+    static int32_t input_offset = 0;
+    static int32_t filter_offset = 0;
+    int32_t out_shift = -10;
+    static int32_t out_offset = 127;
+    int32_t out_mult = 0x59e492c4;
+    for (int itr = 0; itr < 5; itr++) {
+        out_mult = INT32_MAX / row_len + rand() % INT16_MAX;
+        switch (itr) {
+        case 0:
+            out_shift = -10;
+            break;
+        case 1:
+            out_shift = SHIFT_MIN;
+            break;
+        case 2:
+            out_shift = SHIFT_MAX;
+            break;
+        case 3:
+            out_shift = 0;
+            break;
+        default:
+            out_shift = -10 + rand() % 5;
+            break;
+        }
+        if (itr == 0) {
+            out_shift = SHIFT_MAX;
+        }
+        /* Generate input and filter data */
+        for (int i = 0; i < row_len; ++i) {
+            input[i] = rand() % 256 - 128;
+        }
+        for (int i = 0; i < row_len * out_channels; ++i) {
+            filter_data[i] = rand() % 256 - 128;
+        }
+
+        if (itr == 0) {
+            /* enable profiler */
+            profile_c_start();
+        }
+
+        /* C function */
+        esp_nn_fully_connected_s8_ansi(input, input_offset, row_len, filter_data, filter_offset,
+                                    NULL, output_c, out_channels, out_offset, out_shift, out_mult,
+                                    activation_min, activation_max);
+
+        if (itr == 0) {
+            profile_c_end();
+            profile_opt_start();
+        }
+
+        /* Optimized function */
+        esp_nn_fully_connected_s8(input, input_offset, row_len, filter_data, filter_offset,
+                                NULL, output_opt, out_channels, out_offset, out_shift, out_mult,
+                                activation_min, activation_max);
+
+        if (itr == 0) {
+            /* disable profiler */
+            profile_opt_end();
+        }
+
+        bool ret = CHECK_EQUAL(output_c, output_opt, out_channels);
+        if (ret == false) {
+            printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+            printf("Output: \n");
+            PRINT_ARRAY_HEX(output_opt, out_channels, 1);
+            printf("Expected: \n");
+            PRINT_ARRAY_HEX(output_c, out_channels, 1);
+            printf("Input:\n");
+            PRINT_ARRAY_HEX(input, row_len, 1);
+            printf("Filter data:\n");
+            PRINT_ARRAY_HEX(filter_data, row_len, out_channels);
+            printf("Out shift: %d\n", out_shift);
+            printf("Out mult: %x\n", out_mult);
+            return;
+        }
+        printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
+    }
+}
diff --git a/code/components/esp-nn/tests/src/pooling_test.c b/code/components/esp-nn/tests/src/pooling_test.c
new file mode 100644
index 00000000..c1c889e1
--- /dev/null
+++ b/code/components/esp-nn/tests/src/pooling_test.c
@@ -0,0 +1,184 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+
+#include <esp_nn.h>
+#include "test_utils.h"
+
+
+void esp_nn_avg_pool_s8_test()
+{
+    /* prepare data */
+    const uint16_t input_wd = 16;
+    const uint16_t input_ht = 16;
+    const uint16_t channels = 16; /* With TFLite example, I have seen it 256 */
+    const int size = input_wd * input_ht * channels;
+    int8_t *input, *output_c, *output_opt;
+    const int32_t activation_min = -128;
+    const int32_t activation_max = 127;
+    const uint16_t pad_wd = 1;
+    const uint16_t pad_ht = 1;
+    const uint16_t stride_wd = 1;
+    const uint16_t stride_ht = 1;
+    const uint16_t filter_ht = 3;
+    const uint16_t filter_wd = 3;
+    const uint16_t out_wd = input_wd / stride_wd;
+    const uint16_t out_ht = input_ht / stride_ht;
+    const int out_size = out_wd * out_ht * channels;
+
+    input = memalign(16, size);
+    output_c = memalign(16, out_size);
+    output_opt = memalign(16, out_size);
+
+    if (input == NULL || output_c == NULL || output_opt == NULL) {
+        printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__);
+        goto avg_pool_s8_cleanup;
+    }
+    /**
+     * width/height, channels etc look suspicious but it it true.
+     * It actually depends upon where in model this is actually placed.
+     * If at the end wd/ht tends to be smaller and depth larger.
+     */
+
+    for (int i = 0; i < size; ++i) {
+        input[i] = rand() % 256 - 128;
+    }
+
+    /* enable profiler */
+    profile_c_start();
+
+    /* C function */
+    esp_nn_avg_pool_s8_ansi(input, input_wd, input_ht, output_c, out_wd, out_ht,
+                              stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,
+                              activation_min, activation_max, channels);
+
+    profile_c_end();
+    profile_opt_start();
+
+    /* Optimized function */
+    esp_nn_avg_pool_s8(input, input_wd, input_ht, output_opt, out_wd, out_ht,
+                         stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,
+                         activation_min, activation_max, channels);
+
+    /* disable profiler */
+    profile_opt_end();
+
+
+    bool ret = CHECK_EQUAL(output_c, output_opt, out_size);
+    if (ret == false) {
+        printf(ANSI_COLOR_RED"%s failed\n"ANSI_COLOR_RESET, __FUNCTION__);
+        printf("Output: \n");
+        PRINT_ARRAY_HEX(output_opt, out_wd * channels, out_ht);
+        printf("Expected: \n");
+        PRINT_ARRAY_HEX(output_c, out_wd * channels, out_ht);
+        printf("Input:\n");
+        PRINT_ARRAY_HEX(input, input_wd * channels, input_ht);
+        goto avg_pool_s8_cleanup;
+    }
+    printf(ANSI_COLOR_GREEN"%s passed\n"ANSI_COLOR_RESET, __FUNCTION__);
+
+avg_pool_s8_cleanup:
+    if (input) {
+        free(input);
+    }
+    if (output_c) {
+        free(output_c);
+    }
+    if (output_opt) {
+        free(output_opt);
+    }
+}
+
+void esp_nn_max_pool_s8_test()
+{
+    /* prepare data */
+    const uint16_t input_wd = 16;
+    const uint16_t input_ht = 16;
+    const uint16_t channels = 16; /* With TFLite example, I have seen it 256 */
+    int8_t *input, *output_c, *output_opt;
+    const int size = input_wd * input_ht * channels;
+    const int32_t activation_min = -128;
+    const int32_t activation_max = 127;
+    const uint16_t pad_wd = 1;
+    const uint16_t pad_ht = 1;
+    const uint16_t stride_wd = 1;
+    const uint16_t stride_ht = 1;
+    const uint16_t filter_ht = 3;
+    const uint16_t filter_wd = 3;
+    const uint16_t out_wd = input_wd / stride_wd;
+    const uint16_t out_ht = input_ht / stride_ht;
+    const int out_size = out_wd * out_ht * channels;
+
+    input = memalign(16, size);
+    output_c = memalign(16, out_size);
+    output_opt = memalign(16, out_size);
+
+    if (input == NULL || output_c == NULL || output_opt == NULL) {
+        printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__);
+        goto max_pool_s8_cleanup;
+    }
+
+    for (int i = 0; i < size; ++i) {
+        input[i] = rand() % 256 - 128;
+    }
+
+    /* enable profiler */
+    profile_c_start();
+
+    /* C function */
+    esp_nn_max_pool_s8_ansi(input, input_wd, input_ht, output_c, out_wd, out_ht,
+                            stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,
+                            activation_min, activation_max, channels);
+
+    profile_c_end();
+    profile_opt_start();
+
+    /* Optimized function */
+    esp_nn_max_pool_s8(input, input_wd, input_ht, output_opt, out_wd, out_ht,
+                       stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,
+                       activation_min, activation_max, channels);
+
+    /* disable profiler */
+    profile_opt_end();
+
+
+    bool ret = CHECK_EQUAL(output_c, output_opt, out_wd * out_ht * channels);
+    if (ret == false) {
+        printf(ANSI_COLOR_RED"%s failed\n"ANSI_COLOR_RESET, __FUNCTION__);
+        printf("Output: \n");
+        PRINT_ARRAY_HEX(output_opt, out_wd * out_ht * channels, 1);
+        printf("Expected: \n");
+        PRINT_ARRAY_HEX(output_c, out_wd * out_ht * channels, 1);
+        printf("Input:\n");
+        PRINT_ARRAY_HEX(input, 8, size / 8);
+        goto max_pool_s8_cleanup;
+    }
+    printf(ANSI_COLOR_GREEN"%s passed\n"ANSI_COLOR_RESET, __FUNCTION__);
+
+max_pool_s8_cleanup:
+    if (input) {
+        free(input);
+    }
+    if (output_c) {
+        free(output_c);
+    }
+    if (output_opt) {
+        free(output_opt);
+    }
+}
diff --git a/code/components/esp-nn/tests/src/relu_test.c b/code/components/esp-nn/tests/src/relu_test.c
new file mode 100644
index 00000000..ce6f13f1
--- /dev/null
+++ b/code/components/esp-nn/tests/src/relu_test.c
@@ -0,0 +1,83 @@
+// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+
+#include <esp_nn.h>
+#include "test_utils.h"
+
+void esp_nn_relu6_s8_test()
+{
+    const int size = 1600 + 8 + 7;
+    int8_t *input, *inout_ansi, *inout_opt;
+
+    input = memalign(16, size);
+    inout_ansi = memalign(16, size);
+    inout_opt = memalign(16, size);
+
+    if (input == NULL || inout_ansi == NULL || inout_opt == NULL) {
+        printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__);
+        goto relu6_s8_cleanup;
+    }
+    /* Generate filter data between -128 -> +127 */
+    for (int i = 0; i < size; ++i) {
+        input[i] = rand() % 255 - 128;
+        inout_ansi[i] = input[i];
+        inout_opt[i] = input[i];
+    }
+
+    /* enable profiler */
+    profile_c_start();
+
+    /* C function */
+    esp_nn_relu6_s8_ansi(inout_ansi, size);
+
+    profile_c_end();
+    profile_opt_start();
+
+    /* Optimized function */
+    esp_nn_relu6_s8(inout_opt, size);
+
+    /* disable profiler */
+    profile_opt_end();
+
+    bool ret = CHECK_EQUAL(inout_ansi, inout_opt, size);
+    if (ret == false) {
+        printf(ANSI_COLOR_RED"%s failed\n"ANSI_COLOR_RESET, __FUNCTION__);
+        printf("Output: \n");
+        PRINT_ARRAY_HEX(inout_opt, size, 1);
+        printf("Expected: \n");
+        PRINT_ARRAY_HEX(inout_ansi, size, 1);
+        printf("Input:\n");
+        PRINT_ARRAY_HEX(input, size, 1);
+        goto relu6_s8_cleanup;
+    }
+    printf(ANSI_COLOR_GREEN"%s passed\n"ANSI_COLOR_RESET, __FUNCTION__);
+
+relu6_s8_cleanup:
+    if (input) {
+        free (input);
+    }
+    if (inout_ansi) {
+        free (inout_ansi);
+    }
+    if (inout_opt) {
+        free (inout_opt);
+    }
+
+}
diff --git a/code/components/esp-nn/tests/src/softmax_test.c b/code/components/esp-nn/tests/src/softmax_test.c
new file mode 100644
index 00000000..f7c734cd
--- /dev/null
+++ b/code/components/esp-nn/tests/src/softmax_test.c
@@ -0,0 +1,101 @@
+// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+
+#include <esp_nn.h>
+#include "test_utils.h"
+
+void esp_nn_softmax_s8_test()
+{
+    const int32_t height = 8;
+    const int32_t width = 32;
+    const int32_t diff_min = -128;
+    const int32_t mult = INT32_MAX / 2;
+    const int32_t shift = 7;
+    void *scratch_buf = NULL;
+    const int size = width * height;
+    int8_t *input, *out_ansi, *out_opt;
+
+    input = memalign(16, size);
+    out_ansi = memalign(16, size);
+    out_opt = memalign(16, size);
+
+    if (input == NULL || out_ansi == NULL || out_opt == NULL) {
+        printf(ANSI_COLOR_RED"%s buffer allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__);
+        goto softmax_s8_cleanup;
+    }
+
+    /* Generate input data between -128 -> +127 */
+    for (int i = 0; i < size; ++i) {
+        input[i] = rand() % 255 - 128;
+    }
+
+    /* enable profiler */
+    profile_c_start();
+
+    /* C function */
+    esp_nn_softmax_s8_ansi(input, height, width, mult, shift, diff_min, out_ansi);
+
+    profile_c_end();
+
+    int32_t scratch_buf_size = esp_nn_get_softmax_scratch_size(width, height);
+    if (scratch_buf_size) {
+        scratch_buf = memalign(4, scratch_buf_size);
+        if (scratch_buf == NULL) {
+            printf(ANSI_COLOR_RED"%s scratch_buf alloc failed size %d\n"ANSI_COLOR_RESET, __FUNCTION__, scratch_buf_size);
+            goto softmax_s8_cleanup;
+        }
+        esp_nn_set_softmax_scratch_buf(scratch_buf);
+    }
+
+    profile_opt_start();
+
+    /* Optimized function */
+    esp_nn_softmax_s8(input, height, width, mult, shift, diff_min, out_opt);
+
+    /* disable profiler */
+    profile_opt_end();
+
+    bool ret = CHECK_EQUAL(out_ansi, out_opt, size);
+    if (ret == false) {
+        printf(ANSI_COLOR_RED"%s failed\n"ANSI_COLOR_RESET, __FUNCTION__);
+        printf("Output: \n");
+        PRINT_ARRAY_HEX(out_opt, width, height);
+        printf("Expected: \n");
+        PRINT_ARRAY_HEX(out_ansi, width, height);
+        printf("Input:\n");
+        PRINT_ARRAY_HEX(input, width, height);
+        goto softmax_s8_cleanup;
+    }
+    printf(ANSI_COLOR_GREEN"%s passed\n"ANSI_COLOR_RESET, __FUNCTION__);
+
+softmax_s8_cleanup:
+    if (input) {
+        free (input);
+    }
+    if (out_ansi) {
+        free (out_ansi);
+    }
+    if (out_opt) {
+        free (out_opt);
+    }
+    if (scratch_buf) {
+        free (scratch_buf);
+    }
+}
diff --git a/code/components/esp-nn_20220716.zip b/code/components/esp-nn_20220716.zip
new file mode 100644
index 00000000..53c7bef2
Binary files /dev/null and b/code/components/esp-nn_20220716.zip differ
diff --git a/code/components/esp32-camera-master_neu_20220121.zip b/code/components/esp32-camera-master.zip
similarity index 77%
rename from code/components/esp32-camera-master_neu_20220121.zip
rename to code/components/esp32-camera-master.zip
index 3acbcf1a..8706b3d8 100644
Binary files a/code/components/esp32-camera-master_neu_20220121.zip and b/code/components/esp32-camera-master.zip differ
diff --git a/code/components/esp32-camera-master/.github/workflows/build.yml b/code/components/esp32-camera-master/.github/workflows/build.yml
index 85762b65..08f10dae 100644
--- a/code/components/esp32-camera-master/.github/workflows/build.yml
+++ b/code/components/esp32-camera-master/.github/workflows/build.yml
@@ -8,26 +8,37 @@ on:
 jobs:
   build-master:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        idf_target: ["esp32", "esp32s2", "esp32s3"]
     steps:
     - name: Checkout repo
       uses: actions/checkout@v2
       with:
         submodules: 'recursive'
     - name: esp-idf build
-      uses: espressif/esp-idf-ci-action@latest
+      uses: espressif/esp-idf-ci-action@main
       with:
+        target: ${{ matrix.idf_target }}
         path: 'examples'
 
-  build-release-v4_0:
+  build-release-v4_4:
+    name: Build for ${{ matrix.idf_target }} on ${{ matrix.idf_ver }}
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        idf_ver: ["v4.4"]
+        idf_target: ["esp32", "esp32s2", "esp32s3"]
     steps:
     - name: Checkout repo
       uses: actions/checkout@v2
       with:
         submodules: 'recursive'
     - name: esp-idf build
-      uses: espressif/esp-idf-ci-action@release-v4.0
+      uses: espressif/esp-idf-ci-action@main
       with:
+        esp_idf_version: ${{ matrix.idf_ver }}
+        target: ${{ matrix.idf_target }}
         path: 'examples'
 
   build-release-v4_1:
@@ -65,15 +76,3 @@ jobs:
       uses: espressif/esp-idf-ci-action@release-v4.3
       with:
         path: 'examples'
-
-  build-release-v3_3:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repo
-      uses: actions/checkout@v2
-      with:
-        submodules: 'recursive'
-    - name: esp-idf build
-      uses: espressif/esp-idf-ci-action@release-v3.3
-      with:
-        path: 'examples'
diff --git a/code/components/esp32-camera-master/.github/workflows/upload_component.yml b/code/components/esp32-camera-master/.github/workflows/upload_component.yml
index 0fb12cc3..f550e696 100644
--- a/code/components/esp32-camera-master/.github/workflows/upload_component.yml
+++ b/code/components/esp32-camera-master/.github/workflows/upload_component.yml
@@ -10,12 +10,10 @@ jobs:
       - uses: actions/checkout@master
         with:
           submodules: "recursive"
-
       - name: Upload component to the component registry
         uses: espressif/github-actions/upload_components@master
         with:
           name: "esp32-camera"
-          version: "git"
           namespace: "espressif"
-          service_url: ${{ secrets.IDF_COMPONENT_API_URL }}
+          version: ${{ github.ref_name }}
           api_token: ${{ secrets.IDF_COMPONENT_API_TOKEN }}
diff --git a/code/components/esp32-camera-master/CMakeLists.txt b/code/components/esp32-camera-master/CMakeLists.txt
index 8090f326..7c07b0bf 100644
--- a/code/components/esp32-camera-master/CMakeLists.txt
+++ b/code/components/esp32-camera-master/CMakeLists.txt
@@ -1,5 +1,29 @@
+# get IDF version for comparison
+set(idf_version "${IDF_VERSION_MAJOR}.${IDF_VERSION_MINOR}")
+
+# set conversion sources
+set(COMPONENT_SRCS
+  conversions/yuv.c
+  conversions/to_jpg.cpp
+  conversions/to_bmp.c
+  conversions/jpge.cpp
+  conversions/esp_jpg_decode.c
+  )
+
+set(COMPONENT_PRIV_INCLUDEDIRS
+  conversions/private_include
+  )
+
+set(COMPONENT_ADD_INCLUDEDIRS
+  driver/include
+  conversions/include
+  )
+
+set(COMPONENT_REQUIRES driver)
+
+# set driver sources only for supported platforms
 if(IDF_TARGET STREQUAL "esp32" OR IDF_TARGET STREQUAL "esp32s2" OR IDF_TARGET STREQUAL "esp32s3")
-  set(COMPONENT_SRCS
+  list(APPEND COMPONENT_SRCS
     driver/esp_camera.c
     driver/cam_hal.c
     driver/sccb.c
@@ -14,22 +38,14 @@ if(IDF_TARGET STREQUAL "esp32" OR IDF_TARGET STREQUAL "esp32s2" OR IDF_TARGET ST
     sensors/gc2145.c
     sensors/gc032a.c
     sensors/bf3005.c
-    conversions/yuv.c
-    conversions/to_jpg.cpp
-    conversions/to_bmp.c
-    conversions/jpge.cpp
-    conversions/esp_jpg_decode.c
+    sensors/bf20a6.c
+    sensors/sc101iot.c
+    sensors/sc030iot.c
     )
 
-  set(COMPONENT_ADD_INCLUDEDIRS
-    driver/include
-    conversions/include
-    )
-
-  set(COMPONENT_PRIV_INCLUDEDIRS
+  list(APPEND COMPONENT_PRIV_INCLUDEDIRS
     driver/private_include
     sensors/private_include
-    conversions/private_include
     target/private_include
     )
 
@@ -58,8 +74,13 @@ if(IDF_TARGET STREQUAL "esp32" OR IDF_TARGET STREQUAL "esp32s2" OR IDF_TARGET ST
       )
   endif()
 
-  set(COMPONENT_REQUIRES driver)
   set(COMPONENT_PRIV_REQUIRES freertos nvs_flash)
 
-  register_component()
+  set(min_version_for_esp_timer "4.2")
+  if (idf_version VERSION_GREATER_EQUAL min_version_for_esp_timer)
+    list(APPEND COMPONENT_PRIV_REQUIRES esp_timer)
+  endif()
+
 endif()
+
+register_component()
diff --git a/code/components/esp32-camera-master/Kconfig b/code/components/esp32-camera-master/Kconfig
index dbf67089..66253d0a 100644
--- a/code/components/esp32-camera-master/Kconfig
+++ b/code/components/esp32-camera-master/Kconfig
@@ -69,6 +69,45 @@ menu "Camera configuration"
         help
             Enable this option if you want to use the BF3005.
             Disable this option to save memory.
+            
+    config BF20A6_SUPPORT
+        bool "Support BF20A6(BYD20A6) VGA"
+        default y
+        help
+            Enable this option if you want to use the BF20A6.
+            Disable this option to save memory.
+
+    config SC101IOT_SUPPORT
+        bool "Support SC101IOT HD"
+        default n
+        help
+            Enable this option if you want to use the SC101IOT.
+            Disable this option to save memory.
+
+    choice SC101_REGS_SELECT
+        prompt "SC101iot default regs"
+        default SC101IOT_720P_15FPS_ENABLED
+        depends on SC101IOT_SUPPORT
+        help
+            Currently SC010iot has several register sets available.
+            Select the one that matches your needs.
+
+        config SC101IOT_720P_15FPS_ENABLED
+            bool "xclk20M_720p_15fps"
+        help
+            Select this option means that when xclk is 20M, the frame rate is 15fps at 720p resolution.
+        config SC101IOT_VGA_25FPS_ENABLED
+            bool "xclk20M_VGA_25fps"
+        help
+            Select this option means that when xclk is 20M, the frame rate is 25fps at VGA resolution.
+    endchoice
+
+    config SC030IOT_SUPPORT
+        bool "Support SC030IOT VGA"
+        default y
+        help
+            Enable this option if you want to use the SC030IOT.
+            Disable this option to save memory.
 
     choice SCCB_HARDWARE_I2C_PORT
         bool "I2C peripheral to use for SCCB"
@@ -125,5 +164,24 @@ menu "Camera configuration"
         help
             Maximum value of DMA buffer
             Larger values may fail to allocate due to insufficient contiguous memory blocks, and smaller value may cause DMA interrupt to be too frequent
+    
+    config CAMERA_CONVERTER_ENABLED
+        bool "Enable camera RGB/YUV converter"
+        depends on IDF_TARGET_ESP32S3
+        default n
+        help
+            Enable this option if you want to use RGB565/YUV422/YUV420/YUV411 format conversion.
 
+    choice CAMERA_CONV_PROTOCOL
+        bool "Camera converter protocol"
+        depends on CAMERA_CONVERTER_ENABLED
+        default LCD_CAM_CONV_BT601_ENABLED
+        help
+            Supports format conversion under both BT601 and BT709 standards.
+
+        config LCD_CAM_CONV_BT601_ENABLED
+            bool "BT601"
+        config LCD_CAM_CONV_BT709_ENABLED
+            bool "BT709"
+    endchoice
 endmenu
diff --git a/code/components/esp32-camera-master/README.md b/code/components/esp32-camera-master/README.md
index de61a5bb..9b5282e4 100644
--- a/code/components/esp32-camera-master/README.md
+++ b/code/components/esp32-camera-master/README.md
@@ -25,6 +25,9 @@ This repository hosts ESP32 series Soc compatible driver for image sensors. Addi
 | GC0308  | 640 x 480      | color      | YUV/YCbCr422<br/>RAW Bayer<br/>RGB565                        | 1/6.5"   |
 | GC2145  | 1600 x 1200    | color      | YUV/YCbCr422<br/>RAW Bayer<br/>RGB565                        | 1/5"     |
 | BF3005  | 640 x 480      | color      | YUV/YCbCr422<br/>RAW Bayer<br/>RGB565                        | 1/4"     |
+| BF20A6  | 640 x 480      | color      | YUV/YCbCr422<br/>RAW Bayer                                   | 1/10"    |
+| SC101IOT| 1280 x 720     | color      | YUV/YCbCr422<br/>Raw RGB                                     | 1/4.2"   |
+| SC030IOT| 640 x 480      | color      | YUV/YCbCr422<br/>RAW Bayer                                   | 1/6.5"   |
 
 ## Important to Remember
 
diff --git a/code/components/esp32-camera-master/conversions/esp_jpg_decode.c b/code/components/esp32-camera-master/conversions/esp_jpg_decode.c
index a9615e36..52833a73 100644
--- a/code/components/esp32-camera-master/conversions/esp_jpg_decode.c
+++ b/code/components/esp32-camera-master/conversions/esp_jpg_decode.c
@@ -21,6 +21,10 @@
 #include "tjpgd.h"
 #elif CONFIG_IDF_TARGET_ESP32S3
 #include "esp32s3/rom/tjpgd.h"
+#elif CONFIG_IDF_TARGET_ESP32C3
+#include "esp32c3/rom/tjpgd.h"
+#elif CONFIG_IDF_TARGET_ESP32H2
+#include "esp32h2/rom/tjpgd.h"
 #else
 #error Target CONFIG_IDF_TARGET is not supported
 #endif
@@ -57,7 +61,7 @@ static const char * jd_errors[] = {
     "Not supported JPEG standard"
 };
 
-static uint32_t _jpg_write(JDEC *decoder, void *bitmap, JRECT *rect)
+static unsigned int _jpg_write(JDEC *decoder, void *bitmap, JRECT *rect)
 {
     uint16_t x = rect->left;
     uint16_t y = rect->top;
@@ -73,7 +77,7 @@ static uint32_t _jpg_write(JDEC *decoder, void *bitmap, JRECT *rect)
     return 0;
 }
 
-static uint32_t _jpg_read(JDEC *decoder, uint8_t *buf, uint32_t len)
+static unsigned int _jpg_read(JDEC *decoder, uint8_t *buf, unsigned int len)
 {
     esp_jpg_decoder_t * jpeg = (esp_jpg_decoder_t *)decoder->device;
     if (jpeg->len && len > (jpeg->len - jpeg->index)) {
diff --git a/code/components/esp32-camera-master/conversions/jpge.cpp b/code/components/esp32-camera-master/conversions/jpge.cpp
index a8ab93e0..dd6790e6 100644
--- a/code/components/esp32-camera-master/conversions/jpge.cpp
+++ b/code/components/esp32-camera-master/conversions/jpge.cpp
@@ -29,7 +29,12 @@ namespace jpge {
         if(b){
             return b;
         }
+    // check if SPIRAM is enabled and allocate on SPIRAM if allocatable
+#if (CONFIG_SPIRAM_SUPPORT && (CONFIG_SPIRAM_USE_CAPS_ALLOC || CONFIG_SPIRAM_USE_MALLOC))
         return heap_caps_malloc(nSize, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+#else
+        return NULL;
+#endif
     }
     static inline void jpge_free(void *p) { free(p); }
 
diff --git a/code/components/esp32-camera-master/conversions/to_bmp.c b/code/components/esp32-camera-master/conversions/to_bmp.c
index 5a54bdba..e267c78f 100644
--- a/code/components/esp32-camera-master/conversions/to_bmp.c
+++ b/code/components/esp32-camera-master/conversions/to_bmp.c
@@ -21,19 +21,6 @@
 #include "esp_jpg_decode.h"
 
 #include "esp_system.h"
-#if ESP_IDF_VERSION_MAJOR >= 4 // IDF 4+
-#if CONFIG_IDF_TARGET_ESP32 // ESP32/PICO-D4
-#include "esp32/spiram.h"
-#elif CONFIG_IDF_TARGET_ESP32S2
-#include "esp32s2/spiram.h"
-#elif CONFIG_IDF_TARGET_ESP32S3
-#include "esp32s3/spiram.h"
-#else 
-#error Target CONFIG_IDF_TARGET is not supported
-#endif
-#else // ESP32 Before IDF 4.0
-#include "esp_spiram.h"
-#endif
 
 #if defined(ARDUINO_ARCH_ESP32) && defined(CONFIG_ARDUHAL_ESP_LOG)
 #include "esp32-hal-log.h"
@@ -72,7 +59,12 @@ typedef struct {
 
 static void *_malloc(size_t size)
 {
+    // check if SPIRAM is enabled and allocate on SPIRAM if allocatable
+#if (CONFIG_SPIRAM_SUPPORT && (CONFIG_SPIRAM_USE_CAPS_ALLOC || CONFIG_SPIRAM_USE_MALLOC))
     return heap_caps_malloc(size, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+#endif
+    // try allocating in internal memory
+    return malloc(size);
 }
 
 //output buffer and image width
@@ -168,7 +160,7 @@ static bool _rgb565_write(void * arg, uint16_t x, uint16_t y, uint16_t w, uint16
 }
 
 //input buffer
-static uint32_t _jpg_read(void * arg, size_t index, uint8_t *buf, size_t len)
+static unsigned int _jpg_read(void * arg, size_t index, uint8_t *buf, size_t len)
 {
     rgb_jpg_decoder * jpeg = (rgb_jpg_decoder *)arg;
     if(buf) {
diff --git a/code/components/esp32-camera-master/conversions/to_jpg.cpp b/code/components/esp32-camera-master/conversions/to_jpg.cpp
index 9b8905a7..24cc2989 100644
--- a/code/components/esp32-camera-master/conversions/to_jpg.cpp
+++ b/code/components/esp32-camera-master/conversions/to_jpg.cpp
@@ -21,21 +21,6 @@
 #include "jpge.h"
 #include "yuv.h"
 
-#include "esp_system.h"
-#if ESP_IDF_VERSION_MAJOR >= 4 // IDF 4+
-#if CONFIG_IDF_TARGET_ESP32 // ESP32/PICO-D4
-#include "esp32/spiram.h"
-#elif CONFIG_IDF_TARGET_ESP32S2
-#include "esp32s2/spiram.h"
-#elif CONFIG_IDF_TARGET_ESP32S3
-#include "esp32s3/spiram.h"
-#else 
-#error Target CONFIG_IDF_TARGET is not supported
-#endif
-#else // ESP32 Before IDF 4.0
-#include "esp_spiram.h"
-#endif
-
 #if defined(ARDUINO_ARCH_ESP32) && defined(CONFIG_ARDUHAL_ESP_LOG)
 #include "esp32-hal-log.h"
 #define TAG ""
@@ -50,7 +35,12 @@ static void *_malloc(size_t size)
     if(res) {
         return res;
     }
+
+    // check if SPIRAM is enabled and is allocatable
+#if (CONFIG_SPIRAM_SUPPORT && (CONFIG_SPIRAM_USE_CAPS_ALLOC || CONFIG_SPIRAM_USE_MALLOC))
     return heap_caps_malloc(size, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
+#endif
+    return NULL;
 }
 
 static IRAM_ATTR void convert_line_format(uint8_t * src, pixformat_t format, uint8_t * dst, size_t width, size_t in_channels, size_t line)
diff --git a/code/components/esp32-camera-master/driver/cam_hal.c b/code/components/esp32-camera-master/driver/cam_hal.c
index 9b7e12b5..1604f8ac 100644
--- a/code/components/esp32-camera-master/driver/cam_hal.c
+++ b/code/components/esp32-camera-master/driver/cam_hal.c
@@ -18,8 +18,21 @@
 #include "ll_cam.h"
 #include "cam_hal.h"
 
-static const char *TAG = "cam_hal";
+#if (ESP_IDF_VERSION_MAJOR == 3) && (ESP_IDF_VERSION_MINOR == 3)
+#include "rom/ets_sys.h"
+#else
+#include "esp_timer.h"
+#if CONFIG_IDF_TARGET_ESP32
+#include "esp32/rom/ets_sys.h"  // will be removed in idf v5.0
+#elif CONFIG_IDF_TARGET_ESP32S2
+#include "esp32s2/rom/ets_sys.h"
+#elif CONFIG_IDF_TARGET_ESP32S3
+#include "esp32s3/rom/ets_sys.h"
+#endif
+#endif // ESP_IDF_VERSION_MAJOR
+#define ESP_CAMERA_ETS_PRINTF ets_printf
 
+static const char *TAG = "cam_hal";
 static cam_obj_t *cam_obj = NULL;
 
 static const uint32_t JPEG_SOI_MARKER = 0xFFD8FF;  // written in little-endian for esp32
@@ -93,7 +106,7 @@ void IRAM_ATTR ll_cam_send_event(cam_obj_t *cam, cam_event_t cam_event, BaseType
     if (xQueueSendFromISR(cam->event_queue, (void *)&cam_event, HPTaskAwoken) != pdTRUE) {
         ll_cam_stop(cam);
         cam->state = CAM_STATE_IDLE;
-        ESP_EARLY_LOGE(TAG, "EV-%s-OVF", cam_event==CAM_IN_SUC_EOF_EVENT ? "EOF" : "VSYNC");
+        ESP_CAMERA_ETS_PRINTF(DRAM_STR("cam_hal: EV-%s-OVF\r\n"), cam_event==CAM_IN_SUC_EOF_EVENT ? DRAM_STR("EOF") : DRAM_STR("VSYNC"));
     }
 }
 
diff --git a/code/components/esp32-camera-master/driver/esp_camera.c b/code/components/esp32-camera-master/driver/esp_camera.c
index 5b671c0e..8327445c 100644
--- a/code/components/esp32-camera-master/driver/esp_camera.c
+++ b/code/components/esp32-camera-master/driver/esp_camera.c
@@ -57,6 +57,15 @@
 #if CONFIG_BF3005_SUPPORT
 #include "bf3005.h"
 #endif
+#if CONFIG_BF20A6_SUPPORT
+#include "bf20a6.h"
+#endif
+#if CONFIG_SC101IOT_SUPPORT
+#include "sc101iot.h"
+#endif
+#if CONFIG_SC030IOT_SUPPORT
+#include "sc030iot.h"
+#endif
 
 #if defined(ARDUINO_ARCH_ESP32) && defined(CONFIG_ARDUHAL_ESP_LOG)
 #include "esp32-hal-log.h"
@@ -119,6 +128,15 @@ static const sensor_func_t g_sensors[] = {
 #if CONFIG_BF3005_SUPPORT
     {bf3005_detect, bf3005_init},
 #endif
+#if CONFIG_BF20A6_SUPPORT
+    {bf20a6_detect, bf20a6_init},
+#endif
+#if CONFIG_SC101IOT_SUPPORT
+    {sc101iot_detect, sc101iot_init},
+#endif
+#if CONFIG_SC030IOT_SUPPORT
+    {sc030iot_detect, sc030iot_init},
+#endif
 };
 
 static esp_err_t camera_probe(const camera_config_t *config, camera_model_t *out_camera_model)
@@ -218,6 +236,23 @@ static esp_err_t camera_probe(const camera_config_t *config, camera_model_t *out
     return ESP_OK;
 }
 
+#if CONFIG_CAMERA_CONVERTER_ENABLED
+static pixformat_t get_output_data_format(camera_conv_mode_t conv_mode)
+{
+    pixformat_t format = PIXFORMAT_RGB565;
+    switch (conv_mode) {
+    case YUV422_TO_YUV420:
+        format = PIXFORMAT_YUV420;
+        break;
+    case YUV422_TO_RGB565: // default format is RGB565
+    default:
+        break;
+    }
+    ESP_LOGD(TAG, "Convert to %d format enabled", format);
+    return format;
+}
+#endif
+
 esp_err_t esp_camera_init(const camera_config_t *config)
 {
     esp_err_t err;
@@ -256,6 +291,7 @@ esp_err_t esp_camera_init(const camera_config_t *config)
 
     s_state->sensor.status.framesize = frame_size;
     s_state->sensor.pixformat = pix_format;
+
     ESP_LOGD(TAG, "Setting frame size to %dx%d", resolution[frame_size].width, resolution[frame_size].height);
     if (s_state->sensor.set_framesize(&s_state->sensor, frame_size) != 0) {
         ESP_LOGE(TAG, "Failed to set frame size");
@@ -263,6 +299,11 @@ esp_err_t esp_camera_init(const camera_config_t *config)
         goto fail;
     }
     s_state->sensor.set_pixformat(&s_state->sensor, pix_format);
+#if CONFIG_CAMERA_CONVERTER_ENABLED
+    if(config->conv_mode) {
+        s_state->sensor.pixformat = get_output_data_format(config->conv_mode); // If conversion enabled, change the out data format by conversion mode
+    }
+#endif
 
     if (s_state->sensor.id.PID == OV2640_PID) {
         s_state->sensor.set_gainceiling(&s_state->sensor, GAINCEILING_2X);
diff --git a/code/components/esp32-camera-master/driver/include/esp_camera.h b/code/components/esp32-camera-master/driver/include/esp_camera.h
index b6047d31..2025bb40 100644
--- a/code/components/esp32-camera-master/driver/include/esp_camera.h
+++ b/code/components/esp32-camera-master/driver/include/esp_camera.h
@@ -70,6 +70,7 @@
 #include "driver/ledc.h"
 #include "sensor.h"
 #include "sys/time.h"
+#include "sdkconfig.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -91,6 +92,19 @@ typedef enum {
     CAMERA_FB_IN_DRAM           /*!< Frame buffer is placed in internal DRAM */
 } camera_fb_location_t;
 
+#if CONFIG_CAMERA_CONVERTER_ENABLED
+/**
+ * @brief Camera RGB\YUV conversion mode
+ */
+typedef enum {
+    CONV_DISABLE,
+    RGB565_TO_YUV422,
+        
+    YUV422_TO_RGB565,
+    YUV422_TO_YUV420
+} camera_conv_mode_t;
+#endif
+
 /**
  * @brief Configuration structure for camera initialization
  */
@@ -124,6 +138,9 @@ typedef struct {
     size_t fb_count;                /*!< Number of frame buffers to be allocated. If more than one, then each frame will be acquired (double speed)  */
     camera_fb_location_t fb_location; /*!< The location where the frame buffer will be allocated */
     camera_grab_mode_t grab_mode;   /*!< When buffers should be filled */
+#if CONFIG_CAMERA_CONVERTER_ENABLED
+    camera_conv_mode_t conv_mode;   /*!< RGB<->YUV Conversion mode */
+#endif
 } camera_config_t;
 
 /**
diff --git a/code/components/esp32-camera-master/driver/include/sensor.h b/code/components/esp32-camera-master/driver/include/sensor.h
index b2bf55f1..d5ec7463 100644
--- a/code/components/esp32-camera-master/driver/include/sensor.h
+++ b/code/components/esp32-camera-master/driver/include/sensor.h
@@ -27,6 +27,9 @@ typedef enum {
     GC032A_PID = 0x232a,
     GC0308_PID = 0x9b,
     BF3005_PID = 0x30,
+    BF20A6_PID = 0x20a6,
+    SC101IOT_PID = 0xda4a,
+    SC030IOT_PID = 0x9a46,
 } camera_pid_t;
 
 typedef enum {
@@ -40,6 +43,9 @@ typedef enum {
     CAMERA_GC032A,
     CAMERA_GC0308,
     CAMERA_BF3005,
+    CAMERA_BF20A6,
+    CAMERA_SC101IOT,
+    CAMERA_SC030IOT,
     CAMERA_MODEL_MAX,
     CAMERA_NONE,
 } camera_model_t;
@@ -55,11 +61,15 @@ typedef enum {
     GC032A_SCCB_ADDR   = 0x21,// 0x42 >> 1
     GC0308_SCCB_ADDR   = 0x21,// 0x42 >> 1
     BF3005_SCCB_ADDR   = 0x6E,
+    BF20A6_SCCB_ADDR   = 0x6E,
+    SC101IOT_SCCB_ADDR = 0x68,// 0xd0 >> 1
+    SC030IOT_SCCB_ADDR = 0x68,// 0xd0 >> 1
 } camera_sccb_addr_t;
 
 typedef enum {
     PIXFORMAT_RGB565,    // 2BPP/RGB565
     PIXFORMAT_YUV422,    // 2BPP/YUV422
+    PIXFORMAT_YUV420,    // 1.5BPP/YUV420
     PIXFORMAT_GRAYSCALE, // 1BPP/GRAYSCALE
     PIXFORMAT_JPEG,      // JPEG/COMPRESSED
     PIXFORMAT_RGB888,    // 3BPP/RGB888
diff --git a/code/components/esp32-camera-master/driver/sccb.c b/code/components/esp32-camera-master/driver/sccb.c
index 314dd982..edc417f8 100644
--- a/code/components/esp32-camera-master/driver/sccb.c
+++ b/code/components/esp32-camera-master/driver/sccb.c
@@ -25,6 +25,11 @@ static const char* TAG = "sccb";
 
 #include "driver/i2c.h"
 
+// support IDF 5.x
+#ifndef portTICK_RATE_MS
+#define portTICK_RATE_MS portTICK_PERIOD_MS
+#endif
+
 #define SCCB_FREQ               CONFIG_SCCB_CLK_FREQ  /*!< I2C master frequency*/
 #define WRITE_BIT               I2C_MASTER_WRITE      /*!< I2C master write */
 #define READ_BIT                I2C_MASTER_READ       /*!< I2C master read */
diff --git a/code/components/esp32-camera-master/driver/sensor.c b/code/components/esp32-camera-master/driver/sensor.c
index bf6d313f..2f4c9711 100644
--- a/code/components/esp32-camera-master/driver/sensor.c
+++ b/code/components/esp32-camera-master/driver/sensor.c
@@ -13,6 +13,9 @@ const camera_sensor_info_t camera_sensor[CAMERA_MODEL_MAX] = {
     {CAMERA_GC032A, "GC032A", GC032A_SCCB_ADDR, GC032A_PID, FRAMESIZE_VGA, false},
     {CAMERA_GC0308, "GC0308", GC0308_SCCB_ADDR, GC0308_PID, FRAMESIZE_VGA, false},
     {CAMERA_BF3005, "BF3005", BF3005_SCCB_ADDR, BF3005_PID, FRAMESIZE_VGA, false},
+    {CAMERA_BF20A6, "BF20A6", BF20A6_SCCB_ADDR, BF20A6_PID, FRAMESIZE_VGA, false},
+    {CAMERA_SC101IOT, "SC101IOT", SC101IOT_SCCB_ADDR, SC101IOT_PID, FRAMESIZE_HD, false},
+    {CAMERA_SC030IOT, "SC030IOT", SC030IOT_SCCB_ADDR, SC030IOT_PID, FRAMESIZE_VGA, false},
 };
 
 const resolution_info_t resolution[FRAMESIZE_INVALID] = {
diff --git a/code/components/esp32-camera-master/examples/main/take_picture.c b/code/components/esp32-camera-master/examples/main/take_picture.c
index 1cbad908..1fb1039d 100644
--- a/code/components/esp32-camera-master/examples/main/take_picture.c
+++ b/code/components/esp32-camera-master/examples/main/take_picture.c
@@ -38,6 +38,11 @@
 #include "freertos/FreeRTOS.h"
 #include "freertos/task.h"
 
+// support IDF 5.x
+#ifndef portTICK_RATE_MS
+#define portTICK_RATE_MS portTICK_PERIOD_MS
+#endif
+
 #include "esp_camera.h"
 
 #define BOARD_WROVER_KIT 1
diff --git a/code/components/esp32-camera-master/idf_component.yml b/code/components/esp32-camera-master/idf_component.yml
index 848e1cd8..2b98f8d0 100644
--- a/code/components/esp32-camera-master/idf_component.yml
+++ b/code/components/esp32-camera-master/idf_component.yml
@@ -1,5 +1,2 @@
 description: ESP32 compatible driver for OV2640, OV3660, OV5640, OV7670 and OV7725 image sensors.
-targets:
-  - esp32
-  - esp32s2
-  - esp32s3
+url: https://github.com/espressif/esp32-camera
diff --git a/code/components/esp32-camera-master/sensors/bf20a6.c b/code/components/esp32-camera-master/sensors/bf20a6.c
new file mode 100644
index 00000000..b1179c30
--- /dev/null
+++ b/code/components/esp32-camera-master/sensors/bf20a6.c
@@ -0,0 +1,404 @@
+// Copyright 2015-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#include "sccb.h"
+#include "bf20a6.h"
+#include "bf20a6_regs.h"
+#include "bf20a6_settings.h"
+
+#if defined(ARDUINO_ARCH_ESP32) && defined(CONFIG_ARDUHAL_ESP_LOG)
+#include "esp32-hal-log.h"
+#else
+#include "esp_log.h"
+static const char *TAG = "bf20a6";
+#endif
+
+#define H8(v) ((v)>>8)
+#define L8(v) ((v)&0xff)
+
+//#define REG_DEBUG_ON
+
+static int read_reg(uint8_t slv_addr, const uint16_t reg)
+{
+    int ret = SCCB_Read(slv_addr, reg);
+    // ESP_LOGI(TAG, "READ Register 0x%02x VALUE: 0x%02x", reg, ret);
+#ifdef REG_DEBUG_ON
+    if (ret < 0) {
+        ESP_LOGE(TAG, "READ REG 0x%04x FAILED: %d", reg, ret);
+    }
+#endif
+    return ret;
+}
+
+static int write_reg(uint8_t slv_addr, const uint16_t reg, uint8_t value)
+{
+    int ret = SCCB_Write(slv_addr, reg, value);
+#ifdef REG_DEBUG_ON
+    if (ret < 0) {
+        ESP_LOGE(TAG, "WRITE REG 0x%04x FAILED: %d", reg, ret);
+    }
+#endif
+    return ret;
+}
+
+#ifdef DEBUG_PRINT_REG
+static int check_reg_mask(uint8_t slv_addr, uint16_t reg, uint8_t mask)
+{
+    return (read_reg(slv_addr, reg) & mask) == mask;
+}
+
+static void print_regs(uint8_t slv_addr)
+{
+    vTaskDelay(pdMS_TO_TICKS(100));
+    ESP_LOGI(TAG, "REG list look ======================");
+    for (size_t i = 0xf0; i <= 0xfe; i++) {
+        ESP_LOGI(TAG, "reg[0x%02x] = 0x%02x", i, read_reg(slv_addr, i));
+    }
+    ESP_LOGI(TAG, "\npage 0 ===");
+    write_reg(slv_addr, 0xfe, 0x00); // page 0
+    for (size_t i = 0x03; i <= 0x24; i++) {
+        ESP_LOGI(TAG, "p0 reg[0x%02x] = 0x%02x", i, read_reg(slv_addr, i));
+    }
+    for (size_t i = 0x40; i <= 0x95; i++) {
+        ESP_LOGI(TAG, "p0 reg[0x%02x] = 0x%02x", i, read_reg(slv_addr, i));
+    }
+    ESP_LOGI(TAG, "\npage 3 ===");
+    write_reg(slv_addr, 0xfe, 0x03); // page 3
+    for (size_t i = 0x01; i <= 0x43; i++) {
+        ESP_LOGI(TAG, "p3 reg[0x%02x] = 0x%02x", i, read_reg(slv_addr, i));
+    }
+}
+
+static int read_regs(uint8_t slv_addr, const uint16_t(*regs)[2])
+{
+    int i = 0, ret = 0;
+    while (regs[i][0] != REGLIST_TAIL) {
+        if (regs[i][0] == REG_DLY) {
+            vTaskDelay(regs[i][1] / portTICK_PERIOD_MS);
+        } else {
+            ret = read_reg(slv_addr, regs[i][0]);
+        }
+        i++;
+    }
+    return ret;
+}
+#endif
+
+static int set_reg_bits(sensor_t *sensor, uint8_t reg, uint8_t offset, uint8_t length, uint8_t value)
+{
+    int ret = 0;
+
+    ret = SCCB_Read(sensor->slv_addr, reg);
+    if (ret < 0) {
+        return ret;
+    }
+    uint8_t mask = ((1 << length) - 1) << offset;
+    value = (ret & ~mask) | ((value << offset) & mask);
+    ret = SCCB_Write(sensor->slv_addr, reg & 0xFF, value);
+    return ret;
+}
+
+static int write_regs(uint8_t slv_addr, const uint16_t(*regs)[2])
+{
+    int i = 0, ret = 0;
+    while (!ret && regs[i][0] != REGLIST_TAIL) {
+        if (regs[i][0] == REG_DLY) {
+            vTaskDelay(regs[i][1] / portTICK_PERIOD_MS);
+        } else {
+            ret = write_reg(slv_addr, regs[i][0], regs[i][1]);
+        }
+        i++;
+    }
+    return ret;
+}
+
+static int reset(sensor_t *sensor)
+{
+    int ret;
+    // Software Reset: clear all registers and reset them to their default values
+    ret = write_reg(sensor->slv_addr, RESET_RELATED, 0x01);
+    if (ret) {
+        ESP_LOGE(TAG, "Software Reset FAILED!");
+        return ret;
+    }
+    vTaskDelay(100 / portTICK_PERIOD_MS);
+
+    ret = write_regs(sensor->slv_addr, bf20a6_default_init_regs);
+    if (ret == 0) {
+        ESP_LOGD(TAG, "Camera defaults loaded");
+        vTaskDelay(100 / portTICK_PERIOD_MS);
+    }
+
+    // int test_value = read_regs(sensor->slv_addr, bf20a6_default_init_regs);
+
+    return ret;
+}
+
+static int set_pixformat(sensor_t *sensor, pixformat_t pixformat)
+{
+    int ret = 0;
+    switch (pixformat) {
+    case PIXFORMAT_YUV422:
+        set_reg_bits(sensor, 0x12, 0, 1, 0);
+        break;
+    case PIXFORMAT_RAW:
+        set_reg_bits(sensor, 0x12, 0, 1, 0x1);
+        break;
+    default:
+        ESP_LOGW(TAG, "set_pix unsupport format");
+        ret = -1;
+        break;
+    }
+    if (ret == 0) {
+        sensor->pixformat = pixformat;
+        ESP_LOGD(TAG, "Set pixformat to: %u", pixformat);
+    }
+
+    return ret;
+}
+
+static int set_framesize(sensor_t *sensor, framesize_t framesize)
+{
+    int ret = 0;
+    if (framesize > FRAMESIZE_VGA) {
+        return -1;
+    }
+    uint16_t w = resolution[framesize].width;
+    uint16_t h = resolution[framesize].height;
+
+    sensor->status.framesize = framesize;
+
+    // Write MSBs
+    ret |= SCCB_Write(sensor->slv_addr, 0x17, 0);
+    ret |= SCCB_Write(sensor->slv_addr, 0x18, w >> 2);
+
+    ret |= SCCB_Write(sensor->slv_addr, 0x19, 0);
+    ret |= SCCB_Write(sensor->slv_addr, 0x1a, h >> 2);
+
+    // Write LSBs
+    ret |= SCCB_Write(sensor->slv_addr, 0x1b, 0);
+
+    if ((w <= 320) && (h <= 240))     {
+        ret |= SCCB_Write(sensor->slv_addr, 0x17, (80 - w / 4));
+        ret |= SCCB_Write(sensor->slv_addr, 0x18, (80 + w / 4));
+
+        ret |= SCCB_Write(sensor->slv_addr, 0x19, (60 - h / 4));
+
+        ret |= SCCB_Write(sensor->slv_addr, 0x1a, (60 + h / 4));
+
+    } else if ((w <= 640) && (h <= 480))     {
+        ret |= SCCB_Write(sensor->slv_addr, 0x17, (80 - w / 8));
+        ret |= SCCB_Write(sensor->slv_addr, 0x18, (80 + w / 8));
+
+        ret |= SCCB_Write(sensor->slv_addr, 0x19, (60 - h / 8));
+
+        ret |= SCCB_Write(sensor->slv_addr, 0x1a, (60 + h / 8));
+    }
+
+    // Delay
+    vTaskDelay(30 / portTICK_PERIOD_MS);
+
+    return ret;
+}
+
+static int set_hmirror(sensor_t *sensor, int enable)
+{
+    int ret = 0;
+    sensor->status.hmirror = enable;
+    //ret = write_reg(sensor->slv_addr, 0xfe, 0x00);
+    ret |= set_reg_bits(sensor, 0x4a, 3, 0x01, enable);
+    if (ret == 0) {
+        ESP_LOGD(TAG, "Set h-mirror to: %d", enable);
+    }
+    return ret;
+}
+
+static int set_vflip(sensor_t *sensor, int enable)
+{
+    int ret = 0;
+    sensor->status.vflip = enable;
+    //ret = write_reg(sensor->slv_addr, 0xfe, 0x00);
+    ret |= set_reg_bits(sensor, 0x4a, 2, 0x01, enable);
+    if (ret == 0) {
+        ESP_LOGD(TAG, "Set v-flip to: %d", enable);
+    }
+    return ret;
+}
+
+static int set_colorbar(sensor_t *sensor, int value)
+{
+    int ret = 0;
+    ret = write_reg(sensor->slv_addr, 0xb6, value);
+    if (ret == 0) {
+        sensor->status.colorbar = value;
+        ESP_LOGD(TAG, "Set colorbar to: %d", value);
+    }
+    return ret;
+}
+
+static int set_sharpness(sensor_t *sensor, int level)
+{
+    int ret = 0;
+    ret = SCCB_Write(sensor->slv_addr, 0x70, level);
+    if (ret == 0) {
+        ESP_LOGD(TAG, "Set sharpness to: %d", level);
+        sensor->status.sharpness = level;
+    }
+    return ret;
+}
+
+static int get_reg(sensor_t *sensor, int reg, int mask)
+{
+    int ret = 0;
+    if (mask > 0xFF) {
+        ESP_LOGE(TAG, "mask should not more than 0xff");
+    } else {
+        ret = read_reg(sensor->slv_addr, reg);
+    }
+    if (ret > 0) {
+        ret &= mask;
+    }
+    return ret;
+}
+
+static int set_reg(sensor_t *sensor, int reg, int mask, int value)
+{
+    int ret = 0;
+    if (mask > 0xFF) {
+        ESP_LOGE(TAG, "mask should not more than 0xff");
+    } else {
+        ret = read_reg(sensor->slv_addr, reg);
+    }
+    if (ret < 0) {
+        return ret;
+    }
+    value = (ret & ~mask) | (value & mask);
+
+    if (mask > 0xFF) {
+
+    } else {
+        ret = write_reg(sensor->slv_addr, reg, value);
+    }
+    return ret;
+}
+
+static int init_status(sensor_t *sensor)
+{
+    // write_reg(sensor->slv_addr, 0xfe, 0x00);
+    sensor->status.brightness = SCCB_Read(sensor->slv_addr, 0x6f);
+    sensor->status.contrast = SCCB_Read(sensor->slv_addr, 0xd6);
+    sensor->status.saturation = 0;
+    sensor->status.sharpness = SCCB_Read(sensor->slv_addr, 0x70);
+    sensor->status.denoise = 0;
+    sensor->status.ae_level = 0;
+    sensor->status.gainceiling = SCCB_Read(sensor->slv_addr, 0x13);
+    sensor->status.awb = 0;
+    sensor->status.dcw = 0;
+    sensor->status.agc = 0;
+    sensor->status.aec = 0;
+    sensor->status.hmirror = 0;// check_reg_mask(sensor->slv_addr, P0_CISCTL_MODE1, 0x01);
+    sensor->status.vflip = 0;// check_reg_mask(sensor->slv_addr, P0_CISCTL_MODE1, 0x02);
+    sensor->status.colorbar = 0;
+    sensor->status.bpc = 0;
+    sensor->status.wpc = 0;
+    sensor->status.raw_gma = 0;
+    sensor->status.lenc = 0;
+    sensor->status.quality = 0;
+    sensor->status.special_effect = 0;
+    sensor->status.wb_mode = 0;
+    sensor->status.awb_gain = 0;
+    sensor->status.agc_gain = 0;
+    sensor->status.aec_value = 0;
+    sensor->status.aec2 = 0;
+    return 0;
+}
+
+static int set_dummy(sensor_t *sensor, int val)
+{
+    ESP_LOGW(TAG, "dummy Unsupported");
+    return -1;
+}
+static int set_gainceiling_dummy(sensor_t *sensor, gainceiling_t val)
+{
+    ESP_LOGW(TAG, "gainceiling Unsupported");
+    return -1;
+}
+
+int bf20a6_detect(int slv_addr, sensor_id_t *id)
+{
+    if (BF20A6_SCCB_ADDR == slv_addr) {
+        uint8_t MIDL = SCCB_Read(slv_addr, SENSOR_ID_LOW);
+        uint8_t MIDH = SCCB_Read(slv_addr, SENSOR_ID_HIGH);
+        uint16_t PID = MIDH << 8 | MIDL;
+        if (BF20A6_PID == PID) {
+            id->PID = PID;
+            return PID;
+        } else {
+            ESP_LOGI(TAG, "Mismatch PID=0x%x", PID);
+        }
+    }
+    return 0;
+}
+
+int bf20a6_init(sensor_t *sensor)
+{
+    sensor->init_status = init_status;
+    sensor->reset = reset;
+    sensor->set_pixformat = set_pixformat;
+    sensor->set_framesize = set_framesize;
+    sensor->set_contrast = set_dummy;
+    sensor->set_brightness = set_dummy;
+    sensor->set_saturation = set_dummy;
+    sensor->set_sharpness = set_sharpness;
+    sensor->set_denoise = set_dummy;
+    sensor->set_gainceiling = set_gainceiling_dummy;
+    sensor->set_quality = set_dummy;
+    sensor->set_colorbar = set_colorbar;
+    sensor->set_whitebal = set_dummy;
+    sensor->set_gain_ctrl = set_dummy;
+    sensor->set_exposure_ctrl = set_dummy;
+    sensor->set_hmirror = set_hmirror; // set_hmirror;
+    sensor->set_vflip = set_vflip; // set_vflip;
+
+    sensor->set_aec2 = set_dummy;
+    sensor->set_awb_gain = set_dummy;
+    sensor->set_agc_gain = set_dummy;
+    sensor->set_aec_value = set_dummy;
+
+    sensor->set_special_effect = set_dummy;
+    sensor->set_wb_mode = set_dummy;
+    sensor->set_ae_level = set_dummy;
+
+    sensor->set_dcw = set_dummy;
+    sensor->set_bpc = set_dummy;
+    sensor->set_wpc = set_dummy;
+
+    sensor->set_raw_gma = set_dummy;
+    sensor->set_lenc = set_dummy;
+
+    sensor->get_reg = get_reg;
+    sensor->set_reg = set_reg;
+    sensor->set_res_raw = NULL;
+    sensor->set_pll = NULL;
+    sensor->set_xclk = NULL;
+
+    ESP_LOGD(TAG, "BF20A6 Attached");
+    return 0;
+}
diff --git a/code/components/esp32-camera-master/sensors/gc0308.c b/code/components/esp32-camera-master/sensors/gc0308.c
index 8b106a3a..f19025eb 100644
--- a/code/components/esp32-camera-master/sensors/gc0308.c
+++ b/code/components/esp32-camera-master/sensors/gc0308.c
@@ -88,10 +88,10 @@ static int set_reg_bits(uint8_t slv_addr, uint16_t reg, uint8_t offset, uint8_t
     return ret;
 }
 
-static int write_regs(uint8_t slv_addr, const uint16_t (*regs)[2])
+static int write_regs(uint8_t slv_addr, const uint8_t (*regs)[2], size_t regs_size)
 {
     int i = 0, ret = 0;
-    while (!ret && regs[i][0] != REGLIST_TAIL) {
+    while (!ret && (i < regs_size)) {
         if (regs[i][0] == REG_DLY) {
             vTaskDelay(regs[i][1] / portTICK_PERIOD_MS);
         } else {
@@ -132,11 +132,12 @@ static int reset(sensor_t *sensor)
         ESP_LOGE(TAG, "Software Reset FAILED!");
         return ret;
     }
-    vTaskDelay(100 / portTICK_PERIOD_MS);
-    ret = write_regs(sensor->slv_addr, gc0308_sensor_default_regs);
+
+    vTaskDelay(80 / portTICK_PERIOD_MS);
+    ret = write_regs(sensor->slv_addr, gc0308_sensor_default_regs, sizeof(gc0308_sensor_default_regs)/(sizeof(uint8_t) * 2));
     if (ret == 0) {
         ESP_LOGD(TAG, "Camera defaults loaded");
-        vTaskDelay(100 / portTICK_PERIOD_MS);
+        vTaskDelay(80 / portTICK_PERIOD_MS);
         write_reg(sensor->slv_addr, 0xfe, 0x00);
 #ifdef CONFIG_IDF_TARGET_ESP32
         set_reg_bits(sensor->slv_addr, 0x28, 4, 0x07, 1);  //frequency division for esp32, ensure pclk <= 15MHz
diff --git a/code/components/esp32-camera-master/sensors/private_include/bf20a6.h b/code/components/esp32-camera-master/sensors/private_include/bf20a6.h
new file mode 100644
index 00000000..8c925eb5
--- /dev/null
+++ b/code/components/esp32-camera-master/sensors/private_include/bf20a6.h
@@ -0,0 +1,27 @@
+
+#ifndef __BF20A6_H__
+#define __BF20A6_H__
+
+#include "sensor.h"
+
+/**
+ * @brief Detect sensor pid
+ *
+ * @param slv_addr SCCB address
+ * @param id Detection result
+ * @return
+ *     0:       Can't detect this sensor
+ *     Nonzero: This sensor has been detected
+ */
+int bf20a6_detect(int slv_addr, sensor_id_t *id);
+
+/**
+ * @brief initialize sensor function pointers
+ *
+ * @param sensor pointer of sensor
+ * @return
+ *      Always 0
+ */
+int bf20a6_init(sensor_t *sensor);
+
+#endif // __BF20A6_H__
diff --git a/code/components/esp32-camera-master/sensors/private_include/bf20a6_regs.h b/code/components/esp32-camera-master/sensors/private_include/bf20a6_regs.h
new file mode 100644
index 00000000..ab1ff69e
--- /dev/null
+++ b/code/components/esp32-camera-master/sensors/private_include/bf20a6_regs.h
@@ -0,0 +1,12 @@
+/*
+ * BF20A6 register definitions.
+ */
+#ifndef __BF20A6_REG_REGS_H__
+#define __BF20A6_REG_REGS_H__
+
+#define SENSOR_ID_HIGH 0XFC
+#define SENSOR_ID_LOW 0XFD
+#define RESET_RELATED   0XF2
+
+
+#endif //__BF20A6_REG_REGS_H__
diff --git a/code/components/esp32-camera-master/sensors/private_include/bf20a6_settings.h b/code/components/esp32-camera-master/sensors/private_include/bf20a6_settings.h
new file mode 100644
index 00000000..0414bbac
--- /dev/null
+++ b/code/components/esp32-camera-master/sensors/private_include/bf20a6_settings.h
@@ -0,0 +1,158 @@
+
+#include <stdint.h>
+
+#define REG_DLY 0xffff
+#define REGLIST_TAIL 0xffff /* Array end token */
+
+static const uint16_t bf20a6_default_init_regs[][2] = {
+    {0xf2,0x01},
+    {0x12,0x20},
+    {0x3a,0x00},
+    {0xe1,0x92},
+    {0xe3,0x12},// PLL Control, important for framerate(choice: 0x02\0x12\0x22\0x32\0x82)
+    {0xe0,0x00},
+    {0x2a,0x98},
+    {0xcd,0x17},
+    {0xc0,0x10},
+    {0xc6,0x1d},
+    {0x10,0x35},
+    {0xe2,0x09},
+    {0xe4,0x72},
+    {0xe5,0x22},
+    {0xe6,0x24},
+    {0xe7,0x64},
+    {0xe8,0xa2}, // DVP:a2},  SPI:f2        VDDIO=1.8V,E8[2]=1},VDDIO=2.8V,E8[2]=0},
+    {0x4a,0x00},
+    {0x00,0x03},
+    {0x1f,0x02},
+    {0x22,0x02},
+    {0x0c,0x31},
+
+    {0x00,0x00},
+    {0x60,0x81},
+    {0x61,0x81},
+
+    {0xa0,0x08},
+    {0x01,0x1a},
+    // {0x01,0x1a},
+    // {0x01,0x1a},
+    // {0x02,0x15},
+    // {0x02,0x15},
+    {0x02,0x15},
+    {0x13,0x08},
+    {0x8a,0x96},
+    {0x8b,0x06},
+    {0x87,0x18},
+
+
+    {0x34,0x48}, // lens
+    {0x35,0x40},
+    {0x36,0x40},
+
+    {0x71,0x44},
+    {0x72,0x48},
+    {0x74,0xa2},
+    {0x75,0xa9},
+    {0x78,0x12},
+    {0x79,0xa0},
+    {0x7a,0x94},
+    {0x7c,0x97},
+    {0x40,0x30},
+    {0x41,0x30},
+    {0x42,0x28},
+    {0x43,0x1f},
+    {0x44,0x1c},
+    {0x45,0x16},
+    {0x46,0x13},
+    {0x47,0x10},
+    {0x48,0x0D},
+    {0x49,0x0C},
+    {0x4B,0x0A},
+    {0x4C,0x0B},
+    {0x4E,0x09},
+    {0x4F,0x08},
+    {0x50,0x08},
+
+
+    {0x5f,0x29},
+    {0x23,0x33},
+    {0xa1,0x10}, // AWB
+    {0xa2,0x0d},
+    {0xa3,0x30},
+    {0xa4,0x06},
+    {0xa5,0x22},
+    {0xa6,0x56},
+    {0xa7,0x18},
+    {0xa8,0x1a},
+    {0xa9,0x12},
+    {0xaa,0x12},
+    {0xab,0x16},
+    {0xac,0xb1},
+    {0xba,0x12},
+    {0xbb,0x12},
+    {0xad,0x12},
+    {0xae,0x56},
+    {0xaf,0x0a},
+    {0x3b,0x30},
+    {0x3c,0x12},
+    {0x3d,0x22},
+    {0x3e,0x3f},
+    {0x3f,0x28},
+    {0xb8,0xc3},
+    {0xb9,0xa3},
+    {0x39,0x47}, // pure color threshold
+    {0x26,0x13},
+    {0x27,0x16},
+    {0x28,0x14},
+    {0x29,0x18},
+    {0xee,0x0d},
+
+        
+    {0x13,0x05},
+    {0x24,0x3C},
+    {0x81,0x20},
+    {0x82,0x40},
+    {0x83,0x30},
+    {0x84,0x58},
+    {0x85,0x30},
+    {0x92,0x08},
+    {0x86,0x80},
+    {0x8a,0x96},
+    {0x91,0xff},
+    {0x94,0x62},
+    {0x9a,0x18}, // outdoor threshold
+    {0xf0,0x45}, // integral time control, important for framerate(choice: 0x46\0x45\0x44..)
+    {0x51,0x17}, // color normal
+    {0x52,0x03},
+    {0x53,0x5F},
+    {0x54,0x47},
+    {0x55,0x66},
+    {0x56,0x0F},
+    {0x7e,0x14},
+    {0x57,0x36}, // color
+    {0x58,0x2A},
+    {0x59,0xAA},
+    {0x5a,0xA8},
+    {0x5b,0x43},
+    {0x5c,0x10},
+    {0x5d,0x00},
+    {0x7d,0x36},
+    {0x5e,0x10},
+
+    {0xd6,0x88}, // contrast
+    {0xd5,0x20}, // bright
+    {0xb0,0x84}, // low light ctrl in gray section
+    {0xb5,0x08}, // the threshold of GLB_GAIN
+    {0xb1,0xc8}, // saturation
+    {0xb2,0xc0},
+    {0xb3,0xd0},
+    {0xb4,0xB0},
+
+    {0x32,0x10},
+    // {0x8a,0x00},
+    // {0x8b,0x10},
+    {0xa0,0x09},
+    {0x00,0x03},
+    {0x0b,0x02},
+    {REGLIST_TAIL, 0x00},
+};
diff --git a/code/components/esp32-camera-master/sensors/private_include/gc0308_settings.h b/code/components/esp32-camera-master/sensors/private_include/gc0308_settings.h
index 32ef3816..adf5f28d 100644
--- a/code/components/esp32-camera-master/sensors/private_include/gc0308_settings.h
+++ b/code/components/esp32-camera-master/sensors/private_include/gc0308_settings.h
@@ -3,10 +3,9 @@
 
 #include <stdint.h>
 
-#define REG_DLY 0xffff
-#define REGLIST_TAIL 0x0000 /* Array end token */
+#define REG_DLY 0xff
 
-static const uint16_t gc0308_sensor_default_regs[][2] = {
+static const uint8_t gc0308_sensor_default_regs[][2] = {
     {0xfe, 0x00},
     {0xec, 0x20},
     {0x05, 0x00},
@@ -239,7 +238,21 @@ static const uint16_t gc0308_sensor_default_regs[][2] = {
     {0x65, 0xd3},
     {0x66, 0x60},
     {0xfe, 0x00},
-    {REGLIST_TAIL, 0x00},
+
+    {0x01, 0x32},   //frame setting                            
+	{0x02, 0x0c},                                  
+	{0x0f, 0x01},                                                                                                                     
+	{0xe2, 0x00},  
+	{0xe3, 0x78},                                                             
+	{0xe4, 0x00},      
+	{0xe5, 0xfe},  
+	{0xe6, 0x01},  
+	{0xe7, 0xe0},  
+	{0xe8, 0x01},  
+	{0xe9, 0xe0},  
+	{0xea, 0x01},  
+	{0xeb, 0xe0},
+	{0xfe, 0x00},
 };
 
 #endif
diff --git a/code/components/esp32-camera-master/sensors/private_include/ov5640_settings.h b/code/components/esp32-camera-master/sensors/private_include/ov5640_settings.h
index fec7d679..f52572fa 100644
--- a/code/components/esp32-camera-master/sensors/private_include/ov5640_settings.h
+++ b/code/components/esp32-camera-master/sensors/private_include/ov5640_settings.h
@@ -42,7 +42,8 @@ static const DRAM_ATTR uint16_t sensor_default_regs[][2] = {
     {ISP_CONTROL_01, 0x83}, // turn color matrix, awb and SDE
 
     //sys reset
-    {0x3000, 0x00},
+    {0x3000, 0x20}, // reset MCU
+    {REG_DLY, 10}, // delay 10ms
     {0x3002, 0x1c},
 
     //clock enable
diff --git a/code/components/esp32-camera-master/sensors/private_include/sc030iot.h b/code/components/esp32-camera-master/sensors/private_include/sc030iot.h
new file mode 100644
index 00000000..19298b76
--- /dev/null
+++ b/code/components/esp32-camera-master/sensors/private_include/sc030iot.h
@@ -0,0 +1,31 @@
+/*
+ *
+ * SC030IOT DVP driver.
+ *
+ */
+#ifndef __SC030IOT_H__
+#define __SC030IOT_H__
+
+#include "sensor.h"
+
+/**
+ * @brief Detect sensor pid
+ *
+ * @param slv_addr SCCB address
+ * @param id Detection result
+ * @return
+ *     0:       Can't detect this sensor
+ *     Nonzero: This sensor has been detected
+ */
+int sc030iot_detect(int slv_addr, sensor_id_t *id);
+
+/**
+ * @brief initialize sensor function pointers
+ *
+ * @param sensor pointer of sensor
+ * @return
+ *      Always 0
+ */
+int sc030iot_init(sensor_t *sensor);
+
+#endif // __SC030IOT_H__
diff --git a/code/components/esp32-camera-master/sensors/private_include/sc030iot_settings.h b/code/components/esp32-camera-master/sensors/private_include/sc030iot_settings.h
new file mode 100644
index 00000000..56f5654c
--- /dev/null
+++ b/code/components/esp32-camera-master/sensors/private_include/sc030iot_settings.h
@@ -0,0 +1,491 @@
+//version: V01P00_20220303
+//Preview Type:0:DVP Raw 10 bit// 1:Raw 8 bit// 2:YUV422// 3:RAW16
+//Preview Type:4:RGB565// 5:Pixart SPI// 6:MIPI 10bit// 7:MIPI 12bit// 8: MTK SPI
+//port  0:MIPI// 1:Parallel// 2:MTK// 3:SPI// 4:TEST// 5: HISPI// 6 : Z2P/Z4P
+//I2C Mode    :0:Normal 8Addr,8Data//  1:Samsung 8 Addr,8Data// 2:Micron 8 Addr,16Data
+//I2C Mode    :3:Stmicro 16Addr,8Data//4:Micron2 16 Addr,16Data
+//Out Format  :0:YCbYCr/RG_GB// 1:YCrYCb/GR_BG// 2:CbYCrY/GB_RG// 3:CrYCbY/BG_GR
+//MCLK Speed  :0:6M//1:8M//2:10M//3:11.4M//4:12M//5:12.5M//6:13.5M//7:15M//8:18M//9:24M
+//pin  :BIT0 pwdn// BIT1:reset
+//avdd  0:3.3V// 1:2.5V// 2:1.8V
+//dovdd  0:2.8V// 1:2.5V// 2:1.8V
+//dvdd  0:1.8V// 1:1.5V// 2:1.2V
+
+/*
+[DataBase]
+DBName=Dothinkey
+
+[Vendor]
+VendorName=SmartSens
+
+[Sensor]
+SensorName=SC031IOT
+width=640
+height=480
+port=1
+type=2
+pin=3
+SlaveID=0xd0
+mode=0
+FlagReg=0xf7
+FlagMask=0xff
+FlagData=0xfa
+FlagReg1=0xf8
+FlagMask1=0xff
+FlagData1=0x46
+outformat=0
+mclk=20
+avdd=2.80000
+dovdd=2.800000
+dvdd=1.5
+
+Ext0=0
+Ext1=0
+Ext2=0
+AFVCC=0.0000
+VPP=0.000000
+*/
+#include <stdint.h>
+
+static const uint8_t sc030iot_default_init_regs[][2] = {
+    {0xf0, 0x30},
+    {0x01, 0xff},
+    {0x02, 0xff},
+    {0x22, 0x07},
+    {0x19, 0xff},
+    {0x3f, 0x82},
+    {0x30, 0x02},
+    {0xf0, 0x01},
+    {0x70, 0x00},
+    {0x71, 0x80},
+    {0x72, 0x20},
+    {0x73, 0x00},
+    {0x74, 0xe0},
+    {0x75, 0x10},
+    {0x76, 0x81},
+    {0x77, 0x88},
+    {0x78, 0xe1},
+    {0x79, 0x01},
+    {0xf5, 0x01},
+    {0xf4, 0x0a},
+    {0xf0, 0x36},
+    {0x37, 0x79},
+    {0x31, 0x82},
+    {0x3e, 0x60},
+    {0x30, 0xf0},
+    {0x33, 0x33},
+    {0xf0, 0x32},
+    {0x48, 0x02},
+    {0xf0, 0x33},
+    {0x02, 0x12},
+    {0x7c, 0x02},
+    {0x7d, 0x0e},
+    {0xa2, 0x04},
+    {0x5e, 0x06},
+    {0x5f, 0x0a},
+    {0x0b, 0x58},
+    {0x06, 0x38},
+    {0xf0, 0x32},
+    {0x48, 0x02},
+    {0xf0, 0x39},
+    {0x02, 0x70},
+    {0xf0, 0x45},
+    {0x09, 0x1c},
+    {0xf0, 0x37},
+    {0x22, 0x0d},
+    {0xf0, 0x33},
+    {0x33, 0x10},
+    {0xb1, 0x80},
+    {0x34, 0x40},
+    {0x0b, 0x54},
+    {0xb2, 0x78},
+    {0xf0, 0x36},
+    {0x11, 0x80},
+    {0xf0, 0x30},
+    {0x38, 0x44},
+    {0xf0, 0x33},
+    {0xb3, 0x51},
+    {0x01, 0x10},
+    {0x0b, 0x6c},
+    {0x06, 0x24},
+    {0xf0, 0x36},
+    {0x31, 0x82},
+    {0x3e, 0x60},
+    {0x30, 0xf0},
+    {0x33, 0x33},
+    {0xf0, 0x34},
+    {0x9f, 0x02},
+    {0xa6, 0x40},
+    {0xa7, 0x47},
+    {0xe8, 0x5f},
+    {0xa8, 0x51},
+    {0xa9, 0x44},
+    {0xe9, 0x36},
+    {0xf0, 0x33},
+    {0xb3, 0x51},
+    {0x64, 0x17},
+    {0x90, 0x01},
+    {0x91, 0x03},
+    {0x92, 0x07},
+    {0x01, 0x10},
+    {0x93, 0x10},
+    {0x94, 0x10},
+    {0x95, 0x10},
+    {0x96, 0x01},
+    {0x97, 0x07},
+    {0x98, 0x1f},
+    {0x99, 0x10},
+    {0x9a, 0x20},
+    {0x9b, 0x28},
+    {0x9c, 0x28},
+    {0xf0, 0x36},
+    {0x70, 0x54},
+    {0xb6, 0x40},
+    {0xb7, 0x41},
+    {0xb8, 0x43},
+    {0xb9, 0x47},
+    {0xba, 0x4f},
+    {0xb0, 0x8b},
+    {0xb1, 0x8b},
+    {0xb2, 0x8b},
+    {0xb3, 0x9b},
+    {0xb4, 0xb8},
+    {0xb5, 0xf0},
+    {0x7e, 0x41},
+    {0x7f, 0x47},
+    {0x77, 0x80},
+    {0x78, 0x84},
+    {0x79, 0x8a},
+    {0xa0, 0x47},
+    {0xa1, 0x5f},
+    {0x96, 0x43},
+    {0x97, 0x44},
+    {0x98, 0x54},
+    {0xf0, 0x00},
+    {0xf0, 0x01},
+    {0x73, 0x00},
+    {0x74, 0xe0},
+    {0x70, 0x00},
+    {0x71, 0x80},
+    {0xf0, 0x36},
+    {0x37, 0x74},
+    {0xf0, 0x3f},
+    {0x03, 0xa1},
+    {0xf0, 0x36},//cvbs_off
+    {0x11, 0x80},
+    {0xf0, 0x01},
+    {0x79, 0xc1},
+    {0xf0, 0x37},
+    {0x24, 0x21},
+    {0xf0, 0x36},
+    {0x41, 0x00},
+    {0xea, 0x09},
+    {0xeb, 0x03},
+    {0xec, 0x19},
+    {0xed, 0x38},
+    {0xe9, 0x30},
+    {0xf0, 0x33},
+    {0x33, 0x00},
+    {0x34, 0x00},
+    {0xb1, 0x00},
+    {0xf0, 0x00},
+    {0xe0, 0x04},
+    {0xf0, 0x01},
+    {0x73, 0x00},
+    {0x74, 0xe0},
+    {0x70, 0x00},
+    {0x71, 0x80},
+    {0xf0, 0x36},
+    {0x32, 0x44},
+    {0xf0, 0x36},
+    {0x3e, 0xe0},
+    {0x70, 0x56},
+    {0x7c, 0x43},
+    {0x7d, 0x47},
+    {0x74, 0x00},
+    {0x75, 0x00},
+    {0x76, 0x00},
+    {0xa0, 0x47},
+    {0xa1, 0x5f},
+    {0x96, 0x22},
+    {0x97, 0x22},
+    {0x98, 0x22},
+    {0xf0, 0x00},
+    {0x72, 0x38},
+    {0x7a, 0x80},
+    {0x85, 0x18},
+    {0x9b, 0x35},
+    {0x9e, 0x20},
+    {0xd0, 0x66},
+    {0xd1, 0x34},
+    {0Xd3, 0x44},
+    {0xd6, 0x44},
+    {0xb0, 0x41},
+    {0xb2, 0x48},
+    {0xb3, 0xf4},
+    {0xb4, 0x0b},
+    {0xb5, 0x78},
+    {0xba, 0xff},
+    {0xbb, 0xc0},
+    {0xbc, 0x90},
+    {0xbd, 0x3a},
+    {0xc1, 0x67},
+    {0xf0, 0x01},
+    {0x20, 0x11},
+    {0x23, 0x90},
+    {0x24, 0x15},
+    {0x25, 0x87},
+    {0xbc, 0x9f},
+    {0xbd, 0x3a},
+    {0x48, 0xe6},
+    {0x49, 0xc0},
+    {0x4a, 0xd0},
+    {0x4b, 0x48},
+
+    // [cvbs_on]
+    {0xf0, 0x36},
+    {0x11, 0x00},
+    {0xf0, 0x01},
+    {0x79, 0xf1},
+
+    // [cvbs_off]
+    {0xf0, 0x36},
+    {0x11, 0x80},
+    {0xf0, 0x01},
+    {0x79, 0xc1},
+};
+
+/*
+[Sensor]
+SensorName=SC031IOT
+width=640
+height=480
+port=1
+type=2
+pin=3
+SlaveID=0xd0
+mode=0
+FlagReg=0xf7
+FlagMask=0xff
+FlagData=0xfa
+FlagReg1=0xf8
+FlagMask1=0xff
+FlagData1=0x46
+outformat=0
+mclk=27
+avdd=2.80000
+dovdd=2.800000
+dvdd=1.5
+
+Ext0=0
+Ext1=0
+Ext2=0
+AFVCC=0.0000
+VPP=0.000000
+*/
+/* 27M MCLK, 30fps
+static const uint8_t sc030iot_default_init_regs[][2] = {
+    {0xf0, 0x30},
+    {0x01, 0xff},
+    {0x02, 0xff},
+    {0x22, 0x07},
+    {0x19, 0xff},
+    {0x3f, 0x82},
+    {0x30, 0x02},
+    {0xf0, 0x01},
+    {0x70, 0x00},
+    {0x71, 0x80},
+    {0x72, 0x20},
+    {0x73, 0x00},
+    {0x74, 0xe0},
+    {0x75, 0x10},
+    {0x76, 0x81},
+    {0x77, 0x88},
+    {0x78, 0xe1},
+    {0x79, 0x01},
+    {0xf5, 0x01},
+    {0xf4, 0x0a},
+    {0xf0, 0x36},
+    {0x37, 0x79},
+    {0x31, 0x82},
+    {0x3e, 0x60},
+    {0x30, 0xf0},
+    {0x33, 0x33},
+    {0xf0, 0x32},
+    {0x48, 0x02},
+    {0xf0, 0x33},
+    {0x02, 0x12},
+    {0x7c, 0x02},
+    {0x7d, 0x0e},
+    {0xa2, 0x04},
+    {0x5e, 0x06},
+    {0x5f, 0x0a},
+    {0x0b, 0x58},
+    {0x06, 0x38},
+    {0xf0, 0x32},
+    {0x48, 0x02},
+    {0xf0, 0x39},
+    {0x02, 0x70},
+    {0xf0, 0x45},
+    {0x09, 0x1c},
+    {0xf0, 0x37},
+    {0x22, 0x0d},
+    {0xf0, 0x33},
+    {0x33, 0x10},
+    {0xb1, 0x80},
+    {0x34, 0x40},
+    {0x0b, 0x54},
+    {0xb2, 0x78},
+    {0xf0, 0x36},
+    {0x11, 0x80},
+    {0xf0, 0x30},
+    {0x38, 0x44},
+    {0xf0, 0x33},
+    {0xb3, 0x51},
+    {0x01, 0x10},
+    {0x0b, 0x6c},
+    {0x06, 0x24},
+    {0xf0, 0x36},
+    {0x31, 0x82},
+    {0x3e, 0x60},
+    {0x30, 0xf0},
+    {0x33, 0x33},
+    {0xf0, 0x34},
+    {0x9f, 0x02},
+    {0xa6, 0x40},
+    {0xa7, 0x47},
+    {0xe8, 0x5f},
+    {0xa8, 0x51},
+    {0xa9, 0x44},
+    {0xe9, 0x36},
+    {0xf0, 0x33},
+    {0xb3, 0x51},
+    {0x64, 0x17},
+    {0x90, 0x01},
+    {0x91, 0x03},
+    {0x92, 0x07},
+    {0x01, 0x10},
+    {0x93, 0x10},
+    {0x94, 0x10},
+    {0x95, 0x10},
+    {0x96, 0x01},
+    {0x97, 0x07},
+    {0x98, 0x1f},
+    {0x99, 0x10},
+    {0x9a, 0x20},
+    {0x9b, 0x28},
+    {0x9c, 0x28},
+    {0xf0, 0x36},
+    {0x70, 0x54},
+    {0xb6, 0x40},
+    {0xb7, 0x41},
+    {0xb8, 0x43},
+    {0xb9, 0x47},
+    {0xba, 0x4f},
+    {0xb0, 0x8b},
+    {0xb1, 0x8b},
+    {0xb2, 0x8b},
+    {0xb3, 0x9b},
+    {0xb4, 0xb8},
+    {0xb5, 0xf0},
+    {0x7e, 0x41},
+    {0x7f, 0x47},
+    {0x77, 0x80},
+    {0x78, 0x84},
+    {0x79, 0x8a},
+    {0xa0, 0x47},
+    {0xa1, 0x5f},
+    {0x96, 0x43},
+    {0x97, 0x44},
+    {0x98, 0x54},
+    {0xf0, 0x00},
+    {0xf0, 0x01},
+    {0x73, 0x00},
+    {0x74, 0xe0},
+    {0x70, 0x00},
+    {0x71, 0x80},
+    {0xf0, 0x36},
+    {0x37, 0x74},
+    {0xf0, 0x3f},
+    {0x03, 0x93},
+    {0xf0, 0x36},//cvbs_off
+    {0x11, 0x80},
+    {0xf0, 0x01},
+    {0x79, 0xc1},
+    {0xf0, 0x37},
+    {0x24, 0x21},
+    {0xf0, 0x36},
+    {0x41, 0x00},
+    {0xe9, 0x2c},
+    {0xf0, 0x33},
+    {0x33, 0x00},
+    {0x34, 0x00},
+    {0xb1, 0x00},
+    {0xf0, 0x00},
+    {0xe0, 0x04},
+    {0xf0, 0x01},
+    {0x73, 0x00},
+    {0x74, 0xe0},
+    {0x70, 0x00},
+    {0x71, 0x80},
+    {0xf0, 0x36},
+    {0x32, 0x44},
+    {0xf0, 0x36},
+    {0x3e, 0xe0},
+    {0x70, 0x56},
+    {0x7c, 0x43},
+    {0x7d, 0x47},
+    {0x74, 0x00},
+    {0x75, 0x00},
+    {0x76, 0x00},
+    {0xa0, 0x47},
+    {0xa1, 0x5f},
+    {0x96, 0x22},
+    {0x97, 0x22},
+    {0x98, 0x22},
+    {0xf0, 0x00},
+    {0x72, 0x38},
+    {0x7a, 0x80},
+    {0x85, 0x18},
+    {0x9b, 0x35},
+    {0x9e, 0x20},
+    {0xd0, 0x66},
+    {0xd1, 0x34},
+    {0Xd3, 0x44},
+    {0xd6, 0x44},
+    {0xb0, 0x41},
+    {0xb2, 0x48},
+    {0xb3, 0xf4},
+    {0xb4, 0x0b},
+    {0xb5, 0x78},
+    {0xba, 0xff},
+    {0xbb, 0xc0},
+    {0xbc, 0x90},
+    {0xbd, 0x3a},
+    {0xc1, 0x67},
+    {0xf0, 0x01},
+    {0x20, 0x11},
+    {0x23, 0x90},
+    {0x24, 0x15},
+    {0x25, 0x87},
+    {0xbc, 0x9f},
+    {0xbd, 0x3a},
+    {0x48, 0xe6},
+    {0x49, 0xc0},
+    {0x4a, 0xd0},
+    {0x4b, 0x48},
+
+    // [cvbs_on]
+    {0xf0, 0x36},
+    {0x11, 0x00},
+    {0xf0, 0x01},
+    {0x79, 0xf1},
+
+    // [cvbs_off]
+    {0xf0, 0x36},
+    {0x11, 0x80},
+    {0xf0, 0x01},
+    {0x79, 0xc1},
+};
+
+*/
\ No newline at end of file
diff --git a/code/components/esp32-camera-master/sensors/private_include/sc101iot.h b/code/components/esp32-camera-master/sensors/private_include/sc101iot.h
new file mode 100644
index 00000000..85858498
--- /dev/null
+++ b/code/components/esp32-camera-master/sensors/private_include/sc101iot.h
@@ -0,0 +1,31 @@
+/*
+ *
+ * SC101IOT DVP driver.
+ *
+ */
+#ifndef __SC101IOT_H__
+#define __SC101IOT_H__
+
+#include "sensor.h"
+
+/**
+ * @brief Detect sensor pid
+ *
+ * @param slv_addr SCCB address
+ * @param id Detection result
+ * @return
+ *     0:       Can't detect this sensor
+ *     Nonzero: This sensor has been detected
+ */
+int sc101iot_detect(int slv_addr, sensor_id_t *id);
+
+/**
+ * @brief initialize sensor function pointers
+ *
+ * @param sensor pointer of sensor
+ * @return
+ *      Always 0
+ */
+int sc101iot_init(sensor_t *sensor);
+
+#endif // __SC101IOT_H__
diff --git a/code/components/esp32-camera-master/sensors/private_include/sc101iot_settings.h b/code/components/esp32-camera-master/sensors/private_include/sc101iot_settings.h
new file mode 100644
index 00000000..2eb14398
--- /dev/null
+++ b/code/components/esp32-camera-master/sensors/private_include/sc101iot_settings.h
@@ -0,0 +1,257 @@
+//Preview Type:0:DVP Raw 10 bit// 1:Raw 8 bit// 2:YUV422// 3:RAW16
+//Preview Type:4:RGB565// 5:Pixart SPI// 6:MIPI 10bit// 7:MIPI 12bit// 8: MTK SPI
+//port  0:MIPI// 1:Parallel// 2:MTK// 3:SPI// 4:TEST// 5: HISPI// 6 : Z2P/Z4P
+//I2C Mode    :0:Normal 8Addr,8Data//  1:Samsung 8 Addr,8Data// 2:Micron 8 Addr,16Data
+//I2C Mode    :3:Stmicro 16Addr,8Data//4:Micron2 16 Addr,16Data
+//Out Format  :0:YCbYCr/RG_GB// 1:YCrYCb/GR_BG// 2:CbYCrY/GB_RG// 3:CrYCbY/BG_GR
+//MCLK Speed  :0:6M//1:8M//2:10M//3:11.4M//4:12M//5:12.5M//6:13.5M//7:15M//8:18M//9:24M
+//pin  :BIT0 pwdn// BIT1:reset
+//avdd  0:2.8V// 1:2.5V// 2:1.8V
+//dovdd  0:2.8V// 1:2.5V// 2:1.8V
+//dvdd  0:1.8V// 1:1.5V// 2:1.2V
+/*
+[DataBase]
+DBName=DemoSens
+
+[Vendor]
+VendorName=SmartSens
+I2C_CRC=0
+
+[Sensor]
+SensorName=SC101AP_raw
+width=1280
+height=720
+port=1
+type=2
+pin=3
+SlaveID=0xd0
+mode=0
+FlagReg=0xf7
+FlagMask=0xff
+FlagData=0xda
+FlagReg1=0xf8
+FlagMask1=0xff
+FlagData1=0x4a
+outformat=0
+mclk=20
+avdd=2.800000
+dovdd=2.800000
+dvdd=1.200000
+
+Ext0=0
+Ext1=0
+Ext2=0
+AFVCC=0.00
+VPP=0.000000
+*/
+#include <stdint.h>
+
+static const uint8_t sc101iot_default_init_regs[][2] = {
+#if CONFIG_SC101IOT_720P_15FPS_ENABLED // 720P+YUV422+15FPS sensor default regs
+/* Here are some test results:
+#   size    xclk    fps    pic       pclk                       
+#  ------- ------- ------ --------- ------- --- --- --- --- --- 
+#   720p    4       3      err                                  
+#   720p    8       5      normal    15                         
+#   720p    10      7.8    normal    19                         
+#   720p    20      15     warning   37.5                       
+#   VGA     8       6      normal    
+#   VGA     20      16     normal    
+
+*/
+    {0xf0, 0x30},
+    {0x01, 0xff},
+    {0x02, 0xe0},
+    {0x30, 0x10},
+    {0x3f, 0x81},
+    {0xf0, 0x00},
+    {0x70, 0x6b},
+    {0x72, 0x30},
+    {0x84, 0xb4},
+    {0x8b, 0x00},
+    {0x8c, 0x20},
+    {0x8d, 0x02},
+    {0x8e, 0xec},
+    {0x9e, 0x10},
+    {0xb0, 0xc1},
+    {0xc8, 0x10},
+    {0xc9, 0x10},
+    {0xc6, 0x00},
+    {0xe0, 0x0f},
+    {0xb5, 0xf0},
+    {0xde, 0x80},
+    {0xb5, 0xf0},
+    {0xde, 0x80},
+    {0xb2, 0x50},
+    {0xb3, 0xfc},
+    {0xb4, 0x40},
+    {0xb5, 0xc0},
+    {0xb6, 0x50},
+    {0xb7, 0xfc},
+    {0xb8, 0x40},
+    {0xb9, 0xc0},
+    {0xba, 0xff},
+    {0xbb, 0xcc},
+    {0xbc, 0xa9},
+    {0xbd, 0x7d},
+    {0xc1, 0x77},
+    {0xf0, 0x01},
+    {0x70, 0x02},
+    {0x71, 0x02},
+    {0x72, 0x50},
+    {0x73, 0x02},
+    {0x74, 0xd2},
+    {0x75, 0x20},
+    {0x76, 0x81},
+    {0x77, 0x8c},
+    {0x78, 0x81},
+    {0xf4, 0x01},
+    {0xf5, 0x00},
+    {0xf6, 0x00},
+    {0xf0, 0x36},
+    {0x40, 0x03},
+    {0x41, 0x01},
+    {0xf0, 0x39},
+    {0x02, 0x70},
+    {0xf0, 0x32},
+    {0x41, 0x00},
+    {0x43, 0x01},
+    {0x48, 0x02},
+    {0xf0, 0x45},
+    {0x09, 0x20},
+    {0xf0, 0x33},
+    {0x33, 0x10},
+    {0xf0, 0x30},
+    {0x38, 0x44},
+    {0xf0, 0x39},
+    {0x07, 0x00},
+    {0x08, 0x19},
+    {0x47, 0x00},
+    {0x48, 0x00},
+    {0xf0, 0x37},
+    {0x24, 0x31},
+    {0xf0, 0x34},
+    {0x9f, 0x02},
+    {0xa6, 0x51},
+    {0xa7, 0x57},
+    {0xe8, 0x5f},
+    {0xa8, 0x50},
+    {0xa9, 0x50},
+    {0xe9, 0x50},
+    {0xf0, 0x33},
+    {0xb3, 0x58},
+    {0xb2, 0x78},
+    {0xf0, 0x34},
+    {0x9f, 0x03},
+    {0xa6, 0x51},
+    {0xa7, 0x57},
+    {0xaa, 0x01},
+    {0xab, 0x28},
+    {0xac, 0x01},
+    {0xad, 0x38},
+    {0xf0, 0x33},
+    {0x0a, 0x01},
+    {0x0b, 0x28},
+    {0xf0, 0x33},
+    {0x64, 0x0f},
+    {0xec, 0x51},
+    {0xed, 0x57},
+    {0x06, 0x58},
+    {0xe9, 0x58},
+    {0xeb, 0x68},
+    {0xf0, 0x33},
+    {0x64, 0x0f},
+    {0xf0, 0x36},
+    {0x70, 0xdf},
+    {0xb6, 0x40},
+    {0xb7, 0x51},
+    {0xb8, 0x53},
+    {0xb9, 0x57},
+    {0xba, 0x5f},
+    {0xb0, 0x84},
+    {0xb1, 0x82},
+    {0xb2, 0x84},
+    {0xb3, 0x88},
+    {0xb4, 0x90},
+    {0xb5, 0x90},
+    {0xf0, 0x36},
+    {0x7e, 0x50},
+    {0x7f, 0x51},
+    {0x77, 0x81},
+    {0x78, 0x86},
+    {0x79, 0x89},
+    {0xf0, 0x36},
+    {0x70, 0xdf},
+    {0x9c, 0x51},
+    {0x9d, 0x57},
+    {0x90, 0x54},
+    {0x91, 0x54},
+    {0x92, 0x56},
+    {0xf0, 0x36},
+    {0xa0, 0x51},
+    {0xa1, 0x57},
+    {0x96, 0x33},
+    {0x97, 0x43},
+    {0x98, 0x43},
+    {0xf0, 0x36},
+    {0x70, 0xdf},
+    {0x7c, 0x40},
+    {0x7d, 0x53},
+    {0x74, 0xd0},
+    {0x75, 0xf0},
+    {0x76, 0xf0},
+    {0xf0, 0x37},
+    {0x0f, 0xd5},
+    {0x7a, 0x40},
+    {0x7b, 0x57},
+    {0x71, 0x09},
+    {0x72, 0x09},
+    {0x73, 0x05},
+    {0xf0, 0x33},
+    {0x01, 0x44},
+    {0xf0, 0x36},
+    {0x37, 0xfb},
+    {0xf0, 0x36},
+    {0x3c, 0x0d},
+    {0xf0, 0x33},
+    {0x14, 0x95},
+    {0xf0, 0x33},
+    {0x8f, 0x80},
+    {0xf0, 0x37},
+    {0x27, 0x14},
+    {0x28, 0x03},
+    {0xf0, 0x36},
+    {0x37, 0xf4},
+    {0xf0, 0x33},
+    {0x01, 0x44},
+    {0xf0, 0x36},
+    {0x79, 0x89},
+    {0xf0, 0x34},
+    {0xac, 0x01},
+    {0xad, 0x40},
+    {0xf0, 0x33},
+    {0xeb, 0x70},
+    {0xf0, 0x34},
+    {0xa8, 0x50},
+    {0xa9, 0x50},
+    {0xf0, 0x33},
+    {0xb3, 0x58},
+    {0xf0, 0x36},
+    {0x11, 0x80},
+    {0xf0, 0x36},
+    {0x41, 0x51},
+    {0xf0, 0x3f},
+    {0x03, 0x09},	  
+    {0xf0, 0x32},
+    {0x0c, 0x06},
+    {0x0d, 0x82},
+    {0x0e, 0x02},
+    {0x0f, 0xee},
+    {0xf0, 0x36},
+    {0xea, 0x09},
+    {0xeb, 0xf5},
+    {0xec, 0x11},
+    {0xed, 0x27},
+    {0xe9, 0x20},
+#endif
+};
diff --git a/code/components/esp32-camera-master/sensors/sc030iot.c b/code/components/esp32-camera-master/sensors/sc030iot.c
new file mode 100644
index 00000000..86f525f3
--- /dev/null
+++ b/code/components/esp32-camera-master/sensors/sc030iot.c
@@ -0,0 +1,335 @@
+/*
+ * SC030IOT driver.
+ * 
+ * Copyright 2020-2022 Espressif Systems (Shanghai) PTE LTD
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "sccb.h"
+#include "xclk.h"
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+
+#include "sc030iot.h"
+#include "sc030iot_settings.h"
+
+#if defined(ARDUINO_ARCH_ESP32) && defined(CONFIG_ARDUHAL_ESP_LOG)
+#include "esp32-hal-log.h"
+#else
+#include "esp_log.h"
+static const char* TAG = "sc030";
+#endif
+
+#define SC030_SENSOR_ID_HIGH_REG    0XF7
+#define SC030_SENSOR_ID_LOW_REG     0XF8
+#define SC030_MAX_FRAME_WIDTH       (640)
+#define SC030_MAX_FRAME_HIGH        (480)
+
+// sc030 use "i2c paging mode", so the high byte of the register needs to be written to the 0xf0 reg.
+// For more information please refer to the Technical Reference Manual.
+static int get_reg(sensor_t *sensor, int reg, int reg_value_mask)
+{
+    int ret = 0;
+    uint8_t reg_high = (reg>>8) & 0xFF;
+    uint8_t reg_low = reg & 0xFF;
+
+    if(SCCB_Write(sensor->slv_addr, 0xf0, reg_high)) {
+        return -1;
+    }
+
+    ret = SCCB_Read(sensor->slv_addr, reg_low);
+    if(ret > 0){
+        ret &= reg_value_mask;
+    }
+    return ret;
+}
+
+// sc030 use "i2c paging mode", so the high byte of the register needs to be written to the 0xf0 reg.
+// For more information please refer to the Technical Reference Manual.
+static int set_reg(sensor_t *sensor, int reg, int mask, int value)
+{
+    int ret = 0;
+    uint8_t reg_high = (reg>>8) & 0xFF;
+    uint8_t reg_low = reg & 0xFF;
+
+    if(SCCB_Write(sensor->slv_addr, 0xf0, reg_high)) {
+        return -1;
+    }
+    
+    ret = SCCB_Write(sensor->slv_addr, reg_low, value & 0xFF);
+    return ret;
+}
+
+static int set_regs(sensor_t *sensor, const uint8_t (*regs)[2], uint32_t regs_entry_len)
+{
+    int i=0, res = 0;
+    while (i<regs_entry_len) {
+        res = SCCB_Write(sensor->slv_addr, regs[i][0], regs[i][1]);
+        if (res) {
+            return res;
+        }
+        i++;
+    }
+    return res;
+}
+
+static int set_reg_bits(sensor_t *sensor, int reg, uint8_t offset, uint8_t length, uint8_t value)
+{
+    int ret = 0;
+    ret = get_reg(sensor, reg, 0xff);
+    if(ret < 0){
+        return ret;
+    }
+    uint8_t mask = ((1 << length) - 1) << offset;
+    value = (ret & ~mask) | ((value << offset) & mask);
+    ret = set_reg(sensor, reg & 0xFFFF, 0xFFFF, value);
+    return ret;
+}
+
+#define WRITE_REGS_OR_RETURN(regs, regs_entry_len) ret = set_regs(sensor, regs, regs_entry_len); if(ret){return ret;}
+#define WRITE_REG_OR_RETURN(reg, val) ret = set_reg(sensor, reg, 0xFF, val); if(ret){return ret;}
+#define SET_REG_BITS_OR_RETURN(reg, offset, length, val) ret = set_reg_bits(sensor, reg, offset, length, val); if(ret){return ret;}
+
+static int set_hmirror(sensor_t *sensor, int enable)
+{
+    int ret = 0;
+    if(enable) {
+        SET_REG_BITS_OR_RETURN(0x3221, 1, 2, 0x3); // mirror on
+    } else {
+        SET_REG_BITS_OR_RETURN(0x3221, 1, 2, 0x0); // mirror off
+    }
+
+    return ret;
+}
+
+static int set_vflip(sensor_t *sensor, int enable)
+{
+    int ret = 0;
+    if(enable) {
+        SET_REG_BITS_OR_RETURN(0x3221, 5, 2, 0x3); // flip on
+    } else {
+        SET_REG_BITS_OR_RETURN(0x3221, 5, 2, 0x0); // flip off
+    }
+
+    return ret;
+}
+
+static int set_colorbar(sensor_t *sensor, int enable)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x0100, 7, 1, enable & 0xff); // enable test pattern mode
+
+    return ret;
+}
+
+static int set_sharpness(sensor_t *sensor, int level)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x00e0, 1, 1, 1); // enable edge enhancement
+    WRITE_REG_OR_RETURN(0x00d0, level & 0xFF); // base value
+    WRITE_REG_OR_RETURN(0x00d2, (level >> 8) & 0xFF); // limit
+
+    return ret;
+}
+
+static int set_agc_gain(sensor_t *sensor, int gain)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x0070, 1, 1, 1); // enable auto agc control
+    WRITE_REG_OR_RETURN(0x0068, gain & 0xFF); // Window weight setting1
+    WRITE_REG_OR_RETURN(0x0069, (gain >> 8) & 0xFF); // Window weight setting2
+    WRITE_REG_OR_RETURN(0x006a, (gain >> 16) & 0xFF); // Window weight setting3
+    WRITE_REG_OR_RETURN(0x006b, (gain >> 24) & 0xFF); // Window weight setting4
+    
+    return ret;
+}
+
+static int set_aec_value(sensor_t *sensor, int value)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x0070, 0, 1, 1); // enable auto aec control
+    WRITE_REG_OR_RETURN(0x0072, value & 0xFF); // AE target
+
+    return ret;
+}
+
+static int set_awb_gain(sensor_t *sensor, int value)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x00b0, 0, 1, 1); // enable awb control
+    WRITE_REG_OR_RETURN(0x00c8, value & 0xFF); // blue gain
+    WRITE_REG_OR_RETURN(0x00c9, (value>>8) & 0XFF); // red gain
+    return ret;
+}
+
+static int set_saturation(sensor_t *sensor, int level)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x00f5, 5, 1, 0); // enable saturation control
+    WRITE_REG_OR_RETURN(0x0149, level & 0xFF); // blue saturation gain (/128)
+    WRITE_REG_OR_RETURN(0x014a, (level>>8) & 0XFF); // red saturation gain (/128)
+    return ret;
+}
+
+static int set_contrast(sensor_t *sensor, int level)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x00f5, 6, 1, 0); // enable contrast control
+    WRITE_REG_OR_RETURN(0x014b, level); // contrast coefficient(/64)
+    return ret;
+}
+
+static int reset(sensor_t *sensor)
+{
+    int ret = set_regs(sensor, sc030iot_default_init_regs, sizeof(sc030iot_default_init_regs)/(sizeof(uint8_t) * 2));
+    
+    // Delay
+    vTaskDelay(50 / portTICK_PERIOD_MS);
+
+    // ESP_LOGI(TAG, "set_reg=%0x", set_reg(sensor, 0x0100, 0xffff, 0x00)); // write 0x80 to enter test mode if you want to test the sensor
+    // ESP_LOGI(TAG, "0x0100=%0x", get_reg(sensor, 0x0100, 0xffff));
+    if (ret) {
+        ESP_LOGE(TAG, "reset fail");
+    }
+    return ret;
+}
+
+static int set_window(sensor_t *sensor, int offset_x, int offset_y, int w, int h)
+{
+    int ret = 0;
+    //sc:H_start={0x0172[1:0],0x0170},H_end={0x0172[5:4],0x0171},
+    WRITE_REG_OR_RETURN(0x0170, offset_x & 0xff);
+    WRITE_REG_OR_RETURN(0x0171, (offset_x+w) & 0xff);
+    WRITE_REG_OR_RETURN(0x0172, ((offset_x>>8) & 0x03) | (((offset_x+w)>>4)&0x30));
+
+    //sc:V_start={0x0175[1:0],0x0173},H_end={0x0175[5:4],0x0174},
+    WRITE_REG_OR_RETURN(0x0173, offset_y & 0xff);
+    WRITE_REG_OR_RETURN(0x0174, (offset_y+h) & 0xff);
+    WRITE_REG_OR_RETURN(0x0175, ((offset_y>>8) & 0x03) | (((offset_y+h)>>4)&0x30));
+
+    vTaskDelay(10 / portTICK_PERIOD_MS);
+
+    return ret;
+}
+
+static int set_framesize(sensor_t *sensor, framesize_t framesize)
+{
+    uint16_t w = resolution[framesize].width;
+    uint16_t h = resolution[framesize].height;
+    if(w>SC030_MAX_FRAME_WIDTH || h > SC030_MAX_FRAME_HIGH) {
+        goto err; 
+    }
+
+    uint16_t offset_x = (640-w) /2;   
+    uint16_t offset_y = (480-h) /2;
+    
+    if(set_window(sensor, offset_x, offset_y, w, h)) {
+        goto err; 
+    }
+    
+    sensor->status.framesize = framesize;
+    return 0;
+err:
+    ESP_LOGE(TAG, "frame size err");
+    return -1;
+}
+
+static int set_pixformat(sensor_t *sensor, pixformat_t pixformat)
+{
+    int ret=0;
+    sensor->pixformat = pixformat;
+
+    switch (pixformat) {
+    case PIXFORMAT_RGB565:
+    case PIXFORMAT_RAW:
+    case PIXFORMAT_GRAYSCALE:
+        ESP_LOGE(TAG, "Not support");
+        break;
+    case PIXFORMAT_YUV422: // For now, sc030/sc031 sensor only support YUV422.
+        break;
+    default:
+        return -1;
+    }
+
+    return ret;
+}
+
+static int init_status(sensor_t *sensor)
+{
+    return 0;
+}
+
+static int set_dummy(sensor_t *sensor, int val){ return -1; }
+
+static int set_xclk(sensor_t *sensor, int timer, int xclk)
+{
+    int ret = 0;
+    sensor->xclk_freq_hz = xclk * 1000000U;
+    ret = xclk_timer_conf(timer, sensor->xclk_freq_hz);
+    return ret;
+}
+
+int sc030iot_detect(int slv_addr, sensor_id_t *id)
+{
+    if (SC030IOT_SCCB_ADDR == slv_addr) {
+        uint8_t MIDL = SCCB_Read(slv_addr, SC030_SENSOR_ID_LOW_REG);
+        uint8_t MIDH = SCCB_Read(slv_addr, SC030_SENSOR_ID_HIGH_REG);
+        uint16_t PID = MIDH << 8 | MIDL;
+        if (SC030IOT_PID == PID) {
+            id->PID = PID;
+            return PID;
+        } else {
+            ESP_LOGI(TAG, "Mismatch PID=0x%x", PID);
+        }
+    }
+    return 0;
+}
+
+int sc030iot_init(sensor_t *sensor)
+{
+    // Set function pointers
+    sensor->reset = reset;
+    sensor->init_status = init_status;
+    sensor->set_pixformat = set_pixformat;
+    sensor->set_framesize = set_framesize;
+    
+    sensor->set_saturation= set_saturation;
+    sensor->set_colorbar = set_colorbar;
+    sensor->set_hmirror = set_hmirror;
+    sensor->set_vflip = set_vflip;
+    sensor->set_sharpness = set_sharpness;
+    sensor->set_agc_gain = set_agc_gain;
+    sensor->set_aec_value = set_aec_value;
+    sensor->set_awb_gain = set_awb_gain;
+    sensor->set_contrast = set_contrast;
+    //not supported
+    sensor->set_denoise = set_dummy;
+    sensor->set_quality = set_dummy;
+    sensor->set_special_effect = set_dummy;
+    sensor->set_wb_mode = set_dummy;
+    sensor->set_ae_level = set_dummy;
+    
+
+    sensor->get_reg = get_reg;
+    sensor->set_reg = set_reg;
+    sensor->set_xclk = set_xclk;
+    
+    ESP_LOGD(TAG, "sc030iot Attached");
+
+    return 0;
+}
\ No newline at end of file
diff --git a/code/components/esp32-camera-master/sensors/sc101iot.c b/code/components/esp32-camera-master/sensors/sc101iot.c
new file mode 100644
index 00000000..310a0476
--- /dev/null
+++ b/code/components/esp32-camera-master/sensors/sc101iot.c
@@ -0,0 +1,342 @@
+/*
+ * SC101IOT driver.
+ * 
+ * Copyright 2020-2022 Espressif Systems (Shanghai) PTE LTD
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "sccb.h"
+#include "xclk.h"
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+
+#include "sc101iot.h"
+#include "sc101iot_settings.h"
+
+#if defined(ARDUINO_ARCH_ESP32) && defined(CONFIG_ARDUHAL_ESP_LOG)
+#include "esp32-hal-log.h"
+#else
+#include "esp_log.h"
+static const char* TAG = "sc101";
+#endif
+
+#define SC101_SENSOR_ID_HIGH_REG    0XF7
+#define SC101_SENSOR_ID_LOW_REG     0XF8
+#define SC101_MAX_FRAME_WIDTH       (1280)
+#define SC101_MAX_FRAME_HIGH        (720)
+
+// sc101 use "i2c paging mode", so the high byte of the register needs to be written to the 0xf0 reg.
+// For more information please refer to the Technical Reference Manual.
+static int get_reg(sensor_t *sensor, int reg, int mask)
+{
+    int ret = 0;
+    uint8_t reg_high = (reg>>8) & 0xFF;
+    uint8_t reg_low = reg & 0xFF;
+
+    if(SCCB_Write(sensor->slv_addr, 0xf0, reg_high)) {
+        return -1;
+    }
+
+    ret = SCCB_Read(sensor->slv_addr, reg_low);
+    if(ret > 0){
+        ret &= mask;
+    }
+    return ret;
+}
+
+// sc101 use "i2c paging mode", so the high byte of the register needs to be written to the 0xf0 reg.
+// For more information please refer to the Technical Reference Manual.
+static int set_reg(sensor_t *sensor, int reg, int mask, int value)
+{
+    int ret = 0;
+    uint8_t reg_high = (reg>>8) & 0xFF;
+    uint8_t reg_low = reg & 0xFF;
+
+    if(SCCB_Write(sensor->slv_addr, 0xf0, reg_high)) {
+        return -1;
+    }
+    
+    ret = SCCB_Write(sensor->slv_addr, reg_low, value & 0xFF);
+    return ret;
+}
+
+static int set_regs(sensor_t *sensor, const uint8_t (*regs)[2], uint32_t regs_entry_len)
+{
+    int i=0, res = 0;
+    while (i<regs_entry_len) {
+        res = SCCB_Write(sensor->slv_addr, regs[i][0], regs[i][1]);
+        if (res) {
+            return res;
+        }
+        i++;
+    }
+    return res;
+}
+
+static int set_reg_bits(sensor_t *sensor, int reg, uint8_t offset, uint8_t length, uint8_t value)
+{
+    int ret = 0;
+    ret = get_reg(sensor, reg, 0xff);
+    if(ret < 0){
+        return ret;
+    }
+    uint8_t mask = ((1 << length) - 1) << offset;
+    value = (ret & ~mask) | ((value << offset) & mask);
+    ret = set_reg(sensor, reg & 0xFFFF, 0xFFFF, value);
+    return ret;
+}
+
+#define WRITE_REGS_OR_RETURN(regs, regs_entry_len) ret = set_regs(sensor, regs, regs_entry_len); if(ret){return ret;}
+#define WRITE_REG_OR_RETURN(reg, val) ret = set_reg(sensor, reg, 0xFF, val); if(ret){return ret;}
+#define SET_REG_BITS_OR_RETURN(reg, offset, length, val) ret = set_reg_bits(sensor, reg, offset, length, val); if(ret){return ret;}
+
+static int set_hmirror(sensor_t *sensor, int enable)
+{
+    int ret = 0;
+    if(enable) {
+        SET_REG_BITS_OR_RETURN(0x3221, 1, 2, 0x3); // enable mirror
+    } else {
+        SET_REG_BITS_OR_RETURN(0x3221, 1, 2, 0x0); // disable mirror
+    }
+
+    return ret;
+}
+
+static int set_vflip(sensor_t *sensor, int enable)
+{
+    int ret = 0;
+    if(enable) {
+        SET_REG_BITS_OR_RETURN(0x3221, 5, 2, 0x3); // flip on
+    } else {
+        SET_REG_BITS_OR_RETURN(0x3221, 5, 2, 0x0); // flip off
+    }
+
+    return ret;
+}
+
+static int set_colorbar(sensor_t *sensor, int enable)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x0100, 7, 1, enable & 0xff); // enable colorbar mode
+    return ret;
+}
+
+static int set_raw_gma(sensor_t *sensor, int enable)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x00f5, 1, 1, enable & 0xff); // enable gamma compensation
+
+    return ret;
+}
+
+static int set_sharpness(sensor_t *sensor, int level)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x00e0, 1, 1, 1); // enable edge enhancement
+    WRITE_REG_OR_RETURN(0x00d0, level & 0xFF); // base value
+    WRITE_REG_OR_RETURN(0x00d2, (level >> 8) & 0xFF); // limit
+
+    return ret;
+}
+
+static int set_agc_gain(sensor_t *sensor, int gain)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x0070, 1, 1, 1); // enable auto agc control
+    WRITE_REG_OR_RETURN(0x0068, gain & 0xFF); // Window weight setting1
+    WRITE_REG_OR_RETURN(0x0069, (gain >> 8) & 0xFF); // Window weight setting2
+    WRITE_REG_OR_RETURN(0x006a, (gain >> 16) & 0xFF); // Window weight setting3
+    WRITE_REG_OR_RETURN(0x006b, (gain >> 24) & 0xFF); // Window weight setting4
+    
+    return ret;
+}
+
+static int set_aec_value(sensor_t *sensor, int value)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x0070, 0, 1, 1); // enable auto aec control
+    WRITE_REG_OR_RETURN(0x0072, value & 0xFF); // AE target
+
+    return ret;
+}
+
+static int set_awb_gain(sensor_t *sensor, int value)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x00b0, 0, 1, 1); // enable awb control
+    WRITE_REG_OR_RETURN(0x00c8, value & 0xFF); // blue gain
+    WRITE_REG_OR_RETURN(0x00c9, (value>>8) & 0XFF); // red gain
+    return ret;
+}
+
+static int set_saturation(sensor_t *sensor, int level)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x00f5, 5, 1, 0); // enable saturation control
+    WRITE_REG_OR_RETURN(0x0149, level & 0xFF); // blue saturation gain (/128)
+    WRITE_REG_OR_RETURN(0x014a, (level>>8) & 0XFF); // red saturation gain (/128)
+    return ret;
+}
+
+static int set_contrast(sensor_t *sensor, int level)
+{
+    int ret = 0;
+    SET_REG_BITS_OR_RETURN(0x00f5, 6, 1, 0); // enable contrast control
+    WRITE_REG_OR_RETURN(0x014b, level); // contrast coefficient(/64)
+    return ret;
+}
+
+static int reset(sensor_t *sensor)
+{
+    int ret = set_regs(sensor, sc101iot_default_init_regs, sizeof(sc101iot_default_init_regs)/(sizeof(uint8_t) * 2));
+    
+    // Delay
+    vTaskDelay(50 / portTICK_PERIOD_MS);
+
+    // ESP_LOGI(TAG, "set_reg=%0x", set_reg(sensor, 0x0100, 0xffff, 0x00)); // write 0x80 to enter test mode if you want to test the sensor
+    // ESP_LOGI(TAG, "0x0100=%0x", get_reg(sensor, 0x0100, 0xffff));
+    if (ret) {
+        ESP_LOGE(TAG, "reset fail");
+    }
+    return ret;
+}
+
+static int set_window(sensor_t *sensor, int offset_x, int offset_y, int w, int h)
+{
+    int ret = 0;
+    //sc:H_start={0x0172[3:0],0x0170},H_end={0x0172[7:4],0x0171},
+    WRITE_REG_OR_RETURN(0x0170, offset_x & 0xff);
+    WRITE_REG_OR_RETURN(0x0171, (offset_x+w) & 0xff);
+    WRITE_REG_OR_RETURN(0x0172, ((offset_x>>8) & 0x0f) | (((offset_x+w)>>4)&0xf0));
+
+    //sc:V_start={0x0175[3:0],0x0173},H_end={0x0175[7:4],0x0174},
+    WRITE_REG_OR_RETURN(0x0173, offset_y & 0xff);
+    WRITE_REG_OR_RETURN(0x0174, (offset_y+h) & 0xff);
+    WRITE_REG_OR_RETURN(0x0175, ((offset_y>>8) & 0x0f) | (((offset_y+h)>>4)&0xf0));
+
+    vTaskDelay(10 / portTICK_PERIOD_MS);
+
+    return ret;
+}
+
+static int set_framesize(sensor_t *sensor, framesize_t framesize)
+{
+    uint16_t w = resolution[framesize].width;
+    uint16_t h = resolution[framesize].height;
+    if(w>SC101_MAX_FRAME_WIDTH || h > SC101_MAX_FRAME_HIGH) {
+        goto err; 
+    }
+
+    uint16_t offset_x = (SC101_MAX_FRAME_WIDTH-w) /2;   
+    uint16_t offset_y = (SC101_MAX_FRAME_HIGH-h) /2;
+    
+    if(set_window(sensor, offset_x, offset_y, w, h)) {
+        goto err; 
+    }
+    
+    sensor->status.framesize = framesize;
+    return 0;
+err:
+    ESP_LOGE(TAG, "frame size err");
+    return -1;
+}
+
+static int set_pixformat(sensor_t *sensor, pixformat_t pixformat)
+{
+    int ret=0;
+    sensor->pixformat = pixformat;
+
+    switch (pixformat) {
+    case PIXFORMAT_RGB565:
+    case PIXFORMAT_RAW:
+    case PIXFORMAT_GRAYSCALE:
+        ESP_LOGE(TAG, "Not support");
+        break;
+    case PIXFORMAT_YUV422: // For now, sc101 sensor only support YUV422.
+        break;
+    default:
+        ret = -1;
+    }
+
+    return ret;
+}
+
+static int init_status(sensor_t *sensor)
+{
+    return 0;
+}
+
+static int set_dummy(sensor_t *sensor, int val){ return -1; }
+
+static int set_xclk(sensor_t *sensor, int timer, int xclk)
+{
+    int ret = 0;
+    sensor->xclk_freq_hz = xclk * 1000000U;
+    ret = xclk_timer_conf(timer, sensor->xclk_freq_hz);
+    return ret;
+}
+
+int sc101iot_detect(int slv_addr, sensor_id_t *id)
+{
+    if (SC101IOT_SCCB_ADDR == slv_addr) {
+        uint8_t MIDL = SCCB_Read(slv_addr, SC101_SENSOR_ID_LOW_REG);
+        uint8_t MIDH = SCCB_Read(slv_addr, SC101_SENSOR_ID_HIGH_REG);
+        uint16_t PID = MIDH << 8 | MIDL;
+        if (SC101IOT_PID == PID) {
+            id->PID = PID;
+            return PID;
+        } else {
+            ESP_LOGI(TAG, "Mismatch PID=0x%x", PID);
+        }
+    }
+    return 0;
+}
+
+int sc101iot_init(sensor_t *sensor)
+{
+    // Set function pointers
+    sensor->reset = reset;
+    sensor->init_status = init_status;
+    sensor->set_pixformat = set_pixformat;
+    sensor->set_framesize = set_framesize;
+    sensor->set_hmirror = set_hmirror;
+    sensor->set_vflip = set_vflip;
+    sensor->set_colorbar = set_colorbar;
+    sensor->set_raw_gma = set_raw_gma;
+    sensor->set_sharpness = set_sharpness;
+    sensor->set_agc_gain = set_agc_gain;
+    sensor->set_aec_value = set_aec_value;
+    sensor->set_awb_gain = set_awb_gain;
+    sensor->set_saturation= set_saturation;
+    sensor->set_contrast = set_contrast;
+
+    sensor->set_denoise = set_dummy;
+    sensor->set_quality = set_dummy;
+    sensor->set_special_effect = set_dummy;
+    sensor->set_wb_mode = set_dummy;
+    sensor->set_ae_level = set_dummy;
+
+
+    sensor->get_reg = get_reg;
+    sensor->set_reg = set_reg;
+    sensor->set_xclk = set_xclk;
+    
+    ESP_LOGD(TAG, "sc101iot Attached");
+
+    return 0;
+}
\ No newline at end of file
diff --git a/code/components/esp32-camera-master/target/esp32/ll_cam.c b/code/components/esp32-camera-master/target/esp32/ll_cam.c
index d0f0c862..1e3def87 100644
--- a/code/components/esp32-camera-master/target/esp32/ll_cam.c
+++ b/code/components/esp32-camera-master/target/esp32/ll_cam.c
@@ -34,10 +34,14 @@ static inline int gpio_ll_get_level(gpio_dev_t *hw, int gpio_num)
 #include "xclk.h"
 #include "cam_hal.h"
 
+#if (ESP_IDF_VERSION_MAJOR >= 4) && (ESP_IDF_VERSION_MINOR >= 3)
+#include "esp_rom_gpio.h"
+#endif
+
 #if (ESP_IDF_VERSION_MAJOR >= 5)
 #define GPIO_PIN_INTR_POSEDGE GPIO_INTR_POSEDGE
 #define GPIO_PIN_INTR_NEGEDGE GPIO_INTR_NEGEDGE
-#define gpio_matrix_in(a,b,c) gpio_iomux_in(a,b)
+#define gpio_matrix_in(a,b,c) esp_rom_gpio_connect_in_signal(a,b,c)
 #endif
 
 static const char *TAG = "esp32 ll_cam";
@@ -233,7 +237,7 @@ static void IRAM_ATTR ll_cam_dma_isr(void *arg)
     //DBG_PIN_SET(0);
 }
 
-bool ll_cam_stop(cam_obj_t *cam)
+bool IRAM_ATTR ll_cam_stop(cam_obj_t *cam)
 {
     I2S0.conf.rx_start = 0;
     I2S_ISR_DISABLE(in_suc_eof);
diff --git a/code/components/esp32-camera-master/target/esp32s2/ll_cam.c b/code/components/esp32-camera-master/target/esp32s2/ll_cam.c
index e54b81f0..54764329 100644
--- a/code/components/esp32-camera-master/target/esp32s2/ll_cam.c
+++ b/code/components/esp32-camera-master/target/esp32s2/ll_cam.c
@@ -21,10 +21,15 @@
 #include "xclk.h"
 #include "cam_hal.h"
 
+#if (ESP_IDF_VERSION_MAJOR >= 4) && (ESP_IDF_VERSION_MINOR >= 3)
+#include "esp_rom_gpio.h"
+#endif
+
 #if (ESP_IDF_VERSION_MAJOR >= 5)
 #define GPIO_PIN_INTR_POSEDGE GPIO_INTR_POSEDGE
 #define GPIO_PIN_INTR_NEGEDGE GPIO_INTR_NEGEDGE
-#define gpio_matrix_in(a,b,c) gpio_iomux_in(a,b)
+#define gpio_matrix_in(a,b,c) esp_rom_gpio_connect_in_signal(a,b,c)
+#define ets_delay_us(a) esp_rom_delay_us(a)
 #endif
 
 static const char *TAG = "s2 ll_cam";
@@ -70,7 +75,7 @@ static void IRAM_ATTR ll_cam_dma_isr(void *arg)
     }
 }
 
-bool ll_cam_stop(cam_obj_t *cam)
+bool IRAM_ATTR ll_cam_stop(cam_obj_t *cam)
 {
     I2S0.conf.rx_start = 0;
 
diff --git a/code/components/esp32-camera-master/target/esp32s3/ll_cam.c b/code/components/esp32-camera-master/target/esp32s3/ll_cam.c
index ce405d16..2211a0ed 100644
--- a/code/components/esp32-camera-master/target/esp32s3/ll_cam.c
+++ b/code/components/esp32-camera-master/target/esp32s3/ll_cam.c
@@ -22,10 +22,15 @@
 #include "soc/gdma_reg.h"
 #include "ll_cam.h"
 #include "cam_hal.h"
+#include "esp_rom_gpio.h"
 
 #if (ESP_IDF_VERSION_MAJOR >= 5)
-#define gpio_matrix_in(a,b,c) gpio_iomux_in(a,b)
-#define gpio_matrix_out(a,b,c,d) gpio_iomux_out(a,b,c)
+#include "soc/gpio_sig_map.h"
+#include "soc/gpio_periph.h"
+#include "soc/io_mux_reg.h"
+#define gpio_matrix_in(a,b,c) esp_rom_gpio_connect_in_signal(a,b,c)
+#define gpio_matrix_out(a,b,c,d) esp_rom_gpio_connect_out_signal(a,b,c,d)
+#define ets_delay_us(a) esp_rom_delay_us(a)
 #endif
 
 static const char *TAG = "s3 ll_cam";
@@ -74,7 +79,7 @@ static void IRAM_ATTR ll_cam_dma_isr(void *arg)
     }
 }
 
-bool ll_cam_stop(cam_obj_t *cam)
+bool IRAM_ATTR ll_cam_stop(cam_obj_t *cam)
 {
     if (cam->jpeg_mode || !cam->psram_mode) {
         GDMA.channel[cam->dma_num].in.int_ena.in_suc_eof = 0;
@@ -170,6 +175,7 @@ static esp_err_t ll_cam_dma_init(cam_obj_t *cam)
     }
 
     GDMA.channel[cam->dma_num].in.conf1.in_check_owner = 0;
+    // GDMA.channel[cam->dma_num].in.conf1.in_ext_mem_bk_size = 2;
 
     GDMA.channel[cam->dma_num].in.peri_sel.sel = 5;
     //GDMA.channel[cam->dma_num].in.pri.rx_pri = 1;//rx prio 0-15
@@ -178,8 +184,52 @@ static esp_err_t ll_cam_dma_init(cam_obj_t *cam)
     return ESP_OK;
 }
 
+#if CONFIG_CAMERA_CONVERTER_ENABLED
+static esp_err_t ll_cam_converter_config(cam_obj_t *cam, const camera_config_t *config)
+{
+    esp_err_t ret = ESP_OK;
+
+    switch (config->conv_mode) {
+    case YUV422_TO_YUV420:
+        if (config->pixel_format != PIXFORMAT_YUV422) {
+            ret = ESP_FAIL;
+        } else {
+            ESP_LOGI(TAG, "YUV422 to YUV420 mode");
+            LCD_CAM.cam_rgb_yuv.cam_conv_yuv2yuv_mode = 1;
+            LCD_CAM.cam_rgb_yuv.cam_conv_yuv_mode = 0;
+            LCD_CAM.cam_rgb_yuv.cam_conv_trans_mode = 1;
+        }
+        break;
+    case YUV422_TO_RGB565:
+        if (config->pixel_format != PIXFORMAT_YUV422) {
+            ret = ESP_FAIL;
+        } else {
+            ESP_LOGI(TAG, "YUV422 to RGB565 mode");
+            LCD_CAM.cam_rgb_yuv.cam_conv_yuv2yuv_mode = 3;
+            LCD_CAM.cam_rgb_yuv.cam_conv_yuv_mode = 0;
+            LCD_CAM.cam_rgb_yuv.cam_conv_trans_mode = 0;
+        }
+        break;
+    default:
+        break;
+    }
+#if CONFIG_LCD_CAM_CONV_BT709_ENABLED
+    LCD_CAM.cam_rgb_yuv.cam_conv_protocol_mode = 1;
+#else
+    LCD_CAM.cam_rgb_yuv.cam_conv_protocol_mode = 0;
+#endif
+    LCD_CAM.cam_rgb_yuv.cam_conv_data_out_mode = 0;
+    LCD_CAM.cam_rgb_yuv.cam_conv_data_in_mode = 0;
+    LCD_CAM.cam_rgb_yuv.cam_conv_mode_8bits_on = 1;
+    LCD_CAM.cam_rgb_yuv.cam_conv_bypass = 1;
+    cam->conv_mode = config->conv_mode;
+    return ret;
+}
+#endif
+
 esp_err_t ll_cam_config(cam_obj_t *cam, const camera_config_t *config)
 {
+    esp_err_t ret = ESP_OK;
     if (REG_GET_BIT(SYSTEM_PERIP_CLK_EN1_REG, SYSTEM_LCD_CAM_CLK_EN) == 0) {
         REG_CLR_BIT(SYSTEM_PERIP_CLK_EN1_REG, SYSTEM_LCD_CAM_CLK_EN);
         REG_SET_BIT(SYSTEM_PERIP_CLK_EN1_REG, SYSTEM_LCD_CAM_CLK_EN);
@@ -215,15 +265,21 @@ esp_err_t ll_cam_config(cam_obj_t *cam, const camera_config_t *config)
 
     LCD_CAM.cam_rgb_yuv.val = 0;
 
+#if CONFIG_CAMERA_CONVERTER_ENABLED
+    if (config->conv_mode) {
+        ret = ll_cam_converter_config(cam, config);
+        if(ret != ESP_OK) {
+            return ret;
+        }
+    }
+#endif
+
     LCD_CAM.cam_ctrl.cam_update = 1;
     LCD_CAM.cam_ctrl1.cam_start = 1;
 
-    esp_err_t err = ll_cam_dma_init(cam);
-    if(err != ESP_OK) {
-        return err;
-    }
+    ret = ll_cam_dma_init(cam);
     
-    return ESP_OK;
+    return ret;
 }
 
 void ll_cam_vsync_intr_enable(cam_obj_t *cam, bool en)
@@ -417,6 +473,7 @@ size_t IRAM_ATTR ll_cam_memcpy(cam_obj_t *cam, uint8_t *out, const uint8_t *in,
         }
         return len / 2;
     }
+    
 
     // just memcpy
     memcpy(out, in, len);
@@ -433,8 +490,22 @@ esp_err_t ll_cam_set_sample_mode(cam_obj_t *cam, pixformat_t pix_format, uint32_
         }
         cam->fb_bytes_per_pixel = 1;       // frame buffer stores Y8
     } else if (pix_format == PIXFORMAT_YUV422 || pix_format == PIXFORMAT_RGB565) {
-            cam->in_bytes_per_pixel = 2;       // camera sends YU/YV
+#if CONFIG_CAMERA_CONVERTER_ENABLED
+        switch (cam->conv_mode) {
+        case YUV422_TO_YUV420:
+            cam->in_bytes_per_pixel = 1.5;       // for DMA receive
+            cam->fb_bytes_per_pixel = 1.5;       // frame buffer stores YUV420
+            break;
+        case YUV422_TO_RGB565:
+        default:
+            cam->in_bytes_per_pixel = 2;       // for DMA receive
             cam->fb_bytes_per_pixel = 2;       // frame buffer stores YU/YV/RGB565
+            break;
+        }
+#else 
+        cam->in_bytes_per_pixel = 2;       // for DMA receive
+        cam->fb_bytes_per_pixel = 2;       // frame buffer stores YU/YV/RGB565
+#endif
     } else if (pix_format == PIXFORMAT_JPEG) {
         cam->in_bytes_per_pixel = 1;
         cam->fb_bytes_per_pixel = 1;
diff --git a/code/components/esp32-camera-master/target/private_include/ll_cam.h b/code/components/esp32-camera-master/target/private_include/ll_cam.h
index 7d30c370..c27db0c4 100644
--- a/code/components/esp32-camera-master/target/private_include/ll_cam.h
+++ b/code/components/esp32-camera-master/target/private_include/ll_cam.h
@@ -116,8 +116,14 @@ typedef struct {
     //for RGB/YUV modes
     uint16_t width;
     uint16_t height;
+#if CONFIG_CAMERA_CONVERTER_ENABLED
+    float in_bytes_per_pixel;
+    float fb_bytes_per_pixel;
+    camera_conv_mode_t conv_mode;
+#else 
     uint8_t in_bytes_per_pixel;
     uint8_t fb_bytes_per_pixel;
+#endif
     uint32_t fb_size;
 
     cam_state_t state;
diff --git a/code/components/esp32-camera-master_old_version.zip b/code/components/esp32-camera-master_old_version.zip
deleted file mode 100644
index c0c60f8f..00000000
Binary files a/code/components/esp32-camera-master_old_version.zip and /dev/null differ
diff --git a/code/components/jomjol_flowcontroll/CMakeLists.txt b/code/components/jomjol_flowcontroll/CMakeLists.txt
index 8a066910..6ee66829 100644
--- a/code/components/jomjol_flowcontroll/CMakeLists.txt
+++ b/code/components/jomjol_flowcontroll/CMakeLists.txt
@@ -2,6 +2,6 @@ FILE(GLOB_RECURSE app_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.*)
 
 idf_component_register(SRCS ${app_sources}
                     INCLUDE_DIRS "."
-                    REQUIRES jomjol_tfliteclass jomjol_helper jomjol_controlcamera jomjol_mqtt jomjol_fileserver_ota jomjol_image_proc jomjol_wlan)
+                    REQUIRES jomjol_tfliteclass jomjol_helper jomjol_controlcamera jomjol_mqtt jomjol_influxdb jomjol_fileserver_ota jomjol_image_proc jomjol_wlan)
 
 
diff --git a/code/components/jomjol_flowcontroll/ClassFlow.cpp b/code/components/jomjol_flowcontroll/ClassFlow.cpp
index ff14c1b2..f15844d5 100644
--- a/code/components/jomjol_flowcontroll/ClassFlow.cpp
+++ b/code/components/jomjol_flowcontroll/ClassFlow.cpp
@@ -19,7 +19,6 @@ void ClassFlow::SetInitialParameter(void)
 std::vector<string> ClassFlow::ZerlegeZeile(std::string input, std::string delimiter)
 {
 	std::vector<string> Output;
-//	std::string delimiter = " =,";
 
 	input = trim(input, delimiter);
 	size_t pos = findDelimiterPos(input, delimiter);
diff --git a/code/components/jomjol_flowcontroll/ClassFlow.h b/code/components/jomjol_flowcontroll/ClassFlow.h
index 4df4777c..92184d32 100644
--- a/code/components/jomjol_flowcontroll/ClassFlow.h
+++ b/code/components/jomjol_flowcontroll/ClassFlow.h
@@ -26,7 +26,6 @@ struct HTMLInfo
 class ClassFlow
 {
 protected:
-//	std::vector<string> ZerlegeZeile(string input);
 	std::vector<string> ZerlegeZeile(string input, string delimiter = " =, \t");
 	bool isNewParagraph(string input);
 	bool GetNextParagraph(FILE* pfile, string& aktparamgraph);
diff --git a/code/components/jomjol_flowcontroll/ClassFlowAlignment.cpp b/code/components/jomjol_flowcontroll/ClassFlowAlignment.cpp
index 164d83cf..8e76c720 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowAlignment.cpp
+++ b/code/components/jomjol_flowcontroll/ClassFlowAlignment.cpp
@@ -19,6 +19,7 @@ void ClassFlowAlignment::SetInitialParameter(void)
     initalrotate = 0;
     anz_ref = 0;
     initialmirror = false;
+    use_antialiasing = false;
     initialflip = false;
     SaveAllFiles = false;
     namerawimage =  "/sdcard/img_tmp/raw.jpg";
@@ -94,7 +95,12 @@ bool ClassFlowAlignment::ReadParameter(FILE* pfile, string& aktparamgraph)
         if ((toUpper(zerlegt[0]) == "SEARCHFIELDY") && (zerlegt.size() > 1))
         {
             suchey = std::stod(zerlegt[1]);
-        }               
+        }   
+        if ((toUpper(zerlegt[0]) == "ANTIALIASING") && (zerlegt.size() > 1))
+        {
+            if (toUpper(zerlegt[1]) == "TRUE")
+                use_antialiasing = true;
+        }   
         if ((zerlegt.size() == 3) && (anz_ref < 2))
         {
             References[anz_ref].image_file = FormatFileName("/sdcard" + zerlegt[0]);
@@ -175,7 +181,10 @@ bool ClassFlowAlignment::doFlow(string time)
  
     if ((initalrotate != 0) || initialflip)
     {
-        rt.Rotate(initalrotate);
+        if (use_antialiasing)
+            rt.RotateAntiAliasing(initalrotate);
+        else
+            rt.Rotate(initalrotate);
         if (SaveAllFiles) AlignAndCutImage->SaveToFile(FormatFileName("/sdcard/img_tmp/rot.jpg"));
     }
 
diff --git a/code/components/jomjol_flowcontroll/ClassFlowAlignment.h b/code/components/jomjol_flowcontroll/ClassFlowAlignment.h
index 180dc7f6..7e1efef7 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowAlignment.h
+++ b/code/components/jomjol_flowcontroll/ClassFlowAlignment.h
@@ -16,6 +16,7 @@ protected:
     float initalrotate;
     bool initialmirror;
     bool initialflip;
+    bool use_antialiasing;
     RefInfo References[2];
     int anz_ref;
     string namerawimage;
diff --git a/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.cpp b/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.cpp
index 6f0f32da..a3d47753 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.cpp
+++ b/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.cpp
@@ -17,6 +17,7 @@ ClassFlowCNNGeneral::ClassFlowCNNGeneral(ClassFlowAlignment *_flowalign, t_CNNTy
     string cnnmodelfile = "";
     modelxsize = 1;
     modelysize = 1;
+    CNNGoodThreshold = 0.0;
     ListFlowControll = NULL;
     previousElement = NULL;   
     SaveAllFiles = false; 
@@ -27,20 +28,21 @@ ClassFlowCNNGeneral::ClassFlowCNNGeneral(ClassFlowAlignment *_flowalign, t_CNNTy
     flowpostalignment = _flowalign;
 }
 
-string ClassFlowCNNGeneral::getReadout(int _analog = 0, bool _extendedResolution = false)
+string ClassFlowCNNGeneral::getReadout(int _analog = 0, bool _extendedResolution, int prev)
 {
     string result = "";    
+
     if (GENERAL[_analog]->ROI.size() == 0)
         return result;
+    if (debugdetailgeneral) LogFile.WriteToFile("ClassFlowCNNGeneral::getReadout _analog=" + std::to_string(_analog) + ", _extendedResolution=" + std::to_string(_extendedResolution) + ", prev=" + std::to_string(prev));
 
-    if (CNNType == Analogue)
+    if (CNNType == Analogue || CNNType == Analogue100)
     {
         float zahl = GENERAL[_analog]->ROI[GENERAL[_analog]->ROI.size() - 1]->result_float;
         int ergebnis_nachkomma = ((int) floor(zahl * 10) + 10) % 10;
-
-        int prev = -1;
-
+        
         prev = ZeigerEval(GENERAL[_analog]->ROI[GENERAL[_analog]->ROI.size() - 1]->result_float, prev);
+        if (debugdetailgeneral) LogFile.WriteToFile("ClassFlowCNNGeneral::getReadout(analog) zahl=" + std::to_string(zahl) + ", ergebnis_nachkomma=" + std::to_string(ergebnis_nachkomma) + ", prev=" + std::to_string(prev));
         result = std::to_string(prev);
 
         if (_extendedResolution && (CNNType != Digital))
@@ -66,7 +68,58 @@ string ClassFlowCNNGeneral::getReadout(int _analog = 0, bool _extendedResolution
         return result;
     }
 
-    if (CNNType == DigitalHyprid)
+    if ((CNNType == DoubleHyprid10) || (CNNType == Digital100))
+    {
+
+        float zahl = GENERAL[_analog]->ROI[GENERAL[_analog]->ROI.size() - 1]->result_float;
+        if (zahl >= 0)       // NaN?
+        {
+            if (_extendedResolution)            // ist nur gesetzt, falls es die erste Ziffer ist (kein Analog vorher!)
+            {
+                int ergebnis_nachkomma = ((int) floor(zahl * 10)) % 10;
+                int ergebnis_vorkomma = ((int) floor(zahl)) % 10;
+
+                result = std::to_string(ergebnis_vorkomma) + std::to_string(ergebnis_nachkomma);
+                prev = ergebnis_vorkomma;
+                if (debugdetailgeneral) LogFile.WriteToFile("ClassFlowCNNGeneral::getReadout(dig100-ext) ergebnis_vorkomma=" + std::to_string(ergebnis_vorkomma) + ", ergebnis_nachkomma=" + std::to_string(ergebnis_nachkomma) + ", prev=" + std::to_string(prev));
+        
+
+            }
+            else
+            {
+//                prev = ZeigerEval(GENERAL[_analog]->ROI[GENERAL[_analog]->ROI.size() - 1]->result_float, prev);
+                prev = ZeigerEvalHybrid(GENERAL[_analog]->ROI[GENERAL[_analog]->ROI.size() - 1]->result_float, prev, prev);
+                result = std::to_string(prev);
+                if (debugdetailgeneral) LogFile.WriteToFile("ClassFlowCNNGeneral::getReadout(dig100)  prev=" + std::to_string(prev));
+        
+            }
+        }
+        else
+        {
+            result = "N";
+            if (_extendedResolution && (CNNType != Digital))
+                result = "NN";
+        }
+
+        for (int i = GENERAL[_analog]->ROI.size() - 2; i >= 0; --i)
+        {
+            if (GENERAL[_analog]->ROI[i]->result_float >= 0)
+            {
+                prev = ZeigerEvalHybrid(GENERAL[_analog]->ROI[i]->result_float, GENERAL[_analog]->ROI[i+1]->result_float, prev);
+                result = std::to_string(prev) + result;
+
+            }
+            else
+            {
+                prev = -1;
+                result = "N" + result;
+            }
+        }
+        return result;
+    }
+
+/*
+    if (CNNType == Digital100)
     {
         int zif_akt = -1;
 
@@ -109,6 +162,7 @@ string ClassFlowCNNGeneral::getReadout(int _analog = 0, bool _extendedResolution
         }
         return result;
     }
+*/
 
     return result;
 }
@@ -116,8 +170,10 @@ string ClassFlowCNNGeneral::getReadout(int _analog = 0, bool _extendedResolution
 int ClassFlowCNNGeneral::ZeigerEvalHybrid(float zahl, float zahl_vorgaenger, int eval_vorgaenger)
 {
     int ergebnis_nachkomma = ((int) floor(zahl * 10)) % 10;
+    int ergebnis_vorkomma = ((int) floor(zahl) + 10) % 10;
 
-    if (zahl_vorgaenger < 0)                // keine Vorzahl vorhanden !!! --> Runde die Zahl
+
+    if (eval_vorgaenger < 0)                // keine Vorzahl vorhanden !!! --> Runde die Zahl
     {
         if ((ergebnis_nachkomma <= 2) || (ergebnis_nachkomma >= 8))     // Band um die Ziffer --> Runden, da Ziffer im Rahmen Ungenauigkeit erreicht
             return ((int) round(zahl) + 10) % 10;
@@ -125,6 +181,34 @@ int ClassFlowCNNGeneral::ZeigerEvalHybrid(float zahl, float zahl_vorgaenger, int
             return ((int) trunc(zahl) + 10) % 10;
     }
 
+    if ((zahl_vorgaenger >= 0.5 ) && (zahl_vorgaenger < 9.5))
+    {
+        // kein Ziffernwechsel, da Vorkomma weit genug weg ist (0+/-0.5) --> zahl wird gerundet
+        return ((int) round(zahl) + 10) % 10;
+    }  
+    else
+    {
+        if (eval_vorgaenger <= 1)  // Nulldurchgang hat stattgefunden (!Bewertung über Prev_value und nicht Zahl!) --> hier aufrunden (2.8 --> 3, aber auch 3.1 --> 3)
+        {
+            if (ergebnis_nachkomma > 5)
+                return (ergebnis_vorkomma + 1) % 10;
+            else
+                return ergebnis_vorkomma;
+        }
+        else // bleibt nur >= 9.5 --> noch kein Nulldurchgang --> 2.8 --> 2, und 3.1 --> 2
+        {
+            // hier auf 4 reduziert, da erst ab Vorgänder 9 anfängt umzustellen. Bei 9.5 Vorgänger kann die aktuelle
+            // Zahl noch x.4 - x.5 sein.
+            if (ergebnis_nachkomma >= 4)
+                return ergebnis_vorkomma;
+            else
+                return (ergebnis_vorkomma - 1 + 10) % 10;
+        }
+    }
+
+    return -1;
+
+/*
     if (zahl_vorgaenger > 9.2)              // Ziffernwechsel beginnt
     {
         if (eval_vorgaenger == 0)           // Wechsel hat schon stattgefunden
@@ -151,20 +235,30 @@ int ClassFlowCNNGeneral::ZeigerEvalHybrid(float zahl, float zahl_vorgaenger, int
         return ((int) round(zahl) + 10) % 10;
 
     return ((int) trunc(zahl) + 10) % 10;
+*/
 }
 
+
+
 int ClassFlowCNNGeneral::ZeigerEval(float zahl, int ziffer_vorgaenger)
-{
+{   
     int ergebnis_nachkomma = ((int) floor(zahl * 10) + 10) % 10;
     int ergebnis_vorkomma = ((int) floor(zahl) + 10) % 10;
-    int ergebnis, ergebnis_rating;
+    int ergebnis;
+    float ergebnis_rating;
+    if (debugdetailgeneral) LogFile.WriteToFile("ClassFlowCNNGeneral::ZeigerEval erg_v=" + std::to_string(ergebnis_vorkomma) + ", erg_n=" + std::to_string(ergebnis_nachkomma) + ", ziff_v=" + std::to_string(ziffer_vorgaenger));
 
     if (ziffer_vorgaenger == -1)
         return ergebnis_vorkomma % 10;
 
+    // Ist die aktuelle Stelle schon umgesprungen und die Vorstelle noch nicht?
+    // Akt.: 2.1, Vorstelle = 0.9 => 1.9
+    // Problem sind mehrere Rundungen 
+    // Bsp. zahl=4.5, Vorgänger= 9.6 (ziffer_vorgaenger=0)
+    // Tritt nur auf bei Übergang von analog auf digit
     ergebnis_rating = ergebnis_nachkomma - ziffer_vorgaenger;
     if (ergebnis_nachkomma >= 5)
-        ergebnis_rating-=5;
+        ergebnis_rating-=5.1;
     else
         ergebnis_rating+=5;
     ergebnis = (int) round(zahl);
@@ -172,7 +266,7 @@ int ClassFlowCNNGeneral::ZeigerEval(float zahl, int ziffer_vorgaenger)
         ergebnis-=1;
     if (ergebnis == -1)
         ergebnis+=10;
-
+    
     ergebnis = (ergebnis + 10) % 10;
     return ergebnis;
 }
@@ -206,12 +300,12 @@ bool ClassFlowCNNGeneral::ReadParameter(FILE* pfile, string& aktparamgraph)
     while (this->getNextLine(pfile, &aktparamgraph) && !this->isNewParagraph(aktparamgraph))
     {
         zerlegt = this->ZerlegeZeile(aktparamgraph);
-        if ((zerlegt[0] == "LogImageLocation") && (zerlegt.size() > 1))
+        if ((toUpper(zerlegt[0]) == "LOGIMAGELOCATION") && (zerlegt.size() > 1))
         {
             this->LogImageLocation = "/sdcard" + zerlegt[1];
             this->isLogImage = true;
         }
-        if ((zerlegt[0] == "LogImageSelect") && (zerlegt.size() > 1))
+        if ((toUpper(zerlegt[0]) == "LOGIMAGESELECT") && (zerlegt.size() > 1))
         {
             LogImageSelect = zerlegt[1];
             isLogImageSelect = true;            
@@ -221,20 +315,20 @@ bool ClassFlowCNNGeneral::ReadParameter(FILE* pfile, string& aktparamgraph)
         {
             this->logfileRetentionInDays = std::stoi(zerlegt[1]);
         }
-        if ((toUpper(zerlegt[0]) == "MODELTYPE") && (zerlegt.size() > 1))
-        {
-            if (toUpper(zerlegt[1]) == "DIGITHYPRID")
-                CNNType = DigitalHyprid;
-        }
+//        if ((toUpper(zerlegt[0]) == "MODELTYPE") && (zerlegt.size() > 1))
+//        {
+//            if (toUpper(zerlegt[1]) == "DIGITHYPRID")
+//                CNNType = DigitalHyprid;
+//        }
 
-        if ((zerlegt[0] == "Model") && (zerlegt.size() > 1))
+        if ((toUpper(zerlegt[0]) == "MODEL") && (zerlegt.size() > 1))
         {
             this->cnnmodelfile = zerlegt[1];
         }
-        if ((zerlegt[0] == "ModelInputSize") && (zerlegt.size() > 2))
+        
+        if ((toUpper(zerlegt[0]) == "CNNGOODTHRESHOLD") && (zerlegt.size() > 1))
         {
-            this->modelxsize = std::stoi(zerlegt[1]);
-            this->modelysize = std::stoi(zerlegt[2]);
+            CNNGoodThreshold = std::stof(zerlegt[1]);
         }
         if (zerlegt.size() >= 5)
         {
@@ -256,11 +350,14 @@ bool ClassFlowCNNGeneral::ReadParameter(FILE* pfile, string& aktparamgraph)
         }
     }
 
+    if (!getNetworkParameter())
+        return false;
 
-   for (int _ana = 0; _ana < GENERAL.size(); ++_ana)
+
+    for (int _ana = 0; _ana < GENERAL.size(); ++_ana)
         for (int i = 0; i < GENERAL[_ana]->ROI.size(); ++i)
         {
-            GENERAL[_ana]->ROI[i]->image = new CImageBasis(modelxsize, modelysize, 3);
+            GENERAL[_ana]->ROI[i]->image = new CImageBasis(modelxsize, modelysize, modelchannel);
             GENERAL[_ana]->ROI[i]->image_org = new CImageBasis(GENERAL[_ana]->ROI[i]->deltax, GENERAL[_ana]->ROI[i]->deltay, 3);
         }
 
@@ -398,7 +495,7 @@ bool ClassFlowCNNGeneral::doAlignAndCut(string time)
 
 void ClassFlowCNNGeneral::DrawROI(CImageBasis *_zw)
 {
-    if (CNNType == Analogue)
+    if (CNNType == Analogue || CNNType == Analogue100)
     {
         int r = 0;
         int g = 255;
@@ -408,7 +505,6 @@ void ClassFlowCNNGeneral::DrawROI(CImageBasis *_zw)
             for (int i = 0; i < GENERAL[_ana]->ROI.size(); ++i)
             {
                 _zw->drawRect(GENERAL[_ana]->ROI[i]->posx, GENERAL[_ana]->ROI[i]->posy, GENERAL[_ana]->ROI[i]->deltax, GENERAL[_ana]->ROI[i]->deltay, r, g, b, 1);
-//                _zw->drawCircle((int) (GENERAL[_ana]->ROI[i]->posx + GENERAL[_ana]->ROI[i]->deltax/2), (int)  (GENERAL[_ana]->ROI[i]->posy + GENERAL[_ana]->ROI[i]->deltay/2), (int) (GENERAL[_ana]->ROI[i]->deltax/2), r, g, b, 2);
                 _zw->drawEllipse( (int) (GENERAL[_ana]->ROI[i]->posx + GENERAL[_ana]->ROI[i]->deltax/2), (int)  (GENERAL[_ana]->ROI[i]->posy + GENERAL[_ana]->ROI[i]->deltay/2), (int) (GENERAL[_ana]->ROI[i]->deltax/2), (int) (GENERAL[_ana]->ROI[i]->deltay/2), r, g, b, 2);
                 _zw->drawLine((int) (GENERAL[_ana]->ROI[i]->posx + GENERAL[_ana]->ROI[i]->deltax/2), (int) GENERAL[_ana]->ROI[i]->posy, (int) (GENERAL[_ana]->ROI[i]->posx + GENERAL[_ana]->ROI[i]->deltax/2), (int) (GENERAL[_ana]->ROI[i]->posy + GENERAL[_ana]->ROI[i]->deltay), r, g, b, 2);
                 _zw->drawLine((int) GENERAL[_ana]->ROI[i]->posx, (int) (GENERAL[_ana]->ROI[i]->posy + GENERAL[_ana]->ROI[i]->deltay/2), (int) GENERAL[_ana]->ROI[i]->posx + GENERAL[_ana]->ROI[i]->deltax, (int) (GENERAL[_ana]->ROI[i]->posy + GENERAL[_ana]->ROI[i]->deltay/2), r, g, b, 2);
@@ -422,6 +518,71 @@ void ClassFlowCNNGeneral::DrawROI(CImageBasis *_zw)
     }
 } 
 
+bool ClassFlowCNNGeneral::getNetworkParameter()
+{
+    if (disabled)
+        return true;
+
+    CTfLiteClass *tflite = new CTfLiteClass;  
+    string zwcnn = "/sdcard" + cnnmodelfile;
+    zwcnn = FormatFileName(zwcnn);
+    printf(zwcnn.c_str());printf("\n");
+    if (!tflite->LoadModel(zwcnn)) {
+        printf("Can't read model file /sdcard%s\n", cnnmodelfile.c_str());
+        LogFile.WriteToFile("Cannot load model");
+        delete tflite;
+        return false;
+    } 
+    tflite->MakeAllocate();
+
+    if (CNNType == AutoDetect)
+    {
+        tflite->GetInputDimension(false);
+        modelxsize = tflite->ReadInputDimenstion(0);
+        modelysize = tflite->ReadInputDimenstion(1);
+        modelchannel = tflite->ReadInputDimenstion(2);
+
+        int _anzoutputdimensions = tflite->GetAnzOutPut();
+        switch (_anzoutputdimensions) 
+        {
+            case 2:
+                CNNType = Analogue;
+                printf("TFlite-Type set to Analogue\n");
+                break;
+            case 10:
+                CNNType = DoubleHyprid10;
+                printf("TFlite-Type set to DoubleHyprid10\n");
+                break;
+            case 11:
+                CNNType = Digital;
+                printf("TFlite-Type set to Digital\n");
+                break;
+            case 20:
+                CNNType = DigitalHyprid10;
+                printf("TFlite-Type set to DigitalHyprid10\n");
+                break;
+//            case 22:
+//                CNNType = DigitalHyprid;
+//                printf("TFlite-Type set to DigitalHyprid\n");
+//                break;
+             case 100:
+                if (modelxsize==32 && modelysize == 32) {
+                    CNNType = Analogue100;
+                    printf("TFlite-Type set to Analogue100\n");
+                } else {
+                    CNNType = Digital100;
+                    printf("TFlite-Type set to Digital\n");
+                }
+                break;
+            default:
+                printf("ERROR ERROR ERROR - tflite passt nicht zur Firmware - ERROR ERROR ERROR\n");
+        }
+    }
+
+    delete tflite;
+    return true;
+}
+
 bool ClassFlowCNNGeneral::doNeuralNetwork(string time)
 {
     if (disabled)
@@ -442,32 +603,6 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time)
     } 
     tflite->MakeAllocate();
 
-    if (CNNType == AutoDetect)
-    {
-        int _anzoutputdimensions = tflite->GetAnzOutPut();
-        switch (_anzoutputdimensions) 
-        {
-            case 2:
-                CNNType = Analogue;
-                printf("TFlite-Type set to Analogue\n");
-                break;
-            case 11:
-                CNNType = Digital;
-                printf("TFlite-Type set to Digital\n");
-                break;
-            case 20:
-                CNNType = DigitalHyprid10;
-                printf("TFlite-Type set to DigitalHyprid10\n");
-                break;
-            case 22:
-                CNNType = DigitalHyprid;
-                printf("TFlite-Type set to DigitalHyprid\n");
-                break;
-            default:
-                printf("ERROR ERROR ERROR - tflite passt nicht zur Firmware - ERROR ERROR ERROR\n");
-        }
-    }
-
     for (int _ana = 0; _ana < GENERAL.size(); ++_ana)
     {
         for (int i = 0; i < GENERAL[_ana]->ROI.size(); ++i)
@@ -492,6 +627,7 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time)
                         if (isLogImage)
                             LogImage(logPath, GENERAL[_ana]->ROI[i]->name, &GENERAL[_ana]->ROI[i]->result_float, NULL, time, GENERAL[_ana]->ROI[i]->image_org);
                     } break;
+
                 case Digital:
                     {
                         GENERAL[_ana]->ROI[i]->result_klasse = 0;
@@ -500,17 +636,19 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time)
 
                         if (isLogImage)
                         {
+                            string _imagename = GENERAL[_ana]->name +  "_" + GENERAL[_ana]->ROI[i]->name;
                             if (isLogImageSelect)
                             {
                                 if (LogImageSelect.find(GENERAL[_ana]->ROI[i]->name) != std::string::npos)
-                                    LogImage(logPath, GENERAL[_ana]->ROI[i]->name, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org);
+                                    LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org);
                             }
                             else
                             {
-                                LogImage(logPath, GENERAL[_ana]->ROI[i]->name, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org);
+                                LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org);
                             }
                         }
                     } break;
+/*
                 case DigitalHyprid:
                     {
                         int _num, _nachkomma;
@@ -536,8 +674,20 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time)
                         if (debugdetailgeneral) LogFile.WriteToFile(_zwres);
 
                         if (isLogImage)
-                            LogImage(logPath, GENERAL[_ana]->ROI[i]->name, &GENERAL[_ana]->ROI[i]->result_float, NULL, time, GENERAL[_ana]->ROI[i]->image_org);
+                        {
+                            string _imagename = GENERAL[_ana]->name +  "_" + GENERAL[_ana]->ROI[i]->name;
+                            if (isLogImageSelect)
+                            {
+                                if (LogImageSelect.find(GENERAL[_ana]->ROI[i]->name) != std::string::npos)
+                                    LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org);
+                            }
+                            else
+                            {
+                                LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org);
+                            }
+                        }
                     } break;
+*/
                 case DigitalHyprid10:
                     {
                         int _num, _nachkomma;
@@ -560,8 +710,149 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time)
                         if (debugdetailgeneral) LogFile.WriteToFile(_zwres);
 
                         if (isLogImage)
-                            LogImage(logPath, GENERAL[_ana]->ROI[i]->name, &GENERAL[_ana]->ROI[i]->result_float, NULL, time, GENERAL[_ana]->ROI[i]->image_org);
+                        {
+                            string _imagename = GENERAL[_ana]->name +  "_" + GENERAL[_ana]->ROI[i]->name;
+                            if (isLogImageSelect)
+                            {
+                                if (LogImageSelect.find(GENERAL[_ana]->ROI[i]->name) != std::string::npos)
+                                    LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org);
+                            }
+                            else
+                            {
+                                LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org);
+                            }
+                        }
                     } break;
+
+                case DoubleHyprid10:
+                    {
+                        int _num, _numplus, _numminus;
+                        float _val, _valplus, _valminus;
+                        float _fit;
+                        float _result_save_file;
+
+                        tflite->LoadInputImageBasis(GENERAL[_ana]->ROI[i]->image);        
+                        tflite->Invoke();
+                        if (debugdetailgeneral) LogFile.WriteToFile("Nach Invoke");
+
+                        _num = tflite->GetOutClassification(0, 9);
+                        _numplus = (_num + 1) % 10;
+                        _numminus = (_num - 1 + 10) % 10;
+
+                        _val = tflite->GetOutputValue(_num);
+                        _valplus = tflite->GetOutputValue(_numplus);
+                        _valminus = tflite->GetOutputValue(_numminus);
+
+                        float result = _num;
+
+                        if (_valplus > _valminus)
+                        {
+                            result = result + _valplus / (_valplus + _val);
+                            _fit = _val + _valplus;
+                        }
+                        else
+                        {
+                            result = result - _valminus / (_val + _valminus);
+                            _fit = _val + _valminus;
+
+                        }
+                        if (result >= 10)
+                            result = result - 10;
+                        if (result < 0)
+                            result = result + 10;
+
+                        string zw = "_num (p, m): " + to_string(_num) + " " + to_string(_numplus) + " " + to_string(_numminus);
+                        zw = zw + " _val (p, m): " + to_string(_val) + " " + to_string(_valplus) + " " + to_string(_valminus);
+                        zw = zw + " result: " + to_string(result) + " _fit: " + to_string(_fit);
+                        printf("details cnn: %s\n", zw.c_str());
+                        LogFile.WriteToFile(zw);
+
+
+                        _result_save_file = result;
+
+                        if (_fit < CNNGoodThreshold)
+                        {
+                            GENERAL[_ana]->ROI[i]->isReject = true;
+                            result = -1;
+                            _result_save_file+= 100;     // Für den Fall, dass fit nicht ausreichend, soll trotzdem das Ergebnis mit "-10x.y" abgespeichert werden.
+                            string zw = "Value Rejected due to Threshold (Fit: " + to_string(_fit) + "Threshold: " + to_string(CNNGoodThreshold);
+                            printf("Value Rejected due to Threshold (Fit: %f, Threshold: %f\n", _fit, CNNGoodThreshold);
+                            LogFile.WriteToFile(zw);
+                        }
+                        else
+                        {
+                            GENERAL[_ana]->ROI[i]->isReject = false;
+                        }
+
+
+                        GENERAL[_ana]->ROI[i]->result_float = result;
+                        printf("Result General(Analog)%i: %f\n", i, GENERAL[_ana]->ROI[i]->result_float); 
+
+                        if (isLogImage)
+                        {
+                            string _imagename = GENERAL[_ana]->name +  "_" + GENERAL[_ana]->ROI[i]->name;
+                            if (isLogImageSelect)
+                            {
+                                if (LogImageSelect.find(GENERAL[_ana]->ROI[i]->name) != std::string::npos)
+                                    LogImage(logPath, _imagename, &_result_save_file, NULL, time, GENERAL[_ana]->ROI[i]->image_org);
+                            }
+                            else
+                            {
+                                LogImage(logPath, _imagename, &_result_save_file, NULL, time, GENERAL[_ana]->ROI[i]->image_org);
+                            }
+                        }
+                    }
+                    break;
+                case Digital100:
+                case Analogue100:
+                    {
+                        int _num;
+                        float _fit;
+                        float _result_save_file;
+                        
+                        tflite->LoadInputImageBasis(GENERAL[_ana]->ROI[i]->image);        
+                        tflite->Invoke();
+    
+                        _num = tflite->GetOutClassification();
+                        _fit = tflite->GetOutputValue(_num);
+
+                        GENERAL[_ana]->ROI[i]->result_float = (float)_num / 10.0;
+
+ 
+                        _result_save_file = GENERAL[_ana]->ROI[i]->result_float;
+
+                        if (_fit < CNNGoodThreshold)
+                        {
+                            GENERAL[_ana]->ROI[i]->isReject = true;
+                            GENERAL[_ana]->ROI[i]->result_float = -1;
+                            _result_save_file+= 100;     // Für den Fall, dass fit nicht ausreichend, soll trotzdem das Ergebnis mit "-10x.y" abgespeichert werden.
+                            string zw = "Value Rejected due to Threshold (Fit: " + to_string(_fit) + "Threshold: " + to_string(CNNGoodThreshold);
+                            printf("Value Rejected due to Threshold (Fit: %f, Threshold: %f\n", _fit, CNNGoodThreshold);
+                            LogFile.WriteToFile(zw);
+                        }
+                        else
+                        {
+                            GENERAL[_ana]->ROI[i]->isReject = false;
+                        }
+
+                        printf("Result General(Analog)%i: %f\n", i, GENERAL[_ana]->ROI[i]->result_float); 
+
+                        if (isLogImage)
+                        {
+                            string _imagename = GENERAL[_ana]->name +  "_" + GENERAL[_ana]->ROI[i]->name;
+                            if (isLogImageSelect)
+                            {
+                                if (LogImageSelect.find(GENERAL[_ana]->ROI[i]->name) != std::string::npos)
+                                    LogImage(logPath, _imagename, &_result_save_file, NULL, time, GENERAL[_ana]->ROI[i]->image_org);
+                            }
+                            else
+                            {
+                                LogImage(logPath, _imagename, &_result_save_file, NULL, time, GENERAL[_ana]->ROI[i]->image_org);
+                            }
+                        }
+
+                    } break;
+            
                 default:
                     break;
             }
diff --git a/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.h b/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.h
index ef0a6cd1..66aa56d5 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.h
+++ b/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.h
@@ -8,9 +8,12 @@
 enum t_CNNType {
     AutoDetect,
     Analogue,
+    Analogue100,
     Digital,
-    DigitalHyprid,
+//    DigitalHyprid,
     DigitalHyprid10,
+    DoubleHyprid10,
+    Digital100,
     None
  };
 
@@ -20,9 +23,10 @@ class ClassFlowCNNGeneral :
 protected:
     t_CNNType CNNType;
     std::vector<general*> GENERAL;
+    float CNNGoodThreshold;
 
     string cnnmodelfile;
-    int modelxsize, modelysize;
+    int modelxsize, modelysize, modelchannel;
     bool isLogImageSelect;
     string LogImageSelect;
     ClassFlowAlignment* flowpostalignment;
@@ -37,6 +41,8 @@ protected:
     bool doNeuralNetwork(string time); 
     bool doAlignAndCut(string time);
 
+    bool getNetworkParameter();
+
 public:
     ClassFlowCNNGeneral(ClassFlowAlignment *_flowalign, t_CNNType _cnntype = AutoDetect);
 
@@ -44,7 +50,7 @@ public:
     bool doFlow(string time);
 
     string getHTMLSingleStep(string host);
-    string getReadout(int _analog, bool _extendedResolution);   
+    string getReadout(int _analog, bool _extendedResolution = false, int prev = -1);   
 
     void DrawROI(CImageBasis *_zw); 
 
diff --git a/code/components/jomjol_flowcontroll/ClassFlowControll.cpp b/code/components/jomjol_flowcontroll/ClassFlowControll.cpp
index 9a0758ca..751d25af 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowControll.cpp
+++ b/code/components/jomjol_flowcontroll/ClassFlowControll.cpp
@@ -49,6 +49,9 @@ std::string ClassFlowControll::doSingleStep(std::string _stepname, std::string _
     if ((_stepname.compare("[MQTT]") == 0) || (_stepname.compare(";[MQTT]") == 0)){
         _classname = "ClassFlowMQTT";
     }
+    if ((_stepname.compare("[InfluxDB]") == 0) || (_stepname.compare(";[InfluxDB]") == 0)){
+        _classname = "ClassFlowInfluxDB";
+    }
 
     for (int i = 0; i < FlowControll.size(); ++i)
         if (FlowControll[i]->name().compare(_classname) == 0){
@@ -67,14 +70,16 @@ std::string ClassFlowControll::TranslateAktstatus(std::string _input)
         return ("Take Image");
     if (_input.compare("ClassFlowAlignment") == 0)
         return ("Aligning");
-    //if (_input.compare("ClassFlowAnalog") == 0)
-    //    return ("Analog ROIs");
     if (_input.compare("ClassFlowCNNGeneral") == 0)
         return ("Digitalization of ROIs");
     if (_input.compare("ClassFlowMQTT") == 0)
         return ("Sending MQTT");
+    if (_input.compare("ClassFlowInfluxDB") == 0)
+        return ("Sending InfluxDB");
     if (_input.compare("ClassFlowPostProcessing") == 0)
         return ("Processing");
+    if (_input.compare("ClassFlowWriteList") == 0)
+        return ("Processing");
 
     return "Unkown Status";
 }
@@ -180,7 +185,13 @@ ClassFlow* ClassFlowControll::CreateClassFlow(std::string _type)
     }
     if (toUpper(_type).compare("[MQTT]") == 0)
         cfc = new ClassFlowMQTT(&FlowControll);
+
+    if (toUpper(_type).compare("[INFLUXDB]") == 0)
+        cfc = new ClassFlowInfluxDB(&FlowControll);
         
+    if (toUpper(_type).compare("[WRITELIST]") == 0)
+        cfc = new ClassFlowWriteList(&FlowControll);
+
     if (toUpper(_type).compare("[POSTPROCESSING]") == 0)
     {
         cfc = new ClassFlowPostProcessing(&FlowControll, flowanalog, flowdigit); 
@@ -632,35 +643,7 @@ esp_err_t ClassFlowControll::GetJPGStream(std::string _fn, httpd_req_t *req)
     return result;
 }
 
-
-string ClassFlowControll::getJSON()
+string ClassFlowControll::getJSON(std::string _id, std::string _mac)
 {
-    std::vector<NumberPost*>* NUMBERS = flowpostprocessing->GetNumbers();
-
-    std::string json="{\n";
-
-    for (int i = 0; i < (*NUMBERS).size(); ++i)
-    {
-        json += "\"" + (*NUMBERS)[i]->name + "\":\n";
-        json += "  {\n";
-        if ((*NUMBERS)[i]->ReturnValue.length() > 0)
-            json += "    \"value\": "      + (*NUMBERS)[i]->ReturnValue          + ",\n";
-        else
-            json += "    \"value\": \"\",\n";
-        json += "    \"raw\": \""        + (*NUMBERS)[i]->ReturnRawValue              + "\",\n";
-        json += "    \"error\": \""     + (*NUMBERS)[i]->ErrorMessageText             + "\",\n";
-        if ((*NUMBERS)[i]->ReturnRateValue.length() > 0)
-            json += "    \"rate\": "      + (*NUMBERS)[i]->ReturnRateValue                + ",\n";
-        else
-            json += "    \"rate\": \"\",\n";
-
-        json += "    \"timestamp\": \"" + (*NUMBERS)[i]->timeStamp                    + "\"\n";
-        if ((i+1) < (*NUMBERS).size())
-            json += "  },\n";
-        else
-            json += "  }\n";
-    }
-    json += "}";
-
-    return json;
+    return flowpostprocessing->GetJSON(_id, _mac);
 }
diff --git a/code/components/jomjol_flowcontroll/ClassFlowControll.h b/code/components/jomjol_flowcontroll/ClassFlowControll.h
index 3f568b26..a8f92dc3 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowControll.h
+++ b/code/components/jomjol_flowcontroll/ClassFlowControll.h
@@ -9,7 +9,9 @@
 #include "ClassFlowCNNGeneral.h"
 #include "ClassFlowPostProcessing.h"
 #include "ClassFlowMQTT.h"
+#include "ClassFlowInfluxDB.h"
 #include "ClassFlowCNNGeneral.h"
+#include "ClassFlowWriteList.h"
 
 
 #define READOUT_TYPE_VALUE 0
@@ -48,7 +50,7 @@ public:
 	string UpdatePrevalue(std::string _newvalue, std::string _numbers, bool _extern);
 	string GetPrevalue(std::string _number = "");	
 	bool ReadParameter(FILE* pfile, string& aktparamgraph);	
-	string getJSON();
+	string getJSON(std::string _id = "", std::string _mac = "");
 
 	string TranslateAktstatus(std::string _input);
 
diff --git a/code/components/jomjol_flowcontroll/ClassFlowDefineTypes.h b/code/components/jomjol_flowcontroll/ClassFlowDefineTypes.h
index 7d9b2bec..98432886 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowDefineTypes.h
+++ b/code/components/jomjol_flowcontroll/ClassFlowDefineTypes.h
@@ -7,6 +7,7 @@ struct roi {
     int posx, posy, deltax, deltay;
     float result_float;
     int result_klasse;
+    bool isReject;
     string name;
     CImageBasis *image, *image_org;
 };
@@ -36,9 +37,10 @@ struct NumberPost {
     float PreValue;             // letzter Wert, der gut ausgelesen wurde
     float Value;                // letzer ausgelesener Wert, inkl. Korrekturen
     string ReturnRateValue;      // RückgabewertRate
+    string ReturnChangeAbsolute;      // RückgabewertRate
     string ReturnRawValue;      // Rohwert (mit N & führenden 0)    
     string ReturnValue;         // korrigierter Rückgabewert, ggf. mit Fehlermeldung
-    string ReturnPreValue;  // korrigierter Rückgabewert ohne Fehlermeldung
+    string ReturnPreValue;      // korrigierter Rückgabewert ohne Fehlermeldung
     string ErrorMessageText;        // Fehlermeldung bei Consistency Check
     int AnzahlAnalog;
     int AnzahlDigital;
diff --git a/code/components/jomjol_flowcontroll/ClassFlowInfluxDB.cpp b/code/components/jomjol_flowcontroll/ClassFlowInfluxDB.cpp
new file mode 100644
index 00000000..55b1f9ff
--- /dev/null
+++ b/code/components/jomjol_flowcontroll/ClassFlowInfluxDB.cpp
@@ -0,0 +1,161 @@
+#include <sstream>
+#include "ClassFlowInfluxDB.h"
+#include "Helper.h"
+#include "connect_wlan.h"
+
+#include "time_sntp.h"
+#include "interface_influxdb.h"
+#include "ClassFlowPostProcessing.h"
+
+#include <time.h>
+
+void ClassFlowInfluxDB::SetInitialParameter(void)
+{
+    uri = "";
+    database = "";
+    measurement = "";
+
+    OldValue = "";
+    flowpostprocessing = NULL;  
+    user = "";
+    password = "";   
+    previousElement = NULL;
+    ListFlowControll = NULL; 
+    disabled = false;
+    InfluxDBenable = false;
+}       
+
+ClassFlowInfluxDB::ClassFlowInfluxDB()
+{
+    SetInitialParameter();
+}
+
+ClassFlowInfluxDB::ClassFlowInfluxDB(std::vector<ClassFlow*>* lfc)
+{
+    SetInitialParameter();
+
+    ListFlowControll = lfc;
+    for (int i = 0; i < ListFlowControll->size(); ++i)
+    {
+        if (((*ListFlowControll)[i])->name().compare("ClassFlowPostProcessing") == 0)
+        {
+            flowpostprocessing = (ClassFlowPostProcessing*) (*ListFlowControll)[i];
+        }
+    }
+}
+
+ClassFlowInfluxDB::ClassFlowInfluxDB(std::vector<ClassFlow*>* lfc, ClassFlow *_prev)
+{
+    SetInitialParameter();
+
+    previousElement = _prev;
+    ListFlowControll = lfc;
+
+    for (int i = 0; i < ListFlowControll->size(); ++i)
+    {
+        if (((*ListFlowControll)[i])->name().compare("ClassFlowPostProcessing") == 0)
+        {
+            flowpostprocessing = (ClassFlowPostProcessing*) (*ListFlowControll)[i];
+        }
+    }
+}
+
+
+bool ClassFlowInfluxDB::ReadParameter(FILE* pfile, string& aktparamgraph)
+{
+    std::vector<string> zerlegt;
+
+    aktparamgraph = trim(aktparamgraph);
+
+    if (aktparamgraph.size() == 0)
+        if (!this->GetNextParagraph(pfile, aktparamgraph))
+            return false;
+
+    if (toUpper(aktparamgraph).compare("[INFLUXDB]") != 0) 
+        return false;
+
+    while (this->getNextLine(pfile, &aktparamgraph) && !this->isNewParagraph(aktparamgraph))
+    {
+        printf("while loop reading line: %s\n", aktparamgraph.c_str());
+        zerlegt = this->ZerlegeZeile(aktparamgraph);
+        if ((toUpper(zerlegt[0]) == "USER") && (zerlegt.size() > 1))
+        {
+            this->user = zerlegt[1];
+        }  
+        if ((toUpper(zerlegt[0]) == "PASSWORD") && (zerlegt.size() > 1))
+        {
+            this->password = zerlegt[1];
+        }               
+        if ((toUpper(zerlegt[0]) == "URI") && (zerlegt.size() > 1))
+        {
+            this->uri = zerlegt[1];
+        }
+        if (((toUpper(zerlegt[0]) == "MEASUREMENT")) && (zerlegt.size() > 1))
+        {
+            this->measurement = zerlegt[1];
+        }
+        if (((toUpper(zerlegt[0]) == "DATABASE")) && (zerlegt.size() > 1))
+        {
+            this->database = zerlegt[1];
+        }
+    }
+
+    if ((uri.length() > 0) && (database.length() > 0) && (measurement.length() > 0)) 
+    { 
+        printf("Init InfluxDB with uri: %s, measurement: %s, user: %s, password: %s\n", uri.c_str(), measurement.c_str(), user.c_str(), password.c_str());
+        InfluxDBInit(uri, database, measurement, user, password); 
+        InfluxDBenable = true;
+    } else {
+        printf("InfluxDB init skipped as we are missing some parameters");
+    }
+   
+    return true;
+}
+
+
+string ClassFlowInfluxDB::GetInfluxDBMeasurement()
+{
+    return measurement;
+}
+
+
+bool ClassFlowInfluxDB::doFlow(string zwtime)
+{
+    if (!InfluxDBenable)
+        return true;
+
+    std::string result;
+    std::string resulterror = "";
+    std::string resultraw = "";
+    std::string resultrate = "";
+    std::string resulttimestamp = "";
+    string zw = "";
+    string namenumber = "";
+
+    if (flowpostprocessing)
+    {
+        std::vector<NumberPost*>* NUMBERS = flowpostprocessing->GetNumbers();
+
+        for (int i = 0; i < (*NUMBERS).size(); ++i)
+        {
+            result =  (*NUMBERS)[i]->ReturnValue;
+            resultraw =  (*NUMBERS)[i]->ReturnRawValue;
+            resulterror = (*NUMBERS)[i]->ErrorMessageText;
+            resultrate = (*NUMBERS)[i]->ReturnRateValue;
+            resulttimestamp = (*NUMBERS)[i]->timeStamp;
+
+            namenumber = (*NUMBERS)[i]->name;
+            if (namenumber == "default")
+                namenumber = "value";
+            else
+                namenumber = namenumber + "/value";
+
+            if (result.length() > 0 && resulttimestamp.length() > 0)   
+                InfluxDBPublish(namenumber, result, resulttimestamp);
+        }
+    }
+   
+    OldValue = result;
+    
+    return true;
+}
diff --git a/code/components/jomjol_flowcontroll/ClassFlowInfluxDB.h b/code/components/jomjol_flowcontroll/ClassFlowInfluxDB.h
new file mode 100644
index 00000000..b7e25362
--- /dev/null
+++ b/code/components/jomjol_flowcontroll/ClassFlowInfluxDB.h
@@ -0,0 +1,31 @@
+#pragma once
+#include "ClassFlow.h"
+
+#include "ClassFlowPostProcessing.h"
+
+#include <string>
+
+class ClassFlowInfluxDB :
+    public ClassFlow
+{
+protected:
+    std::string uri, database, measurement;
+    std::string OldValue;
+	ClassFlowPostProcessing* flowpostprocessing;  
+    std::string user, password; 
+    bool InfluxDBenable;
+
+    void SetInitialParameter(void);        
+
+public:
+    ClassFlowInfluxDB();
+    ClassFlowInfluxDB(std::vector<ClassFlow*>* lfc);
+    ClassFlowInfluxDB(std::vector<ClassFlow*>* lfc, ClassFlow *_prev);
+
+    string GetInfluxDBMeasurement();
+
+    bool ReadParameter(FILE* pfile, string& aktparamgraph);
+    bool doFlow(string time);
+    string name(){return "ClassFlowInfluxDB";};
+};
+
diff --git a/code/components/jomjol_flowcontroll/ClassFlowMQTT.cpp b/code/components/jomjol_flowcontroll/ClassFlowMQTT.cpp
index d78399cb..f4e014e9 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowMQTT.cpp
+++ b/code/components/jomjol_flowcontroll/ClassFlowMQTT.cpp
@@ -25,7 +25,8 @@ void ClassFlowMQTT::SetInitialParameter(void)
     OldValue = "";
     flowpostprocessing = NULL;  
     user = "";
-    password = "";   
+    password = ""; 
+    SetRetainFlag = 0;  
     previousElement = NULL;
     ListFlowControll = NULL; 
     disabled = false;
@@ -99,6 +100,12 @@ bool ClassFlowMQTT::ReadParameter(FILE* pfile, string& aktparamgraph)
         {
             this->uri = zerlegt[1];
         }
+        if ((toUpper(zerlegt[0]) == "SETRETAINFLAG") && (zerlegt.size() > 1))
+        {
+            if (toUpper(zerlegt[1]) == "TRUE")
+                SetRetainFlag = 1;  
+        }
+
 
         if ((toUpper(zerlegt[0]) == "CLIENTID") && (zerlegt.size() > 1))
         {
@@ -118,7 +125,7 @@ bool ClassFlowMQTT::ReadParameter(FILE* pfile, string& aktparamgraph)
         mainerrortopic = maintopic + "/connection";
         printf("Init MQTT with uri: %s, clientname: %s, user: %s, password: %s, maintopic: %s\n", uri.c_str(), clientname.c_str(), user.c_str(), password.c_str(), mainerrortopic.c_str());
         MQTTInit(uri, clientname, user, password, mainerrortopic, 60); 
-        MQTTPublish(mainerrortopic, "connected");
+        MQTTPublish(mainerrortopic, "connected", SetRetainFlag);
         MQTTenable = true;
     }
    
@@ -142,6 +149,7 @@ bool ClassFlowMQTT::doFlow(string zwtime)
     std::string resultraw = "";
     std::string resultrate = "";
     std::string resulttimestamp = "";
+    std::string resultchangabs = "";
     string zw = "";
     string namenumber = "";
 
@@ -150,17 +158,17 @@ bool ClassFlowMQTT::doFlow(string zwtime)
     zw = maintopic + "/" + "uptime";
     char uptimeStr[11];
     sprintf(uptimeStr, "%ld", (long)getUpTime());
-    MQTTPublish(zw, uptimeStr);
+    MQTTPublish(zw, uptimeStr, SetRetainFlag);
 
     zw = maintopic + "/" + "freeMem";
     char freeheapmem[11];
     sprintf(freeheapmem, "%zu", esp_get_free_heap_size());
-    MQTTPublish(zw, freeheapmem);
+    MQTTPublish(zw, freeheapmem, SetRetainFlag);
 
     zw = maintopic + "/" + "wifiRSSI";
     char rssi[11];
     sprintf(rssi, "%d", get_WIFI_RSSI());
-    MQTTPublish(zw, rssi);
+    MQTTPublish(zw, rssi, SetRetainFlag);
 
 
     if (flowpostprocessing)
@@ -173,6 +181,7 @@ bool ClassFlowMQTT::doFlow(string zwtime)
             resultraw =  (*NUMBERS)[i]->ReturnRawValue;
             resulterror = (*NUMBERS)[i]->ErrorMessageText;
             resultrate = (*NUMBERS)[i]->ReturnRateValue;
+            resultchangabs = (*NUMBERS)[i]->ReturnChangeAbsolute;
             resulttimestamp = (*NUMBERS)[i]->timeStamp;
 
             namenumber = (*NUMBERS)[i]->name;
@@ -183,23 +192,27 @@ bool ClassFlowMQTT::doFlow(string zwtime)
 
             zw = namenumber + "value"; 
             if (result.length() > 0)   
-                MQTTPublish(zw, result);
+                MQTTPublish(zw, result, SetRetainFlag);
 
             zw = namenumber + "error"; 
             if (resulterror.length() > 0)  
-                MQTTPublish(zw, resulterror, 1);
+                MQTTPublish(zw, resulterror, SetRetainFlag);
 
             zw = namenumber + "rate"; 
             if (resultrate.length() > 0)   
-                MQTTPublish(zw, resultrate);
+                MQTTPublish(zw, resultrate, SetRetainFlag);
+
+            zw = namenumber + "changeabsolut"; 
+            if (resultchangabs.length() > 0)   
+                MQTTPublish(zw, resultchangabs, SetRetainFlag);
 
             zw = namenumber + "raw"; 
             if (resultraw.length() > 0)   
-                MQTTPublish(zw, resultraw);
+                MQTTPublish(zw, resultraw, SetRetainFlag);
 
             zw = namenumber + "timestamp";
             if (resulttimestamp.length() > 0)
-                MQTTPublish(zw, resulttimestamp);
+                MQTTPublish(zw, resulttimestamp, SetRetainFlag);
 
 
             std::string json = "";
@@ -218,7 +231,7 @@ bool ClassFlowMQTT::doFlow(string zwtime)
             json += ",\"timestamp\":\""+resulttimestamp+"\"}";
 
             zw = namenumber + "json";
-            MQTTPublish(zw, json);
+            MQTTPublish(zw, json, SetRetainFlag);
         }
     }
     else
@@ -234,7 +247,7 @@ bool ClassFlowMQTT::doFlow(string zwtime)
                     result = result + "\t" + zw;
             }
         }
-        MQTTPublish(topic, result);
+        MQTTPublish(topic, result, SetRetainFlag);
     }
     
     OldValue = result;
diff --git a/code/components/jomjol_flowcontroll/ClassFlowMQTT.h b/code/components/jomjol_flowcontroll/ClassFlowMQTT.h
index 19cc9fdf..816389b1 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowMQTT.h
+++ b/code/components/jomjol_flowcontroll/ClassFlowMQTT.h
@@ -13,6 +13,7 @@ protected:
     std::string OldValue;
 	ClassFlowPostProcessing* flowpostprocessing;  
     std::string user, password; 
+    int SetRetainFlag;
     bool MQTTenable;
 
     std::string maintopic, mainerrortopic; 
diff --git a/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.cpp b/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.cpp
index f37d44a5..306b6b5f 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.cpp
+++ b/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.cpp
@@ -15,6 +15,42 @@
 #define PREVALUE_TIME_FORMAT_INPUT "%d-%d-%dT%d:%d:%d"
 
 
+std::string ClassFlowPostProcessing::GetJSON(std::string _id, std::string _mac, std::string _lineend)
+{
+    std::string json="{" + _lineend;
+
+    for (int i = 0; i < NUMBERS.size(); ++i)
+    {
+        json += "\"" + NUMBERS[i]->name + "\":"  + _lineend;
+        json += "  {"  + _lineend;
+
+        if (_id.length() > 0)
+            json += "    \"ID\": \"" + _id + "\","  + _lineend;
+        if (_mac.length() > 0)
+            json += "    \"MAC\": \"" + _mac + "\","  + _lineend;
+
+        if (NUMBERS[i]->ReturnValue.length() > 0)
+            json += "    \"value\": \""      + NUMBERS[i]->ReturnValue          + "\"," + _lineend;
+        else
+            json += "    \"value\": \"\","  + _lineend;
+        json += "    \"raw\": \""        + NUMBERS[i]->ReturnRawValue              + "\","  + _lineend;
+        json += "    \"error\": \""     + NUMBERS[i]->ErrorMessageText             + "\","  + _lineend;
+        if (NUMBERS[i]->ReturnRateValue.length() > 0)
+            json += "    \"rate\": "      + NUMBERS[i]->ReturnRateValue                + ","  + _lineend;
+        else
+            json += "    \"rate\": \"\","  + _lineend;
+
+        json += "    \"timestamp\": \"" + NUMBERS[i]->timeStamp                    + "\""  + _lineend;
+        if ((i+1) < NUMBERS.size())
+            json += "  }," + _lineend;
+        else
+            json += "  }" + _lineend;
+    }
+    json += "}";
+
+    return json;
+}
+
 string ClassFlowPostProcessing::GetPreValue(std::string _number)
 {
     std::string result;
@@ -41,6 +77,8 @@ void ClassFlowPostProcessing::SetPreValue(float zw, string _numbers, bool _exter
         if (NUMBERS[j]->name == _numbers)
         {
             NUMBERS[j]->PreValue = zw;
+            NUMBERS[j]->ReturnPreValue = std::to_string(zw);
+            NUMBERS[j]->PreValueOkay = true;
             if (_extern)
             {
                 time(&(NUMBERS[j]->lastvalue));
@@ -505,7 +543,6 @@ void ClassFlowPostProcessing::InitNUMBERS()
 
         _number->ReturnRawValue = "";      // Rohwert (mit N & führenden 0)    
         _number->ReturnValue = "";         // korrigierter Rückgabewert, ggf. mit Fehlermeldung
-//        _number->ReturnValueNoError = "";  // korrigierter Rückgabewert ohne Fehlermeldung
         _number->ErrorMessageText = "";        // Fehlermeldung bei Consistency Check
         _number->ReturnPreValue = "";
         _number->PreValueOkay = false;
@@ -524,7 +561,6 @@ void ClassFlowPostProcessing::InitNUMBERS()
         _number->Value = 0;                // letzer ausgelesener Wert, inkl. Korrekturen
         _number->ReturnRawValue = "";      // Rohwert (mit N & führenden 0)    
         _number->ReturnValue = "";         // korrigierter Rückgabewert, ggf. mit Fehlermeldung
-//        _number->ReturnValueNoError = "";  // korrigierter Rückgabewert ohne Fehlermeldung
         _number->ErrorMessageText = "";        // Fehlermeldung bei Consistency Check
 
         _number->Nachkomma = _number->AnzahlAnalog;
@@ -612,18 +648,29 @@ bool ClassFlowPostProcessing::doFlow(string zwtime)
 
         UpdateNachkommaDecimalShift();
 
+        int previous_value = -1;
+
+        if (NUMBERS[j]->analog_roi)
+        {
+            NUMBERS[j]->ReturnRawValue = flowAnalog->getReadout(j, NUMBERS[j]->isExtendedResolution); 
+            if (NUMBERS[j]->ReturnRawValue.length() > 0)
+            {
+                char zw = NUMBERS[j]->ReturnRawValue[0];
+                if (zw >= 48 && zw <=57)
+                    previous_value = zw - 48;
+            }
+        }
+
+        if (NUMBERS[j]->digit_roi && NUMBERS[j]->analog_roi)
+            NUMBERS[j]->ReturnRawValue = "." + NUMBERS[j]->ReturnRawValue;
+
         if (NUMBERS[j]->digit_roi)
         {
             if (NUMBERS[j]->analog_roi) 
-                NUMBERS[j]->ReturnRawValue = flowDigit->getReadout(j, false);
+                NUMBERS[j]->ReturnRawValue = flowDigit->getReadout(j, false, previous_value) + NUMBERS[j]->ReturnRawValue;
             else
-                NUMBERS[j]->ReturnRawValue = flowDigit->getReadout(j, NUMBERS[j]->isExtendedResolution);        // Extended Resolution nur falls es keine analogen Ziffern gibt
+                NUMBERS[j]->ReturnRawValue = flowDigit->getReadout(j, NUMBERS[j]->isExtendedResolution, previous_value);        // Extended Resolution nur falls es keine analogen Ziffern gibt
         }
-        if (NUMBERS[j]->digit_roi && NUMBERS[j]->analog_roi)
-            NUMBERS[j]->ReturnRawValue = NUMBERS[j]->ReturnRawValue + ".";
-
-        if (NUMBERS[j]->analog_roi)
-            NUMBERS[j]->ReturnRawValue = NUMBERS[j]->ReturnRawValue + flowAnalog->getReadout(j, NUMBERS[j]->isExtendedResolution); 
 
         NUMBERS[j]->ReturnRawValue = ShiftDecimal(NUMBERS[j]->ReturnRawValue, NUMBERS[j]->DecimalShift);
 
@@ -675,7 +722,7 @@ bool ClassFlowPostProcessing::doFlow(string zwtime)
 
         if (NUMBERS[j]->useMaxRateValue && PreValueUse && NUMBERS[j]->PreValueOkay)
         {
-            float _ratedifference;                                                   
+            float _ratedifference;  
             if (NUMBERS[j]->RateType == RateChange)
                 _ratedifference = NUMBERS[j]->FlowRateAct;
             else
@@ -691,6 +738,7 @@ bool ClassFlowPostProcessing::doFlow(string zwtime)
             }
         }
 
+        NUMBERS[j]->ReturnChangeAbsolute = RundeOutput(NUMBERS[j]->Value - NUMBERS[j]->PreValue, NUMBERS[j]->Nachkomma);                                                
         NUMBERS[j]->lastvalue = imagetime;
         NUMBERS[j]->PreValue = NUMBERS[j]->Value;
         NUMBERS[j]->PreValueOkay = true;
diff --git a/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.h b/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.h
index aa50a85c..34b2309c 100644
--- a/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.h
+++ b/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.h
@@ -60,6 +60,8 @@ public:
     string GetPreValue(std::string _number = "");
     void SetPreValue(float zw, string _numbers, bool _extern = false);
 
+    std::string GetJSON(std::string _id = "", std::string _mac = "", std::string _lineend = "\n");
+
     void UpdateNachkommaDecimalShift();
 
     std::vector<NumberPost*>* GetNumbers(){return &NUMBERS;};
diff --git a/code/components/jomjol_flowcontroll/ClassFlowWriteList.cpp b/code/components/jomjol_flowcontroll/ClassFlowWriteList.cpp
new file mode 100644
index 00000000..4d406728
--- /dev/null
+++ b/code/components/jomjol_flowcontroll/ClassFlowWriteList.cpp
@@ -0,0 +1,97 @@
+#include <sstream>
+#include "ClassFlowWriteList.h"
+#include "Helper.h"
+
+#include "time_sntp.h"
+
+
+#include <time.h>
+
+void ClassFlowWriteList::SetInitialParameter(void)
+{
+    flowpostprocessing = NULL;  
+    previousElement = NULL;
+    ListFlowControll = NULL; 
+    disabled = false;
+}       
+
+ClassFlowWriteList::ClassFlowWriteList()
+{
+    SetInitialParameter();
+}
+
+ClassFlowWriteList::ClassFlowWriteList(std::vector<ClassFlow*>* lfc)
+{
+    SetInitialParameter();
+
+    ListFlowControll = lfc;
+    for (int i = 0; i < ListFlowControll->size(); ++i)
+    {
+        if (((*ListFlowControll)[i])->name().compare("ClassFlowPostProcessing") == 0)
+        {
+            flowpostprocessing = (ClassFlowPostProcessing*) (*ListFlowControll)[i];
+        }
+    }
+}
+
+
+bool ClassFlowWriteList::ReadParameter(FILE* pfile, string& aktparamgraph)
+{
+    std::vector<string> zerlegt;
+
+    aktparamgraph = trim(aktparamgraph);
+
+    if (aktparamgraph.size() == 0)
+        if (!this->GetNextParagraph(pfile, aktparamgraph))
+            return false;
+
+    if (toUpper(aktparamgraph).compare("[MQTT]") != 0)       // Paragraph passt nich zu MakeImage
+        return false;
+
+    while (this->getNextLine(pfile, &aktparamgraph) && !this->isNewParagraph(aktparamgraph))
+    {
+        zerlegt = this->ZerlegeZeile(aktparamgraph);
+/*
+        if ((toUpper(zerlegt[0]) == "USER") && (zerlegt.size() > 1))
+        {
+            this->user = zerlegt[1];
+        }  
+*/
+    }
+   
+    return true;
+}
+
+
+
+bool ClassFlowWriteList::doFlow(string zwtime)
+{
+    std::string line = "";
+
+    std::string result;
+    std::string resulterror = "";
+    std::string resultraw = "";
+    std::string resultrate = "";
+    std::string resulttimestamp = "";
+    string zw = "";
+    string namenumber = "";
+
+    if (flowpostprocessing)
+    {
+        std::vector<NumberPost*>* NUMBERS = flowpostprocessing->GetNumbers();
+
+        for (int i = 0; i < (*NUMBERS).size(); ++i)
+        {
+            result =  (*NUMBERS)[i]->ReturnValue;
+            resultraw =  (*NUMBERS)[i]->ReturnRawValue;
+            resulterror = (*NUMBERS)[i]->ErrorMessageText;
+            resultrate = (*NUMBERS)[i]->ReturnRateValue;
+            resulttimestamp = (*NUMBERS)[i]->timeStamp;
+
+            line = line + resulttimestamp + "\t" + resultraw + "\t" + result + "\t" + resultraw + "\t" + resultrate + "\t" + resulttimestamp + "\t"; 
+
+        }
+    }
+    
+    return true;
+}
diff --git a/code/components/jomjol_flowcontroll/ClassFlowWriteList.h b/code/components/jomjol_flowcontroll/ClassFlowWriteList.h
new file mode 100644
index 00000000..49078f19
--- /dev/null
+++ b/code/components/jomjol_flowcontroll/ClassFlowWriteList.h
@@ -0,0 +1,22 @@
+#pragma once
+#include "ClassFlow.h"
+#include "ClassFlowPostProcessing.h"
+
+#include <string>
+
+class ClassFlowWriteList :
+    public ClassFlow
+{
+protected:
+	ClassFlowPostProcessing* flowpostprocessing;  
+	void SetInitialParameter(void);        
+
+public:
+    ClassFlowWriteList();
+    ClassFlowWriteList(std::vector<ClassFlow*>* lfc);
+
+    bool ReadParameter(FILE* pfile, string& aktparamgraph);
+    bool doFlow(string time);
+    string name(){return "ClassFlowWriteList";};
+};
+
diff --git a/code/components/jomjol_image_proc/CRotateImage.cpp b/code/components/jomjol_image_proc/CRotateImage.cpp
index b6b2321b..96eec7a0 100644
--- a/code/components/jomjol_image_proc/CRotateImage.cpp
+++ b/code/components/jomjol_image_proc/CRotateImage.cpp
@@ -156,12 +156,140 @@ void CRotateImage::Rotate(float _angle, int _centerx, int _centery)
     RGBImageRelease();
 }
 
+
+
+void CRotateImage::RotateAntiAliasing(float _angle, int _centerx, int _centery)
+{
+    int org_width, org_height;
+    float m[2][3];
+
+    float x_center = _centerx;
+    float y_center = _centery;
+    _angle = _angle / 180 * M_PI;
+
+    if (doflip)
+    {
+        org_width = width;
+        org_height = height;
+        height = org_width;
+        width = org_height;
+        x_center =  x_center - (org_width/2) + (org_height/2);
+        y_center =  y_center + (org_width/2) - (org_height/2);
+        if (ImageOrg)
+        {
+            ImageOrg->height = height;
+            ImageOrg->width = width;
+        }
+    }
+    else
+    {
+        org_width = width;
+        org_height = height;
+    }
+
+    m[0][0] = cos(_angle);
+    m[0][1] = sin(_angle);
+    m[0][2] = (1 - m[0][0]) * x_center - m[0][1] * y_center;
+
+    m[1][0] = -m[0][1];
+    m[1][1] = m[0][0];
+    m[1][2] = m[0][1] * x_center + (1 - m[0][0]) * y_center;
+
+    if (doflip)
+    {
+        m[0][2] = m[0][2] + (org_width/2) - (org_height/2);
+        m[1][2] = m[1][2] - (org_width/2) + (org_height/2);
+    }
+
+    int memsize = width * height * channels;
+    uint8_t* odata;
+    if (ImageTMP)
+    {
+        odata = ImageTMP->RGBImageLock();
+    }
+    else
+    {
+        odata = (unsigned char*)GET_MEMORY(memsize);
+    }
+    
+
+    int x_source_1, y_source_1, x_source_2, y_source_2;
+    float x_source, y_source;
+    float quad_ul, quad_ur, quad_ol, quad_or;
+    stbi_uc* p_target;
+    stbi_uc *p_source_ul, *p_source_ur, *p_source_ol, *p_source_or;
+
+    RGBImageLock();
+
+    for (int x = 0; x < width; ++x)
+        for (int y = 0; y < height; ++y)
+        {
+            p_target = odata + (channels * (y * width + x));
+
+            x_source = (m[0][0] * x + m[0][1] * y);
+            y_source = (m[1][0] * x + m[1][1] * y);
+
+            x_source += (m[0][2]);
+            y_source += (m[1][2]);
+
+            x_source_1 = (int)x_source;
+            x_source_2 = x_source_1 + 1;
+            y_source_1 = (int)y_source;
+            y_source_2 = y_source_1 + 1;
+
+            quad_ul = (x_source_2 - x_source) * (y_source_2 - y_source);
+            quad_ur = (1- (x_source_2 - x_source)) * (y_source_2 - y_source);
+            quad_or = (x_source_2 - x_source) * (1-(y_source_2 - y_source));
+            quad_ol = (1- (x_source_2 - x_source)) * (1-(y_source_2 - y_source));
+
+
+            if ((x_source_1 >= 0) && (x_source_2 < org_width) && (y_source_1 >= 0) && (y_source_2 < org_height))
+            {
+                p_source_ul = rgb_image + (channels * (y_source_1 * org_width + x_source_1));
+                p_source_ur = rgb_image + (channels * (y_source_1 * org_width + x_source_2));
+                p_source_or = rgb_image + (channels * (y_source_2 * org_width + x_source_1));
+                p_source_ol = rgb_image + (channels * (y_source_2 * org_width + x_source_2));
+                for (int _channels = 0; _channels < channels; ++_channels)
+                {
+                    p_target[_channels] = (int)((float)p_source_ul[_channels] * quad_ul
+                                                + (float)p_source_ur[_channels] * quad_ur
+                                                + (float)p_source_or[_channels] * quad_or
+                                                + (float)p_source_ol[_channels] * quad_ol);
+                }
+            }
+            else
+            {
+                for (int _channels = 0; _channels < channels; ++_channels)
+                    p_target[_channels] = 255;
+            }
+        }
+
+    //    memcpy(rgb_image, odata, memsize);
+    memCopy(odata, rgb_image, memsize);
+
+    if (!ImageTMP)
+    {
+        stbi_image_free(odata);
+    }
+    if (ImageTMP)
+        ImageTMP->RGBImageRelease();
+
+    RGBImageRelease();
+}
+
+
 void CRotateImage::Rotate(float _angle)
 {
 //    printf("width %d, height %d\n", width, height);
     Rotate(_angle, width / 2, height / 2);
 }
 
+void CRotateImage::RotateAntiAliasing(float _angle)
+{
+//    printf("width %d, height %d\n", width, height);
+    RotateAntiAliasing(_angle, width / 2, height / 2);
+}
+
 void CRotateImage::Translate(int _dx, int _dy)
 {
     int memsize = width * height * channels;
diff --git a/code/components/jomjol_image_proc/CRotateImage.h b/code/components/jomjol_image_proc/CRotateImage.h
index 90ad7d71..4dec78ef 100644
--- a/code/components/jomjol_image_proc/CRotateImage.h
+++ b/code/components/jomjol_image_proc/CRotateImage.h
@@ -11,7 +11,11 @@ class CRotateImage: public CImageBasis
         CRotateImage(CImageBasis *_org, CImageBasis *_temp, bool _flip = false);
 
         void Rotate(float _angle);
+        void RotateAntiAliasing(float _angle);
+       
         void Rotate(float _angle, int _centerx, int _centery);
+        void RotateAntiAliasing(float _angle, int _centerx, int _centery);
+
         void Translate(int _dx, int _dy);
         void Mirror();
 };
diff --git a/code/components/jomjol_influxdb/CMakeLists.txt b/code/components/jomjol_influxdb/CMakeLists.txt
new file mode 100644
index 00000000..47330bd5
--- /dev/null
+++ b/code/components/jomjol_influxdb/CMakeLists.txt
@@ -0,0 +1,7 @@
+FILE(GLOB_RECURSE app_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.*)
+
+idf_component_register(SRCS ${app_sources}
+                    INCLUDE_DIRS "."
+                    REQUIRES tflite-lib esp_http_client jomjol_logfile)
+
+
diff --git a/code/components/jomjol_influxdb/interface_influxdb.cpp b/code/components/jomjol_influxdb/interface_influxdb.cpp
new file mode 100644
index 00000000..2089d98e
--- /dev/null
+++ b/code/components/jomjol_influxdb/interface_influxdb.cpp
@@ -0,0 +1,114 @@
+#include "interface_influxdb.h"
+
+//#define LOG_LOCAL_LEVEL ESP_LOG_DEBUG
+#include "esp_log.h"
+#include <time.h>
+#include "ClassLogFile.h"
+#include "esp_http_client.h"
+
+#define MAX_HTTP_OUTPUT_BUFFER 2048
+
+static const char *TAG_INTERFACEINFLUXDB = "interface_influxdb";
+
+std::string _influxDBURI;
+std::string _influxDBDatabase;
+std::string _influxDBMeasurement;
+std::string _influxDBUser;
+std::string _influxDBPassword;
+
+static esp_err_t http_event_handler(esp_http_client_event_t *evt)
+{
+    switch(evt->event_id)
+    {
+        case HTTP_EVENT_ERROR:
+            ESP_LOGE(TAG_INTERFACEINFLUXDB, "HTTP Client Error encountered");
+            break;
+        case HTTP_EVENT_ON_CONNECTED:
+            ESP_LOGI(TAG_INTERFACEINFLUXDB, "HTTP Client Connected");
+            break;
+        case HTTP_EVENT_HEADERS_SENT:
+            ESP_LOGV(TAG_INTERFACEINFLUXDB, "HTTP Client sent all request headers");
+            break;
+        case HTTP_EVENT_ON_HEADER:
+            ESP_LOGV(TAG_INTERFACEINFLUXDB, "Header: key=%s, value=%s", evt->header_key, evt->header_value);
+            break;
+        case HTTP_EVENT_ON_DATA:
+            ESP_LOGV(TAG_INTERFACEINFLUXDB, "HTTP Client data recevied: len=%d", evt->data_len);
+            break;
+        case HTTP_EVENT_ON_FINISH:
+            ESP_LOGI(TAG_INTERFACEINFLUXDB, "HTTP Client finished");
+            break;
+         case HTTP_EVENT_DISCONNECTED:
+            ESP_LOGI(TAG_INTERFACEINFLUXDB, "HTTP Client Disconnected");
+            break;
+    }
+    return ESP_OK;
+}
+
+void InfluxDBPublish(std::string _key, std::string _content, std::string _timestamp) {
+    char response_buffer[MAX_HTTP_OUTPUT_BUFFER] = {0};
+    esp_http_client_config_t http_config = {
+       .user_agent = "ESP32 Meter reader",
+       .method = HTTP_METHOD_POST,
+       .event_handler = http_event_handler,
+       .buffer_size = MAX_HTTP_OUTPUT_BUFFER,
+       .user_data = response_buffer
+    };
+
+    if (_influxDBUser.length() && _influxDBPassword.length()){
+       http_config.username = _influxDBUser.c_str();
+       http_config.password = _influxDBPassword.c_str();
+       http_config.auth_type = HTTP_AUTH_TYPE_BASIC;
+    }
+
+    // generate timestamp (TODO: parse result timestamp passed as string and convert it to POSIX timestamp?)
+    time_t now = time(NULL);
+    char nowTimestamp[21];
+    // pad with zeroes to get nanoseconds
+    sprintf(nowTimestamp,"%jd000000000", (intmax_t)now);
+    
+    std::string payload = _influxDBMeasurement + " " + _key + "=" + _content + " " + nowTimestamp;
+    payload.shrink_to_fit();
+    ESP_LOGI(TAG_INTERFACEINFLUXDB, "sending line to influxdb: %s\n", payload.c_str());
+
+    // use the default retention policy of the database
+    std::string apiURI = _influxDBURI + "/api/v2/write?bucket=" + _influxDBDatabase + "/";
+    apiURI.shrink_to_fit();
+    http_config.url = apiURI.c_str();
+    ESP_LOGI(TAG_INTERFACEINFLUXDB, "API URI: %s", apiURI.c_str());
+
+    esp_http_client_handle_t http_client = esp_http_client_init(&http_config);
+    ESP_LOGI(TAG_INTERFACEINFLUXDB, "client is initialized%s\n", "");
+
+    esp_http_client_set_header(http_client, "Content-Type", "text/plain");
+    ESP_LOGI(TAG_INTERFACEINFLUXDB, "header is set%s\n", "");
+
+    ESP_ERROR_CHECK(esp_http_client_set_post_field(http_client, payload.c_str(), payload.length()));
+    ESP_LOGI(TAG_INTERFACEINFLUXDB, "post payload is set%s\n", "");
+
+    esp_err_t err = ESP_ERROR_CHECK_WITHOUT_ABORT(esp_http_client_perform(http_client));
+
+    if( err == ESP_OK ) {
+      ESP_LOGI(TAG_INTERFACEINFLUXDB, "HTTP request was performed%s\n", "");
+      int status_code = esp_http_client_get_status_code(http_client);
+      ESP_LOGI(TAG_INTERFACEINFLUXDB, "HTTP status code %d\n", status_code);
+    } else {
+      ESP_LOGW(TAG_INTERFACEINFLUXDB, "HTTP request failed%s\n", "");
+    }
+    esp_http_client_cleanup(http_client);
+}
+
+
+void InfluxDBInit(std::string _uri, std::string _database, std::string _measurement, std::string _user, std::string _password){
+    _influxDBURI = _uri;
+    _influxDBDatabase = _database;
+    _influxDBMeasurement = _measurement;
+    _influxDBUser = _user;
+    _influxDBPassword = _password;
+ 
+}
+
+void InfluxDBdestroy() {
+}
+
+
diff --git a/code/components/jomjol_influxdb/interface_influxdb.h b/code/components/jomjol_influxdb/interface_influxdb.h
new file mode 100644
index 00000000..33ae0564
--- /dev/null
+++ b/code/components/jomjol_influxdb/interface_influxdb.h
@@ -0,0 +1,13 @@
+#ifndef INTERFACE_INFLUXDB_H
+#define INTERFACE_INFLUXDB_H
+
+#include <string>
+#include <map>
+#include <functional>
+
+void InfluxDBInit(std::string _influxDBURI, std::string _database, std::string _measurement, std::string _user, std::string _password);
+void InfluxDBdestroy();
+
+void InfluxDBPublish(std::string _key, std::string _content, std::string _timestamp);
+
+#endif //INTERFACE_INFLUXDB_H
diff --git a/code/components/jomjol_mqtt/interface_mqtt.h b/code/components/jomjol_mqtt/interface_mqtt.h
index 50990e6f..397d1787 100644
--- a/code/components/jomjol_mqtt/interface_mqtt.h
+++ b/code/components/jomjol_mqtt/interface_mqtt.h
@@ -10,7 +10,7 @@ void MQTTdestroy();
 
 //void MQTTInit(std::string _mqttURI, std::string _clientid, std::string _user = "", std::string _password = "");
 
-void MQTTPublish(std::string _key, std::string _content, int retained_flag = 0);
+void MQTTPublish(std::string _key, std::string _content, int retained_flag = 1);            // retained Flag as Standart
 
 bool MQTTisConnected();
 
diff --git a/code/components/jomjol_tfliteclass/CTfLiteClass.cpp b/code/components/jomjol_tfliteclass/CTfLiteClass.cpp
index df008a1b..15affbc0 100644
--- a/code/components/jomjol_tfliteclass/CTfLiteClass.cpp
+++ b/code/components/jomjol_tfliteclass/CTfLiteClass.cpp
@@ -87,6 +87,19 @@ void CTfLiteClass::GetInputDimension(bool silent = false)
   }
 }
 
+int CTfLiteClass::ReadInputDimenstion(int _dim)
+{
+  if (_dim == 0)
+    return im_width;
+  if (_dim == 1)
+    return im_height;
+  if (_dim == 2)
+    return im_channel;
+
+  return -1;
+}
+
+
 
 int CTfLiteClass::GetAnzOutPut(bool silent)
 {
diff --git a/code/components/jomjol_tfliteclass/CTfLiteClass.h b/code/components/jomjol_tfliteclass/CTfLiteClass.h
index d6e7aed4..ef98c1fa 100644
--- a/code/components/jomjol_tfliteclass/CTfLiteClass.h
+++ b/code/components/jomjol_tfliteclass/CTfLiteClass.h
@@ -9,7 +9,6 @@
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-//#include "tensorflow/lite/version.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
 #include "esp_err.h"
 #include "esp_log.h"
@@ -65,8 +64,6 @@ class CTfLiteClass
         bool LoadInputImageBasis(CImageBasis *rs);
         void Invoke();
         int GetAnzOutPut(bool silent = true);        
-//        void GetOutPut();
-//        int GetOutClassification();
         int GetOutClassification(int _von = -1, int _bis = -1);
 
         int GetClassFromImageBasis(CImageBasis *rs);
@@ -74,5 +71,6 @@ class CTfLiteClass
 
         float GetOutputValue(int nr);
         void GetInputDimension(bool silent);
+        int ReadInputDimenstion(int _dim);
 };
 
diff --git a/code/components/jomjol_wlan/connect_wlan.cpp b/code/components/jomjol_wlan/connect_wlan.cpp
index ebcdf4be..3b2c1a1a 100644
--- a/code/components/jomjol_wlan/connect_wlan.cpp
+++ b/code/components/jomjol_wlan/connect_wlan.cpp
@@ -151,8 +151,24 @@ void wifi_init_sta(const char *_ssid, const char *_password, const char *_hostna
 
     if ((_ipadr != NULL) && (_gw != NULL) && (_netmask != NULL))
     {
+    /*
+       tcpip_adapter_dhcpc_stop(TCPIP_ADAPTER_IF_STA);
+        tcpip_adapter_ip_info_t ip_info;
+        int a, b, c, d;
+        strinttoip4(_ipadr, a, b, c, d);
+        IP4_ADDR(&ip_info.ip, a, b, c, d);
+        strinttoip4(_gw, a, b, c, d);
+        IP4_ADDR(&ip_info.gw, a, b, c, d);
+        strinttoip4(_netmask, a, b, c, d);
+        IP4_ADDR(&ip_info.netmask, a, b, c, d);
+
+        tcpip_adapter_set_ip_info(TCPIP_ADAPTER_IF_STA, &ip_info);
+    */
+
+
         ESP_LOGI(TAG, "set IP %s, GW %s, Netmask %s manual", _ipadr, _gw, _netmask);
         esp_netif_dhcpc_stop(my_sta);
+
         esp_netif_ip_info_t ip_info;
         int a, b, c, d;
         strinttoip4(_ipadr, a, b, c, d);
@@ -168,6 +184,22 @@ void wifi_init_sta(const char *_ssid, const char *_password, const char *_hostna
     wifi_init_config_t cfg = WIFI_INIT_CONFIG_DEFAULT();
     ESP_ERROR_CHECK(esp_wifi_init(&cfg));
 
+    if ((_ipadr != NULL) && (_gw != NULL) && (_netmask != NULL))
+    {
+        if (_dns == NULL)
+            _dns = _gw;
+            
+        ESP_LOGI(TAG, "set DNS manual");
+        esp_netif_dns_info_t dns_info;
+        ip4_addr_t ip;
+        ip.addr = esp_ip4addr_aton(_dns);
+        ip_addr_set_ip4_u32(&dns_info.ip, ip.addr);
+        ESP_ERROR_CHECK(esp_netif_set_dns_info(my_sta, ESP_NETIF_DNS_MAIN, &dns_info));
+    }
+
+
+
+
     esp_event_handler_instance_t instance_any_id;
     esp_event_handler_instance_t instance_got_ip;
     ESP_ERROR_CHECK(esp_event_handler_instance_register(WIFI_EVENT,
diff --git a/code/components/tflite-lib/CMakeLists.txt b/code/components/tflite-lib/CMakeLists.txt
index fab7027a..ab666ce0 100644
--- a/code/components/tflite-lib/CMakeLists.txt
+++ b/code/components/tflite-lib/CMakeLists.txt
@@ -1,3 +1,5 @@
+## TODO: GLOB is not a good way to collect files. Use explicit file list instead
+
 cmake_minimum_required(VERSION 3.5)
 
 set(tflite_dir "${CMAKE_CURRENT_SOURCE_DIR}/tensorflow/lite")
@@ -16,14 +18,30 @@ file(GLOB srcs_kernels
           "${tfmicro_kernels_dir}/*.c"
           "${tfmicro_kernels_dir}/*.cc")
 
+# remove sources which will be provided by esp_nn
+list(REMOVE_ITEM srcs_kernels
+          "${tfmicro_kernels_dir}/add.cc"
+          "${tfmicro_kernels_dir}/conv.cc"
+          "${tfmicro_kernels_dir}/depthwise_conv.cc"
+          "${tfmicro_kernels_dir}/fully_connected.cc"
+          "${tfmicro_kernels_dir}/mul.cc"
+          "${tfmicro_kernels_dir}/pooling.cc"
+          "${tfmicro_kernels_dir}/softmax.cc")
+
+FILE(GLOB esp_nn_kernels
+          "${tfmicro_kernels_dir}/esp_nn/*.cc")
+
 set(lib_srcs
           "${srcs_micro}"
           "${srcs_kernels}"
+          "${esp_nn_kernels}"
           "${src_micro_frontend}"
           "${tflite_dir}/kernels/kernel_util.cc"
           "${tflite_dir}/micro/memory_planner/greedy_memory_planner.cc"
           "${tflite_dir}/micro/memory_planner/linear_memory_planner.cc"
-          "${tflite_dir}/c/common.c"
+          "${tflite_dir}/micro/arena_allocator/recording_simple_memory_allocator.cc"
+          "${tflite_dir}/micro/arena_allocator/simple_memory_allocator.cc"
+          "${tflite_dir}/c/common.cc"
           "${tflite_dir}/core/api/error_reporter.cc"
           "${tflite_dir}/core/api/flatbuffer_conversions.cc"
           "${tflite_dir}/core/api/op_resolver.cc"
@@ -36,15 +54,17 @@ idf_component_register(
             INCLUDE_DIRS "." "third_party/gemmlowp"
                          "third_party/flatbuffers/include"
                          "third_party/ruy"
-                         "third_party/kissfft")
+                         "third_party/kissfft"
+            REQUIRES "esp-nn")
 
 # Reduce the level of paranoia to be able to compile TF sources
 target_compile_options(${COMPONENT_LIB} PRIVATE
   -Wno-maybe-uninitialized
   -Wno-missing-field-initializers
+  -DESP_NN # enables ESP-NN optimizations by Espressif
   -Wno-type-limits)
 
-target_compile_options(${COMPONENT_LIB} PRIVATE -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -DESP -DESP_NN -Wno-nonnull -Wno-nonnull -Wno-nonnull)
-target_compile_options(${COMPONENT_LIB} PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -std=c++11 -fno-rtti -fno-exceptions -fno-threadsafe-statics -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Werror -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -DESP -DESP_NN -Wno-return-type -Wno-strict-aliasing -std=gnu++14 -Wno-return-type -Wno-strict-aliasing -std=gnu++14 -Wno-return-type -Wno-strict-aliasing -std=gnu++14 >)
+target_compile_options(${COMPONENT_LIB} PRIVATE -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -Wno-nonnull)
+target_compile_options(${COMPONENT_LIB} PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -std=c++11 -fno-rtti -fno-exceptions -fno-threadsafe-statics -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Werror -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -Wno-return-type -Wno-strict-aliasing -std=gnu++14 >)
 target_compile_options(${COMPONENT_LIB} INTERFACE $<$<IN_LIST:-DTF_LITE_STATIC_MEMORY,$<TARGET_PROPERTY:${COMPONENT_LIB},COMPILE_OPTIONS>>:-DTF_LITE_STATIC_MEMORY>)
 target_link_libraries(${COMPONENT_LIB} PRIVATE -lm)
diff --git a/code/components/tflite-lib/tensorflow/lite/builtin_op_data.h b/code/components/tflite-lib/tensorflow/lite/builtin_op_data.h
new file mode 100644
index 00000000..b9d42845
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/builtin_op_data.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Compatibility shim for new location of interface definitions.
+
+#ifndef TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+
+#endif  // TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/builtin_ops.h b/code/components/tflite-lib/tensorflow/lite/builtin_ops.h
new file mode 100644
index 00000000..67014928
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/builtin_ops.h
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_BUILTIN_OPS_H_
+#define TENSORFLOW_LITE_BUILTIN_OPS_H_
+
+// DO NOT EDIT MANUALLY: This file is automatically generated by
+// `schema/builtin_ops_header/generator.cc`.
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// The enum for builtin operators.
+// Note: CUSTOM, DELEGATE, and PLACEHOLDER_FOR_GREATER_OP_CODES are 3 special
+// ops which are not real built-in ops.
+typedef enum {
+  kTfLiteBuiltinAdd = 0,
+  kTfLiteBuiltinAveragePool2d = 1,
+  kTfLiteBuiltinConcatenation = 2,
+  kTfLiteBuiltinConv2d = 3,
+  kTfLiteBuiltinDepthwiseConv2d = 4,
+  kTfLiteBuiltinDepthToSpace = 5,
+  kTfLiteBuiltinDequantize = 6,
+  kTfLiteBuiltinEmbeddingLookup = 7,
+  kTfLiteBuiltinFloor = 8,
+  kTfLiteBuiltinFullyConnected = 9,
+  kTfLiteBuiltinHashtableLookup = 10,
+  kTfLiteBuiltinL2Normalization = 11,
+  kTfLiteBuiltinL2Pool2d = 12,
+  kTfLiteBuiltinLocalResponseNormalization = 13,
+  kTfLiteBuiltinLogistic = 14,
+  kTfLiteBuiltinLshProjection = 15,
+  kTfLiteBuiltinLstm = 16,
+  kTfLiteBuiltinMaxPool2d = 17,
+  kTfLiteBuiltinMul = 18,
+  kTfLiteBuiltinRelu = 19,
+  kTfLiteBuiltinReluN1To1 = 20,
+  kTfLiteBuiltinRelu6 = 21,
+  kTfLiteBuiltinReshape = 22,
+  kTfLiteBuiltinResizeBilinear = 23,
+  kTfLiteBuiltinRnn = 24,
+  kTfLiteBuiltinSoftmax = 25,
+  kTfLiteBuiltinSpaceToDepth = 26,
+  kTfLiteBuiltinSvdf = 27,
+  kTfLiteBuiltinTanh = 28,
+  kTfLiteBuiltinConcatEmbeddings = 29,
+  kTfLiteBuiltinSkipGram = 30,
+  kTfLiteBuiltinCall = 31,
+  kTfLiteBuiltinCustom = 32,
+  kTfLiteBuiltinEmbeddingLookupSparse = 33,
+  kTfLiteBuiltinPad = 34,
+  kTfLiteBuiltinUnidirectionalSequenceRnn = 35,
+  kTfLiteBuiltinGather = 36,
+  kTfLiteBuiltinBatchToSpaceNd = 37,
+  kTfLiteBuiltinSpaceToBatchNd = 38,
+  kTfLiteBuiltinTranspose = 39,
+  kTfLiteBuiltinMean = 40,
+  kTfLiteBuiltinSub = 41,
+  kTfLiteBuiltinDiv = 42,
+  kTfLiteBuiltinSqueeze = 43,
+  kTfLiteBuiltinUnidirectionalSequenceLstm = 44,
+  kTfLiteBuiltinStridedSlice = 45,
+  kTfLiteBuiltinBidirectionalSequenceRnn = 46,
+  kTfLiteBuiltinExp = 47,
+  kTfLiteBuiltinTopkV2 = 48,
+  kTfLiteBuiltinSplit = 49,
+  kTfLiteBuiltinLogSoftmax = 50,
+  kTfLiteBuiltinDelegate = 51,
+  kTfLiteBuiltinBidirectionalSequenceLstm = 52,
+  kTfLiteBuiltinCast = 53,
+  kTfLiteBuiltinPrelu = 54,
+  kTfLiteBuiltinMaximum = 55,
+  kTfLiteBuiltinArgMax = 56,
+  kTfLiteBuiltinMinimum = 57,
+  kTfLiteBuiltinLess = 58,
+  kTfLiteBuiltinNeg = 59,
+  kTfLiteBuiltinPadv2 = 60,
+  kTfLiteBuiltinGreater = 61,
+  kTfLiteBuiltinGreaterEqual = 62,
+  kTfLiteBuiltinLessEqual = 63,
+  kTfLiteBuiltinSelect = 64,
+  kTfLiteBuiltinSlice = 65,
+  kTfLiteBuiltinSin = 66,
+  kTfLiteBuiltinTransposeConv = 67,
+  kTfLiteBuiltinSparseToDense = 68,
+  kTfLiteBuiltinTile = 69,
+  kTfLiteBuiltinExpandDims = 70,
+  kTfLiteBuiltinEqual = 71,
+  kTfLiteBuiltinNotEqual = 72,
+  kTfLiteBuiltinLog = 73,
+  kTfLiteBuiltinSum = 74,
+  kTfLiteBuiltinSqrt = 75,
+  kTfLiteBuiltinRsqrt = 76,
+  kTfLiteBuiltinShape = 77,
+  kTfLiteBuiltinPow = 78,
+  kTfLiteBuiltinArgMin = 79,
+  kTfLiteBuiltinFakeQuant = 80,
+  kTfLiteBuiltinReduceProd = 81,
+  kTfLiteBuiltinReduceMax = 82,
+  kTfLiteBuiltinPack = 83,
+  kTfLiteBuiltinLogicalOr = 84,
+  kTfLiteBuiltinOneHot = 85,
+  kTfLiteBuiltinLogicalAnd = 86,
+  kTfLiteBuiltinLogicalNot = 87,
+  kTfLiteBuiltinUnpack = 88,
+  kTfLiteBuiltinReduceMin = 89,
+  kTfLiteBuiltinFloorDiv = 90,
+  kTfLiteBuiltinReduceAny = 91,
+  kTfLiteBuiltinSquare = 92,
+  kTfLiteBuiltinZerosLike = 93,
+  kTfLiteBuiltinFill = 94,
+  kTfLiteBuiltinFloorMod = 95,
+  kTfLiteBuiltinRange = 96,
+  kTfLiteBuiltinResizeNearestNeighbor = 97,
+  kTfLiteBuiltinLeakyRelu = 98,
+  kTfLiteBuiltinSquaredDifference = 99,
+  kTfLiteBuiltinMirrorPad = 100,
+  kTfLiteBuiltinAbs = 101,
+  kTfLiteBuiltinSplitV = 102,
+  kTfLiteBuiltinUnique = 103,
+  kTfLiteBuiltinCeil = 104,
+  kTfLiteBuiltinReverseV2 = 105,
+  kTfLiteBuiltinAddN = 106,
+  kTfLiteBuiltinGatherNd = 107,
+  kTfLiteBuiltinCos = 108,
+  kTfLiteBuiltinWhere = 109,
+  kTfLiteBuiltinRank = 110,
+  kTfLiteBuiltinElu = 111,
+  kTfLiteBuiltinReverseSequence = 112,
+  kTfLiteBuiltinMatrixDiag = 113,
+  kTfLiteBuiltinQuantize = 114,
+  kTfLiteBuiltinMatrixSetDiag = 115,
+  kTfLiteBuiltinRound = 116,
+  kTfLiteBuiltinHardSwish = 117,
+  kTfLiteBuiltinIf = 118,
+  kTfLiteBuiltinWhile = 119,
+  kTfLiteBuiltinNonMaxSuppressionV4 = 120,
+  kTfLiteBuiltinNonMaxSuppressionV5 = 121,
+  kTfLiteBuiltinScatterNd = 122,
+  kTfLiteBuiltinSelectV2 = 123,
+  kTfLiteBuiltinDensify = 124,
+  kTfLiteBuiltinSegmentSum = 125,
+  kTfLiteBuiltinBatchMatmul = 126,
+  kTfLiteBuiltinPlaceholderForGreaterOpCodes = 127,
+  kTfLiteBuiltinCumsum = 128,
+  kTfLiteBuiltinCallOnce = 129,
+  kTfLiteBuiltinBroadcastTo = 130,
+  kTfLiteBuiltinRfft2d = 131,
+  kTfLiteBuiltinConv3d = 132,
+  kTfLiteBuiltinImag = 133,
+  kTfLiteBuiltinReal = 134,
+  kTfLiteBuiltinComplexAbs = 135,
+  kTfLiteBuiltinHashtable = 136,
+  kTfLiteBuiltinHashtableFind = 137,
+  kTfLiteBuiltinHashtableImport = 138,
+  kTfLiteBuiltinHashtableSize = 139,
+  kTfLiteBuiltinReduceAll = 140,
+  kTfLiteBuiltinConv3dTranspose = 141,
+  kTfLiteBuiltinVarHandle = 142,
+  kTfLiteBuiltinReadVariable = 143,
+  kTfLiteBuiltinAssignVariable = 144,
+  kTfLiteBuiltinBroadcastArgs = 145,
+  kTfLiteBuiltinRandomStandardNormal = 146,
+  kTfLiteBuiltinBucketize = 147,
+  kTfLiteBuiltinRandomUniform = 148,
+  kTfLiteBuiltinMultinomial = 149,
+  kTfLiteBuiltinGelu = 150,
+  kTfLiteBuiltinDynamicUpdateSlice = 151,
+  kTfLiteBuiltinRelu0To1 = 152,
+  kTfLiteBuiltinUnsortedSegmentProd = 153,
+} TfLiteBuiltinOperator;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_LITE_BUILTIN_OPS_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/c/builtin_op_data.h b/code/components/tflite-lib/tensorflow/lite/c/builtin_op_data.h
index 7f160972..b8fdb7d1 100644
--- a/code/components/tflite-lib/tensorflow/lite/c/builtin_op_data.h
+++ b/code/components/tflite-lib/tensorflow/lite/c/builtin_op_data.h
@@ -518,6 +518,9 @@ typedef struct {
   bool approximate;
 } TfLiteGeluParams;
 
+typedef struct {
+  int num_segments;
+} TfLiteUnsortedSegmentProdParams;
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/code/components/tflite-lib/tensorflow/lite/c/c_api_types.h b/code/components/tflite-lib/tensorflow/lite/c/c_api_types.h
index 678dfae6..d947213b 100644
--- a/code/components/tflite-lib/tensorflow/lite/c/c_api_types.h
+++ b/code/components/tflite-lib/tensorflow/lite/c/c_api_types.h
@@ -98,6 +98,7 @@ typedef enum {
   kTfLiteResource = 14,
   kTfLiteVariant = 15,
   kTfLiteUInt32 = 16,
+  kTfLiteUInt16 = 17,
 } TfLiteType;
 
 // Legacy. Will be deprecated in favor of TfLiteAffineQuantization.
@@ -111,6 +112,18 @@ typedef struct TfLiteQuantizationParams {
   int32_t zero_point;
 } TfLiteQuantizationParams;
 
+// --------------------------------------------------------------------------
+// Opaque types used by c_api.h, c_api_opaque.h and common.h.
+
+// TfLiteOpaqueContext is an opaque version of TfLiteContext;
+typedef struct TfLiteOpaqueContext TfLiteOpaqueContext;
+
+// TfLiteOpaqueNode is an opaque version of TfLiteNode;
+typedef struct TfLiteOpaqueNode TfLiteOpaqueNode;
+
+// TfLiteOpaqueTensor is an opaque version of TfLiteTensor;
+typedef struct TfLiteOpaqueTensor TfLiteOpaqueTensor;
+
 #ifdef __cplusplus
 }  // extern C
 #endif
diff --git a/code/components/tflite-lib/tensorflow/lite/c/common.c b/code/components/tflite-lib/tensorflow/lite/c/common.cc
similarity index 75%
rename from code/components/tflite-lib/tensorflow/lite/c/common.c
rename to code/components/tflite-lib/tensorflow/lite/c/common.cc
index d149d22c..8548424d 100644
--- a/code/components/tflite-lib/tensorflow/lite/c/common.c
+++ b/code/components/tflite-lib/tensorflow/lite/c/common.cc
@@ -14,13 +14,35 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/c/common.h"
+
 #include "tensorflow/lite/c/c_api_types.h"
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+#include <string>
+
+#include "tensorflow/lite/core/macros.h"
+#include "tensorflow/lite/tensorflow_profiler_logger.h"
+#endif
 
 #ifndef TF_LITE_STATIC_MEMORY
 #include <stdlib.h>
 #include <string.h>
 #endif  // TF_LITE_STATIC_MEMORY
 
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+namespace tflite {
+// Use weak symbols here (even though they are guarded by macros) to avoid
+// build breakage when building a benchmark requires TFLite runs. The main
+// benchmark library should have tensor_profiler_logger dependency.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteTensorAlloc(TfLiteTensor* tensor,
+                                               size_t num_bytes);
+
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteTensorDealloc(TfLiteTensor* tensor);
+}  // namespace tflite
+
+#endif  // TF_LITE_TENSORFLOW_PROFILER
+
+extern "C" {
+
 size_t TfLiteIntArrayGetSizeInBytes(int size) {
   static TfLiteIntArray dummy;
 
@@ -34,13 +56,13 @@ size_t TfLiteIntArrayGetSizeInBytes(int size) {
 
 int TfLiteIntArrayEqual(const TfLiteIntArray* a, const TfLiteIntArray* b) {
   if (a == b) return 1;
-  if (a == NULL || b == NULL) return 0;
+  if (a == nullptr || b == nullptr) return 0;
   return TfLiteIntArrayEqualsArray(a, b->size, b->data);
 }
 
 int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size,
                               const int b_data[]) {
-  if (a == NULL) return (b_size == 0);
+  if (a == nullptr) return (b_size == 0);
   if (a->size != b_size) return 0;
   int i = 0;
   for (; i < a->size; i++)
@@ -52,7 +74,7 @@ int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size,
 
 TfLiteIntArray* TfLiteIntArrayCreate(int size) {
   size_t alloc_size = TfLiteIntArrayGetSizeInBytes(size);
-  if (alloc_size <= 0) return NULL;
+  if (alloc_size <= 0) return nullptr;
   TfLiteIntArray* ret = (TfLiteIntArray*)malloc(alloc_size);
   if (!ret) return ret;
   ret->size = size;
@@ -60,7 +82,7 @@ TfLiteIntArray* TfLiteIntArrayCreate(int size) {
 }
 
 TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) {
-  if (!src) return NULL;
+  if (!src) return nullptr;
   TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size);
   if (ret) {
     memcpy(ret->data, src->data, src->size * sizeof(int));
@@ -97,9 +119,14 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
 void TfLiteTensorDataFree(TfLiteTensor* t) {
   if (t->allocation_type == kTfLiteDynamic ||
       t->allocation_type == kTfLitePersistentRo) {
-    free(t->data.raw);
+    if (t->data.raw) {
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+      tflite::OnTfLiteTensorDealloc(t);
+#endif
+      free(t->data.raw);
+    }
   }
-  t->data.raw = NULL;
+  t->data.raw = nullptr;
 }
 
 void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
@@ -108,31 +135,31 @@ void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
         (TfLiteAffineQuantization*)(quantization->params);
     if (q_params->scale) {
       TfLiteFloatArrayFree(q_params->scale);
-      q_params->scale = NULL;
+      q_params->scale = nullptr;
     }
     if (q_params->zero_point) {
       TfLiteIntArrayFree(q_params->zero_point);
-      q_params->zero_point = NULL;
+      q_params->zero_point = nullptr;
     }
     free(q_params);
   }
-  quantization->params = NULL;
+  quantization->params = nullptr;
   quantization->type = kTfLiteNoQuantization;
 }
 
 void TfLiteSparsityFree(TfLiteSparsity* sparsity) {
-  if (sparsity == NULL) {
+  if (sparsity == nullptr) {
     return;
   }
 
   if (sparsity->traversal_order) {
     TfLiteIntArrayFree(sparsity->traversal_order);
-    sparsity->traversal_order = NULL;
+    sparsity->traversal_order = nullptr;
   }
 
   if (sparsity->block_map) {
     TfLiteIntArrayFree(sparsity->block_map);
-    sparsity->block_map = NULL;
+    sparsity->block_map = nullptr;
   }
 
   if (sparsity->dim_metadata) {
@@ -141,13 +168,13 @@ void TfLiteSparsityFree(TfLiteSparsity* sparsity) {
       TfLiteDimensionMetadata metadata = sparsity->dim_metadata[i];
       if (metadata.format == kTfLiteDimSparseCSR) {
         TfLiteIntArrayFree(metadata.array_segments);
-        metadata.array_segments = NULL;
+        metadata.array_segments = nullptr;
         TfLiteIntArrayFree(metadata.array_indices);
-        metadata.array_indices = NULL;
+        metadata.array_indices = nullptr;
       }
     }
     free(sparsity->dim_metadata);
-    sparsity->dim_metadata = NULL;
+    sparsity->dim_metadata = nullptr;
   }
 
   free(sparsity);
@@ -156,16 +183,16 @@ void TfLiteSparsityFree(TfLiteSparsity* sparsity) {
 void TfLiteTensorFree(TfLiteTensor* t) {
   TfLiteTensorDataFree(t);
   if (t->dims) TfLiteIntArrayFree(t->dims);
-  t->dims = NULL;
+  t->dims = nullptr;
 
   if (t->dims_signature) {
-    TfLiteIntArrayFree((TfLiteIntArray *) t->dims_signature);
+    TfLiteIntArrayFree((TfLiteIntArray*)t->dims_signature);
   }
-  t->dims_signature = NULL;
+  t->dims_signature = nullptr;
 
   TfLiteQuantizationFree(&t->quantization);
   TfLiteSparsityFree(t->sparsity);
-  t->sparsity = NULL;
+  t->sparsity = nullptr;
 }
 
 void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
@@ -185,20 +212,16 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
   tensor->is_variable = is_variable;
 
   tensor->quantization.type = kTfLiteNoQuantization;
-  tensor->quantization.params = NULL;
+  tensor->quantization.params = nullptr;
 }
 
 TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) {
-  if (!src || !dst)
-    return kTfLiteOk;
-  if (src->bytes != dst->bytes)
-    return kTfLiteError;
-  if (src == dst)
-    return kTfLiteOk;
+  if (!src || !dst) return kTfLiteOk;
+  if (src->bytes != dst->bytes) return kTfLiteError;
+  if (src == dst) return kTfLiteOk;
 
   dst->type = src->type;
-  if (dst->dims)
-    TfLiteIntArrayFree(dst->dims);
+  if (dst->dims) TfLiteIntArrayFree(dst->dims);
   dst->dims = TfLiteIntArrayCopy(src->dims);
   memcpy(dst->data.raw, src->data.raw, src->bytes);
   dst->buffer_handle = src->buffer_handle;
@@ -216,8 +239,17 @@ void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
   // TODO(b/145340303): Tensor data should be aligned.
   if (!tensor->data.raw) {
     tensor->data.raw = (char*)malloc(num_bytes);
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+    tflite::OnTfLiteTensorAlloc(tensor, num_bytes);
+#endif
   } else if (num_bytes > tensor->bytes) {
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+    tflite::OnTfLiteTensorDealloc(tensor);
+#endif
     tensor->data.raw = (char*)realloc(tensor->data.raw, num_bytes);
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+    tflite::OnTfLiteTensorAlloc(tensor, num_bytes);
+#endif
   }
   tensor->bytes = num_bytes;
 }
@@ -229,6 +261,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
       return "NOTYPE";
     case kTfLiteFloat32:
       return "FLOAT32";
+    case kTfLiteUInt16:
+      return "UINT16";
     case kTfLiteInt16:
       return "INT16";
     case kTfLiteInt32:
@@ -263,14 +297,6 @@ const char* TfLiteTypeGetName(TfLiteType type) {
   return "Unknown type";
 }
 
-TfLiteDelegate TfLiteDelegateCreate(void) {
-  TfLiteDelegate d = {
-      .data_ = NULL,
-      .Prepare = NULL,
-      .CopyFromBufferHandle = NULL,
-      .CopyToBufferHandle = NULL,
-      .FreeBufferHandle = NULL,
-      .flags = kTfLiteDelegateFlagsNone,
-  };
-  return d;
-}
+TfLiteDelegate TfLiteDelegateCreate() { return TfLiteDelegate{}; }
+
+}  // extern "C"
diff --git a/code/components/tflite-lib/tensorflow/lite/c/common.h b/code/components/tflite-lib/tensorflow/lite/c/common.h
index 7056d1e2..8b8ffbe8 100644
--- a/code/components/tflite-lib/tensorflow/lite/c/common.h
+++ b/code/components/tflite-lib/tensorflow/lite/c/common.h
@@ -173,8 +173,9 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
     }                                                 \
   } while (false)
 #else  // TF_LITE_STRIP_ERROR_STRINGS
-#define TF_LITE_KERNEL_LOG(context, ...)
-#define TF_LITE_MAYBE_KERNEL_LOG(context, ...)
+#define ARGS_UNUSED(...) (void)sizeof(#__VA_ARGS__)
+#define TF_LITE_KERNEL_LOG(context, ...) ARGS_UNUSED(__VA_ARGS__)
+#define TF_LITE_MAYBE_KERNEL_LOG(context, ...) ARGS_UNUSED(__VA_ARGS__)
 #endif  // TF_LITE_STRIP_ERROR_STRINGS
 
 // Check whether value is true, and if not return kTfLiteError from
@@ -316,6 +317,7 @@ typedef union TfLitePtrUnion {
   uint8_t* uint8;
   bool* b;
   int16_t* i16;
+  uint16_t* ui16;
   TfLiteComplex64* c64;
   TfLiteComplex128* c128;
   int8_t* int8;
@@ -459,7 +461,8 @@ typedef struct TfLiteTensor {
   // Optional. Encodes shapes with unknown dimensions with -1. This field is
   // only populated when unknown dimensions exist in a read-write tensor (i.e.
   // an input or output tensor). (e.g.  `dims` contains [1, 1, 1, 3] and
-  // `dims_signature` contains [1, -1, -1, 3]).
+  // `dims_signature` contains [1, -1, -1, 3]). Note that this field only
+  // exists when TF_LITE_STATIC_MEMORY is not defined.
   const TfLiteIntArray* dims_signature;
 } TfLiteTensor;
 
@@ -839,6 +842,32 @@ typedef struct TfLiteContext {
                                    size_t* bytes);
 } TfLiteContext;
 
+// `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
+// for C API which doesn't use internal types (such as `TfLiteContext`) but only
+// uses stable API types (such as `TfLiteOpaqueContext`). The purpose of each
+// field is the exactly the same as with `TfLiteRegistration`.
+typedef struct TfLiteRegistrationExternal {
+  // Custom op name.
+  const char* custom_name;
+
+  // The version of the op. The verion should be higher than 0.
+  const int version;
+
+  // Initializes the op from serialized data.
+  void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
+                size_t length);
+
+  // The pointer `buffer` is the data previously returned by an init invocation.
+  void (*free)(TfLiteOpaqueContext* context, void* buffer);
+
+  // Called when the inputs that this node depends on have been resized.
+  TfLiteStatus (*prepare)(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
+
+  // Called when the node is executed. (should read node->inputs and output to
+  // node->outputs).
+  TfLiteStatus (*invoke)(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
+} TfLiteRegistrationExternal;
+
 typedef struct TfLiteRegistration {
   // Initializes the op from serialized data.
   // Called only *once* for the lifetime of the op, so any one-time allocations
@@ -900,8 +929,31 @@ typedef struct TfLiteRegistration {
   // Note: It is the responsibility of the registration binder to set this
   // properly.
   int version;
+
+  // The external version of `TfLiteRegistration`. Since we can't use internal
+  // types (such as `TfLiteContext`) for C API to maintain ABI stability.
+  // C API user will provide `TfLiteRegistrationExternal` to implement custom
+  // ops. We keep it inside of `TfLiteRegistration` and use it to route
+  // callbacks properly.
+  TfLiteRegistrationExternal* registration_external;
 } TfLiteRegistration;
 
+// Old version of `TfLiteRegistration` to maintain binary backward
+// compatibility.
+// WARNING: This structure is deprecated / not an official part of the API.
+// It should be only used for binary backward compatibility.
+typedef struct TfLiteRegistration_V1 {
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+  void (*free)(TfLiteContext* context, void* buffer);
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+  const char* (*profiling_string)(const TfLiteContext* context,
+                                  const TfLiteNode* node);
+  int32_t builtin_code;
+  const char* custom_name;
+  int version;
+} TfLiteRegistration_V1;
+
 // The flags used in `TfLiteDelegate`. Note that this is a bitmask, so the
 // values should be 1, 2, 4, 8, ...etc.
 typedef enum TfLiteDelegateFlags {
diff --git a/code/components/tflite-lib/tensorflow/lite/context_util.h b/code/components/tflite-lib/tensorflow/lite/context_util.h
new file mode 100644
index 00000000..7c8a5abd
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/context_util.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This provides a few C++ helpers that are useful for manipulating C structures
+// in C++.
+#ifndef TENSORFLOW_LITE_CONTEXT_UTIL_H_
+#define TENSORFLOW_LITE_CONTEXT_UTIL_H_
+
+#include <stddef.h>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+// Provide a range iterable wrapper for TfLiteIntArray* (C lists that TfLite
+// C api uses. Can't use the google array_view, since we can't depend on even
+// absl for embedded device reasons.
+class TfLiteIntArrayView {
+ public:
+  // Construct a view of a TfLiteIntArray*. Note, `int_array` should be non-null
+  // and this view does not take ownership of it.
+  explicit TfLiteIntArrayView(const TfLiteIntArray* int_array)
+      : int_array_(int_array) {}
+
+  TfLiteIntArrayView(const TfLiteIntArrayView&) = default;
+  TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default;
+
+  typedef const int* const_iterator;
+  const_iterator begin() const { return int_array_->data; }
+  const_iterator end() const { return &int_array_->data[int_array_->size]; }
+  size_t size() const { return end() - begin(); }
+  int operator[](size_t pos) const { return int_array_->data[pos]; }
+
+ private:
+  const TfLiteIntArray* int_array_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CONTEXT_UTIL_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.cc b/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.cc
index dfa0ccfd..5175d903 100644
--- a/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -208,6 +208,14 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseBatchToSpaceNd(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_BROADCAST_ARGS: {
+      return ParseBroadcastArgs(op, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_BROADCAST_TO: {
+      return ParseBroadcastTo(op, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_CALL_ONCE: {
       return ParseCallOnce(op, error_reporter, allocator, builtin_data);
     }
@@ -336,6 +344,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseLogSoftmax(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_LSTM: {
+      return ParseLSTM(op, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_MAXIMUM: {
       return ParseMaximum(op, error_reporter, allocator, builtin_data);
     }
@@ -605,37 +617,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_LSTM: {
-      auto params = safe_allocator.Allocate<TfLiteLSTMParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
-        params->activation =
-            ConvertActivation(lstm_params->fused_activation_function());
-        params->cell_clip = lstm_params->cell_clip();
-        params->proj_clip = lstm_params->proj_clip();
-        switch (lstm_params->kernel_type()) {
-          case LSTMKernelType_FULL:
-            params->kernel_type = kTfLiteLSTMFullKernel;
-            break;
-          case LSTMKernelType_BASIC:
-            params->kernel_type = kTfLiteLSTMBasicKernel;
-            break;
-          default:
-            TF_LITE_REPORT_ERROR(error_reporter,
-                                 "Unhandled LSTM kernel type: %d",
-                                 lstm_params->kernel_type());
-            return kTfLiteError;
-        }
-        params->asymmetric_quantize_inputs =
-            lstm_params->asymmetric_quantize_inputs();
-      } else {
-        TF_LITE_REPORT_ERROR(error_reporter,
-                             "No valid LSTM builtin options exist");
-        return kTfLiteError;
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: {
       return ParseUnidirectionalSequenceLSTM(op, error_reporter, allocator,
                                              builtin_data);
@@ -855,6 +836,16 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
+    case BuiltinOperator_UNSORTED_SEGMENT_PROD: {
+      auto params = safe_allocator.Allocate<TfLiteUnsortedSegmentProdParams>();
+      TF_LITE_ENSURE(error_reporter, params != nullptr);
+      if (const auto* unsorted_segment_prod_params =
+              op->builtin_options_as_UnsortedSegmentProdOptions()) {
+        params->num_segments = unsorted_segment_prod_params->num_segments();
+      }
+      *builtin_data = params.release();
+      return kTfLiteOk;
+    }
     // Below are the ops with no builtin_data structure.
     // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
     // ok for now, since there is no call implementation either.
@@ -867,6 +858,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_MATRIX_DIAG:
     case BuiltinOperator_MATRIX_SET_DIAG:
     case BuiltinOperator_RELU_N1_TO_1:
+    case BuiltinOperator_RELU_0_TO_1:
     case BuiltinOperator_SELECT:
     case BuiltinOperator_SELECT_V2:
     case BuiltinOperator_SLICE:
@@ -883,7 +875,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_SCATTER_ND:
     case BuiltinOperator_DENSIFY:
     case BuiltinOperator_SEGMENT_SUM:
-    case BuiltinOperator_BROADCAST_TO:
     case BuiltinOperator_RFFT2D:
     case BuiltinOperator_IMAG:
     case BuiltinOperator_REAL:
@@ -891,7 +882,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_HASHTABLE_FIND:
     case BuiltinOperator_HASHTABLE_IMPORT:
     case BuiltinOperator_HASHTABLE_SIZE:
-    case BuiltinOperator_BROADCAST_ARGS:
+    case BuiltinOperator_DYNAMIC_UPDATE_SLICE:
       return kTfLiteOk;
     case BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES:
       return kTfLiteError;
@@ -916,6 +907,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_INT16:
       *type = kTfLiteInt16;
       return kTfLiteOk;
+    case TensorType_UINT16:
+      *type = kTfLiteUInt16;
+      return kTfLiteOk;
     case TensorType_INT32:
       *type = kTfLiteInt32;
       return kTfLiteOk;
@@ -1085,6 +1079,22 @@ TfLiteStatus ParseBatchToSpaceNd(const Operator*, ErrorReporter*,
   return kTfLiteOk;
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseBroadcastArgs(const Operator*, ErrorReporter*,
+                                BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseBroadcastTo(const Operator*, ErrorReporter*,
+                              BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseCallOnce(const Operator* op, ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data) {
@@ -1605,6 +1615,40 @@ TfLiteStatus ParseLogSoftmax(const Operator*, ErrorReporter*,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseLSTM(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  auto params = safe_allocator.Allocate<TfLiteLSTMParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+  if (const auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
+    params->activation =
+        ConvertActivation(lstm_params->fused_activation_function());
+    params->cell_clip = lstm_params->cell_clip();
+    params->proj_clip = lstm_params->proj_clip();
+    switch (lstm_params->kernel_type()) {
+      case LSTMKernelType_FULL:
+        params->kernel_type = kTfLiteLSTMFullKernel;
+        break;
+      case LSTMKernelType_BASIC:
+        params->kernel_type = kTfLiteLSTMBasicKernel;
+        break;
+      default:
+        TF_LITE_REPORT_ERROR(error_reporter, "Unhandled LSTM kernel type: %d",
+                             lstm_params->kernel_type());
+        return kTfLiteError;
+    }
+    params->asymmetric_quantize_inputs =
+        lstm_params->asymmetric_quantize_inputs();
+  } else {
+    TF_LITE_REPORT_ERROR(error_reporter, "No valid LSTM builtin options exist");
+    return kTfLiteError;
+  }
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
@@ -2337,6 +2381,31 @@ TfLiteStatus ParseVarHandle(const Operator* op, ErrorReporter* error_reporter,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseWhile(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteWhileParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteWhileParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const WhileOptions* schema_params = op->builtin_options_as_WhileOptions();
+
+  if (schema_params != nullptr) {
+    params->cond_subgraph_index = schema_params->cond_subgraph_index();
+    params->body_subgraph_index = schema_params->body_subgraph_index();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
diff --git a/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.h b/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.h
index 8cf889d8..cd6637bc 100644
--- a/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -98,6 +98,15 @@ TfLiteStatus ParseBatchToSpaceNd(const Operator* op,
                                  BuiltinDataAllocator* allocator,
                                  void** builtin_data);
 
+TfLiteStatus ParseBroadcastArgs(const Operator* op,
+                                ErrorReporter* error_reporter,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+TfLiteStatus ParseBroadcastTo(const Operator* op, ErrorReporter* error_reporter,
+                              BuiltinDataAllocator* allocator,
+                              void** builtin_data);
+
 TfLiteStatus ParseCallOnce(const Operator* op, ErrorReporter* error_reporter,
                            BuiltinDataAllocator* allocator,
                            void** builtin_data);
@@ -232,6 +241,9 @@ TfLiteStatus ParseLogSoftmax(const Operator* op, ErrorReporter* error_reporter,
                              BuiltinDataAllocator* allocator,
                              void** builtin_data);
 
+TfLiteStatus ParseLSTM(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseMaximum(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
@@ -379,6 +391,9 @@ TfLiteStatus ParseVarHandle(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);
 
+TfLiteStatus ParseWhile(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseZerosLike(const Operator* op, ErrorReporter* error_reporter,
                             BuiltinDataAllocator* allocator,
                             void** builtin_data);
diff --git a/code/components/tflite-lib/tensorflow/lite/core/api/op_resolver.h b/code/components/tflite-lib/tensorflow/lite/core/api/op_resolver.h
index 49ac778e..cec1f2dd 100644
--- a/code/components/tflite-lib/tensorflow/lite/core/api/op_resolver.h
+++ b/code/components/tflite-lib/tensorflow/lite/core/api/op_resolver.h
@@ -23,6 +23,16 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
+// Opaque type similar to TfLiteDelegate / TfLiteOpaqueDelegate.
+// This is used for cases (e.g. when using "TF Lite with Google Play Services")
+// where the TF Lite runtime might be built using a newer (or older)
+// version of the TF Lite sources than the app, and hence might have a
+// different definition of the TfLiteDelegate type. TF Lite APIs use
+// TfLiteOpaqueDelegate rather than TfLiteDelegate when they want to
+// refer to a delegate defined with that potentially different version
+// of the TfLiteDelegate type.
+struct TfLiteOpaqueDelegateStruct;
+
 namespace tflite {
 
 /// Abstract interface that returns TfLiteRegistrations given op codes or custom
@@ -37,8 +47,10 @@ class OpResolver {
   virtual const TfLiteRegistration* FindOp(const char* op,
                                            int version) const = 0;
 
+  // Represents a sequence of delegates.
   using TfLiteDelegatePtrVector =
       std::vector<std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>>;
+
   // Returns optional delegates for resolving and handling ops in the flatbuffer
   // model. This may be used in addition to the standard TfLiteRegistration
   // lookup for graph resolution.
@@ -47,16 +59,55 @@ class OpResolver {
     return {};
   }
 
-  // Represent a function that creates a TfLite delegate instance.
+  // Represents a function that creates a TfLite delegate instance.
   using TfLiteDelegateCreator =
       std::function<std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
           int /*num_threads*/)>;
+
+  // Represents a sequence of delegate creator functions.
   using TfLiteDelegateCreators = std::vector<TfLiteDelegateCreator>;
+
   // Returns a vector of delegate creators to create optional delegates for
   // resolving and handling ops in the flatbuffer model. This may be used in
   // addition to the standard TfLiteRegistration lookup for graph resolution.
+  //
+  // Note that this method is not used (will not be called) if you are using
+  // TF Lite in Google Play Services; the GetOpaqueDelegateCreators method
+  // (see below) is used for that case.
   virtual TfLiteDelegateCreators GetDelegateCreators() const { return {}; }
 
+  // TODO(b/202712825): it would be nice if we could avoid the need for separate
+  // "opaque" types & methods for use only with TF Lite in Google Play Services.
+
+  // Represents an opaque delegate instance.
+  // WARNING: Experimental interface, subject to change.
+  using TfLiteOpaqueDelegatePtr =
+      std::unique_ptr<TfLiteOpaqueDelegateStruct,
+                      void (*)(TfLiteOpaqueDelegateStruct*)>;
+
+  // Represents a function that creates an opaque delegate instance.
+  // WARNING: Experimental interface, subject to change.
+  using TfLiteOpaqueDelegateCreator =
+      std::function<TfLiteOpaqueDelegatePtr(int /*num_threads*/)>;
+
+  // Represents a sequence of opaque delegate creator functions.
+  // WARNING: Experimental interface, subject to change.
+  using TfLiteOpaqueDelegateCreators = std::vector<TfLiteOpaqueDelegateCreator>;
+
+  // Returns a vector of opaque delegate creators to create optional opaque
+  // delegates for resolving and handling ops in the flatbuffer model. This may
+  // be used in addition to the standard TfLiteRegistration lookup for graph
+  // resolution.
+  //
+  // Note that this method will be called only if you are using TF Lite in
+  // Google Play Services; if you are using regular TF Lite, GetDelegateCreators
+  // (see above) is used instead.
+  //
+  // WARNING: Experimental interface, subject to change.
+  virtual TfLiteOpaqueDelegateCreators GetOpaqueDelegateCreators() const {
+    return {};
+  }
+
   virtual ~OpResolver() {}
 
  private:
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/portable_tensor_utils.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/portable_tensor_utils.h
index 0671ce73..ab0c8f96 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/portable_tensor_utils.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/portable_tensor_utils.h
@@ -60,9 +60,8 @@ void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,
 
 // Cwise product of two vectors.
 template <typename T>
-inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
-                                     const T* __restrict__ vector2, int v_size,
-                                     T* __restrict__ result) {
+inline void VectorVectorCwiseProduct(const T* vector1, const T* vector2,
+                                     int v_size, T* result) {
   for (int v = 0; v < v_size; v++) {
     *result++ = *vector1++ * *vector2++;
   }
@@ -117,6 +116,367 @@ void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
   }
 }
 
+// Checks if all entries of vector are zero for float.
+bool IsZeroVector(const float* vector, int v_size);
+
+// Checks if all entries of vector are zero for int8.
+bool IsZeroVector(const int8_t* vector, int v_size);
+
+// Quantizes a buffer of floating point values using a symmetric quantization
+// (i.e. linear quantization without an offset) to 8-bit signed integers.
+// It also outputs the range (min, max) of the floating point buffer, and the
+// scaling factor used to quantize the values.
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min_value,
+                             float* max_value, float* scaling_factor);
+
+// Quantizes a buffer of floating point values using a symmetric quantization
+// (i.e. linear quantization without an offset) to 8-bit signed integers.
+// It uses the range (min, max) provided to the function to calculate the
+// appropriate scaling factor to quantize the values.
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float min_value,
+                             float max_value, float* scaling_factor);
+
+void AsymmetricQuantizeFloats(const float* values, const int size,
+                              int8_t* quantized_values, float* scaling_factor,
+                              int32_t* offset);
+
+// Helper function to quantize floats.
+// float_data_ptr     input float vectors
+// n_batch            number of input vectors
+// n_data             size of a single input vector
+// quantized_data_ptr (out) vector with quantized data
+// scaling_factors    (out) scaling factors (one per vector)
+// zero_points        (out) zero points (one per vector)
+// do_asymmetric      controls if the quantization should be asymmetric.
+inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch,
+                                int n_data, int8_t* quantized_data_ptr,
+                                float* scaling_factors, int32_t* zero_points,
+                                bool do_asymmetric) {
+  for (int b = 0; b < n_batch; ++b) {
+    const int offset = b * n_data;
+    if (do_asymmetric) {
+      tensor_utils::AsymmetricQuantizeFloats(
+          float_data_ptr + offset, n_data, quantized_data_ptr + offset,
+          &scaling_factors[b], &zero_points[b]);
+    } else {
+      float unused_min, unused_max;
+      tensor_utils::SymmetricQuantizeFloats(
+          float_data_ptr + offset, n_data, quantized_data_ptr + offset,
+          &unused_min, &unused_max, &scaling_factors[b]);
+    }
+  }
+}
+
+// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
+// dimension composed by input vectors independent from each other). The result
+// of the multiplication is accumulated to the passed result buffer.
+// More specifically, for a matrix M of shape [n, i] and a batched-vector
+// of shape [i, batch] it will first compute the product of shape [n, batch].
+// This product will be accumulated to the result buffer.
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result);
+
+// Same as the function above, but the matrix is a sparse tensor with block
+// pattern 1x4.
+// This function assumes that m_cols is a multiple of the block size (4 in this
+// case) so that there's no incomplete block.
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result);
+
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//      with an integer representing the number of non-zero blocks for the
+//      corresponding row and follows with column indexes of the first element
+//      of each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result);
+
+// Same as the function above, but for values quantized using symmetric
+// quantization (e.g. by calling SymmetricQuantizeFloats).
+// The passed scaling factors is a buffer of the quantization scaling factors
+// that will be used to dequentize the products into the final result buffer.
+// These scaling factors are the multiplication of the matrix scaling factor
+// by the vector's scaling factor, one per batch (i.e. this allows quantizing
+// each batch in the batch-vector matrix independently).
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result);
+
+// Same as the function above except that vector values
+// are quantized with asymmetric quantization per-batch and the matrix
+// is quantized per row.
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* __restrict__ per_channel_scale,
+    const int32_t* __restrict__ input_offset);
+
+// Same as the function above, but the matrix is a sparse tensor with block
+// pattern 1x16.
+// This function assumes that m_cols is a multiple of the block size (16 in this
+// case) so that there's no incomplete block. Also, it assumes all offsets of
+// input, output and filter are zero.
+void SparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result);
+
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//      with an integer representing the number of non-zero blocks for the
+//      corresponding row followed by column index of the first element of
+//      each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result);
+
+// Same as the above 8, 8, 8 integer matmul except for the presence of zero
+// point and non-accumulative.
+// TODO(b/148688698): remove this function by folding zero point calculation in
+// prepare() function.
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp);
+
+// Same as above but has 16 bit and 8 bit input and 8 bit output.
+// Used in projection when hidden is 16bit.
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output);
+
+// Apply Layer Normalization (https://arxiv.org/abs/1607.06450) to a Quantized
+// vector.
+// Parameters:
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - layer_norm_weights:  the quantized layer normalization weights.
+//     - bias: the bias for the layer normalization.
+//     - layer_norm_scale_a: multiplier for scale factor.
+//     - layer_norm_scale_b: shift for scale factor.
+//     - variance_limit: the guard to make sure the inverse does not overflow.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output);
+
+// Same as above but the internal calculation is done in float.
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output);
+
+// Apply Sigmoid to a quantized vector.
+// Parameters:
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+// The input is in Q3.12 format and the output is in Q0.15 format.
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output);
+
+// Same as above but the internal calcualtion is float.
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output);
+
+// Apply Tanh to a quantized vector.
+// Parameters:
+//     - integer_bits: the integer bits of the input.
+//                     Currently supports 0, 1, 2, 3, 4, 5, 6.
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+// The input is in Qm.15-m format and the output is in Q0.15 format.
+void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
+               int32_t n_input, int16_t* output);
+
+// Apply Tanh to a quantized vector. Tbe internal calculation is in float.
+//    - Input has 2^(integer_bits) as scale.
+//    - Output has Q0.15 as scale.
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output);
+
+// Element-wise multiplication of two quantized vectors.
+// Parameters:
+//     - input_1: batch vector of size n_batch * n_input; 16 bit.
+//     - input_2: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - shift:   the shift needed to produce the output.
+//     - output:  the 16 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output);
+
+// Element-wise multiplication of two quantized vectors.
+// Parameters:
+//     - input_1: batch vector of size n_batch * n_input; 16 bit.
+//     - input_2: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - shift:   the shift needed to produce the output.
+//     - output:  the 8 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int8_t* output);
+
+// Element-wise multiplication of two quantized vectors with rescaling.
+// Parameters:
+//     - input_1:    batch vector of size n_batch * n_input; 16 bit.
+//     - input_2:    batch vector of size n_batch * n_input; 16 bit.
+//     - multiplier: the multiplier part of scale.
+//     - shift:      the shift part of scale.
+//     - n_batch:    the number of batches.
+//     - n_input:    the size for input and output.
+//     - output:     the 8 bit output of size n_batch * n_input.
+//     - output_zp:  the zero point of output.
+// Output does not need to be initialized.
+// Multiplier ("m") and shift ("s") are connected to scale ("s") with s = m *
+// 2^(s - 31).
+void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+              int32_t multiplier, int32_t shift, int32_t n_batch,
+              int32_t n_input, int32_t output_zp, int8_t* output);
+
+// Element-wise saturating addition of two quantized vectors without rescaling.
+// Parameters:
+//     - input_1:    batch vector of size n_batch * n_input; 16 bit.
+//     - input_2:    batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch:    the number of batches.
+//     - n_input:    the size for input and output.
+//     - output:     the 8 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output);
+
+// Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
+// int8_t. Parameters:
+//     - vector:         vector of size v_size.
+//     - v_size:         the size of the vector.
+//     - clipping_value: the value used for clipping.
+void CwiseClipping(float* vector, const int v_size, const float clipping_value);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value);
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value);
+
+// Dot product of two vectors.
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size);
+
+// Dot product of two batch vectors of size n_batch * v_size:
+// vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
+//            x_2_1, x_2_2, ..., x_2_vsize,
+//            ...
+//            x_nbatch_1,..., x_nbatch_vsize]
+// vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
+//            y_2_1, y_2_2, ..., y_2_vsize,
+//            ...
+//            y_nbatch_1,..., y_nbatch_vsize]
+// Then result will be a vector of n_batch size starting from 'result':
+// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
+//  x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
+//  ...
+//  x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
+template <typename T>
+inline void BatchVectorBatchVectorDotProduct(const T* vector1, const T* vector2,
+                                             int v_size, int n_batch,
+                                             T* result) {
+  for (int b = 0; b < n_batch; b++) {
+    result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
+    vector1 += v_size;
+    vector2 += v_size;
+  }
+}
+
+// Same as above but input is 16bit and output is 32bit.
+void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                      const int16_t* vector2, int v_size,
+                                      int n_batch, int32_t* result);
+
+// Same as above, but inputs are 16bit integer and output is 16bit integer.
+void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
+                                             const int16_t* batch_vector,
+                                             int n_batch, int32_t multiplier,
+                                             int shift, int16_t* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void Sub1Vector(const float* vector, int v_size, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG) for int16 input.
+// "vector" has range [0, 32767] because it is the output of sigmoid function.
+void Sub1Vector(const int16_t* vector, int v_size, int16_t* result);
+
+// Multiply all elements of vector with a scalar.
+void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                          float* result);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size);
+
+// Same as above but input/output is 32 bit integer.
+void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size);
+
+// Same as above but input is 8 bit integer.
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size);
+
+// Layer norm for each batch.
+void MeanStddevNormalization(const float* input_vector, float* output_vector,
+                             int v_size, int n_batch);
+
+// Saturate Add with rescale on both inputs.
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output);
+
 }  // namespace tensor_utils
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/batch_matmul.h
index 5fe01da2..767ad6ab 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/internal/tensor_utils_common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_args.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_args.h
new file mode 100644
index 00000000..d93c316d
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_args.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+template <typename T>
+void BroadcastArgs(const RuntimeShape& input1_shape, const T* input1_data,
+                   const RuntimeShape& input2_shape, const T* input2_data,
+                   const RuntimeShape& output_shape, T* output_data) {
+  // Gets data at the backward index i of the shape tensor. Returns 1 if the
+  // index is out of range.
+  auto get_shape_data = [](const RuntimeShape& shape, const T* data,
+                           int backward_idx) -> T {
+    int forward_idx = shape.FlatSize() - 1 - backward_idx;
+    if (forward_idx < 0) return 1;
+    return data[forward_idx];
+  };
+
+  int output_num_elements = output_shape.FlatSize();
+  for (int i = 0; i < output_num_elements; ++i) {
+    int backward_i = output_num_elements - 1 - i;
+    int shape1_i = get_shape_data(input1_shape, input1_data, i);
+    int shape2_i = get_shape_data(input2_shape, input2_data, i);
+    if (shape1_i == 1) {
+      output_data[backward_i] = shape2_i;
+    } else if (shape2_i == 1) {
+      output_data[backward_i] = shape1_i;
+    } else {
+      TFLITE_CHECK_EQ(shape1_i, shape2_i);
+      output_data[backward_i] = shape1_i;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_to.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_to.h
new file mode 100644
index 00000000..f106b2b5
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_to.h
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace reference_ops {
+template <int N>
+void BroadcastImpl(const NdArrayDesc<N>& input_desc, const char* input_data,
+                   const NdArrayDesc<N>& output_desc, char* output_data,
+                   int indexes[N], int dim, const int last_broadcasting_dim,
+                   const int type_size) {
+  // Copy data from input to output.
+  if (dim == last_broadcasting_dim) {
+    int copy_size = output_desc.strides[dim] * type_size;
+    const char* data_src =
+        input_data + SubscriptToIndex(input_desc, indexes) * type_size;
+    char* data_dst =
+        output_data + SubscriptToIndex(output_desc, indexes) * type_size;
+    for (int i = 0; i < output_desc.extents[dim]; ++i, data_dst += copy_size) {
+      memcpy(data_dst, data_src, copy_size);
+    }
+    return;
+  }
+
+  // Recursive call to find the next broadcasting.
+  for (indexes[dim] = 0; indexes[dim] < input_desc.extents[dim];
+       ++indexes[dim]) {
+    BroadcastImpl<N>(input_desc, input_data, output_desc, output_data, indexes,
+                     dim + 1, last_broadcasting_dim, type_size);
+  }
+
+  // Duplicate data in output tensor.
+  indexes[dim] = 0;
+  if (input_desc.extents[dim] != output_desc.extents[dim]) {
+    int copy_size = output_desc.strides[dim] * type_size;
+    char* data_src =
+        output_data + SubscriptToIndex(output_desc, indexes) * type_size;
+    char* data_dst = data_src + copy_size;
+    for (int i = 1; i < output_desc.extents[dim]; ++i, data_dst += copy_size) {
+      memcpy(data_dst, data_src, copy_size);
+    }
+  }
+}
+
+template <int N>
+inline void BroadcastTo(const RuntimeShape& unextended_input_shape,
+                        const char* input_data,
+                        const RuntimeShape& unextended_output_shape,
+                        char* output_data, TfLiteType data_type) {
+  NdArrayDesc<N> input_desc;
+  NdArrayDesc<N> output_desc;
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_input_shape),
+                 &input_desc);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
+                 &output_desc);
+
+  // Get the last dimension has broadcasting. At this dimension, the data is
+  // copied from input tensor to output tensor.
+  int last_broadcast_dim = -1;
+  for (int i = N - 1; i >= 0; --i) {
+    if (input_desc.extents[i] != output_desc.extents[i]) {
+      last_broadcast_dim = i;
+      break;
+    }
+  }
+
+  // If non-broadcasting, just copy data from input to output tensor.
+  if (last_broadcast_dim == -1) {
+    memcpy(output_data, input_data,
+           unextended_input_shape.FlatSize() * TfLiteTypeGetSize(data_type));
+    return;
+  }
+
+  // Broadcasting using memcpy.
+  int indexes[N] = {0};
+  BroadcastImpl<N>(input_desc, input_data, output_desc, output_data, indexes, 0,
+                   last_broadcast_dim, TfLiteTypeGetSize(data_type));
+}
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/conv.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/conv.h
index 5a6369d8..ac5f04f6 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/conv.h
@@ -43,7 +43,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   (void)im2col_data;   // only used in optimized code.
   (void)im2col_shape;  // only used in optimized code.
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int input_depth = input_shape.Dims(3);
   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
   if (bias_data) {
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
@@ -52,14 +52,20 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   const int input_width = input_shape.Dims(2);
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
+
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
       const int in_y_origin = (out_y * stride_height) - pad_height;
       for (int out_x = 0; out_x < output_width; ++out_x) {
         const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
           float total = 0.f;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             const int in_y = in_y_origin + dilation_height_factor * filter_y;
@@ -74,10 +80,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
               if (!is_point_inside_image) {
                 continue;
               }
-
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                float input_value = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                float input_value =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
                 float filter_value = filter_data[Offset(
                     filter_shape, out_channel, filter_y, filter_x, in_channel)];
                 total += (input_value * filter_value);
@@ -126,7 +133,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int input_depth = input_shape.Dims(3);
   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
   if (bias_data) {
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
@@ -135,6 +142,10 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   const int input_width = input_shape.Dims(2);
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
@@ -143,6 +154,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
       for (int out_x = 0; out_x < output_width; ++out_x) {
         const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
           int32_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             const int in_y = in_y_origin + dilation_height_factor * filter_y;
@@ -158,9 +170,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                 continue;
               }
 
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
                 int32_t filter_val = filter_data[Offset(
                     filter_shape, out_channel, filter_y, filter_x, in_channel)];
                 acc +=
@@ -206,7 +220,7 @@ inline void HybridConvPerChannel(
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int input_depth = input_shape.Dims(3);
   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
   if (bias_data) {
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
@@ -215,18 +229,24 @@ inline void HybridConvPerChannel(
   const int input_width = input_shape.Dims(2);
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
       for (int out_x = 0; out_x < output_width; ++out_x) {
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
           const int in_x_origin = (out_x * stride_width) - pad_width;
           const int in_y_origin = (out_y * stride_height) - pad_height;
           int32_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
                 const int in_y =
                     in_y_origin + dilation_height_factor * filter_y;
@@ -235,7 +255,8 @@ inline void HybridConvPerChannel(
                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                     (in_y < input_height)) {
                   int32_t input_val = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
+                      input_shape, batch, in_y, in_x,
+                      in_channel + group * filter_input_depth)];
                   int32_t filter_val =
                       filter_data[Offset(filter_shape, out_channel, filter_y,
                                          filter_x, in_channel)];
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/hard_swish.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/hard_swish.h
index cda1b5cf..b1204cc5 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/hard_swish.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/hard_swish.h
@@ -23,9 +23,9 @@ namespace tflite {
 namespace reference_ops {
 
 inline int16_t SaturatingLeftShift(int16_t value, int amount) {
-  int32_t result = static_cast<int32_t>(value) * (1 << amount);
-  result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
-  result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
+  int64_t result = static_cast<int64_t>(value) * (1 << amount);
+  result = std::min<int64_t>(result, std::numeric_limits<int16_t>::max());
+  result = std::max<int64_t>(result, std::numeric_limits<int16_t>::min());
   return result;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
index 3a4164d3..3f869a3a 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -48,7 +48,7 @@ inline void ConvPerChannel(
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int input_depth = input_shape.Dims(3);
   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
   if (bias_data) {
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
@@ -59,6 +59,10 @@ inline void ConvPerChannel(
   const int input_width = input_shape.Dims(2);
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
@@ -67,6 +71,7 @@ inline void ConvPerChannel(
       for (int out_x = 0; out_x < output_width; ++out_x) {
         const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
           int32_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             const int in_y = in_y_origin + dilation_height_factor * filter_y;
@@ -82,9 +87,11 @@ inline void ConvPerChannel(
                 continue;
               }
 
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
                 int32_t filter_val = filter_data[Offset(
                     filter_shape, out_channel, filter_y, filter_x, in_channel)];
                 // Accumulate with 32 bits accumulator.
@@ -126,12 +133,13 @@ inline void ConvPerChannel(
 
 // Fixed-point per-channel-quantization convolution reference kernel.
 // 16-bit data and 8-bit filter
+template <typename AccumScalar>
 inline void ConvPerChannel(
     const ConvParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,
     const int16_t* input_data, const RuntimeShape& filter_shape,
     const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    const AccumScalar* bias_data, const RuntimeShape& output_shape,
     int16_t* output_data) {
   // Get parameters.
   const int stride_width = params.stride_width;
@@ -151,7 +159,7 @@ inline void ConvPerChannel(
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int input_depth = input_shape.Dims(3);
   const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
   if (bias_data) {
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
@@ -162,6 +170,10 @@ inline void ConvPerChannel(
   const int input_width = input_shape.Dims(2);
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
@@ -170,7 +182,8 @@ inline void ConvPerChannel(
       for (int out_x = 0; out_x < output_width; ++out_x) {
         const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          std::int64_t acc = 0;
+          auto group = out_channel / filters_per_group;
+          AccumScalar acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             const int in_y = in_y_origin + dilation_height_factor * filter_y;
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
@@ -185,9 +198,11 @@ inline void ConvPerChannel(
                 continue;
               }
 
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
                 int32_t filter_val = filter_data[Offset(
                     filter_shape, out_channel, filter_y, filter_x, in_channel)];
                 // Accumulate with 64 bits accumulator.
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
index 1a469fa9..42920d16 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -34,12 +34,13 @@ inline void FullyConnected(
   const int32_t output_activation_min = params.quantized_activation_min;
   const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
 
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   const int filter_dim_count = filter_shape.DimensionsCount();
-  const int batches = output_shape.Dims(0);
-  const int output_depth = output_shape.Dims(1);
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = output_shape.Dims(output_dim_count - 1);
   TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
   for (int b = 0; b < batches; ++b) {
@@ -62,11 +63,12 @@ inline void FullyConnected(
   }
 }
 
+template <typename AccumScalar>
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const int16_t* input_data, const RuntimeShape& filter_shape,
     const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int64_t* bias_data, const RuntimeShape& output_shape,
+    const AccumScalar* bias_data, const RuntimeShape& output_shape,
     int16_t* output_data) {
   const int32_t filter_offset = params.weights_offset;
   const int32_t output_multiplier = params.output_multiplier;
@@ -85,7 +87,7 @@ inline void FullyConnected(
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
   for (int b = 0; b < batches; ++b) {
     for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int64_t acc = 0;
+      AccumScalar acc = 0;
       for (int d = 0; d < accum_depth; ++d) {
         int32_t input_val = input_data[b * accum_depth + d];
         int32_t filter_val = filter_data[out_c * accum_depth + d];
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
index 284c0f21..3397f869 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
@@ -119,15 +119,16 @@ inline void TransposeConv(
   }
 }
 
-// int16_t input (zero_point=0), int8_t filter, int64 accumulator
+// int16_t input (zero_point=0), int8_t filter, int32 or int64 accumulator
+template <typename Scalar>
 inline void TransposeConv(
     const ConvParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,
     const int16_t* input_data, const RuntimeShape& filter_shape,
     const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    const Scalar* bias_data, const RuntimeShape& output_shape,
     int16_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data,
-    std::int64_t* scratch_buffer) {
+    Scalar* scratch_buffer) {
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int pad_width = params.padding_values.width;
@@ -157,7 +158,7 @@ inline void TransposeConv(
   const int num_elements = output_shape.FlatSize();
   // We need to initialize scratch_buffer to all 0s, as we apply the same
   // 'scatter' based trick as in float version.
-  memset(scratch_buffer, 0, num_elements * sizeof(std::int64_t));
+  memset(scratch_buffer, 0, num_elements * sizeof(Scalar));
 
   // Loop through input elements one at a time.
   for (int batch = 0; batch < batches; ++batch) {
@@ -198,8 +199,8 @@ inline void TransposeConv(
     for (int out_y = 0; out_y < output_height; ++out_y) {
       for (int out_x = 0; out_x < output_width; ++out_x) {
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          std::int64_t acc = scratch_buffer[Offset(output_shape, batch, out_y,
-                                                   out_x, out_channel)];
+          Scalar acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x,
+                                             out_channel)];
           if (bias_data) {
             acc += bias_data[out_channel];
           }
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/lstm_cell.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/lstm_cell.h
new file mode 100644
index 00000000..17b113eb
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/lstm_cell.h
@@ -0,0 +1,422 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void LstmCell(
+    const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
+    const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
+    const float* prev_activ_data, const RuntimeShape& weights_shape,
+    const float* weights_data, const RuntimeShape& unextended_bias_shape,
+    const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
+    const float* prev_state_data,
+    const RuntimeShape& unextended_output_state_shape, float* output_state_data,
+    const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
+    const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
+    const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches =
+      MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+                  output_state_shape, 0, output_activ_shape, 0);
+  const int height =
+      MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+                  output_state_shape, 1, output_activ_shape, 1);
+  const int width =
+      MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+                  output_state_shape, 2, output_activ_shape, 2);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  float const* concat_input_arrays_data[2] = {input_data, prev_activ_data};
+  const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
+                                                       &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes,
+                concat_input_arrays_data, concat_temp_shape, concat_temp_data);
+
+  // Fully connected
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape,
+                 weights_data, bias_shape, bias_data, activ_temp_shape,
+                 activ_temp_data);
+
+  // Memory state update (the LSTM "guts")
+  for (int b = 0; b < batches; ++b) {
+    for (int w = 0; w < width; ++w) {
+      for (int h = 0; h < height; ++h) {
+        for (int c = 0; c < output_depth; ++c) {
+          const float input_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
+                                                      0 * output_depth + c)]));
+          const float new_input = std::tanh(activ_temp_data[Offset(
+              activ_temp_shape, b, h, w, 1 * output_depth + c)]);
+          const float forget_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
+                                                      2 * output_depth + c)]));
+          const float output_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w,
+                                                      3 * output_depth + c)]));
+          const float new_state =
+              input_gate * new_input +
+              forget_gate *
+                  prev_state_data[Offset(prev_state_shape, b, h, w, c)];
+          output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
+          output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
+              output_gate * std::tanh(new_state);
+        }
+      }
+    }
+  }
+}
+
+// Quantized LSTM cell implementation.
+// The quantization of the input, output arrays is as follows:
+//  - The input activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that is the natural interval for output
+//    activations (see next point) and these need to be concatenated together.
+//    We could accommodate different ranges by re-scaling, but we empirically
+//    found that setting the input activations range to be [-1, 127/128] in the
+//    first place, removing the need for re-scaling, greatly improves accuracy.
+//  - The output activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that the definition of a LSTM cell makes them
+//    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
+//    makes for simpler, more accurate fixed-point arithmetic.
+//  - The output-at-previous-timestep state array is obviously quantized as
+//    the output activations.
+//  - The internal LSTM memory (not the output-at-previous-timestep, the other
+//    internal state array) is int16-quantized and may use any power-of-two,
+//    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
+//    StateIntegerBits below, see the below discussion of that template
+//    parameter ("The StateIntegerBits template parameter").
+//  - The output of the internal fully-connected node is int16-quantized
+//    on the interval [-8, 8 * 32767/32768], the rationale for which is
+//    explained just below ("Why [-8, 8] for fully-connected output?").
+//
+//
+// === The StateIntegerBits template parameter ===
+//
+// The StateIntegerBits template parameter controls the fixed-point format used
+// to represent the internal memory of the LSTM cell (not the
+// output-at-previous-timestep, the other internal state array). It's currently
+// a template parameter so that the model can control that. The most typical
+// value for StateIntegerBits is 4. Other plausible values are anywhere between
+// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
+// and drop that template parameter. The reason why it can't be a runtime
+// parameter is that this controls the fixed-point format used, i.e. we need to
+// generate actually different code based on it. In particular, we generate code
+// for a fixed-point tanh() implementation for that format, which internally
+// uses a fixed-point exp() implementation, which internally uses a
+// barrel-shifter with a number of steps that depends on StateIntegerBits.
+// Another consequence of that is that a higher value of StateIntegerBits
+// results in a more expensive implementation (more barrel shifter steps
+// needed).
+//
+//
+// === Why [-8, 8] for fully-connected output? ===
+//
+// This array is only fed to Logistic and Tanh functions, for which
+// the quantized implementation will want to use fixed-point arithmetic,
+// requiring a power-of-two representation interval. Thus, we should right
+// away quantize this array to a power-of-two interval; otherwise,
+// implementation will need to rescale that, losing any benefit that a tighter
+// representation interval might otherwise yield, while introducing some
+// numerical error and computational overhead.
+//
+// Now, Logistic and Tanh
+// are nearly constant (nearly equal to their horizontal asymptotes)
+// outside of a small bounded interval around 0:
+//
+//   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
+//   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
+//   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
+//
+// From this, we see that clamping to [-4, 4] would be too inaccurate
+// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
+// while clamping to [-16, 16] would make no difference even in float32.
+// However, for a fixed-point implementation in 16-bit integers, using 5
+// integer bits to represent the [-16, 16] range would leave only 11
+// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
+// representable values. Notice that is higher than the
+// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
+// Using [-8, 8] thus seems like the better compromise overall, enjoying
+// an increment of 2.4e-4 between representable values and a worst-case
+// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
+// [-16, 16].
+//
+// Moreover, all other things being equal, it is nice to choose the narrower
+// representation range, as that makes the implementation of fixed-point
+// math functions a little cheaper (each integer bit requires an additional
+// barrel-shifter atep in the implementation of exp(-x)). That is further
+// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
+// sense for 32-bit float or 32-bit fixed-point quantization, but we are
+// aiming for 16-bit fixed-point quantization of these internal nodes here.
+//
+template <int StateIntegerBits>
+inline void LstmCell(const LstmCellParams& params,
+                     const RuntimeShape& unextended_input_shape,
+                     const uint8_t* input_data_uint8,
+                     const RuntimeShape& unextended_prev_activ_shape,
+                     const uint8_t* prev_activ_data_uint8,
+                     const RuntimeShape& weights_shape,
+                     const uint8_t* weights_data_uint8,
+                     const RuntimeShape& unextended_bias_shape,
+                     const int32_t* bias_data_int32,
+                     const RuntimeShape& unextended_prev_state_shape,
+                     const int16_t* prev_state_data_int16,
+                     const RuntimeShape& unextended_output_state_shape,
+                     int16_t* output_state_data_int16,
+                     const RuntimeShape& unextended_output_activ_shape,
+                     uint8_t* output_activ_data_uint8,
+                     const RuntimeShape& unextended_concat_temp_shape,
+                     uint8_t* concat_temp_data_uint8,
+                     const RuntimeShape& unextended_activ_temp_shape,
+                     int16_t* activ_temp_data_int16, void* gemmlowp_context) {
+  (void)gemmlowp_context;  // only used in optimized code.
+  int32_t weights_zero_point = params.weights_zero_point;
+  int32_t accum_multiplier = params.accum_multiplier;
+  int accum_shift = params.accum_shift;
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape =
+      RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape =
+      RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  // Gather dimensions information, and perform consistency checks.
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int outer_size = MatchingFlatSizeSkipDim(
+      input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape,
+      output_activ_shape);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
+                   total_input_depth);
+  const int intern_activ_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
+                   intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                  3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+  const int fc_output_depth =
+      MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+  const int fc_accum_depth = total_input_depth;
+  TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8_t const* concat_input_arrays_data[2] = {input_data_uint8,
+                                                prev_activ_data_uint8};
+  const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
+                                                       &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes,
+                concat_input_arrays_data, concat_temp_shape,
+                concat_temp_data_uint8);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b) {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d) {
+        int16_t input_val =
+            concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16_t weights_val =
+            weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum =
+          MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, accum));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  for (int b = 0; b < outer_size; ++b) {
+    for (int c = 0; c < output_depth; ++c) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16_t rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16_t clamped_output_activ = std::max<int16_t>(
+          -128, std::min<int16_t>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] =
+          128 + clamped_output_activ;
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 4cc51cb4..4684be64 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -227,6 +227,41 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
   }
 }
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result) {
+  const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
+  for (int batch = 0; batch < n_batch; ++batch) {
+    const int8_t* matrix_ptr = matrix;
+    for (int row = 0; row < m_rows; ++row) {
+      int32_t dot_prod = 0;
+      const int8_t* vector_in_batch = vector + batch * m_cols;
+      for (int i = segments[row]; i < segments[row + 1]; ++i) {
+        const int block_start_index = indices[i] * kBlockSize;
+        const int8_t* vector_block_in_batch_ptr =
+            vector_in_batch + block_start_index;
+        for (int c = 0; c < kBlockSize; c++) {
+          dot_prod += *matrix_ptr * *vector_block_in_batch_ptr++;
+          dot_prod += *matrix_ptr++ * input_offset;
+        }
+      }
+      const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0;
+      dot_prod = MultiplyByQuantizedMultiplier(dot_prod + bias_value,
+                                               output_multiplier, output_shift);
+      dot_prod += output_offset;
+      result[batch * m_rows + row] =
+          static_cast<int8_t>(ActivationFunctionWithMinMax(
+              dot_prod, output_activation_min, output_activation_max));
+    }
+  }
+}
+
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
new file mode 100644
index 00000000..0416db09
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -0,0 +1,333 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+namespace tflite {
+namespace tensor_utils {
+
+// Check if all entries of a vector are zero for float.
+bool IsZeroVector(const float* vector, int v_size) {
+  return PortableIsZeroVector(vector, v_size);
+}
+
+// Check if all entries of a vector are zero for int8_t.
+bool IsZeroVector(const int8_t* vector, int v_size) {
+  return PortableIsZeroVector(vector, v_size);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min, float* max,
+                             float* scaling_factor) {
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min, max,
+                                  scaling_factor);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float min_value,
+                             float max_value, float* scaling_factor) {
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min_value,
+                                  max_value, scaling_factor);
+}
+
+void AsymmetricQuantizeFloats(const float* values, const int size,
+                              int8_t* quantized_values, float* scaling_factor,
+                              int32_t* offset) {
+  PortableAsymmetricQuantizeFloats(values, size, quantized_values,
+                                   scaling_factor, offset);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vector,
+                                         const float* scaling_factors,
+                                         int n_batch,
+                                         float* __restrict__ result) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              scaling_factors, n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      per_channel_scale, input_offset, scratch, row_sums, compute_row_sums,
+      context);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vector,
+                                         const float* scaling_factors,
+                                         int n_batch, int32_t* scratch,
+                                         float* __restrict__ result,
+                                         CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              scaling_factors, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
+      matrix, segments, indices, m_rows, m_cols, vector, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vector, n_batch, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+
+    int8_t* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+      matrix, segments, indices, m_rows, m_cols, vector, bias_vector, n_batch,
+      input_offset, output_multiplier, output_shift, output_offset,
+      output_activation_min, output_activation_max, result);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch,
+      result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
+      n_output, output_zp, scratch, output, context);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
+      n_output, output_zp, scratch, output, context);
+}
+
+void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
+                                    int32_t n_row, int32_t n_col,
+                                    int32_t* output) {
+  PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output);
+}
+
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
+void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
+                    const int32_t* bias, int32_t layer_norm_scale_a,
+                    int32_t layer_norm_scale_b, int32_t variance_limit,
+                    int n_batch, int n_input, int16_t* output) {
+  PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
+                         layer_norm_scale_b, variance_limit, n_batch, n_input,
+                         output);
+}
+
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
+void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                  int16_t* output) {
+  PortableApplySigmoid(input, n_batch, n_input, output);
+}
+
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
+void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
+               int32_t n_input, int16_t* output) {
+  PortableApplyTanh(integer_bits, input, n_batch, n_input, output);
+}
+
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int shift, int16_t* output) {
+  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
+}
+
+void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+              int32_t multiplier, int32_t shift, int32_t n_batch,
+              int32_t n_input, int32_t output_zp, int8_t* output) {
+  PortableCwiseMul(input_1, input_2, multiplier, shift, n_batch, n_input,
+                   output_zp, output);
+}
+
+void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
+              int n_input, int16_t* output) {
+  PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
+}
+
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
+                                             const int16_t* batch_vector,
+                                             int n_batch, int32_t multiplier,
+                                             int shift, int16_t* result) {
+  PortableVectorBatchVectorCwiseProductAccumulate(
+      vector, v_size, batch_vector, n_batch, multiplier, shift, result);
+}
+
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size) {
+  return PortableVectorVectorDotProduct(vector1, vector2, v_size);
+}
+
+void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                      const int16_t* vector2, int v_size,
+                                      int n_batch, int32_t* result) {
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
+}
+
+void Sub1Vector(const float* vector, int v_size, float* result) {
+  PortableSub1Vector(vector, v_size, result);
+}
+
+void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
+  PortableSub1Vector(vector, v_size, result);
+}
+
+// Multiply all elements of vector with a scalar.
+void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                          float* result) {
+  PortableVectorScalarMultiply(vector, v_size, scale, result);
+}
+
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+void MeanStddevNormalization(const float* input_vector, float* output_vector,
+                             int v_size, int n_batch) {
+  PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
+}
+
+void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                          const int8_t* recurrent, int8_t recurrent_zp,
+                          int32_t input_effective_scale_a,
+                          int32_t input_effective_scale_b,
+                          int32_t recurrent_effective_scale_a,
+                          int32_t recurrent_effective_scale_b, int32_t n_batch,
+                          int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturatingAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 1e411e16..6c404d5e 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -87,6 +87,15 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate(
     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
     float* __restrict__ result);
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result);
+
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
     const int m_cols, const int8_t* __restrict__ vectors,
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/sub.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/sub.h
index 3fa43ce9..d0ebc95a 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/sub.h
@@ -273,6 +273,9 @@ void BroadcastQuantSubSlow(const ArithmeticParams& params,
                            const T* input2_data,
                            const RuntimeShape& output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
   NdArrayDesc<N> desc1;
   NdArrayDesc<N> desc2;
   NdArrayDesc<N> output_desc;
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/runtime_shape.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/runtime_shape.h
index 13693643..c2678b57 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/runtime_shape.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/runtime_shape.h
@@ -27,6 +27,11 @@ class RuntimeShape {
  public:
   RuntimeShape& operator=(RuntimeShape const&) = delete;
 
+  // RuntimeShape in TFLM supports up to 5 dimensions.
+  // The name kMaxSmallSize comes from the same file of the upstream
+  // tensorflow lite repo and need to be kept the same for max reuse.
+  static constexpr int kMaxSmallSize = 5;
+
   RuntimeShape() : size_(0) {}
 
   explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {}
@@ -104,11 +109,9 @@ class RuntimeShape {
                 sizeof(int32_t) * shape.DimensionsCount());
   }
 
-  // A maximum of 4 dimensions are supported on TFLM.
-  static constexpr int kMaxSize = 5;
   int32_t size_;
   union {
-    int32_t dims_[kMaxSize];
+    int32_t dims_[kMaxSmallSize];
   };
 };
 
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/types.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/types.h
index 77644bc0..c44ba48e 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/types.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/types.h
@@ -974,11 +974,11 @@ struct StridedSliceParams {
   int8_t strides_count;
   int32_t strides[5];
 
-  int16_t begin_mask;
-  int16_t ellipsis_mask;
-  int16_t end_mask;
-  int16_t new_axis_mask;
-  int16_t shrink_axis_mask;
+  uint16_t begin_mask;
+  uint16_t ellipsis_mask;
+  uint16_t end_mask;
+  uint16_t new_axis_mask;
+  uint16_t shrink_axis_mask;
 };
 
 struct TanhParams {
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.cc b/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.cc
index 75529296..10b37ed3 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.cc
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.cc
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 
@@ -466,10 +467,10 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
     const int d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
     const int d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
     if (!(d1 == d2 || d1 == 1 || d2 == 1)) {
-      context->ReportError(context,
-                           "Given shapes, %s and %s, are not broadcastable.",
-                           GetShapeDebugString(input1->dims).c_str(),
-                           GetShapeDebugString(input2->dims).c_str());
+      TF_LITE_KERNEL_LOG(context,
+                         "Given shapes, %s and %s, are not broadcastable.",
+                         GetShapeDebugString(input1->dims).c_str(),
+                         GetShapeDebugString(input2->dims).c_str());
       return kTfLiteError;
     }
 
@@ -504,11 +505,11 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
     if (min_value == 0) max_value = 0;
     if (!(d1 == 1 || d1 == max_value) || !(d2 == 1 || d2 == max_value) ||
         !(d3 == 1 || d3 == max_value)) {
-      context->ReportError(
-          context, "Given shapes, %s, %s and %s, are not broadcastable.",
-          GetShapeDebugString(input1->dims).c_str(),
-          GetShapeDebugString(input2->dims).c_str(),
-          GetShapeDebugString(input3->dims).c_str());
+      TF_LITE_KERNEL_LOG(context,
+                         "Given shapes, %s, %s and %s, are not broadcastable.",
+                         GetShapeDebugString(input1->dims).c_str(),
+                         GetShapeDebugString(input2->dims).c_str(),
+                         GetShapeDebugString(input3->dims).c_str());
       return kTfLiteError;
     }
     shape->data[out_dims - i - 1] = max_value;
@@ -529,6 +530,9 @@ int TfLiteTypeGetSize(TfLiteType type) {
       return 1;
     case kTfLiteBool:
       return sizeof(bool);
+    case kTfLiteUInt16:
+      static_assert(sizeof(uint16_t) == 2, "");
+      return 2;
     case kTfLiteInt16:
       static_assert(sizeof(int16_t) == 2, "");
       return 2;
@@ -575,4 +579,15 @@ bool IsMobilePlatform() {
   return false;
 }
 
+bool HasUnspecifiedDimension(const TfLiteTensor* tensor) {
+#ifndef TF_LITE_STATIC_MEMORY
+  if (tensor->dims_signature) {
+    for (int i : TfLiteIntArrayView(tensor->dims_signature)) {
+      if (i == -1) return true;
+    }
+  }
+#endif  // TF_LITE_STATIC_MEMORY
+  return false;
+}
+
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.h b/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.h
index d082e7b0..ed3a566f 100644
--- a/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.h
+++ b/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.h
@@ -308,12 +308,15 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
                                         const TfLiteTensor* input3,
                                         TfLiteIntArray** output_shape);
 
-// Return the size of given type in bytes. Return 0 in in case of string.
+// Return the size of given type in bytes. Return 0 in case of string.
 int TfLiteTypeGetSize(TfLiteType type);
 
 // Whether the current platform is mobile (Android or iOS).
 bool IsMobilePlatform();
 
+// Returns whether there is unspecified dimension in the tensor's dim signature.
+bool HasUnspecifiedDimension(const TfLiteTensor* tensor);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/all_ops_resolver.cc b/code/components/tflite-lib/tensorflow/lite/micro/all_ops_resolver.cc
index 8777cd28..6fa1b31b 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/all_ops_resolver.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/all_ops_resolver.cc
@@ -29,8 +29,12 @@ AllOpsResolver::AllOpsResolver() {
   AddAssignVariable();
   AddAveragePool2D();
   AddBatchToSpaceNd();
+  AddBroadcastArgs();
+  AddBroadcastTo();
   AddCallOnce();
+  AddCast();
   AddCeil();
+  AddCircularBuffer();
   AddConcatenation();
   AddConv2D();
   AddCos();
@@ -49,9 +53,12 @@ AllOpsResolver::AllOpsResolver() {
   AddFloorDiv();
   AddFloorMod();
   AddFullyConnected();
+  AddGather();
+  AddGatherNd();
   AddGreater();
   AddGreaterEqual();
   AddHardSwish();
+  AddIf();
   AddL2Normalization();
   AddL2Pool2D();
   AddLeakyRelu();
@@ -66,6 +73,7 @@ AllOpsResolver::AllOpsResolver() {
   AddMaximum();
   AddMean();
   AddMinimum();
+  AddMirrorPad();
   AddMul();
   AddNeg();
   AddNotEqual();
@@ -85,6 +93,7 @@ AllOpsResolver::AllOpsResolver() {
   AddRsqrt();
   AddShape();
   AddSin();
+  AddSlice();
   AddSoftmax();
   AddSpaceToBatchNd();
   AddSpaceToDepth();
@@ -101,6 +110,8 @@ AllOpsResolver::AllOpsResolver() {
   AddTransposeConv();
   AddUnpack();
   AddVarHandle();
+  AddWhile();
+  AddZerosLike();
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/ibuffer_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/ibuffer_allocator.h
new file mode 100644
index 00000000..b92d6b2d
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/ibuffer_allocator.h
@@ -0,0 +1,100 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_IBUFFER_ALLOCATOR_H_
+#define TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_IBUFFER_ALLOCATOR_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/lite/c/c_api_types.h"
+
+namespace tflite {
+// Interface classes that the TFLM framework relies on to get buffers it needs.
+// There are two types of buffers that the TFLM framework requires: persistent
+// and non-persistent. Persistent buffers, once allocated, are never freed by
+// the TFLM framework. Non-persist buffers can be allocated and deallocated by
+// the TFLM framework. This file defines two interfaces classes that TFLM
+// framework will rely on to manage these buffers.
+
+// Interface class for managing persistent buffers.
+class IPersistentBufferAllocator {
+ public:
+  IPersistentBufferAllocator() {}
+  virtual ~IPersistentBufferAllocator() {}
+
+  // Allocates persistent memory. The persistent buffer is never freed.
+  virtual uint8_t* AllocatePersistentBuffer(size_t size, size_t alignment) = 0;
+
+  // Returns the size of all persistent allocations in bytes.
+  virtual size_t GetPersistentUsedBytes() const = 0;
+};
+
+// Interface class for managing non-persistent buffers.
+// The default non-persistent buffers are temp buffers that are not resizable.
+// Support of at least one resizable buffer is required.
+class INonPersistentBufferAllocator {
+ public:
+  INonPersistentBufferAllocator() {}
+  virtual ~INonPersistentBufferAllocator() {}
+
+  // Allocates a temporary buffer. This buffer is not resizable.
+  virtual uint8_t* AllocateTemp(size_t size, size_t alignment) = 0;
+
+  // Signals that a temporary buffer is no longer needed.
+  virtual void DeallocateTemp(uint8_t* buf) = 0;
+
+  // Returns true if all temporary buffers are already deallocated.
+  virtual bool IsAllTempDeallocated() = 0;
+
+  // Signals that all temporary allocations can be reclaimed. TFLM calls this
+  // API when it knows that all temporary buffers that it requested has been
+  // deallocated. The goal of API is to facilitate implementations of
+  // INonPersistentBufferAllocator can reuse buffer with some reasonable
+  // complexity.
+  virtual TfLiteStatus ResetTempAllocations() = 0;
+
+  // Returns a buffer that is resizable viable ResizeBuffer().
+  virtual uint8_t* AllocateResizableBuffer(size_t size, size_t alignment) = 0;
+
+  // Resizes a buffer that is previously returned by the
+  // AllocateResizableBuffer.
+  virtual TfLiteStatus ResizeBuffer(uint8_t* resizable_buf, size_t size,
+                                    size_t alignment) = 0;
+
+  // Frees up the memory occupied by the resizable buffer.
+  virtual TfLiteStatus DeallocateResizableBuffer(uint8_t* resizable_buf) = 0;
+
+  // Returns a pointer pointing to the start of the overlay memory, which is
+  // used for activation tensors and scratch buffers by kernels at Invoke stage.
+  virtual uint8_t* GetOverlayMemoryAddress() const = 0;
+
+  // Reserves the size of the overlay memory. This overlay is reserved for the
+  // kernels at Invoke stage. This is referred to as the overlay because before
+  // Invoket state, the same memory can be used for temp buffers. The layout of
+  // the memory is planned by the memory planner separately at Invoke stage.
+  virtual TfLiteStatus ReserveNonPersistentOverlayMemory(size_t size,
+                                                         size_t alignment) = 0;
+
+  // Returns the size of non-persistent buffer in use.
+  virtual size_t GetNonPersistentUsedBytes() const = 0;
+
+  // Returns the number of bytes available with a given alignment. This number
+  // takes in account any temporary allocations.
+  virtual size_t GetAvailableMemory(size_t alignment) const = 0;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_IBUFFER_ALLOCATOR_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/non_persistent_arena_buffer_allocator.cc b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/non_persistent_arena_buffer_allocator.cc
new file mode 100644
index 00000000..0f75d286
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/non_persistent_arena_buffer_allocator.cc
@@ -0,0 +1,165 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/micro/arena_allocator/non_persistent_arena_buffer_allocator.h"
+
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+namespace tflite {
+
+NonPersistentArenaBufferAllocator::NonPersistentArenaBufferAllocator(
+    uint8_t* buffer, size_t buffer_size)
+    : buffer_head_(buffer),
+      buffer_tail_(buffer + buffer_size),
+      head_temp_(buffer),
+      next_temp_(buffer) {}
+
+NonPersistentArenaBufferAllocator::~NonPersistentArenaBufferAllocator() {}
+
+// Allocates a temporary buffer. This buffer is not resizable.
+uint8_t* NonPersistentArenaBufferAllocator::AllocateTemp(size_t size,
+                                                         size_t alignment) {
+  uint8_t* const aligned_result = AlignPointerUp(next_temp_, alignment);
+  const size_t available_memory = buffer_tail_ - aligned_result;
+  if (available_memory < size) {
+    MicroPrintf(
+        "Failed to allocate temp memory. Requested: %u, "
+        "available %u, missing: %u",
+        size, available_memory, size - available_memory);
+    return nullptr;
+  }
+  next_temp_ = aligned_result + size;
+  temp_buffer_ptr_check_sum_ ^= reinterpret_cast<intptr_t>(aligned_result);
+  temp_buffer_count_++;
+  return aligned_result;
+}
+
+// Signals that a temporary buffer is no longer needed.
+void NonPersistentArenaBufferAllocator::DeallocateTemp(uint8_t* temp_buf) {
+  temp_buffer_ptr_check_sum_ ^= reinterpret_cast<intptr_t>(temp_buf);
+  temp_buffer_count_--;
+}
+
+// Returns true if all temporary buffers are already deallocated.
+bool NonPersistentArenaBufferAllocator::IsAllTempDeallocated() {
+  if (temp_buffer_count_ != 0 || temp_buffer_ptr_check_sum_ != 0) {
+    MicroPrintf(
+        "Number of allocated temp buffers: %d. Checksum passing status: %d",
+        temp_buffer_count_, !temp_buffer_ptr_check_sum_);
+    return false;
+  }
+  return true;
+}
+
+// Signals that all temporary allocations can be reclaimed. TFLM calls this
+// API when it knows that all temporary buffers that it requested has been
+// deallocated. The goal of API is to facilitate implementations of
+// INonPersistentBufferAllocator can reuse buffer with some reasonable
+// complexity.
+TfLiteStatus NonPersistentArenaBufferAllocator::ResetTempAllocations() {
+  if (!IsAllTempDeallocated()) {
+    MicroPrintf(
+        "All temp buffers must be freed before calling ResetTempAllocations()");
+    return kTfLiteError;
+  }
+  next_temp_ = head_temp_;
+  return kTfLiteOk;
+}
+
+// Returns a buffer that is resizable viable ResizeBuffer().
+uint8_t* NonPersistentArenaBufferAllocator::AllocateResizableBuffer(
+    size_t size, size_t alignment) {
+  // Only supports one resizable buffer, which starts at the buffer head.
+  uint8_t* expected_resizable_buf = AlignPointerUp(buffer_head_, alignment);
+
+  if (head_temp_ != expected_resizable_buf) {
+    MicroPrintf(
+        "Cannot allocate a new resizable buffer when one is already allocated");
+    return nullptr;
+  }
+
+  if (ResizeBuffer(expected_resizable_buf, size, alignment) == kTfLiteOk) {
+    return expected_resizable_buf;
+  }
+  return nullptr;
+}
+
+// Resizes a buffer that is previously returned by the AllocateResizableBuffer.
+// Note that ResizeBuffer(old_resizable_buf, 0, 1) effectively deallocates
+// a previous allocated resizable buffer.
+TfLiteStatus NonPersistentArenaBufferAllocator::ResizeBuffer(
+    uint8_t* resizable_buf, size_t size, size_t alignment) {
+  // Only supports one resizable buffer, which starts at the buffer head.
+  uint8_t* expect_resizable_buf = AlignPointerUp(buffer_head_, alignment);
+  if (resizable_buf != expect_resizable_buf) {
+    MicroPrintf("Internal error: buffer is not resizable");
+    return kTfLiteError;
+  }
+  if (head_temp_ != next_temp_) {
+    MicroPrintf("ResetTempAllocations() is not called before ResizeBuffer().");
+    return kTfLiteError;
+  }
+
+  const size_t available_memory = buffer_tail_ - expect_resizable_buf;
+  if (available_memory < size) {
+    MicroPrintf(
+        "Failed to resize buffer. Requested: %u, available %u, missing: %u",
+        size, available_memory, size - available_memory);
+    return kTfLiteError;
+  }
+  head_temp_ = expect_resizable_buf + size;
+  next_temp_ = head_temp_;
+
+  return kTfLiteOk;
+}
+
+// Frees up the memory occupied by the resizable buffer.
+TfLiteStatus NonPersistentArenaBufferAllocator::DeallocateResizableBuffer(
+    uint8_t* resizable_buf) {
+  return ResizeBuffer(resizable_buf, 0, 1);
+}
+
+// Returns a pointer pointing to the start of the overlay memory, which is
+// used for activation tensors and scratch buffers by kernels at Invoke stage.
+uint8_t* NonPersistentArenaBufferAllocator::GetOverlayMemoryAddress() const {
+  return buffer_head_;
+}
+
+// Reserves the size of the overlay memory. This overlay is reserved for the
+// kernels at Invoke stage. This is referred to as the overlay because before
+// Invoket state, the same memory can be used for temp buffers. The layout of
+// the memory is planned by the memory planner separately at Invoke stage.
+TfLiteStatus
+NonPersistentArenaBufferAllocator::ReserveNonPersistentOverlayMemory(
+    size_t size, size_t alignment) {
+  uint8_t* expect_resizable_buf = AlignPointerUp(buffer_head_, alignment);
+  return ResizeBuffer(expect_resizable_buf, size, alignment);
+}
+
+// Returns the size of non-persistent buffer in use.
+size_t NonPersistentArenaBufferAllocator::GetNonPersistentUsedBytes() const {
+  return (next_temp_ - buffer_head_);
+}
+
+// Returns the number of bytes available with a given alignment. This number
+// takes in account any temporary allocations.
+size_t NonPersistentArenaBufferAllocator::GetAvailableMemory(
+    size_t alignment) const {
+  uint8_t* const aligned_temp = AlignPointerUp(next_temp_, alignment);
+  uint8_t* const aligned_tail = AlignPointerDown(buffer_tail_, alignment);
+  return aligned_tail - aligned_temp;
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/non_persistent_arena_buffer_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/non_persistent_arena_buffer_allocator.h
new file mode 100644
index 00000000..aad41d3f
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/non_persistent_arena_buffer_allocator.h
@@ -0,0 +1,104 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_NON_PERSISTENT_ARENA_BUFFER_ALLOCATOR_H_
+#define TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_NON_PERSISTENT_ARENA_BUFFER_ALLOCATOR_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/micro/arena_allocator/ibuffer_allocator.h"
+#include "tensorflow/lite/micro/compatibility.h"
+
+namespace tflite {
+
+// Implement INonPersistentBufferAllocator on an arena that is dedicated for
+// non-persistent buffers.
+class NonPersistentArenaBufferAllocator : public INonPersistentBufferAllocator {
+ public:
+  NonPersistentArenaBufferAllocator(uint8_t* buffer, size_t buffer_size);
+  virtual ~NonPersistentArenaBufferAllocator();
+
+  // Allocates a temporary buffer. This buffer is not resizable.
+  uint8_t* AllocateTemp(size_t size, size_t alignment) override;
+
+  // Signals that a temporary buffer is no longer needed.
+  void DeallocateTemp(uint8_t* buf) override;
+
+  // Returns true if all temporary buffers are already deallocated.
+  bool IsAllTempDeallocated() override;
+
+  // Signals that all temporary allocations can be reclaimed. TFLM calls this
+  // API when it knows that all temporary buffers that it requested has been
+  // deallocated.
+  TfLiteStatus ResetTempAllocations() override;
+
+  // Returns a buffer that is resizable viable ResizeBuffer().
+  uint8_t* AllocateResizableBuffer(size_t size, size_t alignment) override;
+
+  // Resizes a buffer that is previously returned by the
+  // AllocateResizableBuffer.
+  TfLiteStatus ResizeBuffer(uint8_t* resizable_buf, size_t size,
+                            size_t alignment) override;
+
+  // Frees up the memory occupied by the resizable buffer.
+  TfLiteStatus DeallocateResizableBuffer(uint8_t* resizable_buf) override;
+
+  // Returns a pointer pointing to the start of the overlay memory, which is
+  // used for activation tensors and scratch buffers by kernels at Invoke stage.
+  uint8_t* GetOverlayMemoryAddress() const override;
+
+  // Reserves the size of the overlay memory. This overlay is reserved for the
+  // kernels at Invoke stage. This is referred to as the overlay because before
+  // Invoket state, the same memory can be used for temp buffers. The layout of
+  // the memory is planned by the memory planner separately at Invoke stage.
+  TfLiteStatus ReserveNonPersistentOverlayMemory(size_t size,
+                                                 size_t alignment) override;
+
+  // Returns the size of non-persistent buffer in use.
+  size_t GetNonPersistentUsedBytes() const override;
+
+  // Returns the number of bytes available with a given alignment. This number
+  // takes in account any temporary allocations.
+  size_t GetAvailableMemory(size_t alignment) const override;
+
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+
+ private:
+  // The memory arena that this allocator manages.
+  uint8_t* const buffer_head_;
+  uint8_t* const buffer_tail_;
+
+  // The whole region is split into two parts:
+  // buffer_head_ to head_temp_ - 1 belongs to the only resizable buffer.
+  // head_temp_ to buffer_tail_ can be used for (non-resizable) temp buffers.
+  uint8_t* head_temp_;
+
+  // next_temp_ points to the next available temp buffer allocation address and
+  // its range is between head_temp_ and buffer_tail_
+  uint8_t* next_temp_;
+
+  // XOR Check sum for outstanding temp buffers.
+  // If all temp buffers are deallocated OR no temp buffers are allocated,
+  // temp_buffer_ptr_check_sum_ == nullptr.
+  intptr_t temp_buffer_ptr_check_sum_ = 0;
+  // Count of outstanding temp buffers.
+  int temp_buffer_count_ = 0;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_NON_PERSISTENT_ARENA_BUFFER_ALLOCATOR_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/persistent_arena_buffer_allocator.cc b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/persistent_arena_buffer_allocator.cc
new file mode 100644
index 00000000..0ccc8fb1
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/persistent_arena_buffer_allocator.cc
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/micro/arena_allocator/persistent_arena_buffer_allocator.h"
+
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+namespace tflite {
+
+PersistentArenaBufferAllocator::PersistentArenaBufferAllocator(
+    uint8_t* buffer, size_t buffer_size)
+    : buffer_head_(buffer),
+      buffer_tail_(buffer + buffer_size),
+      tail_temp_(buffer_tail_) {}
+
+PersistentArenaBufferAllocator::~PersistentArenaBufferAllocator() {}
+
+uint8_t* PersistentArenaBufferAllocator::AllocatePersistentBuffer(
+    size_t size, size_t alignment) {
+  uint8_t* const aligned_result =
+      AlignPointerDown(tail_temp_ - size, alignment);
+  if (aligned_result < buffer_head_) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+    const size_t missing_memory = buffer_head_ - aligned_result;
+    MicroPrintf(
+        "Failed to allocate tail memory. Requested: %u, "
+        "available %u, missing: %u",
+        size, size - missing_memory, missing_memory);
+#endif
+    return nullptr;
+  }
+  tail_temp_ = aligned_result;
+  return aligned_result;
+}
+
+size_t PersistentArenaBufferAllocator::GetPersistentUsedBytes() const {
+  return buffer_tail_ - tail_temp_;
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/persistent_arena_buffer_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/persistent_arena_buffer_allocator.h
new file mode 100644
index 00000000..10145d72
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/persistent_arena_buffer_allocator.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_PERSISTENT_ARENA_BUFFER_ALLOCATOR_H_
+#define TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_PERSISTENT_ARENA_BUFFER_ALLOCATOR_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/micro/arena_allocator/ibuffer_allocator.h"
+#include "tensorflow/lite/micro/compatibility.h"
+
+namespace tflite {
+
+// PersistentArenaBufferAllocator is an implementatation of
+// IPersistentBufferAllocator interface on an arena that is dedicated for
+// persistent buffers.
+class PersistentArenaBufferAllocator : public IPersistentBufferAllocator {
+ public:
+  PersistentArenaBufferAllocator(uint8_t* buffer, size_t buffer_size);
+  virtual ~PersistentArenaBufferAllocator();
+
+  // Allocates persistent memory. The persistent buffer is never freed.
+  // Returns nullptr if errors occured.
+  uint8_t* AllocatePersistentBuffer(size_t size, size_t alignment) override;
+
+  // Returns the size of all persistent allocations in bytes.
+  size_t GetPersistentUsedBytes() const override;
+
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+ private:
+  // The memory arena that this allocator manages.
+  uint8_t* const buffer_head_;
+  uint8_t* const buffer_tail_;
+
+  // The whole region is split into two parts:
+  // tail_temp_ to buffer_tail_ contains allocated buffers;
+  // buffer_head_ to tail_temp_ - 1 belongs to still available spaces.
+  // So in essence, the allocated region grows from the bottom and emulates
+  // SimpleMemoryAllocator's persistent part.
+  uint8_t* tail_temp_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_PERSISTENT_ARENA_BUFFER_ALLOCATOR_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.cc b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/recording_simple_memory_allocator.cc
similarity index 80%
rename from code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.cc
rename to code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/recording_simple_memory_allocator.cc
index ef30aca4..0efb6512 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/recording_simple_memory_allocator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/micro/recording_simple_memory_allocator.h"
+#include "tensorflow/lite/micro/arena_allocator/recording_simple_memory_allocator.h"
 
 #include <new>
 
@@ -39,8 +39,8 @@ RecordingSimpleMemoryAllocator* RecordingSimpleMemoryAllocator::Create(
       RecordingSimpleMemoryAllocator(error_reporter, buffer_head, buffer_size);
 
   uint8_t* allocator_buffer =
-      tmp.AllocateFromTail(sizeof(RecordingSimpleMemoryAllocator),
-                           alignof(RecordingSimpleMemoryAllocator));
+      tmp.AllocatePersistentBuffer(sizeof(RecordingSimpleMemoryAllocator),
+                                   alignof(RecordingSimpleMemoryAllocator));
   // Use the default copy constructor to populate internal states.
   return new (allocator_buffer) RecordingSimpleMemoryAllocator(tmp);
 }
@@ -57,11 +57,11 @@ size_t RecordingSimpleMemoryAllocator::GetAllocatedCount() const {
   return alloc_count_;
 }
 
-TfLiteStatus RecordingSimpleMemoryAllocator::SetHeadBufferSize(
-    size_t size, size_t alignment) {
+TfLiteStatus RecordingSimpleMemoryAllocator::ResizeBuffer(
+    uint8_t* resizable_buf, size_t size, size_t alignment) {
   const uint8_t* previous_head = head();
   TfLiteStatus status =
-      SimpleMemoryAllocator::SetHeadBufferSize(size, alignment);
+      SimpleMemoryAllocator::ResizeBuffer(resizable_buf, size, alignment);
   if (status == kTfLiteOk) {
     used_bytes_ += head() - previous_head;
     requested_head_bytes_ = size;
@@ -69,10 +69,11 @@ TfLiteStatus RecordingSimpleMemoryAllocator::SetHeadBufferSize(
   return status;
 }
 
-uint8_t* RecordingSimpleMemoryAllocator::AllocateFromTail(size_t size,
-                                                          size_t alignment) {
+uint8_t* RecordingSimpleMemoryAllocator::AllocatePersistentBuffer(
+    size_t size, size_t alignment) {
   const uint8_t* previous_tail = tail();
-  uint8_t* result = SimpleMemoryAllocator::AllocateFromTail(size, alignment);
+  uint8_t* result =
+      SimpleMemoryAllocator::AllocatePersistentBuffer(size, alignment);
   if (result != nullptr) {
     used_bytes_ += previous_tail - tail();
     requested_tail_bytes_ += size;
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/recording_simple_memory_allocator.h
similarity index 79%
rename from code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.h
rename to code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/recording_simple_memory_allocator.h
index 3526716e..1abe43dd 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/recording_simple_memory_allocator.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_MICRO_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
-#define TENSORFLOW_LITE_MICRO_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
+#ifndef TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
+#define TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
 
+#include "tensorflow/lite/micro/arena_allocator/simple_memory_allocator.h"
 #include "tensorflow/lite/micro/compatibility.h"
-#include "tensorflow/lite/micro/simple_memory_allocator.h"
 
 namespace tflite {
 
@@ -47,8 +47,9 @@ class RecordingSimpleMemoryAllocator : public SimpleMemoryAllocator {
   // Returns the number of alloc calls from the head or tail.
   size_t GetAllocatedCount() const;
 
-  TfLiteStatus SetHeadBufferSize(size_t size, size_t alignment) override;
-  uint8_t* AllocateFromTail(size_t size, size_t alignment) override;
+  TfLiteStatus ResizeBuffer(uint8_t* resizable_buf, size_t size,
+                            size_t alignment) override;
+  uint8_t* AllocatePersistentBuffer(size_t size, size_t alignment) override;
 
  private:
   size_t requested_head_bytes_;
@@ -61,4 +62,4 @@ class RecordingSimpleMemoryAllocator : public SimpleMemoryAllocator {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_MICRO_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
+#endif  // TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.cc b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/simple_memory_allocator.cc
similarity index 59%
rename from code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.cc
rename to code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/simple_memory_allocator.cc
index 08b6789e..3e3ea4bd 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/simple_memory_allocator.cc
@@ -13,16 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/micro/simple_memory_allocator.h"
+#include "tensorflow/lite/micro/arena_allocator/simple_memory_allocator.h"
 
 #include <cstddef>
 #include <cstdint>
 #include <new>
 
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
 
 namespace tflite {
 
@@ -52,7 +55,7 @@ SimpleMemoryAllocator* SimpleMemoryAllocator::Create(
   // Allocate enough bytes from the buffer to create a SimpleMemoryAllocator.
   // The new instance will use the current adjusted tail buffer from the tmp
   // allocator instance.
-  uint8_t* allocator_buffer = tmp.AllocateFromTail(
+  uint8_t* allocator_buffer = tmp.AllocatePersistentBuffer(
       sizeof(SimpleMemoryAllocator), alignof(SimpleMemoryAllocator));
   // Use the default copy constructor to populate internal states.
   return new (allocator_buffer) SimpleMemoryAllocator(tmp);
@@ -60,13 +63,37 @@ SimpleMemoryAllocator* SimpleMemoryAllocator::Create(
 
 SimpleMemoryAllocator::~SimpleMemoryAllocator() {}
 
-TfLiteStatus SimpleMemoryAllocator::SetHeadBufferSize(size_t size,
-                                                      size_t alignment) {
-  if (head_ != temp_) {
+uint8_t* SimpleMemoryAllocator::AllocateResizableBuffer(size_t size,
+                                                        size_t alignment) {
+  // Only supports one resizable buffer, which starts at the buffer head.
+  uint8_t* expect_resizable_buf = AlignPointerUp(buffer_head_, alignment);
+  if (ResizeBuffer(expect_resizable_buf, size, alignment) == kTfLiteOk) {
+    return expect_resizable_buf;
+  }
+  return nullptr;
+}
+
+TfLiteStatus SimpleMemoryAllocator::DeallocateResizableBuffer(
+    uint8_t* resizable_buf) {
+  return ResizeBuffer(resizable_buf, 0, 1);
+}
+
+TfLiteStatus SimpleMemoryAllocator::ReserveNonPersistentOverlayMemory(
+    size_t size, size_t alignment) {
+  uint8_t* expect_resizable_buf = AlignPointerUp(buffer_head_, alignment);
+  return ResizeBuffer(expect_resizable_buf, size, alignment);
+}
+
+TfLiteStatus SimpleMemoryAllocator::ResizeBuffer(uint8_t* resizable_buf,
+                                                 size_t size,
+                                                 size_t alignment) {
+  // Only supports one resizable buffer, which starts at the buffer head.
+  uint8_t* expect_resizable_buf = AlignPointerUp(buffer_head_, alignment);
+  if (head_ != temp_ || resizable_buf != expect_resizable_buf) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
-        "Internal error: SetHeadBufferSize() needs to be called "
-        "after ResetTempAllocations().");
+        "Internal error: either buffer is not resizable or "
+        "ResetTempAllocations() is not called before ResizeBuffer().");
     return kTfLiteError;
   }
 
@@ -75,7 +102,7 @@ TfLiteStatus SimpleMemoryAllocator::SetHeadBufferSize(size_t size,
   if (available_memory < size) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
-        "Failed to set head size. Requested: %u, available %u, missing: %u",
+        "Failed to resize buffer. Requested: %u, available %u, missing: %u",
         size, available_memory, size - available_memory);
     return kTfLiteError;
   }
@@ -85,8 +112,8 @@ TfLiteStatus SimpleMemoryAllocator::SetHeadBufferSize(size_t size,
   return kTfLiteOk;
 }
 
-uint8_t* SimpleMemoryAllocator::AllocateFromTail(size_t size,
-                                                 size_t alignment) {
+uint8_t* SimpleMemoryAllocator::AllocatePersistentBuffer(size_t size,
+                                                         size_t alignment) {
   uint8_t* const aligned_result = AlignPointerDown(tail_ - size, alignment);
   if (aligned_result < head_) {
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
@@ -113,18 +140,47 @@ uint8_t* SimpleMemoryAllocator::AllocateTemp(size_t size, size_t alignment) {
     return nullptr;
   }
   temp_ = aligned_result + size;
+  temp_buffer_ptr_check_sum_ ^= (reinterpret_cast<intptr_t>(aligned_result));
+  temp_buffer_count_++;
   return aligned_result;
 }
 
-void SimpleMemoryAllocator::ResetTempAllocations() { temp_ = head_; }
-
-uint8_t* SimpleMemoryAllocator::GetHeadBuffer() const { return buffer_head_; }
-
-size_t SimpleMemoryAllocator::GetHeadUsedBytes() const {
-  return head_ - buffer_head_;
+void SimpleMemoryAllocator::DeallocateTemp(uint8_t* temp_buf) {
+  temp_buffer_ptr_check_sum_ ^= (reinterpret_cast<intptr_t>(temp_buf));
+  temp_buffer_count_--;
 }
 
-size_t SimpleMemoryAllocator::GetTailUsedBytes() const {
+bool SimpleMemoryAllocator::IsAllTempDeallocated() {
+  if (temp_buffer_count_ != 0 || temp_buffer_ptr_check_sum_ != 0) {
+    MicroPrintf(
+        "Number of allocated temp buffers: %d. Checksum passing status: %d",
+        temp_buffer_count_, !temp_buffer_ptr_check_sum_);
+    return false;
+  }
+  return true;
+}
+
+TfLiteStatus SimpleMemoryAllocator::ResetTempAllocations() {
+  // TODO(b/209453859): enable error check based on IsAllTempDeallocated after
+  // all AllocateTemp have been paird with DeallocateTemp
+  if (!IsAllTempDeallocated()) {
+    MicroPrintf(
+        "All temp buffers must be freed before calling ResetTempAllocations()");
+    return kTfLiteError;
+  }
+  temp_ = head_;
+  return kTfLiteOk;
+}
+
+uint8_t* SimpleMemoryAllocator::GetOverlayMemoryAddress() const {
+  return buffer_head_;
+}
+
+size_t SimpleMemoryAllocator::GetNonPersistentUsedBytes() const {
+  return std::max(head_ - buffer_head_, temp_ - buffer_head_);
+}
+
+size_t SimpleMemoryAllocator::GetPersistentUsedBytes() const {
   return buffer_tail_ - tail_;
 }
 
@@ -135,7 +191,7 @@ size_t SimpleMemoryAllocator::GetAvailableMemory(size_t alignment) const {
 }
 
 size_t SimpleMemoryAllocator::GetUsedBytes() const {
-  return GetBufferSize() - (tail_ - temp_);
+  return GetPersistentUsedBytes() + GetNonPersistentUsedBytes();
 }
 
 size_t SimpleMemoryAllocator::GetBufferSize() const {
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/simple_memory_allocator.h
similarity index 51%
rename from code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.h
rename to code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/simple_memory_allocator.h
index 36ab80b3..92d0e425 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/arena_allocator/simple_memory_allocator.h
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
-#define TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
+#ifndef TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_SIMPLE_MEMORY_ALLOCATOR_H_
+#define TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_SIMPLE_MEMORY_ALLOCATOR_H_
 
 #include <cstddef>
 #include <cstdint>
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/micro/arena_allocator/ibuffer_allocator.h"
 #include "tensorflow/lite/micro/compatibility.h"
 
 namespace tflite {
@@ -28,7 +29,8 @@ namespace tflite {
 // TODO(petewarden): This allocator never frees up or reuses  any memory, even
 // though we have enough information about lifetimes of the tensors to do so.
 // This makes it pretty wasteful, so we should use a more intelligent method.
-class SimpleMemoryAllocator {
+class SimpleMemoryAllocator : public INonPersistentBufferAllocator,
+                              public IPersistentBufferAllocator {
  public:
   // TODO(b/157615197): Cleanup constructors/destructor and use factory
   // functions.
@@ -43,17 +45,33 @@ class SimpleMemoryAllocator {
                                        uint8_t* buffer_head,
                                        size_t buffer_size);
 
-  // Adjusts the head (lowest address and moving upwards) memory allocation to a
-  // given size. Calls to this method will also invalidate all temporary
-  // allocation values (it sets the location of temp space at the end of the
-  // head section). This call will fail if a chain of allocations through
-  // AllocateTemp() have not been cleaned up with a call to
-  // ResetTempAllocations().
-  virtual TfLiteStatus SetHeadBufferSize(size_t size, size_t alignment);
+  // Resizes a buffer that is previously returned by the
+  // AllocateResizableBuffer. In current implementation, it Adjusts the head
+  // (lowest address and moving upwards) memory allocation to a given size.
+  // Calls to this method will also invalidate all temporary allocation values
+  // (it sets the location of temp space at the end of the head section). This
+  // call will fail if a chain of allocations through AllocateTemp() have not
+  // been cleaned up with a call to ResetTempAllocations().
+  virtual TfLiteStatus ResizeBuffer(uint8_t* resizable_buf, size_t size,
+                                    size_t alignment) override;
 
-  // Allocates memory starting at the tail of the arena (highest address and
-  // moving downwards).
-  virtual uint8_t* AllocateFromTail(size_t size, size_t alignment);
+  // Returns a buffer that is resizable viable ResizeBuffer(). Only one
+  // resizable buffer is currently supported.
+  virtual uint8_t* AllocateResizableBuffer(size_t size,
+                                           size_t alignment) override;
+
+  // Frees up the memory occupied by the resizable buffer
+  virtual TfLiteStatus DeallocateResizableBuffer(
+      uint8_t* resizable_buf) override;
+
+  // Reserves the non-persistent memory that is planned by the memory planner.
+  virtual TfLiteStatus ReserveNonPersistentOverlayMemory(
+      size_t size, size_t alignment) override;
+
+  // Allocates persistent memory starting at the tail of the arena (highest
+  // address and moving downwards).
+  virtual uint8_t* AllocatePersistentBuffer(size_t size,
+                                            size_t alignment) override;
 
   // Allocates a temporary buffer from the head of the arena (lowest address and
   // moving upwards) but does not update the actual head allocation size or
@@ -63,25 +81,34 @@ class SimpleMemoryAllocator {
   // calls to AllocateTemp() must end with a call to ResetTempAllocations(). If
   // AllocateFromHead() is called before a call to ResetTempAllocations(), it
   // will fail with an error message.
-  virtual uint8_t* AllocateTemp(size_t size, size_t alignment);
+  virtual uint8_t* AllocateTemp(size_t size, size_t alignment) override;
+
+  // Signals that a temporary buffer is no longer needed. This is currently for
+  // book-keeping purpose and the memory region are not immediately available
+  // for re-use. The deallocated memory region are only reclaimed after
+  // ResetTempAllocations is called as it is right now.
+  virtual void DeallocateTemp(uint8_t* buf) override;
+
+  // Returns true if all temporary buffers are already deallocated.
+  virtual bool IsAllTempDeallocated() override;
 
   // Resets a chain of temporary allocations back to the current head of the
   // arena (lowest address).
-  virtual void ResetTempAllocations();
+  virtual TfLiteStatus ResetTempAllocations() override;
 
   // Returns a pointer to the buffer currently assigned to the head section.
   // This buffer is set by calling SetHeadSize().
-  uint8_t* GetHeadBuffer() const;
+  uint8_t* GetOverlayMemoryAddress() const override;
 
   // Returns the size of the head section in bytes.
-  size_t GetHeadUsedBytes() const;
+  size_t GetNonPersistentUsedBytes() const override;
 
   // Returns the size of all allocations in the tail section in bytes.
-  size_t GetTailUsedBytes() const;
+  size_t GetPersistentUsedBytes() const override;
 
   // Returns the number of bytes available with a given alignment. This number
   // takes in account any temporary allocations.
-  size_t GetAvailableMemory(size_t alignment) const;
+  size_t GetAvailableMemory(size_t alignment) const override;
 
   // Returns the number of used bytes in the allocator. This number takes in
   // account any temporary allocations.
@@ -105,8 +132,19 @@ class SimpleMemoryAllocator {
   uint8_t* head_;
   uint8_t* tail_;
   uint8_t* temp_;
+
+  // The combination of the checksum of outstanding temporary buffer pointers
+  // AND the count of outstanding temporary buffer provide a low cost mechanism
+  // to audit temporary buffers' allocation and deallocation.
+  //
+  // XOR Check sum for outstanding temp buffers.
+  // If all temp buffers are deallocated OR no temp buffers are allocated,
+  // temp_buffer_ptr_check_sum_ == nullptr.
+  intptr_t temp_buffer_ptr_check_sum_ = 0;
+  // Count of outstanding temp buffers.
+  int temp_buffer_count_ = 0;
 };
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
+#endif  // TENSORFLOW_LITE_MICRO_ARENA_ALLOCATOR_SIMPLE_MEMORY_ALLOCATOR_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.cc b/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.cc
new file mode 100644
index 00000000..36dd062a
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.cc
@@ -0,0 +1,107 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/fake_micro_context.h"
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/micro/arena_allocator/simple_memory_allocator.h"
+#include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_arena_constants.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+namespace tflite {
+namespace {
+// Dummy static variables to allow creation of dummy MicroAllocator.
+// All tests are guarateed to run serially.
+static constexpr int KDummyTensorArenaSize = 256;
+static uint8_t dummy_tensor_arena[KDummyTensorArenaSize];
+}  // namespace
+
+FakeMicroContext::FakeMicroContext(TfLiteTensor* tensors,
+                                   SimpleMemoryAllocator* allocator,
+                                   MicroGraph* micro_graph)
+    : MicroContext(
+          MicroAllocator::Create(dummy_tensor_arena, KDummyTensorArenaSize,
+                                 GetMicroErrorReporter()),
+          nullptr, micro_graph),
+      tensors_(tensors),
+      allocator_(allocator) {}
+
+TfLiteTensor* FakeMicroContext::AllocateTempTfLiteTensor(int tensor_index) {
+  allocated_tensor_count_++;
+  return &tensors_[tensor_index];
+}
+
+void FakeMicroContext::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) {
+  allocated_tensor_count_--;
+}
+
+bool FakeMicroContext::IsAllTempTfLiteTensorDeallocated() {
+  return !allocated_tensor_count_;
+}
+
+TfLiteEvalTensor* FakeMicroContext::GetEvalTensor(int tensor_index) {
+  TfLiteEvalTensor* eval_tensor =
+      reinterpret_cast<TfLiteEvalTensor*>(allocator_->AllocateTemp(
+          sizeof(TfLiteEvalTensor), alignof(TfLiteEvalTensor)));
+  TFLITE_DCHECK(eval_tensor != nullptr);
+
+  // In unit tests, the TfLiteTensor pointer contains the source of truth for
+  // buffers and values:
+  eval_tensor->data = tensors_[tensor_index].data;
+  eval_tensor->dims = tensors_[tensor_index].dims;
+  eval_tensor->type = tensors_[tensor_index].type;
+  return eval_tensor;
+}
+
+void* FakeMicroContext::AllocatePersistentBuffer(size_t bytes) {
+  // FakeMicroContext use SimpleMemoryAllocator, which does not automatically
+  // apply the buffer alignment like MicroAllocator.
+  // The buffer alignment is potentially wasteful but allows the
+  // fake_micro_context to work correctly with optimized kernels.
+  return allocator_->AllocatePersistentBuffer(bytes,
+                                              MicroArenaBufferAlignment());
+}
+
+TfLiteStatus FakeMicroContext::RequestScratchBufferInArena(size_t bytes,
+                                                           int* buffer_index) {
+  TFLITE_DCHECK(buffer_index != nullptr);
+
+  if (scratch_buffer_count_ == kNumScratchBuffers_) {
+    MicroPrintf("Exceeded the maximum number of scratch tensors allowed (%d).",
+                kNumScratchBuffers_);
+    return kTfLiteError;
+  }
+
+  // For tests, we allocate scratch buffers from the tail and keep them around
+  // for the lifetime of model. This means that the arena size in the tests will
+  // be more than what we would have if the scratch buffers could share memory.
+  scratch_buffers_[scratch_buffer_count_] =
+      allocator_->AllocatePersistentBuffer(bytes, MicroArenaBufferAlignment());
+  TFLITE_DCHECK(scratch_buffers_[scratch_buffer_count_] != nullptr);
+
+  *buffer_index = scratch_buffer_count_++;
+  return kTfLiteOk;
+}
+
+void* FakeMicroContext::GetScratchBuffer(int buffer_index) {
+  TFLITE_DCHECK(scratch_buffer_count_ <= kNumScratchBuffers_);
+  if (buffer_index >= scratch_buffer_count_) {
+    return nullptr;
+  }
+  return scratch_buffers_[buffer_index];
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.h b/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.h
new file mode 100644
index 00000000..99933c19
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.h
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_FAKE_MICRO_CONTEXT_H_
+#define TENSORFLOW_LITE_MICRO_FAKE_MICRO_CONTEXT_H_
+
+#include "tensorflow/lite/micro/micro_context.h"
+#include "tensorflow/lite/micro/micro_graph.h"
+
+namespace tflite {
+// A fake of MicroContext for kernel util tests.
+class FakeMicroContext : public MicroContext {
+ public:
+  FakeMicroContext(TfLiteTensor* tensors, SimpleMemoryAllocator* allocator,
+                   MicroGraph* micro_graph);
+
+  void* AllocatePersistentBuffer(size_t bytes) override;
+  TfLiteStatus RequestScratchBufferInArena(size_t bytes,
+                                           int* buffer_index) override;
+  void* GetScratchBuffer(int buffer_index) override;
+
+  TfLiteTensor* AllocateTempTfLiteTensor(int tensor_index) override;
+  void DeallocateTempTfLiteTensor(TfLiteTensor* tensor) override;
+  bool IsAllTempTfLiteTensorDeallocated();
+
+  TfLiteEvalTensor* GetEvalTensor(int tensor_index) override;
+
+ private:
+  static constexpr int kNumScratchBuffers_ = 12;
+
+  int scratch_buffer_count_ = 0;
+  uint8_t* scratch_buffers_[kNumScratchBuffers_];
+
+  TfLiteTensor* tensors_;
+  int allocated_tensor_count_ = 0;
+
+  SimpleMemoryAllocator* allocator_;
+
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_FAKE_MICRO_CONTEXT_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations.cc
index c556ac64..e0b79631 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
@@ -60,8 +61,8 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     default: {
-      TF_LITE_KERNEL_LOG(context, "Only float32 is supported currently, got %s",
-                         TfLiteTypeGetName(input->type));
+      MicroPrintf("Only float32 is supported currently, got %s",
+                  TfLiteTypeGetName(input->type));
       return kTfLiteError;
     }
   }
@@ -99,8 +100,8 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     default: {
-      TF_LITE_KERNEL_LOG(context, "Only float32 is supported currently, got %s",
-                         TfLiteTypeGetName(input->type));
+      MicroPrintf("Only float32 is supported currently, got %s",
+                  TfLiteTypeGetName(input->type));
       return kTfLiteError;
     }
   }
@@ -109,25 +110,11 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_RELU() {
-  return {/*init=*/ReluInit,
-          /*free=*/nullptr,
-          /*prepare=*/ReluPrepare,
-          /*invoke=*/ReluEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(ReluInit, ReluPrepare, ReluEval);
 }
 
 TfLiteRegistration Register_RELU6() {
-  return {/*init=*/Relu6Init,
-          /*free=*/nullptr,
-          /*prepare=*/Relu6Prepare,
-          /*invoke=*/Relu6Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Relu6Init, Relu6Prepare, Relu6Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations_common.cc
index 90afe832..2ec3a1bf 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations_common.cc
@@ -117,15 +117,21 @@ TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   ReluOpData* data = static_cast<ReluOpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kActivationsInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kActivationsInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kActivationsOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kActivationsOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   if (input->type == kTfLiteInt8) {
     CalculateReluOpData<int8_t>(input, output, data);
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -133,7 +139,9 @@ TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   Relu6OpData* data = static_cast<Relu6OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kActivationsInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kActivationsInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
 
   if (input->type == kTfLiteInt8) {
@@ -142,6 +150,8 @@ TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) {
     data->zero_int8 = input->params.zero_point;
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add.cc
index 75523d14..f75db4e5 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add.cc
@@ -159,14 +159,7 @@ TfLiteStatus AddEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteRegistration Register_ADD() {
-  return {/*init=*/AddInit,
-          /*free=*/nullptr,
-          /*prepare=*/AddPrepare,
-          /*invoke=*/AddEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(AddInit, AddPrepare, AddEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_common.cc
index 3d0c841e..b285b800 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_common.cc
@@ -80,11 +80,15 @@ TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   TFLITE_DCHECK(node->builtin_data != nullptr);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kAddInputTensor1);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kAddInputTensor1);
   TF_LITE_ENSURE(context, input1 != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kAddInputTensor2);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kAddInputTensor2);
   TF_LITE_ENSURE(context, input2 != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kAddOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kAddOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   OpDataAdd* data = static_cast<OpDataAdd*>(node->user_data);
@@ -93,6 +97,9 @@ TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_STATUS(
       CalculateOpDataAdd(context, params, input1, input2, output, data));
 
+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_n.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_n.cc
index b57a2ae6..ce064687 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_n.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_n.cc
@@ -50,18 +50,19 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, num_inputs >= 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input_tensor_first;
-  TF_LITE_ENSURE_OK(
-      context, GetInputSafe(context, node, kInputTensor0, &input_tensor_first));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input_tensor_first =
+      micro_context->AllocateTempInputTensor(node, kInputTensor0);
+  TF_LITE_ENSURE(context, input_tensor_first != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   // Check that all tensors have the same shape and type.
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_tensor_first->type);
   for (int i = kInputTensor0 + 1; i < num_inputs; ++i) {
-    const TfLiteTensor* input;
-    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i, &input));
+    TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, i);
+    TF_LITE_ENSURE(context, input != nullptr);
     TF_LITE_ENSURE(context, HaveSameShapes(input_tensor_first, input));
     TF_LITE_ENSURE_TYPES_EQ(context, input_tensor_first->type, input->type);
 
@@ -72,6 +73,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE(context,
                      input_tensor_first->params.scale == input->params.scale);
     }
+
+    micro_context->DeallocateTempTfLiteTensor(input);
   }
 
   if (output->type == kTfLiteFloat32) {
@@ -123,6 +126,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteError;
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input_tensor_first);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -202,14 +208,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_ADD_N() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/arg_min_max.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/arg_min_max.cc
index 8217a4a0..a8aa5a48 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/arg_min_max.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/arg_min_max.cc
@@ -104,25 +104,11 @@ TfLiteStatus ArgMaxEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace arg_min_max
 
 TfLiteRegistration Register_ARG_MAX() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/arg_min_max::ArgMaxEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, nullptr, arg_min_max::ArgMaxEval);
 }
 
 TfLiteRegistration Register_ARG_MIN() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/arg_min_max::ArgMinEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, nullptr, arg_min_max::ArgMinEval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/assign_variable.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/assign_variable.cc
index a583a067..a770d0aa 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/assign_variable.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/assign_variable.cc
@@ -52,21 +52,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                            input_resource_id_tensor->type == kTfLiteInt32));
   TF_LITE_ENSURE_EQ(context, NumElements(input_resource_id_tensor->dims), 1);
 
-  const TfLiteTensor* input_value = GetInput(context, node, kInputValue);
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  TfLiteTensor* input_value =
+      micro_context->AllocateTempInputTensor(node, kInputValue);
   TFLITE_DCHECK(input_value != nullptr);
 
-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
-  MicroResourceVariables* resources = graph_info->GetResourceVariables();
+  MicroGraph& graph_info = micro_context->graph();
+
+  MicroResourceVariables* resources = graph_info.GetResourceVariables();
   TF_LITE_ENSURE_OK(context,
                     resources->Allocate(input_resource_id_tensor->data.i32[0],
                                         context, input_value));
 
+  micro_context->DeallocateTempTfLiteTensor(input_value);
   return kTfLiteOk;
 }
 
@@ -79,14 +77,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       tflite::micro::GetEvalInput(context, node, kInputValue);
   TFLITE_DCHECK(input_value != nullptr);
 
-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
-  MicroResourceVariables* resources = graph_info->GetResourceVariables();
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  MicroGraph& graph_info = micro_context->graph();
+
+  MicroResourceVariables* resources = graph_info.GetResourceVariables();
   if (resources == nullptr) {
     MicroPrintf(
         "ASSIGN_VARIABLE requires resource variables. Please create "
@@ -101,14 +95,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace.
 
 TfLiteRegistration Register_ASSIGN_VARIABLE() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/batch_to_space_nd.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/batch_to_space_nd.cc
index a6fa0462..be82d942 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/batch_to_space_nd.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/batch_to_space_nd.cc
@@ -41,8 +41,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, input != nullptr && output != nullptr);
 
   TF_LITE_ENSURE(context, NumDimensions(input) >= kInputOutputMinDimensionNum);
@@ -51,6 +55,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumDimensions(output) <= kInputOutputMaxDimensionNum);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -98,14 +105,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace.
 
 TfLiteRegistration Register_BATCH_TO_SPACE_ND() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_args.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_args.cc
new file mode 100644
index 00000000..be2672ec
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_args.cc
@@ -0,0 +1,91 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/broadcast_args.h"
+
+#include <stdint.h>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_context.h"
+
+namespace tflite {
+namespace {
+constexpr int kShape1Tensor = 0;
+constexpr int kShape2Tensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus BroadcastArgsPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* shape1 =
+      micro_context->AllocateTempInputTensor(node, kShape1Tensor);
+  TfLiteTensor* shape2 =
+      micro_context->AllocateTempInputTensor(node, kShape2Tensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+
+  TF_LITE_ENSURE(context,
+                 shape1->type == kTfLiteInt32 || shape1->type == kTfLiteInt64);
+  TF_LITE_ENSURE_EQ(context, shape1->type, shape2->type);
+  TF_LITE_ENSURE_EQ(context, shape1->type, output->type);
+
+  // Ensures the shapes are 1D tensor.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(shape1), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(shape2), 1);
+
+  // Ensure the shape of the output tensor is compatible
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 1);
+
+  micro_context->DeallocateTempTfLiteTensor(shape1);
+  micro_context->DeallocateTempTfLiteTensor(shape2);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus BroadcastArgsEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* shape1 =
+      micro::GetEvalInput(context, node, kShape1Tensor);
+  const TfLiteEvalTensor* shape2 =
+      micro::GetEvalInput(context, node, kShape2Tensor);
+  TfLiteEvalTensor* output = micro::GetEvalOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteInt32) {
+    reference_ops::BroadcastArgs(
+        micro::GetTensorShape(shape1), micro::GetTensorData<int32_t>(shape1),
+        micro::GetTensorShape(shape2), micro::GetTensorData<int32_t>(shape2),
+        micro::GetTensorShape(output), micro::GetTensorData<int32_t>(output));
+  } else {
+    reference_ops::BroadcastArgs(
+        micro::GetTensorShape(shape1), micro::GetTensorData<int64_t>(shape1),
+        micro::GetTensorShape(shape2), micro::GetTensorData<int64_t>(shape2),
+        micro::GetTensorShape(output), micro::GetTensorData<int64_t>(output));
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_BROADCAST_ARGS() {
+  return tflite::micro::RegisterOp(nullptr, BroadcastArgsPrepare,
+                                   BroadcastArgsEval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_to.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_to.cc
new file mode 100644
index 00000000..63a14db2
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_to.cc
@@ -0,0 +1,123 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/broadcast_to.h"
+
+#include <stdint.h>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_context.h"
+
+namespace tflite {
+
+namespace {
+constexpr int kInputTensor = 0;
+constexpr int kShapeTensor = 1;
+constexpr int kOutputTensor = 0;
+// Support a maximum of 5 dimensions in TFLM.
+constexpr int kMaxDims = 5;
+
+TfLiteStatus ValidateOutputTensor(TfLiteContext* context, TfLiteTensor* input,
+                                  TfLiteTensor* shape, TfLiteTensor* output) {
+  // Ensures the shape is 1D tensor.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(shape), 1);
+
+  // Ensure output dims is not less than input dims.
+  int input_num_dims = NumDimensions(input);
+  int output_num_dims = NumDimensions(output);
+  int shape_num_dims = SizeOfDimension(shape, 0);
+  TF_LITE_ENSURE_MSG(context, output_num_dims == shape_num_dims,
+                     "Output must match with the expected shape dimension.");
+  TF_LITE_ENSURE_MSG(context, input_num_dims <= output_num_dims,
+                     "Output shape must be broadcastable from input shape.");
+  TF_LITE_ENSURE_MSG(context, output_num_dims <= kMaxDims,
+                     "BroadcastTo only supports 1-5D tensor.");
+
+  // Check if output shape is broadcastable from input shape.
+  auto get_shape_data = [shape](int i) -> int32_t {
+    if (shape->type == kTfLiteInt32) {
+      return GetTensorData<int32_t>(shape)[i];
+    } else {
+      return GetTensorData<int64_t>(shape)[i];
+    }
+  };
+
+  int extending_dims = output_num_dims - input_num_dims;
+  for (int idx = 0; idx < input_num_dims; ++idx) {
+    TF_LITE_ENSURE_MSG(
+        context,
+        (SizeOfDimension(input, idx) == 1 ||
+         SizeOfDimension(input, idx) == get_shape_data(extending_dims + idx)),
+        "Output shape must be broadcastable from input shape.");
+  }
+
+  // Validating the shape of the output tensor.
+  tflite::RuntimeShape output_shape = tflite::GetTensorShape(output);
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    TF_LITE_ENSURE(context, output_shape.Dims(idx) == get_shape_data(idx));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus BroadcastToPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* shape =
+      micro_context->AllocateTempInputTensor(node, kShapeTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+
+  TF_LITE_ENSURE_MSG(context, (NumDimensions(input) <= kMaxDims),
+                     "BroadcastTo only supports 1-5D tensor.");
+
+  TF_LITE_ENSURE(context,
+                 shape->type == kTfLiteInt32 || shape->type == kTfLiteInt64);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  // Does not support String type due to its variable size. This limitation is
+  // the same as TFLite.
+  TF_LITE_ENSURE(context, input->type != kTfLiteString);
+
+  TF_LITE_ENSURE_STATUS(ValidateOutputTensor(context, input, shape, output));
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(shape);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus BroadcastToEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output = micro::GetEvalOutput(context, node, kOutputTensor);
+
+  // BroadcastTo op support upto 5 dims, different from 8 dims in TFLite.
+  reference_ops::BroadcastTo<kMaxDims>(
+      micro::GetTensorShape(input), input->data.raw,
+      micro::GetTensorShape(output), output->data.raw, input->type);
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteRegistration Register_BROADCAST_TO() {
+  return tflite::micro::RegisterOp(nullptr, BroadcastToPrepare,
+                                   BroadcastToEval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/call_once.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/call_once.cc
index 97fded0c..200242b2 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/call_once.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/call_once.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_context.h"
 #include "tensorflow/lite/micro/micro_graph.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -50,16 +51,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumInputs(node) == 0);
   TF_LITE_ENSURE(context, NumOutputs(node) == 0);
 
-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  MicroGraph& graph_info = micro_context->graph();
 
   TF_LITE_ENSURE(context,
-                 op_data->init_subgraph_index < graph_info->NumSubgraphs());
+                 op_data->init_subgraph_index < graph_info.NumSubgraphs());
 
   return kTfLiteOk;
 }
@@ -72,16 +68,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteOk;
   }
 
-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  MicroGraph& graph_info = micro_context->graph();
 
   TF_LITE_ENSURE_OK(context,
-                    graph_info->InvokeSubgraph(op_data->init_subgraph_index));
+                    graph_info.InvokeSubgraph(op_data->init_subgraph_index));
 
   op_data->has_run = true;
 
@@ -91,14 +82,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace.
 
 TfLiteRegistration Register_CALL_ONCE() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/cast.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/cast.cc
index 0314e523..a1f4516b 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/cast.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/cast.cc
@@ -28,11 +28,19 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -83,6 +91,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt32:
       return copyToTensor(context, tflite::micro::GetTensorData<int32_t>(input),
                           output, num_elements);
+    case kTfLiteUInt32:
+      return copyToTensor(context,
+                          tflite::micro::GetTensorData<uint32_t>(input), output,
+                          num_elements);
     case kTfLiteFloat32:
       return copyToTensor(context, tflite::micro::GetTensorData<float>(input),
                           output, num_elements);
@@ -96,14 +108,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_CAST() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/ceil.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/ceil.cc
index f929ce62..a390a735 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/ceil.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/ceil.cc
@@ -29,9 +29,13 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -42,6 +46,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   for (int i = 0; i < output->dims->size; ++i) {
     TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]);
   }
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -61,14 +67,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace ceil
 
 TfLiteRegistration Register_CEIL() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/ceil::Prepare,
-          /*invoke=*/ceil::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, ceil::Prepare, ceil::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer.cc
index bda3e66a..a66a61c5 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer.cc
@@ -108,14 +108,7 @@ TfLiteStatus CircularBufferEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteRegistration* Register_CIRCULAR_BUFFER() {
-  static TfLiteRegistration r = {/*init=*/CircularBufferInit,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/CircularBufferPrepare,
-                                 /*invoke=*/CircularBufferEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
+  static TfLiteRegistration r = tflite::micro::RegisterOp(CircularBufferInit, CircularBufferPrepare, CircularBufferEval);
   return &r;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer_common.cc
index 0bb4d476..682efb43 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer_common.cc
@@ -39,9 +39,13 @@ const int kCircularBufferCyclesMaxIndex = 0;  // 'cycles_max'
 const TfLiteStatus kTfLiteAbort = static_cast<TfLiteStatus>(-9);
 
 TfLiteStatus CircularBufferPrepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input =
-      GetInput(context, node, kCircularBufferInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kCircularBufferOutputTensor);
+
+  MicroContext * micro_context = GetMicroContext(context);
+
+   TfLiteTensor* input =
+    micro_context->  AllocateTempInputTensor(node, kCircularBufferInputTensor);
+  TfLiteTensor* output =
+      micro_context-> AllocateTempOutputTensor(node, kCircularBufferOutputTensor);
 
   TFLITE_DCHECK(node->user_data != nullptr);
   OpDataCircularBuffer* op_data =
@@ -85,6 +89,9 @@ TfLiteStatus CircularBufferPrepare(TfLiteContext* context, TfLiteNode* node) {
   op_data->cycles_until_run = op_data->cycles_max;
   node->user_data = op_data;
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/comparisons.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/comparisons.cc
index eb39d9ea..cff15e4d 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/comparisons.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/comparisons.cc
@@ -540,9 +540,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   OpData* data = static_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor1);
   TF_LITE_ENSURE(context, input1 != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor2);
   TF_LITE_ENSURE(context, input2 != nullptr);
 
   if (input1->type == kTfLiteInt8) {
@@ -570,75 +574,42 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     data->params.input2_shift = input2_shift;
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+
   return kTfLiteOk;
 }
 
 }  // namespace comparisons
 
 TfLiteRegistration Register_EQUAL() {
-  return {/*init=*/comparisons::Init,
-          /*free=*/nullptr,
-          /*prepare=*/comparisons::Prepare,
-          /*invoke=*/comparisons::EqualEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(comparisons::Init, comparisons::Prepare,
+                                   comparisons::EqualEval);
 }
 
 TfLiteRegistration Register_NOT_EQUAL() {
-  return {/*init=*/comparisons::Init,
-          /*free=*/nullptr,
-          /*prepare=*/comparisons::Prepare,
-          /*invoke=*/comparisons::NotEqualEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(comparisons::Init, comparisons::Prepare,
+                                   comparisons::NotEqualEval);
 }
 
 TfLiteRegistration Register_GREATER() {
-  return {/*init=*/comparisons::Init,
-          /*free=*/nullptr,
-          /*prepare=*/comparisons::Prepare,
-          /*invoke=*/comparisons::GreaterEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(comparisons::Init, comparisons::Prepare,
+                                   comparisons::GreaterEval);
 }
 
 TfLiteRegistration Register_GREATER_EQUAL() {
-  return {/*init=*/comparisons::Init,
-          /*free=*/nullptr,
-          /*prepare=*/comparisons::Prepare,
-          /*invoke=*/comparisons::GreaterEqualEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(comparisons::Init, comparisons::Prepare,
+                                   comparisons::GreaterEqualEval);
 }
 
 TfLiteRegistration Register_LESS() {
-  return {/*init=*/comparisons::Init,
-          /*free=*/nullptr,
-          /*prepare=*/comparisons::Prepare,
-          /*invoke=*/comparisons::LessEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(comparisons::Init, comparisons::Prepare,
+                                   comparisons::LessEval);
 }
 
 TfLiteRegistration Register_LESS_EQUAL() {
-  return {/*init=*/comparisons::Init,
-          /*free=*/nullptr,
-          /*prepare=*/comparisons::Prepare,
-          /*invoke=*/comparisons::LessEqualEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(comparisons::Init, comparisons::Prepare,
+                                   comparisons::LessEqualEval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/concatenation.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/concatenation.cc
index 8f45ac6a..34622c22 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/concatenation.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/concatenation.cc
@@ -115,13 +115,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteConcatenationParams* params =
       reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
 
-  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input_tensor = micro_context->AllocateTempInputTensor(node, 0);
   TF_LITE_ENSURE(context, input_tensor != nullptr);
   TfLiteType input_type = input_tensor->type;
-  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output_tensor =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, output_tensor != nullptr);
   TfLiteType output_type = output_tensor->type;
 
+  micro_context->DeallocateTempTfLiteTensor(input_tensor);
+  micro_context->DeallocateTempTfLiteTensor(output_tensor);
+
   // Check activation and input type
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
   TF_LITE_ENSURE(context,
@@ -138,25 +144,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Shapes with dimensions >4 are not yet supported with static allocation.
   for (int i = 0; i < num_inputs; ++i) {
-    const TfLiteTensor* input = GetInput(context, node, i);
+    TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, i);
     TF_LITE_ENSURE(context, input != nullptr);
     int num_dimensions = NumDimensions(input);
 
-    if (num_dimensions > 4) {
+    if (num_dimensions > RuntimeShape::kMaxSmallSize) {
       TF_LITE_KERNEL_LOG(
           context,
-          "Op Concatenation does not currently support num dimensions >4 "
+          "Op Concatenation does not currently support num dimensions > %d "
           "Tensor has %d dimensions.",
-          num_dimensions);
+          RuntimeShape::kMaxSmallSize, num_dimensions);
       return kTfLiteError;
     }
+    micro_context->DeallocateTempTfLiteTensor(input);
   }
 
   // Calculate OpData.
   TFLITE_DCHECK(node->user_data != nullptr);
   OpData* data = static_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   switch (output_type) {  // Already know in/outtypes are same.
@@ -183,10 +191,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       // Allocate persistent scale and zeropoint buffers.
       // Store input scale and zero point values in OpParams:
       for (int i = 0; i < node->inputs->size; ++i) {
-        const TfLiteTensor* t = GetInput(context, node, i);
+        TfLiteTensor* t = micro_context->AllocateTempInputTensor(node, i);
         TF_LITE_ENSURE(context, t != nullptr);
         input_scales[i] = t->params.scale;
         input_zero_points[i] = t->params.zero_point;
+        micro_context->DeallocateTempTfLiteTensor(t);
       }
 
       data->params.input_scale = input_scales;
@@ -202,6 +211,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -241,14 +252,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace concatenation
 
 TfLiteRegistration Register_CONCATENATION() {
-  return {/*init=*/concatenation::Init,
-          /*free=*/nullptr,
-          /*prepare=*/concatenation::Prepare,
-          /*invoke=*/concatenation::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(concatenation::Init, concatenation::Prepare,
+                                   concatenation::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.cc
index 0fed1223..87ea92e6 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
 
 namespace tflite {
 namespace {
@@ -67,23 +68,47 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           tflite::micro::GetTensorShape(filter),
           tflite::micro::GetTensorData<float>(filter),
           tflite::micro::GetTensorShape(bias),
-          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetOptionalTensorData<float>(bias),
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<float>(output),
           tflite::micro::GetTensorShape(nullptr), nullptr);
       break;
     }
     case kTfLiteInt16: {
-      reference_integer_ops::ConvPerChannel(
-          ConvParamsQuantized(params, data), data.per_channel_output_multiplier,
-          data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
-          tflite::micro::GetTensorData<int16_t>(input),
-          tflite::micro::GetTensorShape(filter),
-          tflite::micro::GetTensorData<int8_t>(filter),
-          tflite::micro::GetTensorShape(bias),
-          tflite::micro::GetTensorData<std::int64_t>(bias),
-          tflite::micro::GetTensorShape(output),
-          tflite::micro::GetTensorData<int16_t>(output));
+      switch (bias->type) {
+        case kTfLiteInt32: {
+          reference_integer_ops::ConvPerChannel(
+              ConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int16_t>(input),
+              tflite::micro::GetTensorShape(filter),
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<std::int32_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int16_t>(output));
+          break;
+        }
+        case kTfLiteInt64: {
+          reference_integer_ops::ConvPerChannel(
+              ConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int16_t>(input),
+              tflite::micro::GetTensorShape(filter),
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<std::int64_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int16_t>(output));
+          break;
+        }
+        default:
+          MicroPrintf("Bias type %s (%d) not supported.",
+                      TfLiteTypeGetName(bias->type), bias->type);
+          return kTfLiteError;
+      }
       break;
     }
     case kTfLiteInt8: {
@@ -94,14 +119,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           tflite::micro::GetTensorShape(filter),
           tflite::micro::GetTensorData<int8_t>(filter),
           tflite::micro::GetTensorShape(bias),
-          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetOptionalTensorData<int32_t>(bias),
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<int8_t>(output));
       break;
     }
     default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
+      MicroPrintf("Type %s (%d) not supported.", TfLiteTypeGetName(input->type),
+                  input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -110,14 +135,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_CONV_2D() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/ConvPrepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, ConvPrepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.h
index 4089a965..06b35e1e 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -79,7 +79,8 @@ TfLiteRegistration Register_CONV_2D();
 
 #if defined(XTENSA)
 // Returns a TfLiteRegistration struct for kernel variant that only supports
-// int8 inputs and outputs.
+// int8 activations and int8 weights and always calls the reference
+// implementation.
 TfLiteRegistration Register_CONV_2D_INT8REF();
 #else
 inline TfLiteRegistration Register_CONV_2D_INT8REF() {
@@ -87,6 +88,25 @@ inline TfLiteRegistration Register_CONV_2D_INT8REF() {
 }
 #endif
 
+#if defined(CMSIS_NN)
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int8 activations and int8 weights and uses the latency optimized
+// implementations.
+TfLiteRegistration Register_CONV_2D_INT8();
+
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int16 activations and int8 weights and uses the latency optimized
+// implementations.
+TfLiteRegistration Register_CONV_2D_INT16();
+
+#else
+inline TfLiteRegistration Register_CONV_2D_INT8() { return Register_CONV_2D(); }
+
+inline TfLiteRegistration Register_CONV_2D_INT16() {
+  return Register_CONV_2D();
+}
+#endif
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_CONV_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_common.cc
index 6887e423..7115f7ba 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_common.cc
@@ -93,13 +93,18 @@ TfLiteStatus CalculateOpDataConv(TfLiteContext* context, TfLiteNode* node,
       params.dilation_width_factor, height, width, filter_height, filter_width,
       padding, &out_height, &out_width);
 
-  const TfLiteTensor* input = GetInput(context, node, kConvInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kConvInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kConvWeightsTensor);
   TF_LITE_ENSURE(context, filter != nullptr);
-  const TfLiteTensor* bias =
-      GetOptionalInputTensor(context, node, kConvBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kConvBiasTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kConvOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   // Note that quantized inference requires that all tensors have their
@@ -119,6 +124,11 @@ TfLiteStatus CalculateOpDataConv(TfLiteContext* context, TfLiteNode* node,
   data->filter_zero_point = filter->params.zero_point;
   data->output_zero_point = output->params.zero_point;
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(bias);
+
   return kTfLiteOk;
 }
 
@@ -129,12 +139,16 @@ TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) {
   OpDataConv* data = static_cast<OpDataConv*>(node->user_data);
   const auto& params =
       *(static_cast<const TfLiteConvParams*>(node->builtin_data));
+  MicroContext* micro_context = GetMicroContext(context);
 
-  TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kConvOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
-  const TfLiteTensor* input = GetInput(context, node, kConvInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kConvInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kConvWeightsTensor);
   TF_LITE_ENSURE(context, filter != nullptr);
 
   const int input_width = input->dims->data[2];
@@ -174,6 +188,10 @@ TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) {
       context, node, params, input_width, input_height, filter_width,
       filter_height, output_width, output_height, input->type, data));
 
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_test.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_test.h
index 38b69525..47ba8ac4 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_test.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_test.h
@@ -97,6 +97,16 @@ TfLiteStatus TestConvQuantizedPerChannel(
     float output_scale, int output_zero_point, TfLiteConvParams* conv_params,
     TfLiteRegistration registration, int16_t* output_data);
 
+TfLiteStatus TestConvQuantizedPerChannel(
+    int* input_dims_data, const float* input_data, int16_t* input_quantized,
+    float input_scale, int input_zero_point, int* filter_dims_data,
+    const float* filter_data, int8_t* filter_data_quantized,
+    int* bias_dims_data, const float* bias_data, int32_t* bias_data_quantized,
+    float* bias_scales, int* bias_zero_points, int* output_dims_data,
+    const float* expected_output_data, int16_t* expected_output_data_quantized,
+    float output_scale, int output_zero_point, TfLiteConvParams* conv_params,
+    TfLiteRegistration registration, int16_t* output_data);
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/cumsum.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/cumsum.cc
index 2dc9f98f..eedc61fd 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/cumsum.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/cumsum.cc
@@ -47,8 +47,12 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* axis = GetInput(context, node, kAxisTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* axis =
+      micro_context->AllocateTempInputTensor(node, kAxisTensor);
 
   TF_LITE_ENSURE(context,
                  input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
@@ -58,7 +62,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
   TF_LITE_ENSURE(context, HaveSameShapes(input, output));
@@ -91,6 +96,10 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
         &data->output_activation_max));
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(axis);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -160,14 +169,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_CUMSUM() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depth_to_space.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depth_to_space.cc
index ae42ee1b..ec000540 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depth_to_space.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depth_to_space.cc
@@ -40,11 +40,14 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
 
@@ -83,6 +86,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
   output->dims->data[kWidthRank] = output_width;
   output->dims->data[kDepthRank] = output_channels;
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -130,14 +136,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_DEPTH_TO_SPACE() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv.cc
index 8a58433a..d2468ff9 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv.cc
@@ -62,7 +62,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           tflite::micro::GetTensorShape(filter),
           tflite::micro::GetTensorData<float>(filter),
           tflite::micro::GetTensorShape(bias),
-          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetOptionalTensorData<float>(bias),
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<float>(output));
       break;
@@ -76,7 +76,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           tflite::micro::GetTensorShape(filter),
           tflite::micro::GetTensorData<int8_t>(filter),
           tflite::micro::GetTensorShape(bias),
-          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetOptionalTensorData<int32_t>(bias),
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<int8_t>(output));
       break;
@@ -92,14 +92,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/DepthwiseConvPrepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, DepthwiseConvPrepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv.h
index 7a7eb0ba..562438d7 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -49,6 +49,32 @@ TfLiteStatus CalculateOpDataDepthwiseConv(
 
 TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node);
 
+// This is the most generic TfLiteRegistration. The actual supported types may
+// still be target dependent. The only requirement is that every implementation
+// (reference or optimized) must define this function.
+TfLiteRegistration Register_DEPTHWISE_CONV_2D();
+
+#if defined(CMSIS_NN)
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int8 activations and int8 weights and uses the latency optimized
+// implementations.
+TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT8();
+
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int16 activations and int8 weights and uses the latency optimized
+// implementations.
+TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT16();
+
+#else
+inline TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT8() {
+  return Register_DEPTHWISE_CONV_2D();
+}
+
+inline TfLiteRegistration Register_DEPTHWISE_CONV_2D_INT16() {
+  return Register_DEPTHWISE_CONV_2D();
+}
+#endif
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_DEPTHWISE_CONV_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv_common.cc
index 49167f38..3bf07274 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv_common.cc
@@ -94,13 +94,18 @@ TfLiteStatus CalculateOpDataDepthwiseConv(
       params.dilation_width_factor, height, width, filter_height, filter_width,
       padding, &out_height, &out_width);
 
-  const TfLiteTensor* input = GetInput(context, node, kConvInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kConvInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kConvWeightsTensor);
   TF_LITE_ENSURE(context, filter != nullptr);
-  const TfLiteTensor* bias =
-      GetOptionalInputTensor(context, node, kConvBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kConvBiasTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kConvOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   // Note that quantized inference requires that all tensors have their
@@ -120,6 +125,11 @@ TfLiteStatus CalculateOpDataDepthwiseConv(
   data->filter_zero_point = filter->params.zero_point;
   data->output_zero_point = output->params.zero_point;
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  micro_context->DeallocateTempTfLiteTensor(bias);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -130,14 +140,16 @@ TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node) {
   OpDataConv* data = static_cast<OpDataConv*>(node->user_data);
   const auto& params =
       *(static_cast<const TfLiteDepthwiseConvParams*>(node->builtin_data));
+  MicroContext* micro_context = GetMicroContext(context);
 
-  TfLiteTensor* output = GetOutput(context, node, kDepthwiseConvOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kDepthwiseConvOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
-  const TfLiteTensor* input =
-      GetInput(context, node, kDepthwiseConvInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kDepthwiseConvInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter =
-      GetInput(context, node, kDepthwiseConvWeightsTensor);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kDepthwiseConvWeightsTensor);
   TF_LITE_ENSURE(context, filter != nullptr);
 
   const int input_width = input->dims->data[2];
@@ -180,6 +192,10 @@ TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node) {
       context, node, params, input_width, input_height, filter_width,
       filter_height, output_width, output_height, input->type, data));
 
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize.cc
index 4438ea33..1cf7f133 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize.cc
@@ -57,6 +57,13 @@ TfLiteStatus DequantizeEval(TfLiteContext* context, TfLiteNode* node) {
                                   tflite::micro::GetTensorShape(output),
                                   tflite::micro::GetTensorData<float>(output));
         break;
+      case kTfLiteUInt8:
+        reference_ops::Dequantize(data->quantization_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<uint8_t>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<float>(output));
+        break;
       default:
         MicroPrintf("Input %s, output %s not supported.",
                     TfLiteTypeGetName(input->type),
@@ -74,14 +81,8 @@ TfLiteStatus DequantizeEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteRegistration Register_DEQUANTIZE() {
-  return {/*init=*/DequantizeInit,
-          /*free=*/nullptr,
-          /*prepare=*/DequantizePrepare,
-          /*invoke=*/DequantizeEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(DequantizeInit, DequantizePrepare,
+                                   DequantizeEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize_common.cc
index 00b47f57..438f9cda 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize_common.cc
@@ -33,14 +33,17 @@ TfLiteStatus DequantizePrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
+  MicroContext* micro_context = GetMicroContext(context);
+
   // TODO(b/140515557): Add cached dequant to improve hybrid model performance.
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
   TF_LITE_ENSURE(context, output != nullptr);
 
-  TF_LITE_ENSURE(context,
-                 input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
+  TF_LITE_ENSURE(context, input->type == kTfLiteInt8 ||
+                              input->type == kTfLiteInt16 ||
+                              input->type == kTfLiteUInt8);
   TF_LITE_ENSURE(context, output->type == kTfLiteFloat32);
 
   if (output->type == kTfLiteInt32) {
@@ -54,6 +57,10 @@ TfLiteStatus DequantizePrepare(TfLiteContext* context, TfLiteNode* node) {
   data->quantization_params.zero_point = input->params.zero_point;
   data->quantization_params.scale = static_cast<double>(input->params.scale);
   data->output_zero_point = output->params.zero_point;
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/detection_postprocess.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/detection_postprocess.cc
index 5ac343cf..326d87b5 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/detection_postprocess.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/detection_postprocess.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <numeric>
+#include <tuple>
 
 #include "flatbuffers/flexbuffers.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
@@ -147,19 +149,20 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   return op_data;
 }
 
-void Free(TfLiteContext* context, void* buffer) {}
-
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   auto* op_data = static_cast<OpData*>(node->user_data);
 
+  MicroContext* micro_context = GetMicroContext(context);
+
   // Inputs: box_encodings, scores, anchors
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
-  const TfLiteTensor* input_box_encodings =
-      GetInput(context, node, kInputTensorBoxEncodings);
-  const TfLiteTensor* input_class_predictions =
-      GetInput(context, node, kInputTensorClassPredictions);
-  const TfLiteTensor* input_anchors =
-      GetInput(context, node, kInputTensorAnchors);
+  TfLiteTensor* input_box_encodings =
+      micro_context->AllocateTempInputTensor(node, kInputTensorBoxEncodings);
+  TfLiteTensor* input_class_predictions =
+      micro_context->AllocateTempInputTensor(node,
+                                             kInputTensorClassPredictions);
+  TfLiteTensor* input_anchors =
+      micro_context->AllocateTempInputTensor(node, kInputTensorAnchors);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_box_encodings), 3);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_class_predictions), 3);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_anchors), 2);
@@ -217,6 +220,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // num_detections
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 4);
 
+  micro_context->DeallocateTempTfLiteTensor(input_box_encodings);
+  micro_context->DeallocateTempTfLiteTensor(input_class_predictions);
+  micro_context->DeallocateTempTfLiteTensor(input_anchors);
+
   return kTfLiteOk;
 }
 
@@ -313,9 +320,10 @@ TfLiteStatus DecodeCenterSizeBoxes(TfLiteContext* context, TfLiteNode* node,
 void DecreasingPartialArgSort(const float* values, int num_values,
                               int num_to_sort, int* indices) {
   std::iota(indices, indices + num_values, 0);
-  std::partial_sort(
-      indices, indices + num_to_sort, indices + num_values,
-      [&values](const int i, const int j) { return values[i] > values[j]; });
+  std::partial_sort(indices, indices + num_to_sort, indices + num_values,
+                    [&values](const int i, const int j) {
+                      return std::tie(values[i], j) > std::tie(values[j], i);
+                    });
 }
 
 template <typename Compare>
@@ -792,14 +800,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration* Register_DETECTION_POSTPROCESS() {
-  static TfLiteRegistration r = {/*init=*/Init,
-                                 /*free=*/Free,
-                                 /*prepare=*/Prepare,
-                                 /*invoke=*/Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
+  static TfLiteRegistration r = tflite::micro::RegisterOp(Init, Prepare, Eval);
   return &r;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/elementwise.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/elementwise.cc
index 581e532b..b1cb1dcb 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/elementwise.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@ limitations under the License.
 #include <cmath>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
@@ -27,6 +29,22 @@ namespace micro {
 namespace elementwise {
 namespace {
 
+constexpr int kAbsNameId = 0;
+constexpr int kRsrqtNameId = 1;
+
+const int kElementwiseInputTensor = 0;
+const int kElementwiseOutputTensor = 0;
+
+struct OpDataAbsRsqrt {
+  int32_t multiplier;
+  int shift;
+  int input_offset;
+  int output_offset;
+  bool needs_rescale;
+  TfLiteQuantizationType input_quantization_type;
+  TfLiteType input_type;
+};
+
 bool IsNumericSupportedType(const TfLiteType type) {
   return type == kTfLiteFloat32;
 }
@@ -35,14 +53,40 @@ bool IsLogicalSupportedType(const TfLiteType type) {
   return type == kTfLiteBool;
 }
 
+bool IsAbsSupportedType(const TfLiteType type) {
+  return type == kTfLiteFloat32 || type == kTfLiteInt8 || type == kTfLiteInt16;
+}
+
+bool IsRsqrtSupportedType(const TfLiteType type) {
+  return type == kTfLiteFloat32 || type == kTfLiteInt8;
+}
+
+inline void SetAbsOutputMultiplier(const float input_scale,
+                                   const float output_scale,
+                                   int32_t* multiplier, int* shift) {
+  QuantizeMultiplier(static_cast<double>(input_scale / output_scale),
+                     multiplier, shift);
+}
+
+inline void SetRsqrtOutputMultiplier(const float input_scale,
+                                     const float output_scale,
+                                     int32_t* multiplier, int* shift) {
+  const double scale =
+      1. / static_cast<double>((std::sqrt(input_scale) * output_scale));
+  QuantizeMultiplier(scale, multiplier, shift);
+}
+
 typedef bool (*IsSupportedType)(TfLiteType);
 template <IsSupportedType>
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kElementwiseInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kElementwiseOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   if (!IsSupportedType(input->type)) {
@@ -50,12 +94,85 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
                        TfLiteTypeGetName(input->type), input->type);
     return kTfLiteError;
   }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+typedef bool (*IsSupportedType)(TfLiteType);
+template <IsSupportedType, const int op_nameid>
+TfLiteStatus PrepareAbsRsqrt(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+  if (!IsSupportedType(input->type)) {
+    TF_LITE_KERNEL_LOG(context, "Input data type %s (%d) is not supported.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
+  }
+
+  auto* op_data = static_cast<OpDataAbsRsqrt*>(node->user_data);
+  op_data->input_type = input->type;
+
+  // For int16 type input, we support both quantized and non-quantized
+  // evaluation.
+  if (op_nameid == kAbsNameId) {
+    op_data->input_quantization_type = input->quantization.type;
+  }
+
+  if (input->type == kTfLiteInt8 ||
+      (input->type == kTfLiteInt16 &&
+       input->quantization.type != kTfLiteNoQuantization)) {
+    TF_LITE_ENSURE_EQ(context, input->quantization.type,
+                      kTfLiteAffineQuantization);
+    TF_LITE_ENSURE_EQ(context, output->quantization.type,
+                      kTfLiteAffineQuantization);
+    const auto* input_params =
+        reinterpret_cast<TfLiteAffineQuantization*>(input->quantization.params);
+    const auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
+        output->quantization.params);
+    TF_LITE_ENSURE(context, input_params != nullptr);
+    TF_LITE_ENSURE(context, input_params->scale != nullptr);
+    TF_LITE_ENSURE(context, input_params->scale->size > 0);
+    TF_LITE_ENSURE(context, input_params->zero_point->size > 0);
+    TF_LITE_ENSURE(context, output_params != nullptr);
+    TF_LITE_ENSURE(context, output_params->scale != nullptr);
+    TF_LITE_ENSURE(context, output_params->scale->size > 0);
+    TF_LITE_ENSURE(context, output_params->zero_point->size > 0);
+    op_data->input_offset = input_params->zero_point->data[0];
+    op_data->output_offset = output_params->zero_point->data[0];
+    if (input->type == kTfLiteInt16) {
+      TF_LITE_ENSURE_EQ(context, op_data->input_offset, 0);
+      TF_LITE_ENSURE_EQ(context, op_data->output_offset, 0);
+    }
+    const float input_scale = input_params->scale->data[0];
+    const float output_scale = output_params->scale->data[0];
+    op_data->needs_rescale = input_scale != output_scale;
+    if (op_nameid == kAbsNameId && op_data->needs_rescale) {
+      SetAbsOutputMultiplier(input_scale, output_scale, &op_data->multiplier,
+                             &op_data->shift);
+    } else if (op_nameid == kRsrqtNameId) {
+      SetRsqrtOutputMultiplier(input_scale, output_scale, &op_data->multiplier,
+                               &op_data->shift);
+    }
+  }
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
 template <typename T>
-inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
-                             T func(T), TfLiteType expected_type) {
+inline TfLiteStatus EvalImplQuantized(
+    TfLiteContext* context, TfLiteNode* node,
+    T func(TfLiteContext*, TfLiteNode*, T),
+    TfLiteStatus validate_input_func(TfLiteContext*, TfLiteNode*, T),
+    TfLiteType expected_type) {
   const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
   TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, expected_type);
@@ -63,6 +180,34 @@ inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
   const T* in_data = tflite::micro::GetTensorData<T>(input);
   T* out_data = tflite::micro::GetTensorData<T>(output);
   for (size_t i = 0; i < num_elements; ++i) {
+    if (validate_input_func) {
+      TF_LITE_ENSURE_OK(context,
+                        validate_input_func(context, node, in_data[i]));
+    }
+    out_data[i] = func(context, node, in_data[i]);
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+inline T AbsHelper(T i) {
+  return std::abs(i);
+}
+
+template <typename T>
+inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
+                             T func(T), TfLiteStatus validate_input_func(T),
+                             TfLiteType expected_type) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, expected_type);
+  const size_t num_elements = ElementCount(*input->dims);
+  const T* in_data = tflite::micro::GetTensorData<T>(input);
+  T* out_data = tflite::micro::GetTensorData<T>(output);
+  for (size_t i = 0; i < num_elements; ++i) {
+    if (validate_input_func) {
+      TF_LITE_ENSURE_OK(context, validate_input_func(in_data[i]));
+    }
     out_data[i] = func(in_data[i]);
   }
   return kTfLiteOk;
@@ -70,16 +215,114 @@ inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
 
 inline TfLiteStatus EvalNumeric(TfLiteContext* context, TfLiteNode* node,
                                 float float_func(float)) {
-  return EvalImpl<float>(context, node, float_func, kTfLiteFloat32);
+  return EvalImpl<float>(context, node, float_func,
+                         /*validate_input_func=*/nullptr, kTfLiteFloat32);
 }
 
 inline TfLiteStatus EvalLogical(TfLiteContext* context, TfLiteNode* node,
+
                                 bool bool_func(bool)) {
-  return EvalImpl<bool>(context, node, bool_func, kTfLiteBool);
+  return EvalImpl<bool>(context, node, bool_func,
+                        /*validate_input_func=*/nullptr, kTfLiteBool);
+}
+
+void* ElementWiseAbsRsqrtInit(TfLiteContext* context, const char* buffer,
+                              size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataAbsRsqrt));
+}
+
+template <typename T>
+inline T AbsEvalQuantized(TfLiteContext* context, TfLiteNode* node, T i) {
+  const auto* op_data = static_cast<const OpDataAbsRsqrt*>(node->user_data);
+  const int kMin = std::numeric_limits<T>::min();
+  const int kMax = std::numeric_limits<T>::max();
+
+  const int32_t value = std::abs(i - op_data->input_offset);
+  if (!op_data->needs_rescale) {
+    return static_cast<T>(
+        std::min(std::max(static_cast<long int>(value + op_data->output_offset),
+                          static_cast<long int>(kMin)),
+                 static_cast<long int>(kMax)));
+  }
+
+  const int32_t output = tflite::MultiplyByQuantizedMultiplier(
+                             value, op_data->multiplier, op_data->shift) +
+                         op_data->output_offset;
+  return static_cast<T>(std::min(
+      std::max(static_cast<long int>(output), static_cast<long int>(kMin)),
+      static_cast<long int>(kMax)));
+}
+
+template <typename T>
+inline T RsqrtEvalQuantized(TfLiteContext* context, TfLiteNode* node, T i) {
+  const auto* op_data = static_cast<const OpDataAbsRsqrt*>(node->user_data);
+  const int kMin = std::numeric_limits<T>::min();
+  const int kMax = std::numeric_limits<T>::max();
+
+  const int32_t value = (i - op_data->input_offset);
+  const int32_t kShift = 20;  // Shift to keep value integer.
+  if (value == 0) {
+    // Assume that any value close to 0 represents the max output value.
+    return static_cast<T>(kMax);
+  }
+  int32_t inv_sqrt_multiplier;
+  int inv_sqrt_shift;
+  GetInvSqrtQuantizedMultiplierExp(value, kReverseShift, &inv_sqrt_multiplier,
+                                   &inv_sqrt_shift);
+  const int32_t data = tflite::MultiplyByQuantizedMultiplier(
+      static_cast<int32_t>(1), inv_sqrt_multiplier, inv_sqrt_shift + kShift);
+  const int32_t output =
+      tflite::MultiplyByQuantizedMultiplier(data, op_data->multiplier,
+                                            op_data->shift - kShift) +
+      op_data->output_offset;
+  return static_cast<T>(std::min(
+      std::max(static_cast<long int>(output), static_cast<long int>(kMin)),
+      static_cast<long int>(kMax)));
+}
+
+template <typename T>
+TfLiteStatus validate_input_func(TfLiteContext* context, TfLiteNode* node,
+                                 T i) {
+  const auto* op_data = static_cast<const OpDataAbsRsqrt*>(node->user_data);
+
+  TF_LITE_ENSURE_MSG(context, i >= op_data->input_offset,
+                     "Rsqrt is only defined for positive values");
+  return static_cast<TfLiteStatus>(kTfLiteOk);
 }
 
 TfLiteStatus AbsEval(TfLiteContext* context, TfLiteNode* node) {
-  return EvalNumeric(context, node, std::abs);
+  OpDataAbsRsqrt* op_data = reinterpret_cast<OpDataAbsRsqrt*>(node->user_data);
+  TfLiteType type = op_data->input_type;
+  TfLiteQuantizationType input_quantization_type =
+      op_data->input_quantization_type;
+  TfLiteStatus eval_result;
+
+  switch (type) {
+    case kTfLiteFloat32:
+      eval_result = EvalNumeric(context, node, std::abs);
+      break;
+    case kTfLiteInt8:
+      eval_result =
+          EvalImplQuantized<int8_t>(context, node, AbsEvalQuantized,
+                                    /*validate_input_func=*/nullptr, type);
+      break;
+    case kTfLiteInt16:
+      eval_result =
+          input_quantization_type == kTfLiteNoQuantization
+              ? EvalImpl<int16_t>(context, node, AbsHelper,
+                                  /*validate_input_func=*/nullptr, type)
+              : EvalImplQuantized<int16_t>(context, node, AbsEvalQuantized,
+                                           /*validate_input_func=*/nullptr,
+                                           type);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Current data type %s is not supported.",
+                         TfLiteTypeGetName(type));
+      return kTfLiteError;
+      break;
+  }
+  return eval_result;
 }
 
 TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
@@ -99,7 +342,23 @@ TfLiteStatus SqrtEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus RsqrtEval(TfLiteContext* context, TfLiteNode* node) {
-  return EvalNumeric(context, node, [](float f) { return 1.f / std::sqrt(f); });
+  const auto* op_data = static_cast<const OpDataAbsRsqrt*>(node->user_data);
+  TfLiteType type = op_data->input_type;
+  switch (type) {
+    case kTfLiteFloat32:
+      return EvalImpl<float>(
+          context, node, [](float f) { return 1.f / std::sqrt(f); },
+          /*validate_input_func=*/nullptr, type);
+    case kTfLiteInt8:
+      return EvalImplQuantized<int8_t>(context, node,
+                                       elementwise::RsqrtEvalQuantized,
+                                       elementwise::validate_input_func, type);
+
+    default:
+      TF_LITE_KERNEL_LOG(context, "Current data type %s is not supported.",
+                         TfLiteTypeGetName(type));
+      return kTfLiteError;
+  }
 }
 
 TfLiteStatus SquareEval(TfLiteContext* context, TfLiteNode* node) {
@@ -114,101 +373,57 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace elementwise
 
 TfLiteRegistration Register_ABS() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/
-          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-          /*invoke=*/elementwise::AbsEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      elementwise::ElementWiseAbsRsqrtInit,
+      elementwise::PrepareAbsRsqrt<elementwise::IsAbsSupportedType,
+                                   elementwise::kAbsNameId>,
+      elementwise::AbsEval);
 }
 
 TfLiteRegistration Register_SIN() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/
-          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-          /*invoke=*/elementwise::SinEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      nullptr, elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::SinEval);
 }
 
 TfLiteRegistration Register_COS() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/
-          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-          /*invoke=*/elementwise::CosEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      nullptr, elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::CosEval);
 }
 
 TfLiteRegistration Register_LOG() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/
-          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-          /*invoke=*/elementwise::LogEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      nullptr, elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::LogEval);
 }
 
 TfLiteRegistration Register_SQRT() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/
-          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-          /*invoke=*/elementwise::SqrtEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      nullptr, elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::SqrtEval);
 }
 
 TfLiteRegistration Register_RSQRT() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/
-          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-          /*invoke=*/elementwise::RsqrtEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      elementwise::ElementWiseAbsRsqrtInit,
+      elementwise::PrepareAbsRsqrt<elementwise::IsRsqrtSupportedType,
+                                   elementwise::kRsrqtNameId>,
+      elementwise::RsqrtEval);
 }
 
 TfLiteRegistration Register_SQUARE() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/
-          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-          /*invoke=*/elementwise::SquareEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      nullptr, elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::SquareEval);
 }
 
 TfLiteRegistration Register_LOGICAL_NOT() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/
-          elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>,
-          /*invoke=*/elementwise::LogicalNotEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      nullptr, elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>,
+      elementwise::LogicalNotEval);
 }
 
 }  // namespace micro
 }  // namespace ops
-}  // namespace tflite
+}  // namespace tflite
\ No newline at end of file
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/elu.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/elu.cc
index 7e785f2f..0b64e89d 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/elu.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/elu.cc
@@ -80,13 +80,16 @@ void EvalUsingLookupTable(const OpData* data, const TfLiteEvalTensor* input,
 }
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   // Use LUT to handle quantized elu path.
@@ -97,7 +100,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
     };
     PopulateLookupTable<int8_t>(input, output, transform, data);
   }
-
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -142,14 +146,7 @@ TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_ELU() {
-  return {/*init=*/EluInit,
-          /*free=*/nullptr,
-          /*prepare=*/EluPrepare,
-          /*invoke=*/EluEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(EluInit, EluPrepare, EluEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/README.md b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/README.md
new file mode 100644
index 00000000..b0c215fb
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/README.md
@@ -0,0 +1,11 @@
+# Info
+
+These are the Espressif chipset specific replacement kernels.
+The kernels call optimized routines or reference routines depending upon optimization option selected.
+
+By default optimizations are selected if available.
+To change this behaviour, please make the appropriate `ESP-NN` menu selection after running:
+
+```
+idf.py menuconfig
+```
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/add.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/add.cc
new file mode 100644
index 00000000..2f1ac58d
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/add.cc
@@ -0,0 +1,202 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/add.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+#include <esp_timer.h>
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+long long add_total_time = 0;
+
+namespace tflite {
+
+void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
+             const OpDataAdd* data, const TfLiteEvalTensor* input1,
+             const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(data->output_activation_min_f32,
+                      data->output_activation_max_f32, &op_params);
+  if (data->requires_broadcast) {
+    reference_ops::BroadcastAdd4DSlow(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<float>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<float>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<float>(output));
+  } else {
+    reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                       tflite::micro::GetTensorData<float>(input1),
+                       tflite::micro::GetTensorShape(input2),
+                       tflite::micro::GetTensorData<float>(input2),
+                       tflite::micro::GetTensorShape(output),
+                       tflite::micro::GetTensorData<float>(output));
+  }
+}
+
+TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
+                              TfLiteAddParams* params, const OpDataAdd* data,
+                              const TfLiteEvalTensor* input1,
+                              const TfLiteEvalTensor* input2,
+                              TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = data->left_shift;
+  op_params.input1_offset = data->input1_offset;
+  op_params.input1_multiplier = data->input1_multiplier;
+  op_params.input1_shift = data->input1_shift;
+  op_params.input2_offset = data->input2_offset;
+  op_params.input2_multiplier = data->input2_multiplier;
+  op_params.input2_shift = data->input2_shift;
+  op_params.output_offset = data->output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+  SetActivationParams(data->output_activation_min, data->output_activation_max,
+                      &op_params);
+  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);
+
+  switch (output->type) {
+    case kTfLiteInt8: {
+      if (need_broadcast) {
+        reference_integer_ops::BroadcastAdd4DSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
+      } else {
+#if ESP_NN
+        const int8_t *input1_data = tflite::micro::GetTensorData<int8_t>(input1);
+        const int8_t *input2_data = tflite::micro::GetTensorData<int8_t>(input2);
+        int8_t *out_data = tflite::micro::GetTensorData<int8_t>(output);
+
+        esp_nn_add_elementwise_s8(input1_data,
+                                  input2_data,
+                                  data->input1_offset,
+                                  data->input2_offset,
+                                  data->input1_multiplier,
+                                  data->input2_multiplier,
+                                  data->input1_shift,
+                                  data->input2_shift,
+                                  data->left_shift,
+                                  out_data,
+                                  data->output_offset,
+                                  data->output_multiplier,
+                                  data->output_shift,
+                                  data->output_activation_min,
+                                  data->output_activation_max,
+                                  MatchingElementsSize(tflite::micro::GetTensorShape(input1),
+                                                       tflite::micro::GetTensorShape(input2),
+                                                       tflite::micro::GetTensorShape(output))
+                                  );
+#else
+        reference_integer_ops::Add(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
+#endif
+      }
+      break;
+    }
+    case kTfLiteInt16: {
+      if (need_broadcast) {
+        reference_ops::BroadcastAdd4DSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int16_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int16_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int16_t>(output));
+      } else {
+        reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                           tflite::micro::GetTensorData<int16_t>(input1),
+                           tflite::micro::GetTensorShape(input2),
+                           tflite::micro::GetTensorData<int16_t>(input2),
+                           tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<int16_t>(output),
+                           false);
+      }
+      break;
+    }
+    default:
+      MicroPrintf("Type %s (%d) not supported.",
+                  TfLiteTypeGetName(output->type), output->type);
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+void* AddInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataAdd));
+}
+
+TfLiteStatus AddEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataAdd* data = static_cast<const OpDataAdd*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kAddInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kAddInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kAddOutputTensor);
+
+  long long start_time = esp_timer_get_time();
+
+  if (output->type == kTfLiteFloat32) {
+    EvalAdd(context, node, params, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt8 || output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data,
+                                                input1, input2, output));
+  } else {
+    MicroPrintf("Type %s (%d) not supported.", TfLiteTypeGetName(output->type),
+                output->type);
+    return kTfLiteError;
+  }
+  add_total_time += esp_timer_get_time() - start_time;
+
+  return kTfLiteOk;
+}
+
+TfLiteRegistration Register_ADD() {
+  return tflite::micro::RegisterOp(AddInit, AddPrepare, AddEval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/conv.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/conv.cc
new file mode 100644
index 00000000..919dd006
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/conv.cc
@@ -0,0 +1,344 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/conv.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+#include "freertos/FreeRTOS.h"
+#include <esp_timer.h>
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+
+long long conv_total_time = 0;
+
+namespace tflite {
+namespace {
+
+struct NodeData {
+  OpDataConv op_data;
+#if ESP_NN
+  int buffer_idx;
+#endif
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(NodeData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  NodeData* data = static_cast<NodeData*>(node->user_data);
+  const auto& params =
+      *(static_cast<const TfLiteConvParams*>(node->builtin_data));
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kConvInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kConvWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kConvOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  const int input_width = input->dims->data[2];
+  const int input_height = input->dims->data[1];
+  const int filter_width = filter->dims->data[2];
+  const int filter_height = filter->dims->data[1];
+  const int output_width = output->dims->data[2];
+  const int output_height = output->dims->data[1];
+
+  // Dynamically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  data->op_data.per_channel_output_multiplier =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->op_data.per_channel_output_shift =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TFLITE_DCHECK(affine_quantization != nullptr);
+    TFLITE_DCHECK(affine_quantization->scale != nullptr);
+    TFLITE_DCHECK(affine_quantization->zero_point != nullptr);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpDataConv(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, &data->op_data));
+
+#if ESP_NN
+  if (input->type == kTfLiteInt8) {
+    data_dims_t input_dims =  {
+                                .width = input_width, .height = input_height,
+                                .channels = input->dims->data[3], 1
+                              };
+    data_dims_t output_dims = {
+                                .width = output_width, .height = output_height,
+                                .channels = output->dims->data[3], 1
+                              };
+    data_dims_t filter_dims = {.width = filter_width, .height = filter_height, 0, 0};
+    conv_params_t conv_params = {
+                                  .in_offset = 0, .out_offset = 0,
+                                  .stride = {params.stride_width, params.stride_height},
+                                  .padding = {data->op_data.padding.width, data->op_data.padding.height},
+                                  .dilation = {0, 0}, .activation = {-128, 127}
+                                };
+
+    int scratch_buf_size = esp_nn_get_conv_scratch_size(
+        &input_dims, &filter_dims, &output_dims, &conv_params);
+    if (scratch_buf_size > 0) {
+      TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+        context, scratch_buf_size, &data->buffer_idx));
+    } else {
+      data->buffer_idx = -1;
+    }
+  }
+#endif
+
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+
+  return kTfLiteOk;
+}
+
+#if ESP_NN
+// Fixed-point per-channel-quantization convolution Int8 function wrapper.
+inline void EvalQuantizedPerChannel(
+    TfLiteContext* context, TfLiteNode* node, const TfLiteConvParams& params,
+    const NodeData& data, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+    TfLiteEvalTensor* output) {
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+
+  if (dilation_width_factor == 1 && dilation_height_factor == 1) {
+    // Get parameters.
+    RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+    RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias);
+
+    const int8_t *input_data = tflite::micro::GetTensorData<int8_t>(input);
+    int8_t *output_data = tflite::micro::GetTensorData<int8_t>(output);
+
+    const int32_t input_offset = -data.op_data.input_zero_point;
+    const int32_t output_offset = data.op_data.output_zero_point;
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int pad_width = data.op_data.padding.width;
+    const int pad_height = data.op_data.padding.height;
+
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+
+    // Set min and max value of the output.
+    const int32_t activation_min = data.op_data.output_activation_min;
+    const int32_t activation_max = data.op_data.output_activation_max;
+
+    // Consistency check.
+    TFLITE_DCHECK_LE(activation_min, activation_max);
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+    const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+    const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+    const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+
+    if (tflite::micro::GetTensorData<int8_t>(bias)) {
+      TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+    }
+
+    void *scratch_buf = NULL;
+    if (data.buffer_idx > -1) {
+      scratch_buf = context->GetScratchBuffer(context, data.buffer_idx);
+    }
+    esp_nn_set_conv_scratch_buf(scratch_buf);
+
+    const int input_size = input_width * input_height * input_depth;
+    const int output_size = output_width * output_height * output_depth;
+
+    data_dims_t input_dims =  {
+                                .width = input_width, .height = input_height,
+                                .channels = input_depth, 1
+                              };
+    data_dims_t output_dims = {
+                                .width = output_width, .height = output_height,
+                                .channels = output_depth, 1
+                              };
+    data_dims_t filter_dims = {.width = filter_width, .height = filter_height, 0, 0};
+    conv_params_t conv_params = {
+                                  .in_offset = input_offset, .out_offset = output_offset,
+                                  .stride = {stride_width, stride_height},
+                                  .padding = {pad_width, pad_height},
+                                  .dilation = {0, 0},
+                                  .activation = {activation_min, activation_max}
+                                };
+    quant_data_t quant_data = {
+                                .shift = data.op_data.per_channel_output_shift,
+                                .mult = data.op_data.per_channel_output_multiplier
+                              };
+
+    for (int i_batch = 0; i_batch < batch_size; i_batch++) {
+      esp_nn_conv_s8(&input_dims, input_data + i_batch * input_size,
+                     &filter_dims, tflite::micro::GetTensorData<int8_t>(filter),
+                     tflite::micro::GetTensorData<int32_t>(bias),
+                     &output_dims, output_data + i_batch * output_size,
+                     &conv_params, &quant_data);
+    }
+  } else {
+    reference_integer_ops::ConvPerChannel(
+        ConvParamsQuantized(params, data.op_data),
+        data.op_data.per_channel_output_multiplier,
+        data.op_data.per_channel_output_shift,
+        tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
+  }
+}
+#endif
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kConvInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kConvWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kConvBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kConvOutputTensor);
+
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto& params =
+      *(reinterpret_cast<TfLiteConvParams*>(node->builtin_data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const auto& data = *(static_cast<const NodeData*>(node->user_data));
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  long long start_time = esp_timer_get_time();
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32: {
+      tflite::reference_ops::Conv(
+          ConvParamsFloat(params, data.op_data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output),
+          tflite::micro::GetTensorShape(nullptr), nullptr);
+      break;
+    }
+    case kTfLiteInt8: {
+#if ESP_NN
+      EvalQuantizedPerChannel(context, node, params, data, input, filter,
+                              bias, output);
+#else
+      reference_integer_ops::ConvPerChannel(
+          ConvParamsQuantized(params, data.op_data),
+          data.op_data.per_channel_output_multiplier,
+          data.op_data.per_channel_output_shift,
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+#endif
+      break;
+    }
+    case kTfLiteUInt8: {
+      //EvalQuantized
+      reference_ops::Conv(ConvParamsQuantized(params, data.op_data),
+                          tflite::micro::GetTensorShape(input),
+                          tflite::micro::GetTensorData<uint8_t>(input),
+                          tflite::micro::GetTensorShape(filter),
+                          tflite::micro::GetTensorData<uint8_t>(filter),
+                          tflite::micro::GetTensorShape(bias),
+                          tflite::micro::GetTensorData<int32_t>(bias),
+                          tflite::micro::GetTensorShape(output),
+                          tflite::micro::GetTensorData<uint8_t>(output),
+                          tflite::micro::GetTensorShape(nullptr), nullptr,
+                          nullptr);
+      break;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  long long time_this_instance = esp_timer_get_time() - start_time;
+  conv_total_time += time_this_instance;
+  //printf("time this instance: %llu\n", time_this_instance / 1000);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_CONV_2D() {
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/depthwise_conv.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/depthwise_conv.cc
new file mode 100644
index 00000000..a2460248
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/depthwise_conv.cc
@@ -0,0 +1,346 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/depthwise_conv.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+#include "freertos/FreeRTOS.h"
+#include <esp_timer.h>
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+long long dc_total_time = 0;
+
+namespace tflite {
+namespace {
+
+struct NodeData {
+  OpDataConv op_data;
+#if ESP_NN
+  int buffer_idx;
+#endif
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(NodeData));
+}
+
+#if ESP_NN
+inline void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                                    const TfLiteDepthwiseConvParams& params,
+                                    const NodeData& data,
+                                    const TfLiteEvalTensor* input,
+                                    const TfLiteEvalTensor* filter,
+                                    const TfLiteEvalTensor* bias,
+                                    TfLiteEvalTensor* output) {
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+
+  if (dilation_width_factor == 1 && dilation_height_factor == 1) {
+    // Get parameters.
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+    RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+    RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias);
+
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    const int8_t *input_data = tflite::micro::GetTensorData<int8_t>(input);
+    int8_t *output_data = tflite::micro::GetTensorData<int8_t>(output);
+
+    const int depth_multiplier = params.depth_multiplier;
+    const int32_t input_offset = -data.op_data.input_zero_point;
+    const int32_t output_offset = data.op_data.output_zero_point;
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int pad_width = data.op_data.padding.width;
+    const int pad_height = data.op_data.padding.height;
+
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+
+    // Set min and max value of the output.
+    const int32_t activation_min = data.op_data.output_activation_min;
+    const int32_t activation_max = data.op_data.output_activation_max;
+
+    // Consistency check.
+    TFLITE_DCHECK_LE(activation_min, activation_max);
+    const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+
+    TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+    if (tflite::micro::GetTensorData<int8_t>(bias)) {
+      TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+    }
+
+    const int input_size = input_width * input_height * input_depth;
+    const int output_size = output_width * output_height * output_depth;
+    void *scratch_buf = NULL;
+    if (data.buffer_idx > -1) {
+      scratch_buf = context->GetScratchBuffer(context, data.buffer_idx);
+    }
+
+    esp_nn_set_depthwise_conv_scratch_buf(scratch_buf);
+
+    data_dims_t input_dims =  {
+                                .width = input_width, .height = input_height,
+                                .channels = input_depth, 1
+                              };
+    data_dims_t output_dims = {
+                                .width = output_width, .height = output_height,
+                                .channels = output_depth, 1
+                              };
+    data_dims_t filter_dims = {.width = filter_width, .height = filter_height, 0, 0};
+    dw_conv_params_t conv_params =  {
+                                      .in_offset = input_offset, .out_offset = output_offset,
+                                      .ch_mult = depth_multiplier,
+                                      .stride = {stride_width, stride_height},
+                                      .padding = {pad_width, pad_height}, .dilation = {0, 0},
+                                      .activation = {activation_min, activation_max}
+                                    };
+    quant_data_t quant_data = {
+                                .shift = data.op_data.per_channel_output_shift,
+                                .mult = data.op_data.per_channel_output_multiplier
+                              };
+
+    for (int i_batch = 0; i_batch < batch_size; i_batch++) {
+      esp_nn_depthwise_conv_s8(&input_dims, input_data + i_batch * input_size,
+                               &filter_dims, tflite::micro::GetTensorData<int8_t>(filter),
+                               tflite::micro::GetTensorData<int32_t>(bias),
+                               &output_dims, output_data + i_batch * output_size,
+                               &conv_params, &quant_data);
+    }
+  } else {
+    reference_integer_ops::DepthwiseConvPerChannel(
+        DepthwiseConvParamsQuantized(params, data.op_data),
+        data.op_data.per_channel_output_multiplier,
+        data.op_data.per_channel_output_shift,
+        tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
+  }
+}
+#endif
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  NodeData* data = static_cast<NodeData*>(node->user_data);
+  const TfLiteDepthwiseConvParams& params =
+      *(static_cast<const TfLiteDepthwiseConvParams*>(node->builtin_data));
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kConvInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kConvWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kConvBiasTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kConvOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  const int input_width = input->dims->data[2];
+  const int input_height = input->dims->data[1];
+  const int filter_width = filter->dims->data[2];
+  const int filter_height = filter->dims->data[1];
+  const int output_width = output->dims->data[2];
+  const int output_height = output->dims->data[1];
+
+  // Dynamically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  data->op_data.per_channel_output_multiplier =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->op_data.per_channel_output_shift =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TFLITE_DCHECK(affine_quantization != nullptr);
+    TFLITE_DCHECK(affine_quantization->scale != nullptr);
+    TFLITE_DCHECK(affine_quantization->zero_point != nullptr);
+
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpDataDepthwiseConv(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, &data->op_data));
+
+#if ESP_NN
+  if (input->type == kTfLiteInt8) {
+    data_dims_t input_dims =  {
+                                .width = input_width, .height = input_height,
+                                .channels = input->dims->data[3], 1
+                              };
+    data_dims_t output_dims = {
+                                .width = output_width, .height = output_height,
+                                .channels = output->dims->data[3], 1
+                              };
+    data_dims_t filter_dims = {.width = filter_width, .height = filter_height, 0, 0};
+    dw_conv_params_t conv_params =  {
+                                      .in_offset = 0, .out_offset = 0,
+                                      .ch_mult = params.depth_multiplier,
+                                      .stride = {params.stride_width, params.stride_height},
+                                      .padding = {data->op_data.padding.width, data->op_data.padding.height},
+                                      .dilation = {0, 0}, .activation = {-128, 127}
+                                    };
+
+    int scratch_buf_size = esp_nn_get_depthwise_conv_scratch_size(
+        &input_dims, &filter_dims, &output_dims, &conv_params);
+    if (scratch_buf_size > 0) {
+      TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+        context, scratch_buf_size, &data->buffer_idx));
+    } else {
+      data->buffer_idx = -1;
+    }
+  }
+#endif
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  micro_context->DeallocateTempTfLiteTensor(bias);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto& params =
+      *(reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data));
+  const NodeData& data = *(static_cast<const NodeData*>(node->user_data));
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kDepthwiseConvOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kDepthwiseConvBiasTensor)
+          : nullptr;
+
+  long long start_time = esp_timer_get_time();
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      tflite::reference_ops::DepthwiseConv(
+          DepthwiseConvParamsFloat(params, data.op_data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    case kTfLiteInt8:
+#if ESP_NN
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
+                              output);
+#else
+      reference_integer_ops::DepthwiseConvPerChannel(
+          DepthwiseConvParamsQuantized(params, data.op_data),
+          data.op_data.per_channel_output_multiplier,
+          data.op_data.per_channel_output_shift,
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+#endif
+      break;
+    case kTfLiteUInt8:
+      //EvalQuantized(context, node, params, &data, input, filter, bias, output);
+      reference_ops::DepthwiseConv(
+          DepthwiseConvParamsQuantized(params, data.op_data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<uint8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<uint8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  long long time_this_instance = esp_timer_get_time() - start_time;
+  dc_total_time += time_this_instance;
+  // printf("time this instance: %llu\n", time_this_instance / 1000);
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/fully_connected.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/fully_connected.cc
new file mode 100644
index 00000000..484cffb6
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/fully_connected.cc
@@ -0,0 +1,191 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+#include <esp_timer.h>
+
+long long fc_total_time = 0;
+
+namespace tflite {
+namespace {
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context,
+                                           sizeof(OpDataFullyConnected));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* data = static_cast<OpDataFullyConnected*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kFullyConnectedInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* filter = micro_context->AllocateTempInputTensor(
+      node, kFullyConnectedWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kFullyConnectedBiasTensor);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(
+      node, kFullyConnectedOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  TF_LITE_ENSURE_OK(context, CalculateOpDataFullyConnected(
+                                 context, params->activation, input->type,
+                                 input, filter, bias, output, data));
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  if (bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(bias);
+  }
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      tflite::micro::GetEvalInput(context, node, kFullyConnectedBiasTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const auto& data =
+      *(static_cast<const OpDataFullyConnected*>(node->user_data));
+
+  long long start_time = esp_timer_get_time();
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      tflite::reference_ops::FullyConnected(
+          FullyConnectedParamsFloat(params->activation),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    }
+
+    case kTfLiteInt8: {
+      const int32_t* bias_data =
+          nullptr != bias ? tflite::micro::GetTensorData<int32_t>(bias)
+                          : nullptr;
+#if ESP_NN
+      const RuntimeShape& filter_shape = tflite::micro::GetTensorShape(filter);
+      const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output);
+      const int filter_dim_count = filter_shape.DimensionsCount();
+      const int batches = output_shape.Dims(0);
+      const int output_depth = output_shape.Dims(1);
+      TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+      const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+
+      const int8_t *input_data = tflite::micro::GetTensorData<int8_t>(input);
+      int8_t *output_data = tflite::micro::GetTensorData<int8_t>(output);
+      const int8_t *filter_data = tflite::micro::GetTensorData<int8_t>(filter);
+
+      for (int b = 0; b < batches; ++b) {
+        esp_nn_fully_connected_s8(input_data, -data.input_zero_point,
+                                  accum_depth,
+                                  filter_data, -data.filter_zero_point,
+                                  bias_data, output_data, output_depth,
+                                  data.output_zero_point,
+                                  data.output_shift, data.output_multiplier,
+                                  data.output_activation_min,
+                                  data.output_activation_max);
+        input_data += accum_depth;
+        output_data += output_depth;
+      }
+#else
+      tflite::reference_integer_ops::FullyConnected(
+          FullyConnectedParamsQuantized(data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias), bias_data,
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+#endif
+      break;
+    }
+
+    case kTfLiteUInt8: {
+      tflite::reference_ops::FullyConnected(
+          FullyConnectedParamsQuantized(data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<uint8_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<uint8_t>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
+      break;
+    }
+    default: {
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+    }
+  }
+  fc_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_FULLY_CONNECTED() {
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/mul.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/mul.cc
new file mode 100644
index 00000000..02413f5c
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/mul.cc
@@ -0,0 +1,124 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/mul.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+#include <esp_timer.h>
+
+long long mul_total_time = 0;
+
+namespace tflite {
+#if ESP_NN
+void MulEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                      const OpDataMul* data, const TfLiteEvalTensor* input1,
+                      const TfLiteEvalTensor* input2,
+                      TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params = {};
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.float_activation_max = data->output_activation_max_f32;
+  op_params.input1_offset = -data->input1_zero_point;
+  op_params.input2_offset = -data->input2_zero_point;
+  op_params.output_offset = data->output_zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+
+  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);
+
+  if (need_broadcast) {
+    reference_integer_ops::BroadcastMul4DSlow(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<int8_t>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<int8_t>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
+  } else {
+    const int8_t *input1_data = tflite::micro::GetTensorData<int8_t>(input1);
+    const int8_t *input2_data = tflite::micro::GetTensorData<int8_t>(input2);
+    int8_t *out_data = tflite::micro::GetTensorData<int8_t>(output);
+
+    esp_nn_mul_elementwise_s8(input1_data, input2_data, op_params.input1_offset,
+                              op_params.input2_offset, out_data, op_params.output_offset,
+                              op_params.output_multiplier, op_params.output_shift,
+                              op_params.quantized_activation_min, op_params.quantized_activation_max,
+                              MatchingElementsSize(tflite::micro::GetTensorShape(input1),
+                                                    tflite::micro::GetTensorShape(input2),
+                                                    tflite::micro::GetTensorShape(output)));
+  }
+}
+#endif
+
+TfLiteStatus MulEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataMul* data = static_cast<const OpDataMul*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kMulInput1Tensor);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kMulInput2Tensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kMulOutputTensor);
+
+  long long start_time = esp_timer_get_time();
+  switch (input1->type) {
+    case kTfLiteInt8:
+#if ESP_NN
+      MulEvalQuantized(context, node, data, input1, input2, output);
+#else
+      EvalMulQuantizedReference(context, node, data, input1, input2, output);
+#endif
+      break;
+    case kTfLiteInt32:
+      EvalMulQuantizedReference(context, node, data, input1, input2, output);
+      break;
+    case kTfLiteFloat32:
+      EvalMulFloatReference(context, node, params, data, input1, input2,
+                            output);
+      break;
+    default:
+      MicroPrintf("Type %s (%d) not supported.",
+                  TfLiteTypeGetName(input1->type), input1->type);
+      return kTfLiteError;
+  }
+  mul_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+TfLiteRegistration Register_MUL() {
+  return tflite::micro::RegisterOp(MulInit, MulPrepare, MulEval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/pooling.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/pooling.cc
new file mode 100644
index 00000000..b450929e
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/pooling.cc
@@ -0,0 +1,231 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/pooling.h"
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+#include <esp_timer.h>
+
+long long pooling_total_time = 0;
+
+namespace tflite {
+
+namespace {
+#if ESP_NN
+void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
+                          const TfLitePoolParams* params, const OpDataPooling* data,
+                          const TfLiteEvalTensor* input,
+                          TfLiteEvalTensor* output) {
+
+  const int stride_height = params->stride_height;
+  const int stride_width = params->stride_width;
+  const int filter_height = params->filter_height;
+  const int filter_width = params->filter_width;
+  const int activation_min = data->activation_min;
+  const int activation_max = data->activation_max;
+  const int pad_height = data->padding.height;
+  const int pad_width = data->padding.width;
+
+  const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
+  const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output);
+  TFLITE_DCHECK_LE(activation_min, activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  const int8_t *input_data = tflite::micro::GetTensorData<int8_t>(input);
+  int8_t *output_data = tflite::micro::GetTensorData<int8_t>(output);
+
+  const int input_size = input_width * input_height * depth;
+  const int output_size = output_width * output_height * depth;
+
+  if (depth % 4 == 0) { // S3 version only supports channels multiple of 4
+    for (int batch = 0; batch < batches; ++batch) {
+      esp_nn_avg_pool_s8(input_data, input_width, input_height,
+                         output_data, output_width, output_height,
+                         stride_width, stride_height,
+                         filter_width, filter_height,
+                         pad_width, pad_height,
+                         activation_min, activation_max, depth);
+      input_data += input_size;
+      output_data += output_size;
+    }
+  } else {
+    for (int batch = 0; batch < batches; ++batch) {
+      esp_nn_avg_pool_s8_ansi(input_data, input_width, input_height,
+                              output_data, output_width, output_height,
+                              stride_width, stride_height,
+                              filter_width, filter_height,
+                              pad_width, pad_height,
+                              activation_min, activation_max, depth);
+      input_data += input_size;
+      output_data += output_size;
+    }
+  }
+}
+
+void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                      TfLitePoolParams* params, const OpDataPooling* data,
+                      const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
+
+  const int stride_height = params->stride_height;
+  const int stride_width = params->stride_width;
+  const int filter_height = params->filter_height;
+  const int filter_width = params->filter_width;
+  const int activation_min = data->activation_min;
+  const int activation_max = data->activation_max;
+  const int pad_height = data->padding.height;
+  const int pad_width = data->padding.width;
+
+  const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
+  const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output);
+  TFLITE_DCHECK_LE(activation_min, activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  const int8_t *input_data = tflite::micro::GetTensorData<int8_t>(input);
+  int8_t *output_data = tflite::micro::GetTensorData<int8_t>(output);
+
+  const int input_size = input_width * input_height * depth;
+  const int output_size = output_width * output_height * depth;
+  if (depth % 4 == 0) { // S3 version only supports channels multiple of 4
+    for (int batch = 0; batch < batches; ++batch) {
+      esp_nn_max_pool_s8(input_data, input_width, input_height,
+                         output_data, output_width, output_height,
+                         stride_width, stride_height,
+                         filter_width, filter_height,
+                         pad_width, pad_height,
+                         activation_min, activation_max, depth);
+      input_data += input_size;
+      output_data += output_size;
+    }
+  } else {
+    for (int batch = 0; batch < batches; ++batch) {
+      esp_nn_max_pool_s8_ansi(input_data, input_width, input_height,
+                              output_data, output_width, output_height,
+                              stride_width, stride_height,
+                              filter_width, filter_height,
+                              pad_width, pad_height,
+                              activation_min, activation_max, depth);
+      input_data += input_size;
+      output_data += output_size;
+    }
+  }
+}
+#endif
+
+TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataPooling* data =
+      static_cast<const OpDataPooling*>(node->user_data);
+
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kPoolingInputTensor);
+  TfLiteEvalTensor* output =
+      micro::GetEvalOutput(context, node, kPoolingOutputTensor);
+
+  long long start_time = esp_timer_get_time();
+  // Inputs and outputs share the same type, guaranteed by the converter.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      AveragePoolingEvalFloat(context, node, params, data, input, output);
+      break;
+    case kTfLiteInt8:
+#if ESP_NN
+      AverageEvalQuantized(context, node, params, data, input, output);
+#else
+      AveragePoolingEvalQuantized(context, node, params, data, input, output);
+#endif
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  pooling_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataPooling* data =
+      static_cast<const OpDataPooling*>(node->user_data);
+
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kPoolingInputTensor);
+  TfLiteEvalTensor* output =
+      micro::GetEvalOutput(context, node, kPoolingOutputTensor);
+
+  long long start_time = esp_timer_get_time();
+  switch (input->type) {
+    case kTfLiteFloat32:
+      MaxPoolingEvalFloat(context, node, params, data, input, output);
+      break;
+    case kTfLiteInt8:
+#if ESP_NN
+      MaxEvalQuantized(context, node, params, data, input, output);
+#else
+      MaxPoolingEvalQuantized(context, node, params, data, input, output);
+#endif
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  pooling_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataPooling));
+}
+
+}  // namespace
+
+TfLiteRegistration Register_AVERAGE_POOL_2D() {
+  return tflite::micro::RegisterOp(Init, PoolingPrepare, AverageEval);
+}
+
+TfLiteRegistration Register_MAX_POOL_2D() {
+  return tflite::micro::RegisterOp(Init, PoolingPrepare, MaxEval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/softmax.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/softmax.cc
new file mode 100644
index 00000000..9a967839
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/softmax.cc
@@ -0,0 +1,208 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/softmax.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+#include "freertos/FreeRTOS.h"
+#include <esp_timer.h>
+
+#if ESP_NN
+#include <esp_nn.h>
+#endif
+
+long long softmax_total_time = 0;
+
+namespace tflite {
+namespace {
+// Softmax parameter data that persists in user_data
+const int kInt16LUTArraySize = 513;
+
+struct NodeData {
+  SoftmaxParams op_data;
+#if ESP_NN
+  int buffer_idx;
+#endif
+};
+
+static void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(NodeData));
+}
+
+void SoftmaxQuantized(TfLiteContext* context, const TfLiteEvalTensor* input,
+                      TfLiteEvalTensor* output, const NodeData* data) {
+  if (input->type == kTfLiteInt8) {
+    if (output->type == kTfLiteInt16) {
+      tflite::reference_ops::Softmax(
+          data->op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
+    } else {
+#if ESP_NN
+      const int32_t input_beta_multiplier = data->op_data.input_multiplier;
+      const int32_t input_beta_left_shift = data->op_data.input_left_shift;
+      const int diff_min = data->op_data.diff_min;
+      const RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+      const RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+      const int trailing_dim = input_shape.DimensionsCount() - 1;
+      const int outer_size =
+          MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+      const int depth =
+          MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+      const int8_t *in_ptr = tflite::micro::GetTensorData<int8_t>(input);
+      int8_t *out_ptr = tflite::micro::GetTensorData<int8_t>(output);
+      void *scratch_buf = NULL;
+      if (data->buffer_idx > -1) {
+        scratch_buf = context->GetScratchBuffer(context, data->buffer_idx);
+      }
+      esp_nn_set_softmax_scratch_buf(scratch_buf);
+      esp_nn_softmax_s8(in_ptr, outer_size, depth, input_beta_multiplier,
+                        input_beta_left_shift, diff_min, out_ptr);
+#else
+      tflite::reference_ops::Softmax(
+          data->op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+#endif
+    }
+  } else {
+    tflite::reference_ops::SoftmaxInt16(
+        data->op_data, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int16_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int16_t>(output));
+  }
+}
+
+static TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  NodeData data = *static_cast<NodeData*>(node->user_data);
+
+  long long start_time = esp_timer_get_time();
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      tflite::reference_ops::Softmax(
+          data.op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+    }
+    break;
+    case kTfLiteInt8:
+    case kTfLiteInt16: {
+      SoftmaxQuantized(context, input, output, &data);
+    }
+    break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  softmax_total_time += esp_timer_get_time() - start_time;
+  return kTfLiteOk;
+}
+
+static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE(context, node->user_data != nullptr);
+  NodeData* data = static_cast<NodeData*>(node->user_data);
+  // Only allocate LUTs for KTfLiteInt16 data type
+  if (input->type == kTfLiteInt16) {
+    void* raw_exp_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, raw_exp_lut != nullptr);
+    data->op_data.exp_lut = reinterpret_cast<int16_t*>(raw_exp_lut);
+    void* one_over_one_plus_x_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, one_over_one_plus_x_lut != nullptr);
+    data->op_data.one_over_one_plus_x_lut =
+        reinterpret_cast<int16_t*>(one_over_one_plus_x_lut);
+  }
+
+  if (output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE(context,
+                   input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  }
+
+  // Populate LUT if required
+  if (input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    // exp LUT only used on negative values
+    // we consider exp(-10.0) is insignificant to accumulation
+    gen_lut<float, int16_t, int16_t>(
+        [](float value) { return std::exp(value); }, -10.0f, 0.0f, -1.0f, 1.0f,
+        data->op_data.exp_lut);
+    gen_lut<float, int16_t, int16_t>(
+        [](float value) { return 1.0f / (1.0f + value); }, 0.0f, 1.0f, -1.0f,
+        1.0f, data->op_data.one_over_one_plus_x_lut);
+    data->op_data.zero_point = output->params.zero_point;
+    data->op_data.scale = output->params.scale;
+  }
+
+  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  auto ret_val =
+      CalculateSoftmaxParams(context, input, output, params, &data->op_data);
+
+#if ESP_NN
+  if (output->type == kTfLiteInt8 && input->type == kTfLiteInt8) {
+    const int32_t input_width = input->dims->data[1];
+    const int32_t input_height = input->dims->data[2];
+    int scratch_buf_size = esp_nn_get_softmax_scratch_size(input_width,
+                                                           input_height);
+    if (scratch_buf_size > 0) {
+      TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+        context, scratch_buf_size, &data->buffer_idx));
+    }
+  }
+#endif
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return ret_val;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_SOFTMAX() {
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/exp.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/exp.cc
index 253769a3..ae26f636 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/exp.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/exp.cc
@@ -27,11 +27,15 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
@@ -40,6 +44,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   for (int i = 0; i < output->dims->size; ++i) {
     TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]);
   }
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -65,14 +72,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_EXP() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/expand_dims.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/expand_dims.cc
index bea3ca7e..4b105bf6 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/expand_dims.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/expand_dims.cc
@@ -84,22 +84,31 @@ TfLiteStatus VerifyTensorDim(TfLiteContext* context, const TfLiteTensor* input,
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  const TfLiteTensor* axis;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAxisTensor, &axis));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* axis =
+      micro_context->AllocateTempInputTensor(node, kAxisTensor);
+  TF_LITE_ENSURE(context, axis != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   output->type = input->type;
   if (IsDynamicTensor(axis)) {
     TF_LITE_KERNEL_LOG(context,
                        "DynamicTensor is not yet supported by Expand_Dims.");
     return kTfLiteError;
   }
-  return VerifyTensorDim(context, input, axis, output);
+  TF_LITE_ENSURE_OK(context, VerifyTensorDim(context, input, axis, output));
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(axis);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
 }
 
 template <typename T>
@@ -137,14 +146,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_EXPAND_DIMS() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fill.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fill.cc
index 18de3458..9f438b89 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fill.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fill.cc
@@ -65,14 +65,18 @@ constexpr int kValueTensor = 1;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   // Ensure inputs and outputs exist.
-  const TfLiteTensor* dims;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDimsTensor, &dims));
-  const TfLiteTensor* value;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kValueTensor, &value));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* dims =
+      micro_context->AllocateTempInputTensor(node, kDimsTensor);
+  TF_LITE_ENSURE(context, dims != nullptr);
+  TfLiteTensor* value =
+      micro_context->AllocateTempInputTensor(node, kValueTensor);
+  TF_LITE_ENSURE(context, value != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   // The value tensor must be a scalar.
   TF_LITE_ENSURE_EQ(context, NumDimensions(value), 0);
@@ -90,6 +94,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, EnsureEq(context, output->dims, dims));
   }
 
+  micro_context->DeallocateTempTfLiteTensor(dims);
+  micro_context->DeallocateTempTfLiteTensor(value);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -128,14 +135,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_FILL() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor.cc
index b8be1cf0..6b2a4cc2 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor.cc
@@ -42,14 +42,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace floor
 
 TfLiteRegistration Register_FLOOR() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/floor::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, nullptr, floor::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_div.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_div.cc
index 006296a9..333a1eba 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_div.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_div.cc
@@ -31,22 +31,28 @@ constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input1;
-  TF_LITE_ENSURE_OK(context,
-                    GetInputSafe(context, node, kInputTensor1, &input1));
-  const TfLiteTensor* input2;
-  TF_LITE_ENSURE_OK(context,
-                    GetInputSafe(context, node, kInputTensor2, &input2));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor1);
+  TF_LITE_ENSURE(context, input1 != nullptr);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor2);
+  TF_LITE_ENSURE(context, input2 != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, output->type);
 
+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -117,14 +123,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_FLOOR_DIV() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_mod.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_mod.cc
index 42f2236c..9bb49497 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_mod.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_mod.cc
@@ -36,22 +36,28 @@ constexpr int kOutputTensor = 0;
 // OLD-TODO(b/117912880): Support quantization.
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input1;
-  TF_LITE_ENSURE_OK(context,
-                    GetInputSafe(context, node, kInputTensor1, &input1));
-  const TfLiteTensor* input2;
-  TF_LITE_ENSURE_OK(context,
-                    GetInputSafe(context, node, kInputTensor2, &input2));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor1);
+  TF_LITE_ENSURE(context, input1 != nullptr);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor2);
+  TF_LITE_ENSURE(context, input2 != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, output->type);
 
+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -115,14 +121,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_FLOOR_MOD() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.cc
index a9f35dba..a083edd7 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -35,6 +35,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TFLITE_DCHECK(node->user_data != nullptr);
   TFLITE_DCHECK(node->builtin_data != nullptr);
 
@@ -42,23 +44,30 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const auto params =
       static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
-  const TfLiteTensor* input =
-      GetInput(context, node, kFullyConnectedInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kFullyConnectedInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter =
-      GetInput(context, node, kFullyConnectedWeightsTensor);
+  TfLiteTensor* filter = micro_context->AllocateTempInputTensor(
+      node, kFullyConnectedWeightsTensor);
   TF_LITE_ENSURE(context, filter != nullptr);
-  const TfLiteTensor* bias =
-      GetOptionalInputTensor(context, node, kFullyConnectedBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kFullyConnectedOutputTensor);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kFullyConnectedBiasTensor);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(
+      node, kFullyConnectedOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
-
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
-  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
-                     "Hybrid models are not supported on TFLite Micro.");
 
-  return CalculateOpDataFullyConnected(context, params->activation, input->type,
-                                       input, filter, bias, output, data);
+  TF_LITE_ENSURE_OK(context, CalculateOpDataFullyConnected(
+                                 context, params->activation, input->type,
+                                 input, filter, bias, output, data));
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
+  if (bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(bias);
+  }
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -114,6 +123,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
 
+    case kTfLiteInt16: {
+      const int64_t* bias_data =
+          nullptr != bias ? tflite::micro::GetTensorData<int64_t>(bias)
+                          : nullptr;
+
+      tflite::reference_integer_ops::FullyConnected(
+          FullyConnectedParamsQuantized(data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int16_t>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<int8_t>(filter),
+          tflite::micro::GetTensorShape(bias), bias_data,
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
+      break;
+    }
+
     default: {
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                          TfLiteTypeGetName(input->type), input->type);
@@ -126,14 +152,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.h
index e1215da6..93026cd5 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -81,6 +81,24 @@ inline TfLiteRegistration Register_FULLY_CONNECTED_INT8() {
 }
 
 #endif
+
+#if defined(CMSIS_NN)
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int16.
+TfLiteRegistration Register_FULLY_CONNECTED_INT16();
+
+#else
+// Note that while this block gets used for both reference and optimized kernels
+// that do not have any specialized implementations, the only goal here is to
+// define fallback implementation that allow reference kernels to still be used
+// from applications that call a more specific kernel variant.
+
+inline TfLiteRegistration Register_FULLY_CONNECTED_INT16() {
+  return Register_FULLY_CONNECTED();
+}
+
+#endif
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_FULLY_CONNECTED_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather.cc
index db050626..6035efa7 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather.cc
@@ -97,19 +97,23 @@ TfLiteStatus Gather(const TfLiteGatherParams* params,
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   const auto* params =
       reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  const TfLiteTensor* coords;
-  TF_LITE_ENSURE_OK(context,
-                    GetInputSafe(context, node, kInputPositions, &coords));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* coords =
+      micro_context->AllocateTempInputTensor(node, kInputPositions);
+  TF_LITE_ENSURE(context, coords != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
   switch (coords->type) {
     case kTfLiteInt32:
       break;
@@ -176,6 +180,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   for (int i = axis + 1; i < input->dims->size; ++i) {
     output_shape->data[output_index++] = input->dims->data[i];
   }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(coords);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -209,14 +218,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_GATHER() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather_nd.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather_nd.cc
index 393b931f..eaa1abca 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather_nd.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather_nd.cc
@@ -28,16 +28,19 @@ constexpr int kOutputTensor = 0;
 constexpr int MAX_INDICES_ND = 5;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* params;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kParams, &params));
-  const TfLiteTensor* indices;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kIndices, &indices));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* params = micro_context->AllocateTempInputTensor(node, kParams);
+  TF_LITE_ENSURE(context, params != nullptr);
+  TfLiteTensor* indices =
+      micro_context->AllocateTempInputTensor(node, kIndices);
+  TF_LITE_ENSURE(context, indices != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   switch (params->type) {
     case kTfLiteFloat32:
@@ -98,6 +101,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_shape->data[output_index++] = params->dims->data[i];
   }
   output_shape->size = output_index;
+
+  micro_context->DeallocateTempTfLiteTensor(params);
+  micro_context->DeallocateTempTfLiteTensor(indices);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -188,14 +195,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_GATHER_ND() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish.cc
index 060dfc14..055e12e6 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish.cc
@@ -68,14 +68,8 @@ TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_HARD_SWISH() {
-  return {/*init=*/HardSwishInit,
-          /*free=*/nullptr,
-          /*prepare=*/tflite::HardSwishPrepare,
-          /*invoke=*/HardSwishEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(HardSwishInit, tflite::HardSwishPrepare,
+                                   HardSwishEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish_common.cc
index ee32e0dc..8f846522 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish_common.cc
@@ -32,13 +32,17 @@ const int kHardSwishInputTensor = 0;
 const int kHardSwishOutputTensor = 0;
 
 TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TFLITE_DCHECK(node->user_data != nullptr);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kHardSwishInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kHardSwishInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kHardSwishOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kHardSwishOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   if (input->type == kTfLiteInt8) {
@@ -73,6 +77,9 @@ TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
         &params->reluish_multiplier_fixedpoint_int16);
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/if.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/if.cc
index 2b98f117..39eca8b4 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/if.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/if.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_context.h"
 #include "tensorflow/lite/micro/micro_graph.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -50,36 +51,33 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, node->inputs->size > 0);
 
   // The first input is the condition.
-  const TfLiteTensor* cond;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &cond));
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  TfLiteTensor* cond = micro_context->AllocateTempInputTensor(node, 0);
+
+  TF_LITE_ENSURE(context, cond != nullptr);
   TF_LITE_ENSURE_EQ(context, cond->type, kTfLiteBool);
   TF_LITE_ENSURE_EQ(context, NumElements(cond), 1);
 
+  micro_context->DeallocateTempTfLiteTensor(cond);
+
   // The first input of the node is the condition. The rest of inputs are
   // passed to the branch subgraphs. Therefore, the number of subgraph inputs
   // will be the number of node inputs - 1.
   size_t num_inputs = node->inputs->size - 1;
   size_t num_outputs = node->outputs->size;
 
-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
+  MicroGraph& graph_info = micro_context->graph();
 
   TF_LITE_ENSURE(context,
-                 op_data->then_subgraph_index < graph_info->NumSubgraphs());
+                 op_data->then_subgraph_index < graph_info.NumSubgraphs());
   TF_LITE_ENSURE(context,
-                 op_data->else_subgraph_index < graph_info->NumSubgraphs());
+                 op_data->else_subgraph_index < graph_info.NumSubgraphs());
 
-  TF_LITE_ENSURE_EQ(
-      context, num_inputs,
-      graph_info->NumSubgraphInputs(op_data->then_subgraph_index));
+  TF_LITE_ENSURE_EQ(context, num_inputs,
+                    graph_info.NumSubgraphInputs(op_data->then_subgraph_index));
   TF_LITE_ENSURE_EQ(
       context, num_outputs,
-      graph_info->NumSubgraphOutputs(op_data->then_subgraph_index));
+      graph_info.NumSubgraphOutputs(op_data->then_subgraph_index));
 
   return kTfLiteOk;
 }
@@ -87,80 +85,37 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* cond;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &cond));
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  TfLiteTensor* cond = micro_context->AllocateTempInputTensor(node, 0);
+
+  TF_LITE_ENSURE(context, cond != nullptr);
   bool cond_value = cond->data.b[0];
+  micro_context->DeallocateTempTfLiteTensor(cond);
 
-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
-
-  // Currently we copy the input / output between the subgraphs. This isn't
-  // optimized yet.
+  MicroGraph* graph_info = &micro_context->graph();
+  // Currently we copy the input / output between the subgraphs.
   int active_branch_subgraph_index =
       cond_value ? op_data->then_subgraph_index : op_data->else_subgraph_index;
 
-  for (size_t i = 0;
-       i < graph_info->NumSubgraphInputs(active_branch_subgraph_index); ++i) {
-    const TfLiteEvalTensor* input =
-        tflite::micro::GetEvalInput(context, node, i + 1);
-
-    TfLiteEvalTensor* subgraph_input =
-        graph_info->GetSubgraphInput(active_branch_subgraph_index, i);
-
-    // These checks must occur in Eval since TfLiteEvalTensors are not available
-    // during Prepare.
-    size_t input_bytes;
-    size_t subgraph_input_bytes;
-    TF_LITE_ENSURE_OK(context, TfLiteEvalTensorByteLength(input, &input_bytes));
-    TF_LITE_ENSURE_OK(context, TfLiteEvalTensorByteLength(
-                                   subgraph_input, &subgraph_input_bytes));
-    TF_LITE_ENSURE_TYPES_EQ(context, input->type, subgraph_input->type);
-    TF_LITE_ENSURE_EQ(context, input_bytes, subgraph_input_bytes);
-    memcpy(subgraph_input->data.raw, input->data.raw, input_bytes);
-  }
+  TF_LITE_ENSURE_OK(context,
+                    tflite::micro::CopyOpInputsToSubgraphInputs(
+                        context, node, graph_info, active_branch_subgraph_index,
+                        /*first_tensor_idx=*/1));
 
   TF_LITE_ENSURE_OK(context,
                     graph_info->InvokeSubgraph(active_branch_subgraph_index));
 
-  for (size_t i = 0;
-       i < graph_info->NumSubgraphOutputs(active_branch_subgraph_index); ++i) {
-    const TfLiteEvalTensor* output =
-        tflite::micro::GetEvalOutput(context, node, i);
+  TF_LITE_ENSURE_OK(
+      context, tflite::micro::CopySubgraphOutputsToOpOutputs(
+                   context, node, graph_info, active_branch_subgraph_index));
 
-    TfLiteEvalTensor* subgraph_output =
-        graph_info->GetSubgraphOutput(active_branch_subgraph_index, i);
-
-    // These checks must occur in Eval since TfLiteEvalTensors are not available
-    // during Prepare.
-    size_t output_bytes;
-    size_t subgraph_output_bytes;
-    TF_LITE_ENSURE_OK(context,
-                      TfLiteEvalTensorByteLength(output, &output_bytes));
-    TF_LITE_ENSURE_OK(context, TfLiteEvalTensorByteLength(
-                                   subgraph_output, &subgraph_output_bytes));
-    TF_LITE_ENSURE_TYPES_EQ(context, output->type, subgraph_output->type);
-    TF_LITE_ENSURE_EQ(context, output_bytes, subgraph_output_bytes);
-    memcpy(output->data.raw, subgraph_output->data.raw, output_bytes);
-  }
   return kTfLiteOk;
 }
 
 }  // namespace.
 
 TfLiteRegistration Register_IF() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.cc
index 7debacc2..341eec77 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.cc
@@ -15,45 +15,50 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
 
+#include "tensorflow/lite/micro/arena_allocator/simple_memory_allocator.h"
 #include "tensorflow/lite/micro/micro_arena_constants.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
-#include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 
 namespace tflite {
 namespace micro {
 
 // TODO(b/161841696): Consider moving away from global arena buffers:
-constexpr int KernelRunner::kNumScratchBuffers_;
 constexpr int KernelRunner::kKernelRunnerBufferSize_;
 uint8_t KernelRunner::kKernelRunnerBuffer_[];
 
 KernelRunner::KernelRunner(const TfLiteRegistration& registration,
                            TfLiteTensor* tensors, int tensors_size,
                            TfLiteIntArray* inputs, TfLiteIntArray* outputs,
-                           void* builtin_data)
-    : allocator_(SimpleMemoryAllocator::Create(GetMicroErrorReporter(),
+                           void* builtin_data, TfLiteIntArray* intermediates)
+    : registration_(registration),
+      allocator_(SimpleMemoryAllocator::Create(GetMicroErrorReporter(),
                                                kKernelRunnerBuffer_,
                                                kKernelRunnerBufferSize_)),
-      registration_(registration),
-      tensors_(tensors),
-      mock_micro_graph_(allocator_) {
+      mock_micro_graph_(allocator_),
+      fake_micro_context_(tensors, allocator_, &mock_micro_graph_) {
   // Prepare TfLiteContext:
-  context_.impl_ = static_cast<void*>(this);
-  context_.ReportError = ReportOpError;
+  context_.impl_ = static_cast<void*>(&fake_micro_context_);
+  context_.ReportError = MicroContextReportOpError;
   context_.recommended_num_threads = 1;
-  context_.GetTensor = GetTensor;
-  context_.GetEvalTensor = GetEvalTensor;
-  context_.AllocatePersistentBuffer = AllocatePersistentBuffer;
-  context_.RequestScratchBufferInArena = RequestScratchBufferInArena;
-  context_.GetScratchBuffer = GetScratchBuffer;
-  context_.GetExecutionPlan = GetGraph;
+  context_.GetTensor = MicroContextGetTensor;
+  context_.GetEvalTensor = MicroContextGetEvalTensor;
+  context_.AllocatePersistentBuffer = MicroContextAllocatePersistentBuffer;
+  context_.RequestScratchBufferInArena =
+      MicroContextRequestScratchBufferInArena;
+  context_.GetScratchBuffer = MicroContextGetScratchBuffer;
+
   context_.recommended_num_threads = 0;
 
   // Prepare TfLiteNode:
   node_.inputs = inputs;
   node_.outputs = outputs;
   node_.builtin_data = builtin_data;
+  node_.intermediates = intermediates;
+}
+
+bool KernelRunner::ValidateTempBufferDeallocated() {
+  return fake_micro_context_.IsAllTempTfLiteTensorDeallocated();
 }
 
 TfLiteStatus KernelRunner::InitAndPrepare(const char* init_data,
@@ -61,9 +66,15 @@ TfLiteStatus KernelRunner::InitAndPrepare(const char* init_data,
   if (registration_.init) {
     node_.user_data = registration_.init(&context_, init_data, length);
   }
+
+  TF_LITE_ENSURE(&context_, ValidateTempBufferDeallocated());
+
   if (registration_.prepare) {
     TF_LITE_ENSURE_STATUS(registration_.prepare(&context_, &node_));
   }
+
+  TF_LITE_ENSURE(&context_, ValidateTempBufferDeallocated());
+
   return kTfLiteOk;
 }
 
@@ -72,101 +83,11 @@ TfLiteStatus KernelRunner::Invoke() {
     MicroPrintf("TfLiteRegistration missing invoke function pointer!");
     return kTfLiteError;
   }
-  return registration_.invoke(&context_, &node_);
-}
 
-TfLiteTensor* KernelRunner::GetTensor(const struct TfLiteContext* context,
-                                      int tensor_index) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
+  TF_LITE_ENSURE_STATUS(registration_.invoke(&context_, &node_));
 
-  return &runner->tensors_[tensor_index];
-}
+  TF_LITE_ENSURE(&context_, ValidateTempBufferDeallocated());
 
-TfLiteEvalTensor* KernelRunner::GetEvalTensor(
-    const struct TfLiteContext* context, int tensor_index) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-
-  TfLiteEvalTensor* eval_tensor =
-      reinterpret_cast<TfLiteEvalTensor*>(runner->allocator_->AllocateTemp(
-          sizeof(TfLiteEvalTensor), alignof(TfLiteEvalTensor)));
-  TFLITE_DCHECK(eval_tensor != nullptr);
-
-  // In unit tests, the TfLiteTensor pointer contains the source of truth for
-  // buffers and values:
-  eval_tensor->data = runner->tensors_[tensor_index].data;
-  eval_tensor->dims = runner->tensors_[tensor_index].dims;
-  eval_tensor->type = runner->tensors_[tensor_index].type;
-  return eval_tensor;
-}
-
-void* KernelRunner::AllocatePersistentBuffer(TfLiteContext* context,
-                                             size_t bytes) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-
-  return runner->allocator_->AllocateFromTail(bytes,
-                                              MicroArenaBufferAlignment());
-}
-
-TfLiteStatus KernelRunner::RequestScratchBufferInArena(TfLiteContext* context,
-                                                       size_t bytes,
-                                                       int* buffer_index) {
-  TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(buffer_index != nullptr);
-
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-
-  if (runner->scratch_buffer_count_ == kNumScratchBuffers_) {
-    MicroPrintf("Exceeded the maximum number of scratch tensors allowed (%d).",
-                kNumScratchBuffers_);
-    return kTfLiteError;
-  }
-
-  // For tests, we allocate scratch buffers from the tail and keep them around
-  // for the lifetime of model. This means that the arena size in the tests will
-  // be more than what we would have if the scratch buffers could share memory.
-  runner->scratch_buffers_[runner->scratch_buffer_count_] =
-      runner->allocator_->AllocateFromTail(bytes, MicroArenaBufferAlignment());
-  TFLITE_DCHECK(runner->scratch_buffers_[runner->scratch_buffer_count_] !=
-                nullptr);
-
-  *buffer_index = runner->scratch_buffer_count_++;
-  return kTfLiteOk;
-}
-
-void* KernelRunner::GetScratchBuffer(TfLiteContext* context, int buffer_index) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-
-  TFLITE_DCHECK(runner->scratch_buffer_count_ <= kNumScratchBuffers_);
-  if (buffer_index >= runner->scratch_buffer_count_) {
-    return nullptr;
-  }
-  return runner->scratch_buffers_[buffer_index];
-}
-
-void KernelRunner::ReportOpError(struct TfLiteContext* context,
-                                 const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  GetMicroErrorReporter()->Report(format, args);
-  va_end(args);
-}
-
-TfLiteStatus KernelRunner::GetGraph(struct TfLiteContext* context,
-                                    TfLiteIntArray** args) {
-  TFLITE_DCHECK(context != nullptr);
-  KernelRunner* runner = reinterpret_cast<KernelRunner*>(context->impl_);
-  TFLITE_DCHECK(runner != nullptr);
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  *args = reinterpret_cast<TfLiteIntArray*>(runner->GetMockGraph());
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.h
index 03919f85..68722edb 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.h
@@ -18,8 +18,9 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/micro/arena_allocator/simple_memory_allocator.h"
+#include "tensorflow/lite/micro/fake_micro_context.h"
 #include "tensorflow/lite/micro/mock_micro_graph.h"
-#include "tensorflow/lite/micro/simple_memory_allocator.h"
 
 namespace tflite {
 namespace micro {
@@ -34,7 +35,8 @@ class KernelRunner {
  public:
   KernelRunner(const TfLiteRegistration& registration, TfLiteTensor* tensors,
                int tensors_size, TfLiteIntArray* inputs,
-               TfLiteIntArray* outputs, void* builtin_data);
+               TfLiteIntArray* outputs, void* builtin_data,
+               TfLiteIntArray* intermediates = nullptr);
 
   // Calls init and prepare on the kernel (i.e. TfLiteRegistration) struct. Any
   // exceptions will be DebugLog'd and returned as a status code.
@@ -50,40 +52,22 @@ class KernelRunner {
   // to stub out MicroGraph methods and track invocations on each subgraph.
   MockMicroGraph* GetMockGraph() { return &mock_micro_graph_; }
 
- protected:
-  static TfLiteTensor* GetTensor(const struct TfLiteContext* context,
-                                 int tensor_index);
-  static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context,
-                                         int tensor_index);
-  static void* AllocatePersistentBuffer(TfLiteContext* context, size_t bytes);
-  static TfLiteStatus RequestScratchBufferInArena(TfLiteContext* context,
-                                                  size_t bytes,
-                                                  int* buffer_index);
-  static void* GetScratchBuffer(TfLiteContext* context, int buffer_index);
-  static void ReportOpError(struct TfLiteContext* context, const char* format,
-                            ...);
-  // This method matches GetExecutionPlan from TfLiteContext since TFLM reuses
-  // this method to get the MicroGraph from an operator context.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  static TfLiteStatus GetGraph(struct TfLiteContext* context,
-                               TfLiteIntArray** args);
+  // Returns true if all temp buffer in tests are deallocated.
+  // TODO(b/209453859): move this function to private after deallocation checks
+  // are enabled for all kernel tests.
+  bool ValidateTempBufferDeallocated();
 
  private:
-  static constexpr int kNumScratchBuffers_ = 12;
-
   static constexpr int kKernelRunnerBufferSize_ = 10000;
   static uint8_t kKernelRunnerBuffer_[kKernelRunnerBufferSize_];
 
-  SimpleMemoryAllocator* allocator_ = nullptr;
-  const TfLiteRegistration& registration_;
-  TfLiteTensor* tensors_ = nullptr;
-  MockMicroGraph mock_micro_graph_;
-
   TfLiteContext context_ = {};
   TfLiteNode node_ = {};
+  const TfLiteRegistration& registration_;
 
-  int scratch_buffer_count_ = 0;
-  uint8_t* scratch_buffers_[kNumScratchBuffers_];
+  SimpleMemoryAllocator* allocator_;
+  MockMicroGraph mock_micro_graph_;
+  FakeMicroContext fake_micro_context_;
 };
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.cc
index 73069064..91c0bc91 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
 namespace micro {
@@ -35,6 +36,21 @@ int ValidateTensorIndexing(const TfLiteContext* context, int index,
 
 }  // namespace
 
+TfLiteRegistration RegisterOp(
+    void* (*init)(TfLiteContext* context, const char* buffer, size_t length),
+    TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node),
+    TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node)) {
+  return {/*init=*/init,
+          /*free=*/nullptr,
+          /*prepare=*/prepare,
+          /*invoke=*/invoke,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0,
+          /*registration_external=*/nullptr};
+}
+
 // Returns a mutable tensor for a given input index. is_variable must be checked
 // during prepare when the full TfLiteTensor is available.
 TfLiteEvalTensor* GetMutableEvalInput(const TfLiteContext* context,
@@ -119,13 +135,83 @@ TfLiteStatus CreateWritableTensorDimsWithCopy(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-// Returns a blob of payload data. The payload is subjected to interpretation by
-// the OP. This is the recommended API for an OP to get an external context. OP
-// should use this instead of directly calling GetExternalContext function in
-// context.
-void* GetExternalContext(TfLiteContext* context) {
-  return reinterpret_cast<void*>(
-      context->GetExternalContext(context, kTfLiteMaxExternalContexts));
+// Verify that both tensors have the same type and size, then return the size
+// of both tensors in bytes if they are the same, or -1 if they are different.
+size_t ValidateAndGetTensorSizes(const TfLiteEvalTensor* tensor1,
+                                 const TfLiteEvalTensor* tensor2) {
+  TFLITE_DCHECK(tensor1->type == tensor2->type);
+  size_t tensor1_size = 0;
+  size_t tensor2_size = 0;
+  TfLiteEvalTensorByteLength(tensor1, &tensor1_size);
+  TfLiteEvalTensorByteLength(tensor2, &tensor2_size);
+  return (tensor1_size == tensor2_size) ? tensor1_size : -1;
+}
+
+TfLiteStatus CopyOpInputsToOpOutputs(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, node->inputs->size == node->outputs->size);
+  for (int i = 0; i < node->inputs->size; i++) {
+    const TfLiteEvalTensor* input =
+        tflite::micro::GetEvalInput(context, node, i);
+    TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, i);
+    int bytes = ValidateAndGetTensorSizes(input, output);
+    TF_LITE_ENSURE(context, bytes >= 0);
+    memcpy(output->data.raw, input->data.raw, bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CopyOpInputsToSubgraphInputs(TfLiteContext* context,
+                                          TfLiteNode* node,
+                                          MicroGraph* graph_info,
+                                          int subgraph_idx,
+                                          int first_tensor_idx) {
+  TF_LITE_ENSURE(context,
+                 static_cast<size_t>(node->inputs->size - first_tensor_idx) ==
+                     graph_info->NumSubgraphInputs(subgraph_idx));
+  for (int i = 0; i < node->inputs->size - first_tensor_idx; i++) {
+    const TfLiteEvalTensor* input =
+        tflite::micro::GetEvalInput(context, node, i + first_tensor_idx);
+    TfLiteEvalTensor* subgraph_input =
+        graph_info->GetSubgraphInput(subgraph_idx, i);
+    int bytes = ValidateAndGetTensorSizes(input, subgraph_input);
+    TF_LITE_ENSURE(context, bytes >= 0);
+    memcpy(subgraph_input->data.raw, input->data.raw, bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CopyOpOutputsToSubgraphInputs(TfLiteContext* context,
+                                           TfLiteNode* node,
+                                           MicroGraph* graph_info,
+                                           int subgraph_idx) {
+  TF_LITE_ENSURE(context, static_cast<size_t>(node->outputs->size) ==
+                              graph_info->NumSubgraphInputs(subgraph_idx));
+  for (int i = 0; i < node->outputs->size; i++) {
+    TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, i);
+    TfLiteEvalTensor* subgraph_input =
+        graph_info->GetSubgraphInput(subgraph_idx, i);
+    int bytes = ValidateAndGetTensorSizes(output, subgraph_input);
+    TF_LITE_ENSURE(context, bytes >= 0);
+    memcpy(subgraph_input->data.raw, output->data.raw, bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CopySubgraphOutputsToOpOutputs(TfLiteContext* context,
+                                            TfLiteNode* node,
+                                            MicroGraph* graph_info,
+                                            int subgraph_idx) {
+  TF_LITE_ENSURE(context, static_cast<size_t>(node->outputs->size) ==
+                              graph_info->NumSubgraphOutputs(subgraph_idx));
+  for (int i = 0; i < node->outputs->size; i++) {
+    TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, i);
+    TfLiteEvalTensor* subgraph_output =
+        graph_info->GetSubgraphOutput(subgraph_idx, i);
+    int bytes = ValidateAndGetTensorSizes(output, subgraph_output);
+    TF_LITE_ENSURE(context, bytes >= 0);
+    memcpy(output->data.raw, subgraph_output->data.raw, bytes);
+  }
+  return kTfLiteOk;
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.h
index 1bd266d1..d6f20c72 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.h
@@ -22,10 +22,16 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/micro/micro_context.h"
 
 namespace tflite {
 namespace micro {
 
+TfLiteRegistration RegisterOp(
+    void* (*init)(TfLiteContext* context, const char* buffer, size_t length),
+    TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node),
+    TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node));
+
 // Returns a mutable tensor for a given input index. is_variable must be checked
 // during prepare when the full TfLiteTensor is available.
 TfLiteEvalTensor* GetMutableEvalInput(const TfLiteContext* context,
@@ -39,19 +45,33 @@ const TfLiteEvalTensor* GetEvalInput(const TfLiteContext* context,
 TfLiteEvalTensor* GetEvalOutput(const TfLiteContext* context,
                                 const TfLiteNode* node, int index);
 
-// Returns data for a TfLiteEvalTensor struct.
+// Returns data for a TfLiteEvalTensor struct that are expected to exist.
 template <typename T>
 T* GetTensorData(TfLiteEvalTensor* tensor) {
-  return tensor != nullptr ? reinterpret_cast<T*>(tensor->data.raw) : nullptr;
+  TFLITE_DCHECK(tensor != nullptr);
+  return reinterpret_cast<T*>(tensor->data.raw);
 }
 
-// Returns const data for a TfLiteEvalTensor struct.
+// Returns const data for a TfLiteEvalTensor struct that are expected to exist.
 template <typename T>
 const T* GetTensorData(const TfLiteEvalTensor* tensor) {
   TFLITE_DCHECK(tensor != nullptr);
   return reinterpret_cast<const T*>(tensor->data.raw);
 }
 
+// Returns data for a TfLiteEvalTensor struct that could be null.
+template <typename T>
+T* GetOptionalTensorData(TfLiteEvalTensor* tensor) {
+  return tensor == nullptr ? nullptr : reinterpret_cast<T*>(tensor->data.raw);
+}
+
+// Returns const data for a TfLiteEvalTensor struct that could be null.
+template <typename T>
+const T* GetOptionalTensorData(const TfLiteEvalTensor* tensor) {
+  return tensor == nullptr ? nullptr
+                           : reinterpret_cast<const T*>(tensor->data.raw);
+}
+
 // Returns the shape of a TfLiteEvalTensor struct.
 const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor);
 
@@ -69,23 +89,33 @@ TfLiteStatus CreateWritableTensorDimsWithCopy(TfLiteContext* context,
                                               TfLiteTensor* tensor,
                                               TfLiteEvalTensor* eval_tensor);
 
-// Returns a blob of payload data. The payload is subjected to interpretation by
-// the OP. This is the recommended API for an OP to get an external context. OP
-// should use this instead of directly calling GetExternalContext function in
-// context. Example usage:
-//
-// An application can set an external context through interpreter as below
-//     interpreter->SetMicroExternalContext(pointer_to_your_payload);
-//
-//  Inside an OP that needs this payload, it get the payload pointer by:
-//    Prepare(TfliteContext * context) {
-//       ...
-//       payload_ptr =
-//       reinterpret_cast<your_data_type>(GetMicroExternalContext(context))
-//       ...
-//    }
-//
-void* GetMicroExternalContext(TfLiteContext* context);
+// Copy all op input tensors to op output tensors. Requires all op input tensor
+// shapes and types to be identical to op output tensor shapes and types.
+TfLiteStatus CopyOpInputsToOpOutputs(TfLiteContext* context, TfLiteNode* node);
+
+// Copy all op input tensors to subgraph input tensors. Requires all op input
+// tensor shapes and types to be identical to subgraph input tensor shapes and
+// types.
+TfLiteStatus CopyOpInputsToSubgraphInputs(TfLiteContext* context,
+                                          TfLiteNode* node,
+                                          MicroGraph* graph_info,
+                                          int subgraph_idx,
+                                          int first_tensor_idx);
+
+// Copy all op output tensors to subgraph input tensors. Requires all op output
+// tensor shapes and types to be identical to subgraph input tensor shapes and
+// types.
+TfLiteStatus CopyOpOutputsToSubgraphInputs(TfLiteContext* context,
+                                           TfLiteNode* node,
+                                           MicroGraph* graph_info,
+                                           int subgraph_idx);
+
+// Copy all subgraph output tensors to op outputs. Requires all subgraph output
+// tensor shapes and types to be identical to op output tensor shapes and types.
+TfLiteStatus CopySubgraphOutputsToOpOutputs(TfLiteContext* context,
+                                            TfLiteNode* node,
+                                            MicroGraph* graph_info,
+                                            int subgraph_idx);
 
 }  // namespace micro
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2_pool_2d.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2_pool_2d.cc
index 926f1ffb..2b2a27bf 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2_pool_2d.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2_pool_2d.cc
@@ -36,15 +36,18 @@ constexpr int kTensorShapeRank = 4;
 enum { kBatchRank = 0, kHeightRank, kWidthRank, kChannelRank };
 
 TfLiteStatus L2Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   auto* params = static_cast<TfLitePoolParams*>(node->builtin_data);
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), kTensorShapeRank);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), kTensorShapeRank);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
@@ -82,6 +85,9 @@ TfLiteStatus L2Prepare(TfLiteContext* context, TfLiteNode* node) {
   output->dims->data[kWidthRank] = out_width;
   output->dims->data[kChannelRank] = channels_out;
 
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(input);
+
   return kTfLiteOk;
 }
 
@@ -130,14 +136,7 @@ TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_L2_POOL_2D() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/L2Prepare,
-          /*invoke=*/L2Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, L2Prepare, L2Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2norm.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2norm.cc
index 930710d4..45858e78 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2norm.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2norm.cc
@@ -49,11 +49,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE(context, output != nullptr);
+  MicroContext* micro_context = GetMicroContext(context);
 
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
 
   TF_LITE_ENSURE(context,
@@ -69,6 +72,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Our implementations don't currently support activations.
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -132,14 +137,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace l2norm
 
 TfLiteRegistration Register_L2NORM_REF() {
-  return {/*init=*/l2norm::Init,
-          /*free=*/nullptr,
-          /*prepare=*/l2norm::Prepare,
-          /*invoke=*/l2norm::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(l2norm::Init, l2norm::Prepare, l2norm::Eval);
 }
 
 TfLiteRegistration Register_L2_NORMALIZATION() { return Register_L2NORM_REF(); }
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu.cc
index 70ee3856..96c1b1b1 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu.cc
@@ -88,14 +88,8 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteRegistration Register_LEAKY_RELU() {
-  return {/*init=*/LeakyReluInit,
-          /*free=*/nullptr,
-          /*prepare=*/LeakyReluPrepare,
-          /*invoke=*/LeakyReluEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(LeakyReluInit, LeakyReluPrepare,
+                                   LeakyReluEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu_common.cc
index 21cc99fc..3d1ffebb 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu_common.cc
@@ -30,13 +30,16 @@ const int kOutputTensor = 0;
 
 TfLiteStatus CalculateOpDataLeakyRelu(TfLiteContext* context,
                                       TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (output->type == kTfLiteInt8 || output->type == kTfLiteInt16) {
@@ -62,6 +65,9 @@ TfLiteStatus CalculateOpDataLeakyRelu(TfLiteContext* context,
     data->output_shift_identity = static_cast<int32_t>(output_shift_identity);
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/log_softmax.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/log_softmax.cc
index 5443c914..5fd87612 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/log_softmax.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/log_softmax.cc
@@ -43,13 +43,16 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   TF_LITE_ENSURE(context, HaveSameShapes(input, output));
@@ -89,6 +92,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) {
     data->depth = static_cast<size_t>(input_shape.Dims(trailing_dim));
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -137,14 +142,7 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_LOG_SOFTMAX() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/LogSoftmaxPrepare,
-          /*invoke=*/LogSoftmaxEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, LogSoftmaxPrepare, LogSoftmaxEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/logical.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/logical.cc
index e2d2b5f8..c85e0c5b 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/logical.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/logical.cc
@@ -34,29 +34,11 @@ TfLiteStatus LogicalAndEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_LOGICAL_OR() {
-  // Init, Free, Prepare, Eval are satisfying the Interface required by
-  // TfLiteRegistration.
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/LogicalOrEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, nullptr, LogicalOrEval);
 }
 
 TfLiteRegistration Register_LOGICAL_AND() {
-  // Init, Free, Prepare, Eval are satisfying the Interface required by
-  // TfLiteRegistration.
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/LogicalAndEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, nullptr, LogicalAndEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic.cc
index 77f94ec0..f8ac1c23 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic.cc
@@ -106,13 +106,6 @@ TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_LOGISTIC() {
-  return {/*init=*/LogisticInit,
-          /*free=*/nullptr,
-          /*prepare=*/LogisticPrepare,
-          /*invoke=*/LogisticEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(LogisticInit, LogisticPrepare, LogisticEval);
 }
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic_common.cc
index 05765ad4..a79fd6bb 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic_common.cc
@@ -32,9 +32,13 @@ const int kLogisticOutputTensor = 0;
 TfLiteStatus CalculateArithmeticOpDataLogistic(TfLiteContext* context,
                                                TfLiteNode* node,
                                                OpDataLogistic* data) {
-  const TfLiteTensor* input = GetInput(context, node, kLogisticInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kLogisticInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kLogisticOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kLogisticOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
@@ -55,6 +59,53 @@ TfLiteStatus CalculateArithmeticOpDataLogistic(TfLiteContext* context,
     data->input_range_radius =
         CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
   }
+
+  if (input->type == kTfLiteInt16) {
+    static constexpr int kInputIntegerBits = 3;
+    static constexpr int kOutputFractionalBits = 15;
+
+    // See comments in TanhPrepare about requiring zero_point==0
+    // and a power-of-two ("POT") scale.
+
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+
+    int input_scale_log2_rounded;
+    bool param_scale_pot =
+        CheckedLog2(input->params.scale, &input_scale_log2_rounded);
+
+    data->input_left_shift =
+        (15 - kInputIntegerBits) + input_scale_log2_rounded;
+    param_scale_pot &= (data->input_left_shift == 0);
+
+    if (param_scale_pot) {
+      data->input_multiplier = 0;
+    } else {
+      // Calculate multiplier to change input scale to 1/(3*4096)
+      // as required by the table lookup.
+      // In this scaling +/-2^17 represents +/-10.7
+      double multiplier =
+          static_cast<double>(input->params.scale) * 4096.0 * 3.0;
+
+      data->input_left_shift = 0;
+
+      while (multiplier <= 32767.0 / 2.0 && data->input_left_shift <= 30) {
+        data->input_left_shift++;
+        multiplier = multiplier * 2.0;
+      }
+
+      data->input_multiplier = static_cast<int32_t>(multiplier);
+    }
+
+    int output_scale_log2_rounded;
+    TF_LITE_ENSURE(
+        context, CheckedLog2(output->params.scale, &output_scale_log2_rounded));
+    TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded,
+                      -kOutputFractionalBits);
+  }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/lstm_eval.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/lstm_eval.cc
new file mode 100644
index 00000000..f157a8d0
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/lstm_eval.cc
@@ -0,0 +1,2955 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/micro/kernels/lstm_eval.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/micro_tensor_utils.h"
+namespace tflite {
+namespace {
+
+void ComputeRowSums(
+    int32_t* input_to_input_row_sums, int32_t* input_to_forget_row_sums,
+    int32_t* input_to_cell_row_sums, int32_t* input_to_output_row_sums,
+    int32_t* aux_input_to_input_row_sums, int32_t* aux_input_to_forget_row_sums,
+    int32_t* aux_input_to_cell_row_sums, int32_t* aux_input_to_output_row_sums,
+    int32_t* recurrent_to_input_row_sums, int32_t* recurrent_to_forget_row_sums,
+    int32_t* recurrent_to_cell_row_sums, int32_t* recurrent_to_output_row_sums,
+    int32_t* projection_weights_row_sums, int32_t* row_sums, int n_cell,
+    int n_input, int n_aux_input, int n_output,
+    const int8_t* input_to_input_weights_ptr,
+    const int8_t* input_to_forget_weights_ptr,
+    const int8_t* input_to_cell_weights_ptr,
+    const int8_t* input_to_output_weights_ptr,
+    const int8_t* aux_input_to_input_weights_ptr,
+    const int8_t* aux_input_to_forget_weights_ptr,
+    const int8_t* aux_input_to_cell_weights_ptr,
+    const int8_t* aux_input_to_output_weights_ptr,
+    const int8_t* recurrent_to_input_weights_ptr,
+    const int8_t* recurrent_to_forget_weights_ptr,
+    const int8_t* recurrent_to_cell_weights_ptr,
+    const int8_t* recurrent_to_output_weights_ptr,
+    const int8_t* projection_weights_ptr, bool use_cifg,
+    const float* aux_input_ptr) {
+  // Compute the row sums for dequantization
+  if (!use_cifg) {
+    micro_tensor_utils::ReductionSumVector(
+        input_to_input_weights_ptr, input_to_input_row_sums, n_cell, n_input);
+  }
+  micro_tensor_utils::ReductionSumVector(
+      input_to_forget_weights_ptr, input_to_forget_row_sums, n_cell, n_input);
+  micro_tensor_utils::ReductionSumVector(
+      input_to_cell_weights_ptr, input_to_cell_row_sums, n_cell, n_input);
+  micro_tensor_utils::ReductionSumVector(
+      input_to_output_weights_ptr, input_to_output_row_sums, n_cell, n_input);
+
+  if (aux_input_ptr) {
+    if (!use_cifg) {
+      micro_tensor_utils::ReductionSumVector(aux_input_to_input_weights_ptr,
+                                             aux_input_to_input_row_sums,
+                                             n_cell, n_aux_input);
+    }
+    micro_tensor_utils::ReductionSumVector(aux_input_to_forget_weights_ptr,
+                                           aux_input_to_forget_row_sums, n_cell,
+                                           n_aux_input);
+    micro_tensor_utils::ReductionSumVector(aux_input_to_cell_weights_ptr,
+                                           aux_input_to_cell_row_sums, n_cell,
+                                           n_aux_input);
+    micro_tensor_utils::ReductionSumVector(aux_input_to_output_weights_ptr,
+                                           aux_input_to_output_row_sums, n_cell,
+                                           n_aux_input);
+  }
+  if (!use_cifg) {
+    micro_tensor_utils::ReductionSumVector(recurrent_to_input_weights_ptr,
+                                           recurrent_to_input_row_sums, n_cell,
+                                           n_output);
+  }
+  micro_tensor_utils::ReductionSumVector(recurrent_to_forget_weights_ptr,
+                                         recurrent_to_forget_row_sums, n_cell,
+                                         n_output);
+  micro_tensor_utils::ReductionSumVector(recurrent_to_cell_weights_ptr,
+                                         recurrent_to_cell_row_sums, n_cell,
+                                         n_output);
+  micro_tensor_utils::ReductionSumVector(recurrent_to_output_weights_ptr,
+                                         recurrent_to_output_row_sums, n_cell,
+                                         n_output);
+
+  if (projection_weights_ptr != nullptr) {
+    micro_tensor_utils::ReductionSumVector(
+        projection_weights_ptr, projection_weights_row_sums, n_output, n_cell);
+  }
+}
+
+// Calculates a single LSTM gate.
+//
+// Implements the following formula: (* is matrix multiply)
+//   gate = activate(W_input    * input + W_aux       * aux_input   +
+//                   W_peephole * cell  + W_recurrent * prev_output + bias)
+// with layer norm:
+//   gate = activate(W_norm * normalize(...) + bias) // not adding bias inside
+//
+// Activation is sigmoid except for the "cell" gate (configurable, usually tanh)
+//
+// Parameters:
+// Input vectors (to LSTM):    | Size:                | Optional?
+//   input                     | n_input              |
+//   aux_input                 | n_aux_input          | y (bidir LSTM)
+// Input vectors (persistent states):
+//   output_state              | n_output             |
+//   cell_state                | n_cell               |
+// 'Constant' inputs:
+//   input_to_gate_weights     | n_cell * n_input     |
+//   aux_input_to_gate_weights | n_cell * n_aux_input | y (bidir LSTM)
+//   recurrent_to_gate_weights | n_cell * n_output    |
+//   cell_to_gate_weights      | n_cell               | y (peephole)
+//   gate_bias                 | n_cell               |
+//   layer_norm_coefficients   | n_cell               | y (layer norm)
+// Output vector:
+//   gate                      | n_cell               |
+// Scalar parameters:
+//   n_batch                                    - batch size / number of vectors
+//   n_input, n_aux_input, n_output, n_cell     - size of vectors.
+//   activation                                 - activation to use.
+//   is_input_all_zeros, is_aux_input_all_zeros - if input vectors are all zero.
+//   use_layer_norm                             - if doing layer norm LSTM.
+inline void CalculateLstmGateFloat(
+    const float* input, const float* input_to_gate_weights,
+    const float* aux_input, const float* aux_input_to_gate_weights,
+    const float* output_state, const float* recurrent_to_gate_weights,
+    const float* cell_state, const float* cell_to_gate_weights,
+    const float* layer_norm_coefficients, const float* gate_bias,
+    const int n_batch, const int n_input, const int n_aux_input,
+    const int n_output, const int n_cell,
+    const TfLiteFusedActivation activation, float* gate,
+    const bool is_input_all_zeros, const bool is_aux_input_all_zeros) {
+  const bool use_peephole = (cell_to_gate_weights != nullptr);
+  const bool use_layer_norm = (layer_norm_coefficients != nullptr);
+
+  // Initialize scratch buffers with bias for regular lstm or initialize with
+  // zero for layer norm lstm.
+  if (use_layer_norm) {
+    memset(gate, 0, n_cell * n_batch * sizeof(float));
+  } else {
+    micro_tensor_utils::VectorBatchVectorAssign(gate_bias, n_cell, n_batch,
+                                                gate);
+  }
+  // For each batch and cell: compute input_weight * input.
+  // Skip if input is all zeros.
+  if (!is_input_all_zeros) {
+    micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_gate_weights, n_cell, n_input, input, n_batch, gate);
+  }
+  // For each batch and cell: compute aux_input_weight * aux_input.
+  // Skip if auxiliary input is not available or all zeros.
+  if (!is_aux_input_all_zeros) {
+    micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_gate_weights, n_cell, n_aux_input, aux_input, n_batch,
+        gate);
+  }
+  // For each batch and cell: compute recurrent_weight * output_state.
+  micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_gate_weights, n_cell, n_output, output_state, n_batch, gate);
+  // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
+  if (use_peephole) {
+    micro_tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_gate_weights, n_cell, cell_state, n_batch, gate);
+  }
+  // Do layer normalization (if layer norm LSTM)
+  if (use_layer_norm) {
+    micro_tensor_utils::MeanStddevNormalization(gate, gate, n_cell, n_batch);
+    micro_tensor_utils::VectorBatchVectorCwiseProduct(
+        layer_norm_coefficients, n_cell, gate, n_batch, gate);
+    micro_tensor_utils::VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate);
+  }
+  // Apply activation
+  micro_tensor_utils::ApplyActivationToVector(gate, n_batch * n_cell,
+                                              activation, gate);
+}
+
+// Updates the LSTM cell state, used by both float and hybrid LSTM versions.
+//
+// Implements the following formula:
+//   cell_state_new = clip(forget_gate * cell_state + input_gate * cell_gate)
+//
+// With CIFG LSTM, input gate is replaced by (1-forget_gate).
+//
+// Parameters:
+//  - n_batch, n_cell: sizes of vectors
+//  - cell_state: input/output vector, size n_batch*n_cell
+//  - input_gate: input vector, size n_batch*n_cell.
+//  - forget_gate: input/scratch vector, size n_batch*n_cell, modified with CIFG
+//  - cell_gate: input vector, size n_batch*n_cell.
+//  - use_cifg: use 1-forget_gate instead of input_gate.
+//  - clip: if > 0, clip the resulting cell state to [-clip, +clip].
+void UpdateLstmCellFloat(int n_batch, int n_cell, float* cell_state,
+                         const float* input_gate, float* forget_gate,
+                         const float* cell_gate, bool use_cifg, float clip) {
+  micro_tensor_utils::VectorVectorCwiseProduct(forget_gate, cell_state,
+                                               n_batch * n_cell, cell_state);
+
+  if (use_cifg) {
+    // With CIFG, input_gate = 1-forget_gate. Use the forget_gate array as
+    // scratch, as input_gate array is not allocated in this case. (Be careful
+    // not to write to the scratch before reading the forget gate data.)
+    float* scratch = forget_gate;
+    micro_tensor_utils::Sub1Vector(forget_gate, n_batch * n_cell, scratch);
+    micro_tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_gate, scratch, n_batch * n_cell, cell_state);
+  } else {
+    micro_tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_gate, input_gate, n_batch * n_cell, cell_state);
+  }
+  if (clip > 0.0f) {
+    micro_tensor_utils::CwiseClipping(cell_state, n_batch * n_cell, clip);
+  }
+}
+
+// Calculates the output state tensor of an LSTM step.
+//
+// Implements the following formula:
+//   output_no_projection = output_gate .* activate(cell_state)
+//     (elementwise vector product)
+// If no projection is used:
+//   output = output_state = output_no_projection
+// With projection:
+//   output = output_state = clip(W*output_no_projection + bias)
+//
+// Output might not have a different 'stride' than n_batch, so we need to copy.
+//
+// Parameters:
+//  - n_batch: batches: the number of distinct vectors in each array.
+//  - n_cell, n_output: sizes of vectors.
+//  - cell_state, output_gate: input vectors, size n_batch*n_cell.
+//  - projection_weights, projection_weights_scale, projection_bias:
+//      constant inputs, describing projection matrix and bias.
+//  - proj_clip: if > 0, clip the output of the projection.
+//  - output_state: output vector, size n_batch*n_output. Must be contigous.
+//  - scratch: scratch area, size n_batch*n_cell.
+void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output,
+                              const float* cell_state, const float* output_gate,
+                              TfLiteFusedActivation activation,
+                              const float* projection_weights,
+                              const float* projection_bias,
+                              const float proj_clip, float* output_state,
+                              float* scratch) {
+  micro_tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell,
+                                              activation, scratch);
+  micro_tensor_utils::VectorVectorCwiseProduct(output_gate, scratch,
+                                               n_batch * n_cell, scratch);
+
+  const bool use_projection = (projection_weights != nullptr);
+  const bool use_projection_bias = (projection_bias != nullptr);
+
+  if (use_projection) {
+    if (use_projection_bias) {
+      micro_tensor_utils::VectorBatchVectorAssign(projection_bias, n_output,
+                                                  n_batch, output_state);
+    } else {
+      memset(output_state, 0, n_batch * n_output * sizeof(float));
+    }
+    micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        projection_weights, n_output, n_cell, scratch, n_batch, output_state);
+    if (proj_clip > 0.0f) {
+      micro_tensor_utils::CwiseClipping(output_state, n_batch * n_output,
+                                        proj_clip);
+    }
+  } else {
+    std::memcpy(output_state, scratch, n_batch * n_output * sizeof(float));
+  }
+}
+
+// Calculates a single LSTM gate, hybrid version.
+// Implements the same functionality as CalculateLstmGateFloat.
+void CalculateLstmGateHybrid(
+    // Input and weights
+    const int8_t* input, const float* input_sf, const int32_t* input_zp,
+    const int8_t* input_to_gate_weights,
+    const uint8_t* input_to_gate_weights_ledger,
+    const float input_to_gate_weights_scale, int32_t* input_to_gate_row_sums,
+    // Aux input and weights
+    const int8_t* aux_input, const float* aux_input_sf,
+    const int32_t* aux_input_zp, const int8_t* aux_input_to_gate_weights,
+    const float aux_input_to_gate_weights_scale,
+    int32_t* aux_input_to_gate_row_sums,
+    // Output state and weights
+    const int8_t* output_state, const float* output_state_sf,
+    const int32_t* output_state_zp, const int8_t* recurrent_to_gate_weights,
+    const uint8_t* recurrent_to_gate_weights_ledger,
+    const float recurrent_to_gate_weights_scale,
+    int32_t* recurrent_to_gate_row_sums,
+    // Cell state and weights (peephole LSTM)
+    const float* cell_state, const int8_t* cell_to_gate_weights,
+    const float cell_to_gate_weights_scale,
+    // Layer normalization coefficients (layer norm LSTM) + gate bias
+    const float* layer_norm_coefficients, const float* gate_bias,
+    // Array sizes
+    const int n_batch, const int n_input, const int n_aux_input,
+    const int n_output, const int n_cell,
+    const TfLiteFusedActivation activation,
+    // Output
+    float* gate,
+    // Parameters for performance optimizations
+    const bool is_input_all_zeros, const bool is_aux_input_all_zeros,
+    const bool is_output_state_all_zeros, bool* compute_row_sums,
+    // Scratch arrays
+    float* scratch0,        // size: n_batch
+    float* scratch1,        // size: n_cell, only used if peephole LSTM
+    float* scales,          // size: n_batch
+    int32_t* accum_scratch  // For MatrixBatchVectorMultiplyAccumulate
+) {
+  const bool use_peephole = (cell_to_gate_weights != nullptr);
+  const bool use_layer_norm = (layer_norm_coefficients != nullptr);
+
+  // Initialize scratch buffers with bias for regular lstm or initialize with
+  // zero for layer norm lstm.
+  if (use_layer_norm) {
+    memset(gate, 0, n_cell * n_batch * sizeof(float));
+  } else {
+    micro_tensor_utils::VectorBatchVectorAssign(gate_bias, n_cell, n_batch,
+                                                gate);
+  }
+  // For each batch and cell: compute input_weight * input.
+  // Skip if input is all zeros.
+  if (!is_input_all_zeros) {
+    if (input_to_gate_weights_ledger != nullptr) {
+      for (int i = 0; i < n_batch; i++) {
+        scales[i] = input_to_gate_weights_scale * input_sf[i];
+      }
+      micro_tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
+          input_to_gate_weights, input_to_gate_weights_ledger, n_cell, n_input,
+          input, scales, n_batch, gate);
+
+    } else {
+      micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_to_gate_weights, n_cell, n_input, input,
+          input_to_gate_weights_scale, input_sf, n_batch, gate,
+          /*per_channel_scale=*/nullptr, input_zp, accum_scratch,
+          input_to_gate_row_sums, compute_row_sums, scratch0, nullptr);
+    }
+  }
+  // For each batch and cell: compute aux_input_weight * aux_input.
+  // Skip if auxiliary input is not available or all zeros.
+  if (!is_aux_input_all_zeros) {
+    micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_gate_weights, n_cell, n_aux_input, aux_input,
+        aux_input_to_gate_weights_scale, aux_input_sf, n_batch, gate,
+        /*per_channel_scale=*/nullptr, aux_input_zp, accum_scratch,
+        aux_input_to_gate_row_sums, compute_row_sums, scratch0, nullptr);
+  }
+  // For each batch and cell: compute recurrent_weight * output_state.
+  // Skip if output state is all zeros.
+  if (!is_output_state_all_zeros) {
+    if (recurrent_to_gate_weights_ledger != nullptr) {
+      for (int i = 0; i < n_batch; i++) {
+        scales[i] = recurrent_to_gate_weights_scale * input_sf[i];
+      }
+      micro_tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_gate_weights, recurrent_to_gate_weights_ledger, n_cell,
+          n_output, output_state, scales, n_batch, gate);
+    } else {
+      micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_gate_weights, n_cell, n_output, output_state,
+          recurrent_to_gate_weights_scale, output_state_sf, n_batch, gate,
+          /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch,
+          recurrent_to_gate_row_sums, compute_row_sums, scratch0, nullptr);
+    }
+  }
+  // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
+  if (use_peephole) {
+    float* recovered_cell_weights = scratch1;
+    micro_tensor_utils::VectorScalarMultiply(cell_to_gate_weights, n_cell,
+                                             cell_to_gate_weights_scale,
+                                             recovered_cell_weights);
+    micro_tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recovered_cell_weights, n_cell, cell_state, n_batch, gate);
+  }
+  // Do layer normalization (if layer norm LSTM)
+  if (use_layer_norm) {
+    micro_tensor_utils::MeanStddevNormalization(gate, gate, n_cell, n_batch);
+    micro_tensor_utils::VectorBatchVectorCwiseProduct(
+        layer_norm_coefficients, n_cell, gate, n_batch, gate);
+    micro_tensor_utils::VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate);
+  }
+  // Apply activation
+  micro_tensor_utils::ApplyActivationToVector(gate, n_cell * n_batch,
+                                              activation, gate);
+}
+
+// Calculates the output state tensor of an LSTM step. See Float version too.
+//
+// Parameters:
+//  - n_batch: batches: the number of distinct vectors in each array.
+//  - n_cell, n_output: sizes of vectors.
+//  - cell_state, output_gate: input vectors, size n_batch*n_cell.
+//  - projection_weights, projection_weights_scale, projection_bias:
+//      constant inputs, describing projection matrix and bias.
+//  - proj_clip: if > 0, clip the output of the projection.
+//  - output_state: output vector, size n_batch*n_output. Must be contigous.
+//  - asymmetric_quantize_inputs: parameter to control quantization.
+//  - projection_weights_row_sums, compute_row_sums: Data for optimized
+//      MatrixBatchVectorMultiplyAccumulate.
+//  - scratch0: scratch area of size n_batch*n_cell
+//  - scratch1: scratch area of size n_batch*n_cell
+//  - scratch2: scratch area of size n_batch
+//  - scratch3: scratch area of size n_batch
+//  - scratch4: scratch area used by MatrixBatchVectorMultiplyAccumulate
+//  - scales: scratch area of size n_batch
+void CalculateLstmOutputHybrid(
+    int n_batch, int n_cell, int n_output, const float* cell_state,
+    const float* output_gate, TfLiteFusedActivation activation,
+    const int8_t* projection_weights, const uint8_t* projection_weights_ledger,
+    float projection_weights_scale, const float* projection_bias,
+    const float proj_clip, float* output_state, bool asymmetric_quantize_inputs,
+    int32_t* projection_weights_row_sums, bool* compute_row_sums,
+    float* scratch0, int8_t* scratch1, float* scratch2, int32_t* scratch3,
+    int32_t* scratch4, float* scales) {
+  micro_tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell,
+                                              activation, scratch0);
+  micro_tensor_utils::VectorVectorCwiseProduct(output_gate, scratch0,
+                                               n_batch * n_cell, scratch0);
+
+  const bool use_projection = (projection_weights != nullptr);
+  const bool use_projection_bias = (projection_bias != nullptr);
+
+  if (use_projection) {
+    if (use_projection_bias) {
+      micro_tensor_utils::VectorBatchVectorAssign(projection_bias, n_output,
+                                                  n_batch, output_state);
+    } else {
+      memset(output_state, 0, n_batch * n_output * sizeof(float));
+    }
+    if (!micro_tensor_utils::IsZeroVector(scratch0, n_batch * n_cell)) {
+      // Save quantization and matmul computation for all zero output.
+      micro_tensor_utils::BatchQuantizeFloats(scratch0, n_batch, n_cell,
+                                              scratch1, scratch2, scratch3,
+                                              asymmetric_quantize_inputs);
+      if (projection_weights_ledger != nullptr) {
+        for (int i = 0; i < n_batch; i++) {
+          scales[i] = projection_weights_scale * scratch2[i];
+        }
+        micro_tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
+            projection_weights, projection_weights_ledger, n_output, n_cell,
+            scratch1, scales, n_batch, output_state);
+      } else {
+        micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            projection_weights, n_output, n_cell, scratch1,
+            projection_weights_scale, scratch2, n_batch, output_state,
+            /*per_channel_scale=*/nullptr, scratch3, scratch4,
+            projection_weights_row_sums, compute_row_sums, scratch2, nullptr);
+      }
+    }
+    if (proj_clip > 0.0f) {
+      micro_tensor_utils::CwiseClipping(output_state, n_batch * n_output,
+                                        proj_clip);
+    }
+  } else {
+    std::memcpy(output_state, scratch0, n_batch * n_output * sizeof(float));
+  }
+}
+
+// Calculates a single LSTM gate, int8x8_16 version.
+// Implements the same functionality as CalculateLstmGateFloat.
+void CalculateLstmGateInteger8x8_16(
+    // Input and weights
+    const int8_t* input, const int8_t* input_to_gate_weights,
+    const int32_t* input_to_gate_bias, const int32_t input_to_gate_scale_a,
+    const int32_t input_to_gate_scale_b,
+    // Output state and weights
+    const int8_t* output_state, const int8_t* recurrent_to_gate_weights,
+    const int32_t* recurrent_to_gate_bias,
+    const int32_t recurrent_to_gate_scale_a,
+    const int32_t recurrent_to_gate_scale_b,
+    // Cell state and weights
+    const int16_t* cell_state, const int16_t* cell_to_gate_weights,
+    const int32_t cell_to_gate_scale_a, const int32_t cell_to_gate_scale_b,
+    // Layer normalization parameters (layer norm LSTM)
+    const int16_t* layer_norm_coefficients, const int32_t* layer_norm_bias,
+    const int32_t layer_norm_input_scale_a,
+    const int32_t layer_norm_input_scale_b,
+    const int32_t layer_norm_variance_guard,
+    // Array sizes
+    const int n_batch, const int n_input, const int n_output, const int n_cell,
+    const TfLiteFusedActivation activation,
+    // Output
+    int16_t* gate,
+    // Parameters for performance optimizations
+    // Scratch arrays
+    int32_t* scratch5) {
+  const bool use_peephole = (cell_to_gate_weights != nullptr);
+  const bool use_layer_norm = (layer_norm_coefficients != nullptr);
+
+  // Initialize scratch buffers with zeros. Note that unlike float and hybrid
+  // versions, bias is only used in layer normalization.
+  memset(gate, 0, n_batch * n_cell * sizeof(int16_t));
+  // For each batch and cell: compute input_weight * input.
+  micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input, input_to_gate_bias, input_to_gate_weights, input_to_gate_scale_a,
+      input_to_gate_scale_b, n_batch, n_input, n_cell, 0, scratch5, gate,
+      nullptr);
+  // Note: no aux_input.
+
+  // For each batch and cell: compute recurrent_weight * output_state.
+  micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      output_state, recurrent_to_gate_bias, recurrent_to_gate_weights,
+      recurrent_to_gate_scale_a, recurrent_to_gate_scale_b, n_batch, n_output,
+      n_cell, 0, scratch5, gate, nullptr);
+  // For each batch and cell: compute cell_weight * cell_state (peephole LSTM)
+  if (use_peephole) {
+    micro_tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_gate_weights, n_output, cell_state, n_batch,
+        cell_to_gate_scale_a, cell_to_gate_scale_b, gate);
+  }
+  // Do layer normalization (if layer norm LSTM)
+  if (use_layer_norm) {
+    micro_tensor_utils::ApplyLayerNorm(
+        gate, layer_norm_coefficients, layer_norm_bias,
+        layer_norm_input_scale_a, layer_norm_input_scale_b,
+        layer_norm_variance_guard, n_batch, n_cell, gate);
+  }
+  // Apply activation
+  switch (activation) {
+    case kTfLiteActSigmoid:
+      micro_tensor_utils::ApplySigmoid(gate, n_batch, n_cell, gate);
+      break;
+    case kTfLiteActTanh:
+      micro_tensor_utils::ApplyTanh(3, gate, n_batch, n_cell, gate);
+      break;
+    default:
+      // Only Sigmoid or Tanh is used.
+      TFLITE_ASSERT_FALSE;
+  }
+}
+
+// Updates the LSTM cell state, used by both integer LSTM versions.
+// Also see UpdateLstmCellFloat.
+//
+// Parameters:
+//  - n_batch, n_cell: sizes of vectors
+//  - cell_state: input/output vector, size n_batch*n_cell
+//  - cell_state_scale: scaling factor of cell state.
+//  - input_gate: input vector, size n_batch*n_cell.
+//  - forget_gate: input/scratch vector, size n_batch*n_cell, always modified.
+//  - cell_gate: input vector, size n_batch*n_cell.
+//  - use_cifg: use 1-forget_gate instead of input_gate.
+//  - clip: if > 0, clip the resulting cell state to [-clip, +clip].
+void UpdateLstmCellInteger(int n_batch, int n_cell, int16_t* cell_state,
+                           int32_t cell_state_scale, const int16_t* input_gate,
+                           int16_t* forget_gate, const int16_t* cell_gate,
+                           bool use_cifg, int16_t clip) {
+  // Use the forget_gate array as scratch, as input_gate array is not allocated
+  // in CIFG case. (Be careful not to write to the scratch before reading the
+  // forget gate data.)
+  int16_t* scratch = forget_gate;
+
+  micro_tensor_utils::CwiseMul(forget_gate, cell_state, n_batch, n_cell, 15,
+                               cell_state);
+  if (use_cifg) {
+    micro_tensor_utils::Sub1Vector(forget_gate, n_batch * n_cell, scratch);
+    micro_tensor_utils::CwiseMul(scratch, cell_gate, n_batch, n_cell,
+                                 30 + cell_state_scale, scratch);
+  } else {
+    micro_tensor_utils::CwiseMul(input_gate, cell_gate, n_batch, n_cell,
+                                 30 + cell_state_scale, scratch);
+  }
+  micro_tensor_utils::CwiseAdd(cell_state, scratch, n_batch, n_cell,
+                               cell_state);
+
+  if (clip > 0) {
+    micro_tensor_utils::CwiseClipping(cell_state, n_batch * n_cell, clip);
+  }
+}
+
+// Calculates the output state tensor of an LSTM step. See Float and hybrid
+// versions as well.
+//
+// Parameters:
+//  - n_batch: batches: the number of distinct vectors in each array.
+//  - n_cell, n_output: sizes of vectors.
+//  - cell_state, output_gate: input vectors, size n_batch*n_cell.
+//  - cell_state_scale: scaling of cell_state.
+//  - hidden_scale_[a|b]: effective scale of cell_state.*output_gate
+//  - hidden_zp: zero_point for cell_state.*output_gate
+//  - projection_weights, proj_scale_[a|b], projection_bias:
+//      constant inputs, describing projection matrix and bias.
+//  - output_state_zp: zero point of output_state. (Input, calibrated value.)
+//  - quantized_proj_clip: if > 0, clip the output of the projection.
+//  - output_state: output vector, size n_batch*n_output. Must be contigous.
+//  - scratch0: scratch area of size n_batch*n_cell
+//  - scratch1: scratch area of size n_batch*n_cell
+//  - scratch2: scratch area used by MatrixBatchVectorMultiplyAccumulate
+void CalculateLstmOutputInteger8x8_16(
+    int n_batch, int n_cell, int n_output, const int16_t* cell_state,
+    int32_t cell_state_scale, const int16_t* output_gate,
+    int32_t hidden_scale_a, int32_t hidden_scale_b, int32_t hidden_zp,
+    const int8_t* projection_weights, int32_t proj_scale_a,
+    int32_t proj_scale_b, const int32_t* projection_bias,
+    int32_t output_state_zp, int8_t quantized_proj_clip, int8_t* output_state,
+    int16_t* scratch0, int8_t* scratch1, int32_t* scratch2) {
+  // Note: unlike float/hybrid, the activation is always Tanh.
+  micro_tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state, n_batch,
+                                n_cell, scratch0);
+  micro_tensor_utils::CwiseMul(output_gate, scratch0, hidden_scale_a,
+                               hidden_scale_b, n_batch, n_cell, hidden_zp,
+                               scratch1);
+
+  const bool use_projection = (projection_weights != nullptr);
+
+  if (use_projection) {
+    // Note: no bias like in float/hybrid
+    memset(output_state, 0, n_batch * n_output * sizeof(int8_t));
+    micro_tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        scratch1, projection_bias, projection_weights, proj_scale_a,
+        proj_scale_b, n_batch, n_cell, n_output, output_state_zp, scratch2,
+        output_state, nullptr);
+    if (quantized_proj_clip > 0) {
+      micro_tensor_utils::CwiseClipping(output_state, n_batch * n_output,
+                                        quantized_proj_clip);
+    }
+  } else {
+    std::memcpy(output_state, scratch1, n_batch * n_output * sizeof(int8_t));
+  }
+}
+
+// Calculates a single LSTM gate, int8x8_8 version.
+// Implements the same functionality as CalculateLstmGateFloat.
+void CalculateLstmGateInteger8x8_8(
+    // Inputs and weights
+    const int8_t* input, int32_t input_zp, const int8_t* input_to_gate_weight,
+    const int32_t input_to_gate_scale_a, const int32_t input_to_gate_scale_b,
+    const int32_t input_times_weights_scale_a,
+    const int32_t input_times_weights_scale_b,
+    const int32_t input_times_weights_zp,
+    // Output state and weights
+    const int8_t* output_state, const int32_t output_state_zp,
+    const int8_t* recurrent_to_gate_weight,
+    const int32_t recurrent_to_gate_scale_a,
+    const int32_t recurrent_to_gate_scale_b,
+    const int32_t output_state_times_weights_scale_a,
+    const int32_t output_state_times_weights_scale_b,
+    const int32_t output_state_times_weights_zp,
+    // Layer normalization parameters (layer norm LSTM)
+    const int16_t* layer_norm_gate_weight,
+    const int32_t layer_norm_gate_scale_a,
+    const int32_t layer_norm_gate_scale_b, const int32_t* gate_bias,
+    // Array sizes
+    const int n_batch, const int n_input, const int n_output, const int n_cell,
+    const TfLiteFusedActivation activation,
+    // Output
+    int16_t* gate,
+    // Scratch arrays, both sized n_batch*n_cell
+    int8_t* scratch0, int8_t* scratch1) {
+  // Multiply input * input_weights => scratch0
+  micro_tensor_utils::MatrixBatchVectorMultiply(
+      input, input_zp, input_to_gate_weight, input_to_gate_scale_a,
+      input_to_gate_scale_b, n_batch, n_input, n_cell, scratch0,
+      input_times_weights_zp);
+  // Multiply output_state * recurrent_weights => scratch1
+  micro_tensor_utils::MatrixBatchVectorMultiply(
+      output_state, output_state_zp, recurrent_to_gate_weight,
+      recurrent_to_gate_scale_a, recurrent_to_gate_scale_b, n_batch, n_output,
+      n_cell, scratch1, output_state_times_weights_zp);
+  // Add scratch0 + scratch1 => gate
+  micro_tensor_utils::TwoGateSaturatingAdd(
+      scratch0, input_times_weights_zp, scratch1, output_state_times_weights_zp,
+      input_times_weights_scale_a, input_times_weights_scale_b,
+      output_state_times_weights_scale_a, output_state_times_weights_scale_b,
+      n_batch, n_cell, gate);
+  // Apply layer normalization.
+  micro_tensor_utils::ApplyLayerNormFloat(
+      gate, layer_norm_gate_weight, layer_norm_gate_scale_a,
+      layer_norm_gate_scale_b, gate_bias, n_batch, n_cell, gate);
+  // Apply activation.
+  switch (activation) {
+    case kTfLiteActSigmoid:
+      micro_tensor_utils::ApplySigmoidFloat(gate, n_batch, n_cell, gate);
+      break;
+    case kTfLiteActTanh:
+      micro_tensor_utils::ApplyTanhFloat(gate, n_batch, n_cell, -12, gate);
+      break;
+    default:
+      // Only Sigmoid or Tanh is used.
+      TFLITE_ASSERT_FALSE;
+  }
+}
+
+// Calculates the output state tensor of an LSTM step. See Float and hybrid
+// versions as well.
+//
+// Parameters:
+//  - n_batch: batches: the number of distinct vectors in each array.
+//  - n_cell, n_output: sizes of vectors.
+//  - cell_state, output_gate: input vectors, size n_batch*n_cell.
+//  - projection_weights, proj_scale_[a|b], projection_bias:
+//      constant inputs, describing projection matrix and bias.
+//  - output_state_zp: zero point of the output state.
+//  - quantized_proj_clip: if > 0, clip the output of the projection.
+//  - output_state: output vector, size n_batch*n_output. Must be contigous.
+//  - scratch: scratch area of size n_batch*n_cell
+void CalculateLstmOutputInteger8x8_8(
+    int n_batch, int n_cell, int n_output, const int16_t* cell_state,
+    const int16_t* output_gate, const int8_t* projection_weights,
+    int32_t proj_scale_a, int32_t proj_scale_b, const int32_t* projection_bias,
+    int32_t output_state_zp, int32_t quantized_proj_clip, int8_t* output_state,
+    int16_t* scratch) {
+  // Note: unlike float/hybrid, the activation is always Tanh.
+  micro_tensor_utils::ApplyTanhFloat(cell_state, n_batch, n_cell, -15, scratch);
+  micro_tensor_utils::CwiseMul(output_gate, scratch, n_batch, n_cell,
+                               15 + 15 - 15, scratch);
+  // Note: no bias like in float/hybrid
+  micro_tensor_utils::MatrixBatchVectorMultiply(
+      scratch, projection_weights, proj_scale_a, proj_scale_b, projection_bias,
+      n_batch, n_cell, n_output, output_state_zp, output_state);
+  if (quantized_proj_clip > 0) {
+    micro_tensor_utils::CwiseClipping(output_state, n_batch * n_output,
+                                      quantized_proj_clip);
+  }
+}
+
+// Performs an LSTM batch inference step for input specified by input_ptr.
+// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
+// biases (*_bias_ptr), and buffers (*_scratch), along with additional
+// parameters:
+//  - params: various LSTM params including activation, clipping, etc.,
+//  - n_batch: size of batch,
+//  - n_cell: number of cells (or units),
+//  - n_input: the input size,
+//  - n_aux_input: the auxiliary input size.
+//  - n_output: the output size.
+//  - output_batch_leading_dim: the leading dimension of the output buffer.
+//
+// Input of size 'n_batch * n_input':
+//   input_ptr
+// Input of size 'n_batch * n_aux_input':
+//   aux_input_ptr                     - optional (can be nullptr)
+//
+// LSTM weights:
+// Input weights of size 'n_cell * n_input':
+//   input_to_input_weights            - optional
+//   input_to_forget_weights
+//   input_to_cell_weights
+//   input_to_output_weights
+// Auxiliary input weights of size 'n_cell * n_aux_input':
+//   aux_input_to_input_weights        - optional
+//   aux_input_to_forget_weights       - optional
+//   aux_input_to_cell_weights         - optional
+//   aux_input_to_output_weights       - optional
+// Recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weights        - optional
+//   recurrent_to_forget_weights
+//   recurrent_to_cell_weights
+//   recurrent_to_input_weights
+// Peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights             - optional
+//   cell_to_cell_weights              - optional
+//   cell_to_output_weights            - optional
+// Projection weights of size 'n_output * n_cell'
+//   projection_weights_ptr            - optional
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr               - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   input_layer_norm_coefficients_ptr  - optional
+//   forget_layer_norm_coefficients_ptr - optional
+//   cell_layer_norm_coefficients_ptr   - optional
+//   output_layer_norm_coefficients_ptr - optional
+//
+// The pointers to the cell and output state and the output are updated.
+//
+// The pointers input_ptr, aux_input_ptr, and output_ptr point to data aligned
+// in batch_major order, and each step processes batch_size many inputs from
+// input_ptr, and updates batch_size many cell and output states.
+//
+// The output_batch_dim is output.shape[-1], i.e. the outermost dimension of the
+// output tensor, and in most cases will be equal to n_output. It is usually not
+// when we want to store the LSTM output into a slice of the output tensor, e.g.
+// for bidirectional LSTMs with merge_outputs. In this case, the batched
+// operations cannot be used since they assume that the batched outputs are
+// contiguous, and we manually loop over the batched outputs.
+inline void LstmStepFloat(
+    const float* input_ptr, const float* input_to_input_weights_ptr,
+    const float* input_to_forget_weights_ptr,
+    const float* input_to_cell_weights_ptr,
+    const float* input_to_output_weights_ptr, const float* aux_input_ptr,
+    const float* aux_input_to_input_weights_ptr,
+    const float* aux_input_to_forget_weights_ptr,
+    const float* aux_input_to_cell_weights_ptr,
+    const float* aux_input_to_output_weights_ptr,
+    const float* recurrent_to_input_weights_ptr,
+    const float* recurrent_to_forget_weights_ptr,
+    const float* recurrent_to_cell_weights_ptr,
+    const float* recurrent_to_output_weights_ptr,
+    const float* cell_to_input_weights_ptr,
+    const float* cell_to_forget_weights_ptr,
+    const float* cell_to_output_weights_ptr,
+    const float* input_layer_norm_coefficients_ptr,
+    const float* forget_layer_norm_coefficients_ptr,
+    const float* cell_layer_norm_coefficients_ptr,
+    const float* output_layer_norm_coefficients_ptr,
+    const float* input_gate_bias_ptr, const float* forget_gate_bias_ptr,
+    const float* cell_gate_bias_ptr, const float* output_gate_bias_ptr,
+    const float* projection_weights_ptr, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_aux_input, int n_output, int output_batch_leading_dim,
+    float* output_state_ptr, float* cell_state_ptr, float* scratch0,
+    float* scratch1, float* scratch2, float* scratch3, float* output_ptr) {
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+
+  // Make named scratch buffers.
+  float* input_gate_scratch = scratch0;
+  float* forget_gate_scratch = scratch1;
+  float* cell_gate_scratch = scratch2;
+  float* output_gate_scratch = scratch3;
+
+  // Check if inputs are all zeros so we can skip some computations.
+  const bool is_input_all_zeros =
+      micro_tensor_utils::IsZeroVector(input_ptr, n_batch * n_input);
+  const bool is_aux_input_all_zeros =
+      (aux_input_ptr == nullptr ||
+       micro_tensor_utils::IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
+  if (!use_cifg) {
+    // Calculate the input gate. (If not CIFG.)
+    CalculateLstmGateFloat(
+        input_ptr, input_to_input_weights_ptr, aux_input_ptr,
+        aux_input_to_input_weights_ptr, output_state_ptr,
+        recurrent_to_input_weights_ptr, cell_state_ptr,
+        cell_to_input_weights_ptr, input_layer_norm_coefficients_ptr,
+        input_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+        /*activation=*/kTfLiteActSigmoid, input_gate_scratch,
+        is_input_all_zeros, is_aux_input_all_zeros);
+  }
+  // Calculate the forget gate.
+  CalculateLstmGateFloat(
+      input_ptr, input_to_forget_weights_ptr, aux_input_ptr,
+      aux_input_to_forget_weights_ptr, output_state_ptr,
+      recurrent_to_forget_weights_ptr, cell_state_ptr,
+      cell_to_forget_weights_ptr, forget_layer_norm_coefficients_ptr,
+      forget_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+      /*activation=*/kTfLiteActSigmoid, forget_gate_scratch, is_input_all_zeros,
+      is_aux_input_all_zeros);
+  // Calculate the cell update gate.
+  CalculateLstmGateFloat(input_ptr, input_to_cell_weights_ptr, aux_input_ptr,
+                         aux_input_to_cell_weights_ptr, output_state_ptr,
+                         recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
+                         /*cell_to_gate_weights=*/nullptr,
+                         cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr,
+                         n_batch, n_input, n_aux_input, n_output, n_cell,
+                         params->activation, cell_gate_scratch,
+                         is_input_all_zeros, is_aux_input_all_zeros);
+  // Update the cell state.
+  UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch,
+                      forget_gate_scratch, cell_gate_scratch, use_cifg,
+                      params->cell_clip);
+  // Calculate output gate.
+  CalculateLstmGateFloat(
+      input_ptr, input_to_output_weights_ptr, aux_input_ptr,
+      aux_input_to_output_weights_ptr, output_state_ptr,
+      recurrent_to_output_weights_ptr, cell_state_ptr,
+      cell_to_output_weights_ptr, output_layer_norm_coefficients_ptr,
+      output_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+      /*activation=*/kTfLiteActSigmoid, output_gate_scratch, is_input_all_zeros,
+      is_aux_input_all_zeros);
+  // Update the output state.
+  CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr,
+                           output_gate_scratch, params->activation,
+                           projection_weights_ptr, projection_bias_ptr,
+                           params->proj_clip, output_state_ptr, scratch2);
+  // Copy output state to the output. Note that the output's rows may not be
+  // contiguous (output_batch_leading_dim != n_output).
+  for (int b = 0; b < n_batch; b++) {
+    std::memcpy(output_ptr + b * output_batch_leading_dim,
+                output_state_ptr + b * n_output, n_output * sizeof(float));
+  }
+}
+
+// Same as above but with quantized weight matrices. In detail:
+// Input of size 'n_batch * n_input':
+//   input_ptr
+// Input of size 'n_batch * n_aux_input':
+//   aux_input_ptr                     - optional (can be nullptr)
+//
+// LSTM weights:
+// Quantized input weights of size 'n_cell * n_input':
+//   input_to_input_weights            - optional
+//   input_to_forget_weights
+//   input_to_cell_weights
+//   input_to_input_weights
+// Quantized auxiliary input weights of size 'n_cell * n_aux_input':
+//   aux_input_to_input_weights        - optional
+//   aux_input_to_forget_weights       - optional
+//   aux_input_to_cell_weights         - optional
+//   aux_input_to_output_weights       - optional
+// Quantized recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weights        - optional
+//   recurrent_to_forget_weights
+//   recurrent_to_cell_weights
+//   recurrent_to_input_weights
+// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights             - optional
+//   cell_to_cell_weights              - optional
+//   cell_to_output_weights            - optional
+// Quantized projection weights of size 'n_output * n_cell'
+//   projection_weights_ptr            - optional
+// Weight scales (scalars) for each of the weights above.
+//   input_to_input_weights_scale      - optional
+//   input_to_forget_weights_scale
+//   input_to_cell_weights_scale
+//   input_to_output_weights_scale
+//   aux_input_to_input_weights_scale  - optional
+//   aux_input_to_forget_weights_scale - optional
+//   aux_input_to_cell_weights_scale   - optional
+//   aux_input_to_output_weights_scale - optional
+//   recurrent_to_input_weights_scale  - optional
+//   recurrent_to_forget_weights_scale
+//   recurrent_to_cell_weights_scale
+//   recurrent_to_output_weights_scale
+//   cell_to_input_weights_scale,
+//   cell_to_forget_weights_scale,
+//   cell_to_output_weights_scale,
+//   projection_weights_scale          - optional
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr               - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   input_layer_norm_coefficients_ptr  - optional
+//   forget_layer_norm_coefficients_ptr - optional
+//   cell_layer_norm_coefficients_ptr   - optional
+//   output_layer_norm_coefficients_ptr - optional
+//
+// Temporary pre-allocated storage for quantized values:
+//   quantized_input_ptr (same size as input_ptr)
+//   quantized_output_state_ptr (same size as output_state_ptr)
+//   quantized_output_scratch (same size as cell_state_ptr)
+// Temporary pre-allocated storage for recovered values:
+//   recovered_cell_weights (same size as cell_to_*_weights)
+//
+// Outputs:
+//   output_state_ptr - size 'n_batch * n_output'
+//   cell_state_ptr   - size 'n_batch * n_cell'
+//   output_ptr       - size 'n_batch * output_batch_leading_dim'
+inline void LstmStepHybrid(
+    const float* input_ptr, const int8_t* input_to_input_weights_ptr,
+    const uint8_t* input_to_input_weights_ledger_ptr,
+    float input_to_input_weights_scale,
+    const int8_t* input_to_forget_weights_ptr,
+    const uint8_t* input_to_forget_weights_ledger_ptr,
+    float input_to_forget_weights_scale,
+    const int8_t* input_to_cell_weights_ptr,
+    const uint8_t* input_to_cell_weights_ledger_ptr,
+    float input_to_cell_weights_scale,
+    const int8_t* input_to_output_weights_ptr,
+    const uint8_t* input_to_output_weights_ledger_ptr,
+    float input_to_output_weights_scale, const float* aux_input_ptr,
+    const int8_t* aux_input_to_input_weights_ptr,
+    float aux_input_to_input_weights_scale,
+    const int8_t* aux_input_to_forget_weights_ptr,
+    float aux_input_to_forget_weights_scale,
+    const int8_t* aux_input_to_cell_weights_ptr,
+    float aux_input_to_cell_weights_scale,
+    const int8_t* aux_input_to_output_weights_ptr,
+    float aux_input_to_output_weights_scale,
+    const int8_t* recurrent_to_input_weights_ptr,
+    const uint8_t* recurrent_to_input_weights_ledger_ptr,
+    float recurrent_to_input_weights_scale,
+    const int8_t* recurrent_to_forget_weights_ptr,
+    const uint8_t* recurrent_to_forget_weights_ledger_ptr,
+    float recurrent_to_forget_weights_scale,
+    const int8_t* recurrent_to_cell_weights_ptr,
+    const uint8_t* recurrent_to_cell_weights_ledger_ptr,
+    float recurrent_to_cell_weights_scale,
+    const int8_t* recurrent_to_output_weights_ptr,
+    const uint8_t* recurrent_to_output_weights_ledger_ptr,
+    float recurrent_to_output_weights_scale,
+    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
+    const int8_t* cell_to_forget_weights_ptr,
+    float cell_to_forget_weights_scale,
+    const int8_t* cell_to_output_weights_ptr,
+    float cell_to_output_weights_scale,
+    const float* input_layer_norm_coefficients_ptr,
+    const float* forget_layer_norm_coefficients_ptr,
+    const float* cell_layer_norm_coefficients_ptr,
+    const float* output_layer_norm_coefficients_ptr,
+    const float* input_gate_bias_ptr, const float* forget_gate_bias_ptr,
+    const float* cell_gate_bias_ptr, const float* output_gate_bias_ptr,
+    const int8_t* projection_weights_ptr,
+    const uint8_t* projection_weights_ledger_ptr,
+    float projection_weights_scale, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_aux_input, int n_output, int output_batch_leading_dim,
+    float* scratch0, float* scratch1, float* scratch2, float* scratch3,
+    float* scales, float* input_sf, float* aux_input_sf, float* output_state_sf,
+    float* scaling_factors_scratch, float* recovered_cell_weights,
+    int8_t* quantized_input_ptr, int8_t* quantized_aux_input_ptr,
+    int8_t* quantized_output_state_ptr, int8_t* quantized_output_scratch,
+    float* output_state_ptr, float* cell_state_ptr, int32_t* accum_scratch_ptr,
+    float* output_ptr, int32_t* input_zp, int32_t* aux_input_zp,
+    int32_t* output_state_zp, int32_t* row_sums, int row_sums_size,
+    bool* compute_row_sums, bool asymmetric_quantize_inputs) {
+  // Since we have already checked that weights are all there or none, we
+  // can check the existence of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+  // Make named scratch buffers for the different gates.
+  float* input_gate_scratch = scratch0;
+  float* forget_gate_scratch = scratch1;
+  float* cell_gate_scratch = scratch2;
+  float* output_gate_scratch = scratch3;
+
+  int32_t* input_to_input_row_sums = nullptr;
+  int32_t* input_to_forget_row_sums = nullptr;
+  int32_t* input_to_cell_row_sums = nullptr;
+  int32_t* input_to_output_row_sums = nullptr;
+  int32_t* aux_input_to_input_row_sums = nullptr;
+  int32_t* aux_input_to_forget_row_sums = nullptr;
+  int32_t* aux_input_to_cell_row_sums = nullptr;
+  int32_t* aux_input_to_output_row_sums = nullptr;
+  int32_t* recurrent_to_input_row_sums = nullptr;
+  int32_t* recurrent_to_forget_row_sums = nullptr;
+  int32_t* recurrent_to_cell_row_sums = nullptr;
+  int32_t* recurrent_to_output_row_sums = nullptr;
+  int32_t* projection_weights_row_sums = nullptr;
+
+  if (asymmetric_quantize_inputs) {
+    int num_row_sums = use_cifg ? 6 : 8;
+    if (aux_input_ptr != nullptr) {
+      num_row_sums += use_cifg ? 3 : 4;
+    }
+    if (projection_weights_ptr != nullptr) {
+      num_row_sums += ceil(static_cast<float>(n_output) / n_cell);
+    }
+    TFLITE_DCHECK(row_sums_size == num_row_sums);
+    input_to_input_row_sums = row_sums;
+    input_to_forget_row_sums =
+        use_cifg ? input_to_input_row_sums : input_to_input_row_sums + n_cell;
+    input_to_cell_row_sums = input_to_forget_row_sums + n_cell;
+    input_to_output_row_sums = input_to_cell_row_sums + n_cell;
+    if (aux_input_ptr != nullptr) {
+      aux_input_to_input_row_sums = input_to_output_row_sums + n_cell;
+      aux_input_to_forget_row_sums = use_cifg
+                                         ? aux_input_to_input_row_sums
+                                         : aux_input_to_input_row_sums + n_cell;
+      aux_input_to_cell_row_sums = aux_input_to_forget_row_sums + n_cell;
+      aux_input_to_output_row_sums = aux_input_to_cell_row_sums + n_cell;
+    }
+    recurrent_to_input_row_sums = aux_input_ptr
+                                      ? aux_input_to_output_row_sums + n_cell
+                                      : input_to_output_row_sums + n_cell;
+    recurrent_to_forget_row_sums = use_cifg
+                                       ? recurrent_to_input_row_sums
+                                       : recurrent_to_input_row_sums + n_cell;
+    recurrent_to_cell_row_sums = recurrent_to_forget_row_sums + n_cell;
+    recurrent_to_output_row_sums = recurrent_to_cell_row_sums + n_cell;
+    if (projection_weights_ptr != nullptr) {
+      projection_weights_row_sums = recurrent_to_output_row_sums + n_cell;
+    }
+    if (*compute_row_sums) {
+      ComputeRowSums(
+          input_to_input_row_sums, input_to_forget_row_sums,
+          input_to_cell_row_sums, input_to_output_row_sums,
+          aux_input_to_input_row_sums, aux_input_to_forget_row_sums,
+          aux_input_to_cell_row_sums, aux_input_to_output_row_sums,
+          recurrent_to_input_row_sums, recurrent_to_forget_row_sums,
+          recurrent_to_cell_row_sums, recurrent_to_output_row_sums,
+          projection_weights_row_sums, row_sums, n_cell, n_input, n_aux_input,
+          n_output, input_to_input_weights_ptr, input_to_forget_weights_ptr,
+          input_to_cell_weights_ptr, input_to_output_weights_ptr,
+          aux_input_to_input_weights_ptr, aux_input_to_forget_weights_ptr,
+          aux_input_to_cell_weights_ptr, aux_input_to_output_weights_ptr,
+          recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr,
+          recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr,
+          projection_weights_ptr, use_cifg, aux_input_ptr);
+      *compute_row_sums = false;
+    }
+  }
+
+  // Check if inputs are all zeros so we can skip some computations.
+  const bool is_input_all_zeros =
+      micro_tensor_utils::IsZeroVector(input_ptr, n_batch * n_input);
+  const bool is_aux_input_all_zeros =
+      (aux_input_ptr == nullptr ||
+       micro_tensor_utils::IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
+  const bool is_output_state_all_zeros =
+      micro_tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output);
+  // Quantize inputs.
+  if (!is_input_all_zeros) {
+    micro_tensor_utils::BatchQuantizeFloats(
+        input_ptr, n_batch, n_input, quantized_input_ptr, input_sf, input_zp,
+        asymmetric_quantize_inputs);
+  }
+  if (!is_aux_input_all_zeros) {
+    micro_tensor_utils::BatchQuantizeFloats(
+        aux_input_ptr, n_batch, n_aux_input, quantized_aux_input_ptr,
+        aux_input_sf, aux_input_zp, asymmetric_quantize_inputs);
+  }
+  if (!is_output_state_all_zeros) {
+    micro_tensor_utils::BatchQuantizeFloats(
+        output_state_ptr, n_batch, n_output, quantized_output_state_ptr,
+        output_state_sf, output_state_zp, asymmetric_quantize_inputs);
+  }
+  if (!use_cifg) {
+    // Calculate the input gate. (If not CIFG.)
+    CalculateLstmGateHybrid(
+        quantized_input_ptr, input_sf, input_zp, input_to_input_weights_ptr,
+        input_to_input_weights_ledger_ptr, input_to_input_weights_scale,
+        input_to_input_row_sums, quantized_aux_input_ptr, aux_input_sf,
+        aux_input_zp, aux_input_to_input_weights_ptr,
+        aux_input_to_input_weights_scale, aux_input_to_input_row_sums,
+        quantized_output_state_ptr, output_state_sf, output_state_zp,
+        recurrent_to_input_weights_ptr, recurrent_to_input_weights_ledger_ptr,
+        recurrent_to_input_weights_scale, recurrent_to_input_row_sums,
+        cell_state_ptr, cell_to_input_weights_ptr, cell_to_input_weights_scale,
+        input_layer_norm_coefficients_ptr, input_gate_bias_ptr, n_batch,
+        n_input, n_aux_input, n_output, n_cell, kTfLiteActSigmoid,
+        input_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros,
+        is_output_state_all_zeros, compute_row_sums, scaling_factors_scratch,
+        recovered_cell_weights, scales, accum_scratch_ptr);
+  }
+  // Calculate the forget gate.
+  CalculateLstmGateHybrid(
+      quantized_input_ptr, input_sf, input_zp, input_to_forget_weights_ptr,
+      input_to_forget_weights_ledger_ptr, input_to_forget_weights_scale,
+      input_to_forget_row_sums, quantized_aux_input_ptr, aux_input_sf,
+      aux_input_zp, aux_input_to_forget_weights_ptr,
+      aux_input_to_forget_weights_scale, aux_input_to_forget_row_sums,
+      quantized_output_state_ptr, output_state_sf, output_state_zp,
+      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_ledger_ptr,
+      recurrent_to_forget_weights_scale, recurrent_to_forget_row_sums,
+      cell_state_ptr, cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
+      forget_layer_norm_coefficients_ptr, forget_gate_bias_ptr, n_batch,
+      n_input, n_aux_input, n_output, n_cell, kTfLiteActSigmoid,
+      forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros,
+      is_output_state_all_zeros, compute_row_sums, scaling_factors_scratch,
+      recovered_cell_weights, scales, accum_scratch_ptr);
+  // Calculate the cell update gate.
+  CalculateLstmGateHybrid(
+      quantized_input_ptr, input_sf, input_zp, input_to_cell_weights_ptr,
+      input_to_cell_weights_ledger_ptr, input_to_cell_weights_scale,
+      input_to_cell_row_sums, quantized_aux_input_ptr, aux_input_sf,
+      aux_input_zp, aux_input_to_cell_weights_ptr,
+      aux_input_to_cell_weights_scale, aux_input_to_cell_row_sums,
+      quantized_output_state_ptr, output_state_sf, output_state_zp,
+      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_ledger_ptr,
+      recurrent_to_cell_weights_scale, recurrent_to_cell_row_sums,
+      /*cell_state=*/nullptr, /*cell_to_gate_weights=*/nullptr,
+      /*cell_to_gate_weights_scale=*/0.0f, cell_layer_norm_coefficients_ptr,
+      cell_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+      params->activation, cell_gate_scratch, is_input_all_zeros,
+      is_aux_input_all_zeros, is_output_state_all_zeros, compute_row_sums,
+      scaling_factors_scratch, recovered_cell_weights, scales,
+      accum_scratch_ptr);
+  // Update the cell state.
+  UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch,
+                      forget_gate_scratch, cell_gate_scratch, use_cifg,
+                      params->cell_clip);
+  // Calculate the output gate.
+  CalculateLstmGateHybrid(
+      quantized_input_ptr, input_sf, input_zp, input_to_output_weights_ptr,
+      input_to_output_weights_ledger_ptr, input_to_output_weights_scale,
+      input_to_output_row_sums, quantized_aux_input_ptr, aux_input_sf,
+      aux_input_zp, aux_input_to_output_weights_ptr,
+      aux_input_to_output_weights_scale, aux_input_to_output_row_sums,
+      quantized_output_state_ptr, output_state_sf, output_state_zp,
+      recurrent_to_output_weights_ptr, recurrent_to_output_weights_ledger_ptr,
+      recurrent_to_output_weights_scale, recurrent_to_output_row_sums,
+      cell_state_ptr, cell_to_output_weights_ptr, cell_to_output_weights_scale,
+      output_layer_norm_coefficients_ptr, output_gate_bias_ptr, n_batch,
+      n_input, n_aux_input, n_output, n_cell, kTfLiteActSigmoid,
+      output_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros,
+      is_output_state_all_zeros, compute_row_sums, scaling_factors_scratch,
+      recovered_cell_weights, scales, accum_scratch_ptr);
+  // Update the output state.
+  CalculateLstmOutputHybrid(
+      n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
+      params->activation, projection_weights_ptr, projection_weights_ledger_ptr,
+      projection_weights_scale, projection_bias_ptr, params->proj_clip,
+      output_state_ptr, asymmetric_quantize_inputs, projection_weights_row_sums,
+      compute_row_sums, scratch2, quantized_output_scratch, input_sf, input_zp,
+      accum_scratch_ptr, scales);
+  // Copy output state to the output. Note that the output's rows may not be
+  // contiguous (output_batch_leading_dim != n_output).
+  for (int b = 0; b < n_batch; b++) {
+    std::memcpy(output_ptr + b * output_batch_leading_dim,
+                output_state_ptr + b * n_output, n_output * sizeof(float));
+  }
+}
+
+// Fully quantized lstm kernel for 16 bit gate matmul output.
+//
+// Input tensor of size n_batch * n_input:
+//   input_ptr
+//
+// LSTM weights:
+// Quantized input weights of size 'n_cell * n_input':
+//   input_to_input_weight_ptr            - optional
+//   input_to_forget_weight_ptr           - optional
+//   input_to_cell_weight_ptr             - optional
+//   input_to_output_weight_ptr           - optional
+//
+// Quantized recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weight_ptr        - optional
+//   recurrent_to_forget_weights_ptr
+//   recurrent_to_cell_weights_ptr
+//   recurrent_to_input_weights_ptr
+//
+// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights               - optional
+//   cell_to_cell_weights                - optional
+//   cell_to_output_weights              - optional
+//
+// Quantized projection weights of size 'n_output * n_cell'
+//   projection_weight_ptr                     - optional
+//
+// Weight scales (scalars) for each of the weights above.
+//   effective_input_to_input_scale_a    - optional
+//   effective_input_to_input_scale_b    - optional
+//   effective_input_to_forget_scale_a
+//   effective_input_to_forget_scale_b
+//   effective_input_to_cell_scale_a
+//   effective_input_to_cell_scale_b
+//   effective_input_to_output_scale_a
+//   effective_input_to_output_scale_b
+//   effective_recurrent_to_input_scale_a    - optional
+//   effective_recurrent_to_input_scale_b    - optional
+//   effective_recurrent_to_forget_scale_a
+//   effective_recurrent_to_forget_scale_b
+//   effective_recurrent_to_cell_scale_a
+//   effective_recurrent_to_cell_scale_b
+//   effective_recurrent_to_output_scale_a
+//   effective_recurrent_to_output_scale_b
+//   effective_proj_scale_a                  - optional
+//   effective_proj_scale_b                  - optional
+//
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr                 - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   layer_norm_input_weight_ptr    - optional
+//   layer_norm_forget_weight_ptr   - optional
+//   layer_norm_cell_weight_ptr     - optional
+//   layer_norm_output_weight_ptr   - optional
+//
+// Layer norm scales of size 'n_cell'.
+//   layer_norm_input_scale_a     - optional
+//   layer_norm_input_scale_b     - optional
+//   layer_norm_forget_scale_a    - optional
+//   layer_norm_forget_scale_b    - optional
+//   layer_norm_cell_scale_a      - optional
+//   layer_norm_cell_scale_b      - optional
+//   layer_norm_output_scale_a    - optional
+//   layer_norm_output_scale_b    - optional
+//
+// Scalar values:
+//   quantized_cell_clip: quantized clip value for cell.
+//   quantized_proj_clip: quantized clip value for projection.
+//   cell_state_scale: the power of two scale for cell state.
+//
+// Zero points:
+//   output_state_zp: zero point of output state
+//   hidden_zp: zero point for hidden state.
+//
+// Temporary pre-allocated storage for the calculation. Each is of size n_cell *
+// n_batch.
+//   scratch0
+//   scratch1
+//   scratch2
+//   scratch3
+//   scratch4
+//   scratch5: this scratch buffer is created purely for optimizing the
+//              MatrixBatchVectorMultiplyAccumulate.
+//
+// Outputs:
+//   output_state_ptr - size 'n_batch * n_output'
+//   cell_state_ptr   - size 'n_batch * n_cell'
+//   output_ptr       - size 'n_batch * n_output'
+// TODO(b/159947023): scratch0 is not used if (!cifg). Don't allocate then.
+inline void LstmStepInteger8x8_16(
+    const int8_t* input_ptr, const int8_t* input_to_input_weight_ptr,
+    int32_t effective_input_to_input_scale_a,
+    int32_t effective_input_to_input_scale_b,
+    const int8_t* input_to_forget_weight_ptr,
+    int32_t effective_input_to_forget_scale_a,
+    int32_t effective_input_to_forget_scale_b,
+    const int8_t* input_to_cell_weight_ptr,
+    int32_t effective_input_to_cell_scale_a,
+    int32_t effective_input_to_cell_scale_b,
+    const int8_t* input_to_output_weight_ptr,
+    int32_t effective_input_to_output_scale_a,
+    int32_t effective_input_to_output_scale_b,
+    const int8_t* recurrent_to_input_weight_ptr,
+    int32_t effective_recurrent_to_input_scale_a,
+    int32_t effective_recurrent_to_input_scale_b,
+    const int8_t* recurrent_to_forget_weight_ptr,
+    int32_t effective_recurrent_to_forget_scale_a,
+    int32_t effective_recurrent_to_forget_scale_b,
+    const int8_t* recurrent_to_cell_weight_ptr,
+    int32_t effective_recurrent_to_cell_scale_a,
+    int32_t effective_recurrent_to_cell_scale_b,
+    const int8_t* recurrent_to_output_weight_ptr,
+    int32_t effective_recurrent_to_output_scale_a,
+    int32_t effective_recurrent_to_output_scale_b,
+    const int16_t* cell_to_input_weight_ptr,
+    int32_t effective_cell_to_input_scale_a,
+    int32_t effective_cell_to_input_scale_b,
+    const int16_t* cell_to_forget_weight_ptr,
+    int32_t effective_cell_to_forget_scale_a,
+    int32_t effective_cell_to_forget_scale_b,
+    const int16_t* cell_to_output_weight_ptr,
+    int32_t effective_cell_to_output_scale_a,
+    int32_t effective_cell_to_output_scale_b,
+    const int8_t* projection_weight_ptr, int32_t effective_proj_scale_a,
+    int32_t effective_proj_scale_b, int32_t hidden_zp,
+    int32_t effective_hidden_scale_a, int32_t effective_hidden_scale_b,
+    const int16_t* layer_norm_input_weight_ptr,
+    int32_t layer_norm_input_scale_a, int32_t layer_norm_input_scale_b,
+    const int16_t* layer_norm_forget_weight_ptr,
+    int32_t layer_norm_forget_scale_a, int32_t layer_norm_forget_scale_b,
+    const int16_t* layer_norm_cell_weight_ptr, int32_t layer_norm_cell_scale_a,
+    int32_t layer_norm_cell_scale_b,
+    const int16_t* layer_norm_output_weight_ptr,
+    int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
+    const int32_t* input_gate_bias_ptr, const int32_t* forget_gate_bias_ptr,
+    const int32_t* cell_gate_bias_ptr, const int32_t* output_gate_bias_ptr,
+    int16_t quantized_cell_clip, int8_t quantized_proj_clip,
+    int32_t cell_state_scale, int32_t input_variance_guard,
+    int32_t forget_variance_guard, int32_t cell_variance_guard,
+    int32_t output_variance_guard,
+    const int32_t* input_to_forget_effective_bias,
+    const int32_t* recurrent_to_forget_effective_bias,
+    const int32_t* input_to_cell_effective_bias,
+    const int32_t* recurrent_to_cell_effective_bias,
+    const int32_t* input_to_output_effective_bias,
+    const int32_t* recurrent_to_output_effective_bias,
+    const int32_t* input_to_input_effective_bias,
+    const int32_t* recurrent_to_input_effective_bias,
+    const int32_t* projection_effective_bias, int n_batch, int n_cell,
+    int n_input, int n_output, int8_t* output_state_ptr,
+    int32_t output_state_zp, int16_t* cell_state_ptr, int8_t* output_ptr,
+    int16_t* scratch0, int16_t* scratch1, int16_t* scratch2, int16_t* scratch3,
+    int8_t* scratch4, int32_t* scratch5) {
+  // Make named scratch buffers for the different gates.
+  int16_t* input_gate_scratch = scratch0;
+  int16_t* forget_gate_scratch = scratch1;
+  int16_t* cell_gate_scratch = scratch2;
+  int16_t* output_gate_scratch = scratch3;
+
+  // Since we have already checked that weights are all there or none, we
+  // can check the existence of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weight_ptr == nullptr);
+
+  // Check for nullptrs.
+  TFLITE_DCHECK(input_to_forget_effective_bias);
+  TFLITE_DCHECK(recurrent_to_forget_effective_bias);
+  TFLITE_DCHECK(input_to_cell_effective_bias);
+  TFLITE_DCHECK(recurrent_to_cell_effective_bias);
+  TFLITE_DCHECK(input_to_output_effective_bias);
+  TFLITE_DCHECK(recurrent_to_output_effective_bias);
+  if (!use_cifg) {
+    TFLITE_DCHECK(input_to_input_effective_bias);
+    TFLITE_DCHECK(recurrent_to_input_effective_bias);
+  }
+  const bool use_projection = (projection_weight_ptr != nullptr);
+  if (use_projection) {
+    TFLITE_DCHECK(projection_effective_bias);
+  }
+  if (!use_cifg) {
+    // Calculate the input gate. (If not CIFG.)
+    CalculateLstmGateInteger8x8_16(
+        input_ptr, input_to_input_weight_ptr, input_to_input_effective_bias,
+        effective_input_to_input_scale_a, effective_input_to_input_scale_b,
+        output_state_ptr, recurrent_to_input_weight_ptr,
+        recurrent_to_input_effective_bias, effective_recurrent_to_input_scale_a,
+        effective_recurrent_to_input_scale_b, cell_state_ptr,
+        cell_to_input_weight_ptr, effective_cell_to_input_scale_a,
+        effective_cell_to_input_scale_b, layer_norm_input_weight_ptr,
+        input_gate_bias_ptr, layer_norm_input_scale_a, layer_norm_input_scale_b,
+        input_variance_guard, n_batch, n_input, n_output, n_cell,
+        kTfLiteActSigmoid, input_gate_scratch, scratch5);
+  }
+  // Calculate the forget gate.
+  CalculateLstmGateInteger8x8_16(
+      input_ptr, input_to_forget_weight_ptr, input_to_forget_effective_bias,
+      effective_input_to_forget_scale_a, effective_input_to_forget_scale_b,
+      output_state_ptr, recurrent_to_forget_weight_ptr,
+      recurrent_to_forget_effective_bias, effective_recurrent_to_forget_scale_a,
+      effective_recurrent_to_forget_scale_b, cell_state_ptr,
+      cell_to_forget_weight_ptr, effective_cell_to_forget_scale_a,
+      effective_cell_to_forget_scale_b, layer_norm_forget_weight_ptr,
+      forget_gate_bias_ptr, layer_norm_forget_scale_a,
+      layer_norm_forget_scale_b, forget_variance_guard, n_batch, n_input,
+      n_output, n_cell, kTfLiteActSigmoid, forget_gate_scratch, scratch5);
+  // Calculate the cell update gate.
+  CalculateLstmGateInteger8x8_16(
+      input_ptr, input_to_cell_weight_ptr, input_to_cell_effective_bias,
+      effective_input_to_cell_scale_a, effective_input_to_cell_scale_b,
+      output_state_ptr, recurrent_to_cell_weight_ptr,
+      recurrent_to_cell_effective_bias, effective_recurrent_to_cell_scale_a,
+      effective_recurrent_to_cell_scale_b, cell_state_ptr,
+      /*cell_to_gate_weights=*/nullptr, /*cell_to_gate_scale_a=*/0,
+      /*cell_to_gate_scale_b=*/0, layer_norm_cell_weight_ptr,
+      cell_gate_bias_ptr, layer_norm_cell_scale_a, layer_norm_cell_scale_b,
+      cell_variance_guard, n_batch, n_input, n_output, n_cell, kTfLiteActTanh,
+      cell_gate_scratch, scratch5);
+  // Update the cell state.
+  UpdateLstmCellInteger(n_batch, n_cell, cell_state_ptr, cell_state_scale,
+                        input_gate_scratch, forget_gate_scratch,
+                        cell_gate_scratch, use_cifg, quantized_cell_clip);
+  // Calculate the output gate.
+  CalculateLstmGateInteger8x8_16(
+      input_ptr, input_to_output_weight_ptr, input_to_output_effective_bias,
+      effective_input_to_output_scale_a, effective_input_to_output_scale_b,
+      output_state_ptr, recurrent_to_output_weight_ptr,
+      recurrent_to_output_effective_bias, effective_recurrent_to_output_scale_a,
+      effective_recurrent_to_output_scale_b, cell_state_ptr,
+      cell_to_output_weight_ptr, effective_cell_to_output_scale_a,
+      effective_cell_to_output_scale_b, layer_norm_output_weight_ptr,
+      output_gate_bias_ptr, layer_norm_output_scale_a,
+      layer_norm_output_scale_b, output_variance_guard, n_batch, n_input,
+      n_output, n_cell, kTfLiteActSigmoid, output_gate_scratch, scratch5);
+  // Update the output state.
+  CalculateLstmOutputInteger8x8_16(
+      n_batch, n_cell, n_output, cell_state_ptr, cell_state_scale,
+      output_gate_scratch, effective_hidden_scale_a, effective_hidden_scale_b,
+      hidden_zp, projection_weight_ptr, effective_proj_scale_a,
+      effective_proj_scale_b, projection_effective_bias, output_state_zp,
+      quantized_proj_clip, output_state_ptr, scratch0, scratch4, scratch5);
+  // Copy output state to the output. Note that unlike float or hybrid, output
+  // is always contiguous.
+  std::memcpy(output_ptr, output_state_ptr,
+              n_batch * n_output * sizeof(int8_t));
+}
+
+// Fully quantized lstm kernel for 8 bit gate matmul output.
+//
+// Input tensor of size n_batch * n_input:
+//   input_ptr
+//
+// LSTM weights:
+// Quantized input weights of size 'n_cell * n_input':
+//   input_to_input_weight_ptr            - optional
+//   input_to_forget_weight_ptr           - optional
+//   input_to_cell_weight_ptr             - optional
+//   input_to_output_weight_ptr           - optional
+//
+// Quantized recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weight_ptr        - optional
+//   recurrent_to_forget_weights_ptr
+//   recurrent_to_cell_weights_ptr
+//   recurrent_to_input_weights_ptr
+//
+// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights               - optional
+//   cell_to_cell_weights                - optional
+//   cell_to_output_weights              - optional
+//
+// Quantized projection weights of size 'n_output * n_cell'
+//   projection_weight_ptr                     - optional
+//
+// Weight scales (scalars) for each of the weights above.
+//   effective_input_to_input_scale_a    - optional
+//   effective_input_to_input_scale_b    - optional
+//   effective_input_to_forget_scale_a
+//   effective_input_to_forget_scale_b
+//   effective_input_to_cell_scale_a
+//   effective_input_to_cell_scale_b
+//   effective_input_to_output_scale_a
+//   effective_input_to_output_scale_b
+//   effective_recurrent_to_input_scale_a    - optional
+//   effective_recurrent_to_input_scale_b    - optional
+//   effective_recurrent_to_forget_scale_a
+//   effective_recurrent_to_forget_scale_b
+//   effective_recurrent_to_cell_scale_a
+//   effective_recurrent_to_cell_scale_b
+//   effective_recurrent_to_output_scale_a
+//   effective_recurrent_to_output_scale_b
+//   effective_proj_scale_a                  - optional
+//   effective_proj_scale_b                  - optional
+//
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr                 - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   layer_norm_input_weight_ptr    - optional
+//   layer_norm_forget_weight_ptr   - optional
+//   layer_norm_cell_weight_ptr     - optional
+//   layer_norm_output_weight_ptr   - optional
+//
+// Layer norm scales of size 'n_cell'.
+//   layer_norm_input_scale_a     - optional
+//   layer_norm_input_scale_b     - optional
+//   layer_norm_forget_scale_a    - optional
+//   layer_norm_forget_scale_b    - optional
+//   layer_norm_cell_scale_a      - optional
+//   layer_norm_cell_scale_b      - optional
+//   layer_norm_output_scale_a    - optional
+//   layer_norm_output_scale_b    - optional
+//
+// Scalar values:
+//   quantized_cell_clip: quantized clip value for cell.
+//   quantized_proj_clip: quantized clip value for projection.
+//   cell_state_scale: the power of two scale for cell state.
+//
+// Zero points:
+//   input_zp: zero point for input tensor.
+//   output_state_zp: zero point of output state.
+//   hidden_zp: zero point for hidden state.
+//
+// Temporary pre-allocated storage for the calculation. Each is of size n_cell *
+// n_batch.
+//   scratch0
+//   scratch1
+//   scratch2
+//   scratch3
+//   scratch4
+//   scratch5
+//   scratch6
+//   scratch7
+//
+// Outputs:
+//   output_state_ptr - size 'n_batch * n_output'
+//   cell_state_ptr   - size 'n_batch * n_cell'
+//   output_ptr       - size 'n_batch * n_output'
+//
+// Can move zero point calculation into Prepare() for better perfomance.
+// TODO(b/159947023): scratch5 is unused, remove.
+inline void LstmStepInteger8x8_8(
+    const int8_t* input_ptr, int32_t input_zp,
+    const int8_t* input_to_input_weight_ptr,
+    int32_t effective_input_to_input_scale_a,
+    int32_t effective_input_to_input_scale_b,
+    const int8_t* input_to_forget_weight_ptr,
+    int32_t effective_input_to_forget_scale_a,
+    int32_t effective_input_to_forget_scale_b,
+    const int8_t* input_to_cell_weight_ptr,
+    int32_t effective_input_to_cell_scale_a,
+    int32_t effective_input_to_cell_scale_b,
+    const int8_t* input_to_output_weight_ptr,
+    int32_t effective_input_to_output_scale_a,
+    int32_t effective_input_to_output_scale_b,
+    const int8_t* recurrent_to_input_weight_ptr,
+    int32_t effective_recurrent_to_input_scale_a,
+    int32_t effective_recurrent_to_input_scale_b,
+    const int8_t* recurrent_to_forget_weight_ptr,
+    int32_t effective_recurrent_to_forget_scale_a,
+    int32_t effective_recurrent_to_forget_scale_b,
+    const int8_t* recurrent_to_cell_weight_ptr,
+    int32_t effective_recurrent_to_cell_scale_a,
+    int32_t effective_recurrent_to_cell_scale_b,
+    const int8_t* recurrent_to_output_weight_ptr,
+    int32_t effective_recurrent_to_output_scale_a,
+    int32_t effective_recurrent_to_output_scale_b,
+    const int8_t* cell_to_input_weight_ptr,
+    int32_t effective_cell_to_input_scale_a,
+    int32_t effective_cell_to_input_scale_b,
+    const int8_t* cell_to_forget_weight_ptr,
+    int32_t effective_cell_to_forget_scale_a,
+    int32_t effective_cell_to_forget_scale_b,
+    const int8_t* cell_to_output_weight_ptr,
+    int32_t effective_cell_to_output_scale_a,
+    int32_t effective_cell_to_output_scale_b,
+    const int8_t* projection_weight_ptr, int32_t effective_proj_scale_a,
+    int32_t effective_proj_scale_b, const int16_t* layer_norm_input_weight_ptr,
+    int32_t layer_norm_input_scale_a, int32_t layer_norm_input_scale_b,
+    const int16_t* layer_norm_forget_weight_ptr,
+    int32_t layer_norm_forget_scale_a, int32_t layer_norm_forget_scale_b,
+    const int16_t* layer_norm_cell_weight_ptr, int32_t layer_norm_cell_scale_a,
+    int32_t layer_norm_cell_scale_b,
+    const int16_t* layer_norm_output_weight_ptr,
+    int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
+    const int32_t* input_gate_bias_ptr, const int32_t* forget_gate_bias_ptr,
+    const int32_t* cell_gate_bias_ptr, const int32_t* output_gate_bias_ptr,
+    const int32_t* projection_bias_ptr, const TfLiteLSTMParams* params,
+    const int32_t* intermediate_scale_a, const int32_t* intermediate_scale_b,
+    const int32_t* intermediate_zp, int16_t quantized_cell_clip,
+    int8_t quantized_proj_clip, int n_batch, int n_cell, int n_input,
+    int n_output, int output_batch_leading_dim, int8_t* output_state_ptr,
+    int32_t output_state_zp, int16_t* cell_state_ptr, int8_t* output_ptr,
+    int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
+    int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
+    int16_t* scratch7) {
+  // TODO(b/159066113): scratch5 is unused, remove.
+
+  // Make named scratch buffers for the different gates.
+  int16_t* forget_gate_scratch = scratch2;
+  int16_t* cell_gate_scratch = scratch3;
+  int16_t* output_gate_scratch = scratch4;
+  // no-CIFG is not supported here
+
+  // Calculate the forget gate.
+  CalculateLstmGateInteger8x8_8(
+      input_ptr, input_zp, input_to_forget_weight_ptr,
+      effective_input_to_forget_scale_a, effective_input_to_forget_scale_b,
+      intermediate_scale_a[2], intermediate_scale_b[2], intermediate_zp[4],
+      output_state_ptr, output_state_zp, recurrent_to_forget_weight_ptr,
+      effective_recurrent_to_forget_scale_a,
+      effective_recurrent_to_forget_scale_b, intermediate_scale_a[3],
+      intermediate_scale_b[3], intermediate_zp[5], layer_norm_forget_weight_ptr,
+      layer_norm_forget_scale_a, layer_norm_forget_scale_b,
+      forget_gate_bias_ptr, n_batch, n_input, n_output, n_cell,
+      kTfLiteActSigmoid, forget_gate_scratch, scratch0, scratch1);
+  // Calculate the cell update gate.
+  CalculateLstmGateInteger8x8_8(
+      input_ptr, input_zp, input_to_cell_weight_ptr,
+      effective_input_to_cell_scale_a, effective_input_to_cell_scale_b,
+      intermediate_scale_a[4], intermediate_scale_b[4], intermediate_zp[7],
+      output_state_ptr, output_state_zp, recurrent_to_cell_weight_ptr,
+      effective_recurrent_to_cell_scale_a, effective_recurrent_to_cell_scale_b,
+      intermediate_scale_a[5], intermediate_scale_b[5], intermediate_zp[8],
+      layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
+      layer_norm_cell_scale_b, cell_gate_bias_ptr, n_batch, n_input, n_output,
+      n_cell, kTfLiteActTanh, cell_gate_scratch, scratch0, scratch1);
+  // Update the cell state.
+  UpdateLstmCellInteger(n_batch, n_cell, cell_state_ptr,
+                        /*cell_state_scale=*/-15, /*input_gate=*/nullptr,
+                        forget_gate_scratch, cell_gate_scratch,
+                        /*use_cifg=*/true, quantized_cell_clip);
+  // Calculate the output gate.
+  CalculateLstmGateInteger8x8_8(
+      input_ptr, input_zp, input_to_output_weight_ptr,
+      effective_input_to_output_scale_a, effective_input_to_output_scale_b,
+      intermediate_scale_a[6], intermediate_scale_b[6], intermediate_zp[10],
+      output_state_ptr, output_state_zp, recurrent_to_output_weight_ptr,
+      effective_recurrent_to_output_scale_a,
+      effective_recurrent_to_output_scale_b, intermediate_scale_a[11],
+      intermediate_scale_b[7], intermediate_zp[7], layer_norm_output_weight_ptr,
+      layer_norm_output_scale_a, layer_norm_output_scale_b,
+      output_gate_bias_ptr, n_batch, n_input, n_output, n_cell,
+      kTfLiteActSigmoid, output_gate_scratch, scratch0, scratch1);
+  // Update the output state.
+  CalculateLstmOutputInteger8x8_8(
+      n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
+      projection_weight_ptr, effective_proj_scale_a, effective_proj_scale_b,
+      projection_bias_ptr, output_state_zp, quantized_proj_clip,
+      output_state_ptr, scratch2);
+  // Copy output state to the output. Note that unlike float or hybrid, output
+  // is always contigous.
+  std::memcpy(output_ptr, output_state_ptr,
+              n_batch * n_output * sizeof(int8_t));
+}
+
+}  // namespace
+
+TfLiteStatus EvalFloatLstm(
+    const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* input_to_input_weights,
+    const TfLiteEvalTensor* input_to_forget_weights,
+    const TfLiteEvalTensor* input_to_cell_weights,
+    const TfLiteEvalTensor* input_to_output_weights,
+    const TfLiteEvalTensor* recurrent_to_input_weights,
+    const TfLiteEvalTensor* recurrent_to_forget_weights,
+    const TfLiteEvalTensor* recurrent_to_cell_weights,
+    const TfLiteEvalTensor* recurrent_to_output_weights,
+    const TfLiteEvalTensor* cell_to_input_weights,
+    const TfLiteEvalTensor* cell_to_forget_weights,
+    const TfLiteEvalTensor* cell_to_output_weights,
+    const TfLiteEvalTensor* input_layer_norm_coefficients,
+    const TfLiteEvalTensor* forget_layer_norm_coefficients,
+    const TfLiteEvalTensor* cell_layer_norm_coefficients,
+    const TfLiteEvalTensor* output_layer_norm_coefficients,
+    const TfLiteEvalTensor* aux_input,
+    const TfLiteEvalTensor* aux_input_to_input_weights,
+    const TfLiteEvalTensor* aux_input_to_forget_weights,
+    const TfLiteEvalTensor* aux_input_to_cell_weights,
+    const TfLiteEvalTensor* aux_input_to_output_weights,
+    const TfLiteEvalTensor* input_gate_bias,
+    const TfLiteEvalTensor* forget_gate_bias,
+    const TfLiteEvalTensor* cell_gate_bias,
+    const TfLiteEvalTensor* output_gate_bias,
+    const TfLiteEvalTensor* projection_weights,
+    const TfLiteEvalTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major, int output_offset,
+    float* scratch_buffer, TfLiteEvalTensor* output_state,
+    TfLiteEvalTensor* cell_state, TfLiteEvalTensor* output) {
+  TFLITE_DCHECK(input->dims->size >= 2 && input->dims->size <= 3);
+  int max_time, n_batch;
+  if (input->dims->size == 3) {
+    max_time = (time_major) ? input->dims->data[0] : input->dims->data[1];
+    n_batch = (time_major) ? input->dims->data[1] : input->dims->data[0];
+  } else {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  }
+  const int n_input = input->dims->data[input->dims->size - 1];
+  const int aux_input_size =
+      (aux_input) ? aux_input->dims->data[aux_input->dims->size - 1] : 0;
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  float* input_gate_scratch = nullptr;
+  float* cell_gate_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_gate_scratch = scratch_buffer;
+    forget_gate_scratch = scratch_buffer + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer;
+    cell_gate_scratch = scratch_buffer + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer + 3 * n_cell * n_batch;
+  }
+
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
+  if (time_major) {
+    // Loop through the sequence.
+    const int input_step = n_batch * n_input;
+    const int output_step = n_batch * output_batch_leading_dim;
+    for (int t = 0; t < max_time; t++) {
+      // If this is the forward_sequence, step forward, otherwise step
+      // backwards.
+      const int t_rel = forward_sequence ? t : max_time - t - 1;
+      const float* input_ptr =
+          tflite::micro::GetTensorData<float>(input) + t_rel * input_step;
+      const float* aux_input_ptr = nullptr;
+      if (aux_input) {
+        aux_input_ptr =
+            tflite::micro::GetTensorData<float>(aux_input) + t_rel * input_step;
+      }
+      float* output_ptr = tflite::micro::GetTensorData<float>(output) +
+                          t_rel * output_step + output_offset;
+
+      LstmStepFloat(
+          input_ptr,
+          input_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(input_to_input_weights),
+          input_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(input_to_forget_weights),
+          input_to_cell_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(input_to_cell_weights),
+          input_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(input_to_output_weights),
+          aux_input_ptr,
+          aux_input_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(aux_input_to_input_weights),
+          aux_input_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    aux_input_to_forget_weights),
+          aux_input_to_cell_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(aux_input_to_cell_weights),
+          aux_input_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    aux_input_to_output_weights),
+          recurrent_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(recurrent_to_input_weights),
+          recurrent_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    recurrent_to_forget_weights),
+          recurrent_to_cell_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(recurrent_to_cell_weights),
+          recurrent_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    recurrent_to_output_weights),
+          cell_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(cell_to_input_weights),
+          cell_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(cell_to_forget_weights),
+          cell_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(cell_to_output_weights),
+          input_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    input_layer_norm_coefficients),
+          forget_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    forget_layer_norm_coefficients),
+          cell_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    cell_layer_norm_coefficients),
+          output_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    output_layer_norm_coefficients),
+          input_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(input_gate_bias),
+          forget_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(forget_gate_bias),
+          cell_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(cell_gate_bias),
+          output_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(output_gate_bias),
+          projection_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(projection_weights),
+          projection_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(projection_bias),
+          params, n_batch, n_cell, n_input, aux_input_size, n_output,
+          output_batch_leading_dim,
+          tflite::micro::GetTensorData<float>(output_state),
+          tflite::micro::GetTensorData<float>(cell_state), input_gate_scratch,
+          forget_gate_scratch, cell_gate_scratch, output_gate_scratch,
+          output_ptr);
+    }
+  } else {
+    for (int b = 0; b < n_batch; b++) {
+      const int input_step = n_input;
+      const int output_step = output_batch_leading_dim;
+      for (int t = 0; t < max_time; t++) {
+        // If this is the forward_sequence, step forward, otherwise step
+        // backwards.
+        const int t_rel = forward_sequence ? t : max_time - t - 1;
+        const int time_offset = b * max_time + t_rel;
+        const float* input_ptr = tflite::micro::GetTensorData<float>(input) +
+                                 time_offset * input_step;
+        const float* aux_input_ptr = nullptr;
+        if (aux_input) {
+          aux_input_ptr = tflite::micro::GetTensorData<float>(aux_input) +
+                          time_offset * input_step;
+        }
+        float* output_ptr = tflite::micro::GetTensorData<float>(output) +
+                            time_offset * output_step + output_offset;
+
+        // Offset the {output,cell}_state pointers to the right batch.
+        float* output_state_ptr =
+            tflite::micro::GetTensorData<float>(output_state) +
+            b * output_batch_leading_dim;
+        float* cell_state_ptr =
+            tflite::micro::GetTensorData<float>(cell_state) + b * n_cell;
+        // Offset the scratch pointers to the right batch.
+        float* input_gate_scratch_ptr =
+            input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
+        float* forget_gate_scratch_ptr = forget_gate_scratch + b * n_cell;
+        float* cell_gate_scratch_ptr = cell_gate_scratch + b * n_cell;
+        float* output_gate_scratch_ptr = output_gate_scratch + b * n_cell;
+
+        LstmStepFloat(
+            input_ptr,
+            input_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(input_to_input_weights),
+            input_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(input_to_forget_weights),
+            input_to_cell_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(input_to_cell_weights),
+            input_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(input_to_output_weights),
+            aux_input_ptr,
+            aux_input_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      aux_input_to_input_weights),
+            aux_input_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      aux_input_to_forget_weights),
+            aux_input_to_cell_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      aux_input_to_cell_weights),
+            aux_input_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      aux_input_to_output_weights),
+            recurrent_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      recurrent_to_input_weights),
+            recurrent_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      recurrent_to_forget_weights),
+            recurrent_to_cell_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      recurrent_to_cell_weights),
+            recurrent_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      recurrent_to_output_weights),
+            cell_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(cell_to_input_weights),
+            cell_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(cell_to_forget_weights),
+            cell_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(cell_to_output_weights),
+            input_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      input_layer_norm_coefficients),
+            forget_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      forget_layer_norm_coefficients),
+            cell_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      cell_layer_norm_coefficients),
+            output_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      output_layer_norm_coefficients),
+            input_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(input_gate_bias),
+            forget_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(forget_gate_bias),
+            cell_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(cell_gate_bias),
+            output_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(output_gate_bias),
+            projection_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(projection_weights),
+            projection_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(projection_bias),
+            params,
+            /*n_batch=*/1, n_cell, n_input, aux_input_size, n_output,
+            output_batch_leading_dim, output_state_ptr, cell_state_ptr,
+            input_gate_scratch_ptr, forget_gate_scratch_ptr,
+            cell_gate_scratch_ptr, output_gate_scratch_ptr, output_ptr);
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalHybridLstm(
+    const HybridLstmScales* hybrid_lstm_scales, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* input_to_input_weights,
+    const TfLiteEvalTensor* input_to_input_weights_ledger,
+    const TfLiteEvalTensor* input_to_forget_weights,
+    const TfLiteEvalTensor* input_to_forget_weights_ledger,
+    const TfLiteEvalTensor* input_to_cell_weights,
+    const TfLiteEvalTensor* input_to_cell_weights_ledger,
+    const TfLiteEvalTensor* input_to_output_weights,
+    const TfLiteEvalTensor* input_to_output_weights_ledger,
+    const TfLiteEvalTensor* recurrent_to_input_weights,
+    const TfLiteEvalTensor* recurrent_to_input_weights_ledger,
+    const TfLiteEvalTensor* recurrent_to_forget_weights,
+    const TfLiteEvalTensor* recurrent_to_forget_weights_ledger,
+    const TfLiteEvalTensor* recurrent_to_cell_weights,
+    const TfLiteEvalTensor* recurrent_to_cell_weights_ledger,
+    const TfLiteEvalTensor* recurrent_to_output_weights,
+    const TfLiteEvalTensor* recurrent_to_output_weights_ledger,
+    const TfLiteEvalTensor* cell_to_input_weights,
+    const TfLiteEvalTensor* cell_to_forget_weights,
+    const TfLiteEvalTensor* cell_to_output_weights,
+    const TfLiteEvalTensor* input_layer_norm_coefficients,
+    const TfLiteEvalTensor* forget_layer_norm_coefficients,
+    const TfLiteEvalTensor* cell_layer_norm_coefficients,
+    const TfLiteEvalTensor* output_layer_norm_coefficients,
+    const TfLiteEvalTensor* aux_input,
+    const TfLiteEvalTensor* aux_input_to_input_weights,
+    const TfLiteEvalTensor* aux_input_to_forget_weights,
+    const TfLiteEvalTensor* aux_input_to_cell_weights,
+    const TfLiteEvalTensor* aux_input_to_output_weights,
+    const TfLiteEvalTensor* input_gate_bias,
+    const TfLiteEvalTensor* forget_gate_bias,
+    const TfLiteEvalTensor* cell_gate_bias,
+    const TfLiteEvalTensor* output_gate_bias,
+    const TfLiteEvalTensor* projection_weights,
+    const TfLiteEvalTensor* projection_weights_ledger,
+    const TfLiteEvalTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major, int output_offset,
+    float* scratch_buffer, float* input_sf, float* aux_input_sf,
+    float* output_state_sf, float* prod_scaling_factors,
+    float* recovered_cell_weights, int8_t* input_quantized,
+    int8_t* aux_input_quantized, int8_t* output_state_quantized,
+    int8_t* cell_state_quantized, float* scales, TfLiteEvalTensor* output_state,
+    TfLiteEvalTensor* cell_state, int32_t* output_scratch_buffer,
+    TfLiteEvalTensor* output, int32_t* input_zp, int32_t* aux_input_zp,
+    int32_t* output_state_zp, int32_t* row_sums, int row_sums_size,
+    bool* compute_row_sums) {
+  TFLITE_DCHECK(input->dims->size >= 2 && input->dims->size <= 3);
+  const int n_input = input->dims->data[input->dims->size - 1];
+  int max_time, n_batch;
+  if (input->dims->size == 2) {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  } else {
+    max_time = (time_major) ? input->dims->data[0] : input->dims->data[1];
+    n_batch = (time_major) ? input->dims->data[1] : input->dims->data[0];
+  }
+  const int aux_input_size =
+      (aux_input) ? aux_input->dims->data[aux_input->dims->size - 1] : 0;
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+
+  float* input_gate_scratch = nullptr;
+  float* cell_gate_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_gate_scratch = scratch_buffer;
+    forget_gate_scratch = scratch_buffer + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer;
+    cell_gate_scratch = scratch_buffer + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer + 3 * n_cell * n_batch;
+  }
+
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
+
+  int32_t* input_zp_ptr = nullptr;
+  int32_t* aux_input_zp_ptr = nullptr;
+  int32_t* output_state_zp_ptr = nullptr;
+  int32_t* row_sums_ptr = nullptr;
+  if (params->asymmetric_quantize_inputs) {
+    input_zp_ptr = input_zp;
+    aux_input_zp_ptr = aux_input_zp;
+    output_state_zp_ptr = output_state_zp;
+    row_sums_ptr = row_sums;
+  }
+
+  if (time_major) {
+    // Feed the sequence into the LSTM step-by-step.
+    const int input_step = n_batch * n_input;
+    const int output_step = n_batch * output_batch_leading_dim;
+    for (int t = 0; t < max_time; t++) {
+      // If this is the forward_sequence, step forward, otherwise step
+      // backwards.
+      const int t_rel = forward_sequence ? t : max_time - t - 1;
+      const float* input_ptr =
+          tflite::micro::GetTensorData<float>(input) + t_rel * input_step;
+      const float* aux_input_ptr = nullptr;
+      if (aux_input) {
+        aux_input_ptr =
+            tflite::micro::GetTensorData<float>(aux_input) + t_rel * input_step;
+      }
+      float* output_ptr = tflite::micro::GetTensorData<float>(output) +
+                          t_rel * output_step + output_offset;
+      LstmStepHybrid(
+          input_ptr,
+          input_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(input_to_input_weights),
+          input_to_input_weights_ledger == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<uint8_t>(
+                    input_to_input_weights_ledger),
+          hybrid_lstm_scales->input_to_input_weights_scale,
+          input_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(input_to_forget_weights),
+          input_to_forget_weights_ledger == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<uint8_t>(
+                    input_to_forget_weights_ledger),
+          hybrid_lstm_scales->input_to_forget_weights_scale,
+          input_to_cell_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(input_to_cell_weights),
+          input_to_cell_weights_ledger == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<uint8_t>(
+                    input_to_cell_weights_ledger),
+          hybrid_lstm_scales->input_to_cell_weights_scale,
+          input_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(input_to_output_weights),
+          input_to_output_weights_ledger == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<uint8_t>(
+                    input_to_output_weights_ledger),
+          hybrid_lstm_scales->input_to_output_weights_scale, aux_input_ptr,
+          aux_input_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(
+                    aux_input_to_input_weights),
+          hybrid_lstm_scales->aux_input_to_input_weights_scale,
+          aux_input_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(
+                    aux_input_to_forget_weights),
+          hybrid_lstm_scales->aux_input_to_forget_weights_scale,
+          aux_input_to_cell_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(aux_input_to_cell_weights),
+          hybrid_lstm_scales->aux_input_to_cell_weights_scale,
+          aux_input_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(
+                    aux_input_to_output_weights),
+          hybrid_lstm_scales->aux_input_to_output_weights_scale,
+          recurrent_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(
+                    recurrent_to_input_weights),
+          recurrent_to_input_weights_ledger == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<uint8_t>(
+                    recurrent_to_input_weights_ledger),
+          hybrid_lstm_scales->recurrent_to_input_weights_scale,
+          recurrent_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(
+                    recurrent_to_forget_weights),
+          recurrent_to_forget_weights_ledger == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<uint8_t>(
+                    recurrent_to_forget_weights_ledger),
+          hybrid_lstm_scales->recurrent_to_forget_weights_scale,
+          recurrent_to_cell_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(recurrent_to_cell_weights),
+          recurrent_to_cell_weights_ledger == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<uint8_t>(
+                    recurrent_to_cell_weights_ledger),
+          hybrid_lstm_scales->recurrent_to_cell_weights_scale,
+          recurrent_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(
+                    recurrent_to_output_weights),
+          recurrent_to_output_weights_ledger == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<uint8_t>(
+                    recurrent_to_output_weights_ledger),
+          hybrid_lstm_scales->recurrent_to_output_weights_scale,
+          cell_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(cell_to_input_weights),
+          hybrid_lstm_scales->cell_to_input_weights_scale,
+          cell_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(cell_to_forget_weights),
+          hybrid_lstm_scales->cell_to_forget_weights_scale,
+          cell_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(cell_to_output_weights),
+          hybrid_lstm_scales->cell_to_output_weights_scale,
+          input_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    input_layer_norm_coefficients),
+          forget_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    forget_layer_norm_coefficients),
+          cell_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    cell_layer_norm_coefficients),
+          output_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(
+                    output_layer_norm_coefficients),
+          input_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(input_gate_bias),
+          forget_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(forget_gate_bias),
+          cell_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(cell_gate_bias),
+          output_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(output_gate_bias),
+          projection_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(projection_weights),
+          projection_weights_ledger == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<uint8_t>(
+                    projection_weights_ledger),
+          hybrid_lstm_scales->projection_weights_scale,
+          projection_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<float>(projection_bias),
+          params, n_batch, n_cell, n_input, aux_input_size, n_output,
+          output_batch_leading_dim, input_gate_scratch, forget_gate_scratch,
+          cell_gate_scratch, output_gate_scratch, scales, input_sf,
+          aux_input_sf, output_state_sf, prod_scaling_factors,
+          recovered_cell_weights, input_quantized, aux_input_quantized,
+          output_state_quantized, cell_state_quantized,
+          tflite::micro::GetTensorData<float>(output_state),
+          tflite::micro::GetTensorData<float>(cell_state),
+          output_scratch_buffer, output_ptr, input_zp_ptr, aux_input_zp_ptr,
+          output_state_zp_ptr, row_sums_ptr, row_sums_size, compute_row_sums,
+          params->asymmetric_quantize_inputs);
+    }
+  } else {
+    for (int b = 0; b < n_batch; b++) {
+      const int input_step = n_input;
+      const int output_step = output_batch_leading_dim;
+      for (int t = 0; t < max_time; t++) {
+        // If this is the forward_sequence, step forward, otherwise step
+        // backwards.
+        const int t_rel = forward_sequence ? t : max_time - t - 1;
+        const int time_offset = b * max_time + t_rel;
+        const float* input_ptr = tflite::micro::GetTensorData<float>(input) +
+                                 time_offset * input_step;
+        const float* aux_input_ptr = nullptr;
+        if (aux_input) {
+          aux_input_ptr = tflite::micro::GetTensorData<float>(aux_input) +
+                          time_offset * input_step;
+        }
+        float* output_ptr = tflite::micro::GetTensorData<float>(output) +
+                            time_offset * output_step + output_offset;
+
+        // Offset the {output,cell}_state pointers to the right batch.
+        float* output_state_ptr =
+            tflite::micro::GetTensorData<float>(output_state) +
+            b * output_batch_leading_dim;
+        float* cell_state_ptr =
+            tflite::micro::GetTensorData<float>(cell_state) + b * n_cell;
+        // Offset the scratch pointers to the right batch.
+        float* input_gate_scratch_ptr =
+            input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
+        float* forget_gate_scratch_ptr = forget_gate_scratch + b * n_cell;
+        float* cell_gate_scratch_ptr = cell_gate_scratch + b * n_cell;
+        float* output_gate_scratch_ptr = output_gate_scratch + b * n_cell;
+
+        LstmStepHybrid(
+            input_ptr,
+            input_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(input_to_input_weights),
+            input_to_input_weights_ledger == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<uint8_t>(
+                      input_to_input_weights_ledger),
+            hybrid_lstm_scales->input_to_input_weights_scale,
+            input_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(input_to_forget_weights),
+            input_to_forget_weights_ledger == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<uint8_t>(
+                      input_to_forget_weights_ledger),
+            hybrid_lstm_scales->input_to_forget_weights_scale,
+            input_to_cell_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(input_to_cell_weights),
+            input_to_cell_weights_ledger == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<uint8_t>(
+                      input_to_cell_weights_ledger),
+            hybrid_lstm_scales->input_to_cell_weights_scale,
+            input_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(input_to_output_weights),
+            input_to_output_weights_ledger == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<uint8_t>(
+                      input_to_output_weights_ledger),
+            hybrid_lstm_scales->input_to_output_weights_scale, aux_input_ptr,
+            aux_input_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      aux_input_to_input_weights),
+            hybrid_lstm_scales->aux_input_to_input_weights_scale,
+            aux_input_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      aux_input_to_forget_weights),
+            hybrid_lstm_scales->aux_input_to_forget_weights_scale,
+            aux_input_to_cell_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      aux_input_to_cell_weights),
+            hybrid_lstm_scales->aux_input_to_cell_weights_scale,
+            aux_input_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      aux_input_to_output_weights),
+            hybrid_lstm_scales->aux_input_to_output_weights_scale,
+            recurrent_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      recurrent_to_input_weights),
+            recurrent_to_input_weights_ledger == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<uint8_t>(
+                      recurrent_to_input_weights_ledger),
+            hybrid_lstm_scales->recurrent_to_input_weights_scale,
+            recurrent_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      recurrent_to_forget_weights),
+            recurrent_to_forget_weights_ledger == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<uint8_t>(
+                      recurrent_to_forget_weights_ledger),
+            hybrid_lstm_scales->recurrent_to_forget_weights_scale,
+            recurrent_to_cell_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      recurrent_to_cell_weights),
+            recurrent_to_cell_weights_ledger == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<uint8_t>(
+                      recurrent_to_cell_weights_ledger),
+            hybrid_lstm_scales->recurrent_to_cell_weights_scale,
+            recurrent_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      recurrent_to_output_weights),
+            recurrent_to_output_weights_ledger == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<uint8_t>(
+                      recurrent_to_output_weights_ledger),
+            hybrid_lstm_scales->recurrent_to_output_weights_scale,
+            cell_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(cell_to_input_weights),
+            hybrid_lstm_scales->cell_to_input_weights_scale,
+            cell_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(cell_to_forget_weights),
+            hybrid_lstm_scales->cell_to_forget_weights_scale,
+            cell_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(cell_to_output_weights),
+            hybrid_lstm_scales->cell_to_output_weights_scale,
+            input_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      input_layer_norm_coefficients),
+            forget_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      forget_layer_norm_coefficients),
+            cell_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      cell_layer_norm_coefficients),
+            output_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(
+                      output_layer_norm_coefficients),
+            input_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(input_gate_bias),
+            forget_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(forget_gate_bias),
+            cell_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(cell_gate_bias),
+            output_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(output_gate_bias),
+            projection_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(projection_weights),
+            projection_weights_ledger == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<uint8_t>(
+                      projection_weights_ledger),
+            hybrid_lstm_scales->projection_weights_scale,
+            projection_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<float>(projection_bias),
+            params,
+            /*n_batch=*/1, n_cell, n_input, aux_input_size, n_output,
+            output_batch_leading_dim, input_gate_scratch_ptr,
+            forget_gate_scratch_ptr, cell_gate_scratch_ptr,
+            output_gate_scratch_ptr, scales, input_sf, aux_input_sf,
+            output_state_sf, prod_scaling_factors, recovered_cell_weights,
+            input_quantized, aux_input_quantized, output_state_quantized,
+            cell_state_quantized, output_state_ptr, cell_state_ptr,
+            output_scratch_buffer, output_ptr, input_zp_ptr, aux_input_zp_ptr,
+            output_state_zp_ptr, row_sums_ptr, row_sums_size, compute_row_sums,
+            params->asymmetric_quantize_inputs);
+      }
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalInteger8x8_16Lstm(
+    const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* input_to_input_weights,
+    const TfLiteEvalTensor* input_to_forget_weights,
+    const TfLiteEvalTensor* input_to_cell_weights,
+    const TfLiteEvalTensor* input_to_output_weights,
+    const TfLiteEvalTensor* recurrent_to_input_weights,
+    const TfLiteEvalTensor* recurrent_to_forget_weights,
+    const TfLiteEvalTensor* recurrent_to_cell_weights,
+    const TfLiteEvalTensor* recurrent_to_output_weights,
+    const TfLiteEvalTensor* cell_to_input_weights,
+    const TfLiteEvalTensor* cell_to_forget_weights,
+    const TfLiteEvalTensor* cell_to_output_weights,
+    const TfLiteEvalTensor* input_layer_norm_coefficients,
+    const TfLiteEvalTensor* forget_layer_norm_coefficients,
+    const TfLiteEvalTensor* cell_layer_norm_coefficients,
+    const TfLiteEvalTensor* output_layer_norm_coefficients,
+    const TfLiteEvalTensor* input_gate_bias,
+    const TfLiteEvalTensor* forget_gate_bias,
+    const TfLiteEvalTensor* cell_gate_bias,
+    const TfLiteEvalTensor* output_gate_bias,
+    const TfLiteEvalTensor* projection_weights,
+    const TfLiteEvalTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major,
+    const IntegerLstmParameter* integer_lstm_param, int32_t output_state_zp,
+    TfLiteEvalTensor* output_state, TfLiteEvalTensor* cell_state,
+    TfLiteEvalTensor* output, int16_t* scratch0, int16_t* scratch1,
+    int16_t* scratch2, int16_t* scratch3, int8_t* scratch4, int32_t* scratch5) {
+  TFLITE_DCHECK(input->dims->size >= 2 && input->dims->size <= 3);
+  const int n_input = input->dims->data[input->dims->size - 1];
+  int max_time, n_batch;
+  if (input->dims->size == 2) {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  } else {
+    max_time = (time_major) ? input->dims->data[0] : input->dims->data[1];
+    n_batch = (time_major) ? input->dims->data[1] : input->dims->data[0];
+  }
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Get params for time/batch/sequence.
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
+
+  if (time_major) {
+    const int input_step = n_batch * n_input;
+    const int output_step = n_batch * output_batch_leading_dim;
+    for (int t = 0; t < max_time; t++) {
+      const int t_rel = t;
+      int8_t* output_ptr =
+          tflite::micro::GetTensorData<int8_t>(output) + t_rel * output_step;
+      const int8_t* input_ptr =
+          tflite::micro::GetTensorData<int8_t>(input) + t_rel * input_step;
+      LstmStepInteger8x8_16(
+          input_ptr,
+          input_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(input_to_input_weights),
+          integer_lstm_param->effective_input_to_input_scale_a,
+          integer_lstm_param->effective_input_to_input_scale_b,
+          input_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(input_to_forget_weights),
+          integer_lstm_param->effective_input_to_forget_scale_a,
+          integer_lstm_param->effective_input_to_forget_scale_b,
+          input_to_cell_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(input_to_cell_weights),
+          integer_lstm_param->effective_input_to_cell_scale_a,
+          integer_lstm_param->effective_input_to_cell_scale_b,
+          input_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(input_to_output_weights),
+          integer_lstm_param->effective_input_to_output_scale_a,
+          integer_lstm_param->effective_input_to_output_scale_b,
+          recurrent_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(
+                    recurrent_to_input_weights),
+          integer_lstm_param->effective_recurrent_to_input_scale_a,
+          integer_lstm_param->effective_recurrent_to_input_scale_b,
+          recurrent_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(
+                    recurrent_to_forget_weights),
+          integer_lstm_param->effective_recurrent_to_forget_scale_a,
+          integer_lstm_param->effective_recurrent_to_forget_scale_b,
+          recurrent_to_cell_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(recurrent_to_cell_weights),
+          integer_lstm_param->effective_recurrent_to_cell_scale_a,
+          integer_lstm_param->effective_recurrent_to_cell_scale_b,
+          recurrent_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(
+                    recurrent_to_output_weights),
+          integer_lstm_param->effective_recurrent_to_output_scale_a,
+          integer_lstm_param->effective_recurrent_to_output_scale_b,
+          cell_to_input_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int16_t>(cell_to_input_weights),
+          integer_lstm_param->effective_cell_to_input_scale_a,
+          integer_lstm_param->effective_cell_to_input_scale_b,
+          cell_to_forget_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int16_t>(cell_to_forget_weights),
+          integer_lstm_param->effective_cell_to_forget_scale_a,
+          integer_lstm_param->effective_cell_to_forget_scale_b,
+          cell_to_output_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int16_t>(cell_to_output_weights),
+          integer_lstm_param->effective_cell_to_output_scale_a,
+          integer_lstm_param->effective_cell_to_output_scale_b,
+          projection_weights == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int8_t>(projection_weights),
+          integer_lstm_param->effective_proj_scale_a,
+          integer_lstm_param->effective_proj_scale_b,
+          integer_lstm_param->hidden_zp,
+          integer_lstm_param->effective_hidden_scale_a,
+          integer_lstm_param->effective_hidden_scale_b,
+          input_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int16_t>(
+                    input_layer_norm_coefficients),
+          integer_lstm_param->layer_norm_input_scale_a,
+          integer_lstm_param->layer_norm_input_scale_b,
+          forget_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int16_t>(
+                    forget_layer_norm_coefficients),
+          integer_lstm_param->layer_norm_forget_scale_a,
+          integer_lstm_param->layer_norm_forget_scale_b,
+          cell_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int16_t>(
+                    cell_layer_norm_coefficients),
+          integer_lstm_param->layer_norm_cell_scale_a,
+          integer_lstm_param->layer_norm_cell_scale_b,
+          output_layer_norm_coefficients == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int16_t>(
+                    output_layer_norm_coefficients),
+          integer_lstm_param->layer_norm_output_scale_a,
+          integer_lstm_param->layer_norm_output_scale_b,
+          input_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int32_t>(input_gate_bias),
+          forget_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int32_t>(forget_gate_bias),
+          cell_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int32_t>(cell_gate_bias),
+          output_gate_bias == nullptr
+              ? nullptr
+              : tflite::micro::GetTensorData<int32_t>(output_gate_bias),
+          integer_lstm_param->quantized_cell_clip,
+          integer_lstm_param->quantized_proj_clip,
+          integer_lstm_param->cell_scale,
+          integer_lstm_param->input_variance_guard,
+          integer_lstm_param->forget_variance_guard,
+          integer_lstm_param->cell_variance_guard,
+          integer_lstm_param->output_variance_guard,
+          integer_lstm_param->input_to_forget_effective_bias,
+          integer_lstm_param->recurrent_to_forget_effective_bias,
+          integer_lstm_param->input_to_cell_effective_bias,
+          integer_lstm_param->recurrent_to_cell_effective_bias,
+          integer_lstm_param->input_to_output_effective_bias,
+          integer_lstm_param->recurrent_to_output_effective_bias,
+          integer_lstm_param->input_to_input_effective_bias,
+          integer_lstm_param->recurrent_to_input_effective_bias,
+          integer_lstm_param->projection_effective_bias, n_batch, n_cell,
+          n_input, n_output, tflite::micro::GetTensorData<int8_t>(output_state),
+          output_state_zp, tflite::micro::GetTensorData<int16_t>(cell_state),
+          output_ptr, scratch0, scratch1, scratch2, scratch3, scratch4,
+          scratch5);
+    }
+  } else {
+    for (int b = 0; b < n_batch; b++) {
+      const int input_step = n_input;
+      const int output_step = output_batch_leading_dim;
+      for (int t = 0; t < max_time; t++) {
+        // If this is the forward_sequence, step forward, otherwise step
+        // backwards.
+        const int t_rel = forward_sequence ? t : max_time - t - 1;
+        const int time_offset = b * max_time + t_rel;
+        const int8_t* input_ptr = tflite::micro::GetTensorData<int8_t>(input) +
+                                  time_offset * input_step;
+        int8_t* output_ptr = tflite::micro::GetTensorData<int8_t>(output) +
+                             time_offset * output_step;
+
+        // Offset the {output,cell}_state pointers to the right batch.
+        int8_t* output_state_ptr =
+            tflite::micro::GetTensorData<int8_t>(output_state) +
+            b * output_batch_leading_dim;
+        int16_t* cell_state_ptr =
+            tflite::micro::GetTensorData<int16_t>(cell_state) + b * n_cell;
+
+        LstmStepInteger8x8_16(
+            input_ptr,
+            input_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(input_to_input_weights),
+            integer_lstm_param->effective_input_to_input_scale_a,
+            integer_lstm_param->effective_input_to_input_scale_b,
+            input_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(input_to_forget_weights),
+            integer_lstm_param->effective_input_to_forget_scale_a,
+            integer_lstm_param->effective_input_to_forget_scale_b,
+            input_to_cell_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(input_to_cell_weights),
+            integer_lstm_param->effective_input_to_cell_scale_a,
+            integer_lstm_param->effective_input_to_cell_scale_b,
+            input_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(input_to_output_weights),
+            integer_lstm_param->effective_input_to_output_scale_a,
+            integer_lstm_param->effective_input_to_output_scale_b,
+            recurrent_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      recurrent_to_input_weights),
+            integer_lstm_param->effective_recurrent_to_input_scale_a,
+            integer_lstm_param->effective_recurrent_to_input_scale_b,
+            recurrent_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      recurrent_to_forget_weights),
+            integer_lstm_param->effective_recurrent_to_forget_scale_a,
+            integer_lstm_param->effective_recurrent_to_forget_scale_b,
+            recurrent_to_cell_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      recurrent_to_cell_weights),
+            integer_lstm_param->effective_recurrent_to_cell_scale_a,
+            integer_lstm_param->effective_recurrent_to_cell_scale_b,
+            recurrent_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(
+                      recurrent_to_output_weights),
+            integer_lstm_param->effective_recurrent_to_output_scale_a,
+            integer_lstm_param->effective_recurrent_to_output_scale_b,
+            cell_to_input_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int16_t>(cell_to_input_weights),
+            integer_lstm_param->effective_cell_to_input_scale_a,
+            integer_lstm_param->effective_cell_to_input_scale_b,
+            cell_to_forget_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int16_t>(cell_to_forget_weights),
+            integer_lstm_param->effective_cell_to_forget_scale_a,
+            integer_lstm_param->effective_cell_to_forget_scale_b,
+            cell_to_output_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int16_t>(cell_to_output_weights),
+            integer_lstm_param->effective_cell_to_output_scale_a,
+            integer_lstm_param->effective_cell_to_output_scale_b,
+            projection_weights == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int8_t>(projection_weights),
+            integer_lstm_param->effective_proj_scale_a,
+            integer_lstm_param->effective_proj_scale_b,
+            integer_lstm_param->hidden_zp,
+            integer_lstm_param->effective_hidden_scale_a,
+            integer_lstm_param->effective_hidden_scale_b,
+            input_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int16_t>(
+                      input_layer_norm_coefficients),
+            integer_lstm_param->layer_norm_input_scale_a,
+            integer_lstm_param->layer_norm_input_scale_b,
+            forget_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int16_t>(
+                      forget_layer_norm_coefficients),
+            integer_lstm_param->layer_norm_forget_scale_a,
+            integer_lstm_param->layer_norm_forget_scale_b,
+            cell_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int16_t>(
+                      cell_layer_norm_coefficients),
+            integer_lstm_param->layer_norm_cell_scale_a,
+            integer_lstm_param->layer_norm_cell_scale_b,
+            output_layer_norm_coefficients == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int16_t>(
+                      output_layer_norm_coefficients),
+            integer_lstm_param->layer_norm_output_scale_a,
+            integer_lstm_param->layer_norm_output_scale_b,
+            input_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int32_t>(input_gate_bias),
+            forget_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int32_t>(forget_gate_bias),
+            cell_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int32_t>(cell_gate_bias),
+            output_gate_bias == nullptr
+                ? nullptr
+                : tflite::micro::GetTensorData<int32_t>(output_gate_bias),
+            integer_lstm_param->quantized_cell_clip,
+            integer_lstm_param->quantized_proj_clip,
+            integer_lstm_param->cell_scale,
+            integer_lstm_param->input_variance_guard,
+            integer_lstm_param->forget_variance_guard,
+            integer_lstm_param->cell_variance_guard,
+            integer_lstm_param->output_variance_guard,
+            integer_lstm_param->input_to_forget_effective_bias,
+            integer_lstm_param->recurrent_to_forget_effective_bias,
+            integer_lstm_param->input_to_cell_effective_bias,
+            integer_lstm_param->recurrent_to_cell_effective_bias,
+            integer_lstm_param->input_to_output_effective_bias,
+            integer_lstm_param->recurrent_to_output_effective_bias,
+            integer_lstm_param->input_to_input_effective_bias,
+            integer_lstm_param->recurrent_to_input_effective_bias,
+            integer_lstm_param->projection_effective_bias, /*n_batch=*/1,
+            n_cell, n_input, n_output, output_state_ptr, output_state_zp,
+            cell_state_ptr, output_ptr, scratch0, scratch1, scratch2, scratch3,
+            scratch4, scratch5);
+      }
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalInteger8x8_8Lstm(
+    const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* input_to_input_weights,
+    const TfLiteEvalTensor* input_to_forget_weights,
+    const TfLiteEvalTensor* input_to_cell_weights,
+    const TfLiteEvalTensor* input_to_output_weights,
+    const TfLiteEvalTensor* recurrent_to_input_weights,
+    const TfLiteEvalTensor* recurrent_to_forget_weights,
+    const TfLiteEvalTensor* recurrent_to_cell_weights,
+    const TfLiteEvalTensor* recurrent_to_output_weights,
+    const TfLiteEvalTensor* cell_to_input_weights,
+    const TfLiteEvalTensor* cell_to_forget_weights,
+    const TfLiteEvalTensor* cell_to_output_weights,
+    const TfLiteEvalTensor* input_layer_norm_coefficients,
+    const TfLiteEvalTensor* forget_layer_norm_coefficients,
+    const TfLiteEvalTensor* cell_layer_norm_coefficients,
+    const TfLiteEvalTensor* output_layer_norm_coefficients,
+    const TfLiteEvalTensor* input_gate_bias,
+    const TfLiteEvalTensor* forget_gate_bias,
+    const TfLiteEvalTensor* cell_gate_bias,
+    const TfLiteEvalTensor* output_gate_bias,
+    const TfLiteEvalTensor* projection_weights,
+    const TfLiteEvalTensor* projection_bias, const TfLiteLSTMParams* params,
+    TfLiteEvalTensor* output_state, TfLiteEvalTensor* cell_state,
+    TfLiteEvalTensor* output, const IntegerLstmParameter* integer_lstm_param,
+    int32_t input_zp, int32_t output_state_zp, int8_t* scratch0,
+    int8_t* scratch1, int16_t* scratch2, int16_t* scratch3, int16_t* scratch4,
+    int16_t* scratch5, int16_t* scratch6, int16_t* scratch7) {
+  TFLITE_DCHECK(input->dims->size >= 2 && input->dims->size <= 3);
+  const int n_input = input->dims->data[input->dims->size - 1];
+  int max_time, n_batch;
+  if (input->dims->size == 2) {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  } else {
+    max_time = input->dims->data[0];
+    n_batch = input->dims->data[1];
+  }
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Get params for time/batch/sequence.
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
+  const int input_step = n_batch * n_input;
+  const int output_step = n_batch * output_batch_leading_dim;
+
+  for (int t = 0; t < max_time; t++) {
+    const int t_rel = t;
+    int8_t* output_ptr =
+        tflite::micro::GetTensorData<int8_t>(output) + t_rel * output_step;
+    // Input can be int8 asymmetric or int16 symmetric.
+    const int8_t* input_ptr =
+        tflite::micro::GetTensorData<int8_t>(input) + t_rel * input_step;
+    LstmStepInteger8x8_8(
+        input_ptr, input_zp,
+
+        input_to_input_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(input_to_input_weights),
+        integer_lstm_param->effective_input_to_input_scale_a,
+        integer_lstm_param->effective_input_to_input_scale_b,
+
+        input_to_forget_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(input_to_forget_weights),
+        integer_lstm_param->effective_input_to_forget_scale_a,
+        integer_lstm_param->effective_input_to_forget_scale_b,
+
+        input_to_cell_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(input_to_cell_weights),
+        integer_lstm_param->effective_input_to_cell_scale_a,
+        integer_lstm_param->effective_input_to_cell_scale_b,
+
+        input_to_output_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(input_to_output_weights),
+        integer_lstm_param->effective_input_to_output_scale_a,
+        integer_lstm_param->effective_input_to_output_scale_b,
+
+        recurrent_to_input_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(recurrent_to_input_weights),
+        integer_lstm_param->effective_recurrent_to_input_scale_a,
+        integer_lstm_param->effective_recurrent_to_input_scale_b,
+
+        recurrent_to_forget_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(recurrent_to_forget_weights),
+        integer_lstm_param->effective_recurrent_to_forget_scale_a,
+        integer_lstm_param->effective_recurrent_to_forget_scale_b,
+
+        recurrent_to_cell_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(recurrent_to_cell_weights),
+        integer_lstm_param->effective_recurrent_to_cell_scale_a,
+        integer_lstm_param->effective_recurrent_to_cell_scale_b,
+
+        recurrent_to_output_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(recurrent_to_output_weights),
+        integer_lstm_param->effective_recurrent_to_output_scale_a,
+        integer_lstm_param->effective_recurrent_to_output_scale_b,
+
+        cell_to_input_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(cell_to_input_weights),
+        integer_lstm_param->effective_cell_to_input_scale_a,
+        integer_lstm_param->effective_cell_to_input_scale_b,
+
+        cell_to_forget_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(cell_to_forget_weights),
+        integer_lstm_param->effective_cell_to_forget_scale_a,
+        integer_lstm_param->effective_cell_to_forget_scale_b,
+
+        cell_to_output_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(cell_to_output_weights),
+        integer_lstm_param->effective_cell_to_output_scale_a,
+        integer_lstm_param->effective_cell_to_output_scale_b,
+
+        projection_weights == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int8_t>(projection_weights),
+        integer_lstm_param->effective_proj_scale_a,
+        integer_lstm_param->effective_proj_scale_b,
+
+        input_layer_norm_coefficients == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int16_t>(
+                  input_layer_norm_coefficients),
+        integer_lstm_param->layer_norm_input_scale_a,
+        integer_lstm_param->layer_norm_input_scale_b,
+
+        forget_layer_norm_coefficients == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int16_t>(
+                  forget_layer_norm_coefficients),
+        integer_lstm_param->layer_norm_forget_scale_a,
+        integer_lstm_param->layer_norm_forget_scale_b,
+
+        cell_layer_norm_coefficients == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int16_t>(
+                  cell_layer_norm_coefficients),
+        integer_lstm_param->layer_norm_cell_scale_a,
+        integer_lstm_param->layer_norm_cell_scale_b,
+
+        output_layer_norm_coefficients == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int16_t>(
+                  output_layer_norm_coefficients),
+        integer_lstm_param->layer_norm_output_scale_a,
+        integer_lstm_param->layer_norm_output_scale_b,
+
+        input_gate_bias == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int32_t>(input_gate_bias),
+        forget_gate_bias == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int32_t>(forget_gate_bias),
+        cell_gate_bias == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int32_t>(cell_gate_bias),
+        output_gate_bias == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int32_t>(output_gate_bias),
+        projection_bias == nullptr
+            ? nullptr
+            : tflite::micro::GetTensorData<int32_t>(projection_bias),
+
+        params, integer_lstm_param->intermediate_scale_a,
+        integer_lstm_param->intermediate_scale_b,
+        integer_lstm_param->intermediate_zp,
+        integer_lstm_param->quantized_cell_clip,
+        integer_lstm_param->quantized_proj_clip, n_batch, n_cell, n_input,
+        n_output, output_batch_leading_dim,
+        tflite::micro::GetTensorData<int8_t>(output_state), output_state_zp,
+        tflite::micro::GetTensorData<int16_t>(cell_state), output_ptr, scratch0,
+        scratch1, scratch2, scratch3, scratch4, scratch5, scratch6, scratch7);
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/lstm_eval.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/lstm_eval.h
new file mode 100644
index 00000000..218b4938
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/lstm_eval.h
@@ -0,0 +1,250 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_LSTM_EVAL_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_LSTM_EVAL_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+// Pamameters for integer LSTM.
+// Consider split this into two Integer Parameters if more fields are added.
+struct IntegerLstmParameter {
+  int32_t effective_input_to_input_scale_a;
+  int32_t effective_input_to_input_scale_b;
+  int32_t effective_recurrent_to_input_scale_a;
+  int32_t effective_recurrent_to_input_scale_b;
+  int32_t effective_cell_to_input_scale_a;
+  int32_t effective_cell_to_input_scale_b;
+  int32_t effective_input_to_forget_scale_a;
+  int32_t effective_input_to_forget_scale_b;
+  int32_t effective_recurrent_to_forget_scale_a;
+  int32_t effective_recurrent_to_forget_scale_b;
+  int32_t effective_cell_to_forget_scale_a;
+  int32_t effective_cell_to_forget_scale_b;
+  int32_t effective_input_to_cell_scale_a;
+  int32_t effective_input_to_cell_scale_b;
+  int32_t effective_recurrent_to_cell_scale_a;
+  int32_t effective_recurrent_to_cell_scale_b;
+  int32_t effective_input_to_output_scale_a;
+  int32_t effective_input_to_output_scale_b;
+  int32_t effective_recurrent_to_output_scale_a;
+  int32_t effective_recurrent_to_output_scale_b;
+  int32_t effective_cell_to_output_scale_a;
+  int32_t effective_cell_to_output_scale_b;
+  int32_t effective_proj_scale_a;
+  int32_t effective_proj_scale_b;
+  int32_t effective_hidden_scale_a;
+  int32_t effective_hidden_scale_b;
+  int32_t layer_norm_input_scale_a;
+  int32_t layer_norm_input_scale_b;
+  int32_t layer_norm_forget_scale_a;
+  int32_t layer_norm_forget_scale_b;
+  int32_t layer_norm_cell_scale_a;
+  int32_t layer_norm_cell_scale_b;
+  int32_t layer_norm_output_scale_a;
+  int32_t layer_norm_output_scale_b;
+  // Quantized clip value for cell and projection. Zero value means no clipping.
+  int16_t quantized_cell_clip;
+  int8_t quantized_proj_clip;
+  int32_t hidden_zp;
+  int32_t cell_scale;
+
+  int32_t input_variance_guard;
+  int32_t forget_variance_guard;
+  int32_t cell_variance_guard;
+  int32_t output_variance_guard;
+
+  // Pre-calculate bias + zero_point * weight.
+  int32_t* input_to_forget_effective_bias;
+  int32_t* recurrent_to_forget_effective_bias;
+  int32_t* input_to_cell_effective_bias;
+  int32_t* recurrent_to_cell_effective_bias;
+  int32_t* input_to_output_effective_bias;
+  int32_t* recurrent_to_output_effective_bias;
+  int32_t* input_to_input_effective_bias;
+  int32_t* recurrent_to_input_effective_bias;
+  int32_t* projection_effective_bias;
+
+  // Scale and zero point for intermediate tensors.
+  // Used only in the 8x8_8 case.
+  int32_t intermediate_scale_a[8];
+  int32_t intermediate_scale_b[8];
+  int32_t intermediate_zp[12];
+};
+
+// Scales for hybrid op with integer inputs and float weights
+struct HybridLstmScales {
+  float input_to_input_weights_scale;
+  float input_to_forget_weights_scale;
+  float input_to_cell_weights_scale;
+  float input_to_output_weights_scale;
+  float aux_input_to_input_weights_scale;
+  float aux_input_to_forget_weights_scale;
+  float aux_input_to_cell_weights_scale;
+  float aux_input_to_output_weights_scale;
+  float recurrent_to_input_weights_scale;
+  float recurrent_to_forget_weights_scale;
+  float recurrent_to_cell_weights_scale;
+  float recurrent_to_output_weights_scale;
+  float cell_to_input_weights_scale;
+  float cell_to_forget_weights_scale;
+  float cell_to_output_weights_scale;
+  float projection_weights_scale;
+};
+
+TfLiteStatus EvalFloatLstm(
+    const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* input_to_input_weights,
+    const TfLiteEvalTensor* input_to_forget_weights,
+    const TfLiteEvalTensor* input_to_cell_weights,
+    const TfLiteEvalTensor* input_to_output_weights,
+    const TfLiteEvalTensor* recurrent_to_input_weights,
+    const TfLiteEvalTensor* recurrent_to_forget_weights,
+    const TfLiteEvalTensor* recurrent_to_cell_weights,
+    const TfLiteEvalTensor* recurrent_to_output_weights,
+    const TfLiteEvalTensor* cell_to_input_weights,
+    const TfLiteEvalTensor* cell_to_forget_weights,
+    const TfLiteEvalTensor* cell_to_output_weights,
+    const TfLiteEvalTensor* input_layer_norm_coefficients,
+    const TfLiteEvalTensor* forget_layer_norm_coefficients,
+    const TfLiteEvalTensor* cell_layer_norm_coefficients,
+    const TfLiteEvalTensor* output_layer_norm_coefficients,
+    const TfLiteEvalTensor* aux_input,
+    const TfLiteEvalTensor* aux_input_to_input_weights,
+    const TfLiteEvalTensor* aux_input_to_forget_weights,
+    const TfLiteEvalTensor* aux_input_to_cell_weights,
+    const TfLiteEvalTensor* aux_input_to_output_weights,
+    const TfLiteEvalTensor* input_gate_bias,
+    const TfLiteEvalTensor* forget_gate_bias,
+    const TfLiteEvalTensor* cell_gate_bias,
+    const TfLiteEvalTensor* output_gate_bias,
+    const TfLiteEvalTensor* projection_weights,
+    const TfLiteEvalTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major, int output_offset,
+    float* scratch_buffer, TfLiteEvalTensor* output_state,
+    TfLiteEvalTensor* cell_state, TfLiteEvalTensor* output);
+
+TfLiteStatus EvalHybridLstm(
+    const HybridLstmScales* hybrid_lstm_scales, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* input_to_input_weights,
+    const TfLiteEvalTensor* input_to_input_weights_ledger,
+    const TfLiteEvalTensor* input_to_forget_weights,
+    const TfLiteEvalTensor* input_to_forget_weights_ledger,
+    const TfLiteEvalTensor* input_to_cell_weights,
+    const TfLiteEvalTensor* input_to_cell_weights_ledger,
+    const TfLiteEvalTensor* input_to_output_weights,
+    const TfLiteEvalTensor* input_to_output_weights_ledger,
+    const TfLiteEvalTensor* recurrent_to_input_weights,
+    const TfLiteEvalTensor* recurrent_to_input_weights_ledger,
+    const TfLiteEvalTensor* recurrent_to_forget_weights,
+    const TfLiteEvalTensor* recurrent_to_forget_weights_ledger,
+    const TfLiteEvalTensor* recurrent_to_cell_weights,
+    const TfLiteEvalTensor* recurrent_to_cell_weights_ledger,
+    const TfLiteEvalTensor* recurrent_to_output_weights,
+    const TfLiteEvalTensor* recurrent_to_output_weights_ledger,
+    const TfLiteEvalTensor* cell_to_input_weights,
+    const TfLiteEvalTensor* cell_to_forget_weights,
+    const TfLiteEvalTensor* cell_to_output_weights,
+    const TfLiteEvalTensor* input_layer_norm_coefficients,
+    const TfLiteEvalTensor* forget_layer_norm_coefficients,
+    const TfLiteEvalTensor* cell_layer_norm_coefficients,
+    const TfLiteEvalTensor* output_layer_norm_coefficients,
+    const TfLiteEvalTensor* aux_input,
+    const TfLiteEvalTensor* aux_input_to_input_weights,
+    const TfLiteEvalTensor* aux_input_to_forget_weights,
+    const TfLiteEvalTensor* aux_input_to_cell_weights,
+    const TfLiteEvalTensor* aux_input_to_output_weights,
+    const TfLiteEvalTensor* input_gate_bias,
+    const TfLiteEvalTensor* forget_gate_bias,
+    const TfLiteEvalTensor* cell_gate_bias,
+    const TfLiteEvalTensor* output_gate_bias,
+    const TfLiteEvalTensor* projection_weights,
+    const TfLiteEvalTensor* projection_weights_ledger,
+    const TfLiteEvalTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major, int output_offset,
+    float* scratch_buffer, float* input_sf, float* aux_input_sf,
+    float* output_state_sf, float* prod_scaling_factors,
+    float* recovered_cell_weights, int8_t* input_quantized,
+    int8_t* aux_input_quantized, int8_t* output_state_quantized,
+    int8_t* cell_state_quantized, float* scales, TfLiteEvalTensor* output_state,
+    TfLiteEvalTensor* cell_state, int32_t* output_scratch_buffer,
+    TfLiteEvalTensor* output, int32_t* input_zp, int32_t* aux_input_zp,
+    int32_t* output_state_zp, int32_t* row_sums, int row_sums_size,
+    bool* compute_row_sums);
+
+TfLiteStatus EvalInteger8x8_16Lstm(
+    const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* input_to_input_weights,
+    const TfLiteEvalTensor* input_to_forget_weights,
+    const TfLiteEvalTensor* input_to_cell_weights,
+    const TfLiteEvalTensor* input_to_output_weights,
+    const TfLiteEvalTensor* recurrent_to_input_weights,
+    const TfLiteEvalTensor* recurrent_to_forget_weights,
+    const TfLiteEvalTensor* recurrent_to_cell_weights,
+    const TfLiteEvalTensor* recurrent_to_output_weights,
+    const TfLiteEvalTensor* cell_to_input_weights,
+    const TfLiteEvalTensor* cell_to_forget_weights,
+    const TfLiteEvalTensor* cell_to_output_weights,
+    const TfLiteEvalTensor* input_layer_norm_coefficients,
+    const TfLiteEvalTensor* forget_layer_norm_coefficients,
+    const TfLiteEvalTensor* cell_layer_norm_coefficients,
+    const TfLiteEvalTensor* output_layer_norm_coefficients,
+    const TfLiteEvalTensor* input_gate_bias,
+    const TfLiteEvalTensor* forget_gate_bias,
+    const TfLiteEvalTensor* cell_gate_bias,
+    const TfLiteEvalTensor* output_gate_bias,
+    const TfLiteEvalTensor* projection_weights,
+    const TfLiteEvalTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major,
+    const IntegerLstmParameter* integer_lstm_param, int32_t output_state_zp,
+    TfLiteEvalTensor* output_state, TfLiteEvalTensor* cell_state,
+    TfLiteEvalTensor* output, int16_t* scratch0, int16_t* scratch1,
+    int16_t* scratch2, int16_t* scratch3, int8_t* scratch4, int32_t* scratch5);
+
+TfLiteStatus EvalInteger8x8_8Lstm(
+    const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* input_to_input_weights,
+    const TfLiteEvalTensor* input_to_forget_weights,
+    const TfLiteEvalTensor* input_to_cell_weights,
+    const TfLiteEvalTensor* input_to_output_weights,
+    const TfLiteEvalTensor* recurrent_to_input_weights,
+    const TfLiteEvalTensor* recurrent_to_forget_weights,
+    const TfLiteEvalTensor* recurrent_to_cell_weights,
+    const TfLiteEvalTensor* recurrent_to_output_weights,
+    const TfLiteEvalTensor* cell_to_input_weights,
+    const TfLiteEvalTensor* cell_to_forget_weights,
+    const TfLiteEvalTensor* cell_to_output_weights,
+    const TfLiteEvalTensor* input_layer_norm_coefficients,
+    const TfLiteEvalTensor* forget_layer_norm_coefficients,
+    const TfLiteEvalTensor* cell_layer_norm_coefficients,
+    const TfLiteEvalTensor* output_layer_norm_coefficients,
+    const TfLiteEvalTensor* input_gate_bias,
+    const TfLiteEvalTensor* forget_gate_bias,
+    const TfLiteEvalTensor* cell_gate_bias,
+    const TfLiteEvalTensor* output_gate_bias,
+    const TfLiteEvalTensor* projection_weights,
+    const TfLiteEvalTensor* projection_bias, const TfLiteLSTMParams* params,
+    TfLiteEvalTensor* output_state, TfLiteEvalTensor* cell_state,
+    TfLiteEvalTensor* output, const IntegerLstmParameter* integer_lstm_param,
+    int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
+    int16_t* scratch4, int16_t* scratch5, int16_t* scratch6, int16_t* scratch7);
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_LSTM_EVAL_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/lstm_shared.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/lstm_shared.h
new file mode 100644
index 00000000..ee34b848
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/lstm_shared.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_LSTM_SHARED_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_LSTM_SHARED_H_
+
+namespace tflite {
+
+// Input Tensors of size {n_batch, n_input}
+constexpr int kLstmInputTensor = 0;
+
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kLstmInputToInputWeightsTensor = 1;  // Optional
+constexpr int kLstmInputToForgetWeightsTensor = 2;
+constexpr int kLstmInputToCellWeightsTensor = 3;
+constexpr int kLstmInputToOutputWeightsTensor = 4;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kLstmRecurrentToInputWeightsTensor = 5;  // Optional
+constexpr int kLstmRecurrentToForgetWeightsTensor = 6;
+constexpr int kLstmRecurrentToCellWeightsTensor = 7;
+constexpr int kLstmRecurrentToOutputWeightsTensor = 8;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kLstmCellToInputWeightsTensor = 9;    // Optional
+constexpr int kLstmCellToForgetWeightsTensor = 10;  // Optional
+constexpr int kLstmCellToOutputWeightsTensor = 11;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kLstmInputGateBiasTensor = 12;  // Optional
+constexpr int kLstmForgetGateBiasTensor = 13;
+constexpr int kLstmCellGateBiasTensor = 14;
+constexpr int kLstmOutputGateBiasTensor = 15;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kLstmProjectionWeightsTensor = 16;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kLstmProjectionBiasTensor = 17;  // Optional
+
+// These state tensors are defined as variable tensors, and will be modified by
+// this op.
+constexpr int kLstmOutputStateTensor = 18;
+constexpr int kLstmCellStateTensor = 19;
+
+// Layer norm coefficient tensors of size {n_cell}, representing a diagonal
+// matrix.
+constexpr int kLstmInputLayerNormCoefficientsTensor = 20;   // Optional
+constexpr int kLstmForgetLayerNormCoefficientsTensor = 21;  // Optional
+constexpr int kLstmCellLayerNormCoefficientsTensor = 22;    // Optional
+constexpr int kLstmOutputLayerNormCoefficientsTensor = 23;  // Optional
+
+// Output tensors.
+constexpr int kLstmOutputTensor = 0;
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_LSTM_SHARED_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/maximum_minimum.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/maximum_minimum.cc
index a6d358fb..7964f1e6 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/maximum_minimum.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/maximum_minimum.cc
@@ -115,29 +115,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace maximum_minimum
 
 TfLiteRegistration Register_MAXIMUM() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/
-          maximum_minimum::Eval<maximum_minimum::kReference,
-                                maximum_minimum::MaximumOp>,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      nullptr, nullptr,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MaximumOp>);
 }
 
 TfLiteRegistration Register_MINIMUM() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/
-          maximum_minimum::Eval<maximum_minimum::kReference,
-                                maximum_minimum::MinimumOp>,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      nullptr, nullptr,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MinimumOp>);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_ops.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_ops.h
index 0fac51b7..c4dec92d 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_ops.h
@@ -36,6 +36,8 @@ TfLiteRegistration Register_ADD_N();
 TfLiteRegistration Register_ASSIGN_VARIABLE();
 TfLiteRegistration Register_AVERAGE_POOL_2D();
 TfLiteRegistration Register_BATCH_TO_SPACE_ND();
+TfLiteRegistration Register_BROADCAST_ARGS();
+TfLiteRegistration Register_BROADCAST_TO();
 TfLiteRegistration Register_CALL_ONCE();
 TfLiteRegistration Register_CAST();
 // TODO(b/160234179): Change custom OPs to also return by value.
@@ -62,6 +64,7 @@ TfLiteRegistration Register_LOGICAL_AND();
 TfLiteRegistration Register_LOGICAL_OR();
 TfLiteRegistration Register_LOGISTIC();
 TfLiteRegistration Register_MAX_POOL_2D();
+TfLiteRegistration Register_MIRROR_PAD();
 TfLiteRegistration Register_PRELU();
 TfLiteRegistration Register_MUL();
 TfLiteRegistration Register_QUANTIZE();
@@ -73,12 +76,16 @@ TfLiteRegistration Register_SHAPE();
 TfLiteRegistration Register_SLICE();
 TfLiteRegistration Register_SPACE_TO_BATCH_ND();
 TfLiteRegistration Register_SPACE_TO_DEPTH();
+TfLiteRegistration Register_SQUARED_DIFFERENCE();
 TfLiteRegistration Register_SQUEEZE();
 TfLiteRegistration Register_SUB();
 TfLiteRegistration Register_SVDF();
 TfLiteRegistration Register_TRANSPOSE();
 TfLiteRegistration Register_TRANSPOSE_CONV();
+// TODO(b/230666079): resolve conflict with xtensa implementation
+TfLiteRegistration Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
 TfLiteRegistration Register_VAR_HANDLE();
+TfLiteRegistration Register_WHILE();
 TfLiteRegistration Register_ZEROS_LIKE();
 
 namespace ops {
@@ -99,14 +106,12 @@ TfLiteRegistration Register_LESS_EQUAL();
 TfLiteRegistration Register_LOG();
 TfLiteRegistration Register_LOGICAL_NOT();
 TfLiteRegistration Register_MAXIMUM();
-TfLiteRegistration Register_MEAN();
 TfLiteRegistration Register_MINIMUM();
 TfLiteRegistration Register_NEG();
 TfLiteRegistration Register_NOT_EQUAL();
 TfLiteRegistration Register_PACK();
 TfLiteRegistration Register_PAD();
 TfLiteRegistration Register_PADV2();
-TfLiteRegistration Register_REDUCE_MAX();
 TfLiteRegistration Register_RESHAPE();
 TfLiteRegistration Register_RESIZE_NEAREST_NEIGHBOR();
 TfLiteRegistration Register_ROUND();
@@ -117,7 +122,6 @@ TfLiteRegistration Register_SPLIT_V();
 TfLiteRegistration Register_SQRT();
 TfLiteRegistration Register_SQUARE();
 TfLiteRegistration Register_STRIDED_SLICE();
-TfLiteRegistration Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
 TfLiteRegistration Register_UNPACK();
 TfLiteRegistration Register_L2_NORMALIZATION();
 TfLiteRegistration Register_TANH();
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_tensor_utils.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_tensor_utils.cc
new file mode 100644
index 00000000..88b097c7
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_tensor_utils.cc
@@ -0,0 +1,809 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/micro/kernels/micro_tensor_utils.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <utility>
+
+#include "fixedpoint/fixedpoint.h"  // from @gemmlowp
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace micro_tensor_utils {
+
+namespace {
+const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
+const int32_t kInt16Min = std::numeric_limits<int16_t>::min();
+}  // namespace
+
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min_value,
+                                     float* max_value, float* scaling_factor) {
+  auto minmax = std::minmax_element(values, values + size);
+  *min_value = *minmax.first;
+  *max_value = *minmax.second;
+
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, *min_value,
+                                  *max_value, scaling_factor);
+}
+
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float min_value,
+                                     float max_value, float* scaling_factor) {
+  const int32_t kScale = 127;
+  const float range = std::max(std::abs(min_value), std::abs(max_value));
+  if (range == 0) {
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
+    return;
+  }
+  *scaling_factor = range / kScale;
+  const float scaling_factor_inv = kScale / range;
+  for (int i = 0; i < size; ++i) {
+    const int32_t quantized_value =
+        static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
+    // Clamp: just in case some odd numeric offset.
+    quantized_values[i] = static_cast<int8_t>(
+        std::min(kScale, std::max(-kScale, quantized_value)));
+  }
+}
+
+void PortableAsymmetricQuantizeFloats(const float* values, const int size,
+                                      int8_t* quantized_values,
+                                      float* scaling_factor, int32_t* offset) {
+  const int32_t kMinScale = -128;
+  const int32_t kMaxScale = 127;
+  const double qmin_double = kMinScale;
+  const double qmax_double = kMaxScale;
+  const auto minmax = std::minmax_element(values, values + size);
+  const double rmin = static_cast<double>(std::min(0.0f, *minmax.first));
+  const double rmax = static_cast<double>(std::max(0.0f, *minmax.second));
+  if (rmin == rmax) {
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
+    *offset = 0;
+    return;
+  } else {
+    double scale = (rmax - rmin) / (qmax_double - qmin_double);
+    const double zero_point_from_min = qmin_double - rmin / scale;
+    const double zero_point_from_max = qmax_double - rmax / scale;
+    const double zero_point_from_min_error =
+        std::abs(qmin_double) + std::abs(rmin / scale);
+    const double zero_point_from_max_error =
+        std::abs(qmax_double) + std::abs(rmax / scale);
+    const double zero_point_double =
+        zero_point_from_min_error < zero_point_from_max_error
+            ? zero_point_from_min
+            : zero_point_from_max;
+    int8_t nudged_zero_point = 0;
+    if (zero_point_double <= qmin_double) {
+      nudged_zero_point = kMinScale;
+    } else if (zero_point_double >= qmax_double) {
+      nudged_zero_point = kMaxScale;
+    } else {
+      nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
+    }
+    *scaling_factor = scale;
+    *offset = nudged_zero_point;
+  }
+  const float scaling_factor_inv = 1.0f / *scaling_factor;
+  for (int i = 0; i < size; ++i) {
+    const int32_t quantized_value = static_cast<int32_t>(
+        TfLiteRound(*offset + values[i] * scaling_factor_inv));
+    quantized_values[i] =
+        std::min(kMaxScale, std::max(kMinScale, quantized_value));
+  }
+}
+
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
+                                                 int m_rows, int m_cols,
+                                                 const float* vector,
+                                                 int n_batch, float* result) {
+  float* result_in_batch = result;
+  for (int b = 0; b < n_batch; b++) {
+    const float* matrix_ptr = matrix;
+    for (int r = 0; r < m_rows; r++) {
+      float dot_prod = 0.0f;
+      const float* vector_in_batch = vector + b * m_cols;
+      for (int c = 0; c < m_cols; c++) {
+        dot_prod += *matrix_ptr++ * *vector_in_batch++;
+      }
+      *result_in_batch += dot_prod;
+      ++result_in_batch;
+    }
+  }
+}
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result) {
+  for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
+    const float batch_scaling_factor = scaling_factors[batch];
+    // Get the address of the first row.
+    const int8_t* row_ptr = matrix;
+    for (int row = 0; row < m_rows; ++row) {
+      // Initialize the dot product sum for the row to 0.
+      int32_t dotprod = 0;
+      // TODO(b/230666277): remove this
+#if defined(__GNUC__)
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                         3 /* temporal locality */);
+#endif
+      for (int col = 0; col < m_cols; ++col, ++row_ptr) {
+        dotprod += (*row_ptr) * (vectors[col]);
+      }  // for col
+      *result += dotprod * batch_scaling_factor;
+      ++result;
+    }  // for row
+  }    // for batch
+}
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  if (input_offset == nullptr) {
+    PortableMatrixBatchVectorMultiplyAccumulate(
+        matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
+    return;
+  }
+  if (!compute_row_sums || *compute_row_sums) {
+    PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
+    if (compute_row_sums) {
+      *compute_row_sums = false;
+    }
+  }
+
+  for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
+    const float batch_scaling_factor = scaling_factors[batch];
+    const int32_t batch_offset = input_offset[batch];
+    const int8_t* row_ptr = matrix;
+    for (int row = 0; row < m_rows; ++row) {
+      int32_t dotprod = 0;
+      float scale = batch_scaling_factor;
+      if (per_channel_scale) {
+        scale *= per_channel_scale[row];
+      }
+#if defined(__GNUC__)
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                         3 /* temporal locality */);
+#endif
+      for (int col = 0; col < m_cols; ++col, ++row_ptr) {
+        dotprod += (*row_ptr) * vectors[col];
+      }  // for col
+      dotprod -= row_sums[row] * batch_offset;
+      *result += dotprod * scale;
+      ++result;
+    }  // for row
+  }    // for batch
+}
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  const int kBlockSize = 4;
+  TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
+  for (int batch = 0; batch < n_batch; batch++) {
+    const float* matrix_ptr = matrix;
+    for (int row = 0; row < m_rows; row++) {
+      float dot_prod = 0.0f;
+      const float* vector_in_batch = vector + batch * m_cols;
+      for (int i = segments[row]; i < segments[row + 1]; i++) {
+        const int block_start_index = indices[i] * kBlockSize;
+        const float* vector_block_in_batch_ptr =
+            vector_in_batch + block_start_index;
+        for (int c = 0; c < kBlockSize; c++) {
+          dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
+        }
+      }
+      result[batch * m_rows + row] += dot_prod;
+    }
+  }
+}
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result) {
+  const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
+  for (int batch = 0; batch < n_batch; ++batch) {
+    const int8_t* matrix_ptr = matrix;
+    for (int row = 0; row < m_rows; ++row) {
+      int32_t dot_prod = 0;
+      const int8_t* vector_in_batch = vector + batch * m_cols;
+      for (int i = segments[row]; i < segments[row + 1]; ++i) {
+        const int block_start_index = indices[i] * kBlockSize;
+        const int8_t* vector_block_in_batch_ptr =
+            vector_in_batch + block_start_index;
+        for (int c = 0; c < kBlockSize; c++) {
+          dot_prod += *matrix_ptr * *vector_block_in_batch_ptr++;
+          dot_prod += *matrix_ptr++ * input_offset;
+        }
+      }
+      const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0;
+      dot_prod = MultiplyByQuantizedMultiplier(dot_prod + bias_value,
+                                               output_multiplier, output_shift);
+      dot_prod += output_offset;
+      result[batch * m_rows + row] =
+          static_cast<int8_t>(ActivationFunctionWithMinMax(
+              dot_prod, output_activation_min, output_activation_max));
+    }
+  }
+}
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result) {
+  const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  for (int batch = 0; batch < n_batch; batch++) {
+    const float* matrix_ptr = matrix;
+    const uint8_t* ledger_ptr = ledger;
+    for (int row = 0; row < m_rows; row++) {
+      float dot_prod = 0.0f;
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        const float* vector_in_batch = vector + batch * m_cols;
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int block_start_index = *ledger_ptr++ * kBlockSize;
+          const float* vector_block_in_batch_ptr =
+              vector_in_batch + block_start_index;
+          for (int c = 0; c < kBlockSize; c++) {
+            dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
+          }
+        }
+      }
+      result[batch * m_rows + row] += dot_prod;
+    }
+  }
+}
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result) {
+  static const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
+    const float batch_scaling_factor = scaling_factors[batch];
+    const uint8_t* ledger_ptr = ledger;
+    // Get the address of the first row.
+    const int8_t* row_ptr = matrix;
+    for (int row = 0; row < m_rows; ++row) {
+      // Initialize the dot product sum for the row to 0.
+      int32_t dotprod = 0;
+#if defined(__GNUC__)
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                         3 /* temporal locality */);
+#endif
+      int num_nonzero_blocks = *ledger_ptr++;
+      for (int i = 0; i < num_nonzero_blocks; i++) {
+        const int block_start_index = *ledger_ptr++ * kBlockSize;
+        const int8_t* vector_block_ptr = vectors + block_start_index;
+        for (int c = 0; c < kBlockSize; c++) {
+          dotprod += (*row_ptr++) * (*vector_block_ptr++);
+        }  // for block
+      }    // for num_nonzero_blocks
+      result[batch * m_rows + row] += dotprod * batch_scaling_factor;
+    }  // for row
+  }    // for batch
+}
+
+template <typename T>
+void PortableMatrixBatchVectorMultiplyAccumulateImpl(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    T* output) {
+  const int16_t output_max = std::numeric_limits<T>::max();
+  const int16_t output_min = std::numeric_limits<T>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int row = 0; row < n_output; ++row) {
+      int32_t acc = bias[row];
+      for (int col = 0; col < n_input; ++col) {
+        int8_t input_val = input[batch * n_input + col];
+        int8_t weights_val = input_to_gate_weights[row * n_input + col];
+        acc += input_val * weights_val;
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
+      acc += output_zp;
+      acc += output[batch * n_output + row];
+      if (acc > output_max) {
+        acc = output_max;
+      }
+      if (acc < output_min) {
+        acc = output_min;
+      }
+      output[batch * n_output + row] = static_cast<T>(acc);
+    }
+  }
+}
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulateImpl(
+      input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
+      n_output, output_zp, output);
+}
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulateImpl(
+      input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
+      n_output, output_zp, output);
+}
+
+void PortableMatrixBatchVectorMultiply(const int8_t* input,
+                                       int32_t input_zeropoint,
+                                       const int8_t* input_to_gate_weights,
+                                       int32_t input_to_gate_effective_scale_a,
+                                       int32_t input_to_gate_effective_scale_b,
+                                       int32_t n_batch, int32_t n_input,
+                                       int32_t n_cell, int8_t* gate_output,
+                                       int8_t gate_output_zp) {
+  const int32_t int8_max = std::numeric_limits<int8_t>::max();
+  const int32_t int8_min = std::numeric_limits<int8_t>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int row = 0; row < n_cell; ++row) {
+      int32_t acc = 0;
+      for (int col = 0; col < n_input; ++col) {
+        int32_t input_val = input[batch * n_input + col];
+        int8_t weights_val = input_to_gate_weights[row * n_input + col];
+        acc += (input_val - input_zeropoint) * weights_val;
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
+                                          input_to_gate_effective_scale_b);
+      acc += gate_output_zp;
+      if (acc > int8_max) {
+        acc = int8_max;
+      }
+      if (acc < int8_min) {
+        acc = int8_min;
+      }
+      gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+void PortableMatrixBatchVectorMultiply(
+    const int16_t* hidden, const int8_t* hidden_to_output_weights,
+    int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
+    const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
+    int32_t n_output, int32_t output_zp, int8_t* proj_output) {
+  const int16_t int8_max = std::numeric_limits<int8_t>::max();
+  const int16_t int8_min = std::numeric_limits<int8_t>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int row = 0; row < n_output; ++row) {
+      int64_t acc = gate_bias[row];
+      for (int col = 0; col < n_hidden; ++col) {
+        int16_t input_val = hidden[batch * n_hidden + col];
+        int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
+        int64_t curr = acc;
+        acc += input_val * weights_val;
+        if (input_val * weights_val > 0 && acc < curr) {
+          acc = std::numeric_limits<int32_t>::max();
+        }
+        if (input_val * weights_val < 0 && acc > curr) {
+          acc = std::numeric_limits<int32_t>::min();
+        }
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
+                                          proj_effective_scale_b);
+      acc += output_zp;
+      if (acc > int8_max) {
+        acc = int8_max;
+      }
+      if (acc < int8_min) {
+        acc = int8_min;
+      }
+      proj_output[batch * n_output + row] = acc;
+    }
+  }
+}
+
+void PortableApplyLayerNorm(const int16_t* input,
+                            const int16_t* layer_norm_weights,
+                            const int32_t* bias, int32_t layer_norm_scale_a,
+                            int32_t layer_norm_scale_b, int32_t variance_limit,
+                            int n_batch, int n_input, int16_t* output) {
+  // The square of std::pow(2, 10), which is the extra factor that makes sure
+  // normalized values has enough resolution.
+  static const int kTwoToPower20 = 1 << 20;
+  for (int i = 0; i < n_batch; ++i) {
+    int64_t sum = 0;
+    int64_t sum_sq = 0;
+    for (int j = 0; j < n_input; ++j) {
+      const int32_t index = i * n_input + j;
+      int32_t val = static_cast<int32_t>(input[index]);
+      sum += val;
+      sum_sq += val * val;
+    }
+    int32_t mean =
+        static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
+    // TODO(b/173994730): Avoids overflow but only works for POT n_input.
+    int32_t temp = kTwoToPower20 / n_input;
+    int64_t variance =
+        sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
+    int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
+    if (variance2 < 1) {
+      variance2 = variance_limit;
+    }
+    int32_t stddev_inverse_a;
+    int stddev_inverse_b;
+    GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
+                                     &stddev_inverse_a, &stddev_inverse_b);
+
+    for (int j = 0; j < n_input; ++j) {
+      const int32_t index = i * n_input + j;
+      int32_t val = static_cast<int32_t>(input[index]);
+      int32_t shifted = 1024 * val - mean;
+      int32_t rescaled = MultiplyByQuantizedMultiplier(
+          shifted, stddev_inverse_a, stddev_inverse_b);
+      int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
+      int32_t val4 =
+          static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
+      int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
+                                                   layer_norm_scale_b + 12);
+      val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
+      output[index] = static_cast<int16_t>(val5);
+    }
+  }
+}
+
+void PortableApplyLayerNormFloat(const int16_t* input,
+                                 const int16_t* layer_norm_weights,
+                                 int32_t layer_norm_scale_a,
+                                 int32_t layer_norm_scale_b,
+                                 const int32_t* bias, int n_batch, int n_input,
+                                 int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16_t>::max();
+  const int32_t int16_min = std::numeric_limits<int16_t>::min();
+  const float layer_norm_scale =
+      layer_norm_scale_a *
+      std::pow(2.0, static_cast<double>(layer_norm_scale_b - 31));
+  const float bias_scale =
+      static_cast<float>(std::pow(2.0, -10)) * layer_norm_scale;
+
+  for (int batch = 0; batch < n_batch; ++batch) {
+    float sum = 0.0f;
+    float sum_sq = 0.0f;
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float value = static_cast<float>(input[index]);
+      sum += value;
+      sum_sq += value * value;
+    }
+    const float mean = sum / n_input;
+    float stddev_inv = 0.0f;
+    const float variance = sum_sq / n_input - mean * mean;
+    if (variance == 0) {
+      stddev_inv = 1.0f / std::sqrt(1e-8f);
+    } else {
+      stddev_inv = 1.0f / std::sqrt(variance);
+    }
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float normalized_value =
+          (static_cast<float>(input[index]) - mean) * stddev_inv;
+      const float weighted_normalized_value =
+          normalized_value * layer_norm_weights[i] * layer_norm_scale +
+          bias[i] * bias_scale;
+      const int32_t quant_output = static_cast<int32_t>(round(
+          weighted_normalized_value * static_cast<float>(std::pow(2, 12))));
+      output[index] = std::min(int16_max, std::max(int16_min, quant_output));
+    }
+  }
+}
+
+void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
+                                            int32_t scalar, int32_t n_row,
+                                            int32_t n_col, int32_t* output) {
+  for (int i = 0; i < n_row; ++i) {
+    int32_t row_sum = 0;
+    for (int j = 0; j < n_col; ++j) {
+      row_sum += *matrix++;
+    }
+    output[i] += row_sum * scalar;
+  }
+}
+
+void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
+                          int32_t n_input, int16_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int c = 0; c < n_input; c++) {
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      const int index = batch * n_input + c;
+      F3 sigmoid_input = F3::FromRaw(input[index]);
+      F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
+      output[index] = sigmoid_output.raw();
+    }
+  }
+}
+
+void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
+                               int32_t n_input, int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16_t>::max();
+  const int32_t int16_min = std::numeric_limits<int16_t>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float float_input =
+          input[index] * static_cast<float>(std::pow(2, -12));
+      const float float_output = 1.0f / (1.0f + std::exp(-float_input));
+      const int32_t quant_output = static_cast<int32_t>(
+          float_output * static_cast<float>(std::pow(2, 15)));
+      const int32_t quant_output_clamped =
+          std::min(int16_max, std::max(int16_min, quant_output));
+      output[index] = static_cast<int16_t>(quant_output_clamped);
+    }
+  }
+}
+
+template <int IntegerBits>
+void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
+                           int32_t n_input, int16_t* output) {
+  using FX = gemmlowp::FixedPoint<std::int16_t, IntegerBits>;
+  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      FX tanh_input = FX::FromRaw(input[index]);
+      F0 tanh_output = gemmlowp::tanh(tanh_input);
+      output[index] = tanh_output.raw();
+    }
+  }
+}
+
+void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
+                       int32_t n_batch, int32_t n_input, int16_t* output) {
+  if (integer_bits > 6) {
+    TFLITE_ASSERT_FALSE;
+  }
+#define DISPATCH_TANH(i)                                       \
+  case i:                                                      \
+    PortableApplyTanhImpl<i>(input, n_batch, n_input, output); \
+    break;
+  switch (integer_bits) {
+    DISPATCH_TANH(0);
+    DISPATCH_TANH(1);
+    DISPATCH_TANH(2);
+    DISPATCH_TANH(3);
+    DISPATCH_TANH(4);
+    DISPATCH_TANH(5);
+    DISPATCH_TANH(6);
+    default:
+      return;
+  }
+#undef DISPATCH_TANH
+}
+
+void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
+                            int32_t n_input, int32_t integer_bits,
+                            int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16_t>::max();
+  const int32_t int16_min = std::numeric_limits<int16_t>::min();
+  const double two = 2.0;
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float float_input =
+          input[index] * std::pow(two, static_cast<double>(integer_bits));
+      const float float_output = std::tanh(float_input);
+      const int32_t quant_output = static_cast<int32_t>(
+          float_output * static_cast<float>(std::pow(2, 15)));
+      const int32_t quant_output_clamped =
+          std::min(int16_max, std::max(int16_min, quant_output));
+      output[index] = static_cast<int16_t>(quant_output_clamped);
+    }
+  }
+}
+
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int shift, int16_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const int16_t a = input_1[index];
+      const int16_t b = input_2[index];
+      const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
+      output[index] =
+          static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
+    }
+  }
+}
+
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                      int32_t multiplier, int32_t shift, int32_t n_batch,
+                      int32_t n_input, int32_t output_zp, int8_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const int16_t a = input_1[index];
+      const int16_t b = input_2[index];
+      int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
+      value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
+      value -= output_zp;
+      value = std::min(std::max(static_cast<int32_t>(-128), value),
+                       static_cast<int32_t>(127));
+
+      output[index] = static_cast<int8_t>(value);
+    }
+  }
+}
+
+void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int16_t* output) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      int32_t sum = input_1[index] + input_2[index];
+      const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
+      output[index] = static_cast<int16_t>(sum_clamped);
+    }
+  }
+}
+
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                     int v_size) {
+  float result = 0.0;
+  for (int v = 0; v < v_size; v++) {
+    result += *vector1++ * *vector2++;
+  }
+  return result;
+}
+
+namespace {
+inline int32_t VectorVectorDotProduct(const int16_t* vector1,
+                                      const int16_t* vector2, int v_size) {
+  int32_t result = 0;
+  for (int v = 0; v < v_size; v++) {
+    result += *vector1++ * *vector2++;
+  }
+  return result;
+}
+}  // namespace
+
+void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                              const int16_t* vector2,
+                                              int v_size, int n_batch,
+                                              int32_t* result) {
+  for (int b = 0; b < n_batch; b++) {
+    result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
+    vector1 += v_size;
+    vector2 += v_size;
+  }
+}
+
+void PortableVectorBatchVectorCwiseProductAccumulate(
+    const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
+    int32_t multiplier, int shift, int16_t* result) {
+  for (int b = 0; b < n_batch; b++) {
+    for (int v = 0; v < v_size; v++) {
+      int32_t prod = vector[v] * *batch_vector++;
+      prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
+      int32_t output = prod + *result;
+      output = std::max(std::min(static_cast<int32_t>(32767), output),
+                        static_cast<int32_t>(-32768));
+      *result++ = output;
+    }
+  }
+}
+
+void PortableSub1Vector(const float* vector, int v_size, float* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ = 1.0f - *vector++;
+  }
+}
+
+void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
+  static const int16_t kOne = 32767;
+  for (int v = 0; v < v_size; v++) {
+    *result++ = kOne - *vector++;
+  }
+}
+
+void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
+                                  const float scale, float* result) {
+  for (int v = 0; v < v_size; ++v) {
+    *result++ = scale * *vector++;
+  }
+}
+
+void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
+                                     float* __restrict__ output_vector,
+                                     int v_size, int n_batch) {
+  for (int batch = 0; batch < n_batch; ++batch) {
+    float sum = 0.0f;
+    for (int i = 0; i < v_size; ++i) {
+      sum += input_vector[i];
+    }
+    const float mean = sum / v_size;
+    float sum_diff_sq = 0.0f;
+    for (int i = 0; i < v_size; ++i) {
+      const float diff = input_vector[i] - mean;
+      sum_diff_sq += diff * diff;
+    }
+    const float variance = sum_diff_sq / v_size;
+    constexpr float kNormalizationConstant = 1e-8f;
+    const float stddev_inv =
+        1.0f / std::sqrt(variance + kNormalizationConstant);
+    for (int i = 0; i < v_size; ++i) {
+      output_vector[i] = (input_vector[i] - mean) * stddev_inv;
+    }
+    input_vector += v_size;
+    output_vector += v_size;
+  }
+}
+
+void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                                  const int8_t* recurrent, int8_t recurrent_zp,
+                                  int32_t input_effective_scale_a,
+                                  int32_t input_effective_scale_b,
+                                  int32_t recurrent_effective_scale_a,
+                                  int32_t recurrent_effective_scale_b,
+                                  int32_t n_batch, int32_t n_cell,
+                                  int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16_t>::max();
+  const int32_t int16_min = std::numeric_limits<int16_t>::min();
+  for (int i = 0; i < n_batch * n_cell; ++i) {
+    int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
+    int32_t h =
+        static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
+    int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
+                                                     input_effective_scale_b);
+    int32_t h_scaled = MultiplyByQuantizedMultiplier(
+        h, recurrent_effective_scale_a, recurrent_effective_scale_b);
+    int32_t y = h_scaled + x_scaled;
+    if (y > int16_max) {
+      y = int16_max;
+    }
+    if (y < int16_min) {
+      y = int16_min;
+    }
+    output[i] = static_cast<int16_t>(y);
+  }
+}
+
+}  // namespace micro_tensor_utils
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_tensor_utils.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_tensor_utils.h
new file mode 100644
index 00000000..673ba6a3
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_tensor_utils.h
@@ -0,0 +1,874 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file and the associated .cc file is branched from
+// tensorflow/lite/kernels/internal/reference/portable_tensor_utils*
+// TFLM needs to create its own because the original files are coupled with
+// the tensor_utils module, which we cannot reuse due to its use of the
+// Eigen library.
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_MICRO_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_MICRO_TENSOR_UTILS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation.
+// TODO(b/230666277): consider removing this since micro does not utilize it
+class CpuBackendContext;
+
+namespace micro_tensor_utils {
+
+template <typename T>
+inline bool PortableIsZeroVector(const T* vector, int v_size) {
+  for (int i = 0; i < v_size; ++i) {
+    if (vector[i] != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min_value,
+                                     float* max_value, float* scaling_factor);
+
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float min_value,
+                                     float max_value, float* scaling_factor);
+
+void PortableAsymmetricQuantizeFloats(const float* values, const int size,
+                                      int8_t* quantized_values,
+                                      float* scaling_factor, int32_t* offset);
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
+                                                 int m_rows, int m_cols,
+                                                 const float* vector,
+                                                 int n_batch, float* result);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vector, const float* scaling_factors,
+    int n_batch, int32_t* scratch, float* __restrict__ result,
+    CpuBackendContext* context);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result);
+
+// Dot product of two vectors.
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                     int v_size);
+
+void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
+                                              const int16_t* vector2,
+                                              int v_size, int n_batch,
+                                              int32_t* result);
+
+void PortableVectorBatchVectorCwiseProductAccumulate(
+    const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
+    int32_t multiplier, int shift, int16_t* result);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context);
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context);
+
+void PortableMatrixBatchVectorMultiply(const int8_t* input,
+                                       int32_t input_zeropoint,
+                                       const int8_t* input_to_gate_weights,
+                                       int32_t input_to_gate_effective_scale_a,
+                                       int32_t input_to_gate_effective_scale_b,
+                                       int32_t n_batch, int32_t n_input,
+                                       int32_t n_cell, int8_t* gate_output,
+                                       int8_t gate_output_zp);
+
+void PortableMatrixBatchVectorMultiply(
+    const int16_t* hidden, const int8_t* hidden_to_output_weights,
+    int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
+    const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
+    int32_t n_output, int32_t output_zp, int8_t* proj_output);
+
+void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
+                                            int32_t scalar, int32_t n_row,
+                                            int32_t n_col, int32_t* output);
+
+void PortableApplyLayerNorm(const int16_t* input,
+                            const int16_t* layer_norm_weights,
+                            const int32_t* bias, int32_t layer_norm_scale_a,
+                            int32_t layer_norm_scale_b, int32_t variance_limit,
+                            int n_batch, int n_input, int16_t* output);
+
+void PortableApplyLayerNormFloat(const int16_t* input,
+                                 const int16_t* layer_norm_weights,
+                                 int32_t layer_norm_scale_a,
+                                 int32_t layer_norm_scale_b,
+                                 const int32_t* bias, int n_batch, int n_input,
+                                 int16_t* output);
+
+void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
+                          int32_t n_input, int16_t* output);
+
+void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
+                               int32_t n_input, int16_t* output);
+
+void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
+                       int32_t n_batch, int32_t n_input, int16_t* output);
+
+void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
+                            int32_t n_input, int32_t integer_bits,
+                            int16_t* output);
+
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int shift, int16_t* output);
+
+void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
+                      int32_t multiplier, int32_t shift, int32_t n_batch,
+                      int32_t n_input, int32_t output_zp, int8_t* output);
+
+void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
+                      int n_batch, int n_input, int16_t* output);
+
+template <typename T>
+inline void PortableCwiseClipping(T* vector, const int v_size,
+                                  const T& clipping_value) {
+  for (int i = 0; i < v_size; i++) {
+    vector[i] = std::max(std::min(clipping_value, vector[i]),
+                         static_cast<T>(-clipping_value));
+  }
+}
+
+// Batch vector initialization with another vector.
+void PortableVectorBatchVectorAssign(const float* vector, int v_size,
+                                     int n_batch, float* batch_vector);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void PortableSub1Vector(const float* vector, int v_size, float* result);
+
+void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
+
+// Multiply all elements of vector with a scalar.
+void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                                  float* result);
+
+// Reduce-sum on a vector:
+// input_vector: pointer to input vector.
+// output_vector: pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+template <typename INPUT, typename OUTPUT>
+inline void PortableReductionSumVector(const INPUT* input_vector,
+                                       OUTPUT* output_vector, int output_size,
+                                       int reduction_size) {
+  for (int o = 0; o < output_size; o++) {
+    OUTPUT result = 0;
+    for (int r = 0; r < reduction_size; r++) {
+      result += input_vector[r];
+    }
+    output_vector[o] = result;
+    input_vector += reduction_size;
+  }
+}
+
+// Layer norm for each batch.
+void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
+                                     float* __restrict__ output_vector,
+                                     int v_size, int n_batch);
+
+// Saturate Add.
+void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                                  const int8_t* recurrent, int8_t recurrent_zp,
+                                  int32_t input_effective_scale_a,
+                                  int32_t input_effective_scale_b,
+                                  int32_t recurrent_effective_scale_a,
+                                  int32_t recurrent_effective_scale_b,
+                                  int32_t n_batch, int32_t n_cell,
+                                  int16_t* output);
+
+// Add another vector for each batch in the batch vector.
+template <typename T>
+inline void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,
+                                 T* batch_vector) {
+  for (int b = 0; b < n_batch; b++) {
+    for (int i = 0; i < v_size; ++i) {
+      batch_vector[i] += vector[i];
+    }
+    batch_vector += v_size;
+  }
+}
+
+// Cwise product of two vectors.
+template <typename T>
+inline void VectorVectorCwiseProduct(const T* vector1, const T* vector2,
+                                     int v_size, T* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ = *vector1++ * *vector2++;
+  }
+}
+
+// Cwise product of a vector and a batch-vector.
+template <typename T>
+inline void VectorBatchVectorCwiseProduct(const T* vector, int v_size,
+                                          const T* batch_vector, int n_batch,
+                                          T* result) {
+  for (int b = 0; b < n_batch; b++) {
+    VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
+    // Update the pointers.
+    result += v_size;
+    batch_vector += v_size;
+  }
+}
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+inline void ReductionSumVector(const float* input_vector, float* output_vector,
+                               int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+// Same as above but input/output is 32 bit integer.
+inline void ReductionSumVector(const int32_t* input_vector,
+                               int32_t* output_vector, int output_size,
+                               int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+// Same as above but input is 8 bit integer.
+inline void ReductionSumVector(const int8_t* input_vector,
+                               int32_t* output_vector, int output_size,
+                               int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+template <typename T>
+inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
+                                               const T* __restrict__ vector2,
+                                               int v_size,
+                                               T* __restrict__ result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ += *vector1++ * *vector2++;
+  }
+}
+
+// Batch vector initialization with another vector.
+template <typename T>
+inline void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
+                                    T* batch_vector) {
+  for (int b = 0; b < n_batch; b++) {
+    std::copy_n(vector, v_size, batch_vector + b * v_size);
+  }
+}
+
+inline void SymmetricQuantizeFloats(const float* values, const int size,
+                                    int8_t* quantized_values, float* min,
+                                    float* max, float* scaling_factor) {
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min, max,
+                                  scaling_factor);
+}
+
+inline void SymmetricQuantizeFloats(const float* values, const int size,
+                                    int8_t* quantized_values, float min_value,
+                                    float max_value, float* scaling_factor) {
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min_value,
+                                  max_value, scaling_factor);
+}
+
+inline void AsymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values,
+                                     float* scaling_factor, int32_t* offset) {
+  PortableAsymmetricQuantizeFloats(values, size, quantized_values,
+                                   scaling_factor, offset);
+}
+
+// Helper function to quantize floats.
+// float_data_ptr     input float vectors
+// n_batch            number of input vectors
+// n_data             size of a single input vector
+// quantized_data_ptr (out) vector with quantized data
+// scaling_factors    (out) scaling factors (one per vector)
+// zero_points        (out) zero points (one per vector)
+// do_asymmetric      controls if the quantization should be asymmetric.
+inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch,
+                                int n_data, int8_t* quantized_data_ptr,
+                                float* scaling_factors, int32_t* zero_points,
+                                bool do_asymmetric) {
+  for (int b = 0; b < n_batch; ++b) {
+    const int offset = b * n_data;
+    if (do_asymmetric) {
+      AsymmetricQuantizeFloats(float_data_ptr + offset, n_data,
+                               quantized_data_ptr + offset, &scaling_factors[b],
+                               &zero_points[b]);
+    } else {
+      float unused_min, unused_max;
+      SymmetricQuantizeFloats(float_data_ptr + offset, n_data,
+                              quantized_data_ptr + offset, &unused_min,
+                              &unused_max, &scaling_factors[b]);
+    }
+  }
+}
+
+// Check if all entries of a vector are zero for float.
+inline bool IsZeroVector(const float* vector, int v_size) {
+  return PortableIsZeroVector(vector, v_size);
+}
+
+// Check if all entries of a vector are zero for int8_t.
+inline bool IsZeroVector(const int8_t* vector, int v_size) {
+  return PortableIsZeroVector(vector, v_size);
+}
+
+// Apply Layer Normalization (https://arxiv.org/abs/1607.06450) to a Quantized
+// vector.
+// Parameters:
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - layer_norm_weights:  the quantized layer normalization weights.
+//     - bias: the bias for the layer normalization.
+//     - layer_norm_scale_a: multiplier for scale factor.
+//     - layer_norm_scale_b: shift for scale factor.
+//     - variance_limit: the guard to make sure the inverse does not overflow.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+inline void ApplyLayerNorm(const int16_t* input,
+                           const int16_t* layer_norm_weights,
+                           const int32_t* bias, int32_t layer_norm_scale_a,
+                           int32_t layer_norm_scale_b, int32_t variance_limit,
+                           int n_batch, int n_input, int16_t* output) {
+  PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a,
+                         layer_norm_scale_b, variance_limit, n_batch, n_input,
+                         output);
+}
+
+// Same as above but the internal calculation is done in float.
+inline void ApplyLayerNormFloat(const int16_t* input,
+                                const int16_t* layer_norm_weights,
+                                int32_t layer_norm_scale_a,
+                                int32_t layer_norm_scale_b, const int32_t* bias,
+                                int n_batch, int n_input, int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
+// Apply Sigmoid to a quantized vector.
+// Parameters:
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+// The input is in Q3.12 format and the output is in Q0.15 format.
+inline void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
+                         int16_t* output) {
+  PortableApplySigmoid(input, n_batch, n_input, output);
+}
+
+// Same as above but the internal calcualtion is float.
+inline void ApplySigmoidFloat(const int16_t* input, int32_t n_batch,
+                              int32_t n_input, int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
+// Apply Tanh to a quantized vector.
+// Parameters:
+//     - integer_bits: the integer bits of the input.
+//                     Currently supports 0, 1, 2, 3, 4, 5, 6.
+//     - input: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - output:  the 16 bit output
+// The input is in Qm.15-m format and the output is in Q0.15 format.
+inline void ApplyTanh(int32_t integer_bits, const int16_t* input,
+                      int32_t n_batch, int32_t n_input, int16_t* output) {
+  PortableApplyTanh(integer_bits, input, n_batch, n_input, output);
+}
+
+// Apply Tanh to a quantized vector. Tbe internal calculation is in float.
+//    - Input has 2^(integer_bits) as scale.
+//    - Output has Q0.15 as scale.
+inline void ApplyTanhFloat(const int16_t* input, int32_t n_batch,
+                           int32_t n_input, int32_t integer_bits,
+                           int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
+// Element-wise multiplication of two quantized vectors.
+// Parameters:
+//     - input_1: batch vector of size n_batch * n_input; 16 bit.
+//     - input_2: batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch: the number of batches.
+//     - n_input: the size for input and output.
+//     - shift:   the shift needed to produce the output.
+//     - output:  the 16 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+inline void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+                     int n_batch, int n_input, int shift, int16_t* output) {
+  PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
+}
+
+// Element-wise multiplication of two quantized vectors with rescaling.
+// Parameters:
+//     - input_1:    batch vector of size n_batch * n_input; 16 bit.
+//     - input_2:    batch vector of size n_batch * n_input; 16 bit.
+//     - multiplier: the multiplier part of scale.
+//     - shift:      the shift part of scale.
+//     - n_batch:    the number of batches.
+//     - n_input:    the size for input and output.
+//     - output:     the 8 bit output of size n_batch * n_input.
+//     - output_zp:  the zero point of output.
+// Output does not need to be initialized.
+// Multiplier ("m") and shift ("s") are connected to scale ("s") with s = m *
+// 2^(s - 31).
+inline void CwiseMul(const int16_t* input_1, const int16_t* input_2,
+                     int32_t multiplier, int32_t shift, int32_t n_batch,
+                     int32_t n_input, int32_t output_zp, int8_t* output) {
+  PortableCwiseMul(input_1, input_2, multiplier, shift, n_batch, n_input,
+                   output_zp, output);
+}
+
+// Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
+// int8_t. Parameters:
+//     - vector:         vector of size v_size.
+//     - v_size:         the size of the vector.
+//     - clipping_value: the value used for clipping.
+inline void CwiseClipping(float* vector, const int v_size,
+                          const float clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+inline void CwiseClipping(int16_t* vector, const int v_size,
+                          const int16_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+inline void CwiseClipping(int8_t* vector, const int v_size,
+                          const int8_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+// Element-wise saturating addition of two quantized vectors without rescaling.
+// Parameters:
+//     - input_1:    batch vector of size n_batch * n_input; 16 bit.
+//     - input_2:    batch vector of size n_batch * n_input; 16 bit.
+//     - n_batch:    the number of batches.
+//     - n_input:    the size for input and output.
+//     - output:     the 8 bit output of size n_batch * n_input.
+// Output does not need to be initialized.
+inline void CwiseAdd(const int16_t* input_1, const int16_t* input_2,
+                     int n_batch, int n_input, int16_t* output) {
+  PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
+}
+
+inline void MeanStddevNormalization(const float* input_vector,
+                                    float* output_vector, int v_size,
+                                    int n_batch) {
+  PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
+}
+
+inline void Sub1Vector(const float* vector, int v_size, float* result) {
+  PortableSub1Vector(vector, v_size, result);
+}
+
+inline void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
+  PortableSub1Vector(vector, v_size, result);
+}
+
+// Multiply all elements of vector with a scalar.
+inline void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                                 float* result) {
+  PortableVectorScalarMultiply(vector, v_size, scale, result);
+}
+
+// Saturate Add with rescale on both inputs.
+inline void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
+                                 const int8_t* recurrent, int8_t recurrent_zp,
+                                 int32_t input_effective_scale_a,
+                                 int32_t input_effective_scale_b,
+                                 int32_t recurrent_effective_scale_a,
+                                 int32_t recurrent_effective_scale_b,
+                                 int32_t n_batch, int32_t n_cell,
+                                 int16_t* output) {
+  PortableTwoGateSaturatingAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
+// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
+// dimension composed by input vectors independent from each other). The result
+// of the multiplication is accumulated to the passed result buffer.
+// More specifically, for a matrix M of shape [n, i] and a batched-vector
+// of shape [i, batch] it will first compute the product of shape [n, batch].
+// This product will be accumulated to the result buffer.
+inline void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                                int m_cols, const float* vector,
+                                                int n_batch, float* result) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              n_batch, result);
+}
+
+// Same as the function above, but the matrix is a sparse tensor with block
+// pattern 1x4.
+// This function assumes that m_cols is a multiple of the block size (4 in this
+// case) so that there's no incomplete block.
+inline void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vector, const float* scaling_factors,
+    int n_batch, float* __restrict__ result) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              scaling_factors, n_batch, result);
+}
+
+inline void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      per_channel_scale, input_offset, scratch, row_sums, compute_row_sums,
+      context);
+}
+
+inline void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vector, const float* scaling_factors,
+    int n_batch, int32_t* scratch, float* __restrict__ result,
+    CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              scaling_factors, n_batch, result);
+}
+
+// Same as the function above, but the matrix is a sparse tensor with block
+// pattern 1x4.
+// This function assumes that m_cols is a multiple of the block size (4 in this
+// case) so that there's no incomplete block.
+inline void SparseMatrixBatchVectorMultiplyAccumulate1x4(
+    const float* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
+      matrix, segments, indices, m_rows, m_cols, vector, n_batch, result);
+}
+
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//      with an integer representing the number of non-zero blocks for the
+//      corresponding row and follows with column indexes of the first element
+//      of each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+inline void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vector, n_batch, result);
+}
+
+// Same as the function above, but the matrix is a sparse tensor with block
+// pattern 1x16.
+// This function assumes that m_cols is a multiple of the block size (16 in this
+// case) so that there's no incomplete block. Also, it assumes all offsets of
+// input, output and filter are zero.
+inline void SparseMatrixBatchVectorMultiplyAccumulate1x16(
+    const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments,
+    const int32_t* __restrict__ indices, int m_rows, int m_cols,
+    const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
+    int n_batch, const int32_t input_offset, const int32_t output_multiplier,
+    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_activation_min, const int32_t output_activation_max,
+    int8_t* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
+      matrix, segments, indices, m_rows, m_cols, vector, bias_vector, n_batch,
+      input_offset, output_multiplier, output_shift, output_offset,
+      output_activation_min, output_activation_max, result);
+}
+
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//      with an integer representing the number of non-zero blocks for the
+//      corresponding row followed by column index of the first element of
+//      each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+inline void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch,
+      result);
+}
+
+// Same as the above 8, 8, 8 integer matmul except for the presence of zero
+// point and non-accumulative.
+// TODO(b/148688698): remove this function by folding zero point calculation in
+// prepare() function.
+inline void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int16_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
+      n_output, output_zp, scratch, output, context);
+}
+
+// Same as above but has 16 bit and 8 bit input and 8 bit output.
+// Used in projection when hidden is 16bit.
+inline void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* input, const int32_t* bias,
+    const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
+    int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
+    int32_t* scratch, int8_t* output, CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
+      n_output, output_zp, scratch, output, context);
+}
+
+// Same as the function above, but provides separate scaling factor for the
+// matrix and the vectors. The scaling factors are multiplied in the
+// scaling_factor_scratch buffer.
+inline void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float matrix_scaling_factor,
+    const float* vector_scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, float* scaling_factor_scratch,
+    CpuBackendContext* context) {
+  for (int b = 0; b < n_batch; ++b) {
+    scaling_factor_scratch[b] =
+        vector_scaling_factors[b] * matrix_scaling_factor;
+  }
+  MatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
+                                      scaling_factor_scratch, n_batch, result,
+                                      per_channel_scale, input_offset, scratch,
+                                      row_sums, compute_row_sums, context);
+}
+
+// Multiplies a matrix with a scalar and reduce the result on each row to a
+// scalar.
+// Parameters:
+//     - matrix: matrix of size n_row * n_col
+//     - scalar: the scalar that is multiplied to each element in the matrix
+//     - n_row:  the row count of the matrix
+//     - n_col:  the column count of the matrix
+//     - output: the 32bit output
+// Note: We do not need saturation because the int8 * int8 is safe from overflow
+// in (2^31-1) / (2^14) = 131072, which is bigger than the n_row. Non-zero
+// initial output value is not exceptionally large.
+inline void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
+                                           int32_t n_row, int32_t n_col,
+                                           int32_t* output) {
+  PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output);
+}
+
+// Same as the above 8, 8, 8 integer matmul except for the presence of zero
+// point and non-accumulative.
+// TODO(b/148688698): remove this function by folding zero point calculation in
+// prepare() function.
+inline void MatrixBatchVectorMultiply(const int8_t* input,
+                                      int32_t input_zeropoint,
+                                      const int8_t* input_to_gate_weights,
+                                      int32_t input_to_gate_effective_scale_a,
+                                      int32_t input_to_gate_effective_scale_b,
+                                      int32_t n_batch, int32_t n_input,
+                                      int32_t n_cell, int8_t* gate_output,
+                                      int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+// Same as above but has 16 bit and 8 bit input and 8 bit output.
+// Used in projection when hidden is 16bit.
+inline void MatrixBatchVectorMultiply(const int16_t* hidden,
+                                      const int8_t* hidden_to_output_weights,
+                                      int32_t proj_effective_scale_a,
+                                      int32_t proj_effective_scale_b,
+                                      const int32_t* gate_bias, int32_t n_batch,
+                                      int32_t n_hidden, int32_t n_output,
+                                      int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+template <typename T>
+inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size,
+                                                    const T* batch_vector,
+                                                    int n_batch, T* result) {
+  for (int b = 0; b < n_batch; b++) {
+    VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
+    // Update the pointers.
+    result += v_size;
+    batch_vector += v_size;
+  }
+}
+
+// Same as above, but inputs are 16bit integer and output is 16bit integer.
+inline void VectorBatchVectorCwiseProductAccumulate(
+    const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
+    int32_t multiplier, int shift, int16_t* result) {
+  PortableVectorBatchVectorCwiseProductAccumulate(
+      vector, v_size, batch_vector, n_batch, multiplier, shift, result);
+}
+
+// Apply Rectified Linear to elements of a vector.
+inline void ApplyReluToVector(const float* vector, int v_size, float* result) {
+  for (int v = 0; v < v_size; v++) {
+    result[v] = std::max(0.0f, vector[v]);
+  }
+}
+
+// Apply Rectified Linear 1 (cap to [-1;1]) to elements of a vector
+inline void ApplyRelu1ToVector(const float* vector, int v_size, float* result) {
+  for (int v = 0; v < v_size; v++) {
+    result[v] = std::max(-1.0f, std::min(vector[v], 1.0f));
+  }
+}
+
+// Apply Rectified Linear 6 (cap to [0;6]) to elements of a vector
+inline void ApplyRelu6ToVector(const float* vector, int v_size, float* result) {
+  for (int v = 0; v < v_size; v++) {
+    result[v] = std::max(0.0f, std::min(vector[v], 6.0f));
+  }
+}
+
+// Apply tanh to elements of a vector
+inline void ApplyTanhToVector(const float* vector, int v_size, float* result) {
+  for (int v = 0; v < v_size; v++) {
+    result[v] = std::tanh(vector[v]);
+  }
+}
+
+// Apply signbit to elements of a vector
+inline void ApplySignbitToVector(const float* vector, int v_size,
+                                 float* result) {
+  for (int v = 0; v < v_size; v++) {
+    result[v] = std::signbit(vector[v]);
+  }
+}
+
+// Apply sigmoid to elements of a vector.
+inline void ApplySigmoidToVector(const float* vector, int v_size,
+                                 float* result) {
+  for (int v = 0; v < v_size; v++) {
+    result[v] = 1.0f / (1.0f + std::exp(-vector[v]));
+  }
+}
+
+// Apply appropriate activation function to elements of a vector.
+inline void ApplyActivationToVector(const float* vector, int v_size,
+                                    TfLiteFusedActivation activation,
+                                    float* result) {
+  switch (activation) {
+    case kTfLiteActNone:
+      return;
+    case kTfLiteActRelu:
+      return ApplyReluToVector(vector, v_size, result);
+    case kTfLiteActReluN1To1:
+      return ApplyRelu1ToVector(vector, v_size, result);
+    case kTfLiteActRelu6:
+      return ApplyRelu6ToVector(vector, v_size, result);
+    case kTfLiteActTanh:
+      return ApplyTanhToVector(vector, v_size, result);
+    case kTfLiteActSignBit:
+      return ApplySignbitToVector(vector, v_size, result);
+    case kTfLiteActSigmoid:
+      return ApplySigmoidToVector(vector, v_size, result);
+  }
+}
+
+}  // namespace micro_tensor_utils
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_MICRO_TENSOR_UTILS_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/mirror_pad.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mirror_pad.cc
new file mode 100644
index 00000000..90d3bd9e
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mirror_pad.cc
@@ -0,0 +1,215 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace {
+
+struct OpDataMirrorPad {
+  int input_dims;
+  int output_size;
+  int offset;
+  int output_dims_num_elements_buffer_index;
+  int input_dims_num_elements_buffer_index;
+};
+
+// Helper method that fills the left and right pads.
+template <typename T>
+inline void GetPadding(const T* data, int offset, int64_t* left_pad,
+                       int64_t* right_pad) {
+  *left_pad = static_cast<int64_t>(*(data + offset * 2));
+  *right_pad = static_cast<int64_t>(*(data + offset * 2 + 1));
+}
+
+// Given dimension index and the left/right padding.
+// Returns the corresponding dimension in the input array.
+inline int GetInputDimension(int padded_dimension, int left_pad, int right_pad,
+                             int input_dim_size, int offset) {
+  if (padded_dimension < left_pad) {
+    const int original_ind = left_pad + offset - 1;
+    return original_ind - (std::min(padded_dimension, original_ind - offset));
+  }
+  padded_dimension -= left_pad;
+  if (padded_dimension >= input_dim_size) {
+    padded_dimension -= input_dim_size;
+    const int original_ind = input_dim_size - (1 + offset);
+    return original_ind - std::min(padded_dimension, original_ind);
+  }
+  return padded_dimension;
+}
+
+// Given and index in output array, returns the index of the value
+// in input array.
+int GetFlatIndex(int index, int num_dims,
+                 const TfLiteEvalTensor* padding_matrix,
+                 const TfLiteIntArray* input_dims,
+                 int* output_dims_num_elements, int* input_dims_num_elements,
+                 const int offset) {
+  int flat_index = 0;
+  int64_t left_pad = 0, right_pad = 0, dimension_index, index_in_input;
+
+  for (int i = 0; i < num_dims; ++i) {
+    switch (padding_matrix->type) {
+      case kTfLiteInt32:
+        GetPadding(padding_matrix->data.i32, i, &left_pad, &right_pad);
+        break;
+      case kTfLiteInt64:
+        GetPadding(padding_matrix->data.i64, i, &left_pad, &right_pad);
+        break;
+      default:
+        break;
+    }
+    dimension_index = index / output_dims_num_elements[i];
+
+    index_in_input = GetInputDimension(dimension_index, left_pad, right_pad,
+                                       input_dims->data[i], offset);
+
+    flat_index += index_in_input * (input_dims_num_elements)[i];
+    index %= output_dims_num_elements[i];
+  }
+
+  return flat_index;
+}
+
+template <typename T>
+void MirrorPad(const TfLiteEvalTensor* padding_matrix,
+               const TfLiteIntArray* input_dims, int* output_dims_num_elements,
+               int* input_dims_num_elements, const T* input_data,
+               T* output_data, const int offset, const int num_dims,
+               const int output_size) {
+  for (int i = 0; i < output_size; ++i) {
+    output_data[i] = input_data[GetFlatIndex(
+        i, num_dims, padding_matrix, input_dims, output_dims_num_elements,
+        input_dims_num_elements, offset)];
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TfLiteStatus status = kTfLiteOk;
+  const OpDataMirrorPad* data =
+      static_cast<const OpDataMirrorPad*>(node->user_data);
+
+  const TfLiteEvalTensor* input_tensor =
+      tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* padding_matrix =
+      tflite::micro::GetEvalInput(context, node, 1);
+
+  TfLiteEvalTensor* output_tensor =
+      tflite::micro::GetEvalOutput(context, node, 0);
+  const int input_dims = data->input_dims;
+  const int output_size = data->output_size;
+
+  int* input_dims_num_elements = (int*)context->GetScratchBuffer(
+      context, data->input_dims_num_elements_buffer_index);
+  int* output_dims_num_elements = (int*)context->GetScratchBuffer(
+      context, data->output_dims_num_elements_buffer_index);
+
+  for (int i = 0; i < input_dims; i++) {
+    output_dims_num_elements[i] = 1;
+    input_dims_num_elements[i] = 1;
+  }
+
+  for (int i = input_dims - 2; i >= 0; i--) {
+    output_dims_num_elements[i] =
+        output_dims_num_elements[i + 1] * output_tensor->dims->data[i + 1];
+
+    input_dims_num_elements[i] =
+        input_dims_num_elements[i + 1] * input_tensor->dims->data[i + 1];
+  }
+
+  switch (output_tensor->type) {
+    case kTfLiteFloat32: {
+      MirrorPad(padding_matrix, input_tensor->dims, output_dims_num_elements,
+                input_dims_num_elements,
+                tflite::micro::GetTensorData<float>(input_tensor),
+                tflite::micro::GetTensorData<float>(output_tensor),
+                data->offset, input_dims, output_size);
+      break;
+    }
+    case kTfLiteInt8: {
+      MirrorPad(padding_matrix, input_tensor->dims, output_dims_num_elements,
+                input_dims_num_elements,
+                tflite::micro::GetTensorData<int8_t>(input_tensor),
+                tflite::micro::GetTensorData<int8_t>(output_tensor),
+                data->offset, input_dims, output_size);
+      break;
+    }
+    default:
+      status = kTfLiteError;
+      break;
+  }
+
+#undef TF_LITE_MIRROR_PAD
+
+  return status;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataMirrorPad));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpDataMirrorPad* data = static_cast<OpDataMirrorPad*>(node->user_data);
+
+  TfLiteTensor* input_tensor = micro_context->AllocateTempInputTensor(node, 0);
+  TfLiteTensor* padding_matrix =
+      micro_context->AllocateTempInputTensor(node, 1);
+  TfLiteTensor* output_tensor =
+      micro_context->AllocateTempOutputTensor(node, 0);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0),
+                    NumDimensions(input_tensor));
+  auto* params =
+      reinterpret_cast<TfLiteMirrorPaddingParams*>(node->builtin_data);
+  if (params == nullptr) {
+    return kTfLiteError;
+  }
+
+  data->offset =
+      params->mode != TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect ? 0
+                                                                           : 1;
+  data->input_dims = NumDimensions(input_tensor);
+  data->output_size = NumElements(output_tensor);
+
+  TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+      context, data->input_dims * sizeof(int),
+      &data->output_dims_num_elements_buffer_index));
+  TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+      context, data->input_dims * sizeof(int),
+      &data->input_dims_num_elements_buffer_index));
+
+  micro_context->DeallocateTempTfLiteTensor(input_tensor);
+  micro_context->DeallocateTempTfLiteTensor(padding_matrix);
+  micro_context->DeallocateTempTfLiteTensor(output_tensor);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_MIRROR_PAD() {
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul.cc
index e8295197..59f006b0 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul.cc
@@ -61,14 +61,7 @@ TfLiteStatus MulEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteRegistration Register_MUL() {
-  return {/*init=*/MulInit,
-          /*free=*/nullptr,
-          /*prepare=*/MulPrepare,
-          /*invoke=*/MulEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(MulInit, MulPrepare, MulEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul_common.cc
index 86ab90aa..6d19ac7a 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul_common.cc
@@ -37,11 +37,16 @@ void* MulInit(TfLiteContext* context, const char* buffer, size_t length) {
 
 TfLiteStatus CalculateOpDataMul(TfLiteContext* context, TfLiteNode* node,
                                 TfLiteMulParams* params, OpDataMul* data) {
-  const TfLiteTensor* input1 = GetInput(context, node, kMulInput1Tensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kMulInput1Tensor);
   TF_LITE_ENSURE(context, input1 != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kMulInput2Tensor);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kMulInput2Tensor);
   TF_LITE_ENSURE(context, input2 != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kMulOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kMulOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
@@ -72,6 +77,9 @@ TfLiteStatus CalculateOpDataMul(TfLiteContext* context, TfLiteNode* node,
                              &data->output_activation_max_f32);
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/neg.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/neg.cc
index 74a95ca3..59dd8cb8 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/neg.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/neg.cc
@@ -51,14 +51,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace neg
 
 TfLiteRegistration Register_NEG() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/neg::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, nullptr, neg::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pack.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pack.cc
index 098a0482..56f3b96e 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pack.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pack.cc
@@ -108,14 +108,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace pack
 
 TfLiteRegistration Register_PACK() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/pack::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, nullptr, pack::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pad.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pad.cc
index e038de0b..b645f983 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pad.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pad.cc
@@ -43,19 +43,26 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TFLITE_DCHECK(node->user_data != nullptr);
   OpData* data = static_cast<OpData*>(node->user_data);
 
   TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, /*index=*/0);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, /*index=*/0);
   TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* paddings = GetInput(context, node, /*index=*/1);
+  TfLiteTensor* paddings =
+      micro_context->AllocateTempInputTensor(node, /*index=*/1);
   TF_LITE_ENSURE(context, paddings != nullptr);
-  const TfLiteTensor* constant_values =
-      NumInputs(node) == 3 ? GetInput(context, node, /*index=*/2) : nullptr;
-  TfLiteTensor* output = GetOutput(context, node, /*index=*/0);
+  TfLiteTensor* constant_values =
+      NumInputs(node) == 3
+          ? micro_context->AllocateTempInputTensor(node, /*index=*/2)
+          : nullptr;
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, /*index=*/0);
   TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
@@ -122,6 +129,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     data->output_zero_point = output->params.zero_point;
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(paddings);
+  if (constant_values != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(constant_values);
+  }
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -209,26 +223,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace pad
 
 TfLiteRegistration Register_PAD() {
-  return {/*init=*/pad::Init,
-          /*free=*/nullptr,
-          /*prepare=*/pad::Prepare,
-          /*invoke=*/pad::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(pad::Init, pad::Prepare, pad::Eval);
 }
 
 // Also register Pad as PadV2.
 TfLiteRegistration Register_PADV2() {
-  return {/*init=*/pad::Init,
-          /*free=*/nullptr,
-          /*prepare=*/pad::Prepare,
-          /*invoke=*/pad::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(pad::Init, pad::Prepare, pad::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling.cc
index b3781636..a2ef8b62 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling.cc
@@ -88,25 +88,11 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }  // namespace
 
 TfLiteRegistration Register_AVERAGE_POOL_2D() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/PoolingPrepare,
-          /*invoke=*/AverageEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, PoolingPrepare, AverageEval);
 }
 
 TfLiteRegistration Register_MAX_POOL_2D() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/PoolingPrepare,
-          /*invoke=*/MaxEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, PoolingPrepare, MaxEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling_common.cc
index fa693240..ddc18f0b 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling_common.cc
@@ -54,9 +54,13 @@ TfLiteStatus PoolingPrepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   OpDataPooling* data = static_cast<OpDataPooling*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kPoolingInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kPoolingInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kPoolingOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kPoolingOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_STATUS(
@@ -71,6 +75,9 @@ TfLiteStatus PoolingPrepare(TfLiteContext* context, TfLiteNode* node) {
                                       &data->activation_max);
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu.cc
index dc0c32c0..54cc0e02 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu.cc
@@ -69,14 +69,7 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteRegistration Register_PRELU() {
-  return {/*init=*/PreluInit,
-          /*free=*/nullptr,
-          /*prepare=*/PreluPrepare,
-          /*invoke=*/PreluEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(PreluInit, PreluPrepare, PreluEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu_common.cc
index 8b840fcb..1a89cadf 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu_common.cc
@@ -84,14 +84,22 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   PreluParams* params = static_cast<PreluParams*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
   TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* alpha = GetInput(context, node, 1);
+  TfLiteTensor* alpha = micro_context->AllocateTempInputTensor(node, 1);
   TF_LITE_ENSURE(context, alpha != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
   TF_LITE_ENSURE(context, output != nullptr);
 
-  return CalculatePreluParams(input, alpha, output, params);
+  TF_LITE_ENSURE_OK(context,
+                    CalculatePreluParams(input, alpha, output, params));
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(alpha);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize.cc
index 97f5a004..b5eb9c3c 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize.cc
@@ -34,14 +34,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }  // namespace
 
 TfLiteRegistration Register_QUANTIZE() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/PrepareQuantizeReference,
-          /*invoke=*/EvalQuantizeReference,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, PrepareQuantizeReference,
+                                   EvalQuantizeReference);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize_common.cc
index 459b0966..94220529 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize_common.cc
@@ -36,9 +36,11 @@ TfLiteStatus PrepareQuantizeReference(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
   TF_LITE_ENSURE(context, output != nullptr);
 
   // TODO(b/128934713): Add support for fixed-point per-channel quantization.
@@ -51,15 +53,19 @@ TfLiteStatus PrepareQuantizeReference(TfLiteContext* context,
   TF_LITE_ENSURE(context, affine_quantization->scale);
   TF_LITE_ENSURE(context, affine_quantization->scale->size == 1);
 
-  TF_LITE_ENSURE(context,
-                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt32 ||
-                     input->type == kTfLiteInt16 || input->type == kTfLiteInt8);
+  TF_LITE_ENSURE(
+      context, input->type == kTfLiteFloat32 || input->type == kTfLiteInt32 ||
+                   input->type == kTfLiteInt16 || input->type == kTfLiteInt8 ||
+                   input->type == kTfLiteUInt8);
   TF_LITE_ENSURE(context, output->type == kTfLiteInt8 ||
                               output->type == kTfLiteInt16 ||
-                              output->type == kTfLiteInt32);
+                              output->type == kTfLiteInt32 ||
+                              output->type == kTfLiteUInt8);
 
   if ((input->type == kTfLiteInt16 && output->type == kTfLiteInt8) ||
       (input->type == kTfLiteInt8 && output->type == kTfLiteInt8) ||
+      (input->type == kTfLiteInt8 && output->type == kTfLiteUInt8) ||
+      (input->type == kTfLiteUInt8 && output->type == kTfLiteInt8) ||
       (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) ||
       (input->type == kTfLiteInt8 && output->type == kTfLiteInt32) ||
       (input->type == kTfLiteInt16 && output->type == kTfLiteInt16) ||
@@ -77,6 +83,9 @@ TfLiteStatus PrepareQuantizeReference(TfLiteContext* context,
   data->quantization_params.scale = static_cast<double>(output->params.scale);
 
   data->input_zero_point = input->params.zero_point;
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -104,9 +113,9 @@ TfLiteStatus EvalQuantizeReference(TfLiteContext* context, TfLiteNode* node) {
             tflite::micro::GetTensorData<int16_t>(output));
         return kTfLiteOk;
       default:
-        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                           TfLiteTypeGetName(input->type),
-                           TfLiteTypeGetName(output->type));
+        MicroPrintf("Input %s, output %s not supported.",
+                    TfLiteTypeGetName(input->type),
+                    TfLiteTypeGetName(output->type));
         return kTfLiteError;
     }
   } else if (input->type == kTfLiteInt32) {
@@ -127,9 +136,9 @@ TfLiteStatus EvalQuantizeReference(TfLiteContext* context, TfLiteNode* node) {
             tflite::micro::GetTensorData<int16_t>(output));
         break;
       default:
-        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                           TfLiteTypeGetName(input->type),
-                           TfLiteTypeGetName(output->type));
+        MicroPrintf("Input %s, output %s not supported.",
+                    TfLiteTypeGetName(input->type),
+                    TfLiteTypeGetName(output->type));
         return kTfLiteError;
     }
   } else if (input->type == kTfLiteInt16) {
@@ -157,9 +166,9 @@ TfLiteStatus EvalQuantizeReference(TfLiteContext* context, TfLiteNode* node) {
             tflite::micro::GetTensorData<int32_t>(output));
         return kTfLiteOk;
       default:
-        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                           TfLiteTypeGetName(input->type),
-                           TfLiteTypeGetName(output->type));
+        MicroPrintf("Input %s, output %s not supported.",
+                    TfLiteTypeGetName(input->type),
+                    TfLiteTypeGetName(output->type));
         return kTfLiteError;
     }
   } else if (input->type == kTfLiteInt8) {
@@ -174,6 +183,13 @@ TfLiteStatus EvalQuantizeReference(TfLiteContext* context, TfLiteNode* node) {
             data->input_zero_point, data->quantization_params.zero_point,
             tflite::micro::GetTensorData<int8_t>(output));
         break;
+      case kTfLiteUInt8:
+        reference_ops::Requantize(
+            tflite::micro::GetTensorData<int8_t>(input), size,
+            data->requantize_output_multiplier, data->requantize_output_shift,
+            data->input_zero_point, data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<uint8_t>(output));
+        break;
       case kTfLiteInt16:
         reference_ops::Requantize(
             tflite::micro::GetTensorData<int8_t>(input), size,
@@ -189,15 +205,31 @@ TfLiteStatus EvalQuantizeReference(TfLiteContext* context, TfLiteNode* node) {
             tflite::micro::GetTensorData<int32_t>(output));
         break;
       default:
-        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                           TfLiteTypeGetName(input->type),
-                           TfLiteTypeGetName(output->type));
+        MicroPrintf("Input %s, output %s not supported.",
+                    TfLiteTypeGetName(input->type),
+                    TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else if (input->type == kTfLiteUInt8) {
+    size_t size = ElementCount(*input->dims);
+    switch (output->type) {
+      case kTfLiteInt8:
+        reference_ops::Requantize(
+            tflite::micro::GetTensorData<uint8_t>(input), size,
+            data->requantize_output_multiplier, data->requantize_output_shift,
+            data->input_zero_point, data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<int8_t>(output));
+        break;
+      default:
+        MicroPrintf("Input %s, output %s not supported.",
+                    TfLiteTypeGetName(input->type),
+                    TfLiteTypeGetName(output->type));
         return kTfLiteError;
     }
   } else {
-    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                       TfLiteTypeGetName(input->type),
-                       TfLiteTypeGetName(output->type));
+    MicroPrintf("Input %s, output %s not supported.",
+                TfLiteTypeGetName(input->type),
+                TfLiteTypeGetName(output->type));
     return kTfLiteError;
   }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/read_variable.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/read_variable.cc
index 024b511a..422c0384 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/read_variable.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/read_variable.cc
@@ -39,13 +39,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(NumInputs(node) == 1);
   TFLITE_DCHECK(NumOutputs(node) == 1);
 
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputVariableId);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input_resource_id_tensor =
+      micro_context->AllocateTempInputTensor(node, kInputVariableId);
 
   TFLITE_DCHECK(input_resource_id_tensor != nullptr);
   TFLITE_DCHECK(input_resource_id_tensor->type == kTfLiteResource);
   TFLITE_DCHECK(NumElements(input_resource_id_tensor) == 1);
 
+  micro_context->DeallocateTempTfLiteTensor(input_resource_id_tensor);
+
   return kTfLiteOk;
 }
 
@@ -58,14 +62,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       tflite::micro::GetEvalOutput(context, node, kOutputValue);
   TFLITE_DCHECK(output_value != nullptr);
 
-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
-  MicroResourceVariables* resources = graph_info->GetResourceVariables();
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  MicroGraph& graph_info = micro_context->graph();
+
+  MicroResourceVariables* resources = graph_info.GetResourceVariables();
   if (resources == nullptr) {
     MicroPrintf(
         "READ_VARIABLE requires resource variables. Please create "
@@ -81,14 +81,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace.
 
 TfLiteRegistration Register_READ_VARIABLE() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.cc
index 6339c98b..7e862ba1 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,320 +23,41 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/reduce.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace reduce {
-
-constexpr int kMaxNumberOfAxis = 4;
-constexpr int kMaxNumberOfReducedAxis = 2;
-
-struct OpData {
-  int32_t multiplier;
-  int shift;
-  int temp_buffer_idx;
-  int resolved_axis_idx;
-  int input_zp;
-  float input_scale;
-  int output_zp;
-  float output_scale;
-  int num_output_elements;
-};
 
 void* InitReduce(TfLiteContext* context, const char* buffer, size_t length) {
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
-  // Inputs Tensor (dtype depends on quantization):
-  // [0] = Input
-  // [1] = Axis
-  const TfLiteTensor* input = GetInput(context, node, 0);
-
-  // Outputs Tensor (dtype depends on quantization):
-  // [0] = Output
-
-  // Validate number of inputs and outputs
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 2);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  // Validate axis type
-  const TfLiteTensor* axis = GetInput(context, node, 1);
-  TF_LITE_ENSURE(context, axis != nullptr);
-  TF_LITE_ENSURE_TYPES_EQ(context, axis->type, kTfLiteInt32);
-
-  if (input->type == kTfLiteInt8) {
-    OpData* data = static_cast<OpData*>(node->user_data);
-    const TfLiteTensor* output = GetOutput(context, node, 0);
-    const double real_multiplier = static_cast<double>(input->params.scale) /
-                                   static_cast<double>(output->params.scale);
-    QuantizeMultiplier(real_multiplier, &data->multiplier, &data->shift);
-  }
-
-  return kTfLiteOk;
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataReduce));
 }
 
 TfLiteStatus PrepareMax(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));
-
-  OpData* op_data = static_cast<OpData*>(node->user_data);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  const TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* axis = GetInput(context, node, 1);
-
-  op_data->input_scale = input->params.scale;
-  op_data->output_scale = output->params.scale;
-  op_data->num_output_elements = NumElements(output);
-
-  context->RequestScratchBufferInArena(context, sizeof(int) * input->dims->size,
-                                       &op_data->temp_buffer_idx);
-  context->RequestScratchBufferInArena(
-      context, sizeof(int) * static_cast<int>(ElementCount(*axis->dims)),
-      &op_data->resolved_axis_idx);
-
-  return kTfLiteOk;
+  return PrepareMaxHelper(context, node,
+                          static_cast<OpDataReduce*>(node->user_data));
 }
 
 TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-  const TfLiteTensor* output = GetOutput(context, node, 0);
-  if (input->type == kTfLiteInt8 || input->type == kTfLiteInt16) {
-    const double real_multiplier = static_cast<double>(input->params.scale) /
-                                   static_cast<double>(output->params.scale);
-    QuantizeMultiplier(real_multiplier, &op_data->multiplier, &op_data->shift);
-  }
-
-  int output_size = NumElements(output);
-  if (input->type == kTfLiteInt8 || input->type == kTfLiteInt16) {
-    context->RequestScratchBufferInArena(context, output_size * sizeof(int32_t),
-                                         &op_data->temp_buffer_idx);
-    op_data->input_zp = input->params.zero_point;
-    op_data->input_scale = input->params.scale;
-    op_data->output_zp = output->params.zero_point;
-    op_data->output_scale = output->params.scale;
-  }
-
-  TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));
-  // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
-  return kTfLiteOk;
-}
-
-void ResolveAxis(const int* axis_data, int axis_count,
-                 tflite::MeanParams* op_params) {
-  int i = 0;
-  for (; i < axis_count; ++i) {
-    op_params->axis[i] = static_cast<int16_t>(axis_data[i]);
-  }
-  for (; i < 4; ++i) {
-    op_params->axis[i] = 1;
-  }
-  op_params->axis_count = axis_count;
+  return PrepareMeanOrSumHelper(context, node,
+                                static_cast<OpDataReduce*>(node->user_data));
 }
 
 TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
-  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 1);
-  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
-  TfLiteReducerParams* params =
-      reinterpret_cast<TfLiteReducerParams*>(node->builtin_data);
-  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-
-  int num_axis = static_cast<int>(ElementCount(*axis->dims));
-  int temp_index[kMaxNumberOfAxis];
-  int resolved_axis[kMaxNumberOfReducedAxis];
-
-  tflite::MeanParams op_params;
-  ResolveAxis(tflite::micro::GetTensorData<int>(axis), num_axis, &op_params);
-
-  // Special case mean implementation exists for 4D mean across axes 1 and 2.
-  bool special_case_4d_axes_1_and_2 =
-      input->dims->size == 4 && op_params.axis_count == 2 &&
-      ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
-       (op_params.axis[0] == 2 && op_params.axis[1] == 1));
-
-  switch (input->type) {
-    case kTfLiteFloat32: {
-      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
-      if (params->keep_dims && special_case_4d_axes_1_and_2) {
-        reference_ops::Mean(op_params, tflite::micro::GetTensorShape(input),
-                            tflite::micro::GetTensorData<float>(input),
-                            tflite::micro::GetTensorShape(output),
-                            tflite::micro::GetTensorData<float>(output));
-      } else {
-        TF_LITE_ENSURE(
-            context,
-            reference_ops::Mean(
-                tflite::micro::GetTensorData<float>(input), input->dims->data,
-                input->dims->size, tflite::micro::GetTensorData<float>(output),
-                output->dims->data, output->dims->size,
-                tflite::micro::GetTensorData<int>(axis), num_axis,
-                params->keep_dims, temp_index, resolved_axis,
-                tflite::micro::GetTensorData<float>(output)));
-      }
-    } break;
-    case kTfLiteInt8: {
-      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
-      if (params->keep_dims && special_case_4d_axes_1_and_2) {
-        reference_integer_ops::Mean(
-            op_params, op_data->multiplier, op_data->shift,
-            tflite::micro::GetTensorShape(input),
-            tflite::micro::GetTensorData<int8_t>(input), op_data->input_zp,
-            tflite::micro::GetTensorShape(output),
-            tflite::micro::GetTensorData<int8_t>(output), op_data->output_zp);
-      } else if (op_data->input_zp == op_data->output_zp &&
-                 op_data->input_scale == op_data->output_scale) {
-        int32_t* temp_buffer = static_cast<int32_t*>(
-            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
-        TF_LITE_ENSURE(
-            context,
-            reference_ops::Mean(
-                tflite::micro::GetTensorData<int8_t>(input), input->dims->data,
-                input->dims->size, tflite::micro::GetTensorData<int8_t>(output),
-                output->dims->data, output->dims->size,
-                tflite::micro::GetTensorData<int>(axis), num_axis,
-                params->keep_dims, temp_index, resolved_axis, temp_buffer));
-      } else {
-        int32_t* temp_buffer = static_cast<int32_t*>(
-            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
-        TF_LITE_ENSURE(
-            context,
-            reference_ops::QuantizedMeanOrSum(
-                tflite::micro::GetTensorData<int8_t>(input), op_data->input_zp,
-                op_data->input_scale, input->dims->data, input->dims->size,
-                tflite::micro::GetTensorData<int8_t>(output),
-                op_data->output_zp, op_data->output_scale, output->dims->data,
-                output->dims->size, tflite::micro::GetTensorData<int>(axis),
-                num_axis, params->keep_dims, temp_index, resolved_axis,
-                temp_buffer, false));
-      }
-    } break;
-    case kTfLiteInt16: {
-      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
-      if (params->keep_dims && special_case_4d_axes_1_and_2) {
-        reference_integer_ops::Mean(
-            op_params, op_data->multiplier, op_data->shift,
-            tflite::micro::GetTensorShape(input),
-            tflite::micro::GetTensorData<int16_t>(input), op_data->input_zp,
-            tflite::micro::GetTensorShape(output),
-            tflite::micro::GetTensorData<int16_t>(output), op_data->output_zp);
-      } else if (op_data->input_zp == op_data->output_zp &&
-                 op_data->input_scale == op_data->output_scale) {
-        int32_t* temp_buffer = static_cast<int32_t*>(
-            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
-        TF_LITE_ENSURE(
-            context,
-            reference_ops::Mean(tflite::micro::GetTensorData<int16_t>(input),
-                                input->dims->data, input->dims->size,
-                                tflite::micro::GetTensorData<int16_t>(output),
-                                output->dims->data, output->dims->size,
-                                tflite::micro::GetTensorData<int>(axis),
-                                num_axis, params->keep_dims, temp_index,
-                                resolved_axis, temp_buffer));
-      } else {
-        int32_t* temp_buffer = static_cast<int32_t*>(
-            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
-        TF_LITE_ENSURE(
-            context,
-            reference_ops::QuantizedMeanOrSum(
-                tflite::micro::GetTensorData<int16_t>(input), op_data->input_zp,
-                op_data->input_scale, input->dims->data, input->dims->size,
-                tflite::micro::GetTensorData<int16_t>(output),
-                op_data->output_zp, op_data->output_scale, output->dims->data,
-                output->dims->size, tflite::micro::GetTensorData<int>(axis),
-                num_axis, params->keep_dims, temp_index, resolved_axis,
-                temp_buffer, false));
-      }
-    } break;
-    default:
-      TF_LITE_ENSURE_MSG(context, false,
-                         "Currently, only float32, int8 or uint8 input type "
-                         "is supported.");
-  }
-  return kTfLiteOk;
+  return EvalMeanHelper(context, node,
+                        static_cast<OpDataReduce*>(node->user_data));
 }
 
 TfLiteStatus EvalMax(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
-  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 1);
-  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
-  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
-  TfLiteReducerParams* params =
-      static_cast<TfLiteReducerParams*>(node->builtin_data);
-  OpData* op_data = static_cast<OpData*>(node->user_data);
-
-  // Interpret an axis tensor with null dimensions as a scalar
-  int num_axis = static_cast<int>(ElementCount(*axis->dims));
-  int* temp_buffer = static_cast<int*>(
-      context->GetScratchBuffer(context, op_data->temp_buffer_idx));
-  int* resolved_axis = static_cast<int*>(
-      context->GetScratchBuffer(context, op_data->resolved_axis_idx));
-  switch (input->type) {
-    case kTfLiteFloat32:
-      TF_LITE_ENSURE(
-          context,
-          reference_ops::ReduceGeneric<float>(
-              tflite::micro::GetTensorData<float>(input), input->dims->data,
-              input->dims->size, tflite::micro::GetTensorData<float>(output),
-              output->dims->data, output->dims->size,
-              tflite::micro::GetTensorData<int>(axis), num_axis,
-              params->keep_dims, temp_buffer, resolved_axis,
-              std::numeric_limits<float>::lowest(),
-              [](const float current, const float in) -> float {
-                return (in > current) ? in : current;
-              }));
-      break;
-    case kTfLiteInt8:
-      TF_LITE_ENSURE_EQ(context, static_cast<double>(op_data->input_scale),
-                        static_cast<double>(op_data->output_scale));
-      TF_LITE_ENSURE_EQ(context, op_data->input_zp, op_data->output_zp);
-      TF_LITE_ENSURE(
-          context,
-          reference_ops::ReduceGeneric<int8_t>(
-              tflite::micro::GetTensorData<int8_t>(input), input->dims->data,
-              input->dims->size, tflite::micro::GetTensorData<int8_t>(output),
-              output->dims->data, output->dims->size,
-              tflite::micro::GetTensorData<int>(axis), num_axis,
-              params->keep_dims, temp_buffer, resolved_axis,
-              std::numeric_limits<int8_t>::lowest(),
-              [](const int8_t current, const int8_t in) -> int8_t {
-                return (in > current) ? in : current;
-              }));
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context,
-                         "Only float32 and int8 types are supported.\n");
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
+  OpDataReduce* op_data = static_cast<OpDataReduce*>(node->user_data);
+  return EvalMaxHelper(context, node, op_data);
 }
 
-}  // namespace reduce
-
 TfLiteRegistration Register_MEAN() {
-  return {/*init=*/reduce::InitReduce,
-          /*free=*/nullptr,
-          /*prepare=*/reduce::PrepareMeanOrSum,
-          /*invoke=*/reduce::EvalMean,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(InitReduce, PrepareMeanOrSum, EvalMean);
 }
 
 TfLiteRegistration Register_REDUCE_MAX() {
-  return {/*init=*/reduce::InitReduce,
-          /*free=*/nullptr,
-          /*prepare=*/reduce::PrepareMax,
-          /*invoke=*/reduce::EvalMax,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(InitReduce, PrepareMax, EvalMax);
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.h
new file mode 100644
index 00000000..cd94b3f5
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.h
@@ -0,0 +1,61 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_REDUCE_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_REDUCE_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+extern const int kMaxNumberOfAxis;
+extern const int kMaxNumberOfReducedAxis;
+
+struct OpDataReduce {
+  int32_t multiplier;
+  int shift;
+  int temp_buffer_idx;
+  int resolved_axis_idx;
+  int input_zp;
+  float input_scale;
+  int output_zp;
+  float output_scale;
+  int num_output_elements;
+};
+
+TfLiteStatus PrepareMaxHelper(TfLiteContext* context, TfLiteNode* node,
+                              OpDataReduce* op_data);
+
+TfLiteStatus PrepareMeanOrSumHelper(TfLiteContext* context, TfLiteNode* node,
+                                    OpDataReduce* op_data);
+
+TfLiteStatus EvalMaxHelper(TfLiteContext* context, TfLiteNode* node,
+                           OpDataReduce* op_data);
+TfLiteStatus EvalMeanHelper(TfLiteContext* context, TfLiteNode* node,
+                            OpDataReduce* op_data);
+
+void ReduceResolveAxis(const int* axis_data, int axis_count,
+                       MeanParams* op_params);
+
+TfLiteRegistration Register_MEAN();
+TfLiteRegistration Register_REDUCE_MAX();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_REDUCE_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce_common.cc
new file mode 100644
index 00000000..97452300
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce_common.cc
@@ -0,0 +1,311 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mean.h"
+#include "tensorflow/lite/kernels/internal/reference/reduce.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/reduce.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+
+const int kMaxNumberOfAxis = 4;
+const int kMaxNumberOfReducedAxis = 2;
+
+TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node,
+                           int32_t* multiplier, int* shift) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  // Inputs Tensor (dtype depends on quantization):
+  // [0] = Input
+  // [1] = Axis
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
+
+  // Outputs Tensor (dtype depends on quantization):
+  // [0] = Output
+
+  // Validate number of inputs and outputs
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  // Validate axis type
+  TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 1);
+  TF_LITE_ENSURE(context, axis != nullptr);
+  TF_LITE_ENSURE_TYPES_EQ(context, axis->type, kTfLiteInt32);
+
+  if (input->type == kTfLiteInt8) {
+    TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
+    const double real_multiplier = static_cast<double>(input->params.scale) /
+                                   static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier, multiplier, shift);
+    micro_context->DeallocateTempTfLiteTensor(output);
+  }
+  micro_context->DeallocateTempTfLiteTensor(axis);
+  micro_context->DeallocateTempTfLiteTensor(input);
+  return kTfLiteOk;
+}
+
+TfLiteStatus PrepareMaxHelper(TfLiteContext* context, TfLiteNode* node,
+                              OpDataReduce* op_data) {
+  TF_LITE_ENSURE_OK(context, PrepareSimple(context, node, &op_data->multiplier,
+                                           &op_data->shift));
+
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
+  TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 1);
+
+  op_data->input_scale = input->params.scale;
+  op_data->output_scale = output->params.scale;
+  op_data->num_output_elements = NumElements(output);
+
+  context->RequestScratchBufferInArena(context, sizeof(int) * input->dims->size,
+                                       &op_data->temp_buffer_idx);
+  context->RequestScratchBufferInArena(
+      context, sizeof(int) * static_cast<int>(ElementCount(*axis->dims)),
+      &op_data->resolved_axis_idx);
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(axis);
+  return kTfLiteOk;
+}
+
+TfLiteStatus PrepareMeanOrSumHelper(TfLiteContext* context, TfLiteNode* node,
+                                    OpDataReduce* op_data) {
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
+  if (input->type == kTfLiteInt8 || input->type == kTfLiteInt16) {
+    const double real_multiplier = static_cast<double>(input->params.scale) /
+                                   static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier, &op_data->multiplier, &op_data->shift);
+  }
+
+  int output_size = NumElements(output);
+  if (input->type == kTfLiteInt8 || input->type == kTfLiteInt16) {
+    context->RequestScratchBufferInArena(context, output_size * sizeof(int32_t),
+                                         &op_data->temp_buffer_idx);
+    op_data->input_zp = input->params.zero_point;
+    op_data->input_scale = input->params.scale;
+    op_data->output_zp = output->params.zero_point;
+    op_data->output_scale = output->params.scale;
+  }
+
+  TF_LITE_ENSURE_OK(
+      context,
+      PrepareSimple(context, node, &(op_data->multiplier), &(op_data->shift)));
+  // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+void ResolveAxis(const int* axis_data, int axis_count,
+                 tflite::MeanParams* op_params) {
+  int i = 0;
+  for (; i < axis_count; ++i) {
+    op_params->axis[i] = static_cast<int16_t>(axis_data[i]);
+  }
+  for (; i < 4; ++i) {
+    op_params->axis[i] = 1;
+  }
+  op_params->axis_count = axis_count;
+}
+
+TfLiteStatus EvalMeanHelper(TfLiteContext* context, TfLiteNode* node,
+                            OpDataReduce* op_data) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 1);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  TfLiteReducerParams* params =
+      reinterpret_cast<TfLiteReducerParams*>(node->builtin_data);
+
+  int num_axis = static_cast<int>(ElementCount(*axis->dims));
+  int temp_index[kMaxNumberOfAxis];
+  int resolved_axis[kMaxNumberOfReducedAxis];
+
+  tflite::MeanParams op_params;
+  ResolveAxis(tflite::micro::GetTensorData<int>(axis), num_axis, &op_params);
+
+  // Special case mean implementation exists for 4D mean across axes 1 and 2.
+  bool special_case_4d_axes_1_and_2 =
+      input->dims->size == 4 && op_params.axis_count == 2 &&
+      ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+       (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+      if (params->keep_dims && special_case_4d_axes_1_and_2) {
+        reference_ops::Mean(op_params, tflite::micro::GetTensorShape(input),
+                            tflite::micro::GetTensorData<float>(input),
+                            tflite::micro::GetTensorShape(output),
+                            tflite::micro::GetTensorData<float>(output));
+      } else {
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::Mean(
+                tflite::micro::GetTensorData<float>(input), input->dims->data,
+                input->dims->size, tflite::micro::GetTensorData<float>(output),
+                output->dims->data, output->dims->size,
+                tflite::micro::GetTensorData<int>(axis), num_axis,
+                params->keep_dims, temp_index, resolved_axis,
+                tflite::micro::GetTensorData<float>(output)));
+      }
+    } break;
+    case kTfLiteInt8: {
+      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+      if (params->keep_dims && special_case_4d_axes_1_and_2) {
+        reference_integer_ops::Mean(
+            op_params, op_data->multiplier, op_data->shift,
+            tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<int8_t>(input), op_data->input_zp,
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output), op_data->output_zp);
+      } else if (op_data->input_zp == op_data->output_zp &&
+                 op_data->input_scale == op_data->output_scale) {
+        int32_t* temp_buffer = static_cast<int32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::Mean(
+                tflite::micro::GetTensorData<int8_t>(input), input->dims->data,
+                input->dims->size, tflite::micro::GetTensorData<int8_t>(output),
+                output->dims->data, output->dims->size,
+                tflite::micro::GetTensorData<int>(axis), num_axis,
+                params->keep_dims, temp_index, resolved_axis, temp_buffer));
+      } else {
+        int32_t* temp_buffer = static_cast<int32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::QuantizedMeanOrSum(
+                tflite::micro::GetTensorData<int8_t>(input), op_data->input_zp,
+                op_data->input_scale, input->dims->data, input->dims->size,
+                tflite::micro::GetTensorData<int8_t>(output),
+                op_data->output_zp, op_data->output_scale, output->dims->data,
+                output->dims->size, tflite::micro::GetTensorData<int>(axis),
+                num_axis, params->keep_dims, temp_index, resolved_axis,
+                temp_buffer, false));
+      }
+    } break;
+    case kTfLiteInt16: {
+      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+      if (params->keep_dims && special_case_4d_axes_1_and_2) {
+        reference_integer_ops::Mean(
+            op_params, op_data->multiplier, op_data->shift,
+            tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<int16_t>(input), op_data->input_zp,
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int16_t>(output), op_data->output_zp);
+      } else if (op_data->input_zp == op_data->output_zp &&
+                 op_data->input_scale == op_data->output_scale) {
+        int32_t* temp_buffer = static_cast<int32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::Mean(tflite::micro::GetTensorData<int16_t>(input),
+                                input->dims->data, input->dims->size,
+                                tflite::micro::GetTensorData<int16_t>(output),
+                                output->dims->data, output->dims->size,
+                                tflite::micro::GetTensorData<int>(axis),
+                                num_axis, params->keep_dims, temp_index,
+                                resolved_axis, temp_buffer));
+      } else {
+        int32_t* temp_buffer = static_cast<int32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::QuantizedMeanOrSum(
+                tflite::micro::GetTensorData<int16_t>(input), op_data->input_zp,
+                op_data->input_scale, input->dims->data, input->dims->size,
+                tflite::micro::GetTensorData<int16_t>(output),
+                op_data->output_zp, op_data->output_scale, output->dims->data,
+                output->dims->size, tflite::micro::GetTensorData<int>(axis),
+                num_axis, params->keep_dims, temp_index, resolved_axis,
+                temp_buffer, false));
+      }
+    } break;
+    default:
+      TF_LITE_ENSURE_MSG(context, false,
+                         "Currently, only float32, int8 or uint8 input type "
+                         "is supported.");
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalMaxHelper(TfLiteContext* context, TfLiteNode* node,
+                           OpDataReduce* op_data) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 1);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+  TfLiteReducerParams* params =
+      static_cast<TfLiteReducerParams*>(node->builtin_data);
+
+  // Interpret an axis tensor with null dimensions as a scalar
+  int num_axis = static_cast<int>(ElementCount(*axis->dims));
+  int* temp_buffer = static_cast<int*>(
+      context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+  int* resolved_axis = static_cast<int*>(
+      context->GetScratchBuffer(context, op_data->resolved_axis_idx));
+  switch (input->type) {
+    case kTfLiteFloat32:
+      TF_LITE_ENSURE(
+          context,
+          reference_ops::ReduceGeneric<float>(
+              tflite::micro::GetTensorData<float>(input), input->dims->data,
+              input->dims->size, tflite::micro::GetTensorData<float>(output),
+              output->dims->data, output->dims->size,
+              tflite::micro::GetTensorData<int>(axis), num_axis,
+              params->keep_dims, temp_buffer, resolved_axis,
+              std::numeric_limits<float>::lowest(),
+              [](const float current, const float in) -> float {
+                return (in > current) ? in : current;
+              }));
+      break;
+    case kTfLiteInt8:
+      TF_LITE_ENSURE_EQ(context, static_cast<double>(op_data->input_scale),
+                        static_cast<double>(op_data->output_scale));
+      TF_LITE_ENSURE_EQ(context, op_data->input_zp, op_data->output_zp);
+      TF_LITE_ENSURE(
+          context,
+          reference_ops::ReduceGeneric<int8_t>(
+              tflite::micro::GetTensorData<int8_t>(input), input->dims->data,
+              input->dims->size, tflite::micro::GetTensorData<int8_t>(output),
+              output->dims->data, output->dims->size,
+              tflite::micro::GetTensorData<int>(axis), num_axis,
+              params->keep_dims, temp_buffer, resolved_axis,
+              std::numeric_limits<int8_t>::lowest(),
+              [](const int8_t current, const int8_t in) -> int8_t {
+                return (in > current) ? in : current;
+              }));
+      break;
+    default:
+      MicroPrintf("Only float32 and int8 types are supported.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reshape.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reshape.cc
index 8e47e2a0..832ba261 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reshape.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reshape.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstring>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -31,9 +33,13 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
   // Tensorflow's Reshape allows one of the shape components to have the
   // special -1 value, meaning it will be calculated automatically based on the
@@ -68,6 +74,9 @@ TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -93,9 +102,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Do nothing for in-place reshape.
   if (input->data.raw != output->data.raw) {
     // Otherwise perform reshape with copy.
-    for (size_t i = 0; i < input_bytes; ++i) {
-      output->data.raw[i] = input->data.raw[i];
-    }
+    memcpy(output->data.raw, input->data.raw, input_bytes);
   }
   return kTfLiteOk;
 }
@@ -103,14 +110,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace reshape
 
 TfLiteRegistration Register_RESHAPE() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/reshape::Prepare,
-          /*invoke=*/reshape::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, reshape::Prepare, reshape::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_bilinear.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_bilinear.cc
index 465bf516..a90057b9 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_bilinear.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_bilinear.cc
@@ -30,12 +30,17 @@ constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* size =
+      micro_context->AllocateTempInputTensor(node, kSizeTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
   TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);
@@ -55,6 +60,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteError;
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(size);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -103,14 +111,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_RESIZE_BILINEAR() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
index 63c302bb..ce507445 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
@@ -33,12 +33,17 @@ constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* size =
+      micro_context->AllocateTempInputTensor(node, kSizeTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
 
   // Our current implementations rely on the input being 4D,
   // and the size being 1D tensor with exactly 2 elements.
@@ -53,6 +58,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_KERNEL_LOG(context, "Dynamic tensors are unsupported in tfmicro.");
     return kTfLiteError;
   }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(size);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -107,14 +117,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace resize_nearest_neighbor
 
 TfLiteRegistration Register_RESIZE_NEAREST_NEIGHBOR() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/resize_nearest_neighbor::Prepare,
-          /*invoke=*/resize_nearest_neighbor::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, resize_nearest_neighbor::Prepare,
+                                   resize_nearest_neighbor::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/round.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/round.cc
index 5804016b..0bda8783 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/round.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/round.cc
@@ -29,9 +29,13 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -42,6 +46,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   for (int i = 0; i < output->dims->size; ++i) {
     TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]);
   }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -61,14 +68,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace round
 
 TfLiteRegistration Register_ROUND() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/round::Prepare,
-          /*invoke=*/round::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, round::Prepare, round::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/shape.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/shape.cc
index df962f62..02f663a8 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/shape.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/shape.cc
@@ -60,14 +60,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_SHAPE() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/slice.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/slice.cc
index 51ee70de..212cf47f 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/slice.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/slice.cc
@@ -45,16 +45,22 @@ void GetBeginAndSizeVectors(int dimensions, const TfLiteEvalTensor* begin,
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
   TFLITE_DCHECK(input != nullptr);
-  const TfLiteTensor* begin = GetInput(context, node, kBeginTensor);
+  TfLiteTensor* begin =
+      micro_context->AllocateTempInputTensor(node, kBeginTensor);
   TFLITE_DCHECK(begin != nullptr);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  TfLiteTensor* size =
+      micro_context->AllocateTempInputTensor(node, kSizeTensor);
   TFLITE_DCHECK(size != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TFLITE_DCHECK(output != nullptr);
 
   // Ensure validity of input tensor and its dimension.
@@ -66,6 +72,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(NumDimensions(size) == 1);
   TFLITE_DCHECK(NumElements(begin) == NumElements(size));
   TFLITE_DCHECK(NumDimensions(input) <= kMaxDim);
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(begin);
+  micro_context->DeallocateTempTfLiteTensor(size);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -139,14 +151,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_SLICE() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax.cc
index f6a30010..c2cee3c5 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax.cc
@@ -83,14 +83,7 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_SOFTMAX() {
-  return {/*init=*/SoftmaxInit,
-          /*free=*/nullptr,
-          /*prepare=*/SoftmaxPrepare,
-          /*invoke=*/SoftmaxEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(SoftmaxInit, SoftmaxPrepare, SoftmaxEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax.h
index 8d605eab..7096d202 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,6 +23,13 @@ namespace tflite {
 
 void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length);
 
+// Common helper function to SoftmaxPrepare.
+TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
+                                    const TfLiteTensor* input,
+                                    TfLiteTensor* output,
+                                    const TfLiteSoftmaxParams* params,
+                                    SoftmaxParams* op_data);
+
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node);
 
 // This is the most generic TfLiteRegistration. The actual supported types may
@@ -30,7 +37,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node);
 // (reference or optimized) must define this function.
 TfLiteRegistration Register_SOFTMAX();
 
-#if defined(XTENSA)
+#if defined(XTENSA) || defined(CMSIS_NN)
 // Returns a TfLiteRegistration struct for kernel variant that only supports
 // int8 input and int16 output.
 TfLiteRegistration Register_SOFTMAX_INT8_INT16();
@@ -40,6 +47,23 @@ inline TfLiteRegistration Register_SOFTMAX_INT8_INT16() {
 }
 #endif
 
+#if defined(CMSIS_NN)
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int8 input/output and uses the latency optimized implementations.
+TfLiteRegistration Register_SOFTMAX_INT8();
+
+// Returns a TfLiteRegistration struct for kernel variant that only supports
+// int16 input/output and uses the latency optimized implementations.
+TfLiteRegistration Register_SOFTMAX_INT16();
+
+#else
+inline TfLiteRegistration Register_SOFTMAX_INT8() { return Register_SOFTMAX(); }
+
+inline TfLiteRegistration Register_SOFTMAX_INT16() {
+  return Register_SOFTMAX();
+}
+#endif
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_SOFTMAX_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax_common.cc
index 5521b543..b5378dae 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax_common.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/softmax.h"
+#include "tensorflow/lite/micro/micro_context.h"
 
 namespace tflite {
 
@@ -27,11 +28,59 @@ namespace {
 // Softmax parameter data that persists in user_data
 const int kInt16LUTArraySize = 513;
 
+TfLiteStatus InitializeLutForInt16(TfLiteContext* context,
+                                   const TfLiteTensor* input,
+                                   TfLiteTensor* output,
+                                   SoftmaxParams* op_data) {
+  // Only allocate LUTs for KTfLiteInt16 data type
+  if (input->type == kTfLiteInt16) {
+    void* raw_exp_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, raw_exp_lut != nullptr);
+    op_data->exp_lut = reinterpret_cast<int16_t*>(raw_exp_lut);
+    void* one_over_one_plus_x_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, one_over_one_plus_x_lut != nullptr);
+    op_data->one_over_one_plus_x_lut =
+        reinterpret_cast<int16_t*>(one_over_one_plus_x_lut);
+  }
+
+  if (output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE(context,
+                   input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  }
+
+  // Populate LUT if required
+  if (input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    // exp LUT only used on negative values
+    // we consider exp(-10.0) is insignificant to accumulation
+    gen_lut<float, int16_t, int16_t>(
+        [](float value) { return std::exp(value); }, -10.0f, 0.0f, -1.0f, 1.0f,
+        op_data->exp_lut);
+    gen_lut<float, int16_t, int16_t>(
+        [](float value) { return 1.0f / (1.0f + value); }, 0.0f, 1.0f, -1.0f,
+        1.0f, op_data->one_over_one_plus_x_lut);
+    op_data->zero_point = output->params.zero_point;
+    op_data->scale = output->params.scale;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
 TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     TfLiteTensor* output,
                                     const TfLiteSoftmaxParams* params,
                                     SoftmaxParams* op_data) {
+  if (InitializeLutForInt16(context, input, output, op_data) != kTfLiteOk) {
+    return kTfLiteError;
+  }
+
   if (input->type == kTfLiteInt8 || input->type == kTfLiteInt16) {
     if (input->type == kTfLiteInt16) {
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
@@ -82,61 +131,32 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-}  // namespace
-
 void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams));
 }
 
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0);
   TF_LITE_ENSURE(context, input != nullptr);
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0);
   TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE(context, node->user_data != nullptr);
   SoftmaxParams* op_data = static_cast<SoftmaxParams*>(node->user_data);
-  // Only allocate LUTs for KTfLiteInt16 data type
-  if (input->type == kTfLiteInt16) {
-    void* raw_exp_lut = context->AllocatePersistentBuffer(
-        context, sizeof(int16_t) * kInt16LUTArraySize);
-    TF_LITE_ENSURE(context, raw_exp_lut != nullptr);
-    op_data->exp_lut = reinterpret_cast<int16_t*>(raw_exp_lut);
-    void* one_over_one_plus_x_lut = context->AllocatePersistentBuffer(
-        context, sizeof(int16_t) * kInt16LUTArraySize);
-    TF_LITE_ENSURE(context, one_over_one_plus_x_lut != nullptr);
-    op_data->one_over_one_plus_x_lut =
-        reinterpret_cast<int16_t*>(one_over_one_plus_x_lut);
-  }
-
-  if (output->type == kTfLiteInt16) {
-    TF_LITE_ENSURE(context,
-                   input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
-  } else {
-    TF_LITE_ENSURE_EQ(context, input->type, output->type);
-  }
-
-  // Populate LUT if required
-  if (input->type == kTfLiteInt16) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    // exp LUT only used on negative values
-    // we consider exp(-10.0) is insignificant to accumulation
-    gen_lut<float, int16_t, int16_t>(
-        [](float value) { return std::exp(value); }, -10.0f, 0.0f, -1.0f, 1.0f,
-        op_data->exp_lut);
-    gen_lut<float, int16_t, int16_t>(
-        [](float value) { return 1.0f / (1.0f + value); }, 0.0f, 1.0f, -1.0f,
-        1.0f, op_data->one_over_one_plus_x_lut);
-    op_data->zero_point = output->params.zero_point;
-    op_data->scale = output->params.scale;
-  }
 
   auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
-  return CalculateSoftmaxParams(context, input, output, params, op_data);
+  auto ret_val =
+      CalculateSoftmaxParams(context, input, output, params, op_data);
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return ret_val;
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_batch_nd.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_batch_nd.cc
index fdfb81bc..21f81312 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_batch_nd.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_batch_nd.cc
@@ -44,11 +44,15 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, input != nullptr && output != nullptr);
 
   TF_LITE_ENSURE(context, NumDimensions(input) >= kInputOutputMinDimensionNum);
@@ -57,6 +61,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumDimensions(output) <= kInputOutputMaxDimensionNum);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -108,14 +114,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace.
 
 TfLiteRegistration Register_SPACE_TO_BATCH_ND() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_depth.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_depth.cc
index 7bd86bad..30519b27 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_depth.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_depth.cc
@@ -39,11 +39,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
 
@@ -75,6 +78,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output->dims->data[kDepthRank] =
       input->dims->data[kDepthRank] * block_size * block_size;
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
   return kTfLiteOk;
 }
 
@@ -115,14 +121,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_SPACE_TO_DEPTH() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/split.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/split.cc
index 82f17228..06584d45 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/split.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/split.cc
@@ -69,7 +69,8 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* axis = GetInput(context, node, 0);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 0);
   TF_LITE_ENSURE(context, axis != nullptr);
 
   // Dynamic output tensors are needed if axis tensor is not constant.
@@ -77,6 +78,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // constant axis tensor for now.
   TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis),
                      "Non constant axis tensor not supported");
+
+  micro_context->DeallocateTempTfLiteTensor(axis);
   return kTfLiteOk;
 }
 
@@ -117,14 +120,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace split
 
 TfLiteRegistration Register_SPLIT() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/split::Prepare,
-          /*invoke=*/split::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, split::Prepare, split::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/split_v.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/split_v.cc
index c2a01149..3ea35130 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/split_v.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/split_v.cc
@@ -74,13 +74,14 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
 
+  MicroContext* micro_context = GetMicroContext(context);
   // Dynamic output tensors are needed if axis tensor is not constant.
   // But Micro doesn't support dynamic memory allocation, so we only support
   // constant axis tensor for now.
-  const TfLiteTensor* axis = GetInput(context, node, 2);
+  TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 2);
   TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis),
                      "Non constant axis tensor not supported");
-
+  micro_context->DeallocateTempTfLiteTensor(axis);
   return kTfLiteOk;
 }
 
@@ -120,14 +121,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace split_v
 
 TfLiteRegistration Register_SPLIT_V() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/split_v::Prepare,
-          /*invoke=*/split_v::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, split_v::Prepare, split_v::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/squared_difference.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/squared_difference.cc
new file mode 100644
index 00000000..ca924e26
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/squared_difference.cc
@@ -0,0 +1,247 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_context.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+namespace tflite {
+namespace {
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  bool requires_broadcast;
+  ArithmeticParams arithmetic_params;
+};
+
+template <typename T>
+T SquaredDifference(T input1, T input2) {
+  const T difference = input1 - input2;
+  return difference * difference;
+}
+
+void* SquaredDifferenceInit(TfLiteContext* context, const char* buffer,
+                            size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus SquaredDifferencePrepare(TfLiteContext* context,
+                                      TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  data->requires_broadcast = false;
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor1);
+  TF_LITE_ENSURE(context, input1 != nullptr);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kInputTensor2);
+  TF_LITE_ENSURE(context, input2 != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  // Ensure the quantization parameters are equivalent.
+  if (input1->type == kTfLiteInt8) {
+    const auto& input1_quantization_params = input1->params;
+    const auto& input2_quantization_params = input2->params;
+    const auto& output_quantization_params = output->params;
+    const int32_t integer_type_min = std::numeric_limits<int8_t>::min();
+    const int32_t integer_type_max = std::numeric_limits<int8_t>::max();
+    TF_LITE_ENSURE(context,
+                   input1_quantization_params.zero_point >= integer_type_min);
+    TF_LITE_ENSURE(context,
+                   input1_quantization_params.zero_point <= integer_type_max);
+    TF_LITE_ENSURE(context,
+                   input2_quantization_params.zero_point >= integer_type_min);
+    TF_LITE_ENSURE(context,
+                   input2_quantization_params.zero_point <= integer_type_max);
+    TF_LITE_ENSURE(context,
+                   output_quantization_params.zero_point >= integer_type_min);
+    TF_LITE_ENSURE(context,
+                   output_quantization_params.zero_point <= integer_type_max);
+    data->arithmetic_params.input1_offset =
+        -input1_quantization_params.zero_point;
+    data->arithmetic_params.input2_offset =
+        -input2_quantization_params.zero_point;
+    data->arithmetic_params.output_offset =
+        output_quantization_params.zero_point;
+
+    // shift to make integer for scales.
+    // 7 is selected so that maximum shifted result 255^2 * (1 << (7 * 2 ))
+    // does not overflow signed 32-bit integer
+    data->arithmetic_params.left_shift = 7;
+    const double twice_max_input_scale =
+        2.0 * static_cast<double>(std::max(input1_quantization_params.scale,
+                                           input2_quantization_params.scale));
+    const double real_input1_multiplier =
+        static_cast<double>(input1_quantization_params.scale) /
+        twice_max_input_scale;
+    double real_input2_multiplier =
+        static_cast<double>(input2_quantization_params.scale) /
+        twice_max_input_scale;
+    const double real_output_multiplier =
+        (twice_max_input_scale * twice_max_input_scale) /
+        static_cast<double>((1 << data->arithmetic_params.left_shift * 2) *
+                            output_quantization_params.scale);
+    QuantizeMultiplierSmallerThanOneExp(
+        real_input1_multiplier, &data->arithmetic_params.input1_multiplier,
+        &data->arithmetic_params.input1_shift);
+    QuantizeMultiplierSmallerThanOneExp(
+        real_input2_multiplier, &data->arithmetic_params.input2_multiplier,
+        &data->arithmetic_params.input2_shift);
+    QuantizeMultiplierSmallerThanOneExp(
+        real_output_multiplier, &data->arithmetic_params.output_multiplier,
+        &data->arithmetic_params.output_shift);
+    data->arithmetic_params.quantized_activation_min =
+        std::numeric_limits<int8_t>::min();
+    data->arithmetic_params.quantized_activation_max =
+        std::numeric_limits<int8_t>::max();
+  }
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+inline int8_t SquaredDifference(int8_t x, int8_t y,
+                                const ArithmeticParams& params) {
+  const int32_t input1_val = params.input1_offset + x;
+  const int32_t input2_val = params.input2_offset + y;
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+  const int32_t scaled_input1_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input1_val, params.input1_multiplier, params.input1_shift);
+  const int32_t scaled_input2_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input2_val, params.input2_multiplier, params.input2_shift);
+  const int32_t raw_diff = scaled_input1_val - scaled_input2_val;
+
+  // Max of this is 255^2 * (1 << 14), so won't overflow 32 bits.
+  const int32_t squared_raw_diff = raw_diff * raw_diff;
+  const int32_t raw_output =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          squared_raw_diff, params.output_multiplier, params.output_shift) +
+      params.output_offset;
+  const int32_t clamped_output =
+      std::min(params.quantized_activation_max,
+               std::max(params.quantized_activation_min, raw_output));
+  return static_cast<int8_t>(clamped_output);
+}
+
+template <typename T>
+void EvalQuantizedSquaredDifference(TfLiteContext* context, TfLiteNode* node,
+                                    const OpData* data,
+                                    const TfLiteEvalTensor* input1,
+                                    const TfLiteEvalTensor* input2,
+                                    TfLiteEvalTensor* output) {
+  const auto* op_data = static_cast<const OpData*>(node->user_data);
+  if (data->requires_broadcast) {
+    reference_integer_ops::BroadcastBinaryFunction4DSlow(
+        op_data->arithmetic_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<T>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<T>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<T>(output),
+        reference_integer_ops::CheckArithmeticParams, SquaredDifference);
+  } else {
+    const int flat_size = tflite::micro::GetTensorShape(input1).FlatSize();
+    reference_integer_ops::ElementWise(
+        flat_size, op_data->arithmetic_params,
+        tflite::micro::GetTensorData<int8_t>(input1),
+        tflite::micro::GetTensorData<int8_t>(input2),
+        tflite::micro::GetTensorData<int8_t>(output),
+        reference_integer_ops::CheckArithmeticParams, SquaredDifference);
+  }
+}
+
+template <typename T>
+void EvalSquaredDifference(TfLiteContext* context, TfLiteNode* node,
+                           const OpData* data, const TfLiteEvalTensor* input1,
+                           const TfLiteEvalTensor* input2,
+                           TfLiteEvalTensor* output) {
+  if (data->requires_broadcast) {
+    reference_ops::BroadcastBinaryFunction4DSlow<T, T, T>(
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<T>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<T>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<T>(output), SquaredDifference<T>);
+  } else {
+    reference_ops::BinaryFunction<T, T, T>(
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<T>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<T>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<T>(output), SquaredDifference<T>);
+  }
+}
+
+TfLiteStatus SquaredDifferenceEval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalSquaredDifference<float>(context, node, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt32) {
+    EvalSquaredDifference<int32_t>(context, node, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt8) {
+    EvalQuantizedSquaredDifference<int8_t>(context, node, data, input1, input2,
+                                           output);
+  } else {
+    MicroPrintf(
+        "SquaredDifference only supports FLOAT32, INT32 and INT8 now, got %d.",
+        output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteRegistration Register_SQUARED_DIFFERENCE() {
+  return tflite::micro::RegisterOp(
+      SquaredDifferenceInit, SquaredDifferencePrepare, SquaredDifferenceEval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/squeeze.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/squeeze.cc
index 522c2d0e..e81b5b56 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/squeeze.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/squeeze.cc
@@ -27,12 +27,19 @@ namespace tflite {
 namespace {
 
 struct SqueezeContext {
-  SqueezeContext(TfLiteContext* context, TfLiteNode* node)
-      : params(reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data)),
-        input(GetInput(context, node, 0)),
-        output(GetOutput(context, node, 0)) {}
+  SqueezeContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data);
+    micro_context = GetMicroContext(context);
+    input = micro_context->AllocateTempInputTensor(node, 0);
+    output = micro_context->AllocateTempOutputTensor(node, 0);
+  }
+  ~SqueezeContext() {
+    micro_context->DeallocateTempTfLiteTensor(input);
+    micro_context->DeallocateTempTfLiteTensor(output);
+  }
+  MicroContext* micro_context;
   TfLiteSqueezeParams* params;
-  const TfLiteTensor* const input;
+  TfLiteTensor* input;
   TfLiteTensor* output;
 };
 
@@ -80,32 +87,31 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  SqueezeContext op_context(context, node);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
 
-  if (op_context.input->type == kTfLiteString) {
+  if (input->type == kTfLiteString) {
     TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                       TfLiteTypeGetName(op_context.input->type),
-                       op_context.input->type);
+                       TfLiteTypeGetName(input->type), input->type);
     return kTfLiteError;
   }
 
-  TF_LITE_ENSURE_EQ(context, op_context.input->bytes, op_context.output->bytes);
-  memcpy(op_context.output->data.raw, op_context.input->data.raw,
-         op_context.input->bytes);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  size_t input_byte_size;
+  size_t output_byte_size;
+  TF_LITE_ENSURE_OK(context,
+                    TfLiteEvalTensorByteLength(input, &input_byte_size));
+  TF_LITE_ENSURE_OK(context,
+                    TfLiteEvalTensorByteLength(output, &output_byte_size));
+
+  TF_LITE_ENSURE_EQ(context, input_byte_size, output_byte_size);
+  memcpy(output->data.raw, input->data.raw, input_byte_size);
   return kTfLiteOk;
 }
 
 }  // namespace
 
 TfLiteRegistration Register_SQUEEZE() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/strided_slice.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/strided_slice.cc
index 58582bbf..832e2ccd 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/strided_slice.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/strided_slice.cc
@@ -38,18 +38,27 @@ constexpr int kOutputTensor = 0;
 struct StridedSliceContext {
   StridedSliceContext(TfLiteContext* context, TfLiteNode* node) {
     params = reinterpret_cast<TfLiteStridedSliceParams*>(node->builtin_data);
-    input = GetInput(context, node, kInputTensor);
-    begin = GetInput(context, node, kBeginTensor);
-    end = GetInput(context, node, kEndTensor);
-    strides = GetInput(context, node, kStridesTensor);
-    output = GetOutput(context, node, kOutputTensor);
+    micro_context = GetMicroContext(context);
+    input = micro_context->AllocateTempInputTensor(node, kInputTensor);
+    begin = micro_context->AllocateTempInputTensor(node, kBeginTensor);
+    end = micro_context->AllocateTempInputTensor(node, kEndTensor);
+    strides = micro_context->AllocateTempInputTensor(node, kStridesTensor);
+    output = micro_context->AllocateTempOutputTensor(node, kOutputTensor);
     dims = NumDimensions(input);
   }
+  ~StridedSliceContext() {
+    micro_context->DeallocateTempTfLiteTensor(input);
+    micro_context->DeallocateTempTfLiteTensor(begin);
+    micro_context->DeallocateTempTfLiteTensor(end);
+    micro_context->DeallocateTempTfLiteTensor(strides);
+    micro_context->DeallocateTempTfLiteTensor(output);
+  }
   const TfLiteStridedSliceParams* params;
-  const TfLiteTensor* input;
-  const TfLiteTensor* begin;
-  const TfLiteTensor* end;
-  const TfLiteTensor* strides;
+  MicroContext* micro_context;
+  TfLiteTensor* input;
+  TfLiteTensor* begin;
+  TfLiteTensor* end;
+  TfLiteTensor* strides;
   TfLiteTensor* output;
   int dims;
 };
@@ -184,14 +193,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace strided_slice
 
 TfLiteRegistration Register_STRIDED_SLICE() {
-  return {/*init=*/strided_slice::Init,
-          /*free=*/nullptr,
-          /*prepare=*/strided_slice::Prepare,
-          /*invoke=*/strided_slice::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(strided_slice::Init, strided_slice::Prepare,
+                                   strided_slice::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub.cc
index de99149f..40bddbad 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub.cc
@@ -162,14 +162,7 @@ TfLiteStatus SubEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteRegistration Register_SUB() {
-  return {/*init=*/SubInit,
-          /*free=*/nullptr,
-          /*prepare=*/SubPrepare,
-          /*invoke=*/SubEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(SubInit, SubPrepare, SubEval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub_common.cc
index bb30780b..d6647462 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub_common.cc
@@ -83,15 +83,24 @@ TfLiteStatus SubPrepare(TfLiteContext* context, TfLiteNode* node) {
   OpDataSub* data = static_cast<OpDataSub*>(node->user_data);
   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kSubInputTensor1);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input1 =
+      micro_context->AllocateTempInputTensor(node, kSubInputTensor1);
   TF_LITE_ENSURE(context, input1 != nullptr);
-  const TfLiteTensor* input2 = GetInput(context, node, kSubInputTensor2);
+  TfLiteTensor* input2 =
+      micro_context->AllocateTempInputTensor(node, kSubInputTensor2);
   TF_LITE_ENSURE(context, input2 != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kSubOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kSubOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_STATUS(
       CalculateOpDataSub(context, params, input1, input2, output, data));
+
+  micro_context->DeallocateTempTfLiteTensor(input1);
+  micro_context->DeallocateTempTfLiteTensor(input2);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf.cc
index f8a2bed2..5994db94 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf.cc
@@ -100,14 +100,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_SVDF() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/PrepareSvdf,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, PrepareSvdf, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf_common.cc
index d1cbd26e..fb92b4fd 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf_common.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf_common.cc
@@ -364,6 +364,8 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
 
   const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
 
+  MicroContext* micro_context = GetMicroContext(context);
+
   // Validate Tensor Inputs (dtype depends on quantization):
   // [0] = Input, {2, batch_size, input_size}
   // [1] = Weights Feature, {2, num_filters, input_size}
@@ -371,18 +373,19 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
   // [3] = Bias (optional), {1, num_units}
   // [4] = Activation State (variable),
   //         {2, batch_size, memory_size * num_filters}
-  const TfLiteTensor* input = GetInput(context, node, kSvdfInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kSvdfInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kSvdfWeightsFeatureTensor);
+  TfLiteTensor* weights_feature =
+      micro_context->AllocateTempInputTensor(node, kSvdfWeightsFeatureTensor);
   TF_LITE_ENSURE(context, weights_feature != nullptr);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kSvdfWeightsTimeTensor);
+  TfLiteTensor* weights_time =
+      micro_context->AllocateTempInputTensor(node, kSvdfWeightsTimeTensor);
   TF_LITE_ENSURE(context, weights_time != nullptr);
-  const TfLiteTensor* bias =
-      GetOptionalInputTensor(context, node, kSvdfBiasTensor);
-  const TfLiteTensor* activation_state =
-      GetInput(context, node, kSvdfInputActivationStateTensor);
+  TfLiteTensor* bias =
+      micro_context->AllocateTempInputTensor(node, kSvdfBiasTensor);
+  TfLiteTensor* activation_state = micro_context->AllocateTempInputTensor(
+      node, kSvdfInputActivationStateTensor);
   TF_LITE_ENSURE(context, activation_state != nullptr);
 
   // Define input constants based on input tensor definition above:
@@ -402,7 +405,8 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
   // Validate Tensor Output:
   // [0] = float/int8_t, {2, batch_size, num_units}
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-  TfLiteTensor* output = GetOutput(context, node, kSvdfOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kSvdfOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
   TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size);
@@ -498,6 +502,12 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, scratch_status);
   }
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(weights_feature);
+  micro_context->DeallocateTempTfLiteTensor(weights_time);
+  micro_context->DeallocateTempTfLiteTensor(activation_state);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(bias);
   return kTfLiteOk;
 }
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/tanh.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/tanh.cc
index a9ede9eb..e97a9035 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/tanh.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/tanh.cc
@@ -48,11 +48,14 @@ void* TanhInit(TfLiteContext* context, const char* buffer, size_t length) {
 
 TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
                                        OpData* data) {
+  MicroContext* micro_context = GetMicroContext(context);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
@@ -69,6 +72,62 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
     data->input_range_radius =
         CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
   }
+
+  if (input->type == kTfLiteInt16) {
+    static constexpr int kInputIntegerBits = 3;
+    static constexpr int kOutputFractionalBits = 15;
+
+    // These operators are implemented in fixed-point arithmetic,
+    // which intrinsically wants symmetric ranges (zero_point==0)
+    // and power-of-two scales (power-of-two is abbreviated below as POT).
+    // While more general support would be possible by means of rescaling,
+    // that would add some overhead and some loss of accuracy and wouldn't
+    // be used at the moment as current quantized LSTM applications are
+    // happy with symmetric, power-of-two-scales quantization. So we just
+    // implement that narrow case only for now.
+
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+
+    int input_scale_log2_rounded;
+    bool param_scale_pot =
+        CheckedLog2(input->params.scale, &input_scale_log2_rounded);
+
+    data->input_left_shift =
+        (15 - kInputIntegerBits) + input_scale_log2_rounded;
+    param_scale_pot &=
+        (data->input_left_shift == 0 || data->input_left_shift == 1);
+
+    if (param_scale_pot) {
+      data->input_multiplier = 0;
+    } else {
+      // Calculate multiplier to change input scale to 1/(3*4096)
+      // as required by the table lookup.
+      // The number 3.0 in the multiplier comes from here,
+      // because the interval is [-10.7, 10.7] instead of [-8, 8].
+      // So, in this scaling +/-2^17 represents +/-10.7.
+
+      double multiplier =
+          static_cast<double>(input->params.scale) * 4096.0 * 3.0;
+      data->input_left_shift = 0;
+
+      while (multiplier <= 32767.0 / 2.0 && data->input_left_shift <= 30) {
+        data->input_left_shift++;
+        multiplier = multiplier * 2.0;
+      }
+
+      data->input_multiplier = static_cast<int32_t>(multiplier);
+    }
+
+    int output_scale_log2_rounded;
+    TF_LITE_ENSURE(
+        context, CheckedLog2(output->params.scale, &output_scale_log2_rounded));
+    TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded,
+                      -kOutputFractionalBits);
+  }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -77,10 +136,15 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   OpData* data = static_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
   data->input_zero_point = input->params.zero_point;
-  return CalculateArithmeticOpData(context, node, data);
+  TF_LITE_ENSURE_OK(context, CalculateArithmeticOpData(context, node, data));
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  return kTfLiteOk;
 }
 
 }  // namespace
@@ -131,14 +195,8 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace activations
 
 TfLiteRegistration Register_TANH() {
-  return {/*init=*/activations::TanhInit,
-          /*free=*/nullptr,
-          /*prepare=*/activations::TanhPrepare,
-          /*invoke=*/activations::TanhEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(
+      activations::TanhInit, activations::TanhPrepare, activations::TanhEval);
 }
 }  // namespace micro
 }  // namespace ops
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose.cc
index bf43a073..9f77e04d 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose.cc
@@ -18,18 +18,30 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
 namespace {
 
+constexpr int kInputTensor = 0;
+constexpr int kPermTensor = 1;
+constexpr int kOutputTensor = 0;
+
 struct TransposeContext {
   TransposeContext(TfLiteContext* context, TfLiteNode* node) {
-    input = GetInput(context, node, 0);
-    perm = GetInput(context, node, 1);
-    output = GetOutput(context, node, 0);
+    micro_context = GetMicroContext(context);
+    input = micro_context->AllocateTempInputTensor(node, kInputTensor);
+    perm = micro_context->AllocateTempInputTensor(node, kPermTensor);
+    output = micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   }
-  const TfLiteTensor* input;
-  const TfLiteTensor* perm;
+  ~TransposeContext() {
+    micro_context->DeallocateTempTfLiteTensor(input);
+    micro_context->DeallocateTempTfLiteTensor(perm);
+    micro_context->DeallocateTempTfLiteTensor(output);
+  }
+  MicroContext* micro_context;
+  TfLiteTensor* input;
+  TfLiteTensor* perm;
   TfLiteTensor* output;
 };
 
@@ -60,10 +72,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TransposeContext op_context(context, node);
-
-  const int32_t* perm_data = GetTensorData<int32_t>(op_context.perm);
-  const int size = op_context.perm->dims->data[0];
+  const TfLiteEvalTensor* perm_tensor =
+      tflite::micro::GetEvalInput(context, node, kPermTensor);
+  const int32_t* perm_data = perm_tensor->data.i32;
+  const int size = perm_tensor->dims->data[0];
   TransposeParams params;
   params.perm_count = size;
   for (int i = 0; i < size; ++i) {
@@ -73,24 +85,28 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Transpose kernel only does rearranging values not numeric evaluations
   // on each cell. It's safe to implement per size of scalar type and this
   // trick keeps the total code size in a reasonable range.
-  switch (op_context.input->type) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  switch (input->type) {
     case kTfLiteFloat32:
-      reference_ops::Transpose(params, GetTensorShape(op_context.input),
-                               GetTensorData<float>(op_context.input),
-                               GetTensorShape(op_context.output),
-                               GetTensorData<float>(op_context.output));
+      reference_ops::Transpose(params, tflite::micro::GetTensorShape(input),
+                               tflite::micro::GetTensorData<float>(input),
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<float>(output));
       break;
     case kTfLiteInt8:
-      reference_ops::Transpose(params, GetTensorShape(op_context.input),
-                               GetTensorData<int8_t>(op_context.input),
-                               GetTensorShape(op_context.output),
-                               GetTensorData<int8_t>(op_context.output));
+      reference_ops::Transpose(params, tflite::micro::GetTensorShape(input),
+                               tflite::micro::GetTensorData<int8_t>(input),
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<int8_t>(output));
       break;
     default:
       TF_LITE_KERNEL_LOG(context,
                          "Type %s is currently not supported by Transpose. "
                          "Only float32 and int8 is supported",
-                         TfLiteTypeGetName(op_context.input->type));
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 
@@ -100,13 +116,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_TRANSPOSE() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose_conv.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose_conv.cc
index cbd964a0..0b2afd5b 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose_conv.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose_conv.cc
@@ -94,13 +94,18 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
   if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    MicroContext* micro_context = GetMicroContext(context);
+
+    TfLiteTensor* input =
+        micro_context->AllocateTempInputTensor(node, kInputTensor);
     TF_LITE_ENSURE(context, input != nullptr);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    TfLiteTensor* filter =
+        micro_context->AllocateTempInputTensor(node, kFilterTensor);
     TF_LITE_ENSURE(context, filter != nullptr);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    TfLiteTensor* bias =
+        micro_context->AllocateTempInputTensor(node, kBiasTensor);
+    TfLiteTensor* output =
+        micro_context->AllocateTempOutputTensor(node, kOutputTensor);
     TF_LITE_ENSURE(context, output != nullptr);
     int output_channels = filter->dims->data[kConvQuantizedDimension];
 
@@ -124,6 +129,13 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                 &(data->bias_converted_buffer_index)) == kTfLiteOk);
       }
     }
+
+    micro_context->DeallocateTempTfLiteTensor(input);
+    micro_context->DeallocateTempTfLiteTensor(filter);
+    micro_context->DeallocateTempTfLiteTensor(output);
+    if (bias != nullptr) {
+      micro_context->DeallocateTempTfLiteTensor(bias);
+    }
   }
   return kTfLiteOk;
 }
@@ -141,11 +153,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const auto params =
       static_cast<const TfLiteTransposeConvParams*>(node->builtin_data);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
   TF_LITE_ENSURE(context, output != nullptr);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
   TF_LITE_ENSURE(context, input != nullptr);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  TfLiteTensor* filter =
+      micro_context->AllocateTempInputTensor(node, kFilterTensor);
   TF_LITE_ENSURE(context, filter != nullptr);
 
   // Get height and width of the output.
@@ -212,6 +229,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Stride
   data->params.stride_width = params->stride_width;
   data->params.stride_height = params->stride_height;
+
+  micro_context->DeallocateTempTfLiteTensor(output);
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(filter);
   return kTfLiteOk;
 }
 
@@ -245,7 +266,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           tflite::micro::GetTensorShape(filter),
           tflite::micro::GetTensorData<float>(filter),
           tflite::micro::GetTensorShape(bias),
-          tflite::micro::GetTensorData<float>(bias),
+          tflite::micro::GetOptionalTensorData<float>(bias),
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<float>(output),
           tflite::micro::GetTensorShape(nullptr), nullptr);
@@ -261,7 +282,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           tflite::micro::GetTensorShape(filter),
           tflite::micro::GetTensorData<int8_t>(filter),
           tflite::micro::GetTensorShape(bias),
-          tflite::micro::GetTensorData<int32_t>(bias),
+          tflite::micro::GetOptionalTensorData<int32_t>(bias),
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<int8_t>(output),
           tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
@@ -272,7 +293,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           context->GetScratchBuffer(context, data.scratch_buffer_index));
       // TODO(b/192090531): Remove this once all 8x16 transpose conv models use
       // 64-bit biases.
-      if (bias->type == kTfLiteInt16) {
+      if (bias != nullptr && bias->type == kTfLiteInt16) {
         std::int64_t* bias_converted_buffer =
             static_cast<int64_t*>(context->GetScratchBuffer(
                 context, data.bias_converted_buffer_index));
@@ -298,7 +319,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             tflite::micro::GetTensorShape(filter),
             tflite::micro::GetTensorData<int8_t>(filter),
             tflite::micro::GetTensorShape(bias),
-            tflite::micro::GetTensorData<std::int64_t>(bias),
+            tflite::micro::GetOptionalTensorData<std::int64_t>(bias),
             tflite::micro::GetTensorShape(output),
             tflite::micro::GetTensorData<int16_t>(output),
             tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
@@ -316,14 +337,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_TRANSPOSE_CONV() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.cc
new file mode 100644
index 00000000..7f3c50e4
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm.cc
@@ -0,0 +1,1696 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <cstddef>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/lstm_eval.h"
+#include "tensorflow/lite/micro/kernels/lstm_shared.h"
+#include "tensorflow/lite/micro/kernels/micro_tensor_utils.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+namespace tflite {
+
+namespace {
+
+constexpr int scratch_index_size = 12;
+
+struct UnidirectionalSequenceLstmOpData {
+  // If the lstm is layer norm.
+  bool use_layer_norm;
+  // The scratch index.
+  int scratch_index[scratch_index_size];
+
+  int32_t row_sums_size;
+  int32_t* row_sums;
+  bool compute_row_sums = false;
+
+  int32_t input_zero_point;
+  int32_t output_state_zero_point;
+
+  IntegerLstmParameter integer_lstm_param;
+  HybridLstmScales hybrid_lstm_scales;
+};
+
+TfLiteStatus PopulateQuantizedLstmParams8x8_16(
+    TfLiteContext* context, TfLiteNode* node,
+    IntegerLstmParameter* integer_lstm_param) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  // Calculate quantized clip for projection and cell.
+  const auto* params =
+      static_cast<TfLiteUnidirectionalSequenceLSTMParams*>(node->builtin_data);
+  const float cell_clip = params->cell_clip;
+  const float proj_clip = params->proj_clip;
+
+  TfLiteTensor* cell_state =
+      micro_context->AllocateTempInputTensor(node, kLstmCellStateTensor);
+  TF_LITE_ENSURE(context, cell_state != nullptr);
+  TF_LITE_ENSURE(context, cell_state->is_variable);
+  TfLiteTensor* output_tensor =
+      micro_context->AllocateTempOutputTensor(node, kLstmOutputTensor);
+
+  TF_LITE_ENSURE(context,
+                 cell_state->quantization.type != kTfLiteNoQuantization);
+  auto* cell_state_params =
+      static_cast<TfLiteAffineQuantization*>(cell_state->quantization.params);
+  TF_LITE_ENSURE(context,
+                 output_tensor->quantization.type != kTfLiteNoQuantization);
+  auto* proj_params = static_cast<TfLiteAffineQuantization*>(
+      output_tensor->quantization.params);
+  if (cell_clip > 0.0f) {
+    integer_lstm_param->quantized_cell_clip = static_cast<int16_t>(std::min(
+        std::max(cell_clip / cell_state_params->scale->data[0], -32768.0f),
+        32767.0f));
+  } else {
+    integer_lstm_param->quantized_cell_clip = 0;
+  }
+  if (proj_clip > 0.0f) {
+    integer_lstm_param->quantized_proj_clip = static_cast<int8_t>(std::min(
+        std::max(proj_clip / proj_params->scale->data[0], -128.0f), 127.0f));
+  } else {
+    integer_lstm_param->quantized_proj_clip = 0;
+  }
+
+  // Calculate effective scales.
+  UnidirectionalSequenceLstmOpData* op_data =
+      static_cast<UnidirectionalSequenceLstmOpData*>(node->user_data);
+  const bool use_layer_norm = op_data->use_layer_norm;
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kLstmInputTensor);
+
+  TfLiteTensor* input_to_input_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmInputToInputWeightsTensor);
+  TfLiteTensor* input_to_forget_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmInputToForgetWeightsTensor);
+  TfLiteTensor* input_to_cell_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmInputToCellWeightsTensor);
+  TfLiteTensor* input_to_output_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmInputToOutputWeightsTensor);
+
+  TfLiteTensor* recurrent_to_input_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToInputWeightsTensor);
+  TfLiteTensor* recurrent_to_forget_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToForgetWeightsTensor);
+  TfLiteTensor* recurrent_to_cell_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmRecurrentToCellWeightsTensor);
+  TfLiteTensor* recurrent_to_output_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToOutputWeightsTensor);
+
+  TfLiteTensor* cell_to_input_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmCellToInputWeightsTensor);
+  TfLiteTensor* cell_to_forget_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmCellToForgetWeightsTensor);
+  TfLiteTensor* cell_to_output_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmCellToOutputWeightsTensor);
+
+  TfLiteTensor* input_layer_norm_coefficients =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmInputLayerNormCoefficientsTensor);
+  TfLiteTensor* forget_layer_norm_coefficients =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmForgetLayerNormCoefficientsTensor);
+  TfLiteTensor* cell_layer_norm_coefficients =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmCellLayerNormCoefficientsTensor);
+  TfLiteTensor* output_layer_norm_coefficients =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmOutputLayerNormCoefficientsTensor);
+
+  TfLiteTensor* projection_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmProjectionWeightsTensor);
+
+  TfLiteTensor* output_state =
+      micro_context->AllocateTempInputTensor(node, kLstmOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+  TF_LITE_ENSURE(context, output_state->is_variable);
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+  const bool use_projection = (projection_weights != nullptr);
+
+  // Get intermediate scales and zero points.
+  float intermediate_scale[5];
+  int32_t intermediate_zp[5];
+  for (int i = 0; i < 4; ++i) {
+    if (use_layer_norm) {
+      TfLiteTensor* intermediate =
+          micro_context->AllocateTempIntermediateTensor(node, i);
+      TF_LITE_ENSURE(context,
+                     intermediate->quantization.type != kTfLiteNoQuantization);
+      auto* params_intermediate = static_cast<TfLiteAffineQuantization*>(
+          intermediate->quantization.params);
+      intermediate_scale[i] = params_intermediate->scale->data[0];
+      intermediate_zp[i] = params_intermediate->zero_point->data[0];
+      if (intermediate != nullptr) {
+        micro_context->DeallocateTempTfLiteTensor(intermediate);
+      }
+    } else {
+      // Q3.12 for activation functions.
+      intermediate_scale[i] = std::pow(2.0f, -12.0f);
+      intermediate_zp[i] = 0;
+    }
+  }
+  // In the absence of projection, hidden becomes otuput and this intermediate
+  // is ignored.
+  TfLiteTensor* hidden = micro_context->AllocateTempIntermediateTensor(node, 4);
+  TF_LITE_ENSURE(context, hidden->quantization.type != kTfLiteNoQuantization);
+  auto* hidden_params =
+      static_cast<TfLiteAffineQuantization*>(hidden->quantization.params);
+  intermediate_scale[4] = hidden_params->scale->data[0];
+  intermediate_zp[4] = hidden_params->zero_point->data[0];
+  if (hidden != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(hidden);
+  }
+
+  // Scales.
+  const float default_scale = 1.0;
+  float input_scale = default_scale;
+  float input_to_input_weight_scale = default_scale;
+  float recurrent_to_input_weight_scale = default_scale;
+  float cell_to_input_weight_scale = default_scale;
+  float input_to_forget_weight_scale = default_scale;
+  float recurrent_to_forget_weight_scale = default_scale;
+  float cell_to_forget_weight_scale = default_scale;
+  float input_to_cell_weight_scale = default_scale;
+  float recurrent_to_cell_weight_scale = default_scale;
+  float input_to_output_weight_scale = default_scale;
+  float recurrent_to_output_weight_scale = default_scale;
+  float cell_to_output_weight_scale = default_scale;
+  float projection_weight_scale = default_scale;
+  float layer_norm_input_scale = default_scale;
+  float layer_norm_forget_scale = default_scale;
+  float layer_norm_cell_scale = default_scale;
+  float layer_norm_output_scale = default_scale;
+  float output_state_scale = default_scale;
+  int cell_scale = 1;
+
+  // Effective scales.
+  float effective_input_to_input_scale = default_scale;
+  float effective_recurrent_to_input_scale = default_scale;
+  float effective_cell_to_input_scale = default_scale;
+  float effective_input_to_forget_scale = default_scale;
+  float effective_recurrent_to_forget_scale = default_scale;
+  float effective_cell_to_forget_scale = default_scale;
+  float effective_input_to_cell_scale = default_scale;
+  float effective_recurrent_to_cell_scale = default_scale;
+  float effective_input_to_output_scale = default_scale;
+  float effective_recurrent_to_output_scale = default_scale;
+  float effective_cell_to_output_scale = default_scale;
+  float effective_proj_scale = default_scale;
+  float effective_hidden_scale = default_scale;
+
+  // Populate scales.
+  if (!use_cifg) {
+    input_to_input_weight_scale = input_to_input_weights->params.scale;
+    recurrent_to_input_weight_scale = recurrent_to_input_weights->params.scale;
+  }
+
+  if (use_peephole) {
+    if (!use_cifg) {
+      cell_to_input_weight_scale = cell_to_input_weights->params.scale;
+    }
+    cell_to_forget_weight_scale = cell_to_forget_weights->params.scale;
+    cell_to_output_weight_scale = cell_to_output_weights->params.scale;
+  }
+
+  if (use_layer_norm) {
+    if (!use_cifg) {
+      layer_norm_input_scale = input_layer_norm_coefficients->params.scale;
+    }
+    layer_norm_forget_scale = forget_layer_norm_coefficients->params.scale;
+    layer_norm_cell_scale = cell_layer_norm_coefficients->params.scale;
+    layer_norm_output_scale = output_layer_norm_coefficients->params.scale;
+  }
+
+  if (use_projection) {
+    projection_weight_scale = projection_weights->params.scale;
+  }
+  output_state_scale = output_state->params.scale;
+
+  input_to_forget_weight_scale = input_to_forget_weights->params.scale;
+  input_to_cell_weight_scale = input_to_cell_weights->params.scale;
+  input_to_output_weight_scale = input_to_output_weights->params.scale;
+  recurrent_to_forget_weight_scale = recurrent_to_forget_weights->params.scale;
+  recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale;
+  recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale;
+
+  // Check cell state (already used above)
+  TF_LITE_ENSURE(context, CheckedLog2(cell_state->params.scale, &cell_scale));
+  // TF_LITE_ENSURE(context, cell_scale <= -9);
+  integer_lstm_param->cell_scale = cell_scale;
+  input_scale = input->params.scale;
+
+  // Calculate effective scales.
+  if (!use_cifg) {
+    effective_input_to_input_scale =
+        input_to_input_weight_scale * input_scale / intermediate_scale[0];
+    effective_recurrent_to_input_scale = recurrent_to_input_weight_scale *
+                                         output_state_scale /
+                                         intermediate_scale[0];
+  }
+  effective_input_to_forget_scale =
+      input_to_forget_weight_scale * input_scale / intermediate_scale[1];
+  effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale *
+                                        output_state_scale /
+                                        intermediate_scale[1];
+
+  effective_input_to_cell_scale =
+      input_to_cell_weight_scale * input_scale / intermediate_scale[2];
+  effective_recurrent_to_cell_scale = recurrent_to_cell_weight_scale *
+                                      output_state_scale /
+                                      intermediate_scale[2];
+
+  effective_input_to_output_scale =
+      input_to_output_weight_scale * input_scale / intermediate_scale[3];
+  effective_recurrent_to_output_scale = recurrent_to_output_weight_scale *
+                                        output_state_scale /
+                                        intermediate_scale[3];
+
+  effective_hidden_scale =
+      std::pow(2.0f, -15.0f) / intermediate_scale[4] * std::pow(2.0f, -15.0f);
+
+  effective_proj_scale =
+      projection_weight_scale * intermediate_scale[4] / output_state_scale;
+
+  if (use_peephole) {
+    if (!use_cifg) {
+      effective_cell_to_input_scale =
+          std::pow(2.0f, static_cast<float>(cell_scale)) *
+          cell_to_input_weight_scale / intermediate_scale[0];
+    }
+    effective_cell_to_forget_scale =
+        std::pow(2.0f, static_cast<float>(cell_scale)) *
+        cell_to_forget_weight_scale / intermediate_scale[1];
+    effective_cell_to_output_scale =
+        std::pow(2.0f, static_cast<float>(cell_scale)) *
+        cell_to_output_weight_scale / intermediate_scale[3];
+  }
+
+  // Decompose scales.
+  int shift_output;
+  QuantizeMultiplier(static_cast<double>(effective_input_to_input_scale),
+                     &integer_lstm_param->effective_input_to_input_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_input_to_input_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_recurrent_to_input_scale),
+                     &integer_lstm_param->effective_recurrent_to_input_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_recurrent_to_input_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_cell_to_input_scale),
+                     &integer_lstm_param->effective_cell_to_input_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_cell_to_input_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_input_to_forget_scale),
+                     &integer_lstm_param->effective_input_to_forget_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_input_to_forget_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_recurrent_to_forget_scale),
+                     &integer_lstm_param->effective_recurrent_to_forget_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_recurrent_to_forget_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_cell_to_forget_scale),
+                     &integer_lstm_param->effective_cell_to_forget_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_cell_to_forget_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_input_to_cell_scale),
+                     &integer_lstm_param->effective_input_to_cell_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_input_to_cell_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_recurrent_to_cell_scale),
+                     &integer_lstm_param->effective_recurrent_to_cell_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_recurrent_to_cell_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_input_to_output_scale),
+                     &integer_lstm_param->effective_input_to_output_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_input_to_output_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_recurrent_to_output_scale),
+                     &integer_lstm_param->effective_recurrent_to_output_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_recurrent_to_output_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_cell_to_output_scale),
+                     &integer_lstm_param->effective_cell_to_output_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_cell_to_output_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_proj_scale),
+                     &integer_lstm_param->effective_proj_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_proj_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(effective_hidden_scale),
+                     &integer_lstm_param->effective_hidden_scale_a,
+                     &shift_output);
+  integer_lstm_param->effective_hidden_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(layer_norm_input_scale),
+                     &integer_lstm_param->layer_norm_input_scale_a,
+                     &shift_output);
+  integer_lstm_param->layer_norm_input_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(layer_norm_forget_scale),
+                     &integer_lstm_param->layer_norm_forget_scale_a,
+                     &shift_output);
+  integer_lstm_param->layer_norm_forget_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(layer_norm_cell_scale),
+                     &integer_lstm_param->layer_norm_cell_scale_a,
+                     &shift_output);
+  integer_lstm_param->layer_norm_cell_scale_b =
+      static_cast<int32_t>(shift_output);
+  QuantizeMultiplier(static_cast<double>(layer_norm_output_scale),
+                     &integer_lstm_param->layer_norm_output_scale_a,
+                     &shift_output);
+  integer_lstm_param->layer_norm_output_scale_b =
+      static_cast<int32_t>(shift_output);
+
+  integer_lstm_param->hidden_zp = intermediate_zp[4];
+
+  // 10000 is used to make sure the kernel logic does not overflow.
+  if (!use_cifg) {
+    integer_lstm_param->input_variance_guard =
+        std::max(1, static_cast<int>(10000 * layer_norm_input_scale));
+  }
+  integer_lstm_param->forget_variance_guard =
+      std::max(1, static_cast<int>(10000 * layer_norm_forget_scale));
+  integer_lstm_param->cell_variance_guard =
+      std::max(1, static_cast<int>(10000 * layer_norm_cell_scale));
+  integer_lstm_param->output_variance_guard =
+      std::max(1, static_cast<int>(10000 * layer_norm_output_scale));
+
+  if (cell_state != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_state);
+  }
+  if (output_tensor != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(output_tensor);
+  }
+  if (input != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input);
+  }
+  if (input_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_input_weights);
+  }
+  if (input_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_forget_weights);
+  }
+  if (input_to_cell_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_cell_weights);
+  }
+  if (input_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_output_weights);
+  }
+  if (recurrent_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_input_weights);
+  }
+  if (recurrent_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_forget_weights);
+  }
+  if (recurrent_to_cell_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_cell_weights);
+  }
+  if (recurrent_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_output_weights);
+  }
+  if (cell_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_to_input_weights);
+  }
+  if (cell_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_to_forget_weights);
+  }
+  if (cell_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_to_output_weights);
+  }
+  if (input_layer_norm_coefficients != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_layer_norm_coefficients);
+  }
+  if (forget_layer_norm_coefficients != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(forget_layer_norm_coefficients);
+  }
+  if (cell_layer_norm_coefficients != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_layer_norm_coefficients);
+  }
+  if (output_layer_norm_coefficients != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(output_layer_norm_coefficients);
+  }
+  if (projection_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(projection_weights);
+  }
+  if (output_state != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(output_state);
+  }
+
+  return kTfLiteOk;
+}
+
+// Temporary buffers used for hybrid mode
+enum HybridTempBuffer {
+  kPrimaryScratchBuffer = 0,
+  kInputQuantized = 1,
+  kOutputStateQuantized = 2,
+  kCellStateQuantized = 3,
+  kInputScalingFactors = 4,
+  kOutputStateScalingFactors = 5,
+  kProductScalingFactors = 6,
+  kRecoveredCellWeights = 7,
+  kAccumScratch = 8,
+  kInputZeroPoints = 9,
+  kOutputStateZeroPoints = 10,
+  kScales = 11,
+  kNumHybridTempBuffers = 12,
+};
+
+void* UnidirectionalSequenceLstmInit(TfLiteContext* context, const char* buffer,
+                                     size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(
+      context, sizeof(UnidirectionalSequenceLstmOpData));
+}
+
+// Check that input tensor dimensions matches with each other.
+TfLiteStatus SetHybridScales(TfLiteContext* context, TfLiteNode* node) {
+  UnidirectionalSequenceLstmOpData* op_data =
+      reinterpret_cast<UnidirectionalSequenceLstmOpData*>(node->user_data);
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input_to_input_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmInputToInputWeightsTensor);
+  op_data->hybrid_lstm_scales.input_to_input_weights_scale =
+      (input_to_input_weights != nullptr) ? input_to_input_weights->params.scale
+                                          : 1.0f;
+
+  TfLiteTensor* input_to_forget_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmInputToForgetWeightsTensor);
+  op_data->hybrid_lstm_scales.input_to_forget_weights_scale =
+      (input_to_forget_weights != nullptr)
+          ? input_to_forget_weights->params.scale
+          : 1.0f;
+
+  TfLiteTensor* input_to_cell_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmInputToCellWeightsTensor);
+  op_data->hybrid_lstm_scales.input_to_cell_weights_scale =
+      (input_to_cell_weights != nullptr) ? input_to_cell_weights->params.scale
+                                         : 1.0f;
+
+  TfLiteTensor* input_to_output_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmInputToOutputWeightsTensor);
+  op_data->hybrid_lstm_scales.input_to_output_weights_scale =
+      (input_to_output_weights != nullptr)
+          ? input_to_output_weights->params.scale
+          : 1.0f;
+
+  op_data->hybrid_lstm_scales.aux_input_to_input_weights_scale = 1.0f;
+  op_data->hybrid_lstm_scales.aux_input_to_forget_weights_scale = 1.0f;
+  op_data->hybrid_lstm_scales.aux_input_to_cell_weights_scale = 1.0f;
+  op_data->hybrid_lstm_scales.aux_input_to_output_weights_scale = 1.0f;
+
+  TfLiteTensor* recurrent_to_input_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToInputWeightsTensor);
+  op_data->hybrid_lstm_scales.recurrent_to_input_weights_scale =
+      (recurrent_to_input_weights != nullptr)
+          ? recurrent_to_input_weights->params.scale
+          : 1.0f;
+
+  TfLiteTensor* recurrent_to_forget_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToForgetWeightsTensor);
+  op_data->hybrid_lstm_scales.recurrent_to_forget_weights_scale =
+      (recurrent_to_forget_weights != nullptr)
+          ? recurrent_to_forget_weights->params.scale
+          : 1.0f;
+
+  TfLiteTensor* recurrent_to_cell_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmRecurrentToCellWeightsTensor);
+  op_data->hybrid_lstm_scales.recurrent_to_cell_weights_scale =
+      (recurrent_to_cell_weights != nullptr)
+          ? recurrent_to_cell_weights->params.scale
+          : 1.0f;
+
+  TfLiteTensor* recurrent_to_output_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToOutputWeightsTensor);
+  op_data->hybrid_lstm_scales.recurrent_to_output_weights_scale =
+      (recurrent_to_output_weights != nullptr)
+          ? recurrent_to_output_weights->params.scale
+          : 1.0f;
+
+  TfLiteTensor* cell_to_input_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmCellToInputWeightsTensor);
+  op_data->hybrid_lstm_scales.cell_to_input_weights_scale =
+      (cell_to_input_weights != nullptr) ? cell_to_input_weights->params.scale
+                                         : 1.0f;
+
+  TfLiteTensor* cell_to_forget_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmCellToForgetWeightsTensor);
+  op_data->hybrid_lstm_scales.cell_to_forget_weights_scale =
+      (cell_to_forget_weights != nullptr) ? cell_to_forget_weights->params.scale
+                                          : 1.0f;
+
+  TfLiteTensor* cell_to_output_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmCellToOutputWeightsTensor);
+  op_data->hybrid_lstm_scales.cell_to_output_weights_scale =
+      (cell_to_output_weights != nullptr) ? cell_to_output_weights->params.scale
+                                          : 1.0f;
+
+  TfLiteTensor* projection_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmProjectionWeightsTensor);
+  op_data->hybrid_lstm_scales.projection_weights_scale =
+      (projection_weights != nullptr) ? projection_weights->params.scale : 1.0f;
+
+  if (input_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_input_weights);
+  }
+
+  if (input_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_forget_weights);
+  }
+
+  if (input_to_cell_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_cell_weights);
+  }
+
+  if (input_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_output_weights);
+  }
+
+  if (recurrent_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_input_weights);
+  }
+
+  if (recurrent_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_forget_weights);
+  }
+
+  if (recurrent_to_cell_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_cell_weights);
+  }
+
+  if (recurrent_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_output_weights);
+  }
+
+  if (cell_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_to_input_weights);
+  }
+
+  if (cell_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_to_forget_weights);
+  }
+
+  if (cell_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_to_output_weights);
+  }
+
+  if (projection_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(projection_weights);
+  }
+
+  return kTfLiteOk;
+}
+
+// Check that input tensor dimensions matches with each other.
+TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
+                                        TfLiteNode* node, int n_input,
+                                        int n_output, int n_cell,
+                                        bool use_layer_norm, bool is_integer) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+
+  // Making sure clipping parameters have valid values.
+  // == 0 means no clipping
+  //  > 0 means clipping
+  TF_LITE_ENSURE(context, params->cell_clip >= 0);
+  TF_LITE_ENSURE(context, params->proj_clip >= 0);
+
+  TfLiteTensor* input_to_input_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmInputToInputWeightsTensor);
+  if (input_to_input_weights != nullptr) {
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
+  }
+
+  TfLiteTensor* input_to_forget_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmInputToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
+
+  TfLiteTensor* input_to_cell_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmInputToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
+
+  TfLiteTensor* recurrent_to_input_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToInputWeightsTensor);
+  if (recurrent_to_input_weights != nullptr) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
+                      n_output);
+  }
+
+  TfLiteTensor* recurrent_to_forget_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
+                    n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
+                    n_output);
+
+  TfLiteTensor* recurrent_to_cell_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmRecurrentToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
+                    n_output);
+
+  // We make sure the input-gate's parameters are either both present (regular
+  // LSTM) or not at all (CIFG-LSTM).
+  const bool cifg_weights_all_or_none =
+      ((input_to_input_weights != nullptr) &&
+       (recurrent_to_input_weights != nullptr)) ||
+      ((input_to_input_weights == nullptr) &&
+       (recurrent_to_input_weights == nullptr));
+  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
+
+  TfLiteTensor* cell_to_input_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmCellToInputWeightsTensor);
+  if (cell_to_input_weights != nullptr) {
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_TYPES_EQ(
+        context, cell_to_input_weights->type,
+        is_integer ? kTfLiteInt16 : input_to_forget_weights->type);
+  }
+
+  TfLiteTensor* cell_to_forget_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmCellToForgetWeightsTensor);
+  if (cell_to_forget_weights != nullptr) {
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_TYPES_EQ(
+        context, cell_to_forget_weights->type,
+        is_integer ? kTfLiteInt16 : input_to_forget_weights->type);
+  }
+
+  TfLiteTensor* cell_to_output_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmCellToOutputWeightsTensor);
+  if (cell_to_output_weights != nullptr) {
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_TYPES_EQ(
+        context, cell_to_output_weights->type,
+        is_integer ? kTfLiteInt16 : input_to_forget_weights->type);
+  }
+
+  // Making sure the peephole weights are there all or none.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool peephole_weights_all_or_none =
+      ((cell_to_input_weights != nullptr || use_cifg) &&
+       (cell_to_forget_weights != nullptr) &&
+       (cell_to_output_weights != nullptr)) ||
+      ((cell_to_input_weights == nullptr) &&
+       (cell_to_forget_weights == nullptr) &&
+       (cell_to_output_weights == nullptr));
+  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
+
+  // Make sure the input gate bias is present only when not a CIFG-LSTM.
+  TfLiteTensor* input_gate_bias =
+      micro_context->AllocateTempInputTensor(node, kLstmInputGateBiasTensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+    if (is_integer) {
+      TF_LITE_ENSURE_TYPES_EQ(context, input_gate_bias->type, kTfLiteInt32);
+    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, input_gate_bias->type, kTfLiteFloat32);
+    }
+  }
+
+  TfLiteTensor* forget_gate_bias =
+      micro_context->AllocateTempInputTensor(node, kLstmForgetGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+  if (is_integer) {
+    TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteInt32);
+  } else {
+    TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
+  }
+
+  TfLiteTensor* cell_gate_bias =
+      micro_context->AllocateTempInputTensor(node, kLstmCellGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
+  if (is_integer) {
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_gate_bias->type, kTfLiteInt32);
+  } else {
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_gate_bias->type, kTfLiteFloat32);
+  }
+
+  TfLiteTensor* output_gate_bias =
+      micro_context->AllocateTempInputTensor(node, kLstmOutputGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+  if (is_integer) {
+    TF_LITE_ENSURE_TYPES_EQ(context, output_gate_bias->type, kTfLiteInt32);
+  } else {
+    TF_LITE_ENSURE_TYPES_EQ(context, output_gate_bias->type, kTfLiteFloat32);
+  }
+
+  TfLiteTensor* projection_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmProjectionWeightsTensor);
+  if (projection_weights != nullptr) {
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
+  }
+
+  TfLiteTensor* projection_bias =
+      micro_context->AllocateTempInputTensor(node, kLstmProjectionBiasTensor);
+  if (projection_bias != nullptr) {
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+    if (is_integer) {
+      TF_LITE_ENSURE_TYPES_EQ(context, projection_bias->type, kTfLiteInt32);
+    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, projection_bias->type, kTfLiteFloat32);
+    }
+  }
+
+  // Making sure the projection tensors are consistent:
+  // 1) If projection weight is not present, then projection bias should not be
+  // present.
+  // 2) If projection weight is present, then projection bias is optional.
+  const bool projecton_tensors_consistent =
+      ((projection_weights != nullptr) || (projection_bias == nullptr));
+  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
+
+  if (use_layer_norm) {
+    TfLiteTensor* input_layer_norm_coefficients =
+        micro_context->AllocateTempInputTensor(
+            node, kLstmInputLayerNormCoefficientsTensor);
+    if (use_cifg) {
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients, nullptr);
+    } else {
+      TF_LITE_ENSURE(context, input_layer_norm_coefficients != nullptr);
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->size, 1);
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->data[0],
+                        n_cell);
+      if (is_integer) {
+        TF_LITE_ENSURE_TYPES_EQ(context, input_layer_norm_coefficients->type,
+                                kTfLiteInt16);
+      } else {
+        TF_LITE_ENSURE_TYPES_EQ(context, input_layer_norm_coefficients->type,
+                                kTfLiteFloat32);
+      }
+    }
+
+    TfLiteTensor* forget_layer_norm_coefficients =
+        micro_context->AllocateTempInputTensor(
+            node, kLstmForgetLayerNormCoefficientsTensor);
+    TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->data[0],
+                      n_cell);
+    if (is_integer) {
+      TF_LITE_ENSURE_TYPES_EQ(context, forget_layer_norm_coefficients->type,
+                              kTfLiteInt16);
+    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, forget_layer_norm_coefficients->type,
+                              kTfLiteFloat32);
+    }
+
+    TfLiteTensor* cell_layer_norm_coefficients =
+        micro_context->AllocateTempInputTensor(
+            node, kLstmCellLayerNormCoefficientsTensor);
+    TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->data[0],
+                      n_cell);
+    if (is_integer) {
+      TF_LITE_ENSURE_TYPES_EQ(context, cell_layer_norm_coefficients->type,
+                              kTfLiteInt16);
+    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, cell_layer_norm_coefficients->type,
+                              kTfLiteFloat32);
+    }
+
+    TfLiteTensor* output_layer_norm_coefficients =
+        micro_context->AllocateTempInputTensor(
+            node, kLstmOutputLayerNormCoefficientsTensor);
+    TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->data[0],
+                      n_cell);
+    if (is_integer) {
+      TF_LITE_ENSURE_TYPES_EQ(context, output_layer_norm_coefficients->type,
+                              kTfLiteInt16);
+    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, output_layer_norm_coefficients->type,
+                              kTfLiteFloat32);
+    }
+    if (input_layer_norm_coefficients != nullptr) {
+      micro_context->DeallocateTempTfLiteTensor(input_layer_norm_coefficients);
+    }
+    if (forget_layer_norm_coefficients != nullptr) {
+      micro_context->DeallocateTempTfLiteTensor(forget_layer_norm_coefficients);
+    }
+    if (cell_layer_norm_coefficients != nullptr) {
+      micro_context->DeallocateTempTfLiteTensor(cell_layer_norm_coefficients);
+    }
+    if (output_layer_norm_coefficients != nullptr) {
+      micro_context->DeallocateTempTfLiteTensor(output_layer_norm_coefficients);
+    }
+  }
+
+  if (input_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_input_weights);
+  }
+  if (input_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_forget_weights);
+  }
+  if (input_to_cell_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_cell_weights);
+  }
+  if (recurrent_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_input_weights);
+  }
+  if (recurrent_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_forget_weights);
+  }
+  micro_context->DeallocateTempTfLiteTensor(recurrent_to_cell_weights);
+  if (cell_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_to_input_weights);
+  }
+  if (cell_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_to_forget_weights);
+  }
+  if (cell_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_to_output_weights);
+  }
+  if (input_gate_bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_gate_bias);
+  }
+  if (forget_gate_bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(forget_gate_bias);
+  }
+  if (cell_gate_bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_gate_bias);
+  }
+  if (output_gate_bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(output_gate_bias);
+  }
+  if (projection_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(projection_weights);
+  }
+  if (projection_bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(projection_bias);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus PrecomputeZeroPointTimesWeightWithBias(
+    TfLiteContext* context, int32_t zero_point,
+    const TfLiteTensor* weight_tensor, const TfLiteTensor* bias_tensor,
+    int32_t** output) {
+  if (weight_tensor == nullptr) {
+    return kTfLiteOk;
+  }
+
+  const RuntimeShape& weight_shape = GetTensorShape(weight_tensor);
+  TF_LITE_ENSURE_EQ(context, weight_shape.DimensionsCount(), 2);
+  const int row = weight_shape.Dims(0);
+  const int col = weight_shape.Dims(1);
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  *output = static_cast<int32_t*>(
+      context->AllocatePersistentBuffer(context, row * sizeof(int32_t)));
+
+  if (bias_tensor == nullptr) {
+    memset(*output, 0, row * sizeof(int32_t));
+  } else {
+    const int32_t* bias = GetTensorData<int32_t>(bias_tensor);
+    memcpy(*output, bias, row * sizeof(int32_t));
+  }
+  if (zero_point != 0) {
+    const int8_t* weight = GetTensorData<int8_t>(weight_tensor);
+    micro_tensor_utils::MatrixScalarMultiplyAccumulate(weight, zero_point, row,
+                                                       col, *output);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(
+    TfLiteContext* context, UnidirectionalSequenceLstmOpData* op_data,
+    TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kLstmInputTensor);
+  TfLiteTensor* output_state =
+      micro_context->AllocateTempInputTensor(node, kLstmOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+  TF_LITE_ENSURE(context, output_state->is_variable);
+
+  const int32_t input_zero_point = -input->params.zero_point;
+  const int32_t output_state_zero_point = -output_state->params.zero_point;
+
+  TfLiteTensor* input_to_input_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmInputToInputWeightsTensor);
+  TfLiteTensor* input_to_forget_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmInputToForgetWeightsTensor);
+  TfLiteTensor* input_to_cell_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmInputToCellWeightsTensor);
+  TfLiteTensor* input_to_output_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmInputToOutputWeightsTensor);
+
+  TfLiteTensor* recurrent_to_input_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToInputWeightsTensor);
+  TfLiteTensor* recurrent_to_forget_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToForgetWeightsTensor);
+  TfLiteTensor* recurrent_to_cell_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmRecurrentToCellWeightsTensor);
+  TfLiteTensor* recurrent_to_output_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToOutputWeightsTensor);
+
+  TfLiteTensor* projection_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmProjectionWeightsTensor);
+  TfLiteTensor* projection_bias =
+      micro_context->AllocateTempInputTensor(node, kLstmProjectionBiasTensor);
+
+  IntegerLstmParameter* integer_lstm_params = &op_data->integer_lstm_param;
+
+  TfLiteTensor* intermediate =
+      micro_context->AllocateTempIntermediateTensor(node, 4);
+  TF_LITE_ENSURE(context,
+                 intermediate->quantization.type != kTfLiteNoQuantization);
+  const auto* params =
+      static_cast<TfLiteAffineQuantization*>(intermediate->quantization.params);
+  const int32_t hidden_zp = params->zero_point->data[0];
+
+  // Get bias and perform zero point calculation.
+  // When there is layer normalization, the gate bias does not apply to matmul
+  // directly:
+  //      y = ln(w * x + w * r + w * c) + b.
+  const bool is_layer_norm = op_data->use_layer_norm;
+
+  // Forget gate.
+  TfLiteTensor* forget_gate_bias = is_layer_norm
+                                       ? nullptr
+                                       : micro_context->AllocateTempInputTensor(
+                                             node, kLstmForgetGateBiasTensor);
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, input_zero_point, input_to_forget_weights, forget_gate_bias,
+          &(integer_lstm_params->input_to_forget_effective_bias)));
+
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, output_state_zero_point, recurrent_to_forget_weights,
+          nullptr, &(integer_lstm_params->recurrent_to_forget_effective_bias)));
+
+  // Modulation gate.
+  TfLiteTensor* cell_gate_bias = is_layer_norm
+                                     ? nullptr
+                                     : micro_context->AllocateTempInputTensor(
+                                           node, kLstmCellGateBiasTensor);
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, input_zero_point, input_to_cell_weights, cell_gate_bias,
+          &(integer_lstm_params->input_to_cell_effective_bias)));
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, output_state_zero_point, recurrent_to_cell_weights, nullptr,
+          &(integer_lstm_params->recurrent_to_cell_effective_bias)));
+
+  // Output gate.
+  TfLiteTensor* output_gate_bias = is_layer_norm
+                                       ? nullptr
+                                       : micro_context->AllocateTempInputTensor(
+                                             node, kLstmOutputGateBiasTensor);
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, input_zero_point, input_to_output_weights, output_gate_bias,
+          &(integer_lstm_params->input_to_output_effective_bias)));
+
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, output_state_zero_point, recurrent_to_output_weights,
+          nullptr, &(integer_lstm_params->recurrent_to_output_effective_bias)));
+
+  // Input gate. The calculation is only meaningful for non-cifg case.
+  TfLiteTensor* input_gate_bias = is_layer_norm
+                                      ? nullptr
+                                      : micro_context->AllocateTempInputTensor(
+                                            node, kLstmInputGateBiasTensor);
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, input_zero_point, input_to_input_weights, input_gate_bias,
+          &(integer_lstm_params->input_to_input_effective_bias)));
+  TF_LITE_ENSURE_OK(
+      context,
+      PrecomputeZeroPointTimesWeightWithBias(
+          context, output_state_zero_point, recurrent_to_input_weights, nullptr,
+          &(integer_lstm_params->recurrent_to_input_effective_bias)));
+
+  // Projection bias. The calculation is only meaningful for with projection.
+  TF_LITE_ENSURE_OK(context,
+                    PrecomputeZeroPointTimesWeightWithBias(
+                        context, hidden_zp, projection_weights, projection_bias,
+                        &(integer_lstm_params->projection_effective_bias)));
+
+  if (input != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input);
+  }
+  if (output_state != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(output_state);
+  }
+  if (input_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_input_weights);
+  }
+  if (input_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_forget_weights);
+  }
+  if (input_to_cell_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_cell_weights);
+  }
+  if (input_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_output_weights);
+  }
+  if (recurrent_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_input_weights);
+  }
+  if (recurrent_to_forget_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_forget_weights);
+  }
+  if (recurrent_to_cell_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_cell_weights);
+  }
+  if (recurrent_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_output_weights);
+  }
+  if (projection_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(projection_weights);
+  }
+  if (projection_bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(projection_bias);
+  }
+  if (forget_gate_bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(forget_gate_bias);
+  }
+  if (cell_gate_bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_gate_bias);
+  }
+  if (output_gate_bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(output_gate_bias);
+  }
+  if (input_gate_bias != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_gate_bias);
+  }
+
+  if (intermediate != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(intermediate);
+  }
+
+  return kTfLiteOk;
+}
+
+// Resize the output and  state tensors based on the sizes of the input tensors.
+// Allocate a temporary scratch tensor. Also check that the sizes of the input
+// tensors match each other.
+TfLiteStatus UnidirectionalSequenceLstmPrepare(TfLiteContext* context,
+                                               TfLiteNode* node) {
+  UnidirectionalSequenceLstmOpData* op_data =
+      reinterpret_cast<UnidirectionalSequenceLstmOpData*>(node->user_data);
+
+  MicroContext* micro_context = GetMicroContext(context);
+
+  // Check we have all the inputs and outputs we need.
+  bool use_layer_norm = false;
+  if (node->inputs->size == 24) {
+    TfLiteTensor* forget_layer_norm_coefficients =
+        micro_context->AllocateTempInputTensor(
+            node, kLstmForgetLayerNormCoefficientsTensor);
+    if (forget_layer_norm_coefficients == nullptr) {
+      use_layer_norm = false;
+    } else {
+      use_layer_norm = true;
+    }
+    if (forget_layer_norm_coefficients != nullptr) {
+      micro_context->DeallocateTempTfLiteTensor(forget_layer_norm_coefficients);
+    }
+  } else if (node->inputs->size == 20) {
+    // This is deprecated and is only kept here for backward compatibility.
+    use_layer_norm = false;
+  } else {
+    MicroPrintf("The LSTM Full kernel expects 20 or 24 inputs. Got %d inputs",
+                node->inputs->size);
+    return kTfLiteError;
+  }
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  op_data->use_layer_norm = use_layer_norm;
+
+  // Inferring batch size, number of outputs and sequence length and
+  // number of cells from the input tensors.
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kLstmInputTensor);
+  op_data->input_zero_point = input->params.zero_point;
+  const bool is_integer = input->type == kTfLiteInt8;
+  TF_LITE_ENSURE(context, input->dims->size > 1);
+  const auto* params =
+      reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
+          node->builtin_data);
+  const bool time_major = params->time_major;
+  const int n_batch = time_major ? input->dims->data[1] : input->dims->data[0];
+  const int n_input = input->dims->data[2];
+
+  TfLiteTensor* input_to_output_weights =
+      micro_context->AllocateTempInputTensor(node,
+                                             kLstmInputToOutputWeightsTensor);
+  const int n_cell = input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
+
+  TfLiteTensor* recurrent_to_output_weights =
+      micro_context->AllocateTempInputTensor(
+          node, kLstmRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
+                    n_cell);
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Check that input tensor dimensions matches with each other.
+  TF_LITE_ENSURE_OK(
+      context, CheckInputTensorDimensions(context, node, n_input, n_output,
+                                          n_cell, use_layer_norm, is_integer));
+
+  // Get the pointer to output, output_state and cell_state buffer tensors.
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kLstmOutputTensor);
+
+  TfLiteTensor* output_state =
+      micro_context->AllocateTempInputTensor(node, kLstmOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+  TF_LITE_ENSURE(context, output_state->is_variable);
+  op_data->output_state_zero_point = output_state->params.zero_point;
+  TfLiteTensor* cell_state =
+      micro_context->AllocateTempInputTensor(node, kLstmCellStateTensor);
+  TF_LITE_ENSURE(context, cell_state != nullptr);
+  TF_LITE_ENSURE(context, cell_state->is_variable);
+
+  // Check the shape of input state tensors.
+  // These tensor may be 1D or 2D. It's fine as long as the total size is
+  // correct.
+  TF_LITE_ENSURE_EQ(context, NumElements(output_state), n_batch * n_output);
+  TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
+
+  // Check the shape of output tensor against that of input tensor
+  TF_LITE_ENSURE_EQ(context, output->dims->size, 3);
+  TF_LITE_ENSURE_EQ(context, input->dims->data[0], output->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, input->dims->data[1], output->dims->data[1]);
+  TF_LITE_ENSURE_EQ(context, output->dims->data[2], n_output);
+
+  if (is_integer) {
+    const int num_intermediate_tensors = node->intermediates->size;
+    TF_LITE_ENSURE(context, num_intermediate_tensors == 5);
+  }
+
+  TfLiteTensor* input_to_input_weights = micro_context->AllocateTempInputTensor(
+      node, kLstmInputToInputWeightsTensor);
+
+  const bool use_cifg = (input_to_input_weights == nullptr);
+
+  // Create a primary scratch buffer for hybrid and float
+  // If is_integer, primary scratch buffer has a different size
+  if (!is_integer) {
+    int scratch_buffer_size[2];
+    scratch_buffer_size[0] = n_batch;
+
+    if (use_cifg) {
+      // Reserving space for Cell, Forget, Output gates
+      scratch_buffer_size[1] = n_cell * 3;
+    } else {
+      // Reserving space for Input, Cell, Forget, Output gates
+      scratch_buffer_size[1] = n_cell * 4;
+    }
+
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context,
+                          scratch_buffer_size[0] * scratch_buffer_size[1] *
+                              TfLiteTypeGetSize(input->type),
+                          &(op_data->scratch_index[kPrimaryScratchBuffer])));
+  }
+
+  if (IsHybridOp(input, input_to_output_weights)) {
+    TF_LITE_ENSURE(context, kNumHybridTempBuffers <= scratch_index_size);
+
+    TF_LITE_ENSURE_OK(context, SetHybridScales(context, node));
+
+    op_data->compute_row_sums = true;
+
+    // Allocate temporary tensors to store quantized values of input,
+    // output_state and cell_state tensors.
+
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context,
+                          GetTensorShape(input).FlatSize() *
+                              TfLiteTypeGetSize(input_to_output_weights->type),
+                          &(op_data->scratch_index[kInputQuantized])));
+
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context,
+                          GetTensorShape(output_state).FlatSize() *
+                              TfLiteTypeGetSize(input_to_output_weights->type),
+                          &(op_data->scratch_index[kOutputStateQuantized])));
+
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context,
+                          GetTensorShape(cell_state).FlatSize() *
+                              TfLiteTypeGetSize(input_to_output_weights->type),
+                          &(op_data->scratch_index[kCellStateQuantized])));
+
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context, n_batch * TfLiteTypeGetSize(kTfLiteFloat32),
+                          &(op_data->scratch_index[kScales])));
+
+    // Allocate temporary buffers to store scaling factors and product scaling
+    // factors. The latter is a convenience storage which allows to quantize
+    // a vector once (which produces the scaling factors) and multiply it with
+    // different matrices (which requires multiplying the scaling factors with
+    // the scaling factor of the matrix).
+
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context, n_batch * TfLiteTypeGetSize(kTfLiteFloat32),
+                          &(op_data->scratch_index[kInputScalingFactors])));
+
+    TF_LITE_ENSURE_OK(
+        context, context->RequestScratchBufferInArena(
+                     context, n_batch * TfLiteTypeGetSize(kTfLiteFloat32),
+                     &(op_data->scratch_index[kOutputStateScalingFactors])));
+
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context, n_batch * TfLiteTypeGetSize(kTfLiteFloat32),
+                          &(op_data->scratch_index[kProductScalingFactors])));
+
+    // Allocate a temporary buffer to store the recovered cell weights. Since
+    // this is used for diagonal matrices, only need to store n_cell values.
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context, n_cell * TfLiteTypeGetSize(kTfLiteFloat32),
+                          &(op_data->scratch_index[kRecoveredCellWeights])));
+
+    // Allocate a temporary buffer to store the accumulated int32 values.
+    TF_LITE_ENSURE_OK(
+        context,
+        context->RequestScratchBufferInArena(
+            context, n_cell * n_batch * TfLiteTypeGetSize(kTfLiteInt32),
+            &(op_data->scratch_index[kAccumScratch])));
+
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context, n_batch * TfLiteTypeGetSize(kTfLiteFloat32),
+                          &(op_data->scratch_index[kInputZeroPoints])));
+
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context, n_batch * TfLiteTypeGetSize(kTfLiteFloat32),
+                          &(op_data->scratch_index[kOutputStateZeroPoints])));
+
+    int row_sums_rows = use_cifg ? 6 : 8;
+    TfLiteTensor* projection_weights = micro_context->AllocateTempInputTensor(
+        node, kLstmProjectionWeightsTensor);
+    if (projection_weights != nullptr) {
+      row_sums_rows += ceil(static_cast<float>(n_output) / n_cell);
+    }
+    op_data->row_sums_size = row_sums_rows;
+    TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+    op_data->row_sums = static_cast<int32_t*>(context->AllocatePersistentBuffer(
+        context, row_sums_rows * n_cell * sizeof(int32_t)));
+    if (projection_weights != nullptr) {
+      micro_context->DeallocateTempTfLiteTensor(projection_weights);
+    }
+  }
+
+  if (is_integer) {
+    // Integer UnidirectionalSequenceLSTM prepare function for 8x8->16.
+    // This code path needs 5 intermediate tensors per Op.
+    // Populate quantization parameters.
+    PopulateQuantizedLstmParams8x8_16(context, node,
+                                      &op_data->integer_lstm_param);
+    // Allocate scratch buffer. Need 4 16-bit buffer with size n_batch * n_cell
+    // and 1 8-bit buffer with size n_batch * n_cell. For integer
+    // UnidirectionalSequenceLSTM, we do not need the extra 32-bit buffer.
+    for (int i = 0; i < 5; ++i) {
+      TfLiteType buffer_type = kTfLiteInt16;
+
+      if (i == 4) {
+        buffer_type = kTfLiteInt8;
+      }
+
+      TF_LITE_ENSURE_OK(
+          context,
+          context->RequestScratchBufferInArena(
+              context, n_batch * n_cell * TfLiteTypeGetSize(buffer_type),
+              &(op_data->scratch_index[i])));
+    }
+
+    // Populate precomputed zp * weight.
+    TF_LITE_ENSURE_OK(context, PopulatePrecomputedZPTimesWeightsWithBias(
+                                   context, op_data, node));
+  }
+
+  if (input != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input);
+  }
+  if (input_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_output_weights);
+  }
+  if (recurrent_to_output_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(recurrent_to_output_weights);
+  }
+  if (output != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(output);
+  }
+  if (output_state != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(output_state);
+  }
+  if (cell_state != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(cell_state);
+  }
+
+  if (input_to_input_weights != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input_to_input_weights);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus UnidirectionalSequenceLstmEval(TfLiteContext* context,
+                                            TfLiteNode* node) {
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  const auto* params =
+      reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
+          node->builtin_data);
+  const UnidirectionalSequenceLstmOpData* op_data =
+      reinterpret_cast<UnidirectionalSequenceLstmOpData*>(node->user_data);
+  const bool use_layer_norm = op_data->use_layer_norm;
+  const bool time_major = params->time_major;
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kLstmInputTensor);
+
+  const TfLiteEvalTensor* input_to_input_weights = tflite::micro::GetEvalInput(
+      context, node, kLstmInputToInputWeightsTensor);
+
+  const TfLiteEvalTensor* input_to_forget_weights = tflite::micro::GetEvalInput(
+      context, node, kLstmInputToForgetWeightsTensor);
+
+  const TfLiteEvalTensor* input_to_cell_weights =
+      tflite::micro::GetEvalInput(context, node, kLstmInputToCellWeightsTensor);
+
+  const TfLiteEvalTensor* input_to_output_weights = tflite::micro::GetEvalInput(
+      context, node, kLstmInputToOutputWeightsTensor);
+
+  const TfLiteEvalTensor* recurrent_to_input_weights =
+      tflite::micro::GetEvalInput(context, node,
+                                  kLstmRecurrentToInputWeightsTensor);
+
+  const TfLiteEvalTensor* recurrent_to_forget_weights =
+      tflite::micro::GetEvalInput(context, node,
+                                  kLstmRecurrentToForgetWeightsTensor);
+
+  const TfLiteEvalTensor* recurrent_to_cell_weights =
+      tflite::micro::GetEvalInput(context, node,
+                                  kLstmRecurrentToCellWeightsTensor);
+
+  const TfLiteEvalTensor* recurrent_to_output_weights =
+      tflite::micro::GetEvalInput(context, node,
+                                  kLstmRecurrentToOutputWeightsTensor);
+
+  const TfLiteEvalTensor* cell_to_input_weights =
+      tflite::micro::GetEvalInput(context, node, kLstmCellToInputWeightsTensor);
+
+  const TfLiteEvalTensor* cell_to_forget_weights = tflite::micro::GetEvalInput(
+      context, node, kLstmCellToForgetWeightsTensor);
+
+  const TfLiteEvalTensor* cell_to_output_weights = tflite::micro::GetEvalInput(
+      context, node, kLstmCellToOutputWeightsTensor);
+
+  const TfLiteEvalTensor* input_gate_bias =
+      tflite::micro::GetEvalInput(context, node, kLstmInputGateBiasTensor);
+
+  const TfLiteEvalTensor* forget_gate_bias =
+      tflite::micro::GetEvalInput(context, node, kLstmForgetGateBiasTensor);
+
+  const TfLiteEvalTensor* cell_gate_bias =
+      tflite::micro::GetEvalInput(context, node, kLstmCellGateBiasTensor);
+
+  const TfLiteEvalTensor* output_gate_bias =
+      tflite::micro::GetEvalInput(context, node, kLstmOutputGateBiasTensor);
+
+  const TfLiteEvalTensor* projection_weights =
+      tflite::micro::GetEvalInput(context, node, kLstmProjectionWeightsTensor);
+
+  const TfLiteEvalTensor* projection_bias =
+      tflite::micro::GetEvalInput(context, node, kLstmProjectionBiasTensor);
+
+  TfLiteEvalTensor* output_state =
+      tflite::micro::GetMutableEvalInput(context, node, kLstmOutputStateTensor);
+
+  TfLiteEvalTensor* cell_state =
+      tflite::micro::GetMutableEvalInput(context, node, kLstmCellStateTensor);
+
+  TFLITE_DCHECK(cell_state != nullptr);
+
+  const TfLiteEvalTensor* input_layer_norm_coefficients =
+      use_layer_norm ? tflite::micro::GetEvalInput(
+                           context, node, kLstmInputLayerNormCoefficientsTensor)
+                     : nullptr;
+  const TfLiteEvalTensor* forget_layer_norm_coefficients =
+      use_layer_norm
+          ? tflite::micro::GetEvalInput(context, node,
+                                        kLstmForgetLayerNormCoefficientsTensor)
+          : nullptr;
+  const TfLiteEvalTensor* cell_layer_norm_coefficients =
+      use_layer_norm ? tflite::micro::GetEvalInput(
+                           context, node, kLstmCellLayerNormCoefficientsTensor)
+                     : nullptr;
+  const TfLiteEvalTensor* output_layer_norm_coefficients =
+      use_layer_norm
+          ? tflite::micro::GetEvalInput(context, node,
+                                        kLstmOutputLayerNormCoefficientsTensor)
+          : nullptr;
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kLstmOutputTensor);
+
+  // Copy out the LSTM specific params so they can be passed in the function.
+  TfLiteLSTMParams lstm_params;
+  lstm_params.activation = params->activation;
+  lstm_params.cell_clip = params->cell_clip;
+  lstm_params.proj_clip = params->proj_clip;
+  lstm_params.asymmetric_quantize_inputs = params->asymmetric_quantize_inputs;
+
+  switch (input_to_output_weights->type) {
+    case kTfLiteFloat32: {
+      // Index the scratch buffers pointers to the global scratch buffer.
+      return EvalFloatLstm(
+          input, input_to_input_weights, input_to_forget_weights,
+          input_to_cell_weights, input_to_output_weights,
+          recurrent_to_input_weights, recurrent_to_forget_weights,
+          recurrent_to_cell_weights, recurrent_to_output_weights,
+          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          input_layer_norm_coefficients, forget_layer_norm_coefficients,
+          cell_layer_norm_coefficients, output_layer_norm_coefficients,
+          /*aux_input=*/nullptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+          forget_gate_bias, cell_gate_bias, output_gate_bias,
+          projection_weights, projection_bias, &lstm_params,
+          /*forward_sequence=*/true, time_major,
+          /*output_offset=*/0,
+          reinterpret_cast<float*>(context->GetScratchBuffer(
+              context, op_data->scratch_index[kPrimaryScratchBuffer])),
+          output_state, cell_state, output);
+    } break;
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
+      const bool is_hybrid = input->type == kTfLiteFloat32;
+      if (is_hybrid) {
+        // Index the scratch buffers pointers to the global scratch buffer.
+        UnidirectionalSequenceLstmOpData* op_data_rw =
+            reinterpret_cast<UnidirectionalSequenceLstmOpData*>(
+                node->user_data);
+        return EvalHybridLstm(
+            &(op_data->hybrid_lstm_scales), input, input_to_input_weights,
+            /*input_to_input_weights_ledger*/ nullptr, input_to_forget_weights,
+            /*input_to_forget_weights_ledger*/ nullptr, input_to_cell_weights,
+            /*input_to_cell_weights_ledger*/ nullptr, input_to_output_weights,
+            /*input_to_output_weights_ledger*/ nullptr,
+            recurrent_to_input_weights,
+            /*recurrent_to_input_weights_ledger*/ nullptr,
+            recurrent_to_forget_weights,
+            /*recurrent_to_forget_weights_ledger*/ nullptr,
+            recurrent_to_cell_weights,
+            /*recurrent_to_cell_weights_ledger*/ nullptr,
+            recurrent_to_output_weights,
+            /*recurrent_to_output_weights_ledger*/ nullptr,
+            cell_to_input_weights, cell_to_forget_weights,
+            cell_to_output_weights, input_layer_norm_coefficients,
+            forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+            output_layer_norm_coefficients,
+            /*aux_input=*/nullptr,
+            /*aux_input_to_input_weights=*/nullptr,
+            /*aux_input_to_forget_weights=*/nullptr,
+            /*aux_input_to_cell_weights=*/nullptr,
+            /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+            forget_gate_bias, cell_gate_bias, output_gate_bias,
+            projection_weights, /*projection_weights_ledger*/ nullptr,
+            projection_bias, &lstm_params,
+            /*forward_sequence=*/true, time_major,
+            /*output_offset=*/0,
+            reinterpret_cast<float*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kPrimaryScratchBuffer])),
+            reinterpret_cast<float*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kInputScalingFactors])),
+            /*aux_input_sf=*/nullptr,
+            reinterpret_cast<float*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kOutputStateScalingFactors])),
+            reinterpret_cast<float*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kProductScalingFactors])),
+            reinterpret_cast<float*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kRecoveredCellWeights])),
+            reinterpret_cast<int8_t*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kInputQuantized])),
+            /*aux_input_quantized=*/nullptr,
+            reinterpret_cast<int8_t*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kOutputStateQuantized])),
+            reinterpret_cast<int8_t*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kCellStateQuantized])),
+            reinterpret_cast<float*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kScales])),
+            output_state, cell_state,
+            reinterpret_cast<int32_t*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kAccumScratch])),
+            output,
+            reinterpret_cast<int32_t*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kInputZeroPoints])),
+            /*aux_input_zp=*/nullptr,
+            reinterpret_cast<int32_t*>(context->GetScratchBuffer(
+                context, op_data->scratch_index[kOutputStateZeroPoints])),
+            op_data_rw->row_sums, op_data_rw->row_sums_size,
+            &op_data_rw->compute_row_sums);
+      } else {
+        return EvalInteger8x8_16Lstm(
+            input, input_to_input_weights, input_to_forget_weights,
+            input_to_cell_weights, input_to_output_weights,
+            recurrent_to_input_weights, recurrent_to_forget_weights,
+            recurrent_to_cell_weights, recurrent_to_output_weights,
+            cell_to_input_weights, cell_to_forget_weights,
+            cell_to_output_weights, input_layer_norm_coefficients,
+            forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+            output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
+            cell_gate_bias, output_gate_bias, projection_weights,
+            projection_bias, &lstm_params, /*forward_sequence=*/true,
+            time_major, &op_data->integer_lstm_param,
+            op_data->output_state_zero_point, output_state, cell_state, output,
+            reinterpret_cast<int16_t*>(
+                context->GetScratchBuffer(context, op_data->scratch_index[0])),
+            reinterpret_cast<int16_t*>(
+                context->GetScratchBuffer(context, op_data->scratch_index[1])),
+            reinterpret_cast<int16_t*>(
+                context->GetScratchBuffer(context, op_data->scratch_index[2])),
+            reinterpret_cast<int16_t*>(
+                context->GetScratchBuffer(context, op_data->scratch_index[3])),
+            reinterpret_cast<int8_t*>(
+                context->GetScratchBuffer(context, op_data->scratch_index[4])),
+            nullptr);
+      }
+    } break;
+    default:
+      MicroPrintf("Type %s is not currently supported.",
+                  TfLiteTypeGetName(input_to_output_weights->type));
+      return kTfLiteError;
+  }
+}
+
+}  // namespace
+
+TfLiteRegistration Register_UNIDIRECTIONAL_SEQUENCE_LSTM() {
+  return tflite::micro::RegisterOp(UnidirectionalSequenceLstmInit,
+                                   UnidirectionalSequenceLstmPrepare,
+                                   UnidirectionalSequenceLstmEval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test_config.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test_config.h
new file mode 100644
index 00000000..e37c0efd
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test_config.h
@@ -0,0 +1,244 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_TEST_CONFIG_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_TEST_CONFIG_H_
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace testing {
+
+// TODO(b/230666079) enable below tests for xtensa when the xtensa
+// kernel is reconciled with reference kernel
+#if !defined(XTENSA)
+
+typedef struct LstmIntegerTestConfig {
+  const int n_batch;
+  const int n_input;
+  const int n_cell;
+  const int n_output;
+  const int sequence_length;
+  const bool time_major;
+  const bool use_cifg;
+  const bool use_peephole;
+  const bool use_projection_weights;
+  const bool use_projection_bias;
+  const bool use_layer_norm;
+  const bool use_8x8_8_implementation;
+  float intermediate_scale[5][2];
+  int intermediate_zp[5][2];
+  TfLiteAffineQuantization* intermediate_qparam;
+
+  const float* input;
+  int8_t* input_quant;
+
+  const float* input_to_input_weights;
+  int8_t* lstm_i2i_quant;
+  const float* input_to_forget_weights;
+  int8_t* lstm_i2f_quant;
+  const float* input_to_cell_weights;
+  int8_t* lstm_i2c_quant;
+  const float* input_to_output_weights;
+  int8_t* lstm_i2o_quant;
+
+  const float* recurrent_to_input_weights;
+  int8_t* lstm_r2i_quant;
+  const float* recurrent_to_forget_weights;
+  int8_t* lstm_r2f_quant;
+  const float* recurrent_to_cell_weights;
+  int8_t* lstm_r2c_quant;
+  const float* recurrent_to_output_weights;
+  int8_t* lstm_r2o_quant;
+
+  const float* cell_to_input_weights;
+  int16_t* lstm_c2i_quant;
+  const float* cell_to_forget_weights;
+  int16_t* lstm_c2f_quant;
+  const float* cell_to_output_weights;
+  int16_t* lstm_c2o_quant;
+
+  const float* input_gate_bias;
+  int32_t* lstm_igate_bias_quant;
+  const float* forget_gate_bias;
+  int32_t* lstm_fgate_bias_quant;
+  const float* cell_gate_bias;
+  int32_t* lstm_cgate_bias_quant;
+  const float* output_gate_bias;
+  int32_t* lstm_ogate_bias_quant;
+
+  const float* projection_weights;
+  int8_t* lstm_proj_w_quant;
+  const float* projection_bias;
+  int32_t* projection_bias_quant;
+
+  int16_t* output_state;
+  int16_t* cell_state;
+
+  const float* input_layer_norm_coefficients;
+  int16_t* lstm_input_layer_norm_coeff_quant;
+  const float* forget_layer_norm_coefficients;
+  int16_t* lstm_forget_layer_norm_coeff_quant;
+  const float* cell_layer_norm_coefficients;
+  int16_t* lstm_cell_layer_norm_coeff_quant;
+  const float* output_layer_norm_coefficients;
+  int16_t* lstm_output_layer_norm_coeff_quant;
+
+  int8_t* output;
+  const int8_t* expected_output;
+
+  bool asymmetric_quantize_inputs;
+  const float ranges[25][2];
+} LstmIntegerTestConfig;
+
+typedef struct LstmFloatTestConfig {
+  const int n_batch;
+  const int n_input;
+  const int n_cell;
+  const int n_output;
+  const int sequence_length;
+  const bool time_major;
+  const bool use_cifg;
+  const bool use_peephole;
+  const bool use_projection_weights;
+  const bool use_projection_bias;
+  const bool use_layer_norm;
+  const float cell_clip;
+  const float proj_clip;
+
+  const float* input_original;
+  float* input;
+
+  const float* input_to_input_weights;
+  const float* input_to_forget_weights;
+  const float* input_to_cell_weights;
+  const float* input_to_output_weights;
+
+  const float* recurrent_to_input_weights;
+  const float* recurrent_to_forget_weights;
+  const float* recurrent_to_cell_weights;
+  const float* recurrent_to_output_weights;
+
+  const float* cell_to_input_weights;
+  const float* cell_to_forget_weights;
+  const float* cell_to_output_weights;
+
+  const float* input_gate_bias;
+  const float* forget_gate_bias;
+  const float* cell_gate_bias;
+  const float* output_gate_bias;
+
+  const float* projection_weights;
+  const float* projection_bias;
+
+  float* output_state;
+  float* cell_state;
+
+  const float* input_layer_norm_coefficients;
+  const float* forget_layer_norm_coefficients;
+  const float* cell_layer_norm_coefficients;
+  const float* output_layer_norm_coefficients;
+
+  float* output;
+  const float* expected_output_original;
+  float* expected_output;
+} LstmFloatTestConfig;
+
+typedef struct LstmWeightQuantizationBuffers {
+  int8_t* lstm_i2i_quant;
+  float* lstm_i2i_scale;
+  int* lstm_i2i_zp;
+  TfLiteAffineQuantization* lstm_i2i_qparam;
+
+  int8_t* lstm_i2f_quant;
+  float* lstm_i2f_scale;
+  int* lstm_i2f_zp;
+  TfLiteAffineQuantization* lstm_i2f_qparam;
+
+  int8_t* lstm_i2c_quant;
+  float* lstm_i2c_scale;
+  int* lstm_i2c_zp;
+  TfLiteAffineQuantization* lstm_i2c_qparam;
+
+  int8_t* lstm_i2o_quant;
+  float* lstm_i2o_scale;
+  int* lstm_i2o_zp;
+  TfLiteAffineQuantization* lstm_i2o_qparam;
+
+  int8_t* lstm_r2i_quant;
+  float* lstm_r2i_scale;
+  int* lstm_r2i_zp;
+  TfLiteAffineQuantization* lstm_r2i_qparam;
+
+  int8_t* lstm_r2f_quant;
+  float* lstm_r2f_scale;
+  int* lstm_r2f_zp;
+  TfLiteAffineQuantization* lstm_r2f_qparam;
+
+  int8_t* lstm_r2c_quant;
+  float* lstm_r2c_scale;
+  int* lstm_r2c_zp;
+  TfLiteAffineQuantization* lstm_r2c_qparam;
+
+  int8_t* lstm_r2o_quant;
+  float* lstm_r2o_scale;
+  int* lstm_r2o_zp;
+  TfLiteAffineQuantization* lstm_r2o_qparam;
+
+  int8_t* lstm_c2i_quant;
+  float* lstm_c2i_scale;
+  int* lstm_c2i_zp;
+  TfLiteAffineQuantization* lstm_c2i_qparam;
+
+  int8_t* lstm_c2f_quant;
+  float* lstm_c2f_scale;
+  int* lstm_c2f_zp;
+  TfLiteAffineQuantization* lstm_c2f_qparam;
+
+  int8_t* lstm_c2o_quant;
+  float* lstm_c2o_scale;
+  int* lstm_c2o_zp;
+  TfLiteAffineQuantization* lstm_c2o_qparam;
+
+  int8_t* lstm_proj_w_quant;
+  float* lstm_proj_w_scale;
+  int* lstm_proj_w_zp;
+  TfLiteAffineQuantization* lstm_proj_w_qparam;
+} LstmWeightQuantizationBuffers;
+
+extern LstmIntegerTestConfig lstm_integer_no_peephole_config;
+
+extern LstmIntegerTestConfig lstm_integer_peephole_config;
+
+extern LstmFloatTestConfig lstm_no_cifg_no_peephole_no_proj_config;
+
+extern LstmFloatTestConfig lstm_cifg_peephole_no_proj_config;
+
+extern LstmFloatTestConfig lstm_no_cifg_peephole_proj_config;
+
+extern LstmFloatTestConfig lstm_no_cifg_peephole_proj_bias_config;
+
+extern LstmWeightQuantizationBuffers lstm_no_cifg_no_peephole_no_proj_buffers;
+
+extern LstmWeightQuantizationBuffers lstm_cifg_peephole_no_proj_buffers;
+
+extern LstmWeightQuantizationBuffers lstm_no_cifg_peephole_proj_buffers;
+
+extern LstmFloatTestConfig cifg_peephole_no_proj_config_layer_norm;
+
+#endif  // !defined(XTENSA)
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_TEST_CONFIG_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/unpack.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/unpack.cc
index 13bb7dcf..d199add0 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/unpack.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/unpack.cc
@@ -103,14 +103,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace unpack
 
 TfLiteRegistration Register_UNPACK() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/unpack::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, nullptr, unpack::Eval);
 }
 
 }  // namespace micro
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/var_handle.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/var_handle.cc
index 2efffdb6..db044f3f 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/var_handle.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/var_handle.cc
@@ -46,14 +46,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const auto* params =
       reinterpret_cast<const TfLiteVarHandleParams*>(node->builtin_data);
 
-  // Casting to TfliteIntArray is required since we are re-using
-  // GetExecutionPlan from TfLiteContext. On TFLM this method returns a
-  // MicroGraph.
-  // TODO(b/188226309): Design a cleaner way to get a graph from kernel context.
-  MicroGraph* graph_info;
-  context->GetExecutionPlan(context,
-                            reinterpret_cast<TfLiteIntArray**>(&graph_info));
-  MicroResourceVariables* resources = graph_info->GetResourceVariables();
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  MicroGraph& graph_info = micro_context->graph();
+
+  MicroResourceVariables* resources = graph_info.GetResourceVariables();
   if (resources == nullptr) {
     MicroPrintf(
         "VAR_HANDLE requires resource variables. Please create "
@@ -91,14 +87,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace.
 
 TfLiteRegistration Register_VAR_HANDLE() {
-  return {/*init=*/Init,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/while.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/while.cc
new file mode 100644
index 00000000..811c9eae
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/while.cc
@@ -0,0 +1,133 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stddef.h>
+
+#include <cstring>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_context.h"
+#include "tensorflow/lite/micro/micro_graph.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+
+namespace {
+
+struct OpData {
+  int cond_subgraph_index;
+  int body_subgraph_index;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  const auto* params =
+      reinterpret_cast<const TfLiteWhileParams*>(node->builtin_data);
+
+  op_data->cond_subgraph_index = params->cond_subgraph_index;
+  op_data->body_subgraph_index = params->body_subgraph_index;
+
+  // The first input is the condition.
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+
+  size_t num_inputs = node->inputs->size;
+  size_t num_outputs = node->outputs->size;
+
+  MicroGraph& graph_info = micro_context->graph();
+
+  TF_LITE_ENSURE(context,
+                 op_data->cond_subgraph_index < graph_info.NumSubgraphs());
+  TF_LITE_ENSURE(context,
+                 op_data->body_subgraph_index < graph_info.NumSubgraphs());
+
+  TF_LITE_ENSURE_EQ(context, num_inputs,
+                    graph_info.NumSubgraphInputs(op_data->cond_subgraph_index));
+  TF_LITE_ENSURE_EQ(context, num_inputs,
+                    graph_info.NumSubgraphInputs(op_data->body_subgraph_index));
+  TF_LITE_ENSURE_EQ(context, num_inputs, num_outputs);
+  TF_LITE_ENSURE_EQ(
+      context, num_outputs,
+      graph_info.NumSubgraphOutputs(op_data->body_subgraph_index));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  tflite::MicroContext* micro_context = tflite::GetMicroContext(context);
+  MicroGraph* graph_info = &micro_context->graph();
+
+  TF_LITE_ENSURE_OK(context,
+                    tflite::micro::CopyOpInputsToSubgraphInputs(
+                        context, node, graph_info, op_data->cond_subgraph_index,
+                        /*first_tensor_idx=*/0));
+
+  TF_LITE_ENSURE_OK(context,
+                    graph_info->InvokeSubgraph(op_data->cond_subgraph_index));
+
+  TfLiteEvalTensor* cond_subgraph_output = graph_info->GetSubgraphOutput(
+      op_data->cond_subgraph_index, /*tensor_idx=*/0);
+  bool cond_value = cond_subgraph_output->data.b[0];
+
+  TF_LITE_ENSURE_OK(context,
+                    tflite::micro::CopyOpInputsToSubgraphInputs(
+                        context, node, graph_info, op_data->body_subgraph_index,
+                        /*first_tensor_idx=*/0));
+  TF_LITE_ENSURE_OK(context,
+                    tflite::micro::CopyOpInputsToOpOutputs(context, node));
+
+  while (cond_value == true) {
+    // Copy output of this iteration back to the body input.
+    TF_LITE_ENSURE_OK(
+        context, tflite::micro::CopyOpOutputsToSubgraphInputs(
+                     context, node, graph_info, op_data->body_subgraph_index));
+    TF_LITE_ENSURE_OK(context,
+                      graph_info->InvokeSubgraph(op_data->body_subgraph_index));
+
+    TF_LITE_ENSURE_OK(
+        context, tflite::micro::CopySubgraphOutputsToOpOutputs(
+                     context, node, graph_info, op_data->body_subgraph_index));
+    TF_LITE_ENSURE_OK(
+        context, tflite::micro::CopyOpOutputsToSubgraphInputs(
+                     context, node, graph_info, op_data->cond_subgraph_index));
+    TF_LITE_ENSURE_OK(context,
+                      graph_info->InvokeSubgraph(op_data->cond_subgraph_index));
+
+    cond_subgraph_output = graph_info->GetSubgraphOutput(
+        op_data->cond_subgraph_index, /*tensor_idx=*/0);
+    cond_value = cond_subgraph_output->data.b[0];
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace.
+
+TfLiteRegistration Register_WHILE() {
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/zeros_like.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/zeros_like.cc
index ce403927..fd6e6612 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/zeros_like.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/zeros_like.cc
@@ -25,15 +25,20 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   output->type = input->type;
 
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
   return kTfLiteOk;
 }
 
@@ -76,14 +81,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 
 TfLiteRegistration Register_ZEROS_LIKE() {
-  return {/*init=*/nullptr,
-          /*free=*/nullptr,
-          /*prepare=*/Prepare,
-          /*invoke=*/Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.cc
new file mode 100644
index 00000000..edab2b83
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.cc
@@ -0,0 +1,351 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/micro/micro_allocation_info.h"
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/memory_planner/greedy_memory_planner.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+namespace tflite {
+
+namespace {
+constexpr char kOfflineMemAllocMetadata[] = "OfflineMemoryAllocation";
+constexpr int kUninitializedLifetime = -1;
+}  // namespace
+
+// Mark the given Allocation info as first created at the specified allocation
+// scope count. Only the first creation must be recorded since the allocation
+// scope count monotonically increases throughout the lifetime marking process.
+void AllocationInfoBuilder::UpdateFirstCreated(AllocationInfo* current,
+                                               int allocation_scope_count) {
+  TFLITE_DCHECK(current->first_created <= allocation_scope_count);
+  if (current->first_created == kUninitializedLifetime) {
+    current->first_created = allocation_scope_count;
+  }
+}
+
+// Mark the given AllocationInfo as last used at the specified allocation scope
+// count. Update the last used marker every time, since the allocation scope
+// count monotonically increases through the lifetime marking process.
+void AllocationInfoBuilder::UpdateLastUsed(AllocationInfo* current,
+                                           int allocation_scope_count) {
+  TFLITE_DCHECK(current->last_used <= allocation_scope_count);
+  current->last_used = allocation_scope_count;
+}
+
+TfLiteStatus AllocationInfoBuilder::MarkSubgraphLifetimesIfNecessary(
+    const Operator* op, internal::ScratchBufferRequest* scratch_buffer_requests,
+    ScratchBufferHandle* scratch_buffer_handles,
+    SubgraphAllocations* allocations) {
+  int first_subgraph_index = -1;
+  int second_subgraph_index = -1;
+  const OperatorCode* opcode =
+      model_->operator_codes()->Get(op->opcode_index());
+  switch (opcode->builtin_code()) {
+    case BuiltinOperator_IF: {
+      first_subgraph_index =
+          op->builtin_options_as_IfOptions()->then_subgraph_index();
+      second_subgraph_index =
+          op->builtin_options_as_IfOptions()->else_subgraph_index();
+      break;
+    }
+    case BuiltinOperator_CALL_ONCE: {
+      first_subgraph_index =
+          op->builtin_options_as_CallOnceOptions()->init_subgraph_index();
+      break;
+    }
+    case BuiltinOperator_WHILE: {
+      first_subgraph_index =
+          op->builtin_options_as_WhileOptions()->cond_subgraph_index();
+      second_subgraph_index =
+          op->builtin_options_as_WhileOptions()->body_subgraph_index();
+      break;
+    }
+    default: {
+      break;
+    }
+  }
+  if (first_subgraph_index != -1) {
+    // Enter a new allocation scope for each subgraph.
+    allocation_scope_count_++;
+    TF_LITE_ENSURE_STATUS(
+        MarkAllocationLifetimes(first_subgraph_index, scratch_buffer_requests,
+                                scratch_buffer_handles, allocations));
+  }
+  if (second_subgraph_index != -1) {
+    // Enter a new allocation scope for each subgraph.
+    allocation_scope_count_++;
+    TF_LITE_ENSURE_STATUS(
+        MarkAllocationLifetimes(second_subgraph_index, scratch_buffer_requests,
+                                scratch_buffer_handles, allocations));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus AllocationInfoBuilder::CreateAllocationInfo(
+    int scratch_buffer_request_count) {
+  size_t subgraph_offsets_length = model_->subgraphs()->size() * sizeof(size_t);
+  info_.subgraph_offsets =
+      reinterpret_cast<size_t*>(non_persistent_allocator_->AllocateTemp(
+          subgraph_offsets_length, alignof(size_t)));
+  if (info_.subgraph_offsets == nullptr) {
+    TF_LITE_REPORT_ERROR(
+        reporter_,
+        "Failed to allocate memory for memory planning, %d bytes required",
+        subgraph_offsets_length);
+    return kTfLiteError;
+  }
+  size_t tensor_count = 0;
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       subgraph_idx++) {
+    // Add all tensors in each subgraph to the AllocationInfo array. Even weight
+    // tensors are added but marked with needs_allocating = false. Including all
+    // tensors in the graph here simplifies logic.
+    info_.subgraph_offsets[subgraph_idx] = tensor_count;
+    tensor_count += model_->subgraphs()->Get(subgraph_idx)->tensors()->size();
+  }
+  info_.tensor_count = tensor_count;
+
+  // Scratch buffer allocations follow tensor allocations, so the scratch offset
+  // is equal to the number of tensor allocations.
+  info_.scratch_offset = tensor_count;
+  info_.allocation_info_count = tensor_count + scratch_buffer_request_count;
+  info_.scratch_buffer_count = scratch_buffer_request_count;
+  size_t bytes = sizeof(AllocationInfo) * info_.allocation_info_count;
+
+  // Allocate an array of AllocationInfo structs from the temp section. This
+  // struct will be used by AllocationInfoBuilder to find buffer usage.
+  info_.allocation_info = reinterpret_cast<AllocationInfo*>(
+      non_persistent_allocator_->AllocateTemp(bytes, alignof(AllocationInfo)));
+  if (info_.allocation_info == nullptr) {
+    TF_LITE_REPORT_ERROR(
+        reporter_,
+        "Failed to allocate memory for memory planning, %d bytes required",
+        bytes);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus AllocationInfoBuilder::FreeAllocationInfo() {
+  non_persistent_allocator_->DeallocateTemp(
+      reinterpret_cast<uint8_t*>(info_.allocation_info));
+  non_persistent_allocator_->DeallocateTemp(
+      reinterpret_cast<uint8_t*>(info_.subgraph_offsets));
+  return kTfLiteOk;
+}
+
+TfLiteStatus AllocationInfoBuilder::ValidateSubgraph(
+    const SubGraph* subgraph, TfLiteEvalTensor* eval_tensors) {
+  uint32_t operators_size = NumSubgraphOperators(subgraph);
+
+  for (uint32_t i = 0; i < operators_size; i++) {
+    const auto op = subgraph->operators()->Get(i);
+    for (size_t n = 0;
+         op->intermediates() != nullptr && n < op->intermediates()->size();
+         n++) {
+      const int tensor_index = op->intermediates()->Get(n);
+      size_t tensor_size = -1;
+      TF_LITE_ENSURE_STATUS(TfLiteEvalTensorByteLength(
+          &eval_tensors[tensor_index], &tensor_size));
+      if (tensor_size != 0) {
+        MicroPrintf(
+            "Does not support intermediate tensor with non-zero size: %d",
+            tensor_size);
+        return kTfLiteError;
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus AllocationInfoBuilder::InitializeAllocationInfo(
+    const int32_t* offline_offsets, SubgraphAllocations* allocations) {
+  AllocationInfo* allocation_info = info_.allocation_info;
+  // Initialize allocation info for every tensor in every subgraph.
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       subgraph_idx++) {
+    const SubGraph* subgraph = model_->subgraphs()->Get(subgraph_idx);
+    TfLiteEvalTensor* eval_tensors = allocations[subgraph_idx].tensors;
+    AllocationInfo* subgraph_allocation_info =
+        &allocation_info[info_.subgraph_offsets[subgraph_idx]];
+
+    // Ensure constraints are met.
+    TF_LITE_ENSURE_STATUS(ValidateSubgraph(subgraph, eval_tensors));
+
+    for (size_t i = 0; i < subgraph->tensors()->size(); ++i) {
+      AllocationInfo* current = &subgraph_allocation_info[i];
+      current->output_ptr = &(eval_tensors[i].data.data);
+
+      TF_LITE_ENSURE_STATUS(
+          TfLiteEvalTensorByteLength(&eval_tensors[i], &current->bytes));
+
+      current->first_created = kUninitializedLifetime;
+      current->last_used = kUninitializedLifetime;
+      current->needs_allocating =
+          (eval_tensors[i].data.data == nullptr) &&
+          (!subgraph->tensors()->Get(i)->is_variable()) &&
+          (current->bytes != 0);
+      if (offline_offsets) {
+        current->offline_offset = offline_offsets[i];
+      } else {
+        current->offline_offset = kOnlinePlannedBuffer;
+      }
+    }
+  }
+  // Initialize allocation info for every scratch buffer.
+  AllocationInfo* scratch_allocation_info =
+      &allocation_info[info_.scratch_offset];
+  for (size_t i = 0; i < info_.scratch_buffer_count; i++) {
+    AllocationInfo* current = &scratch_allocation_info[i];
+    current->first_created = kUninitializedLifetime;
+    current->last_used = kUninitializedLifetime;
+    current->needs_allocating = true;
+    current->offline_offset = kOnlinePlannedBuffer;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus AllocationInfoBuilder::MarkAllocationLifetimes(
+    int subgraph_idx, internal::ScratchBufferRequest* scratch_buffer_requests,
+    ScratchBufferHandle* scratch_buffer_handles,
+    SubgraphAllocations* allocations) {
+  const SubGraph* subgraph = model_->subgraphs()->Get(subgraph_idx);
+
+  AllocationInfo* allocation_info = info_.allocation_info;
+  // Each subgraph's tensor allocations are in a contiguous block starting at
+  // subgraph_offsets_[subgraph index] with one entry per tensor.
+  AllocationInfo* subgraph_allocation_info =
+      &allocation_info[info_.subgraph_offsets[subgraph_idx]];
+
+  uint32_t operators_size = NumSubgraphOperators(subgraph);
+  // Mark all inputs as created at the start of the subgraph invocation.
+  for (size_t i = 0;
+       subgraph->inputs() != nullptr && i < subgraph->inputs()->size(); ++i) {
+    const int tensor_index = subgraph->inputs()->Get(i);
+    AllocationInfo* current = &subgraph_allocation_info[tensor_index];
+    UpdateFirstCreated(current, allocation_scope_count_);
+  }
+
+  for (uint32_t i = 0; i < operators_size; i++) {
+    // Each operator has a new allocation scope.
+    allocation_scope_count_++;
+    const auto* op = subgraph->operators()->Get(i);
+    // Figure out when the first creation and use of each tensor is.
+    for (size_t n = 0; op->outputs() != nullptr && n < op->outputs()->size();
+         ++n) {
+      const int tensor_index = op->outputs()->Get(n);
+      AllocationInfo* current = &subgraph_allocation_info[tensor_index];
+      UpdateFirstCreated(current, allocation_scope_count_);
+    }
+
+    // Keep track of scope count before any subgraphs, so that scratch buffers'
+    // lifetime within a control flow op properly overlaps with all subgraphs.
+    int start_allocation_scope_count = allocation_scope_count_;
+
+    // Control flow operators can invoke subgraphs. Plan these subgraphs
+    // before continuing on to the rest of the graph.
+    MarkSubgraphLifetimesIfNecessary(op, scratch_buffer_requests,
+                                     scratch_buffer_handles, allocations);
+
+    // Figure out when the last use of each tensor is.
+    for (size_t n = 0; op->inputs() != nullptr && n < op->inputs()->size();
+         ++n) {
+      const int tensor_index = op->inputs()->Get(n);
+      // Optional bias tensors can have an index of -1 when they are omitted.
+      if (tensor_index >= 0) {
+        AllocationInfo* current = &subgraph_allocation_info[tensor_index];
+        // No need to update creation since it is either marked by the subgraph
+        // or producer op, or it is not part of the memory plan (weight, bias
+        // tensor).
+        UpdateLastUsed(current, allocation_scope_count_);
+      }
+    }
+    for (size_t n = 0; op->outputs() != nullptr && n < op->outputs()->size();
+         ++n) {
+      const int tensor_index = op->outputs()->Get(n);
+      AllocationInfo* current = &subgraph_allocation_info[tensor_index];
+      UpdateLastUsed(current, allocation_scope_count_);
+    }
+
+    // Mark thse lifetime of scratch buffers belonging to the current node. This
+    // operation is O(N * M) where N is the total number of visited nodes and M
+    // is the total number of scratch buffers.
+    // TODO(b/217794030): Optimize this memory planning code.
+    AllocationInfo* scratch_allocation_info =
+        &allocation_info[info_.scratch_offset];
+    for (size_t scratch_idx = 0; scratch_idx < info_.scratch_buffer_count;
+         scratch_idx++) {
+      internal::ScratchBufferRequest request =
+          scratch_buffer_requests[scratch_idx];
+      AllocationInfo* current = &scratch_allocation_info[scratch_idx];
+      if (request.node_idx == static_cast<int>(i) &&
+          request.subgraph_idx == static_cast<int>(subgraph_idx)) {
+        ScratchBufferHandle* current_handle =
+            &(scratch_buffer_handles[scratch_idx]);
+        current->output_ptr = reinterpret_cast<void**>(&current_handle->data);
+        current->bytes = request.bytes;
+        UpdateFirstCreated(current, start_allocation_scope_count);
+        UpdateLastUsed(current, allocation_scope_count_);
+      }
+    }
+  }
+
+  // Mark all outputs as persistent to the end of the subgraph invocation.
+  for (size_t i = 0;
+       subgraph->outputs() != nullptr && i < subgraph->outputs()->size(); ++i) {
+    const int tensor_index = subgraph->outputs()->Get(i);
+    AllocationInfo* current = &subgraph_allocation_info[tensor_index];
+    UpdateLastUsed(current, allocation_scope_count_);
+  }
+  return kTfLiteOk;
+}
+
+// Get offline tensors allocation plan. See
+// micro/docs/memory_management.md for more info.
+TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
+    const int32_t** offline_planner_offsets) {
+  if (model_->metadata()) {
+    for (size_t i = 0; i < model_->metadata()->size(); ++i) {
+      auto metadata = model_->metadata()->Get(i);
+      if (strncmp(metadata->name()->c_str(), kOfflineMemAllocMetadata,
+                  strlen(kOfflineMemAllocMetadata)) == 0) {
+        const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
+            model_->buffers();
+        auto* buffer = (*buffers)[metadata->buffer()];
+        auto* array = buffer->data();
+        const uint32_t* metadata_buffer =
+            reinterpret_cast<const uint32_t*>(array->data());
+        const size_t nbr_tensors = static_cast<size_t>(metadata_buffer[2]);
+        *offline_planner_offsets =
+            reinterpret_cast<const int32_t*>(&metadata_buffer[3]);
+
+        if (info_.tensor_count != nbr_tensors) {
+          TF_LITE_REPORT_ERROR(reporter_,
+                               "Nbr of offline buffer offsets (%d) in metadata "
+                               "not equal nbr tensors (%d)\n",
+                               nbr_tensors, info_.tensor_count);
+          return kTfLiteError;
+        }
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.h
new file mode 100644
index 00000000..bc6825ef
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.h
@@ -0,0 +1,145 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_ALLOCATION_INFO_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_ALLOCATION_INFO_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/flatbuffer_utils.h"
+#include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Used to hold information used during allocation calculations.
+struct AllocationInfo {
+  size_t bytes;
+  void** output_ptr;
+  int first_created;
+  int last_used;
+  int32_t offline_offset;
+  bool needs_allocating;
+};
+
+// Used to hold the allocation info list and related metadata for the entire
+// graph (including subgraphs). Since all subgraphs are planned together, the
+// allocation info list contains allocations for all subgraphs. Track the offset
+// into this list for each subgraph then reserve space to track all allocations.
+//
+// The AllocationInfo list is a contiguous list of allocations across all
+// subgraphs and scratch buffers. Each element here is marked as
+// s<subgraph index>t<tensor index>. The following is a possible
+// AllocationInfo list:
+// [s0t0, s0t1, s1t0, s2t1, s1t2, s3t0, s3t1, scratch0, scratch1, scratch2]
+//
+// For this example, the subgraph offsets would be [0, 2, 5] and the scratch
+// offset would be 7.
+struct GraphAllocationInfo {
+  AllocationInfo* allocation_info;
+  size_t allocation_info_count;
+  size_t* subgraph_offsets;
+  size_t scratch_offset;
+  size_t tensor_count;
+  size_t scratch_buffer_count;
+};
+
+// A helper class to construct AllocationInfo array. This array contains the
+// lifetime of tensors / scratch_buffer and will be used to calculate the memory
+// plan. Methods need to be called in order from `Create`, Init`, `Add*`, to
+// `Finish`.
+class AllocationInfoBuilder {
+ public:
+  AllocationInfoBuilder(const Model* model,
+                        INonPersistentBufferAllocator* non_persistent_allocator,
+                        ErrorReporter* reporter)
+      : model_(model),
+        non_persistent_allocator_(non_persistent_allocator),
+        reporter_(reporter) {}
+
+  // Check if model contains offline planned buffer offsets.
+  //  - If there's no metadata available, offline_planner_offsets is not set
+  //  - If there's metadata available, offline_planner_offsets will point to the
+  //    first offset in the metadata buffer list.
+  TfLiteStatus GetOfflinePlannedOffsets(
+      const int32_t** offline_planner_offsets);
+
+  // Allocate memory for the allocation info array as well as offsets into that
+  // array for each subgraph.
+  TfLiteStatus CreateAllocationInfo(int scratch_buffer_request_count);
+
+  // Release memory used for the allocation info array.
+  TfLiteStatus FreeAllocationInfo();
+
+  // Initialize AllocationInfo for all tensors and scratch buffers in the graph.
+  TfLiteStatus InitializeAllocationInfo(const int32_t* offline_offsets,
+                                        SubgraphAllocations* allocations);
+
+  // Mark the scope of each tensor and scratch buffer across the graph. Enter
+  // all possible subgraphs invoked by each control flow operator. This method
+  // marks the maximum lifetime of each buffer so that tensors are correctly
+  // planned for all valid invocation flows.
+  TfLiteStatus MarkAllocationLifetimes(
+      int subgraph_idx, internal::ScratchBufferRequest* scratch_buffer_request,
+      ScratchBufferHandle* scratch_buffer_handles,
+      SubgraphAllocations* allocations);
+
+  // Identify control flow operators and recursively mark all subgraphs which
+  // that operator can invoke. The lifetime of all tensors within a subgraph
+  // can only be extended. The order of subgraph invocation does not matter
+  // since subgraphs within the same control flow operator are executed
+  // within their own allocation scope (planned buffers in a subgraph cannot
+  // persist beyond the end of that subgraph's invocation).
+  TfLiteStatus MarkSubgraphLifetimesIfNecessary(
+      const Operator* op,
+      internal::ScratchBufferRequest* scratch_buffer_requests,
+      ScratchBufferHandle* scratch_buffer_handles,
+      SubgraphAllocations* allocations);
+
+  // Returns the number of allocations.
+  int AllocationCount() const { return info_.allocation_info_count; }
+
+  // Returns a pointer to the built AllocationInfo array.
+  AllocationInfo* Finish() const { return info_.allocation_info; }
+
+ private:
+  // Mark the given Allocation info as first created at the specified allocation
+  // scope count. Only the first creation must be recorded since the allocation
+  // scope count monotonically increases throughout the lifetime marking
+  // process.
+  void UpdateFirstCreated(AllocationInfo* current, int allocation_scope_count);
+
+  // Mark the given AllocationInfo as last used at the specified allocation
+  // scope
+  // count. Update the last used marker every time, since the allocation scope
+  // count monotonically increases through the lifetime marking process.
+  void UpdateLastUsed(AllocationInfo* current, int allocation_scope_count);
+
+  // Validate if a subgraph satisfies assumptions.
+  TfLiteStatus ValidateSubgraph(const SubGraph* subgraph,
+                                TfLiteEvalTensor* eval_tensors);
+
+  const tflite::Model* model_ = nullptr;
+  INonPersistentBufferAllocator* non_persistent_allocator_ = nullptr;
+  ErrorReporter* reporter_ = nullptr;
+
+  GraphAllocationInfo info_;
+  int allocation_scope_count_ = 0;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_ALLOCATION_INFO_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.cc
index ec203b9f..7e5192cf 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.cc
@@ -19,20 +19,22 @@ limitations under the License.
 #include <cstdint>
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/micro/arena_allocator/simple_memory_allocator.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/flatbuffer_utils.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/memory_planner/greedy_memory_planner.h"
 #include "tensorflow/lite/micro/memory_planner/micro_memory_planner.h"
+#include "tensorflow/lite/micro/micro_allocation_info.h"
 #include "tensorflow/lite/micro/micro_arena_constants.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
-#include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 
@@ -48,26 +50,17 @@ constexpr size_t kMaxScratchBuffersPerOp = 12;
 // needs a node id assignment.
 constexpr int kUnassignedScratchBufferRequestIndex = -1;
 
-// Used to hold information used during allocation calculations.
-struct AllocationInfo {
-  size_t bytes;
-  void** output_ptr;
-  int first_created;
-  int last_used;
-  int32_t offline_offset;
-  bool needs_allocating;
-};
-
-constexpr char kOfflineMemAllocMetadata[] = "OfflineMemoryAllocation";
 const TfLiteIntArray kZeroLengthIntArray = {};
 
 class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
  public:
-  explicit MicroBuiltinDataAllocator(SimpleMemoryAllocator* memory_allocator)
-      : memory_allocator_(memory_allocator) {}
+  explicit MicroBuiltinDataAllocator(
+      IPersistentBufferAllocator* persistent_allocator)
+      : persistent_allocator_(persistent_allocator) {}
 
   void* Allocate(size_t size, size_t alignment_hint) override {
-    return memory_allocator_->AllocateFromTail(size, alignment_hint);
+    return persistent_allocator_->AllocatePersistentBuffer(size,
+                                                           alignment_hint);
   }
   void Deallocate(void* data) override {
     // Do not deallocate, builtin data needs to be available for the life time
@@ -77,169 +70,9 @@ class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
   TF_LITE_REMOVE_VIRTUAL_DELETE
 
  private:
-  SimpleMemoryAllocator* memory_allocator_;
+  IPersistentBufferAllocator* persistent_allocator_;
 };
 
-// A helper class to construct AllocationInfo array. This array contains the
-// lifetime of tensors / scratch_buffer and will be used to calculate the memory
-// plan. Methods need to be called in order from `Init`, `Add*`, to `Finish`.
-class AllocationInfoBuilder {
- public:
-  AllocationInfoBuilder(AllocationInfo* info, size_t tensor_count,
-                        size_t scratch_buffer_count, ErrorReporter* reporter)
-      : info_(info),
-        tensor_count_(tensor_count),
-        buffer_count_(scratch_buffer_count),
-        reporter_(reporter) {}
-
-  // Check if model contains offline planned buffer offsets.
-  //  - If there's no metadata available, offline_planner_offsets is not set
-  //  - If there's metadata available, offline_planner_offsets will point to the
-  //    first offset in the metadata buffer list.
-  TfLiteStatus GetOfflinePlannedOffsets(
-      const Model* model, const int32_t** offline_planner_offsets);
-
-  // Add allocaiton information for the tensors.
-  TfLiteStatus AddTensors(const SubGraph* subgraph,
-                          const int32_t* offline_offsets,
-                          TfLiteEvalTensor* eval_tensors);
-
-  // Add allocation information for the scratch buffers.
-  TfLiteStatus AddScratchBuffers(
-      internal::ScratchBufferRequest* scratch_buffer_requests,
-      ScratchBufferHandle* scratch_buffer_handles);
-
-  // Returns a pointer to the built AllocationInfo array.
-  const AllocationInfo* Finish() const { return info_; }
-
- private:
-  AllocationInfo* info_ = nullptr;
-  size_t tensor_count_ = 0;
-  size_t buffer_count_ = 0;
-  ErrorReporter* reporter_ = nullptr;
-};
-
-TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
-                                               const int32_t* offline_offsets,
-                                               TfLiteEvalTensor* eval_tensors) {
-  TFLITE_DCHECK(eval_tensors != nullptr);
-
-  // Set up allocation info for all tensors.
-  for (size_t i = 0; i < tensor_count_; ++i) {
-    AllocationInfo* current = &info_[i];
-    current->output_ptr = &(eval_tensors[i].data.data);
-
-    TF_LITE_ENSURE_STATUS(
-        TfLiteEvalTensorByteLength(&eval_tensors[i], &current->bytes));
-
-    current->first_created = -1;
-    current->last_used = -1;
-    current->needs_allocating = (eval_tensors[i].data.data == nullptr) &&
-                                (!subgraph->tensors()->Get(i)->is_variable());
-    if (offline_offsets) {
-      current->offline_offset = offline_offsets[i];
-    } else {
-      current->offline_offset = kOnlinePlannedBuffer;
-    }
-  }
-
-  uint32_t operators_size = NumSubgraphOperators(subgraph);
-
-  for (size_t i = 0;
-       subgraph->inputs() != nullptr && i < subgraph->inputs()->size(); ++i) {
-    const int tensor_index = subgraph->inputs()->Get(i);
-    AllocationInfo* current = &info_[tensor_index];
-    current->first_created = 0;
-  }
-
-  // Mark all outputs as persistent to the end of the invocation.
-  for (size_t i = 0;
-       subgraph->outputs() != nullptr && i < subgraph->outputs()->size(); ++i) {
-    const int tensor_index = subgraph->outputs()->Get(i);
-    AllocationInfo* current = &info_[tensor_index];
-    current->last_used = operators_size - 1;
-  }
-
-  // Figure out when the first and last use of each tensor is.
-  for (int i = (operators_size - 1); i >= 0; --i) {
-    const auto* op = subgraph->operators()->Get(i);
-    for (size_t n = 0; op->inputs() != nullptr && n < op->inputs()->size();
-         ++n) {
-      const int tensor_index = op->inputs()->Get(n);
-      AllocationInfo* current = &info_[tensor_index];
-      if (((current->last_used == -1) || (current->last_used < i))) {
-        current->last_used = i;
-      }
-    }
-    for (size_t n = 0; op->outputs() != nullptr && n < op->outputs()->size();
-         ++n) {
-      const int tensor_index = op->outputs()->Get(n);
-      AllocationInfo* current = &info_[tensor_index];
-      if ((current->first_created == -1) || (current->first_created > i)) {
-        current->first_created = i;
-      }
-      // Since operator outputs are written to, they must be marked as used.
-      if ((current->last_used == -1) || (current->last_used < i)) {
-        current->last_used = i;
-      }
-    }
-  }
-  return kTfLiteOk;
-}
-
-// Get offline tensors allocation plan. See
-// micro/docs/memory_management.md for more info.
-TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
-    const Model* model, const int32_t** offline_planner_offsets) {
-  if (model->metadata()) {
-    for (size_t i = 0; i < model->metadata()->size(); ++i) {
-      auto metadata = model->metadata()->Get(i);
-      if (strncmp(metadata->name()->c_str(), kOfflineMemAllocMetadata,
-                  strlen(kOfflineMemAllocMetadata)) == 0) {
-        const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
-            model->buffers();
-        auto* buffer = (*buffers)[metadata->buffer()];
-        auto* array = buffer->data();
-        const uint32_t* metadata_buffer =
-            reinterpret_cast<const uint32_t*>(array->data());
-        const size_t nbr_tensors = static_cast<size_t>(metadata_buffer[2]);
-        *offline_planner_offsets =
-            reinterpret_cast<const int32_t*>(&metadata_buffer[3]);
-
-        if (tensor_count_ != nbr_tensors) {
-          TF_LITE_REPORT_ERROR(reporter_,
-                               "Nbr of offline buffer offsets (%d) in metadata "
-                               "not equal nbr tensors (%d)\n",
-                               nbr_tensors, tensor_count_);
-          return kTfLiteError;
-        }
-      }
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus AllocationInfoBuilder::AddScratchBuffers(
-    internal::ScratchBufferRequest* scratch_buffer_requests,
-    ScratchBufferHandle* scratch_buffer_handles) {
-  // Set up allocation info for buffers.
-  for (size_t i = tensor_count_; i < tensor_count_ + buffer_count_; ++i) {
-    internal::ScratchBufferRequest* current_request =
-        &(scratch_buffer_requests[i - tensor_count_]);
-    ScratchBufferHandle* current_handle =
-        &(scratch_buffer_handles[i - tensor_count_]);
-
-    AllocationInfo* current = &info_[i];
-    current->output_ptr = reinterpret_cast<void**>(&current_handle->data);
-    current->bytes = current_request->bytes;
-    current->first_created = current_request->node_idx;
-    current->last_used = current_request->node_idx;
-    current->offline_offset = kOnlinePlannedBuffer;
-    current->needs_allocating = true;
-  }
-  return kTfLiteOk;
-}
-
 TfLiteStatus CreatePlan(ErrorReporter* error_reporter,
                         MicroMemoryPlanner* planner,
                         const AllocationInfo* allocation_info,
@@ -282,6 +115,7 @@ TfLiteStatus CommitPlan(ErrorReporter* error_reporter,
   }
   return kTfLiteOk;
 }
+
 }  // namespace
 
 namespace internal {
@@ -319,8 +153,9 @@ void* GetFlatbufferTensorBuffer(
 }
 
 TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
-    SimpleMemoryAllocator* allocator, bool allocate_temp,
-    const tflite::Tensor& flatbuffer_tensor,
+    IPersistentBufferAllocator* persistent_buffer_allocator,
+    INonPersistentBufferAllocator* non_persistent_buffer_allocator,
+    bool allocate_temp, const tflite::Tensor& flatbuffer_tensor,
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     ErrorReporter* error_reporter, TfLiteTensor* result) {
   TFLITE_DCHECK(result != nullptr);
@@ -385,10 +220,11 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     TfLiteAffineQuantization* quantization =
         allocate_temp
             ? reinterpret_cast<TfLiteAffineQuantization*>(
-                  allocator->AllocateTemp(sizeof(TfLiteAffineQuantization),
-                                          alignof(TfLiteAffineQuantization)))
+                  non_persistent_buffer_allocator->AllocateTemp(
+                      sizeof(TfLiteAffineQuantization),
+                      alignof(TfLiteAffineQuantization)))
             : reinterpret_cast<TfLiteAffineQuantization*>(
-                  allocator->AllocateFromTail(
+                  persistent_buffer_allocator->AllocatePersistentBuffer(
                       sizeof(TfLiteAffineQuantization),
                       alignof(TfLiteAffineQuantization)));
     if (quantization == nullptr) {
@@ -402,12 +238,14 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     // zero_point is stored as a int64_t.
     quantization->zero_point =
         allocate_temp
-            ? reinterpret_cast<TfLiteIntArray*>(allocator->AllocateTemp(
-                  TfLiteIntArrayGetSizeInBytes(channels),
-                  alignof(TfLiteIntArray)))
-            : reinterpret_cast<TfLiteIntArray*>(allocator->AllocateFromTail(
-                  TfLiteIntArrayGetSizeInBytes(channels),
-                  alignof(TfLiteIntArray)));
+            ? reinterpret_cast<TfLiteIntArray*>(
+                  non_persistent_buffer_allocator->AllocateTemp(
+                      TfLiteIntArrayGetSizeInBytes(channels),
+                      alignof(TfLiteIntArray)))
+            : reinterpret_cast<TfLiteIntArray*>(
+                  persistent_buffer_allocator->AllocatePersistentBuffer(
+                      TfLiteIntArrayGetSizeInBytes(channels),
+                      alignof(TfLiteIntArray)));
     if (quantization->zero_point == nullptr) {
       TF_LITE_REPORT_ERROR(error_reporter,
                            "Unable to allocate quantization->zero_point.\n");
@@ -437,7 +275,7 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
 }
 
 TfLiteStatus InitializeTfLiteEvalTensorFromFlatbuffer(
-    SimpleMemoryAllocator* allocator, const tflite::Tensor& flatbuffer_tensor,
+    const tflite::Tensor& flatbuffer_tensor,
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     ErrorReporter* error_reporter, TfLiteEvalTensor* result) {
   *result = {};
@@ -480,7 +318,8 @@ size_t MicroAllocator::GetDefaultTailUsage(bool is_memory_planner_given) {
 MicroAllocator::MicroAllocator(SimpleMemoryAllocator* memory_allocator,
                                MicroMemoryPlanner* memory_planner,
                                ErrorReporter* error_reporter)
-    : memory_allocator_(memory_allocator),
+    : non_persistent_buffer_allocator_(memory_allocator),
+      persistent_buffer_allocator_(memory_allocator),
       memory_planner_(memory_planner),
       error_reporter_(error_reporter),
       model_is_allocating_(false) {}
@@ -509,7 +348,7 @@ MicroAllocator* MicroAllocator::Create(uint8_t* tensor_arena, size_t arena_size,
 
   // By default create GreedyMemoryPlanner.
   // If a different MemoryPlanner is needed, use the other api.
-  uint8_t* memory_planner_buffer = memory_allocator->AllocateFromTail(
+  uint8_t* memory_planner_buffer = memory_allocator->AllocatePersistentBuffer(
       sizeof(GreedyMemoryPlanner), alignof(GreedyMemoryPlanner));
   GreedyMemoryPlanner* memory_planner =
       new (memory_planner_buffer) GreedyMemoryPlanner();
@@ -524,7 +363,7 @@ MicroAllocator* MicroAllocator::Create(SimpleMemoryAllocator* memory_allocator,
   TFLITE_DCHECK(error_reporter != nullptr);
   TFLITE_DCHECK(memory_planner != nullptr);
 
-  uint8_t* allocator_buffer = memory_allocator->AllocateFromTail(
+  uint8_t* allocator_buffer = memory_allocator->AllocatePersistentBuffer(
       sizeof(MicroAllocator), alignof(MicroAllocator));
   MicroAllocator* allocator = new (allocator_buffer)
       MicroAllocator(memory_allocator, memory_planner, error_reporter);
@@ -543,10 +382,12 @@ SubgraphAllocations* MicroAllocator::StartModelAllocation(const Model* model) {
 
   model_is_allocating_ = true;
 
-  uint8_t* data_allocator_buffer = memory_allocator_->AllocateFromTail(
-      sizeof(MicroBuiltinDataAllocator), alignof(MicroBuiltinDataAllocator));
-  builtin_data_allocator_ =
-      new (data_allocator_buffer) MicroBuiltinDataAllocator(memory_allocator_);
+  uint8_t* data_allocator_buffer =
+      persistent_buffer_allocator_->AllocatePersistentBuffer(
+          sizeof(MicroBuiltinDataAllocator),
+          alignof(MicroBuiltinDataAllocator));
+  builtin_data_allocator_ = new (data_allocator_buffer)
+      MicroBuiltinDataAllocator(persistent_buffer_allocator_);
 
   if (InitScratchBufferData() != kTfLiteOk) {
     return nullptr;
@@ -554,7 +395,7 @@ SubgraphAllocations* MicroAllocator::StartModelAllocation(const Model* model) {
 
   // Allocate struct to store eval tensors, nodes and registrations.
   SubgraphAllocations* output = reinterpret_cast<SubgraphAllocations*>(
-      memory_allocator_->AllocateFromTail(
+      persistent_buffer_allocator_->AllocatePersistentBuffer(
           sizeof(SubgraphAllocations) * model->subgraphs()->size(),
           alignof(SubgraphAllocations)));
   if (output == nullptr) {
@@ -579,7 +420,7 @@ TfLiteStatus MicroAllocator::FinishModelAllocation(
     return kTfLiteError;
   }
 
-  // TODO(b/187993197): Track scratch buffers for each subgraph.
+  // Allocate scratch buffer metadata and buffers for variable tensors.
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs()->size();
        subgraph_idx++) {
     const SubGraph* subgraph = model->subgraphs()->Get(subgraph_idx);
@@ -587,19 +428,20 @@ TfLiteStatus MicroAllocator::FinishModelAllocation(
 
     TF_LITE_ENSURE_STATUS(AllocateScratchBufferHandles(
         scratch_buffer_handles, scratch_buffer_request_count_));
-    TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(
-        model, subgraph_allocations[subgraph_idx].tensors,
-        *scratch_buffer_handles, subgraph_idx));
     TF_LITE_ENSURE_STATUS(AllocateVariables(
         subgraph, subgraph_allocations[subgraph_idx].tensors));
   }
+
+  // Plan all subgraphs and scratch buffers together.
+  TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(model, subgraph_allocations,
+                                               *scratch_buffer_handles));
   model_is_allocating_ = false;
   return kTfLiteOk;
 }
 
 void* MicroAllocator::AllocatePersistentBuffer(size_t bytes) {
-  return memory_allocator_->AllocateFromTail(bytes,
-                                             MicroArenaBufferAlignment());
+  return persistent_buffer_allocator_->AllocatePersistentBuffer(
+      bytes, MicroArenaBufferAlignment());
 }
 
 TfLiteStatus MicroAllocator::RequestScratchBufferInArena(size_t bytes,
@@ -635,6 +477,7 @@ TfLiteStatus MicroAllocator::RequestScratchBufferInArena(size_t bytes,
   // allocating:
   current_request->bytes = bytes;
   current_request->node_idx = kUnassignedScratchBufferRequestIndex;
+  current_request->subgraph_idx = subgraph_idx;
 
   // Assign the current request index to the out-param:
   *buffer_idx = scratch_buffer_request_count_;
@@ -647,7 +490,7 @@ TfLiteStatus MicroAllocator::RequestScratchBufferInArena(size_t bytes,
 TfLiteStatus MicroAllocator::FinishPrepareNodeAllocations(int node_id) {
   // When a node has finished preparing, all temp allocations performed by the
   // kernel should be cleaned up:
-  ResetTempAllocations();
+  TF_LITE_ENSURE_STATUS(ResetTempAllocations());
 
   // Find and update any new scratch buffer requests for the current node:
   internal::ScratchBufferRequest* requests = GetScratchBufferRequests();
@@ -665,7 +508,8 @@ TfLiteStatus MicroAllocator::FinishPrepareNodeAllocations(int node_id) {
 
   // Ensure that the head is re-adjusted to allow for another at-most
   // kMaxScratchBuffersPerOp scratch buffer requests in the next operator:
-  TF_LITE_ENSURE_STATUS(memory_allocator_->SetHeadBufferSize(
+  TF_LITE_ENSURE_STATUS(non_persistent_buffer_allocator_->ResizeBuffer(
+      scratch_buffer_head_,
       sizeof(internal::ScratchBufferRequest) *
           (scratch_buffer_request_count_ + kMaxScratchBuffersPerOp),
       alignof(internal::ScratchBufferRequest)));
@@ -674,7 +518,8 @@ TfLiteStatus MicroAllocator::FinishPrepareNodeAllocations(int node_id) {
 }
 
 size_t MicroAllocator::used_bytes() const {
-  return memory_allocator_->GetUsedBytes();
+  return non_persistent_buffer_allocator_->GetNonPersistentUsedBytes() +
+         persistent_buffer_allocator_->GetPersistentUsedBytes();
 }
 
 TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
@@ -690,7 +535,7 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
 
     // Initialize NodeAndRegistrations for the subgraph.
     NodeAndRegistration* output = reinterpret_cast<NodeAndRegistration*>(
-        memory_allocator_->AllocateFromTail(
+        persistent_buffer_allocator_->AllocatePersistentBuffer(
             sizeof(NodeAndRegistration) * operators_size,
             alignof(NodeAndRegistration)));
     if (output == nullptr) {
@@ -703,6 +548,7 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
   }
   return kTfLiteOk;
 }
+
 TfLiteTensor* MicroAllocator::AllocatePersistentTfLiteTensor(
     const Model* model, const SubgraphAllocations* subgraph_allocations,
     int tensor_index, int subgraph_index) {
@@ -740,6 +586,30 @@ TfLiteTensor* MicroAllocator::AllocatePersistentTfLiteTensor(
   return tensor;
 }
 
+void MicroAllocator::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) {
+  TFLITE_DCHECK(tensor != nullptr);
+
+  if (tensor->quantization.type == kTfLiteAffineQuantization) {
+    TFLITE_DCHECK(tensor->quantization.params != nullptr);
+    TfLiteAffineQuantization* quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            tensor->quantization.params);
+
+    non_persistent_buffer_allocator_->DeallocateTemp(
+        reinterpret_cast<uint8_t*>(quantization->zero_point));
+    non_persistent_buffer_allocator_->DeallocateTemp(
+        reinterpret_cast<uint8_t*>(quantization));
+  }
+
+  // Clear the data in case someone still access tensor arena by mistake
+  tensor->quantization.type = kTfLiteNoQuantization;
+  tensor->quantization.params = nullptr;
+  tensor->data.data = nullptr;
+  tensor->dims = nullptr;
+  non_persistent_buffer_allocator_->DeallocateTemp(
+      reinterpret_cast<uint8_t*>(tensor));
+}
+
 TfLiteTensor* MicroAllocator::AllocateTempTfLiteTensor(
     const Model* model, const SubgraphAllocations* subgraph_allocations,
     int tensor_index, int subgraph_index) {
@@ -749,9 +619,9 @@ TfLiteTensor* MicroAllocator::AllocateTempTfLiteTensor(
   // This value is allocated from temporary arena space. It is guaranteed to be
   // around for at least the scope of the calling function. Since this struct
   // allocation takes place in temp space, no need to own or cleanup.
-  TfLiteTensor* tensor =
-      reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateTemp(
-          sizeof(TfLiteTensor), alignof(TfLiteTensor)));
+  TfLiteTensor* tensor = reinterpret_cast<TfLiteTensor*>(
+      non_persistent_buffer_allocator_->AllocateTemp(sizeof(TfLiteTensor),
+                                                     alignof(TfLiteTensor)));
 
   // Populate any fields from the flatbuffer, since this TfLiteTensor struct is
   // allocated in the temp section of the arena, ensure that additional
@@ -780,8 +650,12 @@ TfLiteTensor* MicroAllocator::AllocateTempTfLiteTensor(
   return tensor;
 }
 
-void MicroAllocator::ResetTempAllocations() {
-  memory_allocator_->ResetTempAllocations();
+TfLiteStatus MicroAllocator::ResetTempAllocations() {
+  return non_persistent_buffer_allocator_->ResetTempAllocations();
+}
+
+bool MicroAllocator::IsAllTempDeallocated() {
+  return non_persistent_buffer_allocator_->IsAllTempDeallocated();
 }
 
 TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors(
@@ -794,8 +668,8 @@ TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors(
     TFLITE_DCHECK(subgraph != nullptr);
 
     size_t alloc_count = subgraph->tensors()->size();
-    TfLiteEvalTensor* tensors =
-        reinterpret_cast<TfLiteEvalTensor*>(memory_allocator_->AllocateFromTail(
+    TfLiteEvalTensor* tensors = reinterpret_cast<TfLiteEvalTensor*>(
+        persistent_buffer_allocator_->AllocatePersistentBuffer(
             sizeof(TfLiteEvalTensor) * alloc_count, alignof(TfLiteEvalTensor)));
     if (tensors == nullptr) {
       TF_LITE_REPORT_ERROR(
@@ -808,8 +682,8 @@ TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors(
 
     for (size_t i = 0; i < alloc_count; ++i) {
       TfLiteStatus status = internal::InitializeTfLiteEvalTensorFromFlatbuffer(
-          memory_allocator_, *subgraph->tensors()->Get(i), model->buffers(),
-          error_reporter_, &tensors[i]);
+          *subgraph->tensors()->Get(i), model->buffers(), error_reporter_,
+          &tensors[i]);
       if (status != kTfLiteOk) {
         TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d",
                              i);
@@ -820,6 +694,7 @@ TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors(
   }
   return kTfLiteOk;
 }
+
 TfLiteStatus MicroAllocator::AllocateVariables(const SubGraph* subgraph,
                                                TfLiteEvalTensor* eval_tensors) {
   for (size_t i = 0; i < subgraph->tensors()->size(); ++i) {
@@ -829,8 +704,9 @@ TfLiteStatus MicroAllocator::AllocateVariables(const SubGraph* subgraph,
       TF_LITE_ENSURE_STATUS(
           TfLiteEvalTensorByteLength(&eval_tensors[i], &buffer_size));
 
-      eval_tensors[i].data.data = memory_allocator_->AllocateFromTail(
-          buffer_size, MicroArenaBufferAlignment());
+      eval_tensors[i].data.data =
+          persistent_buffer_allocator_->AllocatePersistentBuffer(
+              buffer_size, MicroArenaBufferAlignment());
 
       if (eval_tensors[i].data.data == nullptr) {
         TF_LITE_REPORT_ERROR(error_reporter_,
@@ -844,8 +720,9 @@ TfLiteStatus MicroAllocator::AllocateVariables(const SubGraph* subgraph,
 }
 
 TfLiteTensor* MicroAllocator::AllocatePersistentTfLiteTensorInternal() {
-  return reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateFromTail(
-      sizeof(TfLiteTensor), alignof(TfLiteTensor)));
+  return reinterpret_cast<TfLiteTensor*>(
+      persistent_buffer_allocator_->AllocatePersistentBuffer(
+          sizeof(TfLiteTensor), alignof(TfLiteTensor)));
 }
 
 TfLiteStatus MicroAllocator::PopulateTfLiteTensorFromFlatbuffer(
@@ -855,7 +732,8 @@ TfLiteStatus MicroAllocator::PopulateTfLiteTensorFromFlatbuffer(
   // allocations in the tail can be recorded. Once the interpreter has APIs for
   // accessing buffers on TfLiteEvalTensor this method can be dropped.
   return internal::InitializeTfLiteTensorFromFlatbuffer(
-      memory_allocator_, allocate_temp,
+      persistent_buffer_allocator_, non_persistent_buffer_allocator_,
+      allocate_temp,
       *model->subgraphs()->Get(subgraph_idx)->tensors()->Get(tensor_index),
       model->buffers(), error_reporter_, tensor);
 }
@@ -865,8 +743,8 @@ ErrorReporter* MicroAllocator::error_reporter() const {
 }
 
 TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(
-    const Model* model, TfLiteEvalTensor* eval_tensors,
-    ScratchBufferHandle* scratch_buffer_handles, int subgraph_idx) {
+    const Model* model, SubgraphAllocations* allocations,
+    ScratchBufferHandle* scratch_buffer_handles) {
   size_t head_usage = 0;
   // Create static memory plan
   // 1. Calculate AllocationInfo to know the lifetime of each tensor/buffer.
@@ -878,69 +756,52 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(
   // allocated from the temp section and cleaned up at the bottom of this
   // function.
 
-  const SubGraph* subgraph = model->subgraphs()->Get(subgraph_idx);
-  size_t allocation_info_count =
-      subgraph->tensors()->size() + scratch_buffer_request_count_;
-  size_t bytes = sizeof(AllocationInfo) * allocation_info_count;
-
-  // Allocate an array of AllocationInfo structs from the temp section. This
-  // struct will be used by AllocationInfoBuilder to find buffer usage.
-  AllocationInfo* allocation_info = reinterpret_cast<AllocationInfo*>(
-      memory_allocator_->AllocateTemp(bytes, alignof(AllocationInfo)));
-  if (allocation_info == nullptr) {
-    TF_LITE_REPORT_ERROR(
-        error_reporter_,
-        "Failed to allocate memory for allocation_info, %d bytes required",
-        bytes);
-    return kTfLiteError;
-  }
-
   // Use the AllocationInfoBuilder class to help determine where buffers are
   // used in the subgraph.
-  AllocationInfoBuilder builder(allocation_info, subgraph->tensors()->size(),
-                                scratch_buffer_request_count_, error_reporter_);
+  AllocationInfoBuilder builder(model, non_persistent_buffer_allocator_,
+                                error_reporter_);
+  TF_LITE_ENSURE_STATUS(
+      builder.CreateAllocationInfo(scratch_buffer_request_count_));
 
   const int32_t* offline_planner_offsets = nullptr;
   TF_LITE_ENSURE_STATUS(
-      builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets));
+      builder.GetOfflinePlannedOffsets(&offline_planner_offsets));
   TF_LITE_ENSURE_STATUS(
-      builder.AddTensors(subgraph, offline_planner_offsets, eval_tensors));
+      builder.InitializeAllocationInfo(offline_planner_offsets, allocations));
 
   internal::ScratchBufferRequest* scratch_buffer_requests =
       GetScratchBufferRequests();
-
-  TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_requests,
-                                                  scratch_buffer_handles));
+  TF_LITE_ENSURE_STATUS(builder.MarkAllocationLifetimes(
+      0, scratch_buffer_requests, scratch_buffer_handles, allocations));
+  int allocation_info_count = builder.AllocationCount();
+  AllocationInfo* allocation_info = builder.Finish();
 
   // Remaining arena size that memory planner can use for calculating offsets.
   size_t remaining_arena_size =
-      memory_allocator_->GetAvailableMemory(MicroArenaBufferAlignment());
-  uint8_t* planner_arena = memory_allocator_->AllocateTemp(
+      non_persistent_buffer_allocator_->GetAvailableMemory(
+          MicroArenaBufferAlignment());
+  uint8_t* planner_arena = non_persistent_buffer_allocator_->AllocateTemp(
       remaining_arena_size, MicroArenaBufferAlignment());
   TF_LITE_ENSURE(error_reporter_, planner_arena != nullptr);
   memory_planner_->Init(planner_arena, remaining_arena_size);
   TF_LITE_ENSURE_STATUS(CreatePlan(error_reporter_, memory_planner_,
                                    allocation_info, allocation_info_count));
 
-  // Reset all temp allocations used above:
-  memory_allocator_->ResetTempAllocations();
-
-  size_t actual_available_arena_size =
-      memory_allocator_->GetAvailableMemory(MicroArenaBufferAlignment());
-
-  // Make sure we have enough arena size.
-  if (memory_planner_->GetMaximumMemorySize() > actual_available_arena_size) {
-    TF_LITE_REPORT_ERROR(
-        error_reporter_,
-        "Arena size is too small for all buffers. Needed %u but only "
-        "%u was available.",
-        memory_planner_->GetMaximumMemorySize(), actual_available_arena_size);
-    return kTfLiteError;
-  }
   // Commit the plan.
-  TF_LITE_ENSURE_STATUS(CommitPlan(error_reporter_, memory_planner_,
-                                   memory_allocator_->GetHeadBuffer(),
-                                   allocation_info, allocation_info_count));
+  TF_LITE_ENSURE_STATUS(
+      CommitPlan(error_reporter_, memory_planner_,
+                 non_persistent_buffer_allocator_->GetOverlayMemoryAddress(),
+                 allocation_info, allocation_info_count));
+
+  // Reset all temp allocations used above:
+  builder.FreeAllocationInfo();
+  non_persistent_buffer_allocator_->DeallocateTemp(planner_arena);
+  TF_LITE_ENSURE_STATUS(
+      non_persistent_buffer_allocator_->ResetTempAllocations());
+  TF_LITE_ENSURE_STATUS(
+      non_persistent_buffer_allocator_->DeallocateResizableBuffer(
+          scratch_buffer_head_));
+
 #ifdef TF_LITE_SHOW_MEMORY_USE
   memory_planner_->PrintMemoryPlan();
 #endif
@@ -958,8 +819,9 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(
   // The head is used for storing scratch buffer allocations before finalizing a
   // memory plan in this function. Ensure that the head is set to the largest
   // memory plan sent through the allocator:
-  TF_LITE_ENSURE_STATUS(memory_allocator_->SetHeadBufferSize(
-      max_head_buffer_usage_, MicroArenaBufferAlignment()));
+  TF_LITE_ENSURE_STATUS(
+      non_persistent_buffer_allocator_->ReserveNonPersistentOverlayMemory(
+          max_head_buffer_usage_, MicroArenaBufferAlignment()));
   return kTfLiteOk;
 }
 
@@ -975,7 +837,7 @@ TfLiteStatus MicroAllocator::AllocateScratchBufferHandles(
   // Allocate a consecutive block of memory store the scratch buffer handles.
   // This alignment ensures quick lookup during inference time for the model:
   *scratch_buffer_handles = reinterpret_cast<ScratchBufferHandle*>(
-      memory_allocator_->AllocateFromTail(
+      persistent_buffer_allocator_->AllocatePersistentBuffer(
           sizeof(ScratchBufferHandle) * handle_count,
           alignof(ScratchBufferHandle)));
 
@@ -990,17 +852,20 @@ TfLiteStatus MicroAllocator::InitScratchBufferData() {
   // All requests will be stored in the head section. Each kernel is allowed at
   // most kMaxScratchBuffersPerOp requests. Adjust the head to reserve at most
   // that many requests to begin:
-  TF_LITE_ENSURE_STATUS(memory_allocator_->SetHeadBufferSize(
-      sizeof(internal::ScratchBufferRequest) * kMaxScratchBuffersPerOp,
-      alignof(internal::ScratchBufferRequest)));
+  scratch_buffer_head_ =
+      non_persistent_buffer_allocator_->AllocateResizableBuffer(
+          sizeof(internal::ScratchBufferRequest) * kMaxScratchBuffersPerOp,
+          alignof(internal::ScratchBufferRequest));
+  if (scratch_buffer_head_ == nullptr) {
+    return kTfLiteError;
+  }
 
   return kTfLiteOk;
 }
 
 internal::ScratchBufferRequest* MicroAllocator::GetScratchBufferRequests() {
-  return reinterpret_cast<internal::ScratchBufferRequest*>(
-      AlignPointerUp(memory_allocator_->GetHeadBuffer(),
-                     alignof(internal::ScratchBufferRequest)));
+  return reinterpret_cast<internal::ScratchBufferRequest*>(AlignPointerUp(
+      scratch_buffer_head_, alignof(internal::ScratchBufferRequest)));
 }
 
 BuiltinDataAllocator* MicroAllocator::GetBuiltinDataAllocator() {
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.h
index f76fe29f..35b07f16 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.h
@@ -21,10 +21,10 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/micro/arena_allocator/simple_memory_allocator.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/flatbuffer_utils.h"
 #include "tensorflow/lite/micro/memory_planner/micro_memory_planner.h"
-#include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -38,8 +38,9 @@ namespace internal {
 // TODO(b/162311891): Drop this method when the interpreter has an API for
 // returning buffers on TfLiteEvalTensor.
 TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
-    SimpleMemoryAllocator* allocator, bool allocate_temp,
-    const tflite::Tensor& flatbuffer_tensor,
+    IPersistentBufferAllocator* persistent_buffer_allocator,
+    INonPersistentBufferAllocator* non_persistent_buffer_allocator,
+    bool allocate_temp, const tflite::Tensor& flatbuffer_tensor,
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     ErrorReporter* error_reporter, TfLiteTensor* result);
 
@@ -61,6 +62,7 @@ typedef struct {
   // determine the lifetime of the buffer. In AllocationInfo, this buffer will
   // have `before` = node_idx and `after` = node_idx.
   int node_idx;
+  int subgraph_idx;
 } ScratchBufferRequest;
 
 }  // namespace internal
@@ -185,10 +187,16 @@ class MicroAllocator {
       const Model* model, const SubgraphAllocations* subgraph_allocations,
       int tensor_index, int subgraph_index);
 
+  virtual void DeallocateTempTfLiteTensor(TfLiteTensor*);
+
   // Resets all temporary allocations. This method should be called after a
   // chain of temp allocations (e.g. chain of TfLiteTensor objects via
   // AllocateTfLiteTensor()).
-  virtual void ResetTempAllocations();
+  virtual TfLiteStatus ResetTempAllocations();
+
+  // Returns true if all temporary buffers including temp TfLiteTensor are
+  // already deallocated.
+  virtual bool IsAllTempDeallocated();
 
   // Allocates persistent buffer which has the same life time as the allocator.
   // The memory is immediately available and is allocated from the tail of the
@@ -260,8 +268,8 @@ class MicroAllocator {
   // ScratchBufferHandle structs that will point to allocated buffers also in
   // the head section.
   virtual TfLiteStatus CommitStaticMemoryPlan(
-      const Model* model, TfLiteEvalTensor* eval_tensors,
-      ScratchBufferHandle* scratch_buffer_handles, int subgraph_idx);
+      const Model* model, SubgraphAllocations* allocations,
+      ScratchBufferHandle* scratch_buffer_handles);
 
   // Allocates an array of ScratchBufferHandle structs in the tail section for a
   // given number of handles.
@@ -278,7 +286,8 @@ class MicroAllocator {
   internal::ScratchBufferRequest* GetScratchBufferRequests();
 
   // A simple memory allocator that always allocate from the arena tail or head.
-  SimpleMemoryAllocator* memory_allocator_;
+  INonPersistentBufferAllocator* non_persistent_buffer_allocator_;
+  IPersistentBufferAllocator* persistent_buffer_allocator_;
 
   // Allocator used to allocate persistent builtin data.
   BuiltinDataAllocator* builtin_data_allocator_;
@@ -293,6 +302,9 @@ class MicroAllocator {
   // section when a model is allocating.
   size_t scratch_buffer_request_count_ = 0;
 
+  // Holds ScratchBufferRequest when a model is allocating
+  uint8_t* scratch_buffer_head_ = nullptr;
+
   // Holds the byte length of the memory plan with the largest head usage. Used
   // to ensure that multi-tenant allocations can share the head for buffers.
   size_t max_head_buffer_usage_ = 0;
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_context.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_context.cc
new file mode 100644
index 00000000..9ec694b8
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_context.cc
@@ -0,0 +1,129 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/micro_context.h"
+
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+namespace tflite {
+MicroContext::MicroContext(MicroAllocator* allocator, const Model* model,
+                           MicroGraph* graph)
+    : allocator_(*allocator), graph_(*graph), model_(model) {}
+
+MicroContext::~MicroContext() {}
+
+void* MicroContext::AllocatePersistentBuffer(size_t bytes) {
+  return allocator_.AllocatePersistentBuffer(bytes);
+}
+
+TfLiteStatus MicroContext::RequestScratchBufferInArena(size_t bytes,
+                                                       int* buffer_idx) {
+  return allocator_.RequestScratchBufferInArena(
+      bytes, graph_.GetCurrentSubgraphIndex(), buffer_idx);
+}
+
+void* MicroContext::GetScratchBuffer(int buffer_idx) {
+  ScratchBufferHandle* handle = scratch_buffer_handles_ + buffer_idx;
+  return handle->data;
+}
+
+TfLiteTensor* MicroContext::AllocateTempTfLiteTensor(int tensor_idx) {
+  return allocator_.AllocateTempTfLiteTensor(model_, graph_.GetAllocations(),
+                                             tensor_idx,
+                                             graph_.GetCurrentSubgraphIndex());
+}
+
+int MicroContext::GetTensorIndex(int index, int max_size,
+                                 const int* tensor_indices) {
+  if (index >= 0 && index < max_size) {
+    const int tensor_index = tensor_indices[index];
+    if (tensor_index != kTfLiteOptionalTensor) {
+      return tensor_index;
+    }
+  }
+  return -1;
+}
+
+TfLiteTensor* MicroContext::AllocateTempInputTensor(const TfLiteNode* node,
+                                                    int index) {
+  const int tensor_index =
+      GetTensorIndex(index, node->inputs->size, node->inputs->data);
+  if (tensor_index < 0) {
+    return nullptr;
+  }
+  return AllocateTempTfLiteTensor(tensor_index);
+}
+
+TfLiteTensor* MicroContext::AllocateTempOutputTensor(const TfLiteNode* node,
+                                                     int index) {
+  const int tensor_index =
+      GetTensorIndex(index, node->outputs->size, node->outputs->data);
+  if (tensor_index < 0) {
+    return nullptr;
+  }
+  return AllocateTempTfLiteTensor(tensor_index);
+}
+
+TfLiteTensor* MicroContext::AllocateTempIntermediateTensor(
+    const TfLiteNode* node, int index) {
+  const int tensor_index = GetTensorIndex(index, node->intermediates->size,
+                                          node->intermediates->data);
+  if (tensor_index < 0) {
+    return nullptr;
+  }
+  return AllocateTempTfLiteTensor(tensor_index);
+}
+
+void MicroContext::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) {
+  return allocator_.DeallocateTempTfLiteTensor(tensor);
+}
+
+TfLiteEvalTensor* MicroContext::GetEvalTensor(int tensor_idx) {
+  return &graph_.GetAllocations()[graph_.GetCurrentSubgraphIndex()]
+              .tensors[tensor_idx];
+}
+
+void MicroContext::SetScratchBufferHandles(
+    ScratchBufferHandle* scratch_buffer_handles) {
+  scratch_buffer_handles_ = scratch_buffer_handles;
+}
+
+TfLiteStatus MicroContext::set_external_context(
+    void* external_context_payload) {
+  if (external_context_payload == nullptr ||
+      external_context_payload_ != nullptr) {
+    MicroPrintf(
+        "Attempting to set external context to %x but it was %x already",
+        external_context_payload, external_context_payload_);
+    return kTfLiteError;
+  }
+
+  external_context_payload_ = external_context_payload;
+  return kTfLiteOk;
+}
+
+void MicroContextReportOpError(struct TfLiteContext* context,
+                               const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  GetMicroErrorReporter()->Report(format, args);
+  va_end(args);
+}
+
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_context.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_context.h
new file mode 100644
index 00000000..e7be6544
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_context.h
@@ -0,0 +1,161 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_CONTEXT_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_CONTEXT_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_graph.h"
+
+namespace tflite {
+// MicroContext is eventually going to become the API between TFLM and the
+// kernels, replacing all the functions in TfLiteContext. The end state is code
+// kernels to have code like:
+//
+// MicroContext* micro_context = GetMicroContext(context);
+// micro_context-><TFLM kernel API>
+class MicroContext {
+ public:
+  // Does not take any ownership, and all pointers must refer to valid objects
+  // that outlive the one constructed.
+  explicit MicroContext(MicroAllocator* allocator, const Model* model,
+                        MicroGraph* graph);
+  virtual ~MicroContext();
+
+  // Allocate persistent buffer which has the same life time as the interpreter.
+  // Returns nullptr on failure.
+  // The memory is allocated from the tail.
+  // This method is only available in Init or Prepare stage.
+  // Virtual so that it can be faked for kernel tests.
+  virtual void* AllocatePersistentBuffer(size_t bytes);
+
+  // Request a scratch buffer in the arena through static memory planning.
+  // This method is only available in Prepare stage and the buffer is allocated
+  // by the interpreter between Prepare and Eval stage. In Eval stage,
+  // GetScratchBuffer API can be used to fetch the address.
+  // Virtual so that it can be faked for kernel tests.
+  virtual TfLiteStatus RequestScratchBufferInArena(size_t bytes,
+                                                   int* buffer_idx);
+
+  // Get the scratch buffer pointer.
+  // This method is only available in Eval stage.
+  // Virtual so that it can be faked for kernel tests.
+  virtual void* GetScratchBuffer(int buffer_idx);
+
+  // Returns a temporary TfLiteTensor struct for a given index.
+  // Virtual so that it can be faked for kernel tests.
+  virtual TfLiteTensor* AllocateTempTfLiteTensor(int tensor_idx);
+
+  // Returns a temporary TfLiteTensor struct for the specified input tensor of a
+  // given mode. This is the recommended API over the deprecated
+  // GetInput/GetInputSafe to get a temp input tensor. The returned tensor shall
+  // be freed via calling DeallocateTempTfLiteTensor.
+  virtual TfLiteTensor* AllocateTempInputTensor(const TfLiteNode* node,
+                                                int index);
+
+  // Returns a temporary TfLiteTensor struct for the specified output tensor of
+  // a given mode. This is the recommended API over the deprecated
+  // GetOutput/GetOutputSafe to get a temp output tensor. The returned tensor
+  // shall be freed via calling DeallocateTempTfLiteTensor.
+  virtual TfLiteTensor* AllocateTempOutputTensor(const TfLiteNode* node,
+                                                 int index);
+
+  // Returns a temporary TfLiteTensor struct for the specified intermediate
+  // tensor of a given mode. This is the recommended API over the deprecated
+  // GetIntermediates/GetIntermediatesSafe to get a temp intermediate tensor.
+  // The returned tensor shall be freed via calling DeallocateTempTfLiteTensor.
+  virtual TfLiteTensor* AllocateTempIntermediateTensor(const TfLiteNode* node,
+                                                       int index);
+
+  // Deallocates a temp TfLiteTensor.
+  // Virtual so that it can be faked for kernel tests.
+  virtual void DeallocateTempTfLiteTensor(TfLiteTensor* tensor);
+
+  // Returns a TfLiteEvalTensor struct for a given index.
+  // Virtual so that it can be faked for kernel tests.
+  virtual TfLiteEvalTensor* GetEvalTensor(int tensor_idx);
+
+  // Does not take ownership of the pointer and the pointer must refer to valid
+  // an object that outlive this class instance.
+  // This can only be called once to set one external context.
+  TfLiteStatus set_external_context(void* external_context_payload);
+
+  void* external_context() { return external_context_payload_; }
+
+  MicroGraph& graph() { return graph_; }
+
+  // Sets the pointer to a list of ScratchBufferHandle instances.
+  // Not API between TFLM and kernels. Primarily used by the framework for
+  // housekeeping in MicroContext.
+  void SetScratchBufferHandles(ScratchBufferHandle* scratch_buffer_handles);
+
+ private:
+  // Return the tensor index as tensor_indices[index]. tensor_indices is of
+  // max_size. Return -1 if index is not in the valid range of tensor_indices.
+  int GetTensorIndex(int index, int max_size, const int* tensor_indices);
+
+  MicroAllocator& allocator_;
+  MicroGraph& graph_;
+  const Model* model_;
+
+  ScratchBufferHandle* scratch_buffer_handles_ = nullptr;
+  void* external_context_payload_ = nullptr;
+
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+inline MicroContext* GetMicroContext(const struct TfLiteContext* context) {
+  return reinterpret_cast<MicroContext*>(context->impl_);
+}
+
+// Deprecated API. Prefer to using the MicroContext API directly from the
+// kernels.
+// TODO(b/213010668): migrate all existing kernels to use MicroContext, delete
+// these functions, and remove corresponding members from the TfLiteContext
+// struct for TFLM.
+inline void* MicroContextAllocatePersistentBuffer(TfLiteContext* ctx,
+                                                  size_t bytes) {
+  return GetMicroContext(ctx)->AllocatePersistentBuffer(bytes);
+}
+inline TfLiteStatus MicroContextRequestScratchBufferInArena(TfLiteContext* ctx,
+                                                            size_t bytes,
+                                                            int* buffer_idx) {
+  return GetMicroContext(ctx)->RequestScratchBufferInArena(bytes, buffer_idx);
+}
+inline void* MicroContextGetScratchBuffer(TfLiteContext* ctx, int buffer_idx) {
+  return GetMicroContext(ctx)->GetScratchBuffer(buffer_idx);
+}
+inline TfLiteTensor* MicroContextGetTensor(const struct TfLiteContext* context,
+                                           int tensor_idx) {
+  return GetMicroContext(context)->AllocateTempTfLiteTensor(tensor_idx);
+}
+inline TfLiteEvalTensor* MicroContextGetEvalTensor(
+    const struct TfLiteContext* context, int tensor_idx) {
+  return GetMicroContext(context)->GetEvalTensor(tensor_idx);
+}
+inline TfLiteExternalContext* MicroContextGetExternalContext(
+    TfLiteContext* context, TfLiteExternalContextType unused) {
+  return reinterpret_cast<TfLiteExternalContext*>(
+      GetMicroContext(context)->external_context());
+}
+
+// Requests that an error be reported with format string msg.
+void MicroContextReportOpError(struct TfLiteContext* context,
+                               const char* format, ...);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_CONTEXT_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_graph.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_graph.cc
index 0abe0173..d9b2176e 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_graph.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_graph.cc
@@ -209,6 +209,9 @@ TfLiteStatus MicroGraph::ResetVariableTensors() {
       }
     }
   }
+  if (resource_variables_ != nullptr) {
+    resource_variables_->ResetAll();
+  }
 
   return kTfLiteOk;
 }
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.cc
index 3f8c44ad..f726a5f3 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.cc
@@ -51,7 +51,8 @@ MicroInterpreter::MicroInterpreter(const Model* model,
       tensors_allocated_(false),
       initialization_status_(kTfLiteError),
       input_tensors_(nullptr),
-      output_tensors_(nullptr) {
+      output_tensors_(nullptr),
+      micro_context_(&allocator_, model_, &graph_) {
   Init(profiler);
 }
 
@@ -69,7 +70,8 @@ MicroInterpreter::MicroInterpreter(const Model* model,
       tensors_allocated_(false),
       initialization_status_(kTfLiteError),
       input_tensors_(nullptr),
-      output_tensors_(nullptr) {
+      output_tensors_(nullptr),
+      micro_context_(&allocator_, model_, &graph_) {
   Init(profiler);
 }
 
@@ -80,12 +82,10 @@ MicroInterpreter::~MicroInterpreter() {
 }
 
 void MicroInterpreter::Init(MicroProfiler* profiler) {
-  context_.impl_ = static_cast<void*>(this);
-  context_.ReportError = ReportOpError;
-  context_.GetTensor = GetTensor;
-  context_.ReportError = ReportOpError;
-  context_.GetTensor = GetTensor;
-  context_.GetEvalTensor = GetEvalTensor;
+  context_.impl_ = static_cast<void*>(&micro_context_);
+  context_.ReportError = MicroContextReportOpError;
+  context_.GetTensor = MicroContextGetTensor;
+  context_.GetEvalTensor = MicroContextGetEvalTensor;
   context_.profiler = profiler;
 
   initialization_status_ = kTfLiteOk;
@@ -200,18 +200,18 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   TF_LITE_ENSURE_STATUS(PrepareNodeAndRegistrationDataFromFlatbuffer());
 
   // Only allow AllocatePersistentBuffer in Init stage.
-  context_.AllocatePersistentBuffer = AllocatePersistentBuffer;
+  context_.AllocatePersistentBuffer = MicroContextAllocatePersistentBuffer;
   context_.RequestScratchBufferInArena = nullptr;
   context_.GetScratchBuffer = nullptr;
-  context_.GetExecutionPlan = GetGraph;
   context_.GetExternalContext = nullptr;
   TF_LITE_ENSURE_STATUS(graph_.InitSubgraphs());
 
   // Both AllocatePersistentBuffer and RequestScratchBufferInArena is
   // available in Prepare stage.
-  context_.RequestScratchBufferInArena = RequestScratchBufferInArena;
-  // GetExternalContext become available in Prepare stage.
-  context_.GetExternalContext = GetExternalContext;
+  context_.RequestScratchBufferInArena =
+      MicroContextRequestScratchBufferInArena;
+  // external_context become available in Prepare stage.
+  context_.GetExternalContext = MicroContextGetExternalContext;
 
   TF_LITE_ENSURE_STATUS(graph_.PrepareSubgraphs());
 
@@ -219,12 +219,14 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   // allowed. Kernels can only fetch scratch buffers via GetScratchBuffer.
   context_.AllocatePersistentBuffer = nullptr;
   context_.RequestScratchBufferInArena = nullptr;
-  context_.GetScratchBuffer = GetScratchBuffer;
+  context_.GetScratchBuffer = MicroContextGetScratchBuffer;
 
   TF_LITE_ENSURE_OK(&context_, allocator_.FinishModelAllocation(
                                    model_, graph_.GetAllocations(),
                                    &scratch_buffer_handles_));
 
+  micro_context_.SetScratchBufferHandles(scratch_buffer_handles_);
+
   // TODO(b/162311891): Drop these allocations when the interpreter supports
   // handling buffers from TfLiteEvalTensor.
   input_tensors_ =
@@ -320,97 +322,9 @@ TfLiteStatus MicroInterpreter::ResetVariableTensors() {
   return graph_.ResetVariableTensors();
 }
 
-void* MicroInterpreter::AllocatePersistentBuffer(TfLiteContext* ctx,
-                                                 size_t bytes) {
-  return reinterpret_cast<MicroInterpreter*>(ctx->impl_)
-      ->allocator_.AllocatePersistentBuffer(bytes);
-}
-
-TfLiteStatus MicroInterpreter::RequestScratchBufferInArena(TfLiteContext* ctx,
-                                                           size_t bytes,
-                                                           int* buffer_idx) {
-  MicroInterpreter* interpreter =
-      reinterpret_cast<MicroInterpreter*>(ctx->impl_);
-  return interpreter->allocator_.RequestScratchBufferInArena(
-      bytes, interpreter->graph_.GetCurrentSubgraphIndex(), buffer_idx);
-}
-
-void* MicroInterpreter::GetScratchBuffer(TfLiteContext* ctx, int buffer_idx) {
-  MicroInterpreter* interpreter =
-      reinterpret_cast<MicroInterpreter*>(ctx->impl_);
-  ScratchBufferHandle* handle =
-      interpreter->scratch_buffer_handles_ + buffer_idx;
-  return handle->data;
-}
-
-void MicroInterpreter::ReportOpError(struct TfLiteContext* context,
-                                     const char* format, ...) {
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-  MicroInterpreter* interpreter =
-      static_cast<MicroInterpreter*>(context->impl_);
-  va_list args;
-  va_start(args, format);
-  TF_LITE_REPORT_ERROR(interpreter->error_reporter_, format, args);
-  va_end(args);
-#endif
-}
-
-TfLiteTensor* MicroInterpreter::GetTensor(const struct TfLiteContext* context,
-                                          int tensor_idx) {
-  MicroInterpreter* interpreter =
-      static_cast<MicroInterpreter*>(context->impl_);
-  return interpreter->allocator_.AllocateTempTfLiteTensor(
-      interpreter->model_, interpreter->graph_.GetAllocations(), tensor_idx,
-      interpreter->get_subgraph_index());
-}
-
-TfLiteEvalTensor* MicroInterpreter::GetEvalTensor(
-    const struct TfLiteContext* context, int tensor_idx) {
-  MicroInterpreter* interpreter =
-      reinterpret_cast<MicroInterpreter*>(context->impl_);
-  return &interpreter->graph_
-              .GetAllocations()[interpreter->get_subgraph_index()]
-              .tensors[tensor_idx];
-}
-
-TfLiteStatus MicroInterpreter::GetGraph(struct TfLiteContext* context,
-                                        TfLiteIntArray** args) {
-  MicroInterpreter* interpreter =
-      reinterpret_cast<MicroInterpreter*>(context->impl_);
-  *args = reinterpret_cast<TfLiteIntArray*>(&interpreter->graph_);
-  return kTfLiteOk;
-}
-
 TfLiteStatus MicroInterpreter::SetMicroExternalContext(
     void* external_context_payload) {
-  if (external_context_payload == nullptr ||
-      external_context_payload_ != nullptr) {
-    MicroPrintf(
-        "Attempting to set external context to %x but it was %x already",
-        external_context_payload, external_context_payload_);
-    return kTfLiteError;
-  }
-
-  external_context_payload_ = external_context_payload;
-  return kTfLiteOk;
-}
-
-void* MicroInterpreter::GetMicroExternalContext() {
-  return external_context_payload_;
-}
-
-// This callback is an implementation for TfLiteContext::GetExternalContext
-// interface.
-TfLiteExternalContext* MicroInterpreter::GetExternalContext(
-    TfLiteContext* context, TfLiteExternalContextType unused) {
-  // TODO(b/205754757): TfLiteExternalContextType is unused in TFLM. This
-  // function is only called by the framework as a way to conform to existing
-  // interface. Users should use GetMicroExternalContext api in kernel_util.h to
-  // get context and shall not directly use this function.
-  MicroInterpreter* interpreter =
-      reinterpret_cast<MicroInterpreter*>(context->impl_);
-  return reinterpret_cast<TfLiteExternalContext*>(
-      interpreter->GetMicroExternalContext());
+  return micro_context_.set_external_context(external_context_payload);
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.h
index 33123af6..aa409386 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_context.h"
 #include "tensorflow/lite/micro/micro_graph.h"
 #include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/micro_profiler.h"
@@ -79,10 +80,6 @@ class MicroInterpreter {
   // one external context.
   TfLiteStatus SetMicroExternalContext(void* external_context_payload);
 
-  // This function is used by the TfLiteContext::GetExternalContext() to get the
-  // external context.
-  void* GetMicroExternalContext();
-
   TfLiteTensor* input(size_t index);
   size_t inputs_size() const {
     return model_->subgraphs()->Get(0)->inputs()->size();
@@ -150,26 +147,6 @@ class MicroInterpreter {
   // Gets the current subgraph index used from within context methods.
   int get_subgraph_index() { return graph_.GetCurrentSubgraphIndex(); }
 
-  // Static functions that are bound to the TfLiteContext instance:
-  static void* AllocatePersistentBuffer(TfLiteContext* ctx, size_t bytes);
-  static TfLiteStatus RequestScratchBufferInArena(TfLiteContext* ctx,
-                                                  size_t bytes,
-                                                  int* buffer_idx);
-  static void* GetScratchBuffer(TfLiteContext* ctx, int buffer_idx);
-  static void ReportOpError(struct TfLiteContext* context, const char* format,
-                            ...);
-  static TfLiteTensor* GetTensor(const struct TfLiteContext* context,
-                                 int tensor_idx);
-  static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context,
-                                         int tensor_idx);
-  static TfLiteStatus GetGraph(struct TfLiteContext* context,
-                               TfLiteIntArray** args);
-
-  // This callback is an implementation for TfLiteContext::GetExternalContext
-  // interface.
-  static TfLiteExternalContext* GetExternalContext(
-      TfLiteContext* context, TfLiteExternalContextType unused);
-
   const Model* model_;
   const MicroOpResolver& op_resolver_;
   ErrorReporter* error_reporter_;
@@ -181,12 +158,13 @@ class MicroInterpreter {
   TfLiteStatus initialization_status_;
 
   ScratchBufferHandle* scratch_buffer_handles_ = nullptr;
-  void* external_context_payload_ = nullptr;
 
   // TODO(b/162311891): Clean these pointers up when this class supports buffers
   // from TfLiteEvalTensor.
   TfLiteTensor** input_tensors_;
   TfLiteTensor** output_tensors_;
+
+  MicroContext micro_context_;
 };
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_mutable_op_resolver.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 8784f29a..237bd595 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,9 +25,11 @@ limitations under the License.
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/kernels/conv.h"
+#include "tensorflow/lite/micro/kernels/depthwise_conv.h"
 #include "tensorflow/lite/micro/kernels/ethosu.h"
 #include "tensorflow/lite/micro/kernels/fully_connected.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/kernels/reduce.h"
 #include "tensorflow/lite/micro/kernels/softmax.h"
 #include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -119,8 +121,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       ParseAbs);
   }
 
-  TfLiteStatus AddAdd() {
-    return AddBuiltin(BuiltinOperator_ADD, tflite::Register_ADD(), ParseAdd);
+  TfLiteStatus AddAdd(const TfLiteRegistration& registration = Register_ADD()) {
+    return AddBuiltin(BuiltinOperator_ADD, registration, ParseAdd);
   }
 
   TfLiteStatus AddAddN() {
@@ -153,6 +155,16 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       Register_BATCH_TO_SPACE_ND(), ParseBatchToSpaceNd);
   }
 
+  TfLiteStatus AddBroadcastArgs() {
+    return AddBuiltin(BuiltinOperator_BROADCAST_ARGS, Register_BROADCAST_ARGS(),
+                      ParseBroadcastArgs);
+  }
+
+  TfLiteStatus AddBroadcastTo() {
+    return AddBuiltin(BuiltinOperator_BROADCAST_TO, Register_BROADCAST_TO(),
+                      ParseBroadcastTo);
+  }
+
   TfLiteStatus AddCallOnce() {
     return AddBuiltin(BuiltinOperator_CALL_ONCE, Register_CALL_ONCE(),
                       ParseCallOnce);
@@ -197,9 +209,10 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       tflite::Register_DEPTH_TO_SPACE(), ParseDepthToSpace);
   }
 
-  TfLiteStatus AddDepthwiseConv2D() {
-    return AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
-                      Register_DEPTHWISE_CONV_2D(), ParseDepthwiseConv2D);
+  TfLiteStatus AddDepthwiseConv2D(
+      const TfLiteRegistration& registration = Register_DEPTHWISE_CONV_2D()) {
+    return AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, registration,
+                      ParseDepthwiseConv2D);
   }
 
   TfLiteStatus AddDequantize() {
@@ -356,9 +369,13 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       tflite::Register_MAX_POOL_2D(), ParsePool);
   }
 
+  TfLiteStatus AddMirrorPad() {
+    return AddBuiltin(BuiltinOperator_MIRROR_PAD, tflite::Register_MIRROR_PAD(),
+                      ParseMirrorPad);
+  }
+
   TfLiteStatus AddMean() {
-    return AddBuiltin(BuiltinOperator_MEAN, tflite::ops::micro::Register_MEAN(),
-                      ParseReducer);
+    return AddBuiltin(BuiltinOperator_MEAN, Register_MEAN(), ParseReducer);
   }
 
   TfLiteStatus AddMinimum() {
@@ -411,8 +428,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddReduceMax() {
-    return AddBuiltin(BuiltinOperator_REDUCE_MAX,
-                      tflite::ops::micro::Register_REDUCE_MAX(), ParseReducer);
+    return AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX(),
+                      ParseReducer);
   }
 
   TfLiteStatus AddRelu() {
@@ -539,10 +556,9 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddUnidirectionalSequenceLSTM() {
-    return AddBuiltin(
-        BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-        tflite::ops::micro::Register_UNIDIRECTIONAL_SEQUENCE_LSTM(),
-        ParseUnidirectionalSequenceLSTM);
+    return AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+                      Register_UNIDIRECTIONAL_SEQUENCE_LSTM(),
+                      ParseUnidirectionalSequenceLSTM);
   }
 
   TfLiteStatus AddVarHandle() {
@@ -550,6 +566,10 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       ParseVarHandle);
   }
 
+  TfLiteStatus AddWhile() {
+    return AddBuiltin(BuiltinOperator_WHILE, Register_WHILE(), ParseWhile);
+  }
+
   TfLiteStatus AddZerosLike() {
     return AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE(),
                       ParseZerosLike);
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_profiler.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_profiler.cc
index d8a86c6b..72f3d37f 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_profiler.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_profiler.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/micro/micro_profiler.h"
 
+#include <cinttypes>
 #include <cstdint>
 
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -38,7 +39,7 @@ void MicroProfiler::EndEvent(uint32_t event_handle) {
   end_ticks_[event_handle] = GetCurrentTimeTicks();
 }
 
-int32_t MicroProfiler::GetTotalTicks() const {
+uint32_t MicroProfiler::GetTotalTicks() const {
   int32_t ticks = 0;
   for (int i = 0; i < num_events_; ++i) {
     ticks += end_ticks_[i] - start_ticks_[i];
@@ -49,8 +50,9 @@ int32_t MicroProfiler::GetTotalTicks() const {
 void MicroProfiler::Log() const {
 #if !defined(TF_LITE_STRIP_ERROR_STRINGS)
   for (int i = 0; i < num_events_; ++i) {
-    int32_t ticks = end_ticks_[i] - start_ticks_[i];
-    MicroPrintf("%s took %d ticks (%d ms).", tags_[i], ticks, TicksToMs(ticks));
+    uint32_t ticks = end_ticks_[i] - start_ticks_[i];
+    MicroPrintf("%s took %" PRIu32 " ticks (%d ms).", tags_[i], ticks,
+                TicksToMs(ticks));
   }
 #endif
 }
@@ -59,8 +61,8 @@ void MicroProfiler::LogCsv() const {
 #if !defined(TF_LITE_STRIP_ERROR_STRINGS)
   MicroPrintf("\"Event\",\"Tag\",\"Ticks\"");
   for (int i = 0; i < num_events_; ++i) {
-    int32_t ticks = end_ticks_[i] - start_ticks_[i];
-    MicroPrintf("%d,%s,%d", i, tags_[i], ticks);
+    uint32_t ticks = end_ticks_[i] - start_ticks_[i];
+    MicroPrintf("%d,%s,%" PRIu32, i, tags_[i], ticks);
   }
 #endif
 }
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_profiler.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_profiler.h
index 8a1ba5de..41f41a35 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_profiler.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_profiler.h
@@ -51,7 +51,7 @@ class MicroProfiler {
   // Returns the sum of the ticks taken across all the events. This number
   // is only meaningful if all of the events are disjoint (the end time of
   // event[i] <= start time of event[i+1]).
-  int32_t GetTotalTicks() const;
+  uint32_t GetTotalTicks() const;
 
   // Prints the profiling information of each of the events in human readable
   // form.
@@ -68,8 +68,8 @@ class MicroProfiler {
   static constexpr int kMaxEvents = 1024;
 
   const char* tags_[kMaxEvents];
-  int32_t start_ticks_[kMaxEvents];
-  int32_t end_ticks_[kMaxEvents];
+  uint32_t start_ticks_[kMaxEvents];
+  uint32_t end_ticks_[kMaxEvents];
   int num_events_ = 0;
 
   TF_LITE_REMOVE_VIRTUAL_DELETE;
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.cc
index e7232f47..c4577773 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.cc
@@ -124,6 +124,14 @@ TfLiteStatus MicroResourceVariables::Assign(int id,
   return kTfLiteOk;
 }
 
+TfLiteStatus MicroResourceVariables::ResetAll() {
+  for (int i = 0; i < num_resource_variables_; i++) {
+    MicroResourceVariable variable = resource_variables_[i];
+    memset(variable.resource_buffer, 0, variable.bytes);
+  }
+  return kTfLiteOk;
+}
+
 int MicroResourceVariables::FindId(const char* container,
                                    const char* shared_name) {
   for (int i = 0; i < num_resource_variables_; i++) {
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.h
index 0bab8e13..e8df991c 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.h
@@ -51,6 +51,9 @@ class MicroResourceVariables {
   // in order to allocate the resource buffer.
   TfLiteStatus Assign(int id, const TfLiteEvalTensor* tensor);
 
+  // Zeros out all resource buffers.
+  TfLiteStatus ResetAll();
+
  private:
   int FindId(const char* container, const char* shared_name);
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_time.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_time.cc
index bbe3f1a8..2d74fdba 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_time.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_time.cc
@@ -38,21 +38,21 @@ namespace tflite {
 // for a platform to support Tensorflow Lite for Microcontrollers profiling.
 // This returns 0 by default because timing is an optional feature that builds
 // without errors on platforms that do not need it.
-int32_t ticks_per_second() { return 0; }
+uint32_t ticks_per_second() { return 0; }
 
 // Reference implementation of the GetCurrentTimeTicks() function that's
 // required for a platform to support Tensorflow Lite for Microcontrollers
 // profiling. This returns 0 by default because timing is an optional feature
 // that builds without errors on platforms that do not need it.
-int32_t GetCurrentTimeTicks() { return 0; }
+uint32_t GetCurrentTimeTicks() { return 0; }
 
 #else  // defined(TF_LITE_USE_CTIME)
 
 // For platforms that support ctime, we implment the micro_time interface in
 // this central location.
-int32_t ticks_per_second() { return CLOCKS_PER_SEC; }
+uint32_t ticks_per_second() { return CLOCKS_PER_SEC; }
 
-int32_t GetCurrentTimeTicks() { return clock(); }
+uint32_t GetCurrentTimeTicks() { return clock(); }
 #endif
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_time.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_time.h
index fac9069b..7a8ab455 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/micro_time.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_time.h
@@ -21,14 +21,14 @@ namespace tflite {
 
 // These functions should be implemented by each target platform, and provide an
 // accurate tick count along with how many ticks there are per second.
-int32_t ticks_per_second();
+uint32_t ticks_per_second();
 
 // Return time in ticks.  The meaning of a tick varies per platform.
-int32_t GetCurrentTimeTicks();
+uint32_t GetCurrentTimeTicks();
 
-inline int32_t TicksToMs(int32_t ticks) {
-  return static_cast<int32_t>(1000.0f * static_cast<float>(ticks) /
-                              static_cast<float>(ticks_per_second()));
+inline uint32_t TicksToMs(int32_t ticks) {
+  return static_cast<uint32_t>(1000.0f * static_cast<float>(ticks) /
+                               static_cast<float>(ticks_per_second()));
 }
 
 }  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/mock_micro_graph.cc b/code/components/tflite-lib/tensorflow/lite/micro/mock_micro_graph.cc
index 951664b5..bfd7605d 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/mock_micro_graph.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/mock_micro_graph.cc
@@ -27,12 +27,12 @@ MockMicroGraph::MockMicroGraph(SimpleMemoryAllocator* allocator)
       free_count_(0) {
   memset(invoke_counts_, 0, sizeof(invoke_counts_));
   mock_tensor_ =
-      reinterpret_cast<TfLiteEvalTensor*>(allocator_->AllocateFromTail(
+      reinterpret_cast<TfLiteEvalTensor*>(allocator_->AllocatePersistentBuffer(
           sizeof(TfLiteEvalTensor), alignof(TfLiteEvalTensor)));
   int* dims_array = reinterpret_cast<int*>(
-      allocator_->AllocateFromTail(3 * sizeof(int), alignof(int)));
+      allocator_->AllocatePersistentBuffer(3 * sizeof(int), alignof(int)));
   float* data_array = reinterpret_cast<float*>(
-      allocator_->AllocateFromTail(2 * sizeof(float), alignof(float)));
+      allocator_->AllocatePersistentBuffer(2 * sizeof(float), alignof(float)));
   int dims[] = {2, 1, 2};
   memcpy(dims_array, dims, 3 * sizeof(int));
   mock_tensor_->dims = testing::IntArrayFromInts(dims_array);
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.cc b/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.cc
index 349018d4..fd84370a 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/micro/arena_allocator/recording_simple_memory_allocator.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/memory_planner/greedy_memory_planner.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
-#include "tensorflow/lite/micro/recording_simple_memory_allocator.h"
 
 namespace tflite {
 
@@ -59,12 +59,13 @@ RecordingMicroAllocator* RecordingMicroAllocator::Create(
                                              arena_size);
   TFLITE_DCHECK(simple_memory_allocator != nullptr);
 
-  uint8_t* memory_planner_buffer = simple_memory_allocator->AllocateFromTail(
-      sizeof(GreedyMemoryPlanner), alignof(GreedyMemoryPlanner));
+  uint8_t* memory_planner_buffer =
+      simple_memory_allocator->AllocatePersistentBuffer(
+          sizeof(GreedyMemoryPlanner), alignof(GreedyMemoryPlanner));
   GreedyMemoryPlanner* memory_planner =
       new (memory_planner_buffer) GreedyMemoryPlanner();
 
-  uint8_t* allocator_buffer = simple_memory_allocator->AllocateFromTail(
+  uint8_t* allocator_buffer = simple_memory_allocator->AllocatePersistentBuffer(
       sizeof(RecordingMicroAllocator), alignof(RecordingMicroAllocator));
   RecordingMicroAllocator* allocator =
       new (allocator_buffer) RecordingMicroAllocator(
@@ -108,11 +109,11 @@ void RecordingMicroAllocator::PrintAllocations() const {
   TF_LITE_REPORT_ERROR(
       error_reporter(),
       "[RecordingMicroAllocator] Arena allocation head %d bytes",
-      recording_memory_allocator_->GetHeadUsedBytes());
+      recording_memory_allocator_->GetNonPersistentUsedBytes());
   TF_LITE_REPORT_ERROR(
       error_reporter(),
       "[RecordingMicroAllocator] Arena allocation tail %d bytes",
-      recording_memory_allocator_->GetTailUsedBytes());
+      recording_memory_allocator_->GetPersistentUsedBytes());
   PrintRecordedAllocation(RecordedAllocationType::kTfLiteEvalTensorData,
                           "TfLiteEvalTensor data", "allocations");
   PrintRecordedAllocation(RecordedAllocationType::kPersistentTfLiteTensorData,
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.h
index 6b039c03..0667287f 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_RECORDING_MICRO_ALLOCATOR_H_
 #define TENSORFLOW_LITE_MICRO_RECORDING_MICRO_ALLOCATOR_H_
 
+#include "tensorflow/lite/micro/arena_allocator/recording_simple_memory_allocator.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
-#include "tensorflow/lite/micro/recording_simple_memory_allocator.h"
 
 namespace tflite {
 
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.cc b/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.cc
new file mode 100644
index 00000000..c89483e1
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.cc
@@ -0,0 +1,113 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/test_helper_custom_ops.h"
+
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <new>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+// TODO(b/170464050): Use TFLM test only version of schema_utils.
+
+namespace tflite {
+namespace testing {
+
+const TfLiteRegistration* PackerOp::getRegistration() {
+  return GetMutableRegistration();
+}
+
+TfLiteRegistration* PackerOp::GetMutableRegistration() {
+  static TfLiteRegistration r;
+  r.init = Init;
+  r.prepare = Prepare;
+  r.invoke = Invoke;
+  r.free = Free;
+  return &r;
+}
+
+void* PackerOp::Init(TfLiteContext* context, const char* buffer,
+                     size_t length) {
+  freed_ = false;
+  // Do nothing.
+  return nullptr;
+}
+
+void PackerOp::Free(TfLiteContext* context, void* buffer) { freed_ = true; }
+
+TfLiteStatus PackerOp::Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus PackerOp::Invoke(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, 0);
+  TF_LITE_ENSURE(context, input1 != nullptr);
+  const int32_t* input1_data = input1->data.i32;
+  TF_LITE_ENSURE_EQ(context, input1->dims->size, 1);
+  const int32_t input1_len = input1->dims->data[0];
+
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, 1);
+  TF_LITE_ENSURE(context, input2 != nullptr);
+  const int32_t* input2_data = input2->data.i32;
+  TF_LITE_ENSURE_EQ(context, input2->dims->size, 1);
+  const int32_t input2_len = input2->dims->data[0];
+
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
+  int32_t* output_data = output->data.i32;
+  int32_t output_len = output->dims->data[0];
+
+  // Fill output with input: first with the first tensor, then with the second
+  // tensor up to the size of the output tensor.
+  int cnt = 0;
+  int i;
+  for (i = 0; i < input1_len && cnt < output_len; i++, cnt++) {
+    output_data[cnt] = input1_data[i];
+  }
+  if (cnt >= output_len) {
+    return kTfLiteOk;
+  }
+
+  for (i = 0; i < input2_len && cnt < output_len; i++, cnt++) {
+    output_data[cnt] = input2_data[i];
+  }
+  if (cnt >= output_len) {
+    return kTfLiteOk;
+  }
+
+  for (; cnt < output_len; cnt++) {
+    output_data[cnt] = 0;
+  }
+  return kTfLiteOk;
+}
+
+bool PackerOp::freed_ = false;
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.h b/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.h
new file mode 100644
index 00000000..b8c025a7
--- /dev/null
+++ b/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_TEST_HELPER_CUSTOM_OPS_H_
+#define TENSORFLOW_LITE_MICRO_TEST_HELPER_CUSTOM_OPS_H_
+
+#include <cstdint>
+#include <limits>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace testing {
+
+class PackerOp {
+ public:
+  static const TfLiteRegistration* getRegistration();
+  static TfLiteRegistration* GetMutableRegistration();
+  static void* Init(TfLiteContext* context, const char* buffer, size_t length);
+  static void Free(TfLiteContext* context, void* buffer);
+  static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node);
+  static TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node);
+
+ private:
+  static bool freed_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_TEST_HELPER_CUSTOM_OPS_H_
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.cc b/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.cc
index 2a5700e6..2411bbf8 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.cc
+++ b/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.cc
@@ -29,7 +29,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_arena_constants.h"
 #include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/micro/test_helper_custom_ops.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 // TODO(b/170464050): Use TFLM test only version of schema_utils.
@@ -40,7 +43,9 @@ namespace {
 
 class StackAllocator : public flatbuffers::Allocator {
  public:
-  StackAllocator() : data_(data_backing_), data_size_(0) {}
+  StackAllocator(size_t alignment) : data_size_(0) {
+    data_ = AlignPointerUp(data_backing_, alignment);
+  }
 
   uint8_t* allocate(size_t size) override {
     TFLITE_DCHECK((data_size_ + size) <= kStackAllocatorSize);
@@ -52,10 +57,10 @@ class StackAllocator : public flatbuffers::Allocator {
 
   void deallocate(uint8_t* p, size_t) override {}
 
-  static StackAllocator& instance() {
+  static StackAllocator& instance(size_t alignment = 1) {
     // Avoid using true dynamic memory allocation to be portable to bare metal.
     static char inst_memory[sizeof(StackAllocator)];
-    static StackAllocator* inst = new (inst_memory) StackAllocator;
+    static StackAllocator* inst = new (inst_memory) StackAllocator(alignment);
     return *inst;
   }
 
@@ -73,7 +78,8 @@ flatbuffers::FlatBufferBuilder* BuilderInstance() {
   static char inst_memory[sizeof(flatbuffers::FlatBufferBuilder)];
   static flatbuffers::FlatBufferBuilder* inst =
       new (inst_memory) flatbuffers::FlatBufferBuilder(
-          StackAllocator::kStackAllocatorSize, &StackAllocator::instance());
+          StackAllocator::kStackAllocatorSize,
+          &StackAllocator::instance(MicroArenaBufferAlignment()));
   return inst;
 }
 
@@ -104,7 +110,9 @@ class ModelBuilder {
 
   // Adds a node to the model with given input and output Tensors.
   Node AddNode(Operator op, std::initializer_list<Tensor> inputs,
-               std::initializer_list<Tensor> outputs);
+               std::initializer_list<Tensor> outputs,
+               std::initializer_list<Tensor> intermediates =
+                   std::initializer_list<Tensor>{});
 
   void AddMetadata(const char* description_string,
                    const int32_t* metadata_buffer_data, size_t num_elements);
@@ -159,12 +167,17 @@ ModelBuilder::Operator ModelBuilder::RegisterOp(BuiltinOperator op,
 ModelBuilder::Node ModelBuilder::AddNode(
     ModelBuilder::Operator op,
     std::initializer_list<ModelBuilder::Tensor> inputs,
-    std::initializer_list<ModelBuilder::Tensor> outputs) {
+    std::initializer_list<ModelBuilder::Tensor> outputs,
+    std::initializer_list<ModelBuilder::Tensor> intermediates) {
   TFLITE_DCHECK(next_operator_id_ <= kMaxOperators);
   operators_[next_operator_id_] = tflite::CreateOperator(
       *builder_, op, builder_->CreateVector(inputs.begin(), inputs.size()),
       builder_->CreateVector(outputs.begin(), outputs.size()),
-      BuiltinOptions_NONE);
+      BuiltinOptions_NONE,
+      /*builtin_options=*/0,
+      /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS,
+      /*mutating_variable_inputs =*/0,
+      builder_->CreateVector(intermediates.begin(), intermediates.size()));
   next_operator_id_++;
   return next_operator_id_ - 1;
 }
@@ -268,9 +281,12 @@ const Model* BuildSimpleStatefulModel() {
   const int median_tensor = model_builder.AddTensor(TensorType_INT8, {3});
   const int invoke_count_tensor =
       model_builder.AddTensor(TensorType_INT32, {1});
+  const int intermediate_tensor =
+      model_builder.AddTensor(TensorType_FLOAT32, {0});
 
   model_builder.AddNode(op_id, {input_tensor},
-                        {median_tensor, invoke_count_tensor});
+                        {median_tensor, invoke_count_tensor},
+                        {intermediate_tensor});
   return model_builder.BuildModel({input_tensor},
                                   {median_tensor, invoke_count_tensor});
 }
@@ -903,6 +919,344 @@ const Model* BuildSimpleModelWithSubgraphsAndIf() {
   return model;
 }
 
+const Model* BuildSimpleModelWithSubgraphsAndWhile() {
+  using flatbuffers::Offset;
+  flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
+
+  constexpr size_t buffers_size = 1;
+  const Offset<Buffer> buffers[buffers_size] = {
+      CreateBuffer(*builder),
+  };
+  const int32_t data_tensor_shape[] = {1, 1};
+  constexpr size_t while_tensors_size = 4;
+  constexpr size_t op_tensors_size = 3;
+  const Offset<Tensor> subgraph0_tensors[while_tensors_size] = {
+      CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1),
+                   TensorType_FLOAT32, 0,
+                   builder->CreateString("input_tensor0"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1),
+                   TensorType_FLOAT32, 0,
+                   builder->CreateString("input_tensor1"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1),
+                   TensorType_FLOAT32, 0,
+                   builder->CreateString("output_tensor0"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1),
+                   TensorType_FLOAT32, 0,
+                   builder->CreateString("output_tensor1"), 0, false),
+  };
+  const Offset<Tensor> subgraph1_tensors[op_tensors_size] = {
+      CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1),
+                   TensorType_FLOAT32, 0,
+                   builder->CreateString("input_tensor1"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1),
+                   TensorType_FLOAT32, 0,
+                   builder->CreateString("input_tensor2"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1),
+                   TensorType_BOOL, 0,
+                   builder->CreateString("condition_tensor"), 0, false),
+  };
+  const Offset<Tensor> subgraph2_tensors[op_tensors_size] = {
+      CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1),
+                   TensorType_FLOAT32, 0,
+                   builder->CreateString("input_tensor0"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1),
+                   TensorType_FLOAT32, 0,
+                   builder->CreateString("input_tensor1"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1),
+                   TensorType_FLOAT32, 0,
+                   builder->CreateString("output_tensor0"), 0, false),
+  };
+
+  constexpr size_t inputs_size = 2;
+  const int32_t inputs[inputs_size] = {0, 1};
+  constexpr size_t while_outputs_size = 2;
+  const int32_t while_outputs[while_outputs_size] = {2, 3};
+  constexpr size_t cond_outputs_size = 1;
+  const int32_t cond_outputs[cond_outputs_size] = {2};
+  constexpr size_t add_outputs_size = 1;
+  const int32_t add_outputs[add_outputs_size] = {2};
+  constexpr size_t add_subgraph_outputs_size = 2;
+  const int32_t add_subgraph_outputs[add_subgraph_outputs_size] = {2, 1};
+  constexpr size_t operators_size = 1;
+  const Offset<Operator> subgraph0_operators[operators_size] = {
+      CreateOperator(*builder, 0, builder->CreateVector(inputs, inputs_size),
+                     builder->CreateVector(while_outputs, while_outputs_size),
+                     BuiltinOptions_WhileOptions,
+                     CreateWhileOptions(*builder, 1, 2).Union()),
+  };
+  const Offset<Operator> subgraph1_operators[operators_size] = {
+      CreateOperator(*builder, 1, builder->CreateVector(inputs, inputs_size),
+                     builder->CreateVector(cond_outputs, cond_outputs_size),
+                     BuiltinOptions_NONE),
+  };
+  const Offset<Operator> subgraph2_operators[operators_size] = {
+      CreateOperator(*builder, 2, builder->CreateVector(inputs, inputs_size),
+                     builder->CreateVector(add_outputs, add_outputs_size),
+                     BuiltinOptions_NONE),
+  };
+  constexpr size_t subgraphs_size = 3;
+  const Offset<SubGraph> subgraphs[subgraphs_size] = {
+      CreateSubGraph(*builder, builder->CreateVector(subgraph0_tensors, 4),
+                     builder->CreateVector(inputs, inputs_size),
+                     builder->CreateVector(while_outputs, while_outputs_size),
+                     builder->CreateVector(subgraph0_operators, operators_size),
+                     builder->CreateString("while_subgraph")),
+      CreateSubGraph(*builder, builder->CreateVector(subgraph1_tensors, 3),
+                     builder->CreateVector(inputs, inputs_size),
+                     builder->CreateVector(cond_outputs, cond_outputs_size),
+                     builder->CreateVector(subgraph1_operators, operators_size),
+                     builder->CreateString("cond_subgraph")),
+      CreateSubGraph(*builder, builder->CreateVector(subgraph2_tensors, 3),
+                     builder->CreateVector(inputs, inputs_size),
+                     builder->CreateVector(add_subgraph_outputs,
+                                           add_subgraph_outputs_size),
+                     builder->CreateVector(subgraph2_operators, operators_size),
+                     builder->CreateString("body_subgraph")),
+  };
+  constexpr size_t operator_codes_size = 3;
+  const Offset<OperatorCode> operator_codes[operator_codes_size] = {
+      CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0,
+                               "multiple_inputs_op",
+                               /*version=*/0, BuiltinOperator_WHILE),
+      CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0,
+                               "multiple_inputs_op",
+                               /*version=*/0, BuiltinOperator_LESS),
+      CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0,
+                               "multiple_inputs_op",
+                               /*version=*/0, BuiltinOperator_ADD),
+  };
+  const Offset<Model> model_offset = CreateModel(
+      *builder, 0, builder->CreateVector(operator_codes, operator_codes_size),
+      builder->CreateVector(subgraphs, subgraphs_size),
+      builder->CreateString("test_model"),
+      builder->CreateVector(buffers, buffers_size));
+  FinishModelBuffer(*builder, model_offset);
+  void* model_pointer = builder->GetBufferPointer();
+  const Model* model = flatbuffers::GetRoot<Model>(model_pointer);
+  return model;
+}
+
+// Build a model with If and two subgraphs: two data tensors A1 of size 2, A2 of
+// size 4 are first concatenated, then cut to a new tensor A3 of size 3; the new
+// tensor A3 of size 3 is then concatenated with A2 tensor of size 4 to produce
+// a final output tensor A4. This model is specially crafted to capture the
+// corner case outlined in go/avoid-memory-corruption-in-if-operator.
+//
+//                Subgraph0
+//            A0(1) A2_0(4)  A1_0(2)
+//             |      |      | ---+
+//             v      v      v    |
+//            +--------------+    |
+//            |     IF       |    |
+//            +------+-------+    |
+//                   | A3_0(3)      |
+//                   v            |
+//            +--------------+    |
+//            |    CUSTOM    |<---+
+//            +------+-------+
+//                   |
+//                   v
+//                    A4_0(8)
+//
+//                Subgraph1/2
+//              A1_1(2)      A2_1(4)
+//                 |         |
+//                 v         v
+//             +---------------+
+//             |   CUSTOM      |
+//             +-------+-------+
+//                     |
+//                     v A3_1(3)
+//
+// And it leads to memory plan as below
+//
+//                  Subgraph0 Layout
+//
+//
+//   <------------A4_0        -------------> <----- A2_0-------> <----A3_0 --->
+//  +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
+//  |    |    |    |    |    |    |    |    | 3  | 4  | 5  |  6 |    |    |    |
+//  +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
+//
+//  +----+----+----+
+//  | 1  |  2 | A0 |
+//  +----+----+----+
+//  <---A1_0-->
+//
+//                 Subgraph 1 Layout
+//
+//  +----+----+----+----+----+----+----+----+----+
+//  |    |    |    |    |    |    |    |    |    |
+//  +----+----+----+----+----+----+----+----+----+
+//
+//
+//  <------A2_1 -------><----A3_1  ---><--A1_1--->
+//
+//
+// A1_1 of subgraph 1 will overlap with A2_0 of subgraph 0.
+// In a buggy implementation of IF, two overwrite may happen:
+// 1. copying input from A1_0 to A1_1 overwrites A2_0 before A2_0 is copied to
+// A2_1; thus subgraph 1 produce incorrect output.
+// 2. copying output from A3_1 to A4_0 overwrites A1_0, which should remain
+// intact so that it can be used by the OP after the IF operator in subgraph 0
+//
+
+const Model* BuildModelWithIfAndSubgraphInputTensorOverlap() {
+  using flatbuffers::Offset;
+  flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
+
+  constexpr TensorType kTensorType = TensorType_INT32;
+  constexpr int kBlockSize =
+      tflite::MicroArenaBufferAlignment() / sizeof(int32_t);
+  constexpr size_t kBuffersCount = 1;
+  const Offset<Buffer> buffers[kBuffersCount] = {
+      CreateBuffer(*builder),
+  };
+  const int32_t kConditionTensorShape[] = {1};
+  const int32_t kIfInput1TensorShape[] = {2 * kBlockSize};
+  const int32_t kIfInput2TensorShape[] = {4 * kBlockSize};
+  const int32_t kIfOutputTensorShape[] = {3 * kBlockSize};
+  const int32_t kFinalOutputTensorShape[] = {8 * kBlockSize};
+  constexpr size_t kSubgraph0TensorsCount = 5;
+  const Offset<Tensor> kSubgraph0Tensors[kSubgraph0TensorsCount] = {
+      CreateTensor(*builder, builder->CreateVector(kConditionTensorShape, 1),
+                   TensorType_BOOL, 0,
+                   builder->CreateString("condition tensor"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(kIfInput1TensorShape, 1),
+                   kTensorType, 0, builder->CreateString("if_input_tensor1"), 0,
+                   false),
+      CreateTensor(*builder, builder->CreateVector(kIfInput2TensorShape, 1),
+                   kTensorType, 0, builder->CreateString("if_input_tensor2"), 0,
+                   false),
+      CreateTensor(*builder, builder->CreateVector(kIfOutputTensorShape, 1),
+                   kTensorType, 0, builder->CreateString("if_output_tensor"), 0,
+                   false),
+      CreateTensor(*builder, builder->CreateVector(kFinalOutputTensorShape, 1),
+                   kTensorType, 0, builder->CreateString("final_output_tensor"),
+                   0, false),
+  };
+
+  // Subgraph 1 is the chosen path if condition tensor in IF is true.
+  constexpr size_t kSubgraph1TensorsCount = 3;
+  const Offset<Tensor> kSubgraph1Tensors[kSubgraph1TensorsCount] = {
+      CreateTensor(*builder, builder->CreateVector(kIfInput1TensorShape, 1),
+                   kTensorType, 0,
+                   builder->CreateString("subgraph1_input_tensor1"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(kIfInput2TensorShape, 1),
+                   kTensorType, 0,
+                   builder->CreateString("subgraph1_input_tensor2"), 0, false),
+      CreateTensor(*builder, builder->CreateVector(kIfOutputTensorShape, 1),
+                   kTensorType, 0,
+                   builder->CreateString("subgraph1_output_tensor"), 0, false),
+  };
+
+  // Subgraph 2 is the chosen path if condition tensor in IF is false
+  constexpr size_t kSubgraph2TensorsCount = 3;
+  const Offset<Tensor> kSubgraph2Tensors[kSubgraph2TensorsCount] = {
+      CreateTensor(*builder, builder->CreateVector(kIfInput1TensorShape, 1),
+                   kTensorType, 0, builder->CreateString("if_input_tensor1"), 0,
+                   false),
+      CreateTensor(*builder, builder->CreateVector(kIfInput2TensorShape, 1),
+                   kTensorType, 0, builder->CreateString("if_input_tensor2"), 0,
+                   false),
+      CreateTensor(*builder, builder->CreateVector(kIfOutputTensorShape, 1),
+                   kTensorType, 0, builder->CreateString("if_output_tensor"), 0,
+                   false),
+  };
+
+  constexpr int kIfOpCodeIndex = 0;
+  constexpr int kCustomOpCodeIndex = 1;
+
+  constexpr size_t kIfInputsCount = 3;
+  const int32_t kIfInputs[kIfInputsCount] = {0, 1, 2};
+  constexpr size_t kOutputsCount = 1;
+  const int32_t kIfOutputs[kOutputsCount] = {3};
+  constexpr size_t kOpAfterIfInputsCount = 2;
+  const int32_t kOpAfterIfInputs[kOpAfterIfInputsCount] = {3, 2};
+  const int32_t kOpAfterIfOutputs[kOutputsCount] = {4};
+  constexpr size_t kOperatorsCount = 2;
+  const Offset<Operator> kSubgraph0Operators[kOperatorsCount] = {
+      CreateOperator(*builder, kIfOpCodeIndex,
+                     builder->CreateVector(kIfInputs, kIfInputsCount),
+                     builder->CreateVector(kIfOutputs, kOutputsCount),
+                     BuiltinOptions_IfOptions,
+                     CreateIfOptions(*builder, 1, 2).Union()),
+      CreateOperator(
+          *builder, kCustomOpCodeIndex,
+          builder->CreateVector(kOpAfterIfInputs, kOpAfterIfInputsCount),
+          builder->CreateVector(kOpAfterIfOutputs, kOutputsCount)),
+  };
+
+  constexpr size_t kSubgraph1InputsCount = 2;
+  const int32_t kSubgraph1Inputs[kSubgraph1InputsCount] = {0, 1};
+  constexpr size_t kSubgraph1OutputsCount = 1;
+  const int32_t kSubgraph1Outputs[kSubgraph1OutputsCount] = {2};
+  constexpr size_t kSubgraph1OperatorsCount = 1;
+  const Offset<Operator> kSubgraph1Operators[kSubgraph1OperatorsCount] = {
+      CreateOperator(
+          *builder, kCustomOpCodeIndex,
+          builder->CreateVector(kSubgraph1Inputs, kSubgraph1InputsCount),
+          builder->CreateVector(kSubgraph1Outputs, kSubgraph1OutputsCount),
+          BuiltinOptions_NONE),
+  };
+
+  constexpr size_t kSubgraph2InputsCount = 2;
+  const int32_t kSubgraph2Inputs[kSubgraph2InputsCount] = {0, 1};
+  constexpr size_t kSubgraph2OutputsCount = 1;
+  const int32_t kSubgraph2Outputs[kSubgraph2OutputsCount] = {2};
+  constexpr size_t kSubgraph2OperatorsCount = 1;
+  const Offset<Operator> kSubgraph2Operators[kSubgraph2OperatorsCount] = {
+      CreateOperator(
+          *builder, kCustomOpCodeIndex,
+          builder->CreateVector(kSubgraph2Inputs, kSubgraph2InputsCount),
+          builder->CreateVector(kSubgraph2Outputs, kSubgraph2OutputsCount),
+          BuiltinOptions_NONE),
+  };
+
+  constexpr size_t kSubgraphsCount = 3;
+  const Offset<SubGraph> kSubgraphs[kSubgraphsCount] = {
+      CreateSubGraph(
+          *builder,
+          builder->CreateVector(kSubgraph0Tensors, kSubgraph0TensorsCount),
+          builder->CreateVector(kIfInputs, kIfInputsCount),
+          builder->CreateVector(kOpAfterIfOutputs, kOutputsCount),
+          builder->CreateVector(kSubgraph0Operators, kOperatorsCount),
+          builder->CreateString("if_subgraph")),
+      CreateSubGraph(
+          *builder,
+          builder->CreateVector(kSubgraph1Tensors, kSubgraph1TensorsCount),
+          builder->CreateVector(kSubgraph1Inputs, kSubgraph1InputsCount),
+          builder->CreateVector(kSubgraph1Outputs, kSubgraph1OutputsCount),
+          builder->CreateVector(kSubgraph1Operators, kSubgraph1OperatorsCount),
+          builder->CreateString("then_subgraph")),
+      CreateSubGraph(
+          *builder,
+          builder->CreateVector(kSubgraph2Tensors, kSubgraph2TensorsCount),
+          builder->CreateVector(kSubgraph2Inputs, kSubgraph2InputsCount),
+          builder->CreateVector(kSubgraph2Outputs, kSubgraph2OutputsCount),
+          builder->CreateVector(kSubgraph2Operators, kSubgraph2OperatorsCount),
+          builder->CreateString("else_subgraph")),
+  };
+
+  constexpr size_t kOperatorCodesCount = 2;
+  const Offset<OperatorCode> kOperatorCodes[kOperatorCodesCount] = {
+      CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0, "if",
+                               /*version=*/0, BuiltinOperator_IF),
+      CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0,
+                               "custom_packer_op",
+                               /*version=*/0, BuiltinOperator_CUSTOM),
+  };
+  const Offset<Model> kModelOffset = CreateModel(
+      *builder, 0, builder->CreateVector(kOperatorCodes, kOperatorCodesCount),
+      builder->CreateVector(kSubgraphs, kSubgraphsCount),
+      builder->CreateString("test_model"),
+      builder->CreateVector(buffers, kBuffersCount));
+  FinishModelBuffer(*builder, kModelOffset);
+  void* model_pointer = builder->GetBufferPointer();
+  const Model* model = flatbuffers::GetRoot<Model>(model_pointer);
+  return model;
+}
+
 // Mock model with one main subgraph containing a single CALL_ONCE op (with null
 // inputs and outputs) which invokes a second subgraph which has null inputs and
 // outputs.
@@ -996,8 +1350,11 @@ TfLiteStatus SimpleStatefulOp::Prepare(TfLiteContext* context,
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   // Make sure that the input is in uint8_t with at least 1 data entry.
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+
   if (input->type != kTfLiteInt8) return kTfLiteError;
   if (NumElements(input->dims) == 0) return kTfLiteError;
 
@@ -1010,6 +1367,7 @@ TfLiteStatus SimpleStatefulOp::Prepare(TfLiteContext* context,
       context->AllocatePersistentBuffer(context, sizeof(int)));
   *data->invoke_count = 0;
 
+  micro_context->DeallocateTempTfLiteTensor(input);
   return kTfLiteOk;
 }
 
@@ -1018,9 +1376,10 @@ TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context,
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
   *data->invoke_count += 1;
 
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  const uint8_t* input_data = GetTensorData<uint8_t>(input);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  const uint8_t* input_data = input->data.uint8;
   int size = NumElements(input->dims);
 
   uint8_t* sorting_buffer = reinterpret_cast<uint8_t*>(
@@ -1038,14 +1397,14 @@ TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context,
     }
   }
 
-  TfLiteTensor* median;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kMedianTensor, &median));
-  uint8_t* median_data = GetTensorData<uint8_t>(median);
-  TfLiteTensor* invoke_count;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kInvokeCount, &invoke_count));
-  int32_t* invoke_count_data = GetTensorData<int32_t>(invoke_count);
+  TfLiteEvalTensor* median =
+      tflite::micro::GetEvalOutput(context, node, kMedianTensor);
+  TF_LITE_ENSURE(context, median != nullptr);
+  uint8_t* median_data = median->data.uint8;
+  TfLiteEvalTensor* invoke_count =
+      tflite::micro::GetEvalOutput(context, node, kInvokeCount);
+  TF_LITE_ENSURE(context, invoke_count != nullptr);
+  int32_t* invoke_count_data = invoke_count->data.i32;
 
   median_data[0] = sorting_buffer[size / 2];
   invoke_count_data[0] = *data->invoke_count;
@@ -1132,18 +1491,20 @@ TfLiteStatus MultipleInputs::Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus MultipleInputs::Invoke(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
   const int32_t* input_data = input->data.i32;
-  const TfLiteTensor* input1;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &input1));
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, 1);
+  TF_LITE_ENSURE(context, input1 != nullptr);
   const int32_t* input_data1 = input1->data.i32;
-  const TfLiteTensor* input2;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &input2));
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, 2);
+  TF_LITE_ENSURE(context, input2 != nullptr);
   const int32_t* input_data2 = input2->data.i32;
 
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
   int32_t* output_data = output->data.i32;
   output_data[0] =
       0;  // Catch output tensor sharing memory with an input tensor
@@ -1195,6 +1556,7 @@ AllOpsResolver GetOpResolver() {
   op_resolver.AddCustom("multiple_inputs_op",
                         MultipleInputs::GetMutableRegistration());
   op_resolver.AddCustom("no_op", NoOp::GetMutableRegistration());
+  op_resolver.AddCustom("custom_packer_op", PackerOp::GetMutableRegistration());
   return op_resolver;
 }
 
@@ -1243,6 +1605,22 @@ const Model* GetSimpleModelWithSubgraphsAndIf() {
   return model;
 }
 
+const Model* GetSimpleModelWithSubgraphsAndWhile() {
+  static Model* model = nullptr;
+  if (!model) {
+    model = const_cast<Model*>(BuildSimpleModelWithSubgraphsAndWhile());
+  }
+  return model;
+}
+
+const Model* GetModelWithIfAndSubgraphInputTensorOverlap() {
+  static Model* model = nullptr;
+  if (!model) {
+    model = const_cast<Model*>(BuildModelWithIfAndSubgraphInputTensorOverlap());
+  }
+  return model;
+}
+
 const Model* GetSimpleModelWithNullInputsAndOutputs() {
   static Model* model = nullptr;
   if (!model) {
diff --git a/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.h b/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.h
index 6ed7efd1..5441ce3e 100644
--- a/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.h
+++ b/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.h
@@ -157,6 +157,13 @@ const Model* GetSimpleStatefulModel();
 // Returns a flatbuffer model with "if" and two subgraphs.
 const Model* GetSimpleModelWithSubgraphsAndIf();
 
+// Returns a flatbuffer model with "while" and three subgraphs.
+const Model* GetSimpleModelWithSubgraphsAndWhile();
+
+// Returns a flatbuffer model with "if" and two subgraphs and the input tensor 1
+// of "if" subgraph overlaps with the input tensor 2 of subgraph 1.
+const Model* GetModelWithIfAndSubgraphInputTensorOverlap();
+
 // Returns a flatbuffer model with null subgraph/operator inputs and outputs.
 const Model* GetSimpleModelWithNullInputsAndOutputs();
 
diff --git a/code/components/tflite-lib/tensorflow/lite/portable_type_to_tflitetype.h b/code/components/tflite-lib/tensorflow/lite/portable_type_to_tflitetype.h
index 83a0ac6c..52d7fdef 100644
--- a/code/components/tflite-lib/tensorflow/lite/portable_type_to_tflitetype.h
+++ b/code/components/tflite-lib/tensorflow/lite/portable_type_to_tflitetype.h
@@ -61,6 +61,7 @@ struct TfLiteTypeToType {};  // Specializations below
 MATCH_TYPE_AND_TFLITE_TYPE(int32_t, kTfLiteInt32);
 MATCH_TYPE_AND_TFLITE_TYPE(uint32_t, kTfLiteUInt32);
 MATCH_TYPE_AND_TFLITE_TYPE(int16_t, kTfLiteInt16);
+MATCH_TYPE_AND_TFLITE_TYPE(uint16_t, kTfLiteUInt16);
 MATCH_TYPE_AND_TFLITE_TYPE(int64_t, kTfLiteInt64);
 MATCH_TYPE_AND_TFLITE_TYPE(float, kTfLiteFloat32);
 MATCH_TYPE_AND_TFLITE_TYPE(unsigned char, kTfLiteUInt8);
diff --git a/code/components/tflite-lib/tensorflow/lite/schema/schema_generated.h b/code/components/tflite-lib/tensorflow/lite/schema/schema_generated.h
index 3620145d..d30dbfe8 100644
--- a/code/components/tflite-lib/tensorflow/lite/schema/schema_generated.h
+++ b/code/components/tflite-lib/tensorflow/lite/schema/schema_generated.h
@@ -394,6 +394,12 @@ struct BucketizeOptionsT;
 struct GeluOptions;
 struct GeluOptionsT;
 
+struct DynamicUpdateSliceOptions;
+struct DynamicUpdateSliceOptionsT;
+
+struct UnsortedSegmentProdOptions;
+struct UnsortedSegmentProdOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -435,11 +441,12 @@ enum TensorType {
   TensorType_RESOURCE = 13,
   TensorType_VARIANT = 14,
   TensorType_UINT32 = 15,
+  TensorType_UINT16 = 16,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_UINT32
+  TensorType_MAX = TensorType_UINT16
 };
 
-inline const TensorType (&EnumValuesTensorType())[16] {
+inline const TensorType (&EnumValuesTensorType())[17] {
   static const TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
@@ -456,13 +463,14 @@ inline const TensorType (&EnumValuesTensorType())[16] {
     TensorType_UINT64,
     TensorType_RESOURCE,
     TensorType_VARIANT,
-    TensorType_UINT32
+    TensorType_UINT32,
+    TensorType_UINT16
   };
   return values;
 }
 
 inline const char * const *EnumNamesTensorType() {
-  static const char * const names[17] = {
+  static const char * const names[18] = {
     "FLOAT32",
     "FLOAT16",
     "INT32",
@@ -479,13 +487,14 @@ inline const char * const *EnumNamesTensorType() {
     "RESOURCE",
     "VARIANT",
     "UINT32",
+    "UINT16",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameTensorType(TensorType e) {
-  if (flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_UINT32)) return "";
+  if (flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_UINT16)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesTensorType()[index];
 }
@@ -868,11 +877,14 @@ enum BuiltinOperator {
   BuiltinOperator_RANDOM_UNIFORM = 148,
   BuiltinOperator_MULTINOMIAL = 149,
   BuiltinOperator_GELU = 150,
+  BuiltinOperator_DYNAMIC_UPDATE_SLICE = 151,
+  BuiltinOperator_RELU_0_TO_1 = 152,
+  BuiltinOperator_UNSORTED_SEGMENT_PROD = 153,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_GELU
+  BuiltinOperator_MAX = BuiltinOperator_UNSORTED_SEGMENT_PROD
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[151] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[154] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -1024,13 +1036,16 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[151] {
     BuiltinOperator_BUCKETIZE,
     BuiltinOperator_RANDOM_UNIFORM,
     BuiltinOperator_MULTINOMIAL,
-    BuiltinOperator_GELU
+    BuiltinOperator_GELU,
+    BuiltinOperator_DYNAMIC_UPDATE_SLICE,
+    BuiltinOperator_RELU_0_TO_1,
+    BuiltinOperator_UNSORTED_SEGMENT_PROD
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOperator() {
-  static const char * const names[152] = {
+  static const char * const names[155] = {
     "ADD",
     "AVERAGE_POOL_2D",
     "CONCATENATION",
@@ -1182,13 +1197,16 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "RANDOM_UNIFORM",
     "MULTINOMIAL",
     "GELU",
+    "DYNAMIC_UPDATE_SLICE",
+    "RELU_0_TO_1",
+    "UNSORTED_SEGMENT_PROD",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
-  if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_GELU)) return "";
+  if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_UNSORTED_SEGMENT_PROD)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOperator()[index];
 }
@@ -1311,11 +1329,13 @@ enum BuiltinOptions {
   BuiltinOptions_RandomOptions = 114,
   BuiltinOptions_BucketizeOptions = 115,
   BuiltinOptions_GeluOptions = 116,
+  BuiltinOptions_DynamicUpdateSliceOptions = 117,
+  BuiltinOptions_UnsortedSegmentProdOptions = 118,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_GeluOptions
+  BuiltinOptions_MAX = BuiltinOptions_UnsortedSegmentProdOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[117] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[119] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -1433,13 +1453,15 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[117] {
     BuiltinOptions_AssignVariableOptions,
     BuiltinOptions_RandomOptions,
     BuiltinOptions_BucketizeOptions,
-    BuiltinOptions_GeluOptions
+    BuiltinOptions_GeluOptions,
+    BuiltinOptions_DynamicUpdateSliceOptions,
+    BuiltinOptions_UnsortedSegmentProdOptions
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOptions() {
-  static const char * const names[118] = {
+  static const char * const names[120] = {
     "NONE",
     "Conv2DOptions",
     "DepthwiseConv2DOptions",
@@ -1557,13 +1579,15 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "RandomOptions",
     "BucketizeOptions",
     "GeluOptions",
+    "DynamicUpdateSliceOptions",
+    "UnsortedSegmentProdOptions",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
-  if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_GeluOptions)) return "";
+  if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_UnsortedSegmentProdOptions)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOptions()[index];
 }
@@ -2036,6 +2060,14 @@ template<> struct BuiltinOptionsTraits<tflite::GeluOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_GeluOptions;
 };
 
+template<> struct BuiltinOptionsTraits<tflite::DynamicUpdateSliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DynamicUpdateSliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnsortedSegmentProdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnsortedSegmentProdOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -2996,6 +3028,22 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_GeluOptions ?
       reinterpret_cast<const tflite::GeluOptionsT *>(value) : nullptr;
   }
+  tflite::DynamicUpdateSliceOptionsT *AsDynamicUpdateSliceOptions() {
+    return type == BuiltinOptions_DynamicUpdateSliceOptions ?
+      reinterpret_cast<tflite::DynamicUpdateSliceOptionsT *>(value) : nullptr;
+  }
+  const tflite::DynamicUpdateSliceOptionsT *AsDynamicUpdateSliceOptions() const {
+    return type == BuiltinOptions_DynamicUpdateSliceOptions ?
+      reinterpret_cast<const tflite::DynamicUpdateSliceOptionsT *>(value) : nullptr;
+  }
+  tflite::UnsortedSegmentProdOptionsT *AsUnsortedSegmentProdOptions() {
+    return type == BuiltinOptions_UnsortedSegmentProdOptions ?
+      reinterpret_cast<tflite::UnsortedSegmentProdOptionsT *>(value) : nullptr;
+  }
+  const tflite::UnsortedSegmentProdOptionsT *AsUnsortedSegmentProdOptions() const {
+    return type == BuiltinOptions_UnsortedSegmentProdOptions ?
+      reinterpret_cast<const tflite::UnsortedSegmentProdOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -10595,6 +10643,100 @@ inline flatbuffers::Offset<GeluOptions> CreateGeluOptions(
 
 flatbuffers::Offset<GeluOptions> CreateGeluOptions(flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct DynamicUpdateSliceOptionsT : public flatbuffers::NativeTable {
+  typedef DynamicUpdateSliceOptions TableType;
+  DynamicUpdateSliceOptionsT() {
+  }
+};
+
+struct DynamicUpdateSliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DynamicUpdateSliceOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  DynamicUpdateSliceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(DynamicUpdateSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DynamicUpdateSliceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DynamicUpdateSliceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit DynamicUpdateSliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  DynamicUpdateSliceOptionsBuilder &operator=(const DynamicUpdateSliceOptionsBuilder &);
+  flatbuffers::Offset<DynamicUpdateSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DynamicUpdateSliceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  DynamicUpdateSliceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct UnsortedSegmentProdOptionsT : public flatbuffers::NativeTable {
+  typedef UnsortedSegmentProdOptions TableType;
+  int32_t num_segments;
+  UnsortedSegmentProdOptionsT()
+      : num_segments(0) {
+  }
+};
+
+struct UnsortedSegmentProdOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UnsortedSegmentProdOptionsT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_SEGMENTS = 4
+  };
+  int32_t num_segments() const {
+    return GetField<int32_t>(VT_NUM_SEGMENTS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_SEGMENTS) &&
+           verifier.EndTable();
+  }
+  UnsortedSegmentProdOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnsortedSegmentProdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<UnsortedSegmentProdOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnsortedSegmentProdOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_num_segments(int32_t num_segments) {
+    fbb_.AddElement<int32_t>(UnsortedSegmentProdOptions::VT_NUM_SEGMENTS, num_segments, 0);
+  }
+  explicit UnsortedSegmentProdOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  UnsortedSegmentProdOptionsBuilder &operator=(const UnsortedSegmentProdOptionsBuilder &);
+  flatbuffers::Offset<UnsortedSegmentProdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UnsortedSegmentProdOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_segments = 0) {
+  UnsortedSegmentProdOptionsBuilder builder_(_fbb);
+  builder_.add_num_segments(num_segments);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   int8_t deprecated_builtin_code;
@@ -11093,6 +11235,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const tflite::GeluOptions *builtin_options_as_GeluOptions() const {
     return builtin_options_type() == tflite::BuiltinOptions_GeluOptions ? static_cast<const tflite::GeluOptions *>(builtin_options()) : nullptr;
   }
+  const tflite::DynamicUpdateSliceOptions *builtin_options_as_DynamicUpdateSliceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_DynamicUpdateSliceOptions ? static_cast<const tflite::DynamicUpdateSliceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnsortedSegmentProdOptions *builtin_options_as_UnsortedSegmentProdOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions_UnsortedSegmentProdOptions ? static_cast<const tflite::UnsortedSegmentProdOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -11593,6 +11741,14 @@ template<> inline const tflite::GeluOptions *Operator::builtin_options_as<tflite
   return builtin_options_as_GeluOptions();
 }
 
+template<> inline const tflite::DynamicUpdateSliceOptions *Operator::builtin_options_as<tflite::DynamicUpdateSliceOptions>() const {
+  return builtin_options_as_DynamicUpdateSliceOptions();
+}
+
+template<> inline const tflite::UnsortedSegmentProdOptions *Operator::builtin_options_as<tflite::UnsortedSegmentProdOptions>() const {
+  return builtin_options_as_UnsortedSegmentProdOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -15679,6 +15835,55 @@ inline flatbuffers::Offset<GeluOptions> CreateGeluOptions(flatbuffers::FlatBuffe
       _approximate);
 }
 
+inline DynamicUpdateSliceOptionsT *DynamicUpdateSliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new DynamicUpdateSliceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void DynamicUpdateSliceOptions::UnPackTo(DynamicUpdateSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<DynamicUpdateSliceOptions> DynamicUpdateSliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDynamicUpdateSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DynamicUpdateSliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateDynamicUpdateSliceOptions(
+      _fbb);
+}
+
+inline UnsortedSegmentProdOptionsT *UnsortedSegmentProdOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new UnsortedSegmentProdOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void UnsortedSegmentProdOptions::UnPackTo(UnsortedSegmentProdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_segments(); _o->num_segments = _e; }
+}
+
+inline flatbuffers::Offset<UnsortedSegmentProdOptions> UnsortedSegmentProdOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnsortedSegmentProdOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnsortedSegmentProdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnsortedSegmentProdOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_segments = _o->num_segments;
+  return tflite::CreateUnsortedSegmentProdOptions(
+      _fbb,
+      _num_segments);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -16618,6 +16823,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const tflite::GeluOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_DynamicUpdateSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DynamicUpdateSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_UnsortedSegmentProdOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentProdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return true;
   }
 }
@@ -17100,6 +17313,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const tflite::GeluOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_DynamicUpdateSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DynamicUpdateSliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_UnsortedSegmentProdOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentProdOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -17570,6 +17791,14 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const tflite::GeluOptionsT *>(value);
       return CreateGeluOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_DynamicUpdateSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DynamicUpdateSliceOptionsT *>(value);
+      return CreateDynamicUpdateSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_UnsortedSegmentProdOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentProdOptionsT *>(value);
+      return CreateUnsortedSegmentProdOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -18040,6 +18269,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new tflite::GeluOptionsT(*reinterpret_cast<tflite::GeluOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_DynamicUpdateSliceOptions: {
+      value = new tflite::DynamicUpdateSliceOptionsT(*reinterpret_cast<tflite::DynamicUpdateSliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_UnsortedSegmentProdOptions: {
+      value = new tflite::UnsortedSegmentProdOptionsT(*reinterpret_cast<tflite::UnsortedSegmentProdOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -18627,6 +18864,16 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_DynamicUpdateSliceOptions: {
+      auto ptr = reinterpret_cast<tflite::DynamicUpdateSliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_UnsortedSegmentProdOptions: {
+      auto ptr = reinterpret_cast<tflite::UnsortedSegmentProdOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/code/components/tflite-lib_20220417.zip b/code/components/tflite-lib_20220417.zip
new file mode 100644
index 00000000..fcc45d22
Binary files /dev/null and b/code/components/tflite-lib_20220417.zip differ
diff --git a/code/components/tflite-lib_20220716.zip b/code/components/tflite-lib_20220716.zip
new file mode 100644
index 00000000..38814b7a
Binary files /dev/null and b/code/components/tflite-lib_20220716.zip differ
diff --git a/code/components/tfmicro.zip b/code/components/tfmicro.zip
deleted file mode 100644
index 5e57aa05..00000000
Binary files a/code/components/tfmicro.zip and /dev/null differ
diff --git a/code/main/main.cpp b/code/main/main.cpp
index 25f3795d..05f0a889 100644
--- a/code/main/main.cpp
+++ b/code/main/main.cpp
@@ -103,18 +103,8 @@ bool Init_NVS_SDCard()
         }
         return false;
     }
-
-    // Card has been initialized, print its properties
     sdmmc_card_print_info(stdout, card);
 
-
-	// Init the GPIO
-    // Flash ausschalten
-    
-    // gpio_pad_select_gpio(FLASH_GPIO);
-    // gpio_set_direction(FLASH_GPIO, GPIO_MODE_OUTPUT);  
-    // gpio_set_level(FLASH_GPIO, 0);   
-
     return true;
 }
 
@@ -149,9 +139,7 @@ extern "C" void app_main(void)
     Camera.LightOnOff(false);
     xDelay = 2000 / portTICK_PERIOD_MS;
     printf("nach init camera: sleep for : %ldms\n", (long) xDelay);
-//    LogFile.WriteToFile("Startsequence 06");      
     vTaskDelay( xDelay );   
-//    LogFile.WriteToFile("Startsequence 07");  
 
 
     if (!Init_NVS_SDCard())
@@ -186,9 +174,7 @@ extern "C" void app_main(void)
 
     xDelay = 2000 / portTICK_PERIOD_MS;
     printf("main: sleep for : %ldms\n", (long) xDelay);
-//    LogFile.WriteToFile("Startsequence 06");      
     vTaskDelay( xDelay );   
-//    LogFile.WriteToFile("Startsequence 07");  
     setup_time();
     setBootTime();
     LogFile.WriteToFile("=============================================================================================");
@@ -196,23 +182,18 @@ extern "C" void app_main(void)
     LogFile.WriteToFile("=============================================================================================");
     LogFile.SwitchOnOff(false);
 
-
-
-
     std::string zw = gettimestring("%Y%m%d-%H%M%S");
     printf("time %s\n", zw.c_str());    
 
-
-
     size_t _hsize = getESPHeapSize();
     if (_hsize < 4000000)
     {
-                    std::string _zws = "Not enought PSRAM available. Expected 4.194.304 MByte - available: " + std::to_string(_hsize);
-                    _zws = _zws + "\nEither not initialzed or too small (2MByte only) or not present at all. Firmware cannot start!!";
-                    printf(_zws.c_str());
-                    LogFile.SwitchOnOff(true);
-                    LogFile.WriteToFile(_zws);
-                    LogFile.SwitchOnOff(false);
+        std::string _zws = "Not enought PSRAM available. Expected 4.194.304 MByte - available: " + std::to_string(_hsize);
+        _zws = _zws + "\nEither not initialzed or too small (2MByte only) or not present at all. Firmware cannot start!!";
+        printf(_zws.c_str());
+        LogFile.SwitchOnOff(true);
+        LogFile.WriteToFile(_zws);
+        LogFile.SwitchOnOff(false);
     } else {
         if (cam != ESP_OK) {
                 ESP_LOGE(TAGMAIN, "Failed to initialize camera module. "
diff --git a/code/main/server_main.h b/code/main/server_main.h
index 00033b66..ccf4c7cb 100644
--- a/code/main/server_main.h
+++ b/code/main/server_main.h
@@ -20,5 +20,4 @@ httpd_handle_t start_webserver(void);
 
 void register_server_main_uri(httpd_handle_t server, const char *base_path);
 
-
 #endif
diff --git a/code/main/version.cpp b/code/main/version.cpp
index 34a3aeed..591dd9da 100644
--- a/code/main/version.cpp
+++ b/code/main/version.cpp
@@ -1,4 +1,4 @@
-const char* GIT_REV="7187101";
+const char* GIT_REV="dc27911";
 const char* GIT_TAG="";
 const char* GIT_BRANCH="master";
-const char* BUILD_TIME="2022-02-22 19:05";
\ No newline at end of file
+const char* BUILD_TIME="2022-07-17 09:03";
\ No newline at end of file
diff --git a/code/main/version.h b/code/main/version.h
index 6c838710..79ec6223 100644
--- a/code/main/version.h
+++ b/code/main/version.h
@@ -13,7 +13,7 @@ extern "C"
 #include "Helper.h"
 #include <fstream>
 
-const char* GIT_BASE_BRANCH = "master - v10.5.2 - 2022-02-22";
+const char* GIT_BASE_BRANCH = "master - v10.6.0 - 2022-07-17";
 
 
 const char* git_base_branch(void)
diff --git a/code/platformio.ini b/code/platformio.ini
index b082bd42..c26b91de 100644
--- a/code/platformio.ini
+++ b/code/platformio.ini
@@ -14,8 +14,8 @@ src_dir = main
 
 
 [env:esp32cam]
-;platform = espressif32@2.1.0
-platform = espressif32
+platform = espressif32@4.4
+;platform = espressif32
 board = esp32cam
 framework = espidf
 
@@ -35,6 +35,7 @@ lib_deps =
   jomjol_time_sntp 
   jomjol_logfile 
   jomjol_mqtt
+  jomjol_influxdb
   jomjol_controlGPIO
 
 
diff --git a/code/sdkconfig.esp32cam b/code/sdkconfig.esp32cam
index 2433bd76..28ab25b6 100644
--- a/code/sdkconfig.esp32cam
+++ b/code/sdkconfig.esp32cam
@@ -140,6 +140,14 @@ CONFIG_EXAMPLE_CONNECT_IPV6_PREF_LOCAL_LINK=y
 # CONFIG_EXAMPLE_CONNECT_IPV6_PREF_UNIQUE_LOCAL is not set
 # end of Example Connection Configuration
 
+#
+# ESP-NN
+#
+# CONFIG_NN_ANSI_C is not set
+CONFIG_NN_OPTIMIZED=y
+CONFIG_NN_OPTIMIZATIONS=1
+# end of ESP-NN
+
 #
 # Compiler options
 #
@@ -1225,6 +1233,9 @@ CONFIG_OV2640_SUPPORT=y
 # CONFIG_GC032A_SUPPORT is not set
 # CONFIG_GC0308_SUPPORT is not set
 # CONFIG_BF3005_SUPPORT is not set
+CONFIG_BF20A6_SUPPORT=y
+# CONFIG_SC101IOT_SUPPORT is not set
+CONFIG_SC030IOT_SUPPORT=y
 # CONFIG_SCCB_HARDWARE_I2C_PORT0 is not set
 CONFIG_SCCB_HARDWARE_I2C_PORT1=y
 CONFIG_SCCB_CLK_FREQ=100000
diff --git a/code/test/components/jomjol-flowcontroll/test_cnnflowcontroll.cpp b/code/test/components/jomjol-flowcontroll/test_cnnflowcontroll.cpp
new file mode 100644
index 00000000..18129ada
--- /dev/null
+++ b/code/test/components/jomjol-flowcontroll/test_cnnflowcontroll.cpp
@@ -0,0 +1,92 @@
+#include <unity.h>
+#include <ClassFlowCNNGeneral.h>
+
+class UnderTest : public ClassFlowCNNGeneral {
+    public:
+    using ClassFlowCNNGeneral::ZeigerEval;
+    using ClassFlowCNNGeneral::ZeigerEvalHybrid;
+    using ClassFlowCNNGeneral::ClassFlowCNNGeneral;
+    
+};
+
+
+void setUp(void)
+{
+  // set stuff up here
+}
+
+void tearDown(void)
+{
+  // clean stuff up here
+}
+
+
+
+/**
+ * @brief test if all combinations of digit 
+ * evaluation are running correctly
+ */
+void test_ZeigerEval() 
+{
+    UnderTest undertest = UnderTest(nullptr, Digital100);
+
+    // the 5.2 is already above 5.0 and the previous digit too (3)
+    int result = undertest.ZeigerEval(5.2, 3);
+    TEST_ASSERT_EQUAL(5, result);
+
+    // the 5.2 is already above 5.0 and the previous digit not (9)
+    // so the current digit shoult be reduced (4.9)
+    TEST_ASSERT_EQUAL(4, undertest.ZeigerEval(5.2, 9));
+
+    // the 4.4 (digital100) is not above 5  and the previous digit (analog) too (9.3)
+    TEST_ASSERT_EQUAL(4, undertest.ZeigerEval(4.4, 9));
+
+    // the 4.5 (digital100) is not above 5  and the previous digit (analog) too (9.6)
+    TEST_ASSERT_EQUAL(4, undertest.ZeigerEval(4.5, 0));    
+
+}
+
+/**
+ * @brief test if all combinations of digit 
+ * evaluation are running correctly
+ */
+void test_ZeigerEvalHybrid() {
+    UnderTest undertest = UnderTest(nullptr, Digital100);
+
+    // the 5.2 and no previous should round down
+    TEST_ASSERT_EQUAL(5, undertest.ZeigerEvalHybrid(5.2, 0, -1));
+
+    // the 5.3 and no previous should trunc to 5
+    TEST_ASSERT_EQUAL(5, undertest.ZeigerEvalHybrid(5.3, 0, -1));
+
+    // the 5.7 and no previous should trunc to 5
+    TEST_ASSERT_EQUAL(5, undertest.ZeigerEvalHybrid(5.7, 0, -1));
+
+    // the 5.8 and no previous should round up to 6
+    TEST_ASSERT_EQUAL(6, undertest.ZeigerEvalHybrid(5.8, 0, -1));
+
+    // the 5.7 with previous and the previous between 0.3-0.7 should round up to 6
+    TEST_ASSERT_EQUAL(6, undertest.ZeigerEvalHybrid(5.7, 0.7, 1));
+
+    // the 5.3 with previous and the previous between 0.3-0.7 should round down to 5
+    TEST_ASSERT_EQUAL(5, undertest.ZeigerEvalHybrid(5.3, 0.7, 1));
+
+    // the 5.3 with previous and the previous <=0.5 should trunc to 5
+    TEST_ASSERT_EQUAL(5, undertest.ZeigerEvalHybrid(5.3, 0.1, 1));
+
+    // the 5.3 with previous and the previous >=9.5 should reduce to 4
+    TEST_ASSERT_EQUAL(4, undertest.ZeigerEvalHybrid(5.3, 9.6, 9));
+
+    // the 5.7 with previous and the previous >=9.5 should trunc to 5
+    TEST_ASSERT_EQUAL(5, undertest.ZeigerEvalHybrid(5.7, 9.6, 9));
+
+    // the 4.5 (digital100) is not above 5  and the previous digit (analog) not over Zero (9.6)
+    TEST_ASSERT_EQUAL(4, undertest.ZeigerEvalHybrid(4.5, 9.6, 0));    
+
+    // the 4.5 (digital100) is not above 5  and the previous digit (analog) not over Zero (9.6)
+    TEST_ASSERT_EQUAL(4, undertest.ZeigerEvalHybrid(4.5, 9.6, 9));    
+    // the 4.4 (digital100) is not above 5  and the previous digit (analog) not over Zero (9.5)
+    TEST_ASSERT_EQUAL(4, undertest.ZeigerEvalHybrid(4.5, 9.5, 9));    
+
+}
+
diff --git a/code/test/test_suite_flowcontroll.cpp b/code/test/test_suite_flowcontroll.cpp
new file mode 100644
index 00000000..48b2993e
--- /dev/null
+++ b/code/test/test_suite_flowcontroll.cpp
@@ -0,0 +1,16 @@
+#include <unity.h>
+#include "components/jomjol-flowcontroll/test_cnnflowcontroll.cpp"
+
+/**
+ * @brief startup the test. Like a test-suite 
+ * all test methods must be called here
+ */
+extern "C" void app_main()
+{
+  UNITY_BEGIN();
+
+  RUN_TEST(test_ZeigerEval);
+  RUN_TEST(test_ZeigerEvalHybrid);
+  
+  UNITY_END();
+}
\ No newline at end of file
diff --git a/code/version.cpp b/code/version.cpp
index 34a3aeed..591dd9da 100644
--- a/code/version.cpp
+++ b/code/version.cpp
@@ -1,4 +1,4 @@
-const char* GIT_REV="7187101";
+const char* GIT_REV="dc27911";
 const char* GIT_TAG="";
 const char* GIT_BRANCH="master";
-const char* BUILD_TIME="2022-02-22 19:05";
\ No newline at end of file
+const char* BUILD_TIME="2022-07-17 09:03";
\ No newline at end of file
diff --git a/firmware/ana-s2-q-20220213.tflite b/firmware/ana-s2-q-20220213.tflite
deleted file mode 100644
index e666f11e..00000000
Binary files a/firmware/ana-s2-q-20220213.tflite and /dev/null differ
diff --git a/firmware/ana-s3-q-20220105.tflite b/firmware/ana-s3-q-20220105.tflite
deleted file mode 100644
index 5718b1ec..00000000
Binary files a/firmware/ana-s3-q-20220105.tflite and /dev/null differ
diff --git a/firmware/bootloader.bin b/firmware/bootloader.bin
index ae05db13..c5435579 100644
Binary files a/firmware/bootloader.bin and b/firmware/bootloader.bin differ
diff --git a/firmware/dig-s2-q-20220211.tflite b/firmware/dig-s2-q-20220211.tflite
deleted file mode 100644
index 26cb56eb..00000000
Binary files a/firmware/dig-s2-q-20220211.tflite and /dev/null differ
diff --git a/firmware/firmware.bin b/firmware/firmware.bin
index 6314d0e3..e09402a6 100644
Binary files a/firmware/firmware.bin and b/firmware/firmware.bin differ
diff --git a/firmware/html.zip b/firmware/html.zip
index ec069237..37a4d4df 100644
Binary files a/firmware/html.zip and b/firmware/html.zip differ
diff --git a/sd-card/config/ana-s3-q-20220105.tflite b/sd-card/config/ana-s3-q-20220105.tflite
deleted file mode 100644
index 5718b1ec..00000000
Binary files a/sd-card/config/ana-s3-q-20220105.tflite and /dev/null differ
diff --git a/sd-card/config/ana0910s3_longq.tflite b/sd-card/config/ana0910s3_longq.tflite
deleted file mode 100644
index 045b6a5b..00000000
Binary files a/sd-card/config/ana0910s3_longq.tflite and /dev/null differ
diff --git a/sd-card/config/ana1000s2.tflite b/sd-card/config/ana1000s2.tflite
new file mode 100644
index 00000000..72ae8ba2
Binary files /dev/null and b/sd-card/config/ana1000s2.tflite differ
diff --git a/sd-card/config/config.ini b/sd-card/config/config.ini
index 9c3c1005..94a9c5cd 100644
--- a/sd-card/config/config.ini
+++ b/sd-card/config/config.ini
@@ -21,19 +21,17 @@ FlipImageSize = false
 /config/ref1.jpg 442 142
 
 [Digits]
-Model = /config/dig-s2-q-20220211.tflite
+Model = /config/dig1400s2q.tflite
 ;LogImageLocation = /log/digit
 ;LogfileRetentionInDays = 3
-ModelInputSize = 20 32
 main.dig1 294 126 30 54
 main.dig2 343 126 30 54
 main.dig3 391 126 30 54
 
 [Analog]
-Model = /config/ana-s3-q-20220105.tflite
+Model = /config/ana1000s2.tflite
 ;LogImageLocation = /log/analog
 ;LogfileRetentionInDays = 3
-ModelInputSize = 32 32
 ExtendedResolution = true
 main.ana1 432 230 92 92
 main.ana2 379 332 92 92
@@ -46,6 +44,9 @@ PreValueUse = true
 PreValueAgeStartup = 720
 AllowNegativeRates = false
 main.MaxRateValue = 0.05
+;main.MaxRateType = AbsoluteChange
+;main.ExtendedResolution = false
+;main.IgnoreLeadingNaN = true
 ErrorMessage = true
 CheckDigitIncreaseConsistency = false
 
diff --git a/sd-card/config/dig-s2-q-20220211.tflite b/sd-card/config/dig-s2-q-20220211.tflite
deleted file mode 100644
index 26cb56eb..00000000
Binary files a/sd-card/config/dig-s2-q-20220211.tflite and /dev/null differ
diff --git a/sd-card/config/dig1330s1q.tflite b/sd-card/config/dig1330s1q.tflite
deleted file mode 100644
index 438327f8..00000000
Binary files a/sd-card/config/dig1330s1q.tflite and /dev/null differ
diff --git a/sd-card/config/dig1400s2q.tflite b/sd-card/config/dig1400s2q.tflite
new file mode 100644
index 00000000..99a28f53
Binary files /dev/null and b/sd-card/config/dig1400s2q.tflite differ
diff --git a/sd-card/html/edit_config_param.html b/sd-card/html/edit_config_param.html
index 1d4c485d..db2f1bd9 100644
--- a/sd-card/html/edit_config_param.html
+++ b/sd-card/html/edit_config_param.html
@@ -298,6 +298,22 @@ textarea {
 				Path to CNN model file for image recognition
 			</td>
 		</tr>
+		
+		<tr class="expert"  id="ex91">
+			<td width="20px"  style="padding-left: 40px;">
+				<input type="checkbox" id="Digits_CNNGoodThreshold_enabled" value="1"  onclick = 'InvertEnableItem("Digits", "CNNGoodThreshold")' unchecked >
+			</td>
+			<td>
+				<class id="Digits_CNNGoodThreshold_text" style="color:black;">CNNGoodThreshold</class>
+			</td>
+			<td>
+				<input width="40px" type="number" id="Digits_CNNGoodThreshold_value1" min="0" max="1" step="0.1">
+			</td>
+			<td style="font-size: 80%;">
+				EXPERIMENTAL - NOT WORKING FOR ALL CNNs! - Threshold above which the classification should be to accept the value (only for digits meaningfull)
+			</td>
+		</tr>
+
 		<tr>
 			<td width="20px"  style="padding-left: 40px;">
 				<input type="checkbox" id="Digits_LogImageLocation_enabled" value="1"  onclick = 'InvertEnableItem("Digits", "LogImageLocation")' unchecked >
@@ -326,6 +342,8 @@ textarea {
 				Time to keep the separated digit images (in days -"0" = forever)
 			</td>
 		</tr>
+<!--
+
 		<tr class="expert"  id="ex9">
 			<td width="20px"  style="padding-left: 40px;">
 			</td>
@@ -340,6 +358,7 @@ textarea {
 				Size of the input image for the CNN model
 			</td>
 		</tr>
+-->
 
 		<tr id="Category_Analog_ex4">
 			<td colspan="4" style="padding-left: 20px;">
@@ -371,6 +390,8 @@ textarea {
 			<td style="font-size: 80%;"> Time to keep the separated digit images (in days -"0" = forever) </td>
 		</tr>
 
+<!--
+
 		<tr class="expert"  id="ex10">
 			<td width="20px"  style="padding-left: 40px;"> </td>
 			<td> <class id="Analog_ModelInputSize_text" style="color:black;">ModelInputSize</class> </td>
@@ -380,6 +401,7 @@ textarea {
 			</td>
 			<td style="font-size: 80%;"> Size of the input image for the CNN model </td>
 		</tr>
+-->
 
 		<tr>
 			<td colspan="4" style="padding-left: 20px;"><h4>PostProcessing</h4></td>
@@ -636,6 +658,98 @@ textarea {
 				Password for MQTT authentication
 			</td>
 		</tr>
+		<tr>
+			<td width="20px"  style="padding-left: 40px;">
+				<input type="checkbox" id="MQTT_SetRetainFlag_enabled" value="1"  onclick = 'InvertEnableItem("MQTT", "SetRetainFlag")' unchecked >
+			</td>
+			<td  width="200px">
+				<class id="MQTT_SetRetainFlag_text" style="color:black;">Enable MQTT Retain Flag</class>
+			</td>
+			<td>
+				<select id="MQTT_SetRetainFlag_value1">
+					<option value="true" selected>true</option>
+					<option value="false" >false</option>
+				</select>
+			</td>
+			<td style="font-size: 80%;">
+				Enable or disable the retain flag for all MQTT entries
+			</td>
+		</tr>
+                                 
+		<tr>
+			<td colspan="4" style="padding-left: 20px;"><h4><input type="checkbox" id="Category_InfluxDB_enabled" value="1"  onclick = 'UpdateAfterCategoryCheck()' unchecked > InfluxDB</h4></td>
+		</tr> 		
+		<tr>
+			<td width="20px"  style="padding-left: 40px;">
+				<input type="checkbox" id="InfluxDB_Uri_enabled" value="1"  onclick = 'InvertEnableItem("InfluxDB", "Uri")' unchecked >
+			</td>
+			<td  width="200px">
+				<class id="InfluxDB_Uri_text" style="color:black;">Uri</class>
+			</td>
+			<td>
+				<input type="text" id="InfluxDB_Uri_value1">
+			</td>
+			<td style="font-size: 80%;">
+				URI of the HTTP interface to InfluxDB, without traililing slash, e.g. http://IP-Address:Port
+			</td>
+		</tr>
+                <tr>
+			<td width="20px"  style="padding-left: 40px;">
+				<input type="checkbox" id="InfluxDB_Database_enabled" value="1"  onclick = 'InvertEnableItem("InfluxDB", "Database")' unchecked >
+			</td>
+			<td  width="200px">
+				<class id="InfluxDB_Database_text" style="color:black;">Database</class>
+			</td>
+			<td>
+				<input type="text" id="InfluxDB_Database_value1">
+			</td>
+			<td style="font-size: 80%;">
+                                Database name in which to publish the read value.
+			</td>
+		</tr>
+		<tr>
+			<td width="20px"  style="padding-left: 40px;">
+				<input type="checkbox" id="InfluxDB_Measurement_enabled" value="1"  onclick = 'InvertEnableItem("InfluxDB", "Measurement")' unchecked >
+			</td>
+			<td  width="200px">
+				<class id="InfluxDB_Measurement_text" style="color:black;">Measurement</class>
+			</td>
+			<td>
+				<input type="text" id="InfluxDB_Measurement_value1">
+			</td>
+			<td style="font-size: 80%;">
+                                Measurement name to use to publish the read value.
+			</td>
+		</tr>
+		<tr>
+			<td width="20px"  style="padding-left: 40px;">
+				<input type="checkbox" id="InfluxDB_user_enabled" value="1"  onclick = 'InvertEnableItem("InfluxDB", "user")' unchecked >
+			</td>
+			<td  width="200px">
+				<class id="InfluxDB_user_text" style="color:black;">user</class>
+			</td>
+			<td>
+				<input type="text" id="InfluxDB_user_value1">
+			</td>
+			<td style="font-size: 80%;">
+				User for InfluxDB authentication
+			</td>
+		</tr>
+		<tr>
+			<td width="20px"  style="padding-left: 40px;">
+				<input type="checkbox" id="InfluxDB_password_enabled" value="1"  onclick = 'InvertEnableItem("InfluxDB", "password")' unchecked >
+			</td>
+			<td  width="200px">
+				<class id="InfluxDB_password_text" style="color:black;">password</class>
+			</td>
+			<td>
+				<input type="text" id="InfluxDB_password_value1">
+			</td>
+			<td style="font-size: 80%;">
+				Password for InfluxDB authentication
+			</td>
+		</tr>
+		
 	
 		<tr>
 			<td colspan="4" style="padding-left: 20px;"><h4>AutoTimer</h4></td>
@@ -678,7 +792,7 @@ textarea {
 				</h4>	
 			</td>
 		</tr> 	
-		
+
 		<!------------- GPIO0 begin ------------------>
 		<tr class="expert" class="GPIO_IO0 GPIO_item">
 			<td width="20px"  style="padding-left: 40px;">
@@ -1769,21 +1883,18 @@ function UpdateInput() {
 	WriteParameter(param, category, "Alignment", "SearchFieldY", false);		
 	WriteParameter(param, category, "Alignment", "AlignmentAlgo", true);		
 
-//	WriteParameter(param, category, "Digits", "Model", false);		
+	WriteParameter(param, category, "Digits", "CNNGoodThreshold", true);
 	WriteParameter(param, category, "Digits", "LogImageLocation", true);		
 	WriteParameter(param, category, "Digits", "LogfileRetentionInDays", true);		
-	WriteParameter(param, category, "Digits", "ModelInputSize", false);	
+//	WriteParameter(param, category, "Digits", "ModelInputSize", false);	
 	
-//	WriteParameter(param, category, "Analog", "Model", false);		
 	WriteParameter(param, category, "Analog", "LogImageLocation", true);		
 	WriteParameter(param, category, "Analog", "LogfileRetentionInDays", true);		
-//	WriteParameter(param, category, "Analog", "ExtendedResolution", true);		
-	WriteParameter(param, category, "Analog", "ModelInputSize", false);		
+//	WriteParameter(param, category, "Analog", "ModelInputSize", false);		
 	
 	WriteParameter(param, category, "PostProcessing", "PreValueUse", true);		
 	WriteParameter(param, category, "PostProcessing", "PreValueAgeStartup", true);		
 	WriteParameter(param, category, "PostProcessing", "AllowNegativeRates", true);
-//	WriteParameter(param, category, "PostProcessing", "MaxRateValue", true);		
 	WriteParameter(param, category, "PostProcessing", "ErrorMessage", true);
 	WriteParameter(param, category, "PostProcessing", "CheckDigitIncreaseConsistency", true);
 
@@ -1792,7 +1903,14 @@ function UpdateInput() {
 	WriteParameter(param, category, "MQTT", "ClientID", true);	
 	WriteParameter(param, category, "MQTT", "user", true);	
 	WriteParameter(param, category, "MQTT", "password", true);
+	WriteParameter(param, category, "MQTT", "SetRetainFlag", true);
 	
+	WriteParameter(param, category, "InfluxDB", "Uri", true);	
+	WriteParameter(param, category, "InfluxDB", "Database", true);	
+	WriteParameter(param, category, "InfluxDB", "Measurement", true);	
+	WriteParameter(param, category, "InfluxDB", "user", true);	
+	WriteParameter(param, category, "InfluxDB", "password", true);	
+
 	WriteParameter(param, category, "GPIO", "IO0", true);
 	WriteParameter(param, category, "GPIO", "IO1", true);
 	WriteParameter(param, category, "GPIO", "IO3", true);
@@ -1847,6 +1965,7 @@ function ReadParameterAll()
 	category["Analog"]["enabled"] = document.getElementById("Category_Analog_enabled").checked;
 	category["Digits"]["enabled"] = document.getElementById("Category_Digits_enabled").checked;
 	category["MQTT"]["enabled"] = document.getElementById("Category_MQTT_enabled").checked;
+	category["InfluxDB"]["enabled"] = document.getElementById("Category_InfluxDB_enabled").checked;
 	category["GPIO"]["enabled"] = document.getElementById("Category_GPIO_enabled").checked;
 	
 	ReadParameter(param, "MakeImage", "LogImageLocation", true);
@@ -1864,15 +1983,16 @@ function ReadParameterAll()
 	ReadParameter(param, "Alignment", "SearchFieldY", false);
 	ReadParameter(param, "Alignment", "AlignmentAlgo", true);
 
-	ReadParameter(param, "Digits", "Model", false);		
+	ReadParameter(param, "Digits", "Model", false);
+	ReadParameter(param, "Digits", "CNNGoodThreshold", true);
 	ReadParameter(param, "Digits", "LogImageLocation", true);		
 	ReadParameter(param, "Digits", "LogfileRetentionInDays", true);		
-	ReadParameter(param, "Digits", "ModelInputSize", false);
+//	ReadParameter(param, "Digits", "ModelInputSize", false);
 
 	ReadParameter(param, "Analog", "Model", false);		
 	ReadParameter(param, "Analog", "LogImageLocation", true);		
 	ReadParameter(param, "Analog", "LogfileRetentionInDays", true);		
-	ReadParameter(param, "Analog", "ModelInputSize", false);
+//	ReadParameter(param, "Analog", "ModelInputSize", false);
 
 	ReadParameter(param, "PostProcessing", "PreValueUse", true);		
 	ReadParameter(param, "PostProcessing", "PreValueAgeStartup", true);		
@@ -1885,6 +2005,12 @@ function ReadParameterAll()
 	ReadParameter(param, "MQTT", "ClientID", true);	
 	ReadParameter(param, "MQTT", "user", true);	
 	ReadParameter(param, "MQTT", "password", true);	
+	ReadParameter(param, "MQTT", "SetRetainFlag", true);	
+
+	ReadParameter(param, "InfluxDB", "Uri", true);	
+	ReadParameter(param, "InfluxDB", "Measurement", true);	
+	ReadParameter(param, "InfluxDB", "user", true);	
+	ReadParameter(param, "InfluxDB", "password", true);	
 
 	ReadParameter(param, "GPIO", "IO0", true);
 	ReadParameter(param, "GPIO", "IO1", true);
diff --git a/sd-card/html/gethost.js b/sd-card/html/gethost.js
index 8a9df763..751e4b11 100644
--- a/sd-card/html/gethost.js
+++ b/sd-card/html/gethost.js
@@ -13,7 +13,7 @@ function getbasepath(){
     {
 //        host = "http://192.168.2.219";          // jomjol interner test
 //        host = "http://192.168.178.46";          // jomjol interner test
-        host = "http://192.168.178.34";          // jomjol interner Real
+        host = "http://192.168.178.62";          // jomjol interner Real
 //        host = "http://192.168.43.191";
 //        host = ".";                           // jomjol interner localhost   
 
diff --git a/sd-card/html/index_configure.html b/sd-card/html/index_configure.html
index 090cae08..da765725 100644
--- a/sd-card/html/index_configure.html
+++ b/sd-card/html/index_configure.html
@@ -73,7 +73,7 @@ li.dropdown {
 
 <body style="font-family: arial">
 
-<h1>Configure Watermeter</h1>
+<h1 id="id_title">Configure Watermeter</h1>
 
 <ul>
 	<li aria-current="page"><a href="index.html">Main Page</a>	
@@ -97,5 +97,41 @@ li.dropdown {
 <div class="h_iframe">
 	 <iframe width="1020px" height="650px" name="maincontent" id ="maincontent" src="edit_config_param.html" title="fileserver"></iframe> 
 </div>
+<script type="text/javascript" src="./gethost.js"></script> 
+
+<script type="text/javascript">
+	var basepath = "http://192.168.178.22";
+
+
+function LoadHostname() {
+	_basepath = getbasepath(); 
+  
+	var xhttp = new XMLHttpRequest();
+	xhttp.addEventListener('load', function(event) {
+	  if (xhttp.status >= 200 && xhttp.status < 300) {
+		  hostname = xhttp.responseText;
+      document.title = "Configure - " + hostname;
+		  document.getElementById("id_title").innerHTML  = "Configure - " + hostname;
+	  } else {
+		 console.warn(request.statusText, request.responseText);
+	  }
+	 });
+
+//     var xhttp = new XMLHttpRequest();
+	 try {
+		  url = _basepath + '/version?type=Hostname';     
+		  xhttp.open("GET", url, true);
+		  xhttp.send();
+
+	 }
+	 catch (error)
+	 {
+//               alert("Loading Hostname failed");
+	 }
+  }
+
+  LoadHostname();
+
+</script>
 </body>
 </html>
\ No newline at end of file
diff --git a/sd-card/html/ota_page.html b/sd-card/html/ota_page.html
index 7476fd23..e86831fa 100644
--- a/sd-card/html/ota_page.html
+++ b/sd-card/html/ota_page.html
@@ -162,6 +162,7 @@ function init(){
 function doUpdate() {
 	if (confirm("Are you sure to update the firmware?")) {
 		var stringota = "/ota?file=firmware.bin";
+        document.getElementById("doUpdate").disabled = true;
         
         var xhttp = new XMLHttpRequest();
 	
@@ -188,6 +189,8 @@ function doUpdate() {
 function doUpdatehtml() {
 	if (confirm("Are you sure to update the /html content?")) {
 		var stringota = "/ota?task=unziphtml";
+        document.getElementById("doUpdatehtml").disabled = true;
+
         
         var xhttp = new XMLHttpRequest();
 	
diff --git a/sd-card/html/readconfigparam.js b/sd-card/html/readconfigparam.js
index adaf490c..7977fce9 100644
--- a/sd-card/html/readconfigparam.js
+++ b/sd-card/html/readconfigparam.js
@@ -83,9 +83,11 @@ function ParseConfig() {
      category[catname]["found"] = false;
      param[catname] = new Object();
      ParamAddValue(param, catname, "Model");
+     ParamAddValue(param, catname, "CNNGoodThreshold", 1); 
      ParamAddValue(param, catname, "LogImageLocation");
      ParamAddValue(param, catname, "LogfileRetentionInDays");
-     ParamAddValue(param, catname, "ModelInputSize", 2);     
+//     ParamAddValue(param, catname, "ModelInputSize", 2); 
+         
 
      var catname = "Analog";
      category[catname] = new Object(); 
@@ -95,7 +97,7 @@ function ParseConfig() {
      ParamAddValue(param, catname, "Model");
      ParamAddValue(param, catname, "LogImageLocation");
      ParamAddValue(param, catname, "LogfileRetentionInDays");
-     ParamAddValue(param, catname, "ModelInputSize", 2);
+//     ParamAddValue(param, catname, "ModelInputSize", 2);
 
      var catname = "PostProcessing";
      category[catname] = new Object(); 
@@ -124,7 +126,19 @@ function ParseConfig() {
      ParamAddValue(param, catname, "ClientID");
      ParamAddValue(param, catname, "user");
      ParamAddValue(param, catname, "password");
-     
+     ParamAddValue(param, catname, "SetRetainFlag");
+
+     var catname = "InfluxDB";
+     category[catname] = new Object(); 
+     category[catname]["enabled"] = false;
+     category[catname]["found"] = false;
+     param[catname] = new Object();
+     ParamAddValue(param, catname, "Uri");
+     ParamAddValue(param, catname, "Database");
+     ParamAddValue(param, catname, "Measurement");
+     ParamAddValue(param, catname, "user");
+     ParamAddValue(param, catname, "password");
+    
      var catname = "GPIO";
      category[catname] = new Object(); 
      category[catname]["enabled"] = false;
diff --git a/sd-card/html/testcnn.html b/sd-card/html/testcnn.html
new file mode 100644
index 00000000..ab2b44c2
--- /dev/null
+++ b/sd-card/html/testcnn.html
@@ -0,0 +1 @@
+<!DOCTYPE html><html><head><meta http-equiv="refresh" content="2"></head><body><tr><td><img src = "/original.jpg"></td></tr><tr><td><img src = "/roi.jpg"></td></tr><tr><td><img src = "/resize.bmp"></td></tr><tr><td><h2>Not-a-Number</h2></td></tr></body></html>
\ No newline at end of file
diff --git a/sd-card/html/version.txt b/sd-card/html/version.txt
index 2e7dfda2..83ecc0b2 100644
--- a/sd-card/html/version.txt
+++ b/sd-card/html/version.txt
@@ -1 +1 @@
-13.1.1
\ No newline at end of file
+14.2.1
\ No newline at end of file