diff --git a/FeatureRequest.md b/FeatureRequest.md index e2e97fd1..9f7f29c4 100644 --- a/FeatureRequest.md +++ b/FeatureRequest.md @@ -11,6 +11,10 @@ ____ +#### #26 Changes behaviour for "N" replacement + +* in case the higher digits has already increased by minium 1 - don't set the "N" to the last value, but to "0" +* https://github.com/jomjol/AI-on-the-edge-device/issues/792 #### #25 Trigger Measurement via MQTT diff --git a/README.md b/README.md index fc27496a..a36a7b8d 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,15 @@ In other cases you can contact the developer via email: ~/.ssh/id_rsa_base64 + - base64 --decode --ignore-garbage ~/.ssh/id_rsa_base64 > ~/.ssh/id_rsa + - chmod 600 ~/.ssh/id_rsa + - echo -e "Host gitlab.espressif.cn\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config + +before_script: + # Add gitlab ssh key + - *add_ssh_key + # Set git config + - *set_git_config + +.build_esp32s3: &build_esp32s3 + - idf.py set-target esp32s3 build + +.build_esp32: &build_esp32 + - idf.py set-target esp32 build + +build_demo: + stage: build + image: $CI_DOCKER_REGISTRY/esp32-ci-env:esp-nn + tags: + - build + script: + # Clone IDF + - git clone --recursive --single-branch -b release/v4.4 --reference-if-able /local_references/gitlab/ https://gitlab-ci-token:${BOT_TOKEN}@gitlab.espressif.cn:6688/espressif/esp-idf.git + - cd esp-idf + - ./install.sh + - . ./export.sh + - cd .. + # Build examples now + - cd test_app + # Build esp32s3 + - *build_esp32s3 + # Build esp32 + - *build_esp32 + - cd - diff --git a/code/components/esp-nn/CMakeLists.txt b/code/components/esp-nn/CMakeLists.txt new file mode 100644 index 00000000..da463779 --- /dev/null +++ b/code/components/esp-nn/CMakeLists.txt @@ -0,0 +1,48 @@ +idf_build_get_property(idf_target IDF_TARGET) + +set(c_srcs + "src/activation_functions/esp_nn_relu_ansi.c" + "src/basic_math/esp_nn_add_ansi.c" + "src/basic_math/esp_nn_mul_ansi.c" + "src/convolution/esp_nn_conv_ansi.c" + "src/convolution/esp_nn_depthwise_conv_ansi.c" + "src/fully_connected/esp_nn_fully_connected_ansi.c" + "src/softmax/esp_nn_softmax_ansi.c" + "src/softmax/esp_nn_softmax_opt.c" + "src/pooling/esp_nn_avg_pool_ansi.c" + "src/pooling/esp_nn_max_pool_ansi.c") + +if(CONFIG_IDF_TARGET_ESP32S3) + set(s3_srcs + "src/common/esp_nn_common_functions_esp32s3.S" + "src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S" + "src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S" + "src/activation_functions/esp_nn_relu_s8_esp32s3.S" + "src/basic_math/esp_nn_add_s8_esp32s3.S" + "src/basic_math/esp_nn_mul_s8_esp32s3.S" + "src/convolution/esp_nn_conv_esp32s3.c" + "src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c" + "src/convolution/esp_nn_conv_s16_mult8_esp32s3.S" + "src/convolution/esp_nn_conv_s16_mult8_1x1_esp32s3.S" + "src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S" + "src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S" + "src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S" + "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S" + "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S" + "src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S" + "src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S" + "src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S" + "src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S" + "src/pooling/esp_nn_max_pool_s8_esp32s3.S" + "src/pooling/esp_nn_avg_pool_s8_esp32s3.S") +endif() + +idf_component_register(SRCS "${c_srcs}" + "${s3_srcs}" + INCLUDE_DIRS "include" "src/common") + +if(CONFIG_IDF_TARGET_ESP32S3) + target_compile_options(${COMPONENT_LIB} PRIVATE -mlongcalls -fno-unroll-loops -O2 -Wno-unused-function) +else() + target_compile_options(${COMPONENT_LIB} PRIVATE -Wno-unused-function) +endif() \ No newline at end of file diff --git a/code/components/esp-nn/Kconfig.projbuild b/code/components/esp-nn/Kconfig.projbuild new file mode 100644 index 00000000..3bd683fc --- /dev/null +++ b/code/components/esp-nn/Kconfig.projbuild @@ -0,0 +1,29 @@ +menu "ESP-NN" + +choice NN_OPTIMIZATIONS + bool "Optimization for nn functions" + default NN_OPTIMIZED + help + Use ANSI-C versions for verification and debug purpose. + Optimisations are automatically picked up for a chipset. + For ESP32-S3, assembly Optimisations are selected. + For ESP32, just the ANSI C versions are selected for now. + +config NN_ANSI_C + bool "ANSI C" + help + ANSI C versions for verification and debug purposes. +config NN_OPTIMIZED + bool "Optimized versions" + help + Optimisations are automatically picked up for a chipset. + For ESP32-S3, assembly Optimisations are selected. + For ESP32, just the ANSI C versions are selected for now. +endchoice + +config NN_OPTIMIZATIONS + int + default 0 if NN_ANSI_C + default 1 if NN_OPTIMIZED + +endmenu diff --git a/code/components/esp-nn/LICENSE b/code/components/esp-nn/LICENSE new file mode 100644 index 00000000..d6456956 --- /dev/null +++ b/code/components/esp-nn/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/code/components/esp-nn/README.md b/code/components/esp-nn/README.md new file mode 100644 index 00000000..0d988c55 --- /dev/null +++ b/code/components/esp-nn/README.md @@ -0,0 +1,54 @@ +# ESP-NN + +The library contains optimised NN (Neural Network) functions for various Espressif chipsets. + +* Supported platforms: + * TensorFlow Lite Micro (TFLite Micro). Repo can be found [here](https://github.com/espressif/tflite-micro-esp-examples) + +* Supported ESP chipsets include: + * ESP32-S3 (Assembly versions optimised to benefit from vector instructions of ESP32-S3) + * ESP32 (ANSI C versions) + +## Performance + +### Kernelwise performance for s8 versions: + + * Kernelwise performance on ESP32-S3 chip + * Numbers are ticks taken for kernel to execute + * Chip config: 240MHz, SPI: QPI 80MHz, Data cache: 64KB + + | Function | ANSI C | ESP32-S3 Opt | Opt Ratio | Data info | Memory | + | ----------------| --------|---------|---------|-------------|-----------| + | elementwise_add | 320397 | 87119 | 3.68 | size = 1615 | External | + | elementwise_mul | 125958 | 44239 | 2.85 | size = 1615 | External | + | convolution | 4663012 | 428675 | 10.88 | input(10,10), filter(64x1x1x64) | External | + | convolution | 301014 | 32433 | 9.28 | input(8,8), filter(16x1x1x16) | External | + | convolution | 2115418 | 1020923 | 2.07 | input(10,10), filter(64x3x3x3) | External | + | depthwise conv | 1190062 | 203278 | 5.85 | input (18, 18), pad(0,0), stride(1,1) filter: 1x3x3x16 | External | + | depthwise conv | 837072 | 182335 | 4.59 | input (12, 12), pad(1,1), stride(1,1) filter: 8x5x5x4 | External | + | max pool | 485714 | 76747 | 6.33 | input(16,16), filter (1x3x3x16) | Internal | + | avg pool | 541462 | 160580 | 3.37 | input(16,16), filter (1x3x3x16) | Internal | + | fully connected | 15853 | 9547 | 1.66 | len: 265, ch = 3 | Internal | + | prelu (relu6) | 19472 | 2734 | 7.12 | size, 1615 | Internal | + + +## Configuration + + * To configure, please use `idf.py menuconfig` and under `ESP-NN` select `NN_OPTIMIZATIONS` + * There are two options presented: + * Optimized versions + * ANSI C + + * Default selection is for `Optimized versions`. For ESP32-S3, assembly versions are automatically selected, whereas for ESP32, ANSI-C versions are selected by default. + * For debugging purposes, you may want to select `ANSI C` + + +## Contributing + +If you encounter an issue with ESP-NN, or wish to submit a feature request, please use the Issues section on the Github. + +For general questions related to this library, please use the esp32.com forum. + +## Copyrights and License + +All original source code in this repository is Copyright (C) 2020-2021 Espressif Systems. This source code is licensed under the Apache License 2.0 as described in the file LICENSE. diff --git a/code/components/esp-nn/include/esp_nn.h b/code/components/esp-nn/include/esp_nn.h new file mode 100644 index 00000000..a4081871 --- /dev/null +++ b/code/components/esp-nn/include/esp_nn.h @@ -0,0 +1,46 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(CONFIG_NN_OPTIMIZED) +#ifdef CONFIG_IDF_TARGET_ESP32S3 +#define ARCH_ESP32_S3 1 +#endif +#ifdef CONFIG_IDF_TARGET_ESP32 +#define ARCH_ESP32 1 +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* reference kernels included by default */ +#include "esp_nn_ansi_headers.h" + +#if defined(CONFIG_NN_OPTIMIZED) +#ifdef ARCH_ESP32_S3 +#include "esp_nn_esp32s3.h" +#endif +#ifdef ARCH_ESP32 +#include "esp_nn_esp32.h" +#endif +#else +#include "esp_nn_ansi_c.h" +#endif + +#ifdef __cplusplus +} +#endif diff --git a/code/components/esp-nn/include/esp_nn_ansi_c.h b/code/components/esp-nn/include/esp_nn_ansi_c.h new file mode 100644 index 00000000..1612228c --- /dev/null +++ b/code/components/esp-nn/include/esp_nn_ansi_c.h @@ -0,0 +1,46 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file Header definitions to include for ANSI C versions. + * These are just typedefs to pick up ANSI versions. + */ + +#pragma once + +#include "esp_nn_ansi_headers.h" + +#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi +#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi + +#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_ansi + +#define esp_nn_conv_s8 esp_nn_conv_s8_ansi + +#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_ansi +#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_ansi + +#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_ansi +#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_ansi + +#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi + +#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi +#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi + +#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi + +#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_ansi +#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_ansi +#define esp_nn_softmax_s8 esp_nn_softmax_s8_ansi diff --git a/code/components/esp-nn/include/esp_nn_ansi_headers.h b/code/components/esp-nn/include/esp_nn_ansi_headers.h new file mode 100644 index 00000000..f871537d --- /dev/null +++ b/code/components/esp-nn/include/esp_nn_ansi_headers.h @@ -0,0 +1,283 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +/** + * @file Header definitions to include for esp_nn reference functions + */ + +#include + +/************************** Basic math functions ****************************/ + +/** + * @brief elementwise addition + * + * @note inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + * + * shift values are expected to be <= 0 + */ +void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data, + const int8_t *input2_data, + const int32_t input1_offset, + const int32_t input2_offset, + const int32_t input1_mult, + const int32_t input2_mult, + const int32_t input1_shift, + const int32_t input2_shift, + const int32_t left_shift, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t activation_min, + const int32_t activation_max, + const int32_t size); +/** + * @brief elementwise multiplication + * + * @note inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + * + * output shift is expected to be <= 0 + */ +void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data, + const int8_t *input2_data, + const int32_t input1_offset, + const int32_t input2_offset, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t activation_min, + const int32_t activation_max, + const int32_t size); + + +/************************** Convolution functions *****************************/ + +/** + * @brief depthwise convolution per channel + * + * @note inputs type: int8_t, output: int8_t + * Version used in tflite is per channel. + * This version follows the same footsprints. + * Meaning, it has per out_channel shift and multiplier for + * requantization + * + * optimization notes: Though input_offset is int32 type, + * offset values are contained in 8 bits [-128, 127] + */ +void esp_nn_depthwise_conv_s8_ansi(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t ch_mult, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +/** + * @brief 2d-convolution channelwise + * + * @note operation: result += (input + offset) * filter + * + * inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + */ +void esp_nn_conv_s8_ansi(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +int esp_nn_get_conv_scratch_size_ansi(const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_ch, + const uint16_t out_ch, + const uint16_t filter_wd, + const uint16_t filter_ht); +void esp_nn_set_conv_scratch_buf_ansi(const void *buf); + +int esp_nn_get_depthwise_conv_scratch_size_ansi(const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const uint16_t ch_mult, + const uint16_t filter_wd, + const uint16_t filter_ht); +void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf); + +/************************** Activation functions *****************************/ + +/** + * @brief relu6 + * + * @note inout: int8_t + */ +void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size); + +/************************** Pooling functions *****************************/ + + +/** + * @brief max_pool + * + * @note inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + */ +void esp_nn_max_pool_s8_ansi(const int8_t *input, + const uint16_t input_wd, + const uint16_t input_ht, + int8_t *output, + const uint16_t output_wd, + const uint16_t output_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t filter_wd, + const uint16_t filter_ht, + const uint16_t pad_wd, + const uint16_t pad_ht, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t channels); + +/** + * @brief avg_pool + * + * @note inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + */ +void esp_nn_avg_pool_s8_ansi(const int8_t *input, + const uint16_t input_wd, + const uint16_t input_ht, + int8_t *output, + const uint16_t output_wd, + const uint16_t output_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t filter_wd, + const uint16_t filter_ht, + const uint16_t pad_wd, + const uint16_t pad_ht, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t channels); + + +/************************** Fully connected functions ***********************/ + +/** + * @brief fully connected + * + * @note inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + */ +void esp_nn_fully_connected_s8_ansi(const int8_t *input_data, + const int32_t input_offset, + const uint16_t row_len, + const int8_t *filter_data, + const int32_t filter_offset, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t out_shift, + const int32_t out_mult, + const int32_t activation_min, + const int32_t activation_max); + +/** + * @brief Get scratch buffer size needed by softmax function + * + * @param width + * @param height + * @return size in bytes + * + * @note buffer must be 4 byte aligned + */ +int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height); + +/* ANSI C function to be hooked up when optimised version needed */ +int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height); + +/** + * @brief Set scratch buffer to be used by softmax function + * + * @param buffer this can be NULL if one needs to unset it + * must be aligned to 4 bytes + */ +void esp_nn_set_softmax_scratch_buf_ansi(void *buffer); + +/* ANSI C function to be hooked up when optimised version needed */ +void esp_nn_set_softmax_scratch_buf_opt(void *buffer); + +/** + * @brief reference softmax function + * + * @note inputs type: int8_t, output: int8_t + */ +void esp_nn_softmax_s8_ansi(const int8_t *input_data, + const int32_t height, + const int32_t width, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + int8_t *output_data); + +/** + * @brief optimised version of softmax function + * + * @note the function uses extra buffer (4 * width bytes) + * hence, scratch buffers must be set before calling this. + */ +void esp_nn_softmax_s8_opt(const int8_t *input_data, + const int32_t height, + const int32_t width, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + int8_t *output_data); diff --git a/code/components/esp-nn/include/esp_nn_esp32.h b/code/components/esp-nn/include/esp_nn_esp32.h new file mode 100644 index 00000000..03fd8216 --- /dev/null +++ b/code/components/esp-nn/include/esp_nn_esp32.h @@ -0,0 +1,48 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file Header definitions to include for esp_nn optimized functions for + * the ESP32 platform. + * We are hooking up just the C versions for now. + * The file hence is exactly same as `esp_nn_ansi_c.h` + */ + +#pragma once + +#include "esp_nn_ansi_headers.h" + +#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi +#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi + +#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_ansi + +#define esp_nn_conv_s8 esp_nn_conv_s8_ansi + +#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_ansi +#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_ansi + +#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_ansi +#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_ansi + +#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi + +#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi +#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi + +#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi + +#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_opt +#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_opt +#define esp_nn_softmax_s8 esp_nn_softmax_s8_opt diff --git a/code/components/esp-nn/include/esp_nn_esp32s3.h b/code/components/esp-nn/include/esp_nn_esp32s3.h new file mode 100644 index 00000000..58b544e4 --- /dev/null +++ b/code/components/esp-nn/include/esp_nn_esp32s3.h @@ -0,0 +1,261 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file Header definitions to include for esp_nn optimized functions for + * the ESP32-S3 platform + */ + +#pragma once + +#include +#include "esp_nn_ansi_headers.h" + +/************************** Basic math functions *****************************/ + + +/** + * @brief elementwise addition + * + * @note inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + * + * shift values are expected to be <= 0 + */ +void esp_nn_add_elementwise_s8_esp32s3(const int8_t *input1_data, + const int8_t *input2_data, + const int32_t input1_offset, + const int32_t input2_offset, + const int32_t input1_mult, + const int32_t input2_mult, + const int32_t input1_shift, + const int32_t input2_shift, + const int32_t left_shift, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t activation_min, + const int32_t activation_max, + const int32_t size); + +/** + * @brief elementwise multiplication + * + * @note inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + * + * output shift is expected to be <= 0 + */ +void esp_nn_mul_elementwise_s8_esp32s3(const int8_t *input1_data, + const int8_t *input2_data, + const int32_t input1_offset, + const int32_t input2_offset, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t activation_min, + const int32_t activation_max, + const int32_t size); + + +/************************** Convolution functions *****************************/ + +/** + * @brief depthwise convolution per channel + * + * @note inputs type: int8_t, output: int8_t + * Version used in tflite is per channel. + * This version follows the same footsprints. + * Meaning, it has per out_channel shift and multiplier for + * requantization + * + * optimization notes: Though input_offset is int32 type, + * offset values are contained in 8 bits [-128, 127] + */ +void esp_nn_depthwise_conv_s8_esp32s3(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t ch_mult, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +/** + * @brief 2d - convolution channelwise + * + * @note operation: result += (input + offset) * filter + * + * inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + */ +void esp_nn_conv_s8_esp32s3(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +int esp_nn_get_conv_scratch_size_esp32s3(const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_ch, + const uint16_t out_ch, + const uint16_t filter_wd, + const uint16_t filter_ht); +void esp_nn_set_conv_scratch_buf_esp32s3(const void *buf); + +int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const uint16_t ch_mult, + const uint16_t filter_wd, + const uint16_t filter_ht); +void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(const void *buf); + +/************************** Pooling functions *****************************/ + +/** + * @brief max_pool + * + * @note inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + */ +void esp_nn_max_pool_s8_esp32s3(const int8_t *input, + const uint16_t input_wd, + const uint16_t input_ht, + int8_t *output, + const uint16_t output_wd, + const uint16_t output_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t filter_wd, + const uint16_t filter_ht, + const uint16_t pad_wd, + const uint16_t pad_ht, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t channels); + +/** + * @brief avg_pool + * + * @note inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + */ +void esp_nn_avg_pool_s8_esp32s3(const int8_t *input, + const uint16_t input_wd, + const uint16_t input_ht, + int8_t *output, + const uint16_t output_wd, + const uint16_t output_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t filter_wd, + const uint16_t filter_ht, + const uint16_t pad_wd, + const uint16_t pad_ht, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t channels); + + +/************************** Fully connected functions *****************************/ + +/** + * @brief fully connected + * + * @note inputs type: int8_t, output: int8_t + * input offsets: although int32_t, they are contained in 8 bits [-128, 127] + * + * Current version works only on aligned input. + * row_len and channels should both be multiple of 8. + */ +void esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data, + const int32_t input_offset, + const uint16_t row_len, + const int8_t *filter_data, + const int32_t filter_offset, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t out_shift, + const int32_t out_mult, + const int32_t activation_min, + const int32_t activation_max); + +/** + * @brief relu6 + * + * @note inout: int8_t + */ +void esp_nn_relu6_s8_esp32s3(int8_t *data, uint16_t size); + +/********************** function defines ***************************/ + +#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32s3 +#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32s3 + +#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32s3 + +#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32s3 +#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32s3 + +#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32s3 +#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32s3 + +#define esp_nn_conv_s8 esp_nn_conv_s8_esp32s3 + +#define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32s3 + +#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32s3 +#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32s3 + +#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32s3 + +#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_opt +#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_opt +#define esp_nn_softmax_s8 esp_nn_softmax_s8_opt diff --git a/code/components/esp-nn/src/activation_functions/esp_nn_relu_ansi.c b/code/components/esp-nn/src/activation_functions/esp_nn_relu_ansi.c new file mode 100644 index 00000000..1d4c3d11 --- /dev/null +++ b/code/components/esp-nn/src/activation_functions/esp_nn_relu_ansi.c @@ -0,0 +1,30 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include + +void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size) +{ + int32_t i; + + for (i = 0; i < size; i++) { + int32_t ip = data[i]; + + ip = max(ip, 0); + data[i] = min(ip, 6); + } +} diff --git a/code/components/esp-nn/src/basic_math/esp_nn_add_ansi.c b/code/components/esp-nn/src/basic_math/esp_nn_add_ansi.c new file mode 100644 index 00000000..617386cf --- /dev/null +++ b/code/components/esp-nn/src/basic_math/esp_nn_add_ansi.c @@ -0,0 +1,97 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +void esp_nn_add_elementwise_u8_ansi(const uint8_t *input1_data, + const uint8_t *input2_data, + const int32_t input1_offset, + const int32_t input2_offset, + const int32_t input1_mult, + const int32_t input2_mult, + const int32_t input1_shift, + const int32_t input2_shift, + const int32_t left_shift, + uint8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t activation_min, + const int32_t activation_max, + const int32_t size) +{ + for (int i = 0; i < size; i++) { + int32_t tmp1 = input1_data[i] + input1_offset; + int32_t tmp2 = input2_data[i] + input2_offset; + + tmp1 <<= left_shift; + tmp2 <<= left_shift; + + tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult); + tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult); + + tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift); + tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift); + + int32_t out = tmp1 + tmp2; + out = esp_nn_sat_round_doubling_high_mul(out, out_mult); + out = esp_nn_div_by_power_of_two(out, -out_shift); + out = out + out_offset; + + out = max(activation_min, min(out, activation_max)); + output[i] = (uint8_t) out; + } +} + +void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data, + const int8_t *input2_data, + const int32_t input1_offset, + const int32_t input2_offset, + const int32_t input1_mult, + const int32_t input2_mult, + const int32_t input1_shift, + const int32_t input2_shift, + const int32_t left_shift, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t activation_min, + const int32_t activation_max, + const int32_t size) +{ + for (int i = 0; i < size; i++) { + int32_t tmp1 = input1_data[i] + input1_offset; + int32_t tmp2 = input2_data[i] + input2_offset; + + tmp1 <<= left_shift; + tmp2 <<= left_shift; + + tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult); + tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult); + + tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift); + tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift); + + int32_t out = tmp1 + tmp2; + out = esp_nn_sat_round_doubling_high_mul(out, out_mult); + out = esp_nn_div_by_power_of_two(out, -out_shift); + out = out + out_offset; + + out = max(activation_min, min(out, activation_max)); + output[i] = (int8_t) out; + } +} diff --git a/code/components/esp-nn/src/basic_math/esp_nn_mul_ansi.c b/code/components/esp-nn/src/basic_math/esp_nn_mul_ansi.c new file mode 100644 index 00000000..db8e8cc0 --- /dev/null +++ b/code/components/esp-nn/src/basic_math/esp_nn_mul_ansi.c @@ -0,0 +1,42 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data, + const int8_t *input2_data, + const int32_t input1_offset, + const int32_t input2_offset, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t activation_min, + const int32_t activation_max, + const int32_t size) +{ + for (int i = 0; i < size; i++) { + int32_t tmp1 = input1_data[i] + input1_offset; + int32_t tmp2 = input2_data[i] + input2_offset; + + int32_t out = tmp1 * tmp2; + out = esp_nn_multiply_by_quantized_mult(out, out_mult, out_shift); + out = out + out_offset; + + out = max(activation_min, min(out, activation_max)); + output[i] = (int8_t) out; + } +} diff --git a/code/components/esp-nn/src/common/common_functions.h b/code/components/esp-nn/src/common/common_functions.h new file mode 100644 index 00000000..9a5f0dcc --- /dev/null +++ b/code/components/esp-nn/src/common/common_functions.h @@ -0,0 +1,218 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +/** + * c99 standard still doesn't strictly inline functions + * We need to use attribute as well to do this. + */ +#define __NN_FORCE_INLINE__ __attribute((always_inline)) static inline + +/* min/max macros */ +#ifndef max +#define max(a, b) ({ \ + __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + _a > _b ? _a : _b; \ +}) + +#define min(a, b) ({ \ + __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + _a < _b ? _a : _b; \ +}) +#endif + +__NN_FORCE_INLINE__ int32_t esp_nn_clz32(uint32_t in) +{ + __asm__ volatile("nsau %0, %0" : "+r" (in)); + return in; +} + +__NN_FORCE_INLINE__ int32_t esp_nn_pick_sat_high32_of64(int64_t val64) +{ + int32_t sign = (int32_t) (val64 >> 63); + int32_t to_add = sign & ((1ul << 31) - 1); + return (int32_t) ((int64_t) (val64 + to_add) >> 31); +} + +/** + * Signed saturate a 32 bit value to 8 bits keeping output in 32 bit variable. + */ +__NN_FORCE_INLINE__ int32_t esp_nn_saturate8(int32_t in) +{ + __asm__ volatile("clamps %0, %0, 7" : "+a"(in)); + return in; +} + +__NN_FORCE_INLINE__ int32_t esp_nn_sat_round_doubling_high_mul(int32_t in0, int32_t in1) +{ + int32_t result; + int64_t in0_64 = (int64_t) in0; + bool overflow = (in0 == in1) && (in0 == (int32_t) INT32_MIN); + + /* Nudge value */ + int64_t nudge_val = 1 << 30; + if ((in0 < 0) ^ (in1 < 0)) { + nudge_val = 1 - nudge_val; + } + + /* Multiply and add nudge */ + int64_t mult = in0_64 * in1 + nudge_val; + + /* Round and pickup 32 bits */ + result = esp_nn_pick_sat_high32_of64(mult); + + return overflow ? INT32_MAX : result; +} + +/** + * fast version + * this will fail for values closer to INT32_MAX and INT32_MIN by `1 << (exponent - 1)`. + * We can afford to do this because we are at the very last stage of filter. + * Also it is pretty rare condition as our output is going to be 8 bit. + */ +__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two_fast(int32_t val, int32_t exponent) +{ + int32_t to_add = (1 << (exponent - 1)) - (val < 0); + return (int32_t) ((val + to_add) >> exponent); +} + +__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two(int32_t val, int32_t exponent) +{ + int32_t result; + + const int32_t mask = (1 << exponent) - 1; + const int32_t remainder = val & mask; + + result = val >> exponent; + int32_t threshold = (mask >> 1) + (result < 0); + + if (remainder > threshold) { + result += 1; + } + return result; +} + +__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult(int32_t x, int32_t mult, int32_t shift) +{ + int32_t left_shift = shift > 0 ? shift : 0; + int32_t right_shift = shift > 0 ? 0 : -shift; + int32_t result = esp_nn_sat_round_doubling_high_mul(x * (1 << left_shift), mult); + return esp_nn_div_by_power_of_two(result, right_shift); +} + +__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult_fast(int32_t x, int32_t mult, int32_t shift) +{ + int32_t left_shift = max(shift, 0); + int32_t right_shift = left_shift - shift; + + int64_t nudge_val = 1 << 30; + int64_t in0_64 = (int64_t) (x << left_shift); + + /* Multiply and add nudge */ + int64_t mult_64 = in0_64 * mult + nudge_val; + int32_t result = (int32_t) (mult_64 >> 31); + if (right_shift) { + result = esp_nn_div_by_power_of_two_fast(result, right_shift); + } + return result; +} + +static void esp_nn_aligned_s8_pad_with_value(const int8_t *src, int8_t *dst, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const int32_t pad_val, + const uint16_t pad_wd, + const uint16_t pad_ht) +{ + /* memset with pad_val */ + memset(dst, pad_val, ((input_wd + 2 * pad_wd) * (input_ht + 2 * pad_ht)) * channels * 2); + dst += (pad_wd + input_wd + pad_wd) * channels; + + for (int i = 0; i < input_ht; i++) { + dst += pad_wd * channels; + for (int j = 0; j < input_wd * channels; j++) { + *dst++ = *src++; + } + dst += pad_wd * channels; + } +} + +#if 0 +static void esp_nn_aligned_s8_pad_end_with_value(const int8_t *src, int8_t *dst, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const int32_t pad_val, + const uint16_t pad_wd, + const uint16_t pad_ht) +{ + for (int i = 0; i < input_ht; i++) { + for (int j = 0; j < input_wd * channels; j++) { + *dst++ = *src++; + } + memset(dst, pad_val, pad_wd * channels); + dst += pad_wd * channels; + } + /* pad end `pad_ht` lines at end */ + memset(dst, pad_val, (input_wd + pad_wd) * pad_ht * channels); +} +#endif + +/** + * @brief convert 8 bit input data to 16 bit + * + * @param src int8_t source data + * @param dst int16_t dst data + * @param size length of data + * @param offset offset to be added to src data. Range: [-128, 127] + */ +__NN_FORCE_INLINE__ void esp_nn_s8_to_s16_with_offset(const int8_t *src, int16_t *dst, + const int size, const int32_t offset) +{ + int i = 0; + for (; i < size; i += 2) { + dst[i + 0] = src[i + 0] + offset; + dst[i + 1] = src[i + 1] + offset; + } + if(i < size) { + dst[i] = src[i] + offset; + } +} + +/** + * @brief convert 8 bit input data to 16 bit + * + * @param src int8_t source data + * @param dst int16_t dst data + * @param size length of data + */ +__NN_FORCE_INLINE__ void esp_nn_s8_to_s16(const int8_t *src, int16_t *dst, const int size) +{ + int i = 0; + for (; i < size; i += 2) { + dst[i + 0] = src[i + 0]; + dst[i + 1] = src[i + 1]; + } + if(i < size) { + dst[i] = src[i]; + } +} diff --git a/code/components/esp-nn/src/convolution/esp_nn_conv_ansi.c b/code/components/esp-nn/src/convolution/esp_nn_conv_ansi.c new file mode 100644 index 00000000..d04f78e1 --- /dev/null +++ b/code/components/esp-nn/src/convolution/esp_nn_conv_ansi.c @@ -0,0 +1,175 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +int esp_nn_get_conv_scratch_size_ansi(const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_ch, + const uint16_t out_ch, + const uint16_t filter_wd, + const uint16_t filter_ht) +{ + return 0; +} + +void esp_nn_set_conv_scratch_buf_ansi(const void *buf) +{ + +} + +/** + * Assumption 1: i/p channels == o/p channels + * Assumption 2: Pointers are valid + * Assumption 3: dialation width = 1 + */ +void esp_nn_conv_u8_ansi(const uint8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t filter_offset, + const int32_t *bias, + uint8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t out_shift, + const int32_t out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + for (int out_y = 0; out_y < out_ht; out_y++) { //height loop + const int16_t base_y = (out_y * stride_ht) - pad_ht; + for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop + const int16_t base_x = (out_x * stride_wd) - pad_wd; + for (int out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {//channel_loop + int32_t result = 0; + + /* Select filter so as the point doesn't lie outside block */ + int filter_y_start = max(0, -base_y); + int filter_x_start = max(0, -base_x); + int filter_y_end = min(filter_ht, input_ht - base_y); + int filter_x_end = min(filter_wd, input_wd - base_x); + + for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { + const int32_t idx_y = base_y + filter_y_idx; + for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { + const int32_t idx_x = base_x + filter_x_idx; + for (int in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) { + int32_t input_index = (idx_y * input_wd + idx_x) * in_channels + in_ch_idx; + int32_t filter_index = ((out_ch_idx * filter_ht + filter_y_idx) + * filter_wd + filter_x_idx) * in_channels + + in_ch_idx; + int32_t input_val = input_data[input_index] + input_offset; + int32_t filter_val = filter_data[filter_index] + filter_offset; + result += input_val * filter_val; + } + } + } + if (bias) { + result += bias[out_ch_idx]; + } + result = esp_nn_multiply_by_quantized_mult(result, out_mult, out_shift); + result += out_offset; + result = max(result, activation_min); + result = min(result, activation_max); + + int out_index = (out_y * out_wd + out_x) * out_channels + out_ch_idx; + out_data[out_index] = (uint8_t) result; + } + } + } +} + +/** + * Assumption 1: i/p channels == o/p channels + * Assumption 2: Pointers are valid + * Assumption 3: dialation width = 1 + */ +void esp_nn_conv_s8_ansi(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx; + + for (out_y = 0; out_y < out_ht; out_y++) { + for (out_x = 0; out_x < out_wd; out_x++) { + for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { + int32_t conv_out = 0; + + const int32_t base_y = stride_ht * out_y - pad_ht; + const int32_t base_x = stride_wd * out_x - pad_wd; + + const int32_t filter_y_start = max(0, -base_y); + const int32_t filter_x_start = max(0, -base_x); + + const int32_t filter_y_end = min(filter_ht, input_ht - base_y); + const int32_t filter_x_end = min(filter_wd, input_wd - base_x); + + for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { + for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { + const int32_t in_row = base_y + filter_y_idx; + const int32_t in_col = base_x + filter_x_idx; + int32_t input_base_offset = (in_row * input_wd + in_col) * in_channels; + int32_t filter_base_offset = out_ch_idx * in_channels * filter_ht * filter_wd + + (filter_y_idx * filter_wd + filter_x_idx) * in_channels; + for (in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) { + conv_out += + (input_data[input_base_offset + in_ch_idx] + input_offset) * + filter_data[filter_base_offset + in_ch_idx]; + } + } + } + if (bias) { + conv_out += bias[out_ch_idx]; + } + conv_out = esp_nn_multiply_by_quantized_mult(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]); + conv_out += out_offset; + conv_out = max(conv_out, activation_min); + conv_out = min(conv_out, activation_max); + *out_data++ = (int8_t) conv_out; + } + } + } +} diff --git a/code/components/esp-nn/src/convolution/esp_nn_conv_esp32s3.c b/code/components/esp-nn/src/convolution/esp_nn_conv_esp32s3.c new file mode 100644 index 00000000..ea8fdfa5 --- /dev/null +++ b/code/components/esp-nn/src/convolution/esp_nn_conv_esp32s3.c @@ -0,0 +1,436 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include + +static int16_t *scratch_buffer = NULL; + +extern void esp_nn_conv_s16_mult8_1x1_esp32s3(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_channels, + const int32_t input_offset, + const int16_t *filter_data, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max, + void *buffer /* scratch buffer */); + +extern void esp_nn_conv_s16_mult4_1x1_esp32s3(const int16_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_channels, + const int16_t *filter_data, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max, + void *buffer /* scratch buffer */); + +extern void esp_nn_conv_s16_mult8_esp32s3(const int16_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_channels, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int16_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +extern void esp_nn_aligned_s8_to_s16_with_offset_esp32s3(const int8_t *src, int16_t *dst, + const int size, const int32_t offset); + +extern void esp_nn_s8_to_s16_esp32s3(const int8_t *src, int16_t *dst, const int size); + +static void esp_nn_conv_s8_unrolled(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx; + + for (out_y = 0; out_y < out_ht; out_y++) { + for (out_x = 0; out_x < out_wd; out_x++) { + for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { + int32_t conv_out = 0; + + const int32_t base_y = stride_ht * out_y - pad_ht; + const int32_t base_x = stride_wd * out_x - pad_wd; + + const int32_t filter_y_start = max(0, -base_y); + const int32_t filter_x_start = max(0, -base_x); + + const int32_t filter_y_end = min(filter_ht, input_ht - base_y); + const int32_t filter_x_end = min(filter_wd, input_wd - base_x); + + for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { + for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { + const int32_t in_row = base_y + filter_y_idx; + const int32_t in_col = base_x + filter_x_idx; + int32_t input_base_offset = (in_row * input_wd + in_col) * in_channels; + int32_t filter_base_offset = out_ch_idx * in_channels * filter_ht * filter_wd + + (filter_y_idx * filter_wd + filter_x_idx) * in_channels; + for (in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) { + conv_out += + (input_data[input_base_offset + in_ch_idx] + input_offset) * + filter_data[filter_base_offset + in_ch_idx]; + } + } + } + if (bias) { + conv_out += bias[out_ch_idx]; + } + conv_out = esp_nn_multiply_by_quantized_mult_fast(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]); + conv_out += out_offset; + conv_out = max(conv_out, activation_min); + conv_out = min(conv_out, activation_max); + *out_data++ = (int8_t) conv_out; + } + } + } +} + +static void esp_nn_conv_s8_pad_valid(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_channels, + const int32_t input_offset, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx; + + for (out_y = 0; out_y < out_ht; out_y++) { + for (out_x = 0; out_x < out_wd; out_x++) { + for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { + int32_t conv_out = 0; + + const int32_t base_y = stride_ht * out_y; + const int32_t base_x = stride_wd * out_x; + + for (filter_y_idx = 0; filter_y_idx < filter_ht; filter_y_idx++) { + for (filter_x_idx = 0; filter_x_idx < filter_wd; filter_x_idx++) { + const int32_t in_row = base_y + filter_y_idx; + const int32_t in_col = base_x + filter_x_idx; + int32_t input_base_offset = (in_row * input_wd + in_col) * in_channels; + int32_t filter_base_offset = out_ch_idx * in_channels * filter_ht * filter_wd + + (filter_y_idx * filter_wd + filter_x_idx) * in_channels; + const int8_t *input_data_ptr = input_data + input_base_offset; + const int8_t *filter_data_ptr = filter_data + filter_base_offset; + for (in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) { + conv_out += (*input_data_ptr++ + input_offset) * *filter_data_ptr++; + } + } + } + if (bias) { + conv_out += bias[out_ch_idx]; + } + conv_out = esp_nn_multiply_by_quantized_mult_fast(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]); + conv_out += out_offset; + conv_out = max(conv_out, activation_min); + conv_out = min(conv_out, activation_max); + *out_data++ = (int8_t) conv_out; + } + } + } +} + +static void esp_nn_conv_s8_pad_valid_3x3(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_channels, + const int32_t input_offset, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int8_t *filter_data, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx; + + for (out_y = 0; out_y < out_ht; out_y++) { + for (out_x = 0; out_x < out_wd; out_x++) { + const int32_t base_y = stride_ht * out_y; + const int32_t base_x = stride_wd * out_x; + for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { + int32_t conv_out = 0; + for (filter_y_idx = 0; filter_y_idx < 3; filter_y_idx++) { + for (filter_x_idx = 0; filter_x_idx < 3; filter_x_idx++) { + const int32_t in_row = base_y + filter_y_idx; + const int32_t in_col = base_x + filter_x_idx; + int32_t input_base_offset = (in_row * input_wd + in_col) * in_channels; + int32_t filter_base_offset = out_ch_idx * in_channels * 3 * 3 + + (filter_y_idx * 3 + filter_x_idx) * in_channels; + const int8_t *input_data_ptr = input_data + input_base_offset; + const int8_t *filter_data_ptr = filter_data + filter_base_offset; + for (in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) { + conv_out += (*input_data_ptr++ + input_offset) * *filter_data_ptr++; + } + } + } + if (bias) { + conv_out += bias[out_ch_idx]; + } + conv_out = esp_nn_multiply_by_quantized_mult_fast(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]); + conv_out += out_offset; + conv_out = max(conv_out, activation_min); + conv_out = min(conv_out, activation_max); + *out_data++ = (int8_t) conv_out; + } + } + } +} + +static void esp_nn_conv_s8_pad_valid_ch3_3x3(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const int32_t input_offset, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int8_t *filter_data, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + int32_t out_ch_idx, out_y, out_x, filter_y_idx; + + /* use scratch_buffer to pre-compute offset factor */ + int16_t *filter_sum = (int16_t *) scratch_buffer; + const int8_t *filter_ptr = filter_data; + for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { + int16_t sum_val = 0; + for (int i = 0; i < 9; i++) { + sum_val += *filter_ptr++; + sum_val += *filter_ptr++; + sum_val += *filter_ptr++; + } + *filter_sum++ = sum_val; + } + + for (out_y = 0; out_y < out_ht; out_y++) { + for (out_x = 0; out_x < out_wd; out_x++) { + const int8_t *filter_data_ptr = filter_data; + const int32_t base_y = stride_ht * out_y; + const int32_t base_x = stride_wd * out_x; + const int8_t *input_base_ptr = input_data + (base_y * input_wd + base_x) * 3; + int16_t *filter_sum = (int16_t *) scratch_buffer; + for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { + int32_t conv_out = 0; + + for (filter_y_idx = 0; filter_y_idx < 3; filter_y_idx++) { + const int8_t *input_data_ptr = input_base_ptr + (filter_y_idx * input_wd) * 3; + conv_out += (*input_data_ptr++) * (*filter_data_ptr++); + conv_out += (*input_data_ptr++) * (*filter_data_ptr++); + conv_out += (*input_data_ptr++) * (*filter_data_ptr++); + + conv_out += (*input_data_ptr++) * (*filter_data_ptr++); + conv_out += (*input_data_ptr++) * (*filter_data_ptr++); + conv_out += (*input_data_ptr++) * (*filter_data_ptr++); + + conv_out += (*input_data_ptr++) * (*filter_data_ptr++); + conv_out += (*input_data_ptr++) * (*filter_data_ptr++); + conv_out += (*input_data_ptr++) * (*filter_data_ptr++); + } + + conv_out += *filter_sum++ * input_offset; + + if (bias) { + conv_out += bias[out_ch_idx]; + } + conv_out = esp_nn_multiply_by_quantized_mult_fast(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]); + conv_out += out_offset; + conv_out = max(conv_out, activation_min); + conv_out = min(conv_out, activation_max); + *out_data++ = (int8_t) conv_out; + } + } + } +} + +int esp_nn_get_conv_scratch_size_esp32s3(const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t in_ch, + const uint16_t out_ch, + const uint16_t filter_wd, + const uint16_t filter_ht) +{ + int filter_size = filter_wd * filter_ht * in_ch * out_ch; + int input_size = input_wd * input_ht * in_ch; + int transpose_buf_size = 8 * in_ch; /* to store intermediate data */ + int align_buf_size = 32; /* extra buffer for alignment */ + return 2 * (filter_size + input_size + transpose_buf_size) + align_buf_size; +} + +void esp_nn_set_conv_scratch_buf_esp32s3(void *buf) +{ + scratch_buffer = (int16_t *) buf; +} + +void esp_nn_conv_s8_esp32s3(const int8_t *input, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + int filter_size = filter_wd * filter_ht * channels * out_channels; + int input_size = input_wd * input_ht * channels; + int align_len = 16 - (filter_size & 15); + int16_t *filter_data16 = scratch_buffer; + int16_t *input_data16 = scratch_buffer + filter_size + align_len; + + if (scratch_buffer == NULL) { + printf("esp_nn_conv error! scratch_buffer not set!\n"); + return; + } + + if (channels % 8 == 0 && filter_wd == 1 && filter_ht == 1 && + pad_wd == 0 && pad_ht == 0 && stride_wd == 1 && stride_ht == 1) { + int scratch_offset = (int) (filter_data16 + filter_size); + void *scratch_buf = (void *) (scratch_offset + 16 - (scratch_offset & 15)); + esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size); + esp_nn_conv_s16_mult8_1x1_esp32s3( + input, input_wd, input_ht, channels, input_offset, filter_data16, + bias, out_data, out_wd, out_ht, out_channels, out_offset, + out_shift, out_mult, activation_min, activation_max, scratch_buf); + } else if (channels % 4 == 0 && filter_wd == 1 && filter_ht == 1 && + (input_wd * input_ht) % 16 == 0 && /* TODO: remove this check */ + pad_wd == 0 && pad_ht == 0 && stride_wd == 1 && stride_ht == 1) { + int scratch_offset = (int) (input_data16 + input_size); + void *scratch_buf = (void *) (scratch_offset + 16 - (scratch_offset & 15)); + esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size); + esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input, input_data16, input_size, input_offset); + esp_nn_conv_s16_mult4_1x1_esp32s3( + input_data16, input_wd, input_ht, channels, filter_data16, + bias, out_data, out_wd, out_ht, out_channels, out_offset, + out_shift, out_mult, activation_min, activation_max, scratch_buf); + } else if (channels % 8 == 0) { + esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size); + esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input, input_data16, input_size, input_offset); + esp_nn_conv_s16_mult8_esp32s3( + input_data16, input_wd, input_ht, channels, pad_wd, pad_ht, + stride_wd, stride_ht, filter_data16, filter_wd, filter_ht, bias, + out_data, out_wd, out_ht, out_channels, out_offset, out_shift, + out_mult, activation_min, activation_max); + } else if (pad_wd == 0 && pad_ht == 0) { + if (filter_wd == 3 && filter_ht == 3 && channels == 3) { + esp_nn_conv_s8_pad_valid_ch3_3x3(input, input_wd, input_ht, input_offset, + stride_wd, stride_ht, filter_data, bias, + out_data, out_wd, out_ht, out_channels, out_offset, + out_shift, out_mult, activation_min, activation_max); + } else { + esp_nn_conv_s8_pad_valid(input, input_wd, input_ht, channels, input_offset, + stride_wd, stride_ht, filter_data, filter_wd, filter_ht, bias, + out_data, out_wd, out_ht, out_channels, out_offset, out_shift, + out_mult, activation_min, activation_max); + } + } else { + /* Basic unrolled version */ + esp_nn_conv_s8_unrolled(input, input_wd, input_ht, channels, input_offset, + pad_wd, pad_ht, stride_wd, stride_ht, + filter_data, filter_wd, filter_ht, bias, + out_data, out_wd, out_ht, out_channels, out_offset, out_shift, + out_mult, activation_min, activation_max); + } +} diff --git a/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_ansi.c b/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_ansi.c new file mode 100644 index 00000000..9cac6cef --- /dev/null +++ b/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_ansi.c @@ -0,0 +1,97 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +int esp_nn_get_depthwise_conv_scratch_size_ansi(const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const uint16_t ch_mult, + const uint16_t filter_wd, + const uint16_t filter_ht) +{ + return 0; +} + +void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf) +{ + +} + +void esp_nn_depthwise_conv_s8_ansi(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t ch_mult, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + int out_idx = 0; + for (int out_y = 0; out_y < out_ht; out_y++) { //height loop + const int16_t base_y = (out_y * stride_ht) - pad_ht; + for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop + const int16_t base_x = (out_x * stride_wd) - pad_wd; + for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop + for (int ch_mult_idx = 0; ch_mult_idx < ch_mult; ch_mult_idx++) { + int32_t result = 0; + const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult; + + /* Select filter so as the point doesn't lie outside block */ + int filter_y_start = max(0, -base_y); + int filter_x_start = max(0, -base_x); + int filter_y_end = min(filter_ht, input_ht - base_y); + int filter_x_end = min(filter_wd, input_wd - base_x); + + for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { + const int32_t idx_y = base_y + filter_y_idx; + for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { + const int32_t idx_x = base_x + filter_x_idx; + int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; + int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx; + int32_t input_val = input_data[input_index] + input_offset; + int32_t filter_val = filter_data[filter_index]; + result += input_val * filter_val; + } + } + if (bias) { + result += bias[out_ch_idx]; + } + result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_ch_idx], out_shift[out_ch_idx]); + result += out_offset; + result = max(result, activation_min); + result = min(result, activation_max); + + out_data[out_idx++] = result; + } + } + } + } +} diff --git a/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c b/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c new file mode 100644 index 00000000..c588c48f --- /dev/null +++ b/code/components/esp-nn/src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c @@ -0,0 +1,483 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include + +static int16_t *scratch_buffer = NULL; + +extern void esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3(const int16_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t ch_mult, + const int16_t *filter_data, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +extern void esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const int32_t input_offset, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int8_t *filter_data, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +extern void esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3(const int16_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int16_t *filter_data, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +extern void esp_nn_depthwise_conv_s16_mult8_esp32s3(const int16_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t ch_mult, + const int16_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +extern void esp_nn_depthwise_conv_s16_mult4_esp32s3(const int16_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t ch_mult, + const int16_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +extern void esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3(const int16_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int16_t *filter_data, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +extern void esp_nn_depthwise_conv_s16_mult1_esp32s3(const int16_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int16_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max); + +extern void esp_nn_s8_to_s16_esp32s3(const int8_t *src, int16_t *dst, const int size); + +extern void esp_nn_aligned_s8_to_s16_with_offset_esp32s3(const int8_t *src, int16_t *dst, + const int size, const int32_t offset); + +static void esp_nn_depthwise_conv_s8_unrolled(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t ch_mult, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + int out_idx = 0; + for (int out_y = 0; out_y < out_ht; out_y++) { //height loop + const int16_t base_y = (out_y * stride_ht) - pad_ht; + for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop + const int16_t base_x = (out_x * stride_wd) - pad_wd; + for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop + int ch_mult_idx = 0; + for (; ch_mult_idx < ch_mult - 3; ch_mult_idx += 4) { + int32_t result0 = 0, result1 = 0, result2 = 0, result3 = 0; + const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult; + + /* Select filter so as the point doesn't lie outside block */ + int filter_y_start = max(0, -base_y); + int filter_x_start = max(0, -base_x); + int filter_y_end = min(filter_ht, input_ht - base_y); + int filter_x_end = min(filter_wd, input_wd - base_x); + + for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { + const int32_t idx_y = base_y + filter_y_idx; + for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { + const int32_t idx_x = base_x + filter_x_idx; + int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; + int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx; + int32_t input_val = input_data[input_index] + input_offset; + int32_t filter_val0 = filter_data[filter_index + 0]; + int32_t filter_val1 = filter_data[filter_index + 1]; + int32_t filter_val2 = filter_data[filter_index + 2]; + int32_t filter_val3 = filter_data[filter_index + 3]; + result0 += input_val * filter_val0; + result1 += input_val * filter_val1; + result2 += input_val * filter_val2; + result3 += input_val * filter_val3; + } + } + if (bias) { + result0 += bias[out_ch_idx + 0]; + result1 += bias[out_ch_idx + 1]; + result2 += bias[out_ch_idx + 2]; + result3 += bias[out_ch_idx + 3]; + } + result0 = esp_nn_multiply_by_quantized_mult(result0, + out_mult[out_ch_idx + 0], out_shift[out_ch_idx + 0]); + result1 = esp_nn_multiply_by_quantized_mult(result1, + out_mult[out_ch_idx + 1], out_shift[out_ch_idx + 1]); + result2 = esp_nn_multiply_by_quantized_mult(result2, + out_mult[out_ch_idx + 2], out_shift[out_ch_idx + 2]); + result3 = esp_nn_multiply_by_quantized_mult(result3, + out_mult[out_ch_idx + 3], out_shift[out_ch_idx + 3]); + + result0 += out_offset; + result1 += out_offset; + result2 += out_offset; + result3 += out_offset; + + result0 = max(result0, activation_min); + result1 = max(result1, activation_min); + result2 = max(result2, activation_min); + result3 = max(result3, activation_min); + + result0 = min(result0, activation_max); + result1 = min(result1, activation_max); + result2 = min(result2, activation_max); + result3 = min(result3, activation_max); + + out_data[out_idx++] = result0; + out_data[out_idx++] = result1; + out_data[out_idx++] = result2; + out_data[out_idx++] = result3; + } + + /* left-over */ + for (; ch_mult_idx < ch_mult; ch_mult_idx++) { + int32_t result = 0; + const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult; + + /* Select filter so as the point doesn't lie outside block */ + int filter_y_start = max(0, -base_y); + int filter_x_start = max(0, -base_x); + int filter_y_end = min(filter_ht, input_ht - base_y); + int filter_x_end = min(filter_wd, input_wd - base_x); + + for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { + const int32_t idx_y = base_y + filter_y_idx; + for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { + const int32_t idx_x = base_x + filter_x_idx; + int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; + int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx; + int32_t input_val = input_data[input_index] + input_offset; + int32_t filter_val = filter_data[filter_index]; + result += input_val * filter_val; + } + } + if (bias) { + result += bias[out_ch_idx]; + } + result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_ch_idx], out_shift[out_ch_idx]); + result += out_offset; + result = max(result, activation_min); + result = min(result, activation_max); + + out_data[out_idx++] = result; + } + } + } + } +} + +void esp_nn_depthwise_conv_s8_ch_mult1(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + int out_idx = 0; + for (int out_y = 0; out_y < out_ht; out_y++) { //height loop + const int16_t base_y = (out_y * stride_ht) - pad_ht; + for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop + const int16_t base_x = (out_x * stride_wd) - pad_wd; + for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop + int32_t result = 0; + /* Select filter so as the point doesn't lie outside block */ + int filter_y_start = max(0, -base_y); + int filter_x_start = max(0, -base_x); + int filter_y_end = min(filter_ht, input_ht - base_y); + int filter_x_end = min(filter_wd, input_wd - base_x); + + for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { + const int32_t idx_y = base_y + filter_y_idx; + for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { + const int32_t idx_x = base_x + filter_x_idx; + int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; + int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * channels + ch_idx; + int32_t input_val = input_data[input_index] + input_offset; + int32_t filter_val = filter_data[filter_index]; + result += input_val * filter_val; + } + } + if (bias) { + result += bias[ch_idx]; + } + result = esp_nn_multiply_by_quantized_mult(result, out_mult[ch_idx], out_shift[ch_idx]); + result += out_offset; + result = max(result, activation_min); + result = min(result, activation_max); + + out_data[out_idx++] = result; + } + } + } +} + +int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const uint16_t ch_mult, + const uint16_t filter_wd, + const uint16_t filter_ht) +{ + int filter_size = filter_wd * filter_ht * channels * ch_mult; + int padding_used = ((filter_wd == 3) && (filter_ht == 3)) * 2; + int input_size = (input_wd + padding_used) * (input_ht + padding_used) * channels; + return 2 * (filter_size + input_size) + 16; //16 for alignment +} + +void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(void *buf) +{ + scratch_buffer = (int16_t *) buf; +} + +/** + * Assumption 1: i/p channels == o/p channels + * Assumption 2: Pointers are valid + * Assumption 3: dialation width = 1 + */ +void esp_nn_depthwise_conv_s8_esp32s3(const int8_t *input_data, + const uint16_t input_wd, + const uint16_t input_ht, + const uint16_t channels, + const int32_t input_offset, + const uint16_t pad_wd, + const uint16_t pad_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t ch_mult, + const int8_t *filter_data, + const uint16_t filter_wd, + const uint16_t filter_ht, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_wd, + const uint16_t out_ht, + const int32_t out_offset, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + int filter_size = filter_wd * filter_ht * channels * ch_mult; + int align_len = 16 - (filter_size & 15); + int input_size = input_wd * input_ht * channels; + int16_t *filter_data16 = scratch_buffer; + int16_t *input_data16 = scratch_buffer + filter_size + align_len; + if (scratch_buffer == NULL) { + printf("esp_nn_depthwise_conv error! scratch_buffer not set!\n"); + return; + } + + if ((ch_mult == 1) && (channels % 8 == 0)) { + if ((filter_wd == 3) && (filter_ht == 3)) { + if ((channels % 16 == 0) && (pad_wd == 1) && (pad_ht == 1)) { + /* process in 8 bits */ + int8_t *filter_aligned = (int8_t *) scratch_buffer; + int8_t *input_padded = (int8_t *) scratch_buffer + filter_size + align_len; + memcpy(filter_aligned, filter_data, filter_size); + esp_nn_aligned_s8_pad_with_value(input_data, input_padded, input_wd, input_ht, channels, + -input_offset, pad_wd, pad_ht); + esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(input_padded, input_wd + 2 * pad_wd, + input_ht + 2 * pad_ht, channels, input_offset, + stride_wd, stride_ht, filter_aligned, bias, + out_data, out_wd, out_ht, out_offset, out_shift, + out_mult, activation_min, activation_max); + } else if ((pad_wd == 0) && (pad_ht == 0) && + // because this does not handle padding offset cases yet, run just for stride (1, 1). + // end padding of input with `-input_offset` should solve this + (stride_wd == 1) && (stride_ht == 1)) { + /* process in 8 bits */ + int8_t *filter_aligned = (int8_t *) scratch_buffer; + memcpy(filter_aligned, filter_data, filter_size); + esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(input_data, input_wd, input_ht, channels, input_offset, + stride_wd, stride_ht, filter_aligned, + bias, out_data, out_wd, out_ht, out_offset, out_shift, + out_mult, activation_min, activation_max); + } else { /* (channels % 8) == 0 && pad_wd == 1 && pad_ht == 1 */ + esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size); + esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset); + esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3(input_data16, input_wd, input_ht, channels, + pad_wd, pad_ht, stride_wd, stride_ht, filter_data16, + bias, out_data, out_wd, out_ht, out_offset, out_shift, + out_mult, activation_min, activation_max); + } + } else { // all other ch_mult == 1, `channels % 8 == 0` + esp_nn_depthwise_conv_s8_ch_mult1(input_data, input_wd, input_ht, channels, input_offset, + pad_wd, pad_ht, stride_wd, stride_ht, + filter_data, filter_wd, filter_ht, + bias, out_data, out_wd, out_ht, out_offset, out_shift, + out_mult, activation_min, activation_max); + } + } else if (ch_mult % 8 == 0) { + esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size); + esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset); + if (filter_wd == 3 && filter_ht == 3) { + esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3(input_data16, input_wd, input_ht, channels, + pad_wd, pad_ht, stride_wd, stride_ht, ch_mult, + filter_data16, bias, + out_data, out_wd, out_ht, out_offset, out_shift, + out_mult, activation_min, activation_max); + } else { + esp_nn_depthwise_conv_s16_mult8_esp32s3(input_data16, input_wd, input_ht, channels, + pad_wd, pad_ht, stride_wd, stride_ht, ch_mult, + filter_data16, filter_wd, filter_ht, bias, + out_data, out_wd, out_ht, out_offset, out_shift, + out_mult, activation_min, activation_max); + } + } else if (ch_mult % 4 == 0) { + esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size); + esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset); + esp_nn_depthwise_conv_s16_mult4_esp32s3(input_data16, input_wd, input_ht, channels, + pad_wd, pad_ht, stride_wd, stride_ht, ch_mult, + filter_data16, filter_wd, filter_ht, bias, + out_data, out_wd, out_ht, out_offset, out_shift, + out_mult, activation_min, activation_max); + } else { + esp_nn_depthwise_conv_s8_unrolled(input_data, input_wd, input_ht, channels, input_offset, + pad_wd, pad_ht, stride_wd, stride_ht, ch_mult, + filter_data, filter_wd, filter_ht, + bias, out_data, out_wd, out_ht, out_offset, out_shift, + out_mult, activation_min, activation_max); + } +} diff --git a/code/components/esp-nn/src/fully_connected/esp_nn_fully_connected_ansi.c b/code/components/esp-nn/src/fully_connected/esp_nn_fully_connected_ansi.c new file mode 100644 index 00000000..6d800bc5 --- /dev/null +++ b/code/components/esp-nn/src/fully_connected/esp_nn_fully_connected_ansi.c @@ -0,0 +1,50 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +void esp_nn_fully_connected_s8_ansi(const int8_t *input_data, + const int32_t input_offset, + const uint16_t row_len, + const int8_t *filter_data, + const int32_t filter_offset, + const int32_t *bias, + int8_t *out_data, + const uint16_t out_channels, + const int32_t out_offset, + const int32_t out_shift, + const int32_t out_mult, + const int32_t activation_min, + const int32_t activation_max) +{ + for (int32_t out_c = 0; out_c < out_channels; ++out_c) { + int32_t result = 0; + for (int32_t data_idx = 0; data_idx < row_len; data_idx++) { + int32_t filter_index = row_len * out_c + data_idx; + int32_t input_val = input_data[data_idx]; + int32_t filter_val = filter_data[filter_index]; + result += (filter_val + filter_offset) * (input_val + input_offset); + } + if (bias) { + result += bias[out_c]; + } + result = esp_nn_multiply_by_quantized_mult(result, out_mult, out_shift); + result += out_offset; + result = max(result, activation_min); + result = min(result, activation_max); + out_data[out_c] = (int8_t) result; + } +} diff --git a/code/components/esp-nn/src/pooling/esp_nn_avg_pool_ansi.c b/code/components/esp-nn/src/pooling/esp_nn_avg_pool_ansi.c new file mode 100644 index 00000000..03846aa0 --- /dev/null +++ b/code/components/esp-nn/src/pooling/esp_nn_avg_pool_ansi.c @@ -0,0 +1,72 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +void esp_nn_avg_pool_s8_ansi(const int8_t *input, + const uint16_t input_wd, + const uint16_t input_ht, + int8_t *output, + const uint16_t output_wd, + const uint16_t output_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t filter_wd, + const uint16_t filter_ht, + const uint16_t pad_wd, + const uint16_t pad_ht, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t channels) +{ + int32_t base_y = -pad_ht; + for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) { + int32_t base_x = -pad_wd; + for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) { + for (int32_t ch_idx = 0; ch_idx < channels; ch_idx++) { + int32_t result = 0; + int32_t filter_cnt = 0; + /* Make sure filter does not cross the input box */ + int32_t filter_y_start = max(0, -base_y); + int32_t filter_x_start = max(0, -base_x); + + int32_t filter_y_end = min(filter_ht, input_ht - base_y); + int32_t filter_x_end = min(filter_wd, input_wd - base_x); + + for (int32_t filter_y = filter_y_start; filter_y < filter_y_end; filter_y++) { + for (int32_t filter_x = filter_x_start; filter_x < filter_x_end; filter_x++) { + int32_t in_x_idx = base_x + filter_x; + int32_t in_y_idx = base_y + filter_y; + int32_t input_index = (in_y_idx * input_wd + in_x_idx) * channels + ch_idx; + result += input[input_index]; + filter_cnt++; + } + } + + /* Rounded average */ + result = result > 0 ? (result + filter_cnt / 2) / filter_cnt + : (result - filter_cnt / 2) / filter_cnt; + + /* Activation function */ + result = max(result, activation_min); + result = min(result, activation_max); + + int32_t output_index = (out_y * output_wd + out_x) * channels + ch_idx; + output[output_index] = (int8_t) result; + } + } + } +} diff --git a/code/components/esp-nn/src/pooling/esp_nn_max_pool_ansi.c b/code/components/esp-nn/src/pooling/esp_nn_max_pool_ansi.c new file mode 100644 index 00000000..4ca5c42d --- /dev/null +++ b/code/components/esp-nn/src/pooling/esp_nn_max_pool_ansi.c @@ -0,0 +1,66 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +void esp_nn_max_pool_s8_ansi(const int8_t *input, + const uint16_t input_wd, + const uint16_t input_ht, + int8_t *output, + const uint16_t output_wd, + const uint16_t output_ht, + const uint16_t stride_wd, + const uint16_t stride_ht, + const uint16_t filter_wd, + const uint16_t filter_ht, + const uint16_t pad_wd, + const uint16_t pad_ht, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t channels) +{ + int32_t base_y = -pad_ht; + for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) { + int32_t base_x = -pad_wd; + for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) { + /* Make sure filter does not cross the input box */ + int32_t filter_y_start = max(0, -base_y); + int32_t filter_x_start = max(0, -base_x); + int32_t filter_y_end = min(filter_ht, input_ht - base_y); + int32_t filter_x_end = min(filter_wd, input_wd - base_x); + + for (int32_t ch_idx = 0; ch_idx < channels; ch_idx++) { + int8_t result = INT8_MIN; + + for (int32_t filter_y = filter_y_start; filter_y < filter_y_end; filter_y++) { + for (int32_t filter_x = filter_x_start; filter_x < filter_x_end; filter_x++) { + int32_t in_x_idx = base_x + filter_x; + int32_t in_y_idx = base_y + filter_y; + int32_t input_index = (in_y_idx * input_wd + in_x_idx) * channels + ch_idx; + result = max(input[input_index], result); + } + } + + /* Activation function */ + result = max(result, activation_min); + result = min(result, activation_max); + + int32_t output_index = (out_y * output_wd + out_x) * channels + ch_idx; + output[output_index] = result; + } + } + } +} diff --git a/code/components/esp-nn/src/softmax/esp_nn_softmax_ansi.c b/code/components/esp-nn/src/softmax/esp_nn_softmax_ansi.c new file mode 100644 index 00000000..d71a8616 --- /dev/null +++ b/code/components/esp-nn/src/softmax/esp_nn_softmax_ansi.c @@ -0,0 +1,88 @@ +// Copyright 2022 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "softmax_common.h" + +int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height) +{ + (void) width; + (void) height; + return 0; +} + +void esp_nn_set_softmax_scratch_buf_ansi(void *buffer) +{ + (void) buffer; + return; +} + +void esp_nn_softmax_s8_ansi(const int8_t *input_data, + const int32_t height, + const int32_t width, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + int8_t *output_data) +{ + // The representation chosen for the input to the exp() function is Q5.26. + // We need to leave extra space since values that we skip might be as large as + // -32 before multiplying by input mult, and therefore as large as + // -16 afterwards. Note that exp(-8) is definitely not insignificant to + // accumulation, but exp(-16) definitely is. +#define ACCUM_BITS 12 +#define DIFF_BITS 5 + + const int32_t mask = (1 << shift); + int32_t col = 0; + const int8_t *in_ptr = input_data; + int8_t *out_ptr = output_data; + + for (int row_idx = 0; row_idx < height; row_idx++) { + int8_t max_in_row = in_ptr[0]; + for (col = 1; col < width; col++) { + max_in_row = max(max_in_row, in_ptr[col]); + } + + int32_t input_diff = 0; + int32_t sum_of_exps = 0; + + for (col = 0; col < width; col++) { + input_diff = in_ptr[col] - max_in_row; + if (input_diff >= diff_min) { + const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult); + const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled); + sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS); + } + } + + const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps); + const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31)); + const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8; + + for (col = 0; col < width; col++) { + input_diff = in_ptr[col] - max_in_row; + if (input_diff >= diff_min) { + const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult); + const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled); + const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw); + const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128; + out_ptr[col] = (int8_t) esp_nn_saturate8(result); + } else { + out_ptr[col] = -128; + } + } + in_ptr += width; + out_ptr += width; + } +} diff --git a/code/components/esp-nn/src/softmax/esp_nn_softmax_opt.c b/code/components/esp-nn/src/softmax/esp_nn_softmax_opt.c new file mode 100644 index 00000000..93337d32 --- /dev/null +++ b/code/components/esp-nn/src/softmax/esp_nn_softmax_opt.c @@ -0,0 +1,108 @@ +// Copyright 2022 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "softmax_common.h" +#include + +static int32_t *scratch_buf = NULL; + +/** + * @brief Get scratch buffer size needed by softmax function + * + * @param width + * @param height + * @return size in bytes + * + * @note buffer must be 4 byte aligned + */ +int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height) +{ + (void) height; + return width * 4; +} + +/** + * @brief Set scratch buffer to be used by softmax function + * + * @param buffer this can be NULL if one needs to unset it + * must be aligned to 4 bytes + */ +void esp_nn_set_softmax_scratch_buf_opt(void *buffer) +{ + scratch_buf = (int32_t *) buffer; +} + +void esp_nn_softmax_s8_opt(const int8_t *input_data, + const int32_t height, + const int32_t width, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + int8_t *output_data) +{ + if (scratch_buf == NULL) { + printf("%s error! scratch buffer not set\n", __FUNCTION__); + return; + } + // The representation chosen for the input to the exp() function is Q5.26. + // We need to leave extra space since values that we skip might be as large as + // -32 before multiplying by input mult, and therefore as large as + // -16 afterwards. Note that exp(-8) is definitely not insignificant to + // accumulation, but exp(-16) definitely is. +#define ACCUM_BITS 12 +#define DIFF_BITS 5 + + const int32_t mask = (1 << shift); + int32_t col = 0; + const int8_t *in_ptr = input_data; + int8_t *out_ptr = output_data; + + for (int row_idx = 0; row_idx < height; row_idx++) { + int8_t max_in_row = in_ptr[0]; + for (col = 1; col < width; col++) { + max_in_row = max(max_in_row, in_ptr[col]); + } + + int32_t input_diff = 0; + int32_t sum_of_exps = 0; + + for (col = 0; col < width; col++) { + input_diff = in_ptr[col] - max_in_row; + if (input_diff >= diff_min) { + const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult); + const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled); + scratch_buf[col] = exp_raw; // store to avoid duplicate calculation later + sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS); + } + } + + const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps); + const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31)); + const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8; + + for (col = 0; col < width; col++) { + input_diff = in_ptr[col] - max_in_row; + if (input_diff >= diff_min) { + int32_t exp_raw = scratch_buf[col]; + const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw); + const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128; + out_ptr[col] = (int8_t) esp_nn_saturate8(result); + } else { + out_ptr[col] = -128; + } + } + in_ptr += width; + out_ptr += width; + } +} diff --git a/code/components/esp-nn/src/softmax/softmax_common.h b/code/components/esp-nn/src/softmax/softmax_common.h new file mode 100644 index 00000000..254d6ace --- /dev/null +++ b/code/components/esp-nn/src/softmax/softmax_common.h @@ -0,0 +1,104 @@ +// Copyright 2022 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0 +#define MASK_IF_NON_ZERO(x) (x) != 0 ? ~0 : 0 +#define SELECT_USING_MASK(mask, a, b) ((mask) & (a)) ^ (~(mask) & (b)) +#define SAT_HIGH_MUL(x, y) esp_nn_sat_round_doubling_high_mul((x), (y)) +#define DIV_POW2(x,y) esp_nn_div_by_power_of_two((x), (y)) + +__NN_FORCE_INLINE__ int32_t mul_power_of_2(int val, int exp) +{ + const int32_t thresh = ((1 << (31 - exp)) - 1); + int32_t result = val << exp; + result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), INT32_MAX, result); + result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), INT32_MIN, result); + return result; +} + +/** + * @brief Calculate `1 / (1 + x)` for x in [0, 1] + * + * @param val input value to calculate `1/(1+x)` for + * @return `int32_t` result + * @note Newton-Raphson division + * + * https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division + * Refer to that page for the logic behind the 48/17 and 32/17 constants. + * Pseudocode: https://en.wikipedia.org/wiki/Division_algorithm#Pseudocode + */ +__NN_FORCE_INLINE__ int32_t esp_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val) +{ + const int64_t sum = (int64_t) val + INT32_MAX; + const int32_t half_denominator = (int32_t) ((sum + (sum >= 0 ? 1 : -1)) / 2L); + int32_t constant_48_over_17 = 1515870810; + int32_t constant_neg_32_over_17 = -1010580540; + int32_t x = constant_48_over_17 + SAT_HIGH_MUL(half_denominator, constant_neg_32_over_17); + const int32_t fixed_2_one = (1 << 29); + + x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2); + x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2); + x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2); + + return mul_power_of_2(x, 1); +} + +#define ONE_OVER_ONE_X(x) esp_nn_one_over_one_plus_x_for_x_in_0_1((x)) + +/** + * @brief Return exp(x) for x < 0. + * + */ +__NN_FORCE_INLINE__ int32_t esp_nn_exp_on_negative_values(int32_t val) +{ + int32_t shift = 24; + + const int32_t one_quarter = (1 << shift); + int32_t mask = one_quarter - 1; + const int32_t val_mod_minus_quarter = (val & mask) - one_quarter; + const int32_t remainder = val_mod_minus_quarter - val; + + // calculate exponent for x in [-1/4, 0) in `result` + const int32_t x = (val_mod_minus_quarter << 5) + (1 << 28); + const int32_t x2 = SAT_HIGH_MUL(x, x); + const int32_t x3 = SAT_HIGH_MUL(x2, x); + const int32_t x4 = SAT_HIGH_MUL(x2, x2); + const int32_t one_over_3 = 715827883; + const int32_t one_over_8 = 1895147668; + + const int32_t x4_over_4 = DIV_POW2(x4, 2); + const int32_t x4_over_4_plus_x3_over_6_plus_x2_over_2 = DIV_POW2(SAT_HIGH_MUL(x4_over_4 + x3, one_over_3) + x2, 1); + int32_t result = one_over_8 + SAT_HIGH_MUL(one_over_8, x + x4_over_4_plus_x3_over_6_plus_x2_over_2); + +#define SELECT_IF_NON_ZERO(x) { \ + mask = MASK_IF_NON_ZERO(remainder & (1 << shift++)); \ + result = SELECT_USING_MASK(mask, SAT_HIGH_MUL(result, x), result); \ +} + + SELECT_IF_NON_ZERO(1672461947) + SELECT_IF_NON_ZERO(1302514674) + SELECT_IF_NON_ZERO(790015084) + SELECT_IF_NON_ZERO(290630308) + SELECT_IF_NON_ZERO(39332535) + SELECT_IF_NON_ZERO(720401) + SELECT_IF_NON_ZERO(242) + +#undef SELECT_IF_NON_ZERO + + mask = MASK_IF_ZERO(val); + return SELECT_USING_MASK(mask, INT32_MAX, result); +} \ No newline at end of file diff --git a/code/components/esp-nn/test_app/CMakeLists.txt b/code/components/esp-nn/test_app/CMakeLists.txt new file mode 100644 index 00000000..8d332768 --- /dev/null +++ b/code/components/esp-nn/test_app/CMakeLists.txt @@ -0,0 +1,9 @@ +# The following lines of boilerplate have to be in your project's +# CMakeLists in this exact order for cmake to work correctly +cmake_minimum_required(VERSION 3.5) + +set(EXTRA_COMPONENT_DIRS "../" "../tests/") +set(IDF_EXCLUDE_COMPONENTS test test_app) + +include($ENV{IDF_PATH}/tools/cmake/project.cmake) +project(test_app) diff --git a/code/components/esp-nn/test_app/main/CMakeLists.txt b/code/components/esp-nn/test_app/main/CMakeLists.txt new file mode 100644 index 00000000..04161254 --- /dev/null +++ b/code/components/esp-nn/test_app/main/CMakeLists.txt @@ -0,0 +1,7 @@ + +set(COMPONENT_SRCS "main.c") +set(COMPONENT_ADD_INCLUDEDIRS "") + +set(COMPONENT_PRIV_REQUIRES tests) + +register_component() diff --git a/code/components/esp-nn/test_app/main/component.mk b/code/components/esp-nn/test_app/main/component.mk new file mode 100644 index 00000000..5d85ad38 --- /dev/null +++ b/code/components/esp-nn/test_app/main/component.mk @@ -0,0 +1,8 @@ +# +# Main component makefile. +# +# This Makefile can be left empty. By default, it will take the sources in the +# src/ directory, compile them and link them into lib(subdirectory_name).a +# in the build directory. This behaviour is entirely configurable, +# please read the ESP-IDF documents if you need to do this. +# diff --git a/code/components/esp-nn/test_app/main/main.c b/code/components/esp-nn/test_app/main/main.c new file mode 100644 index 00000000..267e35f2 --- /dev/null +++ b/code/components/esp-nn/test_app/main/main.c @@ -0,0 +1,87 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include +#include +#include + +#include +#include + +static const char *TAG = "test_app"; +static uint32_t start_c, start_opt, total_c, total_opt; + +void profile_c_start() +{ + /* initiate profiling */ + start_c = esp_cpu_get_ccount(); +} + +void profile_c_end() +{ + /* record profile number */ + total_c = esp_cpu_get_ccount() - start_c; +} + +void profile_opt_start() +{ + /* initiate profiling */ + start_opt = esp_cpu_get_ccount(); +} + +void profile_opt_end() +{ + /* record profile number */ + total_opt = esp_cpu_get_ccount() - start_opt; +} + +void app_main() +{ + /* s8 tests */ + ESP_LOGI(TAG, "Running s8 tests..."); + esp_nn_add_elementwise_s8_test(); + printf("add, c %u opt %u\n", total_c, total_opt); + esp_nn_mul_elementwise_s8_test(); + printf("mul, c %u opt %u\n", total_c, total_opt); + esp_nn_depthwise_conv_s8_test(); + printf("depthwise, c %u opt %u\n", total_c, total_opt); + esp_nn_conv_s8_test(); + printf("conv2d, c %u opt %u\n", total_c, total_opt); + + esp_nn_relu6_s8_test(); + printf("relu, c %u opt %u\n", total_c, total_opt); + esp_nn_avg_pool_s8_test(); + printf("avg_pool, c %u opt %u\n", total_c, total_opt); + esp_nn_max_pool_s8_test(); + printf("max_pool, c %u opt %u\n", total_c, total_opt); + esp_nn_fully_connected_s8_test(); + printf("fully_connected, c %u opt %u\n", total_c, total_opt); + esp_nn_softmax_s8_test(); + printf("softmax, c %u opt %u\n", total_c, total_opt); + ESP_LOGI(TAG, "s8 tests done!\n"); + + /* u8 tests */ + //ESP_LOGI(TAG, "Running u8 tests..."); + //esp_nn_add_elementwise_u8_test(); + //esp_nn_depthwise_conv_u8_test(); + //esp_nn_conv_u8_test(); + //esp_nn_avg_pool_u8_test(); + //esp_nn_max_pool_u8_test(); + //esp_nn_fully_connected_u8_test(); + //ESP_LOGI(TAG, "u8 tests done!\n"); +} diff --git a/code/components/esp-nn/test_app/sdkconfig.defaults b/code/components/esp-nn/test_app/sdkconfig.defaults new file mode 100644 index 00000000..bb37aac5 --- /dev/null +++ b/code/components/esp-nn/test_app/sdkconfig.defaults @@ -0,0 +1,5 @@ + +# +# esp-nn +# +CONFIG_NN_ESP32=y diff --git a/code/components/esp-nn/tests/CMakeLists.txt b/code/components/esp-nn/tests/CMakeLists.txt new file mode 100644 index 00000000..97ec946f --- /dev/null +++ b/code/components/esp-nn/tests/CMakeLists.txt @@ -0,0 +1,15 @@ + +set(COMPONENT_ADD_INCLUDEDIRS ./include/) +set(COMPONENT_SRCS "src/basic_math_test.c" + "src/convolution_test.c" + "src/fully_connected_test.c" + "src/pooling_test.c" + "src/relu_test.c" + "src/softmax_test.c") + +set(COMPONENT_REQUIRES ) +set(COMPONENT_PRIV_REQUIRES esp-nn) + +register_component() + +target_compile_options(${COMPONENT_LIB} PRIVATE -Wno-unused-function) diff --git a/code/components/esp-nn/tests/README.md b/code/components/esp-nn/tests/README.md new file mode 100644 index 00000000..41c94235 --- /dev/null +++ b/code/components/esp-nn/tests/README.md @@ -0,0 +1,4 @@ +# Tests for esp_nn library + +- Include these in your test framework and run the framework. +- For IDF test please refer `test_app` diff --git a/code/components/esp-nn/tests/component.mk b/code/components/esp-nn/tests/component.mk new file mode 100644 index 00000000..2860f3ff --- /dev/null +++ b/code/components/esp-nn/tests/component.mk @@ -0,0 +1,5 @@ +#FIXME + +COMPONENT_ADD_INCLUDEDIRS := include/ + +COMPONENT_SRCDIRS := src/ diff --git a/code/components/esp-nn/tests/include/test_functions.h b/code/components/esp-nn/tests/include/test_functions.h new file mode 100644 index 00000000..3e882efa --- /dev/null +++ b/code/components/esp-nn/tests/include/test_functions.h @@ -0,0 +1,48 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +/* int8_t ops tests */ +void esp_nn_add_elementwise_s8_test(); +void esp_nn_mul_elementwise_s8_test(); + +void esp_nn_depthwise_conv_s8_test(); +void esp_nn_conv_s8_test(); + +void esp_nn_avg_pool_s8_test(); +void esp_nn_max_pool_s8_test(); + +void esp_nn_fully_connected_s8_test(); + +void esp_nn_relu6_s8_test(); + +void esp_nn_softmax_s8_test(); + +/* uint8_t ops tests */ +void esp_nn_add_elementwise_u8_test(); + +void esp_nn_depthwise_conv_u8_test(); +void esp_nn_conv_u8_test(); + +void esp_nn_avg_pool_u8_test(); +void esp_nn_max_pool_u8_test(); + +void esp_nn_fully_connected_u8_test(); + +/* instructions test functions */ +void compare_instructions_test(); +void arith_instructions_test(); +void min_max_instructions_test(); +void bitwise_instructions_test(); +void load_store_instructions_test(); diff --git a/code/components/esp-nn/tests/include/test_utils.h b/code/components/esp-nn/tests/include/test_utils.h new file mode 100644 index 00000000..a152549b --- /dev/null +++ b/code/components/esp-nn/tests/include/test_utils.h @@ -0,0 +1,87 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +/* mult value range */ +#define MULT_MAX INT32_MAX +#define MULT_MIN 0 + +/* shift value range */ +#define SHIFT_MIN -31 +#define SHIFT_MAX 30 + +/** + * @brief callback function to run before C function + */ +void profile_c_start(); + +/** + * @brief callback function to run after C function + */ +void profile_c_end(); + +/** + * @brief callback function to run before optimized function + */ +void profile_opt_start(); + +/** + * @brief callback function to run after optimized function + */ +void profile_opt_end(); + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_RESET "\x1b[0m" + +#define CHECK_EQUAL(ARRAY1, ARRAY2, size) ({ \ + bool res = true; \ + for (int _i = 0; _i < size; _i++) { \ + if (ARRAY1[_i] != ARRAY2[_i]) { \ + res = false; \ + break; \ + } \ + } \ + res; \ +}) + +#define PRINT_ARRAY_INT(ARRAY, width, height) ({ \ + int *_array = (int *) ARRAY; \ + for (int _j = 0; _j < height; _j++) { \ + for (int _i = 0; _i < width; _i++) { \ + printf("%d\t", _array[width * _j + _i]); \ + } \ + printf("\n"); \ + } \ + printf("\n"); \ +}) + +#define PRINT_ARRAY_HEX(ARRAY, width, height) ({ \ + uint8_t *_array = (uint8_t *) ARRAY; \ + for (int _j = 0; _j < height; _j++) { \ + for (int _i = 0; _i < width; _i++) { \ + printf("%02x\t", _array[width * _j + _i]); \ + } \ + printf("\n"); \ + } \ + printf("\n"); \ +}) diff --git a/code/components/esp-nn/tests/src/basic_math_test.c b/code/components/esp-nn/tests/src/basic_math_test.c new file mode 100644 index 00000000..5b96b990 --- /dev/null +++ b/code/components/esp-nn/tests/src/basic_math_test.c @@ -0,0 +1,343 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include +#include +#include "test_utils.h" + +#if CONFIG_IDF_CMAKE +#define IDF_HEAP_CAPS 1 + +#if IDF_HEAP_CAPS +#include "esp_heap_caps.h" +#endif +#endif + +void esp_nn_add_elementwise_s8_test() +{ + /* prepare data */ + const int size = 1600 + 8 + 7; /* odd len to test leftover */ + int8_t *input1; + int8_t *input2; + int8_t *out_data_c; + int8_t *out_data_opt; + int8_t *input1_orig = NULL; + int8_t *input2_orig = NULL; + int8_t *out_c_orig = NULL; + int8_t *out_opt_orig = NULL; + int32_t input1_offset = 34; + int32_t input2_offset = 35; + int32_t output_offset = 36; + int32_t input1_shift = -8; // right_shift amt always <= 0 + int32_t input2_shift = -8; // right_shift amt always <= 0 + int32_t output_shift = -9; // right_shift amt always <= 0 + int32_t left_shift = 15; // always +ve + int32_t input1_mult = INT32_MAX; + int32_t input2_mult = INT32_MAX; + int32_t output_mult = INT32_MAX; + int32_t activation_min = -128; + int32_t activation_max = 127; + + for (int itr = 0; itr < 10; itr++) { + switch (itr) { + case 0: // all zeros + input1_offset = 0; + input2_offset = 0; + output_offset = 0; + input1_mult = 0; + input2_mult = 0; + output_mult = 0; + input1_shift = 0; + input2_shift = 0; + output_shift = 0; + left_shift = 0; + break; + case 1: // hit min + input1_offset = -127; + input2_offset = -127; + output_offset = -128; + input1_mult = MULT_MIN; + input2_mult = MULT_MIN; + output_mult = MULT_MIN; + input1_shift = 0; + input2_shift = 0; + output_shift = 0; + left_shift = 0; + break; + case 2: // hit max + input1_offset = 128; + input2_offset = 128; + output_offset = -127; + input1_mult = MULT_MAX; + input2_mult = MULT_MAX; + output_mult = MULT_MAX; + input1_shift = SHIFT_MIN; + input2_shift = SHIFT_MIN; + output_shift = SHIFT_MIN; + left_shift = 30 - 8; // since input is 8 bits + break; + case 3: // hit extreme max + input1_offset = 128; + input2_offset = 128; + output_offset = -127; + input1_mult = MULT_MAX; + input2_mult = MULT_MAX; + output_mult = MULT_MAX; + input1_shift = 0; + input2_shift = 0; + output_shift = 0; + left_shift = 30 - 8; // -8 since input is 8 bit + break; + default: // practical random input + input1_offset = rand() % 256 - 127; // range [-127, 128] + input2_offset = rand() % 256 - 127; // range [-127, 128] + output_offset = rand() % 256 - 128; // range [-128, 127] + input1_mult = MULT_MAX / 2 + rand() % INT16_MAX; + input2_mult = MULT_MAX / 2 + rand() % INT16_MAX; + output_mult = MULT_MAX / 2 + rand() % INT16_MAX; + input1_shift = -8 + rand() % 4; + input2_shift = -8 + rand() % 4; + output_shift = -8 + rand() % 4; + left_shift = rand() % 15; + } +#if IDF_HEAP_CAPS + input1_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + input2_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + out_c_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + out_opt_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + + input1 = 16 + input1_orig - ((uint32_t) input1_orig & 0xf); + input2 = 16 + input2_orig - ((uint32_t) input2_orig & 0xf); + out_data_c = 16 + out_c_orig - ((uint32_t) out_c_orig & 0xf); + out_data_opt = 16 + out_opt_orig - ((uint32_t) out_opt_orig & 0xf); +#else + input1 = memalign(16, size); + input2 = memalign(16, size); + out_data_c = memalign(16, size); + out_data_opt = memalign(16, size); + + input1_orig = input1; + input2_orig = input2; + out_c_orig = out_data_c; + out_opt_orig = out_data_opt; +#endif + + for (int i = 0; i < size; ++i) { + input1[i] = rand() % 256 - 128; + input2[i] = rand() % 256 - 128; + } + + if (itr == 0) { + /* enable profiler */ + profile_c_start(); + } + /* C function */ + esp_nn_add_elementwise_s8_ansi(input1, input2, input1_offset, input2_offset, + input1_mult, input2_mult, input1_shift, input2_shift, + left_shift, out_data_c, output_offset, output_mult, + output_shift, activation_min, activation_max, size); + + if (itr == 0) { + profile_c_end(); + profile_opt_start(); + } + + /* Optimized function */ + esp_nn_add_elementwise_s8(input1, input2, input1_offset, input2_offset, + input1_mult, input2_mult, input1_shift, input2_shift, + left_shift, out_data_opt, output_offset, output_mult, + output_shift, activation_min, activation_max, size); + if (itr == 0) { + /* disable profiler */ + profile_opt_end(); + } + + bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size); + if (ret == false) { + printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + printf("Output: \n"); + PRINT_ARRAY_HEX(out_data_opt, size, 1); + printf("Expected: \n"); + PRINT_ARRAY_HEX(out_data_c, size, 1); + printf("Input1:\n"); + PRINT_ARRAY_HEX(input1, size, 1); + printf("Input2:\n"); + PRINT_ARRAY_HEX(input2, size, 1); + printf("in1_shift %d, in2_shift %d, left_shift %d, out_shift %d\n", + input1_shift, input2_shift, left_shift, output_shift); + printf("in1_mult %d, in2_mult %d, out_mult %d\n", input1_mult, input2_mult, output_mult); + goto elementwise_add_test_cleanup; + } + printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + +elementwise_add_test_cleanup: + if (input1_orig) { + free(input1_orig); + } + if (input2_orig) { + free(input2_orig); + } + if (out_data_c) { + free(out_c_orig); + } + if (out_data_opt) { + free(out_opt_orig); + } + } +} + +void esp_nn_mul_elementwise_s8_test() +{ + /* prepare data */ + const int size = 1600 + 8 + 7; /* odd len to test leftover */ + int8_t *input1; + int8_t *input2; + int8_t *out_data_c; + int8_t *out_data_opt; + int32_t input1_offset = 34; + int32_t input2_offset = 35; + int32_t output_offset = 36; + int32_t output_shift = -7; + int32_t output_mult = MULT_MAX; // max out_mult + int32_t activation_min = -128; + int32_t activation_max = 127; + int8_t *input1_orig = NULL; + int8_t *input2_orig = NULL; + int8_t *out_c_orig = NULL; + int8_t *out_opt_orig = NULL; + + for (int itr = 0; itr < 10; itr++) { + switch (itr) { + case 0: // all zeros + input1_offset = 0; + input2_offset = 0; + output_offset = 0; + output_mult = 0; + output_shift = 0; + break; + case 1: // hit min + input1_offset = -127; + input2_offset = -127; + output_offset = -128; + output_mult = MULT_MIN; + output_shift = 0; + break; + case 2: // hit max + input1_offset = 128; + input2_offset = 128; + output_offset = -127; + output_mult = MULT_MAX; + output_shift = SHIFT_MIN; + break; + case 3: // hit extreme max + input1_offset = 128; + input2_offset = 128; + output_offset = -127; + output_mult = MULT_MAX; + output_shift = 0; + break; + default: // practical random input + input1_offset = rand() % 256 - 127; // range [-127, 128] + input2_offset = rand() % 256 - 127; // range [-127, 128] + output_offset = rand() % 256 - 128; // range [-128, 127] + output_mult = MULT_MAX / 2 + rand() % INT16_MAX; + output_shift = -8 + rand() % 4; + } + +#if IDF_HEAP_CAPS + input1_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + input2_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + out_c_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + out_opt_orig = (int8_t *) heap_caps_malloc(size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + + input1 = 16 + input1_orig - ((uint32_t) input1_orig & 0xf); + input2 = 16 + input2_orig - ((uint32_t) input2_orig & 0xf); + out_data_c = 16 + out_c_orig - ((uint32_t) out_c_orig & 0xf); + out_data_opt = 16 + out_opt_orig - ((uint32_t) out_opt_orig & 0xf); +#else + input1 = memalign(16, size); + input2 = memalign(16, size); + out_data_c = memalign(16, size); + out_data_opt = memalign(16, size); + + input1_orig = input1; + input2_orig = input2; + out_c_orig = out_data_c; + out_opt_orig = out_data_opt; +#endif + + for (int i = 0; i < size; ++i) { + input1[i] = rand() % 256 - 128; + input2[i] = rand() % 256 - 128; + } + + if (itr == 0) { + /* enable profiler */ + profile_c_start(); + } + /* C function */ + esp_nn_mul_elementwise_s8_ansi(input1, input2, input1_offset, input2_offset, + out_data_c, output_offset, output_mult, output_shift, + activation_min, activation_max, size); + + if (itr == 0) { + profile_c_end(); + profile_opt_start(); + } + /* Optimized function */ + esp_nn_mul_elementwise_s8(input1, input2, input1_offset, input2_offset, + out_data_opt, output_offset, output_mult, output_shift, + activation_min, activation_max, size); + + if (itr == 0) { + /* disable profiler */ + profile_opt_end(); + } + + bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size); + if (ret == false) { + printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + printf("Output: \n"); + PRINT_ARRAY_HEX(out_data_opt, size, 1); + printf("Expected: \n"); + PRINT_ARRAY_HEX(out_data_c, size, 1); + printf("Input1:\n"); + PRINT_ARRAY_HEX(input1, size, 1); + printf("Input2:\n"); + PRINT_ARRAY_HEX(input2, size, 1); + goto elementwise_mult_test_cleanup; + } + printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + +elementwise_mult_test_cleanup: + if (input1_orig) { + free(input1_orig); + } + if (input2_orig) { + free(input2_orig); + } + if (out_data_c) { + free(out_c_orig); + } + if (out_data_opt) { + free(out_opt_orig); + } + } +} diff --git a/code/components/esp-nn/tests/src/convolution_test.c b/code/components/esp-nn/tests/src/convolution_test.c new file mode 100644 index 00000000..f3802257 --- /dev/null +++ b/code/components/esp-nn/tests/src/convolution_test.c @@ -0,0 +1,571 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include +#include "test_utils.h" + +#if CONFIG_IDF_CMAKE +#define IDF_HEAP_CAPS 1 + +#if IDF_HEAP_CAPS +#include "esp_heap_caps.h" +#endif +#endif + +void esp_nn_depthwise_conv_s8_test() +{ + int8_t *input = NULL, *filter_data = NULL, *out_data_c = NULL, *out_data_opt = NULL; + int32_t *bias = NULL; + int32_t input_offset = 5; /* some number in [-128, 127] */ + int32_t out_offset = 7; + int32_t activation_min = -125; + int32_t activation_max = 120; + void *scratch_buf = NULL; + + /* independent variables */ + int input_wd, input_ht, channels; + uint16_t filter_ht, filter_wd, ch_mult; + uint16_t pad_wd, pad_ht, stride_wd, stride_ht; + + // run for 10 iterations + for (int itr = 0; itr < 10; itr++) { + /* prepare data */ + switch (itr) { + case 0: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0) + input_wd = 18; + input_ht = 18; + filter_ht = 3; + filter_wd = 3; + ch_mult = 1; + channels = 16; + pad_wd = 0; + pad_ht = 0; + stride_wd = 1; + stride_ht = 1; + break; + case 1: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (1,1) + input_wd = 10; + input_ht = 10; + filter_ht = 3; + filter_wd = 3; + ch_mult = 1; + channels = 16; + pad_wd = 1; + pad_ht = 1; + stride_wd = 1; + stride_ht = 1; + break; + case 2: // (ch_mult 1, (channels % 8) = 0), filter (3,3), pad (1,1) + input_wd = 10; + input_ht = 10; + filter_ht = 3; + filter_wd = 3; + ch_mult = 1; + channels = 24; + pad_wd = 1; + pad_ht = 1; + stride_wd = 1; + stride_ht = 1; + break; + case 3: // other filter sizes (ch_mult 1, (channels % 8) = 0) + input_wd = 10; + input_ht = 10; + filter_ht = 3; + filter_wd = 3; + ch_mult = 1; + channels = 24; + pad_wd = 1; + pad_ht = 1; + stride_wd = 1; + stride_ht = 1; + break; + case 4: // other filter sizes (ch_mult 8 = 0) + input_wd = 6; + input_ht = 6; + filter_ht = 3; + filter_wd = 3; + ch_mult = 8; + channels = 4; + pad_wd = 1; + pad_ht = 1; + stride_wd = 1; + stride_ht = 1; + break; + case 5: // other filter sizes (ch_mult 8 = 0) + input_wd = 12; + input_ht = 12; + filter_ht = 5; + filter_wd = 5; + ch_mult = 8; + channels = 4; + pad_wd = 1; + pad_ht = 1; + stride_wd = 1; + stride_ht = 1; + break; + case 6: // other filter sizes (ch_mult 4 = 0) + input_wd = 6; + input_ht = 6; + filter_ht = 3; + filter_wd = 3; + ch_mult = 4; + channels = 4; + pad_wd = 1; + pad_ht = 1; + stride_wd = 1; + stride_ht = 1; + break; + case 7: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0) stride (2,2) + input_wd = 6; + input_ht = 6; + filter_ht = 3; + filter_wd = 3; + ch_mult = 1; + channels = 16; + pad_wd = 0; + pad_ht = 0; + stride_wd = 2; + stride_ht = 2; + break; + default: + input_wd = 4; + input_ht = 4; + filter_ht = 3; + filter_wd = 3; + ch_mult = 4; + channels = 4; + pad_wd = 1; + pad_ht = 1; + stride_wd = 1; + stride_ht = 1; + break; + } + + uint16_t out_wd = (input_wd - filter_wd + 1) / stride_wd; + uint16_t out_ht = (input_ht - filter_ht + 1) / stride_ht; + int in_size = input_wd * input_ht * channels; + int out_size = out_wd * out_ht * channels * ch_mult; + int filter_size = filter_wd * filter_ht * channels * ch_mult + 4; + int bias_size = channels * ch_mult + 1; + int32_t out_shift[channels * ch_mult]; + int32_t out_mult[channels * ch_mult]; + +#if IDF_HEAP_CAPS + int8_t *input_orig = (int8_t *) heap_caps_malloc(in_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + int8_t *out_c_orig = (int8_t *) heap_caps_malloc(out_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + int8_t *out_opt_orig = (int8_t *) heap_caps_malloc(out_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + filter_data = (int8_t *) heap_caps_malloc(filter_size, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + bias = (int32_t *) heap_caps_malloc(bias_size * 4, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + + input = 16 + input_orig - ((uint32_t) input_orig & 0xf); + out_data_c = 16 + out_c_orig - ((uint32_t) out_c_orig & 0xf); + out_data_opt = 16 + out_opt_orig - ((uint32_t) out_opt_orig & 0xf); +#else + input = memalign(16, in_size + 16); + filter_data = memalign(16, filter_size); + out_data_c = memalign(16, out_size + 16); + out_data_opt = memalign(16, out_size + 16); + bias = memalign(16, bias_size * 4); + int8_t *input_orig = input; + int8_t *out_c_orig = out_data_c; + int8_t *out_opt_orig = out_data_opt; +#endif + if (bias == NULL || input == NULL || filter_data == NULL || + out_data_c == NULL || out_data_opt == NULL || bias == NULL) { + printf(ANSI_COLOR_RED"%s[%d] allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + goto dc_s8_cleanup; + } + + /* Generate input data */ + for (int i = 0; i < in_size; ++i) { + input[i] = rand() % 128; + } + + /* Generate filter data */ + for (int i = 0; i < filter_size; ++i) { + filter_data[i] = rand() % 256 - 128; + } + + /* Generate bias data */ + for (int i = 0; i < channels * ch_mult; ++i) { + bias[i + 1] = rand() % INT16_MAX; //0th index left for unalignment + out_shift[i] = -8 + rand() % 3; + out_mult[i] = 0x7eb0e200 + rand() % 50; + } + + int scratch_buf_size = esp_nn_get_depthwise_conv_scratch_size(input_wd, input_ht, + channels, ch_mult, + filter_wd, filter_ht); + if (scratch_buf_size > 0) { +#if IDF_HEAP_CAPS + scratch_buf = heap_caps_malloc(scratch_buf_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + int align_sz = 16 - (((int32_t) scratch_buf) & 0xf); +#else + scratch_buf = memalign(16, scratch_buf_size); + int align_sz = 0; +#endif + if (scratch_buf == NULL) { + printf(ANSI_COLOR_RED"%s[%d] scratch_buf alloc failed size %d\n"ANSI_COLOR_RESET, + __FUNCTION__, itr, scratch_buf_size); + goto dc_s8_cleanup; + } + esp_nn_set_depthwise_conv_scratch_buf(scratch_buf + align_sz); + } + if (itr == 0) { + /* enable profiler */ + profile_c_start(); + } + + /* C function */ + esp_nn_depthwise_conv_s8_ansi(input, input_wd, input_ht, channels, input_offset, + pad_wd, pad_ht, stride_wd, stride_ht, ch_mult, + filter_data + 4, filter_wd, filter_ht, + bias + 1, out_data_c, out_wd, out_ht, out_offset, out_shift, + out_mult, activation_min, activation_max); + + if (itr == 0) { + profile_c_end(); + profile_opt_start(); + } + + /* Optimized function */ + esp_nn_depthwise_conv_s8(input, input_wd, input_ht, channels, input_offset, + pad_wd, pad_ht, stride_wd, stride_ht, ch_mult, + filter_data + 4, filter_wd, filter_ht, + bias + 1, out_data_opt, out_wd, out_ht, out_offset, out_shift, + out_mult, activation_min, activation_max); + + if (itr == 0) { + /* disable profiler */ + profile_opt_end(); + } + + bool ret = CHECK_EQUAL(out_data_c, out_data_opt, out_size); + if (ret == false) { + printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + printf("Output: \n"); + PRINT_ARRAY_HEX(out_data_opt, out_size / out_ht, out_ht); + printf("Expected: \n"); + PRINT_ARRAY_HEX(out_data_c, out_size / out_ht, out_ht); + printf("Input:\n"); + PRINT_ARRAY_HEX(input, in_size / input_ht, input_ht); + printf("Filter data:\n"); + PRINT_ARRAY_HEX(filter_data + 4, (filter_size - 4) / filter_ht, filter_ht); + printf("bias data:\n"); + PRINT_ARRAY_INT(bias + 1, ch_mult * channels, 1); + goto dc_s8_cleanup; + } + printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + + dc_s8_cleanup: + if (input) { + free(input_orig); + } + if (filter_data) { + free(filter_data); + } + if (out_data_c) { + free(out_c_orig); + } + if (out_data_opt) { + free(out_opt_orig); + } + if (bias) { + free(bias); + } + if (scratch_buf) { + free(scratch_buf); + } + } +} + +void esp_nn_conv_s8_test() +{ + const int32_t input_offset = 5; /* some number in [-128, 127] */ + const int32_t activation_min = -125; + const int32_t activation_max = 122; + const int32_t out_offset = 3; + + void *scratch_buf = NULL; + int8_t *input_orig; + int8_t *out_c_orig; + int8_t *out_opt_orig; + int8_t *filter_data; + int32_t *bias; + + /* independent variable */ + int in_wd, in_ht, in_channels, out_channels; + uint16_t filter_ht, filter_wd; + uint16_t pad_wd, pad_ht, stride_wd, stride_ht; + + // run for 10 iterations + for (int itr = 0; itr < 10; itr++) { + switch (itr) { + case 0: // ch % 8 == 0 && filter (1,1), padding (0,0) + in_wd = 10; + in_ht = 10; + in_channels = 64; + out_channels = 64; + filter_ht = 1; + filter_wd = 1; + pad_wd = 0; + pad_ht = 0; + stride_wd = 1; + stride_ht = 1; + break; + case 1: // ch % 4 == 0 && (in_wd * in_ht) % 16 == 0 + in_wd = 4; + in_ht = 4; + in_channels = 20; + out_channels = 8; + filter_ht = 1; + filter_wd = 1; + pad_wd = 0; + pad_ht = 0; + stride_wd = 1; + stride_ht = 1; + break; + case 2: // ch, filter (3x3x3) + in_wd = 10; + in_ht = 10; + in_channels = 3; + out_channels = 64; + filter_ht = 3; + filter_wd = 3; + pad_wd = 0; + pad_ht = 0; + stride_wd = 1; + stride_ht = 1; + break; + case 3: // remaining pad (0, 0) + in_wd = 10; + in_ht = 10; + in_channels = 3; + out_channels = 64; + filter_ht = 1; + filter_wd = 1; + pad_wd = 0; + pad_ht = 0; + stride_wd = 1; + stride_ht = 1; + break; + case 4: // unopt case + in_wd = 10; + in_ht = 10; + in_channels = 12; + out_channels = 64; + filter_ht = 3; + filter_wd = 3; + pad_wd = 1; + pad_ht = 1; + stride_wd = 1; + stride_ht = 1; + break; + case 5: // ch % 8 == 0 & stride (2,2) + in_wd = 16; + in_ht = 16; + in_channels = 16; + out_channels = 16; + filter_ht = 1; + filter_wd = 1; + pad_wd = 0; + pad_ht = 0; + stride_wd = 2; + stride_ht = 2; + break; + case 6: // ch % 8 == 0 && filter (1,1), padding (0,0) + in_wd = 2; + in_ht = 2; + in_channels = 8; + out_channels = 8; + filter_ht = 1; + filter_wd = 1; + pad_wd = 0; + pad_ht = 0; + stride_wd = 1; + stride_ht = 1; + break; + default: // ch % 8 == 0 + in_wd = 8; + in_ht = 8; + in_channels = 16; + out_channels = 16; + filter_ht = 1; + filter_wd = 1; + pad_wd = 0; + pad_ht = 0; + stride_wd = 1; + stride_ht = 1; + break; + } + + /* prepare data */ + uint16_t out_wd = (in_wd - filter_wd + 1) / stride_wd; + uint16_t out_ht = (in_ht - filter_ht + 1) / stride_ht; + + int in_size = in_wd * in_ht * in_channels; + int filter_size = filter_wd * filter_ht * in_channels * out_channels + 2; + int out_size = out_wd * out_ht * out_channels; + +#if IDF_HEAP_CAPS + input_orig = (int8_t *) heap_caps_malloc(in_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + out_c_orig = (int8_t *) heap_caps_malloc(out_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + out_opt_orig = (int8_t *) heap_caps_malloc(out_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + filter_data = (int8_t *) heap_caps_malloc(filter_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + bias = (int32_t *) heap_caps_malloc(128 + sizeof (int32_t) * out_channels, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + + int8_t *input = 16 + input_orig - ((uint32_t) input_orig & 0xf); + int8_t *out_data_c = 16 + out_c_orig - ((uint32_t) out_c_orig & 0xf); + int8_t *out_data_opt = 16 + out_opt_orig - ((uint32_t) out_opt_orig & 0xf); +#else + int8_t *input = memalign(16, in_size); + int8_t *out_data_c = memalign(16, out_size); + int8_t *out_data_opt = memalign(16, out_size); + filter_data = memalign(16, filter_size); + bias = calloc(1, 128 + sizeof (int32_t) * out_channels); + input_orig = input; + out_c_orig = out_data_c; + out_opt_orig = out_data_opt; +#endif + int32_t *out_shift = calloc(1, 128 + sizeof (int32_t) * out_channels); + int32_t *out_mult = calloc(1, 128 + sizeof (int32_t) * out_channels); + + if (input == NULL || filter_data == NULL || + out_data_c == NULL || out_data_opt == NULL) { + printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__); + goto conv_s8_cleanup; + } + + if (bias == NULL || out_shift == NULL || out_mult == NULL) { + printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__); + goto conv_s8_cleanup; + } + + /* Generate input data between -128 -> +127 */ + for (int i = 0; i < in_size; ++i) { + input[i] = rand() % 255 - 128; + } + + /* Generate filter data between -128 -> +127 */ + for (int i = 0; i < filter_size; ++i) { + filter_data[i] = rand() % 256 - 128; + } + + /* Generate bias data */ + for (int i = 0; i < out_channels; ++i) { + bias[i] = (int32_t)rand() % UINT16_MAX + UINT8_MAX; + } + + /* Shift and multiplier */ + for (int i = 0; i < out_channels; ++i) { + out_shift[i] = -10 + rand() % 2; + out_mult[i] = 0x7f67f4f8 + rand() % 50; + } + + int scratch_buf_size = esp_nn_get_conv_scratch_size(in_wd, in_ht, in_channels, + out_channels, filter_wd, filter_ht); + if (scratch_buf_size > 0) { +#if IDF_HEAP_CAPS + void *scratch_buf = heap_caps_malloc(scratch_buf_size + 32, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); + int align_sz = 16 - (((int32_t) scratch_buf) & 0xf); +#else + void *scratch_buf = memalign(16, scratch_buf_size); + int align_sz = 0; +#endif + if (scratch_buf == NULL) { + printf(ANSI_COLOR_RED"%s scratch_buf alloc failed size %d\n"ANSI_COLOR_RESET, __FUNCTION__, scratch_buf_size); + goto conv_s8_cleanup; + } + esp_nn_set_conv_scratch_buf(scratch_buf + align_sz); + } + + if (itr == 0) { + /* enable profiler */ + profile_c_start(); + } + + /* C function */ + esp_nn_conv_s8_ansi(input, in_wd, in_ht, in_channels, input_offset, + pad_wd, pad_ht, stride_wd, stride_ht, + filter_data + 2, filter_wd, filter_ht, bias, + out_data_c, out_wd, out_ht, out_channels, out_offset, out_shift, + out_mult, activation_min, activation_max); + + if (itr == 0) { + profile_c_end(); + profile_opt_start(); + } + + /* Optimized function */ + esp_nn_conv_s8(input, in_wd, in_ht, in_channels, input_offset, + pad_wd, pad_ht, stride_wd, stride_ht, + filter_data + 2, filter_wd, filter_ht, bias, + out_data_opt, out_wd, out_ht, out_channels, out_offset, out_shift, + out_mult, activation_min, activation_max); + + if (itr == 0) { + /* disable profiler */ + profile_opt_end(); + } + + bool ret = CHECK_EQUAL(out_data_c, out_data_opt, out_size); + if (ret == false) { + printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + printf("Output: \n"); + PRINT_ARRAY_HEX(out_data_opt, out_size / out_ht, out_ht); + printf("Expected: \n"); + PRINT_ARRAY_HEX(out_data_c, out_size / out_ht, out_ht); + printf("Input:\n"); + PRINT_ARRAY_HEX(input, in_size / in_ht, in_ht); + printf("Filter data:\n"); + PRINT_ARRAY_HEX(filter_data + 2, (filter_size - 2) / filter_ht, filter_ht); + printf("bias data:\n"); + PRINT_ARRAY_INT(bias, out_channels, 1); + goto conv_s8_cleanup; + } + printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + + conv_s8_cleanup: + if (input) { + free(input_orig); + } + if (filter_data) { + free(filter_data); + } + if (out_data_c) { + free(out_c_orig); + } + if (out_data_opt) { + free(out_opt_orig); + } + if (bias) { + free(bias); + } + if (out_shift) { + free(out_shift); + } + if (out_mult) { + free(out_mult); + } + if (scratch_buf) { + free(scratch_buf); + } + } +} diff --git a/code/components/esp-nn/tests/src/fully_connected_test.c b/code/components/esp-nn/tests/src/fully_connected_test.c new file mode 100644 index 00000000..d0210b46 --- /dev/null +++ b/code/components/esp-nn/tests/src/fully_connected_test.c @@ -0,0 +1,111 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include +#include "test_utils.h" + + +void esp_nn_fully_connected_s8_test() +{ + /* prepare data */ + static uint16_t row_len = 256 + 8 + 7; /* odd len to test unaligned+left-over */ + static uint16_t out_channels = 3; + int8_t input[row_len]; + int8_t filter_data[row_len * out_channels]; + int8_t output_c[out_channels], output_opt[out_channels]; + static int32_t activation_min = -128; + static int32_t activation_max = 127; + static int32_t input_offset = 0; + static int32_t filter_offset = 0; + int32_t out_shift = -10; + static int32_t out_offset = 127; + int32_t out_mult = 0x59e492c4; + for (int itr = 0; itr < 5; itr++) { + out_mult = INT32_MAX / row_len + rand() % INT16_MAX; + switch (itr) { + case 0: + out_shift = -10; + break; + case 1: + out_shift = SHIFT_MIN; + break; + case 2: + out_shift = SHIFT_MAX; + break; + case 3: + out_shift = 0; + break; + default: + out_shift = -10 + rand() % 5; + break; + } + if (itr == 0) { + out_shift = SHIFT_MAX; + } + /* Generate input and filter data */ + for (int i = 0; i < row_len; ++i) { + input[i] = rand() % 256 - 128; + } + for (int i = 0; i < row_len * out_channels; ++i) { + filter_data[i] = rand() % 256 - 128; + } + + if (itr == 0) { + /* enable profiler */ + profile_c_start(); + } + + /* C function */ + esp_nn_fully_connected_s8_ansi(input, input_offset, row_len, filter_data, filter_offset, + NULL, output_c, out_channels, out_offset, out_shift, out_mult, + activation_min, activation_max); + + if (itr == 0) { + profile_c_end(); + profile_opt_start(); + } + + /* Optimized function */ + esp_nn_fully_connected_s8(input, input_offset, row_len, filter_data, filter_offset, + NULL, output_opt, out_channels, out_offset, out_shift, out_mult, + activation_min, activation_max); + + if (itr == 0) { + /* disable profiler */ + profile_opt_end(); + } + + bool ret = CHECK_EQUAL(output_c, output_opt, out_channels); + if (ret == false) { + printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + printf("Output: \n"); + PRINT_ARRAY_HEX(output_opt, out_channels, 1); + printf("Expected: \n"); + PRINT_ARRAY_HEX(output_c, out_channels, 1); + printf("Input:\n"); + PRINT_ARRAY_HEX(input, row_len, 1); + printf("Filter data:\n"); + PRINT_ARRAY_HEX(filter_data, row_len, out_channels); + printf("Out shift: %d\n", out_shift); + printf("Out mult: %x\n", out_mult); + return; + } + printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); + } +} diff --git a/code/components/esp-nn/tests/src/pooling_test.c b/code/components/esp-nn/tests/src/pooling_test.c new file mode 100644 index 00000000..c1c889e1 --- /dev/null +++ b/code/components/esp-nn/tests/src/pooling_test.c @@ -0,0 +1,184 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include +#include "test_utils.h" + + +void esp_nn_avg_pool_s8_test() +{ + /* prepare data */ + const uint16_t input_wd = 16; + const uint16_t input_ht = 16; + const uint16_t channels = 16; /* With TFLite example, I have seen it 256 */ + const int size = input_wd * input_ht * channels; + int8_t *input, *output_c, *output_opt; + const int32_t activation_min = -128; + const int32_t activation_max = 127; + const uint16_t pad_wd = 1; + const uint16_t pad_ht = 1; + const uint16_t stride_wd = 1; + const uint16_t stride_ht = 1; + const uint16_t filter_ht = 3; + const uint16_t filter_wd = 3; + const uint16_t out_wd = input_wd / stride_wd; + const uint16_t out_ht = input_ht / stride_ht; + const int out_size = out_wd * out_ht * channels; + + input = memalign(16, size); + output_c = memalign(16, out_size); + output_opt = memalign(16, out_size); + + if (input == NULL || output_c == NULL || output_opt == NULL) { + printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__); + goto avg_pool_s8_cleanup; + } + /** + * width/height, channels etc look suspicious but it it true. + * It actually depends upon where in model this is actually placed. + * If at the end wd/ht tends to be smaller and depth larger. + */ + + for (int i = 0; i < size; ++i) { + input[i] = rand() % 256 - 128; + } + + /* enable profiler */ + profile_c_start(); + + /* C function */ + esp_nn_avg_pool_s8_ansi(input, input_wd, input_ht, output_c, out_wd, out_ht, + stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht, + activation_min, activation_max, channels); + + profile_c_end(); + profile_opt_start(); + + /* Optimized function */ + esp_nn_avg_pool_s8(input, input_wd, input_ht, output_opt, out_wd, out_ht, + stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht, + activation_min, activation_max, channels); + + /* disable profiler */ + profile_opt_end(); + + + bool ret = CHECK_EQUAL(output_c, output_opt, out_size); + if (ret == false) { + printf(ANSI_COLOR_RED"%s failed\n"ANSI_COLOR_RESET, __FUNCTION__); + printf("Output: \n"); + PRINT_ARRAY_HEX(output_opt, out_wd * channels, out_ht); + printf("Expected: \n"); + PRINT_ARRAY_HEX(output_c, out_wd * channels, out_ht); + printf("Input:\n"); + PRINT_ARRAY_HEX(input, input_wd * channels, input_ht); + goto avg_pool_s8_cleanup; + } + printf(ANSI_COLOR_GREEN"%s passed\n"ANSI_COLOR_RESET, __FUNCTION__); + +avg_pool_s8_cleanup: + if (input) { + free(input); + } + if (output_c) { + free(output_c); + } + if (output_opt) { + free(output_opt); + } +} + +void esp_nn_max_pool_s8_test() +{ + /* prepare data */ + const uint16_t input_wd = 16; + const uint16_t input_ht = 16; + const uint16_t channels = 16; /* With TFLite example, I have seen it 256 */ + int8_t *input, *output_c, *output_opt; + const int size = input_wd * input_ht * channels; + const int32_t activation_min = -128; + const int32_t activation_max = 127; + const uint16_t pad_wd = 1; + const uint16_t pad_ht = 1; + const uint16_t stride_wd = 1; + const uint16_t stride_ht = 1; + const uint16_t filter_ht = 3; + const uint16_t filter_wd = 3; + const uint16_t out_wd = input_wd / stride_wd; + const uint16_t out_ht = input_ht / stride_ht; + const int out_size = out_wd * out_ht * channels; + + input = memalign(16, size); + output_c = memalign(16, out_size); + output_opt = memalign(16, out_size); + + if (input == NULL || output_c == NULL || output_opt == NULL) { + printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__); + goto max_pool_s8_cleanup; + } + + for (int i = 0; i < size; ++i) { + input[i] = rand() % 256 - 128; + } + + /* enable profiler */ + profile_c_start(); + + /* C function */ + esp_nn_max_pool_s8_ansi(input, input_wd, input_ht, output_c, out_wd, out_ht, + stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht, + activation_min, activation_max, channels); + + profile_c_end(); + profile_opt_start(); + + /* Optimized function */ + esp_nn_max_pool_s8(input, input_wd, input_ht, output_opt, out_wd, out_ht, + stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht, + activation_min, activation_max, channels); + + /* disable profiler */ + profile_opt_end(); + + + bool ret = CHECK_EQUAL(output_c, output_opt, out_wd * out_ht * channels); + if (ret == false) { + printf(ANSI_COLOR_RED"%s failed\n"ANSI_COLOR_RESET, __FUNCTION__); + printf("Output: \n"); + PRINT_ARRAY_HEX(output_opt, out_wd * out_ht * channels, 1); + printf("Expected: \n"); + PRINT_ARRAY_HEX(output_c, out_wd * out_ht * channels, 1); + printf("Input:\n"); + PRINT_ARRAY_HEX(input, 8, size / 8); + goto max_pool_s8_cleanup; + } + printf(ANSI_COLOR_GREEN"%s passed\n"ANSI_COLOR_RESET, __FUNCTION__); + +max_pool_s8_cleanup: + if (input) { + free(input); + } + if (output_c) { + free(output_c); + } + if (output_opt) { + free(output_opt); + } +} diff --git a/code/components/esp-nn/tests/src/relu_test.c b/code/components/esp-nn/tests/src/relu_test.c new file mode 100644 index 00000000..ce6f13f1 --- /dev/null +++ b/code/components/esp-nn/tests/src/relu_test.c @@ -0,0 +1,83 @@ +// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include +#include "test_utils.h" + +void esp_nn_relu6_s8_test() +{ + const int size = 1600 + 8 + 7; + int8_t *input, *inout_ansi, *inout_opt; + + input = memalign(16, size); + inout_ansi = memalign(16, size); + inout_opt = memalign(16, size); + + if (input == NULL || inout_ansi == NULL || inout_opt == NULL) { + printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__); + goto relu6_s8_cleanup; + } + /* Generate filter data between -128 -> +127 */ + for (int i = 0; i < size; ++i) { + input[i] = rand() % 255 - 128; + inout_ansi[i] = input[i]; + inout_opt[i] = input[i]; + } + + /* enable profiler */ + profile_c_start(); + + /* C function */ + esp_nn_relu6_s8_ansi(inout_ansi, size); + + profile_c_end(); + profile_opt_start(); + + /* Optimized function */ + esp_nn_relu6_s8(inout_opt, size); + + /* disable profiler */ + profile_opt_end(); + + bool ret = CHECK_EQUAL(inout_ansi, inout_opt, size); + if (ret == false) { + printf(ANSI_COLOR_RED"%s failed\n"ANSI_COLOR_RESET, __FUNCTION__); + printf("Output: \n"); + PRINT_ARRAY_HEX(inout_opt, size, 1); + printf("Expected: \n"); + PRINT_ARRAY_HEX(inout_ansi, size, 1); + printf("Input:\n"); + PRINT_ARRAY_HEX(input, size, 1); + goto relu6_s8_cleanup; + } + printf(ANSI_COLOR_GREEN"%s passed\n"ANSI_COLOR_RESET, __FUNCTION__); + +relu6_s8_cleanup: + if (input) { + free (input); + } + if (inout_ansi) { + free (inout_ansi); + } + if (inout_opt) { + free (inout_opt); + } + +} diff --git a/code/components/esp-nn/tests/src/softmax_test.c b/code/components/esp-nn/tests/src/softmax_test.c new file mode 100644 index 00000000..f7c734cd --- /dev/null +++ b/code/components/esp-nn/tests/src/softmax_test.c @@ -0,0 +1,101 @@ +// Copyright 2022 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include +#include "test_utils.h" + +void esp_nn_softmax_s8_test() +{ + const int32_t height = 8; + const int32_t width = 32; + const int32_t diff_min = -128; + const int32_t mult = INT32_MAX / 2; + const int32_t shift = 7; + void *scratch_buf = NULL; + const int size = width * height; + int8_t *input, *out_ansi, *out_opt; + + input = memalign(16, size); + out_ansi = memalign(16, size); + out_opt = memalign(16, size); + + if (input == NULL || out_ansi == NULL || out_opt == NULL) { + printf(ANSI_COLOR_RED"%s buffer allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__); + goto softmax_s8_cleanup; + } + + /* Generate input data between -128 -> +127 */ + for (int i = 0; i < size; ++i) { + input[i] = rand() % 255 - 128; + } + + /* enable profiler */ + profile_c_start(); + + /* C function */ + esp_nn_softmax_s8_ansi(input, height, width, mult, shift, diff_min, out_ansi); + + profile_c_end(); + + int32_t scratch_buf_size = esp_nn_get_softmax_scratch_size(width, height); + if (scratch_buf_size) { + scratch_buf = memalign(4, scratch_buf_size); + if (scratch_buf == NULL) { + printf(ANSI_COLOR_RED"%s scratch_buf alloc failed size %d\n"ANSI_COLOR_RESET, __FUNCTION__, scratch_buf_size); + goto softmax_s8_cleanup; + } + esp_nn_set_softmax_scratch_buf(scratch_buf); + } + + profile_opt_start(); + + /* Optimized function */ + esp_nn_softmax_s8(input, height, width, mult, shift, diff_min, out_opt); + + /* disable profiler */ + profile_opt_end(); + + bool ret = CHECK_EQUAL(out_ansi, out_opt, size); + if (ret == false) { + printf(ANSI_COLOR_RED"%s failed\n"ANSI_COLOR_RESET, __FUNCTION__); + printf("Output: \n"); + PRINT_ARRAY_HEX(out_opt, width, height); + printf("Expected: \n"); + PRINT_ARRAY_HEX(out_ansi, width, height); + printf("Input:\n"); + PRINT_ARRAY_HEX(input, width, height); + goto softmax_s8_cleanup; + } + printf(ANSI_COLOR_GREEN"%s passed\n"ANSI_COLOR_RESET, __FUNCTION__); + +softmax_s8_cleanup: + if (input) { + free (input); + } + if (out_ansi) { + free (out_ansi); + } + if (out_opt) { + free (out_opt); + } + if (scratch_buf) { + free (scratch_buf); + } +} diff --git a/code/components/esp32-camera-master_neu_20220121.zip b/code/components/esp32-camera-master_neu_20220121.zip deleted file mode 100644 index 3acbcf1a..00000000 Binary files a/code/components/esp32-camera-master_neu_20220121.zip and /dev/null differ diff --git a/code/components/esp32-camera-master_old_version.zip b/code/components/esp32-camera-master_old_version.zip deleted file mode 100644 index c0c60f8f..00000000 Binary files a/code/components/esp32-camera-master_old_version.zip and /dev/null differ diff --git a/code/components/jomjol_flowcontroll/ClassFlow.cpp b/code/components/jomjol_flowcontroll/ClassFlow.cpp index ff14c1b2..f15844d5 100644 --- a/code/components/jomjol_flowcontroll/ClassFlow.cpp +++ b/code/components/jomjol_flowcontroll/ClassFlow.cpp @@ -19,7 +19,6 @@ void ClassFlow::SetInitialParameter(void) std::vector ClassFlow::ZerlegeZeile(std::string input, std::string delimiter) { std::vector Output; -// std::string delimiter = " =,"; input = trim(input, delimiter); size_t pos = findDelimiterPos(input, delimiter); diff --git a/code/components/jomjol_flowcontroll/ClassFlow.h b/code/components/jomjol_flowcontroll/ClassFlow.h index 4df4777c..92184d32 100644 --- a/code/components/jomjol_flowcontroll/ClassFlow.h +++ b/code/components/jomjol_flowcontroll/ClassFlow.h @@ -26,7 +26,6 @@ struct HTMLInfo class ClassFlow { protected: -// std::vector ZerlegeZeile(string input); std::vector ZerlegeZeile(string input, string delimiter = " =, \t"); bool isNewParagraph(string input); bool GetNextParagraph(FILE* pfile, string& aktparamgraph); diff --git a/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.cpp b/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.cpp index e3092308..9644b265 100644 --- a/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.cpp +++ b/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.cpp @@ -197,33 +197,6 @@ int ClassFlowCNNGeneral::ZeigerEvalHybrid(float zahl, float zahl_vorgaenger, int return ((int) trunc(zahl) + 10) % 10; } -/* -int ClassFlowCNNGeneral::ZeigerEvalHybrid_NEU(float zahl, float zahl_vorgaenger) -{ - int ergebnis_nachkomma = ((int) floor(zahl * 10) + 10) % 10; - int ergebnis_vorkomma = ((int) floor(zahl) + 10) % 10; - int ergebnis, ergebnis_rating; - - - if (zahl_vorgaenger < 0) - return ergebnis_vorkomma % 10; - - ergebnis_rating = ergebnis_nachkomma - zahl_vorgaenger; - if (ergebnis_nachkomma >= 5) - ergebnis_rating-=5; - else - ergebnis_rating+=5; - ergebnis = (int) round(zahl); - if (ergebnis_rating < 0) - ergebnis-=1; - if (ergebnis == -1) - ergebnis+=10; - - ergebnis = (ergebnis + 10) % 10; - return ergebnis; - -} -*/ int ClassFlowCNNGeneral::ZeigerEval(float zahl, int ziffer_vorgaenger) @@ -309,11 +282,12 @@ bool ClassFlowCNNGeneral::ReadParameter(FILE* pfile, string& aktparamgraph) { CNNGoodThreshold = std::stof(zerlegt[1]); } - if ((toUpper(zerlegt[0]) == "MODELINPUTSIZE") && (zerlegt.size() > 2)) +/* if ((toUpper(zerlegt[0]) == "MODELINPUTSIZE") && (zerlegt.size() > 2)) { this->modelxsize = std::stoi(zerlegt[1]); this->modelysize = std::stoi(zerlegt[2]); } +*/ if (zerlegt.size() >= 5) { general* _analog = GetGENERAL(zerlegt[0], true); @@ -334,11 +308,14 @@ bool ClassFlowCNNGeneral::ReadParameter(FILE* pfile, string& aktparamgraph) } } + if (!getNetworkParameter()) + return false; - for (int _ana = 0; _ana < GENERAL.size(); ++_ana) + + for (int _ana = 0; _ana < GENERAL.size(); ++_ana) for (int i = 0; i < GENERAL[_ana]->ROI.size(); ++i) { - GENERAL[_ana]->ROI[i]->image = new CImageBasis(modelxsize, modelysize, 3); + GENERAL[_ana]->ROI[i]->image = new CImageBasis(modelxsize, modelysize, modelchannel); GENERAL[_ana]->ROI[i]->image_org = new CImageBasis(GENERAL[_ana]->ROI[i]->deltax, GENERAL[_ana]->ROI[i]->deltay, 3); } @@ -499,13 +476,11 @@ void ClassFlowCNNGeneral::DrawROI(CImageBasis *_zw) } } -bool ClassFlowCNNGeneral::doNeuralNetwork(string time) +bool ClassFlowCNNGeneral::getNetworkParameter() { if (disabled) return true; - string logPath = CreateLogFolder(time); - CTfLiteClass *tflite = new CTfLiteClass; string zwcnn = "/sdcard" + cnnmodelfile; zwcnn = FormatFileName(zwcnn); @@ -513,7 +488,6 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time) if (!tflite->LoadModel(zwcnn)) { printf("Can't read model file /sdcard%s\n", cnnmodelfile.c_str()); LogFile.WriteToFile("Cannot load model"); - delete tflite; return false; } @@ -521,6 +495,11 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time) if (CNNType == AutoDetect) { + tflite->GetInputDimension(false); + modelxsize = tflite->ReadInputDimenstion(0); + modelysize = tflite->ReadInputDimenstion(1); + modelchannel = tflite->ReadInputDimenstion(2); + int _anzoutputdimensions = tflite->GetAnzOutPut(); switch (_anzoutputdimensions) { @@ -549,6 +528,30 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time) } } + delete tflite; + return true; +} + +bool ClassFlowCNNGeneral::doNeuralNetwork(string time) +{ + if (disabled) + return true; + + string logPath = CreateLogFolder(time); + + CTfLiteClass *tflite = new CTfLiteClass; + string zwcnn = "/sdcard" + cnnmodelfile; + zwcnn = FormatFileName(zwcnn); + printf(zwcnn.c_str());printf("\n"); + if (!tflite->LoadModel(zwcnn)) { + printf("Can't read model file /sdcard%s\n", cnnmodelfile.c_str()); + LogFile.WriteToFile("Cannot load model"); + + delete tflite; + return false; + } + tflite->MakeAllocate(); + for (int _ana = 0; _ana < GENERAL.size(); ++_ana) { for (int i = 0; i < GENERAL[_ana]->ROI.size(); ++i) @@ -581,14 +584,15 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time) if (isLogImage) { + string _imagename = GENERAL[_ana]->name + "_" + GENERAL[_ana]->ROI[i]->name; if (isLogImageSelect) { if (LogImageSelect.find(GENERAL[_ana]->ROI[i]->name) != std::string::npos) - LogImage(logPath, GENERAL[_ana]->ROI[i]->name, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org); + LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org); } else { - LogImage(logPath, GENERAL[_ana]->ROI[i]->name, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org); + LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org); } } } break; @@ -617,7 +621,18 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time) if (debugdetailgeneral) LogFile.WriteToFile(_zwres); if (isLogImage) - LogImage(logPath, GENERAL[_ana]->ROI[i]->name, &GENERAL[_ana]->ROI[i]->result_float, NULL, time, GENERAL[_ana]->ROI[i]->image_org); + { + string _imagename = GENERAL[_ana]->name + "_" + GENERAL[_ana]->ROI[i]->name; + if (isLogImageSelect) + { + if (LogImageSelect.find(GENERAL[_ana]->ROI[i]->name) != std::string::npos) + LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org); + } + else + { + LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org); + } + } } break; case DigitalHyprid10: { @@ -641,7 +656,18 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time) if (debugdetailgeneral) LogFile.WriteToFile(_zwres); if (isLogImage) - LogImage(logPath, GENERAL[_ana]->ROI[i]->name, &GENERAL[_ana]->ROI[i]->result_float, NULL, time, GENERAL[_ana]->ROI[i]->image_org); + { + string _imagename = GENERAL[_ana]->name + "_" + GENERAL[_ana]->ROI[i]->name; + if (isLogImageSelect) + { + if (LogImageSelect.find(GENERAL[_ana]->ROI[i]->name) != std::string::npos) + LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org); + } + else + { + LogImage(logPath, _imagename, NULL, &GENERAL[_ana]->ROI[i]->result_klasse, time, GENERAL[_ana]->ROI[i]->image_org); + } + } } break; case DoubleHyprid10: @@ -649,6 +675,7 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time) int _num, _numplus, _numminus; float _val, _valplus, _valminus; float _fit; + float _result_save_file; tflite->LoadInputImageBasis(GENERAL[_ana]->ROI[i]->image); tflite->Invoke(); @@ -680,10 +707,13 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time) if (result < 0) result = result + 10; + _result_save_file = result; + if (_fit < CNNGoodThreshold) { GENERAL[_ana]->ROI[i]->isReject = true; result = -1; + _result_save_file+= 100; // Für den Fall, dass fit nicht ausreichend, soll trotzdem das Ergebnis mit "-10x.y" abgespeichert werden. string zw = "Value Rejected due to Threshold (Fit: " + to_string(_fit) + "Threshold: " + to_string(CNNGoodThreshold); printf("Value Rejected due to Threshold (Fit: %f, Threshold: %f\n", _fit, CNNGoodThreshold); LogFile.WriteToFile(zw); @@ -693,9 +723,23 @@ bool ClassFlowCNNGeneral::doNeuralNetwork(string time) GENERAL[_ana]->ROI[i]->isReject = false; } + GENERAL[_ana]->ROI[i]->result_float = result; printf("Result General(Analog)%i: %f\n", i, GENERAL[_ana]->ROI[i]->result_float); + if (isLogImage) + { + string _imagename = GENERAL[_ana]->name + "_" + GENERAL[_ana]->ROI[i]->name; + if (isLogImageSelect) + { + if (LogImageSelect.find(GENERAL[_ana]->ROI[i]->name) != std::string::npos) + LogImage(logPath, _imagename, &_result_save_file, NULL, time, GENERAL[_ana]->ROI[i]->image_org); + } + else + { + LogImage(logPath, _imagename, &_result_save_file, NULL, time, GENERAL[_ana]->ROI[i]->image_org); + } + } } break; diff --git a/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.h b/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.h index e9c5c3ce..32fcf9bd 100644 --- a/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.h +++ b/code/components/jomjol_flowcontroll/ClassFlowCNNGeneral.h @@ -24,7 +24,7 @@ protected: float CNNGoodThreshold; string cnnmodelfile; - int modelxsize, modelysize; + int modelxsize, modelysize, modelchannel; bool isLogImageSelect; string LogImageSelect; ClassFlowAlignment* flowpostalignment; @@ -39,6 +39,8 @@ protected: bool doNeuralNetwork(string time); bool doAlignAndCut(string time); + bool getNetworkParameter(); + public: ClassFlowCNNGeneral(ClassFlowAlignment *_flowalign, t_CNNType _cnntype = AutoDetect); diff --git a/code/components/jomjol_flowcontroll/ClassFlowDefineTypes.h b/code/components/jomjol_flowcontroll/ClassFlowDefineTypes.h index 181332d0..98432886 100644 --- a/code/components/jomjol_flowcontroll/ClassFlowDefineTypes.h +++ b/code/components/jomjol_flowcontroll/ClassFlowDefineTypes.h @@ -37,6 +37,7 @@ struct NumberPost { float PreValue; // letzter Wert, der gut ausgelesen wurde float Value; // letzer ausgelesener Wert, inkl. Korrekturen string ReturnRateValue; // RückgabewertRate + string ReturnChangeAbsolute; // RückgabewertRate string ReturnRawValue; // Rohwert (mit N & führenden 0) string ReturnValue; // korrigierter Rückgabewert, ggf. mit Fehlermeldung string ReturnPreValue; // korrigierter Rückgabewert ohne Fehlermeldung diff --git a/code/components/jomjol_flowcontroll/ClassFlowMQTT.cpp b/code/components/jomjol_flowcontroll/ClassFlowMQTT.cpp index ceaf9671..f4e014e9 100644 --- a/code/components/jomjol_flowcontroll/ClassFlowMQTT.cpp +++ b/code/components/jomjol_flowcontroll/ClassFlowMQTT.cpp @@ -149,6 +149,7 @@ bool ClassFlowMQTT::doFlow(string zwtime) std::string resultraw = ""; std::string resultrate = ""; std::string resulttimestamp = ""; + std::string resultchangabs = ""; string zw = ""; string namenumber = ""; @@ -180,6 +181,7 @@ bool ClassFlowMQTT::doFlow(string zwtime) resultraw = (*NUMBERS)[i]->ReturnRawValue; resulterror = (*NUMBERS)[i]->ErrorMessageText; resultrate = (*NUMBERS)[i]->ReturnRateValue; + resultchangabs = (*NUMBERS)[i]->ReturnChangeAbsolute; resulttimestamp = (*NUMBERS)[i]->timeStamp; namenumber = (*NUMBERS)[i]->name; @@ -200,6 +202,10 @@ bool ClassFlowMQTT::doFlow(string zwtime) if (resultrate.length() > 0) MQTTPublish(zw, resultrate, SetRetainFlag); + zw = namenumber + "changeabsolut"; + if (resultchangabs.length() > 0) + MQTTPublish(zw, resultchangabs, SetRetainFlag); + zw = namenumber + "raw"; if (resultraw.length() > 0) MQTTPublish(zw, resultraw, SetRetainFlag); diff --git a/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.cpp b/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.cpp index f61e0140..b3c60fad 100644 --- a/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.cpp +++ b/code/components/jomjol_flowcontroll/ClassFlowPostProcessing.cpp @@ -77,6 +77,8 @@ void ClassFlowPostProcessing::SetPreValue(float zw, string _numbers, bool _exter if (NUMBERS[j]->name == _numbers) { NUMBERS[j]->PreValue = zw; + NUMBERS[j]->ReturnPreValue = std::to_string(zw); + NUMBERS[j]->PreValueOkay = true; if (_extern) { time(&(NUMBERS[j]->lastvalue)); @@ -541,7 +543,6 @@ void ClassFlowPostProcessing::InitNUMBERS() _number->ReturnRawValue = ""; // Rohwert (mit N & führenden 0) _number->ReturnValue = ""; // korrigierter Rückgabewert, ggf. mit Fehlermeldung -// _number->ReturnValueNoError = ""; // korrigierter Rückgabewert ohne Fehlermeldung _number->ErrorMessageText = ""; // Fehlermeldung bei Consistency Check _number->ReturnPreValue = ""; _number->PreValueOkay = false; @@ -560,7 +561,6 @@ void ClassFlowPostProcessing::InitNUMBERS() _number->Value = 0; // letzer ausgelesener Wert, inkl. Korrekturen _number->ReturnRawValue = ""; // Rohwert (mit N & führenden 0) _number->ReturnValue = ""; // korrigierter Rückgabewert, ggf. mit Fehlermeldung -// _number->ReturnValueNoError = ""; // korrigierter Rückgabewert ohne Fehlermeldung _number->ErrorMessageText = ""; // Fehlermeldung bei Consistency Check _number->Nachkomma = _number->AnzahlAnalog; @@ -722,7 +722,7 @@ bool ClassFlowPostProcessing::doFlow(string zwtime) if (NUMBERS[j]->useMaxRateValue && PreValueUse && NUMBERS[j]->PreValueOkay) { - float _ratedifference; + float _ratedifference; if (NUMBERS[j]->RateType == RateChange) _ratedifference = NUMBERS[j]->FlowRateAct; else @@ -745,6 +745,7 @@ bool ClassFlowPostProcessing::doFlow(string zwtime) NUMBERS[j]->ReturnValue = RundeOutput(NUMBERS[j]->Value, NUMBERS[j]->Nachkomma); NUMBERS[j]->ReturnPreValue = RundeOutput(NUMBERS[j]->PreValue, NUMBERS[j]->Nachkomma); + NUMBERS[j]->ReturnChangeAbsolute = RundeOutput(NUMBERS[j]->Value - NUMBERS[j]->PreValue, NUMBERS[j]->Nachkomma); NUMBERS[j]->ErrorMessageText = "no error"; UpdatePreValueINI = true; diff --git a/code/components/jomjol_tfliteclass/CTfLiteClass.cpp b/code/components/jomjol_tfliteclass/CTfLiteClass.cpp index df008a1b..15affbc0 100644 --- a/code/components/jomjol_tfliteclass/CTfLiteClass.cpp +++ b/code/components/jomjol_tfliteclass/CTfLiteClass.cpp @@ -87,6 +87,19 @@ void CTfLiteClass::GetInputDimension(bool silent = false) } } +int CTfLiteClass::ReadInputDimenstion(int _dim) +{ + if (_dim == 0) + return im_width; + if (_dim == 1) + return im_height; + if (_dim == 2) + return im_channel; + + return -1; +} + + int CTfLiteClass::GetAnzOutPut(bool silent) { diff --git a/code/components/jomjol_tfliteclass/CTfLiteClass.h b/code/components/jomjol_tfliteclass/CTfLiteClass.h index cab5a0e3..ef98c1fa 100644 --- a/code/components/jomjol_tfliteclass/CTfLiteClass.h +++ b/code/components/jomjol_tfliteclass/CTfLiteClass.h @@ -71,5 +71,6 @@ class CTfLiteClass float GetOutputValue(int nr); void GetInputDimension(bool silent); + int ReadInputDimenstion(int _dim); }; diff --git a/code/components/tflite-lib/CMakeLists.txt b/code/components/tflite-lib/CMakeLists.txt index fab7027a..eed31a57 100644 --- a/code/components/tflite-lib/CMakeLists.txt +++ b/code/components/tflite-lib/CMakeLists.txt @@ -1,3 +1,5 @@ +## TODO: GLOB is not a good way to collect files. Use explicit file list instead + cmake_minimum_required(VERSION 3.5) set(tflite_dir "${CMAKE_CURRENT_SOURCE_DIR}/tensorflow/lite") @@ -16,14 +18,27 @@ file(GLOB srcs_kernels "${tfmicro_kernels_dir}/*.c" "${tfmicro_kernels_dir}/*.cc") +# remove sources which will be provided by esp_nn +list(REMOVE_ITEM srcs_kernels + "${tfmicro_kernels_dir}/add.cc" + "${tfmicro_kernels_dir}/conv.cc" + "${tfmicro_kernels_dir}/depthwise_conv.cc" + "${tfmicro_kernels_dir}/fully_connected.cc" + "${tfmicro_kernels_dir}/mul.cc" + "${tfmicro_kernels_dir}/pooling.cc") + +FILE(GLOB esp_nn_kernels + "${tfmicro_kernels_dir}/esp_nn/*.cc") + set(lib_srcs "${srcs_micro}" "${srcs_kernels}" + "${esp_nn_kernels}" "${src_micro_frontend}" "${tflite_dir}/kernels/kernel_util.cc" "${tflite_dir}/micro/memory_planner/greedy_memory_planner.cc" "${tflite_dir}/micro/memory_planner/linear_memory_planner.cc" - "${tflite_dir}/c/common.c" + "${tflite_dir}/c/common.cc" "${tflite_dir}/core/api/error_reporter.cc" "${tflite_dir}/core/api/flatbuffer_conversions.cc" "${tflite_dir}/core/api/op_resolver.cc" @@ -36,15 +51,17 @@ idf_component_register( INCLUDE_DIRS "." "third_party/gemmlowp" "third_party/flatbuffers/include" "third_party/ruy" - "third_party/kissfft") + "third_party/kissfft" + REQUIRES "esp-nn") # Reduce the level of paranoia to be able to compile TF sources target_compile_options(${COMPONENT_LIB} PRIVATE -Wno-maybe-uninitialized -Wno-missing-field-initializers + -DESP_NN # enables ESP-NN optimizations by Espressif -Wno-type-limits) -target_compile_options(${COMPONENT_LIB} PRIVATE -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -DESP -DESP_NN -Wno-nonnull -Wno-nonnull -Wno-nonnull) -target_compile_options(${COMPONENT_LIB} PRIVATE $<$: -std=c++11 -fno-rtti -fno-exceptions -fno-threadsafe-statics -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Werror -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -DESP -DESP_NN -Wno-return-type -Wno-strict-aliasing -std=gnu++14 -Wno-return-type -Wno-strict-aliasing -std=gnu++14 -Wno-return-type -Wno-strict-aliasing -std=gnu++14 >) +target_compile_options(${COMPONENT_LIB} PRIVATE -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -Wno-nonnull) +target_compile_options(${COMPONENT_LIB} PRIVATE $<$: -std=c++11 -fno-rtti -fno-exceptions -fno-threadsafe-statics -fno-unwind-tables -ffunction-sections -fdata-sections -fmessage-length=0 -DTF_LITE_STATIC_MEMORY -DTF_LITE_DISABLE_X86_NEON -O3 -Werror -Wsign-compare -Wdouble-promotion -Wshadow -Wunused-variable -Wmissing-field-initializers -Wunused-function -Wswitch -Wvla -Wall -Wextra -Wstrict-aliasing -Wno-unused-parameter -Wno-return-type -Wno-strict-aliasing -std=gnu++14 >) target_compile_options(${COMPONENT_LIB} INTERFACE $<$>:-DTF_LITE_STATIC_MEMORY>) target_link_libraries(${COMPONENT_LIB} PRIVATE -lm) diff --git a/code/components/tflite-lib/tensorflow/lite/builtin_op_data.h b/code/components/tflite-lib/tensorflow/lite/builtin_op_data.h new file mode 100644 index 00000000..b9d42845 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/builtin_op_data.h @@ -0,0 +1,22 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// Compatibility shim for new location of interface definitions. + +#ifndef TENSORFLOW_LITE_BUILTIN_OP_DATA_H_ +#define TENSORFLOW_LITE_BUILTIN_OP_DATA_H_ + +#include "tensorflow/lite/c/builtin_op_data.h" + +#endif // TENSORFLOW_LITE_BUILTIN_OP_DATA_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/builtin_ops.h b/code/components/tflite-lib/tensorflow/lite/builtin_ops.h new file mode 100644 index 00000000..19ce3e2c --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/builtin_ops.h @@ -0,0 +1,187 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_BUILTIN_OPS_H_ +#define TENSORFLOW_LITE_BUILTIN_OPS_H_ + +// DO NOT EDIT MANUALLY: This file is automatically generated by +// `schema/builtin_ops_header/generator.cc`. + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// The enum for builtin operators. +// Note: CUSTOM, DELEGATE, and PLACEHOLDER_FOR_GREATER_OP_CODES are 3 special +// ops which are not real built-in ops. +typedef enum { + kTfLiteBuiltinAdd = 0, + kTfLiteBuiltinAveragePool2d = 1, + kTfLiteBuiltinConcatenation = 2, + kTfLiteBuiltinConv2d = 3, + kTfLiteBuiltinDepthwiseConv2d = 4, + kTfLiteBuiltinDepthToSpace = 5, + kTfLiteBuiltinDequantize = 6, + kTfLiteBuiltinEmbeddingLookup = 7, + kTfLiteBuiltinFloor = 8, + kTfLiteBuiltinFullyConnected = 9, + kTfLiteBuiltinHashtableLookup = 10, + kTfLiteBuiltinL2Normalization = 11, + kTfLiteBuiltinL2Pool2d = 12, + kTfLiteBuiltinLocalResponseNormalization = 13, + kTfLiteBuiltinLogistic = 14, + kTfLiteBuiltinLshProjection = 15, + kTfLiteBuiltinLstm = 16, + kTfLiteBuiltinMaxPool2d = 17, + kTfLiteBuiltinMul = 18, + kTfLiteBuiltinRelu = 19, + kTfLiteBuiltinReluN1To1 = 20, + kTfLiteBuiltinRelu6 = 21, + kTfLiteBuiltinReshape = 22, + kTfLiteBuiltinResizeBilinear = 23, + kTfLiteBuiltinRnn = 24, + kTfLiteBuiltinSoftmax = 25, + kTfLiteBuiltinSpaceToDepth = 26, + kTfLiteBuiltinSvdf = 27, + kTfLiteBuiltinTanh = 28, + kTfLiteBuiltinConcatEmbeddings = 29, + kTfLiteBuiltinSkipGram = 30, + kTfLiteBuiltinCall = 31, + kTfLiteBuiltinCustom = 32, + kTfLiteBuiltinEmbeddingLookupSparse = 33, + kTfLiteBuiltinPad = 34, + kTfLiteBuiltinUnidirectionalSequenceRnn = 35, + kTfLiteBuiltinGather = 36, + kTfLiteBuiltinBatchToSpaceNd = 37, + kTfLiteBuiltinSpaceToBatchNd = 38, + kTfLiteBuiltinTranspose = 39, + kTfLiteBuiltinMean = 40, + kTfLiteBuiltinSub = 41, + kTfLiteBuiltinDiv = 42, + kTfLiteBuiltinSqueeze = 43, + kTfLiteBuiltinUnidirectionalSequenceLstm = 44, + kTfLiteBuiltinStridedSlice = 45, + kTfLiteBuiltinBidirectionalSequenceRnn = 46, + kTfLiteBuiltinExp = 47, + kTfLiteBuiltinTopkV2 = 48, + kTfLiteBuiltinSplit = 49, + kTfLiteBuiltinLogSoftmax = 50, + kTfLiteBuiltinDelegate = 51, + kTfLiteBuiltinBidirectionalSequenceLstm = 52, + kTfLiteBuiltinCast = 53, + kTfLiteBuiltinPrelu = 54, + kTfLiteBuiltinMaximum = 55, + kTfLiteBuiltinArgMax = 56, + kTfLiteBuiltinMinimum = 57, + kTfLiteBuiltinLess = 58, + kTfLiteBuiltinNeg = 59, + kTfLiteBuiltinPadv2 = 60, + kTfLiteBuiltinGreater = 61, + kTfLiteBuiltinGreaterEqual = 62, + kTfLiteBuiltinLessEqual = 63, + kTfLiteBuiltinSelect = 64, + kTfLiteBuiltinSlice = 65, + kTfLiteBuiltinSin = 66, + kTfLiteBuiltinTransposeConv = 67, + kTfLiteBuiltinSparseToDense = 68, + kTfLiteBuiltinTile = 69, + kTfLiteBuiltinExpandDims = 70, + kTfLiteBuiltinEqual = 71, + kTfLiteBuiltinNotEqual = 72, + kTfLiteBuiltinLog = 73, + kTfLiteBuiltinSum = 74, + kTfLiteBuiltinSqrt = 75, + kTfLiteBuiltinRsqrt = 76, + kTfLiteBuiltinShape = 77, + kTfLiteBuiltinPow = 78, + kTfLiteBuiltinArgMin = 79, + kTfLiteBuiltinFakeQuant = 80, + kTfLiteBuiltinReduceProd = 81, + kTfLiteBuiltinReduceMax = 82, + kTfLiteBuiltinPack = 83, + kTfLiteBuiltinLogicalOr = 84, + kTfLiteBuiltinOneHot = 85, + kTfLiteBuiltinLogicalAnd = 86, + kTfLiteBuiltinLogicalNot = 87, + kTfLiteBuiltinUnpack = 88, + kTfLiteBuiltinReduceMin = 89, + kTfLiteBuiltinFloorDiv = 90, + kTfLiteBuiltinReduceAny = 91, + kTfLiteBuiltinSquare = 92, + kTfLiteBuiltinZerosLike = 93, + kTfLiteBuiltinFill = 94, + kTfLiteBuiltinFloorMod = 95, + kTfLiteBuiltinRange = 96, + kTfLiteBuiltinResizeNearestNeighbor = 97, + kTfLiteBuiltinLeakyRelu = 98, + kTfLiteBuiltinSquaredDifference = 99, + kTfLiteBuiltinMirrorPad = 100, + kTfLiteBuiltinAbs = 101, + kTfLiteBuiltinSplitV = 102, + kTfLiteBuiltinUnique = 103, + kTfLiteBuiltinCeil = 104, + kTfLiteBuiltinReverseV2 = 105, + kTfLiteBuiltinAddN = 106, + kTfLiteBuiltinGatherNd = 107, + kTfLiteBuiltinCos = 108, + kTfLiteBuiltinWhere = 109, + kTfLiteBuiltinRank = 110, + kTfLiteBuiltinElu = 111, + kTfLiteBuiltinReverseSequence = 112, + kTfLiteBuiltinMatrixDiag = 113, + kTfLiteBuiltinQuantize = 114, + kTfLiteBuiltinMatrixSetDiag = 115, + kTfLiteBuiltinRound = 116, + kTfLiteBuiltinHardSwish = 117, + kTfLiteBuiltinIf = 118, + kTfLiteBuiltinWhile = 119, + kTfLiteBuiltinNonMaxSuppressionV4 = 120, + kTfLiteBuiltinNonMaxSuppressionV5 = 121, + kTfLiteBuiltinScatterNd = 122, + kTfLiteBuiltinSelectV2 = 123, + kTfLiteBuiltinDensify = 124, + kTfLiteBuiltinSegmentSum = 125, + kTfLiteBuiltinBatchMatmul = 126, + kTfLiteBuiltinPlaceholderForGreaterOpCodes = 127, + kTfLiteBuiltinCumsum = 128, + kTfLiteBuiltinCallOnce = 129, + kTfLiteBuiltinBroadcastTo = 130, + kTfLiteBuiltinRfft2d = 131, + kTfLiteBuiltinConv3d = 132, + kTfLiteBuiltinImag = 133, + kTfLiteBuiltinReal = 134, + kTfLiteBuiltinComplexAbs = 135, + kTfLiteBuiltinHashtable = 136, + kTfLiteBuiltinHashtableFind = 137, + kTfLiteBuiltinHashtableImport = 138, + kTfLiteBuiltinHashtableSize = 139, + kTfLiteBuiltinReduceAll = 140, + kTfLiteBuiltinConv3dTranspose = 141, + kTfLiteBuiltinVarHandle = 142, + kTfLiteBuiltinReadVariable = 143, + kTfLiteBuiltinAssignVariable = 144, + kTfLiteBuiltinBroadcastArgs = 145, + kTfLiteBuiltinRandomStandardNormal = 146, + kTfLiteBuiltinBucketize = 147, + kTfLiteBuiltinRandomUniform = 148, + kTfLiteBuiltinMultinomial = 149, + kTfLiteBuiltinGelu = 150, + kTfLiteBuiltinDynamicUpdateSlice = 151, +} TfLiteBuiltinOperator; + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // TENSORFLOW_LITE_BUILTIN_OPS_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/c/c_api_types.h b/code/components/tflite-lib/tensorflow/lite/c/c_api_types.h index 678dfae6..d2524969 100644 --- a/code/components/tflite-lib/tensorflow/lite/c/c_api_types.h +++ b/code/components/tflite-lib/tensorflow/lite/c/c_api_types.h @@ -98,6 +98,7 @@ typedef enum { kTfLiteResource = 14, kTfLiteVariant = 15, kTfLiteUInt32 = 16, + kTfLiteUInt16 = 17, } TfLiteType; // Legacy. Will be deprecated in favor of TfLiteAffineQuantization. @@ -111,6 +112,12 @@ typedef struct TfLiteQuantizationParams { int32_t zero_point; } TfLiteQuantizationParams; +// -------------------------------------------------------------------------- +// Opaque types used by c_api_opaque.h. + +// TfLiteOpaqueTensor is an opaque version of TfLiteTensor; +typedef struct TfLiteOpaqueTensor TfLiteOpaqueTensor; + #ifdef __cplusplus } // extern C #endif diff --git a/code/components/tflite-lib/tensorflow/lite/c/common.c b/code/components/tflite-lib/tensorflow/lite/c/common.cc similarity index 89% rename from code/components/tflite-lib/tensorflow/lite/c/common.c rename to code/components/tflite-lib/tensorflow/lite/c/common.cc index d149d22c..956e9d69 100644 --- a/code/components/tflite-lib/tensorflow/lite/c/common.c +++ b/code/components/tflite-lib/tensorflow/lite/c/common.cc @@ -21,6 +21,8 @@ limitations under the License. #include #endif // TF_LITE_STATIC_MEMORY +extern "C" { + size_t TfLiteIntArrayGetSizeInBytes(int size) { static TfLiteIntArray dummy; @@ -34,13 +36,13 @@ size_t TfLiteIntArrayGetSizeInBytes(int size) { int TfLiteIntArrayEqual(const TfLiteIntArray* a, const TfLiteIntArray* b) { if (a == b) return 1; - if (a == NULL || b == NULL) return 0; + if (a == nullptr || b == nullptr) return 0; return TfLiteIntArrayEqualsArray(a, b->size, b->data); } int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size, const int b_data[]) { - if (a == NULL) return (b_size == 0); + if (a == nullptr) return (b_size == 0); if (a->size != b_size) return 0; int i = 0; for (; i < a->size; i++) @@ -52,7 +54,7 @@ int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size, TfLiteIntArray* TfLiteIntArrayCreate(int size) { size_t alloc_size = TfLiteIntArrayGetSizeInBytes(size); - if (alloc_size <= 0) return NULL; + if (alloc_size <= 0) return nullptr; TfLiteIntArray* ret = (TfLiteIntArray*)malloc(alloc_size); if (!ret) return ret; ret->size = size; @@ -60,7 +62,7 @@ TfLiteIntArray* TfLiteIntArrayCreate(int size) { } TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) { - if (!src) return NULL; + if (!src) return nullptr; TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size); if (ret) { memcpy(ret->data, src->data, src->size * sizeof(int)); @@ -99,7 +101,7 @@ void TfLiteTensorDataFree(TfLiteTensor* t) { t->allocation_type == kTfLitePersistentRo) { free(t->data.raw); } - t->data.raw = NULL; + t->data.raw = nullptr; } void TfLiteQuantizationFree(TfLiteQuantization* quantization) { @@ -108,31 +110,31 @@ void TfLiteQuantizationFree(TfLiteQuantization* quantization) { (TfLiteAffineQuantization*)(quantization->params); if (q_params->scale) { TfLiteFloatArrayFree(q_params->scale); - q_params->scale = NULL; + q_params->scale = nullptr; } if (q_params->zero_point) { TfLiteIntArrayFree(q_params->zero_point); - q_params->zero_point = NULL; + q_params->zero_point = nullptr; } free(q_params); } - quantization->params = NULL; + quantization->params = nullptr; quantization->type = kTfLiteNoQuantization; } void TfLiteSparsityFree(TfLiteSparsity* sparsity) { - if (sparsity == NULL) { + if (sparsity == nullptr) { return; } if (sparsity->traversal_order) { TfLiteIntArrayFree(sparsity->traversal_order); - sparsity->traversal_order = NULL; + sparsity->traversal_order = nullptr; } if (sparsity->block_map) { TfLiteIntArrayFree(sparsity->block_map); - sparsity->block_map = NULL; + sparsity->block_map = nullptr; } if (sparsity->dim_metadata) { @@ -141,13 +143,13 @@ void TfLiteSparsityFree(TfLiteSparsity* sparsity) { TfLiteDimensionMetadata metadata = sparsity->dim_metadata[i]; if (metadata.format == kTfLiteDimSparseCSR) { TfLiteIntArrayFree(metadata.array_segments); - metadata.array_segments = NULL; + metadata.array_segments = nullptr; TfLiteIntArrayFree(metadata.array_indices); - metadata.array_indices = NULL; + metadata.array_indices = nullptr; } } free(sparsity->dim_metadata); - sparsity->dim_metadata = NULL; + sparsity->dim_metadata = nullptr; } free(sparsity); @@ -156,16 +158,16 @@ void TfLiteSparsityFree(TfLiteSparsity* sparsity) { void TfLiteTensorFree(TfLiteTensor* t) { TfLiteTensorDataFree(t); if (t->dims) TfLiteIntArrayFree(t->dims); - t->dims = NULL; + t->dims = nullptr; if (t->dims_signature) { TfLiteIntArrayFree((TfLiteIntArray *) t->dims_signature); } - t->dims_signature = NULL; + t->dims_signature = nullptr; TfLiteQuantizationFree(&t->quantization); TfLiteSparsityFree(t->sparsity); - t->sparsity = NULL; + t->sparsity = nullptr; } void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims, @@ -185,7 +187,7 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims, tensor->is_variable = is_variable; tensor->quantization.type = kTfLiteNoQuantization; - tensor->quantization.params = NULL; + tensor->quantization.params = nullptr; } TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) { @@ -229,6 +231,8 @@ const char* TfLiteTypeGetName(TfLiteType type) { return "NOTYPE"; case kTfLiteFloat32: return "FLOAT32"; + case kTfLiteUInt16: + return "UINT16"; case kTfLiteInt16: return "INT16"; case kTfLiteInt32: @@ -263,14 +267,6 @@ const char* TfLiteTypeGetName(TfLiteType type) { return "Unknown type"; } -TfLiteDelegate TfLiteDelegateCreate(void) { - TfLiteDelegate d = { - .data_ = NULL, - .Prepare = NULL, - .CopyFromBufferHandle = NULL, - .CopyToBufferHandle = NULL, - .FreeBufferHandle = NULL, - .flags = kTfLiteDelegateFlagsNone, - }; - return d; -} +TfLiteDelegate TfLiteDelegateCreate() { return TfLiteDelegate{}; } + +} // extern "C" diff --git a/code/components/tflite-lib/tensorflow/lite/c/common.h b/code/components/tflite-lib/tensorflow/lite/c/common.h index 7056d1e2..6a109e1e 100644 --- a/code/components/tflite-lib/tensorflow/lite/c/common.h +++ b/code/components/tflite-lib/tensorflow/lite/c/common.h @@ -173,8 +173,9 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a); } \ } while (false) #else // TF_LITE_STRIP_ERROR_STRINGS -#define TF_LITE_KERNEL_LOG(context, ...) -#define TF_LITE_MAYBE_KERNEL_LOG(context, ...) +#define UNUSED(...) (void)sizeof(#__VA_ARGS__) +#define TF_LITE_KERNEL_LOG(context, ...) UNUSED(__VA_ARGS__) +#define TF_LITE_MAYBE_KERNEL_LOG(context, ...) UNUSED(__VA_ARGS__) #endif // TF_LITE_STRIP_ERROR_STRINGS // Check whether value is true, and if not return kTfLiteError from @@ -316,6 +317,7 @@ typedef union TfLitePtrUnion { uint8_t* uint8; bool* b; int16_t* i16; + uint16_t* ui16; TfLiteComplex64* c64; TfLiteComplex128* c128; int8_t* int8; @@ -459,7 +461,8 @@ typedef struct TfLiteTensor { // Optional. Encodes shapes with unknown dimensions with -1. This field is // only populated when unknown dimensions exist in a read-write tensor (i.e. // an input or output tensor). (e.g. `dims` contains [1, 1, 1, 3] and - // `dims_signature` contains [1, -1, -1, 3]). + // `dims_signature` contains [1, -1, -1, 3]). Note that this field only + // exists when TF_LITE_STATIC_MEMORY is not defined. const TfLiteIntArray* dims_signature; } TfLiteTensor; diff --git a/code/components/tflite-lib/tensorflow/lite/context_util.h b/code/components/tflite-lib/tensorflow/lite/context_util.h new file mode 100644 index 00000000..7c8a5abd --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/context_util.h @@ -0,0 +1,51 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// This provides a few C++ helpers that are useful for manipulating C structures +// in C++. +#ifndef TENSORFLOW_LITE_CONTEXT_UTIL_H_ +#define TENSORFLOW_LITE_CONTEXT_UTIL_H_ + +#include + +#include "tensorflow/lite/c/common.h" + +namespace tflite { + +// Provide a range iterable wrapper for TfLiteIntArray* (C lists that TfLite +// C api uses. Can't use the google array_view, since we can't depend on even +// absl for embedded device reasons. +class TfLiteIntArrayView { + public: + // Construct a view of a TfLiteIntArray*. Note, `int_array` should be non-null + // and this view does not take ownership of it. + explicit TfLiteIntArrayView(const TfLiteIntArray* int_array) + : int_array_(int_array) {} + + TfLiteIntArrayView(const TfLiteIntArrayView&) = default; + TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default; + + typedef const int* const_iterator; + const_iterator begin() const { return int_array_->data; } + const_iterator end() const { return &int_array_->data[int_array_->size]; } + size_t size() const { return end() - begin(); } + int operator[](size_t pos) const { return int_array_->data[pos]; } + + private: + const TfLiteIntArray* int_array_; +}; + +} // namespace tflite + +#endif // TENSORFLOW_LITE_CONTEXT_UTIL_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.cc b/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.cc index dfa0ccfd..e92d754f 100644 --- a/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.cc +++ b/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.cc @@ -208,6 +208,14 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type, return ParseBatchToSpaceNd(op, error_reporter, allocator, builtin_data); } + case BuiltinOperator_BROADCAST_ARGS: { + return ParseBroadcastArgs(op, error_reporter, allocator, builtin_data); + } + + case BuiltinOperator_BROADCAST_TO: { + return ParseBroadcastTo(op, error_reporter, allocator, builtin_data); + } + case BuiltinOperator_CALL_ONCE: { return ParseCallOnce(op, error_reporter, allocator, builtin_data); } @@ -336,6 +344,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type, return ParseLogSoftmax(op, error_reporter, allocator, builtin_data); } + case BuiltinOperator_LSTM: { + return ParseLSTM(op, error_reporter, allocator, builtin_data); + } + case BuiltinOperator_MAXIMUM: { return ParseMaximum(op, error_reporter, allocator, builtin_data); } @@ -605,37 +617,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type, *builtin_data = params.release(); return kTfLiteOk; } - case BuiltinOperator_LSTM: { - auto params = safe_allocator.Allocate(); - TF_LITE_ENSURE(error_reporter, params != nullptr); - if (const auto* lstm_params = op->builtin_options_as_LSTMOptions()) { - params->activation = - ConvertActivation(lstm_params->fused_activation_function()); - params->cell_clip = lstm_params->cell_clip(); - params->proj_clip = lstm_params->proj_clip(); - switch (lstm_params->kernel_type()) { - case LSTMKernelType_FULL: - params->kernel_type = kTfLiteLSTMFullKernel; - break; - case LSTMKernelType_BASIC: - params->kernel_type = kTfLiteLSTMBasicKernel; - break; - default: - TF_LITE_REPORT_ERROR(error_reporter, - "Unhandled LSTM kernel type: %d", - lstm_params->kernel_type()); - return kTfLiteError; - } - params->asymmetric_quantize_inputs = - lstm_params->asymmetric_quantize_inputs(); - } else { - TF_LITE_REPORT_ERROR(error_reporter, - "No valid LSTM builtin options exist"); - return kTfLiteError; - } - *builtin_data = params.release(); - return kTfLiteOk; - } case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: { return ParseUnidirectionalSequenceLSTM(op, error_reporter, allocator, builtin_data); @@ -883,7 +864,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type, case BuiltinOperator_SCATTER_ND: case BuiltinOperator_DENSIFY: case BuiltinOperator_SEGMENT_SUM: - case BuiltinOperator_BROADCAST_TO: case BuiltinOperator_RFFT2D: case BuiltinOperator_IMAG: case BuiltinOperator_REAL: @@ -891,7 +871,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type, case BuiltinOperator_HASHTABLE_FIND: case BuiltinOperator_HASHTABLE_IMPORT: case BuiltinOperator_HASHTABLE_SIZE: - case BuiltinOperator_BROADCAST_ARGS: + case BuiltinOperator_DYNAMIC_UPDATE_SLICE: return kTfLiteOk; case BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES: return kTfLiteError; @@ -916,6 +896,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type, case TensorType_INT16: *type = kTfLiteInt16; return kTfLiteOk; + case TensorType_UINT16: + *type = kTfLiteUInt16; + return kTfLiteOk; case TensorType_INT32: *type = kTfLiteInt32; return kTfLiteOk; @@ -1085,6 +1068,22 @@ TfLiteStatus ParseBatchToSpaceNd(const Operator*, ErrorReporter*, return kTfLiteOk; } +// We have this parse function instead of directly returning kTfLiteOk from the +// switch-case in ParseOpData because this function is used as part of the +// selective registration for the OpResolver implementation in micro. +TfLiteStatus ParseBroadcastArgs(const Operator*, ErrorReporter*, + BuiltinDataAllocator*, void**) { + return kTfLiteOk; +} + +// We have this parse function instead of directly returning kTfLiteOk from the +// switch-case in ParseOpData because this function is used as part of the +// selective registration for the OpResolver implementation in micro. +TfLiteStatus ParseBroadcastTo(const Operator*, ErrorReporter*, + BuiltinDataAllocator*, void**) { + return kTfLiteOk; +} + TfLiteStatus ParseCallOnce(const Operator* op, ErrorReporter* error_reporter, BuiltinDataAllocator* allocator, void** builtin_data) { @@ -1605,6 +1604,40 @@ TfLiteStatus ParseLogSoftmax(const Operator*, ErrorReporter*, return kTfLiteOk; } +TfLiteStatus ParseLSTM(const Operator* op, ErrorReporter* error_reporter, + BuiltinDataAllocator* allocator, void** builtin_data) { + CheckParsePointerParams(op, error_reporter, allocator, builtin_data); + + SafeBuiltinDataAllocator safe_allocator(allocator); + auto params = safe_allocator.Allocate(); + TF_LITE_ENSURE(error_reporter, params != nullptr); + if (const auto* lstm_params = op->builtin_options_as_LSTMOptions()) { + params->activation = + ConvertActivation(lstm_params->fused_activation_function()); + params->cell_clip = lstm_params->cell_clip(); + params->proj_clip = lstm_params->proj_clip(); + switch (lstm_params->kernel_type()) { + case LSTMKernelType_FULL: + params->kernel_type = kTfLiteLSTMFullKernel; + break; + case LSTMKernelType_BASIC: + params->kernel_type = kTfLiteLSTMBasicKernel; + break; + default: + TF_LITE_REPORT_ERROR(error_reporter, "Unhandled LSTM kernel type: %d", + lstm_params->kernel_type()); + return kTfLiteError; + } + params->asymmetric_quantize_inputs = + lstm_params->asymmetric_quantize_inputs(); + } else { + TF_LITE_REPORT_ERROR(error_reporter, "No valid LSTM builtin options exist"); + return kTfLiteError; + } + *builtin_data = params.release(); + return kTfLiteOk; +} + // We have this parse function instead of directly returning kTfLiteOk from the // switch-case in ParseOpData because this function is used as part of the // selective registration for the OpResolver implementation in micro. @@ -2337,6 +2370,31 @@ TfLiteStatus ParseVarHandle(const Operator* op, ErrorReporter* error_reporter, return kTfLiteOk; } +TfLiteStatus ParseWhile(const Operator* op, ErrorReporter* error_reporter, + BuiltinDataAllocator* allocator, void** builtin_data) { + CheckParsePointerParams(op, error_reporter, allocator, builtin_data); + + SafeBuiltinDataAllocator safe_allocator(allocator); + std::unique_ptr + params = safe_allocator.Allocate(); + TF_LITE_ENSURE(error_reporter, params != nullptr); + + const WhileOptions* schema_params = op->builtin_options_as_WhileOptions(); + + if (schema_params != nullptr) { + params->cond_subgraph_index = schema_params->cond_subgraph_index(); + params->body_subgraph_index = schema_params->body_subgraph_index(); + } else { + // TODO(b/157480169): We should either return kTfLiteError or fill in some + // reasonable defaults in the params struct. We are not doing so until we + // better undertand the ramifications of changing the legacy behavior. + } + + *builtin_data = params.release(); + return kTfLiteOk; +} + // We have this parse function instead of directly returning kTfLiteOk from the // switch-case in ParseOpData because this function is used as part of the // selective registration for the OpResolver implementation in micro. diff --git a/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.h b/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.h index 8cf889d8..cd6637bc 100644 --- a/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.h +++ b/code/components/tflite-lib/tensorflow/lite/core/api/flatbuffer_conversions.h @@ -98,6 +98,15 @@ TfLiteStatus ParseBatchToSpaceNd(const Operator* op, BuiltinDataAllocator* allocator, void** builtin_data); +TfLiteStatus ParseBroadcastArgs(const Operator* op, + ErrorReporter* error_reporter, + BuiltinDataAllocator* allocator, + void** builtin_data); + +TfLiteStatus ParseBroadcastTo(const Operator* op, ErrorReporter* error_reporter, + BuiltinDataAllocator* allocator, + void** builtin_data); + TfLiteStatus ParseCallOnce(const Operator* op, ErrorReporter* error_reporter, BuiltinDataAllocator* allocator, void** builtin_data); @@ -232,6 +241,9 @@ TfLiteStatus ParseLogSoftmax(const Operator* op, ErrorReporter* error_reporter, BuiltinDataAllocator* allocator, void** builtin_data); +TfLiteStatus ParseLSTM(const Operator* op, ErrorReporter* error_reporter, + BuiltinDataAllocator* allocator, void** builtin_data); + TfLiteStatus ParseMaximum(const Operator* op, ErrorReporter* error_reporter, BuiltinDataAllocator* allocator, void** builtin_data); @@ -379,6 +391,9 @@ TfLiteStatus ParseVarHandle(const Operator* op, ErrorReporter* error_reporter, BuiltinDataAllocator* allocator, void** builtin_data); +TfLiteStatus ParseWhile(const Operator* op, ErrorReporter* error_reporter, + BuiltinDataAllocator* allocator, void** builtin_data); + TfLiteStatus ParseZerosLike(const Operator* op, ErrorReporter* error_reporter, BuiltinDataAllocator* allocator, void** builtin_data); diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/portable_tensor_utils.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/portable_tensor_utils.h index 0671ce73..ab0c8f96 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/portable_tensor_utils.h +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/portable_tensor_utils.h @@ -60,9 +60,8 @@ void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch, // Cwise product of two vectors. template -inline void VectorVectorCwiseProduct(const T* __restrict__ vector1, - const T* __restrict__ vector2, int v_size, - T* __restrict__ result) { +inline void VectorVectorCwiseProduct(const T* vector1, const T* vector2, + int v_size, T* result) { for (int v = 0; v < v_size; v++) { *result++ = *vector1++ * *vector2++; } @@ -117,6 +116,367 @@ void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch, } } +// Checks if all entries of vector are zero for float. +bool IsZeroVector(const float* vector, int v_size); + +// Checks if all entries of vector are zero for int8. +bool IsZeroVector(const int8_t* vector, int v_size); + +// Quantizes a buffer of floating point values using a symmetric quantization +// (i.e. linear quantization without an offset) to 8-bit signed integers. +// It also outputs the range (min, max) of the floating point buffer, and the +// scaling factor used to quantize the values. +void SymmetricQuantizeFloats(const float* values, const int size, + int8_t* quantized_values, float* min_value, + float* max_value, float* scaling_factor); + +// Quantizes a buffer of floating point values using a symmetric quantization +// (i.e. linear quantization without an offset) to 8-bit signed integers. +// It uses the range (min, max) provided to the function to calculate the +// appropriate scaling factor to quantize the values. +void SymmetricQuantizeFloats(const float* values, const int size, + int8_t* quantized_values, float min_value, + float max_value, float* scaling_factor); + +void AsymmetricQuantizeFloats(const float* values, const int size, + int8_t* quantized_values, float* scaling_factor, + int32_t* offset); + +// Helper function to quantize floats. +// float_data_ptr input float vectors +// n_batch number of input vectors +// n_data size of a single input vector +// quantized_data_ptr (out) vector with quantized data +// scaling_factors (out) scaling factors (one per vector) +// zero_points (out) zero points (one per vector) +// do_asymmetric controls if the quantization should be asymmetric. +inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch, + int n_data, int8_t* quantized_data_ptr, + float* scaling_factors, int32_t* zero_points, + bool do_asymmetric) { + for (int b = 0; b < n_batch; ++b) { + const int offset = b * n_data; + if (do_asymmetric) { + tensor_utils::AsymmetricQuantizeFloats( + float_data_ptr + offset, n_data, quantized_data_ptr + offset, + &scaling_factors[b], &zero_points[b]); + } else { + float unused_min, unused_max; + tensor_utils::SymmetricQuantizeFloats( + float_data_ptr + offset, n_data, quantized_data_ptr + offset, + &unused_min, &unused_max, &scaling_factors[b]); + } + } +} + +// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch +// dimension composed by input vectors independent from each other). The result +// of the multiplication is accumulated to the passed result buffer. +// More specifically, for a matrix M of shape [n, i] and a batched-vector +// of shape [i, batch] it will first compute the product of shape [n, batch]. +// This product will be accumulated to the result buffer. +void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, + int m_cols, const float* vector, + int n_batch, float* result); + +// Same as the function above, but the matrix is a sparse tensor with block +// pattern 1x4. +// This function assumes that m_cols is a multiple of the block size (4 in this +// case) so that there's no incomplete block. +void SparseMatrixBatchVectorMultiplyAccumulate1x4( + const float* __restrict__ matrix, const int32_t* __restrict__ segments, + const int32_t* __restrict__ indices, int m_rows, int m_cols, + const float* __restrict__ vector, int n_batch, float* __restrict__ result); + +// Same as the function above, but the matrix is stored in block compressed +// sparse row format with block pattern 1x16 which consists of two arrays: +// 1. A matrix array stores non-zero blocks of the matrix in row major. +// 2. A ledger array stores nrows groups, one group per row. Each group starts +// with an integer representing the number of non-zero blocks for the +// corresponding row and follows with column indexes of the first element +// of each non-zero block. +// This function assumes that +// 1. m_cols is a multiple of 16 so that all blocks are full blocks. +// 2. m_cols < 254 * 16 so that block index can be represented by uint8. +void SparseMatrixBatchVectorMultiplyAccumulate( + const float* __restrict__ matrix, const uint8_t* __restrict__ ledger, + int m_rows, int m_cols, const float* __restrict__ vector, int n_batch, + float* __restrict__ result); + +// Same as the function above, but for values quantized using symmetric +// quantization (e.g. by calling SymmetricQuantizeFloats). +// The passed scaling factors is a buffer of the quantization scaling factors +// that will be used to dequentize the products into the final result buffer. +// These scaling factors are the multiplication of the matrix scaling factor +// by the vector's scaling factor, one per batch (i.e. this allows quantizing +// each batch in the batch-vector matrix independently). +void MatrixBatchVectorMultiplyAccumulate( + const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, + const int8_t* __restrict__ vectors, + const float* __restrict__ scaling_factors, int n_batch, + float* __restrict__ result); + +// Same as the function above except that vector values +// are quantized with asymmetric quantization per-batch and the matrix +// is quantized per row. +void MatrixBatchVectorMultiplyAccumulate( + const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, + const int8_t* __restrict__ vectors, + const float* __restrict__ scaling_factors, int n_batch, + float* __restrict__ result, const float* __restrict__ per_channel_scale, + const int32_t* __restrict__ input_offset); + +// Same as the function above, but the matrix is a sparse tensor with block +// pattern 1x16. +// This function assumes that m_cols is a multiple of the block size (16 in this +// case) so that there's no incomplete block. Also, it assumes all offsets of +// input, output and filter are zero. +void SparseMatrixBatchVectorMultiplyAccumulate1x16( + const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments, + const int32_t* __restrict__ indices, int m_rows, int m_cols, + const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector, + int n_batch, const int32_t input_offset, const int32_t output_multiplier, + const int32_t output_shift, const int32_t output_offset, + const int32_t output_activation_min, const int32_t output_activation_max, + int8_t* __restrict__ result); + +// Same as the function above, but the matrix is stored in block compressed +// sparse row format with block pattern 1x16 which consists of two arrays: +// 1. A matrix array stores non-zero blocks of the matrix in row major. +// 2. A ledger array stores nrows groups, one group per row. Each group starts +// with an integer representing the number of non-zero blocks for the +// corresponding row followed by column index of the first element of +// each non-zero block. +// This function assumes that +// 1. m_cols is a multiple of 16 so that all blocks are full blocks. +// 2. m_cols < 254 * 16 so that block index can be represented by uint8. +void SparseMatrixBatchVectorMultiplyAccumulate( + const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger, + const int m_rows, const int m_cols, const int8_t* __restrict__ vectors, + const float* __restrict__ scaling_factors, int n_batch, + float* __restrict__ result); + +// Same as the above 8, 8, 8 integer matmul except for the presence of zero +// point and non-accumulative. +// TODO(b/148688698): remove this function by folding zero point calculation in +// prepare() function. +void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint, + const int8_t* input_to_gate_weights, + int32_t input_to_gate_effective_scale_a, + int32_t input_to_gate_effective_scale_b, + int32_t n_batch, int32_t n_input, int32_t n_cell, + int8_t* gate_output, int8_t gate_output_zp); + +// Same as above but has 16 bit and 8 bit input and 8 bit output. +// Used in projection when hidden is 16bit. +void MatrixBatchVectorMultiply(const int16_t* hidden, + const int8_t* hidden_to_output_weights, + int32_t proj_effective_scale_a, + int32_t proj_effective_scale_b, + const int32_t* gate_bias, int32_t n_batch, + int32_t n_hidden, int32_t n_output, + int32_t output_zp, int8_t* proj_output); + +// Apply Layer Normalization (https://arxiv.org/abs/1607.06450) to a Quantized +// vector. +// Parameters: +// - input: batch vector of size n_batch * n_input; 16 bit. +// - layer_norm_weights: the quantized layer normalization weights. +// - bias: the bias for the layer normalization. +// - layer_norm_scale_a: multiplier for scale factor. +// - layer_norm_scale_b: shift for scale factor. +// - variance_limit: the guard to make sure the inverse does not overflow. +// - n_batch: the number of batches. +// - n_input: the size for input and output. +// - output: the 16 bit output +void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights, + const int32_t* bias, int32_t layer_norm_scale_a, + int32_t layer_norm_scale_b, int32_t variance_limit, + int n_batch, int n_input, int16_t* output); + +// Same as above but the internal calculation is done in float. +void ApplyLayerNormFloat(const int16_t* input, + const int16_t* layer_norm_weights, + int32_t layer_norm_scale_a, int32_t layer_norm_scale_b, + const int32_t* bias, int n_batch, int n_input, + int16_t* output); + +// Apply Sigmoid to a quantized vector. +// Parameters: +// - input: batch vector of size n_batch * n_input; 16 bit. +// - n_batch: the number of batches. +// - n_input: the size for input and output. +// - output: the 16 bit output +// The input is in Q3.12 format and the output is in Q0.15 format. +void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input, + int16_t* output); + +// Same as above but the internal calcualtion is float. +void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input, + int16_t* output); + +// Apply Tanh to a quantized vector. +// Parameters: +// - integer_bits: the integer bits of the input. +// Currently supports 0, 1, 2, 3, 4, 5, 6. +// - input: batch vector of size n_batch * n_input; 16 bit. +// - n_batch: the number of batches. +// - n_input: the size for input and output. +// - output: the 16 bit output +// The input is in Qm.15-m format and the output is in Q0.15 format. +void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch, + int32_t n_input, int16_t* output); + +// Apply Tanh to a quantized vector. Tbe internal calculation is in float. +// - Input has 2^(integer_bits) as scale. +// - Output has Q0.15 as scale. +void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input, + int32_t integer_bits, int16_t* output); + +// Element-wise multiplication of two quantized vectors. +// Parameters: +// - input_1: batch vector of size n_batch * n_input; 16 bit. +// - input_2: batch vector of size n_batch * n_input; 16 bit. +// - n_batch: the number of batches. +// - n_input: the size for input and output. +// - shift: the shift needed to produce the output. +// - output: the 16 bit output of size n_batch * n_input. +// Output does not need to be initialized. +void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch, + int n_input, int shift, int16_t* output); + +// Element-wise multiplication of two quantized vectors. +// Parameters: +// - input_1: batch vector of size n_batch * n_input; 16 bit. +// - input_2: batch vector of size n_batch * n_input; 16 bit. +// - n_batch: the number of batches. +// - n_input: the size for input and output. +// - shift: the shift needed to produce the output. +// - output: the 8 bit output of size n_batch * n_input. +// Output does not need to be initialized. +void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch, + int n_input, int shift, int8_t* output); + +// Element-wise multiplication of two quantized vectors with rescaling. +// Parameters: +// - input_1: batch vector of size n_batch * n_input; 16 bit. +// - input_2: batch vector of size n_batch * n_input; 16 bit. +// - multiplier: the multiplier part of scale. +// - shift: the shift part of scale. +// - n_batch: the number of batches. +// - n_input: the size for input and output. +// - output: the 8 bit output of size n_batch * n_input. +// - output_zp: the zero point of output. +// Output does not need to be initialized. +// Multiplier ("m") and shift ("s") are connected to scale ("s") with s = m * +// 2^(s - 31). +void CwiseMul(const int16_t* input_1, const int16_t* input_2, + int32_t multiplier, int32_t shift, int32_t n_batch, + int32_t n_input, int32_t output_zp, int8_t* output); + +// Element-wise saturating addition of two quantized vectors without rescaling. +// Parameters: +// - input_1: batch vector of size n_batch * n_input; 16 bit. +// - input_2: batch vector of size n_batch * n_input; 16 bit. +// - n_batch: the number of batches. +// - n_input: the size for input and output. +// - output: the 8 bit output of size n_batch * n_input. +// Output does not need to be initialized. +void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch, + int n_input, int16_t* output); + +// Element-wise in-place clipping of a vector. Overloaded for float, int16_t, +// int8_t. Parameters: +// - vector: vector of size v_size. +// - v_size: the size of the vector. +// - clipping_value: the value used for clipping. +void CwiseClipping(float* vector, const int v_size, const float clipping_value); +void CwiseClipping(int16_t* vector, const int v_size, + const int16_t clipping_value); +void CwiseClipping(int8_t* vector, const int v_size, + const int8_t clipping_value); + +// Dot product of two vectors. +float VectorVectorDotProduct(const float* vector1, const float* vector2, + int v_size); + +// Dot product of two batch vectors of size n_batch * v_size: +// vector1 = [x_1_1, x_1_2, ..., x_1_vsize, +// x_2_1, x_2_2, ..., x_2_vsize, +// ... +// x_nbatch_1,..., x_nbatch_vsize] +// vector2 = [y_1_1, y_1_2, ..., y_1_vsize, +// y_2_1, y_2_2, ..., y_2_vsize, +// ... +// y_nbatch_1,..., y_nbatch_vsize] +// Then result will be a vector of n_batch size starting from 'result': +// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize, +// x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize, +// ... +// x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize] +template +inline void BatchVectorBatchVectorDotProduct(const T* vector1, const T* vector2, + int v_size, int n_batch, + T* result) { + for (int b = 0; b < n_batch; b++) { + result[b] = VectorVectorDotProduct(vector1, vector2, v_size); + vector1 += v_size; + vector2 += v_size; + } +} + +// Same as above but input is 16bit and output is 32bit. +void BatchVectorBatchVectorDotProduct(const int16_t* vector1, + const int16_t* vector2, int v_size, + int n_batch, int32_t* result); + +// Same as above, but inputs are 16bit integer and output is 16bit integer. +void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size, + const int16_t* batch_vector, + int n_batch, int32_t multiplier, + int shift, int16_t* result); + +// Compute "1.0f - elements of vector" (used in CIFG). +void Sub1Vector(const float* vector, int v_size, float* result); + +// Compute "1.0f - elements of vector" (used in CIFG) for int16 input. +// "vector" has range [0, 32767] because it is the output of sigmoid function. +void Sub1Vector(const int16_t* vector, int v_size, int16_t* result); + +// Multiply all elements of vector with a scalar. +void VectorScalarMultiply(const int8_t* vector, int v_size, float scale, + float* result); + +// Reduce-sum on a float input vector: +// input_vector: float pointer to input vector. +// output_vector: float pointer to vector. +// output_size: output vector size. +// reduction_size: number of consecutive elements from input vector which are +// added to get one element of output. +void ReductionSumVector(const float* input_vector, float* output_vector, + int output_size, int reduction_size); + +// Same as above but input/output is 32 bit integer. +void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector, + int output_size, int reduction_size); + +// Same as above but input is 8 bit integer. +void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector, + int output_size, int reduction_size); + +// Layer norm for each batch. +void MeanStddevNormalization(const float* input_vector, float* output_vector, + int v_size, int n_batch); + +// Saturate Add with rescale on both inputs. +void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp, + const int8_t* recurrent, int8_t recurrent_zp, + int32_t input_effective_scale_a, + int32_t input_effective_scale_b, + int32_t recurrent_effective_scale_a, + int32_t recurrent_effective_scale_b, int32_t n_batch, + int32_t n_cell, int16_t* output); + } // namespace tensor_utils } // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/batch_matmul.h index 5fe01da2..767ad6ab 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/batch_matmul.h +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/batch_matmul.h @@ -20,7 +20,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" -#include "tensorflow/lite/kernels/internal/tensor_utils_common.h" +#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h" #include "tensorflow/lite/kernels/internal/types.h" namespace tflite { diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_args.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_args.h new file mode 100644 index 00000000..d93c316d --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_args.h @@ -0,0 +1,56 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_ + +#include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/types.h" + +namespace tflite { +namespace reference_ops { + +template +void BroadcastArgs(const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape& input2_shape, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + // Gets data at the backward index i of the shape tensor. Returns 1 if the + // index is out of range. + auto get_shape_data = [](const RuntimeShape& shape, const T* data, + int backward_idx) -> T { + int forward_idx = shape.FlatSize() - 1 - backward_idx; + if (forward_idx < 0) return 1; + return data[forward_idx]; + }; + + int output_num_elements = output_shape.FlatSize(); + for (int i = 0; i < output_num_elements; ++i) { + int backward_i = output_num_elements - 1 - i; + int shape1_i = get_shape_data(input1_shape, input1_data, i); + int shape2_i = get_shape_data(input2_shape, input2_data, i); + if (shape1_i == 1) { + output_data[backward_i] = shape2_i; + } else if (shape2_i == 1) { + output_data[backward_i] = shape1_i; + } else { + TFLITE_CHECK_EQ(shape1_i, shape2_i); + output_data[backward_i] = shape1_i; + } + } +} + +} // namespace reference_ops +} // namespace tflite + +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_ARGS_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_to.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_to.h new file mode 100644 index 00000000..f106b2b5 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/broadcast_to.h @@ -0,0 +1,97 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_ + +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/kernel_util.h" + +namespace tflite { +namespace reference_ops { +template +void BroadcastImpl(const NdArrayDesc& input_desc, const char* input_data, + const NdArrayDesc& output_desc, char* output_data, + int indexes[N], int dim, const int last_broadcasting_dim, + const int type_size) { + // Copy data from input to output. + if (dim == last_broadcasting_dim) { + int copy_size = output_desc.strides[dim] * type_size; + const char* data_src = + input_data + SubscriptToIndex(input_desc, indexes) * type_size; + char* data_dst = + output_data + SubscriptToIndex(output_desc, indexes) * type_size; + for (int i = 0; i < output_desc.extents[dim]; ++i, data_dst += copy_size) { + memcpy(data_dst, data_src, copy_size); + } + return; + } + + // Recursive call to find the next broadcasting. + for (indexes[dim] = 0; indexes[dim] < input_desc.extents[dim]; + ++indexes[dim]) { + BroadcastImpl(input_desc, input_data, output_desc, output_data, indexes, + dim + 1, last_broadcasting_dim, type_size); + } + + // Duplicate data in output tensor. + indexes[dim] = 0; + if (input_desc.extents[dim] != output_desc.extents[dim]) { + int copy_size = output_desc.strides[dim] * type_size; + char* data_src = + output_data + SubscriptToIndex(output_desc, indexes) * type_size; + char* data_dst = data_src + copy_size; + for (int i = 1; i < output_desc.extents[dim]; ++i, data_dst += copy_size) { + memcpy(data_dst, data_src, copy_size); + } + } +} + +template +inline void BroadcastTo(const RuntimeShape& unextended_input_shape, + const char* input_data, + const RuntimeShape& unextended_output_shape, + char* output_data, TfLiteType data_type) { + NdArrayDesc input_desc; + NdArrayDesc output_desc; + CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_input_shape), + &input_desc); + CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape), + &output_desc); + + // Get the last dimension has broadcasting. At this dimension, the data is + // copied from input tensor to output tensor. + int last_broadcast_dim = -1; + for (int i = N - 1; i >= 0; --i) { + if (input_desc.extents[i] != output_desc.extents[i]) { + last_broadcast_dim = i; + break; + } + } + + // If non-broadcasting, just copy data from input to output tensor. + if (last_broadcast_dim == -1) { + memcpy(output_data, input_data, + unextended_input_shape.FlatSize() * TfLiteTypeGetSize(data_type)); + return; + } + + // Broadcasting using memcpy. + int indexes[N] = {0}; + BroadcastImpl(input_desc, input_data, output_desc, output_data, indexes, 0, + last_broadcast_dim, TfLiteTypeGetSize(data_type)); +} +} // namespace reference_ops +} // namespace tflite +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BROADCAST_TO_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/conv.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/conv.h index 5a6369d8..ac5f04f6 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/conv.h +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/conv.h @@ -43,7 +43,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, (void)im2col_data; // only used in optimized code. (void)im2col_shape; // only used in optimized code. const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int input_depth = input_shape.Dims(3); const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); if (bias_data) { TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); @@ -52,14 +52,20 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, const int input_width = input_shape.Dims(2); const int filter_height = filter_shape.Dims(1); const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { const int in_y_origin = (out_y * stride_height) - pad_height; for (int out_x = 0; out_x < output_width; ++out_x) { const int in_x_origin = (out_x * stride_width) - pad_width; for (int out_channel = 0; out_channel < output_depth; ++out_channel) { + auto group = out_channel / filters_per_group; float total = 0.f; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { const int in_y = in_y_origin + dilation_height_factor * filter_y; @@ -74,10 +80,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, if (!is_point_inside_image) { continue; } - - for (int in_channel = 0; in_channel < input_depth; ++in_channel) { - float input_value = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; + for (int in_channel = 0; in_channel < filter_input_depth; + ++in_channel) { + float input_value = + input_data[Offset(input_shape, batch, in_y, in_x, + in_channel + group * filter_input_depth)]; float filter_value = filter_data[Offset( filter_shape, out_channel, filter_y, filter_x, in_channel)]; total += (input_value * filter_value); @@ -126,7 +133,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int input_depth = input_shape.Dims(3); const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); if (bias_data) { TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); @@ -135,6 +142,10 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, const int input_width = input_shape.Dims(2); const int filter_height = filter_shape.Dims(1); const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); for (int batch = 0; batch < batches; ++batch) { @@ -143,6 +154,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, for (int out_x = 0; out_x < output_width; ++out_x) { const int in_x_origin = (out_x * stride_width) - pad_width; for (int out_channel = 0; out_channel < output_depth; ++out_channel) { + auto group = out_channel / filters_per_group; int32_t acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { const int in_y = in_y_origin + dilation_height_factor * filter_y; @@ -158,9 +170,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, continue; } - for (int in_channel = 0; in_channel < input_depth; ++in_channel) { - int32_t input_val = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; + for (int in_channel = 0; in_channel < filter_input_depth; + ++in_channel) { + int32_t input_val = + input_data[Offset(input_shape, batch, in_y, in_x, + in_channel + group * filter_input_depth)]; int32_t filter_val = filter_data[Offset( filter_shape, out_channel, filter_y, filter_x, in_channel)]; acc += @@ -206,7 +220,7 @@ inline void HybridConvPerChannel( TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int input_depth = input_shape.Dims(3); const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); if (bias_data) { TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); @@ -215,18 +229,24 @@ inline void HybridConvPerChannel( const int input_width = input_shape.Dims(2); const int filter_height = filter_shape.Dims(1); const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int out_channel = 0; out_channel < output_depth; ++out_channel) { + auto group = out_channel / filters_per_group; const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; int32_t acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - for (int in_channel = 0; in_channel < input_depth; ++in_channel) { + for (int in_channel = 0; in_channel < filter_input_depth; + ++in_channel) { const int in_x = in_x_origin + dilation_width_factor * filter_x; const int in_y = in_y_origin + dilation_height_factor * filter_y; @@ -235,7 +255,8 @@ inline void HybridConvPerChannel( if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { int32_t input_val = input_data[Offset( - input_shape, batch, in_y, in_x, in_channel)]; + input_shape, batch, in_y, in_x, + in_channel + group * filter_input_depth)]; int32_t filter_val = filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)]; diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h index 3a4164d3..3f869a3a 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h @@ -48,7 +48,7 @@ inline void ConvPerChannel( TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int input_depth = input_shape.Dims(3); const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); if (bias_data) { TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); @@ -59,6 +59,10 @@ inline void ConvPerChannel( const int input_width = input_shape.Dims(2); const int filter_height = filter_shape.Dims(1); const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); for (int batch = 0; batch < batches; ++batch) { @@ -67,6 +71,7 @@ inline void ConvPerChannel( for (int out_x = 0; out_x < output_width; ++out_x) { const int in_x_origin = (out_x * stride_width) - pad_width; for (int out_channel = 0; out_channel < output_depth; ++out_channel) { + auto group = out_channel / filters_per_group; int32_t acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { const int in_y = in_y_origin + dilation_height_factor * filter_y; @@ -82,9 +87,11 @@ inline void ConvPerChannel( continue; } - for (int in_channel = 0; in_channel < input_depth; ++in_channel) { - int32_t input_val = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; + for (int in_channel = 0; in_channel < filter_input_depth; + ++in_channel) { + int32_t input_val = + input_data[Offset(input_shape, batch, in_y, in_x, + in_channel + group * filter_input_depth)]; int32_t filter_val = filter_data[Offset( filter_shape, out_channel, filter_y, filter_x, in_channel)]; // Accumulate with 32 bits accumulator. @@ -126,12 +133,13 @@ inline void ConvPerChannel( // Fixed-point per-channel-quantization convolution reference kernel. // 16-bit data and 8-bit filter +template inline void ConvPerChannel( const ConvParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const RuntimeShape& input_shape, const int16_t* input_data, const RuntimeShape& filter_shape, const int8_t* filter_data, const RuntimeShape& bias_shape, - const std::int64_t* bias_data, const RuntimeShape& output_shape, + const AccumScalar* bias_data, const RuntimeShape& output_shape, int16_t* output_data) { // Get parameters. const int stride_width = params.stride_width; @@ -151,7 +159,7 @@ inline void ConvPerChannel( TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int input_depth = input_shape.Dims(3); const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); if (bias_data) { TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); @@ -162,6 +170,10 @@ inline void ConvPerChannel( const int input_width = input_shape.Dims(2); const int filter_height = filter_shape.Dims(1); const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; const int output_height = output_shape.Dims(1); const int output_width = output_shape.Dims(2); for (int batch = 0; batch < batches; ++batch) { @@ -170,7 +182,8 @@ inline void ConvPerChannel( for (int out_x = 0; out_x < output_width; ++out_x) { const int in_x_origin = (out_x * stride_width) - pad_width; for (int out_channel = 0; out_channel < output_depth; ++out_channel) { - std::int64_t acc = 0; + auto group = out_channel / filters_per_group; + AccumScalar acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { const int in_y = in_y_origin + dilation_height_factor * filter_y; for (int filter_x = 0; filter_x < filter_width; ++filter_x) { @@ -185,9 +198,11 @@ inline void ConvPerChannel( continue; } - for (int in_channel = 0; in_channel < input_depth; ++in_channel) { - int32_t input_val = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; + for (int in_channel = 0; in_channel < filter_input_depth; + ++in_channel) { + int32_t input_val = + input_data[Offset(input_shape, batch, in_y, in_x, + in_channel + group * filter_input_depth)]; int32_t filter_val = filter_data[Offset( filter_shape, out_channel, filter_y, filter_x, in_channel)]; // Accumulate with 64 bits accumulator. diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h index 1a469fa9..42920d16 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h @@ -34,12 +34,13 @@ inline void FullyConnected( const int32_t output_activation_min = params.quantized_activation_min; const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); - TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); TFLITE_DCHECK_LE(output_activation_min, output_activation_max); const int filter_dim_count = filter_shape.DimensionsCount(); - const int batches = output_shape.Dims(0); - const int output_depth = output_shape.Dims(1); + const int output_dim_count = output_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = output_shape.Dims(output_dim_count - 1); TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2)); const int accum_depth = filter_shape.Dims(filter_dim_count - 1); for (int b = 0; b < batches; ++b) { @@ -62,11 +63,12 @@ inline void FullyConnected( } } +template inline void FullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, const int16_t* input_data, const RuntimeShape& filter_shape, const int8_t* filter_data, const RuntimeShape& bias_shape, - const int64_t* bias_data, const RuntimeShape& output_shape, + const AccumScalar* bias_data, const RuntimeShape& output_shape, int16_t* output_data) { const int32_t filter_offset = params.weights_offset; const int32_t output_multiplier = params.output_multiplier; @@ -85,7 +87,7 @@ inline void FullyConnected( const int accum_depth = filter_shape.Dims(filter_dim_count - 1); for (int b = 0; b < batches; ++b) { for (int out_c = 0; out_c < output_depth; ++out_c) { - int64_t acc = 0; + AccumScalar acc = 0; for (int d = 0; d < accum_depth; ++d) { int32_t input_val = input_data[b * accum_depth + d]; int32_t filter_val = filter_data[out_c * accum_depth + d]; diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h index 284c0f21..3397f869 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h @@ -119,15 +119,16 @@ inline void TransposeConv( } } -// int16_t input (zero_point=0), int8_t filter, int64 accumulator +// int16_t input (zero_point=0), int8_t filter, int32 or int64 accumulator +template inline void TransposeConv( const ConvParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const RuntimeShape& input_shape, const int16_t* input_data, const RuntimeShape& filter_shape, const int8_t* filter_data, const RuntimeShape& bias_shape, - const std::int64_t* bias_data, const RuntimeShape& output_shape, + const Scalar* bias_data, const RuntimeShape& output_shape, int16_t* output_data, const RuntimeShape& im2col_shape, int8_t* im2col_data, - std::int64_t* scratch_buffer) { + Scalar* scratch_buffer) { const int stride_width = params.stride_width; const int stride_height = params.stride_height; const int pad_width = params.padding_values.width; @@ -157,7 +158,7 @@ inline void TransposeConv( const int num_elements = output_shape.FlatSize(); // We need to initialize scratch_buffer to all 0s, as we apply the same // 'scatter' based trick as in float version. - memset(scratch_buffer, 0, num_elements * sizeof(std::int64_t)); + memset(scratch_buffer, 0, num_elements * sizeof(Scalar)); // Loop through input elements one at a time. for (int batch = 0; batch < batches; ++batch) { @@ -198,8 +199,8 @@ inline void TransposeConv( for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int out_channel = 0; out_channel < output_depth; ++out_channel) { - std::int64_t acc = scratch_buffer[Offset(output_shape, batch, out_y, - out_x, out_channel)]; + Scalar acc = scratch_buffer[Offset(output_shape, batch, out_y, out_x, + out_channel)]; if (bias_data) { acc += bias_data[out_channel]; } diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/lstm_cell.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/lstm_cell.h new file mode 100644 index 00000000..17b113eb --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/lstm_cell.h @@ -0,0 +1,422 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_ + +#include +#include +#include + +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/concatenation.h" +#include "tensorflow/lite/kernels/internal/reference/fully_connected.h" +#include "tensorflow/lite/kernels/internal/types.h" + +namespace tflite { +namespace reference_ops { + +inline void LstmCell( + const LstmCellParams& params, const RuntimeShape& unextended_input_shape, + const float* input_data, const RuntimeShape& unextended_prev_activ_shape, + const float* prev_activ_data, const RuntimeShape& weights_shape, + const float* weights_data, const RuntimeShape& unextended_bias_shape, + const float* bias_data, const RuntimeShape& unextended_prev_state_shape, + const float* prev_state_data, + const RuntimeShape& unextended_output_state_shape, float* output_state_data, + const RuntimeShape& unextended_output_activ_shape, float* output_activ_data, + const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data, + const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) { + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape prev_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape); + const RuntimeShape bias_shape = + RuntimeShape::ExtendedShape(4, unextended_bias_shape); + const RuntimeShape prev_state_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_state_shape); + const RuntimeShape output_state_shape = + RuntimeShape::ExtendedShape(4, unextended_output_state_shape); + const RuntimeShape output_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_output_activ_shape); + const RuntimeShape concat_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape); + const RuntimeShape activ_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); + + const int weights_dim_count = weights_shape.DimensionsCount(); + const int batches = + MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0, + output_state_shape, 0, output_activ_shape, 0); + const int height = + MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1, + output_state_shape, 1, output_activ_shape, 1); + const int width = + MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2, + output_state_shape, 2, output_activ_shape, 2); + const int input_depth = input_shape.Dims(3); + const int prev_activ_depth = prev_activ_shape.Dims(3); + const int total_input_depth = prev_activ_depth + input_depth; + TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), + total_input_depth); + TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1); + const int intern_activ_depth = + MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3); + TFLITE_DCHECK_EQ(weights_shape.FlatSize(), + intern_activ_depth * total_input_depth); + TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0); + const int output_depth = + MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape, + 3, output_activ_shape, 3); + TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4); + + // Concatenate prev_activ and input data together + float const* concat_input_arrays_data[2] = {input_data, prev_activ_data}; + const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape, + &prev_activ_shape}; + tflite::ConcatenationParams concat_params; + concat_params.axis = 3; + concat_params.inputs_count = 2; + Concatenation(concat_params, concat_input_arrays_shapes, + concat_input_arrays_data, concat_temp_shape, concat_temp_data); + + // Fully connected + tflite::FullyConnectedParams fc_params; + fc_params.float_activation_min = std::numeric_limits::lowest(); + fc_params.float_activation_max = std::numeric_limits::max(); + FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape, + weights_data, bias_shape, bias_data, activ_temp_shape, + activ_temp_data); + + // Memory state update (the LSTM "guts") + for (int b = 0; b < batches; ++b) { + for (int w = 0; w < width; ++w) { + for (int h = 0; h < height; ++h) { + for (int c = 0; c < output_depth; ++c) { + const float input_gate = + 1.f / + (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, + 0 * output_depth + c)])); + const float new_input = std::tanh(activ_temp_data[Offset( + activ_temp_shape, b, h, w, 1 * output_depth + c)]); + const float forget_gate = + 1.f / + (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, + 2 * output_depth + c)])); + const float output_gate = + 1.f / + (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, + 3 * output_depth + c)])); + const float new_state = + input_gate * new_input + + forget_gate * + prev_state_data[Offset(prev_state_shape, b, h, w, c)]; + output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state; + output_activ_data[Offset(output_activ_shape, b, h, w, c)] = + output_gate * std::tanh(new_state); + } + } + } + } +} + +// Quantized LSTM cell implementation. +// The quantization of the input, output arrays is as follows: +// - The input activations are quantized as uint8 on the interval +// [-1, 127/128]. +// The rationale for that is that is the natural interval for output +// activations (see next point) and these need to be concatenated together. +// We could accommodate different ranges by re-scaling, but we empirically +// found that setting the input activations range to be [-1, 127/128] in the +// first place, removing the need for re-scaling, greatly improves accuracy. +// - The output activations are quantized as uint8 on the interval +// [-1, 127/128]. +// The rationale for that is that the definition of a LSTM cell makes them +// intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128] +// makes for simpler, more accurate fixed-point arithmetic. +// - The output-at-previous-timestep state array is obviously quantized as +// the output activations. +// - The internal LSTM memory (not the output-at-previous-timestep, the other +// internal state array) is int16-quantized and may use any power-of-two, +// symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call +// StateIntegerBits below, see the below discussion of that template +// parameter ("The StateIntegerBits template parameter"). +// - The output of the internal fully-connected node is int16-quantized +// on the interval [-8, 8 * 32767/32768], the rationale for which is +// explained just below ("Why [-8, 8] for fully-connected output?"). +// +// +// === The StateIntegerBits template parameter === +// +// The StateIntegerBits template parameter controls the fixed-point format used +// to represent the internal memory of the LSTM cell (not the +// output-at-previous-timestep, the other internal state array). It's currently +// a template parameter so that the model can control that. The most typical +// value for StateIntegerBits is 4. Other plausible values are anywhere between +// 3 and 5. We might eventually standardize on a single supported value, e.g. 4, +// and drop that template parameter. The reason why it can't be a runtime +// parameter is that this controls the fixed-point format used, i.e. we need to +// generate actually different code based on it. In particular, we generate code +// for a fixed-point tanh() implementation for that format, which internally +// uses a fixed-point exp() implementation, which internally uses a +// barrel-shifter with a number of steps that depends on StateIntegerBits. +// Another consequence of that is that a higher value of StateIntegerBits +// results in a more expensive implementation (more barrel shifter steps +// needed). +// +// +// === Why [-8, 8] for fully-connected output? === +// +// This array is only fed to Logistic and Tanh functions, for which +// the quantized implementation will want to use fixed-point arithmetic, +// requiring a power-of-two representation interval. Thus, we should right +// away quantize this array to a power-of-two interval; otherwise, +// implementation will need to rescale that, losing any benefit that a tighter +// representation interval might otherwise yield, while introducing some +// numerical error and computational overhead. +// +// Now, Logistic and Tanh +// are nearly constant (nearly equal to their horizontal asymptotes) +// outside of a small bounded interval around 0: +// +// Logistic(4) = 1 - 1.8e-2 Tanh(4) = 1 - 6.7e-4 +// Logistic(8) = 1 - 3.4e-4 Tanh(8) = 1 - 2.3e-7 +// Logistic(16) = 1 - 1.1e-7 Tanh(16) = 1 - 2.5e-14 +// +// From this, we see that clamping to [-4, 4] would be too inaccurate +// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision) +// while clamping to [-16, 16] would make no difference even in float32. +// However, for a fixed-point implementation in 16-bit integers, using 5 +// integer bits to represent the [-16, 16] range would leave only 11 +// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive +// representable values. Notice that is higher than the +// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic. +// Using [-8, 8] thus seems like the better compromise overall, enjoying +// an increment of 2.4e-4 between representable values and a worst-case +// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with +// [-16, 16]. +// +// Moreover, all other things being equal, it is nice to choose the narrower +// representation range, as that makes the implementation of fixed-point +// math functions a little cheaper (each integer bit requires an additional +// barrel-shifter atep in the implementation of exp(-x)). That is further +// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make +// sense for 32-bit float or 32-bit fixed-point quantization, but we are +// aiming for 16-bit fixed-point quantization of these internal nodes here. +// +template +inline void LstmCell(const LstmCellParams& params, + const RuntimeShape& unextended_input_shape, + const uint8_t* input_data_uint8, + const RuntimeShape& unextended_prev_activ_shape, + const uint8_t* prev_activ_data_uint8, + const RuntimeShape& weights_shape, + const uint8_t* weights_data_uint8, + const RuntimeShape& unextended_bias_shape, + const int32_t* bias_data_int32, + const RuntimeShape& unextended_prev_state_shape, + const int16_t* prev_state_data_int16, + const RuntimeShape& unextended_output_state_shape, + int16_t* output_state_data_int16, + const RuntimeShape& unextended_output_activ_shape, + uint8_t* output_activ_data_uint8, + const RuntimeShape& unextended_concat_temp_shape, + uint8_t* concat_temp_data_uint8, + const RuntimeShape& unextended_activ_temp_shape, + int16_t* activ_temp_data_int16, void* gemmlowp_context) { + (void)gemmlowp_context; // only used in optimized code. + int32_t weights_zero_point = params.weights_zero_point; + int32_t accum_multiplier = params.accum_multiplier; + int accum_shift = params.accum_shift; + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape prev_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape); + const RuntimeShape bias_shape = + RuntimeShape::ExtendedShape(4, unextended_bias_shape); + const RuntimeShape prev_state_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_state_shape); + const RuntimeShape output_state_shape = + RuntimeShape::ExtendedShape(4, unextended_output_state_shape); + const RuntimeShape output_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_output_activ_shape); + const RuntimeShape concat_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape); + const RuntimeShape activ_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); + + // Gather dimensions information, and perform consistency checks. + const int weights_dim_count = weights_shape.DimensionsCount(); + const int outer_size = MatchingFlatSizeSkipDim( + input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape, + output_activ_shape); + const int input_depth = input_shape.Dims(3); + const int prev_activ_depth = prev_activ_shape.Dims(3); + const int total_input_depth = prev_activ_depth + input_depth; + TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), + total_input_depth); + const int intern_activ_depth = + MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3); + TFLITE_DCHECK_EQ(weights_shape.FlatSize(), + intern_activ_depth * total_input_depth); + TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1); + TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0); + const int output_depth = + MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape, + 3, output_activ_shape, 3); + TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4); + const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3); + const int fc_output_depth = + MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3); + const int fc_accum_depth = total_input_depth; + TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth); + + // Depth-concatenate prev_activ and input data together. + uint8_t const* concat_input_arrays_data[2] = {input_data_uint8, + prev_activ_data_uint8}; + const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape, + &prev_activ_shape}; + tflite::ConcatenationParams concat_params; + concat_params.axis = 3; + concat_params.inputs_count = 2; + Concatenation(concat_params, concat_input_arrays_shapes, + concat_input_arrays_data, concat_temp_shape, + concat_temp_data_uint8); + + // Implementation of the fully connected node inside the LSTM cell. + // The operands are 8-bit integers, the accumulators are internally 32bit + // integers, and the output is 16-bit fixed-point with 3 integer bits so + // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that + // is explained in the function comment above. + for (int b = 0; b < fc_batches; ++b) { + for (int out_c = 0; out_c < fc_output_depth; ++out_c) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32_t accum = bias_data_int32[out_c]; + // Accumulation loop. + for (int d = 0; d < fc_accum_depth; ++d) { + int16_t input_val = + concat_temp_data_uint8[b * fc_accum_depth + d] - 128; + int16_t weights_val = + weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point; + accum += input_val * weights_val; + } + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, using 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + accum = + MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift); + // Saturate, cast to int16, and store to the temporary activations array. + accum = std::max(-32768, std::min(32767, accum)); + activ_temp_data_int16[out_c + fc_output_depth * b] = accum; + } + } + + // Rest of the LSTM cell: tanh and logistic math functions, and some adds + // and muls, all done in 16-bit fixed-point. + for (int b = 0; b < outer_size; ++b) { + for (int c = 0; c < output_depth; ++c) { + // Define the fixed-point data types that we will use here. All use + // int16 as the underlying integer type i.e. all are 16-bit fixed-point. + // They only differ by the number of integral vs. fractional bits, + // determining the range of values that they can represent. + // + // F0 uses 0 integer bits, range [-1, 1]. + // This is the return type of math functions such as tanh, logistic, + // whose range is in [-1, 1]. + using F0 = gemmlowp::FixedPoint; + // F3 uses 3 integer bits, range [-8, 8]. + // This is the range of the previous fully-connected node's output, + // which is our input here. + using F3 = gemmlowp::FixedPoint; + // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits, + // 2^StateIntegerBits]. It's used to represent the internal state, whose + // number of integer bits is currently dictated by the model. See comment + // on the StateIntegerBits template parameter above. + using FS = gemmlowp::FixedPoint; + // Implementation of input gate, using fixed-point logistic function. + F3 input_gate_input = F3::FromRaw( + activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]); + F0 input_gate_output = gemmlowp::logistic(input_gate_input); + // Implementation of input modulation gate, using fixed-point tanh + // function. + F3 input_modulation_gate_input = F3::FromRaw( + activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]); + F0 input_modulation_gate_output = + gemmlowp::tanh(input_modulation_gate_input); + // Implementation of forget gate, using fixed-point logistic function. + F3 forget_gate_input = F3::FromRaw( + activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]); + F0 forget_gate_output = gemmlowp::logistic(forget_gate_input); + // Implementation of output gate, using fixed-point logistic function. + F3 output_gate_input = F3::FromRaw( + activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]); + F0 output_gate_output = gemmlowp::logistic(output_gate_input); + // Implementation of internal multiplication nodes, still in fixed-point. + F0 input_times_input_modulation = + input_gate_output * input_modulation_gate_output; + FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]); + FS prev_state_times_forget_state = forget_gate_output * prev_state; + // Implementation of internal addition node, saturating. + FS new_state = gemmlowp::SaturatingAdd( + gemmlowp::Rescale(input_times_input_modulation), + prev_state_times_forget_state); + // Implementation of last internal Tanh node, still in fixed-point. + // Since a Tanh fixed-point implementation is specialized for a given + // number or integer bits, and each specialization can have a substantial + // code size, and we already used above a Tanh on an input with 3 integer + // bits, and per the table in the above function comment there is no + // significant accuracy to be lost by clamping to [-8, +8] for a + // 3-integer-bits representation, let us just do that. This helps people + // porting this to targets where code footprint must be minimized. + F3 new_state_f3 = gemmlowp::Rescale<3>(new_state); + F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3); + // Store the new internal state back to memory, as 16-bit integers. + // Note: here we store the original value with StateIntegerBits, not + // the rescaled 3-integer-bits value fed to tanh. + output_state_data_int16[b * output_depth + c] = new_state.raw(); + // Down-scale the output activations to 8-bit integers, saturating, + // and store back to memory. + int16_t rescaled_output_activ = + gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8); + int16_t clamped_output_activ = std::max( + -128, std::min(127, rescaled_output_activ)); + output_activ_data_uint8[b * output_depth + c] = + 128 + clamped_output_activ; + } + } +} + +} // namespace reference_ops +} // namespace tflite +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LSTM_CELL_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc index 4cc51cb4..4684be64 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc @@ -227,6 +227,41 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4( } } +void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16( + const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments, + const int32_t* __restrict__ indices, int m_rows, int m_cols, + const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector, + int n_batch, const int32_t input_offset, const int32_t output_multiplier, + const int32_t output_shift, const int32_t output_offset, + const int32_t output_activation_min, const int32_t output_activation_max, + int8_t* __restrict__ result) { + const int kBlockSize = 16; + TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0); + for (int batch = 0; batch < n_batch; ++batch) { + const int8_t* matrix_ptr = matrix; + for (int row = 0; row < m_rows; ++row) { + int32_t dot_prod = 0; + const int8_t* vector_in_batch = vector + batch * m_cols; + for (int i = segments[row]; i < segments[row + 1]; ++i) { + const int block_start_index = indices[i] * kBlockSize; + const int8_t* vector_block_in_batch_ptr = + vector_in_batch + block_start_index; + for (int c = 0; c < kBlockSize; c++) { + dot_prod += *matrix_ptr * *vector_block_in_batch_ptr++; + dot_prod += *matrix_ptr++ * input_offset; + } + } + const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0; + dot_prod = MultiplyByQuantizedMultiplier(dot_prod + bias_value, + output_multiplier, output_shift); + dot_prod += output_offset; + result[batch * m_rows + row] = + static_cast(ActivationFunctionWithMinMax( + dot_prod, output_activation_min, output_activation_max)); + } + } +} + void PortableSparseMatrixBatchVectorMultiplyAccumulate( const float* __restrict__ matrix, const uint8_t* __restrict__ ledger, int m_rows, int m_cols, const float* __restrict__ vector, int n_batch, diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h new file mode 100644 index 00000000..0416db09 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h @@ -0,0 +1,333 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_ + +#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h" + +#if defined(_MSC_VER) +#define __restrict__ __restrict +#endif + +namespace tflite { +namespace tensor_utils { + +// Check if all entries of a vector are zero for float. +bool IsZeroVector(const float* vector, int v_size) { + return PortableIsZeroVector(vector, v_size); +} + +// Check if all entries of a vector are zero for int8_t. +bool IsZeroVector(const int8_t* vector, int v_size) { + return PortableIsZeroVector(vector, v_size); +} + +void SymmetricQuantizeFloats(const float* values, const int size, + int8_t* quantized_values, float* min, float* max, + float* scaling_factor) { + PortableSymmetricQuantizeFloats(values, size, quantized_values, min, max, + scaling_factor); +} + +void SymmetricQuantizeFloats(const float* values, const int size, + int8_t* quantized_values, float min_value, + float max_value, float* scaling_factor) { + PortableSymmetricQuantizeFloats(values, size, quantized_values, min_value, + max_value, scaling_factor); +} + +void AsymmetricQuantizeFloats(const float* values, const int size, + int8_t* quantized_values, float* scaling_factor, + int32_t* offset) { + PortableAsymmetricQuantizeFloats(values, size, quantized_values, + scaling_factor, offset); +} + +void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, + int m_cols, const float* vector, + int n_batch, float* result) { + PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, + n_batch, result); +} + +void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t* __restrict__ vector, + const float* scaling_factors, + int n_batch, + float* __restrict__ result) { + PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, + scaling_factors, n_batch, result); +} + +void MatrixBatchVectorMultiplyAccumulate( + const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, + const int8_t* __restrict__ vectors, const float* scaling_factors, + int n_batch, float* __restrict__ result, const float* per_channel_scale, + const int32_t* input_offset, int32_t* scratch, int32_t* row_sums, + bool* compute_row_sums, CpuBackendContext* context) { + PortableMatrixBatchVectorMultiplyAccumulate( + matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result, + per_channel_scale, input_offset, scratch, row_sums, compute_row_sums, + context); +} + +void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t* __restrict__ vector, + const float* scaling_factors, + int n_batch, int32_t* scratch, + float* __restrict__ result, + CpuBackendContext* context) { + PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, + scaling_factors, n_batch, result); +} + +void SparseMatrixBatchVectorMultiplyAccumulate1x4( + const float* __restrict__ matrix, const int32_t* __restrict__ segments, + const int32_t* __restrict__ indices, int m_rows, int m_cols, + const float* __restrict__ vector, int n_batch, float* __restrict__ result) { + PortableSparseMatrixBatchVectorMultiplyAccumulate1x4( + matrix, segments, indices, m_rows, m_cols, vector, n_batch, result); +} + +void SparseMatrixBatchVectorMultiplyAccumulate( + const float* __restrict__ matrix, const uint8_t* __restrict__ ledger, + int m_rows, int m_cols, const float* __restrict__ vector, int n_batch, + float* __restrict__ result) { + PortableSparseMatrixBatchVectorMultiplyAccumulate( + matrix, ledger, m_rows, m_cols, vector, n_batch, result); +} + +void SparseMatrixBatchVectorMultiplyAccumulate1x16( + const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments, + const int32_t* __restrict__ indices, int m_rows, int m_cols, + const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector, + int n_batch, const int32_t input_offset, const int32_t output_multiplier, + const int32_t output_shift, const int32_t output_offset, + const int32_t output_activation_min, const int32_t output_activation_max, + + int8_t* __restrict__ result) { + PortableSparseMatrixBatchVectorMultiplyAccumulate1x16( + matrix, segments, indices, m_rows, m_cols, vector, bias_vector, n_batch, + input_offset, output_multiplier, output_shift, output_offset, + output_activation_min, output_activation_max, result); +} + +void SparseMatrixBatchVectorMultiplyAccumulate( + const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows, + const int m_cols, const int8_t* __restrict__ vectors, + const float* scaling_factors, int n_batch, float* __restrict__ result) { + PortableSparseMatrixBatchVectorMultiplyAccumulate( + matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch, + result); +} + +void MatrixBatchVectorMultiplyAccumulate( + const int8_t* input, const int32_t* bias, + const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift, + int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp, + int32_t* scratch, int16_t* output, CpuBackendContext* context) { + PortableMatrixBatchVectorMultiplyAccumulate( + input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input, + n_output, output_zp, scratch, output, context); +} + +void MatrixBatchVectorMultiplyAccumulate( + const int8_t* input, const int32_t* bias, + const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift, + int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp, + int32_t* scratch, int8_t* output, CpuBackendContext* context) { + PortableMatrixBatchVectorMultiplyAccumulate( + input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input, + n_output, output_zp, scratch, output, context); +} + +void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar, + int32_t n_row, int32_t n_col, + int32_t* output) { + PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output); +} + +void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint, + const int8_t* input_to_gate_weights, + int32_t input_to_gate_effective_scale_a, + int32_t input_to_gate_effective_scale_b, + int32_t n_batch, int32_t n_input, int32_t n_cell, + int8_t* gate_output, int8_t gate_output_zp) { + PortableMatrixBatchVectorMultiply( + input, input_zeropoint, input_to_gate_weights, + input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch, + n_input, n_cell, gate_output, gate_output_zp); +} + +void MatrixBatchVectorMultiply(const int16_t* hidden, + const int8_t* hidden_to_output_weights, + int32_t proj_effective_scale_a, + int32_t proj_effective_scale_b, + const int32_t* gate_bias, int32_t n_batch, + int32_t n_hidden, int32_t n_output, + int32_t output_zp, int8_t* proj_output) { + PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights, + proj_effective_scale_a, + proj_effective_scale_b, gate_bias, n_batch, + n_hidden, n_output, output_zp, proj_output); +} + +void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights, + const int32_t* bias, int32_t layer_norm_scale_a, + int32_t layer_norm_scale_b, int32_t variance_limit, + int n_batch, int n_input, int16_t* output) { + PortableApplyLayerNorm(input, layer_norm_weights, bias, layer_norm_scale_a, + layer_norm_scale_b, variance_limit, n_batch, n_input, + output); +} + +void ApplyLayerNormFloat(const int16_t* input, + const int16_t* layer_norm_weights, + int32_t layer_norm_scale_a, int32_t layer_norm_scale_b, + const int32_t* bias, int n_batch, int n_input, + int16_t* output) { + PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a, + layer_norm_scale_b, bias, n_batch, n_input, + output); +} + +void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input, + int16_t* output) { + PortableApplySigmoid(input, n_batch, n_input, output); +} + +void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input, + int16_t* output) { + PortableApplySigmoidFloat(input, n_batch, n_input, output); +} + +void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch, + int32_t n_input, int16_t* output) { + PortableApplyTanh(integer_bits, input, n_batch, n_input, output); +} + +void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input, + int32_t integer_bits, int16_t* output) { + PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output); +} + +void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch, + int n_input, int shift, int16_t* output) { + PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output); +} + +void CwiseMul(const int16_t* input_1, const int16_t* input_2, + int32_t multiplier, int32_t shift, int32_t n_batch, + int32_t n_input, int32_t output_zp, int8_t* output) { + PortableCwiseMul(input_1, input_2, multiplier, shift, n_batch, n_input, + output_zp, output); +} + +void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch, + int n_input, int16_t* output) { + PortableCwiseAdd(input_1, input_2, n_batch, n_input, output); +} + +void CwiseClipping(float* vector, const int v_size, + const float clipping_value) { + PortableCwiseClipping(vector, v_size, clipping_value); +} + +void CwiseClipping(int16_t* vector, const int v_size, + const int16_t clipping_value) { + PortableCwiseClipping(vector, v_size, clipping_value); +} + +void CwiseClipping(int8_t* vector, const int v_size, + const int8_t clipping_value) { + PortableCwiseClipping(vector, v_size, clipping_value); +} + +void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size, + const int16_t* batch_vector, + int n_batch, int32_t multiplier, + int shift, int16_t* result) { + PortableVectorBatchVectorCwiseProductAccumulate( + vector, v_size, batch_vector, n_batch, multiplier, shift, result); +} + +float VectorVectorDotProduct(const float* vector1, const float* vector2, + int v_size) { + return PortableVectorVectorDotProduct(vector1, vector2, v_size); +} + +void BatchVectorBatchVectorDotProduct(const int16_t* vector1, + const int16_t* vector2, int v_size, + int n_batch, int32_t* result) { + PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch, + result); +} + +void Sub1Vector(const float* vector, int v_size, float* result) { + PortableSub1Vector(vector, v_size, result); +} + +void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) { + PortableSub1Vector(vector, v_size, result); +} + +// Multiply all elements of vector with a scalar. +void VectorScalarMultiply(const int8_t* vector, int v_size, float scale, + float* result) { + PortableVectorScalarMultiply(vector, v_size, scale, result); +} + +void ReductionSumVector(const float* input_vector, float* output_vector, + int output_size, int reduction_size) { + PortableReductionSumVector(input_vector, output_vector, output_size, + reduction_size); +} + +void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector, + int output_size, int reduction_size) { + PortableReductionSumVector(input_vector, output_vector, output_size, + reduction_size); +} + +void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector, + int output_size, int reduction_size) { + PortableReductionSumVector(input_vector, output_vector, output_size, + reduction_size); +} + +void MeanStddevNormalization(const float* input_vector, float* output_vector, + int v_size, int n_batch) { + PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch); +} + +void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp, + const int8_t* recurrent, int8_t recurrent_zp, + int32_t input_effective_scale_a, + int32_t input_effective_scale_b, + int32_t recurrent_effective_scale_a, + int32_t recurrent_effective_scale_b, int32_t n_batch, + int32_t n_cell, int16_t* output) { + PortableTwoGateSaturatingAdd( + input, input_zp, recurrent, recurrent_zp, input_effective_scale_a, + input_effective_scale_b, recurrent_effective_scale_a, + recurrent_effective_scale_b, n_batch, n_cell, output); +} + +} // namespace tensor_utils +} // namespace tflite + +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h index 1e411e16..6c404d5e 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h @@ -87,6 +87,15 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate( int m_rows, int m_cols, const float* __restrict__ vector, int n_batch, float* __restrict__ result); +void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16( + const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments, + const int32_t* __restrict__ indices, int m_rows, int m_cols, + const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector, + int n_batch, const int32_t input_offset, const int32_t output_multiplier, + const int32_t output_shift, const int32_t output_offset, + const int32_t output_activation_min, const int32_t output_activation_max, + int8_t* __restrict__ result); + void PortableSparseMatrixBatchVectorMultiplyAccumulate( const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows, const int m_cols, const int8_t* __restrict__ vectors, diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/sub.h b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/sub.h index 3fa43ce9..d0ebc95a 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/sub.h +++ b/code/components/tflite-lib/tensorflow/lite/kernels/internal/reference/sub.h @@ -273,6 +273,9 @@ void BroadcastQuantSubSlow(const ArithmeticParams& params, const T* input2_data, const RuntimeShape& output_shape, T* output_data) { ruy::profiler::ScopeLabel label("BroadcastQuantSubSlow/T"); + TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N); + TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N); + TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N); NdArrayDesc desc1; NdArrayDesc desc2; NdArrayDesc output_desc; diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.cc b/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.cc index 75529296..10b37ed3 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.cc +++ b/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.cc @@ -27,6 +27,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/context_util.h" #include "tensorflow/lite/kernels/internal/cppmath.h" #include "tensorflow/lite/kernels/internal/quantization_util.h" @@ -466,10 +467,10 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context, const int d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1); const int d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1); if (!(d1 == d2 || d1 == 1 || d2 == 1)) { - context->ReportError(context, - "Given shapes, %s and %s, are not broadcastable.", - GetShapeDebugString(input1->dims).c_str(), - GetShapeDebugString(input2->dims).c_str()); + TF_LITE_KERNEL_LOG(context, + "Given shapes, %s and %s, are not broadcastable.", + GetShapeDebugString(input1->dims).c_str(), + GetShapeDebugString(input2->dims).c_str()); return kTfLiteError; } @@ -504,11 +505,11 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context, if (min_value == 0) max_value = 0; if (!(d1 == 1 || d1 == max_value) || !(d2 == 1 || d2 == max_value) || !(d3 == 1 || d3 == max_value)) { - context->ReportError( - context, "Given shapes, %s, %s and %s, are not broadcastable.", - GetShapeDebugString(input1->dims).c_str(), - GetShapeDebugString(input2->dims).c_str(), - GetShapeDebugString(input3->dims).c_str()); + TF_LITE_KERNEL_LOG(context, + "Given shapes, %s, %s and %s, are not broadcastable.", + GetShapeDebugString(input1->dims).c_str(), + GetShapeDebugString(input2->dims).c_str(), + GetShapeDebugString(input3->dims).c_str()); return kTfLiteError; } shape->data[out_dims - i - 1] = max_value; @@ -529,6 +530,9 @@ int TfLiteTypeGetSize(TfLiteType type) { return 1; case kTfLiteBool: return sizeof(bool); + case kTfLiteUInt16: + static_assert(sizeof(uint16_t) == 2, ""); + return 2; case kTfLiteInt16: static_assert(sizeof(int16_t) == 2, ""); return 2; @@ -575,4 +579,15 @@ bool IsMobilePlatform() { return false; } +bool HasUnspecifiedDimension(const TfLiteTensor* tensor) { +#ifndef TF_LITE_STATIC_MEMORY + if (tensor->dims_signature) { + for (int i : TfLiteIntArrayView(tensor->dims_signature)) { + if (i == -1) return true; + } + } +#endif // TF_LITE_STATIC_MEMORY + return false; +} + } // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.h b/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.h index d082e7b0..22689436 100644 --- a/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.h +++ b/code/components/tflite-lib/tensorflow/lite/kernels/kernel_util.h @@ -314,6 +314,9 @@ int TfLiteTypeGetSize(TfLiteType type); // Whether the current platform is mobile (Android or iOS). bool IsMobilePlatform(); +// Returns whether there is unspecified dimension in the tensor's dim signature. +bool HasUnspecifiedDimension(const TfLiteTensor* tensor); + } // namespace tflite #endif // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/micro/all_ops_resolver.cc b/code/components/tflite-lib/tensorflow/lite/micro/all_ops_resolver.cc index 8777cd28..6fa1b31b 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/all_ops_resolver.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/all_ops_resolver.cc @@ -29,8 +29,12 @@ AllOpsResolver::AllOpsResolver() { AddAssignVariable(); AddAveragePool2D(); AddBatchToSpaceNd(); + AddBroadcastArgs(); + AddBroadcastTo(); AddCallOnce(); + AddCast(); AddCeil(); + AddCircularBuffer(); AddConcatenation(); AddConv2D(); AddCos(); @@ -49,9 +53,12 @@ AllOpsResolver::AllOpsResolver() { AddFloorDiv(); AddFloorMod(); AddFullyConnected(); + AddGather(); + AddGatherNd(); AddGreater(); AddGreaterEqual(); AddHardSwish(); + AddIf(); AddL2Normalization(); AddL2Pool2D(); AddLeakyRelu(); @@ -66,6 +73,7 @@ AllOpsResolver::AllOpsResolver() { AddMaximum(); AddMean(); AddMinimum(); + AddMirrorPad(); AddMul(); AddNeg(); AddNotEqual(); @@ -85,6 +93,7 @@ AllOpsResolver::AllOpsResolver() { AddRsqrt(); AddShape(); AddSin(); + AddSlice(); AddSoftmax(); AddSpaceToBatchNd(); AddSpaceToDepth(); @@ -101,6 +110,8 @@ AllOpsResolver::AllOpsResolver() { AddTransposeConv(); AddUnpack(); AddVarHandle(); + AddWhile(); + AddZerosLike(); } } // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.cc b/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.cc new file mode 100644 index 00000000..5a5ba9ab --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.cc @@ -0,0 +1,107 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/fake_micro_context.h" + +#include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/micro/micro_allocator.h" +#include "tensorflow/lite/micro/micro_arena_constants.h" +#include "tensorflow/lite/micro/micro_error_reporter.h" +#include "tensorflow/lite/micro/simple_memory_allocator.h" + +namespace tflite { +namespace { +// Dummy static variables to allow creation of dummy MicroAllocator. +// All tests are guarateed to run serially. +static constexpr int KDummyTensorArenaSize = 256; +static uint8_t dummy_tensor_arena[KDummyTensorArenaSize]; +} // namespace + +FakeMicroContext::FakeMicroContext(TfLiteTensor* tensors, + SimpleMemoryAllocator* allocator, + MicroGraph* micro_graph) + : MicroContext( + MicroAllocator::Create(dummy_tensor_arena, KDummyTensorArenaSize, + GetMicroErrorReporter()), + nullptr, micro_graph), + tensors_(tensors), + allocator_(allocator) {} + +TfLiteTensor* FakeMicroContext::AllocateTempTfLiteTensor(int tensor_index) { + allocated_tensor_count_++; + return &tensors_[tensor_index]; +} + +void FakeMicroContext::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) { + allocated_tensor_count_--; +} + +bool FakeMicroContext::IsAllTempTfLiteTensorDeallocated() { + return !allocated_tensor_count_; +} + +TfLiteEvalTensor* FakeMicroContext::GetEvalTensor(int tensor_index) { + TfLiteEvalTensor* eval_tensor = + reinterpret_cast(allocator_->AllocateTemp( + sizeof(TfLiteEvalTensor), alignof(TfLiteEvalTensor))); + TFLITE_DCHECK(eval_tensor != nullptr); + + // In unit tests, the TfLiteTensor pointer contains the source of truth for + // buffers and values: + eval_tensor->data = tensors_[tensor_index].data; + eval_tensor->dims = tensors_[tensor_index].dims; + eval_tensor->type = tensors_[tensor_index].type; + return eval_tensor; +} + +void* FakeMicroContext::AllocatePersistentBuffer(size_t bytes) { + // FakeMicroContext use SimpleMemoryAllocator, which does not automatically + // apply the buffer alignment like MicroAllocator. + // The buffer alignment is potentially wasteful but allows the + // fake_micro_context to work correctly with optimized kernels. + return allocator_->AllocatePersistentBuffer(bytes, + MicroArenaBufferAlignment()); +} + +TfLiteStatus FakeMicroContext::RequestScratchBufferInArena(size_t bytes, + int* buffer_index) { + TFLITE_DCHECK(buffer_index != nullptr); + + if (scratch_buffer_count_ == kNumScratchBuffers_) { + MicroPrintf("Exceeded the maximum number of scratch tensors allowed (%d).", + kNumScratchBuffers_); + return kTfLiteError; + } + + // For tests, we allocate scratch buffers from the tail and keep them around + // for the lifetime of model. This means that the arena size in the tests will + // be more than what we would have if the scratch buffers could share memory. + scratch_buffers_[scratch_buffer_count_] = + allocator_->AllocatePersistentBuffer(bytes, MicroArenaBufferAlignment()); + TFLITE_DCHECK(scratch_buffers_[scratch_buffer_count_] != nullptr); + + *buffer_index = scratch_buffer_count_++; + return kTfLiteOk; +} + +void* FakeMicroContext::GetScratchBuffer(int buffer_index) { + TFLITE_DCHECK(scratch_buffer_count_ <= kNumScratchBuffers_); + if (buffer_index >= scratch_buffer_count_) { + return nullptr; + } + return scratch_buffers_[buffer_index]; +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.h b/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.h new file mode 100644 index 00000000..99933c19 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/fake_micro_context.h @@ -0,0 +1,56 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_MICRO_FAKE_MICRO_CONTEXT_H_ +#define TENSORFLOW_LITE_MICRO_FAKE_MICRO_CONTEXT_H_ + +#include "tensorflow/lite/micro/micro_context.h" +#include "tensorflow/lite/micro/micro_graph.h" + +namespace tflite { +// A fake of MicroContext for kernel util tests. +class FakeMicroContext : public MicroContext { + public: + FakeMicroContext(TfLiteTensor* tensors, SimpleMemoryAllocator* allocator, + MicroGraph* micro_graph); + + void* AllocatePersistentBuffer(size_t bytes) override; + TfLiteStatus RequestScratchBufferInArena(size_t bytes, + int* buffer_index) override; + void* GetScratchBuffer(int buffer_index) override; + + TfLiteTensor* AllocateTempTfLiteTensor(int tensor_index) override; + void DeallocateTempTfLiteTensor(TfLiteTensor* tensor) override; + bool IsAllTempTfLiteTensorDeallocated(); + + TfLiteEvalTensor* GetEvalTensor(int tensor_index) override; + + private: + static constexpr int kNumScratchBuffers_ = 12; + + int scratch_buffer_count_ = 0; + uint8_t* scratch_buffers_[kNumScratchBuffers_]; + + TfLiteTensor* tensors_; + int allocated_tensor_count_ = 0; + + SimpleMemoryAllocator* allocator_; + + TF_LITE_REMOVE_VIRTUAL_DELETE +}; + +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_FAKE_MICRO_CONTEXT_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/micro/ibuffer_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/ibuffer_allocator.h new file mode 100644 index 00000000..3767cb9f --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/ibuffer_allocator.h @@ -0,0 +1,100 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_MICRO_IBUFFER_ALLOCATOR_H_ +#define TENSORFLOW_LITE_MICRO_IBUFFER_ALLOCATOR_H_ + +#include +#include + +#include "tensorflow/lite/c/c_api_types.h" + +namespace tflite { +// Interface classes that the TFLM framework relies on to get buffers it needs. +// There are two types of buffers that the TFLM framework requires: persistent +// and non-persistent. Persistent buffers, once allocated, are never freed by +// the TFLM framework. Non-persist buffers can be allocated and deallocated by +// the TFLM framework. This file defines two interfaces classes that TFLM +// framework will rely on to manage these buffers. + +// Interface class for managing persistent buffers. +class IPersistentBufferAllocator { + public: + IPersistentBufferAllocator() {} + virtual ~IPersistentBufferAllocator() {} + + // Allocates persistent memory. The persistent buffer is never freed. + virtual uint8_t* AllocatePersistentBuffer(size_t size, size_t alignment) = 0; + + // Returns the size of all persistent allocations in bytes. + virtual size_t GetPersistentUsedBytes() const = 0; +}; + +// Interface class for managing non-persistent buffers. +// The default non-persistent buffers are temp buffers that are not resizable. +// Support of at least one resizable buffer is required. +class INonPersistentBufferAllocator { + public: + INonPersistentBufferAllocator() {} + virtual ~INonPersistentBufferAllocator() {} + + // Allocates a temporary buffer. This buffer is not resizable. + virtual uint8_t* AllocateTemp(size_t size, size_t alignment) = 0; + + // Signals that a temporary buffer is no longer needed. + virtual void DeallocateTemp(uint8_t* buf) = 0; + + // Returns true if all temporary buffers are already deallocated. + virtual bool IsAllTempDeallocated() = 0; + + // Signals that all temporary allocations can be reclaimed. TFLM calls this + // API when it knows that all temporary buffers that it requested has been + // deallocated. The goal of API is to facilitate implementations of + // INonPersistentBufferAllocator can reuse buffer with some reasonable + // complexity. + virtual TfLiteStatus ResetTempAllocations() = 0; + + // Returns a buffer that is resizable viable ResizeBuffer(). + virtual uint8_t* AllocateResizableBuffer(size_t size, size_t alignment) = 0; + + // Resizes a buffer that is previously returned by the + // AllocateResizableBuffer. + virtual TfLiteStatus ResizeBuffer(uint8_t* resizable_buf, size_t size, + size_t alignment) = 0; + + // Frees up the memory occupied by the resizable buffer. + virtual TfLiteStatus DeallocateResizableBuffer(uint8_t* resizable_buf) = 0; + + // Returns a pointer pointing to the start of the overlay memory, which is + // used for activation tensors and scratch buffers by kernels at Invoke stage. + virtual uint8_t* GetOverlayMemoryAddress() const = 0; + + // Reserves the size of the overlay memory. This overlay is reserved for the + // kernels at Invoke stage. This is referred to as the overlay because before + // Invoket state, the same memory can be used for temp buffers. The layout of + // the memory is planned by the memory planner separately at Invoke stage. + virtual TfLiteStatus ReserveNonPersistentOverlayMemory(size_t size, + size_t alignment) = 0; + + // Returns the size of non-persistent buffer in use. + virtual size_t GetNonPersistentUsedBytes() const = 0; + + // Returns the number of bytes available with a given alignment. This number + // takes in account any temporary allocations. + virtual size_t GetAvailableMemory(size_t alignment) const = 0; +}; + +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_IBUFFER_ALLOCATOR_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations_common.cc index 90afe832..2ec3a1bf 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/activations_common.cc @@ -117,15 +117,21 @@ TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->user_data != nullptr); ReluOpData* data = static_cast(node->user_data); - const TfLiteTensor* input = GetInput(context, node, kActivationsInputTensor); + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kActivationsInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kActivationsOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kActivationsOutputTensor); TF_LITE_ENSURE(context, output != nullptr); if (input->type == kTfLiteInt8) { CalculateReluOpData(input, output, data); } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } @@ -133,7 +139,9 @@ TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->user_data != nullptr); Relu6OpData* data = static_cast(node->user_data); - const TfLiteTensor* input = GetInput(context, node, kActivationsInputTensor); + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kActivationsInputTensor); TF_LITE_ENSURE(context, input != nullptr); if (input->type == kTfLiteInt8) { @@ -142,6 +150,8 @@ TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) { data->zero_int8 = input->params.zero_point; } + micro_context->DeallocateTempTfLiteTensor(input); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_common.cc index 3d0c841e..b285b800 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_common.cc @@ -80,11 +80,15 @@ TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->user_data != nullptr); TFLITE_DCHECK(node->builtin_data != nullptr); - const TfLiteTensor* input1 = GetInput(context, node, kAddInputTensor1); + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* input1 = + micro_context->AllocateTempInputTensor(node, kAddInputTensor1); TF_LITE_ENSURE(context, input1 != nullptr); - const TfLiteTensor* input2 = GetInput(context, node, kAddInputTensor2); + TfLiteTensor* input2 = + micro_context->AllocateTempInputTensor(node, kAddInputTensor2); TF_LITE_ENSURE(context, input2 != nullptr); - TfLiteTensor* output = GetOutput(context, node, kAddOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kAddOutputTensor); TF_LITE_ENSURE(context, output != nullptr); OpDataAdd* data = static_cast(node->user_data); @@ -93,6 +97,9 @@ TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_STATUS( CalculateOpDataAdd(context, params, input1, input2, output, data)); + micro_context->DeallocateTempTfLiteTensor(input1); + micro_context->DeallocateTempTfLiteTensor(input2); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_n.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_n.cc index b57a2ae6..5d0ab724 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_n.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/add_n.cc @@ -50,18 +50,19 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, num_inputs >= 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input_tensor_first; - TF_LITE_ENSURE_OK( - context, GetInputSafe(context, node, kInputTensor0, &input_tensor_first)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* input_tensor_first = + micro_context->AllocateTempInputTensor(node, kInputTensor0); + TF_LITE_ENSURE(context, input_tensor_first != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); // Check that all tensors have the same shape and type. TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_tensor_first->type); for (int i = kInputTensor0 + 1; i < num_inputs; ++i) { - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i, &input)); + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, i); + TF_LITE_ENSURE(context, input != nullptr); TF_LITE_ENSURE(context, HaveSameShapes(input_tensor_first, input)); TF_LITE_ENSURE_TYPES_EQ(context, input_tensor_first->type, input->type); @@ -72,6 +73,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, input_tensor_first->params.scale == input->params.scale); } + + micro_context->DeallocateTempTfLiteTensor(input); } if (output->type == kTfLiteFloat32) { @@ -123,6 +126,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { return kTfLiteError; } + micro_context->DeallocateTempTfLiteTensor(input_tensor_first); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/assign_variable.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/assign_variable.cc index a583a067..e28ebebb 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/assign_variable.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/assign_variable.cc @@ -52,21 +52,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { input_resource_id_tensor->type == kTfLiteInt32)); TF_LITE_ENSURE_EQ(context, NumElements(input_resource_id_tensor->dims), 1); - const TfLiteTensor* input_value = GetInput(context, node, kInputValue); + tflite::MicroContext* micro_context = tflite::GetMicroContext(context); + TfLiteTensor* input_value = + micro_context->AllocateTempInputTensor(node, kInputValue); TFLITE_DCHECK(input_value != nullptr); - // Casting to TfliteIntArray is required since we are re-using - // GetExecutionPlan from TfLiteContext. On TFLM this method returns a - // MicroGraph. - // TODO(b/188226309): Design a cleaner way to get a graph from kernel context. - MicroGraph* graph_info; - context->GetExecutionPlan(context, - reinterpret_cast(&graph_info)); - MicroResourceVariables* resources = graph_info->GetResourceVariables(); + MicroGraph& graph_info = micro_context->graph(); + + MicroResourceVariables* resources = graph_info.GetResourceVariables(); TF_LITE_ENSURE_OK(context, resources->Allocate(input_resource_id_tensor->data.i32[0], context, input_value)); + micro_context->DeallocateTempTfLiteTensor(input_value); return kTfLiteOk; } @@ -79,14 +77,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { tflite::micro::GetEvalInput(context, node, kInputValue); TFLITE_DCHECK(input_value != nullptr); - // Casting to TfliteIntArray is required since we are re-using - // GetExecutionPlan from TfLiteContext. On TFLM this method returns a - // MicroGraph. - // TODO(b/188226309): Design a cleaner way to get a graph from kernel context. - MicroGraph* graph_info; - context->GetExecutionPlan(context, - reinterpret_cast(&graph_info)); - MicroResourceVariables* resources = graph_info->GetResourceVariables(); + tflite::MicroContext* micro_context = tflite::GetMicroContext(context); + MicroGraph& graph_info = micro_context->graph(); + + MicroResourceVariables* resources = graph_info.GetResourceVariables(); if (resources == nullptr) { MicroPrintf( "ASSIGN_VARIABLE requires resource variables. Please create " diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/batch_to_space_nd.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/batch_to_space_nd.cc index a6fa0462..07b680df 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/batch_to_space_nd.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/batch_to_space_nd.cc @@ -41,8 +41,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 3); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, input != nullptr && output != nullptr); TF_LITE_ENSURE(context, NumDimensions(input) >= kInputOutputMinDimensionNum); @@ -51,6 +55,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, NumDimensions(output) <= kInputOutputMaxDimensionNum); TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_args.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_args.cc new file mode 100644 index 00000000..fa333249 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_args.cc @@ -0,0 +1,97 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/lite/kernels/internal/reference/broadcast_args.h" + +#include + +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/micro_context.h" + +namespace tflite { +namespace { +constexpr int kShape1Tensor = 0; +constexpr int kShape2Tensor = 1; +constexpr int kOutputTensor = 0; + +TfLiteStatus BroadcastArgsPrepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE(context, NumInputs(node) == 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* shape1 = + micro_context->AllocateTempInputTensor(node, kShape1Tensor); + TfLiteTensor* shape2 = + micro_context->AllocateTempInputTensor(node, kShape2Tensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + + TF_LITE_ENSURE(context, + shape1->type == kTfLiteInt32 || shape1->type == kTfLiteInt64); + TF_LITE_ENSURE_EQ(context, shape1->type, shape2->type); + TF_LITE_ENSURE_EQ(context, shape1->type, output->type); + + // Ensures the shapes are 1D tensor. + TF_LITE_ENSURE_EQ(context, NumDimensions(shape1), 1); + TF_LITE_ENSURE_EQ(context, NumDimensions(shape2), 1); + + // Ensure the shape of the output tensor is compatible + TF_LITE_ENSURE_EQ(context, NumDimensions(output), 1); + + micro_context->DeallocateTempTfLiteTensor(shape1); + micro_context->DeallocateTempTfLiteTensor(shape2); + micro_context->DeallocateTempTfLiteTensor(output); + + return kTfLiteOk; +} + +TfLiteStatus BroadcastArgsEval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteEvalTensor* shape1 = + micro::GetEvalInput(context, node, kShape1Tensor); + const TfLiteEvalTensor* shape2 = + micro::GetEvalInput(context, node, kShape2Tensor); + TfLiteEvalTensor* output = micro::GetEvalOutput(context, node, kOutputTensor); + + if (output->type == kTfLiteInt32) { + reference_ops::BroadcastArgs( + micro::GetTensorShape(shape1), micro::GetTensorData(shape1), + micro::GetTensorShape(shape2), micro::GetTensorData(shape2), + micro::GetTensorShape(output), micro::GetTensorData(output)); + } else { + reference_ops::BroadcastArgs( + micro::GetTensorShape(shape1), micro::GetTensorData(shape1), + micro::GetTensorShape(shape2), micro::GetTensorData(shape2), + micro::GetTensorShape(output), micro::GetTensorData(output)); + } + + return kTfLiteOk; +} + +} // namespace + +TfLiteRegistration Register_BROADCAST_ARGS() { + return {/*init=*/nullptr, + /*free=*/nullptr, + /*prepare=*/BroadcastArgsPrepare, + /*invoke=*/BroadcastArgsEval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite \ No newline at end of file diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_to.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_to.cc new file mode 100644 index 00000000..5302faf1 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/broadcast_to.cc @@ -0,0 +1,129 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/lite/kernels/internal/reference/broadcast_to.h" + +#include + +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/micro_context.h" + +namespace tflite { + +namespace { +constexpr int kInputTensor = 0; +constexpr int kShapeTensor = 1; +constexpr int kOutputTensor = 0; +// Support a maximum of 5 dimensions in TFLM. +constexpr int kMaxDims = 5; + +TfLiteStatus ValidateOutputTensor(TfLiteContext* context, TfLiteTensor* input, + TfLiteTensor* shape, TfLiteTensor* output) { + // Ensures the shape is 1D tensor. + TF_LITE_ENSURE_EQ(context, NumDimensions(shape), 1); + + // Ensure output dims is not less than input dims. + int input_num_dims = NumDimensions(input); + int output_num_dims = NumDimensions(output); + int shape_num_dims = SizeOfDimension(shape, 0); + TF_LITE_ENSURE_MSG(context, output_num_dims == shape_num_dims, + "Output must match with the expected shape dimension."); + TF_LITE_ENSURE_MSG(context, input_num_dims <= output_num_dims, + "Output shape must be broadcastable from input shape."); + TF_LITE_ENSURE_MSG(context, output_num_dims <= kMaxDims, + "BroadcastTo only supports 1-5D tensor."); + + // Check if output shape is broadcastable from input shape. + auto get_shape_data = [shape](int i) -> int32_t { + if (shape->type == kTfLiteInt32) { + return GetTensorData(shape)[i]; + } else { + return GetTensorData(shape)[i]; + } + }; + + int extending_dims = output_num_dims - input_num_dims; + for (int idx = 0; idx < input_num_dims; ++idx) { + TF_LITE_ENSURE_MSG( + context, + (SizeOfDimension(input, idx) == 1 || + SizeOfDimension(input, idx) == get_shape_data(extending_dims + idx)), + "Output shape must be broadcastable from input shape."); + } + + // Validating the shape of the output tensor. + tflite::RuntimeShape output_shape = tflite::GetTensorShape(output); + for (int idx = 0; idx < output_num_dims; ++idx) { + TF_LITE_ENSURE(context, output_shape.Dims(idx) == get_shape_data(idx)); + } + return kTfLiteOk; +} + +TfLiteStatus BroadcastToPrepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE(context, NumInputs(node) == 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TfLiteTensor* shape = + micro_context->AllocateTempInputTensor(node, kShapeTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + + TF_LITE_ENSURE_MSG(context, (NumDimensions(input) <= kMaxDims), + "BroadcastTo only supports 1-5D tensor."); + + TF_LITE_ENSURE(context, + shape->type == kTfLiteInt32 || shape->type == kTfLiteInt64); + TF_LITE_ENSURE_EQ(context, input->type, output->type); + + // Does not support String type due to its variable size. This limitation is + // the same as TFLite. + TF_LITE_ENSURE(context, input->type != kTfLiteString); + + TF_LITE_ENSURE_STATUS(ValidateOutputTensor(context, input, shape, output)); + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(shape); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; +} + +TfLiteStatus BroadcastToEval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteEvalTensor* input = + micro::GetEvalInput(context, node, kInputTensor); + TfLiteEvalTensor* output = micro::GetEvalOutput(context, node, kOutputTensor); + + // BroadcastTo op support upto 5 dims, different from 8 dims in TFLite. + reference_ops::BroadcastTo( + micro::GetTensorShape(input), input->data.raw, + micro::GetTensorShape(output), output->data.raw, input->type); + return kTfLiteOk; +} +} // namespace + +TfLiteRegistration Register_BROADCAST_TO() { + return {/*init=*/nullptr, + /*free=*/nullptr, + /*prepare=*/BroadcastToPrepare, + /*invoke=*/BroadcastToEval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite \ No newline at end of file diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/call_once.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/call_once.cc index 97fded0c..4db39f7d 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/call_once.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/call_once.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/micro/kernels/kernel_util.h" #include "tensorflow/lite/micro/memory_helpers.h" +#include "tensorflow/lite/micro/micro_context.h" #include "tensorflow/lite/micro/micro_graph.h" #include "tensorflow/lite/schema/schema_generated.h" @@ -50,16 +51,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, NumInputs(node) == 0); TF_LITE_ENSURE(context, NumOutputs(node) == 0); - // Casting to TfliteIntArray is required since we are re-using - // GetExecutionPlan from TfLiteContext. On TFLM this method returns a - // MicroGraph. - // TODO(b/188226309): Design a cleaner way to get a graph from kernel context. - MicroGraph* graph_info; - context->GetExecutionPlan(context, - reinterpret_cast(&graph_info)); + tflite::MicroContext* micro_context = tflite::GetMicroContext(context); + MicroGraph& graph_info = micro_context->graph(); TF_LITE_ENSURE(context, - op_data->init_subgraph_index < graph_info->NumSubgraphs()); + op_data->init_subgraph_index < graph_info.NumSubgraphs()); return kTfLiteOk; } @@ -72,16 +68,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } - // Casting to TfliteIntArray is required since we are re-using - // GetExecutionPlan from TfLiteContext. On TFLM this method returns a - // MicroGraph. - // TODO(b/188226309): Design a cleaner way to get a graph from kernel context. - MicroGraph* graph_info; - context->GetExecutionPlan(context, - reinterpret_cast(&graph_info)); + tflite::MicroContext* micro_context = tflite::GetMicroContext(context); + MicroGraph& graph_info = micro_context->graph(); TF_LITE_ENSURE_OK(context, - graph_info->InvokeSubgraph(op_data->init_subgraph_index)); + graph_info.InvokeSubgraph(op_data->init_subgraph_index)); op_data->has_run = true; diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/cast.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/cast.cc index 0314e523..dc651a24 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/cast.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/cast.cc @@ -28,11 +28,19 @@ constexpr int kOutputTensor = 0; TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); + + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, output != nullptr); + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } @@ -83,6 +91,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt32: return copyToTensor(context, tflite::micro::GetTensorData(input), output, num_elements); + case kTfLiteUInt32: + return copyToTensor(context, + tflite::micro::GetTensorData(input), output, + num_elements); case kTfLiteFloat32: return copyToTensor(context, tflite::micro::GetTensorData(input), output, num_elements); diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/ceil.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/ceil.cc index f929ce62..d0a48f91 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/ceil.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/ceil.cc @@ -29,9 +29,13 @@ constexpr int kInputTensor = 0; constexpr int kOutputTensor = 0; TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - const TfLiteTensor* input = GetInput(context, node, kInputTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); @@ -42,6 +46,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { for (int i = 0; i < output->dims->size; ++i) { TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]); } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer_common.cc index 0bb4d476..682efb43 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/circular_buffer_common.cc @@ -39,9 +39,13 @@ const int kCircularBufferCyclesMaxIndex = 0; // 'cycles_max' const TfLiteStatus kTfLiteAbort = static_cast(-9); TfLiteStatus CircularBufferPrepare(TfLiteContext* context, TfLiteNode* node) { - const TfLiteTensor* input = - GetInput(context, node, kCircularBufferInputTensor); - TfLiteTensor* output = GetOutput(context, node, kCircularBufferOutputTensor); + + MicroContext * micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context-> AllocateTempInputTensor(node, kCircularBufferInputTensor); + TfLiteTensor* output = + micro_context-> AllocateTempOutputTensor(node, kCircularBufferOutputTensor); TFLITE_DCHECK(node->user_data != nullptr); OpDataCircularBuffer* op_data = @@ -85,6 +89,9 @@ TfLiteStatus CircularBufferPrepare(TfLiteContext* context, TfLiteNode* node) { op_data->cycles_until_run = op_data->cycles_max; node->user_data = op_data; + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/comparisons.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/comparisons.cc index eb39d9ea..925c3fb5 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/comparisons.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/comparisons.cc @@ -540,9 +540,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->user_data != nullptr); OpData* data = static_cast(node->user_data); - const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input1 = + micro_context->AllocateTempInputTensor(node, kInputTensor1); TF_LITE_ENSURE(context, input1 != nullptr); - const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* input2 = + micro_context->AllocateTempInputTensor(node, kInputTensor2); TF_LITE_ENSURE(context, input2 != nullptr); if (input1->type == kTfLiteInt8) { @@ -570,6 +574,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { data->params.input2_shift = input2_shift; } + micro_context->DeallocateTempTfLiteTensor(input1); + micro_context->DeallocateTempTfLiteTensor(input2); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/concatenation.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/concatenation.cc index 8f45ac6a..d727a0d5 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/concatenation.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/concatenation.cc @@ -115,13 +115,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { const TfLiteConcatenationParams* params = reinterpret_cast(node->builtin_data); - const TfLiteTensor* input_tensor = GetInput(context, node, 0); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input_tensor = micro_context->AllocateTempInputTensor(node, 0); TF_LITE_ENSURE(context, input_tensor != nullptr); TfLiteType input_type = input_tensor->type; - const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor); + TfLiteTensor* output_tensor = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, output_tensor != nullptr); TfLiteType output_type = output_tensor->type; + micro_context->DeallocateTempTfLiteTensor(input_tensor); + micro_context->DeallocateTempTfLiteTensor(output_tensor); + // Check activation and input type TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone); TF_LITE_ENSURE(context, @@ -138,7 +144,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Shapes with dimensions >4 are not yet supported with static allocation. for (int i = 0; i < num_inputs; ++i) { - const TfLiteTensor* input = GetInput(context, node, i); + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, i); TF_LITE_ENSURE(context, input != nullptr); int num_dimensions = NumDimensions(input); @@ -150,13 +156,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { num_dimensions); return kTfLiteError; } + micro_context->DeallocateTempTfLiteTensor(input); } // Calculate OpData. TFLITE_DCHECK(node->user_data != nullptr); OpData* data = static_cast(node->user_data); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, output != nullptr); switch (output_type) { // Already know in/outtypes are same. @@ -183,10 +191,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Allocate persistent scale and zeropoint buffers. // Store input scale and zero point values in OpParams: for (int i = 0; i < node->inputs->size; ++i) { - const TfLiteTensor* t = GetInput(context, node, i); + TfLiteTensor* t = micro_context->AllocateTempInputTensor(node, i); TF_LITE_ENSURE(context, t != nullptr); input_scales[i] = t->params.scale; input_zero_points[i] = t->params.zero_point; + micro_context->DeallocateTempTfLiteTensor(t); } data->params.input_scale = input_scales; @@ -202,6 +211,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { return kTfLiteError; } + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.h index 4089a965..06b35e1e 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv.h @@ -1,4 +1,4 @@ -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -79,7 +79,8 @@ TfLiteRegistration Register_CONV_2D(); #if defined(XTENSA) // Returns a TfLiteRegistration struct for kernel variant that only supports -// int8 inputs and outputs. +// int8 activations and int8 weights and always calls the reference +// implementation. TfLiteRegistration Register_CONV_2D_INT8REF(); #else inline TfLiteRegistration Register_CONV_2D_INT8REF() { @@ -87,6 +88,25 @@ inline TfLiteRegistration Register_CONV_2D_INT8REF() { } #endif +#if defined(CMSIS_NN) +// Returns a TfLiteRegistration struct for kernel variant that only supports +// int8 activations and int8 weights and uses the latency optimized +// implementations. +TfLiteRegistration Register_CONV_2D_INT8(); + +// Returns a TfLiteRegistration struct for kernel variant that only supports +// int16 activations and int8 weights and uses the latency optimized +// implementations. +TfLiteRegistration Register_CONV_2D_INT16(); + +#else +inline TfLiteRegistration Register_CONV_2D_INT8() { return Register_CONV_2D(); } + +inline TfLiteRegistration Register_CONV_2D_INT16() { + return Register_CONV_2D(); +} +#endif + } // namespace tflite #endif // TENSORFLOW_LITE_MICRO_KERNELS_CONV_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_common.cc index 6887e423..7115f7ba 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/conv_common.cc @@ -93,13 +93,18 @@ TfLiteStatus CalculateOpDataConv(TfLiteContext* context, TfLiteNode* node, params.dilation_width_factor, height, width, filter_height, filter_width, padding, &out_height, &out_width); - const TfLiteTensor* input = GetInput(context, node, kConvInputTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kConvInputTensor); TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor); + TfLiteTensor* filter = + micro_context->AllocateTempInputTensor(node, kConvWeightsTensor); TF_LITE_ENSURE(context, filter != nullptr); - const TfLiteTensor* bias = - GetOptionalInputTensor(context, node, kConvBiasTensor); - TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor); + TfLiteTensor* bias = + micro_context->AllocateTempInputTensor(node, kConvBiasTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kConvOutputTensor); TF_LITE_ENSURE(context, output != nullptr); // Note that quantized inference requires that all tensors have their @@ -119,6 +124,11 @@ TfLiteStatus CalculateOpDataConv(TfLiteContext* context, TfLiteNode* node, data->filter_zero_point = filter->params.zero_point; data->output_zero_point = output->params.zero_point; + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(filter); + micro_context->DeallocateTempTfLiteTensor(output); + micro_context->DeallocateTempTfLiteTensor(bias); + return kTfLiteOk; } @@ -129,12 +139,16 @@ TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) { OpDataConv* data = static_cast(node->user_data); const auto& params = *(static_cast(node->builtin_data)); + MicroContext* micro_context = GetMicroContext(context); - TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kConvOutputTensor); TF_LITE_ENSURE(context, output != nullptr); - const TfLiteTensor* input = GetInput(context, node, kConvInputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kConvInputTensor); TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor); + TfLiteTensor* filter = + micro_context->AllocateTempInputTensor(node, kConvWeightsTensor); TF_LITE_ENSURE(context, filter != nullptr); const int input_width = input->dims->data[2]; @@ -174,6 +188,10 @@ TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) { context, node, params, input_width, input_height, filter_width, filter_height, output_width, output_height, input->type, data)); + micro_context->DeallocateTempTfLiteTensor(filter); + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } } // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/cumsum.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/cumsum.cc index 2dc9f98f..61f7af23 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/cumsum.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/cumsum.cc @@ -47,8 +47,12 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); - const TfLiteTensor* axis = GetInput(context, node, kAxisTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TfLiteTensor* axis = + micro_context->AllocateTempInputTensor(node, kAxisTensor); TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 || input->type == kTfLiteInt8); @@ -58,7 +62,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, NumDimensions(input) >= 1); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE_EQ(context, input->type, output->type); TF_LITE_ENSURE(context, HaveSameShapes(input, output)); @@ -91,6 +96,10 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { &data->output_activation_max)); } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(axis); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depth_to_space.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depth_to_space.cc index ae42ee1b..cce93c9c 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depth_to_space.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depth_to_space.cc @@ -40,11 +40,14 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4); @@ -83,6 +86,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { output->dims->data[kWidthRank] = output_width; output->dims->data[kDepthRank] = output_channels; + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv_common.cc index 49167f38..3bf07274 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/depthwise_conv_common.cc @@ -94,13 +94,18 @@ TfLiteStatus CalculateOpDataDepthwiseConv( params.dilation_width_factor, height, width, filter_height, filter_width, padding, &out_height, &out_width); - const TfLiteTensor* input = GetInput(context, node, kConvInputTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kConvInputTensor); TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* filter = GetInput(context, node, kConvWeightsTensor); + TfLiteTensor* filter = + micro_context->AllocateTempInputTensor(node, kConvWeightsTensor); TF_LITE_ENSURE(context, filter != nullptr); - const TfLiteTensor* bias = - GetOptionalInputTensor(context, node, kConvBiasTensor); - TfLiteTensor* output = GetOutput(context, node, kConvOutputTensor); + TfLiteTensor* bias = + micro_context->AllocateTempInputTensor(node, kConvBiasTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kConvOutputTensor); TF_LITE_ENSURE(context, output != nullptr); // Note that quantized inference requires that all tensors have their @@ -120,6 +125,11 @@ TfLiteStatus CalculateOpDataDepthwiseConv( data->filter_zero_point = filter->params.zero_point; data->output_zero_point = output->params.zero_point; + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(filter); + micro_context->DeallocateTempTfLiteTensor(bias); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } @@ -130,14 +140,16 @@ TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node) { OpDataConv* data = static_cast(node->user_data); const auto& params = *(static_cast(node->builtin_data)); + MicroContext* micro_context = GetMicroContext(context); - TfLiteTensor* output = GetOutput(context, node, kDepthwiseConvOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kDepthwiseConvOutputTensor); TF_LITE_ENSURE(context, output != nullptr); - const TfLiteTensor* input = - GetInput(context, node, kDepthwiseConvInputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kDepthwiseConvInputTensor); TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* filter = - GetInput(context, node, kDepthwiseConvWeightsTensor); + TfLiteTensor* filter = + micro_context->AllocateTempInputTensor(node, kDepthwiseConvWeightsTensor); TF_LITE_ENSURE(context, filter != nullptr); const int input_width = input->dims->data[2]; @@ -180,6 +192,10 @@ TfLiteStatus DepthwiseConvPrepare(TfLiteContext* context, TfLiteNode* node) { context, node, params, input_width, input_height, filter_width, filter_height, output_width, output_height, input->type, data)); + micro_context->DeallocateTempTfLiteTensor(output); + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(filter); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize_common.cc index 00b47f57..4be5ad89 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/dequantize_common.cc @@ -33,10 +33,12 @@ TfLiteStatus DequantizePrepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + MicroContext* micro_context = GetMicroContext(context); + // TODO(b/140515557): Add cached dequant to improve hybrid model performance. - const TfLiteTensor* input = GetInput(context, node, 0); + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, 0); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE(context, @@ -54,6 +56,10 @@ TfLiteStatus DequantizePrepare(TfLiteContext* context, TfLiteNode* node) { data->quantization_params.zero_point = input->params.zero_point; data->quantization_params.scale = static_cast(input->params.scale); data->output_zero_point = output->params.zero_point; + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/detection_postprocess.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/detection_postprocess.cc index 5ac343cf..efe57e2f 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/detection_postprocess.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/detection_postprocess.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include #include +#include #include "flatbuffers/flexbuffers.h" #include "tensorflow/lite/c/builtin_op_data.h" @@ -152,14 +154,17 @@ void Free(TfLiteContext* context, void* buffer) {} TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { auto* op_data = static_cast(node->user_data); + MicroContext* micro_context = GetMicroContext(context); + // Inputs: box_encodings, scores, anchors TF_LITE_ENSURE_EQ(context, NumInputs(node), 3); - const TfLiteTensor* input_box_encodings = - GetInput(context, node, kInputTensorBoxEncodings); - const TfLiteTensor* input_class_predictions = - GetInput(context, node, kInputTensorClassPredictions); - const TfLiteTensor* input_anchors = - GetInput(context, node, kInputTensorAnchors); + TfLiteTensor* input_box_encodings = + micro_context->AllocateTempInputTensor(node, kInputTensorBoxEncodings); + TfLiteTensor* input_class_predictions = + micro_context->AllocateTempInputTensor(node, + kInputTensorClassPredictions); + TfLiteTensor* input_anchors = + micro_context->AllocateTempInputTensor(node, kInputTensorAnchors); TF_LITE_ENSURE_EQ(context, NumDimensions(input_box_encodings), 3); TF_LITE_ENSURE_EQ(context, NumDimensions(input_class_predictions), 3); TF_LITE_ENSURE_EQ(context, NumDimensions(input_anchors), 2); @@ -217,6 +222,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // num_detections TF_LITE_ENSURE_EQ(context, NumOutputs(node), 4); + micro_context->DeallocateTempTfLiteTensor(input_box_encodings); + micro_context->DeallocateTempTfLiteTensor(input_class_predictions); + micro_context->DeallocateTempTfLiteTensor(input_anchors); + return kTfLiteOk; } @@ -313,9 +322,10 @@ TfLiteStatus DecodeCenterSizeBoxes(TfLiteContext* context, TfLiteNode* node, void DecreasingPartialArgSort(const float* values, int num_values, int num_to_sort, int* indices) { std::iota(indices, indices + num_values, 0); - std::partial_sort( - indices, indices + num_to_sort, indices + num_values, - [&values](const int i, const int j) { return values[i] > values[j]; }); + std::partial_sort(indices, indices + num_to_sort, indices + num_values, + [&values](const int i, const int j) { + return std::tie(values[i], j) > std::tie(values[j], i); + }); } template diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/elementwise.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/elementwise.cc index 581e532b..366dd610 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/elementwise.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/elementwise.cc @@ -38,11 +38,13 @@ bool IsLogicalSupportedType(const TfLiteType type) { typedef bool (*IsSupportedType)(TfLiteType); template TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, 0); + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, 0); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); if (!IsSupportedType(input->type)) { @@ -50,6 +52,9 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTypeGetName(input->type), input->type); return kTfLiteError; } + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/elu.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/elu.cc index 7e785f2f..b2cd19cc 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/elu.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/elu.cc @@ -80,13 +80,16 @@ void EvalUsingLookupTable(const OpData* data, const TfLiteEvalTensor* input, } TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); // Use LUT to handle quantized elu path. @@ -97,7 +100,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { }; PopulateLookupTable(input, output, transform, data); } - + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/README.md b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/README.md new file mode 100644 index 00000000..b0c215fb --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/README.md @@ -0,0 +1,11 @@ +# Info + +These are the Espressif chipset specific replacement kernels. +The kernels call optimized routines or reference routines depending upon optimization option selected. + +By default optimizations are selected if available. +To change this behaviour, please make the appropriate `ESP-NN` menu selection after running: + +``` +idf.py menuconfig +``` diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/add.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/add.cc new file mode 100644 index 00000000..47a17d9f --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/add.cc @@ -0,0 +1,209 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/kernels/internal/reference/add.h" + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h" +#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/kernels/op_macros.h" +#include "tensorflow/lite/micro/kernels/add.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/memory_helpers.h" +#include "tensorflow/lite/micro/micro_error_reporter.h" + +#include + +#if ESP_NN +#include +#endif + +long long add_total_time = 0; + +namespace tflite { + +void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params, + const OpDataAdd* data, const TfLiteEvalTensor* input1, + const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) { + tflite::ArithmeticParams op_params; + SetActivationParams(data->output_activation_min_f32, + data->output_activation_max_f32, &op_params); + if (data->requires_broadcast) { + reference_ops::BroadcastAdd4DSlow( + op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } else { + reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } +} + +TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, + TfLiteAddParams* params, const OpDataAdd* data, + const TfLiteEvalTensor* input1, + const TfLiteEvalTensor* input2, + TfLiteEvalTensor* output) { + tflite::ArithmeticParams op_params; + op_params.left_shift = data->left_shift; + op_params.input1_offset = data->input1_offset; + op_params.input1_multiplier = data->input1_multiplier; + op_params.input1_shift = data->input1_shift; + op_params.input2_offset = data->input2_offset; + op_params.input2_multiplier = data->input2_multiplier; + op_params.input2_shift = data->input2_shift; + op_params.output_offset = data->output_offset; + op_params.output_multiplier = data->output_multiplier; + op_params.output_shift = data->output_shift; + SetActivationParams(data->output_activation_min, data->output_activation_max, + &op_params); + bool need_broadcast = reference_ops::ProcessBroadcastShapes( + tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorShape(input2), &op_params); + + switch (output->type) { + case kTfLiteInt8: { + if (need_broadcast) { + reference_integer_ops::BroadcastAdd4DSlow( + op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } else { +#if ESP_NN + const int8_t *input1_data = tflite::micro::GetTensorData(input1); + const int8_t *input2_data = tflite::micro::GetTensorData(input2); + int8_t *out_data = tflite::micro::GetTensorData(output); + + esp_nn_add_elementwise_s8(input1_data, + input2_data, + data->input1_offset, + data->input2_offset, + data->input1_multiplier, + data->input2_multiplier, + data->input1_shift, + data->input2_shift, + data->left_shift, + out_data, + data->output_offset, + data->output_multiplier, + data->output_shift, + data->output_activation_min, + data->output_activation_max, + MatchingElementsSize(tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorShape(output)) + ); +#else + reference_integer_ops::Add( + op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); +#endif + } + break; + } + case kTfLiteInt16: { + if (need_broadcast) { + reference_ops::BroadcastAdd4DSlow( + op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } else { + reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output), + false); + } + break; + } + default: + MicroPrintf("Type %s (%d) not supported.", + TfLiteTypeGetName(output->type), output->type); + return kTfLiteError; + } + + return kTfLiteOk; +} + +void* AddInit(TfLiteContext* context, const char* buffer, size_t length) { + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + return context->AllocatePersistentBuffer(context, sizeof(OpDataAdd)); +} + +TfLiteStatus AddEval(TfLiteContext* context, TfLiteNode* node) { + auto* params = reinterpret_cast(node->builtin_data); + + TFLITE_DCHECK(node->user_data != nullptr); + const OpDataAdd* data = static_cast(node->user_data); + + const TfLiteEvalTensor* input1 = + tflite::micro::GetEvalInput(context, node, kAddInputTensor1); + const TfLiteEvalTensor* input2 = + tflite::micro::GetEvalInput(context, node, kAddInputTensor2); + TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, kAddOutputTensor); + + long long start_time = esp_timer_get_time(); + + if (output->type == kTfLiteFloat32) { + EvalAdd(context, node, params, data, input1, input2, output); + } else if (output->type == kTfLiteInt8 || output->type == kTfLiteInt16) { + TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data, + input1, input2, output)); + } else { + MicroPrintf("Type %s (%d) not supported.", TfLiteTypeGetName(output->type), + output->type); + return kTfLiteError; + } + add_total_time += esp_timer_get_time() - start_time; + + return kTfLiteOk; +} + +TfLiteRegistration Register_ADD() { + return {/*init=*/AddInit, + /*free=*/nullptr, + /*prepare=*/AddPrepare, + /*invoke=*/AddEval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/conv.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/conv.cc new file mode 100644 index 00000000..09260482 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/conv.cc @@ -0,0 +1,319 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/kernels/conv.h" + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/reference/conv.h" +#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/kernels/padding.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" + +#include "freertos/FreeRTOS.h" +#include + +#if ESP_NN +#include +#endif + + +long long conv_total_time = 0; + +namespace tflite { +namespace { + +struct NodeData { + OpDataConv op_data; +#if ESP_NN + int buffer_idx; +#endif +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + return context->AllocatePersistentBuffer(context, sizeof(NodeData)); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->user_data != nullptr); + TFLITE_DCHECK(node->builtin_data != nullptr); + + NodeData* data = static_cast(node->user_data); + const auto& params = + *(static_cast(node->builtin_data)); + + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kConvInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* filter = + micro_context->AllocateTempInputTensor(node, kConvWeightsTensor); + TF_LITE_ENSURE(context, filter != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kConvOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); + + const int input_width = input->dims->data[2]; + const int input_height = input->dims->data[1]; + const int filter_width = filter->dims->data[2]; + const int filter_height = filter->dims->data[1]; + const int output_width = output->dims->data[2]; + const int output_height = output->dims->data[1]; + + // Dynamically allocate per-channel quantization parameters. + const int num_channels = filter->dims->data[kConvQuantizedDimension]; + data->op_data.per_channel_output_multiplier = + static_cast(context->AllocatePersistentBuffer( + context, num_channels * sizeof(int32_t))); + data->op_data.per_channel_output_shift = + static_cast(context->AllocatePersistentBuffer( + context, num_channels * sizeof(int32_t))); + + // All per-channel quantized tensors need valid zero point and scale arrays. + if (input->type == kTfLiteInt8) { + TF_LITE_ENSURE_EQ(context, filter->quantization.type, + kTfLiteAffineQuantization); + + const auto* affine_quantization = + static_cast(filter->quantization.params); + TFLITE_DCHECK(affine_quantization != nullptr); + TFLITE_DCHECK(affine_quantization->scale != nullptr); + TFLITE_DCHECK(affine_quantization->zero_point != nullptr); + + TF_LITE_ENSURE(context, + affine_quantization->scale->size == 1 || + affine_quantization->scale->size == + filter->dims->data[kConvQuantizedDimension]); + TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size, + affine_quantization->zero_point->size); + } + + TF_LITE_ENSURE_STATUS(CalculateOpDataConv( + context, node, params, input_width, input_height, filter_width, + filter_height, output_width, output_height, input->type, &data->op_data)); + +#if ESP_NN + if (input->type == kTfLiteInt8) { + int scratch_buf_size = esp_nn_get_conv_scratch_size( + input_width, input_height, input->dims->data[3], + output->dims->data[3], filter_width, filter_height); + if (scratch_buf_size > 0) { + TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena( + context, scratch_buf_size, &data->buffer_idx)); + } else { + data->buffer_idx = -1; + } + } +#endif + + micro_context->DeallocateTempTfLiteTensor(output); + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(filter); + + return kTfLiteOk; +} + +#if ESP_NN +// Fixed-point per-channel-quantization convolution Int8 function wrapper. +inline void EvalQuantizedPerChannel( + TfLiteContext* context, TfLiteNode* node, const TfLiteConvParams& params, + const NodeData& data, const TfLiteEvalTensor* input, + const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias, + TfLiteEvalTensor* output) { + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + + if (dilation_width_factor == 1 && dilation_height_factor == 1) { + // Get parameters. + RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter); + RuntimeShape input_shape = tflite::micro::GetTensorShape(input); + RuntimeShape output_shape = tflite::micro::GetTensorShape(output); + RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias); + + const int8_t *input_data = tflite::micro::GetTensorData(input); + int8_t *output_data = tflite::micro::GetTensorData(output); + + const int32_t input_offset = -data.op_data.input_zero_point; + const int32_t output_offset = data.op_data.output_zero_point; + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = data.op_data.padding.width; + const int pad_height = data.op_data.padding.height; + + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + // Set min and max value of the output. + const int32_t activation_min = data.op_data.output_activation_min; + const int32_t activation_max = data.op_data.output_activation_max; + + // Consistency check. + TFLITE_DCHECK_LE(activation_min, activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batch_size = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + + if (tflite::micro::GetTensorData(bias)) { + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + + void *scratch_buf = NULL; + if (data.buffer_idx > -1) { + scratch_buf = context->GetScratchBuffer(context, data.buffer_idx); + } + esp_nn_set_conv_scratch_buf(scratch_buf); + + const int input_size = input_width * input_height * input_depth; + const int output_size = output_width * output_height * output_depth; + + for (int i_batch = 0; i_batch < batch_size; i_batch++) { + esp_nn_conv_s8(input_data + i_batch * input_size, + input_width, input_height, input_depth, input_offset, + pad_width, pad_height, stride_width, stride_height, + tflite::micro::GetTensorData(filter), + filter_width, filter_height, + tflite::micro::GetTensorData(bias), + output_data + i_batch * output_size, + output_width, output_height, output_depth, output_offset, + data.op_data.per_channel_output_shift, + data.op_data.per_channel_output_multiplier, + activation_min, activation_max); + } + } else { + reference_integer_ops::ConvPerChannel( + ConvParamsQuantized(params, data.op_data), + data.op_data.per_channel_output_multiplier, + data.op_data.per_channel_output_shift, + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } +} +#endif + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, kConvInputTensor); + const TfLiteEvalTensor* filter = + tflite::micro::GetEvalInput(context, node, kConvWeightsTensor); + const TfLiteEvalTensor* bias = + (NumInputs(node) == 3) + ? tflite::micro::GetEvalInput(context, node, kConvBiasTensor) + : nullptr; + TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, kConvOutputTensor); + + TFLITE_DCHECK(node->builtin_data != nullptr); + const auto& params = + *(reinterpret_cast(node->builtin_data)); + TFLITE_DCHECK(node->user_data != nullptr); + const auto& data = *(static_cast(node->user_data)); + + TF_LITE_ENSURE_EQ(context, input->type, output->type); + TF_LITE_ENSURE_MSG(context, input->type == filter->type, + "Hybrid models are not supported on TFLite Micro."); + + long long start_time = esp_timer_get_time(); + switch (input->type) { // Already know in/out types are same. + case kTfLiteFloat32: { + tflite::reference_ops::Conv( + ConvParamsFloat(params, data.op_data), + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output), + tflite::micro::GetTensorShape(nullptr), nullptr); + break; + } + case kTfLiteInt8: { +#if ESP_NN + EvalQuantizedPerChannel(context, node, params, data, input, filter, + bias, output); +#else + reference_integer_ops::ConvPerChannel( + ConvParamsQuantized(params, data.op_data), + data.op_data.per_channel_output_multiplier, + data.op_data.per_channel_output_shift, + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); +#endif + break; + } + case kTfLiteUInt8: { + //EvalQuantized + reference_ops::Conv(ConvParamsQuantized(params, data.op_data), + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output), + tflite::micro::GetTensorShape(nullptr), nullptr, + nullptr); + break; + } + default: + TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; + } + conv_total_time += esp_timer_get_time() - start_time; + return kTfLiteOk; +} + +} // namespace + +TfLiteRegistration Register_CONV_2D() { + return {/*init=*/Init, + /*free=*/nullptr, + /*prepare=*/Prepare, + /*invoke=*/Eval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/depthwise_conv.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/depthwise_conv.cc new file mode 100644 index 00000000..5f2d9d50 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/depthwise_conv.cc @@ -0,0 +1,319 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/kernels/depthwise_conv.h" + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h" +#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h" +#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/kernels/padding.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" + +#include "freertos/FreeRTOS.h" +#include + +#if ESP_NN +#include +#endif + +long long dc_total_time = 0; + +namespace tflite { +namespace { + +struct NodeData { + OpDataConv op_data; +#if ESP_NN + int buffer_idx; +#endif +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + return context->AllocatePersistentBuffer(context, sizeof(NodeData)); +} + +#if ESP_NN +inline void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, + const TfLiteDepthwiseConvParams& params, + const NodeData& data, + const TfLiteEvalTensor* input, + const TfLiteEvalTensor* filter, + const TfLiteEvalTensor* bias, + TfLiteEvalTensor* output) { + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + + if (dilation_width_factor == 1 && dilation_height_factor == 1) { + // Get parameters. + RuntimeShape input_shape = tflite::micro::GetTensorShape(input); + RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter); + RuntimeShape output_shape = tflite::micro::GetTensorShape(output); + RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias); + + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + const int8_t *input_data = tflite::micro::GetTensorData(input); + int8_t *output_data = tflite::micro::GetTensorData(output); + + const int depth_multiplier = params.depth_multiplier; + const int32_t input_offset = -data.op_data.input_zero_point; + const int32_t output_offset = data.op_data.output_zero_point; + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = data.op_data.padding.width; + const int pad_height = data.op_data.padding.height; + + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + // Set min and max value of the output. + const int32_t activation_min = data.op_data.output_activation_min; + const int32_t activation_max = data.op_data.output_activation_max; + + // Consistency check. + TFLITE_DCHECK_LE(activation_min, activation_max); + const int batch_size = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + if (tflite::micro::GetTensorData(bias)) { + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + + const int input_size = input_width * input_height * input_depth; + const int output_size = output_width * output_height * output_depth; + void *scratch_buf = NULL; + if (data.buffer_idx > -1) { + scratch_buf = context->GetScratchBuffer(context, data.buffer_idx); + } + esp_nn_set_depthwise_conv_scratch_buf(scratch_buf); + + for (int i_batch = 0; i_batch < batch_size; i_batch++) { + esp_nn_depthwise_conv_s8(input_data + i_batch * input_size, input_width, + input_height, input_depth, input_offset, + pad_width, pad_height, + stride_width, stride_height, depth_multiplier, + tflite::micro::GetTensorData(filter), + filter_width, filter_height, + tflite::micro::GetTensorData(bias), + output_data + i_batch * output_size, + output_width, output_height, output_offset, + data.op_data.per_channel_output_shift, + data.op_data.per_channel_output_multiplier, + activation_min, activation_max); + } + } else { + reference_integer_ops::DepthwiseConvPerChannel( + DepthwiseConvParamsQuantized(params, data.op_data), + data.op_data.per_channel_output_multiplier, + data.op_data.per_channel_output_shift, + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } +} +#endif + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->user_data != nullptr); + TFLITE_DCHECK(node->builtin_data != nullptr); + + NodeData* data = static_cast(node->user_data); + const TfLiteDepthwiseConvParams& params = + *(static_cast(node->builtin_data)); + + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kConvInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* filter = + micro_context->AllocateTempInputTensor(node, kConvWeightsTensor); + TF_LITE_ENSURE(context, filter != nullptr); + TfLiteTensor* bias = + micro_context->AllocateTempInputTensor(node, kConvBiasTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kConvOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); + + const int input_width = input->dims->data[2]; + const int input_height = input->dims->data[1]; + const int filter_width = filter->dims->data[2]; + const int filter_height = filter->dims->data[1]; + const int output_width = output->dims->data[2]; + const int output_height = output->dims->data[1]; + + // Dynamically allocate per-channel quantization parameters. + const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension]; + data->op_data.per_channel_output_multiplier = + static_cast(context->AllocatePersistentBuffer( + context, num_channels * sizeof(int32_t))); + data->op_data.per_channel_output_shift = + static_cast(context->AllocatePersistentBuffer( + context, num_channels * sizeof(int32_t))); + + // All per-channel quantized tensors need valid zero point and scale arrays. + if (input->type == kTfLiteInt8) { + TF_LITE_ENSURE_EQ(context, filter->quantization.type, + kTfLiteAffineQuantization); + + const auto* affine_quantization = + static_cast(filter->quantization.params); + TFLITE_DCHECK(affine_quantization != nullptr); + TFLITE_DCHECK(affine_quantization->scale != nullptr); + TFLITE_DCHECK(affine_quantization->zero_point != nullptr); + + TF_LITE_ENSURE( + context, affine_quantization->scale->size == 1 || + affine_quantization->scale->size == + filter->dims->data[kDepthwiseConvQuantizedDimension]); + + TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size, + affine_quantization->zero_point->size); + } + + TF_LITE_ENSURE_STATUS(CalculateOpDataDepthwiseConv( + context, node, params, input_width, input_height, filter_width, + filter_height, output_width, output_height, input->type, &data->op_data)); + +#if ESP_NN + if (input->type == kTfLiteInt8) { + int scratch_buf_size = esp_nn_get_depthwise_conv_scratch_size( + input_width, input_height, input->dims->data[3], + params.depth_multiplier, filter_width, filter_height); + if (scratch_buf_size > 0) { + TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena( + context, scratch_buf_size, &data->buffer_idx)); + } else { + data->buffer_idx = -1; + } + } +#endif + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(filter); + micro_context->DeallocateTempTfLiteTensor(bias); + micro_context->DeallocateTempTfLiteTensor(output); + + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->user_data != nullptr); + TFLITE_DCHECK(node->builtin_data != nullptr); + + auto& params = + *(reinterpret_cast(node->builtin_data)); + const NodeData& data = *(static_cast(node->user_data)); + + TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, kDepthwiseConvOutputTensor); + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, kDepthwiseConvInputTensor); + const TfLiteEvalTensor* filter = + tflite::micro::GetEvalInput(context, node, kDepthwiseConvWeightsTensor); + const TfLiteEvalTensor* bias = + (NumInputs(node) == 3) + ? tflite::micro::GetEvalInput(context, node, kDepthwiseConvBiasTensor) + : nullptr; + + long long start_time = esp_timer_get_time(); + switch (input->type) { // Already know in/out types are same. + case kTfLiteFloat32: + tflite::reference_ops::DepthwiseConv( + DepthwiseConvParamsFloat(params, data.op_data), + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + break; + case kTfLiteInt8: +#if ESP_NN + EvalQuantizedPerChannel(context, node, params, data, input, filter, bias, + output); +#else + reference_integer_ops::DepthwiseConvPerChannel( + DepthwiseConvParamsQuantized(params, data.op_data), + data.op_data.per_channel_output_multiplier, + data.op_data.per_channel_output_shift, + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); +#endif + break; + case kTfLiteUInt8: + //EvalQuantized(context, node, params, &data, input, filter, bias, output); + reference_ops::DepthwiseConv( + DepthwiseConvParamsQuantized(params, data.op_data), + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + break; + default: + TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; + } + dc_total_time += esp_timer_get_time() - start_time; + return kTfLiteOk; +} + +} // namespace + +TfLiteRegistration Register_DEPTHWISE_CONV_2D() { + return {/*init=*/Init, + /*free=*/nullptr, + /*prepare=*/Prepare, + /*invoke=*/Eval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/fully_connected.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/fully_connected.cc new file mode 100644 index 00000000..5e1705da --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/fully_connected.cc @@ -0,0 +1,198 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/kernels/fully_connected.h" + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/reference/fully_connected.h" +#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" + +#if ESP_NN +#include +#endif + +#include + +long long fc_total_time = 0; + +namespace tflite { +namespace { + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + return context->AllocatePersistentBuffer(context, + sizeof(OpDataFullyConnected)); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + + TFLITE_DCHECK(node->user_data != nullptr); + TFLITE_DCHECK(node->builtin_data != nullptr); + + auto* data = static_cast(node->user_data); + const auto params = + static_cast(node->builtin_data); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kFullyConnectedInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* filter = micro_context->AllocateTempInputTensor( + node, kFullyConnectedWeightsTensor); + TF_LITE_ENSURE(context, filter != nullptr); + TfLiteTensor* bias = + micro_context->AllocateTempInputTensor(node, kFullyConnectedBiasTensor); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor( + node, kFullyConnectedOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); + + TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); + TF_LITE_ENSURE_MSG(context, input->type == filter->type, + "Hybrid models are not supported on TFLite Micro."); + + TF_LITE_ENSURE_OK(context, CalculateOpDataFullyConnected( + context, params->activation, input->type, + input, filter, bias, output, data)); + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(filter); + if (bias != nullptr) { + micro_context->DeallocateTempTfLiteTensor(bias); + } + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->builtin_data != nullptr); + const auto* params = + static_cast(node->builtin_data); + + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, kFullyConnectedInputTensor); + const TfLiteEvalTensor* filter = + tflite::micro::GetEvalInput(context, node, kFullyConnectedWeightsTensor); + const TfLiteEvalTensor* bias = + tflite::micro::GetEvalInput(context, node, kFullyConnectedBiasTensor); + TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor); + + TFLITE_DCHECK(node->user_data != nullptr); + const auto& data = + *(static_cast(node->user_data)); + + long long start_time = esp_timer_get_time(); + // Checks in Prepare ensure input, output and filter types are all the same. + switch (input->type) { + case kTfLiteFloat32: { + tflite::reference_ops::FullyConnected( + FullyConnectedParamsFloat(params->activation), + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + break; + } + + case kTfLiteInt8: { + const int32_t* bias_data = + nullptr != bias ? tflite::micro::GetTensorData(bias) + : nullptr; +#if ESP_NN + const RuntimeShape& filter_shape = tflite::micro::GetTensorShape(filter); + const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = output_shape.Dims(0); + const int output_depth = output_shape.Dims(1); + TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2)); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + + const int8_t *input_data = tflite::micro::GetTensorData(input); + int8_t *output_data = tflite::micro::GetTensorData(output); + const int8_t *filter_data = tflite::micro::GetTensorData(filter); + + for (int b = 0; b < batches; ++b) { + esp_nn_fully_connected_s8(input_data, -data.input_zero_point, + accum_depth, + filter_data, -data.filter_zero_point, + bias_data, output_data, output_depth, + data.output_zero_point, + data.output_shift, data.output_multiplier, + data.output_activation_min, + data.output_activation_max); + input_data += accum_depth; + output_data += output_depth; + } +#else + tflite::reference_integer_ops::FullyConnected( + FullyConnectedParamsQuantized(data), + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), bias_data, + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); +#endif + break; + } + + case kTfLiteUInt8: { + tflite::reference_ops::FullyConnected( + FullyConnectedParamsQuantized(data), + tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(filter), + tflite::micro::GetTensorData(filter), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(bias), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + break; + } + default: { + TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; + } + } + fc_total_time += esp_timer_get_time() - start_time; + return kTfLiteOk; +} + +} // namespace + +TfLiteRegistration Register_FULLY_CONNECTED() { + return {/*init=*/Init, + /*free=*/nullptr, + /*prepare=*/Prepare, + /*invoke=*/Eval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/mul.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/mul.cc new file mode 100644 index 00000000..0e8a82f4 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/mul.cc @@ -0,0 +1,131 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/kernels/mul.h" + +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h" +#include "tensorflow/lite/kernels/internal/reference/mul.h" +#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/memory_helpers.h" +#include "tensorflow/lite/micro/micro_error_reporter.h" + +#if ESP_NN +#include +#endif + +#include + +long long mul_total_time = 0; + +namespace tflite { +#if ESP_NN +void MulEvalQuantized(TfLiteContext* context, TfLiteNode* node, + const OpDataMul* data, const TfLiteEvalTensor* input1, + const TfLiteEvalTensor* input2, + TfLiteEvalTensor* output) { + tflite::ArithmeticParams op_params = {}; + op_params.quantized_activation_min = data->output_activation_min; + op_params.quantized_activation_max = data->output_activation_max; + op_params.float_activation_max = data->output_activation_max_f32; + op_params.input1_offset = -data->input1_zero_point; + op_params.input2_offset = -data->input2_zero_point; + op_params.output_offset = data->output_zero_point; + op_params.output_multiplier = data->output_multiplier; + op_params.output_shift = data->output_shift; + + bool need_broadcast = reference_ops::ProcessBroadcastShapes( + tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorShape(input2), &op_params); + + if (need_broadcast) { + reference_integer_ops::BroadcastMul4DSlow( + op_params, tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorData(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorData(input2), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); + } else { + const int8_t *input1_data = tflite::micro::GetTensorData(input1); + const int8_t *input2_data = tflite::micro::GetTensorData(input2); + int8_t *out_data = tflite::micro::GetTensorData(output); + + esp_nn_mul_elementwise_s8(input1_data, input2_data, op_params.input1_offset, + op_params.input2_offset, out_data, op_params.output_offset, + op_params.output_multiplier, op_params.output_shift, + op_params.quantized_activation_min, op_params.quantized_activation_max, + MatchingElementsSize(tflite::micro::GetTensorShape(input1), + tflite::micro::GetTensorShape(input2), + tflite::micro::GetTensorShape(output))); + } +} +#endif + +TfLiteStatus MulEval(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->builtin_data != nullptr); + auto* params = reinterpret_cast(node->builtin_data); + + TFLITE_DCHECK(node->user_data != nullptr); + const OpDataMul* data = static_cast(node->user_data); + + const TfLiteEvalTensor* input1 = + tflite::micro::GetEvalInput(context, node, kMulInput1Tensor); + const TfLiteEvalTensor* input2 = + tflite::micro::GetEvalInput(context, node, kMulInput2Tensor); + TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, kMulOutputTensor); + + long long start_time = esp_timer_get_time(); + switch (input1->type) { + case kTfLiteInt8: +#if ESP_NN + MulEvalQuantized(context, node, data, input1, input2, output); +#else + EvalMulQuantizedReference(context, node, data, input1, input2, output); +#endif + break; + case kTfLiteInt32: + EvalMulQuantizedReference(context, node, data, input1, input2, output); + break; + case kTfLiteFloat32: + EvalMulFloatReference(context, node, params, data, input1, input2, + output); + break; + default: + MicroPrintf("Type %s (%d) not supported.", + TfLiteTypeGetName(input1->type), input1->type); + return kTfLiteError; + } + mul_total_time += esp_timer_get_time() - start_time; + return kTfLiteOk; +} + +TfLiteRegistration Register_MUL() { + return {/*init=*/MulInit, + /*free=*/nullptr, + /*prepare=*/MulPrepare, + /*invoke=*/MulEval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/pooling.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/pooling.cc new file mode 100644 index 00000000..d55bab82 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/esp_nn/pooling.cc @@ -0,0 +1,245 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/lite/kernels/internal/reference/pooling.h" + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/pooling.h" + +#if ESP_NN +#include +#endif + +#include + +long long pooling_total_time = 0; + +namespace tflite { + +namespace { +#if ESP_NN +void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node, + const TfLitePoolParams* params, const OpDataPooling* data, + const TfLiteEvalTensor* input, + TfLiteEvalTensor* output) { + + const int stride_height = params->stride_height; + const int stride_width = params->stride_width; + const int filter_height = params->filter_height; + const int filter_width = params->filter_width; + const int activation_min = data->activation_min; + const int activation_max = data->activation_max; + const int pad_height = data->padding.height; + const int pad_width = data->padding.width; + + const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input); + const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output); + TFLITE_DCHECK_LE(activation_min, activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + const int8_t *input_data = tflite::micro::GetTensorData(input); + int8_t *output_data = tflite::micro::GetTensorData(output); + + const int input_size = input_width * input_height * depth; + const int output_size = output_width * output_height * depth; + + if (depth % 4 == 0) { // S3 version only supports channels multiple of 4 + for (int batch = 0; batch < batches; ++batch) { + esp_nn_avg_pool_s8(input_data, input_width, input_height, + output_data, output_width, output_height, + stride_width, stride_height, + filter_width, filter_height, + pad_width, pad_height, + activation_min, activation_max, depth); + input_data += input_size; + output_data += output_size; + } + } else { + for (int batch = 0; batch < batches; ++batch) { + esp_nn_avg_pool_s8_ansi(input_data, input_width, input_height, + output_data, output_width, output_height, + stride_width, stride_height, + filter_width, filter_height, + pad_width, pad_height, + activation_min, activation_max, depth); + input_data += input_size; + output_data += output_size; + } + } +} + +void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node, + TfLitePoolParams* params, const OpDataPooling* data, + const TfLiteEvalTensor* input, TfLiteEvalTensor* output) { + + const int stride_height = params->stride_height; + const int stride_width = params->stride_width; + const int filter_height = params->filter_height; + const int filter_width = params->filter_width; + const int activation_min = data->activation_min; + const int activation_max = data->activation_max; + const int pad_height = data->padding.height; + const int pad_width = data->padding.width; + + const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input); + const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output); + TFLITE_DCHECK_LE(activation_min, activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + const int8_t *input_data = tflite::micro::GetTensorData(input); + int8_t *output_data = tflite::micro::GetTensorData(output); + + const int input_size = input_width * input_height * depth; + const int output_size = output_width * output_height * depth; + if (depth % 4 == 0) { // S3 version only supports channels multiple of 4 + for (int batch = 0; batch < batches; ++batch) { + esp_nn_max_pool_s8(input_data, input_width, input_height, + output_data, output_width, output_height, + stride_width, stride_height, + filter_width, filter_height, + pad_width, pad_height, + activation_min, activation_max, depth); + input_data += input_size; + output_data += output_size; + } + } else { + for (int batch = 0; batch < batches; ++batch) { + esp_nn_max_pool_s8_ansi(input_data, input_width, input_height, + output_data, output_width, output_height, + stride_width, stride_height, + filter_width, filter_height, + pad_width, pad_height, + activation_min, activation_max, depth); + input_data += input_size; + output_data += output_size; + } + } +} +#endif + +TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->builtin_data != nullptr); + auto* params = reinterpret_cast(node->builtin_data); + + TFLITE_DCHECK(node->user_data != nullptr); + const OpDataPooling* data = + static_cast(node->user_data); + + const TfLiteEvalTensor* input = + micro::GetEvalInput(context, node, kPoolingInputTensor); + TfLiteEvalTensor* output = + micro::GetEvalOutput(context, node, kPoolingOutputTensor); + + long long start_time = esp_timer_get_time(); + // Inputs and outputs share the same type, guaranteed by the converter. + switch (input->type) { + case kTfLiteFloat32: + AveragePoolingEvalFloat(context, node, params, data, input, output); + break; + case kTfLiteInt8: +#if ESP_NN + AverageEvalQuantized(context, node, params, data, input, output); +#else + AveragePoolingEvalQuantized(context, node, params, data, input, output); +#endif + break; + default: + TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported", + TfLiteTypeGetName(input->type)); + return kTfLiteError; + } + pooling_total_time += esp_timer_get_time() - start_time; + return kTfLiteOk; +} + +TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->builtin_data != nullptr); + auto* params = reinterpret_cast(node->builtin_data); + + TFLITE_DCHECK(node->user_data != nullptr); + const OpDataPooling* data = + static_cast(node->user_data); + + const TfLiteEvalTensor* input = + micro::GetEvalInput(context, node, kPoolingInputTensor); + TfLiteEvalTensor* output = + micro::GetEvalOutput(context, node, kPoolingOutputTensor); + + long long start_time = esp_timer_get_time(); + switch (input->type) { + case kTfLiteFloat32: + MaxPoolingEvalFloat(context, node, params, data, input, output); + break; + case kTfLiteInt8: +#if ESP_NN + MaxEvalQuantized(context, node, params, data, input, output); +#else + MaxPoolingEvalQuantized(context, node, params, data, input, output); +#endif + break; + default: + TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.", + TfLiteTypeGetName(input->type)); + return kTfLiteError; + } + pooling_total_time += esp_timer_get_time() - start_time; + return kTfLiteOk; +} + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + return context->AllocatePersistentBuffer(context, sizeof(OpDataPooling)); +} + +} // namespace + +TfLiteRegistration Register_AVERAGE_POOL_2D() { + return {/*init=*/Init, + /*free=*/nullptr, + /*prepare=*/PoolingPrepare, + /*invoke=*/AverageEval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +TfLiteRegistration Register_MAX_POOL_2D() { + return {/*init=*/Init, + /*free=*/nullptr, + /*prepare=*/PoolingPrepare, + /*invoke=*/MaxEval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/exp.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/exp.cc index 253769a3..d1b0f6cb 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/exp.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/exp.cc @@ -27,11 +27,15 @@ constexpr int kInputTensor = 0; constexpr int kOutputTensor = 0; TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32); TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type); @@ -40,6 +44,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { for (int i = 0; i < output->dims->size; ++i) { TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]); } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/expand_dims.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/expand_dims.cc index bea3ca7e..6dcba4d5 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/expand_dims.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/expand_dims.cc @@ -84,22 +84,31 @@ TfLiteStatus VerifyTensorDim(TfLiteContext* context, const TfLiteTensor* input, } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); - const TfLiteTensor* axis; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAxisTensor, &axis)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* axis = + micro_context->AllocateTempInputTensor(node, kAxisTensor); + TF_LITE_ENSURE(context, axis != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); output->type = input->type; if (IsDynamicTensor(axis)) { TF_LITE_KERNEL_LOG(context, "DynamicTensor is not yet supported by Expand_Dims."); return kTfLiteError; } - return VerifyTensorDim(context, input, axis, output); + TF_LITE_ENSURE_OK(context, VerifyTensorDim(context, input, axis, output)); + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(axis); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } template diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fill.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fill.cc index 18de3458..d8a2b09d 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fill.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fill.cc @@ -65,14 +65,18 @@ constexpr int kValueTensor = 1; constexpr int kOutputTensor = 0; TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + // Ensure inputs and outputs exist. - const TfLiteTensor* dims; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDimsTensor, &dims)); - const TfLiteTensor* value; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kValueTensor, &value)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + TfLiteTensor* dims = + micro_context->AllocateTempInputTensor(node, kDimsTensor); + TF_LITE_ENSURE(context, dims != nullptr); + TfLiteTensor* value = + micro_context->AllocateTempInputTensor(node, kValueTensor); + TF_LITE_ENSURE(context, value != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); // The value tensor must be a scalar. TF_LITE_ENSURE_EQ(context, NumDimensions(value), 0); @@ -90,6 +94,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, EnsureEq(context, output->dims, dims)); } + micro_context->DeallocateTempTfLiteTensor(dims); + micro_context->DeallocateTempTfLiteTensor(value); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_div.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_div.cc index 006296a9..d11e4969 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_div.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_div.cc @@ -31,22 +31,28 @@ constexpr int kInputTensor2 = 1; constexpr int kOutputTensor = 0; TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input1; - TF_LITE_ENSURE_OK(context, - GetInputSafe(context, node, kInputTensor1, &input1)); - const TfLiteTensor* input2; - TF_LITE_ENSURE_OK(context, - GetInputSafe(context, node, kInputTensor2, &input2)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + TfLiteTensor* input1 = + micro_context->AllocateTempInputTensor(node, kInputTensor1); + TF_LITE_ENSURE(context, input1 != nullptr); + TfLiteTensor* input2 = + micro_context->AllocateTempInputTensor(node, kInputTensor2); + TF_LITE_ENSURE(context, input2 != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type); TF_LITE_ENSURE_TYPES_EQ(context, input1->type, output->type); + micro_context->DeallocateTempTfLiteTensor(input1); + micro_context->DeallocateTempTfLiteTensor(input2); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_mod.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_mod.cc index 42f2236c..083bd5cb 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_mod.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/floor_mod.cc @@ -36,22 +36,28 @@ constexpr int kOutputTensor = 0; // OLD-TODO(b/117912880): Support quantization. TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input1; - TF_LITE_ENSURE_OK(context, - GetInputSafe(context, node, kInputTensor1, &input1)); - const TfLiteTensor* input2; - TF_LITE_ENSURE_OK(context, - GetInputSafe(context, node, kInputTensor2, &input2)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + TfLiteTensor* input1 = + micro_context->AllocateTempInputTensor(node, kInputTensor1); + TF_LITE_ENSURE(context, input1 != nullptr); + TfLiteTensor* input2 = + micro_context->AllocateTempInputTensor(node, kInputTensor2); + TF_LITE_ENSURE(context, input2 != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type); TF_LITE_ENSURE_TYPES_EQ(context, input1->type, output->type); + micro_context->DeallocateTempTfLiteTensor(input1); + micro_context->DeallocateTempTfLiteTensor(input2); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.cc index a9f35dba..c0be3814 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/fully_connected.cc @@ -35,6 +35,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TFLITE_DCHECK(node->user_data != nullptr); TFLITE_DCHECK(node->builtin_data != nullptr); @@ -42,23 +44,33 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { const auto params = static_cast(node->builtin_data); - const TfLiteTensor* input = - GetInput(context, node, kFullyConnectedInputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kFullyConnectedInputTensor); TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* filter = - GetInput(context, node, kFullyConnectedWeightsTensor); + TfLiteTensor* filter = micro_context->AllocateTempInputTensor( + node, kFullyConnectedWeightsTensor); TF_LITE_ENSURE(context, filter != nullptr); - const TfLiteTensor* bias = - GetOptionalInputTensor(context, node, kFullyConnectedBiasTensor); - TfLiteTensor* output = GetOutput(context, node, kFullyConnectedOutputTensor); + TfLiteTensor* bias = + micro_context->AllocateTempInputTensor(node, kFullyConnectedBiasTensor); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor( + node, kFullyConnectedOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); TF_LITE_ENSURE_MSG(context, input->type == filter->type, "Hybrid models are not supported on TFLite Micro."); - return CalculateOpDataFullyConnected(context, params->activation, input->type, - input, filter, bias, output, data); + TF_LITE_ENSURE_OK(context, CalculateOpDataFullyConnected( + context, params->activation, input->type, + input, filter, bias, output, data)); + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(filter); + if (bias != nullptr) { + micro_context->DeallocateTempTfLiteTensor(bias); + } + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather.cc index db050626..0b7c23f9 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather.cc @@ -97,19 +97,23 @@ TfLiteStatus Gather(const TfLiteGatherParams* params, } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); const auto* params = reinterpret_cast(node->builtin_data); - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); - const TfLiteTensor* coords; - TF_LITE_ENSURE_OK(context, - GetInputSafe(context, node, kInputPositions, &coords)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* coords = + micro_context->AllocateTempInputTensor(node, kInputPositions); + TF_LITE_ENSURE(context, coords != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); + switch (coords->type) { case kTfLiteInt32: break; @@ -176,6 +180,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { for (int i = axis + 1; i < input->dims->size; ++i) { output_shape->data[output_index++] = input->dims->data[i]; } + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(coords); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather_nd.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather_nd.cc index 393b931f..c604ae15 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather_nd.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/gather_nd.cc @@ -28,16 +28,19 @@ constexpr int kOutputTensor = 0; constexpr int MAX_INDICES_ND = 5; TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* params; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kParams, ¶ms)); - const TfLiteTensor* indices; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kIndices, &indices)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + TfLiteTensor* params = micro_context->AllocateTempInputTensor(node, kParams); + TF_LITE_ENSURE(context, params != nullptr); + TfLiteTensor* indices = + micro_context->AllocateTempInputTensor(node, kIndices); + TF_LITE_ENSURE(context, indices != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); switch (params->type) { case kTfLiteFloat32: @@ -98,6 +101,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { output_shape->data[output_index++] = params->dims->data[i]; } output_shape->size = output_index; + + micro_context->DeallocateTempTfLiteTensor(params); + micro_context->DeallocateTempTfLiteTensor(indices); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish_common.cc index ee32e0dc..8f846522 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/hard_swish_common.cc @@ -32,13 +32,17 @@ const int kHardSwishInputTensor = 0; const int kHardSwishOutputTensor = 0; TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TFLITE_DCHECK(node->user_data != nullptr); TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kHardSwishInputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kHardSwishInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kHardSwishOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kHardSwishOutputTensor); TF_LITE_ENSURE(context, output != nullptr); if (input->type == kTfLiteInt8) { @@ -73,6 +77,9 @@ TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) { ¶ms->reluish_multiplier_fixedpoint_int16); } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/if.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/if.cc index 2b98f117..050aeac4 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/if.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/if.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/micro/kernels/kernel_util.h" #include "tensorflow/lite/micro/memory_helpers.h" +#include "tensorflow/lite/micro/micro_context.h" #include "tensorflow/lite/micro/micro_graph.h" #include "tensorflow/lite/schema/schema_generated.h" @@ -50,36 +51,33 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, node->inputs->size > 0); // The first input is the condition. - const TfLiteTensor* cond; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &cond)); + tflite::MicroContext* micro_context = tflite::GetMicroContext(context); + TfLiteTensor* cond = micro_context->AllocateTempInputTensor(node, 0); + + TF_LITE_ENSURE(context, cond != nullptr); TF_LITE_ENSURE_EQ(context, cond->type, kTfLiteBool); TF_LITE_ENSURE_EQ(context, NumElements(cond), 1); + micro_context->DeallocateTempTfLiteTensor(cond); + // The first input of the node is the condition. The rest of inputs are // passed to the branch subgraphs. Therefore, the number of subgraph inputs // will be the number of node inputs - 1. size_t num_inputs = node->inputs->size - 1; size_t num_outputs = node->outputs->size; - // Casting to TfliteIntArray is required since we are re-using - // GetExecutionPlan from TfLiteContext. On TFLM this method returns a - // MicroGraph. - // TODO(b/188226309): Design a cleaner way to get a graph from kernel context. - MicroGraph* graph_info; - context->GetExecutionPlan(context, - reinterpret_cast(&graph_info)); + MicroGraph& graph_info = micro_context->graph(); TF_LITE_ENSURE(context, - op_data->then_subgraph_index < graph_info->NumSubgraphs()); + op_data->then_subgraph_index < graph_info.NumSubgraphs()); TF_LITE_ENSURE(context, - op_data->else_subgraph_index < graph_info->NumSubgraphs()); + op_data->else_subgraph_index < graph_info.NumSubgraphs()); - TF_LITE_ENSURE_EQ( - context, num_inputs, - graph_info->NumSubgraphInputs(op_data->then_subgraph_index)); + TF_LITE_ENSURE_EQ(context, num_inputs, + graph_info.NumSubgraphInputs(op_data->then_subgraph_index)); TF_LITE_ENSURE_EQ( context, num_outputs, - graph_info->NumSubgraphOutputs(op_data->then_subgraph_index)); + graph_info.NumSubgraphOutputs(op_data->then_subgraph_index)); return kTfLiteOk; } @@ -87,66 +85,30 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const OpData* op_data = reinterpret_cast(node->user_data); - const TfLiteTensor* cond; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &cond)); + tflite::MicroContext* micro_context = tflite::GetMicroContext(context); + TfLiteTensor* cond = micro_context->AllocateTempInputTensor(node, 0); + + TF_LITE_ENSURE(context, cond != nullptr); bool cond_value = cond->data.b[0]; + micro_context->DeallocateTempTfLiteTensor(cond); - // Casting to TfliteIntArray is required since we are re-using - // GetExecutionPlan from TfLiteContext. On TFLM this method returns a - // MicroGraph. - // TODO(b/188226309): Design a cleaner way to get a graph from kernel context. - MicroGraph* graph_info; - context->GetExecutionPlan(context, - reinterpret_cast(&graph_info)); - - // Currently we copy the input / output between the subgraphs. This isn't - // optimized yet. + MicroGraph* graph_info = µ_context->graph(); + // Currently we copy the input / output between the subgraphs. int active_branch_subgraph_index = cond_value ? op_data->then_subgraph_index : op_data->else_subgraph_index; - for (size_t i = 0; - i < graph_info->NumSubgraphInputs(active_branch_subgraph_index); ++i) { - const TfLiteEvalTensor* input = - tflite::micro::GetEvalInput(context, node, i + 1); - - TfLiteEvalTensor* subgraph_input = - graph_info->GetSubgraphInput(active_branch_subgraph_index, i); - - // These checks must occur in Eval since TfLiteEvalTensors are not available - // during Prepare. - size_t input_bytes; - size_t subgraph_input_bytes; - TF_LITE_ENSURE_OK(context, TfLiteEvalTensorByteLength(input, &input_bytes)); - TF_LITE_ENSURE_OK(context, TfLiteEvalTensorByteLength( - subgraph_input, &subgraph_input_bytes)); - TF_LITE_ENSURE_TYPES_EQ(context, input->type, subgraph_input->type); - TF_LITE_ENSURE_EQ(context, input_bytes, subgraph_input_bytes); - memcpy(subgraph_input->data.raw, input->data.raw, input_bytes); - } + TF_LITE_ENSURE_OK(context, + tflite::micro::CopyOpInputsToSubgraphInputs( + context, node, graph_info, active_branch_subgraph_index, + /*first_tensor_idx=*/1)); TF_LITE_ENSURE_OK(context, graph_info->InvokeSubgraph(active_branch_subgraph_index)); - for (size_t i = 0; - i < graph_info->NumSubgraphOutputs(active_branch_subgraph_index); ++i) { - const TfLiteEvalTensor* output = - tflite::micro::GetEvalOutput(context, node, i); + TF_LITE_ENSURE_OK( + context, tflite::micro::CopySubgraphOutputsToOpOutputs( + context, node, graph_info, active_branch_subgraph_index)); - TfLiteEvalTensor* subgraph_output = - graph_info->GetSubgraphOutput(active_branch_subgraph_index, i); - - // These checks must occur in Eval since TfLiteEvalTensors are not available - // during Prepare. - size_t output_bytes; - size_t subgraph_output_bytes; - TF_LITE_ENSURE_OK(context, - TfLiteEvalTensorByteLength(output, &output_bytes)); - TF_LITE_ENSURE_OK(context, TfLiteEvalTensorByteLength( - subgraph_output, &subgraph_output_bytes)); - TF_LITE_ENSURE_TYPES_EQ(context, output->type, subgraph_output->type); - TF_LITE_ENSURE_EQ(context, output_bytes, subgraph_output_bytes); - memcpy(output->data.raw, subgraph_output->data.raw, output_bytes); - } return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.cc index 7debacc2..fd84dec1 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.cc @@ -24,7 +24,6 @@ namespace tflite { namespace micro { // TODO(b/161841696): Consider moving away from global arena buffers: -constexpr int KernelRunner::kNumScratchBuffers_; constexpr int KernelRunner::kKernelRunnerBufferSize_; uint8_t KernelRunner::kKernelRunnerBuffer_[]; @@ -32,22 +31,23 @@ KernelRunner::KernelRunner(const TfLiteRegistration& registration, TfLiteTensor* tensors, int tensors_size, TfLiteIntArray* inputs, TfLiteIntArray* outputs, void* builtin_data) - : allocator_(SimpleMemoryAllocator::Create(GetMicroErrorReporter(), + : registration_(registration), + allocator_(SimpleMemoryAllocator::Create(GetMicroErrorReporter(), kKernelRunnerBuffer_, kKernelRunnerBufferSize_)), - registration_(registration), - tensors_(tensors), - mock_micro_graph_(allocator_) { + mock_micro_graph_(allocator_), + fake_micro_context_(tensors, allocator_, &mock_micro_graph_) { // Prepare TfLiteContext: - context_.impl_ = static_cast(this); - context_.ReportError = ReportOpError; + context_.impl_ = static_cast(&fake_micro_context_); + context_.ReportError = MicroContextReportOpError; context_.recommended_num_threads = 1; - context_.GetTensor = GetTensor; - context_.GetEvalTensor = GetEvalTensor; - context_.AllocatePersistentBuffer = AllocatePersistentBuffer; - context_.RequestScratchBufferInArena = RequestScratchBufferInArena; - context_.GetScratchBuffer = GetScratchBuffer; - context_.GetExecutionPlan = GetGraph; + context_.GetTensor = MicroContextGetTensor; + context_.GetEvalTensor = MicroContextGetEvalTensor; + context_.AllocatePersistentBuffer = MicroContextAllocatePersistentBuffer; + context_.RequestScratchBufferInArena = + MicroContextRequestScratchBufferInArena; + context_.GetScratchBuffer = MicroContextGetScratchBuffer; + context_.recommended_num_threads = 0; // Prepare TfLiteNode: @@ -56,14 +56,24 @@ KernelRunner::KernelRunner(const TfLiteRegistration& registration, node_.builtin_data = builtin_data; } +bool KernelRunner::ValidateTempBufferDeallocated() { + return fake_micro_context_.IsAllTempTfLiteTensorDeallocated(); +} + TfLiteStatus KernelRunner::InitAndPrepare(const char* init_data, size_t length) { if (registration_.init) { node_.user_data = registration_.init(&context_, init_data, length); } + + TF_LITE_ENSURE(&context_, ValidateTempBufferDeallocated()); + if (registration_.prepare) { TF_LITE_ENSURE_STATUS(registration_.prepare(&context_, &node_)); } + + TF_LITE_ENSURE(&context_, ValidateTempBufferDeallocated()); + return kTfLiteOk; } @@ -72,101 +82,11 @@ TfLiteStatus KernelRunner::Invoke() { MicroPrintf("TfLiteRegistration missing invoke function pointer!"); return kTfLiteError; } - return registration_.invoke(&context_, &node_); -} -TfLiteTensor* KernelRunner::GetTensor(const struct TfLiteContext* context, - int tensor_index) { - TFLITE_DCHECK(context != nullptr); - KernelRunner* runner = reinterpret_cast(context->impl_); - TFLITE_DCHECK(runner != nullptr); + TF_LITE_ENSURE_STATUS(registration_.invoke(&context_, &node_)); - return &runner->tensors_[tensor_index]; -} + TF_LITE_ENSURE(&context_, ValidateTempBufferDeallocated()); -TfLiteEvalTensor* KernelRunner::GetEvalTensor( - const struct TfLiteContext* context, int tensor_index) { - TFLITE_DCHECK(context != nullptr); - KernelRunner* runner = reinterpret_cast(context->impl_); - TFLITE_DCHECK(runner != nullptr); - - TfLiteEvalTensor* eval_tensor = - reinterpret_cast(runner->allocator_->AllocateTemp( - sizeof(TfLiteEvalTensor), alignof(TfLiteEvalTensor))); - TFLITE_DCHECK(eval_tensor != nullptr); - - // In unit tests, the TfLiteTensor pointer contains the source of truth for - // buffers and values: - eval_tensor->data = runner->tensors_[tensor_index].data; - eval_tensor->dims = runner->tensors_[tensor_index].dims; - eval_tensor->type = runner->tensors_[tensor_index].type; - return eval_tensor; -} - -void* KernelRunner::AllocatePersistentBuffer(TfLiteContext* context, - size_t bytes) { - TFLITE_DCHECK(context != nullptr); - KernelRunner* runner = reinterpret_cast(context->impl_); - TFLITE_DCHECK(runner != nullptr); - - return runner->allocator_->AllocateFromTail(bytes, - MicroArenaBufferAlignment()); -} - -TfLiteStatus KernelRunner::RequestScratchBufferInArena(TfLiteContext* context, - size_t bytes, - int* buffer_index) { - TFLITE_DCHECK(context != nullptr); - TFLITE_DCHECK(buffer_index != nullptr); - - KernelRunner* runner = reinterpret_cast(context->impl_); - TFLITE_DCHECK(runner != nullptr); - - if (runner->scratch_buffer_count_ == kNumScratchBuffers_) { - MicroPrintf("Exceeded the maximum number of scratch tensors allowed (%d).", - kNumScratchBuffers_); - return kTfLiteError; - } - - // For tests, we allocate scratch buffers from the tail and keep them around - // for the lifetime of model. This means that the arena size in the tests will - // be more than what we would have if the scratch buffers could share memory. - runner->scratch_buffers_[runner->scratch_buffer_count_] = - runner->allocator_->AllocateFromTail(bytes, MicroArenaBufferAlignment()); - TFLITE_DCHECK(runner->scratch_buffers_[runner->scratch_buffer_count_] != - nullptr); - - *buffer_index = runner->scratch_buffer_count_++; - return kTfLiteOk; -} - -void* KernelRunner::GetScratchBuffer(TfLiteContext* context, int buffer_index) { - TFLITE_DCHECK(context != nullptr); - KernelRunner* runner = reinterpret_cast(context->impl_); - TFLITE_DCHECK(runner != nullptr); - - TFLITE_DCHECK(runner->scratch_buffer_count_ <= kNumScratchBuffers_); - if (buffer_index >= runner->scratch_buffer_count_) { - return nullptr; - } - return runner->scratch_buffers_[buffer_index]; -} - -void KernelRunner::ReportOpError(struct TfLiteContext* context, - const char* format, ...) { - va_list args; - va_start(args, format); - GetMicroErrorReporter()->Report(format, args); - va_end(args); -} - -TfLiteStatus KernelRunner::GetGraph(struct TfLiteContext* context, - TfLiteIntArray** args) { - TFLITE_DCHECK(context != nullptr); - KernelRunner* runner = reinterpret_cast(context->impl_); - TFLITE_DCHECK(runner != nullptr); - // TODO(b/188226309): Design a cleaner way to get a graph from kernel context. - *args = reinterpret_cast(runner->GetMockGraph()); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.h index 03919f85..9dddde50 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_runner.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/micro/fake_micro_context.h" #include "tensorflow/lite/micro/mock_micro_graph.h" #include "tensorflow/lite/micro/simple_memory_allocator.h" @@ -50,40 +51,22 @@ class KernelRunner { // to stub out MicroGraph methods and track invocations on each subgraph. MockMicroGraph* GetMockGraph() { return &mock_micro_graph_; } - protected: - static TfLiteTensor* GetTensor(const struct TfLiteContext* context, - int tensor_index); - static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context, - int tensor_index); - static void* AllocatePersistentBuffer(TfLiteContext* context, size_t bytes); - static TfLiteStatus RequestScratchBufferInArena(TfLiteContext* context, - size_t bytes, - int* buffer_index); - static void* GetScratchBuffer(TfLiteContext* context, int buffer_index); - static void ReportOpError(struct TfLiteContext* context, const char* format, - ...); - // This method matches GetExecutionPlan from TfLiteContext since TFLM reuses - // this method to get the MicroGraph from an operator context. - // TODO(b/188226309): Design a cleaner way to get a graph from kernel context. - static TfLiteStatus GetGraph(struct TfLiteContext* context, - TfLiteIntArray** args); + // Returns true if all temp buffer in tests are deallocated. + // TODO(b/209453859): move this function to private after deallocation checks + // are enabled for all kernel tests. + bool ValidateTempBufferDeallocated(); private: - static constexpr int kNumScratchBuffers_ = 12; - static constexpr int kKernelRunnerBufferSize_ = 10000; static uint8_t kKernelRunnerBuffer_[kKernelRunnerBufferSize_]; - SimpleMemoryAllocator* allocator_ = nullptr; - const TfLiteRegistration& registration_; - TfLiteTensor* tensors_ = nullptr; - MockMicroGraph mock_micro_graph_; - TfLiteContext context_ = {}; TfLiteNode node_ = {}; + const TfLiteRegistration& registration_; - int scratch_buffer_count_ = 0; - uint8_t* scratch_buffers_[kNumScratchBuffers_]; + SimpleMemoryAllocator* allocator_; + MockMicroGraph mock_micro_graph_; + FakeMicroContext fake_micro_context_; }; } // namespace micro diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.cc index 73069064..14664b91 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/kernel_util.h" #include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/micro/memory_helpers.h" namespace tflite { namespace micro { @@ -119,13 +120,83 @@ TfLiteStatus CreateWritableTensorDimsWithCopy(TfLiteContext* context, return kTfLiteOk; } -// Returns a blob of payload data. The payload is subjected to interpretation by -// the OP. This is the recommended API for an OP to get an external context. OP -// should use this instead of directly calling GetExternalContext function in -// context. -void* GetExternalContext(TfLiteContext* context) { - return reinterpret_cast( - context->GetExternalContext(context, kTfLiteMaxExternalContexts)); +// Verify that both tensors have the same type and size, then return the size +// of both tensors in bytes if they are the same, or -1 if they are different. +size_t ValidateAndGetTensorSizes(const TfLiteEvalTensor* tensor1, + const TfLiteEvalTensor* tensor2) { + TFLITE_DCHECK(tensor1->type == tensor2->type); + size_t tensor1_size = 0; + size_t tensor2_size = 0; + TfLiteEvalTensorByteLength(tensor1, &tensor1_size); + TfLiteEvalTensorByteLength(tensor2, &tensor2_size); + return (tensor1_size == tensor2_size) ? tensor1_size : -1; +} + +TfLiteStatus CopyOpInputsToOpOutputs(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE(context, node->inputs->size == node->outputs->size); + for (int i = 0; i < node->inputs->size; i++) { + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, i); + TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, i); + int bytes = ValidateAndGetTensorSizes(input, output); + TF_LITE_ENSURE(context, bytes >= 0); + memcpy(output->data.raw, input->data.raw, bytes); + } + return kTfLiteOk; +} + +TfLiteStatus CopyOpInputsToSubgraphInputs(TfLiteContext* context, + TfLiteNode* node, + MicroGraph* graph_info, + int subgraph_idx, + int first_tensor_idx) { + TF_LITE_ENSURE(context, + static_cast(node->inputs->size - first_tensor_idx) == + graph_info->NumSubgraphInputs(subgraph_idx)); + for (int i = 0; i < node->inputs->size - first_tensor_idx; i++) { + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, i + first_tensor_idx); + TfLiteEvalTensor* subgraph_input = + graph_info->GetSubgraphInput(subgraph_idx, i); + int bytes = ValidateAndGetTensorSizes(input, subgraph_input); + TF_LITE_ENSURE(context, bytes >= 0); + memcpy(subgraph_input->data.raw, input->data.raw, bytes); + } + return kTfLiteOk; +} + +TfLiteStatus CopyOpOutputsToSubgraphInputs(TfLiteContext* context, + TfLiteNode* node, + MicroGraph* graph_info, + int subgraph_idx) { + TF_LITE_ENSURE(context, static_cast(node->outputs->size) == + graph_info->NumSubgraphInputs(subgraph_idx)); + for (int i = 0; i < node->outputs->size; i++) { + TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, i); + TfLiteEvalTensor* subgraph_input = + graph_info->GetSubgraphInput(subgraph_idx, i); + int bytes = ValidateAndGetTensorSizes(output, subgraph_input); + TF_LITE_ENSURE(context, bytes >= 0); + memcpy(subgraph_input->data.raw, output->data.raw, bytes); + } + return kTfLiteOk; +} + +TfLiteStatus CopySubgraphOutputsToOpOutputs(TfLiteContext* context, + TfLiteNode* node, + MicroGraph* graph_info, + int subgraph_idx) { + TF_LITE_ENSURE(context, static_cast(node->outputs->size) == + graph_info->NumSubgraphOutputs(subgraph_idx)); + for (int i = 0; i < node->outputs->size; i++) { + TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, i); + TfLiteEvalTensor* subgraph_output = + graph_info->GetSubgraphOutput(subgraph_idx, i); + int bytes = ValidateAndGetTensorSizes(output, subgraph_output); + TF_LITE_ENSURE(context, bytes >= 0); + memcpy(output->data.raw, subgraph_output->data.raw, bytes); + } + return kTfLiteOk; } } // namespace micro diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.h index 1bd266d1..6c5d7f18 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/kernel_util.h @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" #include "tensorflow/lite/kernels/internal/types.h" +#include "tensorflow/lite/micro/micro_context.h" namespace tflite { namespace micro { @@ -69,23 +70,33 @@ TfLiteStatus CreateWritableTensorDimsWithCopy(TfLiteContext* context, TfLiteTensor* tensor, TfLiteEvalTensor* eval_tensor); -// Returns a blob of payload data. The payload is subjected to interpretation by -// the OP. This is the recommended API for an OP to get an external context. OP -// should use this instead of directly calling GetExternalContext function in -// context. Example usage: -// -// An application can set an external context through interpreter as below -// interpreter->SetMicroExternalContext(pointer_to_your_payload); -// -// Inside an OP that needs this payload, it get the payload pointer by: -// Prepare(TfliteContext * context) { -// ... -// payload_ptr = -// reinterpret_cast(GetMicroExternalContext(context)) -// ... -// } -// -void* GetMicroExternalContext(TfLiteContext* context); +// Copy all op input tensors to op output tensors. Requires all op input tensor +// shapes and types to be identical to op output tensor shapes and types. +TfLiteStatus CopyOpInputsToOpOutputs(TfLiteContext* context, TfLiteNode* node); + +// Copy all op input tensors to subgraph input tensors. Requires all op input +// tensor shapes and types to be identical to subgraph input tensor shapes and +// types. +TfLiteStatus CopyOpInputsToSubgraphInputs(TfLiteContext* context, + TfLiteNode* node, + MicroGraph* graph_info, + int subgraph_idx, + int first_tensor_idx); + +// Copy all op output tensors to subgraph input tensors. Requires all op output +// tensor shapes and types to be identical to subgraph input tensor shapes and +// types. +TfLiteStatus CopyOpOutputsToSubgraphInputs(TfLiteContext* context, + TfLiteNode* node, + MicroGraph* graph_info, + int subgraph_idx); + +// Copy all subgraph output tensors to op outputs. Requires all subgraph output +// tensor shapes and types to be identical to op output tensor shapes and types. +TfLiteStatus CopySubgraphOutputsToOpOutputs(TfLiteContext* context, + TfLiteNode* node, + MicroGraph* graph_info, + int subgraph_idx); } // namespace micro } // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2_pool_2d.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2_pool_2d.cc index 926f1ffb..250cd3be 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2_pool_2d.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2_pool_2d.cc @@ -36,15 +36,18 @@ constexpr int kTensorShapeRank = 4; enum { kBatchRank = 0, kHeightRank, kWidthRank, kChannelRank }; TfLiteStatus L2Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + auto* params = static_cast(node->builtin_data); TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); TF_LITE_ENSURE_EQ(context, NumDimensions(input), kTensorShapeRank); TF_LITE_ENSURE_EQ(context, NumDimensions(output), kTensorShapeRank); TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); @@ -82,6 +85,9 @@ TfLiteStatus L2Prepare(TfLiteContext* context, TfLiteNode* node) { output->dims->data[kWidthRank] = out_width; output->dims->data[kChannelRank] = channels_out; + micro_context->DeallocateTempTfLiteTensor(output); + micro_context->DeallocateTempTfLiteTensor(input); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2norm.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2norm.cc index 930710d4..289e4de5 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2norm.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/l2norm.cc @@ -49,11 +49,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); - TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - TF_LITE_ENSURE(context, output != nullptr); + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE(context, NumDimensions(input) <= 4); TF_LITE_ENSURE(context, @@ -69,6 +72,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Our implementations don't currently support activations. TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone); + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu_common.cc index 21cc99fc..3d1ffebb 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/leaky_relu_common.cc @@ -30,13 +30,16 @@ const int kOutputTensor = 0; TfLiteStatus CalculateOpDataLeakyRelu(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); if (output->type == kTfLiteInt8 || output->type == kTfLiteInt16) { @@ -62,6 +65,9 @@ TfLiteStatus CalculateOpDataLeakyRelu(TfLiteContext* context, data->output_shift_identity = static_cast(output_shift_identity); } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/log_softmax.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/log_softmax.cc index 5443c914..0af74def 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/log_softmax.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/log_softmax.cc @@ -43,13 +43,16 @@ constexpr int kInputTensor = 0; constexpr int kOutputTensor = 0; TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); TF_LITE_ENSURE(context, HaveSameShapes(input, output)); @@ -89,6 +92,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node) { data->depth = static_cast(input_shape.Dims(trailing_dim)); } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic_common.cc index 05765ad4..a79fd6bb 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/logistic_common.cc @@ -32,9 +32,13 @@ const int kLogisticOutputTensor = 0; TfLiteStatus CalculateArithmeticOpDataLogistic(TfLiteContext* context, TfLiteNode* node, OpDataLogistic* data) { - const TfLiteTensor* input = GetInput(context, node, kLogisticInputTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kLogisticInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kLogisticOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kLogisticOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); @@ -55,6 +59,53 @@ TfLiteStatus CalculateArithmeticOpDataLogistic(TfLiteContext* context, data->input_range_radius = CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31); } + + if (input->type == kTfLiteInt16) { + static constexpr int kInputIntegerBits = 3; + static constexpr int kOutputFractionalBits = 15; + + // See comments in TanhPrepare about requiring zero_point==0 + // and a power-of-two ("POT") scale. + + TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + + int input_scale_log2_rounded; + bool param_scale_pot = + CheckedLog2(input->params.scale, &input_scale_log2_rounded); + + data->input_left_shift = + (15 - kInputIntegerBits) + input_scale_log2_rounded; + param_scale_pot &= (data->input_left_shift == 0); + + if (param_scale_pot) { + data->input_multiplier = 0; + } else { + // Calculate multiplier to change input scale to 1/(3*4096) + // as required by the table lookup. + // In this scaling +/-2^17 represents +/-10.7 + double multiplier = + static_cast(input->params.scale) * 4096.0 * 3.0; + + data->input_left_shift = 0; + + while (multiplier <= 32767.0 / 2.0 && data->input_left_shift <= 30) { + data->input_left_shift++; + multiplier = multiplier * 2.0; + } + + data->input_multiplier = static_cast(multiplier); + } + + int output_scale_log2_rounded; + TF_LITE_ENSURE( + context, CheckedLog2(output->params.scale, &output_scale_log2_rounded)); + TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded, + -kOutputFractionalBits); + } + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_ops.h b/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_ops.h index 0fac51b7..17064fee 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_ops.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/micro_ops.h @@ -36,6 +36,8 @@ TfLiteRegistration Register_ADD_N(); TfLiteRegistration Register_ASSIGN_VARIABLE(); TfLiteRegistration Register_AVERAGE_POOL_2D(); TfLiteRegistration Register_BATCH_TO_SPACE_ND(); +TfLiteRegistration Register_BROADCAST_ARGS(); +TfLiteRegistration Register_BROADCAST_TO(); TfLiteRegistration Register_CALL_ONCE(); TfLiteRegistration Register_CAST(); // TODO(b/160234179): Change custom OPs to also return by value. @@ -62,6 +64,7 @@ TfLiteRegistration Register_LOGICAL_AND(); TfLiteRegistration Register_LOGICAL_OR(); TfLiteRegistration Register_LOGISTIC(); TfLiteRegistration Register_MAX_POOL_2D(); +TfLiteRegistration Register_MIRROR_PAD(); TfLiteRegistration Register_PRELU(); TfLiteRegistration Register_MUL(); TfLiteRegistration Register_QUANTIZE(); @@ -79,6 +82,7 @@ TfLiteRegistration Register_SVDF(); TfLiteRegistration Register_TRANSPOSE(); TfLiteRegistration Register_TRANSPOSE_CONV(); TfLiteRegistration Register_VAR_HANDLE(); +TfLiteRegistration Register_WHILE(); TfLiteRegistration Register_ZEROS_LIKE(); namespace ops { diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/mirror_pad.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mirror_pad.cc new file mode 100644 index 00000000..a19561f6 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mirror_pad.cc @@ -0,0 +1,222 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" + +namespace tflite { +namespace { + +struct OpDataMirrorPad { + int input_dims; + int output_size; + int offset; + int output_dims_num_elements_buffer_index; + int input_dims_num_elements_buffer_index; +}; + +// Helper method that fills the left and right pads. +template +inline void GetPadding(const T* data, int offset, int64_t* left_pad, + int64_t* right_pad) { + *left_pad = static_cast(*(data + offset * 2)); + *right_pad = static_cast(*(data + offset * 2 + 1)); +} + +// Given dimension index and the left/right padding. +// Returns the corresponding dimension in the input array. +inline int GetInputDimension(int padded_dimension, int left_pad, int right_pad, + int input_dim_size, int offset) { + if (padded_dimension < left_pad) { + const int original_ind = left_pad + offset - 1; + return original_ind - (std::min(padded_dimension, original_ind - offset)); + } + padded_dimension -= left_pad; + if (padded_dimension >= input_dim_size) { + padded_dimension -= input_dim_size; + const int original_ind = input_dim_size - (1 + offset); + return original_ind - std::min(padded_dimension, original_ind); + } + return padded_dimension; +} + +// Given and index in output array, returns the index of the value +// in input array. +int GetFlatIndex(int index, int num_dims, + const TfLiteEvalTensor* padding_matrix, + const TfLiteIntArray* input_dims, + int* output_dims_num_elements, int* input_dims_num_elements, + const int offset) { + int flat_index = 0; + int64_t left_pad = 0, right_pad = 0, dimension_index, index_in_input; + + for (int i = 0; i < num_dims; ++i) { + switch (padding_matrix->type) { + case kTfLiteInt32: + GetPadding(padding_matrix->data.i32, i, &left_pad, &right_pad); + break; + case kTfLiteInt64: + GetPadding(padding_matrix->data.i64, i, &left_pad, &right_pad); + break; + default: + break; + } + dimension_index = index / output_dims_num_elements[i]; + + index_in_input = GetInputDimension(dimension_index, left_pad, right_pad, + input_dims->data[i], offset); + + flat_index += index_in_input * (input_dims_num_elements)[i]; + index %= output_dims_num_elements[i]; + } + + return flat_index; +} + +template +void MirrorPad(const TfLiteEvalTensor* padding_matrix, + const TfLiteIntArray* input_dims, int* output_dims_num_elements, + int* input_dims_num_elements, const T* input_data, + T* output_data, const int offset, const int num_dims, + const int output_size) { + for (int i = 0; i < output_size; ++i) { + output_data[i] = input_data[GetFlatIndex( + i, num_dims, padding_matrix, input_dims, output_dims_num_elements, + input_dims_num_elements, offset)]; + } +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + TFLITE_DCHECK(node->user_data != nullptr); + TfLiteStatus status = kTfLiteOk; + const OpDataMirrorPad* data = + static_cast(node->user_data); + + const TfLiteEvalTensor* input_tensor = + tflite::micro::GetEvalInput(context, node, 0); + const TfLiteEvalTensor* padding_matrix = + tflite::micro::GetEvalInput(context, node, 1); + + TfLiteEvalTensor* output_tensor = + tflite::micro::GetEvalOutput(context, node, 0); + const int input_dims = data->input_dims; + const int output_size = data->output_size; + + int* input_dims_num_elements = (int*)context->GetScratchBuffer( + context, data->input_dims_num_elements_buffer_index); + int* output_dims_num_elements = (int*)context->GetScratchBuffer( + context, data->output_dims_num_elements_buffer_index); + + for (int i = 0; i < input_dims; i++) { + output_dims_num_elements[i] = 1; + input_dims_num_elements[i] = 1; + } + + for (int i = input_dims - 2; i >= 0; i--) { + output_dims_num_elements[i] = + output_dims_num_elements[i + 1] * output_tensor->dims->data[i + 1]; + + input_dims_num_elements[i] = + input_dims_num_elements[i + 1] * input_tensor->dims->data[i + 1]; + } + + switch (output_tensor->type) { + case kTfLiteFloat32: { + MirrorPad(padding_matrix, input_tensor->dims, output_dims_num_elements, + input_dims_num_elements, + tflite::micro::GetTensorData(input_tensor), + tflite::micro::GetTensorData(output_tensor), + data->offset, input_dims, output_size); + break; + } + case kTfLiteInt8: { + MirrorPad(padding_matrix, input_tensor->dims, output_dims_num_elements, + input_dims_num_elements, + tflite::micro::GetTensorData(input_tensor), + tflite::micro::GetTensorData(output_tensor), + data->offset, input_dims, output_size); + break; + } + default: + status = kTfLiteError; + break; + } + +#undef TF_LITE_MIRROR_PAD + + return status; +} + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + return context->AllocatePersistentBuffer(context, sizeof(OpDataMirrorPad)); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + + TFLITE_DCHECK(node->user_data != nullptr); + OpDataMirrorPad* data = static_cast(node->user_data); + + TfLiteTensor* input_tensor = micro_context->AllocateTempInputTensor(node, 0); + TfLiteTensor* padding_matrix = + micro_context->AllocateTempInputTensor(node, 1); + TfLiteTensor* output_tensor = + micro_context->AllocateTempOutputTensor(node, 0); + + TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2); + TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0), + NumDimensions(input_tensor)); + auto* params = + reinterpret_cast(node->builtin_data); + if (params == nullptr) { + return kTfLiteError; + } + + data->offset = + params->mode != TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect ? 0 + : 1; + data->input_dims = NumDimensions(input_tensor); + data->output_size = NumElements(output_tensor); + + TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena( + context, data->input_dims * sizeof(int), + &data->output_dims_num_elements_buffer_index)); + TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena( + context, data->input_dims * sizeof(int), + &data->input_dims_num_elements_buffer_index)); + + micro_context->DeallocateTempTfLiteTensor(input_tensor); + micro_context->DeallocateTempTfLiteTensor(padding_matrix); + micro_context->DeallocateTempTfLiteTensor(output_tensor); + return kTfLiteOk; +} + +} // namespace + +TfLiteRegistration Register_MIRROR_PAD() { + return {/*init=*/Init, + /*free=*/nullptr, + /*prepare=*/Prepare, + /*invoke=*/Eval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul_common.cc index 86ab90aa..6d19ac7a 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/mul_common.cc @@ -37,11 +37,16 @@ void* MulInit(TfLiteContext* context, const char* buffer, size_t length) { TfLiteStatus CalculateOpDataMul(TfLiteContext* context, TfLiteNode* node, TfLiteMulParams* params, OpDataMul* data) { - const TfLiteTensor* input1 = GetInput(context, node, kMulInput1Tensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input1 = + micro_context->AllocateTempInputTensor(node, kMulInput1Tensor); TF_LITE_ENSURE(context, input1 != nullptr); - const TfLiteTensor* input2 = GetInput(context, node, kMulInput2Tensor); + TfLiteTensor* input2 = + micro_context->AllocateTempInputTensor(node, kMulInput2Tensor); TF_LITE_ENSURE(context, input2 != nullptr); - TfLiteTensor* output = GetOutput(context, node, kMulOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kMulOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); @@ -72,6 +77,9 @@ TfLiteStatus CalculateOpDataMul(TfLiteContext* context, TfLiteNode* node, &data->output_activation_max_f32); } + micro_context->DeallocateTempTfLiteTensor(input1); + micro_context->DeallocateTempTfLiteTensor(input2); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pad.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pad.cc index e038de0b..1428b16e 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pad.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pad.cc @@ -43,19 +43,26 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TFLITE_DCHECK(node->user_data != nullptr); OpData* data = static_cast(node->user_data); TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, /*index=*/0); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, /*index=*/0); TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* paddings = GetInput(context, node, /*index=*/1); + TfLiteTensor* paddings = + micro_context->AllocateTempInputTensor(node, /*index=*/1); TF_LITE_ENSURE(context, paddings != nullptr); - const TfLiteTensor* constant_values = - NumInputs(node) == 3 ? GetInput(context, node, /*index=*/2) : nullptr; - TfLiteTensor* output = GetOutput(context, node, /*index=*/0); + TfLiteTensor* constant_values = + NumInputs(node) == 3 + ? micro_context->AllocateTempInputTensor(node, /*index=*/2) + : nullptr; + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, /*index=*/0); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_EQ(context, input->type, output->type); @@ -122,6 +129,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { data->output_zero_point = output->params.zero_point; } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(paddings); + if (constant_values != nullptr) { + micro_context->DeallocateTempTfLiteTensor(constant_values); + } + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling_common.cc index fa693240..ddc18f0b 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/pooling_common.cc @@ -54,9 +54,13 @@ TfLiteStatus PoolingPrepare(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->user_data != nullptr); OpDataPooling* data = static_cast(node->user_data); - const TfLiteTensor* input = GetInput(context, node, kPoolingInputTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kPoolingInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kPoolingOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kPoolingOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_STATUS( @@ -71,6 +75,9 @@ TfLiteStatus PoolingPrepare(TfLiteContext* context, TfLiteNode* node) { &data->activation_max); } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu_common.cc index 8b840fcb..1a89cadf 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/prelu_common.cc @@ -84,14 +84,22 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->user_data != nullptr); PreluParams* params = static_cast(node->user_data); - const TfLiteTensor* input = GetInput(context, node, 0); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0); TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* alpha = GetInput(context, node, 1); + TfLiteTensor* alpha = micro_context->AllocateTempInputTensor(node, 1); TF_LITE_ENSURE(context, alpha != nullptr); - TfLiteTensor* output = GetOutput(context, node, 0); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0); TF_LITE_ENSURE(context, output != nullptr); - return CalculatePreluParams(input, alpha, output, params); + TF_LITE_ENSURE_OK(context, + CalculatePreluParams(input, alpha, output, params)); + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(alpha); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } } // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize_common.cc index 459b0966..cca3489d 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/quantize_common.cc @@ -36,9 +36,11 @@ TfLiteStatus PrepareQuantizeReference(TfLiteContext* context, TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, 0); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, 0); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0); TF_LITE_ENSURE(context, output != nullptr); // TODO(b/128934713): Add support for fixed-point per-channel quantization. @@ -77,6 +79,9 @@ TfLiteStatus PrepareQuantizeReference(TfLiteContext* context, data->quantization_params.scale = static_cast(output->params.scale); data->input_zero_point = input->params.zero_point; + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/read_variable.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/read_variable.cc index 024b511a..f9124f04 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/read_variable.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/read_variable.cc @@ -39,13 +39,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(NumInputs(node) == 1); TFLITE_DCHECK(NumOutputs(node) == 1); - const TfLiteTensor* input_resource_id_tensor = - GetInput(context, node, kInputVariableId); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input_resource_id_tensor = + micro_context->AllocateTempInputTensor(node, kInputVariableId); TFLITE_DCHECK(input_resource_id_tensor != nullptr); TFLITE_DCHECK(input_resource_id_tensor->type == kTfLiteResource); TFLITE_DCHECK(NumElements(input_resource_id_tensor) == 1); + micro_context->DeallocateTempTfLiteTensor(input_resource_id_tensor); + return kTfLiteOk; } @@ -58,14 +62,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { tflite::micro::GetEvalOutput(context, node, kOutputValue); TFLITE_DCHECK(output_value != nullptr); - // Casting to TfliteIntArray is required since we are re-using - // GetExecutionPlan from TfLiteContext. On TFLM this method returns a - // MicroGraph. - // TODO(b/188226309): Design a cleaner way to get a graph from kernel context. - MicroGraph* graph_info; - context->GetExecutionPlan(context, - reinterpret_cast(&graph_info)); - MicroResourceVariables* resources = graph_info->GetResourceVariables(); + tflite::MicroContext* micro_context = tflite::GetMicroContext(context); + MicroGraph& graph_info = micro_context->graph(); + + MicroResourceVariables* resources = graph_info.GetResourceVariables(); if (resources == nullptr) { MicroPrintf( "READ_VARIABLE requires resource variables. Please create " diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.cc index 6339c98b..40aed2d5 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reduce.cc @@ -50,10 +50,12 @@ void* InitReduce(TfLiteContext* context, const char* buffer, size_t length) { } TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + // Inputs Tensor (dtype depends on quantization): // [0] = Input // [1] = Axis - const TfLiteTensor* input = GetInput(context, node, 0); + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0); // Outputs Tensor (dtype depends on quantization): // [0] = Output @@ -63,28 +65,31 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); // Validate axis type - const TfLiteTensor* axis = GetInput(context, node, 1); + TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 1); TF_LITE_ENSURE(context, axis != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, axis->type, kTfLiteInt32); if (input->type == kTfLiteInt8) { OpData* data = static_cast(node->user_data); - const TfLiteTensor* output = GetOutput(context, node, 0); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0); const double real_multiplier = static_cast(input->params.scale) / static_cast(output->params.scale); QuantizeMultiplier(real_multiplier, &data->multiplier, &data->shift); + micro_context->DeallocateTempTfLiteTensor(output); } - + micro_context->DeallocateTempTfLiteTensor(axis); + micro_context->DeallocateTempTfLiteTensor(input); return kTfLiteOk; } TfLiteStatus PrepareMax(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, PrepareSimple(context, node)); + MicroContext* micro_context = GetMicroContext(context); OpData* op_data = static_cast(node->user_data); - const TfLiteTensor* input = GetInput(context, node, 0); - const TfLiteTensor* output = GetOutput(context, node, 0); - const TfLiteTensor* axis = GetInput(context, node, 1); + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0); + TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 1); op_data->input_scale = input->params.scale; op_data->output_scale = output->params.scale; @@ -96,13 +101,17 @@ TfLiteStatus PrepareMax(TfLiteContext* context, TfLiteNode* node) { context, sizeof(int) * static_cast(ElementCount(*axis->dims)), &op_data->resolved_axis_idx); + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + micro_context->DeallocateTempTfLiteTensor(axis); return kTfLiteOk; } TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) { - const TfLiteTensor* input = GetInput(context, node, 0); + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0); OpData* op_data = reinterpret_cast(node->user_data); - const TfLiteTensor* output = GetOutput(context, node, 0); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0); if (input->type == kTfLiteInt8 || input->type == kTfLiteInt16) { const double real_multiplier = static_cast(input->params.scale) / static_cast(output->params.scale); @@ -121,6 +130,8 @@ TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, PrepareSimple(context, node)); // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018) + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reshape.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reshape.cc index 8e47e2a0..d14ed82e 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/reshape.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/reshape.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include + #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" @@ -31,9 +33,13 @@ constexpr int kInputTensor = 0; constexpr int kOutputTensor = 0; TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) { - const TfLiteTensor* input = GetInput(context, node, kInputTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, output != nullptr); // Tensorflow's Reshape allows one of the shape components to have the // special -1 value, meaning it will be calculated automatically based on the @@ -68,6 +74,9 @@ TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements); + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } @@ -93,9 +102,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { // Do nothing for in-place reshape. if (input->data.raw != output->data.raw) { // Otherwise perform reshape with copy. - for (size_t i = 0; i < input_bytes; ++i) { - output->data.raw[i] = input->data.raw[i]; - } + memcpy(output->data.raw, input->data.raw, input_bytes); } return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_bilinear.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_bilinear.cc index 465bf516..55c23846 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_bilinear.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_bilinear.cc @@ -30,12 +30,17 @@ constexpr int kSizeTensor = 1; constexpr int kOutputTensor = 0; TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); - const TfLiteTensor* size = GetInput(context, node, kSizeTensor); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TfLiteTensor* size = + micro_context->AllocateTempInputTensor(node, kSizeTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4); TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1); @@ -55,6 +60,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { return kTfLiteError; } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(size); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc index 63c302bb..a02159af 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc @@ -33,12 +33,17 @@ constexpr int kSizeTensor = 1; constexpr int kOutputTensor = 0; TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); - const TfLiteTensor* size = GetInput(context, node, kSizeTensor); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TfLiteTensor* size = + micro_context->AllocateTempInputTensor(node, kSizeTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); // Our current implementations rely on the input being 4D, // and the size being 1D tensor with exactly 2 elements. @@ -53,6 +58,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_KERNEL_LOG(context, "Dynamic tensors are unsupported in tfmicro."); return kTfLiteError; } + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(size); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/round.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/round.cc index 5804016b..76d8e6bf 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/round.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/round.cc @@ -29,9 +29,13 @@ constexpr int kInputTensor = 0; constexpr int kOutputTensor = 0; TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - const TfLiteTensor* input = GetInput(context, node, kInputTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); @@ -42,6 +46,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { for (int i = 0; i < output->dims->size; ++i) { TF_LITE_ENSURE_EQ(context, output->dims->data[i], input->dims->data[i]); } + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/slice.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/slice.cc index 51ee70de..40d9fdd7 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/slice.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/slice.cc @@ -45,16 +45,22 @@ void GetBeginAndSizeVectors(int dimensions, const TfLiteEvalTensor* begin, } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 3); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); TFLITE_DCHECK(input != nullptr); - const TfLiteTensor* begin = GetInput(context, node, kBeginTensor); + TfLiteTensor* begin = + micro_context->AllocateTempInputTensor(node, kBeginTensor); TFLITE_DCHECK(begin != nullptr); - const TfLiteTensor* size = GetInput(context, node, kSizeTensor); + TfLiteTensor* size = + micro_context->AllocateTempInputTensor(node, kSizeTensor); TFLITE_DCHECK(size != nullptr); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TFLITE_DCHECK(output != nullptr); // Ensure validity of input tensor and its dimension. @@ -66,6 +72,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(NumDimensions(size) == 1); TFLITE_DCHECK(NumElements(begin) == NumElements(size)); TFLITE_DCHECK(NumDimensions(input) <= kMaxDim); + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(begin); + micro_context->DeallocateTempTfLiteTensor(size); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax_common.cc index 5521b543..d93f5f26 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/softmax_common.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/op_macros.h" #include "tensorflow/lite/micro/kernels/softmax.h" +#include "tensorflow/lite/micro/micro_context.h" namespace tflite { @@ -90,12 +91,14 @@ void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) { } TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, 0); + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0); TF_LITE_ENSURE(context, input != nullptr); TF_LITE_ENSURE(context, NumDimensions(input) >= 1); - TfLiteTensor* output = GetOutput(context, node, 0); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE(context, node->user_data != nullptr); @@ -136,7 +139,12 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) { } auto* params = static_cast(node->builtin_data); - return CalculateSoftmaxParams(context, input, output, params, op_data); + auto ret_val = + CalculateSoftmaxParams(context, input, output, params, op_data); + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return ret_val; } } // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_batch_nd.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_batch_nd.cc index fdfb81bc..4e01becb 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_batch_nd.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_batch_nd.cc @@ -44,11 +44,15 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 3); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, input != nullptr && output != nullptr); TF_LITE_ENSURE(context, NumDimensions(input) >= kInputOutputMinDimensionNum); @@ -57,6 +61,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, NumDimensions(output) <= kInputOutputMaxDimensionNum); TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_depth.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_depth.cc index 7bd86bad..9c0cc445 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_depth.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/space_to_depth.cc @@ -39,11 +39,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4); @@ -75,6 +78,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { output->dims->data[kDepthRank] = input->dims->data[kDepthRank] * block_size * block_size; + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/split.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/split.cc index 82f17228..5d90d983 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/split.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/split.cc @@ -69,7 +69,8 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node, } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - const TfLiteTensor* axis = GetInput(context, node, 0); + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 0); TF_LITE_ENSURE(context, axis != nullptr); // Dynamic output tensors are needed if axis tensor is not constant. @@ -77,6 +78,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // constant axis tensor for now. TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis), "Non constant axis tensor not supported"); + + micro_context->DeallocateTempTfLiteTensor(axis); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/split_v.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/split_v.cc index c2a01149..c1c41124 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/split_v.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/split_v.cc @@ -74,13 +74,14 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node, TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 3); + MicroContext* micro_context = GetMicroContext(context); // Dynamic output tensors are needed if axis tensor is not constant. // But Micro doesn't support dynamic memory allocation, so we only support // constant axis tensor for now. - const TfLiteTensor* axis = GetInput(context, node, 2); + TfLiteTensor* axis = micro_context->AllocateTempInputTensor(node, 2); TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis), "Non constant axis tensor not supported"); - + micro_context->DeallocateTempTfLiteTensor(axis); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/squeeze.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/squeeze.cc index 522c2d0e..0eb767db 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/squeeze.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/squeeze.cc @@ -27,12 +27,19 @@ namespace tflite { namespace { struct SqueezeContext { - SqueezeContext(TfLiteContext* context, TfLiteNode* node) - : params(reinterpret_cast(node->builtin_data)), - input(GetInput(context, node, 0)), - output(GetOutput(context, node, 0)) {} + SqueezeContext(TfLiteContext* context, TfLiteNode* node) { + params = reinterpret_cast(node->builtin_data); + micro_context = GetMicroContext(context); + input = micro_context->AllocateTempInputTensor(node, 0); + output = micro_context->AllocateTempOutputTensor(node, 0); + } + ~SqueezeContext() { + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); + } + MicroContext* micro_context; TfLiteSqueezeParams* params; - const TfLiteTensor* const input; + TfLiteTensor* input; TfLiteTensor* output; }; @@ -80,18 +87,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - SqueezeContext op_context(context, node); + const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0); - if (op_context.input->type == kTfLiteString) { + if (input->type == kTfLiteString) { TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.", - TfLiteTypeGetName(op_context.input->type), - op_context.input->type); + TfLiteTypeGetName(input->type), input->type); return kTfLiteError; } - TF_LITE_ENSURE_EQ(context, op_context.input->bytes, op_context.output->bytes); - memcpy(op_context.output->data.raw, op_context.input->data.raw, - op_context.input->bytes); + TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0); + size_t input_byte_size; + size_t output_byte_size; + TF_LITE_ENSURE_OK(context, + TfLiteEvalTensorByteLength(input, &input_byte_size)); + TF_LITE_ENSURE_OK(context, + TfLiteEvalTensorByteLength(output, &output_byte_size)); + + TF_LITE_ENSURE_EQ(context, input_byte_size, output_byte_size); + memcpy(output->data.raw, input->data.raw, input_byte_size); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/strided_slice.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/strided_slice.cc index 58582bbf..d5b73b8f 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/strided_slice.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/strided_slice.cc @@ -38,18 +38,27 @@ constexpr int kOutputTensor = 0; struct StridedSliceContext { StridedSliceContext(TfLiteContext* context, TfLiteNode* node) { params = reinterpret_cast(node->builtin_data); - input = GetInput(context, node, kInputTensor); - begin = GetInput(context, node, kBeginTensor); - end = GetInput(context, node, kEndTensor); - strides = GetInput(context, node, kStridesTensor); - output = GetOutput(context, node, kOutputTensor); + micro_context = GetMicroContext(context); + input = micro_context->AllocateTempInputTensor(node, kInputTensor); + begin = micro_context->AllocateTempInputTensor(node, kBeginTensor); + end = micro_context->AllocateTempInputTensor(node, kEndTensor); + strides = micro_context->AllocateTempInputTensor(node, kStridesTensor); + output = micro_context->AllocateTempOutputTensor(node, kOutputTensor); dims = NumDimensions(input); } + ~StridedSliceContext() { + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(begin); + micro_context->DeallocateTempTfLiteTensor(end); + micro_context->DeallocateTempTfLiteTensor(strides); + micro_context->DeallocateTempTfLiteTensor(output); + } const TfLiteStridedSliceParams* params; - const TfLiteTensor* input; - const TfLiteTensor* begin; - const TfLiteTensor* end; - const TfLiteTensor* strides; + MicroContext* micro_context; + TfLiteTensor* input; + TfLiteTensor* begin; + TfLiteTensor* end; + TfLiteTensor* strides; TfLiteTensor* output; int dims; }; diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub_common.cc index bb30780b..d6647462 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/sub_common.cc @@ -83,15 +83,24 @@ TfLiteStatus SubPrepare(TfLiteContext* context, TfLiteNode* node) { OpDataSub* data = static_cast(node->user_data); auto* params = reinterpret_cast(node->builtin_data); - const TfLiteTensor* input1 = GetInput(context, node, kSubInputTensor1); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input1 = + micro_context->AllocateTempInputTensor(node, kSubInputTensor1); TF_LITE_ENSURE(context, input1 != nullptr); - const TfLiteTensor* input2 = GetInput(context, node, kSubInputTensor2); + TfLiteTensor* input2 = + micro_context->AllocateTempInputTensor(node, kSubInputTensor2); TF_LITE_ENSURE(context, input2 != nullptr); - TfLiteTensor* output = GetOutput(context, node, kSubOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kSubOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_STATUS( CalculateOpDataSub(context, params, input1, input2, output, data)); + + micro_context->DeallocateTempTfLiteTensor(input1); + micro_context->DeallocateTempTfLiteTensor(input2); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf_common.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf_common.cc index d1cbd26e..fb92b4fd 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf_common.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/svdf_common.cc @@ -364,6 +364,8 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) { const auto* params = static_cast(node->builtin_data); + MicroContext* micro_context = GetMicroContext(context); + // Validate Tensor Inputs (dtype depends on quantization): // [0] = Input, {2, batch_size, input_size} // [1] = Weights Feature, {2, num_filters, input_size} @@ -371,18 +373,19 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) { // [3] = Bias (optional), {1, num_units} // [4] = Activation State (variable), // {2, batch_size, memory_size * num_filters} - const TfLiteTensor* input = GetInput(context, node, kSvdfInputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kSvdfInputTensor); TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* weights_feature = - GetInput(context, node, kSvdfWeightsFeatureTensor); + TfLiteTensor* weights_feature = + micro_context->AllocateTempInputTensor(node, kSvdfWeightsFeatureTensor); TF_LITE_ENSURE(context, weights_feature != nullptr); - const TfLiteTensor* weights_time = - GetInput(context, node, kSvdfWeightsTimeTensor); + TfLiteTensor* weights_time = + micro_context->AllocateTempInputTensor(node, kSvdfWeightsTimeTensor); TF_LITE_ENSURE(context, weights_time != nullptr); - const TfLiteTensor* bias = - GetOptionalInputTensor(context, node, kSvdfBiasTensor); - const TfLiteTensor* activation_state = - GetInput(context, node, kSvdfInputActivationStateTensor); + TfLiteTensor* bias = + micro_context->AllocateTempInputTensor(node, kSvdfBiasTensor); + TfLiteTensor* activation_state = micro_context->AllocateTempInputTensor( + node, kSvdfInputActivationStateTensor); TF_LITE_ENSURE(context, activation_state != nullptr); // Define input constants based on input tensor definition above: @@ -402,7 +405,8 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) { // Validate Tensor Output: // [0] = float/int8_t, {2, batch_size, num_units} TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); - TfLiteTensor* output = GetOutput(context, node, kSvdfOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kSvdfOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2); TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size); @@ -498,6 +502,12 @@ TfLiteStatus PrepareSvdf(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, scratch_status); } + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(weights_feature); + micro_context->DeallocateTempTfLiteTensor(weights_time); + micro_context->DeallocateTempTfLiteTensor(activation_state); + micro_context->DeallocateTempTfLiteTensor(output); + micro_context->DeallocateTempTfLiteTensor(bias); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/tanh.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/tanh.cc index a9ede9eb..a9f01c71 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/tanh.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/tanh.cc @@ -48,11 +48,14 @@ void* TanhInit(TfLiteContext* context, const char* buffer, size_t length) { TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node, OpData* data) { + MicroContext* micro_context = GetMicroContext(context); TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); @@ -69,6 +72,62 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node, data->input_range_radius = CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31); } + + if (input->type == kTfLiteInt16) { + static constexpr int kInputIntegerBits = 3; + static constexpr int kOutputFractionalBits = 15; + + // These operators are implemented in fixed-point arithmetic, + // which intrinsically wants symmetric ranges (zero_point==0) + // and power-of-two scales (power-of-two is abbreviated below as POT). + // While more general support would be possible by means of rescaling, + // that would add some overhead and some loss of accuracy and wouldn't + // be used at the moment as current quantized LSTM applications are + // happy with symmetric, power-of-two-scales quantization. So we just + // implement that narrow case only for now. + + TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + + int input_scale_log2_rounded; + bool param_scale_pot = + CheckedLog2(input->params.scale, &input_scale_log2_rounded); + + data->input_left_shift = + (15 - kInputIntegerBits) + input_scale_log2_rounded; + param_scale_pot &= + (data->input_left_shift == 0 || data->input_left_shift == 1); + + if (param_scale_pot) { + data->input_multiplier = 0; + } else { + // Calculate multiplier to change input scale to 1/(3*4096) + // as required by the table lookup. + // The number 3.0 in the multiplier comes from here, + // because the interval is [-10.7, 10.7] instead of [-8, 8]. + // So, in this scaling +/-2^17 represents +/-10.7. + + double multiplier = + static_cast(input->params.scale) * 4096.0 * 3.0; + data->input_left_shift = 0; + + while (multiplier <= 32767.0 / 2.0 && data->input_left_shift <= 30) { + data->input_left_shift++; + multiplier = multiplier * 2.0; + } + + data->input_multiplier = static_cast(multiplier); + } + + int output_scale_log2_rounded; + TF_LITE_ENSURE( + context, CheckedLog2(output->params.scale, &output_scale_log2_rounded)); + TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded, + -kOutputFractionalBits); + } + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } @@ -77,10 +136,15 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) { OpData* data = static_cast(node->user_data); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); TF_LITE_ENSURE(context, input != nullptr); data->input_zero_point = input->params.zero_point; - return CalculateArithmeticOpData(context, node, data); + TF_LITE_ENSURE_OK(context, CalculateArithmeticOpData(context, node, data)); + + micro_context->DeallocateTempTfLiteTensor(input); + return kTfLiteOk; } } // namespace diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose.cc index bf43a073..ba3d6e94 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose.cc @@ -18,18 +18,30 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/internal/types.h" #include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" namespace tflite { namespace { +constexpr int kInputTensor = 0; +constexpr int kPermTensor = 1; +constexpr int kOutputTensor = 0; + struct TransposeContext { TransposeContext(TfLiteContext* context, TfLiteNode* node) { - input = GetInput(context, node, 0); - perm = GetInput(context, node, 1); - output = GetOutput(context, node, 0); + micro_context = GetMicroContext(context); + input = micro_context->AllocateTempInputTensor(node, kInputTensor); + perm = micro_context->AllocateTempInputTensor(node, kPermTensor); + output = micro_context->AllocateTempOutputTensor(node, kOutputTensor); } - const TfLiteTensor* input; - const TfLiteTensor* perm; + ~TransposeContext() { + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(perm); + micro_context->DeallocateTempTfLiteTensor(output); + } + MicroContext* micro_context; + TfLiteTensor* input; + TfLiteTensor* perm; TfLiteTensor* output; }; @@ -60,10 +72,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - TransposeContext op_context(context, node); - - const int32_t* perm_data = GetTensorData(op_context.perm); - const int size = op_context.perm->dims->data[0]; + const TfLiteEvalTensor* perm_tensor = + tflite::micro::GetEvalInput(context, node, kPermTensor); + const int32_t* perm_data = perm_tensor->data.i32; + const int size = perm_tensor->dims->data[0]; TransposeParams params; params.perm_count = size; for (int i = 0; i < size; ++i) { @@ -73,24 +85,28 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { // Transpose kernel only does rearranging values not numeric evaluations // on each cell. It's safe to implement per size of scalar type and this // trick keeps the total code size in a reasonable range. - switch (op_context.input->type) { + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, kInputTensor); + TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, kOutputTensor); + switch (input->type) { case kTfLiteFloat32: - reference_ops::Transpose(params, GetTensorShape(op_context.input), - GetTensorData(op_context.input), - GetTensorShape(op_context.output), - GetTensorData(op_context.output)); + reference_ops::Transpose(params, tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); break; case kTfLiteInt8: - reference_ops::Transpose(params, GetTensorShape(op_context.input), - GetTensorData(op_context.input), - GetTensorShape(op_context.output), - GetTensorData(op_context.output)); + reference_ops::Transpose(params, tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData(input), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData(output)); break; default: TF_LITE_KERNEL_LOG(context, "Type %s is currently not supported by Transpose. " "Only float32 and int8 is supported", - TfLiteTypeGetName(op_context.input->type)); + TfLiteTypeGetName(input->type)); return kTfLiteError; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose_conv.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose_conv.cc index cbd964a0..dcf007c5 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose_conv.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/transpose_conv.cc @@ -94,13 +94,18 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, // Note that quantized inference requires that all tensors have their // parameters set. This is usually done during quantized training. if (data_type != kTfLiteFloat32) { - const TfLiteTensor* input = GetInput(context, node, kInputTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* filter = GetInput(context, node, kFilterTensor); + TfLiteTensor* filter = + micro_context->AllocateTempInputTensor(node, kFilterTensor); TF_LITE_ENSURE(context, filter != nullptr); - const TfLiteTensor* bias = - GetOptionalInputTensor(context, node, kBiasTensor); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TfLiteTensor* bias = + micro_context->AllocateTempInputTensor(node, kBiasTensor); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, output != nullptr); int output_channels = filter->dims->data[kConvQuantizedDimension]; @@ -124,6 +129,13 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, &(data->bias_converted_buffer_index)) == kTfLiteOk); } } + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(filter); + micro_context->DeallocateTempTfLiteTensor(output); + if (bias != nullptr) { + micro_context->DeallocateTempTfLiteTensor(bias); + } } return kTfLiteOk; } @@ -141,11 +153,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { const auto params = static_cast(node->builtin_data); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); TF_LITE_ENSURE(context, output != nullptr); - const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); TF_LITE_ENSURE(context, input != nullptr); - const TfLiteTensor* filter = GetInput(context, node, kFilterTensor); + TfLiteTensor* filter = + micro_context->AllocateTempInputTensor(node, kFilterTensor); TF_LITE_ENSURE(context, filter != nullptr); // Get height and width of the output. @@ -212,6 +229,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Stride data->params.stride_width = params->stride_width; data->params.stride_height = params->stride_height; + + micro_context->DeallocateTempTfLiteTensor(output); + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(filter); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/var_handle.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/var_handle.cc index 2efffdb6..8354c918 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/var_handle.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/var_handle.cc @@ -46,14 +46,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { const auto* params = reinterpret_cast(node->builtin_data); - // Casting to TfliteIntArray is required since we are re-using - // GetExecutionPlan from TfLiteContext. On TFLM this method returns a - // MicroGraph. - // TODO(b/188226309): Design a cleaner way to get a graph from kernel context. - MicroGraph* graph_info; - context->GetExecutionPlan(context, - reinterpret_cast(&graph_info)); - MicroResourceVariables* resources = graph_info->GetResourceVariables(); + tflite::MicroContext* micro_context = tflite::GetMicroContext(context); + MicroGraph& graph_info = micro_context->graph(); + + MicroResourceVariables* resources = graph_info.GetResourceVariables(); if (resources == nullptr) { MicroPrintf( "VAR_HANDLE requires resource variables. Please create " diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/while.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/while.cc new file mode 100644 index 00000000..576c19b0 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/while.cc @@ -0,0 +1,140 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/memory_helpers.h" +#include "tensorflow/lite/micro/micro_context.h" +#include "tensorflow/lite/micro/micro_graph.h" +#include "tensorflow/lite/micro/micro_utils.h" + +namespace tflite { + +namespace { + +struct OpData { + int cond_subgraph_index; + int body_subgraph_index; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); + return context->AllocatePersistentBuffer(context, sizeof(OpData)); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + OpData* op_data = reinterpret_cast(node->user_data); + const auto* params = + reinterpret_cast(node->builtin_data); + + op_data->cond_subgraph_index = params->cond_subgraph_index; + op_data->body_subgraph_index = params->body_subgraph_index; + + // The first input is the condition. + tflite::MicroContext* micro_context = tflite::GetMicroContext(context); + + size_t num_inputs = node->inputs->size; + size_t num_outputs = node->outputs->size; + + MicroGraph& graph_info = micro_context->graph(); + + TF_LITE_ENSURE(context, + op_data->cond_subgraph_index < graph_info.NumSubgraphs()); + TF_LITE_ENSURE(context, + op_data->body_subgraph_index < graph_info.NumSubgraphs()); + + TF_LITE_ENSURE_EQ(context, num_inputs, + graph_info.NumSubgraphInputs(op_data->cond_subgraph_index)); + TF_LITE_ENSURE_EQ(context, num_inputs, + graph_info.NumSubgraphInputs(op_data->body_subgraph_index)); + TF_LITE_ENSURE_EQ(context, num_inputs, num_outputs); + TF_LITE_ENSURE_EQ( + context, num_outputs, + graph_info.NumSubgraphOutputs(op_data->body_subgraph_index)); + + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const OpData* op_data = reinterpret_cast(node->user_data); + + tflite::MicroContext* micro_context = tflite::GetMicroContext(context); + MicroGraph* graph_info = µ_context->graph(); + + TF_LITE_ENSURE_OK(context, + tflite::micro::CopyOpInputsToSubgraphInputs( + context, node, graph_info, op_data->cond_subgraph_index, + /*first_tensor_idx=*/0)); + + TF_LITE_ENSURE_OK(context, + graph_info->InvokeSubgraph(op_data->cond_subgraph_index)); + + TfLiteEvalTensor* cond_subgraph_output = graph_info->GetSubgraphOutput( + op_data->cond_subgraph_index, /*tensor_idx=*/0); + bool cond_value = cond_subgraph_output->data.b[0]; + + TF_LITE_ENSURE_OK(context, + tflite::micro::CopyOpInputsToSubgraphInputs( + context, node, graph_info, op_data->body_subgraph_index, + /*first_tensor_idx=*/0)); + TF_LITE_ENSURE_OK(context, + tflite::micro::CopyOpInputsToOpOutputs(context, node)); + + while (cond_value == true) { + // Copy output of this iteration back to the body input. + TF_LITE_ENSURE_OK( + context, tflite::micro::CopyOpOutputsToSubgraphInputs( + context, node, graph_info, op_data->body_subgraph_index)); + TF_LITE_ENSURE_OK(context, + graph_info->InvokeSubgraph(op_data->body_subgraph_index)); + + TF_LITE_ENSURE_OK( + context, tflite::micro::CopySubgraphOutputsToOpOutputs( + context, node, graph_info, op_data->body_subgraph_index)); + TF_LITE_ENSURE_OK( + context, tflite::micro::CopyOpOutputsToSubgraphInputs( + context, node, graph_info, op_data->cond_subgraph_index)); + TF_LITE_ENSURE_OK(context, + graph_info->InvokeSubgraph(op_data->cond_subgraph_index)); + + cond_subgraph_output = graph_info->GetSubgraphOutput( + op_data->cond_subgraph_index, /*tensor_idx=*/0); + cond_value = cond_subgraph_output->data.b[0]; + } + + return kTfLiteOk; +} + +} // namespace. + +TfLiteRegistration Register_WHILE() { + return {/*init=*/Init, + /*free=*/nullptr, + /*prepare=*/Prepare, + /*invoke=*/Eval, + /*profiling_string=*/nullptr, + /*builtin_code=*/0, + /*custom_name=*/nullptr, + /*version=*/0}; +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/kernels/zeros_like.cc b/code/components/tflite-lib/tensorflow/lite/micro/kernels/zeros_like.cc index ce403927..733564c9 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/kernels/zeros_like.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/kernels/zeros_like.cc @@ -25,15 +25,20 @@ constexpr int kInputTensor = 0; constexpr int kOutputTensor = 0; TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kOutputTensor, &output)); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* output = + micro_context->AllocateTempOutputTensor(node, kOutputTensor); + TF_LITE_ENSURE(context, output != nullptr); output->type = input->type; + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(output); return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.cc new file mode 100644 index 00000000..ab313e66 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.cc @@ -0,0 +1,319 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/lite/micro/micro_allocation_info.h" + +#include "tensorflow/lite/c/c_api_types.h" +#include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/micro/memory_helpers.h" +#include "tensorflow/lite/micro/memory_planner/greedy_memory_planner.h" + +namespace tflite { + +namespace { +constexpr char kOfflineMemAllocMetadata[] = "OfflineMemoryAllocation"; +constexpr int kUninitializedLifetime = -1; +} // namespace + +// Mark the given Allocation info as first created at the specified allocation +// scope count. Only the first creation must be recorded since the allocation +// scope count monotonically increases throughout the lifetime marking process. +void AllocationInfoBuilder::UpdateFirstCreated(AllocationInfo* current, + int allocation_scope_count) { + TFLITE_DCHECK(current->first_created <= allocation_scope_count); + if (current->first_created == kUninitializedLifetime) { + current->first_created = allocation_scope_count; + } +} + +// Mark the given AllocationInfo as last used at the specified allocation scope +// count. Update the last used marker every time, since the allocation scope +// count monotonically increases through the lifetime marking process. +void AllocationInfoBuilder::UpdateLastUsed(AllocationInfo* current, + int allocation_scope_count) { + TFLITE_DCHECK(current->last_used <= allocation_scope_count); + current->last_used = allocation_scope_count; +} + +TfLiteStatus AllocationInfoBuilder::MarkSubgraphLifetimesIfNecessary( + const Operator* op, internal::ScratchBufferRequest* scratch_buffer_requests, + ScratchBufferHandle* scratch_buffer_handles, + SubgraphAllocations* allocations) { + int first_subgraph_index = -1; + int second_subgraph_index = -1; + const OperatorCode* opcode = + model_->operator_codes()->Get(op->opcode_index()); + switch (opcode->builtin_code()) { + case BuiltinOperator_IF: { + first_subgraph_index = + op->builtin_options_as_IfOptions()->then_subgraph_index(); + second_subgraph_index = + op->builtin_options_as_IfOptions()->else_subgraph_index(); + break; + } + case BuiltinOperator_CALL_ONCE: { + first_subgraph_index = + op->builtin_options_as_CallOnceOptions()->init_subgraph_index(); + break; + } + case BuiltinOperator_WHILE: { + first_subgraph_index = + op->builtin_options_as_WhileOptions()->cond_subgraph_index(); + second_subgraph_index = + op->builtin_options_as_WhileOptions()->body_subgraph_index(); + break; + } + default: { + break; + } + } + if (first_subgraph_index != -1) { + // Enter a new allocation scope for each subgraph. + allocation_scope_count_++; + TF_LITE_ENSURE_STATUS( + MarkAllocationLifetimes(first_subgraph_index, scratch_buffer_requests, + scratch_buffer_handles, allocations)); + } + if (second_subgraph_index != -1) { + // Enter a new allocation scope for each subgraph. + allocation_scope_count_++; + TF_LITE_ENSURE_STATUS( + MarkAllocationLifetimes(second_subgraph_index, scratch_buffer_requests, + scratch_buffer_handles, allocations)); + } + return kTfLiteOk; +} + +TfLiteStatus AllocationInfoBuilder::CreateAllocationInfo( + int scratch_buffer_request_count) { + size_t subgraph_offsets_length = model_->subgraphs()->size() * sizeof(size_t); + info_.subgraph_offsets = + reinterpret_cast(non_persistent_allocator_->AllocateTemp( + subgraph_offsets_length, alignof(size_t))); + if (info_.subgraph_offsets == nullptr) { + TF_LITE_REPORT_ERROR( + reporter_, + "Failed to allocate memory for memory planning, %d bytes required", + subgraph_offsets_length); + return kTfLiteError; + } + size_t tensor_count = 0; + for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size(); + subgraph_idx++) { + // Add all tensors in each subgraph to the AllocationInfo array. Even weight + // tensors are added but marked with needs_allocating = false. Including all + // tensors in the graph here simplifies logic. + info_.subgraph_offsets[subgraph_idx] = tensor_count; + tensor_count += model_->subgraphs()->Get(subgraph_idx)->tensors()->size(); + } + info_.tensor_count = tensor_count; + + // Scratch buffer allocations follow tensor allocations, so the scratch offset + // is equal to the number of tensor allocations. + info_.scratch_offset = tensor_count; + info_.allocation_info_count = tensor_count + scratch_buffer_request_count; + info_.scratch_buffer_count = scratch_buffer_request_count; + size_t bytes = sizeof(AllocationInfo) * info_.allocation_info_count; + + // Allocate an array of AllocationInfo structs from the temp section. This + // struct will be used by AllocationInfoBuilder to find buffer usage. + info_.allocation_info = reinterpret_cast( + non_persistent_allocator_->AllocateTemp(bytes, alignof(AllocationInfo))); + if (info_.allocation_info == nullptr) { + TF_LITE_REPORT_ERROR( + reporter_, + "Failed to allocate memory for memory planning, %d bytes required", + bytes); + return kTfLiteError; + } + return kTfLiteOk; +} + +TfLiteStatus AllocationInfoBuilder::FreeAllocationInfo() { + non_persistent_allocator_->DeallocateTemp( + reinterpret_cast(info_.allocation_info)); + non_persistent_allocator_->DeallocateTemp( + reinterpret_cast(info_.subgraph_offsets)); + return kTfLiteOk; +} + +TfLiteStatus AllocationInfoBuilder::InitializeAllocationInfo( + const int32_t* offline_offsets, SubgraphAllocations* allocations) { + AllocationInfo* allocation_info = info_.allocation_info; + // Initialize allocation info for every tensor in every subgraph. + for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size(); + subgraph_idx++) { + const SubGraph* subgraph = model_->subgraphs()->Get(subgraph_idx); + TfLiteEvalTensor* eval_tensors = allocations[subgraph_idx].tensors; + AllocationInfo* subgraph_allocation_info = + &allocation_info[info_.subgraph_offsets[subgraph_idx]]; + for (size_t i = 0; i < subgraph->tensors()->size(); ++i) { + AllocationInfo* current = &subgraph_allocation_info[i]; + current->output_ptr = &(eval_tensors[i].data.data); + + TF_LITE_ENSURE_STATUS( + TfLiteEvalTensorByteLength(&eval_tensors[i], ¤t->bytes)); + + current->first_created = kUninitializedLifetime; + current->last_used = kUninitializedLifetime; + current->needs_allocating = (eval_tensors[i].data.data == nullptr) && + (!subgraph->tensors()->Get(i)->is_variable()); + if (offline_offsets) { + current->offline_offset = offline_offsets[i]; + } else { + current->offline_offset = kOnlinePlannedBuffer; + } + } + } + // Initialize allocation info for every scratch buffer. + AllocationInfo* scratch_allocation_info = + &allocation_info[info_.scratch_offset]; + for (size_t i = 0; i < info_.scratch_buffer_count; i++) { + AllocationInfo* current = &scratch_allocation_info[i]; + current->first_created = -1; + current->last_used = -1; + current->needs_allocating = true; + current->offline_offset = kOnlinePlannedBuffer; + } + return kTfLiteOk; +} + +TfLiteStatus AllocationInfoBuilder::MarkAllocationLifetimes( + int subgraph_idx, internal::ScratchBufferRequest* scratch_buffer_requests, + ScratchBufferHandle* scratch_buffer_handles, + SubgraphAllocations* allocations) { + const SubGraph* subgraph = model_->subgraphs()->Get(subgraph_idx); + + AllocationInfo* allocation_info = info_.allocation_info; + // Each subgraph's tensor allocations are in a contiguous block starting at + // subgraph_offsets_[subgraph index] with one entry per tensor. + AllocationInfo* subgraph_allocation_info = + &allocation_info[info_.subgraph_offsets[subgraph_idx]]; + + uint32_t operators_size = NumSubgraphOperators(subgraph); + // Mark all inputs as created at the start of the subgraph invocation. + for (size_t i = 0; + subgraph->inputs() != nullptr && i < subgraph->inputs()->size(); ++i) { + const int tensor_index = subgraph->inputs()->Get(i); + AllocationInfo* current = &subgraph_allocation_info[tensor_index]; + UpdateFirstCreated(current, allocation_scope_count_); + } + + for (uint32_t i = 0; i < operators_size; i++) { + // Each operator has a new allocation scope. + allocation_scope_count_++; + const auto* op = subgraph->operators()->Get(i); + // Figure out when the first creation and use of each tensor is. + for (size_t n = 0; op->outputs() != nullptr && n < op->outputs()->size(); + ++n) { + const int tensor_index = op->outputs()->Get(n); + AllocationInfo* current = &subgraph_allocation_info[tensor_index]; + UpdateFirstCreated(current, allocation_scope_count_); + } + + // Keep track of scope count before any subgraphs, so that scratch buffers' + // lifetime within a control flow op properly overlaps with all subgraphs. + int start_allocation_scope_count = allocation_scope_count_; + + // Control flow operators can invoke subgraphs. Plan these subgraphs + // before continuing on to the rest of the graph. + MarkSubgraphLifetimesIfNecessary(op, scratch_buffer_requests, + scratch_buffer_handles, allocations); + + // Figure out when the last use of each tensor is. + for (size_t n = 0; op->inputs() != nullptr && n < op->inputs()->size(); + ++n) { + const int tensor_index = op->inputs()->Get(n); + // Optional bias tensors can have an index of -1 when they are omitted. + if (tensor_index >= 0) { + AllocationInfo* current = &subgraph_allocation_info[tensor_index]; + // No need to update creation since it is either marked by the subgraph + // or producer op, or it is not part of the memory plan (weight, bias + // tensor). + UpdateLastUsed(current, allocation_scope_count_); + } + } + for (size_t n = 0; op->outputs() != nullptr && n < op->outputs()->size(); + ++n) { + const int tensor_index = op->outputs()->Get(n); + AllocationInfo* current = &subgraph_allocation_info[tensor_index]; + UpdateLastUsed(current, allocation_scope_count_); + } + + // Mark thse lifetime of scratch buffers belonging to the current node. This + // operation is O(N * M) where N is the total number of visited nodes and M + // is the total number of scratch buffers. + // TODO(b/217794030): Optimize this memory planning code. + AllocationInfo* scratch_allocation_info = + &allocation_info[info_.scratch_offset]; + for (size_t scratch_idx = 0; scratch_idx < info_.scratch_buffer_count; + scratch_idx++) { + internal::ScratchBufferRequest request = + scratch_buffer_requests[scratch_idx]; + AllocationInfo* current = &scratch_allocation_info[scratch_idx]; + if (request.node_idx == static_cast(i) && + request.subgraph_idx == static_cast(subgraph_idx)) { + ScratchBufferHandle* current_handle = + &(scratch_buffer_handles[scratch_idx]); + current->output_ptr = reinterpret_cast(¤t_handle->data); + current->bytes = request.bytes; + UpdateFirstCreated(current, start_allocation_scope_count); + UpdateLastUsed(current, allocation_scope_count_); + } + } + } + + // Mark all outputs as persistent to the end of the subgraph invocation. + for (size_t i = 0; + subgraph->outputs() != nullptr && i < subgraph->outputs()->size(); ++i) { + const int tensor_index = subgraph->outputs()->Get(i); + AllocationInfo* current = &subgraph_allocation_info[tensor_index]; + UpdateLastUsed(current, allocation_scope_count_); + } + return kTfLiteOk; +} + +// Get offline tensors allocation plan. See +// micro/docs/memory_management.md for more info. +TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets( + const int32_t** offline_planner_offsets) { + if (model_->metadata()) { + for (size_t i = 0; i < model_->metadata()->size(); ++i) { + auto metadata = model_->metadata()->Get(i); + if (strncmp(metadata->name()->c_str(), kOfflineMemAllocMetadata, + strlen(kOfflineMemAllocMetadata)) == 0) { + const flatbuffers::Vector>* buffers = + model_->buffers(); + auto* buffer = (*buffers)[metadata->buffer()]; + auto* array = buffer->data(); + const uint32_t* metadata_buffer = + reinterpret_cast(array->data()); + const size_t nbr_tensors = static_cast(metadata_buffer[2]); + *offline_planner_offsets = + reinterpret_cast(&metadata_buffer[3]); + + if (info_.tensor_count != nbr_tensors) { + TF_LITE_REPORT_ERROR(reporter_, + "Nbr of offline buffer offsets (%d) in metadata " + "not equal nbr tensors (%d)\n", + nbr_tensors, info_.tensor_count); + return kTfLiteError; + } + } + } + } + return kTfLiteOk; +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.h new file mode 100644 index 00000000..af303307 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocation_info.h @@ -0,0 +1,141 @@ +/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_MICRO_MICRO_ALLOCATION_INFO_H_ +#define TENSORFLOW_LITE_MICRO_MICRO_ALLOCATION_INFO_H_ + +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/core/api/error_reporter.h" +#include "tensorflow/lite/core/api/flatbuffer_conversions.h" +#include "tensorflow/lite/micro/compatibility.h" +#include "tensorflow/lite/micro/flatbuffer_utils.h" +#include "tensorflow/lite/micro/micro_allocator.h" +#include "tensorflow/lite/schema/schema_generated.h" + +namespace tflite { + +// Used to hold information used during allocation calculations. +struct AllocationInfo { + size_t bytes; + void** output_ptr; + int first_created; + int last_used; + int32_t offline_offset; + bool needs_allocating; +}; + +// Used to hold the allocation info list and related metadata for the entire +// graph (including subgraphs). Since all subgraphs are planned together, the +// allocation info list contains allocations for all subgraphs. Track the offset +// into this list for each subgraph then reserve space to track all allocations. +// +// The AllocationInfo list is a contiguous list of allocations across all +// subgraphs and scratch buffers. Each element here is marked as +// st. The following is a possible +// AllocationInfo list: +// [s0t0, s0t1, s1t0, s2t1, s1t2, s3t0, s3t1, scratch0, scratch1, scratch2] +// +// For this example, the subgraph offsets would be [0, 2, 5] and the scratch +// offset would be 7. +struct GraphAllocationInfo { + AllocationInfo* allocation_info; + size_t allocation_info_count; + size_t* subgraph_offsets; + size_t scratch_offset; + size_t tensor_count; + size_t scratch_buffer_count; +}; + +// A helper class to construct AllocationInfo array. This array contains the +// lifetime of tensors / scratch_buffer and will be used to calculate the memory +// plan. Methods need to be called in order from `Create`, Init`, `Add*`, to +// `Finish`. +class AllocationInfoBuilder { + public: + AllocationInfoBuilder(const Model* model, + INonPersistentBufferAllocator* non_persistent_allocator, + ErrorReporter* reporter) + : model_(model), + non_persistent_allocator_(non_persistent_allocator), + reporter_(reporter) {} + + // Check if model contains offline planned buffer offsets. + // - If there's no metadata available, offline_planner_offsets is not set + // - If there's metadata available, offline_planner_offsets will point to the + // first offset in the metadata buffer list. + TfLiteStatus GetOfflinePlannedOffsets( + const int32_t** offline_planner_offsets); + + // Allocate memory for the allocation info array as well as offsets into that + // array for each subgraph. + TfLiteStatus CreateAllocationInfo(int scratch_buffer_request_count); + + // Release memory used for the allocation info array. + TfLiteStatus FreeAllocationInfo(); + + // Initialize AllocationInfo for all tensors and scratch buffers in the graph. + TfLiteStatus InitializeAllocationInfo(const int32_t* offline_offsets, + SubgraphAllocations* allocations); + + // Mark the scope of each tensor and scratch buffer across the graph. Enter + // all possible subgraphs invoked by each control flow operator. This method + // marks the maximum lifetime of each buffer so that tensors are correctly + // planned for all valid invocation flows. + TfLiteStatus MarkAllocationLifetimes( + int subgraph_idx, internal::ScratchBufferRequest* scratch_buffer_request, + ScratchBufferHandle* scratch_buffer_handles, + SubgraphAllocations* allocations); + + // Identify control flow operators and recursively mark all subgraphs which + // that operator can invoke. The lifetime of all tensors within a subgraph + // can only be extended. The order of subgraph invocation does not matter + // since subgraphs within the same control flow operator are executed + // within their own allocation scope (planned buffers in a subgraph cannot + // persist beyond the end of that subgraph's invocation). + TfLiteStatus MarkSubgraphLifetimesIfNecessary( + const Operator* op, + internal::ScratchBufferRequest* scratch_buffer_requests, + ScratchBufferHandle* scratch_buffer_handles, + SubgraphAllocations* allocations); + + // Returns the number of allocations. + int AllocationCount() const { return info_.allocation_info_count; } + + // Returns a pointer to the built AllocationInfo array. + AllocationInfo* Finish() const { return info_.allocation_info; } + + private: + // Mark the given Allocation info as first created at the specified allocation + // scope count. Only the first creation must be recorded since the allocation + // scope count monotonically increases throughout the lifetime marking + // process. + void UpdateFirstCreated(AllocationInfo* current, int allocation_scope_count); + + // Mark the given AllocationInfo as last used at the specified allocation + // scope + // count. Update the last used marker every time, since the allocation scope + // count monotonically increases through the lifetime marking process. + void UpdateLastUsed(AllocationInfo* current, int allocation_scope_count); + + const tflite::Model* model_ = nullptr; + INonPersistentBufferAllocator* non_persistent_allocator_ = nullptr; + ErrorReporter* reporter_ = nullptr; + + GraphAllocationInfo info_; + int allocation_scope_count_ = 0; +}; + +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_MICRO_ALLOCATION_INFO_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.cc index ec203b9f..b71c7502 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.cc @@ -19,6 +19,7 @@ limitations under the License. #include #include "flatbuffers/flatbuffers.h" // from @flatbuffers +#include "tensorflow/lite/c/c_api_types.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/core/api/error_reporter.h" #include "tensorflow/lite/core/api/flatbuffer_conversions.h" @@ -30,6 +31,7 @@ limitations under the License. #include "tensorflow/lite/micro/memory_helpers.h" #include "tensorflow/lite/micro/memory_planner/greedy_memory_planner.h" #include "tensorflow/lite/micro/memory_planner/micro_memory_planner.h" +#include "tensorflow/lite/micro/micro_allocation_info.h" #include "tensorflow/lite/micro/micro_arena_constants.h" #include "tensorflow/lite/micro/micro_error_reporter.h" #include "tensorflow/lite/micro/simple_memory_allocator.h" @@ -48,26 +50,17 @@ constexpr size_t kMaxScratchBuffersPerOp = 12; // needs a node id assignment. constexpr int kUnassignedScratchBufferRequestIndex = -1; -// Used to hold information used during allocation calculations. -struct AllocationInfo { - size_t bytes; - void** output_ptr; - int first_created; - int last_used; - int32_t offline_offset; - bool needs_allocating; -}; - -constexpr char kOfflineMemAllocMetadata[] = "OfflineMemoryAllocation"; const TfLiteIntArray kZeroLengthIntArray = {}; class MicroBuiltinDataAllocator : public BuiltinDataAllocator { public: - explicit MicroBuiltinDataAllocator(SimpleMemoryAllocator* memory_allocator) - : memory_allocator_(memory_allocator) {} + explicit MicroBuiltinDataAllocator( + IPersistentBufferAllocator* persistent_allocator) + : persistent_allocator_(persistent_allocator) {} void* Allocate(size_t size, size_t alignment_hint) override { - return memory_allocator_->AllocateFromTail(size, alignment_hint); + return persistent_allocator_->AllocatePersistentBuffer(size, + alignment_hint); } void Deallocate(void* data) override { // Do not deallocate, builtin data needs to be available for the life time @@ -77,169 +70,9 @@ class MicroBuiltinDataAllocator : public BuiltinDataAllocator { TF_LITE_REMOVE_VIRTUAL_DELETE private: - SimpleMemoryAllocator* memory_allocator_; + IPersistentBufferAllocator* persistent_allocator_; }; -// A helper class to construct AllocationInfo array. This array contains the -// lifetime of tensors / scratch_buffer and will be used to calculate the memory -// plan. Methods need to be called in order from `Init`, `Add*`, to `Finish`. -class AllocationInfoBuilder { - public: - AllocationInfoBuilder(AllocationInfo* info, size_t tensor_count, - size_t scratch_buffer_count, ErrorReporter* reporter) - : info_(info), - tensor_count_(tensor_count), - buffer_count_(scratch_buffer_count), - reporter_(reporter) {} - - // Check if model contains offline planned buffer offsets. - // - If there's no metadata available, offline_planner_offsets is not set - // - If there's metadata available, offline_planner_offsets will point to the - // first offset in the metadata buffer list. - TfLiteStatus GetOfflinePlannedOffsets( - const Model* model, const int32_t** offline_planner_offsets); - - // Add allocaiton information for the tensors. - TfLiteStatus AddTensors(const SubGraph* subgraph, - const int32_t* offline_offsets, - TfLiteEvalTensor* eval_tensors); - - // Add allocation information for the scratch buffers. - TfLiteStatus AddScratchBuffers( - internal::ScratchBufferRequest* scratch_buffer_requests, - ScratchBufferHandle* scratch_buffer_handles); - - // Returns a pointer to the built AllocationInfo array. - const AllocationInfo* Finish() const { return info_; } - - private: - AllocationInfo* info_ = nullptr; - size_t tensor_count_ = 0; - size_t buffer_count_ = 0; - ErrorReporter* reporter_ = nullptr; -}; - -TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph, - const int32_t* offline_offsets, - TfLiteEvalTensor* eval_tensors) { - TFLITE_DCHECK(eval_tensors != nullptr); - - // Set up allocation info for all tensors. - for (size_t i = 0; i < tensor_count_; ++i) { - AllocationInfo* current = &info_[i]; - current->output_ptr = &(eval_tensors[i].data.data); - - TF_LITE_ENSURE_STATUS( - TfLiteEvalTensorByteLength(&eval_tensors[i], ¤t->bytes)); - - current->first_created = -1; - current->last_used = -1; - current->needs_allocating = (eval_tensors[i].data.data == nullptr) && - (!subgraph->tensors()->Get(i)->is_variable()); - if (offline_offsets) { - current->offline_offset = offline_offsets[i]; - } else { - current->offline_offset = kOnlinePlannedBuffer; - } - } - - uint32_t operators_size = NumSubgraphOperators(subgraph); - - for (size_t i = 0; - subgraph->inputs() != nullptr && i < subgraph->inputs()->size(); ++i) { - const int tensor_index = subgraph->inputs()->Get(i); - AllocationInfo* current = &info_[tensor_index]; - current->first_created = 0; - } - - // Mark all outputs as persistent to the end of the invocation. - for (size_t i = 0; - subgraph->outputs() != nullptr && i < subgraph->outputs()->size(); ++i) { - const int tensor_index = subgraph->outputs()->Get(i); - AllocationInfo* current = &info_[tensor_index]; - current->last_used = operators_size - 1; - } - - // Figure out when the first and last use of each tensor is. - for (int i = (operators_size - 1); i >= 0; --i) { - const auto* op = subgraph->operators()->Get(i); - for (size_t n = 0; op->inputs() != nullptr && n < op->inputs()->size(); - ++n) { - const int tensor_index = op->inputs()->Get(n); - AllocationInfo* current = &info_[tensor_index]; - if (((current->last_used == -1) || (current->last_used < i))) { - current->last_used = i; - } - } - for (size_t n = 0; op->outputs() != nullptr && n < op->outputs()->size(); - ++n) { - const int tensor_index = op->outputs()->Get(n); - AllocationInfo* current = &info_[tensor_index]; - if ((current->first_created == -1) || (current->first_created > i)) { - current->first_created = i; - } - // Since operator outputs are written to, they must be marked as used. - if ((current->last_used == -1) || (current->last_used < i)) { - current->last_used = i; - } - } - } - return kTfLiteOk; -} - -// Get offline tensors allocation plan. See -// micro/docs/memory_management.md for more info. -TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets( - const Model* model, const int32_t** offline_planner_offsets) { - if (model->metadata()) { - for (size_t i = 0; i < model->metadata()->size(); ++i) { - auto metadata = model->metadata()->Get(i); - if (strncmp(metadata->name()->c_str(), kOfflineMemAllocMetadata, - strlen(kOfflineMemAllocMetadata)) == 0) { - const flatbuffers::Vector>* buffers = - model->buffers(); - auto* buffer = (*buffers)[metadata->buffer()]; - auto* array = buffer->data(); - const uint32_t* metadata_buffer = - reinterpret_cast(array->data()); - const size_t nbr_tensors = static_cast(metadata_buffer[2]); - *offline_planner_offsets = - reinterpret_cast(&metadata_buffer[3]); - - if (tensor_count_ != nbr_tensors) { - TF_LITE_REPORT_ERROR(reporter_, - "Nbr of offline buffer offsets (%d) in metadata " - "not equal nbr tensors (%d)\n", - nbr_tensors, tensor_count_); - return kTfLiteError; - } - } - } - } - return kTfLiteOk; -} - -TfLiteStatus AllocationInfoBuilder::AddScratchBuffers( - internal::ScratchBufferRequest* scratch_buffer_requests, - ScratchBufferHandle* scratch_buffer_handles) { - // Set up allocation info for buffers. - for (size_t i = tensor_count_; i < tensor_count_ + buffer_count_; ++i) { - internal::ScratchBufferRequest* current_request = - &(scratch_buffer_requests[i - tensor_count_]); - ScratchBufferHandle* current_handle = - &(scratch_buffer_handles[i - tensor_count_]); - - AllocationInfo* current = &info_[i]; - current->output_ptr = reinterpret_cast(¤t_handle->data); - current->bytes = current_request->bytes; - current->first_created = current_request->node_idx; - current->last_used = current_request->node_idx; - current->offline_offset = kOnlinePlannedBuffer; - current->needs_allocating = true; - } - return kTfLiteOk; -} - TfLiteStatus CreatePlan(ErrorReporter* error_reporter, MicroMemoryPlanner* planner, const AllocationInfo* allocation_info, @@ -282,6 +115,7 @@ TfLiteStatus CommitPlan(ErrorReporter* error_reporter, } return kTfLiteOk; } + } // namespace namespace internal { @@ -319,8 +153,9 @@ void* GetFlatbufferTensorBuffer( } TfLiteStatus InitializeTfLiteTensorFromFlatbuffer( - SimpleMemoryAllocator* allocator, bool allocate_temp, - const tflite::Tensor& flatbuffer_tensor, + IPersistentBufferAllocator* persistent_buffer_allocator, + INonPersistentBufferAllocator* non_persistent_buffer_allocator, + bool allocate_temp, const tflite::Tensor& flatbuffer_tensor, const flatbuffers::Vector>* buffers, ErrorReporter* error_reporter, TfLiteTensor* result) { TFLITE_DCHECK(result != nullptr); @@ -385,10 +220,11 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer( TfLiteAffineQuantization* quantization = allocate_temp ? reinterpret_cast( - allocator->AllocateTemp(sizeof(TfLiteAffineQuantization), - alignof(TfLiteAffineQuantization))) + non_persistent_buffer_allocator->AllocateTemp( + sizeof(TfLiteAffineQuantization), + alignof(TfLiteAffineQuantization))) : reinterpret_cast( - allocator->AllocateFromTail( + persistent_buffer_allocator->AllocatePersistentBuffer( sizeof(TfLiteAffineQuantization), alignof(TfLiteAffineQuantization))); if (quantization == nullptr) { @@ -402,12 +238,14 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer( // zero_point is stored as a int64_t. quantization->zero_point = allocate_temp - ? reinterpret_cast(allocator->AllocateTemp( - TfLiteIntArrayGetSizeInBytes(channels), - alignof(TfLiteIntArray))) - : reinterpret_cast(allocator->AllocateFromTail( - TfLiteIntArrayGetSizeInBytes(channels), - alignof(TfLiteIntArray))); + ? reinterpret_cast( + non_persistent_buffer_allocator->AllocateTemp( + TfLiteIntArrayGetSizeInBytes(channels), + alignof(TfLiteIntArray))) + : reinterpret_cast( + persistent_buffer_allocator->AllocatePersistentBuffer( + TfLiteIntArrayGetSizeInBytes(channels), + alignof(TfLiteIntArray))); if (quantization->zero_point == nullptr) { TF_LITE_REPORT_ERROR(error_reporter, "Unable to allocate quantization->zero_point.\n"); @@ -437,7 +275,7 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer( } TfLiteStatus InitializeTfLiteEvalTensorFromFlatbuffer( - SimpleMemoryAllocator* allocator, const tflite::Tensor& flatbuffer_tensor, + const tflite::Tensor& flatbuffer_tensor, const flatbuffers::Vector>* buffers, ErrorReporter* error_reporter, TfLiteEvalTensor* result) { *result = {}; @@ -480,7 +318,8 @@ size_t MicroAllocator::GetDefaultTailUsage(bool is_memory_planner_given) { MicroAllocator::MicroAllocator(SimpleMemoryAllocator* memory_allocator, MicroMemoryPlanner* memory_planner, ErrorReporter* error_reporter) - : memory_allocator_(memory_allocator), + : non_persistent_buffer_allocator_(memory_allocator), + persistent_buffer_allocator_(memory_allocator), memory_planner_(memory_planner), error_reporter_(error_reporter), model_is_allocating_(false) {} @@ -509,7 +348,7 @@ MicroAllocator* MicroAllocator::Create(uint8_t* tensor_arena, size_t arena_size, // By default create GreedyMemoryPlanner. // If a different MemoryPlanner is needed, use the other api. - uint8_t* memory_planner_buffer = memory_allocator->AllocateFromTail( + uint8_t* memory_planner_buffer = memory_allocator->AllocatePersistentBuffer( sizeof(GreedyMemoryPlanner), alignof(GreedyMemoryPlanner)); GreedyMemoryPlanner* memory_planner = new (memory_planner_buffer) GreedyMemoryPlanner(); @@ -524,7 +363,7 @@ MicroAllocator* MicroAllocator::Create(SimpleMemoryAllocator* memory_allocator, TFLITE_DCHECK(error_reporter != nullptr); TFLITE_DCHECK(memory_planner != nullptr); - uint8_t* allocator_buffer = memory_allocator->AllocateFromTail( + uint8_t* allocator_buffer = memory_allocator->AllocatePersistentBuffer( sizeof(MicroAllocator), alignof(MicroAllocator)); MicroAllocator* allocator = new (allocator_buffer) MicroAllocator(memory_allocator, memory_planner, error_reporter); @@ -543,10 +382,12 @@ SubgraphAllocations* MicroAllocator::StartModelAllocation(const Model* model) { model_is_allocating_ = true; - uint8_t* data_allocator_buffer = memory_allocator_->AllocateFromTail( - sizeof(MicroBuiltinDataAllocator), alignof(MicroBuiltinDataAllocator)); - builtin_data_allocator_ = - new (data_allocator_buffer) MicroBuiltinDataAllocator(memory_allocator_); + uint8_t* data_allocator_buffer = + persistent_buffer_allocator_->AllocatePersistentBuffer( + sizeof(MicroBuiltinDataAllocator), + alignof(MicroBuiltinDataAllocator)); + builtin_data_allocator_ = new (data_allocator_buffer) + MicroBuiltinDataAllocator(persistent_buffer_allocator_); if (InitScratchBufferData() != kTfLiteOk) { return nullptr; @@ -554,7 +395,7 @@ SubgraphAllocations* MicroAllocator::StartModelAllocation(const Model* model) { // Allocate struct to store eval tensors, nodes and registrations. SubgraphAllocations* output = reinterpret_cast( - memory_allocator_->AllocateFromTail( + persistent_buffer_allocator_->AllocatePersistentBuffer( sizeof(SubgraphAllocations) * model->subgraphs()->size(), alignof(SubgraphAllocations))); if (output == nullptr) { @@ -579,7 +420,7 @@ TfLiteStatus MicroAllocator::FinishModelAllocation( return kTfLiteError; } - // TODO(b/187993197): Track scratch buffers for each subgraph. + // Allocate scratch buffer metadata and buffers for variable tensors. for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs()->size(); subgraph_idx++) { const SubGraph* subgraph = model->subgraphs()->Get(subgraph_idx); @@ -587,19 +428,20 @@ TfLiteStatus MicroAllocator::FinishModelAllocation( TF_LITE_ENSURE_STATUS(AllocateScratchBufferHandles( scratch_buffer_handles, scratch_buffer_request_count_)); - TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan( - model, subgraph_allocations[subgraph_idx].tensors, - *scratch_buffer_handles, subgraph_idx)); TF_LITE_ENSURE_STATUS(AllocateVariables( subgraph, subgraph_allocations[subgraph_idx].tensors)); } + + // Plan all subgraphs and scratch buffers together. + TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(model, subgraph_allocations, + *scratch_buffer_handles)); model_is_allocating_ = false; return kTfLiteOk; } void* MicroAllocator::AllocatePersistentBuffer(size_t bytes) { - return memory_allocator_->AllocateFromTail(bytes, - MicroArenaBufferAlignment()); + return persistent_buffer_allocator_->AllocatePersistentBuffer( + bytes, MicroArenaBufferAlignment()); } TfLiteStatus MicroAllocator::RequestScratchBufferInArena(size_t bytes, @@ -635,6 +477,7 @@ TfLiteStatus MicroAllocator::RequestScratchBufferInArena(size_t bytes, // allocating: current_request->bytes = bytes; current_request->node_idx = kUnassignedScratchBufferRequestIndex; + current_request->subgraph_idx = subgraph_idx; // Assign the current request index to the out-param: *buffer_idx = scratch_buffer_request_count_; @@ -647,7 +490,7 @@ TfLiteStatus MicroAllocator::RequestScratchBufferInArena(size_t bytes, TfLiteStatus MicroAllocator::FinishPrepareNodeAllocations(int node_id) { // When a node has finished preparing, all temp allocations performed by the // kernel should be cleaned up: - ResetTempAllocations(); + TF_LITE_ENSURE_STATUS(ResetTempAllocations()); // Find and update any new scratch buffer requests for the current node: internal::ScratchBufferRequest* requests = GetScratchBufferRequests(); @@ -665,7 +508,8 @@ TfLiteStatus MicroAllocator::FinishPrepareNodeAllocations(int node_id) { // Ensure that the head is re-adjusted to allow for another at-most // kMaxScratchBuffersPerOp scratch buffer requests in the next operator: - TF_LITE_ENSURE_STATUS(memory_allocator_->SetHeadBufferSize( + TF_LITE_ENSURE_STATUS(non_persistent_buffer_allocator_->ResizeBuffer( + scratch_buffer_head_, sizeof(internal::ScratchBufferRequest) * (scratch_buffer_request_count_ + kMaxScratchBuffersPerOp), alignof(internal::ScratchBufferRequest))); @@ -674,7 +518,8 @@ TfLiteStatus MicroAllocator::FinishPrepareNodeAllocations(int node_id) { } size_t MicroAllocator::used_bytes() const { - return memory_allocator_->GetUsedBytes(); + return non_persistent_buffer_allocator_->GetNonPersistentUsedBytes() + + persistent_buffer_allocator_->GetPersistentUsedBytes(); } TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations( @@ -690,7 +535,7 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations( // Initialize NodeAndRegistrations for the subgraph. NodeAndRegistration* output = reinterpret_cast( - memory_allocator_->AllocateFromTail( + persistent_buffer_allocator_->AllocatePersistentBuffer( sizeof(NodeAndRegistration) * operators_size, alignof(NodeAndRegistration))); if (output == nullptr) { @@ -703,6 +548,7 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations( } return kTfLiteOk; } + TfLiteTensor* MicroAllocator::AllocatePersistentTfLiteTensor( const Model* model, const SubgraphAllocations* subgraph_allocations, int tensor_index, int subgraph_index) { @@ -740,6 +586,30 @@ TfLiteTensor* MicroAllocator::AllocatePersistentTfLiteTensor( return tensor; } +void MicroAllocator::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) { + TFLITE_DCHECK(tensor != nullptr); + + if (tensor->quantization.type == kTfLiteAffineQuantization) { + TFLITE_DCHECK(tensor->quantization.params != nullptr); + TfLiteAffineQuantization* quantization = + reinterpret_cast( + tensor->quantization.params); + + non_persistent_buffer_allocator_->DeallocateTemp( + reinterpret_cast(quantization->zero_point)); + non_persistent_buffer_allocator_->DeallocateTemp( + reinterpret_cast(quantization)); + } + + // Clear the data in case someone still access tensor arena by mistake + tensor->quantization.type = kTfLiteNoQuantization; + tensor->quantization.params = nullptr; + tensor->data.data = nullptr; + tensor->dims = nullptr; + non_persistent_buffer_allocator_->DeallocateTemp( + reinterpret_cast(tensor)); +} + TfLiteTensor* MicroAllocator::AllocateTempTfLiteTensor( const Model* model, const SubgraphAllocations* subgraph_allocations, int tensor_index, int subgraph_index) { @@ -749,9 +619,9 @@ TfLiteTensor* MicroAllocator::AllocateTempTfLiteTensor( // This value is allocated from temporary arena space. It is guaranteed to be // around for at least the scope of the calling function. Since this struct // allocation takes place in temp space, no need to own or cleanup. - TfLiteTensor* tensor = - reinterpret_cast(memory_allocator_->AllocateTemp( - sizeof(TfLiteTensor), alignof(TfLiteTensor))); + TfLiteTensor* tensor = reinterpret_cast( + non_persistent_buffer_allocator_->AllocateTemp(sizeof(TfLiteTensor), + alignof(TfLiteTensor))); // Populate any fields from the flatbuffer, since this TfLiteTensor struct is // allocated in the temp section of the arena, ensure that additional @@ -780,8 +650,12 @@ TfLiteTensor* MicroAllocator::AllocateTempTfLiteTensor( return tensor; } -void MicroAllocator::ResetTempAllocations() { - memory_allocator_->ResetTempAllocations(); +TfLiteStatus MicroAllocator::ResetTempAllocations() { + return non_persistent_buffer_allocator_->ResetTempAllocations(); +} + +bool MicroAllocator::IsAllTempDeallocated() { + return non_persistent_buffer_allocator_->IsAllTempDeallocated(); } TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors( @@ -794,8 +668,8 @@ TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors( TFLITE_DCHECK(subgraph != nullptr); size_t alloc_count = subgraph->tensors()->size(); - TfLiteEvalTensor* tensors = - reinterpret_cast(memory_allocator_->AllocateFromTail( + TfLiteEvalTensor* tensors = reinterpret_cast( + persistent_buffer_allocator_->AllocatePersistentBuffer( sizeof(TfLiteEvalTensor) * alloc_count, alignof(TfLiteEvalTensor))); if (tensors == nullptr) { TF_LITE_REPORT_ERROR( @@ -808,8 +682,8 @@ TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors( for (size_t i = 0; i < alloc_count; ++i) { TfLiteStatus status = internal::InitializeTfLiteEvalTensorFromFlatbuffer( - memory_allocator_, *subgraph->tensors()->Get(i), model->buffers(), - error_reporter_, &tensors[i]); + *subgraph->tensors()->Get(i), model->buffers(), error_reporter_, + &tensors[i]); if (status != kTfLiteOk) { TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d", i); @@ -820,6 +694,7 @@ TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors( } return kTfLiteOk; } + TfLiteStatus MicroAllocator::AllocateVariables(const SubGraph* subgraph, TfLiteEvalTensor* eval_tensors) { for (size_t i = 0; i < subgraph->tensors()->size(); ++i) { @@ -829,8 +704,9 @@ TfLiteStatus MicroAllocator::AllocateVariables(const SubGraph* subgraph, TF_LITE_ENSURE_STATUS( TfLiteEvalTensorByteLength(&eval_tensors[i], &buffer_size)); - eval_tensors[i].data.data = memory_allocator_->AllocateFromTail( - buffer_size, MicroArenaBufferAlignment()); + eval_tensors[i].data.data = + persistent_buffer_allocator_->AllocatePersistentBuffer( + buffer_size, MicroArenaBufferAlignment()); if (eval_tensors[i].data.data == nullptr) { TF_LITE_REPORT_ERROR(error_reporter_, @@ -844,8 +720,9 @@ TfLiteStatus MicroAllocator::AllocateVariables(const SubGraph* subgraph, } TfLiteTensor* MicroAllocator::AllocatePersistentTfLiteTensorInternal() { - return reinterpret_cast(memory_allocator_->AllocateFromTail( - sizeof(TfLiteTensor), alignof(TfLiteTensor))); + return reinterpret_cast( + persistent_buffer_allocator_->AllocatePersistentBuffer( + sizeof(TfLiteTensor), alignof(TfLiteTensor))); } TfLiteStatus MicroAllocator::PopulateTfLiteTensorFromFlatbuffer( @@ -855,7 +732,8 @@ TfLiteStatus MicroAllocator::PopulateTfLiteTensorFromFlatbuffer( // allocations in the tail can be recorded. Once the interpreter has APIs for // accessing buffers on TfLiteEvalTensor this method can be dropped. return internal::InitializeTfLiteTensorFromFlatbuffer( - memory_allocator_, allocate_temp, + persistent_buffer_allocator_, non_persistent_buffer_allocator_, + allocate_temp, *model->subgraphs()->Get(subgraph_idx)->tensors()->Get(tensor_index), model->buffers(), error_reporter_, tensor); } @@ -865,8 +743,8 @@ ErrorReporter* MicroAllocator::error_reporter() const { } TfLiteStatus MicroAllocator::CommitStaticMemoryPlan( - const Model* model, TfLiteEvalTensor* eval_tensors, - ScratchBufferHandle* scratch_buffer_handles, int subgraph_idx) { + const Model* model, SubgraphAllocations* allocations, + ScratchBufferHandle* scratch_buffer_handles) { size_t head_usage = 0; // Create static memory plan // 1. Calculate AllocationInfo to know the lifetime of each tensor/buffer. @@ -878,69 +756,52 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan( // allocated from the temp section and cleaned up at the bottom of this // function. - const SubGraph* subgraph = model->subgraphs()->Get(subgraph_idx); - size_t allocation_info_count = - subgraph->tensors()->size() + scratch_buffer_request_count_; - size_t bytes = sizeof(AllocationInfo) * allocation_info_count; - - // Allocate an array of AllocationInfo structs from the temp section. This - // struct will be used by AllocationInfoBuilder to find buffer usage. - AllocationInfo* allocation_info = reinterpret_cast( - memory_allocator_->AllocateTemp(bytes, alignof(AllocationInfo))); - if (allocation_info == nullptr) { - TF_LITE_REPORT_ERROR( - error_reporter_, - "Failed to allocate memory for allocation_info, %d bytes required", - bytes); - return kTfLiteError; - } - // Use the AllocationInfoBuilder class to help determine where buffers are // used in the subgraph. - AllocationInfoBuilder builder(allocation_info, subgraph->tensors()->size(), - scratch_buffer_request_count_, error_reporter_); + AllocationInfoBuilder builder(model, non_persistent_buffer_allocator_, + error_reporter_); + TF_LITE_ENSURE_STATUS( + builder.CreateAllocationInfo(scratch_buffer_request_count_)); const int32_t* offline_planner_offsets = nullptr; TF_LITE_ENSURE_STATUS( - builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets)); + builder.GetOfflinePlannedOffsets(&offline_planner_offsets)); TF_LITE_ENSURE_STATUS( - builder.AddTensors(subgraph, offline_planner_offsets, eval_tensors)); + builder.InitializeAllocationInfo(offline_planner_offsets, allocations)); internal::ScratchBufferRequest* scratch_buffer_requests = GetScratchBufferRequests(); - - TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_requests, - scratch_buffer_handles)); + TF_LITE_ENSURE_STATUS(builder.MarkAllocationLifetimes( + 0, scratch_buffer_requests, scratch_buffer_handles, allocations)); + int allocation_info_count = builder.AllocationCount(); + AllocationInfo* allocation_info = builder.Finish(); // Remaining arena size that memory planner can use for calculating offsets. size_t remaining_arena_size = - memory_allocator_->GetAvailableMemory(MicroArenaBufferAlignment()); - uint8_t* planner_arena = memory_allocator_->AllocateTemp( + non_persistent_buffer_allocator_->GetAvailableMemory( + MicroArenaBufferAlignment()); + uint8_t* planner_arena = non_persistent_buffer_allocator_->AllocateTemp( remaining_arena_size, MicroArenaBufferAlignment()); TF_LITE_ENSURE(error_reporter_, planner_arena != nullptr); memory_planner_->Init(planner_arena, remaining_arena_size); TF_LITE_ENSURE_STATUS(CreatePlan(error_reporter_, memory_planner_, allocation_info, allocation_info_count)); - // Reset all temp allocations used above: - memory_allocator_->ResetTempAllocations(); - - size_t actual_available_arena_size = - memory_allocator_->GetAvailableMemory(MicroArenaBufferAlignment()); - - // Make sure we have enough arena size. - if (memory_planner_->GetMaximumMemorySize() > actual_available_arena_size) { - TF_LITE_REPORT_ERROR( - error_reporter_, - "Arena size is too small for all buffers. Needed %u but only " - "%u was available.", - memory_planner_->GetMaximumMemorySize(), actual_available_arena_size); - return kTfLiteError; - } // Commit the plan. - TF_LITE_ENSURE_STATUS(CommitPlan(error_reporter_, memory_planner_, - memory_allocator_->GetHeadBuffer(), - allocation_info, allocation_info_count)); + TF_LITE_ENSURE_STATUS( + CommitPlan(error_reporter_, memory_planner_, + non_persistent_buffer_allocator_->GetOverlayMemoryAddress(), + allocation_info, allocation_info_count)); + + // Reset all temp allocations used above: + builder.FreeAllocationInfo(); + non_persistent_buffer_allocator_->DeallocateTemp(planner_arena); + TF_LITE_ENSURE_STATUS( + non_persistent_buffer_allocator_->ResetTempAllocations()); + TF_LITE_ENSURE_STATUS( + non_persistent_buffer_allocator_->DeallocateResizableBuffer( + scratch_buffer_head_)); + #ifdef TF_LITE_SHOW_MEMORY_USE memory_planner_->PrintMemoryPlan(); #endif @@ -958,8 +819,9 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan( // The head is used for storing scratch buffer allocations before finalizing a // memory plan in this function. Ensure that the head is set to the largest // memory plan sent through the allocator: - TF_LITE_ENSURE_STATUS(memory_allocator_->SetHeadBufferSize( - max_head_buffer_usage_, MicroArenaBufferAlignment())); + TF_LITE_ENSURE_STATUS( + non_persistent_buffer_allocator_->ReserveNonPersistentOverlayMemory( + max_head_buffer_usage_, MicroArenaBufferAlignment())); return kTfLiteOk; } @@ -975,7 +837,7 @@ TfLiteStatus MicroAllocator::AllocateScratchBufferHandles( // Allocate a consecutive block of memory store the scratch buffer handles. // This alignment ensures quick lookup during inference time for the model: *scratch_buffer_handles = reinterpret_cast( - memory_allocator_->AllocateFromTail( + persistent_buffer_allocator_->AllocatePersistentBuffer( sizeof(ScratchBufferHandle) * handle_count, alignof(ScratchBufferHandle))); @@ -990,17 +852,20 @@ TfLiteStatus MicroAllocator::InitScratchBufferData() { // All requests will be stored in the head section. Each kernel is allowed at // most kMaxScratchBuffersPerOp requests. Adjust the head to reserve at most // that many requests to begin: - TF_LITE_ENSURE_STATUS(memory_allocator_->SetHeadBufferSize( - sizeof(internal::ScratchBufferRequest) * kMaxScratchBuffersPerOp, - alignof(internal::ScratchBufferRequest))); + scratch_buffer_head_ = + non_persistent_buffer_allocator_->AllocateResizableBuffer( + sizeof(internal::ScratchBufferRequest) * kMaxScratchBuffersPerOp, + alignof(internal::ScratchBufferRequest)); + if (scratch_buffer_head_ == nullptr) { + return kTfLiteError; + } return kTfLiteOk; } internal::ScratchBufferRequest* MicroAllocator::GetScratchBufferRequests() { - return reinterpret_cast( - AlignPointerUp(memory_allocator_->GetHeadBuffer(), - alignof(internal::ScratchBufferRequest))); + return reinterpret_cast(AlignPointerUp( + scratch_buffer_head_, alignof(internal::ScratchBufferRequest))); } BuiltinDataAllocator* MicroAllocator::GetBuiltinDataAllocator() { diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.h index f76fe29f..d2967c21 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_allocator.h @@ -38,8 +38,9 @@ namespace internal { // TODO(b/162311891): Drop this method when the interpreter has an API for // returning buffers on TfLiteEvalTensor. TfLiteStatus InitializeTfLiteTensorFromFlatbuffer( - SimpleMemoryAllocator* allocator, bool allocate_temp, - const tflite::Tensor& flatbuffer_tensor, + IPersistentBufferAllocator* persistent_buffer_allocator, + INonPersistentBufferAllocator* non_persistent_buffer_allocator, + bool allocate_temp, const tflite::Tensor& flatbuffer_tensor, const flatbuffers::Vector>* buffers, ErrorReporter* error_reporter, TfLiteTensor* result); @@ -61,6 +62,7 @@ typedef struct { // determine the lifetime of the buffer. In AllocationInfo, this buffer will // have `before` = node_idx and `after` = node_idx. int node_idx; + int subgraph_idx; } ScratchBufferRequest; } // namespace internal @@ -185,10 +187,16 @@ class MicroAllocator { const Model* model, const SubgraphAllocations* subgraph_allocations, int tensor_index, int subgraph_index); + virtual void DeallocateTempTfLiteTensor(TfLiteTensor*); + // Resets all temporary allocations. This method should be called after a // chain of temp allocations (e.g. chain of TfLiteTensor objects via // AllocateTfLiteTensor()). - virtual void ResetTempAllocations(); + virtual TfLiteStatus ResetTempAllocations(); + + // Returns true if all temporary buffers including temp TfLiteTensor are + // already deallocated. + virtual bool IsAllTempDeallocated(); // Allocates persistent buffer which has the same life time as the allocator. // The memory is immediately available and is allocated from the tail of the @@ -260,8 +268,8 @@ class MicroAllocator { // ScratchBufferHandle structs that will point to allocated buffers also in // the head section. virtual TfLiteStatus CommitStaticMemoryPlan( - const Model* model, TfLiteEvalTensor* eval_tensors, - ScratchBufferHandle* scratch_buffer_handles, int subgraph_idx); + const Model* model, SubgraphAllocations* allocations, + ScratchBufferHandle* scratch_buffer_handles); // Allocates an array of ScratchBufferHandle structs in the tail section for a // given number of handles. @@ -278,7 +286,8 @@ class MicroAllocator { internal::ScratchBufferRequest* GetScratchBufferRequests(); // A simple memory allocator that always allocate from the arena tail or head. - SimpleMemoryAllocator* memory_allocator_; + INonPersistentBufferAllocator* non_persistent_buffer_allocator_; + IPersistentBufferAllocator* persistent_buffer_allocator_; // Allocator used to allocate persistent builtin data. BuiltinDataAllocator* builtin_data_allocator_; @@ -293,6 +302,9 @@ class MicroAllocator { // section when a model is allocating. size_t scratch_buffer_request_count_ = 0; + // Holds ScratchBufferRequest when a model is allocating + uint8_t* scratch_buffer_head_ = nullptr; + // Holds the byte length of the memory plan with the largest head usage. Used // to ensure that multi-tenant allocations can share the head for buffers. size_t max_head_buffer_usage_ = 0; diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_context.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_context.cc new file mode 100644 index 00000000..1526b976 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_context.cc @@ -0,0 +1,119 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/micro_context.h" + +#include +#include +#include + +#include "tensorflow/lite/micro/micro_error_reporter.h" + +namespace tflite { +MicroContext::MicroContext(MicroAllocator* allocator, const Model* model, + MicroGraph* graph) + : allocator_(*allocator), graph_(*graph), model_(model) {} + +MicroContext::~MicroContext() {} + +void* MicroContext::AllocatePersistentBuffer(size_t bytes) { + return allocator_.AllocatePersistentBuffer(bytes); +} + +TfLiteStatus MicroContext::RequestScratchBufferInArena(size_t bytes, + int* buffer_idx) { + return allocator_.RequestScratchBufferInArena( + bytes, graph_.GetCurrentSubgraphIndex(), buffer_idx); +} + +void* MicroContext::GetScratchBuffer(int buffer_idx) { + ScratchBufferHandle* handle = scratch_buffer_handles_ + buffer_idx; + return handle->data; +} + +TfLiteTensor* MicroContext::AllocateTempTfLiteTensor(int tensor_idx) { + return allocator_.AllocateTempTfLiteTensor(model_, graph_.GetAllocations(), + tensor_idx, + graph_.GetCurrentSubgraphIndex()); +} + +int MicroContext::GetTensorIndex(int index, int max_size, + const int* tensor_indices) { + if (index >= 0 && index < max_size) { + const int tensor_index = tensor_indices[index]; + if (tensor_index != kTfLiteOptionalTensor) { + return tensor_index; + } + } + return -1; +} + +TfLiteTensor* MicroContext::AllocateTempInputTensor(const TfLiteNode* node, + int index) { + const int tensor_index = + GetTensorIndex(index, node->inputs->size, node->inputs->data); + if (tensor_index < 0) { + return nullptr; + } + return AllocateTempTfLiteTensor(tensor_index); +} + +TfLiteTensor* MicroContext::AllocateTempOutputTensor(const TfLiteNode* node, + int index) { + const int tensor_index = + GetTensorIndex(index, node->outputs->size, node->outputs->data); + if (tensor_index < 0) { + return nullptr; + } + return AllocateTempTfLiteTensor(tensor_index); +} + +void MicroContext::DeallocateTempTfLiteTensor(TfLiteTensor* tensor) { + return allocator_.DeallocateTempTfLiteTensor(tensor); +} + +TfLiteEvalTensor* MicroContext::GetEvalTensor(int tensor_idx) { + return &graph_.GetAllocations()[graph_.GetCurrentSubgraphIndex()] + .tensors[tensor_idx]; +} + +void MicroContext::SetScratchBufferHandles( + ScratchBufferHandle* scratch_buffer_handles) { + scratch_buffer_handles_ = scratch_buffer_handles; +} + +TfLiteStatus MicroContext::set_external_context( + void* external_context_payload) { + if (external_context_payload == nullptr || + external_context_payload_ != nullptr) { + MicroPrintf( + "Attempting to set external context to %x but it was %x already", + external_context_payload, external_context_payload_); + return kTfLiteError; + } + + external_context_payload_ = external_context_payload; + return kTfLiteOk; +} + +void MicroContextReportOpError(struct TfLiteContext* context, + const char* format, ...) { + va_list args; + va_start(args, format); + GetMicroErrorReporter()->Report(format, args); + va_end(args); +} + +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_context.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_context.h new file mode 100644 index 00000000..1db2575e --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_context.h @@ -0,0 +1,154 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_MICRO_MICRO_CONTEXT_H_ +#define TENSORFLOW_LITE_MICRO_MICRO_CONTEXT_H_ + +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/micro/micro_allocator.h" +#include "tensorflow/lite/micro/micro_graph.h" + +namespace tflite { +// MicroContext is eventually going to become the API between TFLM and the +// kernels, replacing all the functions in TfLiteContext. The end state is code +// kernels to have code like: +// +// MicroContext* micro_context = GetMicroContext(context); +// micro_context-> +class MicroContext { + public: + // Does not take any ownership, and all pointers must refer to valid objects + // that outlive the one constructed. + explicit MicroContext(MicroAllocator* allocator, const Model* model, + MicroGraph* graph); + virtual ~MicroContext(); + + // Allocate persistent buffer which has the same life time as the interpreter. + // Returns nullptr on failure. + // The memory is allocated from the tail. + // This method is only available in Init or Prepare stage. + // Virtual so that it can be faked for kernel tests. + virtual void* AllocatePersistentBuffer(size_t bytes); + + // Request a scratch buffer in the arena through static memory planning. + // This method is only available in Prepare stage and the buffer is allocated + // by the interpreter between Prepare and Eval stage. In Eval stage, + // GetScratchBuffer API can be used to fetch the address. + // Virtual so that it can be faked for kernel tests. + virtual TfLiteStatus RequestScratchBufferInArena(size_t bytes, + int* buffer_idx); + + // Get the scratch buffer pointer. + // This method is only available in Eval stage. + // Virtual so that it can be faked for kernel tests. + virtual void* GetScratchBuffer(int buffer_idx); + + // Returns a temporary TfLiteTensor struct for a given index. + // Virtual so that it can be faked for kernel tests. + virtual TfLiteTensor* AllocateTempTfLiteTensor(int tensor_idx); + + // Returns a temporary TfLiteTensor struct for the specified input tensor of a + // given mode. This is the recommended API over the deprecated + // GetInput/GetInputSafe to get a temp input tensor. The returned tensor shall + // be freed via calling DeallocateTempTfLiteTensor. + virtual TfLiteTensor* AllocateTempInputTensor(const TfLiteNode* node, + int index); + + // Returns a temporary TfLiteTensor struct for the specified output tensor of + // a given mode. This is the recommended API over the deprecated + // GetOutput/GetOutputSafe to get a temp output tensor. The returned tensor + // shall be freed via calling DeallocateTempTfLiteTensor. + virtual TfLiteTensor* AllocateTempOutputTensor(const TfLiteNode* node, + int index); + + // Deallocates a temp TfLiteTensor. + // Virtual so that it can be faked for kernel tests. + virtual void DeallocateTempTfLiteTensor(TfLiteTensor* tensor); + + // Returns a TfLiteEvalTensor struct for a given index. + // Virtual so that it can be faked for kernel tests. + virtual TfLiteEvalTensor* GetEvalTensor(int tensor_idx); + + // Does not take ownership of the pointer and the pointer must refer to valid + // an object that outlive this class instance. + // This can only be called once to set one external context. + TfLiteStatus set_external_context(void* external_context_payload); + + void* external_context() { return external_context_payload_; } + + MicroGraph& graph() { return graph_; } + + // Sets the pointer to a list of ScratchBufferHandle instances. + // Not API between TFLM and kernels. Primarily used by the framework for + // housekeeping in MicroContext. + void SetScratchBufferHandles(ScratchBufferHandle* scratch_buffer_handles); + + private: + // Return the tensor index as tensor_indices[index]. tensor_indices is of + // max_size. Return -1 if index is not in the valid range of tensor_indices. + int GetTensorIndex(int index, int max_size, const int* tensor_indices); + + MicroAllocator& allocator_; + MicroGraph& graph_; + const Model* model_; + + ScratchBufferHandle* scratch_buffer_handles_ = nullptr; + void* external_context_payload_ = nullptr; + + TF_LITE_REMOVE_VIRTUAL_DELETE +}; + +inline MicroContext* GetMicroContext(const struct TfLiteContext* context) { + return reinterpret_cast(context->impl_); +} + +// Deprecated API. Prefer to using the MicroContext API directly from the +// kernels. +// TODO(b/213010668): migrate all existing kernels to use MicroContext, delete +// these functions, and remove corresponding members from the TfLiteContext +// struct for TFLM. +inline void* MicroContextAllocatePersistentBuffer(TfLiteContext* ctx, + size_t bytes) { + return GetMicroContext(ctx)->AllocatePersistentBuffer(bytes); +} +inline TfLiteStatus MicroContextRequestScratchBufferInArena(TfLiteContext* ctx, + size_t bytes, + int* buffer_idx) { + return GetMicroContext(ctx)->RequestScratchBufferInArena(bytes, buffer_idx); +} +inline void* MicroContextGetScratchBuffer(TfLiteContext* ctx, int buffer_idx) { + return GetMicroContext(ctx)->GetScratchBuffer(buffer_idx); +} +inline TfLiteTensor* MicroContextGetTensor(const struct TfLiteContext* context, + int tensor_idx) { + return GetMicroContext(context)->AllocateTempTfLiteTensor(tensor_idx); +} +inline TfLiteEvalTensor* MicroContextGetEvalTensor( + const struct TfLiteContext* context, int tensor_idx) { + return GetMicroContext(context)->GetEvalTensor(tensor_idx); +} +inline TfLiteExternalContext* MicroContextGetExternalContext( + TfLiteContext* context, TfLiteExternalContextType unused) { + return reinterpret_cast( + GetMicroContext(context)->external_context()); +} + +// Requests that an error be reported with format string msg. +void MicroContextReportOpError(struct TfLiteContext* context, + const char* format, ...); + +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_MICRO_CONTEXT_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_graph.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_graph.cc index 0abe0173..d9b2176e 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/micro_graph.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_graph.cc @@ -209,6 +209,9 @@ TfLiteStatus MicroGraph::ResetVariableTensors() { } } } + if (resource_variables_ != nullptr) { + resource_variables_->ResetAll(); + } return kTfLiteOk; } diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.cc index 3f8c44ad..f726a5f3 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.cc @@ -51,7 +51,8 @@ MicroInterpreter::MicroInterpreter(const Model* model, tensors_allocated_(false), initialization_status_(kTfLiteError), input_tensors_(nullptr), - output_tensors_(nullptr) { + output_tensors_(nullptr), + micro_context_(&allocator_, model_, &graph_) { Init(profiler); } @@ -69,7 +70,8 @@ MicroInterpreter::MicroInterpreter(const Model* model, tensors_allocated_(false), initialization_status_(kTfLiteError), input_tensors_(nullptr), - output_tensors_(nullptr) { + output_tensors_(nullptr), + micro_context_(&allocator_, model_, &graph_) { Init(profiler); } @@ -80,12 +82,10 @@ MicroInterpreter::~MicroInterpreter() { } void MicroInterpreter::Init(MicroProfiler* profiler) { - context_.impl_ = static_cast(this); - context_.ReportError = ReportOpError; - context_.GetTensor = GetTensor; - context_.ReportError = ReportOpError; - context_.GetTensor = GetTensor; - context_.GetEvalTensor = GetEvalTensor; + context_.impl_ = static_cast(µ_context_); + context_.ReportError = MicroContextReportOpError; + context_.GetTensor = MicroContextGetTensor; + context_.GetEvalTensor = MicroContextGetEvalTensor; context_.profiler = profiler; initialization_status_ = kTfLiteOk; @@ -200,18 +200,18 @@ TfLiteStatus MicroInterpreter::AllocateTensors() { TF_LITE_ENSURE_STATUS(PrepareNodeAndRegistrationDataFromFlatbuffer()); // Only allow AllocatePersistentBuffer in Init stage. - context_.AllocatePersistentBuffer = AllocatePersistentBuffer; + context_.AllocatePersistentBuffer = MicroContextAllocatePersistentBuffer; context_.RequestScratchBufferInArena = nullptr; context_.GetScratchBuffer = nullptr; - context_.GetExecutionPlan = GetGraph; context_.GetExternalContext = nullptr; TF_LITE_ENSURE_STATUS(graph_.InitSubgraphs()); // Both AllocatePersistentBuffer and RequestScratchBufferInArena is // available in Prepare stage. - context_.RequestScratchBufferInArena = RequestScratchBufferInArena; - // GetExternalContext become available in Prepare stage. - context_.GetExternalContext = GetExternalContext; + context_.RequestScratchBufferInArena = + MicroContextRequestScratchBufferInArena; + // external_context become available in Prepare stage. + context_.GetExternalContext = MicroContextGetExternalContext; TF_LITE_ENSURE_STATUS(graph_.PrepareSubgraphs()); @@ -219,12 +219,14 @@ TfLiteStatus MicroInterpreter::AllocateTensors() { // allowed. Kernels can only fetch scratch buffers via GetScratchBuffer. context_.AllocatePersistentBuffer = nullptr; context_.RequestScratchBufferInArena = nullptr; - context_.GetScratchBuffer = GetScratchBuffer; + context_.GetScratchBuffer = MicroContextGetScratchBuffer; TF_LITE_ENSURE_OK(&context_, allocator_.FinishModelAllocation( model_, graph_.GetAllocations(), &scratch_buffer_handles_)); + micro_context_.SetScratchBufferHandles(scratch_buffer_handles_); + // TODO(b/162311891): Drop these allocations when the interpreter supports // handling buffers from TfLiteEvalTensor. input_tensors_ = @@ -320,97 +322,9 @@ TfLiteStatus MicroInterpreter::ResetVariableTensors() { return graph_.ResetVariableTensors(); } -void* MicroInterpreter::AllocatePersistentBuffer(TfLiteContext* ctx, - size_t bytes) { - return reinterpret_cast(ctx->impl_) - ->allocator_.AllocatePersistentBuffer(bytes); -} - -TfLiteStatus MicroInterpreter::RequestScratchBufferInArena(TfLiteContext* ctx, - size_t bytes, - int* buffer_idx) { - MicroInterpreter* interpreter = - reinterpret_cast(ctx->impl_); - return interpreter->allocator_.RequestScratchBufferInArena( - bytes, interpreter->graph_.GetCurrentSubgraphIndex(), buffer_idx); -} - -void* MicroInterpreter::GetScratchBuffer(TfLiteContext* ctx, int buffer_idx) { - MicroInterpreter* interpreter = - reinterpret_cast(ctx->impl_); - ScratchBufferHandle* handle = - interpreter->scratch_buffer_handles_ + buffer_idx; - return handle->data; -} - -void MicroInterpreter::ReportOpError(struct TfLiteContext* context, - const char* format, ...) { -#ifndef TF_LITE_STRIP_ERROR_STRINGS - MicroInterpreter* interpreter = - static_cast(context->impl_); - va_list args; - va_start(args, format); - TF_LITE_REPORT_ERROR(interpreter->error_reporter_, format, args); - va_end(args); -#endif -} - -TfLiteTensor* MicroInterpreter::GetTensor(const struct TfLiteContext* context, - int tensor_idx) { - MicroInterpreter* interpreter = - static_cast(context->impl_); - return interpreter->allocator_.AllocateTempTfLiteTensor( - interpreter->model_, interpreter->graph_.GetAllocations(), tensor_idx, - interpreter->get_subgraph_index()); -} - -TfLiteEvalTensor* MicroInterpreter::GetEvalTensor( - const struct TfLiteContext* context, int tensor_idx) { - MicroInterpreter* interpreter = - reinterpret_cast(context->impl_); - return &interpreter->graph_ - .GetAllocations()[interpreter->get_subgraph_index()] - .tensors[tensor_idx]; -} - -TfLiteStatus MicroInterpreter::GetGraph(struct TfLiteContext* context, - TfLiteIntArray** args) { - MicroInterpreter* interpreter = - reinterpret_cast(context->impl_); - *args = reinterpret_cast(&interpreter->graph_); - return kTfLiteOk; -} - TfLiteStatus MicroInterpreter::SetMicroExternalContext( void* external_context_payload) { - if (external_context_payload == nullptr || - external_context_payload_ != nullptr) { - MicroPrintf( - "Attempting to set external context to %x but it was %x already", - external_context_payload, external_context_payload_); - return kTfLiteError; - } - - external_context_payload_ = external_context_payload; - return kTfLiteOk; -} - -void* MicroInterpreter::GetMicroExternalContext() { - return external_context_payload_; -} - -// This callback is an implementation for TfLiteContext::GetExternalContext -// interface. -TfLiteExternalContext* MicroInterpreter::GetExternalContext( - TfLiteContext* context, TfLiteExternalContextType unused) { - // TODO(b/205754757): TfLiteExternalContextType is unused in TFLM. This - // function is only called by the framework as a way to conform to existing - // interface. Users should use GetMicroExternalContext api in kernel_util.h to - // get context and shall not directly use this function. - MicroInterpreter* interpreter = - reinterpret_cast(context->impl_); - return reinterpret_cast( - interpreter->GetMicroExternalContext()); + return micro_context_.set_external_context(external_context_payload); } } // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.h index 33123af6..aa409386 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_interpreter.h @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/lite/core/api/error_reporter.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/micro/micro_allocator.h" +#include "tensorflow/lite/micro/micro_context.h" #include "tensorflow/lite/micro/micro_graph.h" #include "tensorflow/lite/micro/micro_op_resolver.h" #include "tensorflow/lite/micro/micro_profiler.h" @@ -79,10 +80,6 @@ class MicroInterpreter { // one external context. TfLiteStatus SetMicroExternalContext(void* external_context_payload); - // This function is used by the TfLiteContext::GetExternalContext() to get the - // external context. - void* GetMicroExternalContext(); - TfLiteTensor* input(size_t index); size_t inputs_size() const { return model_->subgraphs()->Get(0)->inputs()->size(); @@ -150,26 +147,6 @@ class MicroInterpreter { // Gets the current subgraph index used from within context methods. int get_subgraph_index() { return graph_.GetCurrentSubgraphIndex(); } - // Static functions that are bound to the TfLiteContext instance: - static void* AllocatePersistentBuffer(TfLiteContext* ctx, size_t bytes); - static TfLiteStatus RequestScratchBufferInArena(TfLiteContext* ctx, - size_t bytes, - int* buffer_idx); - static void* GetScratchBuffer(TfLiteContext* ctx, int buffer_idx); - static void ReportOpError(struct TfLiteContext* context, const char* format, - ...); - static TfLiteTensor* GetTensor(const struct TfLiteContext* context, - int tensor_idx); - static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context, - int tensor_idx); - static TfLiteStatus GetGraph(struct TfLiteContext* context, - TfLiteIntArray** args); - - // This callback is an implementation for TfLiteContext::GetExternalContext - // interface. - static TfLiteExternalContext* GetExternalContext( - TfLiteContext* context, TfLiteExternalContextType unused); - const Model* model_; const MicroOpResolver& op_resolver_; ErrorReporter* error_reporter_; @@ -181,12 +158,13 @@ class MicroInterpreter { TfLiteStatus initialization_status_; ScratchBufferHandle* scratch_buffer_handles_ = nullptr; - void* external_context_payload_ = nullptr; // TODO(b/162311891): Clean these pointers up when this class supports buffers // from TfLiteEvalTensor. TfLiteTensor** input_tensors_; TfLiteTensor** output_tensors_; + + MicroContext micro_context_; }; } // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_mutable_op_resolver.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_mutable_op_resolver.h index 8784f29a..8676189d 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/micro_mutable_op_resolver.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_mutable_op_resolver.h @@ -153,6 +153,16 @@ class MicroMutableOpResolver : public MicroOpResolver { Register_BATCH_TO_SPACE_ND(), ParseBatchToSpaceNd); } + TfLiteStatus AddBroadcastArgs() { + return AddBuiltin(BuiltinOperator_BROADCAST_ARGS, Register_BROADCAST_ARGS(), + ParseBroadcastArgs); + } + + TfLiteStatus AddBroadcastTo() { + return AddBuiltin(BuiltinOperator_BROADCAST_TO, Register_BROADCAST_TO(), + ParseBroadcastTo); + } + TfLiteStatus AddCallOnce() { return AddBuiltin(BuiltinOperator_CALL_ONCE, Register_CALL_ONCE(), ParseCallOnce); @@ -356,6 +366,11 @@ class MicroMutableOpResolver : public MicroOpResolver { tflite::Register_MAX_POOL_2D(), ParsePool); } + TfLiteStatus AddMirrorPad() { + return AddBuiltin(BuiltinOperator_MIRROR_PAD, tflite::Register_MIRROR_PAD(), + ParseMirrorPad); + } + TfLiteStatus AddMean() { return AddBuiltin(BuiltinOperator_MEAN, tflite::ops::micro::Register_MEAN(), ParseReducer); @@ -550,6 +565,10 @@ class MicroMutableOpResolver : public MicroOpResolver { ParseVarHandle); } + TfLiteStatus AddWhile() { + return AddBuiltin(BuiltinOperator_WHILE, Register_WHILE(), ParseWhile); + } + TfLiteStatus AddZerosLike() { return AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE(), ParseZerosLike); diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.cc b/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.cc index e7232f47..c4577773 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.cc @@ -124,6 +124,14 @@ TfLiteStatus MicroResourceVariables::Assign(int id, return kTfLiteOk; } +TfLiteStatus MicroResourceVariables::ResetAll() { + for (int i = 0; i < num_resource_variables_; i++) { + MicroResourceVariable variable = resource_variables_[i]; + memset(variable.resource_buffer, 0, variable.bytes); + } + return kTfLiteOk; +} + int MicroResourceVariables::FindId(const char* container, const char* shared_name) { for (int i = 0; i < num_resource_variables_; i++) { diff --git a/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.h b/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.h index 0bab8e13..e8df991c 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/micro_resource_variable.h @@ -51,6 +51,9 @@ class MicroResourceVariables { // in order to allocate the resource buffer. TfLiteStatus Assign(int id, const TfLiteEvalTensor* tensor); + // Zeros out all resource buffers. + TfLiteStatus ResetAll(); + private: int FindId(const char* container, const char* shared_name); diff --git a/code/components/tflite-lib/tensorflow/lite/micro/mock_micro_graph.cc b/code/components/tflite-lib/tensorflow/lite/micro/mock_micro_graph.cc index 951664b5..bfd7605d 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/mock_micro_graph.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/mock_micro_graph.cc @@ -27,12 +27,12 @@ MockMicroGraph::MockMicroGraph(SimpleMemoryAllocator* allocator) free_count_(0) { memset(invoke_counts_, 0, sizeof(invoke_counts_)); mock_tensor_ = - reinterpret_cast(allocator_->AllocateFromTail( + reinterpret_cast(allocator_->AllocatePersistentBuffer( sizeof(TfLiteEvalTensor), alignof(TfLiteEvalTensor))); int* dims_array = reinterpret_cast( - allocator_->AllocateFromTail(3 * sizeof(int), alignof(int))); + allocator_->AllocatePersistentBuffer(3 * sizeof(int), alignof(int))); float* data_array = reinterpret_cast( - allocator_->AllocateFromTail(2 * sizeof(float), alignof(float))); + allocator_->AllocatePersistentBuffer(2 * sizeof(float), alignof(float))); int dims[] = {2, 1, 2}; memcpy(dims_array, dims, 3 * sizeof(int)); mock_tensor_->dims = testing::IntArrayFromInts(dims_array); diff --git a/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.cc b/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.cc index 349018d4..53b3806d 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/recording_micro_allocator.cc @@ -59,12 +59,13 @@ RecordingMicroAllocator* RecordingMicroAllocator::Create( arena_size); TFLITE_DCHECK(simple_memory_allocator != nullptr); - uint8_t* memory_planner_buffer = simple_memory_allocator->AllocateFromTail( - sizeof(GreedyMemoryPlanner), alignof(GreedyMemoryPlanner)); + uint8_t* memory_planner_buffer = + simple_memory_allocator->AllocatePersistentBuffer( + sizeof(GreedyMemoryPlanner), alignof(GreedyMemoryPlanner)); GreedyMemoryPlanner* memory_planner = new (memory_planner_buffer) GreedyMemoryPlanner(); - uint8_t* allocator_buffer = simple_memory_allocator->AllocateFromTail( + uint8_t* allocator_buffer = simple_memory_allocator->AllocatePersistentBuffer( sizeof(RecordingMicroAllocator), alignof(RecordingMicroAllocator)); RecordingMicroAllocator* allocator = new (allocator_buffer) RecordingMicroAllocator( @@ -108,11 +109,11 @@ void RecordingMicroAllocator::PrintAllocations() const { TF_LITE_REPORT_ERROR( error_reporter(), "[RecordingMicroAllocator] Arena allocation head %d bytes", - recording_memory_allocator_->GetHeadUsedBytes()); + recording_memory_allocator_->GetNonPersistentUsedBytes()); TF_LITE_REPORT_ERROR( error_reporter(), "[RecordingMicroAllocator] Arena allocation tail %d bytes", - recording_memory_allocator_->GetTailUsedBytes()); + recording_memory_allocator_->GetPersistentUsedBytes()); PrintRecordedAllocation(RecordedAllocationType::kTfLiteEvalTensorData, "TfLiteEvalTensor data", "allocations"); PrintRecordedAllocation(RecordedAllocationType::kPersistentTfLiteTensorData, diff --git a/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.cc b/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.cc index ef30aca4..6d3e72bd 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.cc @@ -39,8 +39,8 @@ RecordingSimpleMemoryAllocator* RecordingSimpleMemoryAllocator::Create( RecordingSimpleMemoryAllocator(error_reporter, buffer_head, buffer_size); uint8_t* allocator_buffer = - tmp.AllocateFromTail(sizeof(RecordingSimpleMemoryAllocator), - alignof(RecordingSimpleMemoryAllocator)); + tmp.AllocatePersistentBuffer(sizeof(RecordingSimpleMemoryAllocator), + alignof(RecordingSimpleMemoryAllocator)); // Use the default copy constructor to populate internal states. return new (allocator_buffer) RecordingSimpleMemoryAllocator(tmp); } @@ -57,11 +57,11 @@ size_t RecordingSimpleMemoryAllocator::GetAllocatedCount() const { return alloc_count_; } -TfLiteStatus RecordingSimpleMemoryAllocator::SetHeadBufferSize( - size_t size, size_t alignment) { +TfLiteStatus RecordingSimpleMemoryAllocator::ResizeBuffer( + uint8_t* resizable_buf, size_t size, size_t alignment) { const uint8_t* previous_head = head(); TfLiteStatus status = - SimpleMemoryAllocator::SetHeadBufferSize(size, alignment); + SimpleMemoryAllocator::ResizeBuffer(resizable_buf, size, alignment); if (status == kTfLiteOk) { used_bytes_ += head() - previous_head; requested_head_bytes_ = size; @@ -69,10 +69,11 @@ TfLiteStatus RecordingSimpleMemoryAllocator::SetHeadBufferSize( return status; } -uint8_t* RecordingSimpleMemoryAllocator::AllocateFromTail(size_t size, - size_t alignment) { +uint8_t* RecordingSimpleMemoryAllocator::AllocatePersistentBuffer( + size_t size, size_t alignment) { const uint8_t* previous_tail = tail(); - uint8_t* result = SimpleMemoryAllocator::AllocateFromTail(size, alignment); + uint8_t* result = + SimpleMemoryAllocator::AllocatePersistentBuffer(size, alignment); if (result != nullptr) { used_bytes_ += previous_tail - tail(); requested_tail_bytes_ += size; diff --git a/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.h index 3526716e..a251e940 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/recording_simple_memory_allocator.h @@ -47,8 +47,9 @@ class RecordingSimpleMemoryAllocator : public SimpleMemoryAllocator { // Returns the number of alloc calls from the head or tail. size_t GetAllocatedCount() const; - TfLiteStatus SetHeadBufferSize(size_t size, size_t alignment) override; - uint8_t* AllocateFromTail(size_t size, size_t alignment) override; + TfLiteStatus ResizeBuffer(uint8_t* resizable_buf, size_t size, + size_t alignment) override; + uint8_t* AllocatePersistentBuffer(size_t size, size_t alignment) override; private: size_t requested_head_bytes_; diff --git a/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.cc b/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.cc index 08b6789e..e5d87afb 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.cc @@ -19,10 +19,13 @@ limitations under the License. #include #include +#include "tensorflow/lite/c/c_api_types.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/core/api/error_reporter.h" #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/op_macros.h" #include "tensorflow/lite/micro/memory_helpers.h" +#include "tensorflow/lite/micro/micro_error_reporter.h" namespace tflite { @@ -52,7 +55,7 @@ SimpleMemoryAllocator* SimpleMemoryAllocator::Create( // Allocate enough bytes from the buffer to create a SimpleMemoryAllocator. // The new instance will use the current adjusted tail buffer from the tmp // allocator instance. - uint8_t* allocator_buffer = tmp.AllocateFromTail( + uint8_t* allocator_buffer = tmp.AllocatePersistentBuffer( sizeof(SimpleMemoryAllocator), alignof(SimpleMemoryAllocator)); // Use the default copy constructor to populate internal states. return new (allocator_buffer) SimpleMemoryAllocator(tmp); @@ -60,13 +63,37 @@ SimpleMemoryAllocator* SimpleMemoryAllocator::Create( SimpleMemoryAllocator::~SimpleMemoryAllocator() {} -TfLiteStatus SimpleMemoryAllocator::SetHeadBufferSize(size_t size, - size_t alignment) { - if (head_ != temp_) { +uint8_t* SimpleMemoryAllocator::AllocateResizableBuffer(size_t size, + size_t alignment) { + // Only supports one resizable buffer, which starts at the buffer head. + uint8_t* expect_resizable_buf = AlignPointerUp(buffer_head_, alignment); + if (ResizeBuffer(expect_resizable_buf, size, alignment) == kTfLiteOk) { + return expect_resizable_buf; + } + return nullptr; +} + +TfLiteStatus SimpleMemoryAllocator::DeallocateResizableBuffer( + uint8_t* resizable_buf) { + return ResizeBuffer(resizable_buf, 0, 1); +} + +TfLiteStatus SimpleMemoryAllocator::ReserveNonPersistentOverlayMemory( + size_t size, size_t alignment) { + uint8_t* expect_resizable_buf = AlignPointerUp(buffer_head_, alignment); + return ResizeBuffer(expect_resizable_buf, size, alignment); +} + +TfLiteStatus SimpleMemoryAllocator::ResizeBuffer(uint8_t* resizable_buf, + size_t size, + size_t alignment) { + // Only supports one resizable buffer, which starts at the buffer head. + uint8_t* expect_resizable_buf = AlignPointerUp(buffer_head_, alignment); + if (head_ != temp_ || resizable_buf != expect_resizable_buf) { TF_LITE_REPORT_ERROR( error_reporter_, - "Internal error: SetHeadBufferSize() needs to be called " - "after ResetTempAllocations()."); + "Internal error: either buffer is not resizable or " + "ResetTempAllocations() is not called before ResizeBuffer()."); return kTfLiteError; } @@ -75,7 +102,7 @@ TfLiteStatus SimpleMemoryAllocator::SetHeadBufferSize(size_t size, if (available_memory < size) { TF_LITE_REPORT_ERROR( error_reporter_, - "Failed to set head size. Requested: %u, available %u, missing: %u", + "Failed to resize buffer. Requested: %u, available %u, missing: %u", size, available_memory, size - available_memory); return kTfLiteError; } @@ -85,8 +112,8 @@ TfLiteStatus SimpleMemoryAllocator::SetHeadBufferSize(size_t size, return kTfLiteOk; } -uint8_t* SimpleMemoryAllocator::AllocateFromTail(size_t size, - size_t alignment) { +uint8_t* SimpleMemoryAllocator::AllocatePersistentBuffer(size_t size, + size_t alignment) { uint8_t* const aligned_result = AlignPointerDown(tail_ - size, alignment); if (aligned_result < head_) { #ifndef TF_LITE_STRIP_ERROR_STRINGS @@ -113,18 +140,47 @@ uint8_t* SimpleMemoryAllocator::AllocateTemp(size_t size, size_t alignment) { return nullptr; } temp_ = aligned_result + size; + temp_buffer_ptr_check_sum_ ^= (reinterpret_cast(aligned_result)); + temp_buffer_count_++; return aligned_result; } -void SimpleMemoryAllocator::ResetTempAllocations() { temp_ = head_; } - -uint8_t* SimpleMemoryAllocator::GetHeadBuffer() const { return buffer_head_; } - -size_t SimpleMemoryAllocator::GetHeadUsedBytes() const { - return head_ - buffer_head_; +void SimpleMemoryAllocator::DeallocateTemp(uint8_t* temp_buf) { + temp_buffer_ptr_check_sum_ ^= (reinterpret_cast(temp_buf)); + temp_buffer_count_--; } -size_t SimpleMemoryAllocator::GetTailUsedBytes() const { +bool SimpleMemoryAllocator::IsAllTempDeallocated() { + if (temp_buffer_count_ != 0 || temp_buffer_ptr_check_sum_ != 0) { + MicroPrintf( + "Number of allocated temp buffers: %d. Checksum passing status: %d", + temp_buffer_count_, !temp_buffer_ptr_check_sum_); + return false; + } + return true; +} + +TfLiteStatus SimpleMemoryAllocator::ResetTempAllocations() { + // TODO(b/209453859): enable error check based on IsAllTempDeallocated after + // all AllocateTemp have been paird with DeallocateTemp + if (!IsAllTempDeallocated()) { + MicroPrintf( + "All temp buffers must be freed before calling ResetTempAllocations()"); + return kTfLiteError; + } + temp_ = head_; + return kTfLiteOk; +} + +uint8_t* SimpleMemoryAllocator::GetOverlayMemoryAddress() const { + return buffer_head_; +} + +size_t SimpleMemoryAllocator::GetNonPersistentUsedBytes() const { + return std::max(head_ - buffer_head_, temp_ - buffer_head_); +} + +size_t SimpleMemoryAllocator::GetPersistentUsedBytes() const { return buffer_tail_ - tail_; } @@ -135,7 +191,7 @@ size_t SimpleMemoryAllocator::GetAvailableMemory(size_t alignment) const { } size_t SimpleMemoryAllocator::GetUsedBytes() const { - return GetBufferSize() - (tail_ - temp_); + return GetPersistentUsedBytes() + GetNonPersistentUsedBytes(); } size_t SimpleMemoryAllocator::GetBufferSize() const { diff --git a/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.h b/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.h index 36ab80b3..d88c4a3d 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/simple_memory_allocator.h @@ -22,13 +22,15 @@ limitations under the License. #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/core/api/error_reporter.h" #include "tensorflow/lite/micro/compatibility.h" +#include "tensorflow/lite/micro/ibuffer_allocator.h" namespace tflite { // TODO(petewarden): This allocator never frees up or reuses any memory, even // though we have enough information about lifetimes of the tensors to do so. // This makes it pretty wasteful, so we should use a more intelligent method. -class SimpleMemoryAllocator { +class SimpleMemoryAllocator : public INonPersistentBufferAllocator, + public IPersistentBufferAllocator { public: // TODO(b/157615197): Cleanup constructors/destructor and use factory // functions. @@ -43,17 +45,33 @@ class SimpleMemoryAllocator { uint8_t* buffer_head, size_t buffer_size); - // Adjusts the head (lowest address and moving upwards) memory allocation to a - // given size. Calls to this method will also invalidate all temporary - // allocation values (it sets the location of temp space at the end of the - // head section). This call will fail if a chain of allocations through - // AllocateTemp() have not been cleaned up with a call to - // ResetTempAllocations(). - virtual TfLiteStatus SetHeadBufferSize(size_t size, size_t alignment); + // Resizes a buffer that is previously returned by the + // AllocateResizableBuffer. In current implementation, it Adjusts the head + // (lowest address and moving upwards) memory allocation to a given size. + // Calls to this method will also invalidate all temporary allocation values + // (it sets the location of temp space at the end of the head section). This + // call will fail if a chain of allocations through AllocateTemp() have not + // been cleaned up with a call to ResetTempAllocations(). + virtual TfLiteStatus ResizeBuffer(uint8_t* resizable_buf, size_t size, + size_t alignment) override; - // Allocates memory starting at the tail of the arena (highest address and - // moving downwards). - virtual uint8_t* AllocateFromTail(size_t size, size_t alignment); + // Returns a buffer that is resizable viable ResizeBuffer(). Only one + // resizable buffer is currently supported. + virtual uint8_t* AllocateResizableBuffer(size_t size, + size_t alignment) override; + + // Frees up the memory occupied by the resizable buffer + virtual TfLiteStatus DeallocateResizableBuffer( + uint8_t* resizable_buf) override; + + // Reserves the non-persistent memory that is planned by the memory planner. + virtual TfLiteStatus ReserveNonPersistentOverlayMemory( + size_t size, size_t alignment) override; + + // Allocates persistent memory starting at the tail of the arena (highest + // address and moving downwards). + virtual uint8_t* AllocatePersistentBuffer(size_t size, + size_t alignment) override; // Allocates a temporary buffer from the head of the arena (lowest address and // moving upwards) but does not update the actual head allocation size or @@ -63,25 +81,34 @@ class SimpleMemoryAllocator { // calls to AllocateTemp() must end with a call to ResetTempAllocations(). If // AllocateFromHead() is called before a call to ResetTempAllocations(), it // will fail with an error message. - virtual uint8_t* AllocateTemp(size_t size, size_t alignment); + virtual uint8_t* AllocateTemp(size_t size, size_t alignment) override; + + // Signals that a temporary buffer is no longer needed. This is currently for + // book-keeping purpose and the memory region are not immediately available + // for re-use. The deallocated memory region are only reclaimed after + // ResetTempAllocations is called as it is right now. + virtual void DeallocateTemp(uint8_t* buf) override; + + // Returns true if all temporary buffers are already deallocated. + virtual bool IsAllTempDeallocated() override; // Resets a chain of temporary allocations back to the current head of the // arena (lowest address). - virtual void ResetTempAllocations(); + virtual TfLiteStatus ResetTempAllocations() override; // Returns a pointer to the buffer currently assigned to the head section. // This buffer is set by calling SetHeadSize(). - uint8_t* GetHeadBuffer() const; + uint8_t* GetOverlayMemoryAddress() const override; // Returns the size of the head section in bytes. - size_t GetHeadUsedBytes() const; + size_t GetNonPersistentUsedBytes() const override; // Returns the size of all allocations in the tail section in bytes. - size_t GetTailUsedBytes() const; + size_t GetPersistentUsedBytes() const override; // Returns the number of bytes available with a given alignment. This number // takes in account any temporary allocations. - size_t GetAvailableMemory(size_t alignment) const; + size_t GetAvailableMemory(size_t alignment) const override; // Returns the number of used bytes in the allocator. This number takes in // account any temporary allocations. @@ -105,6 +132,17 @@ class SimpleMemoryAllocator { uint8_t* head_; uint8_t* tail_; uint8_t* temp_; + + // The combination of the checksum of outstanding temporary buffer pointers + // AND the count of outstanding temporary buffer provide a low cost mechanism + // to audit temporary buffers' allocation and deallocation. + // + // XOR Check sum for outstanding temp buffers. + // If all temp buffers are deallocated OR no temp buffers are allocated, + // temp_buffer_ptr_check_sum_ == nullptr. + intptr_t temp_buffer_ptr_check_sum_ = 0; + // Count of outstanding temp buffers. + int temp_buffer_count_ = 0; }; } // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.cc b/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.cc new file mode 100644 index 00000000..c89483e1 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.cc @@ -0,0 +1,113 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/test_helper_custom_ops.h" + +#include +#include +#include +#include +#include + +#include "flatbuffers/flatbuffers.h" // from @flatbuffers +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/core/api/error_reporter.h" +#include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/all_ops_resolver.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/micro_utils.h" +#include "tensorflow/lite/schema/schema_generated.h" + +// TODO(b/170464050): Use TFLM test only version of schema_utils. + +namespace tflite { +namespace testing { + +const TfLiteRegistration* PackerOp::getRegistration() { + return GetMutableRegistration(); +} + +TfLiteRegistration* PackerOp::GetMutableRegistration() { + static TfLiteRegistration r; + r.init = Init; + r.prepare = Prepare; + r.invoke = Invoke; + r.free = Free; + return &r; +} + +void* PackerOp::Init(TfLiteContext* context, const char* buffer, + size_t length) { + freed_ = false; + // Do nothing. + return nullptr; +} + +void PackerOp::Free(TfLiteContext* context, void* buffer) { freed_ = true; } + +TfLiteStatus PackerOp::Prepare(TfLiteContext* context, TfLiteNode* node) { + return kTfLiteOk; +} + +TfLiteStatus PackerOp::Invoke(TfLiteContext* context, TfLiteNode* node) { + const TfLiteEvalTensor* input1 = + tflite::micro::GetEvalInput(context, node, 0); + TF_LITE_ENSURE(context, input1 != nullptr); + const int32_t* input1_data = input1->data.i32; + TF_LITE_ENSURE_EQ(context, input1->dims->size, 1); + const int32_t input1_len = input1->dims->data[0]; + + const TfLiteEvalTensor* input2 = + tflite::micro::GetEvalInput(context, node, 1); + TF_LITE_ENSURE(context, input2 != nullptr); + const int32_t* input2_data = input2->data.i32; + TF_LITE_ENSURE_EQ(context, input2->dims->size, 1); + const int32_t input2_len = input2->dims->data[0]; + + TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0); + TF_LITE_ENSURE(context, output != nullptr); + int32_t* output_data = output->data.i32; + int32_t output_len = output->dims->data[0]; + + // Fill output with input: first with the first tensor, then with the second + // tensor up to the size of the output tensor. + int cnt = 0; + int i; + for (i = 0; i < input1_len && cnt < output_len; i++, cnt++) { + output_data[cnt] = input1_data[i]; + } + if (cnt >= output_len) { + return kTfLiteOk; + } + + for (i = 0; i < input2_len && cnt < output_len; i++, cnt++) { + output_data[cnt] = input2_data[i]; + } + if (cnt >= output_len) { + return kTfLiteOk; + } + + for (; cnt < output_len; cnt++) { + output_data[cnt] = 0; + } + return kTfLiteOk; +} + +bool PackerOp::freed_ = false; + +} // namespace testing +} // namespace tflite diff --git a/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.h b/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.h new file mode 100644 index 00000000..b8c025a7 --- /dev/null +++ b/code/components/tflite-lib/tensorflow/lite/micro/test_helper_custom_ops.h @@ -0,0 +1,50 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_MICRO_TEST_HELPER_CUSTOM_OPS_H_ +#define TENSORFLOW_LITE_MICRO_TEST_HELPER_CUSTOM_OPS_H_ + +#include +#include + +#include "flatbuffers/flatbuffers.h" // from @flatbuffers +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/micro/all_ops_resolver.h" +#include "tensorflow/lite/micro/micro_utils.h" +#include "tensorflow/lite/portable_type_to_tflitetype.h" +#include "tensorflow/lite/schema/schema_generated.h" + +namespace tflite { +namespace testing { + +class PackerOp { + public: + static const TfLiteRegistration* getRegistration(); + static TfLiteRegistration* GetMutableRegistration(); + static void* Init(TfLiteContext* context, const char* buffer, size_t length); + static void Free(TfLiteContext* context, void* buffer); + static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node); + static TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node); + + private: + static bool freed_; +}; + +} // namespace testing +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_TEST_HELPER_CUSTOM_OPS_H_ diff --git a/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.cc b/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.cc index 2a5700e6..2adea777 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.cc +++ b/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.cc @@ -29,7 +29,10 @@ limitations under the License. #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/micro/all_ops_resolver.h" #include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/memory_helpers.h" +#include "tensorflow/lite/micro/micro_arena_constants.h" #include "tensorflow/lite/micro/micro_utils.h" +#include "tensorflow/lite/micro/test_helper_custom_ops.h" #include "tensorflow/lite/schema/schema_generated.h" // TODO(b/170464050): Use TFLM test only version of schema_utils. @@ -40,7 +43,9 @@ namespace { class StackAllocator : public flatbuffers::Allocator { public: - StackAllocator() : data_(data_backing_), data_size_(0) {} + StackAllocator(size_t alignment) : data_size_(0) { + data_ = AlignPointerUp(data_backing_, alignment); + } uint8_t* allocate(size_t size) override { TFLITE_DCHECK((data_size_ + size) <= kStackAllocatorSize); @@ -52,10 +57,10 @@ class StackAllocator : public flatbuffers::Allocator { void deallocate(uint8_t* p, size_t) override {} - static StackAllocator& instance() { + static StackAllocator& instance(size_t alignment = 1) { // Avoid using true dynamic memory allocation to be portable to bare metal. static char inst_memory[sizeof(StackAllocator)]; - static StackAllocator* inst = new (inst_memory) StackAllocator; + static StackAllocator* inst = new (inst_memory) StackAllocator(alignment); return *inst; } @@ -73,7 +78,8 @@ flatbuffers::FlatBufferBuilder* BuilderInstance() { static char inst_memory[sizeof(flatbuffers::FlatBufferBuilder)]; static flatbuffers::FlatBufferBuilder* inst = new (inst_memory) flatbuffers::FlatBufferBuilder( - StackAllocator::kStackAllocatorSize, &StackAllocator::instance()); + StackAllocator::kStackAllocatorSize, + &StackAllocator::instance(MicroArenaBufferAlignment())); return inst; } @@ -903,6 +909,344 @@ const Model* BuildSimpleModelWithSubgraphsAndIf() { return model; } +const Model* BuildSimpleModelWithSubgraphsAndWhile() { + using flatbuffers::Offset; + flatbuffers::FlatBufferBuilder* builder = BuilderInstance(); + + constexpr size_t buffers_size = 1; + const Offset buffers[buffers_size] = { + CreateBuffer(*builder), + }; + const int32_t data_tensor_shape[] = {1, 1}; + constexpr size_t while_tensors_size = 4; + constexpr size_t op_tensors_size = 3; + const Offset subgraph0_tensors[while_tensors_size] = { + CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1), + TensorType_FLOAT32, 0, + builder->CreateString("input_tensor0"), 0, false), + CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1), + TensorType_FLOAT32, 0, + builder->CreateString("input_tensor1"), 0, false), + CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1), + TensorType_FLOAT32, 0, + builder->CreateString("output_tensor0"), 0, false), + CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1), + TensorType_FLOAT32, 0, + builder->CreateString("output_tensor1"), 0, false), + }; + const Offset subgraph1_tensors[op_tensors_size] = { + CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1), + TensorType_FLOAT32, 0, + builder->CreateString("input_tensor1"), 0, false), + CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1), + TensorType_FLOAT32, 0, + builder->CreateString("input_tensor2"), 0, false), + CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1), + TensorType_BOOL, 0, + builder->CreateString("condition_tensor"), 0, false), + }; + const Offset subgraph2_tensors[op_tensors_size] = { + CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1), + TensorType_FLOAT32, 0, + builder->CreateString("input_tensor0"), 0, false), + CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1), + TensorType_FLOAT32, 0, + builder->CreateString("input_tensor1"), 0, false), + CreateTensor(*builder, builder->CreateVector(data_tensor_shape, 1), + TensorType_FLOAT32, 0, + builder->CreateString("output_tensor0"), 0, false), + }; + + constexpr size_t inputs_size = 2; + const int32_t inputs[inputs_size] = {0, 1}; + constexpr size_t while_outputs_size = 2; + const int32_t while_outputs[while_outputs_size] = {2, 3}; + constexpr size_t cond_outputs_size = 1; + const int32_t cond_outputs[cond_outputs_size] = {2}; + constexpr size_t add_outputs_size = 1; + const int32_t add_outputs[add_outputs_size] = {2}; + constexpr size_t add_subgraph_outputs_size = 2; + const int32_t add_subgraph_outputs[add_subgraph_outputs_size] = {2, 1}; + constexpr size_t operators_size = 1; + const Offset subgraph0_operators[operators_size] = { + CreateOperator(*builder, 0, builder->CreateVector(inputs, inputs_size), + builder->CreateVector(while_outputs, while_outputs_size), + BuiltinOptions_WhileOptions, + CreateWhileOptions(*builder, 1, 2).Union()), + }; + const Offset subgraph1_operators[operators_size] = { + CreateOperator(*builder, 1, builder->CreateVector(inputs, inputs_size), + builder->CreateVector(cond_outputs, cond_outputs_size), + BuiltinOptions_NONE), + }; + const Offset subgraph2_operators[operators_size] = { + CreateOperator(*builder, 2, builder->CreateVector(inputs, inputs_size), + builder->CreateVector(add_outputs, add_outputs_size), + BuiltinOptions_NONE), + }; + constexpr size_t subgraphs_size = 3; + const Offset subgraphs[subgraphs_size] = { + CreateSubGraph(*builder, builder->CreateVector(subgraph0_tensors, 4), + builder->CreateVector(inputs, inputs_size), + builder->CreateVector(while_outputs, while_outputs_size), + builder->CreateVector(subgraph0_operators, operators_size), + builder->CreateString("while_subgraph")), + CreateSubGraph(*builder, builder->CreateVector(subgraph1_tensors, 3), + builder->CreateVector(inputs, inputs_size), + builder->CreateVector(cond_outputs, cond_outputs_size), + builder->CreateVector(subgraph1_operators, operators_size), + builder->CreateString("cond_subgraph")), + CreateSubGraph(*builder, builder->CreateVector(subgraph2_tensors, 3), + builder->CreateVector(inputs, inputs_size), + builder->CreateVector(add_subgraph_outputs, + add_subgraph_outputs_size), + builder->CreateVector(subgraph2_operators, operators_size), + builder->CreateString("body_subgraph")), + }; + constexpr size_t operator_codes_size = 3; + const Offset operator_codes[operator_codes_size] = { + CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0, + "multiple_inputs_op", + /*version=*/0, BuiltinOperator_WHILE), + CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0, + "multiple_inputs_op", + /*version=*/0, BuiltinOperator_LESS), + CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0, + "multiple_inputs_op", + /*version=*/0, BuiltinOperator_ADD), + }; + const Offset model_offset = CreateModel( + *builder, 0, builder->CreateVector(operator_codes, operator_codes_size), + builder->CreateVector(subgraphs, subgraphs_size), + builder->CreateString("test_model"), + builder->CreateVector(buffers, buffers_size)); + FinishModelBuffer(*builder, model_offset); + void* model_pointer = builder->GetBufferPointer(); + const Model* model = flatbuffers::GetRoot(model_pointer); + return model; +} + +// Build a model with If and two subgraphs: two data tensors A1 of size 2, A2 of +// size 4 are first concatenated, then cut to a new tensor A3 of size 3; the new +// tensor A3 of size 3 is then concatenated with A2 tensor of size 4 to produce +// a final output tensor A4. This model is specially crafted to capture the +// corner case outlined in go/avoid-memory-corruption-in-if-operator. +// +// Subgraph0 +// A0(1) A2_0(4) A1_0(2) +// | | | ---+ +// v v v | +// +--------------+ | +// | IF | | +// +------+-------+ | +// | A3_0(3) | +// v | +// +--------------+ | +// | CUSTOM |<---+ +// +------+-------+ +// | +// v +// A4_0(8) +// +// Subgraph1/2 +// A1_1(2) A2_1(4) +// | | +// v v +// +---------------+ +// | CUSTOM | +// +-------+-------+ +// | +// v A3_1(3) +// +// And it leads to memory plan as below +// +// Subgraph0 Layout +// +// +// <------------A4_0 -------------> <----- A2_0-------> <----A3_0 ---> +// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ +// | | | | | | | | | 3 | 4 | 5 | 6 | | | | +// +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ +// +// +----+----+----+ +// | 1 | 2 | A0 | +// +----+----+----+ +// <---A1_0--> +// +// Subgraph 1 Layout +// +// +----+----+----+----+----+----+----+----+----+ +// | | | | | | | | | | +// +----+----+----+----+----+----+----+----+----+ +// +// +// <------A2_1 -------><----A3_1 ---><--A1_1---> +// +// +// A1_1 of subgraph 1 will overlap with A2_0 of subgraph 0. +// In a buggy implementation of IF, two overwrite may happen: +// 1. copying input from A1_0 to A1_1 overwrites A2_0 before A2_0 is copied to +// A2_1; thus subgraph 1 produce incorrect output. +// 2. copying output from A3_1 to A4_0 overwrites A1_0, which should remain +// intact so that it can be used by the OP after the IF operator in subgraph 0 +// + +const Model* BuildModelWithIfAndSubgraphInputTensorOverlap() { + using flatbuffers::Offset; + flatbuffers::FlatBufferBuilder* builder = BuilderInstance(); + + constexpr TensorType kTensorType = TensorType_INT32; + constexpr int kBlockSize = + tflite::MicroArenaBufferAlignment() / sizeof(int32_t); + constexpr size_t kBuffersCount = 1; + const Offset buffers[kBuffersCount] = { + CreateBuffer(*builder), + }; + const int32_t kConditionTensorShape[] = {1}; + const int32_t kIfInput1TensorShape[] = {2 * kBlockSize}; + const int32_t kIfInput2TensorShape[] = {4 * kBlockSize}; + const int32_t kIfOutputTensorShape[] = {3 * kBlockSize}; + const int32_t kFinalOutputTensorShape[] = {8 * kBlockSize}; + constexpr size_t kSubgraph0TensorsCount = 5; + const Offset kSubgraph0Tensors[kSubgraph0TensorsCount] = { + CreateTensor(*builder, builder->CreateVector(kConditionTensorShape, 1), + TensorType_BOOL, 0, + builder->CreateString("condition tensor"), 0, false), + CreateTensor(*builder, builder->CreateVector(kIfInput1TensorShape, 1), + kTensorType, 0, builder->CreateString("if_input_tensor1"), 0, + false), + CreateTensor(*builder, builder->CreateVector(kIfInput2TensorShape, 1), + kTensorType, 0, builder->CreateString("if_input_tensor2"), 0, + false), + CreateTensor(*builder, builder->CreateVector(kIfOutputTensorShape, 1), + kTensorType, 0, builder->CreateString("if_output_tensor"), 0, + false), + CreateTensor(*builder, builder->CreateVector(kFinalOutputTensorShape, 1), + kTensorType, 0, builder->CreateString("final_output_tensor"), + 0, false), + }; + + // Subgraph 1 is the chosen path if condition tensor in IF is true. + constexpr size_t kSubgraph1TensorsCount = 3; + const Offset kSubgraph1Tensors[kSubgraph1TensorsCount] = { + CreateTensor(*builder, builder->CreateVector(kIfInput1TensorShape, 1), + kTensorType, 0, + builder->CreateString("subgraph1_input_tensor1"), 0, false), + CreateTensor(*builder, builder->CreateVector(kIfInput2TensorShape, 1), + kTensorType, 0, + builder->CreateString("subgraph1_input_tensor2"), 0, false), + CreateTensor(*builder, builder->CreateVector(kIfOutputTensorShape, 1), + kTensorType, 0, + builder->CreateString("subgraph1_output_tensor"), 0, false), + }; + + // Subgraph 2 is the chosen path if condition tensor in IF is false + constexpr size_t kSubgraph2TensorsCount = 3; + const Offset kSubgraph2Tensors[kSubgraph2TensorsCount] = { + CreateTensor(*builder, builder->CreateVector(kIfInput1TensorShape, 1), + kTensorType, 0, builder->CreateString("if_input_tensor1"), 0, + false), + CreateTensor(*builder, builder->CreateVector(kIfInput2TensorShape, 1), + kTensorType, 0, builder->CreateString("if_input_tensor2"), 0, + false), + CreateTensor(*builder, builder->CreateVector(kIfOutputTensorShape, 1), + kTensorType, 0, builder->CreateString("if_output_tensor"), 0, + false), + }; + + constexpr int kIfOpCodeIndex = 0; + constexpr int kCustomOpCodeIndex = 1; + + constexpr size_t kIfInputsCount = 3; + const int32_t kIfInputs[kIfInputsCount] = {0, 1, 2}; + constexpr size_t kOutputsCount = 1; + const int32_t kIfOutputs[kOutputsCount] = {3}; + constexpr size_t kOpAfterIfInputsCount = 2; + const int32_t kOpAfterIfInputs[kOpAfterIfInputsCount] = {3, 2}; + const int32_t kOpAfterIfOutputs[kOutputsCount] = {4}; + constexpr size_t kOperatorsCount = 2; + const Offset kSubgraph0Operators[kOperatorsCount] = { + CreateOperator(*builder, kIfOpCodeIndex, + builder->CreateVector(kIfInputs, kIfInputsCount), + builder->CreateVector(kIfOutputs, kOutputsCount), + BuiltinOptions_IfOptions, + CreateIfOptions(*builder, 1, 2).Union()), + CreateOperator( + *builder, kCustomOpCodeIndex, + builder->CreateVector(kOpAfterIfInputs, kOpAfterIfInputsCount), + builder->CreateVector(kOpAfterIfOutputs, kOutputsCount)), + }; + + constexpr size_t kSubgraph1InputsCount = 2; + const int32_t kSubgraph1Inputs[kSubgraph1InputsCount] = {0, 1}; + constexpr size_t kSubgraph1OutputsCount = 1; + const int32_t kSubgraph1Outputs[kSubgraph1OutputsCount] = {2}; + constexpr size_t kSubgraph1OperatorsCount = 1; + const Offset kSubgraph1Operators[kSubgraph1OperatorsCount] = { + CreateOperator( + *builder, kCustomOpCodeIndex, + builder->CreateVector(kSubgraph1Inputs, kSubgraph1InputsCount), + builder->CreateVector(kSubgraph1Outputs, kSubgraph1OutputsCount), + BuiltinOptions_NONE), + }; + + constexpr size_t kSubgraph2InputsCount = 2; + const int32_t kSubgraph2Inputs[kSubgraph2InputsCount] = {0, 1}; + constexpr size_t kSubgraph2OutputsCount = 1; + const int32_t kSubgraph2Outputs[kSubgraph2OutputsCount] = {2}; + constexpr size_t kSubgraph2OperatorsCount = 1; + const Offset kSubgraph2Operators[kSubgraph2OperatorsCount] = { + CreateOperator( + *builder, kCustomOpCodeIndex, + builder->CreateVector(kSubgraph2Inputs, kSubgraph2InputsCount), + builder->CreateVector(kSubgraph2Outputs, kSubgraph2OutputsCount), + BuiltinOptions_NONE), + }; + + constexpr size_t kSubgraphsCount = 3; + const Offset kSubgraphs[kSubgraphsCount] = { + CreateSubGraph( + *builder, + builder->CreateVector(kSubgraph0Tensors, kSubgraph0TensorsCount), + builder->CreateVector(kIfInputs, kIfInputsCount), + builder->CreateVector(kOpAfterIfOutputs, kOutputsCount), + builder->CreateVector(kSubgraph0Operators, kOperatorsCount), + builder->CreateString("if_subgraph")), + CreateSubGraph( + *builder, + builder->CreateVector(kSubgraph1Tensors, kSubgraph1TensorsCount), + builder->CreateVector(kSubgraph1Inputs, kSubgraph1InputsCount), + builder->CreateVector(kSubgraph1Outputs, kSubgraph1OutputsCount), + builder->CreateVector(kSubgraph1Operators, kSubgraph1OperatorsCount), + builder->CreateString("then_subgraph")), + CreateSubGraph( + *builder, + builder->CreateVector(kSubgraph2Tensors, kSubgraph2TensorsCount), + builder->CreateVector(kSubgraph2Inputs, kSubgraph2InputsCount), + builder->CreateVector(kSubgraph2Outputs, kSubgraph2OutputsCount), + builder->CreateVector(kSubgraph2Operators, kSubgraph2OperatorsCount), + builder->CreateString("else_subgraph")), + }; + + constexpr size_t kOperatorCodesCount = 2; + const Offset kOperatorCodes[kOperatorCodesCount] = { + CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0, "if", + /*version=*/0, BuiltinOperator_IF), + CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0, + "custom_packer_op", + /*version=*/0, BuiltinOperator_CUSTOM), + }; + const Offset kModelOffset = CreateModel( + *builder, 0, builder->CreateVector(kOperatorCodes, kOperatorCodesCount), + builder->CreateVector(kSubgraphs, kSubgraphsCount), + builder->CreateString("test_model"), + builder->CreateVector(buffers, kBuffersCount)); + FinishModelBuffer(*builder, kModelOffset); + void* model_pointer = builder->GetBufferPointer(); + const Model* model = flatbuffers::GetRoot(model_pointer); + return model; +} + // Mock model with one main subgraph containing a single CALL_ONCE op (with null // inputs and outputs) which invokes a second subgraph which has null inputs and // outputs. @@ -996,8 +1340,11 @@ TfLiteStatus SimpleStatefulOp::Prepare(TfLiteContext* context, OpData* data = reinterpret_cast(node->user_data); // Make sure that the input is in uint8_t with at least 1 data entry. - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); + MicroContext* micro_context = GetMicroContext(context); + TfLiteTensor* input = + micro_context->AllocateTempInputTensor(node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + if (input->type != kTfLiteInt8) return kTfLiteError; if (NumElements(input->dims) == 0) return kTfLiteError; @@ -1010,6 +1357,7 @@ TfLiteStatus SimpleStatefulOp::Prepare(TfLiteContext* context, context->AllocatePersistentBuffer(context, sizeof(int))); *data->invoke_count = 0; + micro_context->DeallocateTempTfLiteTensor(input); return kTfLiteOk; } @@ -1018,9 +1366,10 @@ TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context, OpData* data = reinterpret_cast(node->user_data); *data->invoke_count += 1; - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input)); - const uint8_t* input_data = GetTensorData(input); + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, kInputTensor); + TF_LITE_ENSURE(context, input != nullptr); + const uint8_t* input_data = input->data.uint8; int size = NumElements(input->dims); uint8_t* sorting_buffer = reinterpret_cast( @@ -1038,14 +1387,14 @@ TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context, } } - TfLiteTensor* median; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kMedianTensor, &median)); - uint8_t* median_data = GetTensorData(median); - TfLiteTensor* invoke_count; - TF_LITE_ENSURE_OK(context, - GetOutputSafe(context, node, kInvokeCount, &invoke_count)); - int32_t* invoke_count_data = GetTensorData(invoke_count); + TfLiteEvalTensor* median = + tflite::micro::GetEvalOutput(context, node, kMedianTensor); + TF_LITE_ENSURE(context, median != nullptr); + uint8_t* median_data = median->data.uint8; + TfLiteEvalTensor* invoke_count = + tflite::micro::GetEvalOutput(context, node, kInvokeCount); + TF_LITE_ENSURE(context, invoke_count != nullptr); + int32_t* invoke_count_data = invoke_count->data.i32; median_data[0] = sorting_buffer[size / 2]; invoke_count_data[0] = *data->invoke_count; @@ -1132,18 +1481,20 @@ TfLiteStatus MultipleInputs::Prepare(TfLiteContext* context, TfLiteNode* node) { } TfLiteStatus MultipleInputs::Invoke(TfLiteContext* context, TfLiteNode* node) { - const TfLiteTensor* input; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input)); + const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0); + TF_LITE_ENSURE(context, input != nullptr); const int32_t* input_data = input->data.i32; - const TfLiteTensor* input1; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &input1)); + const TfLiteEvalTensor* input1 = + tflite::micro::GetEvalInput(context, node, 1); + TF_LITE_ENSURE(context, input1 != nullptr); const int32_t* input_data1 = input1->data.i32; - const TfLiteTensor* input2; - TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &input2)); + const TfLiteEvalTensor* input2 = + tflite::micro::GetEvalInput(context, node, 2); + TF_LITE_ENSURE(context, input2 != nullptr); const int32_t* input_data2 = input2->data.i32; - TfLiteTensor* output; - TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output)); + TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0); + TF_LITE_ENSURE(context, output != nullptr); int32_t* output_data = output->data.i32; output_data[0] = 0; // Catch output tensor sharing memory with an input tensor @@ -1195,6 +1546,7 @@ AllOpsResolver GetOpResolver() { op_resolver.AddCustom("multiple_inputs_op", MultipleInputs::GetMutableRegistration()); op_resolver.AddCustom("no_op", NoOp::GetMutableRegistration()); + op_resolver.AddCustom("custom_packer_op", PackerOp::GetMutableRegistration()); return op_resolver; } @@ -1243,6 +1595,22 @@ const Model* GetSimpleModelWithSubgraphsAndIf() { return model; } +const Model* GetSimpleModelWithSubgraphsAndWhile() { + static Model* model = nullptr; + if (!model) { + model = const_cast(BuildSimpleModelWithSubgraphsAndWhile()); + } + return model; +} + +const Model* GetModelWithIfAndSubgraphInputTensorOverlap() { + static Model* model = nullptr; + if (!model) { + model = const_cast(BuildModelWithIfAndSubgraphInputTensorOverlap()); + } + return model; +} + const Model* GetSimpleModelWithNullInputsAndOutputs() { static Model* model = nullptr; if (!model) { diff --git a/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.h b/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.h index 6ed7efd1..5441ce3e 100644 --- a/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.h +++ b/code/components/tflite-lib/tensorflow/lite/micro/test_helpers.h @@ -157,6 +157,13 @@ const Model* GetSimpleStatefulModel(); // Returns a flatbuffer model with "if" and two subgraphs. const Model* GetSimpleModelWithSubgraphsAndIf(); +// Returns a flatbuffer model with "while" and three subgraphs. +const Model* GetSimpleModelWithSubgraphsAndWhile(); + +// Returns a flatbuffer model with "if" and two subgraphs and the input tensor 1 +// of "if" subgraph overlaps with the input tensor 2 of subgraph 1. +const Model* GetModelWithIfAndSubgraphInputTensorOverlap(); + // Returns a flatbuffer model with null subgraph/operator inputs and outputs. const Model* GetSimpleModelWithNullInputsAndOutputs(); diff --git a/code/components/tflite-lib/tensorflow/lite/portable_type_to_tflitetype.h b/code/components/tflite-lib/tensorflow/lite/portable_type_to_tflitetype.h index 83a0ac6c..52d7fdef 100644 --- a/code/components/tflite-lib/tensorflow/lite/portable_type_to_tflitetype.h +++ b/code/components/tflite-lib/tensorflow/lite/portable_type_to_tflitetype.h @@ -61,6 +61,7 @@ struct TfLiteTypeToType {}; // Specializations below MATCH_TYPE_AND_TFLITE_TYPE(int32_t, kTfLiteInt32); MATCH_TYPE_AND_TFLITE_TYPE(uint32_t, kTfLiteUInt32); MATCH_TYPE_AND_TFLITE_TYPE(int16_t, kTfLiteInt16); +MATCH_TYPE_AND_TFLITE_TYPE(uint16_t, kTfLiteUInt16); MATCH_TYPE_AND_TFLITE_TYPE(int64_t, kTfLiteInt64); MATCH_TYPE_AND_TFLITE_TYPE(float, kTfLiteFloat32); MATCH_TYPE_AND_TFLITE_TYPE(unsigned char, kTfLiteUInt8); diff --git a/code/components/tflite-lib/tensorflow/lite/schema/schema_generated.h b/code/components/tflite-lib/tensorflow/lite/schema/schema_generated.h index 3620145d..e5ce189f 100644 --- a/code/components/tflite-lib/tensorflow/lite/schema/schema_generated.h +++ b/code/components/tflite-lib/tensorflow/lite/schema/schema_generated.h @@ -394,6 +394,9 @@ struct BucketizeOptionsT; struct GeluOptions; struct GeluOptionsT; +struct DynamicUpdateSliceOptions; +struct DynamicUpdateSliceOptionsT; + struct OperatorCode; struct OperatorCodeT; @@ -435,11 +438,12 @@ enum TensorType { TensorType_RESOURCE = 13, TensorType_VARIANT = 14, TensorType_UINT32 = 15, + TensorType_UINT16 = 16, TensorType_MIN = TensorType_FLOAT32, - TensorType_MAX = TensorType_UINT32 + TensorType_MAX = TensorType_UINT16 }; -inline const TensorType (&EnumValuesTensorType())[16] { +inline const TensorType (&EnumValuesTensorType())[17] { static const TensorType values[] = { TensorType_FLOAT32, TensorType_FLOAT16, @@ -456,13 +460,14 @@ inline const TensorType (&EnumValuesTensorType())[16] { TensorType_UINT64, TensorType_RESOURCE, TensorType_VARIANT, - TensorType_UINT32 + TensorType_UINT32, + TensorType_UINT16 }; return values; } inline const char * const *EnumNamesTensorType() { - static const char * const names[17] = { + static const char * const names[18] = { "FLOAT32", "FLOAT16", "INT32", @@ -479,13 +484,14 @@ inline const char * const *EnumNamesTensorType() { "RESOURCE", "VARIANT", "UINT32", + "UINT16", nullptr }; return names; } inline const char *EnumNameTensorType(TensorType e) { - if (flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_UINT32)) return ""; + if (flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_UINT16)) return ""; const size_t index = static_cast(e); return EnumNamesTensorType()[index]; } @@ -868,11 +874,12 @@ enum BuiltinOperator { BuiltinOperator_RANDOM_UNIFORM = 148, BuiltinOperator_MULTINOMIAL = 149, BuiltinOperator_GELU = 150, + BuiltinOperator_DYNAMIC_UPDATE_SLICE = 151, BuiltinOperator_MIN = BuiltinOperator_ADD, - BuiltinOperator_MAX = BuiltinOperator_GELU + BuiltinOperator_MAX = BuiltinOperator_DYNAMIC_UPDATE_SLICE }; -inline const BuiltinOperator (&EnumValuesBuiltinOperator())[151] { +inline const BuiltinOperator (&EnumValuesBuiltinOperator())[152] { static const BuiltinOperator values[] = { BuiltinOperator_ADD, BuiltinOperator_AVERAGE_POOL_2D, @@ -1024,13 +1031,14 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[151] { BuiltinOperator_BUCKETIZE, BuiltinOperator_RANDOM_UNIFORM, BuiltinOperator_MULTINOMIAL, - BuiltinOperator_GELU + BuiltinOperator_GELU, + BuiltinOperator_DYNAMIC_UPDATE_SLICE }; return values; } inline const char * const *EnumNamesBuiltinOperator() { - static const char * const names[152] = { + static const char * const names[153] = { "ADD", "AVERAGE_POOL_2D", "CONCATENATION", @@ -1182,13 +1190,14 @@ inline const char * const *EnumNamesBuiltinOperator() { "RANDOM_UNIFORM", "MULTINOMIAL", "GELU", + "DYNAMIC_UPDATE_SLICE", nullptr }; return names; } inline const char *EnumNameBuiltinOperator(BuiltinOperator e) { - if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_GELU)) return ""; + if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_DYNAMIC_UPDATE_SLICE)) return ""; const size_t index = static_cast(e); return EnumNamesBuiltinOperator()[index]; } @@ -1311,11 +1320,12 @@ enum BuiltinOptions { BuiltinOptions_RandomOptions = 114, BuiltinOptions_BucketizeOptions = 115, BuiltinOptions_GeluOptions = 116, + BuiltinOptions_DynamicUpdateSliceOptions = 117, BuiltinOptions_MIN = BuiltinOptions_NONE, - BuiltinOptions_MAX = BuiltinOptions_GeluOptions + BuiltinOptions_MAX = BuiltinOptions_DynamicUpdateSliceOptions }; -inline const BuiltinOptions (&EnumValuesBuiltinOptions())[117] { +inline const BuiltinOptions (&EnumValuesBuiltinOptions())[118] { static const BuiltinOptions values[] = { BuiltinOptions_NONE, BuiltinOptions_Conv2DOptions, @@ -1433,13 +1443,14 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[117] { BuiltinOptions_AssignVariableOptions, BuiltinOptions_RandomOptions, BuiltinOptions_BucketizeOptions, - BuiltinOptions_GeluOptions + BuiltinOptions_GeluOptions, + BuiltinOptions_DynamicUpdateSliceOptions }; return values; } inline const char * const *EnumNamesBuiltinOptions() { - static const char * const names[118] = { + static const char * const names[119] = { "NONE", "Conv2DOptions", "DepthwiseConv2DOptions", @@ -1557,13 +1568,14 @@ inline const char * const *EnumNamesBuiltinOptions() { "RandomOptions", "BucketizeOptions", "GeluOptions", + "DynamicUpdateSliceOptions", nullptr }; return names; } inline const char *EnumNameBuiltinOptions(BuiltinOptions e) { - if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_GeluOptions)) return ""; + if (flatbuffers::IsOutRange(e, BuiltinOptions_NONE, BuiltinOptions_DynamicUpdateSliceOptions)) return ""; const size_t index = static_cast(e); return EnumNamesBuiltinOptions()[index]; } @@ -2036,6 +2048,10 @@ template<> struct BuiltinOptionsTraits { static const BuiltinOptions enum_value = BuiltinOptions_GeluOptions; }; +template<> struct BuiltinOptionsTraits { + static const BuiltinOptions enum_value = BuiltinOptions_DynamicUpdateSliceOptions; +}; + struct BuiltinOptionsUnion { BuiltinOptions type; void *value; @@ -2996,6 +3012,14 @@ struct BuiltinOptionsUnion { return type == BuiltinOptions_GeluOptions ? reinterpret_cast(value) : nullptr; } + tflite::DynamicUpdateSliceOptionsT *AsDynamicUpdateSliceOptions() { + return type == BuiltinOptions_DynamicUpdateSliceOptions ? + reinterpret_cast(value) : nullptr; + } + const tflite::DynamicUpdateSliceOptionsT *AsDynamicUpdateSliceOptions() const { + return type == BuiltinOptions_DynamicUpdateSliceOptions ? + reinterpret_cast(value) : nullptr; + } }; bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type); @@ -10595,6 +10619,46 @@ inline flatbuffers::Offset CreateGeluOptions( flatbuffers::Offset CreateGeluOptions(flatbuffers::FlatBufferBuilder &_fbb, const GeluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +struct DynamicUpdateSliceOptionsT : public flatbuffers::NativeTable { + typedef DynamicUpdateSliceOptions TableType; + DynamicUpdateSliceOptionsT() { + } +}; + +struct DynamicUpdateSliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef DynamicUpdateSliceOptionsT NativeTableType; + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + verifier.EndTable(); + } + DynamicUpdateSliceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(DynamicUpdateSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct DynamicUpdateSliceOptionsBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + explicit DynamicUpdateSliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + DynamicUpdateSliceOptionsBuilder &operator=(const DynamicUpdateSliceOptionsBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateDynamicUpdateSliceOptions( + flatbuffers::FlatBufferBuilder &_fbb) { + DynamicUpdateSliceOptionsBuilder builder_(_fbb); + return builder_.Finish(); +} + +flatbuffers::Offset CreateDynamicUpdateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + struct OperatorCodeT : public flatbuffers::NativeTable { typedef OperatorCode TableType; int8_t deprecated_builtin_code; @@ -11093,6 +11157,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const tflite::GeluOptions *builtin_options_as_GeluOptions() const { return builtin_options_type() == tflite::BuiltinOptions_GeluOptions ? static_cast(builtin_options()) : nullptr; } + const tflite::DynamicUpdateSliceOptions *builtin_options_as_DynamicUpdateSliceOptions() const { + return builtin_options_type() == tflite::BuiltinOptions_DynamicUpdateSliceOptions ? static_cast(builtin_options()) : nullptr; + } const flatbuffers::Vector *custom_options() const { return GetPointer *>(VT_CUSTOM_OPTIONS); } @@ -11593,6 +11660,10 @@ template<> inline const tflite::GeluOptions *Operator::builtin_options_as inline const tflite::DynamicUpdateSliceOptions *Operator::builtin_options_as() const { + return builtin_options_as_DynamicUpdateSliceOptions(); +} + struct OperatorBuilder { flatbuffers::FlatBufferBuilder &fbb_; flatbuffers::uoffset_t start_; @@ -15679,6 +15750,29 @@ inline flatbuffers::Offset CreateGeluOptions(flatbuffers::FlatBuffe _approximate); } +inline DynamicUpdateSliceOptionsT *DynamicUpdateSliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new DynamicUpdateSliceOptionsT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void DynamicUpdateSliceOptions::UnPackTo(DynamicUpdateSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; +} + +inline flatbuffers::Offset DynamicUpdateSliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateDynamicUpdateSliceOptions(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateDynamicUpdateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const DynamicUpdateSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DynamicUpdateSliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + return tflite::CreateDynamicUpdateSliceOptions( + _fbb); +} + inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const { auto _o = new OperatorCodeT(); UnPackTo(_o, _resolver); @@ -16618,6 +16712,10 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } + case BuiltinOptions_DynamicUpdateSliceOptions: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } default: return true; } } @@ -17100,6 +17198,10 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c auto ptr = reinterpret_cast(obj); return ptr->UnPack(resolver); } + case BuiltinOptions_DynamicUpdateSliceOptions: { + auto ptr = reinterpret_cast(obj); + return ptr->UnPack(resolver); + } default: return nullptr; } } @@ -17570,6 +17672,10 @@ inline flatbuffers::Offset BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff auto ptr = reinterpret_cast(value); return CreateGeluOptions(_fbb, ptr, _rehasher).Union(); } + case BuiltinOptions_DynamicUpdateSliceOptions: { + auto ptr = reinterpret_cast(value); + return CreateDynamicUpdateSliceOptions(_fbb, ptr, _rehasher).Union(); + } default: return 0; } } @@ -18040,6 +18146,10 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL value = new tflite::GeluOptionsT(*reinterpret_cast(u.value)); break; } + case BuiltinOptions_DynamicUpdateSliceOptions: { + value = new tflite::DynamicUpdateSliceOptionsT(*reinterpret_cast(u.value)); + break; + } default: break; } @@ -18627,6 +18737,11 @@ inline void BuiltinOptionsUnion::Reset() { delete ptr; break; } + case BuiltinOptions_DynamicUpdateSliceOptions: { + auto ptr = reinterpret_cast(value); + delete ptr; + break; + } default: break; } value = nullptr; diff --git a/code/components/tflite-lib_20220417.zip b/code/components/tflite-lib_20220417.zip new file mode 100644 index 00000000..fcc45d22 Binary files /dev/null and b/code/components/tflite-lib_20220417.zip differ diff --git a/code/components/tfmicro.zip b/code/components/tfmicro.zip deleted file mode 100644 index 5e57aa05..00000000 Binary files a/code/components/tfmicro.zip and /dev/null differ diff --git a/code/main/version.cpp b/code/main/version.cpp index 10eaaa2b..e19ce16a 100644 --- a/code/main/version.cpp +++ b/code/main/version.cpp @@ -1,4 +1,4 @@ -const char* GIT_REV="ccb4bd5"; +const char* GIT_REV="cce812f"; const char* GIT_TAG=""; const char* GIT_BRANCH="rolling"; -const char* BUILD_TIME="2022-04-17 19:48"; \ No newline at end of file +const char* BUILD_TIME="2022-05-26 20:03"; \ No newline at end of file diff --git a/code/platformio.ini b/code/platformio.ini index 84ee163b..11935158 100644 --- a/code/platformio.ini +++ b/code/platformio.ini @@ -15,6 +15,7 @@ src_dir = main [env:esp32cam] ;platform = espressif32@2.1.0 +;platform = espressif32@3.5.0 platform = espressif32 board = esp32cam framework = espidf diff --git a/code/sdkconfig.esp32cam b/code/sdkconfig.esp32cam index 2433bd76..5437b6a8 100644 --- a/code/sdkconfig.esp32cam +++ b/code/sdkconfig.esp32cam @@ -140,6 +140,14 @@ CONFIG_EXAMPLE_CONNECT_IPV6_PREF_LOCAL_LINK=y # CONFIG_EXAMPLE_CONNECT_IPV6_PREF_UNIQUE_LOCAL is not set # end of Example Connection Configuration +# +# ESP-NN +# +# CONFIG_NN_ANSI_C is not set +CONFIG_NN_OPTIMIZED=y +CONFIG_NN_OPTIMIZATIONS=1 +# end of ESP-NN + # # Compiler options # diff --git a/code/version.cpp b/code/version.cpp index 10eaaa2b..e19ce16a 100644 --- a/code/version.cpp +++ b/code/version.cpp @@ -1,4 +1,4 @@ -const char* GIT_REV="ccb4bd5"; +const char* GIT_REV="cce812f"; const char* GIT_TAG=""; const char* GIT_BRANCH="rolling"; -const char* BUILD_TIME="2022-04-17 19:48"; \ No newline at end of file +const char* BUILD_TIME="2022-05-26 20:03"; \ No newline at end of file diff --git a/firmware/ana-s2-q-20220213.tflite b/firmware/ana-s2-q-20220213.tflite deleted file mode 100644 index e666f11e..00000000 Binary files a/firmware/ana-s2-q-20220213.tflite and /dev/null differ diff --git a/firmware/ana-s3-q-20220105.tflite b/firmware/ana-s3-q-20220105.tflite deleted file mode 100644 index 5718b1ec..00000000 Binary files a/firmware/ana-s3-q-20220105.tflite and /dev/null differ diff --git a/firmware/bootloader.bin b/firmware/bootloader.bin index 1b3dd33b..d9de7ed0 100644 Binary files a/firmware/bootloader.bin and b/firmware/bootloader.bin differ diff --git a/firmware/dhy0010s3q.tflite b/firmware/dhy0010s3q.tflite deleted file mode 100644 index 9f3d9acf..00000000 Binary files a/firmware/dhy0010s3q.tflite and /dev/null differ diff --git a/firmware/dhy0030s3q.tflite b/firmware/dhy0030s3q.tflite new file mode 100644 index 00000000..23755d63 Binary files /dev/null and b/firmware/dhy0030s3q.tflite differ diff --git a/firmware/dhy0040s3q.tflite b/firmware/dhy0040s3q.tflite new file mode 100644 index 00000000..4d39f007 Binary files /dev/null and b/firmware/dhy0040s3q.tflite differ diff --git a/firmware/dig-s1-q-20220227.tflite b/firmware/dig-s1-q-20220227.tflite deleted file mode 100644 index 3ced1fcc..00000000 Binary files a/firmware/dig-s1-q-20220227.tflite and /dev/null differ diff --git a/firmware/dig-s2-q-20220211.tflite b/firmware/dig-s2-q-20220211.tflite deleted file mode 100644 index 26cb56eb..00000000 Binary files a/firmware/dig-s2-q-20220211.tflite and /dev/null differ diff --git a/firmware/firmware.bin b/firmware/firmware.bin index ebfcb971..5c99e32a 100644 Binary files a/firmware/firmware.bin and b/firmware/firmware.bin differ diff --git a/firmware/html.zip b/firmware/html.zip index c136949e..85c4c7df 100644 Binary files a/firmware/html.zip and b/firmware/html.zip differ diff --git a/sd-card/html/edit_config_param.html b/sd-card/html/edit_config_param.html index 7dea248d..03e98a9f 100644 --- a/sd-card/html/edit_config_param.html +++ b/sd-card/html/edit_config_param.html @@ -342,6 +342,8 @@ textarea { Time to keep the separated digit images (in days -"0" = forever) + @@ -387,6 +390,8 @@ textarea { Time to keep the separated digit images (in days -"0" = forever) +

PostProcessing

@@ -1880,11 +1886,11 @@ function UpdateInput() { WriteParameter(param, category, "Digits", "CNNGoodThreshold", true); WriteParameter(param, category, "Digits", "LogImageLocation", true); WriteParameter(param, category, "Digits", "LogfileRetentionInDays", true); - WriteParameter(param, category, "Digits", "ModelInputSize", false); +// WriteParameter(param, category, "Digits", "ModelInputSize", false); WriteParameter(param, category, "Analog", "LogImageLocation", true); WriteParameter(param, category, "Analog", "LogfileRetentionInDays", true); - WriteParameter(param, category, "Analog", "ModelInputSize", false); +// WriteParameter(param, category, "Analog", "ModelInputSize", false); WriteParameter(param, category, "PostProcessing", "PreValueUse", true); WriteParameter(param, category, "PostProcessing", "PreValueAgeStartup", true); @@ -1981,12 +1987,12 @@ function ReadParameterAll() ReadParameter(param, "Digits", "CNNGoodThreshold", true); ReadParameter(param, "Digits", "LogImageLocation", true); ReadParameter(param, "Digits", "LogfileRetentionInDays", true); - ReadParameter(param, "Digits", "ModelInputSize", false); +// ReadParameter(param, "Digits", "ModelInputSize", false); ReadParameter(param, "Analog", "Model", false); ReadParameter(param, "Analog", "LogImageLocation", true); ReadParameter(param, "Analog", "LogfileRetentionInDays", true); - ReadParameter(param, "Analog", "ModelInputSize", false); +// ReadParameter(param, "Analog", "ModelInputSize", false); ReadParameter(param, "PostProcessing", "PreValueUse", true); ReadParameter(param, "PostProcessing", "PreValueAgeStartup", true); diff --git a/sd-card/html/ota_page.html b/sd-card/html/ota_page.html index 7476fd23..e86831fa 100644 --- a/sd-card/html/ota_page.html +++ b/sd-card/html/ota_page.html @@ -162,6 +162,7 @@ function init(){ function doUpdate() { if (confirm("Are you sure to update the firmware?")) { var stringota = "/ota?file=firmware.bin"; + document.getElementById("doUpdate").disabled = true; var xhttp = new XMLHttpRequest(); @@ -188,6 +189,8 @@ function doUpdate() { function doUpdatehtml() { if (confirm("Are you sure to update the /html content?")) { var stringota = "/ota?task=unziphtml"; + document.getElementById("doUpdatehtml").disabled = true; + var xhttp = new XMLHttpRequest(); diff --git a/sd-card/html/readconfigparam.js b/sd-card/html/readconfigparam.js index 5deb5141..7977fce9 100644 --- a/sd-card/html/readconfigparam.js +++ b/sd-card/html/readconfigparam.js @@ -86,7 +86,7 @@ function ParseConfig() { ParamAddValue(param, catname, "CNNGoodThreshold", 1); ParamAddValue(param, catname, "LogImageLocation"); ParamAddValue(param, catname, "LogfileRetentionInDays"); - ParamAddValue(param, catname, "ModelInputSize", 2); +// ParamAddValue(param, catname, "ModelInputSize", 2); var catname = "Analog"; @@ -97,7 +97,7 @@ function ParseConfig() { ParamAddValue(param, catname, "Model"); ParamAddValue(param, catname, "LogImageLocation"); ParamAddValue(param, catname, "LogfileRetentionInDays"); - ParamAddValue(param, catname, "ModelInputSize", 2); +// ParamAddValue(param, catname, "ModelInputSize", 2); var catname = "PostProcessing"; category[catname] = new Object(); diff --git a/sd-card/html/version.txt b/sd-card/html/version.txt index 2f0f3c13..9714cb35 100644 --- a/sd-card/html/version.txt +++ b/sd-card/html/version.txt @@ -1 +1 @@ -14.1.0 \ No newline at end of file +14.2.0 \ No newline at end of file