From d225e673c71bc57066fa4260d95316bf73b67f95 Mon Sep 17 00:00:00 2001 From: Vadim Vetrov Date: Sun, 2 Feb 2025 19:08:47 +0300 Subject: [PATCH] Implement Aho-Corasick algorithm --- Kbuild | 2 +- src/trie.c | 183 +++++++++++++++++++++++++++++++++++++++++++++++++ src/trie.h | 99 ++++++++++++++++++++++++++ src/types.h | 1 + test/main_fn.c | 1 + test/trie.c | 104 ++++++++++++++++++++++++++++ uspace.mk | 2 +- 7 files changed, 390 insertions(+), 2 deletions(-) create mode 100644 src/trie.c create mode 100644 src/trie.h create mode 100644 test/trie.c diff --git a/Kbuild b/Kbuild index de094d3..adb8dd1 100644 --- a/Kbuild +++ b/Kbuild @@ -1,3 +1,3 @@ obj-m := kyoutubeUnblock.o -kyoutubeUnblock-objs := src/kytunblock.o src/mangle.o src/quic.o src/quic_crypto.o src/utils.o src/tls.o src/getopt.o src/inet_ntop.o src/args.o deps/cyclone/aes.o deps/cyclone/cpu_endian.o deps/cyclone/ecb.o deps/cyclone/gcm.o deps/cyclone/hkdf.o deps/cyclone/hmac.o deps/cyclone/sha256.o +kyoutubeUnblock-objs := src/kytunblock.o src/mangle.o src/quic.o src/quic_crypto.o src/utils.o src/tls.o src/getopt.o src/inet_ntop.o src/args.o src/trie.o deps/cyclone/aes.o deps/cyclone/cpu_endian.o deps/cyclone/ecb.o deps/cyclone/gcm.o deps/cyclone/hkdf.o deps/cyclone/hmac.o deps/cyclone/sha256.o ccflags-y := -std=gnu99 -DKERNEL_SPACE -Wno-error -Wno-declaration-after-statement -I$(src)/src -I$(src)/deps/cyclone/include diff --git a/src/trie.c b/src/trie.c new file mode 100644 index 0000000..1d4af2a --- /dev/null +++ b/src/trie.c @@ -0,0 +1,183 @@ +/* + youtubeUnblock - https://github.com/Waujito/youtubeUnblock + + Copyright (C) 2024-2025 Vadim Vetrov + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * This is slightly optimized Aho-Corasick implementation + * + * Big thanks to e-maxx http://e-maxx.ru/algo/aho_corasick + * for the best description and reference code samples + */ + +#include "trie.h" + +int trie_init(struct trie_container *trie) { + void *vx = malloc(sizeof(struct trie_vertex) * TRIE_STARTSZ); + if (vx == NULL) { + return -ENOMEM; + } + trie->vx = vx; + trie->arrsz = TRIE_STARTSZ; + trie->sz = 1; + + struct trie_vertex *trx = trie->vx; + trx->p = trx->link = -1; + trx->leaf = 0; + trx->depth = 0; + trx->pch = 0; + memset(trx->go, 0xff, sizeof(trie->vx[0].go)); + + return 0; +} + +void trie_destroy(struct trie_container *trie) { + trie->arrsz = 0; + trie->sz = 0; + free(trie->vx); + trie->vx = NULL; +} + +int trie_push_vertex(struct trie_container *trie) { + if (trie->sz == NMAX - 1) { + return -EINVAL; + } + + if (trie->arrsz == trie->sz) { // realloc + void *pt = realloc(trie->vx, + sizeof(struct trie_vertex) * trie->arrsz * 2); + if (pt == NULL) { + return -ENOMEM; + } + + trie->arrsz *= 2; + trie->vx = pt; + } + + return trie->sz++; +} + + +int trie_add_string(struct trie_container *trie, + const uint8_t *str, size_t strlen) { + int v = 0; + int nv; + + for (size_t i = 0; i < strlen; ++i) { + uint8_t c = str[i]; + if (c >= TRIE_ALPHABET) { + return -EINVAL; + } + + if (trie->vx[v].go[c] == -1) { + nv = trie_push_vertex(trie); + if (nv < 0) { + return nv; + } + struct trie_vertex *tvx = trie->vx + nv; + + memset(tvx->go, 0xff, sizeof(tvx->go)); + tvx->link = -1; + tvx->p = v; + tvx->depth = trie->vx[v].depth + 1; + tvx->leaf = 0; + tvx->pch = c; + trie->vx[v].go[c] = nv; + } + v = trie->vx[v].go[c]; + } + + if (v != 0) { + trie->vx[v].leaf = 1; + } + + return 0; +} + +static int trie_go(struct trie_container *trie, + int v, uint8_t c); + +static int trie_get_link(struct trie_container *trie, + int v) { + struct trie_vertex *tvx = trie->vx + v; + + if (tvx->link == -1) { + if (v == 0 || tvx->p == 0) { + tvx->link = 0; + } else { + tvx->link = trie_go(trie, + trie_get_link(trie, tvx->p), tvx->pch); + } + } + + return tvx->link; +} + +static int trie_go(struct trie_container *trie, int v, uint8_t c) { + struct trie_vertex *tvx = trie->vx + v; + + if (tvx->go[c] == -1) { + tvx->go[c] = v == 0 ? 0 : + trie_go(trie, trie_get_link(trie, v), c); + } + + return tvx->go[c]; +} + + +int trie_process_str( + struct trie_container *trie, + const uint8_t *str, size_t strlen, + int flags, + size_t *offset, size_t *offlen +) { + int v = 0; + size_t i = 0; + uint8_t c; + int len; + + for (; i < strlen; ++i) { + c = str[i]; + if (c >= TRIE_ALPHABET) { + v = 0; + continue; + } + + v = trie->vx[v].go[c] != -1 ? trie->vx[v].go[c] : + trie_go(trie, v, str[i]); + + if (trie->vx[v].leaf && + ((flags & TRIE_OPT_MAP_TO_END) != TRIE_OPT_MAP_TO_END || + i == strlen - 1) + ) { + ++i; + break; + } + } + + len = trie->vx[v].depth; + if ( trie->vx[v].leaf && + i >= len + ) { + size_t sp = i - len; + *offset = sp; + *offlen = len; + return 1; + } + + return 0; +} diff --git a/src/trie.h b/src/trie.h new file mode 100644 index 0000000..3bcceea --- /dev/null +++ b/src/trie.h @@ -0,0 +1,99 @@ +/* + youtubeUnblock - https://github.com/Waujito/youtubeUnblock + + Copyright (C) 2024-2025 Vadim Vetrov + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/** + * This is slightly optimized Aho-Corasick implementation + * + * Big thanks to e-maxx http://e-maxx.ru/algo/aho_corasick + * for the best description and reference code samples + * + */ + +/** + * + * This algorithm allows us to search inside the string + * for a list of patterns in the linear time. + * + * The algorithm will lazily initialize itself while + * youtubeUnblock works. Lazy initializations considered + * safe for multithreading and operate without atomicity + * or synchronization primitives. + * + */ + +#ifndef TRIE_H +#define TRIE_H + +#include "types.h" + +// ASCII alphabet +#define TRIE_ALPHABET 128 +// Maximum of vertexes in the trie +#define NMAX ((1 << 15) - 1) + +struct trie_vertex { + int leaf; // boolean flag + int depth; // depth of tree (length of substring) + int p; // parent + uint8_t pch; // vertex char + int link; // sufflink + int16_t go[TRIE_ALPHABET]; // dynamically filled pushes +}; + +struct trie_container { + struct trie_vertex *vx; + size_t arrsz; + size_t sz; +}; + +#define TRIE_STARTSZ 32 +int trie_init(struct trie_container *trie); +void trie_destroy(struct trie_container *trie); + +/** + * + * Increases trie vertex container size. + * Returns new vertex index or ret < 0 on error + * + */ +int trie_push_vertex(struct trie_container *trie); +int trie_add_string(struct trie_container *trie, + const uint8_t *str, size_t strlen); + +/** + * Aligns the pattern to the end + */ +#define TRIE_OPT_MAP_TO_END (1 << 1) + +/** + * Searches the string for the patterns. + * flags is TRIE_OPT binary mask with options for search. + * offset, offlen are destination variables with + * offset of the given string and length of target. + * + * returns 1 if target found, 0 otherwise + */ +int trie_process_str( + struct trie_container *trie, + const uint8_t *str, size_t strlen, + int flags, + size_t *offset, size_t *offlen +); + +#endif diff --git a/src/types.h b/src/types.h index 0dc9705..09fba6d 100644 --- a/src/types.h +++ b/src/types.h @@ -68,6 +68,7 @@ typedef __s16 int_least16_t; /* integer of >= 16 bits */ #define free kfree #define malloc(size) kmalloc((size), GFP_KERNEL) +#define realloc(pt, size) krealloc((pt), (size), GFP_KERNEL) #define calloc(n, size) kcalloc((n), (size), GFP_KERNEL) #define ip6_hdr ipv6hdr diff --git a/test/main_fn.c b/test/main_fn.c index d068a95..48b69cc 100644 --- a/test/main_fn.c +++ b/test/main_fn.c @@ -10,6 +10,7 @@ static void RunAllTests(void) { RUN_TEST_GROUP(TLSTest) RUN_TEST_GROUP(QuicTest); + RUN_TEST_GROUP(TrieTest); } int main(int argc, const char * argv[]) diff --git a/test/trie.c b/test/trie.c new file mode 100644 index 0000000..642da76 --- /dev/null +++ b/test/trie.c @@ -0,0 +1,104 @@ +#include "unity.h" +#include "unity_fixture.h" + +#include "trie.h" + +TEST_GROUP(TrieTest); + +TEST_SETUP(TrieTest) +{ +} + +TEST_TEAR_DOWN(TrieTest) +{ +} + +const char ASTR[] = "abacaba"; +const char BSTR[] = "BABABABA"; +const char CSTR[] = "abracadabra"; + +const char tstr[] = "aBABABABDADAabacabracadabraabbbabacabaaaaaabacaba"; + + +TEST(TrieTest, Trie_string_adds) +{ + int ret; + size_t offset; + size_t offlen; + struct trie_container trie; + + ret = trie_init(&trie); + TEST_ASSERT_EQUAL(0, ret); + ret = trie_add_string(&trie, (uint8_t *)ASTR, sizeof(ASTR) - 1); + TEST_ASSERT_EQUAL(0, ret); + ret = trie_add_string(&trie, (uint8_t *)BSTR, sizeof(BSTR) - 1); + TEST_ASSERT_EQUAL(0, ret); + ret = trie_add_string(&trie, (uint8_t *)CSTR, sizeof(CSTR) - 1); + TEST_ASSERT_EQUAL(0, ret); + + TEST_ASSERT_EQUAL(25, trie.sz); + + trie_destroy(&trie); +} + +TEST(TrieTest, Trie_string_finds) +{ + int ret; + size_t offset; + size_t offlen; + struct trie_container trie; + + ret = trie_init(&trie); + ret = trie_add_string(&trie, (uint8_t *)ASTR, sizeof(ASTR) - 1); + ret = trie_add_string(&trie, (uint8_t *)BSTR, sizeof(BSTR) - 1); + ret = trie_add_string(&trie, (uint8_t *)CSTR, sizeof(CSTR) - 1); + + ret = trie_process_str(&trie, + (uint8_t *)tstr, sizeof(tstr) - 1, + 0, &offset, &offlen + ); + TEST_ASSERT_EQUAL(1, ret); + TEST_ASSERT_EQUAL(11, offlen); + TEST_ASSERT_EQUAL_STRING_LEN("abracadabra", tstr + offset, offlen); + + trie_destroy(&trie); +} + +TEST(TrieTest, Trie_string_finds_opt_end) +{ + int ret; + size_t offset; + size_t offlen; + struct trie_container trie; + + ret = trie_init(&trie); + ret = trie_add_string(&trie, (uint8_t *)ASTR, sizeof(ASTR) - 1); + ret = trie_add_string(&trie, (uint8_t *)BSTR, sizeof(BSTR) - 1); + ret = trie_add_string(&trie, (uint8_t *)CSTR, sizeof(CSTR) - 1); + + ret = trie_process_str(&trie, + (uint8_t *)tstr, sizeof(tstr) - 1, + TRIE_OPT_MAP_TO_END, + &offset, &offlen + ); + TEST_ASSERT_EQUAL(1, ret); + TEST_ASSERT_EQUAL(7, offlen); + TEST_ASSERT_EQUAL_STRING_LEN("abacaba", tstr + offset, offlen); + + ret = trie_process_str(&trie, + (uint8_t *)tstr, sizeof(tstr), + TRIE_OPT_MAP_TO_END, + &offset, &offlen + ); + TEST_ASSERT_EQUAL(0, ret); + + trie_destroy(&trie); +} + + +TEST_GROUP_RUNNER(TrieTest) +{ + RUN_TEST_CASE(TrieTest, Trie_string_adds); + RUN_TEST_CASE(TrieTest, Trie_string_finds); + RUN_TEST_CASE(TrieTest, Trie_string_finds_opt_end); +} diff --git a/uspace.mk b/uspace.mk index a1f8690..3699cff 100644 --- a/uspace.mk +++ b/uspace.mk @@ -34,7 +34,7 @@ export CC CCLD LD CFLAGS LDFLAGS LIBNFNETLINK_CFLAGS LIBNFNETLINK_LIBS LIBMNL_CF APP:=$(BUILD_DIR)/youtubeUnblock TEST_APP:=$(BUILD_DIR)/testYoutubeUnblock -SRCS := mangle.c args.c utils.c quic.c tls.c getopt.c quic_crypto.c inet_ntop.c +SRCS := mangle.c args.c utils.c quic.c tls.c getopt.c quic_crypto.c inet_ntop.c trie.c OBJS := $(SRCS:%.c=$(BUILD_DIR)/%.o) APP_EXEC := youtubeUnblock.c APP_OBJ := $(APP_EXEC:%.c=$(BUILD_DIR)/%.o)