Implement Aho-Corasick algorithm

This commit is contained in:
Vadim Vetrov
2025-02-02 19:08:47 +03:00
parent d9c360910b
commit d225e673c7
7 changed files with 390 additions and 2 deletions

2
Kbuild
View File

@@ -1,3 +1,3 @@
obj-m := kyoutubeUnblock.o
kyoutubeUnblock-objs := src/kytunblock.o src/mangle.o src/quic.o src/quic_crypto.o src/utils.o src/tls.o src/getopt.o src/inet_ntop.o src/args.o deps/cyclone/aes.o deps/cyclone/cpu_endian.o deps/cyclone/ecb.o deps/cyclone/gcm.o deps/cyclone/hkdf.o deps/cyclone/hmac.o deps/cyclone/sha256.o
kyoutubeUnblock-objs := src/kytunblock.o src/mangle.o src/quic.o src/quic_crypto.o src/utils.o src/tls.o src/getopt.o src/inet_ntop.o src/args.o src/trie.o deps/cyclone/aes.o deps/cyclone/cpu_endian.o deps/cyclone/ecb.o deps/cyclone/gcm.o deps/cyclone/hkdf.o deps/cyclone/hmac.o deps/cyclone/sha256.o
ccflags-y := -std=gnu99 -DKERNEL_SPACE -Wno-error -Wno-declaration-after-statement -I$(src)/src -I$(src)/deps/cyclone/include

183
src/trie.c Normal file
View File

@@ -0,0 +1,183 @@
/*
youtubeUnblock - https://github.com/Waujito/youtubeUnblock
Copyright (C) 2024-2025 Vadim Vetrov <vetrovvd@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
/**
* This is slightly optimized Aho-Corasick implementation
*
* Big thanks to e-maxx http://e-maxx.ru/algo/aho_corasick
* for the best description and reference code samples
*/
#include "trie.h"
int trie_init(struct trie_container *trie) {
void *vx = malloc(sizeof(struct trie_vertex) * TRIE_STARTSZ);
if (vx == NULL) {
return -ENOMEM;
}
trie->vx = vx;
trie->arrsz = TRIE_STARTSZ;
trie->sz = 1;
struct trie_vertex *trx = trie->vx;
trx->p = trx->link = -1;
trx->leaf = 0;
trx->depth = 0;
trx->pch = 0;
memset(trx->go, 0xff, sizeof(trie->vx[0].go));
return 0;
}
void trie_destroy(struct trie_container *trie) {
trie->arrsz = 0;
trie->sz = 0;
free(trie->vx);
trie->vx = NULL;
}
int trie_push_vertex(struct trie_container *trie) {
if (trie->sz == NMAX - 1) {
return -EINVAL;
}
if (trie->arrsz == trie->sz) { // realloc
void *pt = realloc(trie->vx,
sizeof(struct trie_vertex) * trie->arrsz * 2);
if (pt == NULL) {
return -ENOMEM;
}
trie->arrsz *= 2;
trie->vx = pt;
}
return trie->sz++;
}
int trie_add_string(struct trie_container *trie,
const uint8_t *str, size_t strlen) {
int v = 0;
int nv;
for (size_t i = 0; i < strlen; ++i) {
uint8_t c = str[i];
if (c >= TRIE_ALPHABET) {
return -EINVAL;
}
if (trie->vx[v].go[c] == -1) {
nv = trie_push_vertex(trie);
if (nv < 0) {
return nv;
}
struct trie_vertex *tvx = trie->vx + nv;
memset(tvx->go, 0xff, sizeof(tvx->go));
tvx->link = -1;
tvx->p = v;
tvx->depth = trie->vx[v].depth + 1;
tvx->leaf = 0;
tvx->pch = c;
trie->vx[v].go[c] = nv;
}
v = trie->vx[v].go[c];
}
if (v != 0) {
trie->vx[v].leaf = 1;
}
return 0;
}
static int trie_go(struct trie_container *trie,
int v, uint8_t c);
static int trie_get_link(struct trie_container *trie,
int v) {
struct trie_vertex *tvx = trie->vx + v;
if (tvx->link == -1) {
if (v == 0 || tvx->p == 0) {
tvx->link = 0;
} else {
tvx->link = trie_go(trie,
trie_get_link(trie, tvx->p), tvx->pch);
}
}
return tvx->link;
}
static int trie_go(struct trie_container *trie, int v, uint8_t c) {
struct trie_vertex *tvx = trie->vx + v;
if (tvx->go[c] == -1) {
tvx->go[c] = v == 0 ? 0 :
trie_go(trie, trie_get_link(trie, v), c);
}
return tvx->go[c];
}
int trie_process_str(
struct trie_container *trie,
const uint8_t *str, size_t strlen,
int flags,
size_t *offset, size_t *offlen
) {
int v = 0;
size_t i = 0;
uint8_t c;
int len;
for (; i < strlen; ++i) {
c = str[i];
if (c >= TRIE_ALPHABET) {
v = 0;
continue;
}
v = trie->vx[v].go[c] != -1 ? trie->vx[v].go[c] :
trie_go(trie, v, str[i]);
if (trie->vx[v].leaf &&
((flags & TRIE_OPT_MAP_TO_END) != TRIE_OPT_MAP_TO_END ||
i == strlen - 1)
) {
++i;
break;
}
}
len = trie->vx[v].depth;
if ( trie->vx[v].leaf &&
i >= len
) {
size_t sp = i - len;
*offset = sp;
*offlen = len;
return 1;
}
return 0;
}

99
src/trie.h Normal file
View File

@@ -0,0 +1,99 @@
/*
youtubeUnblock - https://github.com/Waujito/youtubeUnblock
Copyright (C) 2024-2025 Vadim Vetrov <vetrovvd@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
/**
* This is slightly optimized Aho-Corasick implementation
*
* Big thanks to e-maxx http://e-maxx.ru/algo/aho_corasick
* for the best description and reference code samples
*
*/
/**
*
* This algorithm allows us to search inside the string
* for a list of patterns in the linear time.
*
* The algorithm will lazily initialize itself while
* youtubeUnblock works. Lazy initializations considered
* safe for multithreading and operate without atomicity
* or synchronization primitives.
*
*/
#ifndef TRIE_H
#define TRIE_H
#include "types.h"
// ASCII alphabet
#define TRIE_ALPHABET 128
// Maximum of vertexes in the trie
#define NMAX ((1 << 15) - 1)
struct trie_vertex {
int leaf; // boolean flag
int depth; // depth of tree (length of substring)
int p; // parent
uint8_t pch; // vertex char
int link; // sufflink
int16_t go[TRIE_ALPHABET]; // dynamically filled pushes
};
struct trie_container {
struct trie_vertex *vx;
size_t arrsz;
size_t sz;
};
#define TRIE_STARTSZ 32
int trie_init(struct trie_container *trie);
void trie_destroy(struct trie_container *trie);
/**
*
* Increases trie vertex container size.
* Returns new vertex index or ret < 0 on error
*
*/
int trie_push_vertex(struct trie_container *trie);
int trie_add_string(struct trie_container *trie,
const uint8_t *str, size_t strlen);
/**
* Aligns the pattern to the end
*/
#define TRIE_OPT_MAP_TO_END (1 << 1)
/**
* Searches the string for the patterns.
* flags is TRIE_OPT binary mask with options for search.
* offset, offlen are destination variables with
* offset of the given string and length of target.
*
* returns 1 if target found, 0 otherwise
*/
int trie_process_str(
struct trie_container *trie,
const uint8_t *str, size_t strlen,
int flags,
size_t *offset, size_t *offlen
);
#endif

View File

@@ -68,6 +68,7 @@ typedef __s16 int_least16_t; /* integer of >= 16 bits */
#define free kfree
#define malloc(size) kmalloc((size), GFP_KERNEL)
#define realloc(pt, size) krealloc((pt), (size), GFP_KERNEL)
#define calloc(n, size) kcalloc((n), (size), GFP_KERNEL)
#define ip6_hdr ipv6hdr

View File

@@ -10,6 +10,7 @@ static void RunAllTests(void)
{
RUN_TEST_GROUP(TLSTest)
RUN_TEST_GROUP(QuicTest);
RUN_TEST_GROUP(TrieTest);
}
int main(int argc, const char * argv[])

104
test/trie.c Normal file
View File

@@ -0,0 +1,104 @@
#include "unity.h"
#include "unity_fixture.h"
#include "trie.h"
TEST_GROUP(TrieTest);
TEST_SETUP(TrieTest)
{
}
TEST_TEAR_DOWN(TrieTest)
{
}
const char ASTR[] = "abacaba";
const char BSTR[] = "BABABABA";
const char CSTR[] = "abracadabra";
const char tstr[] = "aBABABABDADAabacabracadabraabbbabacabaaaaaabacaba";
TEST(TrieTest, Trie_string_adds)
{
int ret;
size_t offset;
size_t offlen;
struct trie_container trie;
ret = trie_init(&trie);
TEST_ASSERT_EQUAL(0, ret);
ret = trie_add_string(&trie, (uint8_t *)ASTR, sizeof(ASTR) - 1);
TEST_ASSERT_EQUAL(0, ret);
ret = trie_add_string(&trie, (uint8_t *)BSTR, sizeof(BSTR) - 1);
TEST_ASSERT_EQUAL(0, ret);
ret = trie_add_string(&trie, (uint8_t *)CSTR, sizeof(CSTR) - 1);
TEST_ASSERT_EQUAL(0, ret);
TEST_ASSERT_EQUAL(25, trie.sz);
trie_destroy(&trie);
}
TEST(TrieTest, Trie_string_finds)
{
int ret;
size_t offset;
size_t offlen;
struct trie_container trie;
ret = trie_init(&trie);
ret = trie_add_string(&trie, (uint8_t *)ASTR, sizeof(ASTR) - 1);
ret = trie_add_string(&trie, (uint8_t *)BSTR, sizeof(BSTR) - 1);
ret = trie_add_string(&trie, (uint8_t *)CSTR, sizeof(CSTR) - 1);
ret = trie_process_str(&trie,
(uint8_t *)tstr, sizeof(tstr) - 1,
0, &offset, &offlen
);
TEST_ASSERT_EQUAL(1, ret);
TEST_ASSERT_EQUAL(11, offlen);
TEST_ASSERT_EQUAL_STRING_LEN("abracadabra", tstr + offset, offlen);
trie_destroy(&trie);
}
TEST(TrieTest, Trie_string_finds_opt_end)
{
int ret;
size_t offset;
size_t offlen;
struct trie_container trie;
ret = trie_init(&trie);
ret = trie_add_string(&trie, (uint8_t *)ASTR, sizeof(ASTR) - 1);
ret = trie_add_string(&trie, (uint8_t *)BSTR, sizeof(BSTR) - 1);
ret = trie_add_string(&trie, (uint8_t *)CSTR, sizeof(CSTR) - 1);
ret = trie_process_str(&trie,
(uint8_t *)tstr, sizeof(tstr) - 1,
TRIE_OPT_MAP_TO_END,
&offset, &offlen
);
TEST_ASSERT_EQUAL(1, ret);
TEST_ASSERT_EQUAL(7, offlen);
TEST_ASSERT_EQUAL_STRING_LEN("abacaba", tstr + offset, offlen);
ret = trie_process_str(&trie,
(uint8_t *)tstr, sizeof(tstr),
TRIE_OPT_MAP_TO_END,
&offset, &offlen
);
TEST_ASSERT_EQUAL(0, ret);
trie_destroy(&trie);
}
TEST_GROUP_RUNNER(TrieTest)
{
RUN_TEST_CASE(TrieTest, Trie_string_adds);
RUN_TEST_CASE(TrieTest, Trie_string_finds);
RUN_TEST_CASE(TrieTest, Trie_string_finds_opt_end);
}

View File

@@ -34,7 +34,7 @@ export CC CCLD LD CFLAGS LDFLAGS LIBNFNETLINK_CFLAGS LIBNFNETLINK_LIBS LIBMNL_CF
APP:=$(BUILD_DIR)/youtubeUnblock
TEST_APP:=$(BUILD_DIR)/testYoutubeUnblock
SRCS := mangle.c args.c utils.c quic.c tls.c getopt.c quic_crypto.c inet_ntop.c
SRCS := mangle.c args.c utils.c quic.c tls.c getopt.c quic_crypto.c inet_ntop.c trie.c
OBJS := $(SRCS:%.c=$(BUILD_DIR)/%.o)
APP_EXEC := youtubeUnblock.c
APP_OBJ := $(APP_EXEC:%.c=$(BUILD_DIR)/%.o)