Tokenize keywords

2026-01-31 14:51:13 +03:00 · 2025-09-23 12:45:29 +02:00
parent 3d84152fa4
commit 2370adb8b2
2 changed files with 118 additions and 32 deletions
--- a/src/components/options/AdvancedSkipOptionsComponent.tsx
+++ b/src/components/options/AdvancedSkipOptionsComponent.tsx
@@ -2,7 +2,7 @@ import * as React from "react";
 import * as CompileConfig from "../../../config.json";

 import Config from "../../config";
-import { AdvancedSkipRuleSet, SkipRuleAttribute, SkipRuleOperator } from "../../utils/skipRule";
+import {AdvancedSkipRuleSet, compileConfigNew, SkipRuleAttribute, SkipRuleOperator} from "../../utils/skipRule";
 import { ActionType, ActionTypes, CategorySkipOption } from "../../types";

 let configSaveTimeout: NodeJS.Timeout | null = null;
@@ -65,6 +65,9 @@ export function AdvancedSkipOptionsComponent() {
 }

 function compileConfig(config: string): AdvancedSkipRuleSet[] | null {
+    // Debug
+    compileConfigNew(config);
+
    const ruleSets: AdvancedSkipRuleSet[] = [];

    let ruleSet: AdvancedSkipRuleSet = {
--- a/src/utils/skipRule.ts
+++ b/src/utils/skipRule.ts
@@ -191,6 +191,7 @@ type TokenType =
    | keyof typeof SkipRuleAttribute // Segment attributes
    | keyof typeof SkipRuleOperator // Segment attribute operators
    | "and" | "or" // Expression operators
+    | "(" | ")" // Syntax
    | "string" // Literal values
    | "eof" | "error"; // Sentinel and special tokens

@@ -230,9 +231,9 @@ function nextToken(state: LexerState): Token {

    /**
     * Returns the UTF-16 value at the current position and advances it forward.
-     * If the end of the source string has been reached, returns {@code null}.
+     * If the end of the source string has been reached, returns <code>null</code>.
     *
-     * @return current UTF-16 value, or {@code null} on EOF
+     * @return current UTF-16 value, or <code>null</code> on EOF
     */
    function consume(): string | null {
        if (state.source.length > state.current) {
@@ -258,9 +259,9 @@ function nextToken(state: LexerState): Token {

    /**
     * Returns the UTF-16 value at the current position without advancing it.
-     * If the end of the source string has been reached, returns {@code null}.
+     * If the end of the source string has been reached, returns <code>null</code>.
     *
-     * @return current UTF-16 value, or {@code null} on EOF
+     * @return current UTF-16 value, or <code>null</code> on EOF
     */
    function peek(): string | null {
        if (state.source.length > state.current) {
@@ -272,31 +273,28 @@ function nextToken(state: LexerState): Token {
    }

    /**
-     * Checks the current position against expected UTF-16 values.
-     * If any of them matches, advances the current position and returns
-     * {@code true}, otherwise {@code false}.
+     * Checks the word at the current position against a list of
+     * expected keywords. The keyword can consist of multiple characters.
+     * If a match is found, the current position is advanced by the length
+     * of the keyword found.
     *
-     * @param expected the expected set of UTF-16 values at the current position
-     * @return whether the actual value matches and whether the position was advanced
+     * @param keywords the expected set of keywords at the current position
+     * @param caseSensitive whether to do a case-sensitive comparison
+     * @return the matching keyword, or <code>null</code>
     */
-    function expect(expected: string | readonly string[]): boolean {
-        const actual = peek();
+    function expectKeyword(keywords: readonly string[], caseSensitive: boolean): string | null {
+        for (const keyword of keywords) {
+            // slice() clamps to string length, so cannot cause out of bounds errors
+            const actual = state.source.slice(state.current, state.current + keyword.length);

-        if (actual === null) {
-            return false;
+            if (caseSensitive && keyword === actual || !caseSensitive && keyword.toLowerCase() === actual.toLowerCase()) {
+                // Does not handle keywords containing line feeds, which shouldn't happen anyway
+                state.current += keyword.length;
+                return keyword;
+            }
        }

-        if (typeof expected === "string") {
-            if (expected === actual) {
-                consume();
-                return true;
-            }
-        } else if (expected.includes(actual)) {
-            consume();
-            return true;
-        }
-
-        return false;
+        return null;
    }

    /**
@@ -306,7 +304,7 @@ function nextToken(state: LexerState): Token {
     */
    function skipWhitespace() {
        let c = peek();
-        const whitespace = /s+/;
+        const whitespace = /\s+/;

        while (c != null) {
            if (!whitespace.test(c)) {
@@ -319,7 +317,7 @@ function nextToken(state: LexerState): Token {
    }

    /**
-     * Skips all characters until the next {@code "\n"} (line feed)
+     * Skips all characters until the next <code>"\n"</code> (line feed)
     * character occurs (inclusive). Will always advance the current position
     * at least once.
     */
@@ -341,23 +339,108 @@ function nextToken(state: LexerState): Token {
    for (;;) {
        skipWhitespace();
        state.start = state.current;
+        state.start_pos = state.current_pos;

        if (isEof()) {
            return makeToken("eof");
        }

+        const keyword = expectKeyword([
+            "if", "and", "or",
+            "(", ")",
+            "//",
+        ].concat(Object.values(SkipRuleAttribute))
+            .concat(Object.values(SkipRuleOperator)), true);
+
+        if (keyword !== null) {
+            switch (keyword) {
+                case "if": return makeToken("if");
+                case "and": return makeToken("and");
+                case "or": return makeToken("or");
+
+                case "(": return makeToken("(");
+                case ")": return makeToken(")");
+
+                case "time.start": return makeToken("StartTime");
+                case "time.end": return makeToken("EndTime");
+                case "time.duration": return makeToken("Duration");
+                case "time.startPercent": return makeToken("StartTimePercent");
+                case "time.endPercent": return makeToken("EndTimePercent");
+                case "time.durationPercent": return makeToken("DurationPercent");
+                case "category": return makeToken("Category");
+                case "actionType": return makeToken("ActionType");
+                case "chapter.name": return makeToken("Description");
+                case "chapter.source": return makeToken("Source");
+                case "channel.id": return makeToken("ChannelID");
+                case "channel.name": return makeToken("ChannelName");
+                case "video.duration": return makeToken("VideoDuration");
+                case "video.title": return makeToken("Title");
+
+                case "<": return makeToken("Less");
+                case "<=": return makeToken("LessOrEqual");
+                case ">": return makeToken("Greater");
+                case ">=": return makeToken("GreaterOrEqual");
+                case "==": return makeToken("Equal");
+                case "!=": return makeToken("NotEqual");
+                case "*=": return makeToken("Contains");
+                case "!*=": return makeToken("NotContains");
+                case "~=": return makeToken("Regex");
+                case "~i=": return makeToken("RegexIgnoreCase");
+                case "!~=": return makeToken("NotRegex");
+                case "!~i=": return makeToken("NotRegexIgnoreCase");
+
+                case "//":
+                    skipLine();
+                    continue;
+
+                default:
+            }
+        }
+
+        const keyword2 = expectKeyword(
+            [ "disabled", "show overlay", "manual skip", "auto skip" ], false);
+
+        if (keyword2 !== null) {
+            switch (keyword2) {
+                case "disabled": return makeToken("disabled");
+                case "show overlay": return makeToken("show overlay");
+                case "manual skip": return makeToken("manual skip");
+                case "auto skip": return makeToken("auto skip");
+                default:
+            }
+        }
+
        const c = consume();

-        switch (c) {
+        if (c === '"') {
+            // TODO
+        } else if (/[0-9.]/.test(c)) {
            // TODO
-            default:
-                return makeToken("error");
        }
+
+        return makeToken("error");
    }
 }

-export function compileConfig(config: string): AdvancedSkipRuleSet[] | null {
+export function compileConfigNew(config: string): AdvancedSkipRuleSet[] | null {
+     // Mutated by calls to nextToken()
+    const lexerState: LexerState = {
+        source: config,
+        start: 0,
+        current: 0,
+
+        start_pos: { line: 1 },
+        current_pos: { line: 1 },
+    };
+
+    let token = nextToken(lexerState);
+
+    while (token.type !== "eof") {
+        console.log(token);
+
+        token = nextToken(lexerState);
+    }
+
    // TODO
-    const ruleSets: AdvancedSkipRuleSet[] = [];
-    return ruleSets;
+    return null;
 }