Implement lexing strings and numbers

2026-01-29 13:50:50 +03:00 · 2025-09-23 17:59:59 +02:00
parent 2370adb8b2
commit 2004f6bf1b
1 changed files with 137 additions and 7 deletions
--- a/src/utils/skipRule.ts
+++ b/src/utils/skipRule.ts
@@ -192,7 +192,7 @@ type TokenType =
    | keyof typeof SkipRuleOperator // Segment attribute operators
    | "and" | "or" // Expression operators
    | "(" | ")" // Syntax
-    | "string" // Literal values
+    | "string" | "number" // Literal values
    | "eof" | "error"; // Sentinel and special tokens

 interface SourcePos {
@@ -336,10 +336,14 @@ function nextToken(state: LexerState): Token {
        return state.current >= state.source.length;
    }

-    for (;;) {
-        skipWhitespace();
+    function resetToCurrent() {
        state.start = state.current;
        state.start_pos = state.current_pos;
+    }
+
+    for (;;) {
+        skipWhitespace();
+        resetToCurrent();

        if (isEof()) {
            return makeToken("eof");
@@ -410,12 +414,138 @@ function nextToken(state: LexerState): Token {
            }
        }

-        const c = consume();
+        let c = consume();

        if (c === '"') {
-            // TODO
-        } else if (/[0-9.]/.test(c)) {
-            // TODO
+            // Parses string according to ECMA-404 2nd edition (JSON), section 9 “String”
+            let output = "";
+            let c = consume();
+            let error = false;
+
+            while (c !== null && c !== '"') {
+                if (c == '\\') {
+                    c = consume();
+
+                    switch (c) {
+                        case '"':
+                            output = output.concat('"');
+                            break;
+                        case '\\':
+                            output = output.concat('\\');
+                            break;
+                        case '/':
+                            output = output.concat('/');
+                            break;
+                        case 'b':
+                            output = output.concat('\b');
+                            break;
+                        case 'f':
+                            output = output.concat('\f');
+                            break;
+                        case 'n':
+                            output = output.concat('\n');
+                            break;
+                        case 'r':
+                            output = output.concat('\r');
+                            break;
+                        case 't':
+                            output = output.concat('\t');
+                            break;
+                        case 'u': {
+                            // UTF-16 value sequence
+                            const digits = state.source.slice(state.current, state.current + 4);
+
+                            if (digits.length < 4 || !/[0-9a-zA-Z]{4}/.test(digits)) {
+                                error = true;
+                                output = output.concat(`\\u`);
+                                c = consume();
+                                continue;
+                            }
+
+                            const value = parseInt(digits, 16);
+                            // fromCharCode() takes a UTF-16 value without performing validity checks,
+                            // which is exactly what is needed here – in JSON, code units outside the
+                            // BMP are represented by two Unicode escape sequences.
+                            output = output.concat(String.fromCharCode(value));
+                            break;
+                        }
+                        default:
+                            error = true;
+                            output = output.concat(`\\${c}`);
+                            break;
+                    }
+                } else {
+                    output = output.concat(c);
+                }
+
+                c = consume();
+            }
+
+            return {
+                type: error || c !== '"' ? "error" : "string",
+                span: { start: state.start_pos, end: state.current_pos, },
+                value: output,
+            };
+        } else if (/[0-9-]/.test(c)) {
+            // Parses number according to ECMA-404 2nd edition (JSON), section 8 “Numbers”
+            if (c === '-') {
+                c = consume();
+
+                if (!/[0-9]/.test(c)) {
+                    return makeToken("error");
+                }
+            }
+
+            const leadingZero = c === '0';
+            let next = peek();
+            let error = false;
+
+            while (next !== null && /[0-9]/.test(next)) {
+                consume();
+                next = peek();
+
+                if (leadingZero) {
+                    error = true;
+                }
+            }
+
+
+            if (next !== null && next === '.') {
+                consume();
+                next = peek();
+
+                if (next === null || !/[0-9]/.test(next)) {
+                    return makeToken("error");
+                }
+
+                do {
+                    consume();
+                    next = peek();
+                } while (next !== null && /[0-9]/.test(next));
+            }
+
+            next = peek();
+
+            if (next != null && (next === 'e' || next === 'E')) {
+                consume();
+                next = peek();
+
+                if (next === null) {
+                    return makeToken("error");
+                }
+
+                if (next === '+' || next === '-') {
+                    consume();
+                    next = peek();
+                }
+
+                while (next !== null && /[0-9]/.test(next)) {
+                    consume();
+                    next = peek();
+                }
+            }
+
+            return makeToken(error ? "error" : "number");
        }

        return makeToken("error");