Begin implementing lexer

This commit is contained in:
mschae23
2025-09-22 23:22:25 +02:00
parent 7cbb0ec7cf
commit 3d84152fa4
2 changed files with 180 additions and 3 deletions

View File

@@ -184,3 +184,180 @@ export function getCategoryDefaultSelection(category: string): CategorySelection
}
return { name: category, option: CategorySkipOption.Disabled} as CategorySelection;
}
type TokenType =
| "if" // Keywords
| "disabled" | "show overlay" | "manual skip" | "auto skip" // Skip option
| keyof typeof SkipRuleAttribute // Segment attributes
| keyof typeof SkipRuleOperator // Segment attribute operators
| "and" | "or" // Expression operators
| "string" // Literal values
| "eof" | "error"; // Sentinel and special tokens
interface SourcePos {
line: number;
// column: number;
}
interface Span {
start: SourcePos;
end: SourcePos;
}
interface Token {
type: TokenType;
span: Span;
value: string;
}
interface LexerState {
source: string;
start: number;
current: number;
start_pos: SourcePos;
current_pos: SourcePos;
}
function nextToken(state: LexerState): Token {
function makeToken(type: TokenType): Token {
return {
type,
span: { start: state.start_pos, end: state.current_pos, },
value: state.source.slice(state.start, state.current),
};
}
/**
* Returns the UTF-16 value at the current position and advances it forward.
* If the end of the source string has been reached, returns {@code null}.
*
* @return current UTF-16 value, or {@code null} on EOF
*/
function consume(): string | null {
if (state.source.length > state.current) {
// The UTF-16 value at the current position, which could be either a Unicode code point or a lone surrogate.
// The check above this is also based on the UTF-16 value count, so this should not be able to fail on “weird” inputs.
const c = state.source[state.current];
state.current++;
if (c === "\n") {
state.current_pos.line++;
// state.current_pos.column = 1;
} else {
// // TODO This will be wrong on anything involving UTF-16 surrogate pairs or grapheme clusters with multiple code units
// // So just don't show column numbers on errors for now
// state.current_pos.column++;
}
return c;
} else {
return null;
}
}
/**
* Returns the UTF-16 value at the current position without advancing it.
* If the end of the source string has been reached, returns {@code null}.
*
* @return current UTF-16 value, or {@code null} on EOF
*/
function peek(): string | null {
if (state.source.length > state.current) {
// See comment in consume() for Unicode expectations here
return state.source[state.current];
} else {
return null;
}
}
/**
* Checks the current position against expected UTF-16 values.
* If any of them matches, advances the current position and returns
* {@code true}, otherwise {@code false}.
*
* @param expected the expected set of UTF-16 values at the current position
* @return whether the actual value matches and whether the position was advanced
*/
function expect(expected: string | readonly string[]): boolean {
const actual = peek();
if (actual === null) {
return false;
}
if (typeof expected === "string") {
if (expected === actual) {
consume();
return true;
}
} else if (expected.includes(actual)) {
consume();
return true;
}
return false;
}
/**
* Skips a series of whitespace characters starting at the current
* position. May advance the current position multiple times, once,
* or not at all.
*/
function skipWhitespace() {
let c = peek();
const whitespace = /s+/;
while (c != null) {
if (!whitespace.test(c)) {
return;
}
consume();
c = peek();
}
}
/**
* Skips all characters until the next {@code "\n"} (line feed)
* character occurs (inclusive). Will always advance the current position
* at least once.
*/
function skipLine() {
let c = consume();
while (c != null) {
if (c == '\n') {
return;
}
c = consume();
}
}
function isEof(): boolean {
return state.current >= state.source.length;
}
for (;;) {
skipWhitespace();
state.start = state.current;
if (isEof()) {
return makeToken("eof");
}
const c = consume();
switch (c) {
// TODO
default:
return makeToken("error");
}
}
}
export function compileConfig(config: string): AdvancedSkipRuleSet[] | null {
// TODO
const ruleSets: AdvancedSkipRuleSet[] = [];
return ruleSets;
}