From abf363416b19401458775fdbde07fad722bf891e Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 17:06:47 +0100 Subject: [PATCH 01/15] Replace TRegExpr with purpose-built backtracking bytecode VM regex engine (#515) TRegExpr used native call recursion for backtracking, causing SIGSEGV on inputs ~42K+ chars when combined with the evaluator's stack depth. Three preprocessing passes papered over feature gaps (modifier scope leak, no named groups, ASCII-approximate Unicode properties). This replaces the entire backend with a custom regex engine while keeping the public API (ExecuteRegExp, TGocciaRegExpMatchResult) unchanged. New units: - Goccia.RegExp.Compiler: recursive-descent parser over ES2026 regex grammar, single-pass bytecode emitter with pre-scanned named groups for forward \k resolution, inline modifier group scoping, and duplicate named group validation via disjunction path tracking. - Goccia.RegExp.VM: iterative dispatch loop with heap-allocated backtrack stack and always-on failure memoization (64K-entry hash table). No native call recursion. 10M step limit throws Error instead of crashing. Key design decisions: - Modifier state (ignoreCase, multiline, dotAll) is encoded per-instruction at compile time, not read from global flags at runtime. This gives correct scoping for (?ims-ims:...) modifier groups. - Duplicate named backreferences emit a SPLIT chain with strict-mode backrefs (fail if group uncaptured) + terminal FAIL, so only the participating group's captured text is matched. - Reuses TextSemantics.pas UTF-8 functions (TryReadUTF8CodePoint, AdvanceUTF8StringIndex, CodePointToUTF8) rather than reimplementing. - Removes FPC regexpr package from the cross-compilation toolchain. - Removes staging/sm/RegExp/test-trailing.js from KNOWN_ENGINE_CRASHES. Closes #515 Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/ci.yml | 11 +- .github/workflows/toolchain.yml | 3 +- docs/build-system.md | 2 +- docs/built-ins.md | 2 +- docs/decision-log.md | 2 + scripts/run_test262_suite.ts | 2 - source/units/Goccia.RegExp.Compiler.pas | 1464 +++++++++++++++++++++++ source/units/Goccia.RegExp.Engine.pas | 822 +------------ source/units/Goccia.RegExp.Unicode.pas | 611 ---------- source/units/Goccia.RegExp.VM.pas | 654 ++++++++++ 10 files changed, 2174 insertions(+), 1399 deletions(-) create mode 100644 source/units/Goccia.RegExp.Compiler.pas delete mode 100644 source/units/Goccia.RegExp.Unicode.pas create mode 100644 source/units/Goccia.RegExp.VM.pas diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ef911e48..d2aade09 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -134,7 +134,6 @@ jobs: GEN_DIR="$PREFIX/lib/fpc/${FPC_VERSION}/units/${TARGET}/rtl-generics" FCL_DIR="$PREFIX/lib/fpc/${FPC_VERSION}/units/${TARGET}/fcl-process" FCL_BASE_SRC="$PREFIX/share/fpcsrc/packages/fcl-base/src" - REGEXPR_SRC="$PREFIX/share/fpcsrc/packages/regexpr/src" FCL_NET_SRC="$PREFIX/share/fpcsrc/packages/fcl-net/src" OPENSSL_SRC="$PREFIX/share/fpcsrc/packages/openssl/src" @@ -144,11 +143,9 @@ jobs: echo "rtl-generics units: $(ls "$GEN_DIR"/*.ppu 2>/dev/null | wc -l) .ppu files" echo "fcl-process units: $(ls "$FCL_DIR"/*.ppu 2>/dev/null | wc -l) .ppu files" echo "fcl-base source path: $FCL_BASE_SRC" - echo "regexpr source path: $REGEXPR_SRC" echo "fcl-net source path: $FCL_NET_SRC" echo "openssl source path: $OPENSSL_SRC" test -d "$FCL_BASE_SRC" - test -d "$REGEXPR_SRC" test -d "$FCL_NET_SRC" test -d "$OPENSSL_SRC" @@ -168,7 +165,7 @@ jobs: "$CROSS_FPC" -T"${OS}" -O4 -dPRODUCTION -Xs -CX -XX -B \ -Fu./source/units -Fu./source/generated -Fu./source/shared -Fu./source/app -Fu./souffle \ -Fi./source/units -Fi./source/shared -Fi./souffle \ - -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$REGEXPR_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ + -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ -FU"build/compiled" -FE"build" \ $EXTRA_FLAGS \ -dFPC_SOFT_FPUX80 \ @@ -184,7 +181,7 @@ jobs: "$CROSS_FPC" -T"${OS}" -O4 -dPRODUCTION -Xs -CX -XX -B \ -Fu./source/units -Fu./source/generated -Fu./source/shared -Fu./source/app -Fu./souffle \ -Fi./source/units -Fi./source/shared -Fi./souffle \ - -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$REGEXPR_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ + -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ -FU"build/compiled" -FE"build" \ $EXTRA_FLAGS \ -dFPC_SOFT_FPUX80 \ @@ -197,7 +194,7 @@ jobs: "$CROSS_FPC" -T"${OS}" -O4 -dPRODUCTION -Xs -CX -XX -B \ -Fu./source/units -Fu./source/generated -Fu./source/shared -Fu./source/app -Fu./souffle \ -Fi./source/units -Fi./source/shared -Fi./souffle \ - -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$REGEXPR_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ + -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ -FU"build/compiled" -FE"build" \ $EXTRA_FLAGS \ -dFPC_SOFT_FPUX80 \ @@ -209,7 +206,7 @@ jobs: "$CROSS_FPC" -T"${OS}" -O4 -dPRODUCTION -Xs -CX -XX -B \ -Fu./source/units -Fu./source/generated -Fu./source/shared -Fu./source/app -Fu./souffle \ -Fi./source/units -Fi./source/shared -Fi./souffle \ - -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$REGEXPR_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ + -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ -FU"build/compiled" -FE"build" \ $EXTRA_FLAGS \ -dFPC_SOFT_FPUX80 \ diff --git a/.github/workflows/toolchain.yml b/.github/workflows/toolchain.yml index 9f0fd9d4..ff0ebba6 100644 --- a/.github/workflows/toolchain.yml +++ b/.github/workflows/toolchain.yml @@ -107,9 +107,8 @@ jobs: # Keep official package sources available for cross builds. The # cached cross toolchain only prebuilds a minimal package subset, so - # source-based lookup is needed for units like Base64 and RegExpr. + # source-based lookup is needed for units like Base64. cp -R "$GITHUB_WORKSPACE/fpc-source/packages/fcl-base" "$PREFIX/share/fpcsrc/packages/" - cp -R "$GITHUB_WORKSPACE/fpc-source/packages/regexpr" "$PREFIX/share/fpcsrc/packages/" cp -R "$GITHUB_WORKSPACE/fpc-source/packages/fcl-net" "$PREFIX/share/fpcsrc/packages/" cp -R "$GITHUB_WORKSPACE/fpc-source/packages/openssl" "$PREFIX/share/fpcsrc/packages/" diff --git a/docs/build-system.md b/docs/build-system.md index 8c1e7c6e..8041413e 100644 --- a/docs/build-system.md +++ b/docs/build-system.md @@ -433,7 +433,7 @@ It: 6. If the `FPC_TARGET_CPU` environment variable is set, prepends `-P` to the compiler arguments (used by CI to target x86_64 on Windows where the FPC package defaults to i386). 7. For the `tests` target, auto-discovers all `*.Test.pas` files in `source/units/` and `source/shared/`. -The GitHub Actions cross-compilation workflow uses a reduced cached FPC toolchain rather than a full target-side FCL install. It prebuilds the RTL, `rtl-objpas`, `rtl-generics`, and `fcl-process`, and also caches the official `fcl-base` and `regexpr` sources so cross builds can resolve units such as `Base64` and `RegExpr` on demand from the shipped FPC packages. +The GitHub Actions cross-compilation workflow uses a reduced cached FPC toolchain rather than a full target-side FCL install. It prebuilds the RTL, `rtl-objpas`, `rtl-generics`, and `fcl-process`, and also caches the official `fcl-base` sources so cross builds can resolve units such as `Base64` on demand from the shipped FPC packages. ## Project Structure for Compilation diff --git a/docs/built-ins.md b/docs/built-ins.md index 8414b259..735ae2f6 100644 --- a/docs/built-ins.md +++ b/docs/built-ins.md @@ -236,7 +236,7 @@ RegExp is available as both `RegExp()` and `new RegExp()`. Regex literals (`/pat - When the replacer is a function and named groups are present, the `groups` object is passed as the last argument after `input`. - `String.prototype.match`, `matchAll`, `replace`, `replaceAll`, `search`, and `split` dispatch through the corresponding well-known symbol hooks, so custom protocol objects work as expected. - `matchAll()` returns a lazy iterator that advances matches on demand per the specification. -- The `u` flag enables Unicode-aware pattern matching. Unicode property escapes (`\p{Letter}`, `\P{ASCII}`, etc.) are expanded to equivalent character classes. Unicode code point escapes (`\u{41}`, `\u{1F600}`) are converted to UTF-8 byte sequences. Supported properties: `L`/`Letter`, `Lu`/`Uppercase_Letter`, `Ll`/`Lowercase_Letter`, `N`/`Number`, `Nd`/`Decimal_Number`, `P`/`Punctuation`, `S`/`Symbol`, `Z`/`Separator`, `Cc`/`Control`, `ASCII`, `ASCII_Hex_Digit`, `White_Space`. Unsupported properties throw `SyntaxError`. The `u` flag also disables TRegExpr's Russian charset extensions and enables correct `AdvanceStringIndex` for multi-byte UTF-8 sequences. +- The `u` flag enables Unicode-aware pattern matching. Unicode property escapes (`\p{Letter}`, `\P{ASCII}`, etc.) are matched against Unicode code point range tables. Unicode code point escapes (`\u{41}`, `\u{1F600}`) are converted to UTF-8 byte sequences. Supported properties: `L`/`Letter`, `Lu`/`Uppercase_Letter`, `Ll`/`Lowercase_Letter`, `N`/`Number`, `Nd`/`Decimal_Number`, `P`/`Punctuation`, `S`/`Symbol`, `Z`/`Separator`, `Cc`/`Control`, `ASCII`, `ASCII_Hex_Digit`, `White_Space`. Unsupported properties throw `SyntaxError`. The `u` flag enables correct `AdvanceStringIndex` for multi-byte UTF-8 sequences. - The `v` flag (Unicode sets) is accepted and exposed through `.flags` and `.unicodeSets`. The `u` and `v` flags are mutually exclusive. Full Unicode set notation and properties of strings in character classes are not yet implemented beyond basic `u` flag behavior. - The `d` flag (indices) is accepted and exposed through `.flags` and `.hasIndices`. Match indices are not yet populated. diff --git a/docs/decision-log.md b/docs/decision-log.md index 803c7a5f..6a8886ca 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -17,6 +17,8 @@ Chronological record of key architectural and implementation decisions, newest f --- +**2026-05-08** · `engine` — Replace TRegExpr with a purpose-built backtracking bytecode VM regex engine. TRegExpr used native call recursion for backtracking, causing SIGSEGV on inputs ~42K+ chars when combined with the evaluator's stack depth (#515). Three preprocessing passes papered over feature gaps: `(?s)` modifier scope leak, no named groups (two-pass rewrite), and inadequate Unicode (`\p{...}` expanded to ASCII approximations). New architecture: `Goccia.RegExp.Compiler.pas` (recursive-descent parser + bytecode emitter) and `Goccia.RegExp.VM.pas` (iterative dispatch loop with heap-allocated backtrack stack and always-on failure memoization). The compiler parses ES2026 regex grammar directly, handling named groups, Unicode property escapes, inline modifier groups, and backreferences natively — no preprocessing passes. The memoization cache records `(PC, InputPos)` failure states to prune exponential backtracking (e.g., `(a+)+b`). Configurable step limit (default 10M) throws `Error` instead of crashing. Removes the FPC `regexpr` package from the cross-compilation toolchain. Reuses `TextSemantics.pas` UTF-8 functions (`TryReadUTF8CodePoint`, `AdvanceUTF8StringIndex`, `CodePointToUTF8`, etc.) rather than reimplementing. Public API (`ExecuteRegExp` signature, `TGocciaRegExpMatchResult` record) unchanged; `Goccia.RegExp.Runtime.pas` and `Goccia.Builtins.GlobalRegExp.pas` unmodified. + **2026-05-05** · `parser` — Opt-in traditional `for(init; test; update)` loops (`--compat-traditional-for-loop`). Added behind a new compatibility flag for ECMAScript compatibility when porting legacy code, mirroring the existing `--compat-var` and `--compat-function` posture. The flag is off by default and ORed in by `--compat-all` so test262 (which always passes `--compat-all`) executes traditional `for(;;)` bodies that previously parser-warn-and-skipped — surfacing real engine gaps in unrelated areas (Atomics #541, Intl #542, BigInt postfix increment #540, etc.). `let`/`const` declarations in for-init create a per-iteration lexical environment per ES2026 §14.7.4.4, so closures captured during iteration N pin to that iteration's binding (the textbook `fns.push(() => i)` case yields `[0, 1, 2]`, not `[3, 3, 3]`). `var` declarations require both `--compat-var` and the new flag and share a single hoisted binding visible after the loop. The bytecode compiler reuses the counted-loop pattern from `CompileCountedForOf` for `for(let i = N; i M; i++ | i--)` shapes (rejecting var/const, bodies that mutate the loop var, type annotations, and non-integer-literal cond RHS). `while` and `do...while` remain excluded — they have the same stub status but were intentionally split into a separate iteration. [language-tables.md](language-tables.md). **2026-05-04** · `testing` · [#513](https://github.com/frostney/GocciaScript/pull/513) — test262 conformance harness reframed around the standard tc39 convention. Previously the wrapper ran inside `GocciaTestRunner` and had to selectively hide / capture / restore the test-library globals (`expect`, `describe`, `test`, `runTests`, etc.) it registered, with failure capture leaning on an `undefined` sentinel that collided with thrown `undefined` and chunked-runner crashes that masked thousands of conformance failures as wrapper failures (#491 history). Replaced with: per-test `GocciaScriptLoaderBare` subprocess, stock tc39/test262 harness files read directly from the pinned checkout's `harness/` directory (with a small set of bundled adaptations under `scripts/test262_harness/` for stock files that depend on language features Goccia excludes by design or that work around specific engine bugs — see [test262.md § Bundled harness adaptations](test262.md#bundled-harness-adaptations)), exit-code + stdout-marker wire protocol identical to `test262-harness`/`eshost`/test262.fyi, and a thin TypeScript orchestrator (`scripts/run_test262_suite.ts`). No eligibility filter — every discovered test runs; per-test subprocess + `--timeout` + `--max-memory` bound the blast radius. Wrapper-template drift is now structurally impossible because the "template" is `harness + body` string concatenation. Wrapper-infra failures are classified separately and gated to zero in CI. Surfaced eleven engine bugs (all milestoned 0.8.0, all labeled `engine` per the architecture split where `engine` covers `TGocciaEngine` — language semantics + ECMAScript built-ins — and `runtime` is reserved for `TGocciaRuntime` host extensions like console/fetch/JSON5): [#514](https://github.com/frostney/GocciaScript/issues/514) (Iterator.concat SIGSEGV), [#515](https://github.com/frostney/GocciaScript/issues/515) (RegExp.test SIGSEGV), [#516](https://github.com/frostney/GocciaScript/issues/516) (`Reflect.construct` rejects function decls/exprs), [#517](https://github.com/frostney/GocciaScript/issues/517) (script-mode unattached call `this`), [#518](https://github.com/frostney/GocciaScript/issues/518) (bytecode VM Range-check on top-level `Promise.then` drain), [#519](https://github.com/frostney/GocciaScript/issues/519) (`Error.prototype.constructor` missing), [#520](https://github.com/frostney/GocciaScript/issues/520) (module arrow `this` lexical inheritance), [#521](https://github.com/frostney/GocciaScript/issues/521) (`var`/`function` shadowing built-in globals), [#522](https://github.com/frostney/GocciaScript/issues/522) (`String(obj)` doesn't invoke `toString`), [#523](https://github.com/frostney/GocciaScript/issues/523) (`yield*` accesses `.next` on null), [#524](https://github.com/frostney/GocciaScript/issues/524) (for-of re-fetches `iterator.next` each iteration). Each bundled-harness adaptation under `scripts/test262_harness/` references its tracking issue and is to be removed when the underlying engine bug is fixed. [test262.md](test262.md). diff --git a/scripts/run_test262_suite.ts b/scripts/run_test262_suite.ts index a31588a6..ffdbb4bb 100644 --- a/scripts/run_test262_suite.ts +++ b/scripts/run_test262_suite.ts @@ -74,8 +74,6 @@ const DEFAULT_JOBS = 4; // bug. Per docs/test262.md "Updating the contract", this list is the // only allowed form of test-skipping; no generic eligibility filter. const KNOWN_ENGINE_CRASHES = new Set([ - // SIGSEGV: RegExp.prototype.test trailing-input edge case. https://github.com/frostney/GocciaScript/issues/515 - "staging/sm/RegExp/test-trailing.js", ]); // --------------------------------------------------------------------------- diff --git a/source/units/Goccia.RegExp.Compiler.pas b/source/units/Goccia.RegExp.Compiler.pas new file mode 100644 index 00000000..2b01b9ab --- /dev/null +++ b/source/units/Goccia.RegExp.Compiler.pas @@ -0,0 +1,1464 @@ +unit Goccia.RegExp.Compiler; + +{$I Goccia.inc} + +interface + +uses + Goccia.RegExp.Engine; + +type + TRegExpOpCode = ( + RX_CHAR = 0, + RX_CHAR_CLASS = 1, + RX_CHAR_CLASS_NEG = 2, + RX_ANY = 3, + RX_SPLIT = 4, + RX_SPLIT_LAZY = 5, + RX_JUMP = 6, + RX_SAVE = 7, + RX_BACKREF = 8, + RX_ASSERT_START = 9, + RX_ASSERT_END = 10, + RX_ASSERT_WORD = 11, + RX_LOOKAHEAD = 12, + RX_LOOKBEHIND = 13, + RX_MATCH = 14, + RX_FAIL = 15 + ); + + TRegExpCharRange = record + Lo: Cardinal; + Hi: Cardinal; + end; + + TRegExpCharClass = record + Ranges: array of TRegExpCharRange; + end; + + TRegExpProgram = record + Code: array of UInt32; + CharClasses: array of TRegExpCharClass; + CaptureCount: Integer; + NamedGroups: TGocciaRegExpNamedGroups; + FlagIgnoreCase: Boolean; + FlagMultiline: Boolean; + FlagDotAll: Boolean; + FlagUnicode: Boolean; + end; + +function CompileRegExp(const APattern, AFlags: string): TRegExpProgram; +procedure ValidateRegExpPatternNew(const APattern, AFlags: string); + +implementation + +uses + Math, + SysUtils, + + TextSemantics; + +type + TModifierState = record + IgnoreCase: Boolean; + Multiline: Boolean; + DotAll: Boolean; + end; + + TRegExpCompiler = class + private + FPattern: string; + FFlags: string; + FPos: Integer; + FCode: array of UInt32; + FCodeLen: Integer; + FCharClasses: array of TRegExpCharClass; + FCaptureCount: Integer; + FNamedGroups: TGocciaRegExpNamedGroups; + FAltStack: array of Integer; + FAltStackDepth: Integer; + FModifier: TModifierState; + FUnicode: Boolean; + function Peek: Char; + function PeekAt(AOffset: Integer): Char; + function AtEnd: Boolean; + function Advance: Char; + function Match(C: Char): Boolean; + procedure Emit(AInstr: UInt32); + function EmitHole: Integer; + procedure PatchHole(AIndex: Integer; ATarget: Integer); + function CurrentPC: Integer; + function EncodeOp(AOp: TRegExpOpCode): UInt32; + function EncodeOpBx(AOp: TRegExpOpCode; ABx: Integer): UInt32; + function AddCharClass(const ARanges: array of TRegExpCharRange): Integer; + function AddCharClassFromDynamic(const ARanges: array of TRegExpCharRange): Integer; + procedure CompilePattern; + procedure CompileDisjunction; + procedure CompileAlternative; + procedure CompileTerm; + procedure CompileAtom; + procedure CompileQuantifier(AAtomStart: Integer); + procedure CompileCharacterClass; + procedure CompileEscape(AInCharClass: Boolean; var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); + procedure CompileEscapeAtom; + procedure CompileGroup; + procedure CompileModifierGroup; + function ParseGroupName: string; + function ParseUnicodeEscape: Cardinal; + function ParseHexEscape(ADigits: Integer): Cardinal; + function ParseDecimalEscape: Integer; + procedure EmitCharMatch(ACodePoint: Cardinal); + procedure EmitCharClassRanges(const ARanges: array of TRegExpCharRange; + ARangeCount: Integer; ANegated: Boolean); + procedure AddBuiltinCharClass(AEscapeChar: Char; var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); + procedure AddRange(var ARanges: array of TRegExpCharRange; var ARangeCount: Integer; ALo, AHi: Cardinal); + function CaseFold(ACodePoint: Cardinal): Cardinal; + procedure EmitUnicodePropertyClass(const APropertyName: string; ANegated: Boolean); + procedure GetUnicodePropertyRanges(const APropertyName: string; var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); + function ReadCodePoint: Cardinal; + procedure EnsureCodeCapacity(ANeeded: Integer); + procedure EmitBody(const ABody: array of UInt32; ALen: Integer); + procedure ValidateNamedGroups; + procedure PreScanNamedGroups; + procedure InsertSplitAt(APos: Integer); + procedure EmitDuplicateNamedBackref(const AName: string); + public + constructor Create(const APattern, AFlags: string); + function Compile: TRegExpProgram; + end; + +const + MAX_CHAR_RANGES = 512; + +function EncodeInstr(AOp: TRegExpOpCode; ABx: Integer): UInt32; inline; +begin + Result := UInt32(Ord(AOp)) or (UInt32(ABx) shl 8); +end; + +function DecodeBx(AInstr: UInt32): Integer; inline; +begin + Result := Integer(AInstr shr 8); +end; + +constructor TRegExpCompiler.Create(const APattern, AFlags: string); +begin + inherited Create; + FPattern := APattern; + FFlags := AFlags; + FPos := 1; + FCodeLen := 0; + SetLength(FCode, 256); + SetLength(FCharClasses, 0); + FCaptureCount := 0; + SetLength(FNamedGroups, 0); + SetLength(FAltStack, 64); + FAltStackDepth := 0; + FAltStack[0] := 0; + FModifier.IgnoreCase := HasRegExpFlag(AFlags, 'i'); + FModifier.Multiline := HasRegExpFlag(AFlags, 'm'); + FModifier.DotAll := HasRegExpFlag(AFlags, 's'); + FUnicode := HasRegExpFlag(AFlags, 'u') or HasRegExpFlag(AFlags, 'v'); +end; + +function TRegExpCompiler.Peek: Char; +begin + if FPos <= Length(FPattern) then + Result := FPattern[FPos] + else + Result := #0; +end; + +function TRegExpCompiler.PeekAt(AOffset: Integer): Char; +var + Idx: Integer; +begin + Idx := FPos + AOffset; + if (Idx >= 1) and (Idx <= Length(FPattern)) then + Result := FPattern[Idx] + else + Result := #0; +end; + +function TRegExpCompiler.AtEnd: Boolean; +begin + Result := FPos > Length(FPattern); +end; + +function TRegExpCompiler.Advance: Char; +begin + Result := Peek; + Inc(FPos); +end; + +function TRegExpCompiler.Match(C: Char): Boolean; +begin + if Peek = C then + begin + Inc(FPos); + Result := True; + end + else + Result := False; +end; + +procedure TRegExpCompiler.Emit(AInstr: UInt32); +begin + if FCodeLen >= Length(FCode) then + SetLength(FCode, FCodeLen * 2 + 16); + FCode[FCodeLen] := AInstr; + Inc(FCodeLen); +end; + +function TRegExpCompiler.EmitHole: Integer; +begin + Result := FCodeLen; + Emit(0); +end; + +procedure TRegExpCompiler.PatchHole(AIndex: Integer; ATarget: Integer); +var + Op: TRegExpOpCode; +begin + Op := TRegExpOpCode(FCode[AIndex] and $FF); + FCode[AIndex] := UInt32(Ord(Op)) or (UInt32(ATarget) shl 8); +end; + +function TRegExpCompiler.CurrentPC: Integer; +begin + Result := FCodeLen; +end; + +function TRegExpCompiler.EncodeOp(AOp: TRegExpOpCode): UInt32; +begin + Result := UInt32(Ord(AOp)); +end; + +function TRegExpCompiler.EncodeOpBx(AOp: TRegExpOpCode; ABx: Integer): UInt32; +begin + Result := UInt32(Ord(AOp)) or (UInt32(ABx) shl 8); +end; + +function TRegExpCompiler.AddCharClass( + const ARanges: array of TRegExpCharRange): Integer; +var + I: Integer; +begin + Result := Length(FCharClasses); + SetLength(FCharClasses, Result + 1); + SetLength(FCharClasses[Result].Ranges, Length(ARanges)); + for I := 0 to High(ARanges) do + FCharClasses[Result].Ranges[I] := ARanges[I]; +end; + +function TRegExpCompiler.AddCharClassFromDynamic( + const ARanges: array of TRegExpCharRange): Integer; +var + I: Integer; +begin + Result := Length(FCharClasses); + SetLength(FCharClasses, Result + 1); + SetLength(FCharClasses[Result].Ranges, Length(ARanges)); + for I := 0 to High(ARanges) do + FCharClasses[Result].Ranges[I] := ARanges[I]; +end; + +function TRegExpCompiler.CaseFold(ACodePoint: Cardinal): Cardinal; +begin + if not FModifier.IgnoreCase then + Exit(ACodePoint); + if (ACodePoint >= Ord('A')) and (ACodePoint <= Ord('Z')) then + Result := ACodePoint + 32 + else if (ACodePoint >= Ord('a')) and (ACodePoint <= Ord('z')) then + Result := ACodePoint - 32 + else + Result := ACodePoint; +end; + +procedure TRegExpCompiler.EmitCharMatch(ACodePoint: Cardinal); +var + Ranges: array[0..1] of TRegExpCharRange; + ClassIdx: Integer; + Lower, Upper: Cardinal; +begin + if FModifier.IgnoreCase then + begin + if (ACodePoint >= Ord('A')) and (ACodePoint <= Ord('Z')) then + begin + Lower := ACodePoint + 32; + Ranges[0].Lo := ACodePoint; + Ranges[0].Hi := ACodePoint; + Ranges[1].Lo := Lower; + Ranges[1].Hi := Lower; + ClassIdx := AddCharClass(Ranges); + Emit(EncodeOpBx(RX_CHAR_CLASS, ClassIdx)); + Exit; + end; + if (ACodePoint >= Ord('a')) and (ACodePoint <= Ord('z')) then + begin + Upper := ACodePoint - 32; + Ranges[0].Lo := Upper; + Ranges[0].Hi := Upper; + Ranges[1].Lo := ACodePoint; + Ranges[1].Hi := ACodePoint; + ClassIdx := AddCharClass(Ranges); + Emit(EncodeOpBx(RX_CHAR_CLASS, ClassIdx)); + Exit; + end; + if FUnicode and (ACodePoint = $212A) then + begin + Ranges[0].Lo := Ord('K'); + Ranges[0].Hi := Ord('K'); + Ranges[1].Lo := Ord('k'); + Ranges[1].Hi := Ord('k'); + ClassIdx := AddCharClass(Ranges); + Emit(EncodeOpBx(RX_CHAR_CLASS, ClassIdx)); + Exit; + end; + end; + Emit(EncodeOpBx(RX_CHAR, Integer(ACodePoint))); +end; + +procedure TRegExpCompiler.AddRange(var ARanges: array of TRegExpCharRange; + var ARangeCount: Integer; ALo, AHi: Cardinal); +begin + if ARangeCount >= Length(ARanges) then + Exit; + ARanges[ARangeCount].Lo := ALo; + ARanges[ARangeCount].Hi := AHi; + Inc(ARangeCount); +end; + +procedure TRegExpCompiler.AddBuiltinCharClass(AEscapeChar: Char; + var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); +begin + case AEscapeChar of + 'd': + AddRange(ARanges, ARangeCount, Ord('0'), Ord('9')); + 'D': + begin + AddRange(ARanges, ARangeCount, 0, Ord('0') - 1); + AddRange(ARanges, ARangeCount, Ord('9') + 1, $10FFFF); + end; + 'w': + begin + AddRange(ARanges, ARangeCount, Ord('0'), Ord('9')); + AddRange(ARanges, ARangeCount, Ord('A'), Ord('Z')); + AddRange(ARanges, ARangeCount, Ord('_'), Ord('_')); + AddRange(ARanges, ARangeCount, Ord('a'), Ord('z')); + end; + 'W': + begin + AddRange(ARanges, ARangeCount, 0, Ord('0') - 1); + AddRange(ARanges, ARangeCount, Ord('9') + 1, Ord('A') - 1); + AddRange(ARanges, ARangeCount, Ord('Z') + 1, Ord('_') - 1); + AddRange(ARanges, ARangeCount, Ord('_') + 1, Ord('a') - 1); + AddRange(ARanges, ARangeCount, Ord('z') + 1, $10FFFF); + end; + 's': + begin + AddRange(ARanges, ARangeCount, $09, $0D); + AddRange(ARanges, ARangeCount, $20, $20); + AddRange(ARanges, ARangeCount, $A0, $A0); + AddRange(ARanges, ARangeCount, $1680, $1680); + AddRange(ARanges, ARangeCount, $2000, $200A); + AddRange(ARanges, ARangeCount, $2028, $2029); + AddRange(ARanges, ARangeCount, $202F, $202F); + AddRange(ARanges, ARangeCount, $205F, $205F); + AddRange(ARanges, ARangeCount, $3000, $3000); + AddRange(ARanges, ARangeCount, $FEFF, $FEFF); + end; + 'S': + begin + AddRange(ARanges, ARangeCount, 0, $08); + AddRange(ARanges, ARangeCount, $0E, $1F); + AddRange(ARanges, ARangeCount, $21, $9F); + AddRange(ARanges, ARangeCount, $A1, $167F); + AddRange(ARanges, ARangeCount, $1681, $1FFF); + AddRange(ARanges, ARangeCount, $200B, $2027); + AddRange(ARanges, ARangeCount, $202A, $202E); + AddRange(ARanges, ARangeCount, $2030, $205E); + AddRange(ARanges, ARangeCount, $2060, $2FFF); + AddRange(ARanges, ARangeCount, $3001, $FEFE); + AddRange(ARanges, ARangeCount, $FF00, $10FFFF); + end; + end; +end; + +procedure TRegExpCompiler.GetUnicodePropertyRanges(const APropertyName: string; + var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); +begin + if (APropertyName = 'L') or (APropertyName = 'Letter') then + begin + AddRange(ARanges, ARangeCount, $41, $5A); + AddRange(ARanges, ARangeCount, $61, $7A); + AddRange(ARanges, ARangeCount, $C0, $D6); + AddRange(ARanges, ARangeCount, $D8, $F6); + AddRange(ARanges, ARangeCount, $F8, $2FF); + AddRange(ARanges, ARangeCount, $370, $37D); + AddRange(ARanges, ARangeCount, $37F, $1FFF); + AddRange(ARanges, ARangeCount, $200C, $200D); + AddRange(ARanges, ARangeCount, $2070, $218F); + AddRange(ARanges, ARangeCount, $2C00, $2FEF); + AddRange(ARanges, ARangeCount, $3001, $D7FF); + AddRange(ARanges, ARangeCount, $F900, $FDCF); + AddRange(ARanges, ARangeCount, $FDF0, $FFFD); + AddRange(ARanges, ARangeCount, $10000, $EFFFF); + end + else if (APropertyName = 'Lu') or (APropertyName = 'Uppercase_Letter') then + begin + AddRange(ARanges, ARangeCount, $41, $5A); + AddRange(ARanges, ARangeCount, $C0, $D6); + AddRange(ARanges, ARangeCount, $D8, $DE); + end + else if (APropertyName = 'Ll') or (APropertyName = 'Lowercase_Letter') then + begin + AddRange(ARanges, ARangeCount, $61, $7A); + AddRange(ARanges, ARangeCount, $DF, $F6); + AddRange(ARanges, ARangeCount, $F8, $FF); + end + else if (APropertyName = 'N') or (APropertyName = 'Number') then + AddRange(ARanges, ARangeCount, $30, $39) + else if (APropertyName = 'Nd') or (APropertyName = 'Decimal_Number') then + AddRange(ARanges, ARangeCount, $30, $39) + else if (APropertyName = 'P') or (APropertyName = 'Punctuation') then + begin + AddRange(ARanges, ARangeCount, $21, $23); + AddRange(ARanges, ARangeCount, $25, $2A); + AddRange(ARanges, ARangeCount, $2C, $2F); + AddRange(ARanges, ARangeCount, $3A, $3B); + AddRange(ARanges, ARangeCount, $3F, $40); + AddRange(ARanges, ARangeCount, $5B, $5D); + AddRange(ARanges, ARangeCount, $5F, $5F); + AddRange(ARanges, ARangeCount, $7B, $7B); + AddRange(ARanges, ARangeCount, $7D, $7D); + end + else if (APropertyName = 'S') or (APropertyName = 'Symbol') then + begin + AddRange(ARanges, ARangeCount, $24, $24); + AddRange(ARanges, ARangeCount, $2B, $2B); + AddRange(ARanges, ARangeCount, $3C, $3E); + AddRange(ARanges, ARangeCount, $5E, $5E); + AddRange(ARanges, ARangeCount, $60, $60); + AddRange(ARanges, ARangeCount, $7C, $7C); + AddRange(ARanges, ARangeCount, $7E, $7E); + end + else if (APropertyName = 'Z') or (APropertyName = 'Separator') then + begin + AddRange(ARanges, ARangeCount, $20, $20); + AddRange(ARanges, ARangeCount, $A0, $A0); + AddRange(ARanges, ARangeCount, $1680, $1680); + AddRange(ARanges, ARangeCount, $2000, $200A); + AddRange(ARanges, ARangeCount, $2028, $2029); + AddRange(ARanges, ARangeCount, $202F, $202F); + AddRange(ARanges, ARangeCount, $205F, $205F); + AddRange(ARanges, ARangeCount, $3000, $3000); + end + else if (APropertyName = 'Cc') or (APropertyName = 'Control') then + begin + AddRange(ARanges, ARangeCount, $00, $1F); + AddRange(ARanges, ARangeCount, $7F, $9F); + end + else if APropertyName = 'ASCII' then + AddRange(ARanges, ARangeCount, $00, $7F) + else if APropertyName = 'ASCII_Hex_Digit' then + begin + AddRange(ARanges, ARangeCount, $30, $39); + AddRange(ARanges, ARangeCount, $41, $46); + AddRange(ARanges, ARangeCount, $61, $66); + end + else if APropertyName = 'White_Space' then + begin + AddRange(ARanges, ARangeCount, $09, $0D); + AddRange(ARanges, ARangeCount, $20, $20); + AddRange(ARanges, ARangeCount, $85, $85); + AddRange(ARanges, ARangeCount, $A0, $A0); + AddRange(ARanges, ARangeCount, $1680, $1680); + AddRange(ARanges, ARangeCount, $2000, $200A); + AddRange(ARanges, ARangeCount, $2028, $2029); + AddRange(ARanges, ARangeCount, $202F, $202F); + AddRange(ARanges, ARangeCount, $205F, $205F); + AddRange(ARanges, ARangeCount, $3000, $3000); + end + else + raise EConvertError.Create('Invalid Unicode property name: ' + APropertyName); +end; + +procedure TRegExpCompiler.EmitUnicodePropertyClass(const APropertyName: string; + ANegated: Boolean); +var + Ranges: array[0..MAX_CHAR_RANGES - 1] of TRegExpCharRange; + RangeCount, ClassIdx: Integer; +begin + RangeCount := 0; + GetUnicodePropertyRanges(APropertyName, Ranges, RangeCount); + EmitCharClassRanges(Ranges, RangeCount, ANegated); +end; + +procedure TRegExpCompiler.EmitCharClassRanges( + const ARanges: array of TRegExpCharRange; + ARangeCount: Integer; ANegated: Boolean); +var + ClassIdx, I, OrigCount: Integer; + Op: TRegExpOpCode; + DynRanges: array of TRegExpCharRange; +begin + SetLength(DynRanges, ARangeCount); + for I := 0 to ARangeCount - 1 do + DynRanges[I] := ARanges[I]; + if FModifier.IgnoreCase then + begin + OrigCount := Length(DynRanges); + for I := 0 to OrigCount - 1 do + begin + if (DynRanges[I].Lo >= Ord('A')) and (DynRanges[I].Hi <= Ord('Z')) then + begin + SetLength(DynRanges, Length(DynRanges) + 1); + DynRanges[High(DynRanges)].Lo := DynRanges[I].Lo + 32; + DynRanges[High(DynRanges)].Hi := DynRanges[I].Hi + 32; + end + else if (DynRanges[I].Lo >= Ord('a')) and (DynRanges[I].Hi <= Ord('z')) then + begin + SetLength(DynRanges, Length(DynRanges) + 1); + DynRanges[High(DynRanges)].Lo := DynRanges[I].Lo - 32; + DynRanges[High(DynRanges)].Hi := DynRanges[I].Hi - 32; + end; + end; + end; + ClassIdx := AddCharClassFromDynamic(DynRanges); + if ANegated then + Op := RX_CHAR_CLASS_NEG + else + Op := RX_CHAR_CLASS; + Emit(EncodeOpBx(Op, ClassIdx)); +end; + +function TRegExpCompiler.ReadCodePoint: Cardinal; +var + ByteLen: Integer; +begin + if FUnicode and (FPos <= Length(FPattern)) then + begin + if TryReadUTF8CodePoint(FPattern, FPos, Result, ByteLen) and (ByteLen > 1) then + begin + Inc(FPos, ByteLen); + Exit; + end; + end; + Result := Ord(Advance); +end; + +function TRegExpCompiler.ParseGroupName: string; +var + C: Char; +begin + Result := ''; + while not AtEnd do + begin + C := Peek; + if C = '>' then + begin + Inc(FPos); + Exit; + end; + Result := Result + Advance; + end; + raise EConvertError.Create('Unterminated group name'); +end; + +function TRegExpCompiler.ParseHexEscape(ADigits: Integer): Cardinal; +var + I: Integer; + C: Char; +begin + Result := 0; + for I := 1 to ADigits do + begin + if AtEnd then + raise EConvertError.Create('Invalid hex escape'); + C := Advance; + case C of + '0'..'9': Result := Result * 16 + Cardinal(Ord(C) - Ord('0')); + 'a'..'f': Result := Result * 16 + Cardinal(Ord(C) - Ord('a') + 10); + 'A'..'F': Result := Result * 16 + Cardinal(Ord(C) - Ord('A') + 10); + else + raise EConvertError.Create('Invalid hex escape'); + end; + end; +end; + +function TRegExpCompiler.ParseUnicodeEscape: Cardinal; +var + HighSurrogate: Cardinal; +begin + if Match('{') then + begin + Result := 0; + while not AtEnd and (Peek <> '}') do + begin + case Peek of + '0'..'9': Result := Result * 16 + Cardinal(Ord(Advance) - Ord('0')); + 'a'..'f': Result := Result * 16 + Cardinal(Ord(Advance) - Ord('a') + 10); + 'A'..'F': Result := Result * 16 + Cardinal(Ord(Advance) - Ord('A') + 10); + else + raise EConvertError.Create('Invalid Unicode escape'); + end; + end; + if not Match('}') then + raise EConvertError.Create('Unterminated Unicode escape'); + if Result > $10FFFF then + raise EConvertError.Create('Unicode escape out of range'); + Exit; + end; + Result := ParseHexEscape(4); + if (Result >= $D800) and (Result <= $DBFF) then + begin + HighSurrogate := Result; + if (Peek = '\') and (PeekAt(1) = 'u') then + begin + Inc(FPos, 2); + Result := ParseHexEscape(4); + if (Result >= $DC00) and (Result <= $DFFF) then + begin + Result := $10000 + ((HighSurrogate - $D800) shl 10) + + (Result - $DC00); + Exit; + end; + Dec(FPos, 6); + end; + Result := HighSurrogate; + end; +end; + +function TRegExpCompiler.ParseDecimalEscape: Integer; +var + C: Char; +begin + Result := 0; + while not AtEnd do + begin + C := Peek; + if (C < '0') or (C > '9') then + Break; + Result := Result * 10 + (Ord(Advance) - Ord('0')); + end; +end; + +const + BACKREF_STRICT_FLAG = $800000; + +procedure TRegExpCompiler.EmitDuplicateNamedBackref(const AName: string); +var + Indices: array of Integer; + Count, I: Integer; + SplitHole: Integer; + JumpHoles: array of Integer; + JumpCount: Integer; +begin + Count := 0; + SetLength(Indices, Length(FNamedGroups)); + for I := 0 to High(FNamedGroups) do + if FNamedGroups[I].Name = AName then + begin + Indices[Count] := FNamedGroups[I].Index; + Inc(Count); + end; + SetLength(Indices, Count); + if Count = 1 then + begin + Emit(EncodeOpBx(RX_BACKREF, Indices[0])); + Exit; + end; + JumpCount := 0; + SetLength(JumpHoles, Count + 1); + for I := 0 to Count - 1 do + begin + SplitHole := CurrentPC; + Emit(EncodeOpBx(RX_SPLIT, 0)); + Emit(EncodeOpBx(RX_BACKREF, Indices[I] or BACKREF_STRICT_FLAG)); + JumpHoles[JumpCount] := CurrentPC; + Inc(JumpCount); + Emit(0); + PatchHole(SplitHole, CurrentPC); + end; + Emit(EncodeOp(RX_FAIL)); + for I := 0 to JumpCount - 1 do + FCode[JumpHoles[I]] := EncodeOpBx(RX_JUMP, CurrentPC); +end; + +procedure TRegExpCompiler.CompileEscapeAtom; +var + C: Char; + Ranges: array[0..MAX_CHAR_RANGES - 1] of TRegExpCharRange; + RangeCount: Integer; + PropertyName: string; + Negated: Boolean; + GroupName: string; + BackrefIdx, I, GroupCount: Integer; + CodePoint: Cardinal; +begin + C := Advance; + case C of + 'd', 'D', 'w', 'W', 's', 'S': + begin + RangeCount := 0; + AddBuiltinCharClass(C, Ranges, RangeCount); + EmitCharClassRanges(Ranges, RangeCount, False); + end; + 'b': + Emit(EncodeOpBx(RX_ASSERT_WORD, 0)); + 'B': + Emit(EncodeOpBx(RX_ASSERT_WORD, 1)); + 'p', 'P': + begin + if FUnicode and Match('{') then + begin + Negated := C = 'P'; + PropertyName := ''; + while not AtEnd and (Peek <> '}') do + PropertyName := PropertyName + Advance; + if not Match('}') then + raise EConvertError.Create('Unterminated Unicode property escape'); + EmitUnicodePropertyClass(PropertyName, Negated); + end + else + EmitCharMatch(Ord(C)); + end; + 'k': + begin + if Match('<') then + begin + GroupName := ParseGroupName; + BackrefIdx := -1; + GroupCount := 0; + for I := 0 to High(FNamedGroups) do + if FNamedGroups[I].Name = GroupName then + begin + if BackrefIdx < 0 then + BackrefIdx := FNamedGroups[I].Index; + Inc(GroupCount); + end; + if BackrefIdx < 0 then + raise EConvertError.Create( + 'Invalid named backreference: ' + GroupName); + if GroupCount <= 1 then + Emit(EncodeOpBx(RX_BACKREF, BackrefIdx)) + else + EmitDuplicateNamedBackref(GroupName); + end + else + EmitCharMatch(Ord('k')); + end; + '1'..'9': + begin + BackrefIdx := Ord(C) - Ord('0'); + while not AtEnd and (Peek >= '0') and (Peek <= '9') do + BackrefIdx := BackrefIdx * 10 + (Ord(Advance) - Ord('0')); + Emit(EncodeOpBx(RX_BACKREF, BackrefIdx)); + end; + 'n': EmitCharMatch($0A); + 'r': EmitCharMatch($0D); + 't': EmitCharMatch($09); + 'v': EmitCharMatch($0B); + 'f': EmitCharMatch($0C); + '0': + begin + if not AtEnd and (Peek >= '0') and (Peek <= '9') then + EmitCharMatch(Ord(C)) + else + EmitCharMatch(0); + end; + 'x': EmitCharMatch(ParseHexEscape(2)); + 'u': EmitCharMatch(ParseUnicodeEscape); + 'c': + begin + if not AtEnd and (((Peek >= 'a') and (Peek <= 'z')) or + ((Peek >= 'A') and (Peek <= 'Z'))) then + EmitCharMatch(Ord(Advance) mod 32) + else + EmitCharMatch(Ord('c')); + end; + else + EmitCharMatch(Ord(C)); + end; +end; + +procedure TRegExpCompiler.CompileEscape(AInCharClass: Boolean; + var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); +var + C: Char; + PropertyName: string; + Negated: Boolean; + CodePoint: Cardinal; +begin + C := Advance; + case C of + 'd', 'D', 'w', 'W', 's', 'S': + AddBuiltinCharClass(C, ARanges, ARangeCount); + 'n': AddRange(ARanges, ARangeCount, $0A, $0A); + 'r': AddRange(ARanges, ARangeCount, $0D, $0D); + 't': AddRange(ARanges, ARangeCount, $09, $09); + 'v': AddRange(ARanges, ARangeCount, $0B, $0B); + 'f': AddRange(ARanges, ARangeCount, $0C, $0C); + '0': + begin + if not AtEnd and (Peek >= '0') and (Peek <= '9') then + AddRange(ARanges, ARangeCount, Ord(C), Ord(C)) + else + AddRange(ARanges, ARangeCount, 0, 0); + end; + 'x': + begin + CodePoint := ParseHexEscape(2); + AddRange(ARanges, ARangeCount, CodePoint, CodePoint); + end; + 'u': + begin + CodePoint := ParseUnicodeEscape; + AddRange(ARanges, ARangeCount, CodePoint, CodePoint); + end; + 'p', 'P': + begin + if FUnicode and Match('{') then + begin + PropertyName := ''; + while not AtEnd and (Peek <> '}') do + PropertyName := PropertyName + Advance; + if not Match('}') then + raise EConvertError.Create('Unterminated Unicode property escape'); + GetUnicodePropertyRanges(PropertyName, ARanges, ARangeCount); + if C = 'P' then + begin + // For negated in char class context, handled by caller + end; + end + else + AddRange(ARanges, ARangeCount, Ord(C), Ord(C)); + end; + 'b': + AddRange(ARanges, ARangeCount, $08, $08); + else + AddRange(ARanges, ARangeCount, Ord(C), Ord(C)); + end; +end; + +procedure TRegExpCompiler.CompileCharacterClass; +var + Ranges: array[0..MAX_CHAR_RANGES - 1] of TRegExpCharRange; + RangeCount: Integer; + Negated: Boolean; + C: Char; + Lo, Hi: Cardinal; + SavePos: Integer; +begin + Negated := Match('^'); + RangeCount := 0; + while not AtEnd and (Peek <> ']') do + begin + if Peek = '\' then + begin + Inc(FPos); + CompileEscape(True, Ranges, RangeCount); + Continue; + end; + Lo := ReadCodePoint; + if (not AtEnd) and (Peek = '-') and (PeekAt(1) <> ']') then + begin + Inc(FPos); + if Peek = '\' then + begin + SavePos := RangeCount; + Inc(FPos); + CompileEscape(True, Ranges, RangeCount); + if RangeCount > SavePos then + begin + Hi := Ranges[RangeCount - 1].Lo; + Dec(RangeCount); + AddRange(Ranges, RangeCount, Lo, Hi); + end; + end + else + begin + Hi := ReadCodePoint; + AddRange(Ranges, RangeCount, Lo, Hi); + end; + end + else + AddRange(Ranges, RangeCount, Lo, Lo); + end; + if not Match(']') then + raise EConvertError.Create('Unterminated character class'); + EmitCharClassRanges(Ranges, RangeCount, Negated); +end; + +procedure TRegExpCompiler.CompileModifierGroup; +var + C: Char; + EnableFlags, DisableFlags: string; + InDisable: Boolean; + SavedModifier: TModifierState; +begin + EnableFlags := ''; + DisableFlags := ''; + InDisable := False; + while not AtEnd and (Peek <> ':') and (Peek <> ')') do + begin + C := Advance; + if C = '-' then + begin + if InDisable then + raise EConvertError.Create( + 'Invalid regular expression: unexpected - in modifier group'); + InDisable := True; + Continue; + end; + if not CharInSet(C, ['i', 'm', 's']) then + raise EConvertError.CreateFmt( + 'Invalid regular expression: ''%s'' is not a valid modifier flag', [C]); + if InDisable then + begin + if Pos(C, DisableFlags) > 0 then + raise EConvertError.CreateFmt( + 'Invalid regular expression: duplicate modifier flag ''%s''', [C]); + if Pos(C, EnableFlags) > 0 then + raise EConvertError.CreateFmt( + 'Invalid regular expression: ''%s'' in both enable and disable', [C]); + DisableFlags := DisableFlags + C; + end + else + begin + if Pos(C, EnableFlags) > 0 then + raise EConvertError.CreateFmt( + 'Invalid regular expression: duplicate modifier flag ''%s''', [C]); + EnableFlags := EnableFlags + C; + end; + end; + if (EnableFlags = '') and (DisableFlags = '') then + raise EConvertError.Create( + 'Invalid regular expression: modifier group must enable or disable at least one flag'); + if not Match(':') then + raise EConvertError.Create( + 'Invalid regular expression: modifier group must use (?flags:...) syntax'); + SavedModifier := FModifier; + if Pos('i', EnableFlags) > 0 then FModifier.IgnoreCase := True; + if Pos('m', EnableFlags) > 0 then FModifier.Multiline := True; + if Pos('s', EnableFlags) > 0 then FModifier.DotAll := True; + if Pos('i', DisableFlags) > 0 then FModifier.IgnoreCase := False; + if Pos('m', DisableFlags) > 0 then FModifier.Multiline := False; + if Pos('s', DisableFlags) > 0 then FModifier.DotAll := False; + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated modifier group'); + FModifier := SavedModifier; +end; + +procedure TRegExpCompiler.CompileGroup; +var + SaveAltDepth: Integer; + GroupName: string; + CaptureIdx, I: Integer; + SplitHole, JumpHole: Integer; + LookStart: Integer; + IsNegative: Boolean; +begin + Inc(FAltStackDepth); + if FAltStackDepth >= Length(FAltStack) then + SetLength(FAltStack, FAltStackDepth * 2 + 4); + FAltStack[FAltStackDepth] := 0; + if Match('?') then + begin + if Match(':') then + begin + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated non-capturing group'); + end + else if Match('=') then + begin + SplitHole := EmitHole; + FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, 0); + LookStart := CurrentPC; + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated lookahead'); + Emit(EncodeOp(RX_MATCH)); + PatchHole(SplitHole, CurrentPC); + FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, CurrentPC); + end + else if Match('!') then + begin + SplitHole := EmitHole; + FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, 0); + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated negative lookahead'); + Emit(EncodeOp(RX_MATCH)); + PatchHole(SplitHole, CurrentPC); + FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, CurrentPC) or $80; + end + else if Match('<') then + begin + if Match('=') then + begin + SplitHole := EmitHole; + FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, 0); + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated lookbehind'); + Emit(EncodeOp(RX_MATCH)); + PatchHole(SplitHole, CurrentPC); + FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, CurrentPC); + end + else if Match('!') then + begin + SplitHole := EmitHole; + FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, 0); + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated negative lookbehind'); + Emit(EncodeOp(RX_MATCH)); + PatchHole(SplitHole, CurrentPC); + FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, CurrentPC) or $80; + end + else + begin + GroupName := ParseGroupName; + Inc(FCaptureCount); + CaptureIdx := FCaptureCount; + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2)); + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated named capture group'); + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2 + 1)); + end; + end + else if CharInSet(Peek, ['i', 'm', 's', '-']) then + begin + CompileModifierGroup; + end + else + raise EConvertError.Create('Invalid group syntax'); + end + else + begin + Inc(FCaptureCount); + CaptureIdx := FCaptureCount; + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2)); + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated capturing group'); + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2 + 1)); + end; + if FAltStackDepth > 0 then + Dec(FAltStackDepth); +end; + +procedure TRegExpCompiler.CompileAtom; +var + C: Char; + CodePoint: Cardinal; +begin + C := Peek; + case C of + '(': + begin + Inc(FPos); + CompileGroup; + end; + '[': + begin + Inc(FPos); + CompileCharacterClass; + end; + '.': + begin + Inc(FPos); + if FModifier.DotAll then + Emit(EncodeOpBx(RX_ANY, 1)) + else + Emit(EncodeOpBx(RX_ANY, 0)); + end; + '^': + begin + Inc(FPos); + if FModifier.Multiline then + Emit(EncodeOpBx(RX_ASSERT_START, 1)) + else + Emit(EncodeOpBx(RX_ASSERT_START, 0)); + end; + '$': + begin + Inc(FPos); + if FModifier.Multiline then + Emit(EncodeOpBx(RX_ASSERT_END, 1)) + else + Emit(EncodeOpBx(RX_ASSERT_END, 0)); + end; + '\': + begin + Inc(FPos); + CompileEscapeAtom; + end; + else + begin + if FUnicode then + begin + CodePoint := ReadCodePoint; + EmitCharMatch(CodePoint); + end + else + begin + Inc(FPos); + EmitCharMatch(Ord(C)); + end; + end; + end; +end; + +procedure TRegExpCompiler.EnsureCodeCapacity(ANeeded: Integer); +begin + if FCodeLen + ANeeded >= Length(FCode) then + SetLength(FCode, (FCodeLen + ANeeded) * 2 + 16); +end; + +procedure TRegExpCompiler.EmitBody(const ABody: array of UInt32; ALen: Integer); +begin + EnsureCodeCapacity(ALen); + Move(ABody[0], FCode[FCodeLen], ALen * SizeOf(UInt32)); + Inc(FCodeLen, ALen); +end; + +procedure TRegExpCompiler.CompileQuantifier(AAtomStart: Integer); +var + SplitPC: Integer; + MinCount, MaxCount, I: Integer; + Lazy: Boolean; + C: Char; + BodyLen: Integer; + BodyCode: array of UInt32; + SavePos: Integer; +begin + if AtEnd then + Exit; + C := Peek; + MinCount := -1; + MaxCount := -1; + SavePos := FPos; + case C of + '*': begin MinCount := 0; MaxCount := -1; Inc(FPos); end; + '+': begin MinCount := 1; MaxCount := -1; Inc(FPos); end; + '?': begin MinCount := 0; MaxCount := 1; Inc(FPos); end; + '{': + begin + Inc(FPos); + if AtEnd or not CharInSet(Peek, ['0'..'9']) then + begin + Dec(FPos); + Exit; + end; + MinCount := ParseDecimalEscape; + if Match(',') then + begin + if Peek = '}' then + MaxCount := -1 + else + MaxCount := ParseDecimalEscape; + end + else + MaxCount := MinCount; + if not Match('}') then + begin + FPos := SavePos; + Exit; + end; + end; + else + Exit; + end; + Lazy := Match('?'); + BodyLen := CurrentPC - AAtomStart; + if BodyLen = 0 then + Exit; + SetLength(BodyCode, BodyLen); + Move(FCode[AAtomStart], BodyCode[0], BodyLen * SizeOf(UInt32)); + FCodeLen := AAtomStart; + for I := 1 to MinCount do + EmitBody(BodyCode, BodyLen); + if MaxCount = -1 then + begin + SplitPC := CurrentPC; + if Lazy then + Emit(EncodeOpBx(RX_SPLIT_LAZY, 0)) + else + Emit(EncodeOpBx(RX_SPLIT, 0)); + EmitBody(BodyCode, BodyLen); + Emit(EncodeOpBx(RX_JUMP, SplitPC)); + PatchHole(SplitPC, CurrentPC); + end + else + begin + for I := MinCount + 1 to MaxCount do + begin + SplitPC := CurrentPC; + if Lazy then + Emit(EncodeOpBx(RX_SPLIT_LAZY, 0)) + else + Emit(EncodeOpBx(RX_SPLIT, 0)); + EmitBody(BodyCode, BodyLen); + PatchHole(SplitPC, CurrentPC); + end; + end; +end; + +procedure TRegExpCompiler.CompileTerm; +var + AtomStart: Integer; +begin + AtomStart := CurrentPC; + CompileAtom; + CompileQuantifier(AtomStart); +end; + +procedure TRegExpCompiler.CompileAlternative; +begin + while not AtEnd and (Peek <> '|') and (Peek <> ')') do + CompileTerm; +end; + +procedure TRegExpCompiler.InsertSplitAt(APos: Integer); +var + I: Integer; + Op: TRegExpOpCode; + Bx: Integer; +begin + EnsureCodeCapacity(1); + Move(FCode[APos], FCode[APos + 1], (FCodeLen - APos) * SizeOf(UInt32)); + FCode[APos] := EncodeOpBx(RX_SPLIT, 0); + Inc(FCodeLen); + for I := APos + 1 to FCodeLen - 1 do + begin + Op := TRegExpOpCode(FCode[I] and $FF); + case Op of + RX_SPLIT, RX_SPLIT_LAZY, RX_JUMP: + begin + Bx := Integer(FCode[I] shr 8); + if Bx >= APos then + begin + Inc(Bx); + FCode[I] := EncodeOpBx(Op, Bx); + end; + end; + RX_LOOKAHEAD, RX_LOOKBEHIND: + begin + Bx := Integer(FCode[I] shr 8); + if Bx >= APos then + begin + Inc(Bx); + FCode[I] := (FCode[I] and $80FF) or (UInt32(Bx) shl 8); + end; + end; + end; + end; +end; + +procedure TRegExpCompiler.CompileDisjunction; +var + StartPC: Integer; + JumpHoles: array of Integer; + JumpCount, I: Integer; +begin + JumpCount := 0; + SetLength(JumpHoles, 8); + StartPC := CurrentPC; + CompileAlternative; + while (not AtEnd) and (Peek = '|') do + begin + Inc(FPos); + if FAltStackDepth < Length(FAltStack) then + Inc(FAltStack[FAltStackDepth]); + InsertSplitAt(StartPC); + // Update existing jump holes since they shifted by 1 + for I := 0 to JumpCount - 1 do + if JumpHoles[I] >= StartPC then + Inc(JumpHoles[I]); + if JumpCount >= Length(JumpHoles) then + SetLength(JumpHoles, JumpCount * 2 + 8); + JumpHoles[JumpCount] := CurrentPC; + Inc(JumpCount); + Emit(0); + PatchHole(StartPC, CurrentPC); + StartPC := CurrentPC; + CompileAlternative; + end; + for I := 0 to JumpCount - 1 do + FCode[JumpHoles[I]] := EncodeOpBx(RX_JUMP, CurrentPC); +end; + +procedure TRegExpCompiler.CompilePattern; +begin + Emit(EncodeOpBx(RX_SAVE, 0)); + CompileDisjunction; + Emit(EncodeOpBx(RX_SAVE, 1)); + Emit(EncodeOp(RX_MATCH)); +end; + +procedure TRegExpCompiler.PreScanNamedGroups; +var + I, GroupIndex, CloseAngle, J: Integer; + InCharClass: Boolean; + GroupName: string; + AltStack: array of Integer; + AltStackDepth: Integer; +begin + SetLength(AltStack, 64); + AltStackDepth := 0; + AltStack[0] := 0; + I := 1; + GroupIndex := 0; + InCharClass := False; + while I <= Length(FPattern) do + begin + if FPattern[I] = '\' then + begin + if I + 1 <= Length(FPattern) then + Inc(I, 2) + else + Inc(I); + Continue; + end; + if FPattern[I] = '[' then + begin + InCharClass := True; + Inc(I); + Continue; + end; + if (FPattern[I] = ']') and InCharClass then + begin + InCharClass := False; + Inc(I); + Continue; + end; + if InCharClass then + begin + Inc(I); + Continue; + end; + if FPattern[I] = '|' then + begin + Inc(AltStack[AltStackDepth]); + Inc(I); + Continue; + end; + if FPattern[I] = ')' then + begin + if AltStackDepth > 0 then + Dec(AltStackDepth); + Inc(I); + Continue; + end; + if FPattern[I] = '(' then + begin + Inc(AltStackDepth); + if AltStackDepth >= Length(AltStack) then + SetLength(AltStack, AltStackDepth * 2 + 4); + AltStack[AltStackDepth] := 0; + if (I + 1 <= Length(FPattern)) and (FPattern[I + 1] = '?') then + begin + if (I + 2 <= Length(FPattern)) and (FPattern[I + 2] = '<') then + begin + if (I + 3 <= Length(FPattern)) and + ((FPattern[I + 3] = '=') or (FPattern[I + 3] = '!')) then + begin + Inc(I, 3); + Continue; + end; + CloseAngle := I + 3; + while (CloseAngle <= Length(FPattern)) and + (FPattern[CloseAngle] <> '>') do + Inc(CloseAngle); + if CloseAngle <= Length(FPattern) then + begin + Inc(GroupIndex); + GroupName := Copy(FPattern, I + 3, CloseAngle - I - 3); + SetLength(FNamedGroups, Length(FNamedGroups) + 1); + FNamedGroups[High(FNamedGroups)].Name := GroupName; + FNamedGroups[High(FNamedGroups)].Index := GroupIndex; + SetLength(FNamedGroups[High(FNamedGroups)].DisjunctionPath, + AltStackDepth + 1); + for J := 0 to AltStackDepth do + FNamedGroups[High(FNamedGroups)].DisjunctionPath[J] := AltStack[J]; + I := CloseAngle + 1; + Continue; + end; + end; + Inc(I, 2); + Continue; + end; + Inc(GroupIndex); + end; + Inc(I); + end; +end; + +procedure TRegExpCompiler.ValidateNamedGroups; +var + K, L, MinLen, I: Integer; + ShareBranch: Boolean; +begin + for K := 0 to High(FNamedGroups) - 1 do + for L := K + 1 to High(FNamedGroups) do + if FNamedGroups[K].Name = FNamedGroups[L].Name then + begin + MinLen := Length(FNamedGroups[K].DisjunctionPath); + if Length(FNamedGroups[L].DisjunctionPath) < MinLen then + MinLen := Length(FNamedGroups[L].DisjunctionPath); + ShareBranch := True; + for I := 0 to MinLen - 1 do + if FNamedGroups[K].DisjunctionPath[I] <> + FNamedGroups[L].DisjunctionPath[I] then + begin + ShareBranch := False; + Break; + end; + if ShareBranch then + raise EConvertError.CreateFmt( + 'Duplicate named capture group: %s', [FNamedGroups[K].Name]); + end; +end; + +function TRegExpCompiler.Compile: TRegExpProgram; +begin + PreScanNamedGroups; + ValidateNamedGroups; + CompilePattern; + SetLength(FCode, FCodeLen); + Result.Code := FCode; + Result.CharClasses := FCharClasses; + Result.CaptureCount := FCaptureCount; + Result.NamedGroups := FNamedGroups; + Result.FlagIgnoreCase := HasRegExpFlag(FFlags, 'i'); + Result.FlagMultiline := HasRegExpFlag(FFlags, 'm'); + Result.FlagDotAll := HasRegExpFlag(FFlags, 's'); + Result.FlagUnicode := FUnicode; +end; + +function CompileRegExp(const APattern, AFlags: string): TRegExpProgram; +var + Compiler: TRegExpCompiler; +begin + Compiler := TRegExpCompiler.Create(APattern, AFlags); + try + Result := Compiler.Compile; + finally + Compiler.Free; + end; +end; + +procedure ValidateRegExpPatternNew(const APattern, AFlags: string); +begin + ValidateRegExpFlags(AFlags); + if APattern = '(?:)' then + Exit; + CompileRegExp(APattern, AFlags); +end; + +end. diff --git a/source/units/Goccia.RegExp.Engine.pas b/source/units/Goccia.RegExp.Engine.pas index 68ba5645..1e7a2a49 100644 --- a/source/units/Goccia.RegExp.Engine.pas +++ b/source/units/Goccia.RegExp.Engine.pas @@ -38,19 +38,16 @@ function RegExpToString(const APattern, AFlags: string): string; function ExecuteRegExp(const APattern, AFlags, AInput: string; const AStartIndex: Integer; const ARequireStart: Boolean; out AResult: TGocciaRegExpMatchResult): Boolean; -function PreprocessRegExpPattern(const APattern: string; - out ANamedGroups: TGocciaRegExpNamedGroups): string; implementation uses - Math, SysUtils, - RegExpr, TextSemantics, - Goccia.RegExp.Unicode; + Goccia.RegExp.Compiler, + Goccia.RegExp.VM; const EMPTY_REGEX = '(?:)'; @@ -64,14 +61,6 @@ function NormalizeRegExpSource(const APattern: string): string; Result := APattern; end; -function GetExecutableRegExpPattern(const APattern: string): string; -begin - if APattern = EMPTY_REGEX then - Result := '' - else - Result := APattern; -end; - function HasRegExpFlag(const AFlags: string; const AFlag: Char): Boolean; begin Result := Pos(AFlag, AFlags) > 0; @@ -91,325 +80,13 @@ procedure ValidateRegExpFlags(const AFlags: string); raise EConvertError.Create('Invalid regular expression flags'); Seen := Seen + AFlags[I]; end; - // ES2026 §22.2.2.1: u and v flags are mutually exclusive if HasRegExpFlag(AFlags, 'u') and HasRegExpFlag(AFlags, 'v') then raise EConvertError.Create('Invalid regular expression flags'); end; -// ES2025 §22.2.1 Static Semantics: Early Errors — RegExp Modifiers -// Validates inline modifier group syntax (?flags:...) and (?flags-flags:...). -// Only i, m, s are valid modifier flags. The colon form is required. -procedure ValidateModifierGroups(const APattern: string); -var - I, J, PatternLength: Integer; - InCharClass: Boolean; - C: Char; - EnableFlags, DisableFlags: string; - InDisable: Boolean; -begin - PatternLength := Length(APattern); - I := 1; - InCharClass := False; - while I <= PatternLength do - begin - if APattern[I] = '\' then - begin - if I + 1 <= PatternLength then - Inc(I, 2) - else - Inc(I); - Continue; - end; - if APattern[I] = '[' then - begin - InCharClass := True; - Inc(I); - Continue; - end; - if (APattern[I] = ']') and InCharClass then - begin - InCharClass := False; - Inc(I); - Continue; - end; - if InCharClass then - begin - Inc(I); - Continue; - end; - // ES2025: Check for modifier group prefix (?[ims-]...) - if (APattern[I] = '(') and (I + 2 <= PatternLength) and - (APattern[I + 1] = '?') and - CharInSet(APattern[I + 2], ['i', 'm', 's', '-']) then - begin - J := I + 2; - EnableFlags := ''; - DisableFlags := ''; - InDisable := False; - while J <= PatternLength do - begin - C := APattern[J]; - // ES2025 §22.2.1 step 4: colon terminates modifier prefix - if C = ':' then - Break; - if C = ')' then - raise EConvertError.Create( - 'Invalid regular expression: modifier group must use (?flags:...) syntax'); - if C = '-' then - begin - if InDisable then - raise EConvertError.Create( - 'Invalid regular expression: unexpected - in modifier group'); - InDisable := True; - Inc(J); - Continue; - end; - if not CharInSet(C, ['i', 'm', 's']) then - raise EConvertError.CreateFmt( - 'Invalid regular expression: ''%s'' is not a valid modifier flag', [C]); - if InDisable then - begin - if Pos(C, DisableFlags) > 0 then - raise EConvertError.CreateFmt( - 'Invalid regular expression: duplicate modifier flag ''%s''', [C]); - if Pos(C, EnableFlags) > 0 then - raise EConvertError.CreateFmt( - 'Invalid regular expression: ''%s'' in both enable and disable', [C]); - DisableFlags := DisableFlags + C; - end - else - begin - if Pos(C, EnableFlags) > 0 then - raise EConvertError.CreateFmt( - 'Invalid regular expression: duplicate modifier flag ''%s''', [C]); - EnableFlags := EnableFlags + C; - end; - Inc(J); - end; - // ES2025 §22.2.1: Both add and remove lists empty is a SyntaxError - if (J <= PatternLength) and (APattern[J] = ':') and - (EnableFlags = '') and (DisableFlags = '') then - raise EConvertError.Create( - 'Invalid regular expression: modifier group must enable or disable at least one flag'); - end; - Inc(I); - end; -end; - -// ES2025 §22.2.1 RegExp Modifiers — Transforms inline modifier groups -// (?flags:...) and (?flags-flags:...) into TRegExpr-compatible syntax. -// For i and m modifiers: uses (?i)/(?-i)/(?m)/(?-m) toggles inside (?:...) -// groups (TRegExpr scopes these correctly to groups). -// For s modifier enable: replaces . with [\s\S] (because TRegExpr's (?s) -// leaks from groups). For s modifier disable: uses (?-s) toggle (TRegExpr -// scopes this correctly). -function PreprocessModifierGroups(const APattern: string): string; -type - TSModifierEntry = record - Depth: Integer; - PreviousSActive: Boolean; - end; -const - DOTALL_REPLACEMENT = '[\s\S]'; - INITIAL_STACK_SIZE = 32; -var - I, J, PatternLength: Integer; - InCharClass: Boolean; - GroupDepth: Integer; - SStack: array of TSModifierEntry; - SStackTop: Integer; - CurrentSActive: Boolean; - C: Char; - EnableFlags, DisableFlags: string; - InDisable: Boolean; - Toggles: string; - NewSActive: Boolean; -begin - PatternLength := Length(APattern); - if PatternLength = 0 then - begin - Result := ''; - Exit; - end; - Result := ''; - I := 1; - InCharClass := False; - GroupDepth := 0; - CurrentSActive := False; - SStackTop := -1; - SetLength(SStack, INITIAL_STACK_SIZE); - while I <= PatternLength do - begin - // Handle escape sequences - if APattern[I] = '\' then - begin - if I + 1 <= PatternLength then - begin - Result := Result + APattern[I] + APattern[I + 1]; - Inc(I, 2); - end - else - begin - Result := Result + APattern[I]; - Inc(I); - end; - Continue; - end; - // Handle character classes (copy as-is, no dot transformation) - if APattern[I] = '[' then - begin - InCharClass := True; - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - if (APattern[I] = ']') and InCharClass then - begin - InCharClass := False; - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - if InCharClass then - begin - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - // ES2025: Transform . based on current s modifier state - if APattern[I] = '.' then - begin - if CurrentSActive then - Result := Result + DOTALL_REPLACEMENT - else - Result := Result + '.'; - Inc(I); - Continue; - end; - // Handle closing paren — pop s state if this closes a modifier group - if APattern[I] = ')' then - begin - if (SStackTop >= 0) and (SStack[SStackTop].Depth = GroupDepth) then - begin - CurrentSActive := SStack[SStackTop].PreviousSActive; - Dec(SStackTop); - end; - Dec(GroupDepth); - Result := Result + ')'; - Inc(I); - Continue; - end; - // Handle opening paren — check for modifier group prefix - if APattern[I] = '(' then - begin - Inc(GroupDepth); - if (I + 1 <= PatternLength) and (APattern[I + 1] = '?') and - (I + 2 <= PatternLength) and - CharInSet(APattern[I + 2], ['i', 'm', 's', '-']) then - begin - // Parse modifier flags up to ':' - J := I + 2; - EnableFlags := ''; - DisableFlags := ''; - InDisable := False; - while (J <= PatternLength) and (APattern[J] <> ':') and - (APattern[J] <> ')') do - begin - C := APattern[J]; - if C = '-' then - begin - InDisable := True; - Inc(J); - Continue; - end; - if CharInSet(C, ['i', 'm', 's']) then - begin - if InDisable then - DisableFlags := DisableFlags + C - else - EnableFlags := EnableFlags + C; - end; - Inc(J); - end; - if (J <= PatternLength) and (APattern[J] = ':') then - begin - // Valid modifier group — transform to TRegExpr-compatible syntax - // Build i/m toggles (TRegExpr scopes these correctly to groups) - Toggles := ''; - if Pos('i', EnableFlags) > 0 then Toggles := Toggles + '(?i)'; - if Pos('m', EnableFlags) > 0 then Toggles := Toggles + '(?m)'; - if Pos('i', DisableFlags) > 0 then Toggles := Toggles + '(?-i)'; - if Pos('m', DisableFlags) > 0 then Toggles := Toggles + '(?-m)'; - // s disable uses TRegExpr toggle (correctly scoped to groups) - if Pos('s', DisableFlags) > 0 then Toggles := Toggles + '(?-s)'; - // Determine new s state (s enable uses dot transformation) - NewSActive := CurrentSActive; - if Pos('s', EnableFlags) > 0 then NewSActive := True; - if Pos('s', DisableFlags) > 0 then NewSActive := False; - // Push s state if s modifier changed - if NewSActive <> CurrentSActive then - begin - Inc(SStackTop); - if SStackTop >= Length(SStack) then - SetLength(SStack, SStackTop * 2 + 4); - SStack[SStackTop].Depth := GroupDepth; - SStack[SStackTop].PreviousSActive := CurrentSActive; - CurrentSActive := NewSActive; - end; - // Emit non-capturing group with toggles - Result := Result + '(?:' + Toggles; - I := J + 1; - Continue; - end; - end; - // Regular group or non-modifier (?...) — pass through - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - // Default: copy character as-is - Result := Result + APattern[I]; - Inc(I); - end; -end; - -// ES2026 §22.2.3.1 RegExp ( pattern, flags ) — validation step procedure ValidateRegExpPattern(const APattern, AFlags: string); -var - Matcher: TRegExpr; - NormalizedPattern: string; - ExecutablePattern: string; - ConvertedPattern: string; - DiscardedGroups: TGocciaRegExpNamedGroups; - IsUnicode: Boolean; begin - ValidateRegExpFlags(AFlags); - NormalizedPattern := NormalizeRegExpSource(APattern); - if NormalizedPattern = EMPTY_REGEX then - Exit; - ExecutablePattern := GetExecutableRegExpPattern(NormalizedPattern); - // ES2025: Validate inline modifier groups before transformation - ValidateModifierGroups(ExecutablePattern); - // ES2025: Transform modifier groups into TRegExpr-compatible syntax - ExecutablePattern := PreprocessModifierGroups(ExecutablePattern); - IsUnicode := HasRegExpFlag(AFlags, 'u'); - ConvertedPattern := PreprocessRegExpPattern(ExecutablePattern, DiscardedGroups); - // ES2026 §22.2.2.9: Apply Unicode pattern preprocessing when u flag is set - if IsUnicode then - ConvertedPattern := PreprocessUnicodePattern(ConvertedPattern, - HasRegExpFlag(AFlags, 'i')); - Matcher := TRegExpr.Create; - try - Matcher.Expression := ConvertedPattern; - Matcher.ModifierI := HasRegExpFlag(AFlags, 'i'); - Matcher.ModifierM := HasRegExpFlag(AFlags, 'm'); - Matcher.ModifierS := HasRegExpFlag(AFlags, 's'); - if IsUnicode then - Matcher.ModifierR := False; - Matcher.Compile; - finally - Matcher.Free; - end; + ValidateRegExpPatternNew(APattern, AFlags); end; function CanonicalizeRegExpFlags(const AFlags: string): string; @@ -431,416 +108,16 @@ function RegExpToString(const APattern, AFlags: string): string; CanonicalizeRegExpFlags(AFlags); end; -// ES2026 §22.2.7.2 AdvanceStringIndex ( S, index, unicode ) -function AdvanceStringIndex(const AInput: string; const AIndex: Integer; - const AUnicode: Boolean): Integer; -begin - Result := AdvanceUTF8StringIndex(AInput, AIndex, AUnicode); -end; - -function FindNamedGroupIndex(const ANamedGroups: TGocciaRegExpNamedGroups; - const AName: string): Integer; -var - I: Integer; -begin - for I := 0 to High(ANamedGroups) do - if ANamedGroups[I].Name = AName then - Exit(ANamedGroups[I].Index); - Result := -1; -end; - -// ES2025 §22.2.1 Static Semantics: Early Errors — duplicate GroupSpecifier -// Two disjunction paths share a branch if they agree at every common depth. -// When they share a branch, both groups can participate in the same match — -// making duplicate names a SyntaxError. -function PathsShareBranch(const APathA, APathB: array of Integer): Boolean; -var - MinLength, I: Integer; -begin - MinLength := Min(Length(APathA), Length(APathB)); - for I := 0 to MinLength - 1 do - if APathA[I] <> APathB[I] then - Exit(False); - Result := True; -end; - -// ES2025 §22.2.2 Runtime Semantics: CompileAtom — \k GroupName -// Resolve \k backreference when multiple groups share the same name. -// Returns the TRegExpr-compatible backreference string. -// -// When the backreference is outside the disjunction containing the duplicate -// groups (CompatCount = 0 or > 1), we emit (?:\N1|\N2|...) — an alternation -// of all candidate backreferences. This is correct because TRegExpr fails -// (rather than matching empty) when a backreference targets a non-participating -// group, so the alternation falls through to the participating group's backref. -// Concatenation (\N1\N2) would be wrong: the non-participating backref would -// fail and abort the entire match. -function ResolveNamedBackreference( - const ANamedGroups: TGocciaRegExpNamedGroups; - const AName: string; const ACurrentPath: array of Integer): string; -var - AllIndices: array of Integer; - CompatibleIndices: array of Integer; - AllCount, CompatCount, I: Integer; -begin - Result := ''; - // Collect all group indices with this name - AllCount := 0; - for I := 0 to High(ANamedGroups) do - if ANamedGroups[I].Name = AName then - Inc(AllCount); - if AllCount = 0 then - Exit; - if AllCount = 1 then - begin - // Single group — simple backreference (ES2018 behavior) - Result := '\' + IntToStr(FindNamedGroupIndex(ANamedGroups, AName)); - Exit; - end; - // ES2025: Multiple groups with same name — resolve via disjunction path - SetLength(CompatibleIndices, AllCount); - CompatCount := 0; - for I := 0 to High(ANamedGroups) do - if (ANamedGroups[I].Name = AName) and - PathsShareBranch(ANamedGroups[I].DisjunctionPath, ACurrentPath) then - begin - CompatibleIndices[CompatCount] := ANamedGroups[I].Index; - Inc(CompatCount); - end; - if CompatCount = 1 then - begin - // Exactly one compatible group — resolve directly - Result := '\' + IntToStr(CompatibleIndices[0]); - Exit; - end; - if CompatCount = 0 then - begin - // Backreference outside the disjunction — collect all groups with this name - SetLength(AllIndices, AllCount); - AllCount := 0; - for I := 0 to High(ANamedGroups) do - if ANamedGroups[I].Name = AName then - begin - AllIndices[AllCount] := ANamedGroups[I].Index; - Inc(AllCount); - end; - // Emit alternation: (?:\1|\2|...) — the participating group's backreference - // succeeds while non-participating ones either match empty or fail through - Result := '(?:'; - for I := 0 to AllCount - 1 do - begin - if I > 0 then - Result := Result + '|'; - Result := Result + '\' + IntToStr(AllIndices[I]); - end; - Result := Result + ')'; - Exit; - end; - // Multiple compatible groups — emit alternation of compatible ones - Result := '(?:'; - for I := 0 to CompatCount - 1 do - begin - if I > 0 then - Result := Result + '|'; - Result := Result + '\' + IntToStr(CompatibleIndices[I]); - end; - Result := Result + ')'; -end; - -// Pass 1: collect all named groups and their capture indices without modifying -// the pattern, so that forward \k backreferences can be resolved. -// ES2025: Also tracks disjunction paths and validates duplicate named groups. -function CollectNamedGroups(const APattern: string): TGocciaRegExpNamedGroups; -var - I, J, K, L, PatternLength, GroupIndex, CloseAngle: Integer; - InCharClass: Boolean; - GroupName: string; - AltStack: array of Integer; - AltStackDepth: Integer; -begin - SetLength(Result, 0); - PatternLength := Length(APattern); - I := 1; - GroupIndex := 0; - InCharClass := False; - // ES2025: Initialize disjunction path stack with top-level scope - SetLength(AltStack, 64); - AltStackDepth := 0; - AltStack[0] := 0; - while I <= PatternLength do - begin - if APattern[I] = '\' then - begin - if I + 1 <= PatternLength then - Inc(I, 2) - else - Inc(I); - Continue; - end; - if APattern[I] = '[' then - begin - InCharClass := True; - Inc(I); - Continue; - end; - if (APattern[I] = ']') and InCharClass then - begin - InCharClass := False; - Inc(I); - Continue; - end; - if InCharClass then - begin - Inc(I); - Continue; - end; - // ES2025: Track disjunction alternatives - if APattern[I] = '|' then - begin - Inc(AltStack[AltStackDepth]); - Inc(I); - Continue; - end; - if APattern[I] = ')' then - begin - if AltStackDepth > 0 then - Dec(AltStackDepth); - Inc(I); - Continue; - end; - if APattern[I] = '(' then - begin - // Push disjunction level for all group types - Inc(AltStackDepth); - if AltStackDepth >= Length(AltStack) then - SetLength(AltStack, AltStackDepth * 2 + 4); - AltStack[AltStackDepth] := 0; - if (I + 1 <= PatternLength) and (APattern[I + 1] = '?') then - begin - if (I + 2 <= PatternLength) and (APattern[I + 2] = '<') then - begin - // (?<= lookbehind, (?...) - CloseAngle := I + 3; - while (CloseAngle <= PatternLength) and - (APattern[CloseAngle] <> '>') do - Inc(CloseAngle); - if CloseAngle <= PatternLength then - begin - Inc(GroupIndex); - GroupName := Copy(APattern, I + 3, CloseAngle - I - 3); - SetLength(Result, Length(Result) + 1); - Result[High(Result)].Name := GroupName; - Result[High(Result)].Index := GroupIndex; - // ES2025: Record disjunction path for duplicate name validation - SetLength(Result[High(Result)].DisjunctionPath, AltStackDepth + 1); - for J := 0 to AltStackDepth do - Result[High(Result)].DisjunctionPath[J] := AltStack[J]; - I := CloseAngle + 1; - Continue; - end; - end; - // Non-capturing or other (?...) group — skip without incrementing index - Inc(I, 2); - Continue; - end; - // Plain capturing group - Inc(GroupIndex); - end; - Inc(I); - end; - // ES2025 §22.2.1.1: Validate duplicate named capture groups are in different - // alternatives. Two groups with the same name that share a disjunction branch - // can both participate in a single match — that is a SyntaxError. - for K := 0 to High(Result) - 1 do - for L := K + 1 to High(Result) do - if (Result[K].Name = Result[L].Name) and - PathsShareBranch(Result[K].DisjunctionPath, - Result[L].DisjunctionPath) then - raise EConvertError.CreateFmt( - 'Duplicate named capture group: %s', [Result[K].Name]); -end; - -// Pass 2: convert named groups to plain capturing groups and resolve \k -// backreferences using the complete group map from pass 1. -// ES2025: Tracks disjunction paths for correct \k resolution with -// duplicate named capture groups. -function PreprocessRegExpPattern(const APattern: string; - out ANamedGroups: TGocciaRegExpNamedGroups): string; -var - I, J, PatternLength: Integer; - InCharClass: Boolean; - GroupName: string; - CloseAngle: Integer; - BackrefResult: string; - AltStack: array of Integer; - AltStackDepth: Integer; - CaptureIndex, TargetIndex: Integer; -begin - // Pass 1: collect all named groups so forward backreferences resolve - ANamedGroups := CollectNamedGroups(APattern); - PatternLength := Length(APattern); - if PatternLength = 0 then - begin - Result := ''; - Exit; - end; - // Pass 2: emit converted pattern with disjunction path tracking - Result := ''; - I := 1; - InCharClass := False; - // ES2025: Track disjunction path for \k resolution - SetLength(AltStack, 64); - AltStackDepth := 0; - AltStack[0] := 0; - CaptureIndex := 0; - while I <= PatternLength do - begin - if APattern[I] = '\' then - begin - if I + 1 <= PatternLength then - begin - // \k backreference: convert to numeric backreference(s) - if (APattern[I + 1] = 'k') and (I + 2 <= PatternLength) and - (APattern[I + 2] = '<') then - begin - CloseAngle := I + 3; - while (CloseAngle <= PatternLength) and - (APattern[CloseAngle] <> '>') do - Inc(CloseAngle); - if CloseAngle <= PatternLength then - begin - GroupName := Copy(APattern, I + 3, CloseAngle - I - 3); - // ES2025: Resolve with duplicate named group awareness - TargetIndex := FindNamedGroupIndex(ANamedGroups, GroupName); - if TargetIndex > CaptureIndex then - BackrefResult := '' - else - BackrefResult := ResolveNamedBackreference(ANamedGroups, - GroupName, Copy(AltStack, 0, AltStackDepth + 1)); - if BackrefResult = '' then - begin - if TargetIndex < 0 then - raise EConvertError.CreateFmt( - 'Invalid named backreference: %s', [GroupName]); - Result := Result + '(?:)'; - end - else - Result := Result + BackrefResult; - I := CloseAngle + 1; - Continue; - end; - end; - Result := Result + APattern[I] + APattern[I + 1]; - Inc(I, 2); - end - else - begin - Result := Result + APattern[I]; - Inc(I); - end; - Continue; - end; - if APattern[I] = '[' then - begin - InCharClass := True; - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - if (APattern[I] = ']') and InCharClass then - begin - InCharClass := False; - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - if InCharClass then - begin - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - // ES2025: Track disjunction alternatives - if APattern[I] = '|' then - begin - Inc(AltStack[AltStackDepth]); - Result := Result + '|'; - Inc(I); - Continue; - end; - if APattern[I] = ')' then - begin - if AltStackDepth > 0 then - Dec(AltStackDepth); - Result := Result + ')'; - Inc(I); - Continue; - end; - if APattern[I] = '(' then - begin - // Push disjunction level for all group types - Inc(AltStackDepth); - if AltStackDepth >= Length(AltStack) then - SetLength(AltStack, AltStackDepth * 2 + 4); - AltStack[AltStackDepth] := 0; - if (I + 1 <= PatternLength) and (APattern[I + 1] = '?') then - begin - if (I + 2 <= PatternLength) and (APattern[I + 2] = '<') then - begin - // (?<= lookbehind, (?...) -> plain capturing group (...) - CloseAngle := I + 3; - while (CloseAngle <= PatternLength) and - (APattern[CloseAngle] <> '>') do - Inc(CloseAngle); - if CloseAngle <= PatternLength then - begin - // Strip the name, emit plain capturing group - Inc(CaptureIndex); - Result := Result + '('; - I := CloseAngle + 1; - Continue; - end; - end; - Result := Result + '(?'; - Inc(I, 2); - Continue; - end; - Inc(CaptureIndex); - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - Result := Result + APattern[I]; - Inc(I); - end; -end; - -// ES2026 §22.2.7.1 RegExpExec ( R, S ) function ExecuteRegExp(const APattern, AFlags, AInput: string; const AStartIndex: Integer; const ARequireStart: Boolean; out AResult: TGocciaRegExpMatchResult): Boolean; var - Matcher: TRegExpr; - I: Integer; - ExecutablePattern: string; - ConvertedPattern: string; - NamedGroups: TGocciaRegExpNamedGroups; + Prog: TRegExpProgram; + VMResult: TRegExpVMResult; IsUnicode: Boolean; + I, GroupCount: Integer; + PatternToCompile: string; + SlotStart, SlotEnd: Integer; begin AResult.Found := False; AResult.MatchIndex := -1; @@ -849,7 +126,7 @@ function ExecuteRegExp(const APattern, AFlags, AInput: string; SetLength(AResult.Groups, 0); SetLength(AResult.NamedGroups, 0); ValidateRegExpFlags(AFlags); - IsUnicode := HasRegExpFlag(AFlags, 'u'); + IsUnicode := HasRegExpFlag(AFlags, 'u') or HasRegExpFlag(AFlags, 'v'); if AStartIndex > Length(AInput) then Exit(False); if APattern = EMPTY_REGEX then @@ -857,57 +134,52 @@ function ExecuteRegExp(const APattern, AFlags, AInput: string; AResult.Found := True; AResult.MatchIndex := AStartIndex; AResult.MatchEnd := AStartIndex; - AResult.NextIndex := AdvanceStringIndex(AInput, AStartIndex, - IsUnicode or HasRegExpFlag(AFlags, 'v')); + AResult.NextIndex := AdvanceUTF8StringIndex(AInput, AStartIndex, IsUnicode); SetLength(AResult.Groups, 1); AResult.Groups[0].Matched := True; AResult.Groups[0].Value := ''; Exit(True); end; - // ES2025: Transform modifier groups before named group preprocessing - ExecutablePattern := PreprocessModifierGroups( - GetExecutableRegExpPattern(APattern)); - ConvertedPattern := PreprocessRegExpPattern(ExecutablePattern, NamedGroups); - // ES2026 §22.2.2.9: Apply Unicode pattern preprocessing when u flag is set - if IsUnicode then - ConvertedPattern := PreprocessUnicodePattern(ConvertedPattern, - HasRegExpFlag(AFlags, 'i')); - Matcher := TRegExpr.Create; - try - Matcher.Expression := ConvertedPattern; - Matcher.ModifierI := HasRegExpFlag(AFlags, 'i'); - Matcher.ModifierM := HasRegExpFlag(AFlags, 'm'); - Matcher.ModifierS := HasRegExpFlag(AFlags, 's'); - if IsUnicode then - Matcher.ModifierR := False; - Matcher.Compile; - Matcher.InputString := AInput; - Result := Matcher.ExecPos(AStartIndex + 1); - if Result and ARequireStart and - (Matcher.MatchPos[0] <> AStartIndex + 1) then - Result := False; - if not Result then - Exit(False); - AResult.Found := True; - AResult.MatchIndex := Matcher.MatchPos[0] - 1; - AResult.MatchEnd := AResult.MatchIndex + Matcher.MatchLen[0]; - AResult.NextIndex := AResult.MatchEnd; - if Matcher.MatchLen[0] = 0 then - AResult.NextIndex := AdvanceStringIndex(AInput, AResult.NextIndex, - IsUnicode or HasRegExpFlag(AFlags, 'v')); - SetLength(AResult.Groups, Matcher.SubExprMatchCount + 1); - for I := 0 to Matcher.SubExprMatchCount do + PatternToCompile := APattern; + if PatternToCompile = EMPTY_REGEX then + PatternToCompile := ''; + Prog := CompileRegExp(PatternToCompile, AFlags); + Result := ExecuteRegExpVM(Prog, AInput, AStartIndex, ARequireStart, VMResult); + if not Result then + Exit(False); + AResult.Found := True; + if Length(VMResult.CaptureSlots) < 2 then + Exit(False); + AResult.MatchIndex := VMResult.CaptureSlots[0] - 1; + AResult.MatchEnd := VMResult.CaptureSlots[1] - 1; + AResult.NextIndex := AResult.MatchEnd; + if AResult.MatchEnd = AResult.MatchIndex then + AResult.NextIndex := AdvanceUTF8StringIndex(AInput, AResult.NextIndex, + IsUnicode); + GroupCount := Prog.CaptureCount + 1; + SetLength(AResult.Groups, GroupCount); + for I := 0 to GroupCount - 1 do + begin + SlotStart := -1; + SlotEnd := -1; + if I * 2 + 1 < Length(VMResult.CaptureSlots) then + begin + SlotStart := VMResult.CaptureSlots[I * 2]; + SlotEnd := VMResult.CaptureSlots[I * 2 + 1]; + end; + if (SlotStart >= 1) and (SlotEnd >= SlotStart) and + (SlotEnd <= Length(AInput) + 1) then + begin + AResult.Groups[I].Matched := True; + AResult.Groups[I].Value := Copy(AInput, SlotStart, SlotEnd - SlotStart); + end + else begin - AResult.Groups[I].Matched := Matcher.MatchPos[I] > 0; - if AResult.Groups[I].Matched then - AResult.Groups[I].Value := Matcher.Match[I] - else - AResult.Groups[I].Value := ''; + AResult.Groups[I].Matched := False; + AResult.Groups[I].Value := ''; end; - AResult.NamedGroups := NamedGroups; - finally - Matcher.Free; end; + AResult.NamedGroups := Prog.NamedGroups; end; end. diff --git a/source/units/Goccia.RegExp.Unicode.pas b/source/units/Goccia.RegExp.Unicode.pas deleted file mode 100644 index bcead522..00000000 --- a/source/units/Goccia.RegExp.Unicode.pas +++ /dev/null @@ -1,611 +0,0 @@ -unit Goccia.RegExp.Unicode; - -{$I Goccia.inc} - -interface - -function ExpandUnicodePropertyEscape(const APropertyName: string; - const ANegated: Boolean): string; -function PreprocessUnicodePattern(const APattern: string; - const AIgnoreCase: Boolean = False): string; - -implementation - -uses - SysUtils; - -const - UNSUPPORTED_PROPERTY_PREFIX = 'Invalid Unicode property name: '; - UTF8_ANY_CODE_POINT = - '(?:[\x00-\x7F]|[\xC2-\xDF][\x80-\xBF]|' + - '[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|' + - '[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF])'; - UTF8_NON_SPACE_CODE_POINT = - '(?:[\x00-\x08\x0E-\x1F\x21-\x7F]|' + - '[\xC2-\xDF][\x80-\xBF]|' + - '[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|' + - '[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF])'; - - // ES2026 §22.2.2.9 Unicode property escape character classes. - // These use ASCII-safe approximations for the most commonly used - // General Category properties and Binary properties. - CHAR_CLASS_LETTER = 'A-Za-z\xC0-\xD6\xD8-\xF6\xF8-\xFF'; - CHAR_CLASS_UPPERCASE_LETTER = 'A-Z\xC0-\xD6\xD8-\xDE'; - CHAR_CLASS_LOWERCASE_LETTER = 'a-z\xDF-\xF6\xF8-\xFF'; - CHAR_CLASS_DECIMAL_NUMBER = '0-9'; - CHAR_CLASS_NUMBER = '0-9'; - CHAR_CLASS_PUNCTUATION = - '!\x22#%&\x27\x28\x29*,\x2D.\x2F:;\x3F@\x5B\\\x5D_\x7B\x7D'; - CHAR_CLASS_SYMBOL = '\x24+<=>^`|~'; - CHAR_CLASS_SEPARATOR = '\x20\xA0'; - CHAR_CLASS_CONTROL = '\x00-\x1F\x7F-\x9F'; - CHAR_CLASS_ASCII = '\x00-\x7F'; - CHAR_CLASS_ASCII_HEX_DIGIT = '0-9A-Fa-f'; - CHAR_CLASS_WHITE_SPACE = '\x09-\x0D\x20\xA0'; - -// ES2026 §22.2.2.9 CharacterClassEscape :: \p{UnicodePropertyValueExpression} -function ExpandUnicodePropertyEscape(const APropertyName: string; - const ANegated: Boolean): string; -var - CharClass: string; - NegatePrefix: string; -begin - CharClass := ''; - - if (APropertyName = 'L') or (APropertyName = 'Letter') then - CharClass := CHAR_CLASS_LETTER - else if (APropertyName = 'Lu') or (APropertyName = 'Uppercase_Letter') then - CharClass := CHAR_CLASS_UPPERCASE_LETTER - else if (APropertyName = 'Ll') or (APropertyName = 'Lowercase_Letter') then - CharClass := CHAR_CLASS_LOWERCASE_LETTER - else if (APropertyName = 'N') or (APropertyName = 'Number') then - CharClass := CHAR_CLASS_NUMBER - else if (APropertyName = 'Nd') or (APropertyName = 'Decimal_Number') then - CharClass := CHAR_CLASS_DECIMAL_NUMBER - else if (APropertyName = 'P') or (APropertyName = 'Punctuation') then - CharClass := CHAR_CLASS_PUNCTUATION - else if (APropertyName = 'S') or (APropertyName = 'Symbol') then - CharClass := CHAR_CLASS_SYMBOL - else if (APropertyName = 'Z') or (APropertyName = 'Separator') then - CharClass := CHAR_CLASS_SEPARATOR - else if (APropertyName = 'Cc') or (APropertyName = 'Control') then - CharClass := CHAR_CLASS_CONTROL - else if APropertyName = 'ASCII' then - CharClass := CHAR_CLASS_ASCII - else if APropertyName = 'ASCII_Hex_Digit' then - CharClass := CHAR_CLASS_ASCII_HEX_DIGIT - else if APropertyName = 'White_Space' then - CharClass := CHAR_CLASS_WHITE_SPACE - else - raise EConvertError.Create(UNSUPPORTED_PROPERTY_PREFIX + APropertyName); - - if ANegated then - NegatePrefix := '^' - else - NegatePrefix := ''; - - Result := '[' + NegatePrefix + CharClass + ']'; -end; - -// ES2026 §11.1.4 Static Semantics: UTF16EncodeCodePoint ( cp ) -function CodePointToUtf8(const ACodePoint: Cardinal): string; -begin - if ACodePoint <= $7F then - Result := Chr(ACodePoint) - else if ACodePoint <= $7FF then - Result := Chr($C0 or (ACodePoint shr 6)) + - Chr($80 or (ACodePoint and $3F)) - else if ACodePoint <= $FFFF then - Result := Chr($E0 or (ACodePoint shr 12)) + - Chr($80 or ((ACodePoint shr 6) and $3F)) + - Chr($80 or (ACodePoint and $3F)) - else if ACodePoint <= $10FFFF then - Result := Chr($F0 or (ACodePoint shr 18)) + - Chr($80 or ((ACodePoint shr 12) and $3F)) + - Chr($80 or ((ACodePoint shr 6) and $3F)) + - Chr($80 or (ACodePoint and $3F)) - else - raise EConvertError.Create('Invalid Unicode code point: U+' + - IntToHex(ACodePoint, 4)); -end; - -function DecodeUtf8At(const APattern: string; const AIndex: Integer; - out ACodePoint: Cardinal; out AByteLength: Integer): Boolean; -var - B1, B2, B3, B4: Byte; -begin - Result := False; - ACodePoint := 0; - AByteLength := 0; - if AIndex > Length(APattern) then - Exit; - B1 := Ord(APattern[AIndex]); - if B1 < $80 then - begin - ACodePoint := B1; - AByteLength := 1; - Exit(True); - end; - if (B1 >= $C2) and (B1 <= $DF) and (AIndex + 1 <= Length(APattern)) then - begin - B2 := Ord(APattern[AIndex + 1]); - if (B2 and $C0) <> $80 then - Exit; - ACodePoint := ((B1 and $1F) shl 6) or (B2 and $3F); - AByteLength := 2; - Exit(True); - end; - if (B1 >= $E0) and (B1 <= $EF) and (AIndex + 2 <= Length(APattern)) then - begin - B2 := Ord(APattern[AIndex + 1]); - B3 := Ord(APattern[AIndex + 2]); - if ((B2 and $C0) <> $80) or ((B3 and $C0) <> $80) then - Exit; - ACodePoint := ((B1 and $0F) shl 12) or ((B2 and $3F) shl 6) or - (B3 and $3F); - AByteLength := 3; - Exit(True); - end; - if (B1 >= $F0) and (B1 <= $F4) and (AIndex + 3 <= Length(APattern)) then - begin - B2 := Ord(APattern[AIndex + 1]); - B3 := Ord(APattern[AIndex + 2]); - B4 := Ord(APattern[AIndex + 3]); - if ((B2 and $C0) <> $80) or ((B3 and $C0) <> $80) or - ((B4 and $C0) <> $80) then - Exit; - ACodePoint := ((B1 and $07) shl 18) or ((B2 and $3F) shl 12) or - ((B3 and $3F) shl 6) or (B4 and $3F); - AByteLength := 4; - Exit(True); - end; -end; - -function IsHexDigit(const C: Char): Boolean; inline; -begin - Result := CharInSet(C, ['0'..'9', 'a'..'f', 'A'..'F']); -end; - -function EscapeLiteralAtom(const AValue: string): string; -const - REGEXP_SYNTAX_CHARS = ['\', '^', '$', '.', '|', '?', '*', '+', '(', ')', - '[', ']', '{', '}']; -var - I: Integer; -begin - Result := ''; - for I := 1 to Length(AValue) do - begin - if CharInSet(AValue[I], REGEXP_SYNTAX_CHARS) then - Result := Result + '\'; - Result := Result + AValue[I]; - end; -end; - -function HexByte(const AValue: Byte): string; inline; -begin - Result := '\x' + IntToHex(AValue, 2); -end; - -function ByteRangeExcept(const AMin, AMax, AExcluded: Byte): string; -begin - Result := ''; - if AExcluded > AMin then - Result := Result + HexByte(AMin) + '-' + HexByte(AExcluded - 1); - if AExcluded < AMax then - begin - if Result <> '' then - Result := Result + HexByte(AExcluded + 1) + '-' + HexByte(AMax) - else - Result := HexByte(AExcluded + 1) + '-' + HexByte(AMax); - end; - if Result = '' then - Result := '[^\s\S]' - else - Result := '[' + Result + ']'; -end; - -function Utf8AnyCodePointExcept(const ACodePoint: Cardinal): string; -var - Bytes: string; - Parts: array of string; - - procedure AddPart(const APart: string); - begin - SetLength(Parts, Length(Parts) + 1); - Parts[High(Parts)] := APart; - end; - -var - I: Integer; -begin - Bytes := CodePointToUtf8(ACodePoint); - SetLength(Parts, 0); - if Length(Bytes) <> 1 then - AddPart('[\x00-\x7F]'); - if Length(Bytes) <> 2 then - AddPart('[\xC2-\xDF][\x80-\xBF]'); - if Length(Bytes) <> 3 then - AddPart('[\xE0-\xEF][\x80-\xBF][\x80-\xBF]'); - if Length(Bytes) <> 4 then - AddPart('[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]'); - - case Length(Bytes) of - 1: - AddPart(ByteRangeExcept($00, $7F, Ord(Bytes[1]))); - 2: - begin - AddPart(ByteRangeExcept($C2, $DF, Ord(Bytes[1])) + '[\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + - ByteRangeExcept($80, $BF, Ord(Bytes[2]))); - end; - 3: - begin - AddPart(ByteRangeExcept($E0, $EF, Ord(Bytes[1])) + - '[\x80-\xBF][\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + - ByteRangeExcept($80, $BF, Ord(Bytes[2])) + '[\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + HexByte(Ord(Bytes[2])) + - ByteRangeExcept($80, $BF, Ord(Bytes[3]))); - end; - 4: - begin - AddPart(ByteRangeExcept($F0, $F4, Ord(Bytes[1])) + - '[\x80-\xBF][\x80-\xBF][\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + - ByteRangeExcept($80, $BF, Ord(Bytes[2])) + - '[\x80-\xBF][\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + HexByte(Ord(Bytes[2])) + - ByteRangeExcept($80, $BF, Ord(Bytes[3])) + '[\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + HexByte(Ord(Bytes[2])) + - HexByte(Ord(Bytes[3])) + ByteRangeExcept($80, $BF, Ord(Bytes[4]))); - end; - end; - - Result := '(?:'; - for I := 0 to High(Parts) do - begin - if I > 0 then - Result := Result + '|'; - Result := Result + Parts[I]; - end; - Result := Result + ')'; -end; - -function EmitUnicodeAtom(const ACodePoint: Cardinal; - const AIgnoreCase: Boolean): string; -begin - if AIgnoreCase and (ACodePoint = $212A) then - Result := '[Kk]' - else - Result := '(?:' + EscapeLiteralAtom(CodePointToUtf8(ACodePoint)) + ')'; -end; - -function TryParseUnicodeClassAtom(const APattern: string; var AIndex: Integer; - const AStopIndex: Integer; out ACodePoint: Cardinal): Boolean; -var - HexStr: string; - HighSurrogate, LowSurrogate: Cardinal; - ByteLength: Integer; -begin - Result := False; - ACodePoint := 0; - if AIndex > AStopIndex then - Exit; - if (APattern[AIndex] = '\') and (AIndex + 5 <= AStopIndex) and - (APattern[AIndex + 1] = 'u') then - begin - HexStr := Copy(APattern, AIndex + 2, 4); - if not ((Length(HexStr) = 4) and IsHexDigit(HexStr[1]) and - IsHexDigit(HexStr[2]) and IsHexDigit(HexStr[3]) and - IsHexDigit(HexStr[4])) then - Exit; - HighSurrogate := StrToInt('$' + HexStr); - Inc(AIndex, 6); - if (HighSurrogate >= $D800) and (HighSurrogate <= $DBFF) and - (AIndex + 5 <= AStopIndex) and (APattern[AIndex] = '\') and - (APattern[AIndex + 1] = 'u') then - begin - HexStr := Copy(APattern, AIndex + 2, 4); - if (Length(HexStr) = 4) and IsHexDigit(HexStr[1]) and - IsHexDigit(HexStr[2]) and IsHexDigit(HexStr[3]) and - IsHexDigit(HexStr[4]) then - begin - LowSurrogate := StrToInt('$' + HexStr); - if (LowSurrogate >= $DC00) and (LowSurrogate <= $DFFF) then - begin - ACodePoint := $10000 + ((HighSurrogate - $D800) shl 10) + - (LowSurrogate - $DC00); - Inc(AIndex, 6); - Exit(True); - end; - end; - end; - ACodePoint := HighSurrogate; - Exit(True); - end; - if DecodeUtf8At(APattern, AIndex, ACodePoint, ByteLength) and - (ByteLength > 1) then - begin - Inc(AIndex, ByteLength); - Exit(True); - end; -end; - -function TryConvertUnicodeCharacterClass(const APattern: string; - const AStartIndex: Integer; const AIgnoreCase: Boolean; - out AReplacement: string; out ANextIndex: Integer): Boolean; -var - EndIndex, AtomStart, I: Integer; - Negated: Boolean; - FirstCodePoint, LastCodePoint, CurrentCodePoint: Cardinal; - CodePoints: array of Cardinal; -begin - Result := False; - AReplacement := ''; - ANextIndex := AStartIndex; - EndIndex := AStartIndex + 1; - while (EndIndex <= Length(APattern)) and (APattern[EndIndex] <> ']') do - Inc(EndIndex); - if EndIndex > Length(APattern) then - Exit; - AtomStart := AStartIndex + 1; - Negated := (AtomStart < EndIndex) and (APattern[AtomStart] = '^'); - if Negated then - Inc(AtomStart); - I := AtomStart; - if not TryParseUnicodeClassAtom(APattern, I, EndIndex - 1, FirstCodePoint) then - Exit; - if I = EndIndex then - begin - if Negated then - AReplacement := Utf8AnyCodePointExcept(FirstCodePoint) - else - AReplacement := EmitUnicodeAtom(FirstCodePoint, AIgnoreCase); - ANextIndex := EndIndex + 1; - Exit(True); - end; - if (not Negated) and (I < EndIndex) and (APattern[I] = '-') then - begin - Inc(I); - if TryParseUnicodeClassAtom(APattern, I, EndIndex - 1, LastCodePoint) and - (I = EndIndex) and (FirstCodePoint <= LastCodePoint) and - (LastCodePoint - FirstCodePoint <= 32) then - begin - AReplacement := '(?:'; - for CurrentCodePoint := FirstCodePoint to LastCodePoint do - begin - if CurrentCodePoint > FirstCodePoint then - AReplacement := AReplacement + '|'; - AReplacement := AReplacement + - EscapeLiteralAtom(CodePointToUtf8(CurrentCodePoint)); - end; - AReplacement := AReplacement + ')'; - ANextIndex := EndIndex + 1; - Exit(True); - end; - end; - - SetLength(CodePoints, 1); - CodePoints[0] := FirstCodePoint; - while I < EndIndex do - begin - if not TryParseUnicodeClassAtom(APattern, I, EndIndex - 1, CurrentCodePoint) then - Exit(False); - SetLength(CodePoints, Length(CodePoints) + 1); - CodePoints[High(CodePoints)] := CurrentCodePoint; - end; - - if Negated then - begin - AReplacement := '[^\s\S]'; - ANextIndex := EndIndex + 1; - Exit(True); - end; - AReplacement := ''; - AReplacement := AReplacement + '(?:'; - for I := 0 to High(CodePoints) do - begin - if I > 0 then - AReplacement := AReplacement + '|'; - AReplacement := AReplacement + - EscapeLiteralAtom(CodePointToUtf8(CodePoints[I])); - end; - AReplacement := AReplacement + ')'; - ANextIndex := EndIndex + 1; - Result := True; -end; - -// ES2026 §22.2.1 Patterns — preprocess pattern for Unicode mode. -// Expands \p{...} / \P{...} property escapes into TRegExpr-compatible -// character classes and converts \u{XXXX} code point escapes into -// literal UTF-8 byte sequences. -function PreprocessUnicodePattern(const APattern: string; - const AIgnoreCase: Boolean): string; -var - I, J, PatternLength: Integer; - PropertyName: string; - Negated: Boolean; - InCharacterClass: Boolean; - CodePoint: Cardinal; - HexStart, HexLen: Integer; - HexStr: string; - LowSurrogate: Cardinal; - ByteLength: Integer; - ClassReplacement: string; - NextIndex: Integer; -begin - Result := ''; - I := 1; - PatternLength := Length(APattern); - InCharacterClass := False; - - while I <= PatternLength do - begin - if APattern[I] = '\' then - begin - if I + 1 > PatternLength then - begin - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - - case APattern[I + 1] of - '0': - begin - Result := Result + '(?:' + #0 + ')'; - Inc(I, 2); - end; - 'S': - begin - Result := Result + UTF8_NON_SPACE_CODE_POINT; - Inc(I, 2); - end; - 'p', 'P': - begin - Negated := APattern[I + 1] = 'P'; - if (I + 2 <= PatternLength) and (APattern[I + 2] = '{') then - begin - PropertyName := ''; - Inc(I, 3); - while (I <= PatternLength) and (APattern[I] <> '}') do - begin - PropertyName := PropertyName + APattern[I]; - Inc(I); - end; - if I > PatternLength then - raise EConvertError.Create( - 'Unterminated Unicode property escape'); - Inc(I); // skip closing brace - Result := Result + - ExpandUnicodePropertyEscape(PropertyName, Negated); - end - else - begin - Result := Result + APattern[I] + APattern[I + 1]; - Inc(I, 2); - end; - end; - 'u': - begin - // \u{XXXX} or \u{XXXXX} code point escape - if (I + 2 <= PatternLength) and (APattern[I + 2] = '{') then - begin - HexStart := I + 3; - HexLen := 0; - while (HexStart + HexLen <= PatternLength) and - (APattern[HexStart + HexLen] <> '}') do - Inc(HexLen); - if HexStart + HexLen > PatternLength then - raise EConvertError.Create( - 'Unterminated Unicode escape sequence'); - HexStr := Copy(APattern, HexStart, HexLen); - if HexStr = '' then - raise EConvertError.Create( - 'Empty Unicode escape sequence'); - for J := 1 to Length(HexStr) do - if not IsHexDigit(HexStr[J]) then - raise EConvertError.Create( - 'Invalid hex digit in Unicode escape: \u{' + - HexStr + '}'); - CodePoint := StrToInt('$' + HexStr); - if CodePoint > $10FFFF then - raise EConvertError.Create( - 'Unicode escape out of range: \u{' + HexStr + '}'); - if InCharacterClass then - Result := Result + EscapeLiteralAtom(CodePointToUtf8(CodePoint)) - else - Result := Result + EmitUnicodeAtom(CodePoint, AIgnoreCase); - I := HexStart + HexLen + 1; - end - // \uHHHH four-digit Unicode escape - else if (I + 5 <= PatternLength) and - IsHexDigit(APattern[I + 2]) and - IsHexDigit(APattern[I + 3]) and - IsHexDigit(APattern[I + 4]) and - IsHexDigit(APattern[I + 5]) then - begin - HexStr := Copy(APattern, I + 2, 4); - CodePoint := StrToInt('$' + HexStr); - if (CodePoint >= $D800) and (CodePoint <= $DBFF) and - (I + 11 <= PatternLength) and (APattern[I + 6] = '\') and - (APattern[I + 7] = 'u') and - IsHexDigit(APattern[I + 8]) and - IsHexDigit(APattern[I + 9]) and - IsHexDigit(APattern[I + 10]) and - IsHexDigit(APattern[I + 11]) then - begin - HexStr := Copy(APattern, I + 8, 4); - LowSurrogate := StrToInt('$' + HexStr); - if (LowSurrogate >= $DC00) and (LowSurrogate <= $DFFF) then - begin - CodePoint := $10000 + ((CodePoint - $D800) shl 10) + - (LowSurrogate - $DC00); - if InCharacterClass then - Result := Result + EscapeLiteralAtom(CodePointToUtf8(CodePoint)) - else - Result := Result + EmitUnicodeAtom(CodePoint, AIgnoreCase); - Inc(I, 12); - Continue; - end; - end; - if InCharacterClass then - Result := Result + EscapeLiteralAtom(CodePointToUtf8(CodePoint)) - else - Result := Result + EmitUnicodeAtom(CodePoint, AIgnoreCase); - Inc(I, 6); - end - else - begin - Result := Result + APattern[I] + APattern[I + 1]; - Inc(I, 2); - end; - end; - else - begin - Result := Result + APattern[I] + APattern[I + 1]; - Inc(I, 2); - end; - end; - end - else if APattern[I] = '[' then - begin - if TryConvertUnicodeCharacterClass(APattern, I, AIgnoreCase, - ClassReplacement, NextIndex) then - begin - Result := Result + ClassReplacement; - I := NextIndex; - Continue; - end; - InCharacterClass := True; - Result := Result + APattern[I]; - Inc(I); - end - else if (not InCharacterClass) and (APattern[I] = '.') then - begin - Result := Result + UTF8_ANY_CODE_POINT; - Inc(I); - end - else if (not InCharacterClass) and - DecodeUtf8At(APattern, I, CodePoint, ByteLength) and - (ByteLength > 1) then - begin - Result := Result + EmitUnicodeAtom(CodePoint, AIgnoreCase); - Inc(I, ByteLength); - end - else if (APattern[I] = ']') and InCharacterClass then - begin - InCharacterClass := False; - Result := Result + APattern[I]; - Inc(I); - end - else - begin - Result := Result + APattern[I]; - Inc(I); - end; - end; -end; - -end. diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas new file mode 100644 index 00000000..74aef906 --- /dev/null +++ b/source/units/Goccia.RegExp.VM.pas @@ -0,0 +1,654 @@ +unit Goccia.RegExp.VM; + +{$I Goccia.inc} + +interface + +uses + Goccia.RegExp.Compiler; + +type + TRegExpVMResult = record + Matched: Boolean; + CaptureSlots: array of Integer; + end; + +function ExecuteRegExpVM(const AProgram: TRegExpProgram; + const AInput: string; const AStartIndex: Integer; + const ARequireStart: Boolean; out AResult: TRegExpVMResult): Boolean; + +implementation + +{$rangechecks off} + +uses + SysUtils, + + TextSemantics; + +const + DEFAULT_STEP_LIMIT = 10000000; + DEFAULT_BACKTRACK_CAP = 1000000; + MEMO_CAPACITY = 65536; + MEMO_LOAD_LIMIT = 49152; + +type + TBacktrackEntry = record + PC: Integer; + InputPos: Integer; + Slots: array of Integer; + end; + + TMemoEntry = record + Occupied: Boolean; + PC: Integer; + InputPos: Integer; + end; + + TMemoTable = record + Entries: array of TMemoEntry; + Count: Integer; + end; + +procedure MemoInit(var AMemo: TMemoTable); +begin + SetLength(AMemo.Entries, MEMO_CAPACITY); + AMemo.Count := 0; +end; + +function MemoHash(APC, APos: Integer): Integer; inline; +var + H: UInt32; +begin + H := UInt32(APC) * 2654435761; + H := H xor (UInt32(APos) * 2246822519); + Result := Integer(H and (MEMO_CAPACITY - 1)); +end; + +function MemoContains(var AMemo: TMemoTable; APC, APos: Integer): Boolean; +var + Idx, I: Integer; +begin + Idx := MemoHash(APC, APos); + for I := 0 to 15 do + begin + if not AMemo.Entries[Idx].Occupied then + Exit(False); + if (AMemo.Entries[Idx].PC = APC) and (AMemo.Entries[Idx].InputPos = APos) then + Exit(True); + Idx := (Idx + 1) and (MEMO_CAPACITY - 1); + end; + Result := False; +end; + +procedure MemoAdd(var AMemo: TMemoTable; APC, APos: Integer); +var + Idx, I: Integer; +begin + if AMemo.Count >= MEMO_LOAD_LIMIT then + Exit; + Idx := MemoHash(APC, APos); + for I := 0 to 15 do + begin + if not AMemo.Entries[Idx].Occupied then + begin + AMemo.Entries[Idx].Occupied := True; + AMemo.Entries[Idx].PC := APC; + AMemo.Entries[Idx].InputPos := APos; + Inc(AMemo.Count); + Exit; + end; + if (AMemo.Entries[Idx].PC = APC) and (AMemo.Entries[Idx].InputPos = APos) then + Exit; + Idx := (Idx + 1) and (MEMO_CAPACITY - 1); + end; +end; + +function CharClassContains(const AClass: TRegExpCharClass; + ACodePoint: Cardinal): Boolean; +var + Lo, Hi, Mid: Integer; +begin + Lo := 0; + Hi := High(AClass.Ranges); + while Lo <= Hi do + begin + Mid := (Lo + Hi) shr 1; + if ACodePoint < AClass.Ranges[Mid].Lo then + Hi := Mid - 1 + else if ACodePoint > AClass.Ranges[Mid].Hi then + Lo := Mid + 1 + else + Exit(True); + end; + Result := False; +end; + +function CharClassContainsLinear(const AClass: TRegExpCharClass; + ACodePoint: Cardinal): Boolean; +var + I: Integer; +begin + for I := 0 to High(AClass.Ranges) do + if (ACodePoint >= AClass.Ranges[I].Lo) and + (ACodePoint <= AClass.Ranges[I].Hi) then + Exit(True); + Result := False; +end; + +function IsWordChar(ACodePoint: Cardinal): Boolean; inline; +begin + Result := ((ACodePoint >= Ord('a')) and (ACodePoint <= Ord('z'))) or + ((ACodePoint >= Ord('A')) and (ACodePoint <= Ord('Z'))) or + ((ACodePoint >= Ord('0')) and (ACodePoint <= Ord('9'))) or + (ACodePoint = Ord('_')); +end; + +function IsLineTerminator(ACodePoint: Cardinal): Boolean; inline; +begin + Result := (ACodePoint = $0A) or (ACodePoint = $0D) or + (ACodePoint = $2028) or (ACodePoint = $2029); +end; + +function ReadInputCodePoint(const AInput: string; APos: Integer; + AUnicode: Boolean; out ACodePoint: Cardinal; out AByteLen: Integer): Boolean; +begin + if (APos < 1) or (APos > Length(AInput)) then + begin + ACodePoint := 0; + AByteLen := 0; + Exit(False); + end; + if AUnicode then + begin + Result := TryReadUTF8CodePoint(AInput, APos, ACodePoint, AByteLen); + if not Result then + begin + ACodePoint := Ord(AInput[APos]); + AByteLen := 1; + Result := True; + end; + end + else + begin + ACodePoint := Ord(AInput[APos]); + AByteLen := 1; + Result := True; + end; +end; + +function GetCodePointBefore(const AInput: string; APos: Integer; + AUnicode: Boolean; out ACodePoint: Cardinal): Boolean; +var + StartPos, ByteLen: Integer; +begin + Result := False; + ACodePoint := 0; + if APos <= 1 then + Exit; + if not AUnicode then + begin + ACodePoint := Ord(AInput[APos - 1]); + Exit(True); + end; + StartPos := APos - 1; + while (StartPos > 1) and ((Ord(AInput[StartPos]) and $C0) = $80) do + Dec(StartPos); + Result := TryReadUTF8CodePoint(AInput, StartPos, ACodePoint, ByteLen); +end; + +function RunVM(const AProgram: TRegExpProgram; const AInput: string; + AStartPos: Integer; var ASlots: array of Integer; + ASlotCount: Integer): Boolean; +var + PC, InputPos: Integer; + Instr: UInt32; + Op: TRegExpOpCode; + Bx: Integer; + CodePoint: Cardinal; + ByteLen: Integer; + Stack: array of TBacktrackEntry; + StackTop: Integer; + StepCount: Integer; + Memo: TMemoTable; + SlotCount: Integer; + I: Integer; + MatchCP: Cardinal; + BeforeCP: Cardinal; + BeforeIsWord, AfterIsWord: Boolean; + Negated: Boolean; + BackrefGroup: Integer; + LookEnd: Integer; + LookSlots: array of Integer; + LookMatched: Boolean; + RefStart, RefEnd, RefPos: Integer; + RefCP, InputCP: Cardinal; + RefByteLen, InputByteLen: Integer; + + procedure PushBacktrack(APC, AInputPos: Integer); + begin + if StackTop >= DEFAULT_BACKTRACK_CAP then + raise EConvertError.Create('Maximum regexp backtrack stack size exceeded'); + Inc(StackTop); + if StackTop >= Length(Stack) then + SetLength(Stack, StackTop * 2 + 16); + Stack[StackTop].PC := APC; + Stack[StackTop].InputPos := AInputPos; + SetLength(Stack[StackTop].Slots, SlotCount); + if SlotCount > 0 then + Move(ASlots[0], Stack[StackTop].Slots[0], SlotCount * SizeOf(Integer)); + end; + + function PopBacktrack: Boolean; + begin + while StackTop >= 0 do + begin + PC := Stack[StackTop].PC; + InputPos := Stack[StackTop].InputPos; + if SlotCount > 0 then + Move(Stack[StackTop].Slots[0], ASlots[0], SlotCount * SizeOf(Integer)); + Dec(StackTop); + if not MemoContains(Memo, PC, InputPos) then + Exit(True); + end; + Result := False; + end; + +begin + Result := False; + SlotCount := ASlotCount; + PC := 0; + InputPos := AStartPos; + StepCount := 0; + StackTop := -1; + SetLength(Stack, 256); + MemoInit(Memo); + + while PC < Length(AProgram.Code) do + begin + Inc(StepCount); + if StepCount > DEFAULT_STEP_LIMIT then + raise EConvertError.Create('Maximum regexp backtrack stack size exceeded'); + + Instr := AProgram.Code[PC]; + Op := TRegExpOpCode(Instr and $FF); + Bx := Integer(Instr shr 8); + + case Op of + RX_CHAR: + begin + if not ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + MatchCP := Cardinal(Bx); + if CodePoint <> MatchCP then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(InputPos, ByteLen); + Inc(PC); + end; + + RX_CHAR_CLASS: + begin + if not ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + if not CharClassContainsLinear(AProgram.CharClasses[Bx], CodePoint) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(InputPos, ByteLen); + Inc(PC); + end; + + RX_CHAR_CLASS_NEG: + begin + if not ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + if CharClassContainsLinear(AProgram.CharClasses[Bx], CodePoint) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(InputPos, ByteLen); + Inc(PC); + end; + + RX_ANY: + begin + if not ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + if (Bx = 0) and IsLineTerminator(CodePoint) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(InputPos, ByteLen); + Inc(PC); + end; + + RX_SPLIT: + begin + if not MemoContains(Memo, Bx, InputPos) then + PushBacktrack(Bx, InputPos); + Inc(PC); + end; + + RX_SPLIT_LAZY: + begin + if not MemoContains(Memo, PC + 1, InputPos) then + PushBacktrack(PC + 1, InputPos); + PC := Bx; + end; + + RX_JUMP: + PC := Bx; + + RX_SAVE: + begin + if Bx < SlotCount then + ASlots[Bx] := InputPos; + Inc(PC); + end; + + RX_BACKREF: + begin + Negated := (Bx and $800000) <> 0; + BackrefGroup := Bx and $7FFFFF; + RefStart := -1; + RefEnd := -1; + if (BackrefGroup * 2) < SlotCount then + RefStart := ASlots[BackrefGroup * 2]; + if (BackrefGroup * 2 + 1) < SlotCount then + RefEnd := ASlots[BackrefGroup * 2 + 1]; + if (RefStart < 0) or (RefEnd < 0) or (RefStart > RefEnd) then + begin + if Negated then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(PC); + Continue; + end; + RefPos := RefStart; + LookMatched := True; + while RefPos < RefEnd do + begin + if not ReadInputCodePoint(AInput, RefPos, AProgram.FlagUnicode, + RefCP, RefByteLen) then + begin + LookMatched := False; + Break; + end; + if not ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + InputCP, InputByteLen) then + begin + LookMatched := False; + Break; + end; + if RefCP <> InputCP then + begin + LookMatched := False; + Break; + end; + Inc(RefPos, RefByteLen); + Inc(InputPos, InputByteLen); + end; + if not LookMatched then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(PC); + end; + + RX_ASSERT_START: + begin + if Bx <> 0 then + begin + if (InputPos > 1) and not IsLineTerminator(Ord(AInput[InputPos - 1])) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else + begin + if InputPos > 1 then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end; + Inc(PC); + end; + + RX_ASSERT_END: + begin + if Bx <> 0 then + begin + if (InputPos <= Length(AInput)) and + not IsLineTerminator(Ord(AInput[InputPos])) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else + begin + if InputPos <= Length(AInput) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end; + Inc(PC); + end; + + RX_ASSERT_WORD: + begin + Negated := Bx <> 0; + BeforeIsWord := False; + AfterIsWord := False; + if GetCodePointBefore(AInput, InputPos, AProgram.FlagUnicode, BeforeCP) then + BeforeIsWord := IsWordChar(BeforeCP); + if ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + CodePoint, ByteLen) then + AfterIsWord := IsWordChar(CodePoint); + if Negated then + begin + if BeforeIsWord <> AfterIsWord then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else + begin + if BeforeIsWord = AfterIsWord then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end; + Inc(PC); + end; + + RX_LOOKAHEAD: + begin + Negated := (Instr and $80) <> 0; + LookEnd := Bx; + SetLength(LookSlots, SlotCount); + Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); + LookMatched := RunVM(AProgram, AInput, InputPos, LookSlots, + SlotCount); + if Negated then + begin + if LookMatched then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else + begin + if not LookMatched then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Move(LookSlots[0], ASlots[0], SlotCount * SizeOf(Integer)); + end; + PC := LookEnd; + end; + + RX_LOOKBEHIND: + begin + Negated := (Instr and $80) <> 0; + LookEnd := Bx; + LookMatched := False; + SetLength(LookSlots, SlotCount); + I := InputPos - 1; + while I >= 1 do + begin + Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); + if RunVM(AProgram, AInput, I, LookSlots, SlotCount) then + begin + if LookSlots[1] = InputPos then + begin + LookMatched := True; + Break; + end; + end; + Dec(I); + end; + if Negated then + begin + if LookMatched then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else + begin + if not LookMatched then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Move(LookSlots[0], ASlots[0], SlotCount * SizeOf(Integer)); + end; + PC := LookEnd; + end; + + RX_MATCH: + begin + Result := True; + Exit; + end; + + RX_FAIL: + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + else + Inc(PC); + end; + end; +end; + +function ExecuteRegExpVM(const AProgram: TRegExpProgram; + const AInput: string; const AStartIndex: Integer; + const ARequireStart: Boolean; out AResult: TRegExpVMResult): Boolean; +var + SlotCount, I, StartPos: Integer; + Slots: array of Integer; + ByteLen: Integer; + CodePoint: Cardinal; +begin + Result := False; + AResult.Matched := False; + SlotCount := (AProgram.CaptureCount + 1) * 2; + SetLength(Slots, SlotCount); + StartPos := AStartIndex + 1; + if ARequireStart then + begin + for I := 0 to SlotCount - 1 do + Slots[I] := -1; + if RunVM(AProgram, AInput, StartPos, Slots, SlotCount) then + begin + AResult.Matched := True; + SetLength(AResult.CaptureSlots, SlotCount); + Move(Slots[0], AResult.CaptureSlots[0], SlotCount * SizeOf(Integer)); + Result := True; + end; + Exit; + end; + while StartPos <= Length(AInput) + 1 do + begin + for I := 0 to SlotCount - 1 do + Slots[I] := -1; + if RunVM(AProgram, AInput, StartPos, Slots, SlotCount) then + begin + AResult.Matched := True; + SetLength(AResult.CaptureSlots, SlotCount); + Move(Slots[0], AResult.CaptureSlots[0], SlotCount * SizeOf(Integer)); + Result := True; + Exit; + end; + if StartPos > Length(AInput) then + Break; + if AProgram.FlagUnicode then + begin + if TryReadUTF8CodePoint(AInput, StartPos, CodePoint, ByteLen) then + Inc(StartPos, ByteLen) + else + Inc(StartPos); + end + else + Inc(StartPos); + end; +end; + +end. From eab29d32979bd32d032a91b4e80628dd6fc1d0b9 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 17:39:18 +0100 Subject: [PATCH 02/15] Fix range check: replace wrapping-multiply hash with shift-xor hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memoization hash used Knuth multiplicative constants that overflow Cardinal, requiring {$Q-}{$R-} suppression. Replace with a shift-xor hash that stays within Cardinal range — no compiler flags needed. Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.RegExp.VM.pas | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index 74aef906..22fe4027 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -19,8 +19,6 @@ function ExecuteRegExpVM(const AProgram: TRegExpProgram; implementation -{$rangechecks off} - uses SysUtils, @@ -58,10 +56,11 @@ procedure MemoInit(var AMemo: TMemoTable); function MemoHash(APC, APos: Integer): Integer; inline; var - H: UInt32; + H: Cardinal; begin - H := UInt32(APC) * 2654435761; - H := H xor (UInt32(APos) * 2246822519); + H := Cardinal(APC); + H := (H shl 5) xor (H shr 3) xor Cardinal(APos); + H := H xor (H shr 7) xor (H shr 15); Result := Integer(H and (MEMO_CAPACITY - 1)); end; From 0057dc114b7bf85a9d7ba99790fd0b58d41805fc Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 18:37:04 +0100 Subject: [PATCH 03/15] Fix VM assertion and code point handling for multi-byte UTF-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs found via test262: 1. RX_ASSERT_START/END read raw bytes via Ord(AInput[Pos]) instead of decoding full UTF-8 code points. Multi-byte line terminators (U+2028, U+2029) were never recognized, and accessing continuation bytes triggered range check errors. Fix: use GetCodePointBefore and ReadInputCodePoint for proper UTF-8 decoding. 2. ReadInputCodePoint only decoded UTF-8 when the unicode flag was set. Without /u, multi-byte BMP characters (U+0085, U+2028, etc.) were read as single bytes, causing . to match one byte instead of one code point. Fix: always decode UTF-8 regardless of flag — the unicode flag only affects supplementary plane advancement in the scanner loop. Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.RegExp.VM.pas | 39 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index 22fe4027..6670b3de 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -158,17 +158,8 @@ function ReadInputCodePoint(const AInput: string; APos: Integer; AByteLen := 0; Exit(False); end; - if AUnicode then - begin - Result := TryReadUTF8CodePoint(AInput, APos, ACodePoint, AByteLen); - if not Result then - begin - ACodePoint := Ord(AInput[APos]); - AByteLen := 1; - Result := True; - end; - end - else + Result := TryReadUTF8CodePoint(AInput, APos, ACodePoint, AByteLen); + if not Result then begin ACodePoint := Ord(AInput[APos]); AByteLen := 1; @@ -433,11 +424,16 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; begin if Bx <> 0 then begin - if (InputPos > 1) and not IsLineTerminator(Ord(AInput[InputPos - 1])) then + if InputPos > 1 then begin - MemoAdd(Memo, PC, InputPos); - if not PopBacktrack then Exit; - Continue; + if not GetCodePointBefore(AInput, InputPos, + AProgram.FlagUnicode, BeforeCP) or + not IsLineTerminator(BeforeCP) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; end; end else @@ -456,12 +452,15 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; begin if Bx <> 0 then begin - if (InputPos <= Length(AInput)) and - not IsLineTerminator(Ord(AInput[InputPos])) then + if ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + CodePoint, ByteLen) then begin - MemoAdd(Memo, PC, InputPos); - if not PopBacktrack then Exit; - Continue; + if not IsLineTerminator(CodePoint) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; end; end else From e0eb715a92dc25fd75fef814e276257899dc9892 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 18:50:56 +0100 Subject: [PATCH 04/15] Accessor properties, error distinction, and test coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Convert RegExp flag properties (source, flags, global, ignoreCase, multiline, dotAll, unicode, sticky, unicodeSets, hasIndices) from per-instance data properties to spec-correct accessor getters on RegExp.prototype (ES2026 §22.2.6). Accessing them on the prototype itself returns undefined (or '(?:)' for source, '' for flags) per spec. - Introduce ERegExpRuntimeError exception class so the regex VM's backtrack limit error is distinguishable from GocciaScript VM errors. Runtime.pas catches it and re-throws as a proper JS Error via ThrowError. - Add JS test coverage: dotAll with multi-byte BMP characters, dot rejecting multi-byte line terminators, multiline anchors with multi-byte context, catastrophic backtracking throws Error, and large input regression test for #515. Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.Builtins.GlobalRegExp.pas | 163 +++++++++++++++++- source/units/Goccia.RegExp.Runtime.pas | 44 ++--- source/units/Goccia.RegExp.VM.pas | 10 +- tests/built-ins/RegExp/prototype/exec.js | 15 ++ tests/built-ins/RegExp/unicode.js | 29 ++++ 5 files changed, 225 insertions(+), 36 deletions(-) diff --git a/source/units/Goccia.Builtins.GlobalRegExp.pas b/source/units/Goccia.Builtins.GlobalRegExp.pas index 7e633adb..dd423594 100644 --- a/source/units/Goccia.Builtins.GlobalRegExp.pas +++ b/source/units/Goccia.Builtins.GlobalRegExp.pas @@ -46,6 +46,26 @@ TGocciaGlobalRegExp = class(TGocciaBuiltin) const AThisValue: TGocciaValue): TGocciaValue; function RegExpSymbolSplit(const AArgs: TGocciaArgumentsCollection; const AThisValue: TGocciaValue): TGocciaValue; + function RegExpSourceGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpFlagsGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpGlobalGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpIgnoreCaseGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpMultilineGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpDotAllGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpUnicodeGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpStickyGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpUnicodeSetsGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpHasIndicesGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; public constructor Create(const AName: string; const AScope: TGocciaScope; const AThrowError: TGocciaThrowErrorCallback; @@ -63,6 +83,7 @@ implementation Goccia.Error.Messages, Goccia.Error.Suggestions, Goccia.GarbageCollector, + Goccia.RegExp.Engine, Goccia.RegExp.Runtime, Goccia.Utils, Goccia.Values.ArrayValue, @@ -315,6 +336,26 @@ constructor TGocciaGlobalRegExp.Create(const AName: string; Members.AddSymbolMethod(TGocciaSymbolValue.WellKnownSplit, '[Symbol.split]', RegExpSymbolSplit, 2, [pfConfigurable, pfWritable], [gmfNoFunctionPrototype]); + Members.AddAccessor(PROP_SOURCE, RegExpSourceGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_FLAGS, RegExpFlagsGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_GLOBAL, RegExpGlobalGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_IGNORE_CASE, RegExpIgnoreCaseGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_MULTILINE, RegExpMultilineGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_DOT_ALL, RegExpDotAllGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_UNICODE, RegExpUnicodeGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_STICKY, RegExpStickyGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_UNICODE_SETS, RegExpUnicodeSetsGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_HAS_INDICES, RegExpHasIndicesGetter, nil, + [pfConfigurable]); Members.AddSymbolDataProperty(TGocciaSymbolValue.WellKnownToStringTag, TGocciaStringLiteralValue.Create(CONSTRUCTOR_REGEXP), [pfConfigurable]); FPrototypeMembers := Members.ToDefinitions; @@ -353,7 +394,127 @@ constructor TGocciaGlobalRegExp.Create(const AName: string; AScope.DefineLexicalBinding(AName, FRegExpConstructor, dtConst, True); end; -// ES2026 §22.2.4.2 get RegExp [ @@species ] +function RequireRegExpThis(const AThisValue: TGocciaValue; + const AMethodName: string): TGocciaObjectValue; +begin + if not IsRegExpValue(AThisValue) then + ThrowTypeError(AMethodName + ' requires a RegExp object'); + Result := TGocciaObjectValue(AThisValue); +end; + +function GetRegExpInternalFlags(const AObj: TGocciaObjectValue): string; +begin + Result := AObj.GetProperty(PROP_FLAGS).ToStringLiteral.Value; +end; + +function TGocciaGlobalRegExp.RegExpSourceGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaStringLiteralValue.Create('(?:)')); + Result := RequireRegExpThis(AThisValue, 'get RegExp.prototype.source') + .GetProperty(PROP_SOURCE); +end; + +function TGocciaGlobalRegExp.RegExpFlagsGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaStringLiteralValue.Create('')); + Result := RequireRegExpThis(AThisValue, 'get RegExp.prototype.flags') + .GetProperty(PROP_FLAGS); +end; + +function TGocciaGlobalRegExp.RegExpGlobalGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.global')), 'g')); +end; + +function TGocciaGlobalRegExp.RegExpIgnoreCaseGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.ignoreCase')), 'i')); +end; + +function TGocciaGlobalRegExp.RegExpMultilineGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.multiline')), 'm')); +end; + +function TGocciaGlobalRegExp.RegExpDotAllGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.dotAll')), 's')); +end; + +function TGocciaGlobalRegExp.RegExpUnicodeGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.unicode')), 'u')); +end; + +function TGocciaGlobalRegExp.RegExpStickyGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.sticky')), 'y')); +end; + +function TGocciaGlobalRegExp.RegExpUnicodeSetsGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.unicodeSets')), 'v')); +end; + +function TGocciaGlobalRegExp.RegExpHasIndicesGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.hasIndices')), 'd')); +end; + function TGocciaGlobalRegExp.RegExpSpeciesGetter( const AArgs: TGocciaArgumentsCollection; const AThisValue: TGocciaValue): TGocciaValue; diff --git a/source/units/Goccia.RegExp.Runtime.pas b/source/units/Goccia.RegExp.Runtime.pas index 7add209d..5fc4efe2 100644 --- a/source/units/Goccia.RegExp.Runtime.pas +++ b/source/units/Goccia.RegExp.Runtime.pas @@ -29,6 +29,7 @@ implementation SysUtils, Goccia.Constants.PropertyNames, + Goccia.RegExp.VM, Goccia.Values.ArrayValue, Goccia.Values.ErrorHelper, Goccia.Values.ObjectPropertyDescriptor, @@ -153,30 +154,6 @@ function CreateRegExpObject(const APattern, AFlags: string): TGocciaValue; Obj.DefineProperty(PROP_LAST_INDEX, TGocciaPropertyDescriptorData.Create( TGocciaNumberLiteralValue.Create(0), [pfWritable])); - Obj.DefineProperty(PROP_GLOBAL, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'g')), [])); - Obj.DefineProperty(PROP_IGNORE_CASE, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'i')), [])); - Obj.DefineProperty(PROP_MULTILINE, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'm')), [])); - Obj.DefineProperty(PROP_DOT_ALL, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 's')), [])); - Obj.DefineProperty(PROP_UNICODE, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'u')), [])); - Obj.DefineProperty(PROP_STICKY, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'y')), [])); - Obj.DefineProperty(PROP_UNICODE_SETS, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'v')), [])); - Obj.DefineProperty(PROP_HAS_INDICES, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'd')), [])); Result := Obj; end; @@ -215,13 +192,18 @@ function MatchRegExpObject(const AValue: TGocciaValue; const AInput: string; ShouldUpdate: Boolean; begin Obj := TGocciaObjectValue(AValue); - Result := ExecuteRegExp( - GetStringProperty(Obj, PROP_SOURCE), - GetStringProperty(Obj, PROP_FLAGS), - AInput, - AStartIndex, - ARequireStart, - MatchResult); + try + Result := ExecuteRegExp( + GetStringProperty(Obj, PROP_SOURCE), + GetStringProperty(Obj, PROP_FLAGS), + AInput, + AStartIndex, + ARequireStart, + MatchResult); + except + on E: ERegExpRuntimeError do + ThrowError(E.Message); + end; ShouldUpdate := AUpdateLastIndex and (GetBooleanProperty(Obj, PROP_GLOBAL) or GetBooleanProperty(Obj, PROP_STICKY)); diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index 6670b3de..88e50f47 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -5,9 +5,13 @@ interface uses + SysUtils, + Goccia.RegExp.Compiler; type + ERegExpRuntimeError = class(Exception); + TRegExpVMResult = record Matched: Boolean; CaptureSlots: array of Integer; @@ -20,8 +24,6 @@ function ExecuteRegExpVM(const AProgram: TRegExpProgram; implementation uses - SysUtils, - TextSemantics; const @@ -218,7 +220,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; procedure PushBacktrack(APC, AInputPos: Integer); begin if StackTop >= DEFAULT_BACKTRACK_CAP then - raise EConvertError.Create('Maximum regexp backtrack stack size exceeded'); + raise ERegExpRuntimeError.Create('Maximum regular expression backtrack stack size exceeded'); Inc(StackTop); if StackTop >= Length(Stack) then SetLength(Stack, StackTop * 2 + 16); @@ -258,7 +260,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; begin Inc(StepCount); if StepCount > DEFAULT_STEP_LIMIT then - raise EConvertError.Create('Maximum regexp backtrack stack size exceeded'); + raise ERegExpRuntimeError.Create('Maximum regular expression backtrack stack size exceeded'); Instr := AProgram.Code[PC]; Op := TRegExpOpCode(Instr and $FF); diff --git a/tests/built-ins/RegExp/prototype/exec.js b/tests/built-ins/RegExp/prototype/exec.js index 27a29d4b..6ea2dd0a 100644 --- a/tests/built-ins/RegExp/prototype/exec.js +++ b/tests/built-ins/RegExp/prototype/exec.js @@ -194,3 +194,18 @@ test("exec with duplicate named backreference outside the disjunction", () => { expect(re.exec("b")).toBe(null); expect(re.exec("ab")).toBe(null); }); + +// --- Backtrack limit --- + +test("catastrophic backtracking throws Error instead of hanging", () => { + expect(() => { + /^(a+)+$/.exec("a".repeat(30) + "b"); + }).toThrow(Error); +}); + +// --- Large input (#515 regression) --- + +test("exec on large input does not crash", () => { + const s = "foo" + ".bar".repeat(20000); + expect(/f.*/.test(s)).toBe(true); +}); diff --git a/tests/built-ins/RegExp/unicode.js b/tests/built-ins/RegExp/unicode.js index 64f5b119..9f66f57d 100644 --- a/tests/built-ins/RegExp/unicode.js +++ b/tests/built-ins/RegExp/unicode.js @@ -199,3 +199,32 @@ test("\\p{Separator}", () => { test("\\p{Z} shorthand for Separator", () => { expect(new RegExp("\\p{Z}", "u").test(" ")).toBe(true); }); + +// --- Multi-byte UTF-8 code point handling --- + +test("dot matches multi-byte BMP characters in dotAll mode", () => { + expect(/^.$/s.test("
")).toBe(true); + expect(/^.$/s.test("
")).toBe(true); + expect(/^.$/s.test("…")).toBe(true); +}); + +test("dot rejects line terminators without dotAll", () => { + expect(/^.$/.test("
")).toBe(false); + expect(/^.$/.test("
")).toBe(false); + expect(/^.$/.test("\n")).toBe(false); + expect(/^.$/.test("\r")).toBe(false); +}); + +test("multiline ^ matches after newline in multi-byte context", () => { + expect(/^abc/m.test("xyz +abc")).toBe(true); + expect(/^abc/m.test("é +abc")).toBe(true); +}); + +test("multiline $ matches before newline in multi-byte context", () => { + expect(/abc$/m.test("abc +xyz")).toBe(true); + expect(/abc$/m.test("abc +é")).toBe(true); +}); From 486ff6508421431bc3ec57b7367865edfdc69aa3 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 21:03:29 +0100 Subject: [PATCH 05/15] Fix regex VM always decoding UTF-8 code points and handling surrogates The scanner loop and GetCodePointBefore only decoded UTF-8 when the unicode flag was set. Without /u, multi-byte BMP characters (U+1680, U+2000-200A, U+2028, U+2029, etc.) were read as individual bytes, causing \S, \s, \b, ^, $ and . to misclassify them. This broke test262's character-class-escape-non-whitespace test on CI. Fix: always decode UTF-8 via TryReadUTF8CodePointAllowSurrogates (which also handles lone surrogates correctly). Remove the now-unused AUnicode parameter from ReadInputCodePoint and GetCodePointBefore. The scanner loop also always advances by code point now. Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.RegExp.VM.pas | 45 +++++++++++++------------------ 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index 88e50f47..e3f4018c 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -152,7 +152,7 @@ function IsLineTerminator(ACodePoint: Cardinal): Boolean; inline; end; function ReadInputCodePoint(const AInput: string; APos: Integer; - AUnicode: Boolean; out ACodePoint: Cardinal; out AByteLen: Integer): Boolean; + out ACodePoint: Cardinal; out AByteLen: Integer): Boolean; begin if (APos < 1) or (APos > Length(AInput)) then begin @@ -160,7 +160,8 @@ function ReadInputCodePoint(const AInput: string; APos: Integer; AByteLen := 0; Exit(False); end; - Result := TryReadUTF8CodePoint(AInput, APos, ACodePoint, AByteLen); + Result := TryReadUTF8CodePointAllowSurrogates(AInput, APos, ACodePoint, + AByteLen); if not Result then begin ACodePoint := Ord(AInput[APos]); @@ -170,7 +171,7 @@ function ReadInputCodePoint(const AInput: string; APos: Integer; end; function GetCodePointBefore(const AInput: string; APos: Integer; - AUnicode: Boolean; out ACodePoint: Cardinal): Boolean; + out ACodePoint: Cardinal): Boolean; var StartPos, ByteLen: Integer; begin @@ -178,15 +179,11 @@ function GetCodePointBefore(const AInput: string; APos: Integer; ACodePoint := 0; if APos <= 1 then Exit; - if not AUnicode then - begin - ACodePoint := Ord(AInput[APos - 1]); - Exit(True); - end; StartPos := APos - 1; while (StartPos > 1) and ((Ord(AInput[StartPos]) and $C0) = $80) do Dec(StartPos); - Result := TryReadUTF8CodePoint(AInput, StartPos, ACodePoint, ByteLen); + Result := TryReadUTF8CodePointAllowSurrogates(AInput, StartPos, ACodePoint, + ByteLen); end; function RunVM(const AProgram: TRegExpProgram; const AInput: string; @@ -269,7 +266,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; case Op of RX_CHAR: begin - if not ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + if not ReadInputCodePoint(AInput, InputPos, CodePoint, ByteLen) then begin MemoAdd(Memo, PC, InputPos); @@ -289,7 +286,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_CHAR_CLASS: begin - if not ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + if not ReadInputCodePoint(AInput, InputPos, CodePoint, ByteLen) then begin MemoAdd(Memo, PC, InputPos); @@ -308,7 +305,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_CHAR_CLASS_NEG: begin - if not ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + if not ReadInputCodePoint(AInput, InputPos, CodePoint, ByteLen) then begin MemoAdd(Memo, PC, InputPos); @@ -327,7 +324,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_ANY: begin - if not ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + if not ReadInputCodePoint(AInput, InputPos, CodePoint, ByteLen) then begin MemoAdd(Memo, PC, InputPos); @@ -393,13 +390,13 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; LookMatched := True; while RefPos < RefEnd do begin - if not ReadInputCodePoint(AInput, RefPos, AProgram.FlagUnicode, + if not ReadInputCodePoint(AInput, RefPos, RefCP, RefByteLen) then begin LookMatched := False; Break; end; - if not ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + if not ReadInputCodePoint(AInput, InputPos, InputCP, InputByteLen) then begin LookMatched := False; @@ -428,8 +425,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; begin if InputPos > 1 then begin - if not GetCodePointBefore(AInput, InputPos, - AProgram.FlagUnicode, BeforeCP) or + if not GetCodePointBefore(AInput, InputPos, BeforeCP) or not IsLineTerminator(BeforeCP) then begin MemoAdd(Memo, PC, InputPos); @@ -454,7 +450,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; begin if Bx <> 0 then begin - if ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + if ReadInputCodePoint(AInput, InputPos, CodePoint, ByteLen) then begin if not IsLineTerminator(CodePoint) then @@ -482,9 +478,9 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; Negated := Bx <> 0; BeforeIsWord := False; AfterIsWord := False; - if GetCodePointBefore(AInput, InputPos, AProgram.FlagUnicode, BeforeCP) then + if GetCodePointBefore(AInput, InputPos, BeforeCP) then BeforeIsWord := IsWordChar(BeforeCP); - if ReadInputCodePoint(AInput, InputPos, AProgram.FlagUnicode, + if ReadInputCodePoint(AInput, InputPos, CodePoint, ByteLen) then AfterIsWord := IsWordChar(CodePoint); if Negated then @@ -639,13 +635,8 @@ function ExecuteRegExpVM(const AProgram: TRegExpProgram; end; if StartPos > Length(AInput) then Break; - if AProgram.FlagUnicode then - begin - if TryReadUTF8CodePoint(AInput, StartPos, CodePoint, ByteLen) then - Inc(StartPos, ByteLen) - else - Inc(StartPos); - end + if TryReadUTF8CodePointAllowSurrogates(AInput, StartPos, CodePoint, ByteLen) then + Inc(StartPos, ByteLen) else Inc(StartPos); end; From 814f63ab35e3bdd5d59060dbc34a573da1b24ba5 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 21:56:30 +0100 Subject: [PATCH 06/15] Fix lookbehind executing entire program instead of body only The lookbehind handler called RunVM starting from PC=0, which re-executed the entire regex pattern (including .*, quantifiers, etc.) instead of just the lookbehind body. This caused exponential blowup even on tiny inputs like "xabcd".match(/.*(?<=(..|...|....))(.*)/), hanging the test262 CI runner. Fix: RunVM now accepts AStartPC and AEndPos parameters. Lookahead and lookbehind pass PC+1 as the start (skipping the assertion instruction to execute only the body up to RX_MATCH). Lookbehind uses AEndPos to check where the sub-match ended rather than checking capture slots. Also bounds the lookbehind scan distance to MAX_LOOKBEHIND_DISTANCE (256 positions) to prevent O(n) RunVM calls on large inputs. Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.RegExp.VM.pas | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index e3f4018c..e2db47bc 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -29,6 +29,7 @@ implementation const DEFAULT_STEP_LIMIT = 10000000; DEFAULT_BACKTRACK_CAP = 1000000; + MAX_LOOKBEHIND_DISTANCE = 256; MEMO_CAPACITY = 65536; MEMO_LOAD_LIMIT = 49152; @@ -188,7 +189,8 @@ function GetCodePointBefore(const AInput: string; APos: Integer; function RunVM(const AProgram: TRegExpProgram; const AInput: string; AStartPos: Integer; var ASlots: array of Integer; - ASlotCount: Integer): Boolean; + ASlotCount: Integer; AStartPC: Integer = 0; + AEndPos: PInteger = nil): Boolean; var PC, InputPos: Integer; Instr: UInt32; @@ -246,7 +248,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; begin Result := False; SlotCount := ASlotCount; - PC := 0; + PC := AStartPC; InputPos := AStartPos; StepCount := 0; StackTop := -1; @@ -511,7 +513,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; SetLength(LookSlots, SlotCount); Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); LookMatched := RunVM(AProgram, AInput, InputPos, LookSlots, - SlotCount); + SlotCount, PC + 1); if Negated then begin if LookMatched then @@ -541,12 +543,17 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; LookMatched := False; SetLength(LookSlots, SlotCount); I := InputPos - 1; - while I >= 1 do + RefStart := I - MAX_LOOKBEHIND_DISTANCE; + if RefStart < 1 then + RefStart := 1; + while I >= RefStart do begin Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); - if RunVM(AProgram, AInput, I, LookSlots, SlotCount) then + RefEnd := 0; + if RunVM(AProgram, AInput, I, LookSlots, SlotCount, PC + 1, + @RefEnd) then begin - if LookSlots[1] = InputPos then + if RefEnd = InputPos then begin LookMatched := True; Break; @@ -578,6 +585,8 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_MATCH: begin + if AEndPos <> nil then + AEndPos^ := InputPos; Result := True; Exit; end; From 14a2b07b4bdd8c7eb13d1ccc48b9e04340b3e6b8 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 22:09:50 +0100 Subject: [PATCH 07/15] Fix negative lookahead/lookbehind and add assertion tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The negation flag for (?!...) and (? --- source/units/Goccia.RegExp.Compiler.pas | 12 ++++-- source/units/Goccia.RegExp.VM.pas | 8 ++-- tests/built-ins/RegExp/prototype/exec.js | 49 ++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 7 deletions(-) diff --git a/source/units/Goccia.RegExp.Compiler.pas b/source/units/Goccia.RegExp.Compiler.pas index 2b01b9ab..3345ae5a 100644 --- a/source/units/Goccia.RegExp.Compiler.pas +++ b/source/units/Goccia.RegExp.Compiler.pas @@ -645,6 +645,7 @@ function TRegExpCompiler.ParseDecimalEscape: Integer; const BACKREF_STRICT_FLAG = $800000; + LOOK_NEGATED_FLAG = $800000; procedure TRegExpCompiler.EmitDuplicateNamedBackref(const AName: string); var @@ -993,7 +994,7 @@ procedure TRegExpCompiler.CompileGroup; raise EConvertError.Create('Unterminated negative lookahead'); Emit(EncodeOp(RX_MATCH)); PatchHole(SplitHole, CurrentPC); - FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, CurrentPC) or $80; + FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, CurrentPC or LOOK_NEGATED_FLAG); end else if Match('<') then begin @@ -1017,7 +1018,7 @@ procedure TRegExpCompiler.CompileGroup; raise EConvertError.Create('Unterminated negative lookbehind'); Emit(EncodeOp(RX_MATCH)); PatchHole(SplitHole, CurrentPC); - FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, CurrentPC) or $80; + FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, CurrentPC or LOOK_NEGATED_FLAG); end else begin @@ -1229,6 +1230,7 @@ procedure TRegExpCompiler.InsertSplitAt(APos: Integer); I: Integer; Op: TRegExpOpCode; Bx: Integer; + Negated: Boolean; begin EnsureCodeCapacity(1); Move(FCode[APos], FCode[APos + 1], (FCodeLen - APos) * SizeOf(UInt32)); @@ -1250,10 +1252,14 @@ procedure TRegExpCompiler.InsertSplitAt(APos: Integer); RX_LOOKAHEAD, RX_LOOKBEHIND: begin Bx := Integer(FCode[I] shr 8); + Negated := (Bx and $800000) <> 0; + Bx := Bx and $7FFFFF; if Bx >= APos then begin Inc(Bx); - FCode[I] := (FCode[I] and $80FF) or (UInt32(Bx) shl 8); + if Negated then + Bx := Bx or $800000; + FCode[I] := EncodeOpBx(Op, Bx); end; end; end; diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index e2db47bc..f7a30c07 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -508,8 +508,8 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_LOOKAHEAD: begin - Negated := (Instr and $80) <> 0; - LookEnd := Bx; + Negated := (Bx and $800000) <> 0; + LookEnd := Bx and $7FFFFF; SetLength(LookSlots, SlotCount); Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); LookMatched := RunVM(AProgram, AInput, InputPos, LookSlots, @@ -538,8 +538,8 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_LOOKBEHIND: begin - Negated := (Instr and $80) <> 0; - LookEnd := Bx; + Negated := (Bx and $800000) <> 0; + LookEnd := Bx and $7FFFFF; LookMatched := False; SetLength(LookSlots, SlotCount); I := InputPos - 1; diff --git a/tests/built-ins/RegExp/prototype/exec.js b/tests/built-ins/RegExp/prototype/exec.js index 6ea2dd0a..29641a71 100644 --- a/tests/built-ins/RegExp/prototype/exec.js +++ b/tests/built-ins/RegExp/prototype/exec.js @@ -209,3 +209,52 @@ test("exec on large input does not crash", () => { const s = "foo" + ".bar".repeat(20000); expect(/f.*/.test(s)).toBe(true); }); + +// --- Lookahead --- + +test("positive lookahead matches without consuming", () => { + const m = /foo(?=bar)/.exec("foobar"); + expect(m[0]).toBe("foo"); + expect(m.index).toBe(0); +}); + +test("negative lookahead rejects when pattern present", () => { + expect(/foo(?!bar)/.test("foobar")).toBe(false); + expect(/foo(?!bar)/.test("foobaz")).toBe(true); +}); + +// --- Lookbehind --- + +test("positive lookbehind matches fixed-length pattern", () => { + const m = /(?<=foo)bar/.exec("foobar"); + expect(m[0]).toBe("bar"); + expect(m.index).toBe(3); +}); + +test("positive lookbehind fails when prefix absent", () => { + expect(/(?<=foo)bar/.test("bazbar")).toBe(false); +}); + +test("negative lookbehind rejects when pattern present", () => { + expect(/(? { + const m = "xabcd".match(/.*(?<=(..|...|....))(.*)/); + expect(m[0]).toBe("xabcd"); + expect(m[1]).toBe("cd"); + expect(m[2]).toBe(""); +}); + +test("lookbehind with quantifier in outer pattern", () => { + const m = /(?<=\d+)px/.exec("100px"); + expect(m[0]).toBe("px"); + expect(m.index).toBe(3); +}); + +test("lookbehind does not consume input", () => { + const m = /(?<=a)b/.exec("ab"); + expect(m[0]).toBe("b"); + expect(m.index).toBe(1); +}); From 3b9c58747d7ed8e48fcde83d3ed9266fdaea0683 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 22:25:21 +0100 Subject: [PATCH 08/15] Fix syntax validation, quantifier body relocation, and zero-width loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syntax validation: - Reject dangling quantifiers (a**, ??, +) — nothing to repeat - Reject invalid char class ranges ([z-a], [b-ac-e]) where start > end - Reject {min,max} where min > max ({2,1}) - Reject trailing backslash (\) - Reject \c without letter in unicode mode - Reject invalid identity escapes in unicode mode - Reject quantified assertions ((?=.)*) in unicode mode Quantifier body relocation: - EmitBodyAt adjusts absolute PC targets (SPLIT, JUMP, LOOKAHEAD, LOOKBEHIND) by the offset between original and destination positions. Without this, alternation inside * quantifiers had stale SPLIT targets, causing /(aa|aabaac|ba|b|c)*/ to return ["",null] instead of ["aaba","ba"]. Zero-width loop detection: - RX_SPLIT records (PC, InputPos) in the memoization table on each visit. When revisited at the same position (zero-width iteration), takes the exit branch instead of looping. Prevents infinite loops on patterns like /(a*)b\1+/ where the backreference matches empty. Also makes catastrophic patterns like /^(a+)+$/ terminate with null instead of hitting the backtrack limit. Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.RegExp.Compiler.pas | 86 ++++++++++++++++++++++-- source/units/Goccia.RegExp.VM.pas | 6 ++ tests/built-ins/RegExp/prototype/exec.js | 7 +- 3 files changed, 90 insertions(+), 9 deletions(-) diff --git a/source/units/Goccia.RegExp.Compiler.pas b/source/units/Goccia.RegExp.Compiler.pas index 3345ae5a..6228f4b4 100644 --- a/source/units/Goccia.RegExp.Compiler.pas +++ b/source/units/Goccia.RegExp.Compiler.pas @@ -118,6 +118,8 @@ TRegExpCompiler = class function ReadCodePoint: Cardinal; procedure EnsureCodeCapacity(ANeeded: Integer); procedure EmitBody(const ABody: array of UInt32; ALen: Integer); + procedure EmitBodyAt(const ABody: array of UInt32; ALen: Integer; + AOrigStart: Integer); procedure ValidateNamedGroups; procedure PreScanNamedGroups; procedure InsertSplitAt(APos: Integer); @@ -775,11 +777,19 @@ procedure TRegExpCompiler.CompileEscapeAtom; if not AtEnd and (((Peek >= 'a') and (Peek <= 'z')) or ((Peek >= 'A') and (Peek <= 'Z'))) then EmitCharMatch(Ord(Advance) mod 32) + else if FUnicode then + raise EConvertError.Create( + 'Invalid regular expression: invalid control escape in unicode mode') else EmitCharMatch(Ord('c')); end; else - EmitCharMatch(Ord(C)); + if FUnicode and not CharInSet(C, ['/', '^', '$', '\', '.', '*', '+', + '?', '(', ')', '[', ']', '{', '}', '|']) then + raise EConvertError.Create( + 'Invalid regular expression: invalid escape in unicode mode') + else + EmitCharMatch(Ord(C)); end; end; @@ -874,12 +884,18 @@ procedure TRegExpCompiler.CompileCharacterClass; begin Hi := Ranges[RangeCount - 1].Lo; Dec(RangeCount); + if Lo > Hi then + raise EConvertError.Create( + 'Invalid regular expression: range out of order in character class'); AddRange(Ranges, RangeCount, Lo, Hi); end; end else begin Hi := ReadCodePoint; + if Lo > Hi then + raise EConvertError.Create( + 'Invalid regular expression: range out of order in character class'); AddRange(Ranges, RangeCount, Lo, Hi); end; end @@ -1097,6 +1113,9 @@ procedure TRegExpCompiler.CompileAtom; '\': begin Inc(FPos); + if AtEnd then + raise EConvertError.Create( + 'Invalid regular expression: \ at end of pattern'); CompileEscapeAtom; end; else @@ -1122,9 +1141,44 @@ procedure TRegExpCompiler.EnsureCodeCapacity(ANeeded: Integer); end; procedure TRegExpCompiler.EmitBody(const ABody: array of UInt32; ALen: Integer); +begin + EmitBodyAt(ABody, ALen, 0); +end; + +procedure TRegExpCompiler.EmitBodyAt(const ABody: array of UInt32; + ALen: Integer; AOrigStart: Integer); +var + DstStart, Delta, J: Integer; + Op: TRegExpOpCode; + Bx: Integer; + NegFlag: Integer; begin EnsureCodeCapacity(ALen); - Move(ABody[0], FCode[FCodeLen], ALen * SizeOf(UInt32)); + DstStart := FCodeLen; + Move(ABody[0], FCode[DstStart], ALen * SizeOf(UInt32)); + Delta := DstStart - AOrigStart; + if Delta <> 0 then + begin + for J := DstStart to DstStart + ALen - 1 do + begin + Op := TRegExpOpCode(FCode[J] and $FF); + case Op of + RX_SPLIT, RX_SPLIT_LAZY, RX_JUMP: + begin + Bx := Integer(FCode[J] shr 8); + Inc(Bx, Delta); + FCode[J] := EncodeOpBx(Op, Bx); + end; + RX_LOOKAHEAD, RX_LOOKBEHIND: + begin + Bx := Integer(FCode[J] shr 8); + NegFlag := Bx and $800000; + Bx := (Bx and $7FFFFF) + Delta; + FCode[J] := EncodeOpBx(Op, Bx or NegFlag); + end; + end; + end; + end; Inc(FCodeLen, ALen); end; @@ -1171,6 +1225,9 @@ procedure TRegExpCompiler.CompileQuantifier(AAtomStart: Integer); FPos := SavePos; Exit; end; + if (MaxCount >= 0) and (MinCount > MaxCount) then + raise EConvertError.Create( + 'Invalid regular expression: numbers out of order in quantifier'); end; else Exit; @@ -1183,7 +1240,7 @@ procedure TRegExpCompiler.CompileQuantifier(AAtomStart: Integer); Move(FCode[AAtomStart], BodyCode[0], BodyLen * SizeOf(UInt32)); FCodeLen := AAtomStart; for I := 1 to MinCount do - EmitBody(BodyCode, BodyLen); + EmitBodyAt(BodyCode, BodyLen, AAtomStart); if MaxCount = -1 then begin SplitPC := CurrentPC; @@ -1191,7 +1248,7 @@ procedure TRegExpCompiler.CompileQuantifier(AAtomStart: Integer); Emit(EncodeOpBx(RX_SPLIT_LAZY, 0)) else Emit(EncodeOpBx(RX_SPLIT, 0)); - EmitBody(BodyCode, BodyLen); + EmitBodyAt(BodyCode, BodyLen, AAtomStart); Emit(EncodeOpBx(RX_JUMP, SplitPC)); PatchHole(SplitPC, CurrentPC); end @@ -1204,18 +1261,37 @@ procedure TRegExpCompiler.CompileQuantifier(AAtomStart: Integer); Emit(EncodeOpBx(RX_SPLIT_LAZY, 0)) else Emit(EncodeOpBx(RX_SPLIT, 0)); - EmitBody(BodyCode, BodyLen); + EmitBodyAt(BodyCode, BodyLen, AAtomStart); PatchHole(SplitPC, CurrentPC); end; end; end; +function IsQuantifierChar(C: Char): Boolean; inline; +begin + Result := (C = '*') or (C = '+') or (C = '?') or (C = '{'); +end; + procedure TRegExpCompiler.CompileTerm; var AtomStart: Integer; + C: Char; + IsAssertion: Boolean; begin + C := Peek; + if IsQuantifierChar(C) then + raise EConvertError.Create('Invalid regular expression: nothing to repeat'); + IsAssertion := (C = '^') or (C = '$') or + ((C = '\') and ((PeekAt(1) = 'b') or (PeekAt(1) = 'B'))); + if (C = '(') and (PeekAt(1) = '?') and + ((PeekAt(2) = '=') or (PeekAt(2) = '!') or + ((PeekAt(2) = '<') and ((PeekAt(3) = '=') or (PeekAt(3) = '!')))) then + IsAssertion := True; AtomStart := CurrentPC; CompileAtom; + if (not AtEnd) and IsQuantifierChar(Peek) and IsAssertion and FUnicode then + raise EConvertError.Create( + 'Invalid regular expression: quantifier on assertion in unicode mode'); CompileQuantifier(AtomStart); end; diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index f7a30c07..b180a66b 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -345,6 +345,12 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_SPLIT: begin + if MemoContains(Memo, PC, InputPos) then + begin + PC := Bx; + Continue; + end; + MemoAdd(Memo, PC, InputPos); if not MemoContains(Memo, Bx, InputPos) then PushBacktrack(Bx, InputPos); Inc(PC); diff --git a/tests/built-ins/RegExp/prototype/exec.js b/tests/built-ins/RegExp/prototype/exec.js index 29641a71..6cd3a79a 100644 --- a/tests/built-ins/RegExp/prototype/exec.js +++ b/tests/built-ins/RegExp/prototype/exec.js @@ -197,10 +197,9 @@ test("exec with duplicate named backreference outside the disjunction", () => { // --- Backtrack limit --- -test("catastrophic backtracking throws Error instead of hanging", () => { - expect(() => { - /^(a+)+$/.exec("a".repeat(30) + "b"); - }).toThrow(Error); +test("catastrophic backtracking does not hang", () => { + const result = /^(a+)+$/.exec("a".repeat(30) + "b"); + expect(result).toBe(null); }); // --- Large input (#515 regression) --- From 91e2fe9529640d7ec5395416f6419cac8ce6782c Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 22:34:13 +0100 Subject: [PATCH 09/15] Backref case-folding scoped to modifier groups, cap huge quantifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Encode ignoreCase flag per-backref instruction (BACKREF_ICASE_FLAG = $400000) so (?i:\1) case-folds the backreference comparison while \1 outside a modifier group does not. The flag is set at compile time from FModifier.IgnoreCase, giving correct scoping to modifier groups. - Cap ParseDecimalEscape at 1M to prevent integer overflow on huge quantifiers like {2147483648} — avoids range check error on the staging/sm/RegExp/regress-yarr-regexp.js test262 test. Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.RegExp.Compiler.pas | 32 ++++++++++++++++++------- source/units/Goccia.RegExp.VM.pas | 18 +++++++++++--- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/source/units/Goccia.RegExp.Compiler.pas b/source/units/Goccia.RegExp.Compiler.pas index 6228f4b4..0dac3dee 100644 --- a/source/units/Goccia.RegExp.Compiler.pas +++ b/source/units/Goccia.RegExp.Compiler.pas @@ -123,7 +123,8 @@ TRegExpCompiler = class procedure ValidateNamedGroups; procedure PreScanNamedGroups; procedure InsertSplitAt(APos: Integer); - procedure EmitDuplicateNamedBackref(const AName: string); + procedure EmitDuplicateNamedBackref(const AName: string; + AICaseFlag: Integer); public constructor Create(const APattern, AFlags: string); function Compile: TRegExpProgram; @@ -632,6 +633,8 @@ function TRegExpCompiler.ParseUnicodeEscape: Cardinal; end; function TRegExpCompiler.ParseDecimalEscape: Integer; +const + MAX_QUANTIFIER = 1000000; var C: Char; begin @@ -641,15 +644,22 @@ function TRegExpCompiler.ParseDecimalEscape: Integer; C := Peek; if (C < '0') or (C > '9') then Break; - Result := Result * 10 + (Ord(Advance) - Ord('0')); + if Result <= MAX_QUANTIFIER then + Result := Result * 10 + (Ord(Advance) - Ord('0')) + else + Advance; end; + if Result > MAX_QUANTIFIER then + Result := MAX_QUANTIFIER; end; const BACKREF_STRICT_FLAG = $800000; + BACKREF_ICASE_FLAG = $400000; LOOK_NEGATED_FLAG = $800000; -procedure TRegExpCompiler.EmitDuplicateNamedBackref(const AName: string); +procedure TRegExpCompiler.EmitDuplicateNamedBackref(const AName: string; + AICaseFlag: Integer); var Indices: array of Integer; Count, I: Integer; @@ -668,7 +678,7 @@ procedure TRegExpCompiler.EmitDuplicateNamedBackref(const AName: string); SetLength(Indices, Count); if Count = 1 then begin - Emit(EncodeOpBx(RX_BACKREF, Indices[0])); + Emit(EncodeOpBx(RX_BACKREF, Indices[0] or AICaseFlag)); Exit; end; JumpCount := 0; @@ -677,7 +687,7 @@ procedure TRegExpCompiler.EmitDuplicateNamedBackref(const AName: string); begin SplitHole := CurrentPC; Emit(EncodeOpBx(RX_SPLIT, 0)); - Emit(EncodeOpBx(RX_BACKREF, Indices[I] or BACKREF_STRICT_FLAG)); + Emit(EncodeOpBx(RX_BACKREF, Indices[I] or BACKREF_STRICT_FLAG or AICaseFlag)); JumpHoles[JumpCount] := CurrentPC; Inc(JumpCount); Emit(0); @@ -696,9 +706,13 @@ procedure TRegExpCompiler.CompileEscapeAtom; PropertyName: string; Negated: Boolean; GroupName: string; - BackrefIdx, I, GroupCount: Integer; + BackrefIdx, I, GroupCount, BackrefICaseFlag: Integer; CodePoint: Cardinal; begin + if FModifier.IgnoreCase then + BackrefICaseFlag := BACKREF_ICASE_FLAG + else + BackrefICaseFlag := 0; C := Advance; case C of 'd', 'D', 'w', 'W', 's', 'S': @@ -744,9 +758,9 @@ procedure TRegExpCompiler.CompileEscapeAtom; raise EConvertError.Create( 'Invalid named backreference: ' + GroupName); if GroupCount <= 1 then - Emit(EncodeOpBx(RX_BACKREF, BackrefIdx)) + Emit(EncodeOpBx(RX_BACKREF, BackrefIdx or BackrefICaseFlag)) else - EmitDuplicateNamedBackref(GroupName); + EmitDuplicateNamedBackref(GroupName, BackrefICaseFlag); end else EmitCharMatch(Ord('k')); @@ -756,7 +770,7 @@ procedure TRegExpCompiler.CompileEscapeAtom; BackrefIdx := Ord(C) - Ord('0'); while not AtEnd and (Peek >= '0') and (Peek <= '9') do BackrefIdx := BackrefIdx * 10 + (Ord(Advance) - Ord('0')); - Emit(EncodeOpBx(RX_BACKREF, BackrefIdx)); + Emit(EncodeOpBx(RX_BACKREF, BackrefIdx or BackrefICaseFlag)); end; 'n': EmitCharMatch($0A); 'r': EmitCharMatch($0D); diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index b180a66b..33a06301 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -209,6 +209,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; BeforeIsWord, AfterIsWord: Boolean; Negated: Boolean; BackrefGroup: Integer; + BackrefICase: Boolean; LookEnd: Integer; LookSlots: array of Integer; LookMatched: Boolean; @@ -376,7 +377,8 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_BACKREF: begin Negated := (Bx and $800000) <> 0; - BackrefGroup := Bx and $7FFFFF; + BackrefICase := (Bx and $400000) <> 0; + BackrefGroup := Bx and $3FFFFF; RefStart := -1; RefEnd := -1; if (BackrefGroup * 2) < SlotCount then @@ -412,8 +414,18 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; end; if RefCP <> InputCP then begin - LookMatched := False; - Break; + if BackrefICase then + begin + if (RefCP >= Ord('A')) and (RefCP <= Ord('Z')) then + RefCP := RefCP + 32; + if (InputCP >= Ord('A')) and (InputCP <= Ord('Z')) then + InputCP := InputCP + 32; + end; + if RefCP <> InputCP then + begin + LookMatched := False; + Break; + end; end; Inc(RefPos, RefByteLen); Inc(InputPos, InputByteLen); From 54e5665b8bc80552ebd99ec224cf33aa005b6a35 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 22:37:45 +0100 Subject: [PATCH 10/15] Add JS tests for syntax validation, backtracking, and modifier backrefs Cover every regression fix with explicit tests: - Dangling quantifiers (a**, ??, +, *) throw SyntaxError - Invalid char class ranges ([z-a], [b-ac-e]) throw SyntaxError - Quantifier min > max ({2,1}) throws SyntaxError - Trailing backslash throws SyntaxError - Huge quantifier ({2147483648}) does not crash - Greedy * with alternation picks correct match path - Greedy * with char class quantifier backtracks correctly - Zero-length backref with + quantifier does not hang - (?i:\1) case-folds backreference, (?-i:\1) disables it - \c without letter in /u throws SyntaxError - Quantified assertions in /u throw SyntaxError Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/built-ins/RegExp/constructor.js | 26 ++++++++++++++++++++++++ tests/built-ins/RegExp/modifiers.js | 15 ++++++++++++++ tests/built-ins/RegExp/prototype/exec.js | 22 ++++++++++++++++++++ tests/built-ins/RegExp/unicode.js | 13 ++++++++++++ 4 files changed, 76 insertions(+) diff --git a/tests/built-ins/RegExp/constructor.js b/tests/built-ins/RegExp/constructor.js index 159f315a..3c3bb21b 100644 --- a/tests/built-ins/RegExp/constructor.js +++ b/tests/built-ins/RegExp/constructor.js @@ -98,3 +98,29 @@ test("RegExp canonicalizes new flags in correct order", () => { const regex = new RegExp("a", "yvgdims"); expect(regex.flags).toBe("dgimsvy"); }); + +// --- Syntax validation --- + +test("dangling quantifier throws SyntaxError", () => { + expect(() => { new RegExp("a**"); }).toThrow(SyntaxError); + expect(() => { new RegExp("??"); }).toThrow(SyntaxError); + expect(() => { new RegExp("+"); }).toThrow(SyntaxError); + expect(() => { new RegExp("*"); }).toThrow(SyntaxError); +}); + +test("invalid character class range throws SyntaxError", () => { + expect(() => { new RegExp("[z-a]"); }).toThrow(SyntaxError); + expect(() => { new RegExp("[b-ac-e]"); }).toThrow(SyntaxError); +}); + +test("quantifier min > max throws SyntaxError", () => { + expect(() => { new RegExp("0{2,1}"); }).toThrow(SyntaxError); +}); + +test("trailing backslash throws SyntaxError", () => { + expect(() => { new RegExp("\\"); }).toThrow(SyntaxError); +}); + +test("huge quantifier does not crash", () => { + expect(/x{2147483648}x/.test("1")).toBe(false); +}); diff --git a/tests/built-ins/RegExp/modifiers.js b/tests/built-ins/RegExp/modifiers.js index bb4e3d1b..97459b40 100644 --- a/tests/built-ins/RegExp/modifiers.js +++ b/tests/built-ins/RegExp/modifiers.js @@ -224,6 +224,21 @@ test("(?-:...) empty add and remove throws SyntaxError", () => { expect(() => { new RegExp("(?-:abc)"); }).toThrow(SyntaxError); }); +// --- Modifier scoping affects backreferences --- + +test("(?i:\\1) case-folds backreference comparison", () => { + const re = /(a)(?i:\1)/; + expect(re.test("aA")).toBe(true); + expect(re.test("aa")).toBe(true); + expect(re.test("AA")).toBe(false); +}); + +test("(?-i:\\1) disables case-folding for backreference", () => { + const re = new RegExp("(a)(?-i:\\1)", "i"); + expect(re.test("aa")).toBe(true); + expect(re.test("aA")).toBe(false); +}); + // --- Error cases: double dash --- test("(?i--s:...) double dash throws SyntaxError", () => { diff --git a/tests/built-ins/RegExp/prototype/exec.js b/tests/built-ins/RegExp/prototype/exec.js index 6cd3a79a..d91470ed 100644 --- a/tests/built-ins/RegExp/prototype/exec.js +++ b/tests/built-ins/RegExp/prototype/exec.js @@ -195,6 +195,28 @@ test("exec with duplicate named backreference outside the disjunction", () => { expect(re.exec("ab")).toBe(null); }); +// --- Greedy quantifier with alternation --- + +test("greedy star with alternation picks correct match", () => { + const m = /(aa|aabaac|ba|b|c)*/.exec("aabaac"); + expect(m[0]).toBe("aaba"); + expect(m[1]).toBe("ba"); +}); + +test("greedy star with character class quantifier backtracks correctly", () => { + const m = /^([a-z]+)*[a-z]$/.exec("ab"); + expect(m[0]).toBe("ab"); + expect(m[1]).toBe("a"); +}); + +// --- Zero-width backref loop --- + +test("backreference to zero-length capture with + does not hang", () => { + const m = /(a*)b\1+/.exec("baaac"); + expect(m[0]).toBe("b"); + expect(m[1]).toBe(""); +}); + // --- Backtrack limit --- test("catastrophic backtracking does not hang", () => { diff --git a/tests/built-ins/RegExp/unicode.js b/tests/built-ins/RegExp/unicode.js index 9f66f57d..58cba4d3 100644 --- a/tests/built-ins/RegExp/unicode.js +++ b/tests/built-ins/RegExp/unicode.js @@ -228,3 +228,16 @@ xyz")).toBe(true); expect(/abc$/m.test("abc é")).toBe(true); }); + +// --- Unicode mode syntax restrictions --- + +test("\\c without letter throws SyntaxError in unicode mode", () => { + expect(() => { new RegExp("\\c", "u"); }).toThrow(SyntaxError); + expect(() => { new RegExp("\\c1", "u"); }).toThrow(SyntaxError); +}); + +test("quantified assertion throws SyntaxError in unicode mode", () => { + expect(() => { new RegExp("(?=.)*", "u"); }).toThrow(SyntaxError); + expect(() => { new RegExp("(?=.)+", "u"); }).toThrow(SyntaxError); + expect(() => { new RegExp("(?!.){2}", "u"); }).toThrow(SyntaxError); +}); From 14ee3e66b8b9d980ea067360c3db43d56f7157f4 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 23:04:48 +0100 Subject: [PATCH 11/15] Fix backref backtracking, char class \c in unicode, dynamic step limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backref backtracking: - Revert the zero-width memo-on-SPLIT-entry that prevented legitimate backtracking through (a+) in /^(a+)\1*,\1+$/. Instead detect zero-width loops via JUMP: when jumping back to a SPLIT and the top backtrack entry has the same exit target and input position, the iteration consumed nothing — take the exit directly. Char class \c in unicode mode: - CompileEscape (character class variant) now handles \c with the same validation as CompileEscapeAtom: \c without a-zA-Z throws SyntaxError in unicode mode. Also reject invalid identity escapes inside character classes in unicode mode. Dynamic step limit: - Step limit is now max(10M, inputLength * 100) instead of a fixed 10M. This prevents false positives on legitimate large inputs (e.g. test262 property-escapes/generated/ASCII.js tests \P{ASCII} against 1M+ chars) while still catching catastrophic backtracking on small inputs. Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.RegExp.Compiler.pas | 19 +++++++++++++++- source/units/Goccia.RegExp.VM.pas | 29 ++++++++++++++++-------- tests/built-ins/RegExp/prototype/exec.js | 7 +++--- 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/source/units/Goccia.RegExp.Compiler.pas b/source/units/Goccia.RegExp.Compiler.pas index 0dac3dee..c85acc78 100644 --- a/source/units/Goccia.RegExp.Compiler.pas +++ b/source/units/Goccia.RegExp.Compiler.pas @@ -861,8 +861,25 @@ procedure TRegExpCompiler.CompileEscape(AInCharClass: Boolean; end; 'b': AddRange(ARanges, ARangeCount, $08, $08); + 'c': + begin + if not AtEnd and (((Peek >= 'a') and (Peek <= 'z')) or + ((Peek >= 'A') and (Peek <= 'Z'))) then + AddRange(ARanges, ARangeCount, Ord(Advance) mod 32, + Ord(FPattern[FPos - 1]) mod 32) + else if FUnicode then + raise EConvertError.Create( + 'Invalid regular expression: invalid control escape in unicode mode') + else + AddRange(ARanges, ARangeCount, Ord('c'), Ord('c')); + end; else - AddRange(ARanges, ARangeCount, Ord(C), Ord(C)); + if FUnicode and not CharInSet(C, ['/', '^', '$', '\', '.', '*', '+', + '?', '(', ')', '[', ']', '{', '}', '|', '-']) then + raise EConvertError.Create( + 'Invalid regular expression: invalid escape in unicode mode') + else + AddRange(ARanges, ARangeCount, Ord(C), Ord(C)); end; end; diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index 33a06301..2d3afca7 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -27,7 +27,8 @@ implementation TextSemantics; const - DEFAULT_STEP_LIMIT = 10000000; + MIN_STEP_LIMIT = 10000000; + STEPS_PER_INPUT_BYTE = 100; DEFAULT_BACKTRACK_CAP = 1000000; MAX_LOOKBEHIND_DISTANCE = 256; MEMO_CAPACITY = 65536; @@ -201,6 +202,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; Stack: array of TBacktrackEntry; StackTop: Integer; StepCount: Integer; + StepLimit: Integer; Memo: TMemoTable; SlotCount: Integer; I: Integer; @@ -252,6 +254,9 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; PC := AStartPC; InputPos := AStartPos; StepCount := 0; + StepLimit := Length(AInput) * STEPS_PER_INPUT_BYTE; + if StepLimit < MIN_STEP_LIMIT then + StepLimit := MIN_STEP_LIMIT; StackTop := -1; SetLength(Stack, 256); MemoInit(Memo); @@ -259,7 +264,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; while PC < Length(AProgram.Code) do begin Inc(StepCount); - if StepCount > DEFAULT_STEP_LIMIT then + if StepCount > StepLimit then raise ERegExpRuntimeError.Create('Maximum regular expression backtrack stack size exceeded'); Instr := AProgram.Code[PC]; @@ -346,12 +351,6 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_SPLIT: begin - if MemoContains(Memo, PC, InputPos) then - begin - PC := Bx; - Continue; - end; - MemoAdd(Memo, PC, InputPos); if not MemoContains(Memo, Bx, InputPos) then PushBacktrack(Bx, InputPos); Inc(PC); @@ -365,7 +364,19 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; end; RX_JUMP: - PC := Bx; + begin + if (Bx >= 0) and (Bx < Length(AProgram.Code)) and + (TRegExpOpCode(AProgram.Code[Bx] and $FF) = RX_SPLIT) then + begin + if (StackTop >= 0) and (Stack[StackTop].PC = Integer(AProgram.Code[Bx] shr 8)) and + (Stack[StackTop].InputPos = InputPos) then + begin + PC := Integer(AProgram.Code[Bx] shr 8); + Continue; + end; + end; + PC := Bx; + end; RX_SAVE: begin diff --git a/tests/built-ins/RegExp/prototype/exec.js b/tests/built-ins/RegExp/prototype/exec.js index d91470ed..90422fb7 100644 --- a/tests/built-ins/RegExp/prototype/exec.js +++ b/tests/built-ins/RegExp/prototype/exec.js @@ -219,9 +219,10 @@ test("backreference to zero-length capture with + does not hang", () => { // --- Backtrack limit --- -test("catastrophic backtracking does not hang", () => { - const result = /^(a+)+$/.exec("a".repeat(30) + "b"); - expect(result).toBe(null); +test("catastrophic backtracking throws Error instead of hanging", () => { + expect(() => { + /^(a+)+$/.exec("a".repeat(30) + "b"); + }).toThrow(Error); }); // --- Large input (#515 regression) --- From c9784fb66a6d128465061df20418bb7f3a97c07f Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Fri, 8 May 2026 23:25:02 +0100 Subject: [PATCH 12/15] Add missing JS tests for backref backtracking, \c in class, step limit Every fix should ship with its test. These were missing from the previous commit: - Backref backtracking through (a+) in /^(a+)\1*,\1+$/ - String.replace with backreference capture - \c inside character class in unicode mode throws SyntaxError - \p{ASCII} on large input does not hit step limit Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/built-ins/RegExp/prototype/exec.js | 10 ++++++++++ tests/built-ins/RegExp/unicode.js | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/tests/built-ins/RegExp/prototype/exec.js b/tests/built-ins/RegExp/prototype/exec.js index 90422fb7..fdf2be21 100644 --- a/tests/built-ins/RegExp/prototype/exec.js +++ b/tests/built-ins/RegExp/prototype/exec.js @@ -209,6 +209,16 @@ test("greedy star with character class quantifier backtracks correctly", () => { expect(m[1]).toBe("a"); }); +test("backreference backtracking finds correct capture length", () => { + const m = /^(a+)\1*,\1+$/.exec("aaaaaaaaaa,aaaaaaaaaaaaaaa"); + expect(m[0]).toBe("aaaaaaaaaa,aaaaaaaaaaaaaaa"); + expect(m[1]).toBe("aaaaa"); +}); + +test("replace with backreference uses correct capture", () => { + expect("aaaaaaaaaa,aaaaaaaaaaaaaaa".replace(/^(a+)\1*,\1+$/, "$1")).toBe("aaaaa"); +}); + // --- Zero-width backref loop --- test("backreference to zero-length capture with + does not hang", () => { diff --git a/tests/built-ins/RegExp/unicode.js b/tests/built-ins/RegExp/unicode.js index 58cba4d3..f8ed7894 100644 --- a/tests/built-ins/RegExp/unicode.js +++ b/tests/built-ins/RegExp/unicode.js @@ -241,3 +241,13 @@ test("quantified assertion throws SyntaxError in unicode mode", () => { expect(() => { new RegExp("(?=.)+", "u"); }).toThrow(SyntaxError); expect(() => { new RegExp("(?!.){2}", "u"); }).toThrow(SyntaxError); }); + +test("\\c inside character class without letter throws SyntaxError in unicode mode", () => { + expect(() => { new RegExp("[\\c]", "u"); }).toThrow(SyntaxError); + expect(() => { new RegExp("[\\c1]", "u"); }).toThrow(SyntaxError); +}); + +test("\\p{ASCII} matches on large input without hitting step limit", () => { + const s = "abcdefghij0123456789".repeat(50); + expect(new RegExp("^\\p{ASCII}+$", "u").test(s)).toBe(true); +}); From 03e2c42904daca546978ff966bf1996f9b4c292e Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Sat, 9 May 2026 09:45:23 +0100 Subject: [PATCH 13/15] Inline hot-path code point read, raise backtrack stack cap to 10M The property-escapes/generated/ASCII.js test262 test builds a 1.1M code point non-ASCII string and tests \P{ASCII}+ against it. The greedy + pushes one backtrack entry per code point, exceeding the 1M backtrack stack cap. Fix: raise DEFAULT_BACKTRACK_CAP from 1M to 10M. Also inline ReadInputCodePoint with a fast path for ASCII bytes (< 0x80) to avoid the function call overhead of TryReadUTF8CodePointAllowSurrogates on every character of large inputs. CharClassContainsLinear is also marked inline. Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.RegExp.VM.pas | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index 2d3afca7..c07a6d70 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -29,7 +29,7 @@ implementation const MIN_STEP_LIMIT = 10000000; STEPS_PER_INPUT_BYTE = 100; - DEFAULT_BACKTRACK_CAP = 1000000; + DEFAULT_BACKTRACK_CAP = 10000000; MAX_LOOKBEHIND_DISTANCE = 256; MEMO_CAPACITY = 65536; MEMO_LOAD_LIMIT = 49152; @@ -128,7 +128,7 @@ function CharClassContains(const AClass: TRegExpCharClass; end; function CharClassContainsLinear(const AClass: TRegExpCharClass; - ACodePoint: Cardinal): Boolean; + ACodePoint: Cardinal): Boolean; inline; var I: Integer; begin @@ -154,7 +154,9 @@ function IsLineTerminator(ACodePoint: Cardinal): Boolean; inline; end; function ReadInputCodePoint(const AInput: string; APos: Integer; - out ACodePoint: Cardinal; out AByteLen: Integer): Boolean; + out ACodePoint: Cardinal; out AByteLen: Integer): Boolean; inline; +var + B: Byte; begin if (APos < 1) or (APos > Length(AInput)) then begin @@ -162,11 +164,18 @@ function ReadInputCodePoint(const AInput: string; APos: Integer; AByteLen := 0; Exit(False); end; + B := Ord(AInput[APos]); + if B < $80 then + begin + ACodePoint := B; + AByteLen := 1; + Exit(True); + end; Result := TryReadUTF8CodePointAllowSurrogates(AInput, APos, ACodePoint, AByteLen); if not Result then begin - ACodePoint := Ord(AInput[APos]); + ACodePoint := B; AByteLen := 1; Result := True; end; From 72dfbcde41e9119df66d2825804eca648e02822e Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Sat, 9 May 2026 10:10:53 +0100 Subject: [PATCH 14/15] Review cleanup: dead code, SIGSEGV fix, memo correctness, lazy alloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dead code removal: - Remove FlagIgnoreCase/FlagMultiline/FlagDotAll from TRegExpProgram (modifier state is encoded per-instruction, these fields are unread) - Remove EncodeInstr (duplicate of EncodeOpBx), AddCharClassFromDynamic (duplicate of AddCharClass), CharClassContains (unused binary search), CaseFold (unused method) SIGSEGV fix: - IsRegExpValue now checks HasOwnProperty('source') and HasOwnProperty('flags') instead of Symbol.toStringTag. An object created via Object.create(RegExp.prototype) inherits the tag but has no internal regex state — the prototype getters would recurse infinitely trying to read source/flags, causing stack overflow. Memo correctness: - Backref match failure now restores InputPos before calling MemoAdd, so the memo records the correct (PC, pos) pair instead of the partially-advanced position. - Invalid opcodes now raise ERegExpRuntimeError instead of silently skipping via Inc(PC). Lazy memo allocation: - MemoInit no longer allocates the 65K-entry table. MemoAdd allocates on first use, MemoContains returns false if unallocated. This avoids ~1MB allocation per lookbehind sub-call (up to 256 calls per assertion). Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.RegExp.Compiler.pas | 43 +------------------------ source/units/Goccia.RegExp.Runtime.pas | 8 ++--- source/units/Goccia.RegExp.VM.pas | 37 +++++++++------------ tests/built-ins/RegExp/constructor.js | 10 ++++++ 4 files changed, 28 insertions(+), 70 deletions(-) diff --git a/source/units/Goccia.RegExp.Compiler.pas b/source/units/Goccia.RegExp.Compiler.pas index c85acc78..8e2af0cd 100644 --- a/source/units/Goccia.RegExp.Compiler.pas +++ b/source/units/Goccia.RegExp.Compiler.pas @@ -41,9 +41,6 @@ TRegExpProgram = record CharClasses: array of TRegExpCharClass; CaptureCount: Integer; NamedGroups: TGocciaRegExpNamedGroups; - FlagIgnoreCase: Boolean; - FlagMultiline: Boolean; - FlagDotAll: Boolean; FlagUnicode: Boolean; end; @@ -91,7 +88,6 @@ TRegExpCompiler = class function EncodeOp(AOp: TRegExpOpCode): UInt32; function EncodeOpBx(AOp: TRegExpOpCode; ABx: Integer): UInt32; function AddCharClass(const ARanges: array of TRegExpCharRange): Integer; - function AddCharClassFromDynamic(const ARanges: array of TRegExpCharRange): Integer; procedure CompilePattern; procedure CompileDisjunction; procedure CompileAlternative; @@ -112,7 +108,6 @@ TRegExpCompiler = class ARangeCount: Integer; ANegated: Boolean); procedure AddBuiltinCharClass(AEscapeChar: Char; var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); procedure AddRange(var ARanges: array of TRegExpCharRange; var ARangeCount: Integer; ALo, AHi: Cardinal); - function CaseFold(ACodePoint: Cardinal): Cardinal; procedure EmitUnicodePropertyClass(const APropertyName: string; ANegated: Boolean); procedure GetUnicodePropertyRanges(const APropertyName: string; var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); function ReadCodePoint: Cardinal; @@ -133,16 +128,6 @@ TRegExpCompiler = class const MAX_CHAR_RANGES = 512; -function EncodeInstr(AOp: TRegExpOpCode; ABx: Integer): UInt32; inline; -begin - Result := UInt32(Ord(AOp)) or (UInt32(ABx) shl 8); -end; - -function DecodeBx(AInstr: UInt32): Integer; inline; -begin - Result := Integer(AInstr shr 8); -end; - constructor TRegExpCompiler.Create(const APattern, AFlags: string); begin inherited Create; @@ -253,29 +238,6 @@ function TRegExpCompiler.AddCharClass( FCharClasses[Result].Ranges[I] := ARanges[I]; end; -function TRegExpCompiler.AddCharClassFromDynamic( - const ARanges: array of TRegExpCharRange): Integer; -var - I: Integer; -begin - Result := Length(FCharClasses); - SetLength(FCharClasses, Result + 1); - SetLength(FCharClasses[Result].Ranges, Length(ARanges)); - for I := 0 to High(ARanges) do - FCharClasses[Result].Ranges[I] := ARanges[I]; -end; - -function TRegExpCompiler.CaseFold(ACodePoint: Cardinal): Cardinal; -begin - if not FModifier.IgnoreCase then - Exit(ACodePoint); - if (ACodePoint >= Ord('A')) and (ACodePoint <= Ord('Z')) then - Result := ACodePoint + 32 - else if (ACodePoint >= Ord('a')) and (ACodePoint <= Ord('z')) then - Result := ACodePoint - 32 - else - Result := ACodePoint; -end; procedure TRegExpCompiler.EmitCharMatch(ACodePoint: Cardinal); var @@ -527,7 +489,7 @@ procedure TRegExpCompiler.EmitCharClassRanges( end; end; end; - ClassIdx := AddCharClassFromDynamic(DynRanges); + ClassIdx := AddCharClass(DynRanges); if ANegated then Op := RX_CHAR_CLASS_NEG else @@ -1548,9 +1510,6 @@ function TRegExpCompiler.Compile: TRegExpProgram; Result.CharClasses := FCharClasses; Result.CaptureCount := FCaptureCount; Result.NamedGroups := FNamedGroups; - Result.FlagIgnoreCase := HasRegExpFlag(FFlags, 'i'); - Result.FlagMultiline := HasRegExpFlag(FFlags, 'm'); - Result.FlagDotAll := HasRegExpFlag(FFlags, 's'); Result.FlagUnicode := FUnicode; end; diff --git a/source/units/Goccia.RegExp.Runtime.pas b/source/units/Goccia.RegExp.Runtime.pas index 5fc4efe2..29d3f48d 100644 --- a/source/units/Goccia.RegExp.Runtime.pas +++ b/source/units/Goccia.RegExp.Runtime.pas @@ -118,15 +118,11 @@ function BuildMatchArray(const AInput: string; end; function IsRegExpValue(const AValue: TGocciaValue): Boolean; -var - Tag: TGocciaValue; begin if not (AValue is TGocciaObjectValue) then Exit(False); - Tag := TGocciaObjectValue(AValue).GetSymbolProperty( - TGocciaSymbolValue.WellKnownToStringTag); - Result := (Tag is TGocciaStringLiteralValue) and - (TGocciaStringLiteralValue(Tag).Value = 'RegExp'); + Result := TGocciaObjectValue(AValue).HasOwnProperty(PROP_SOURCE) and + TGocciaObjectValue(AValue).HasOwnProperty(PROP_FLAGS); end; function CreateRegExpObject(const APattern, AFlags: string): TGocciaValue; diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index c07a6d70..6ad87b88 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -54,10 +54,15 @@ TMemoTable = record procedure MemoInit(var AMemo: TMemoTable); begin - SetLength(AMemo.Entries, MEMO_CAPACITY); AMemo.Count := 0; end; +procedure MemoEnsureAllocated(var AMemo: TMemoTable); inline; +begin + if Length(AMemo.Entries) = 0 then + SetLength(AMemo.Entries, MEMO_CAPACITY); +end; + function MemoHash(APC, APos: Integer): Integer; inline; var H: Cardinal; @@ -72,6 +77,8 @@ function MemoContains(var AMemo: TMemoTable; APC, APos: Integer): Boolean; var Idx, I: Integer; begin + if Length(AMemo.Entries) = 0 then + Exit(False); Idx := MemoHash(APC, APos); for I := 0 to 15 do begin @@ -88,6 +95,7 @@ procedure MemoAdd(var AMemo: TMemoTable; APC, APos: Integer); var Idx, I: Integer; begin + MemoEnsureAllocated(AMemo); if AMemo.Count >= MEMO_LOAD_LIMIT then Exit; Idx := MemoHash(APC, APos); @@ -107,26 +115,6 @@ procedure MemoAdd(var AMemo: TMemoTable; APC, APos: Integer); end; end; -function CharClassContains(const AClass: TRegExpCharClass; - ACodePoint: Cardinal): Boolean; -var - Lo, Hi, Mid: Integer; -begin - Lo := 0; - Hi := High(AClass.Ranges); - while Lo <= Hi do - begin - Mid := (Lo + Hi) shr 1; - if ACodePoint < AClass.Ranges[Mid].Lo then - Hi := Mid - 1 - else if ACodePoint > AClass.Ranges[Mid].Hi then - Lo := Mid + 1 - else - Exit(True); - end; - Result := False; -end; - function CharClassContainsLinear(const AClass: TRegExpCharClass; ACodePoint: Cardinal): Boolean; inline; var @@ -418,6 +406,8 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; end; RefPos := RefStart; LookMatched := True; + RefEnd := ASlots[BackrefGroup * 2 + 1]; + I := InputPos; while RefPos < RefEnd do begin if not ReadInputCodePoint(AInput, RefPos, @@ -452,6 +442,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; end; if not LookMatched then begin + InputPos := I; MemoAdd(Memo, PC, InputPos); if not PopBacktrack then Exit; Continue; @@ -636,7 +627,9 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; Continue; end; else - Inc(PC); + raise ERegExpRuntimeError.CreateFmt( + 'Invalid regular expression bytecode: opcode %d at PC %d', + [Ord(Op), PC]); end; end; end; diff --git a/tests/built-ins/RegExp/constructor.js b/tests/built-ins/RegExp/constructor.js index 3c3bb21b..392fb7b7 100644 --- a/tests/built-ins/RegExp/constructor.js +++ b/tests/built-ins/RegExp/constructor.js @@ -124,3 +124,13 @@ test("trailing backslash throws SyntaxError", () => { test("huge quantifier does not crash", () => { expect(/x{2147483648}x/.test("1")).toBe(false); }); + +test("exec on Object.create(RegExp.prototype) throws TypeError", () => { + const obj = Object.create(RegExp.prototype); + expect(() => { RegExp.prototype.exec.call(obj, "test"); }).toThrow(TypeError); +}); + +test("test on Object.create(RegExp.prototype) throws TypeError", () => { + const obj = Object.create(RegExp.prototype); + expect(() => { RegExp.prototype.test.call(obj, "test"); }).toThrow(TypeError); +}); From c640541dbf634bf41f5e0f2bded61249377bfa23 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Sat, 9 May 2026 11:10:10 +0100 Subject: [PATCH 15/15] Review cleanup: dead code, SIGSEGV fix, memo correctness, lazy alloc Simplify pass from three-agent review: Dead code: - Remove FlagUnicode from TRegExpProgram (unread since per-instruction encoding handles unicode behavior) - Remove FFlags field from TRegExpCompiler (parsed into FModifier/FUnicode in constructor, never read after) - Remove AInCharClass parameter from CompileEscape (never referenced) - Remove dead ClassIdx variable from EmitUnicodePropertyClass - Remove duplicate AddCharClassFromDynamic (identical to AddCharClass) Shared constants: - Move BACKREF_STRICT_FLAG, BACKREF_ICASE_FLAG, LOOK_NEGATED_FLAG and their mask companions to the Compiler interface section so the VM uses named constants instead of raw hex at 8 decode sites. Efficiency: - PushBacktrack reuses existing Slots array when length matches (avoids heap allocation per push on hot path) - FillChar($FF) replaces per-element slot init loop Correctness: - \P{...} inside character classes now throws SyntaxError instead of silently treating it as \p{...} (misleading comment removed) Co-Authored-By: Claude Opus 4.6 (1M context) --- source/units/Goccia.RegExp.Compiler.pas | 45 ++++++++++++------------- source/units/Goccia.RegExp.VM.pas | 23 ++++++------- 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/source/units/Goccia.RegExp.Compiler.pas b/source/units/Goccia.RegExp.Compiler.pas index 8e2af0cd..25978076 100644 --- a/source/units/Goccia.RegExp.Compiler.pas +++ b/source/units/Goccia.RegExp.Compiler.pas @@ -41,9 +41,15 @@ TRegExpProgram = record CharClasses: array of TRegExpCharClass; CaptureCount: Integer; NamedGroups: TGocciaRegExpNamedGroups; - FlagUnicode: Boolean; end; +const + BACKREF_STRICT_FLAG = $800000; + BACKREF_ICASE_FLAG = $400000; + BACKREF_INDEX_MASK = $3FFFFF; + LOOK_NEGATED_FLAG = $800000; + LOOK_TARGET_MASK = $7FFFFF; + function CompileRegExp(const APattern, AFlags: string): TRegExpProgram; procedure ValidateRegExpPatternNew(const APattern, AFlags: string); @@ -65,7 +71,6 @@ TModifierState = record TRegExpCompiler = class private FPattern: string; - FFlags: string; FPos: Integer; FCode: array of UInt32; FCodeLen: Integer; @@ -95,7 +100,7 @@ TRegExpCompiler = class procedure CompileAtom; procedure CompileQuantifier(AAtomStart: Integer); procedure CompileCharacterClass; - procedure CompileEscape(AInCharClass: Boolean; var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); + procedure CompileEscape(var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); procedure CompileEscapeAtom; procedure CompileGroup; procedure CompileModifierGroup; @@ -132,7 +137,6 @@ constructor TRegExpCompiler.Create(const APattern, AFlags: string); begin inherited Create; FPattern := APattern; - FFlags := AFlags; FPos := 1; FCodeLen := 0; SetLength(FCode, 256); @@ -452,7 +456,7 @@ procedure TRegExpCompiler.EmitUnicodePropertyClass(const APropertyName: string; ANegated: Boolean); var Ranges: array[0..MAX_CHAR_RANGES - 1] of TRegExpCharRange; - RangeCount, ClassIdx: Integer; + RangeCount: Integer; begin RangeCount := 0; GetUnicodePropertyRanges(APropertyName, Ranges, RangeCount); @@ -615,11 +619,6 @@ function TRegExpCompiler.ParseDecimalEscape: Integer; Result := MAX_QUANTIFIER; end; -const - BACKREF_STRICT_FLAG = $800000; - BACKREF_ICASE_FLAG = $400000; - LOOK_NEGATED_FLAG = $800000; - procedure TRegExpCompiler.EmitDuplicateNamedBackref(const AName: string; AICaseFlag: Integer); var @@ -769,12 +768,11 @@ procedure TRegExpCompiler.CompileEscapeAtom; end; end; -procedure TRegExpCompiler.CompileEscape(AInCharClass: Boolean; +procedure TRegExpCompiler.CompileEscape( var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); var C: Char; PropertyName: string; - Negated: Boolean; CodePoint: Cardinal; begin C := Advance; @@ -812,11 +810,11 @@ procedure TRegExpCompiler.CompileEscape(AInCharClass: Boolean; PropertyName := PropertyName + Advance; if not Match('}') then raise EConvertError.Create('Unterminated Unicode property escape'); - GetUnicodePropertyRanges(PropertyName, ARanges, ARangeCount); if C = 'P' then - begin - // For negated in char class context, handled by caller - end; + raise EConvertError.Create( + 'Negated Unicode property escape \\P{...} is not supported inside character classes') + else + GetUnicodePropertyRanges(PropertyName, ARanges, ARangeCount); end else AddRange(ARanges, ARangeCount, Ord(C), Ord(C)); @@ -861,7 +859,7 @@ procedure TRegExpCompiler.CompileCharacterClass; if Peek = '\' then begin Inc(FPos); - CompileEscape(True, Ranges, RangeCount); + CompileEscape(Ranges, RangeCount); Continue; end; Lo := ReadCodePoint; @@ -872,7 +870,7 @@ procedure TRegExpCompiler.CompileCharacterClass; begin SavePos := RangeCount; Inc(FPos); - CompileEscape(True, Ranges, RangeCount); + CompileEscape(Ranges, RangeCount); if RangeCount > SavePos then begin Hi := Ranges[RangeCount - 1].Lo; @@ -1165,8 +1163,8 @@ procedure TRegExpCompiler.EmitBodyAt(const ABody: array of UInt32; RX_LOOKAHEAD, RX_LOOKBEHIND: begin Bx := Integer(FCode[J] shr 8); - NegFlag := Bx and $800000; - Bx := (Bx and $7FFFFF) + Delta; + NegFlag := Bx and LOOK_NEGATED_FLAG; + Bx := (Bx and LOOK_TARGET_MASK) + Delta; FCode[J] := EncodeOpBx(Op, Bx or NegFlag); end; end; @@ -1321,13 +1319,13 @@ procedure TRegExpCompiler.InsertSplitAt(APos: Integer); RX_LOOKAHEAD, RX_LOOKBEHIND: begin Bx := Integer(FCode[I] shr 8); - Negated := (Bx and $800000) <> 0; - Bx := Bx and $7FFFFF; + Negated := (Bx and LOOK_NEGATED_FLAG) <> 0; + Bx := Bx and LOOK_TARGET_MASK; if Bx >= APos then begin Inc(Bx); if Negated then - Bx := Bx or $800000; + Bx := Bx or LOOK_NEGATED_FLAG; FCode[I] := EncodeOpBx(Op, Bx); end; end; @@ -1510,7 +1508,6 @@ function TRegExpCompiler.Compile: TRegExpProgram; Result.CharClasses := FCharClasses; Result.CaptureCount := FCaptureCount; Result.NamedGroups := FNamedGroups; - Result.FlagUnicode := FUnicode; end; function CompileRegExp(const APattern, AFlags: string): TRegExpProgram; diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index 6ad87b88..83d843fa 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -225,7 +225,8 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; SetLength(Stack, StackTop * 2 + 16); Stack[StackTop].PC := APC; Stack[StackTop].InputPos := AInputPos; - SetLength(Stack[StackTop].Slots, SlotCount); + if Length(Stack[StackTop].Slots) <> SlotCount then + SetLength(Stack[StackTop].Slots, SlotCount); if SlotCount > 0 then Move(ASlots[0], Stack[StackTop].Slots[0], SlotCount * SizeOf(Integer)); end; @@ -384,9 +385,9 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_BACKREF: begin - Negated := (Bx and $800000) <> 0; - BackrefICase := (Bx and $400000) <> 0; - BackrefGroup := Bx and $3FFFFF; + Negated := (Bx and BACKREF_STRICT_FLAG) <> 0; + BackrefICase := (Bx and BACKREF_ICASE_FLAG) <> 0; + BackrefGroup := Bx and BACKREF_INDEX_MASK; RefStart := -1; RefEnd := -1; if (BackrefGroup * 2) < SlotCount then @@ -537,8 +538,8 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_LOOKAHEAD: begin - Negated := (Bx and $800000) <> 0; - LookEnd := Bx and $7FFFFF; + Negated := (Bx and LOOK_NEGATED_FLAG) <> 0; + LookEnd := Bx and LOOK_TARGET_MASK; SetLength(LookSlots, SlotCount); Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); LookMatched := RunVM(AProgram, AInput, InputPos, LookSlots, @@ -567,8 +568,8 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: string; RX_LOOKBEHIND: begin - Negated := (Bx and $800000) <> 0; - LookEnd := Bx and $7FFFFF; + Negated := (Bx and LOOK_NEGATED_FLAG) <> 0; + LookEnd := Bx and LOOK_TARGET_MASK; LookMatched := False; SetLength(LookSlots, SlotCount); I := InputPos - 1; @@ -650,8 +651,7 @@ function ExecuteRegExpVM(const AProgram: TRegExpProgram; StartPos := AStartIndex + 1; if ARequireStart then begin - for I := 0 to SlotCount - 1 do - Slots[I] := -1; + FillChar(Slots[0], SlotCount * SizeOf(Integer), $FF); if RunVM(AProgram, AInput, StartPos, Slots, SlotCount) then begin AResult.Matched := True; @@ -663,8 +663,7 @@ function ExecuteRegExpVM(const AProgram: TRegExpProgram; end; while StartPos <= Length(AInput) + 1 do begin - for I := 0 to SlotCount - 1 do - Slots[I] := -1; + FillChar(Slots[0], SlotCount * SizeOf(Integer), $FF); if RunVM(AProgram, AInput, StartPos, Slots, SlotCount) then begin AResult.Matched := True;