diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ef911e48..d2aade09 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -134,7 +134,6 @@ jobs: GEN_DIR="$PREFIX/lib/fpc/${FPC_VERSION}/units/${TARGET}/rtl-generics" FCL_DIR="$PREFIX/lib/fpc/${FPC_VERSION}/units/${TARGET}/fcl-process" FCL_BASE_SRC="$PREFIX/share/fpcsrc/packages/fcl-base/src" - REGEXPR_SRC="$PREFIX/share/fpcsrc/packages/regexpr/src" FCL_NET_SRC="$PREFIX/share/fpcsrc/packages/fcl-net/src" OPENSSL_SRC="$PREFIX/share/fpcsrc/packages/openssl/src" @@ -144,11 +143,9 @@ jobs: echo "rtl-generics units: $(ls "$GEN_DIR"/*.ppu 2>/dev/null | wc -l) .ppu files" echo "fcl-process units: $(ls "$FCL_DIR"/*.ppu 2>/dev/null | wc -l) .ppu files" echo "fcl-base source path: $FCL_BASE_SRC" - echo "regexpr source path: $REGEXPR_SRC" echo "fcl-net source path: $FCL_NET_SRC" echo "openssl source path: $OPENSSL_SRC" test -d "$FCL_BASE_SRC" - test -d "$REGEXPR_SRC" test -d "$FCL_NET_SRC" test -d "$OPENSSL_SRC" @@ -168,7 +165,7 @@ jobs: "$CROSS_FPC" -T"${OS}" -O4 -dPRODUCTION -Xs -CX -XX -B \ -Fu./source/units -Fu./source/generated -Fu./source/shared -Fu./source/app -Fu./souffle \ -Fi./source/units -Fi./source/shared -Fi./souffle \ - -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$REGEXPR_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ + -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ -FU"build/compiled" -FE"build" \ $EXTRA_FLAGS \ -dFPC_SOFT_FPUX80 \ @@ -184,7 +181,7 @@ jobs: "$CROSS_FPC" -T"${OS}" -O4 -dPRODUCTION -Xs -CX -XX -B \ -Fu./source/units -Fu./source/generated -Fu./source/shared -Fu./source/app -Fu./souffle \ -Fi./source/units -Fi./source/shared -Fi./souffle \ - -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$REGEXPR_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ + -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ -FU"build/compiled" -FE"build" \ $EXTRA_FLAGS \ -dFPC_SOFT_FPUX80 \ @@ -197,7 +194,7 @@ jobs: "$CROSS_FPC" -T"${OS}" -O4 -dPRODUCTION -Xs -CX -XX -B \ -Fu./source/units -Fu./source/generated -Fu./source/shared -Fu./source/app -Fu./souffle \ -Fi./source/units -Fi./source/shared -Fi./souffle \ - -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$REGEXPR_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ + -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ -FU"build/compiled" -FE"build" \ $EXTRA_FLAGS \ -dFPC_SOFT_FPUX80 \ @@ -209,7 +206,7 @@ jobs: "$CROSS_FPC" -T"${OS}" -O4 -dPRODUCTION -Xs -CX -XX -B \ -Fu./source/units -Fu./source/generated -Fu./source/shared -Fu./source/app -Fu./souffle \ -Fi./source/units -Fi./source/shared -Fi./souffle \ - -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$REGEXPR_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ + -Fu"$RTL_DIR" -Fu"$OBJPAS_DIR" -Fu"$GEN_DIR" -Fu"$FCL_DIR" -Fu"$FCL_BASE_SRC" -Fu"$FCL_NET_SRC" -Fu"$OPENSSL_SRC" \ -FU"build/compiled" -FE"build" \ $EXTRA_FLAGS \ -dFPC_SOFT_FPUX80 \ diff --git a/.github/workflows/toolchain.yml b/.github/workflows/toolchain.yml index 9f0fd9d4..ff0ebba6 100644 --- a/.github/workflows/toolchain.yml +++ b/.github/workflows/toolchain.yml @@ -107,9 +107,8 @@ jobs: # Keep official package sources available for cross builds. The # cached cross toolchain only prebuilds a minimal package subset, so - # source-based lookup is needed for units like Base64 and RegExpr. + # source-based lookup is needed for units like Base64. cp -R "$GITHUB_WORKSPACE/fpc-source/packages/fcl-base" "$PREFIX/share/fpcsrc/packages/" - cp -R "$GITHUB_WORKSPACE/fpc-source/packages/regexpr" "$PREFIX/share/fpcsrc/packages/" cp -R "$GITHUB_WORKSPACE/fpc-source/packages/fcl-net" "$PREFIX/share/fpcsrc/packages/" cp -R "$GITHUB_WORKSPACE/fpc-source/packages/openssl" "$PREFIX/share/fpcsrc/packages/" diff --git a/docs/build-system.md b/docs/build-system.md index 8c1e7c6e..8041413e 100644 --- a/docs/build-system.md +++ b/docs/build-system.md @@ -433,7 +433,7 @@ It: 6. If the `FPC_TARGET_CPU` environment variable is set, prepends `-P` to the compiler arguments (used by CI to target x86_64 on Windows where the FPC package defaults to i386). 7. For the `tests` target, auto-discovers all `*.Test.pas` files in `source/units/` and `source/shared/`. -The GitHub Actions cross-compilation workflow uses a reduced cached FPC toolchain rather than a full target-side FCL install. It prebuilds the RTL, `rtl-objpas`, `rtl-generics`, and `fcl-process`, and also caches the official `fcl-base` and `regexpr` sources so cross builds can resolve units such as `Base64` and `RegExpr` on demand from the shipped FPC packages. +The GitHub Actions cross-compilation workflow uses a reduced cached FPC toolchain rather than a full target-side FCL install. It prebuilds the RTL, `rtl-objpas`, `rtl-generics`, and `fcl-process`, and also caches the official `fcl-base` sources so cross builds can resolve units such as `Base64` on demand from the shipped FPC packages. ## Project Structure for Compilation diff --git a/docs/built-ins.md b/docs/built-ins.md index 8414b259..735ae2f6 100644 --- a/docs/built-ins.md +++ b/docs/built-ins.md @@ -236,7 +236,7 @@ RegExp is available as both `RegExp()` and `new RegExp()`. Regex literals (`/pat - When the replacer is a function and named groups are present, the `groups` object is passed as the last argument after `input`. - `String.prototype.match`, `matchAll`, `replace`, `replaceAll`, `search`, and `split` dispatch through the corresponding well-known symbol hooks, so custom protocol objects work as expected. - `matchAll()` returns a lazy iterator that advances matches on demand per the specification. -- The `u` flag enables Unicode-aware pattern matching. Unicode property escapes (`\p{Letter}`, `\P{ASCII}`, etc.) are expanded to equivalent character classes. Unicode code point escapes (`\u{41}`, `\u{1F600}`) are converted to UTF-8 byte sequences. Supported properties: `L`/`Letter`, `Lu`/`Uppercase_Letter`, `Ll`/`Lowercase_Letter`, `N`/`Number`, `Nd`/`Decimal_Number`, `P`/`Punctuation`, `S`/`Symbol`, `Z`/`Separator`, `Cc`/`Control`, `ASCII`, `ASCII_Hex_Digit`, `White_Space`. Unsupported properties throw `SyntaxError`. The `u` flag also disables TRegExpr's Russian charset extensions and enables correct `AdvanceStringIndex` for multi-byte UTF-8 sequences. +- The `u` flag enables Unicode-aware pattern matching. Unicode property escapes (`\p{Letter}`, `\P{ASCII}`, etc.) are matched against Unicode code point range tables. Unicode code point escapes (`\u{41}`, `\u{1F600}`) are converted to UTF-8 byte sequences. Supported properties: `L`/`Letter`, `Lu`/`Uppercase_Letter`, `Ll`/`Lowercase_Letter`, `N`/`Number`, `Nd`/`Decimal_Number`, `P`/`Punctuation`, `S`/`Symbol`, `Z`/`Separator`, `Cc`/`Control`, `ASCII`, `ASCII_Hex_Digit`, `White_Space`. Unsupported properties throw `SyntaxError`. The `u` flag enables correct `AdvanceStringIndex` for multi-byte UTF-8 sequences. - The `v` flag (Unicode sets) is accepted and exposed through `.flags` and `.unicodeSets`. The `u` and `v` flags are mutually exclusive. Full Unicode set notation and properties of strings in character classes are not yet implemented beyond basic `u` flag behavior. - The `d` flag (indices) is accepted and exposed through `.flags` and `.hasIndices`. Match indices are not yet populated. diff --git a/docs/decision-log.md b/docs/decision-log.md index 803c7a5f..6a8886ca 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -17,6 +17,8 @@ Chronological record of key architectural and implementation decisions, newest f --- +**2026-05-08** · `engine` — Replace TRegExpr with a purpose-built backtracking bytecode VM regex engine. TRegExpr used native call recursion for backtracking, causing SIGSEGV on inputs ~42K+ chars when combined with the evaluator's stack depth (#515). Three preprocessing passes papered over feature gaps: `(?s)` modifier scope leak, no named groups (two-pass rewrite), and inadequate Unicode (`\p{...}` expanded to ASCII approximations). New architecture: `Goccia.RegExp.Compiler.pas` (recursive-descent parser + bytecode emitter) and `Goccia.RegExp.VM.pas` (iterative dispatch loop with heap-allocated backtrack stack and always-on failure memoization). The compiler parses ES2026 regex grammar directly, handling named groups, Unicode property escapes, inline modifier groups, and backreferences natively — no preprocessing passes. The memoization cache records `(PC, InputPos)` failure states to prune exponential backtracking (e.g., `(a+)+b`). Configurable step limit (default 10M) throws `Error` instead of crashing. Removes the FPC `regexpr` package from the cross-compilation toolchain. Reuses `TextSemantics.pas` UTF-8 functions (`TryReadUTF8CodePoint`, `AdvanceUTF8StringIndex`, `CodePointToUTF8`, etc.) rather than reimplementing. Public API (`ExecuteRegExp` signature, `TGocciaRegExpMatchResult` record) unchanged; `Goccia.RegExp.Runtime.pas` and `Goccia.Builtins.GlobalRegExp.pas` unmodified. + **2026-05-05** · `parser` — Opt-in traditional `for(init; test; update)` loops (`--compat-traditional-for-loop`). Added behind a new compatibility flag for ECMAScript compatibility when porting legacy code, mirroring the existing `--compat-var` and `--compat-function` posture. The flag is off by default and ORed in by `--compat-all` so test262 (which always passes `--compat-all`) executes traditional `for(;;)` bodies that previously parser-warn-and-skipped — surfacing real engine gaps in unrelated areas (Atomics #541, Intl #542, BigInt postfix increment #540, etc.). `let`/`const` declarations in for-init create a per-iteration lexical environment per ES2026 §14.7.4.4, so closures captured during iteration N pin to that iteration's binding (the textbook `fns.push(() => i)` case yields `[0, 1, 2]`, not `[3, 3, 3]`). `var` declarations require both `--compat-var` and the new flag and share a single hoisted binding visible after the loop. The bytecode compiler reuses the counted-loop pattern from `CompileCountedForOf` for `for(let i = N; i M; i++ | i--)` shapes (rejecting var/const, bodies that mutate the loop var, type annotations, and non-integer-literal cond RHS). `while` and `do...while` remain excluded — they have the same stub status but were intentionally split into a separate iteration. [language-tables.md](language-tables.md). **2026-05-04** · `testing` · [#513](https://github.com/frostney/GocciaScript/pull/513) — test262 conformance harness reframed around the standard tc39 convention. Previously the wrapper ran inside `GocciaTestRunner` and had to selectively hide / capture / restore the test-library globals (`expect`, `describe`, `test`, `runTests`, etc.) it registered, with failure capture leaning on an `undefined` sentinel that collided with thrown `undefined` and chunked-runner crashes that masked thousands of conformance failures as wrapper failures (#491 history). Replaced with: per-test `GocciaScriptLoaderBare` subprocess, stock tc39/test262 harness files read directly from the pinned checkout's `harness/` directory (with a small set of bundled adaptations under `scripts/test262_harness/` for stock files that depend on language features Goccia excludes by design or that work around specific engine bugs — see [test262.md § Bundled harness adaptations](test262.md#bundled-harness-adaptations)), exit-code + stdout-marker wire protocol identical to `test262-harness`/`eshost`/test262.fyi, and a thin TypeScript orchestrator (`scripts/run_test262_suite.ts`). No eligibility filter — every discovered test runs; per-test subprocess + `--timeout` + `--max-memory` bound the blast radius. Wrapper-template drift is now structurally impossible because the "template" is `harness + body` string concatenation. Wrapper-infra failures are classified separately and gated to zero in CI. Surfaced eleven engine bugs (all milestoned 0.8.0, all labeled `engine` per the architecture split where `engine` covers `TGocciaEngine` — language semantics + ECMAScript built-ins — and `runtime` is reserved for `TGocciaRuntime` host extensions like console/fetch/JSON5): [#514](https://github.com/frostney/GocciaScript/issues/514) (Iterator.concat SIGSEGV), [#515](https://github.com/frostney/GocciaScript/issues/515) (RegExp.test SIGSEGV), [#516](https://github.com/frostney/GocciaScript/issues/516) (`Reflect.construct` rejects function decls/exprs), [#517](https://github.com/frostney/GocciaScript/issues/517) (script-mode unattached call `this`), [#518](https://github.com/frostney/GocciaScript/issues/518) (bytecode VM Range-check on top-level `Promise.then` drain), [#519](https://github.com/frostney/GocciaScript/issues/519) (`Error.prototype.constructor` missing), [#520](https://github.com/frostney/GocciaScript/issues/520) (module arrow `this` lexical inheritance), [#521](https://github.com/frostney/GocciaScript/issues/521) (`var`/`function` shadowing built-in globals), [#522](https://github.com/frostney/GocciaScript/issues/522) (`String(obj)` doesn't invoke `toString`), [#523](https://github.com/frostney/GocciaScript/issues/523) (`yield*` accesses `.next` on null), [#524](https://github.com/frostney/GocciaScript/issues/524) (for-of re-fetches `iterator.next` each iteration). Each bundled-harness adaptation under `scripts/test262_harness/` references its tracking issue and is to be removed when the underlying engine bug is fixed. [test262.md](test262.md). diff --git a/scripts/run_test262_suite.ts b/scripts/run_test262_suite.ts index a31588a6..ffdbb4bb 100644 --- a/scripts/run_test262_suite.ts +++ b/scripts/run_test262_suite.ts @@ -74,8 +74,6 @@ const DEFAULT_JOBS = 4; // bug. Per docs/test262.md "Updating the contract", this list is the // only allowed form of test-skipping; no generic eligibility filter. const KNOWN_ENGINE_CRASHES = new Set([ - // SIGSEGV: RegExp.prototype.test trailing-input edge case. https://github.com/frostney/GocciaScript/issues/515 - "staging/sm/RegExp/test-trailing.js", ]); // --------------------------------------------------------------------------- diff --git a/source/units/Goccia.Builtins.GlobalRegExp.pas b/source/units/Goccia.Builtins.GlobalRegExp.pas index 7e633adb..dd423594 100644 --- a/source/units/Goccia.Builtins.GlobalRegExp.pas +++ b/source/units/Goccia.Builtins.GlobalRegExp.pas @@ -46,6 +46,26 @@ TGocciaGlobalRegExp = class(TGocciaBuiltin) const AThisValue: TGocciaValue): TGocciaValue; function RegExpSymbolSplit(const AArgs: TGocciaArgumentsCollection; const AThisValue: TGocciaValue): TGocciaValue; + function RegExpSourceGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpFlagsGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpGlobalGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpIgnoreCaseGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpMultilineGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpDotAllGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpUnicodeGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpStickyGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpUnicodeSetsGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; + function RegExpHasIndicesGetter(const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; public constructor Create(const AName: string; const AScope: TGocciaScope; const AThrowError: TGocciaThrowErrorCallback; @@ -63,6 +83,7 @@ implementation Goccia.Error.Messages, Goccia.Error.Suggestions, Goccia.GarbageCollector, + Goccia.RegExp.Engine, Goccia.RegExp.Runtime, Goccia.Utils, Goccia.Values.ArrayValue, @@ -315,6 +336,26 @@ constructor TGocciaGlobalRegExp.Create(const AName: string; Members.AddSymbolMethod(TGocciaSymbolValue.WellKnownSplit, '[Symbol.split]', RegExpSymbolSplit, 2, [pfConfigurable, pfWritable], [gmfNoFunctionPrototype]); + Members.AddAccessor(PROP_SOURCE, RegExpSourceGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_FLAGS, RegExpFlagsGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_GLOBAL, RegExpGlobalGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_IGNORE_CASE, RegExpIgnoreCaseGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_MULTILINE, RegExpMultilineGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_DOT_ALL, RegExpDotAllGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_UNICODE, RegExpUnicodeGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_STICKY, RegExpStickyGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_UNICODE_SETS, RegExpUnicodeSetsGetter, nil, + [pfConfigurable]); + Members.AddAccessor(PROP_HAS_INDICES, RegExpHasIndicesGetter, nil, + [pfConfigurable]); Members.AddSymbolDataProperty(TGocciaSymbolValue.WellKnownToStringTag, TGocciaStringLiteralValue.Create(CONSTRUCTOR_REGEXP), [pfConfigurable]); FPrototypeMembers := Members.ToDefinitions; @@ -353,7 +394,127 @@ constructor TGocciaGlobalRegExp.Create(const AName: string; AScope.DefineLexicalBinding(AName, FRegExpConstructor, dtConst, True); end; -// ES2026 §22.2.4.2 get RegExp [ @@species ] +function RequireRegExpThis(const AThisValue: TGocciaValue; + const AMethodName: string): TGocciaObjectValue; +begin + if not IsRegExpValue(AThisValue) then + ThrowTypeError(AMethodName + ' requires a RegExp object'); + Result := TGocciaObjectValue(AThisValue); +end; + +function GetRegExpInternalFlags(const AObj: TGocciaObjectValue): string; +begin + Result := AObj.GetProperty(PROP_FLAGS).ToStringLiteral.Value; +end; + +function TGocciaGlobalRegExp.RegExpSourceGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaStringLiteralValue.Create('(?:)')); + Result := RequireRegExpThis(AThisValue, 'get RegExp.prototype.source') + .GetProperty(PROP_SOURCE); +end; + +function TGocciaGlobalRegExp.RegExpFlagsGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaStringLiteralValue.Create('')); + Result := RequireRegExpThis(AThisValue, 'get RegExp.prototype.flags') + .GetProperty(PROP_FLAGS); +end; + +function TGocciaGlobalRegExp.RegExpGlobalGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.global')), 'g')); +end; + +function TGocciaGlobalRegExp.RegExpIgnoreCaseGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.ignoreCase')), 'i')); +end; + +function TGocciaGlobalRegExp.RegExpMultilineGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.multiline')), 'm')); +end; + +function TGocciaGlobalRegExp.RegExpDotAllGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.dotAll')), 's')); +end; + +function TGocciaGlobalRegExp.RegExpUnicodeGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.unicode')), 'u')); +end; + +function TGocciaGlobalRegExp.RegExpStickyGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.sticky')), 'y')); +end; + +function TGocciaGlobalRegExp.RegExpUnicodeSetsGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.unicodeSets')), 'v')); +end; + +function TGocciaGlobalRegExp.RegExpHasIndicesGetter( + const AArgs: TGocciaArgumentsCollection; + const AThisValue: TGocciaValue): TGocciaValue; +begin + if AThisValue = GetRegExpPrototype then + Exit(TGocciaUndefinedLiteralValue.UndefinedValue); + Result := TGocciaBooleanLiteralValue.Create( + HasRegExpFlag(GetRegExpInternalFlags( + RequireRegExpThis(AThisValue, 'get RegExp.prototype.hasIndices')), 'd')); +end; + function TGocciaGlobalRegExp.RegExpSpeciesGetter( const AArgs: TGocciaArgumentsCollection; const AThisValue: TGocciaValue): TGocciaValue; diff --git a/source/units/Goccia.RegExp.Compiler.pas b/source/units/Goccia.RegExp.Compiler.pas new file mode 100644 index 00000000..25978076 --- /dev/null +++ b/source/units/Goccia.RegExp.Compiler.pas @@ -0,0 +1,1533 @@ +unit Goccia.RegExp.Compiler; + +{$I Goccia.inc} + +interface + +uses + Goccia.RegExp.Engine; + +type + TRegExpOpCode = ( + RX_CHAR = 0, + RX_CHAR_CLASS = 1, + RX_CHAR_CLASS_NEG = 2, + RX_ANY = 3, + RX_SPLIT = 4, + RX_SPLIT_LAZY = 5, + RX_JUMP = 6, + RX_SAVE = 7, + RX_BACKREF = 8, + RX_ASSERT_START = 9, + RX_ASSERT_END = 10, + RX_ASSERT_WORD = 11, + RX_LOOKAHEAD = 12, + RX_LOOKBEHIND = 13, + RX_MATCH = 14, + RX_FAIL = 15 + ); + + TRegExpCharRange = record + Lo: Cardinal; + Hi: Cardinal; + end; + + TRegExpCharClass = record + Ranges: array of TRegExpCharRange; + end; + + TRegExpProgram = record + Code: array of UInt32; + CharClasses: array of TRegExpCharClass; + CaptureCount: Integer; + NamedGroups: TGocciaRegExpNamedGroups; + end; + +const + BACKREF_STRICT_FLAG = $800000; + BACKREF_ICASE_FLAG = $400000; + BACKREF_INDEX_MASK = $3FFFFF; + LOOK_NEGATED_FLAG = $800000; + LOOK_TARGET_MASK = $7FFFFF; + +function CompileRegExp(const APattern, AFlags: string): TRegExpProgram; +procedure ValidateRegExpPatternNew(const APattern, AFlags: string); + +implementation + +uses + Math, + SysUtils, + + TextSemantics; + +type + TModifierState = record + IgnoreCase: Boolean; + Multiline: Boolean; + DotAll: Boolean; + end; + + TRegExpCompiler = class + private + FPattern: string; + FPos: Integer; + FCode: array of UInt32; + FCodeLen: Integer; + FCharClasses: array of TRegExpCharClass; + FCaptureCount: Integer; + FNamedGroups: TGocciaRegExpNamedGroups; + FAltStack: array of Integer; + FAltStackDepth: Integer; + FModifier: TModifierState; + FUnicode: Boolean; + function Peek: Char; + function PeekAt(AOffset: Integer): Char; + function AtEnd: Boolean; + function Advance: Char; + function Match(C: Char): Boolean; + procedure Emit(AInstr: UInt32); + function EmitHole: Integer; + procedure PatchHole(AIndex: Integer; ATarget: Integer); + function CurrentPC: Integer; + function EncodeOp(AOp: TRegExpOpCode): UInt32; + function EncodeOpBx(AOp: TRegExpOpCode; ABx: Integer): UInt32; + function AddCharClass(const ARanges: array of TRegExpCharRange): Integer; + procedure CompilePattern; + procedure CompileDisjunction; + procedure CompileAlternative; + procedure CompileTerm; + procedure CompileAtom; + procedure CompileQuantifier(AAtomStart: Integer); + procedure CompileCharacterClass; + procedure CompileEscape(var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); + procedure CompileEscapeAtom; + procedure CompileGroup; + procedure CompileModifierGroup; + function ParseGroupName: string; + function ParseUnicodeEscape: Cardinal; + function ParseHexEscape(ADigits: Integer): Cardinal; + function ParseDecimalEscape: Integer; + procedure EmitCharMatch(ACodePoint: Cardinal); + procedure EmitCharClassRanges(const ARanges: array of TRegExpCharRange; + ARangeCount: Integer; ANegated: Boolean); + procedure AddBuiltinCharClass(AEscapeChar: Char; var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); + procedure AddRange(var ARanges: array of TRegExpCharRange; var ARangeCount: Integer; ALo, AHi: Cardinal); + procedure EmitUnicodePropertyClass(const APropertyName: string; ANegated: Boolean); + procedure GetUnicodePropertyRanges(const APropertyName: string; var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); + function ReadCodePoint: Cardinal; + procedure EnsureCodeCapacity(ANeeded: Integer); + procedure EmitBody(const ABody: array of UInt32; ALen: Integer); + procedure EmitBodyAt(const ABody: array of UInt32; ALen: Integer; + AOrigStart: Integer); + procedure ValidateNamedGroups; + procedure PreScanNamedGroups; + procedure InsertSplitAt(APos: Integer); + procedure EmitDuplicateNamedBackref(const AName: string; + AICaseFlag: Integer); + public + constructor Create(const APattern, AFlags: string); + function Compile: TRegExpProgram; + end; + +const + MAX_CHAR_RANGES = 512; + +constructor TRegExpCompiler.Create(const APattern, AFlags: string); +begin + inherited Create; + FPattern := APattern; + FPos := 1; + FCodeLen := 0; + SetLength(FCode, 256); + SetLength(FCharClasses, 0); + FCaptureCount := 0; + SetLength(FNamedGroups, 0); + SetLength(FAltStack, 64); + FAltStackDepth := 0; + FAltStack[0] := 0; + FModifier.IgnoreCase := HasRegExpFlag(AFlags, 'i'); + FModifier.Multiline := HasRegExpFlag(AFlags, 'm'); + FModifier.DotAll := HasRegExpFlag(AFlags, 's'); + FUnicode := HasRegExpFlag(AFlags, 'u') or HasRegExpFlag(AFlags, 'v'); +end; + +function TRegExpCompiler.Peek: Char; +begin + if FPos <= Length(FPattern) then + Result := FPattern[FPos] + else + Result := #0; +end; + +function TRegExpCompiler.PeekAt(AOffset: Integer): Char; +var + Idx: Integer; +begin + Idx := FPos + AOffset; + if (Idx >= 1) and (Idx <= Length(FPattern)) then + Result := FPattern[Idx] + else + Result := #0; +end; + +function TRegExpCompiler.AtEnd: Boolean; +begin + Result := FPos > Length(FPattern); +end; + +function TRegExpCompiler.Advance: Char; +begin + Result := Peek; + Inc(FPos); +end; + +function TRegExpCompiler.Match(C: Char): Boolean; +begin + if Peek = C then + begin + Inc(FPos); + Result := True; + end + else + Result := False; +end; + +procedure TRegExpCompiler.Emit(AInstr: UInt32); +begin + if FCodeLen >= Length(FCode) then + SetLength(FCode, FCodeLen * 2 + 16); + FCode[FCodeLen] := AInstr; + Inc(FCodeLen); +end; + +function TRegExpCompiler.EmitHole: Integer; +begin + Result := FCodeLen; + Emit(0); +end; + +procedure TRegExpCompiler.PatchHole(AIndex: Integer; ATarget: Integer); +var + Op: TRegExpOpCode; +begin + Op := TRegExpOpCode(FCode[AIndex] and $FF); + FCode[AIndex] := UInt32(Ord(Op)) or (UInt32(ATarget) shl 8); +end; + +function TRegExpCompiler.CurrentPC: Integer; +begin + Result := FCodeLen; +end; + +function TRegExpCompiler.EncodeOp(AOp: TRegExpOpCode): UInt32; +begin + Result := UInt32(Ord(AOp)); +end; + +function TRegExpCompiler.EncodeOpBx(AOp: TRegExpOpCode; ABx: Integer): UInt32; +begin + Result := UInt32(Ord(AOp)) or (UInt32(ABx) shl 8); +end; + +function TRegExpCompiler.AddCharClass( + const ARanges: array of TRegExpCharRange): Integer; +var + I: Integer; +begin + Result := Length(FCharClasses); + SetLength(FCharClasses, Result + 1); + SetLength(FCharClasses[Result].Ranges, Length(ARanges)); + for I := 0 to High(ARanges) do + FCharClasses[Result].Ranges[I] := ARanges[I]; +end; + + +procedure TRegExpCompiler.EmitCharMatch(ACodePoint: Cardinal); +var + Ranges: array[0..1] of TRegExpCharRange; + ClassIdx: Integer; + Lower, Upper: Cardinal; +begin + if FModifier.IgnoreCase then + begin + if (ACodePoint >= Ord('A')) and (ACodePoint <= Ord('Z')) then + begin + Lower := ACodePoint + 32; + Ranges[0].Lo := ACodePoint; + Ranges[0].Hi := ACodePoint; + Ranges[1].Lo := Lower; + Ranges[1].Hi := Lower; + ClassIdx := AddCharClass(Ranges); + Emit(EncodeOpBx(RX_CHAR_CLASS, ClassIdx)); + Exit; + end; + if (ACodePoint >= Ord('a')) and (ACodePoint <= Ord('z')) then + begin + Upper := ACodePoint - 32; + Ranges[0].Lo := Upper; + Ranges[0].Hi := Upper; + Ranges[1].Lo := ACodePoint; + Ranges[1].Hi := ACodePoint; + ClassIdx := AddCharClass(Ranges); + Emit(EncodeOpBx(RX_CHAR_CLASS, ClassIdx)); + Exit; + end; + if FUnicode and (ACodePoint = $212A) then + begin + Ranges[0].Lo := Ord('K'); + Ranges[0].Hi := Ord('K'); + Ranges[1].Lo := Ord('k'); + Ranges[1].Hi := Ord('k'); + ClassIdx := AddCharClass(Ranges); + Emit(EncodeOpBx(RX_CHAR_CLASS, ClassIdx)); + Exit; + end; + end; + Emit(EncodeOpBx(RX_CHAR, Integer(ACodePoint))); +end; + +procedure TRegExpCompiler.AddRange(var ARanges: array of TRegExpCharRange; + var ARangeCount: Integer; ALo, AHi: Cardinal); +begin + if ARangeCount >= Length(ARanges) then + Exit; + ARanges[ARangeCount].Lo := ALo; + ARanges[ARangeCount].Hi := AHi; + Inc(ARangeCount); +end; + +procedure TRegExpCompiler.AddBuiltinCharClass(AEscapeChar: Char; + var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); +begin + case AEscapeChar of + 'd': + AddRange(ARanges, ARangeCount, Ord('0'), Ord('9')); + 'D': + begin + AddRange(ARanges, ARangeCount, 0, Ord('0') - 1); + AddRange(ARanges, ARangeCount, Ord('9') + 1, $10FFFF); + end; + 'w': + begin + AddRange(ARanges, ARangeCount, Ord('0'), Ord('9')); + AddRange(ARanges, ARangeCount, Ord('A'), Ord('Z')); + AddRange(ARanges, ARangeCount, Ord('_'), Ord('_')); + AddRange(ARanges, ARangeCount, Ord('a'), Ord('z')); + end; + 'W': + begin + AddRange(ARanges, ARangeCount, 0, Ord('0') - 1); + AddRange(ARanges, ARangeCount, Ord('9') + 1, Ord('A') - 1); + AddRange(ARanges, ARangeCount, Ord('Z') + 1, Ord('_') - 1); + AddRange(ARanges, ARangeCount, Ord('_') + 1, Ord('a') - 1); + AddRange(ARanges, ARangeCount, Ord('z') + 1, $10FFFF); + end; + 's': + begin + AddRange(ARanges, ARangeCount, $09, $0D); + AddRange(ARanges, ARangeCount, $20, $20); + AddRange(ARanges, ARangeCount, $A0, $A0); + AddRange(ARanges, ARangeCount, $1680, $1680); + AddRange(ARanges, ARangeCount, $2000, $200A); + AddRange(ARanges, ARangeCount, $2028, $2029); + AddRange(ARanges, ARangeCount, $202F, $202F); + AddRange(ARanges, ARangeCount, $205F, $205F); + AddRange(ARanges, ARangeCount, $3000, $3000); + AddRange(ARanges, ARangeCount, $FEFF, $FEFF); + end; + 'S': + begin + AddRange(ARanges, ARangeCount, 0, $08); + AddRange(ARanges, ARangeCount, $0E, $1F); + AddRange(ARanges, ARangeCount, $21, $9F); + AddRange(ARanges, ARangeCount, $A1, $167F); + AddRange(ARanges, ARangeCount, $1681, $1FFF); + AddRange(ARanges, ARangeCount, $200B, $2027); + AddRange(ARanges, ARangeCount, $202A, $202E); + AddRange(ARanges, ARangeCount, $2030, $205E); + AddRange(ARanges, ARangeCount, $2060, $2FFF); + AddRange(ARanges, ARangeCount, $3001, $FEFE); + AddRange(ARanges, ARangeCount, $FF00, $10FFFF); + end; + end; +end; + +procedure TRegExpCompiler.GetUnicodePropertyRanges(const APropertyName: string; + var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); +begin + if (APropertyName = 'L') or (APropertyName = 'Letter') then + begin + AddRange(ARanges, ARangeCount, $41, $5A); + AddRange(ARanges, ARangeCount, $61, $7A); + AddRange(ARanges, ARangeCount, $C0, $D6); + AddRange(ARanges, ARangeCount, $D8, $F6); + AddRange(ARanges, ARangeCount, $F8, $2FF); + AddRange(ARanges, ARangeCount, $370, $37D); + AddRange(ARanges, ARangeCount, $37F, $1FFF); + AddRange(ARanges, ARangeCount, $200C, $200D); + AddRange(ARanges, ARangeCount, $2070, $218F); + AddRange(ARanges, ARangeCount, $2C00, $2FEF); + AddRange(ARanges, ARangeCount, $3001, $D7FF); + AddRange(ARanges, ARangeCount, $F900, $FDCF); + AddRange(ARanges, ARangeCount, $FDF0, $FFFD); + AddRange(ARanges, ARangeCount, $10000, $EFFFF); + end + else if (APropertyName = 'Lu') or (APropertyName = 'Uppercase_Letter') then + begin + AddRange(ARanges, ARangeCount, $41, $5A); + AddRange(ARanges, ARangeCount, $C0, $D6); + AddRange(ARanges, ARangeCount, $D8, $DE); + end + else if (APropertyName = 'Ll') or (APropertyName = 'Lowercase_Letter') then + begin + AddRange(ARanges, ARangeCount, $61, $7A); + AddRange(ARanges, ARangeCount, $DF, $F6); + AddRange(ARanges, ARangeCount, $F8, $FF); + end + else if (APropertyName = 'N') or (APropertyName = 'Number') then + AddRange(ARanges, ARangeCount, $30, $39) + else if (APropertyName = 'Nd') or (APropertyName = 'Decimal_Number') then + AddRange(ARanges, ARangeCount, $30, $39) + else if (APropertyName = 'P') or (APropertyName = 'Punctuation') then + begin + AddRange(ARanges, ARangeCount, $21, $23); + AddRange(ARanges, ARangeCount, $25, $2A); + AddRange(ARanges, ARangeCount, $2C, $2F); + AddRange(ARanges, ARangeCount, $3A, $3B); + AddRange(ARanges, ARangeCount, $3F, $40); + AddRange(ARanges, ARangeCount, $5B, $5D); + AddRange(ARanges, ARangeCount, $5F, $5F); + AddRange(ARanges, ARangeCount, $7B, $7B); + AddRange(ARanges, ARangeCount, $7D, $7D); + end + else if (APropertyName = 'S') or (APropertyName = 'Symbol') then + begin + AddRange(ARanges, ARangeCount, $24, $24); + AddRange(ARanges, ARangeCount, $2B, $2B); + AddRange(ARanges, ARangeCount, $3C, $3E); + AddRange(ARanges, ARangeCount, $5E, $5E); + AddRange(ARanges, ARangeCount, $60, $60); + AddRange(ARanges, ARangeCount, $7C, $7C); + AddRange(ARanges, ARangeCount, $7E, $7E); + end + else if (APropertyName = 'Z') or (APropertyName = 'Separator') then + begin + AddRange(ARanges, ARangeCount, $20, $20); + AddRange(ARanges, ARangeCount, $A0, $A0); + AddRange(ARanges, ARangeCount, $1680, $1680); + AddRange(ARanges, ARangeCount, $2000, $200A); + AddRange(ARanges, ARangeCount, $2028, $2029); + AddRange(ARanges, ARangeCount, $202F, $202F); + AddRange(ARanges, ARangeCount, $205F, $205F); + AddRange(ARanges, ARangeCount, $3000, $3000); + end + else if (APropertyName = 'Cc') or (APropertyName = 'Control') then + begin + AddRange(ARanges, ARangeCount, $00, $1F); + AddRange(ARanges, ARangeCount, $7F, $9F); + end + else if APropertyName = 'ASCII' then + AddRange(ARanges, ARangeCount, $00, $7F) + else if APropertyName = 'ASCII_Hex_Digit' then + begin + AddRange(ARanges, ARangeCount, $30, $39); + AddRange(ARanges, ARangeCount, $41, $46); + AddRange(ARanges, ARangeCount, $61, $66); + end + else if APropertyName = 'White_Space' then + begin + AddRange(ARanges, ARangeCount, $09, $0D); + AddRange(ARanges, ARangeCount, $20, $20); + AddRange(ARanges, ARangeCount, $85, $85); + AddRange(ARanges, ARangeCount, $A0, $A0); + AddRange(ARanges, ARangeCount, $1680, $1680); + AddRange(ARanges, ARangeCount, $2000, $200A); + AddRange(ARanges, ARangeCount, $2028, $2029); + AddRange(ARanges, ARangeCount, $202F, $202F); + AddRange(ARanges, ARangeCount, $205F, $205F); + AddRange(ARanges, ARangeCount, $3000, $3000); + end + else + raise EConvertError.Create('Invalid Unicode property name: ' + APropertyName); +end; + +procedure TRegExpCompiler.EmitUnicodePropertyClass(const APropertyName: string; + ANegated: Boolean); +var + Ranges: array[0..MAX_CHAR_RANGES - 1] of TRegExpCharRange; + RangeCount: Integer; +begin + RangeCount := 0; + GetUnicodePropertyRanges(APropertyName, Ranges, RangeCount); + EmitCharClassRanges(Ranges, RangeCount, ANegated); +end; + +procedure TRegExpCompiler.EmitCharClassRanges( + const ARanges: array of TRegExpCharRange; + ARangeCount: Integer; ANegated: Boolean); +var + ClassIdx, I, OrigCount: Integer; + Op: TRegExpOpCode; + DynRanges: array of TRegExpCharRange; +begin + SetLength(DynRanges, ARangeCount); + for I := 0 to ARangeCount - 1 do + DynRanges[I] := ARanges[I]; + if FModifier.IgnoreCase then + begin + OrigCount := Length(DynRanges); + for I := 0 to OrigCount - 1 do + begin + if (DynRanges[I].Lo >= Ord('A')) and (DynRanges[I].Hi <= Ord('Z')) then + begin + SetLength(DynRanges, Length(DynRanges) + 1); + DynRanges[High(DynRanges)].Lo := DynRanges[I].Lo + 32; + DynRanges[High(DynRanges)].Hi := DynRanges[I].Hi + 32; + end + else if (DynRanges[I].Lo >= Ord('a')) and (DynRanges[I].Hi <= Ord('z')) then + begin + SetLength(DynRanges, Length(DynRanges) + 1); + DynRanges[High(DynRanges)].Lo := DynRanges[I].Lo - 32; + DynRanges[High(DynRanges)].Hi := DynRanges[I].Hi - 32; + end; + end; + end; + ClassIdx := AddCharClass(DynRanges); + if ANegated then + Op := RX_CHAR_CLASS_NEG + else + Op := RX_CHAR_CLASS; + Emit(EncodeOpBx(Op, ClassIdx)); +end; + +function TRegExpCompiler.ReadCodePoint: Cardinal; +var + ByteLen: Integer; +begin + if FUnicode and (FPos <= Length(FPattern)) then + begin + if TryReadUTF8CodePoint(FPattern, FPos, Result, ByteLen) and (ByteLen > 1) then + begin + Inc(FPos, ByteLen); + Exit; + end; + end; + Result := Ord(Advance); +end; + +function TRegExpCompiler.ParseGroupName: string; +var + C: Char; +begin + Result := ''; + while not AtEnd do + begin + C := Peek; + if C = '>' then + begin + Inc(FPos); + Exit; + end; + Result := Result + Advance; + end; + raise EConvertError.Create('Unterminated group name'); +end; + +function TRegExpCompiler.ParseHexEscape(ADigits: Integer): Cardinal; +var + I: Integer; + C: Char; +begin + Result := 0; + for I := 1 to ADigits do + begin + if AtEnd then + raise EConvertError.Create('Invalid hex escape'); + C := Advance; + case C of + '0'..'9': Result := Result * 16 + Cardinal(Ord(C) - Ord('0')); + 'a'..'f': Result := Result * 16 + Cardinal(Ord(C) - Ord('a') + 10); + 'A'..'F': Result := Result * 16 + Cardinal(Ord(C) - Ord('A') + 10); + else + raise EConvertError.Create('Invalid hex escape'); + end; + end; +end; + +function TRegExpCompiler.ParseUnicodeEscape: Cardinal; +var + HighSurrogate: Cardinal; +begin + if Match('{') then + begin + Result := 0; + while not AtEnd and (Peek <> '}') do + begin + case Peek of + '0'..'9': Result := Result * 16 + Cardinal(Ord(Advance) - Ord('0')); + 'a'..'f': Result := Result * 16 + Cardinal(Ord(Advance) - Ord('a') + 10); + 'A'..'F': Result := Result * 16 + Cardinal(Ord(Advance) - Ord('A') + 10); + else + raise EConvertError.Create('Invalid Unicode escape'); + end; + end; + if not Match('}') then + raise EConvertError.Create('Unterminated Unicode escape'); + if Result > $10FFFF then + raise EConvertError.Create('Unicode escape out of range'); + Exit; + end; + Result := ParseHexEscape(4); + if (Result >= $D800) and (Result <= $DBFF) then + begin + HighSurrogate := Result; + if (Peek = '\') and (PeekAt(1) = 'u') then + begin + Inc(FPos, 2); + Result := ParseHexEscape(4); + if (Result >= $DC00) and (Result <= $DFFF) then + begin + Result := $10000 + ((HighSurrogate - $D800) shl 10) + + (Result - $DC00); + Exit; + end; + Dec(FPos, 6); + end; + Result := HighSurrogate; + end; +end; + +function TRegExpCompiler.ParseDecimalEscape: Integer; +const + MAX_QUANTIFIER = 1000000; +var + C: Char; +begin + Result := 0; + while not AtEnd do + begin + C := Peek; + if (C < '0') or (C > '9') then + Break; + if Result <= MAX_QUANTIFIER then + Result := Result * 10 + (Ord(Advance) - Ord('0')) + else + Advance; + end; + if Result > MAX_QUANTIFIER then + Result := MAX_QUANTIFIER; +end; + +procedure TRegExpCompiler.EmitDuplicateNamedBackref(const AName: string; + AICaseFlag: Integer); +var + Indices: array of Integer; + Count, I: Integer; + SplitHole: Integer; + JumpHoles: array of Integer; + JumpCount: Integer; +begin + Count := 0; + SetLength(Indices, Length(FNamedGroups)); + for I := 0 to High(FNamedGroups) do + if FNamedGroups[I].Name = AName then + begin + Indices[Count] := FNamedGroups[I].Index; + Inc(Count); + end; + SetLength(Indices, Count); + if Count = 1 then + begin + Emit(EncodeOpBx(RX_BACKREF, Indices[0] or AICaseFlag)); + Exit; + end; + JumpCount := 0; + SetLength(JumpHoles, Count + 1); + for I := 0 to Count - 1 do + begin + SplitHole := CurrentPC; + Emit(EncodeOpBx(RX_SPLIT, 0)); + Emit(EncodeOpBx(RX_BACKREF, Indices[I] or BACKREF_STRICT_FLAG or AICaseFlag)); + JumpHoles[JumpCount] := CurrentPC; + Inc(JumpCount); + Emit(0); + PatchHole(SplitHole, CurrentPC); + end; + Emit(EncodeOp(RX_FAIL)); + for I := 0 to JumpCount - 1 do + FCode[JumpHoles[I]] := EncodeOpBx(RX_JUMP, CurrentPC); +end; + +procedure TRegExpCompiler.CompileEscapeAtom; +var + C: Char; + Ranges: array[0..MAX_CHAR_RANGES - 1] of TRegExpCharRange; + RangeCount: Integer; + PropertyName: string; + Negated: Boolean; + GroupName: string; + BackrefIdx, I, GroupCount, BackrefICaseFlag: Integer; + CodePoint: Cardinal; +begin + if FModifier.IgnoreCase then + BackrefICaseFlag := BACKREF_ICASE_FLAG + else + BackrefICaseFlag := 0; + C := Advance; + case C of + 'd', 'D', 'w', 'W', 's', 'S': + begin + RangeCount := 0; + AddBuiltinCharClass(C, Ranges, RangeCount); + EmitCharClassRanges(Ranges, RangeCount, False); + end; + 'b': + Emit(EncodeOpBx(RX_ASSERT_WORD, 0)); + 'B': + Emit(EncodeOpBx(RX_ASSERT_WORD, 1)); + 'p', 'P': + begin + if FUnicode and Match('{') then + begin + Negated := C = 'P'; + PropertyName := ''; + while not AtEnd and (Peek <> '}') do + PropertyName := PropertyName + Advance; + if not Match('}') then + raise EConvertError.Create('Unterminated Unicode property escape'); + EmitUnicodePropertyClass(PropertyName, Negated); + end + else + EmitCharMatch(Ord(C)); + end; + 'k': + begin + if Match('<') then + begin + GroupName := ParseGroupName; + BackrefIdx := -1; + GroupCount := 0; + for I := 0 to High(FNamedGroups) do + if FNamedGroups[I].Name = GroupName then + begin + if BackrefIdx < 0 then + BackrefIdx := FNamedGroups[I].Index; + Inc(GroupCount); + end; + if BackrefIdx < 0 then + raise EConvertError.Create( + 'Invalid named backreference: ' + GroupName); + if GroupCount <= 1 then + Emit(EncodeOpBx(RX_BACKREF, BackrefIdx or BackrefICaseFlag)) + else + EmitDuplicateNamedBackref(GroupName, BackrefICaseFlag); + end + else + EmitCharMatch(Ord('k')); + end; + '1'..'9': + begin + BackrefIdx := Ord(C) - Ord('0'); + while not AtEnd and (Peek >= '0') and (Peek <= '9') do + BackrefIdx := BackrefIdx * 10 + (Ord(Advance) - Ord('0')); + Emit(EncodeOpBx(RX_BACKREF, BackrefIdx or BackrefICaseFlag)); + end; + 'n': EmitCharMatch($0A); + 'r': EmitCharMatch($0D); + 't': EmitCharMatch($09); + 'v': EmitCharMatch($0B); + 'f': EmitCharMatch($0C); + '0': + begin + if not AtEnd and (Peek >= '0') and (Peek <= '9') then + EmitCharMatch(Ord(C)) + else + EmitCharMatch(0); + end; + 'x': EmitCharMatch(ParseHexEscape(2)); + 'u': EmitCharMatch(ParseUnicodeEscape); + 'c': + begin + if not AtEnd and (((Peek >= 'a') and (Peek <= 'z')) or + ((Peek >= 'A') and (Peek <= 'Z'))) then + EmitCharMatch(Ord(Advance) mod 32) + else if FUnicode then + raise EConvertError.Create( + 'Invalid regular expression: invalid control escape in unicode mode') + else + EmitCharMatch(Ord('c')); + end; + else + if FUnicode and not CharInSet(C, ['/', '^', '$', '\', '.', '*', '+', + '?', '(', ')', '[', ']', '{', '}', '|']) then + raise EConvertError.Create( + 'Invalid regular expression: invalid escape in unicode mode') + else + EmitCharMatch(Ord(C)); + end; +end; + +procedure TRegExpCompiler.CompileEscape( + var ARanges: array of TRegExpCharRange; var ARangeCount: Integer); +var + C: Char; + PropertyName: string; + CodePoint: Cardinal; +begin + C := Advance; + case C of + 'd', 'D', 'w', 'W', 's', 'S': + AddBuiltinCharClass(C, ARanges, ARangeCount); + 'n': AddRange(ARanges, ARangeCount, $0A, $0A); + 'r': AddRange(ARanges, ARangeCount, $0D, $0D); + 't': AddRange(ARanges, ARangeCount, $09, $09); + 'v': AddRange(ARanges, ARangeCount, $0B, $0B); + 'f': AddRange(ARanges, ARangeCount, $0C, $0C); + '0': + begin + if not AtEnd and (Peek >= '0') and (Peek <= '9') then + AddRange(ARanges, ARangeCount, Ord(C), Ord(C)) + else + AddRange(ARanges, ARangeCount, 0, 0); + end; + 'x': + begin + CodePoint := ParseHexEscape(2); + AddRange(ARanges, ARangeCount, CodePoint, CodePoint); + end; + 'u': + begin + CodePoint := ParseUnicodeEscape; + AddRange(ARanges, ARangeCount, CodePoint, CodePoint); + end; + 'p', 'P': + begin + if FUnicode and Match('{') then + begin + PropertyName := ''; + while not AtEnd and (Peek <> '}') do + PropertyName := PropertyName + Advance; + if not Match('}') then + raise EConvertError.Create('Unterminated Unicode property escape'); + if C = 'P' then + raise EConvertError.Create( + 'Negated Unicode property escape \\P{...} is not supported inside character classes') + else + GetUnicodePropertyRanges(PropertyName, ARanges, ARangeCount); + end + else + AddRange(ARanges, ARangeCount, Ord(C), Ord(C)); + end; + 'b': + AddRange(ARanges, ARangeCount, $08, $08); + 'c': + begin + if not AtEnd and (((Peek >= 'a') and (Peek <= 'z')) or + ((Peek >= 'A') and (Peek <= 'Z'))) then + AddRange(ARanges, ARangeCount, Ord(Advance) mod 32, + Ord(FPattern[FPos - 1]) mod 32) + else if FUnicode then + raise EConvertError.Create( + 'Invalid regular expression: invalid control escape in unicode mode') + else + AddRange(ARanges, ARangeCount, Ord('c'), Ord('c')); + end; + else + if FUnicode and not CharInSet(C, ['/', '^', '$', '\', '.', '*', '+', + '?', '(', ')', '[', ']', '{', '}', '|', '-']) then + raise EConvertError.Create( + 'Invalid regular expression: invalid escape in unicode mode') + else + AddRange(ARanges, ARangeCount, Ord(C), Ord(C)); + end; +end; + +procedure TRegExpCompiler.CompileCharacterClass; +var + Ranges: array[0..MAX_CHAR_RANGES - 1] of TRegExpCharRange; + RangeCount: Integer; + Negated: Boolean; + C: Char; + Lo, Hi: Cardinal; + SavePos: Integer; +begin + Negated := Match('^'); + RangeCount := 0; + while not AtEnd and (Peek <> ']') do + begin + if Peek = '\' then + begin + Inc(FPos); + CompileEscape(Ranges, RangeCount); + Continue; + end; + Lo := ReadCodePoint; + if (not AtEnd) and (Peek = '-') and (PeekAt(1) <> ']') then + begin + Inc(FPos); + if Peek = '\' then + begin + SavePos := RangeCount; + Inc(FPos); + CompileEscape(Ranges, RangeCount); + if RangeCount > SavePos then + begin + Hi := Ranges[RangeCount - 1].Lo; + Dec(RangeCount); + if Lo > Hi then + raise EConvertError.Create( + 'Invalid regular expression: range out of order in character class'); + AddRange(Ranges, RangeCount, Lo, Hi); + end; + end + else + begin + Hi := ReadCodePoint; + if Lo > Hi then + raise EConvertError.Create( + 'Invalid regular expression: range out of order in character class'); + AddRange(Ranges, RangeCount, Lo, Hi); + end; + end + else + AddRange(Ranges, RangeCount, Lo, Lo); + end; + if not Match(']') then + raise EConvertError.Create('Unterminated character class'); + EmitCharClassRanges(Ranges, RangeCount, Negated); +end; + +procedure TRegExpCompiler.CompileModifierGroup; +var + C: Char; + EnableFlags, DisableFlags: string; + InDisable: Boolean; + SavedModifier: TModifierState; +begin + EnableFlags := ''; + DisableFlags := ''; + InDisable := False; + while not AtEnd and (Peek <> ':') and (Peek <> ')') do + begin + C := Advance; + if C = '-' then + begin + if InDisable then + raise EConvertError.Create( + 'Invalid regular expression: unexpected - in modifier group'); + InDisable := True; + Continue; + end; + if not CharInSet(C, ['i', 'm', 's']) then + raise EConvertError.CreateFmt( + 'Invalid regular expression: ''%s'' is not a valid modifier flag', [C]); + if InDisable then + begin + if Pos(C, DisableFlags) > 0 then + raise EConvertError.CreateFmt( + 'Invalid regular expression: duplicate modifier flag ''%s''', [C]); + if Pos(C, EnableFlags) > 0 then + raise EConvertError.CreateFmt( + 'Invalid regular expression: ''%s'' in both enable and disable', [C]); + DisableFlags := DisableFlags + C; + end + else + begin + if Pos(C, EnableFlags) > 0 then + raise EConvertError.CreateFmt( + 'Invalid regular expression: duplicate modifier flag ''%s''', [C]); + EnableFlags := EnableFlags + C; + end; + end; + if (EnableFlags = '') and (DisableFlags = '') then + raise EConvertError.Create( + 'Invalid regular expression: modifier group must enable or disable at least one flag'); + if not Match(':') then + raise EConvertError.Create( + 'Invalid regular expression: modifier group must use (?flags:...) syntax'); + SavedModifier := FModifier; + if Pos('i', EnableFlags) > 0 then FModifier.IgnoreCase := True; + if Pos('m', EnableFlags) > 0 then FModifier.Multiline := True; + if Pos('s', EnableFlags) > 0 then FModifier.DotAll := True; + if Pos('i', DisableFlags) > 0 then FModifier.IgnoreCase := False; + if Pos('m', DisableFlags) > 0 then FModifier.Multiline := False; + if Pos('s', DisableFlags) > 0 then FModifier.DotAll := False; + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated modifier group'); + FModifier := SavedModifier; +end; + +procedure TRegExpCompiler.CompileGroup; +var + SaveAltDepth: Integer; + GroupName: string; + CaptureIdx, I: Integer; + SplitHole, JumpHole: Integer; + LookStart: Integer; + IsNegative: Boolean; +begin + Inc(FAltStackDepth); + if FAltStackDepth >= Length(FAltStack) then + SetLength(FAltStack, FAltStackDepth * 2 + 4); + FAltStack[FAltStackDepth] := 0; + if Match('?') then + begin + if Match(':') then + begin + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated non-capturing group'); + end + else if Match('=') then + begin + SplitHole := EmitHole; + FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, 0); + LookStart := CurrentPC; + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated lookahead'); + Emit(EncodeOp(RX_MATCH)); + PatchHole(SplitHole, CurrentPC); + FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, CurrentPC); + end + else if Match('!') then + begin + SplitHole := EmitHole; + FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, 0); + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated negative lookahead'); + Emit(EncodeOp(RX_MATCH)); + PatchHole(SplitHole, CurrentPC); + FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, CurrentPC or LOOK_NEGATED_FLAG); + end + else if Match('<') then + begin + if Match('=') then + begin + SplitHole := EmitHole; + FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, 0); + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated lookbehind'); + Emit(EncodeOp(RX_MATCH)); + PatchHole(SplitHole, CurrentPC); + FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, CurrentPC); + end + else if Match('!') then + begin + SplitHole := EmitHole; + FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, 0); + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated negative lookbehind'); + Emit(EncodeOp(RX_MATCH)); + PatchHole(SplitHole, CurrentPC); + FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, CurrentPC or LOOK_NEGATED_FLAG); + end + else + begin + GroupName := ParseGroupName; + Inc(FCaptureCount); + CaptureIdx := FCaptureCount; + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2)); + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated named capture group'); + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2 + 1)); + end; + end + else if CharInSet(Peek, ['i', 'm', 's', '-']) then + begin + CompileModifierGroup; + end + else + raise EConvertError.Create('Invalid group syntax'); + end + else + begin + Inc(FCaptureCount); + CaptureIdx := FCaptureCount; + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2)); + CompileDisjunction; + if not Match(')') then + raise EConvertError.Create('Unterminated capturing group'); + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2 + 1)); + end; + if FAltStackDepth > 0 then + Dec(FAltStackDepth); +end; + +procedure TRegExpCompiler.CompileAtom; +var + C: Char; + CodePoint: Cardinal; +begin + C := Peek; + case C of + '(': + begin + Inc(FPos); + CompileGroup; + end; + '[': + begin + Inc(FPos); + CompileCharacterClass; + end; + '.': + begin + Inc(FPos); + if FModifier.DotAll then + Emit(EncodeOpBx(RX_ANY, 1)) + else + Emit(EncodeOpBx(RX_ANY, 0)); + end; + '^': + begin + Inc(FPos); + if FModifier.Multiline then + Emit(EncodeOpBx(RX_ASSERT_START, 1)) + else + Emit(EncodeOpBx(RX_ASSERT_START, 0)); + end; + '$': + begin + Inc(FPos); + if FModifier.Multiline then + Emit(EncodeOpBx(RX_ASSERT_END, 1)) + else + Emit(EncodeOpBx(RX_ASSERT_END, 0)); + end; + '\': + begin + Inc(FPos); + if AtEnd then + raise EConvertError.Create( + 'Invalid regular expression: \ at end of pattern'); + CompileEscapeAtom; + end; + else + begin + if FUnicode then + begin + CodePoint := ReadCodePoint; + EmitCharMatch(CodePoint); + end + else + begin + Inc(FPos); + EmitCharMatch(Ord(C)); + end; + end; + end; +end; + +procedure TRegExpCompiler.EnsureCodeCapacity(ANeeded: Integer); +begin + if FCodeLen + ANeeded >= Length(FCode) then + SetLength(FCode, (FCodeLen + ANeeded) * 2 + 16); +end; + +procedure TRegExpCompiler.EmitBody(const ABody: array of UInt32; ALen: Integer); +begin + EmitBodyAt(ABody, ALen, 0); +end; + +procedure TRegExpCompiler.EmitBodyAt(const ABody: array of UInt32; + ALen: Integer; AOrigStart: Integer); +var + DstStart, Delta, J: Integer; + Op: TRegExpOpCode; + Bx: Integer; + NegFlag: Integer; +begin + EnsureCodeCapacity(ALen); + DstStart := FCodeLen; + Move(ABody[0], FCode[DstStart], ALen * SizeOf(UInt32)); + Delta := DstStart - AOrigStart; + if Delta <> 0 then + begin + for J := DstStart to DstStart + ALen - 1 do + begin + Op := TRegExpOpCode(FCode[J] and $FF); + case Op of + RX_SPLIT, RX_SPLIT_LAZY, RX_JUMP: + begin + Bx := Integer(FCode[J] shr 8); + Inc(Bx, Delta); + FCode[J] := EncodeOpBx(Op, Bx); + end; + RX_LOOKAHEAD, RX_LOOKBEHIND: + begin + Bx := Integer(FCode[J] shr 8); + NegFlag := Bx and LOOK_NEGATED_FLAG; + Bx := (Bx and LOOK_TARGET_MASK) + Delta; + FCode[J] := EncodeOpBx(Op, Bx or NegFlag); + end; + end; + end; + end; + Inc(FCodeLen, ALen); +end; + +procedure TRegExpCompiler.CompileQuantifier(AAtomStart: Integer); +var + SplitPC: Integer; + MinCount, MaxCount, I: Integer; + Lazy: Boolean; + C: Char; + BodyLen: Integer; + BodyCode: array of UInt32; + SavePos: Integer; +begin + if AtEnd then + Exit; + C := Peek; + MinCount := -1; + MaxCount := -1; + SavePos := FPos; + case C of + '*': begin MinCount := 0; MaxCount := -1; Inc(FPos); end; + '+': begin MinCount := 1; MaxCount := -1; Inc(FPos); end; + '?': begin MinCount := 0; MaxCount := 1; Inc(FPos); end; + '{': + begin + Inc(FPos); + if AtEnd or not CharInSet(Peek, ['0'..'9']) then + begin + Dec(FPos); + Exit; + end; + MinCount := ParseDecimalEscape; + if Match(',') then + begin + if Peek = '}' then + MaxCount := -1 + else + MaxCount := ParseDecimalEscape; + end + else + MaxCount := MinCount; + if not Match('}') then + begin + FPos := SavePos; + Exit; + end; + if (MaxCount >= 0) and (MinCount > MaxCount) then + raise EConvertError.Create( + 'Invalid regular expression: numbers out of order in quantifier'); + end; + else + Exit; + end; + Lazy := Match('?'); + BodyLen := CurrentPC - AAtomStart; + if BodyLen = 0 then + Exit; + SetLength(BodyCode, BodyLen); + Move(FCode[AAtomStart], BodyCode[0], BodyLen * SizeOf(UInt32)); + FCodeLen := AAtomStart; + for I := 1 to MinCount do + EmitBodyAt(BodyCode, BodyLen, AAtomStart); + if MaxCount = -1 then + begin + SplitPC := CurrentPC; + if Lazy then + Emit(EncodeOpBx(RX_SPLIT_LAZY, 0)) + else + Emit(EncodeOpBx(RX_SPLIT, 0)); + EmitBodyAt(BodyCode, BodyLen, AAtomStart); + Emit(EncodeOpBx(RX_JUMP, SplitPC)); + PatchHole(SplitPC, CurrentPC); + end + else + begin + for I := MinCount + 1 to MaxCount do + begin + SplitPC := CurrentPC; + if Lazy then + Emit(EncodeOpBx(RX_SPLIT_LAZY, 0)) + else + Emit(EncodeOpBx(RX_SPLIT, 0)); + EmitBodyAt(BodyCode, BodyLen, AAtomStart); + PatchHole(SplitPC, CurrentPC); + end; + end; +end; + +function IsQuantifierChar(C: Char): Boolean; inline; +begin + Result := (C = '*') or (C = '+') or (C = '?') or (C = '{'); +end; + +procedure TRegExpCompiler.CompileTerm; +var + AtomStart: Integer; + C: Char; + IsAssertion: Boolean; +begin + C := Peek; + if IsQuantifierChar(C) then + raise EConvertError.Create('Invalid regular expression: nothing to repeat'); + IsAssertion := (C = '^') or (C = '$') or + ((C = '\') and ((PeekAt(1) = 'b') or (PeekAt(1) = 'B'))); + if (C = '(') and (PeekAt(1) = '?') and + ((PeekAt(2) = '=') or (PeekAt(2) = '!') or + ((PeekAt(2) = '<') and ((PeekAt(3) = '=') or (PeekAt(3) = '!')))) then + IsAssertion := True; + AtomStart := CurrentPC; + CompileAtom; + if (not AtEnd) and IsQuantifierChar(Peek) and IsAssertion and FUnicode then + raise EConvertError.Create( + 'Invalid regular expression: quantifier on assertion in unicode mode'); + CompileQuantifier(AtomStart); +end; + +procedure TRegExpCompiler.CompileAlternative; +begin + while not AtEnd and (Peek <> '|') and (Peek <> ')') do + CompileTerm; +end; + +procedure TRegExpCompiler.InsertSplitAt(APos: Integer); +var + I: Integer; + Op: TRegExpOpCode; + Bx: Integer; + Negated: Boolean; +begin + EnsureCodeCapacity(1); + Move(FCode[APos], FCode[APos + 1], (FCodeLen - APos) * SizeOf(UInt32)); + FCode[APos] := EncodeOpBx(RX_SPLIT, 0); + Inc(FCodeLen); + for I := APos + 1 to FCodeLen - 1 do + begin + Op := TRegExpOpCode(FCode[I] and $FF); + case Op of + RX_SPLIT, RX_SPLIT_LAZY, RX_JUMP: + begin + Bx := Integer(FCode[I] shr 8); + if Bx >= APos then + begin + Inc(Bx); + FCode[I] := EncodeOpBx(Op, Bx); + end; + end; + RX_LOOKAHEAD, RX_LOOKBEHIND: + begin + Bx := Integer(FCode[I] shr 8); + Negated := (Bx and LOOK_NEGATED_FLAG) <> 0; + Bx := Bx and LOOK_TARGET_MASK; + if Bx >= APos then + begin + Inc(Bx); + if Negated then + Bx := Bx or LOOK_NEGATED_FLAG; + FCode[I] := EncodeOpBx(Op, Bx); + end; + end; + end; + end; +end; + +procedure TRegExpCompiler.CompileDisjunction; +var + StartPC: Integer; + JumpHoles: array of Integer; + JumpCount, I: Integer; +begin + JumpCount := 0; + SetLength(JumpHoles, 8); + StartPC := CurrentPC; + CompileAlternative; + while (not AtEnd) and (Peek = '|') do + begin + Inc(FPos); + if FAltStackDepth < Length(FAltStack) then + Inc(FAltStack[FAltStackDepth]); + InsertSplitAt(StartPC); + // Update existing jump holes since they shifted by 1 + for I := 0 to JumpCount - 1 do + if JumpHoles[I] >= StartPC then + Inc(JumpHoles[I]); + if JumpCount >= Length(JumpHoles) then + SetLength(JumpHoles, JumpCount * 2 + 8); + JumpHoles[JumpCount] := CurrentPC; + Inc(JumpCount); + Emit(0); + PatchHole(StartPC, CurrentPC); + StartPC := CurrentPC; + CompileAlternative; + end; + for I := 0 to JumpCount - 1 do + FCode[JumpHoles[I]] := EncodeOpBx(RX_JUMP, CurrentPC); +end; + +procedure TRegExpCompiler.CompilePattern; +begin + Emit(EncodeOpBx(RX_SAVE, 0)); + CompileDisjunction; + Emit(EncodeOpBx(RX_SAVE, 1)); + Emit(EncodeOp(RX_MATCH)); +end; + +procedure TRegExpCompiler.PreScanNamedGroups; +var + I, GroupIndex, CloseAngle, J: Integer; + InCharClass: Boolean; + GroupName: string; + AltStack: array of Integer; + AltStackDepth: Integer; +begin + SetLength(AltStack, 64); + AltStackDepth := 0; + AltStack[0] := 0; + I := 1; + GroupIndex := 0; + InCharClass := False; + while I <= Length(FPattern) do + begin + if FPattern[I] = '\' then + begin + if I + 1 <= Length(FPattern) then + Inc(I, 2) + else + Inc(I); + Continue; + end; + if FPattern[I] = '[' then + begin + InCharClass := True; + Inc(I); + Continue; + end; + if (FPattern[I] = ']') and InCharClass then + begin + InCharClass := False; + Inc(I); + Continue; + end; + if InCharClass then + begin + Inc(I); + Continue; + end; + if FPattern[I] = '|' then + begin + Inc(AltStack[AltStackDepth]); + Inc(I); + Continue; + end; + if FPattern[I] = ')' then + begin + if AltStackDepth > 0 then + Dec(AltStackDepth); + Inc(I); + Continue; + end; + if FPattern[I] = '(' then + begin + Inc(AltStackDepth); + if AltStackDepth >= Length(AltStack) then + SetLength(AltStack, AltStackDepth * 2 + 4); + AltStack[AltStackDepth] := 0; + if (I + 1 <= Length(FPattern)) and (FPattern[I + 1] = '?') then + begin + if (I + 2 <= Length(FPattern)) and (FPattern[I + 2] = '<') then + begin + if (I + 3 <= Length(FPattern)) and + ((FPattern[I + 3] = '=') or (FPattern[I + 3] = '!')) then + begin + Inc(I, 3); + Continue; + end; + CloseAngle := I + 3; + while (CloseAngle <= Length(FPattern)) and + (FPattern[CloseAngle] <> '>') do + Inc(CloseAngle); + if CloseAngle <= Length(FPattern) then + begin + Inc(GroupIndex); + GroupName := Copy(FPattern, I + 3, CloseAngle - I - 3); + SetLength(FNamedGroups, Length(FNamedGroups) + 1); + FNamedGroups[High(FNamedGroups)].Name := GroupName; + FNamedGroups[High(FNamedGroups)].Index := GroupIndex; + SetLength(FNamedGroups[High(FNamedGroups)].DisjunctionPath, + AltStackDepth + 1); + for J := 0 to AltStackDepth do + FNamedGroups[High(FNamedGroups)].DisjunctionPath[J] := AltStack[J]; + I := CloseAngle + 1; + Continue; + end; + end; + Inc(I, 2); + Continue; + end; + Inc(GroupIndex); + end; + Inc(I); + end; +end; + +procedure TRegExpCompiler.ValidateNamedGroups; +var + K, L, MinLen, I: Integer; + ShareBranch: Boolean; +begin + for K := 0 to High(FNamedGroups) - 1 do + for L := K + 1 to High(FNamedGroups) do + if FNamedGroups[K].Name = FNamedGroups[L].Name then + begin + MinLen := Length(FNamedGroups[K].DisjunctionPath); + if Length(FNamedGroups[L].DisjunctionPath) < MinLen then + MinLen := Length(FNamedGroups[L].DisjunctionPath); + ShareBranch := True; + for I := 0 to MinLen - 1 do + if FNamedGroups[K].DisjunctionPath[I] <> + FNamedGroups[L].DisjunctionPath[I] then + begin + ShareBranch := False; + Break; + end; + if ShareBranch then + raise EConvertError.CreateFmt( + 'Duplicate named capture group: %s', [FNamedGroups[K].Name]); + end; +end; + +function TRegExpCompiler.Compile: TRegExpProgram; +begin + PreScanNamedGroups; + ValidateNamedGroups; + CompilePattern; + SetLength(FCode, FCodeLen); + Result.Code := FCode; + Result.CharClasses := FCharClasses; + Result.CaptureCount := FCaptureCount; + Result.NamedGroups := FNamedGroups; +end; + +function CompileRegExp(const APattern, AFlags: string): TRegExpProgram; +var + Compiler: TRegExpCompiler; +begin + Compiler := TRegExpCompiler.Create(APattern, AFlags); + try + Result := Compiler.Compile; + finally + Compiler.Free; + end; +end; + +procedure ValidateRegExpPatternNew(const APattern, AFlags: string); +begin + ValidateRegExpFlags(AFlags); + if APattern = '(?:)' then + Exit; + CompileRegExp(APattern, AFlags); +end; + +end. diff --git a/source/units/Goccia.RegExp.Engine.pas b/source/units/Goccia.RegExp.Engine.pas index 68ba5645..1e7a2a49 100644 --- a/source/units/Goccia.RegExp.Engine.pas +++ b/source/units/Goccia.RegExp.Engine.pas @@ -38,19 +38,16 @@ function RegExpToString(const APattern, AFlags: string): string; function ExecuteRegExp(const APattern, AFlags, AInput: string; const AStartIndex: Integer; const ARequireStart: Boolean; out AResult: TGocciaRegExpMatchResult): Boolean; -function PreprocessRegExpPattern(const APattern: string; - out ANamedGroups: TGocciaRegExpNamedGroups): string; implementation uses - Math, SysUtils, - RegExpr, TextSemantics, - Goccia.RegExp.Unicode; + Goccia.RegExp.Compiler, + Goccia.RegExp.VM; const EMPTY_REGEX = '(?:)'; @@ -64,14 +61,6 @@ function NormalizeRegExpSource(const APattern: string): string; Result := APattern; end; -function GetExecutableRegExpPattern(const APattern: string): string; -begin - if APattern = EMPTY_REGEX then - Result := '' - else - Result := APattern; -end; - function HasRegExpFlag(const AFlags: string; const AFlag: Char): Boolean; begin Result := Pos(AFlag, AFlags) > 0; @@ -91,325 +80,13 @@ procedure ValidateRegExpFlags(const AFlags: string); raise EConvertError.Create('Invalid regular expression flags'); Seen := Seen + AFlags[I]; end; - // ES2026 §22.2.2.1: u and v flags are mutually exclusive if HasRegExpFlag(AFlags, 'u') and HasRegExpFlag(AFlags, 'v') then raise EConvertError.Create('Invalid regular expression flags'); end; -// ES2025 §22.2.1 Static Semantics: Early Errors — RegExp Modifiers -// Validates inline modifier group syntax (?flags:...) and (?flags-flags:...). -// Only i, m, s are valid modifier flags. The colon form is required. -procedure ValidateModifierGroups(const APattern: string); -var - I, J, PatternLength: Integer; - InCharClass: Boolean; - C: Char; - EnableFlags, DisableFlags: string; - InDisable: Boolean; -begin - PatternLength := Length(APattern); - I := 1; - InCharClass := False; - while I <= PatternLength do - begin - if APattern[I] = '\' then - begin - if I + 1 <= PatternLength then - Inc(I, 2) - else - Inc(I); - Continue; - end; - if APattern[I] = '[' then - begin - InCharClass := True; - Inc(I); - Continue; - end; - if (APattern[I] = ']') and InCharClass then - begin - InCharClass := False; - Inc(I); - Continue; - end; - if InCharClass then - begin - Inc(I); - Continue; - end; - // ES2025: Check for modifier group prefix (?[ims-]...) - if (APattern[I] = '(') and (I + 2 <= PatternLength) and - (APattern[I + 1] = '?') and - CharInSet(APattern[I + 2], ['i', 'm', 's', '-']) then - begin - J := I + 2; - EnableFlags := ''; - DisableFlags := ''; - InDisable := False; - while J <= PatternLength do - begin - C := APattern[J]; - // ES2025 §22.2.1 step 4: colon terminates modifier prefix - if C = ':' then - Break; - if C = ')' then - raise EConvertError.Create( - 'Invalid regular expression: modifier group must use (?flags:...) syntax'); - if C = '-' then - begin - if InDisable then - raise EConvertError.Create( - 'Invalid regular expression: unexpected - in modifier group'); - InDisable := True; - Inc(J); - Continue; - end; - if not CharInSet(C, ['i', 'm', 's']) then - raise EConvertError.CreateFmt( - 'Invalid regular expression: ''%s'' is not a valid modifier flag', [C]); - if InDisable then - begin - if Pos(C, DisableFlags) > 0 then - raise EConvertError.CreateFmt( - 'Invalid regular expression: duplicate modifier flag ''%s''', [C]); - if Pos(C, EnableFlags) > 0 then - raise EConvertError.CreateFmt( - 'Invalid regular expression: ''%s'' in both enable and disable', [C]); - DisableFlags := DisableFlags + C; - end - else - begin - if Pos(C, EnableFlags) > 0 then - raise EConvertError.CreateFmt( - 'Invalid regular expression: duplicate modifier flag ''%s''', [C]); - EnableFlags := EnableFlags + C; - end; - Inc(J); - end; - // ES2025 §22.2.1: Both add and remove lists empty is a SyntaxError - if (J <= PatternLength) and (APattern[J] = ':') and - (EnableFlags = '') and (DisableFlags = '') then - raise EConvertError.Create( - 'Invalid regular expression: modifier group must enable or disable at least one flag'); - end; - Inc(I); - end; -end; - -// ES2025 §22.2.1 RegExp Modifiers — Transforms inline modifier groups -// (?flags:...) and (?flags-flags:...) into TRegExpr-compatible syntax. -// For i and m modifiers: uses (?i)/(?-i)/(?m)/(?-m) toggles inside (?:...) -// groups (TRegExpr scopes these correctly to groups). -// For s modifier enable: replaces . with [\s\S] (because TRegExpr's (?s) -// leaks from groups). For s modifier disable: uses (?-s) toggle (TRegExpr -// scopes this correctly). -function PreprocessModifierGroups(const APattern: string): string; -type - TSModifierEntry = record - Depth: Integer; - PreviousSActive: Boolean; - end; -const - DOTALL_REPLACEMENT = '[\s\S]'; - INITIAL_STACK_SIZE = 32; -var - I, J, PatternLength: Integer; - InCharClass: Boolean; - GroupDepth: Integer; - SStack: array of TSModifierEntry; - SStackTop: Integer; - CurrentSActive: Boolean; - C: Char; - EnableFlags, DisableFlags: string; - InDisable: Boolean; - Toggles: string; - NewSActive: Boolean; -begin - PatternLength := Length(APattern); - if PatternLength = 0 then - begin - Result := ''; - Exit; - end; - Result := ''; - I := 1; - InCharClass := False; - GroupDepth := 0; - CurrentSActive := False; - SStackTop := -1; - SetLength(SStack, INITIAL_STACK_SIZE); - while I <= PatternLength do - begin - // Handle escape sequences - if APattern[I] = '\' then - begin - if I + 1 <= PatternLength then - begin - Result := Result + APattern[I] + APattern[I + 1]; - Inc(I, 2); - end - else - begin - Result := Result + APattern[I]; - Inc(I); - end; - Continue; - end; - // Handle character classes (copy as-is, no dot transformation) - if APattern[I] = '[' then - begin - InCharClass := True; - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - if (APattern[I] = ']') and InCharClass then - begin - InCharClass := False; - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - if InCharClass then - begin - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - // ES2025: Transform . based on current s modifier state - if APattern[I] = '.' then - begin - if CurrentSActive then - Result := Result + DOTALL_REPLACEMENT - else - Result := Result + '.'; - Inc(I); - Continue; - end; - // Handle closing paren — pop s state if this closes a modifier group - if APattern[I] = ')' then - begin - if (SStackTop >= 0) and (SStack[SStackTop].Depth = GroupDepth) then - begin - CurrentSActive := SStack[SStackTop].PreviousSActive; - Dec(SStackTop); - end; - Dec(GroupDepth); - Result := Result + ')'; - Inc(I); - Continue; - end; - // Handle opening paren — check for modifier group prefix - if APattern[I] = '(' then - begin - Inc(GroupDepth); - if (I + 1 <= PatternLength) and (APattern[I + 1] = '?') and - (I + 2 <= PatternLength) and - CharInSet(APattern[I + 2], ['i', 'm', 's', '-']) then - begin - // Parse modifier flags up to ':' - J := I + 2; - EnableFlags := ''; - DisableFlags := ''; - InDisable := False; - while (J <= PatternLength) and (APattern[J] <> ':') and - (APattern[J] <> ')') do - begin - C := APattern[J]; - if C = '-' then - begin - InDisable := True; - Inc(J); - Continue; - end; - if CharInSet(C, ['i', 'm', 's']) then - begin - if InDisable then - DisableFlags := DisableFlags + C - else - EnableFlags := EnableFlags + C; - end; - Inc(J); - end; - if (J <= PatternLength) and (APattern[J] = ':') then - begin - // Valid modifier group — transform to TRegExpr-compatible syntax - // Build i/m toggles (TRegExpr scopes these correctly to groups) - Toggles := ''; - if Pos('i', EnableFlags) > 0 then Toggles := Toggles + '(?i)'; - if Pos('m', EnableFlags) > 0 then Toggles := Toggles + '(?m)'; - if Pos('i', DisableFlags) > 0 then Toggles := Toggles + '(?-i)'; - if Pos('m', DisableFlags) > 0 then Toggles := Toggles + '(?-m)'; - // s disable uses TRegExpr toggle (correctly scoped to groups) - if Pos('s', DisableFlags) > 0 then Toggles := Toggles + '(?-s)'; - // Determine new s state (s enable uses dot transformation) - NewSActive := CurrentSActive; - if Pos('s', EnableFlags) > 0 then NewSActive := True; - if Pos('s', DisableFlags) > 0 then NewSActive := False; - // Push s state if s modifier changed - if NewSActive <> CurrentSActive then - begin - Inc(SStackTop); - if SStackTop >= Length(SStack) then - SetLength(SStack, SStackTop * 2 + 4); - SStack[SStackTop].Depth := GroupDepth; - SStack[SStackTop].PreviousSActive := CurrentSActive; - CurrentSActive := NewSActive; - end; - // Emit non-capturing group with toggles - Result := Result + '(?:' + Toggles; - I := J + 1; - Continue; - end; - end; - // Regular group or non-modifier (?...) — pass through - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - // Default: copy character as-is - Result := Result + APattern[I]; - Inc(I); - end; -end; - -// ES2026 §22.2.3.1 RegExp ( pattern, flags ) — validation step procedure ValidateRegExpPattern(const APattern, AFlags: string); -var - Matcher: TRegExpr; - NormalizedPattern: string; - ExecutablePattern: string; - ConvertedPattern: string; - DiscardedGroups: TGocciaRegExpNamedGroups; - IsUnicode: Boolean; begin - ValidateRegExpFlags(AFlags); - NormalizedPattern := NormalizeRegExpSource(APattern); - if NormalizedPattern = EMPTY_REGEX then - Exit; - ExecutablePattern := GetExecutableRegExpPattern(NormalizedPattern); - // ES2025: Validate inline modifier groups before transformation - ValidateModifierGroups(ExecutablePattern); - // ES2025: Transform modifier groups into TRegExpr-compatible syntax - ExecutablePattern := PreprocessModifierGroups(ExecutablePattern); - IsUnicode := HasRegExpFlag(AFlags, 'u'); - ConvertedPattern := PreprocessRegExpPattern(ExecutablePattern, DiscardedGroups); - // ES2026 §22.2.2.9: Apply Unicode pattern preprocessing when u flag is set - if IsUnicode then - ConvertedPattern := PreprocessUnicodePattern(ConvertedPattern, - HasRegExpFlag(AFlags, 'i')); - Matcher := TRegExpr.Create; - try - Matcher.Expression := ConvertedPattern; - Matcher.ModifierI := HasRegExpFlag(AFlags, 'i'); - Matcher.ModifierM := HasRegExpFlag(AFlags, 'm'); - Matcher.ModifierS := HasRegExpFlag(AFlags, 's'); - if IsUnicode then - Matcher.ModifierR := False; - Matcher.Compile; - finally - Matcher.Free; - end; + ValidateRegExpPatternNew(APattern, AFlags); end; function CanonicalizeRegExpFlags(const AFlags: string): string; @@ -431,416 +108,16 @@ function RegExpToString(const APattern, AFlags: string): string; CanonicalizeRegExpFlags(AFlags); end; -// ES2026 §22.2.7.2 AdvanceStringIndex ( S, index, unicode ) -function AdvanceStringIndex(const AInput: string; const AIndex: Integer; - const AUnicode: Boolean): Integer; -begin - Result := AdvanceUTF8StringIndex(AInput, AIndex, AUnicode); -end; - -function FindNamedGroupIndex(const ANamedGroups: TGocciaRegExpNamedGroups; - const AName: string): Integer; -var - I: Integer; -begin - for I := 0 to High(ANamedGroups) do - if ANamedGroups[I].Name = AName then - Exit(ANamedGroups[I].Index); - Result := -1; -end; - -// ES2025 §22.2.1 Static Semantics: Early Errors — duplicate GroupSpecifier -// Two disjunction paths share a branch if they agree at every common depth. -// When they share a branch, both groups can participate in the same match — -// making duplicate names a SyntaxError. -function PathsShareBranch(const APathA, APathB: array of Integer): Boolean; -var - MinLength, I: Integer; -begin - MinLength := Min(Length(APathA), Length(APathB)); - for I := 0 to MinLength - 1 do - if APathA[I] <> APathB[I] then - Exit(False); - Result := True; -end; - -// ES2025 §22.2.2 Runtime Semantics: CompileAtom — \k GroupName -// Resolve \k backreference when multiple groups share the same name. -// Returns the TRegExpr-compatible backreference string. -// -// When the backreference is outside the disjunction containing the duplicate -// groups (CompatCount = 0 or > 1), we emit (?:\N1|\N2|...) — an alternation -// of all candidate backreferences. This is correct because TRegExpr fails -// (rather than matching empty) when a backreference targets a non-participating -// group, so the alternation falls through to the participating group's backref. -// Concatenation (\N1\N2) would be wrong: the non-participating backref would -// fail and abort the entire match. -function ResolveNamedBackreference( - const ANamedGroups: TGocciaRegExpNamedGroups; - const AName: string; const ACurrentPath: array of Integer): string; -var - AllIndices: array of Integer; - CompatibleIndices: array of Integer; - AllCount, CompatCount, I: Integer; -begin - Result := ''; - // Collect all group indices with this name - AllCount := 0; - for I := 0 to High(ANamedGroups) do - if ANamedGroups[I].Name = AName then - Inc(AllCount); - if AllCount = 0 then - Exit; - if AllCount = 1 then - begin - // Single group — simple backreference (ES2018 behavior) - Result := '\' + IntToStr(FindNamedGroupIndex(ANamedGroups, AName)); - Exit; - end; - // ES2025: Multiple groups with same name — resolve via disjunction path - SetLength(CompatibleIndices, AllCount); - CompatCount := 0; - for I := 0 to High(ANamedGroups) do - if (ANamedGroups[I].Name = AName) and - PathsShareBranch(ANamedGroups[I].DisjunctionPath, ACurrentPath) then - begin - CompatibleIndices[CompatCount] := ANamedGroups[I].Index; - Inc(CompatCount); - end; - if CompatCount = 1 then - begin - // Exactly one compatible group — resolve directly - Result := '\' + IntToStr(CompatibleIndices[0]); - Exit; - end; - if CompatCount = 0 then - begin - // Backreference outside the disjunction — collect all groups with this name - SetLength(AllIndices, AllCount); - AllCount := 0; - for I := 0 to High(ANamedGroups) do - if ANamedGroups[I].Name = AName then - begin - AllIndices[AllCount] := ANamedGroups[I].Index; - Inc(AllCount); - end; - // Emit alternation: (?:\1|\2|...) — the participating group's backreference - // succeeds while non-participating ones either match empty or fail through - Result := '(?:'; - for I := 0 to AllCount - 1 do - begin - if I > 0 then - Result := Result + '|'; - Result := Result + '\' + IntToStr(AllIndices[I]); - end; - Result := Result + ')'; - Exit; - end; - // Multiple compatible groups — emit alternation of compatible ones - Result := '(?:'; - for I := 0 to CompatCount - 1 do - begin - if I > 0 then - Result := Result + '|'; - Result := Result + '\' + IntToStr(CompatibleIndices[I]); - end; - Result := Result + ')'; -end; - -// Pass 1: collect all named groups and their capture indices without modifying -// the pattern, so that forward \k backreferences can be resolved. -// ES2025: Also tracks disjunction paths and validates duplicate named groups. -function CollectNamedGroups(const APattern: string): TGocciaRegExpNamedGroups; -var - I, J, K, L, PatternLength, GroupIndex, CloseAngle: Integer; - InCharClass: Boolean; - GroupName: string; - AltStack: array of Integer; - AltStackDepth: Integer; -begin - SetLength(Result, 0); - PatternLength := Length(APattern); - I := 1; - GroupIndex := 0; - InCharClass := False; - // ES2025: Initialize disjunction path stack with top-level scope - SetLength(AltStack, 64); - AltStackDepth := 0; - AltStack[0] := 0; - while I <= PatternLength do - begin - if APattern[I] = '\' then - begin - if I + 1 <= PatternLength then - Inc(I, 2) - else - Inc(I); - Continue; - end; - if APattern[I] = '[' then - begin - InCharClass := True; - Inc(I); - Continue; - end; - if (APattern[I] = ']') and InCharClass then - begin - InCharClass := False; - Inc(I); - Continue; - end; - if InCharClass then - begin - Inc(I); - Continue; - end; - // ES2025: Track disjunction alternatives - if APattern[I] = '|' then - begin - Inc(AltStack[AltStackDepth]); - Inc(I); - Continue; - end; - if APattern[I] = ')' then - begin - if AltStackDepth > 0 then - Dec(AltStackDepth); - Inc(I); - Continue; - end; - if APattern[I] = '(' then - begin - // Push disjunction level for all group types - Inc(AltStackDepth); - if AltStackDepth >= Length(AltStack) then - SetLength(AltStack, AltStackDepth * 2 + 4); - AltStack[AltStackDepth] := 0; - if (I + 1 <= PatternLength) and (APattern[I + 1] = '?') then - begin - if (I + 2 <= PatternLength) and (APattern[I + 2] = '<') then - begin - // (?<= lookbehind, (?...) - CloseAngle := I + 3; - while (CloseAngle <= PatternLength) and - (APattern[CloseAngle] <> '>') do - Inc(CloseAngle); - if CloseAngle <= PatternLength then - begin - Inc(GroupIndex); - GroupName := Copy(APattern, I + 3, CloseAngle - I - 3); - SetLength(Result, Length(Result) + 1); - Result[High(Result)].Name := GroupName; - Result[High(Result)].Index := GroupIndex; - // ES2025: Record disjunction path for duplicate name validation - SetLength(Result[High(Result)].DisjunctionPath, AltStackDepth + 1); - for J := 0 to AltStackDepth do - Result[High(Result)].DisjunctionPath[J] := AltStack[J]; - I := CloseAngle + 1; - Continue; - end; - end; - // Non-capturing or other (?...) group — skip without incrementing index - Inc(I, 2); - Continue; - end; - // Plain capturing group - Inc(GroupIndex); - end; - Inc(I); - end; - // ES2025 §22.2.1.1: Validate duplicate named capture groups are in different - // alternatives. Two groups with the same name that share a disjunction branch - // can both participate in a single match — that is a SyntaxError. - for K := 0 to High(Result) - 1 do - for L := K + 1 to High(Result) do - if (Result[K].Name = Result[L].Name) and - PathsShareBranch(Result[K].DisjunctionPath, - Result[L].DisjunctionPath) then - raise EConvertError.CreateFmt( - 'Duplicate named capture group: %s', [Result[K].Name]); -end; - -// Pass 2: convert named groups to plain capturing groups and resolve \k -// backreferences using the complete group map from pass 1. -// ES2025: Tracks disjunction paths for correct \k resolution with -// duplicate named capture groups. -function PreprocessRegExpPattern(const APattern: string; - out ANamedGroups: TGocciaRegExpNamedGroups): string; -var - I, J, PatternLength: Integer; - InCharClass: Boolean; - GroupName: string; - CloseAngle: Integer; - BackrefResult: string; - AltStack: array of Integer; - AltStackDepth: Integer; - CaptureIndex, TargetIndex: Integer; -begin - // Pass 1: collect all named groups so forward backreferences resolve - ANamedGroups := CollectNamedGroups(APattern); - PatternLength := Length(APattern); - if PatternLength = 0 then - begin - Result := ''; - Exit; - end; - // Pass 2: emit converted pattern with disjunction path tracking - Result := ''; - I := 1; - InCharClass := False; - // ES2025: Track disjunction path for \k resolution - SetLength(AltStack, 64); - AltStackDepth := 0; - AltStack[0] := 0; - CaptureIndex := 0; - while I <= PatternLength do - begin - if APattern[I] = '\' then - begin - if I + 1 <= PatternLength then - begin - // \k backreference: convert to numeric backreference(s) - if (APattern[I + 1] = 'k') and (I + 2 <= PatternLength) and - (APattern[I + 2] = '<') then - begin - CloseAngle := I + 3; - while (CloseAngle <= PatternLength) and - (APattern[CloseAngle] <> '>') do - Inc(CloseAngle); - if CloseAngle <= PatternLength then - begin - GroupName := Copy(APattern, I + 3, CloseAngle - I - 3); - // ES2025: Resolve with duplicate named group awareness - TargetIndex := FindNamedGroupIndex(ANamedGroups, GroupName); - if TargetIndex > CaptureIndex then - BackrefResult := '' - else - BackrefResult := ResolveNamedBackreference(ANamedGroups, - GroupName, Copy(AltStack, 0, AltStackDepth + 1)); - if BackrefResult = '' then - begin - if TargetIndex < 0 then - raise EConvertError.CreateFmt( - 'Invalid named backreference: %s', [GroupName]); - Result := Result + '(?:)'; - end - else - Result := Result + BackrefResult; - I := CloseAngle + 1; - Continue; - end; - end; - Result := Result + APattern[I] + APattern[I + 1]; - Inc(I, 2); - end - else - begin - Result := Result + APattern[I]; - Inc(I); - end; - Continue; - end; - if APattern[I] = '[' then - begin - InCharClass := True; - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - if (APattern[I] = ']') and InCharClass then - begin - InCharClass := False; - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - if InCharClass then - begin - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - // ES2025: Track disjunction alternatives - if APattern[I] = '|' then - begin - Inc(AltStack[AltStackDepth]); - Result := Result + '|'; - Inc(I); - Continue; - end; - if APattern[I] = ')' then - begin - if AltStackDepth > 0 then - Dec(AltStackDepth); - Result := Result + ')'; - Inc(I); - Continue; - end; - if APattern[I] = '(' then - begin - // Push disjunction level for all group types - Inc(AltStackDepth); - if AltStackDepth >= Length(AltStack) then - SetLength(AltStack, AltStackDepth * 2 + 4); - AltStack[AltStackDepth] := 0; - if (I + 1 <= PatternLength) and (APattern[I + 1] = '?') then - begin - if (I + 2 <= PatternLength) and (APattern[I + 2] = '<') then - begin - // (?<= lookbehind, (?...) -> plain capturing group (...) - CloseAngle := I + 3; - while (CloseAngle <= PatternLength) and - (APattern[CloseAngle] <> '>') do - Inc(CloseAngle); - if CloseAngle <= PatternLength then - begin - // Strip the name, emit plain capturing group - Inc(CaptureIndex); - Result := Result + '('; - I := CloseAngle + 1; - Continue; - end; - end; - Result := Result + '(?'; - Inc(I, 2); - Continue; - end; - Inc(CaptureIndex); - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - Result := Result + APattern[I]; - Inc(I); - end; -end; - -// ES2026 §22.2.7.1 RegExpExec ( R, S ) function ExecuteRegExp(const APattern, AFlags, AInput: string; const AStartIndex: Integer; const ARequireStart: Boolean; out AResult: TGocciaRegExpMatchResult): Boolean; var - Matcher: TRegExpr; - I: Integer; - ExecutablePattern: string; - ConvertedPattern: string; - NamedGroups: TGocciaRegExpNamedGroups; + Prog: TRegExpProgram; + VMResult: TRegExpVMResult; IsUnicode: Boolean; + I, GroupCount: Integer; + PatternToCompile: string; + SlotStart, SlotEnd: Integer; begin AResult.Found := False; AResult.MatchIndex := -1; @@ -849,7 +126,7 @@ function ExecuteRegExp(const APattern, AFlags, AInput: string; SetLength(AResult.Groups, 0); SetLength(AResult.NamedGroups, 0); ValidateRegExpFlags(AFlags); - IsUnicode := HasRegExpFlag(AFlags, 'u'); + IsUnicode := HasRegExpFlag(AFlags, 'u') or HasRegExpFlag(AFlags, 'v'); if AStartIndex > Length(AInput) then Exit(False); if APattern = EMPTY_REGEX then @@ -857,57 +134,52 @@ function ExecuteRegExp(const APattern, AFlags, AInput: string; AResult.Found := True; AResult.MatchIndex := AStartIndex; AResult.MatchEnd := AStartIndex; - AResult.NextIndex := AdvanceStringIndex(AInput, AStartIndex, - IsUnicode or HasRegExpFlag(AFlags, 'v')); + AResult.NextIndex := AdvanceUTF8StringIndex(AInput, AStartIndex, IsUnicode); SetLength(AResult.Groups, 1); AResult.Groups[0].Matched := True; AResult.Groups[0].Value := ''; Exit(True); end; - // ES2025: Transform modifier groups before named group preprocessing - ExecutablePattern := PreprocessModifierGroups( - GetExecutableRegExpPattern(APattern)); - ConvertedPattern := PreprocessRegExpPattern(ExecutablePattern, NamedGroups); - // ES2026 §22.2.2.9: Apply Unicode pattern preprocessing when u flag is set - if IsUnicode then - ConvertedPattern := PreprocessUnicodePattern(ConvertedPattern, - HasRegExpFlag(AFlags, 'i')); - Matcher := TRegExpr.Create; - try - Matcher.Expression := ConvertedPattern; - Matcher.ModifierI := HasRegExpFlag(AFlags, 'i'); - Matcher.ModifierM := HasRegExpFlag(AFlags, 'm'); - Matcher.ModifierS := HasRegExpFlag(AFlags, 's'); - if IsUnicode then - Matcher.ModifierR := False; - Matcher.Compile; - Matcher.InputString := AInput; - Result := Matcher.ExecPos(AStartIndex + 1); - if Result and ARequireStart and - (Matcher.MatchPos[0] <> AStartIndex + 1) then - Result := False; - if not Result then - Exit(False); - AResult.Found := True; - AResult.MatchIndex := Matcher.MatchPos[0] - 1; - AResult.MatchEnd := AResult.MatchIndex + Matcher.MatchLen[0]; - AResult.NextIndex := AResult.MatchEnd; - if Matcher.MatchLen[0] = 0 then - AResult.NextIndex := AdvanceStringIndex(AInput, AResult.NextIndex, - IsUnicode or HasRegExpFlag(AFlags, 'v')); - SetLength(AResult.Groups, Matcher.SubExprMatchCount + 1); - for I := 0 to Matcher.SubExprMatchCount do + PatternToCompile := APattern; + if PatternToCompile = EMPTY_REGEX then + PatternToCompile := ''; + Prog := CompileRegExp(PatternToCompile, AFlags); + Result := ExecuteRegExpVM(Prog, AInput, AStartIndex, ARequireStart, VMResult); + if not Result then + Exit(False); + AResult.Found := True; + if Length(VMResult.CaptureSlots) < 2 then + Exit(False); + AResult.MatchIndex := VMResult.CaptureSlots[0] - 1; + AResult.MatchEnd := VMResult.CaptureSlots[1] - 1; + AResult.NextIndex := AResult.MatchEnd; + if AResult.MatchEnd = AResult.MatchIndex then + AResult.NextIndex := AdvanceUTF8StringIndex(AInput, AResult.NextIndex, + IsUnicode); + GroupCount := Prog.CaptureCount + 1; + SetLength(AResult.Groups, GroupCount); + for I := 0 to GroupCount - 1 do + begin + SlotStart := -1; + SlotEnd := -1; + if I * 2 + 1 < Length(VMResult.CaptureSlots) then + begin + SlotStart := VMResult.CaptureSlots[I * 2]; + SlotEnd := VMResult.CaptureSlots[I * 2 + 1]; + end; + if (SlotStart >= 1) and (SlotEnd >= SlotStart) and + (SlotEnd <= Length(AInput) + 1) then + begin + AResult.Groups[I].Matched := True; + AResult.Groups[I].Value := Copy(AInput, SlotStart, SlotEnd - SlotStart); + end + else begin - AResult.Groups[I].Matched := Matcher.MatchPos[I] > 0; - if AResult.Groups[I].Matched then - AResult.Groups[I].Value := Matcher.Match[I] - else - AResult.Groups[I].Value := ''; + AResult.Groups[I].Matched := False; + AResult.Groups[I].Value := ''; end; - AResult.NamedGroups := NamedGroups; - finally - Matcher.Free; end; + AResult.NamedGroups := Prog.NamedGroups; end; end. diff --git a/source/units/Goccia.RegExp.Runtime.pas b/source/units/Goccia.RegExp.Runtime.pas index 7add209d..29d3f48d 100644 --- a/source/units/Goccia.RegExp.Runtime.pas +++ b/source/units/Goccia.RegExp.Runtime.pas @@ -29,6 +29,7 @@ implementation SysUtils, Goccia.Constants.PropertyNames, + Goccia.RegExp.VM, Goccia.Values.ArrayValue, Goccia.Values.ErrorHelper, Goccia.Values.ObjectPropertyDescriptor, @@ -117,15 +118,11 @@ function BuildMatchArray(const AInput: string; end; function IsRegExpValue(const AValue: TGocciaValue): Boolean; -var - Tag: TGocciaValue; begin if not (AValue is TGocciaObjectValue) then Exit(False); - Tag := TGocciaObjectValue(AValue).GetSymbolProperty( - TGocciaSymbolValue.WellKnownToStringTag); - Result := (Tag is TGocciaStringLiteralValue) and - (TGocciaStringLiteralValue(Tag).Value = 'RegExp'); + Result := TGocciaObjectValue(AValue).HasOwnProperty(PROP_SOURCE) and + TGocciaObjectValue(AValue).HasOwnProperty(PROP_FLAGS); end; function CreateRegExpObject(const APattern, AFlags: string): TGocciaValue; @@ -153,30 +150,6 @@ function CreateRegExpObject(const APattern, AFlags: string): TGocciaValue; Obj.DefineProperty(PROP_LAST_INDEX, TGocciaPropertyDescriptorData.Create( TGocciaNumberLiteralValue.Create(0), [pfWritable])); - Obj.DefineProperty(PROP_GLOBAL, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'g')), [])); - Obj.DefineProperty(PROP_IGNORE_CASE, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'i')), [])); - Obj.DefineProperty(PROP_MULTILINE, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'm')), [])); - Obj.DefineProperty(PROP_DOT_ALL, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 's')), [])); - Obj.DefineProperty(PROP_UNICODE, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'u')), [])); - Obj.DefineProperty(PROP_STICKY, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'y')), [])); - Obj.DefineProperty(PROP_UNICODE_SETS, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'v')), [])); - Obj.DefineProperty(PROP_HAS_INDICES, - TGocciaPropertyDescriptorData.Create( - TGocciaBooleanLiteralValue.Create(HasRegExpFlag(CanonicalFlags, 'd')), [])); Result := Obj; end; @@ -215,13 +188,18 @@ function MatchRegExpObject(const AValue: TGocciaValue; const AInput: string; ShouldUpdate: Boolean; begin Obj := TGocciaObjectValue(AValue); - Result := ExecuteRegExp( - GetStringProperty(Obj, PROP_SOURCE), - GetStringProperty(Obj, PROP_FLAGS), - AInput, - AStartIndex, - ARequireStart, - MatchResult); + try + Result := ExecuteRegExp( + GetStringProperty(Obj, PROP_SOURCE), + GetStringProperty(Obj, PROP_FLAGS), + AInput, + AStartIndex, + ARequireStart, + MatchResult); + except + on E: ERegExpRuntimeError do + ThrowError(E.Message); + end; ShouldUpdate := AUpdateLastIndex and (GetBooleanProperty(Obj, PROP_GLOBAL) or GetBooleanProperty(Obj, PROP_STICKY)); diff --git a/source/units/Goccia.RegExp.Unicode.pas b/source/units/Goccia.RegExp.Unicode.pas deleted file mode 100644 index bcead522..00000000 --- a/source/units/Goccia.RegExp.Unicode.pas +++ /dev/null @@ -1,611 +0,0 @@ -unit Goccia.RegExp.Unicode; - -{$I Goccia.inc} - -interface - -function ExpandUnicodePropertyEscape(const APropertyName: string; - const ANegated: Boolean): string; -function PreprocessUnicodePattern(const APattern: string; - const AIgnoreCase: Boolean = False): string; - -implementation - -uses - SysUtils; - -const - UNSUPPORTED_PROPERTY_PREFIX = 'Invalid Unicode property name: '; - UTF8_ANY_CODE_POINT = - '(?:[\x00-\x7F]|[\xC2-\xDF][\x80-\xBF]|' + - '[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|' + - '[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF])'; - UTF8_NON_SPACE_CODE_POINT = - '(?:[\x00-\x08\x0E-\x1F\x21-\x7F]|' + - '[\xC2-\xDF][\x80-\xBF]|' + - '[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|' + - '[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF])'; - - // ES2026 §22.2.2.9 Unicode property escape character classes. - // These use ASCII-safe approximations for the most commonly used - // General Category properties and Binary properties. - CHAR_CLASS_LETTER = 'A-Za-z\xC0-\xD6\xD8-\xF6\xF8-\xFF'; - CHAR_CLASS_UPPERCASE_LETTER = 'A-Z\xC0-\xD6\xD8-\xDE'; - CHAR_CLASS_LOWERCASE_LETTER = 'a-z\xDF-\xF6\xF8-\xFF'; - CHAR_CLASS_DECIMAL_NUMBER = '0-9'; - CHAR_CLASS_NUMBER = '0-9'; - CHAR_CLASS_PUNCTUATION = - '!\x22#%&\x27\x28\x29*,\x2D.\x2F:;\x3F@\x5B\\\x5D_\x7B\x7D'; - CHAR_CLASS_SYMBOL = '\x24+<=>^`|~'; - CHAR_CLASS_SEPARATOR = '\x20\xA0'; - CHAR_CLASS_CONTROL = '\x00-\x1F\x7F-\x9F'; - CHAR_CLASS_ASCII = '\x00-\x7F'; - CHAR_CLASS_ASCII_HEX_DIGIT = '0-9A-Fa-f'; - CHAR_CLASS_WHITE_SPACE = '\x09-\x0D\x20\xA0'; - -// ES2026 §22.2.2.9 CharacterClassEscape :: \p{UnicodePropertyValueExpression} -function ExpandUnicodePropertyEscape(const APropertyName: string; - const ANegated: Boolean): string; -var - CharClass: string; - NegatePrefix: string; -begin - CharClass := ''; - - if (APropertyName = 'L') or (APropertyName = 'Letter') then - CharClass := CHAR_CLASS_LETTER - else if (APropertyName = 'Lu') or (APropertyName = 'Uppercase_Letter') then - CharClass := CHAR_CLASS_UPPERCASE_LETTER - else if (APropertyName = 'Ll') or (APropertyName = 'Lowercase_Letter') then - CharClass := CHAR_CLASS_LOWERCASE_LETTER - else if (APropertyName = 'N') or (APropertyName = 'Number') then - CharClass := CHAR_CLASS_NUMBER - else if (APropertyName = 'Nd') or (APropertyName = 'Decimal_Number') then - CharClass := CHAR_CLASS_DECIMAL_NUMBER - else if (APropertyName = 'P') or (APropertyName = 'Punctuation') then - CharClass := CHAR_CLASS_PUNCTUATION - else if (APropertyName = 'S') or (APropertyName = 'Symbol') then - CharClass := CHAR_CLASS_SYMBOL - else if (APropertyName = 'Z') or (APropertyName = 'Separator') then - CharClass := CHAR_CLASS_SEPARATOR - else if (APropertyName = 'Cc') or (APropertyName = 'Control') then - CharClass := CHAR_CLASS_CONTROL - else if APropertyName = 'ASCII' then - CharClass := CHAR_CLASS_ASCII - else if APropertyName = 'ASCII_Hex_Digit' then - CharClass := CHAR_CLASS_ASCII_HEX_DIGIT - else if APropertyName = 'White_Space' then - CharClass := CHAR_CLASS_WHITE_SPACE - else - raise EConvertError.Create(UNSUPPORTED_PROPERTY_PREFIX + APropertyName); - - if ANegated then - NegatePrefix := '^' - else - NegatePrefix := ''; - - Result := '[' + NegatePrefix + CharClass + ']'; -end; - -// ES2026 §11.1.4 Static Semantics: UTF16EncodeCodePoint ( cp ) -function CodePointToUtf8(const ACodePoint: Cardinal): string; -begin - if ACodePoint <= $7F then - Result := Chr(ACodePoint) - else if ACodePoint <= $7FF then - Result := Chr($C0 or (ACodePoint shr 6)) + - Chr($80 or (ACodePoint and $3F)) - else if ACodePoint <= $FFFF then - Result := Chr($E0 or (ACodePoint shr 12)) + - Chr($80 or ((ACodePoint shr 6) and $3F)) + - Chr($80 or (ACodePoint and $3F)) - else if ACodePoint <= $10FFFF then - Result := Chr($F0 or (ACodePoint shr 18)) + - Chr($80 or ((ACodePoint shr 12) and $3F)) + - Chr($80 or ((ACodePoint shr 6) and $3F)) + - Chr($80 or (ACodePoint and $3F)) - else - raise EConvertError.Create('Invalid Unicode code point: U+' + - IntToHex(ACodePoint, 4)); -end; - -function DecodeUtf8At(const APattern: string; const AIndex: Integer; - out ACodePoint: Cardinal; out AByteLength: Integer): Boolean; -var - B1, B2, B3, B4: Byte; -begin - Result := False; - ACodePoint := 0; - AByteLength := 0; - if AIndex > Length(APattern) then - Exit; - B1 := Ord(APattern[AIndex]); - if B1 < $80 then - begin - ACodePoint := B1; - AByteLength := 1; - Exit(True); - end; - if (B1 >= $C2) and (B1 <= $DF) and (AIndex + 1 <= Length(APattern)) then - begin - B2 := Ord(APattern[AIndex + 1]); - if (B2 and $C0) <> $80 then - Exit; - ACodePoint := ((B1 and $1F) shl 6) or (B2 and $3F); - AByteLength := 2; - Exit(True); - end; - if (B1 >= $E0) and (B1 <= $EF) and (AIndex + 2 <= Length(APattern)) then - begin - B2 := Ord(APattern[AIndex + 1]); - B3 := Ord(APattern[AIndex + 2]); - if ((B2 and $C0) <> $80) or ((B3 and $C0) <> $80) then - Exit; - ACodePoint := ((B1 and $0F) shl 12) or ((B2 and $3F) shl 6) or - (B3 and $3F); - AByteLength := 3; - Exit(True); - end; - if (B1 >= $F0) and (B1 <= $F4) and (AIndex + 3 <= Length(APattern)) then - begin - B2 := Ord(APattern[AIndex + 1]); - B3 := Ord(APattern[AIndex + 2]); - B4 := Ord(APattern[AIndex + 3]); - if ((B2 and $C0) <> $80) or ((B3 and $C0) <> $80) or - ((B4 and $C0) <> $80) then - Exit; - ACodePoint := ((B1 and $07) shl 18) or ((B2 and $3F) shl 12) or - ((B3 and $3F) shl 6) or (B4 and $3F); - AByteLength := 4; - Exit(True); - end; -end; - -function IsHexDigit(const C: Char): Boolean; inline; -begin - Result := CharInSet(C, ['0'..'9', 'a'..'f', 'A'..'F']); -end; - -function EscapeLiteralAtom(const AValue: string): string; -const - REGEXP_SYNTAX_CHARS = ['\', '^', '$', '.', '|', '?', '*', '+', '(', ')', - '[', ']', '{', '}']; -var - I: Integer; -begin - Result := ''; - for I := 1 to Length(AValue) do - begin - if CharInSet(AValue[I], REGEXP_SYNTAX_CHARS) then - Result := Result + '\'; - Result := Result + AValue[I]; - end; -end; - -function HexByte(const AValue: Byte): string; inline; -begin - Result := '\x' + IntToHex(AValue, 2); -end; - -function ByteRangeExcept(const AMin, AMax, AExcluded: Byte): string; -begin - Result := ''; - if AExcluded > AMin then - Result := Result + HexByte(AMin) + '-' + HexByte(AExcluded - 1); - if AExcluded < AMax then - begin - if Result <> '' then - Result := Result + HexByte(AExcluded + 1) + '-' + HexByte(AMax) - else - Result := HexByte(AExcluded + 1) + '-' + HexByte(AMax); - end; - if Result = '' then - Result := '[^\s\S]' - else - Result := '[' + Result + ']'; -end; - -function Utf8AnyCodePointExcept(const ACodePoint: Cardinal): string; -var - Bytes: string; - Parts: array of string; - - procedure AddPart(const APart: string); - begin - SetLength(Parts, Length(Parts) + 1); - Parts[High(Parts)] := APart; - end; - -var - I: Integer; -begin - Bytes := CodePointToUtf8(ACodePoint); - SetLength(Parts, 0); - if Length(Bytes) <> 1 then - AddPart('[\x00-\x7F]'); - if Length(Bytes) <> 2 then - AddPart('[\xC2-\xDF][\x80-\xBF]'); - if Length(Bytes) <> 3 then - AddPart('[\xE0-\xEF][\x80-\xBF][\x80-\xBF]'); - if Length(Bytes) <> 4 then - AddPart('[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]'); - - case Length(Bytes) of - 1: - AddPart(ByteRangeExcept($00, $7F, Ord(Bytes[1]))); - 2: - begin - AddPart(ByteRangeExcept($C2, $DF, Ord(Bytes[1])) + '[\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + - ByteRangeExcept($80, $BF, Ord(Bytes[2]))); - end; - 3: - begin - AddPart(ByteRangeExcept($E0, $EF, Ord(Bytes[1])) + - '[\x80-\xBF][\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + - ByteRangeExcept($80, $BF, Ord(Bytes[2])) + '[\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + HexByte(Ord(Bytes[2])) + - ByteRangeExcept($80, $BF, Ord(Bytes[3]))); - end; - 4: - begin - AddPart(ByteRangeExcept($F0, $F4, Ord(Bytes[1])) + - '[\x80-\xBF][\x80-\xBF][\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + - ByteRangeExcept($80, $BF, Ord(Bytes[2])) + - '[\x80-\xBF][\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + HexByte(Ord(Bytes[2])) + - ByteRangeExcept($80, $BF, Ord(Bytes[3])) + '[\x80-\xBF]'); - AddPart(HexByte(Ord(Bytes[1])) + HexByte(Ord(Bytes[2])) + - HexByte(Ord(Bytes[3])) + ByteRangeExcept($80, $BF, Ord(Bytes[4]))); - end; - end; - - Result := '(?:'; - for I := 0 to High(Parts) do - begin - if I > 0 then - Result := Result + '|'; - Result := Result + Parts[I]; - end; - Result := Result + ')'; -end; - -function EmitUnicodeAtom(const ACodePoint: Cardinal; - const AIgnoreCase: Boolean): string; -begin - if AIgnoreCase and (ACodePoint = $212A) then - Result := '[Kk]' - else - Result := '(?:' + EscapeLiteralAtom(CodePointToUtf8(ACodePoint)) + ')'; -end; - -function TryParseUnicodeClassAtom(const APattern: string; var AIndex: Integer; - const AStopIndex: Integer; out ACodePoint: Cardinal): Boolean; -var - HexStr: string; - HighSurrogate, LowSurrogate: Cardinal; - ByteLength: Integer; -begin - Result := False; - ACodePoint := 0; - if AIndex > AStopIndex then - Exit; - if (APattern[AIndex] = '\') and (AIndex + 5 <= AStopIndex) and - (APattern[AIndex + 1] = 'u') then - begin - HexStr := Copy(APattern, AIndex + 2, 4); - if not ((Length(HexStr) = 4) and IsHexDigit(HexStr[1]) and - IsHexDigit(HexStr[2]) and IsHexDigit(HexStr[3]) and - IsHexDigit(HexStr[4])) then - Exit; - HighSurrogate := StrToInt('$' + HexStr); - Inc(AIndex, 6); - if (HighSurrogate >= $D800) and (HighSurrogate <= $DBFF) and - (AIndex + 5 <= AStopIndex) and (APattern[AIndex] = '\') and - (APattern[AIndex + 1] = 'u') then - begin - HexStr := Copy(APattern, AIndex + 2, 4); - if (Length(HexStr) = 4) and IsHexDigit(HexStr[1]) and - IsHexDigit(HexStr[2]) and IsHexDigit(HexStr[3]) and - IsHexDigit(HexStr[4]) then - begin - LowSurrogate := StrToInt('$' + HexStr); - if (LowSurrogate >= $DC00) and (LowSurrogate <= $DFFF) then - begin - ACodePoint := $10000 + ((HighSurrogate - $D800) shl 10) + - (LowSurrogate - $DC00); - Inc(AIndex, 6); - Exit(True); - end; - end; - end; - ACodePoint := HighSurrogate; - Exit(True); - end; - if DecodeUtf8At(APattern, AIndex, ACodePoint, ByteLength) and - (ByteLength > 1) then - begin - Inc(AIndex, ByteLength); - Exit(True); - end; -end; - -function TryConvertUnicodeCharacterClass(const APattern: string; - const AStartIndex: Integer; const AIgnoreCase: Boolean; - out AReplacement: string; out ANextIndex: Integer): Boolean; -var - EndIndex, AtomStart, I: Integer; - Negated: Boolean; - FirstCodePoint, LastCodePoint, CurrentCodePoint: Cardinal; - CodePoints: array of Cardinal; -begin - Result := False; - AReplacement := ''; - ANextIndex := AStartIndex; - EndIndex := AStartIndex + 1; - while (EndIndex <= Length(APattern)) and (APattern[EndIndex] <> ']') do - Inc(EndIndex); - if EndIndex > Length(APattern) then - Exit; - AtomStart := AStartIndex + 1; - Negated := (AtomStart < EndIndex) and (APattern[AtomStart] = '^'); - if Negated then - Inc(AtomStart); - I := AtomStart; - if not TryParseUnicodeClassAtom(APattern, I, EndIndex - 1, FirstCodePoint) then - Exit; - if I = EndIndex then - begin - if Negated then - AReplacement := Utf8AnyCodePointExcept(FirstCodePoint) - else - AReplacement := EmitUnicodeAtom(FirstCodePoint, AIgnoreCase); - ANextIndex := EndIndex + 1; - Exit(True); - end; - if (not Negated) and (I < EndIndex) and (APattern[I] = '-') then - begin - Inc(I); - if TryParseUnicodeClassAtom(APattern, I, EndIndex - 1, LastCodePoint) and - (I = EndIndex) and (FirstCodePoint <= LastCodePoint) and - (LastCodePoint - FirstCodePoint <= 32) then - begin - AReplacement := '(?:'; - for CurrentCodePoint := FirstCodePoint to LastCodePoint do - begin - if CurrentCodePoint > FirstCodePoint then - AReplacement := AReplacement + '|'; - AReplacement := AReplacement + - EscapeLiteralAtom(CodePointToUtf8(CurrentCodePoint)); - end; - AReplacement := AReplacement + ')'; - ANextIndex := EndIndex + 1; - Exit(True); - end; - end; - - SetLength(CodePoints, 1); - CodePoints[0] := FirstCodePoint; - while I < EndIndex do - begin - if not TryParseUnicodeClassAtom(APattern, I, EndIndex - 1, CurrentCodePoint) then - Exit(False); - SetLength(CodePoints, Length(CodePoints) + 1); - CodePoints[High(CodePoints)] := CurrentCodePoint; - end; - - if Negated then - begin - AReplacement := '[^\s\S]'; - ANextIndex := EndIndex + 1; - Exit(True); - end; - AReplacement := ''; - AReplacement := AReplacement + '(?:'; - for I := 0 to High(CodePoints) do - begin - if I > 0 then - AReplacement := AReplacement + '|'; - AReplacement := AReplacement + - EscapeLiteralAtom(CodePointToUtf8(CodePoints[I])); - end; - AReplacement := AReplacement + ')'; - ANextIndex := EndIndex + 1; - Result := True; -end; - -// ES2026 §22.2.1 Patterns — preprocess pattern for Unicode mode. -// Expands \p{...} / \P{...} property escapes into TRegExpr-compatible -// character classes and converts \u{XXXX} code point escapes into -// literal UTF-8 byte sequences. -function PreprocessUnicodePattern(const APattern: string; - const AIgnoreCase: Boolean): string; -var - I, J, PatternLength: Integer; - PropertyName: string; - Negated: Boolean; - InCharacterClass: Boolean; - CodePoint: Cardinal; - HexStart, HexLen: Integer; - HexStr: string; - LowSurrogate: Cardinal; - ByteLength: Integer; - ClassReplacement: string; - NextIndex: Integer; -begin - Result := ''; - I := 1; - PatternLength := Length(APattern); - InCharacterClass := False; - - while I <= PatternLength do - begin - if APattern[I] = '\' then - begin - if I + 1 > PatternLength then - begin - Result := Result + APattern[I]; - Inc(I); - Continue; - end; - - case APattern[I + 1] of - '0': - begin - Result := Result + '(?:' + #0 + ')'; - Inc(I, 2); - end; - 'S': - begin - Result := Result + UTF8_NON_SPACE_CODE_POINT; - Inc(I, 2); - end; - 'p', 'P': - begin - Negated := APattern[I + 1] = 'P'; - if (I + 2 <= PatternLength) and (APattern[I + 2] = '{') then - begin - PropertyName := ''; - Inc(I, 3); - while (I <= PatternLength) and (APattern[I] <> '}') do - begin - PropertyName := PropertyName + APattern[I]; - Inc(I); - end; - if I > PatternLength then - raise EConvertError.Create( - 'Unterminated Unicode property escape'); - Inc(I); // skip closing brace - Result := Result + - ExpandUnicodePropertyEscape(PropertyName, Negated); - end - else - begin - Result := Result + APattern[I] + APattern[I + 1]; - Inc(I, 2); - end; - end; - 'u': - begin - // \u{XXXX} or \u{XXXXX} code point escape - if (I + 2 <= PatternLength) and (APattern[I + 2] = '{') then - begin - HexStart := I + 3; - HexLen := 0; - while (HexStart + HexLen <= PatternLength) and - (APattern[HexStart + HexLen] <> '}') do - Inc(HexLen); - if HexStart + HexLen > PatternLength then - raise EConvertError.Create( - 'Unterminated Unicode escape sequence'); - HexStr := Copy(APattern, HexStart, HexLen); - if HexStr = '' then - raise EConvertError.Create( - 'Empty Unicode escape sequence'); - for J := 1 to Length(HexStr) do - if not IsHexDigit(HexStr[J]) then - raise EConvertError.Create( - 'Invalid hex digit in Unicode escape: \u{' + - HexStr + '}'); - CodePoint := StrToInt('$' + HexStr); - if CodePoint > $10FFFF then - raise EConvertError.Create( - 'Unicode escape out of range: \u{' + HexStr + '}'); - if InCharacterClass then - Result := Result + EscapeLiteralAtom(CodePointToUtf8(CodePoint)) - else - Result := Result + EmitUnicodeAtom(CodePoint, AIgnoreCase); - I := HexStart + HexLen + 1; - end - // \uHHHH four-digit Unicode escape - else if (I + 5 <= PatternLength) and - IsHexDigit(APattern[I + 2]) and - IsHexDigit(APattern[I + 3]) and - IsHexDigit(APattern[I + 4]) and - IsHexDigit(APattern[I + 5]) then - begin - HexStr := Copy(APattern, I + 2, 4); - CodePoint := StrToInt('$' + HexStr); - if (CodePoint >= $D800) and (CodePoint <= $DBFF) and - (I + 11 <= PatternLength) and (APattern[I + 6] = '\') and - (APattern[I + 7] = 'u') and - IsHexDigit(APattern[I + 8]) and - IsHexDigit(APattern[I + 9]) and - IsHexDigit(APattern[I + 10]) and - IsHexDigit(APattern[I + 11]) then - begin - HexStr := Copy(APattern, I + 8, 4); - LowSurrogate := StrToInt('$' + HexStr); - if (LowSurrogate >= $DC00) and (LowSurrogate <= $DFFF) then - begin - CodePoint := $10000 + ((CodePoint - $D800) shl 10) + - (LowSurrogate - $DC00); - if InCharacterClass then - Result := Result + EscapeLiteralAtom(CodePointToUtf8(CodePoint)) - else - Result := Result + EmitUnicodeAtom(CodePoint, AIgnoreCase); - Inc(I, 12); - Continue; - end; - end; - if InCharacterClass then - Result := Result + EscapeLiteralAtom(CodePointToUtf8(CodePoint)) - else - Result := Result + EmitUnicodeAtom(CodePoint, AIgnoreCase); - Inc(I, 6); - end - else - begin - Result := Result + APattern[I] + APattern[I + 1]; - Inc(I, 2); - end; - end; - else - begin - Result := Result + APattern[I] + APattern[I + 1]; - Inc(I, 2); - end; - end; - end - else if APattern[I] = '[' then - begin - if TryConvertUnicodeCharacterClass(APattern, I, AIgnoreCase, - ClassReplacement, NextIndex) then - begin - Result := Result + ClassReplacement; - I := NextIndex; - Continue; - end; - InCharacterClass := True; - Result := Result + APattern[I]; - Inc(I); - end - else if (not InCharacterClass) and (APattern[I] = '.') then - begin - Result := Result + UTF8_ANY_CODE_POINT; - Inc(I); - end - else if (not InCharacterClass) and - DecodeUtf8At(APattern, I, CodePoint, ByteLength) and - (ByteLength > 1) then - begin - Result := Result + EmitUnicodeAtom(CodePoint, AIgnoreCase); - Inc(I, ByteLength); - end - else if (APattern[I] = ']') and InCharacterClass then - begin - InCharacterClass := False; - Result := Result + APattern[I]; - Inc(I); - end - else - begin - Result := Result + APattern[I]; - Inc(I); - end; - end; -end; - -end. diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas new file mode 100644 index 00000000..83d843fa --- /dev/null +++ b/source/units/Goccia.RegExp.VM.pas @@ -0,0 +1,684 @@ +unit Goccia.RegExp.VM; + +{$I Goccia.inc} + +interface + +uses + SysUtils, + + Goccia.RegExp.Compiler; + +type + ERegExpRuntimeError = class(Exception); + + TRegExpVMResult = record + Matched: Boolean; + CaptureSlots: array of Integer; + end; + +function ExecuteRegExpVM(const AProgram: TRegExpProgram; + const AInput: string; const AStartIndex: Integer; + const ARequireStart: Boolean; out AResult: TRegExpVMResult): Boolean; + +implementation + +uses + TextSemantics; + +const + MIN_STEP_LIMIT = 10000000; + STEPS_PER_INPUT_BYTE = 100; + DEFAULT_BACKTRACK_CAP = 10000000; + MAX_LOOKBEHIND_DISTANCE = 256; + MEMO_CAPACITY = 65536; + MEMO_LOAD_LIMIT = 49152; + +type + TBacktrackEntry = record + PC: Integer; + InputPos: Integer; + Slots: array of Integer; + end; + + TMemoEntry = record + Occupied: Boolean; + PC: Integer; + InputPos: Integer; + end; + + TMemoTable = record + Entries: array of TMemoEntry; + Count: Integer; + end; + +procedure MemoInit(var AMemo: TMemoTable); +begin + AMemo.Count := 0; +end; + +procedure MemoEnsureAllocated(var AMemo: TMemoTable); inline; +begin + if Length(AMemo.Entries) = 0 then + SetLength(AMemo.Entries, MEMO_CAPACITY); +end; + +function MemoHash(APC, APos: Integer): Integer; inline; +var + H: Cardinal; +begin + H := Cardinal(APC); + H := (H shl 5) xor (H shr 3) xor Cardinal(APos); + H := H xor (H shr 7) xor (H shr 15); + Result := Integer(H and (MEMO_CAPACITY - 1)); +end; + +function MemoContains(var AMemo: TMemoTable; APC, APos: Integer): Boolean; +var + Idx, I: Integer; +begin + if Length(AMemo.Entries) = 0 then + Exit(False); + Idx := MemoHash(APC, APos); + for I := 0 to 15 do + begin + if not AMemo.Entries[Idx].Occupied then + Exit(False); + if (AMemo.Entries[Idx].PC = APC) and (AMemo.Entries[Idx].InputPos = APos) then + Exit(True); + Idx := (Idx + 1) and (MEMO_CAPACITY - 1); + end; + Result := False; +end; + +procedure MemoAdd(var AMemo: TMemoTable; APC, APos: Integer); +var + Idx, I: Integer; +begin + MemoEnsureAllocated(AMemo); + if AMemo.Count >= MEMO_LOAD_LIMIT then + Exit; + Idx := MemoHash(APC, APos); + for I := 0 to 15 do + begin + if not AMemo.Entries[Idx].Occupied then + begin + AMemo.Entries[Idx].Occupied := True; + AMemo.Entries[Idx].PC := APC; + AMemo.Entries[Idx].InputPos := APos; + Inc(AMemo.Count); + Exit; + end; + if (AMemo.Entries[Idx].PC = APC) and (AMemo.Entries[Idx].InputPos = APos) then + Exit; + Idx := (Idx + 1) and (MEMO_CAPACITY - 1); + end; +end; + +function CharClassContainsLinear(const AClass: TRegExpCharClass; + ACodePoint: Cardinal): Boolean; inline; +var + I: Integer; +begin + for I := 0 to High(AClass.Ranges) do + if (ACodePoint >= AClass.Ranges[I].Lo) and + (ACodePoint <= AClass.Ranges[I].Hi) then + Exit(True); + Result := False; +end; + +function IsWordChar(ACodePoint: Cardinal): Boolean; inline; +begin + Result := ((ACodePoint >= Ord('a')) and (ACodePoint <= Ord('z'))) or + ((ACodePoint >= Ord('A')) and (ACodePoint <= Ord('Z'))) or + ((ACodePoint >= Ord('0')) and (ACodePoint <= Ord('9'))) or + (ACodePoint = Ord('_')); +end; + +function IsLineTerminator(ACodePoint: Cardinal): Boolean; inline; +begin + Result := (ACodePoint = $0A) or (ACodePoint = $0D) or + (ACodePoint = $2028) or (ACodePoint = $2029); +end; + +function ReadInputCodePoint(const AInput: string; APos: Integer; + out ACodePoint: Cardinal; out AByteLen: Integer): Boolean; inline; +var + B: Byte; +begin + if (APos < 1) or (APos > Length(AInput)) then + begin + ACodePoint := 0; + AByteLen := 0; + Exit(False); + end; + B := Ord(AInput[APos]); + if B < $80 then + begin + ACodePoint := B; + AByteLen := 1; + Exit(True); + end; + Result := TryReadUTF8CodePointAllowSurrogates(AInput, APos, ACodePoint, + AByteLen); + if not Result then + begin + ACodePoint := B; + AByteLen := 1; + Result := True; + end; +end; + +function GetCodePointBefore(const AInput: string; APos: Integer; + out ACodePoint: Cardinal): Boolean; +var + StartPos, ByteLen: Integer; +begin + Result := False; + ACodePoint := 0; + if APos <= 1 then + Exit; + StartPos := APos - 1; + while (StartPos > 1) and ((Ord(AInput[StartPos]) and $C0) = $80) do + Dec(StartPos); + Result := TryReadUTF8CodePointAllowSurrogates(AInput, StartPos, ACodePoint, + ByteLen); +end; + +function RunVM(const AProgram: TRegExpProgram; const AInput: string; + AStartPos: Integer; var ASlots: array of Integer; + ASlotCount: Integer; AStartPC: Integer = 0; + AEndPos: PInteger = nil): Boolean; +var + PC, InputPos: Integer; + Instr: UInt32; + Op: TRegExpOpCode; + Bx: Integer; + CodePoint: Cardinal; + ByteLen: Integer; + Stack: array of TBacktrackEntry; + StackTop: Integer; + StepCount: Integer; + StepLimit: Integer; + Memo: TMemoTable; + SlotCount: Integer; + I: Integer; + MatchCP: Cardinal; + BeforeCP: Cardinal; + BeforeIsWord, AfterIsWord: Boolean; + Negated: Boolean; + BackrefGroup: Integer; + BackrefICase: Boolean; + LookEnd: Integer; + LookSlots: array of Integer; + LookMatched: Boolean; + RefStart, RefEnd, RefPos: Integer; + RefCP, InputCP: Cardinal; + RefByteLen, InputByteLen: Integer; + + procedure PushBacktrack(APC, AInputPos: Integer); + begin + if StackTop >= DEFAULT_BACKTRACK_CAP then + raise ERegExpRuntimeError.Create('Maximum regular expression backtrack stack size exceeded'); + Inc(StackTop); + if StackTop >= Length(Stack) then + SetLength(Stack, StackTop * 2 + 16); + Stack[StackTop].PC := APC; + Stack[StackTop].InputPos := AInputPos; + if Length(Stack[StackTop].Slots) <> SlotCount then + SetLength(Stack[StackTop].Slots, SlotCount); + if SlotCount > 0 then + Move(ASlots[0], Stack[StackTop].Slots[0], SlotCount * SizeOf(Integer)); + end; + + function PopBacktrack: Boolean; + begin + while StackTop >= 0 do + begin + PC := Stack[StackTop].PC; + InputPos := Stack[StackTop].InputPos; + if SlotCount > 0 then + Move(Stack[StackTop].Slots[0], ASlots[0], SlotCount * SizeOf(Integer)); + Dec(StackTop); + if not MemoContains(Memo, PC, InputPos) then + Exit(True); + end; + Result := False; + end; + +begin + Result := False; + SlotCount := ASlotCount; + PC := AStartPC; + InputPos := AStartPos; + StepCount := 0; + StepLimit := Length(AInput) * STEPS_PER_INPUT_BYTE; + if StepLimit < MIN_STEP_LIMIT then + StepLimit := MIN_STEP_LIMIT; + StackTop := -1; + SetLength(Stack, 256); + MemoInit(Memo); + + while PC < Length(AProgram.Code) do + begin + Inc(StepCount); + if StepCount > StepLimit then + raise ERegExpRuntimeError.Create('Maximum regular expression backtrack stack size exceeded'); + + Instr := AProgram.Code[PC]; + Op := TRegExpOpCode(Instr and $FF); + Bx := Integer(Instr shr 8); + + case Op of + RX_CHAR: + begin + if not ReadInputCodePoint(AInput, InputPos, + CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + MatchCP := Cardinal(Bx); + if CodePoint <> MatchCP then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(InputPos, ByteLen); + Inc(PC); + end; + + RX_CHAR_CLASS: + begin + if not ReadInputCodePoint(AInput, InputPos, + CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + if not CharClassContainsLinear(AProgram.CharClasses[Bx], CodePoint) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(InputPos, ByteLen); + Inc(PC); + end; + + RX_CHAR_CLASS_NEG: + begin + if not ReadInputCodePoint(AInput, InputPos, + CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + if CharClassContainsLinear(AProgram.CharClasses[Bx], CodePoint) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(InputPos, ByteLen); + Inc(PC); + end; + + RX_ANY: + begin + if not ReadInputCodePoint(AInput, InputPos, + CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + if (Bx = 0) and IsLineTerminator(CodePoint) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(InputPos, ByteLen); + Inc(PC); + end; + + RX_SPLIT: + begin + if not MemoContains(Memo, Bx, InputPos) then + PushBacktrack(Bx, InputPos); + Inc(PC); + end; + + RX_SPLIT_LAZY: + begin + if not MemoContains(Memo, PC + 1, InputPos) then + PushBacktrack(PC + 1, InputPos); + PC := Bx; + end; + + RX_JUMP: + begin + if (Bx >= 0) and (Bx < Length(AProgram.Code)) and + (TRegExpOpCode(AProgram.Code[Bx] and $FF) = RX_SPLIT) then + begin + if (StackTop >= 0) and (Stack[StackTop].PC = Integer(AProgram.Code[Bx] shr 8)) and + (Stack[StackTop].InputPos = InputPos) then + begin + PC := Integer(AProgram.Code[Bx] shr 8); + Continue; + end; + end; + PC := Bx; + end; + + RX_SAVE: + begin + if Bx < SlotCount then + ASlots[Bx] := InputPos; + Inc(PC); + end; + + RX_BACKREF: + begin + Negated := (Bx and BACKREF_STRICT_FLAG) <> 0; + BackrefICase := (Bx and BACKREF_ICASE_FLAG) <> 0; + BackrefGroup := Bx and BACKREF_INDEX_MASK; + RefStart := -1; + RefEnd := -1; + if (BackrefGroup * 2) < SlotCount then + RefStart := ASlots[BackrefGroup * 2]; + if (BackrefGroup * 2 + 1) < SlotCount then + RefEnd := ASlots[BackrefGroup * 2 + 1]; + if (RefStart < 0) or (RefEnd < 0) or (RefStart > RefEnd) then + begin + if Negated then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(PC); + Continue; + end; + RefPos := RefStart; + LookMatched := True; + RefEnd := ASlots[BackrefGroup * 2 + 1]; + I := InputPos; + while RefPos < RefEnd do + begin + if not ReadInputCodePoint(AInput, RefPos, + RefCP, RefByteLen) then + begin + LookMatched := False; + Break; + end; + if not ReadInputCodePoint(AInput, InputPos, + InputCP, InputByteLen) then + begin + LookMatched := False; + Break; + end; + if RefCP <> InputCP then + begin + if BackrefICase then + begin + if (RefCP >= Ord('A')) and (RefCP <= Ord('Z')) then + RefCP := RefCP + 32; + if (InputCP >= Ord('A')) and (InputCP <= Ord('Z')) then + InputCP := InputCP + 32; + end; + if RefCP <> InputCP then + begin + LookMatched := False; + Break; + end; + end; + Inc(RefPos, RefByteLen); + Inc(InputPos, InputByteLen); + end; + if not LookMatched then + begin + InputPos := I; + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Inc(PC); + end; + + RX_ASSERT_START: + begin + if Bx <> 0 then + begin + if InputPos > 1 then + begin + if not GetCodePointBefore(AInput, InputPos, BeforeCP) or + not IsLineTerminator(BeforeCP) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end; + end + else + begin + if InputPos > 1 then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end; + Inc(PC); + end; + + RX_ASSERT_END: + begin + if Bx <> 0 then + begin + if ReadInputCodePoint(AInput, InputPos, + CodePoint, ByteLen) then + begin + if not IsLineTerminator(CodePoint) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end; + end + else + begin + if InputPos <= Length(AInput) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end; + Inc(PC); + end; + + RX_ASSERT_WORD: + begin + Negated := Bx <> 0; + BeforeIsWord := False; + AfterIsWord := False; + if GetCodePointBefore(AInput, InputPos, BeforeCP) then + BeforeIsWord := IsWordChar(BeforeCP); + if ReadInputCodePoint(AInput, InputPos, + CodePoint, ByteLen) then + AfterIsWord := IsWordChar(CodePoint); + if Negated then + begin + if BeforeIsWord <> AfterIsWord then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else + begin + if BeforeIsWord = AfterIsWord then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end; + Inc(PC); + end; + + RX_LOOKAHEAD: + begin + Negated := (Bx and LOOK_NEGATED_FLAG) <> 0; + LookEnd := Bx and LOOK_TARGET_MASK; + SetLength(LookSlots, SlotCount); + Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); + LookMatched := RunVM(AProgram, AInput, InputPos, LookSlots, + SlotCount, PC + 1); + if Negated then + begin + if LookMatched then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else + begin + if not LookMatched then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Move(LookSlots[0], ASlots[0], SlotCount * SizeOf(Integer)); + end; + PC := LookEnd; + end; + + RX_LOOKBEHIND: + begin + Negated := (Bx and LOOK_NEGATED_FLAG) <> 0; + LookEnd := Bx and LOOK_TARGET_MASK; + LookMatched := False; + SetLength(LookSlots, SlotCount); + I := InputPos - 1; + RefStart := I - MAX_LOOKBEHIND_DISTANCE; + if RefStart < 1 then + RefStart := 1; + while I >= RefStart do + begin + Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); + RefEnd := 0; + if RunVM(AProgram, AInput, I, LookSlots, SlotCount, PC + 1, + @RefEnd) then + begin + if RefEnd = InputPos then + begin + LookMatched := True; + Break; + end; + end; + Dec(I); + end; + if Negated then + begin + if LookMatched then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else + begin + if not LookMatched then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + Move(LookSlots[0], ASlots[0], SlotCount * SizeOf(Integer)); + end; + PC := LookEnd; + end; + + RX_MATCH: + begin + if AEndPos <> nil then + AEndPos^ := InputPos; + Result := True; + Exit; + end; + + RX_FAIL: + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + else + raise ERegExpRuntimeError.CreateFmt( + 'Invalid regular expression bytecode: opcode %d at PC %d', + [Ord(Op), PC]); + end; + end; +end; + +function ExecuteRegExpVM(const AProgram: TRegExpProgram; + const AInput: string; const AStartIndex: Integer; + const ARequireStart: Boolean; out AResult: TRegExpVMResult): Boolean; +var + SlotCount, I, StartPos: Integer; + Slots: array of Integer; + ByteLen: Integer; + CodePoint: Cardinal; +begin + Result := False; + AResult.Matched := False; + SlotCount := (AProgram.CaptureCount + 1) * 2; + SetLength(Slots, SlotCount); + StartPos := AStartIndex + 1; + if ARequireStart then + begin + FillChar(Slots[0], SlotCount * SizeOf(Integer), $FF); + if RunVM(AProgram, AInput, StartPos, Slots, SlotCount) then + begin + AResult.Matched := True; + SetLength(AResult.CaptureSlots, SlotCount); + Move(Slots[0], AResult.CaptureSlots[0], SlotCount * SizeOf(Integer)); + Result := True; + end; + Exit; + end; + while StartPos <= Length(AInput) + 1 do + begin + FillChar(Slots[0], SlotCount * SizeOf(Integer), $FF); + if RunVM(AProgram, AInput, StartPos, Slots, SlotCount) then + begin + AResult.Matched := True; + SetLength(AResult.CaptureSlots, SlotCount); + Move(Slots[0], AResult.CaptureSlots[0], SlotCount * SizeOf(Integer)); + Result := True; + Exit; + end; + if StartPos > Length(AInput) then + Break; + if TryReadUTF8CodePointAllowSurrogates(AInput, StartPos, CodePoint, ByteLen) then + Inc(StartPos, ByteLen) + else + Inc(StartPos); + end; +end; + +end. diff --git a/tests/built-ins/RegExp/constructor.js b/tests/built-ins/RegExp/constructor.js index 159f315a..392fb7b7 100644 --- a/tests/built-ins/RegExp/constructor.js +++ b/tests/built-ins/RegExp/constructor.js @@ -98,3 +98,39 @@ test("RegExp canonicalizes new flags in correct order", () => { const regex = new RegExp("a", "yvgdims"); expect(regex.flags).toBe("dgimsvy"); }); + +// --- Syntax validation --- + +test("dangling quantifier throws SyntaxError", () => { + expect(() => { new RegExp("a**"); }).toThrow(SyntaxError); + expect(() => { new RegExp("??"); }).toThrow(SyntaxError); + expect(() => { new RegExp("+"); }).toThrow(SyntaxError); + expect(() => { new RegExp("*"); }).toThrow(SyntaxError); +}); + +test("invalid character class range throws SyntaxError", () => { + expect(() => { new RegExp("[z-a]"); }).toThrow(SyntaxError); + expect(() => { new RegExp("[b-ac-e]"); }).toThrow(SyntaxError); +}); + +test("quantifier min > max throws SyntaxError", () => { + expect(() => { new RegExp("0{2,1}"); }).toThrow(SyntaxError); +}); + +test("trailing backslash throws SyntaxError", () => { + expect(() => { new RegExp("\\"); }).toThrow(SyntaxError); +}); + +test("huge quantifier does not crash", () => { + expect(/x{2147483648}x/.test("1")).toBe(false); +}); + +test("exec on Object.create(RegExp.prototype) throws TypeError", () => { + const obj = Object.create(RegExp.prototype); + expect(() => { RegExp.prototype.exec.call(obj, "test"); }).toThrow(TypeError); +}); + +test("test on Object.create(RegExp.prototype) throws TypeError", () => { + const obj = Object.create(RegExp.prototype); + expect(() => { RegExp.prototype.test.call(obj, "test"); }).toThrow(TypeError); +}); diff --git a/tests/built-ins/RegExp/modifiers.js b/tests/built-ins/RegExp/modifiers.js index bb4e3d1b..97459b40 100644 --- a/tests/built-ins/RegExp/modifiers.js +++ b/tests/built-ins/RegExp/modifiers.js @@ -224,6 +224,21 @@ test("(?-:...) empty add and remove throws SyntaxError", () => { expect(() => { new RegExp("(?-:abc)"); }).toThrow(SyntaxError); }); +// --- Modifier scoping affects backreferences --- + +test("(?i:\\1) case-folds backreference comparison", () => { + const re = /(a)(?i:\1)/; + expect(re.test("aA")).toBe(true); + expect(re.test("aa")).toBe(true); + expect(re.test("AA")).toBe(false); +}); + +test("(?-i:\\1) disables case-folding for backreference", () => { + const re = new RegExp("(a)(?-i:\\1)", "i"); + expect(re.test("aa")).toBe(true); + expect(re.test("aA")).toBe(false); +}); + // --- Error cases: double dash --- test("(?i--s:...) double dash throws SyntaxError", () => { diff --git a/tests/built-ins/RegExp/prototype/exec.js b/tests/built-ins/RegExp/prototype/exec.js index 27a29d4b..fdf2be21 100644 --- a/tests/built-ins/RegExp/prototype/exec.js +++ b/tests/built-ins/RegExp/prototype/exec.js @@ -194,3 +194,99 @@ test("exec with duplicate named backreference outside the disjunction", () => { expect(re.exec("b")).toBe(null); expect(re.exec("ab")).toBe(null); }); + +// --- Greedy quantifier with alternation --- + +test("greedy star with alternation picks correct match", () => { + const m = /(aa|aabaac|ba|b|c)*/.exec("aabaac"); + expect(m[0]).toBe("aaba"); + expect(m[1]).toBe("ba"); +}); + +test("greedy star with character class quantifier backtracks correctly", () => { + const m = /^([a-z]+)*[a-z]$/.exec("ab"); + expect(m[0]).toBe("ab"); + expect(m[1]).toBe("a"); +}); + +test("backreference backtracking finds correct capture length", () => { + const m = /^(a+)\1*,\1+$/.exec("aaaaaaaaaa,aaaaaaaaaaaaaaa"); + expect(m[0]).toBe("aaaaaaaaaa,aaaaaaaaaaaaaaa"); + expect(m[1]).toBe("aaaaa"); +}); + +test("replace with backreference uses correct capture", () => { + expect("aaaaaaaaaa,aaaaaaaaaaaaaaa".replace(/^(a+)\1*,\1+$/, "$1")).toBe("aaaaa"); +}); + +// --- Zero-width backref loop --- + +test("backreference to zero-length capture with + does not hang", () => { + const m = /(a*)b\1+/.exec("baaac"); + expect(m[0]).toBe("b"); + expect(m[1]).toBe(""); +}); + +// --- Backtrack limit --- + +test("catastrophic backtracking throws Error instead of hanging", () => { + expect(() => { + /^(a+)+$/.exec("a".repeat(30) + "b"); + }).toThrow(Error); +}); + +// --- Large input (#515 regression) --- + +test("exec on large input does not crash", () => { + const s = "foo" + ".bar".repeat(20000); + expect(/f.*/.test(s)).toBe(true); +}); + +// --- Lookahead --- + +test("positive lookahead matches without consuming", () => { + const m = /foo(?=bar)/.exec("foobar"); + expect(m[0]).toBe("foo"); + expect(m.index).toBe(0); +}); + +test("negative lookahead rejects when pattern present", () => { + expect(/foo(?!bar)/.test("foobar")).toBe(false); + expect(/foo(?!bar)/.test("foobaz")).toBe(true); +}); + +// --- Lookbehind --- + +test("positive lookbehind matches fixed-length pattern", () => { + const m = /(?<=foo)bar/.exec("foobar"); + expect(m[0]).toBe("bar"); + expect(m.index).toBe(3); +}); + +test("positive lookbehind fails when prefix absent", () => { + expect(/(?<=foo)bar/.test("bazbar")).toBe(false); +}); + +test("negative lookbehind rejects when pattern present", () => { + expect(/(? { + const m = "xabcd".match(/.*(?<=(..|...|....))(.*)/); + expect(m[0]).toBe("xabcd"); + expect(m[1]).toBe("cd"); + expect(m[2]).toBe(""); +}); + +test("lookbehind with quantifier in outer pattern", () => { + const m = /(?<=\d+)px/.exec("100px"); + expect(m[0]).toBe("px"); + expect(m.index).toBe(3); +}); + +test("lookbehind does not consume input", () => { + const m = /(?<=a)b/.exec("ab"); + expect(m[0]).toBe("b"); + expect(m.index).toBe(1); +}); diff --git a/tests/built-ins/RegExp/unicode.js b/tests/built-ins/RegExp/unicode.js index 64f5b119..f8ed7894 100644 --- a/tests/built-ins/RegExp/unicode.js +++ b/tests/built-ins/RegExp/unicode.js @@ -199,3 +199,55 @@ test("\\p{Separator}", () => { test("\\p{Z} shorthand for Separator", () => { expect(new RegExp("\\p{Z}", "u").test(" ")).toBe(true); }); + +// --- Multi-byte UTF-8 code point handling --- + +test("dot matches multi-byte BMP characters in dotAll mode", () => { + expect(/^.$/s.test("
")).toBe(true); + expect(/^.$/s.test("
")).toBe(true); + expect(/^.$/s.test("…")).toBe(true); +}); + +test("dot rejects line terminators without dotAll", () => { + expect(/^.$/.test("
")).toBe(false); + expect(/^.$/.test("
")).toBe(false); + expect(/^.$/.test("\n")).toBe(false); + expect(/^.$/.test("\r")).toBe(false); +}); + +test("multiline ^ matches after newline in multi-byte context", () => { + expect(/^abc/m.test("xyz +abc")).toBe(true); + expect(/^abc/m.test("é +abc")).toBe(true); +}); + +test("multiline $ matches before newline in multi-byte context", () => { + expect(/abc$/m.test("abc +xyz")).toBe(true); + expect(/abc$/m.test("abc +é")).toBe(true); +}); + +// --- Unicode mode syntax restrictions --- + +test("\\c without letter throws SyntaxError in unicode mode", () => { + expect(() => { new RegExp("\\c", "u"); }).toThrow(SyntaxError); + expect(() => { new RegExp("\\c1", "u"); }).toThrow(SyntaxError); +}); + +test("quantified assertion throws SyntaxError in unicode mode", () => { + expect(() => { new RegExp("(?=.)*", "u"); }).toThrow(SyntaxError); + expect(() => { new RegExp("(?=.)+", "u"); }).toThrow(SyntaxError); + expect(() => { new RegExp("(?!.){2}", "u"); }).toThrow(SyntaxError); +}); + +test("\\c inside character class without letter throws SyntaxError in unicode mode", () => { + expect(() => { new RegExp("[\\c]", "u"); }).toThrow(SyntaxError); + expect(() => { new RegExp("[\\c1]", "u"); }).toThrow(SyntaxError); +}); + +test("\\p{ASCII} matches on large input without hitting step limit", () => { + const s = "abcdefghij0123456789".repeat(50); + expect(new RegExp("^\\p{ASCII}+$", "u").test(s)).toBe(true); +});