From 88b698982fba71964bb1b482dbf9778391666a15 Mon Sep 17 00:00:00 2001 From: Logan Stokols Date: Mon, 25 May 2026 21:56:42 -0400 Subject: [PATCH 1/3] feat(xfa): extract scripts from nodes not emitted as Questions/Sections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, script extraction was driven by walkSubformChildren via the per-node attachFieldScripts helper, so any script whose owner node was suppressed by emitField / emitDraw was silently dropped. The known cases: event-bearing elements (status indicators), bind="none" non-AddAttachment buttons (Help Text / Show Intro triggers), events, and per-option scripts on s flattened into an 's Options. The b44a6f9 FormScript comment acknowledged this gap and deferred the fix. Split script extraction from question emission: - New extractAllScripts walks the entire xfaNode tree post-Section walk and emits a FormScript for every event-bearing node, regardless of whether the node was emitted as a Question or Section. OwnerPath is set to the SOM path of the owning node; OwnerID is left empty. - New populateScriptBackRefs indexes the resulting scripts by OwnerPath, fills in OwnerID and Question.Scripts / FormSection.Scripts back-refs whenever the owner was also emitted as a Question/Section, and leaves orphans with empty OwnerID. - exclGroup OptionEvents is a parallel slice to Options that preserves per-option events through the flatten so they reach extractAllScripts. - pageArea and exclGroup are now valid event-stack targets in the parseXFATemplate state machine (previously only subform). The attachFieldScripts / appendScripts helpers and xfaNode.QuestionID field are removed — back-refs are now populated by path lookup in a single post-pass rather than threaded through emission. Tests cover all four orphan cases (pageArea, bind=none button, draw with event, exclGroup per-option) and assert that the corresponding nodes are still NOT emitted as Questions — only their scripts are surfaced. FormScript doc comment rewritten: orphan scripts are now first-class (OwnerPath set, OwnerID empty) rather than missing. Co-Authored-By: Claude Opus 4.7 (1M context) --- forms/xfa/xfa_form_translator.go | 180 ++++++++++++++++----- forms/xfa/xfa_script_extraction_test.go | 207 ++++++++++++++++++++++++ types/form_types.go | 11 +- 3 files changed, 348 insertions(+), 50 deletions(-) diff --git a/forms/xfa/xfa_form_translator.go b/forms/xfa/xfa_form_translator.go index b6420e8..0a9b24d 100644 --- a/forms/xfa/xfa_form_translator.go +++ b/forms/xfa/xfa_form_translator.go @@ -420,6 +420,12 @@ type xfaNode struct { Validation *XFAValidation Events []XFAEvent + // OptionEvents is parallel to Options on exclGroup nodes: one []XFAEvent per + // flattened option . Lets per-option event scripts (originally hung off + // the individual inside an ) survive the flatten so they + // can still be surfaced as FormScripts. + OptionEvents [][]XFAEvent + // UI-element-specific constraints AllowNeutral bool // checkButton allowNeutral="1" → tri-state checkbox MaxChars *int // textEdit maxChars → ValidationRules.MaxLength @@ -444,11 +450,6 @@ type xfaNode struct { ExDataHTML string // plain text extracted from text/html exData without xfa:embed markers PageNumber int - - // QuestionID is the ID assigned to this node's emitted Question. - // Set by emitField/emitExclGroup after the question is built; empty otherwise. - // Used by buildFormScripts to populate FormScript.OwnerID for field-attached scripts. - QuestionID string } // xfaTemplateResult bundles the parse tree with top-level metadata. @@ -965,7 +966,7 @@ func parseXFATemplate(xfaXML string, verbose bool) (*xfaTemplateResult, error) { } if currentLeaf != nil { currentLeaf.Events = append(currentLeaf.Events, ev) - } else if top := topOfStack(); top.Kind == xfaKindSubform { + } else if top := topOfStack(); top.Kind == xfaKindSubform || top.Kind == xfaKindPageArea || top.Kind == xfaKindExclGroup { top.Events = append(top.Events, ev) } @@ -1002,7 +1003,7 @@ func parseXFATemplate(xfaXML string, verbose bool) (*xfaTemplateResult, error) { putAttr(&last.Properties, attr.Name.Local, attr.Value) } } - } else if top := topOfStack(); top.Kind == xfaKindSubform && len(top.Events) > 0 { + } else if top := topOfStack(); (top.Kind == xfaKindSubform || top.Kind == xfaKindPageArea || top.Kind == xfaKindExclGroup) && len(top.Events) > 0 { inScript = true currentValue.Reset() last := &top.Events[len(top.Events)-1] @@ -1155,6 +1156,9 @@ func parseXFATemplate(xfaXML string, verbose bool) (*xfaTemplateResult, error) { optValue = currentLeaf.Name } top.Options = append(top.Options, XFAOption{Label: optLabel, Value: optValue}) + // Preserve per-option events (parallel slice) so they can still + // surface as FormScripts even though the option is flattened. + top.OptionEvents = append(top.OptionEvents, currentLeaf.Events) // draws inside exclGroup are decorative labels — fall through and discard } else if top.Kind != xfaKindExclGroup { top.Children = append(top.Children, currentLeaf) @@ -1269,7 +1273,7 @@ func parseXFATemplate(xfaXML string, verbose bool) (*xfaTemplateResult, error) { } else if currentLeaf != nil && len(currentLeaf.Events) > 0 { last := ¤tLeaf.Events[len(currentLeaf.Events)-1] last.Body = currentValue.String() - } else if top := topOfStack(); top.Kind == xfaKindSubform && len(top.Events) > 0 { + } else if top := topOfStack(); (top.Kind == xfaKindSubform || top.Kind == xfaKindPageArea || top.Kind == xfaKindExclGroup) && len(top.Events) > 0 { last := &top.Events[len(top.Events)-1] last.Body = currentValue.String() } @@ -1402,6 +1406,15 @@ func buildFormSchema(result *xfaTemplateResult, verbose bool) *types.FormSchema var qIdx int schema.Sections = walkSubformChildren(result.Root, nil, schema, &qIdx, false, verbose) + // Extract every + + + + + + +` + + form, err := ParseXFAForm(xfaXML, false) + if err != nil { + t.Fatalf("ParseXFAForm() error = %v", err) + } + s := findScript(form.Scripts, "form1.Master", "ready") + if s == nil { + t.Fatalf("no script with OwnerPath=form1.Master event=ready; got %+v", form.Scripts) + } + if s.Body != body { + t.Errorf("body = %q, want %q", s.Body, body) + } + if s.OwnerID != "" { + t.Errorf("OwnerID = %q, want empty (pageArea is not a Question/Section)", s.OwnerID) + } +} + +// TestBindNoneButtonScriptExtracted verifies that a bind="none" button that is +// not an AddAttachment (e.g. "Help Text" / "Show Intro" UI triggers) still +// contributes its click-handler script to FormSchema.Scripts, even though the +// button itself is not emitted as a Question. +func TestBindNoneButtonScriptExtracted(t *testing.T) { + body := `xfa.host.messageBox("?-hint");` + xfaXML := `` + + form, err := ParseXFAForm(xfaXML, false) + if err != nil { + t.Fatalf("ParseXFAForm() error = %v", err) + } + for _, q := range form.Questions { + if q.Name == "helpBtn" { + t.Fatalf("bind=none non-AddAttachment button should not be emitted as Question; got %+v", q) + } + } + s := findScript(form.Scripts, "Page1.helpBtn", "click") + if s == nil { + t.Fatalf("no orphan script for Page1.helpBtn click; got %+v", form.Scripts) + } + if s.Body != body { + t.Errorf("body = %q, want %q", s.Body, body) + } + if s.OwnerID != "" { + t.Errorf("OwnerID = %q, want empty (button is not emitted as Question)", s.OwnerID) + } +} + +// TestDrawEventScriptExtracted verifies that event-bearing elements +// (status indicators with dynamic show/hide handlers) still surface their +// scripts as orphan FormScripts even though the draw itself is suppressed. +func TestDrawEventScriptExtracted(t *testing.T) { + body := `this.presence = "hidden";` + xfaXML := `` + + form, err := ParseXFAForm(xfaXML, false) + if err != nil { + t.Fatalf("ParseXFAForm() error = %v", err) + } + for _, q := range form.Questions { + if q.Name == "statusOk" { + t.Fatalf("event-bearing draw should not be emitted as Question; got %+v", q) + } + } + s := findScript(form.Scripts, "Page1.statusOk", "initialize") + if s == nil { + t.Fatalf("no orphan script for Page1.statusOk initialize; got %+v", form.Scripts) + } + if s.Body != body { + t.Errorf("body = %q, want %q", s.Body, body) + } + if s.OwnerID != "" { + t.Errorf("OwnerID = %q, want empty (draw is not emitted as Question)", s.OwnerID) + } +} + +// TestExclGroupOptionScriptsExtracted verifies that per-option +// blocks (defined on the individual radio-option s inside an +// ) are preserved as distinct FormScripts. Each option's script +// must have its own OwnerPath (exclGroup SOM path + "." + option value) and +// remain an orphan, while the exclGroup itself (which IS a Question) gets +// its own script back-ref via the standard Question.Scripts mechanism. +func TestExclGroupOptionScriptsExtracted(t *testing.T) { + bodyA := `xfa.host.messageBox("A selected");` + bodyB := `xfa.host.messageBox("B selected");` + groupBody := `xfa.host.messageBox("group changed");` + xfaXML := `` + + form, err := ParseXFAForm(xfaXML, false) + if err != nil { + t.Fatalf("ParseXFAForm() error = %v", err) + } + + var choiceQ *types.Question + for i := range form.Questions { + if form.Questions[i].Name == "choice" { + choiceQ = &form.Questions[i] + } + } + if choiceQ == nil { + t.Fatalf("exclGroup 'choice' question missing") + } + + groupScript := findScript(form.Scripts, "Page1.choice", "change") + if groupScript == nil { + t.Fatalf("no script with OwnerPath=Page1.choice event=change; got %+v", form.Scripts) + } + if groupScript.Body != groupBody { + t.Errorf("group body = %q, want %q", groupScript.Body, groupBody) + } + if groupScript.OwnerID != choiceQ.ID { + t.Errorf("group OwnerID = %q, want question ID %q", groupScript.OwnerID, choiceQ.ID) + } + + scriptA := findScript(form.Scripts, "Page1.choice.a", "click") + if scriptA == nil { + t.Fatalf("no per-option script with OwnerPath=Page1.choice.a; got %+v", form.Scripts) + } + if scriptA.Body != bodyA { + t.Errorf("option A body = %q, want %q", scriptA.Body, bodyA) + } + if scriptA.OwnerID != "" { + t.Errorf("option A OwnerID = %q, want empty (orphan)", scriptA.OwnerID) + } + + scriptB := findScript(form.Scripts, "Page1.choice.b", "click") + if scriptB == nil { + t.Fatalf("no per-option script with OwnerPath=Page1.choice.b; got %+v", form.Scripts) + } + if scriptB.Body != bodyB { + t.Errorf("option B body = %q, want %q", scriptB.Body, bodyB) + } + if scriptB.OwnerID != "" { + t.Errorf("option B OwnerID = %q, want empty (orphan)", scriptB.OwnerID) + } +} diff --git a/types/form_types.go b/types/form_types.go index a021b04..5319ec9 100644 --- a/types/form_types.go +++ b/types/form_types.go @@ -100,12 +100,11 @@ type ValidationRules struct { // FormScript represents a raw script block extracted from an XFA form. // Bodies are exposed verbatim — pdfer does not interpret script semantics. // -// Limitations: scripts attached to XFA nodes that pdfer does not surface in -// the schema are not extracted. This includes decorative elements with -// events (e.g. status indicators), buttons with bind="none" other than -// AddAttachment, -level events, and individual radio options -// that are collapsed into an 's Options. Callers that need full -// event fidelity should walk the raw XFA XML directly. +// Scripts whose owner node is not emitted as a Question or FormSection (e.g. +// event-bearing elements, bind="none" non-AddAttachment buttons, +// events, individual radio options collapsed into an ) +// appear here with OwnerPath set and OwnerID empty. Callers that need to +// correlate orphan scripts must inspect OwnerPath directly. type FormScript struct { ID string `json:"id"` // stable: SOM owner path + "#" + event + "[" + index + "]" OwnerPath string `json:"owner_path,omitempty"` // SOM path of containing node (e.g. "form1.section.field"); empty for template-level From 7594b78462ea5e38d0d6fe4625f1ccad7272be66 Mon Sep 17 00:00:00 2001 From: Logan Stokols Date: Mon, 25 May 2026 22:59:27 -0400 Subject: [PATCH 2/3] docs(xfa): add scope recommendation design doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the high-level design plan for pdfer's XFA surface: - The scope principle: extract structure and surface logic, don't execute logic. Schema is a projection of the template DOM, not a snapshot of a Form DOM. Runtime model (instance counts, presence toggles, calculation order) is the caller's responsibility. - P1 roadmap: orphan-script extraction (done in 88b6989), a parallel Elements collection for non-question template nodes with visual presence or events, and / metadata on Sections and Questions for dynamic XFA. - P2 drafts: SOM path parser and schema resolver, SOM-keyed data-DOM cursor API, and + + + + + +` + + form, err := ParseXFAForm(xfaXML, false) + if err != nil { + t.Fatalf("ParseXFAForm() error = %v", err) + } + + var deepQ *types.Question + for i := range form.Questions { + if form.Questions[i].Name == "deepField" { + deepQ = &form.Questions[i] + } + } + if deepQ == nil { + t.Fatalf("deepField question missing; got %+v", form.Questions) + } + + wantPath := "form1.outer.inner.deepField" + s := findScript(form.Scripts, wantPath, "change") + if s == nil { + t.Fatalf("no script with OwnerPath=%s; got %+v", wantPath, form.Scripts) + } + if s.OwnerID != deepQ.ID { + t.Errorf("OwnerID = %q, want question ID %q (back-ref must resolve through nested sections)", s.OwnerID, deepQ.ID) + } + if len(deepQ.Scripts) != 1 || deepQ.Scripts[0] != s.ID { + t.Errorf("Question.Scripts = %v, want [%q]", deepQ.Scripts, s.ID) + } } diff --git a/types/form_types.go b/types/form_types.go index 5319ec9..062083d 100644 --- a/types/form_types.go +++ b/types/form_types.go @@ -100,11 +100,16 @@ type ValidationRules struct { // FormScript represents a raw script block extracted from an XFA form. // Bodies are exposed verbatim — pdfer does not interpret script semantics. // -// Scripts whose owner node is not emitted as a Question or FormSection (e.g. +// OwnerID is non-empty iff the owning node is surfaced as a typed schema +// entity (today: Question or FormSection) that callers can dereference by ID. +// Scripts whose owner is not currently typed in the schema appear with +// OwnerPath set and OwnerID empty — at time of writing, these include // event-bearing elements, bind="none" non-AddAttachment buttons, -// events, individual radio options collapsed into an ) -// appear here with OwnerPath set and OwnerID empty. Callers that need to -// correlate orphan scripts must inspect OwnerPath directly. +// events, and individual radio options collapsed into an +// . The set of orphan cases will shrink as more node types become +// typed (see docs/design/xfa-scope.md §2). Callers should treat OwnerID empty +// as "not currently dereferenceable" rather than a permanent classification, +// and rely on OwnerPath when they need owner-keyed addressing in either case. type FormScript struct { ID string `json:"id"` // stable: SOM owner path + "#" + event + "[" + index + "]" OwnerPath string `json:"owner_path,omitempty"` // SOM path of containing node (e.g. "form1.section.field"); empty for template-level