diff --git a/cmd/doc.go b/cmd/doc.go index 27e8faa..6c75d2e 100644 --- a/cmd/doc.go +++ b/cmd/doc.go @@ -251,14 +251,26 @@ var docTextCmd = &cobra.Command{ if err != nil { return fmt.Errorf("failed to extract sections: %w", err) } - for _, s := range sections { + // Pick the candidate with the most text. EDINET filings sometimes + // produce more than one section with the same ID (e.g., a parent + // heading and a child heading that both match KnownSections). + // Returning the first match in document order can yield an empty + // or near-empty parent slice; the longest-match policy avoids this. + var best *extract.Section + for i := range sections { + s := §ions[i] if s.ID == docTextSection || strings.Contains(s.Name, docTextSection) { - return outputResult(cmd.OutOrStdout(), map[string]string{ - "section": s.ID, - "text": s.Text, - }) + if best == nil || len(s.Text) > len(best.Text) { + best = s + } } } + if best != nil { + return outputResult(cmd.OutOrStdout(), map[string]string{ + "section": best.ID, + "text": best.Text, + }) + } // Section not found — return full text with warning _, _ = fmt.Fprintf(cmd.ErrOrStderr(), `{"warning":"section '%s' not found, returning full text"}`+"\n", docTextSection) } diff --git a/internal/extract/html.go b/internal/extract/html.go index dec583f..00d739b 100644 --- a/internal/extract/html.go +++ b/internal/extract/html.go @@ -97,25 +97,53 @@ func extractTextFromNode(n *html.Node, buf *strings.Builder) { } } +// sectionWalkState tracks the currently-open section and the heading depth +// (h-level: 1..6) at which it was opened. Depth lets us flush on non-matching +// sibling/parent headings while keeping sub-headings (deeper h-levels) as part +// of the open section. +type sectionWalkState struct { + current *Section + depth int // h-level (1..6) where the current section was opened; 0 = no section open +} + func extractSectionsFromNodes(nodes []*html.Node) []Section { var sections []Section - var currentSection *Section + state := §ionWalkState{} var currentText strings.Builder for _, doc := range nodes { - walkForSections(doc, §ions, ¤tSection, ¤tText) + walkForSections(doc, §ions, state, ¤tText) } // Flush last section - if currentSection != nil { - currentSection.Text = normalizeWhitespace(currentText.String()) - sections = append(sections, *currentSection) + if state.current != nil { + state.current.Text = normalizeWhitespace(currentText.String()) + sections = append(sections, *state.current) } - return sections + return mergeAdjacentSameIDSections(sections) +} + +// headingLevel returns the heading depth (1..6) for h1..h6 tags, or 0 otherwise. +func headingLevel(tag string) int { + switch tag { + case "h1": + return 1 + case "h2": + return 2 + case "h3": + return 3 + case "h4": + return 4 + case "h5": + return 5 + case "h6": + return 6 + } + return 0 } -func walkForSections(n *html.Node, sections *[]Section, current **Section, text *strings.Builder) { +func walkForSections(n *html.Node, sections *[]Section, state *sectionWalkState, text *strings.Builder) { if n == nil { return } @@ -128,23 +156,52 @@ func walkForSections(n *html.Node, sections *[]Section, current **Section, text // Check if this is a heading element if n.Type == html.ElementNode && isHeadingElement(n.Data) { headingText := getNodeText(n) + level := headingLevel(n.Data) + if secDef := MatchSection(headingText); secDef != nil { - // Flush previous section - if *current != nil { - (*current).Text = normalizeWhitespace(text.String()) - *sections = append(*sections, **current) + // If the current section has the same ID, treat this heading as a + // sub-heading inside the same section: do not flush, do not reset + // text. This handles EDINET filings where a parent heading like + // "コーポレート・ガバナンスの状況等" is immediately followed by + // child headings like "コーポレート・ガバナンスの概要" that also + // match the same KnownSections entry. + if state.current != nil && state.current.ID == secDef.ID { + return } - *current = &Section{ + + // Different section: flush previous and start new. + if state.current != nil { + state.current.Text = normalizeWhitespace(text.String()) + *sections = append(*sections, *state.current) + } + state.current = &Section{ ID: secDef.ID, Name: headingText, } + state.depth = level + text.Reset() + return + } + + // Non-matching heading: if it is at the same depth as (or shallower + // than) the heading that opened the current section, treat it as a + // section boundary and flush. Deeper headings are sub-headings of the + // current section (e.g., "(2)役員の状況" inside a governance section + // anchored at h3) and should keep accumulating text. + if state.current != nil && level > 0 && level <= state.depth { + state.current.Text = normalizeWhitespace(text.String()) + *sections = append(*sections, *state.current) + state.current = nil + state.depth = 0 text.Reset() + // Fall through so the heading's own text is not collected into + // any section. return } } // Collect text for current section - if n.Type == html.TextNode && *current != nil { + if n.Type == html.TextNode && state.current != nil { t := strings.TrimSpace(n.Data) if t != "" { text.WriteString(t) @@ -153,8 +210,35 @@ func walkForSections(n *html.Node, sections *[]Section, current **Section, text } for c := n.FirstChild; c != nil; c = c.NextSibling { - walkForSections(c, sections, current, text) + walkForSections(c, sections, state, text) + } +} + +// mergeAdjacentSameIDSections concatenates consecutive sections that share +// the same ID. This is a safety net for cases where the same section opens +// twice in a row (e.g., split across HTML files) — the depth-aware walker +// already prevents most occurrences, but merging guards against edge cases. +func mergeAdjacentSameIDSections(in []Section) []Section { + if len(in) <= 1 { + return in + } + out := make([]Section, 0, len(in)) + out = append(out, in[0]) + for i := 1; i < len(in); i++ { + last := &out[len(out)-1] + if last.ID != "" && last.ID == in[i].ID { + if in[i].Text != "" { + if last.Text != "" { + last.Text += " " + in[i].Text + } else { + last.Text = in[i].Text + } + } + continue + } + out = append(out, in[i]) } + return out } func getNodeText(n *html.Node) string { diff --git a/internal/extract/html_test.go b/internal/extract/html_test.go index 1518409..35548f0 100644 --- a/internal/extract/html_test.go +++ b/internal/extract/html_test.go @@ -148,3 +148,130 @@ func TestMatchSection_Unknown(t *testing.T) { t.Errorf("MatchSection should return nil for unknown heading, got %v", sec) } } + +// TestExtractSections_BleedAcrossUnknownHeadings reproduces the bleed-truncated +// pattern observed in EDINET filings such as docID S100XS22 (日本マクドナルド +// HD 第55期): the "従業員の状況" section is followed by unknown headings +// (関係会社の状況, 第2【事業の状況】, 経営方針, サステナビリティ) before the +// next recognised heading (事業等のリスク). The depth-aware walker should flush +// the employees section at the next heading whose h-level is the same as the +// opening heading of employees, keeping employees content from bleeding into +// the following chapters. +func TestExtractSections_BleedAcrossUnknownHeadings(t *testing.T) { + data := createTestZip(t, map[string]string{ + "PublicDoc/main.htm": ` +

5【従業員の状況】

+

従業員数は2,454名です。

+

4【関係会社の状況】

+

関係会社の説明です。

+

第2【事業の状況】

+

1【経営方針、経営環境及び対処すべき課題等】

+

経営方針の説明です。

+

2【サステナビリティに関する考え方及び取組】

+

サステナビリティの説明です。

+

3【事業等のリスク】

+

リスクの説明です。

+ `, + }) + + sections, err := ExtractSections(data) + if err != nil { + t.Fatalf("ExtractSections() error = %v", err) + } + + var employees, risk *Section + for i := range sections { + s := §ions[i] + if s.ID == "employees" { + employees = s + } + if s.ID == "risk" { + risk = s + } + } + + if employees == nil { + t.Fatal("missing 'employees' section") + } + if !strings.Contains(employees.Text, "従業員数は2,454名") { + t.Errorf("employees.Text = %q, missing expected content", employees.Text) + } + for _, leak := range []string{"関係会社の説明", "経営方針の説明", "サステナビリティの説明", "リスクの説明"} { + if strings.Contains(employees.Text, leak) { + t.Errorf("employees.Text bled into other chapter (found %q)", leak) + } + } + if risk == nil { + t.Fatal("missing 'risk' section") + } + if !strings.Contains(risk.Text, "リスクの説明") { + t.Errorf("risk.Text = %q, missing expected content", risk.Text) + } +} + +// TestExtractSections_SameIDNestedHeading reproduces the empty-section pattern +// observed for governance: a parent heading "コーポレート・ガバナンスの状況等" +// (h3) is immediately followed by a child heading "コーポレート・ガバナンスの +// 概要" (h4) — both match the governance KnownSections entry. The previous +// implementation flushed and reset the section on the second match, leaving +// the parent section empty. With the same-ID continuation rule, the deeper +// heading is treated as a sub-heading inside the open governance section. +func TestExtractSections_SameIDNestedHeading(t *testing.T) { + data := createTestZip(t, map[string]string{ + "PublicDoc/main.htm": ` +

4【コーポレート・ガバナンスの状況等】

+

(1)【コーポレート・ガバナンスの概要】

+

ガバナンスの概要本文です。

+

(2)【役員の状況】

+

役員の状況本文です。

+

(3)【監査の状況】

+

監査の状況本文です。

+

5【提出会社の株式事務の概要】

+

株式事務の概要本文です。

+ `, + }) + + sections, err := ExtractSections(data) + if err != nil { + t.Fatalf("ExtractSections() error = %v", err) + } + + var governance *Section + for i := range sections { + s := §ions[i] + if s.ID == "governance" { + governance = s + } + } + + if governance == nil { + t.Fatal("missing 'governance' section") + } + for _, want := range []string{"ガバナンスの概要本文", "役員の状況本文", "監査の状況本文"} { + if !strings.Contains(governance.Text, want) { + t.Errorf("governance.Text missing expected content %q (text=%q)", want, governance.Text) + } + } + if strings.Contains(governance.Text, "株式事務の概要本文") { + t.Errorf("governance.Text bled into next chapter (株式事務の概要)") + } +} + +// TestMergeAdjacentSameIDSections checks the merge safety net directly. +func TestMergeAdjacentSameIDSections(t *testing.T) { + in := []Section{ + {ID: "governance", Name: "コーポレート・ガバナンスの状況等", Text: ""}, + {ID: "governance", Name: "コーポレート・ガバナンスの概要", Text: "ガバナンス本文"}, + {ID: "financial", Name: "連結財務諸表", Text: "財務諸表本文"}, + } + out := mergeAdjacentSameIDSections(in) + if len(out) != 2 { + t.Fatalf("len = %d, want 2 (governance merged + financial)", len(out)) + } + if out[0].ID != "governance" { + t.Errorf("out[0].ID = %q, want governance", out[0].ID) + } + if !strings.Contains(out[0].Text, "ガバナンス本文") { + t.Errorf("merged governance.Text = %q, missing content", out[0].Text) + } +}