diff --git a/internal/extract/html.go b/internal/extract/html.go
index 00d739b..7cbba46 100644
--- a/internal/extract/html.go
+++ b/internal/extract/html.go
@@ -3,11 +3,51 @@ package extract
import (
"bytes"
"fmt"
+ "regexp"
"strings"
"golang.org/x/net/html"
)
+// chapterHeadingPrefixRe matches headings whose leading numbering identifies
+// them as a top-level / chapter-level EDINET 有報 marker, e.g.:
+//
+// "4【コーポレート・ガバナンスの状況等】"
+// "第2【事業の状況】"
+// "2【沿革】"
+// "5 【従業員の状況】"
+//
+// It explicitly does NOT match parenthesised sub-numbering like "(1)" or
+// circled numbers like "①", which are used for sub-sections inside a chapter
+// (e.g., セコム 第64期 (S100W3TS) の "(1)事業環境に起因するリスク" lives inside
+// "事業等のリスク" but is marked up with the same
tag
+// as the chapter heading itself).
+var chapterHeadingPrefixRe = regexp.MustCompile(`^\s*第?\s*[0-90-9一二三四五六七八九十百〇]+\s*【`)
+
+// filingFooterRe matches the EDINET filing-title footer artifact that appears
+// at the very end of each `_honbun_*.htm` file, e.g.:
+//
+// "有価証券報告書(通常方式)_20260326141712"
+//
+// This token is metadata, not section content. Stripping it prevents the
+// footer from being appended to whichever section happens to be open at the
+// end of the document (typically the last `KnownSections` match such as
+// governance or financial).
+var filingFooterRe = regexp.MustCompile(`\s*(?:\x{FEFF})?\s*有価証券報告書(通常方式)_\d{14}\s*$`)
+
+// isChapterHeading reports whether the given heading text looks like a
+// chapter-level EDINET 有報 heading (chapter numbering followed by 【...】).
+func isChapterHeading(headingText string) bool {
+ return chapterHeadingPrefixRe.MatchString(headingText)
+}
+
+// stripFilingFooter removes the trailing EDINET filing-title footer artifact
+// from a section's accumulated text. Safe to call on any string; if the
+// footer is absent the input is returned unchanged.
+func stripFilingFooter(s string) string {
+ return strings.TrimSpace(filingFooterRe.ReplaceAllString(s, ""))
+}
+
// ExtractText extracts plain text from HTML files in a type=1 XBRL ZIP archive.
// Reads all .htm files under PublicDoc/, sorted by name, and concatenates their text.
func ExtractText(zipData []byte) (string, error) {
@@ -117,10 +157,17 @@ func extractSectionsFromNodes(nodes []*html.Node) []Section {
// Flush last section
if state.current != nil {
- state.current.Text = normalizeWhitespace(currentText.String())
+ state.current.Text = stripFilingFooter(normalizeWhitespace(currentText.String()))
sections = append(sections, *state.current)
}
+ // Strip filing-title footer from every section's tail. It is harmless when
+ // absent and avoids contaminating downstream consumers with EDINET metadata
+ // text that legitimately appears at the bottom of each HTML file.
+ for i := range sections {
+ sections[i].Text = stripFilingFooter(sections[i].Text)
+ }
+
return mergeAdjacentSameIDSections(sections)
}
@@ -183,12 +230,20 @@ func walkForSections(n *html.Node, sections *[]Section, state *sectionWalkState,
return
}
- // Non-matching heading: if it is at the same depth as (or shallower
- // than) the heading that opened the current section, treat it as a
- // section boundary and flush. Deeper headings are sub-headings of the
- // current section (e.g., "(2)役員の状況" inside a governance section
- // anchored at h3) and should keep accumulating text.
- if state.current != nil && level > 0 && level <= state.depth {
+ // Non-matching heading: flush only if it looks like a chapter-level
+ // EDINET 有報 heading (e.g., "4【関係会社の状況】", "第4【提出会社の
+ // 状況】") at the same depth as (or shallower than) the heading that
+ // opened the current section.
+ //
+ // Why filter by chapter-numbering pattern: some filers (e.g., セコム
+ // 第64期 S100W3TS) mark up *sub-section* headings such as
+ // "(1)事業環境に起因するリスク" with the same tag as the chapter
+ // heading "3【事業等のリスク】". Pure depth-based flushing would close
+ // the risk section at the first sub-heading and drop most of its
+ // content. Restricting the flush to chapter-numbered headings keeps
+ // sub-sections inside the open section while still closing it when a
+ // new chapter (matching or not) begins.
+ if state.current != nil && level > 0 && level <= state.depth && isChapterHeading(headingText) {
state.current.Text = normalizeWhitespace(text.String())
*sections = append(*sections, *state.current)
state.current = nil
diff --git a/internal/extract/html_test.go b/internal/extract/html_test.go
index d800d58..18857cc 100644
--- a/internal/extract/html_test.go
+++ b/internal/extract/html_test.go
@@ -287,6 +287,139 @@ func TestExtractSections_SameIDNestedHeading(t *testing.T) {
}
}
+// TestExtractSections_SubHeadingSameHTag reproduces the セコム 第64期
+// (S100W3TS) pattern: the chapter heading "3【事業等のリスク】" and the
+// sub-section heading "(1)事業環境に起因するリスク" are both marked up with
+// in the source HTML. Pure depth-based flushing would
+// close risk at the first sub-heading and drop its content; the chapter-vs-
+// sub-numbering check must keep them together.
+func TestExtractSections_SubHeadingSameHTag(t *testing.T) {
+ data := createTestZip(t, map[string]string{
+ "PublicDoc/main.htm": `
+ 3【事業等のリスク】
+
リスクのイントロ段落です。
+ (1)事業環境に起因するリスク
+ 事業環境リスクの本文です。
+ ①社会・経済
+ 社会・経済の説明です。
+ (2)経営戦略に関するリスク
+ 経営戦略リスクの本文です。
+ 4【経営者による財政状態、経営成績及びキャッシュ・フローの状況の分析】
+ MD&A の本文です。
+ `,
+ })
+
+ sections, err := ExtractSections(data)
+ if err != nil {
+ t.Fatalf("ExtractSections() error = %v", err)
+ }
+
+ var risk, mda *Section
+ for i := range sections {
+ s := §ions[i]
+ if s.ID == "risk" {
+ risk = s
+ }
+ if s.ID == "mda" {
+ mda = s
+ }
+ }
+
+ if risk == nil {
+ t.Fatal("missing 'risk' section")
+ }
+ for _, want := range []string{"リスクのイントロ段落", "事業環境リスクの本文", "社会・経済の説明", "経営戦略リスクの本文"} {
+ if !strings.Contains(risk.Text, want) {
+ t.Errorf("risk.Text missing %q (text=%q)", want, risk.Text)
+ }
+ }
+ if strings.Contains(risk.Text, "MD&A の本文") {
+ t.Errorf("risk.Text bled into MD&A chapter (text=%q)", risk.Text)
+ }
+ if mda == nil {
+ t.Fatal("missing 'mda' section")
+ }
+ if !strings.Contains(mda.Text, "MD&A の本文") {
+ t.Errorf("mda.Text missing expected content (text=%q)", mda.Text)
+ }
+}
+
+// TestIsChapterHeading covers the chapter-vs-sub-numbering predicate.
+func TestIsChapterHeading(t *testing.T) {
+ cases := []struct {
+ in string
+ want bool
+ }{
+ {"4【コーポレート・ガバナンスの状況等】", true},
+ {"第2【事業の状況】", true},
+ {"2【沿革】", true},
+ {"5 【従業員の状況】", true},
+ {"第4 【提出会社の状況】", true},
+ {"(1)事業環境に起因するリスク", false},
+ {"(1)【コーポレート・ガバナンスの概要】", false},
+ {"①社会・経済", false},
+ {"a. 受注実績", false},
+ {"②キャッシュ・フローの状況", false},
+ }
+ for _, c := range cases {
+ got := isChapterHeading(c.in)
+ if got != c.want {
+ t.Errorf("isChapterHeading(%q) = %v, want %v", c.in, got, c.want)
+ }
+ }
+}
+
+// TestStripFilingFooter checks the EDINET filing-title footer is removed.
+func TestStripFilingFooter(t *testing.T) {
+ cases := []struct {
+ in, want string
+ }{
+ {"ガバナンスの本文です。 有価証券報告書(通常方式)_20260326141712", "ガバナンスの本文です。"},
+ {"テキスト 有価証券報告書(通常方式)_20250523094900\n", "テキスト"},
+ {"footerなし", "footerなし"},
+ {"", ""},
+ }
+ for _, c := range cases {
+ got := stripFilingFooter(c.in)
+ if got != c.want {
+ t.Errorf("stripFilingFooter(%q) = %q, want %q", c.in, got, c.want)
+ }
+ }
+}
+
+// TestExtractSections_FilingFooterStripped confirms the filing-title footer
+// artifact at the end of an HTML file is not retained in the last section's
+// body text.
+func TestExtractSections_FilingFooterStripped(t *testing.T) {
+ data := createTestZip(t, map[string]string{
+ "PublicDoc/main.htm": `
+ 4【コーポレート・ガバナンスの状況等】
+ ガバナンスの本文です。
+ 有価証券報告書(通常方式)_20260326141712
+ `,
+ })
+
+ sections, err := ExtractSections(data)
+ if err != nil {
+ t.Fatalf("ExtractSections() error = %v", err)
+ }
+ var governance *Section
+ for i := range sections {
+ if sections[i].ID == "governance" {
+ governance = §ions[i]
+ }
+ }
+ if governance == nil {
+ t.Fatal("missing 'governance' section")
+ }
+ if strings.Contains(governance.Text, "有価証券報告書(通常方式)_") {
+ t.Errorf("governance.Text still contains filing footer artifact (text=%q)", governance.Text)
+ }
+ if !strings.Contains(governance.Text, "ガバナンスの本文") {
+ t.Errorf("governance.Text missing expected content (text=%q)", governance.Text)
+ }
+}
+
// TestMergeAdjacentSameIDSections checks the merge safety net directly.
func TestMergeAdjacentSameIDSections(t *testing.T) {
in := []Section{