beatinaniwa · beatinaniwa · May 17, 2026 · May 17, 2026
diff --git a/internal/extract/html.go b/internal/extract/html.go
@@ -3,11 +3,51 @@ package extract
 import (
 	"bytes"
 	"fmt"
+	"regexp"
 	"strings"
 
 	"golang.org/x/net/html"
 )
 
+// chapterHeadingPrefixRe matches headings whose leading numbering identifies
+// them as a top-level / chapter-level EDINET 有報 marker, e.g.:
+//
+//	"４【コーポレート・ガバナンスの状況等】"
+//	"第２【事業の状況】"
+//	"２【沿革】"
+//	"５ 【従業員の状況】"
+//
+// It explicitly does NOT match parenthesised sub-numbering like "(1)"  or
+// circled numbers like "①", which are used for sub-sections inside a chapter
+// (e.g., セコム 第64期 (S100W3TS) の "(1)事業環境に起因するリスク" lives inside
+// "事業等のリスク" but is marked up with the same <h3 class="smt_head2"> tag
+// as the chapter heading itself).
+var chapterHeadingPrefixRe = regexp.MustCompile(`^\s*第?\s*[０-９0-9一二三四五六七八九十百〇]+\s*【`)
+
+// filingFooterRe matches the EDINET filing-title footer artifact that appears
+// at the very end of each `_honbun_*.htm` file, e.g.:
+//
+//	"有価証券報告書（通常方式）_20260326141712"
+//
+// This token is metadata, not section content. Stripping it prevents the
+// footer from being appended to whichever section happens to be open at the
+// end of the document (typically the last `KnownSections` match such as
+// governance or financial).
+var filingFooterRe = regexp.MustCompile(`\s*(?:\x{FEFF})?\s*有価証券報告書（通常方式）_\d{14}\s*$`)
+
+// isChapterHeading reports whether the given heading text looks like a
+// chapter-level EDINET 有報 heading (chapter numbering followed by 【...】).
+func isChapterHeading(headingText string) bool {
+	return chapterHeadingPrefixRe.MatchString(headingText)
+}
+
+// stripFilingFooter removes the trailing EDINET filing-title footer artifact
+// from a section's accumulated text. Safe to call on any string; if the
+// footer is absent the input is returned unchanged.
+func stripFilingFooter(s string) string {
+	return strings.TrimSpace(filingFooterRe.ReplaceAllString(s, ""))
+}
+
 // ExtractText extracts plain text from HTML files in a type=1 XBRL ZIP archive.
 // Reads all .htm files under PublicDoc/, sorted by name, and concatenates their text.
 func ExtractText(zipData []byte) (string, error) {
@@ -117,10 +157,17 @@ func extractSectionsFromNodes(nodes []*html.Node) []Section {
 
 	// Flush last section
 	if state.current != nil {
-		state.current.Text = normalizeWhitespace(currentText.String())
+		state.current.Text = stripFilingFooter(normalizeWhitespace(currentText.String()))
 		sections = append(sections, *state.current)
 	}
 
+	// Strip filing-title footer from every section's tail. It is harmless when
+	// absent and avoids contaminating downstream consumers with EDINET metadata
+	// text that legitimately appears at the bottom of each HTML file.
+	for i := range sections {
+		sections[i].Text = stripFilingFooter(sections[i].Text)
+	}
+
 	return mergeAdjacentSameIDSections(sections)
 }
 
@@ -183,12 +230,20 @@ func walkForSections(n *html.Node, sections *[]Section, state *sectionWalkState,
 			return
 		}
 
-		// Non-matching heading: if it is at the same depth as (or shallower
-		// than) the heading that opened the current section, treat it as a
-		// section boundary and flush. Deeper headings are sub-headings of the
-		// current section (e.g., "（２）役員の状況" inside a governance section
-		// anchored at h3) and should keep accumulating text.
-		if state.current != nil && level > 0 && level <= state.depth {
+		// Non-matching heading: flush only if it looks like a chapter-level
+		// EDINET 有報 heading (e.g., "４【関係会社の状況】", "第４【提出会社の
+		// 状況】") at the same depth as (or shallower than) the heading that
+		// opened the current section.
+		//
+		// Why filter by chapter-numbering pattern: some filers (e.g., セコム
+		// 第64期 S100W3TS) mark up *sub-section* headings such as
+		// "(1)事業環境に起因するリスク" with the same <h3> tag as the chapter
+		// heading "３【事業等のリスク】". Pure depth-based flushing would close
+		// the risk section at the first sub-heading and drop most of its
+		// content. Restricting the flush to chapter-numbered headings keeps
+		// sub-sections inside the open section while still closing it when a
+		// new chapter (matching or not) begins.
+		if state.current != nil && level > 0 && level <= state.depth && isChapterHeading(headingText) {
 			state.current.Text = normalizeWhitespace(text.String())
 			*sections = append(*sections, *state.current)
 			state.current = nil

diff --git a/internal/extract/html_test.go b/internal/extract/html_test.go
@@ -287,6 +287,139 @@ func TestExtractSections_SameIDNestedHeading(t *testing.T) {
 	}
 }
 
+// TestExtractSections_SubHeadingSameHTag reproduces the セコム 第64期
+// (S100W3TS) pattern: the chapter heading "３【事業等のリスク】" and the
+// sub-section heading "(1)事業環境に起因するリスク" are both marked up with
+// <h3 class="smt_head2"> in the source HTML. Pure depth-based flushing would
+// close risk at the first sub-heading and drop its content; the chapter-vs-
+// sub-numbering check must keep them together.
+func TestExtractSections_SubHeadingSameHTag(t *testing.T) {
+	data := createTestZip(t, map[string]string{
+		"PublicDoc/main.htm": `<html><body>
+			<h3>３【事業等のリスク】</h3>
+			<p>リスクのイントロ段落です。</p>
+			<h3>(1)事業環境に起因するリスク</h3>
+			<p>事業環境リスクの本文です。</p>
+			<h3>①社会・経済</h3>
+			<p>社会・経済の説明です。</p>
+			<h3>(2)経営戦略に関するリスク</h3>
+			<p>経営戦略リスクの本文です。</p>
+			<h3>４【経営者による財政状態、経営成績及びキャッシュ・フローの状況の分析】</h3>
+			<p>MD&A の本文です。</p>
+		</body></html>`,
+	})
+
+	sections, err := ExtractSections(data)
+	if err != nil {
+		t.Fatalf("ExtractSections() error = %v", err)
+	}
+
+	var risk, mda *Section
+	for i := range sections {
+		s := &sections[i]
+		if s.ID == "risk" {
+			risk = s
+		}
+		if s.ID == "mda" {
+			mda = s
+		}
+	}
+
+	if risk == nil {
+		t.Fatal("missing 'risk' section")
+	}
+	for _, want := range []string{"リスクのイントロ段落", "事業環境リスクの本文", "社会・経済の説明", "経営戦略リスクの本文"} {
+		if !strings.Contains(risk.Text, want) {
+			t.Errorf("risk.Text missing %q (text=%q)", want, risk.Text)
+		}
+	}
+	if strings.Contains(risk.Text, "MD&A の本文") {
+		t.Errorf("risk.Text bled into MD&A chapter (text=%q)", risk.Text)
+	}
+	if mda == nil {
+		t.Fatal("missing 'mda' section")
+	}
+	if !strings.Contains(mda.Text, "MD&A の本文") {
+		t.Errorf("mda.Text missing expected content (text=%q)", mda.Text)
+	}
+}
+
+// TestIsChapterHeading covers the chapter-vs-sub-numbering predicate.
+func TestIsChapterHeading(t *testing.T) {
+	cases := []struct {
+		in   string
+		want bool
+	}{
+		{"４【コーポレート・ガバナンスの状況等】", true},
+		{"第２【事業の状況】", true},
+		{"２【沿革】", true},
+		{"５ 【従業員の状況】", true},
+		{"第４ 【提出会社の状況】", true},
+		{"(1)事業環境に起因するリスク", false},
+		{"（１）【コーポレート・ガバナンスの概要】", false},
+		{"①社会・経済", false},
+		{"a. 受注実績", false},
+		{"②キャッシュ・フローの状況", false},
+	}
+	for _, c := range cases {
+		got := isChapterHeading(c.in)
+		if got != c.want {
+			t.Errorf("isChapterHeading(%q) = %v, want %v", c.in, got, c.want)
+		}
+	}
+}
+
+// TestStripFilingFooter checks the EDINET filing-title footer is removed.
+func TestStripFilingFooter(t *testing.T) {
+	cases := []struct {
+		in, want string
+	}{
+		{"ガバナンスの本文です。 有価証券報告書（通常方式）_20260326141712", "ガバナンスの本文です。"},
+		{"テキスト 有価証券報告書（通常方式）_20250523094900\n", "テキスト"},
+		{"footerなし", "footerなし"},
+		{"", ""},
+	}
+	for _, c := range cases {
+		got := stripFilingFooter(c.in)
+		if got != c.want {
+			t.Errorf("stripFilingFooter(%q) = %q, want %q", c.in, got, c.want)
+		}
+	}
+}
+
+// TestExtractSections_FilingFooterStripped confirms the filing-title footer
+// artifact at the end of an HTML file is not retained in the last section's
+// body text.
+func TestExtractSections_FilingFooterStripped(t *testing.T) {
+	data := createTestZip(t, map[string]string{
+		"PublicDoc/main.htm": `<html><body>
+			<h3>４【コーポレート・ガバナンスの状況等】</h3>
+			<p>ガバナンスの本文です。</p>
+			<p>有価証券報告書（通常方式）_20260326141712</p>
+		</body></html>`,
+	})
+
+	sections, err := ExtractSections(data)
+	if err != nil {
+		t.Fatalf("ExtractSections() error = %v", err)
+	}
+	var governance *Section
+	for i := range sections {
+		if sections[i].ID == "governance" {
+			governance = &sections[i]
+		}
+	}
+	if governance == nil {
+		t.Fatal("missing 'governance' section")
+	}
+	if strings.Contains(governance.Text, "有価証券報告書（通常方式）_") {
+		t.Errorf("governance.Text still contains filing footer artifact (text=%q)", governance.Text)
+	}
+	if !strings.Contains(governance.Text, "ガバナンスの本文") {
+		t.Errorf("governance.Text missing expected content (text=%q)", governance.Text)
+	}
+}
+
 // TestMergeAdjacentSameIDSections checks the merge safety net directly.
 func TestMergeAdjacentSameIDSections(t *testing.T) {
 	in := []Section{