From e6f5cd4e7888d83c257ff660fa9bebcb7cd94b95 Mon Sep 17 00:00:00 2001
From: beatinaniwa <beatinaniwa@gmail.com>
Date: Sun, 17 May 2026 19:52:21 +0900
Subject: [PATCH] =?UTF-8?q?fix(extract):=20=E7=AB=A0=20marker=20=E3=81=A8?=
 =?UTF-8?q?=20sub-heading=20=E3=81=AE=E5=8C=BA=E5=88=A5=20+=20EDINET=20?=
 =?UTF-8?q?=E3=83=95=E3=83=83=E3=82=BF=E3=83=BC=E9=99=A4=E5=8E=BB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #8 で導入した depth-aware section flush は physical h-level (h1〜h6 タグ) のみを見ていたが、 EDINET 提出書類によっては 章見出しと sub-heading に同じ <h3> タグを使っているため、 sub-heading で section が誤 flush される。 具体例:

セコム 第64期 (S100W3TS) の risk 章:
- `<h3>３【事業等のリスク】` ← 章 marker
- `<h3>(1)事業環境に起因するリスク` ← sub-heading だが同じ <h3>
- `<h3>(2)経営戦略に関するリスク` ← 同上
- `<h3>４【経営者による財政状態...】` ← 次の章 marker (mda)

PR #8 の depth-only flush は (1)事業環境に起因するリスク で risk を flush してしまい、 risk セクションが 4,050 chars → 286 chars (intro のみ) に劇的に短縮されていた (= silent data loss)。

## 修正

### chapter vs sub-numbering を識別

`chapterHeadingPrefixRe` で 「N【...】」 「第N【...】」 形式の **章 marker** のみを検出。 「(1)」 「①」 「a.」 などの sub-numbering はマッチしない。

`walkForSections` の非マッチ heading の flush 判定を:

旧: `level > 0 && level <= state.depth` (depth のみ)
新: `level > 0 && level <= state.depth && isChapterHeading(headingText)` (depth + 章 marker)

これで、 sub-heading は同 depth でも章を flush せず sub-section として継続。 章 marker が出てきたら flush する。

### EDINET フッターアーティファクトを除去

各 `_honbun_*.htm` の末尾には `有価証券報告書（通常方式）_<14桁timestamp>` という EDINET メタ情報が書かれている。 walker はこれを最後の section の末尾に取り込んでしまうため、 governance / financial 等の末尾に metadata text が混入していた。

`filingFooterRe` で末尾の `有価証券報告書（通常方式）_<digits>` を section text から除去。 各 section の Text に `stripFilingFooter` を適用。

## 検証

| docID | section | before (PR #8+#9) | after (本 PR) |
|---|---|---|---|
| S100W3TS (セコム) | risk | **286 (regression)** | **4,050 (restored)** |
| S100XTNW (楽天) | governance | 8,813 | 38,947 (footer/章 marker 整理) |
| S100XS22 (マクドナルド) | governance | 30,201 | 30,170 (footer -31 chars) |
| S100VT7P (セブン&アイ) | governance | 59,998 | 59,967 (footer -31 chars) |

全 section の text 末尾 100 chars に `有価証券報告書（通常方式）_` を検出しなくなった。

## Tests

- `TestExtractSections_SubHeadingSameHTag`: セコムパターン (同 h3 タグで章 + sub) で section が正しく分割されること
- `TestIsChapterHeading`: 章 marker と sub-numbering の判定が正しいこと
- `TestStripFilingFooter`: footer 除去ロジック単体
- `TestExtractSections_FilingFooterStripped`: section 抽出時に footer が除去されること

`go test ./...` 全件パス。
---
 internal/extract/html.go      |  69 ++++++++++++++++--
 internal/extract/html_test.go | 133 ++++++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+), 7 deletions(-)
diff --git a/internal/extract/html.go b/internal/extract/html.go
index 00d739b..7cbba46 100644
--- a/internal/extract/html.go
+++ b/internal/extract/html.go
@@ -3,11 +3,51 @@ package extract
 import (
 	"bytes"
 	"fmt"
+	"regexp"
 	"strings"
 
 	"golang.org/x/net/html"
 )
 
+// chapterHeadingPrefixRe matches headings whose leading numbering identifies
+// them as a top-level / chapter-level EDINET 有報 marker, e.g.:
+//
+//	"４【コーポレート・ガバナンスの状況等】"
+//	"第２【事業の状況】"
+//	"２【沿革】"
+//	"５ 【従業員の状況】"
+//
+// It explicitly does NOT match parenthesised sub-numbering like "(1)"  or
+// circled numbers like "①", which are used for sub-sections inside a chapter
+// (e.g., セコム 第64期 (S100W3TS) の "(1)事業環境に起因するリスク" lives inside
+// "事業等のリスク" but is marked up with the same <h3 class="smt_head2"> tag
+// as the chapter heading itself).
+var chapterHeadingPrefixRe = regexp.MustCompile(`^\s*第?\s*[０-９0-9一二三四五六七八九十百〇]+\s*【`)
+
+// filingFooterRe matches the EDINET filing-title footer artifact that appears
+// at the very end of each `_honbun_*.htm` file, e.g.:
+//
+//	"有価証券報告書（通常方式）_20260326141712"
+//
+// This token is metadata, not section content. Stripping it prevents the
+// footer from being appended to whichever section happens to be open at the
+// end of the document (typically the last `KnownSections` match such as
+// governance or financial).
+var filingFooterRe = regexp.MustCompile(`\s*(?:\x{FEFF})?\s*有価証券報告書（通常方式）_\d{14}\s*$`)
+
+// isChapterHeading reports whether the given heading text looks like a
+// chapter-level EDINET 有報 heading (chapter numbering followed by 【...】).
+func isChapterHeading(headingText string) bool {
+	return chapterHeadingPrefixRe.MatchString(headingText)
+}
+
+// stripFilingFooter removes the trailing EDINET filing-title footer artifact
+// from a section's accumulated text. Safe to call on any string; if the
+// footer is absent the input is returned unchanged.
+func stripFilingFooter(s string) string {
+	return strings.TrimSpace(filingFooterRe.ReplaceAllString(s, ""))
+}
+
 // ExtractText extracts plain text from HTML files in a type=1 XBRL ZIP archive.
 // Reads all .htm files under PublicDoc/, sorted by name, and concatenates their text.
 func ExtractText(zipData []byte) (string, error) {
@@ -117,10 +157,17 @@ func extractSectionsFromNodes(nodes []*html.Node) []Section {
 
 	// Flush last section
 	if state.current != nil {
-		state.current.Text = normalizeWhitespace(currentText.String())
+		state.current.Text = stripFilingFooter(normalizeWhitespace(currentText.String()))
 		sections = append(sections, *state.current)
 	}
 
+	// Strip filing-title footer from every section's tail. It is harmless when
+	// absent and avoids contaminating downstream consumers with EDINET metadata
+	// text that legitimately appears at the bottom of each HTML file.
+	for i := range sections {
+		sections[i].Text = stripFilingFooter(sections[i].Text)
+	}
+
 	return mergeAdjacentSameIDSections(sections)
 }
 
@@ -183,12 +230,20 @@ func walkForSections(n *html.Node, sections *[]Section, state *sectionWalkState,
 			return
 		}
 
-		// Non-matching heading: if it is at the same depth as (or shallower
-		// than) the heading that opened the current section, treat it as a
-		// section boundary and flush. Deeper headings are sub-headings of the
-		// current section (e.g., "（２）役員の状況" inside a governance section
-		// anchored at h3) and should keep accumulating text.
-		if state.current != nil && level > 0 && level <= state.depth {
+		// Non-matching heading: flush only if it looks like a chapter-level
+		// EDINET 有報 heading (e.g., "４【関係会社の状況】", "第４【提出会社の
+		// 状況】") at the same depth as (or shallower than) the heading that
+		// opened the current section.
+		//
+		// Why filter by chapter-numbering pattern: some filers (e.g., セコム
+		// 第64期 S100W3TS) mark up *sub-section* headings such as
+		// "(1)事業環境に起因するリスク" with the same <h3> tag as the chapter
+		// heading "３【事業等のリスク】". Pure depth-based flushing would close
+		// the risk section at the first sub-heading and drop most of its
+		// content. Restricting the flush to chapter-numbered headings keeps
+		// sub-sections inside the open section while still closing it when a
+		// new chapter (matching or not) begins.
+		if state.current != nil && level > 0 && level <= state.depth && isChapterHeading(headingText) {
 			state.current.Text = normalizeWhitespace(text.String())
 			*sections = append(*sections, *state.current)
 			state.current = nil
diff --git a/internal/extract/html_test.go b/internal/extract/html_test.go
index d800d58..18857cc 100644
--- a/internal/extract/html_test.go
+++ b/internal/extract/html_test.go
@@ -287,6 +287,139 @@ func TestExtractSections_SameIDNestedHeading(t *testing.T) {
 	}
 }
 
+// TestExtractSections_SubHeadingSameHTag reproduces the セコム 第64期
+// (S100W3TS) pattern: the chapter heading "３【事業等のリスク】" and the
+// sub-section heading "(1)事業環境に起因するリスク" are both marked up with
+// <h3 class="smt_head2"> in the source HTML. Pure depth-based flushing would
+// close risk at the first sub-heading and drop its content; the chapter-vs-
+// sub-numbering check must keep them together.
+func TestExtractSections_SubHeadingSameHTag(t *testing.T) {
+	data := createTestZip(t, map[string]string{
+		"PublicDoc/main.htm": `<html><body>
+			<h3>３【事業等のリスク】</h3>
+			<p>リスクのイントロ段落です。</p>
+			<h3>(1)事業環境に起因するリスク</h3>
+			<p>事業環境リスクの本文です。</p>
+			<h3>①社会・経済</h3>
+			<p>社会・経済の説明です。</p>
+			<h3>(2)経営戦略に関するリスク</h3>
+			<p>経営戦略リスクの本文です。</p>
+			<h3>４【経営者による財政状態、経営成績及びキャッシュ・フローの状況の分析】</h3>
+			<p>MD&A の本文です。</p>
+		</body></html>`,
+	})
+
+	sections, err := ExtractSections(data)
+	if err != nil {
+		t.Fatalf("ExtractSections() error = %v", err)
+	}
+
+	var risk, mda *Section
+	for i := range sections {
+		s := &sections[i]
+		if s.ID == "risk" {
+			risk = s
+		}
+		if s.ID == "mda" {
+			mda = s
+		}
+	}
+
+	if risk == nil {
+		t.Fatal("missing 'risk' section")
+	}
+	for _, want := range []string{"リスクのイントロ段落", "事業環境リスクの本文", "社会・経済の説明", "経営戦略リスクの本文"} {
+		if !strings.Contains(risk.Text, want) {
+			t.Errorf("risk.Text missing %q (text=%q)", want, risk.Text)
+		}
+	}
+	if strings.Contains(risk.Text, "MD&A の本文") {
+		t.Errorf("risk.Text bled into MD&A chapter (text=%q)", risk.Text)
+	}
+	if mda == nil {
+		t.Fatal("missing 'mda' section")
+	}
+	if !strings.Contains(mda.Text, "MD&A の本文") {
+		t.Errorf("mda.Text missing expected content (text=%q)", mda.Text)
+	}
+}
+
+// TestIsChapterHeading covers the chapter-vs-sub-numbering predicate.
+func TestIsChapterHeading(t *testing.T) {
+	cases := []struct {
+		in   string
+		want bool
+	}{
+		{"４【コーポレート・ガバナンスの状況等】", true},
+		{"第２【事業の状況】", true},
+		{"２【沿革】", true},
+		{"５ 【従業員の状況】", true},
+		{"第４ 【提出会社の状況】", true},
+		{"(1)事業環境に起因するリスク", false},
+		{"（１）【コーポレート・ガバナンスの概要】", false},
+		{"①社会・経済", false},
+		{"a. 受注実績", false},
+		{"②キャッシュ・フローの状況", false},
+	}
+	for _, c := range cases {
+		got := isChapterHeading(c.in)
+		if got != c.want {
+			t.Errorf("isChapterHeading(%q) = %v, want %v", c.in, got, c.want)
+		}
+	}
+}
+
+// TestStripFilingFooter checks the EDINET filing-title footer is removed.
+func TestStripFilingFooter(t *testing.T) {
+	cases := []struct {
+		in, want string
+	}{
+		{"ガバナンスの本文です。 有価証券報告書（通常方式）_20260326141712", "ガバナンスの本文です。"},
+		{"テキスト 有価証券報告書（通常方式）_20250523094900\n", "テキスト"},
+		{"footerなし", "footerなし"},
+		{"", ""},
+	}
+	for _, c := range cases {
+		got := stripFilingFooter(c.in)
+		if got != c.want {
+			t.Errorf("stripFilingFooter(%q) = %q, want %q", c.in, got, c.want)
+		}
+	}
+}
+
+// TestExtractSections_FilingFooterStripped confirms the filing-title footer
+// artifact at the end of an HTML file is not retained in the last section's
+// body text.
+func TestExtractSections_FilingFooterStripped(t *testing.T) {
+	data := createTestZip(t, map[string]string{
+		"PublicDoc/main.htm": `<html><body>
+			<h3>４【コーポレート・ガバナンスの状況等】</h3>
+			<p>ガバナンスの本文です。</p>
+			<p>有価証券報告書（通常方式）_20260326141712</p>
+		</body></html>`,
+	})
+
+	sections, err := ExtractSections(data)
+	if err != nil {
+		t.Fatalf("ExtractSections() error = %v", err)
+	}
+	var governance *Section
+	for i := range sections {
+		if sections[i].ID == "governance" {
+			governance = &sections[i]
+		}
+	}
+	if governance == nil {
+		t.Fatal("missing 'governance' section")
+	}
+	if strings.Contains(governance.Text, "有価証券報告書（通常方式）_") {
+		t.Errorf("governance.Text still contains filing footer artifact (text=%q)", governance.Text)
+	}
+	if !strings.Contains(governance.Text, "ガバナンスの本文") {
+		t.Errorf("governance.Text missing expected content (text=%q)", governance.Text)
+	}
+}
+
 // TestMergeAdjacentSameIDSections checks the merge safety net directly.
 func TestMergeAdjacentSameIDSections(t *testing.T) {
 	in := []Section{