From 447a1ae7a872962505566cc0c1501c4959ea7694 Mon Sep 17 00:00:00 2001
From: beatinaniwa <beatinaniwa@gmail.com>
Date: Tue, 12 May 2026 11:43:34 +0900
Subject: [PATCH] =?UTF-8?q?fix(extract):=20=E3=82=BB=E3=82=AF=E3=82=B7?=
 =?UTF-8?q?=E3=83=A7=E3=83=B3=E5=90=8D=E3=83=9E=E3=83=83=E3=83=81=E3=81=A7?=
 =?UTF-8?q?=20=E4=B8=AD=E9=BB=92=20(=E3=83=BB)=20=E3=81=A8=E7=A9=BA?=
 =?UTF-8?q?=E7=99=BD=E3=82=92=E6=AD=A3=E8=A6=8F=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

EDINET 提出企業によって、 章見出しでの 「中黒 ・」 の有無や全角/半角の空白の差し込み方が揺れている。 例:

- 日本マクドナルドホールディングス 第55期 (S100XS22): 「コーポレート・ガバナンスの状況等」 (中黒あり)
- 株式会社セブン＆アイ・ホールディングス 第20期 (S100VT7P): 「コーポレートガバナンスの状況等」 (中黒なし)

旧実装 (\`strings.Contains(heading, name)\` ベース) は文字一致なので、 KnownSections の Names が中黒入りで定義されている場合 (governance)、 中黒なしの 提出書類の見出しに一致しなかった。 結果として:

- セブン＆アイ governance: cli は section 未検出として warning + full doc を返す
- ingest_edinet.py 側は full doc を section テキストとして保存、 BLEED_MARKER_PATTERN がほぼ任意の位置で truncate → 表紙ページから始まる 19,526 chars の無意味な内容で governance.json が生成されていた (見かけは bleed_truncated)

## 修正

\`MatchSection\` の比較を、 中黒 (・、 U+30FB) + 半角/全角空白 + tab + 改行 を除いた正規形で行うように変更。

## 検証

S100VT7P (セブン＆アイ HD 第20期) governance:

- before: section 未検出 → warning + full doc 282,362 chars (内容: 表紙ページ〜)
- after: section=governance 59,998 chars (内容: 「①コーポレートガバナンスに関する基本的な考え方」〜)

既存テスト全件パス。 新規テスト \`TestMatchSection_NakaguroVariants\` で 中黒あり/なし + 半角/全角空白の 6 パターンを担保。

## 影響範囲

- 中黒揺れがある heading を含む EDINET 有報の section 抽出が改善
- 同様の正規化が今後追加される他の section にも自動的に適用される
---
 internal/extract/html_test.go | 30 ++++++++++++++++++++++++++++++
 internal/extract/sections.go  | 25 ++++++++++++++++++++++++-
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/internal/extract/html_test.go b/internal/extract/html_test.go
index 35548f0..d800d58 100644
--- a/internal/extract/html_test.go
+++ b/internal/extract/html_test.go
@@ -149,6 +149,36 @@ func TestMatchSection_Unknown(t *testing.T) {
 	}
 }
 
+// TestMatchSection_NakaguroVariants confirms tolerant matching across the
+// 中黒 (・) variations seen in real EDINET filings. e.g., 株式会社セブン＆
+// アイ・ホールディングス 第20期 (S100VT7P) uses「コーポレートガバナンス」
+// (no middle dot) while 日本マクドナルドホールディングス 第55期 (S100XS22)
+// uses「コーポレート・ガバナンス」 (with middle dot). Both must map to the
+// governance section.
+func TestMatchSection_NakaguroVariants(t *testing.T) {
+	cases := []struct {
+		heading string
+		wantID  string
+	}{
+		{"４【コーポレート・ガバナンスの状況等】", "governance"},
+		{"４【コーポレートガバナンスの状況等】", "governance"},
+		{"（１）【コーポレート・ガバナンスの概要】", "governance"},
+		{"（１）【コーポレートガバナンスの概要】", "governance"},
+		{"４【コーポレート ガバナンスの状況等】", "governance"}, // half-width space
+		{"４【コーポレート　ガバナンスの状況等】", "governance"}, // full-width space
+	}
+	for _, c := range cases {
+		got := MatchSection(c.heading)
+		if got == nil {
+			t.Errorf("MatchSection(%q) = nil, want id=%q", c.heading, c.wantID)
+			continue
+		}
+		if got.ID != c.wantID {
+			t.Errorf("MatchSection(%q).ID = %q, want %q", c.heading, got.ID, c.wantID)
+		}
+	}
+}
+
 // TestExtractSections_BleedAcrossUnknownHeadings reproduces the bleed-truncated
 // pattern observed in EDINET filings such as docID S100XS22 (日本マクドナルド
 // HD 第55期): the "従業員の状況" section is followed by unknown headings
diff --git a/internal/extract/sections.go b/internal/extract/sections.go
index a6c2df9..b9a1ca8 100644
--- a/internal/extract/sections.go
+++ b/internal/extract/sections.go
@@ -29,11 +29,34 @@ var KnownSections = []SectionDef{
 	{ID: "dividends", Names: []string{"配当政策"}},
 }
 
+// normalizeForMatch normalizes a string for tolerant heading comparison.
+// EDINET 提出企業ごとに 中黒 (・) の有無や前後の空白が揺れているため、
+// マッチング前に正規化する。 例: 株式会社セブン＆アイ・ホールディングス は
+// 「コーポレートガバナンス」 (中黒なし)、 日本マクドナルドホールディングス
+// は「コーポレート・ガバナンス」 (中黒あり) で同じ章を表す。
+func normalizeForMatch(s string) string {
+	// Remove the katakana middle dot (・, U+30FB) and ASCII / full-width
+	// whitespace so headings that differ only in these decorative chars still
+	// match the same KnownSections entry.
+	r := strings.NewReplacer(
+		"・", "",
+		" ", "",
+		"　", "",
+		"\t", "",
+		"\n", "",
+		"\r", "",
+	)
+	return r.Replace(s)
+}
+
 // MatchSection returns the SectionDef matching the given heading text, or nil if none match.
+// Comparison is performed on a normalized form so that minor variations in
+// 中黒 (・) usage and whitespace do not cause false negatives.
 func MatchSection(heading string) *SectionDef {
+	normHeading := normalizeForMatch(heading)
 	for i := range KnownSections {
 		for _, name := range KnownSections[i].Names {
-			if strings.Contains(heading, name) {
+			if strings.Contains(normHeading, normalizeForMatch(name)) {
 				return &KnownSections[i]
 			}
 		}