Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions internal/extract/html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,36 @@ func TestMatchSection_Unknown(t *testing.T) {
}
}

// TestMatchSection_NakaguroVariants confirms tolerant matching across the
// 中黒 (・) variations seen in real EDINET filings. e.g., 株式会社セブン&
// アイ・ホールディングス 第20期 (S100VT7P) uses「コーポレートガバナンス」
// (no middle dot) while 日本マクドナルドホールディングス 第55期 (S100XS22)
// uses「コーポレート・ガバナンス」 (with middle dot). Both must map to the
// governance section.
func TestMatchSection_NakaguroVariants(t *testing.T) {
cases := []struct {
heading string
wantID string
}{
{"4【コーポレート・ガバナンスの状況等】", "governance"},
{"4【コーポレートガバナンスの状況等】", "governance"},
{"(1)【コーポレート・ガバナンスの概要】", "governance"},
{"(1)【コーポレートガバナンスの概要】", "governance"},
{"4【コーポレート ガバナンスの状況等】", "governance"}, // half-width space
{"4【コーポレート ガバナンスの状況等】", "governance"}, // full-width space
}
for _, c := range cases {
got := MatchSection(c.heading)
if got == nil {
t.Errorf("MatchSection(%q) = nil, want id=%q", c.heading, c.wantID)
continue
}
if got.ID != c.wantID {
t.Errorf("MatchSection(%q).ID = %q, want %q", c.heading, got.ID, c.wantID)
}
}
}

// TestExtractSections_BleedAcrossUnknownHeadings reproduces the bleed-truncated
// pattern observed in EDINET filings such as docID S100XS22 (日本マクドナルド
// HD 第55期): the "従業員の状況" section is followed by unknown headings
Expand Down
25 changes: 24 additions & 1 deletion internal/extract/sections.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,34 @@ var KnownSections = []SectionDef{
{ID: "dividends", Names: []string{"配当政策"}},
}

// normalizeForMatch normalizes a string for tolerant heading comparison.
// EDINET 提出企業ごとに 中黒 (・) の有無や前後の空白が揺れているため、
// マッチング前に正規化する。 例: 株式会社セブン&アイ・ホールディングス は
// 「コーポレートガバナンス」 (中黒なし)、 日本マクドナルドホールディングス
// は「コーポレート・ガバナンス」 (中黒あり) で同じ章を表す。
func normalizeForMatch(s string) string {
// Remove the katakana middle dot (・, U+30FB) and ASCII / full-width
// whitespace so headings that differ only in these decorative chars still
// match the same KnownSections entry.
r := strings.NewReplacer(
"・", "",
" ", "",
" ", "",
"\t", "",
"\n", "",
"\r", "",
)
return r.Replace(s)
}

// MatchSection returns the SectionDef matching the given heading text, or nil if none match.
// Comparison is performed on a normalized form so that minor variations in
// 中黒 (・) usage and whitespace do not cause false negatives.
func MatchSection(heading string) *SectionDef {
normHeading := normalizeForMatch(heading)
for i := range KnownSections {
for _, name := range KnownSections[i].Names {
if strings.Contains(heading, name) {
if strings.Contains(normHeading, normalizeForMatch(name)) {
return &KnownSections[i]
}
}
Expand Down
Loading