Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 62 additions & 7 deletions internal/extract/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,51 @@ package extract
import (
"bytes"
"fmt"
"regexp"
"strings"

"golang.org/x/net/html"
)

// chapterHeadingPrefixRe matches headings whose leading numbering identifies
// them as a top-level / chapter-level EDINET 有報 marker, e.g.:
//
// "4【コーポレート・ガバナンスの状況等】"
// "第2【事業の状況】"
// "2【沿革】"
// "5 【従業員の状況】"
//
// It explicitly does NOT match parenthesised sub-numbering like "(1)" or
// circled numbers like "①", which are used for sub-sections inside a chapter
// (e.g., セコム 第64期 (S100W3TS) の "(1)事業環境に起因するリスク" lives inside
// "事業等のリスク" but is marked up with the same <h3 class="smt_head2"> tag
// as the chapter heading itself).
var chapterHeadingPrefixRe = regexp.MustCompile(`^\s*第?\s*[0-90-9一二三四五六七八九十百〇]+\s*【`)

// filingFooterRe matches the EDINET filing-title footer artifact that appears
// at the very end of each `_honbun_*.htm` file, e.g.:
//
// "有価証券報告書(通常方式)_20260326141712"
//
// This token is metadata, not section content. Stripping it prevents the
// footer from being appended to whichever section happens to be open at the
// end of the document (typically the last `KnownSections` match such as
// governance or financial).
var filingFooterRe = regexp.MustCompile(`\s*(?:\x{FEFF})?\s*有価証券報告書(通常方式)_\d{14}\s*$`)

// isChapterHeading reports whether the given heading text looks like a
// chapter-level EDINET 有報 heading (chapter numbering followed by 【...】).
func isChapterHeading(headingText string) bool {
return chapterHeadingPrefixRe.MatchString(headingText)
}

// stripFilingFooter removes the trailing EDINET filing-title footer artifact
// from a section's accumulated text. Safe to call on any string; if the
// footer is absent the input is returned unchanged.
func stripFilingFooter(s string) string {
return strings.TrimSpace(filingFooterRe.ReplaceAllString(s, ""))
}

// ExtractText extracts plain text from HTML files in a type=1 XBRL ZIP archive.
// Reads all .htm files under PublicDoc/, sorted by name, and concatenates their text.
func ExtractText(zipData []byte) (string, error) {
Expand Down Expand Up @@ -117,10 +157,17 @@ func extractSectionsFromNodes(nodes []*html.Node) []Section {

// Flush last section
if state.current != nil {
state.current.Text = normalizeWhitespace(currentText.String())
state.current.Text = stripFilingFooter(normalizeWhitespace(currentText.String()))
sections = append(sections, *state.current)
}

// Strip filing-title footer from every section's tail. It is harmless when
// absent and avoids contaminating downstream consumers with EDINET metadata
// text that legitimately appears at the bottom of each HTML file.
for i := range sections {
sections[i].Text = stripFilingFooter(sections[i].Text)
}

return mergeAdjacentSameIDSections(sections)
}

Expand Down Expand Up @@ -183,12 +230,20 @@ func walkForSections(n *html.Node, sections *[]Section, state *sectionWalkState,
return
}

// Non-matching heading: if it is at the same depth as (or shallower
// than) the heading that opened the current section, treat it as a
// section boundary and flush. Deeper headings are sub-headings of the
// current section (e.g., "(2)役員の状況" inside a governance section
// anchored at h3) and should keep accumulating text.
if state.current != nil && level > 0 && level <= state.depth {
// Non-matching heading: flush only if it looks like a chapter-level
// EDINET 有報 heading (e.g., "4【関係会社の状況】", "第4【提出会社の
// 状況】") at the same depth as (or shallower than) the heading that
// opened the current section.
//
// Why filter by chapter-numbering pattern: some filers (e.g., セコム
// 第64期 S100W3TS) mark up *sub-section* headings such as
// "(1)事業環境に起因するリスク" with the same <h3> tag as the chapter
// heading "3【事業等のリスク】". Pure depth-based flushing would close
// the risk section at the first sub-heading and drop most of its
// content. Restricting the flush to chapter-numbered headings keeps
// sub-sections inside the open section while still closing it when a
// new chapter (matching or not) begins.
if state.current != nil && level > 0 && level <= state.depth && isChapterHeading(headingText) {
state.current.Text = normalizeWhitespace(text.String())
*sections = append(*sections, *state.current)
state.current = nil
Expand Down
133 changes: 133 additions & 0 deletions internal/extract/html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,139 @@ func TestExtractSections_SameIDNestedHeading(t *testing.T) {
}
}

// TestExtractSections_SubHeadingSameHTag reproduces the セコム 第64期
// (S100W3TS) pattern: the chapter heading "3【事業等のリスク】" and the
// sub-section heading "(1)事業環境に起因するリスク" are both marked up with
// <h3 class="smt_head2"> in the source HTML. Pure depth-based flushing would
// close risk at the first sub-heading and drop its content; the chapter-vs-
// sub-numbering check must keep them together.
func TestExtractSections_SubHeadingSameHTag(t *testing.T) {
data := createTestZip(t, map[string]string{
"PublicDoc/main.htm": `<html><body>
<h3>3【事業等のリスク】</h3>
<p>リスクのイントロ段落です。</p>
<h3>(1)事業環境に起因するリスク</h3>
<p>事業環境リスクの本文です。</p>
<h3>①社会・経済</h3>
<p>社会・経済の説明です。</p>
<h3>(2)経営戦略に関するリスク</h3>
<p>経営戦略リスクの本文です。</p>
<h3>4【経営者による財政状態、経営成績及びキャッシュ・フローの状況の分析】</h3>
<p>MD&A の本文です。</p>
</body></html>`,
})

sections, err := ExtractSections(data)
if err != nil {
t.Fatalf("ExtractSections() error = %v", err)
}

var risk, mda *Section
for i := range sections {
s := &sections[i]
if s.ID == "risk" {
risk = s
}
if s.ID == "mda" {
mda = s
}
}

if risk == nil {
t.Fatal("missing 'risk' section")
}
for _, want := range []string{"リスクのイントロ段落", "事業環境リスクの本文", "社会・経済の説明", "経営戦略リスクの本文"} {
if !strings.Contains(risk.Text, want) {
t.Errorf("risk.Text missing %q (text=%q)", want, risk.Text)
}
}
if strings.Contains(risk.Text, "MD&A の本文") {
t.Errorf("risk.Text bled into MD&A chapter (text=%q)", risk.Text)
}
if mda == nil {
t.Fatal("missing 'mda' section")
}
if !strings.Contains(mda.Text, "MD&A の本文") {
t.Errorf("mda.Text missing expected content (text=%q)", mda.Text)
}
}

// TestIsChapterHeading covers the chapter-vs-sub-numbering predicate.
func TestIsChapterHeading(t *testing.T) {
cases := []struct {
in string
want bool
}{
{"4【コーポレート・ガバナンスの状況等】", true},
{"第2【事業の状況】", true},
{"2【沿革】", true},
{"5 【従業員の状況】", true},
{"第4 【提出会社の状況】", true},
{"(1)事業環境に起因するリスク", false},
{"(1)【コーポレート・ガバナンスの概要】", false},
{"①社会・経済", false},
{"a. 受注実績", false},
{"②キャッシュ・フローの状況", false},
}
for _, c := range cases {
got := isChapterHeading(c.in)
if got != c.want {
t.Errorf("isChapterHeading(%q) = %v, want %v", c.in, got, c.want)
}
}
}

// TestStripFilingFooter checks the EDINET filing-title footer is removed.
func TestStripFilingFooter(t *testing.T) {
cases := []struct {
in, want string
}{
{"ガバナンスの本文です。 有価証券報告書(通常方式)_20260326141712", "ガバナンスの本文です。"},
{"テキスト 有価証券報告書(通常方式)_20250523094900\n", "テキスト"},
{"footerなし", "footerなし"},
{"", ""},
}
for _, c := range cases {
got := stripFilingFooter(c.in)
if got != c.want {
t.Errorf("stripFilingFooter(%q) = %q, want %q", c.in, got, c.want)
}
}
}

// TestExtractSections_FilingFooterStripped confirms the filing-title footer
// artifact at the end of an HTML file is not retained in the last section's
// body text.
func TestExtractSections_FilingFooterStripped(t *testing.T) {
data := createTestZip(t, map[string]string{
"PublicDoc/main.htm": `<html><body>
<h3>4【コーポレート・ガバナンスの状況等】</h3>
<p>ガバナンスの本文です。</p>
<p>有価証券報告書(通常方式)_20260326141712</p>
</body></html>`,
})

sections, err := ExtractSections(data)
if err != nil {
t.Fatalf("ExtractSections() error = %v", err)
}
var governance *Section
for i := range sections {
if sections[i].ID == "governance" {
governance = &sections[i]
}
}
if governance == nil {
t.Fatal("missing 'governance' section")
}
if strings.Contains(governance.Text, "有価証券報告書(通常方式)_") {
t.Errorf("governance.Text still contains filing footer artifact (text=%q)", governance.Text)
}
if !strings.Contains(governance.Text, "ガバナンスの本文") {
t.Errorf("governance.Text missing expected content (text=%q)", governance.Text)
}
}

// TestMergeAdjacentSameIDSections checks the merge safety net directly.
func TestMergeAdjacentSameIDSections(t *testing.T) {
in := []Section{
Expand Down
Loading