Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions cmd/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,14 +251,26 @@ var docTextCmd = &cobra.Command{
if err != nil {
return fmt.Errorf("failed to extract sections: %w", err)
}
for _, s := range sections {
// Pick the candidate with the most text. EDINET filings sometimes
// produce more than one section with the same ID (e.g., a parent
// heading and a child heading that both match KnownSections).
// Returning the first match in document order can yield an empty
// or near-empty parent slice; the longest-match policy avoids this.
var best *extract.Section
for i := range sections {
s := &sections[i]
if s.ID == docTextSection || strings.Contains(s.Name, docTextSection) {
return outputResult(cmd.OutOrStdout(), map[string]string{
"section": s.ID,
"text": s.Text,
})
if best == nil || len(s.Text) > len(best.Text) {
best = s
}
}
}
if best != nil {
return outputResult(cmd.OutOrStdout(), map[string]string{
"section": best.ID,
"text": best.Text,
})
}
// Section not found — return full text with warning
_, _ = fmt.Fprintf(cmd.ErrOrStderr(), `{"warning":"section '%s' not found, returning full text"}`+"\n", docTextSection)
}
Expand Down
112 changes: 98 additions & 14 deletions internal/extract/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,25 +97,53 @@ func extractTextFromNode(n *html.Node, buf *strings.Builder) {
}
}

// sectionWalkState tracks the currently-open section and the heading depth
// (h-level: 1..6) at which it was opened. Depth lets us flush on non-matching
// sibling/parent headings while keeping sub-headings (deeper h-levels) as part
// of the open section.
type sectionWalkState struct {
current *Section
depth int // h-level (1..6) where the current section was opened; 0 = no section open
}

func extractSectionsFromNodes(nodes []*html.Node) []Section {
var sections []Section
var currentSection *Section
state := &sectionWalkState{}
var currentText strings.Builder

for _, doc := range nodes {
walkForSections(doc, &sections, &currentSection, &currentText)
walkForSections(doc, &sections, state, &currentText)
}

// Flush last section
if currentSection != nil {
currentSection.Text = normalizeWhitespace(currentText.String())
sections = append(sections, *currentSection)
if state.current != nil {
state.current.Text = normalizeWhitespace(currentText.String())
sections = append(sections, *state.current)
}

return sections
return mergeAdjacentSameIDSections(sections)
}

// headingLevel returns the heading depth (1..6) for h1..h6 tags, or 0 otherwise.
func headingLevel(tag string) int {
switch tag {
case "h1":
return 1
case "h2":
return 2
case "h3":
return 3
case "h4":
return 4
case "h5":
return 5
case "h6":
return 6
}
return 0
}

func walkForSections(n *html.Node, sections *[]Section, current **Section, text *strings.Builder) {
func walkForSections(n *html.Node, sections *[]Section, state *sectionWalkState, text *strings.Builder) {
if n == nil {
return
}
Expand All @@ -128,23 +156,52 @@ func walkForSections(n *html.Node, sections *[]Section, current **Section, text
// Check if this is a heading element
if n.Type == html.ElementNode && isHeadingElement(n.Data) {
headingText := getNodeText(n)
level := headingLevel(n.Data)

if secDef := MatchSection(headingText); secDef != nil {
// Flush previous section
if *current != nil {
(*current).Text = normalizeWhitespace(text.String())
*sections = append(*sections, **current)
// If the current section has the same ID, treat this heading as a
// sub-heading inside the same section: do not flush, do not reset
// text. This handles EDINET filings where a parent heading like
// "コーポレート・ガバナンスの状況等" is immediately followed by
// child headings like "コーポレート・ガバナンスの概要" that also
// match the same KnownSections entry.
if state.current != nil && state.current.ID == secDef.ID {
return
}
*current = &Section{

// Different section: flush previous and start new.
if state.current != nil {
state.current.Text = normalizeWhitespace(text.String())
*sections = append(*sections, *state.current)
}
state.current = &Section{
ID: secDef.ID,
Name: headingText,
}
state.depth = level
text.Reset()
return
}

// Non-matching heading: if it is at the same depth as (or shallower
// than) the heading that opened the current section, treat it as a
// section boundary and flush. Deeper headings are sub-headings of the
// current section (e.g., "(2)役員の状況" inside a governance section
// anchored at h3) and should keep accumulating text.
if state.current != nil && level > 0 && level <= state.depth {
state.current.Text = normalizeWhitespace(text.String())
*sections = append(*sections, *state.current)
state.current = nil
state.depth = 0
text.Reset()
// Fall through so the heading's own text is not collected into
// any section.
return
}
}

// Collect text for current section
if n.Type == html.TextNode && *current != nil {
if n.Type == html.TextNode && state.current != nil {
t := strings.TrimSpace(n.Data)
if t != "" {
text.WriteString(t)
Expand All @@ -153,8 +210,35 @@ func walkForSections(n *html.Node, sections *[]Section, current **Section, text
}

for c := n.FirstChild; c != nil; c = c.NextSibling {
walkForSections(c, sections, current, text)
walkForSections(c, sections, state, text)
}
}

// mergeAdjacentSameIDSections concatenates consecutive sections that share
// the same ID. This is a safety net for cases where the same section opens
// twice in a row (e.g., split across HTML files) — the depth-aware walker
// already prevents most occurrences, but merging guards against edge cases.
func mergeAdjacentSameIDSections(in []Section) []Section {
if len(in) <= 1 {
return in
}
out := make([]Section, 0, len(in))
out = append(out, in[0])
for i := 1; i < len(in); i++ {
last := &out[len(out)-1]
if last.ID != "" && last.ID == in[i].ID {
if in[i].Text != "" {
if last.Text != "" {
last.Text += " " + in[i].Text
} else {
last.Text = in[i].Text
}
}
continue
}
out = append(out, in[i])
}
return out
}

func getNodeText(n *html.Node) string {
Expand Down
127 changes: 127 additions & 0 deletions internal/extract/html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,130 @@ func TestMatchSection_Unknown(t *testing.T) {
t.Errorf("MatchSection should return nil for unknown heading, got %v", sec)
}
}

// TestExtractSections_BleedAcrossUnknownHeadings reproduces the bleed-truncated
// pattern observed in EDINET filings such as docID S100XS22 (日本マクドナルド
// HD 第55期): the "従業員の状況" section is followed by unknown headings
// (関係会社の状況, 第2【事業の状況】, 経営方針, サステナビリティ) before the
// next recognised heading (事業等のリスク). The depth-aware walker should flush
// the employees section at the next heading whose h-level is the same as the
// opening heading of employees, keeping employees content from bleeding into
// the following chapters.
func TestExtractSections_BleedAcrossUnknownHeadings(t *testing.T) {
data := createTestZip(t, map[string]string{
"PublicDoc/main.htm": `<html><body>
<h3>5【従業員の状況】</h3>
<p>従業員数は2,454名です。</p>
<h3>4【関係会社の状況】</h3>
<p>関係会社の説明です。</p>
<h2>第2【事業の状況】</h2>
<h3>1【経営方針、経営環境及び対処すべき課題等】</h3>
<p>経営方針の説明です。</p>
<h3>2【サステナビリティに関する考え方及び取組】</h3>
<p>サステナビリティの説明です。</p>
<h3>3【事業等のリスク】</h3>
<p>リスクの説明です。</p>
</body></html>`,
})

sections, err := ExtractSections(data)
if err != nil {
t.Fatalf("ExtractSections() error = %v", err)
}

var employees, risk *Section
for i := range sections {
s := &sections[i]
if s.ID == "employees" {
employees = s
}
if s.ID == "risk" {
risk = s
}
}

if employees == nil {
t.Fatal("missing 'employees' section")
}
if !strings.Contains(employees.Text, "従業員数は2,454名") {
t.Errorf("employees.Text = %q, missing expected content", employees.Text)
}
for _, leak := range []string{"関係会社の説明", "経営方針の説明", "サステナビリティの説明", "リスクの説明"} {
if strings.Contains(employees.Text, leak) {
t.Errorf("employees.Text bled into other chapter (found %q)", leak)
}
}
if risk == nil {
t.Fatal("missing 'risk' section")
}
if !strings.Contains(risk.Text, "リスクの説明") {
t.Errorf("risk.Text = %q, missing expected content", risk.Text)
}
}

// TestExtractSections_SameIDNestedHeading reproduces the empty-section pattern
// observed for governance: a parent heading "コーポレート・ガバナンスの状況等"
// (h3) is immediately followed by a child heading "コーポレート・ガバナンスの
// 概要" (h4) — both match the governance KnownSections entry. The previous
// implementation flushed and reset the section on the second match, leaving
// the parent section empty. With the same-ID continuation rule, the deeper
// heading is treated as a sub-heading inside the open governance section.
func TestExtractSections_SameIDNestedHeading(t *testing.T) {
data := createTestZip(t, map[string]string{
"PublicDoc/main.htm": `<html><body>
<h3>4【コーポレート・ガバナンスの状況等】</h3>
<h4>(1)【コーポレート・ガバナンスの概要】</h4>
<p>ガバナンスの概要本文です。</p>
<h4>(2)【役員の状況】</h4>
<p>役員の状況本文です。</p>
<h4>(3)【監査の状況】</h4>
<p>監査の状況本文です。</p>
<h3>5【提出会社の株式事務の概要】</h3>
<p>株式事務の概要本文です。</p>
</body></html>`,
})

sections, err := ExtractSections(data)
if err != nil {
t.Fatalf("ExtractSections() error = %v", err)
}

var governance *Section
for i := range sections {
s := &sections[i]
if s.ID == "governance" {
governance = s
}
}

if governance == nil {
t.Fatal("missing 'governance' section")
}
for _, want := range []string{"ガバナンスの概要本文", "役員の状況本文", "監査の状況本文"} {
if !strings.Contains(governance.Text, want) {
t.Errorf("governance.Text missing expected content %q (text=%q)", want, governance.Text)
}
}
if strings.Contains(governance.Text, "株式事務の概要本文") {
t.Errorf("governance.Text bled into next chapter (株式事務の概要)")
}
}

// TestMergeAdjacentSameIDSections checks the merge safety net directly.
func TestMergeAdjacentSameIDSections(t *testing.T) {
in := []Section{
{ID: "governance", Name: "コーポレート・ガバナンスの状況等", Text: ""},
{ID: "governance", Name: "コーポレート・ガバナンスの概要", Text: "ガバナンス本文"},
{ID: "financial", Name: "連結財務諸表", Text: "財務諸表本文"},
}
out := mergeAdjacentSameIDSections(in)
if len(out) != 2 {
t.Fatalf("len = %d, want 2 (governance merged + financial)", len(out))
}
if out[0].ID != "governance" {
t.Errorf("out[0].ID = %q, want governance", out[0].ID)
}
if !strings.Contains(out[0].Text, "ガバナンス本文") {
t.Errorf("merged governance.Text = %q, missing content", out[0].Text)
}
}
Loading