beatinaniwa · beatinaniwa · May 12, 2026 · May 11, 2026
diff --git a/cmd/doc.go b/cmd/doc.go
@@ -251,14 +251,26 @@ var docTextCmd = &cobra.Command{
 			if err != nil {
 				return fmt.Errorf("failed to extract sections: %w", err)
 			}
-			for _, s := range sections {
+			// Pick the candidate with the most text. EDINET filings sometimes
+			// produce more than one section with the same ID (e.g., a parent
+			// heading and a child heading that both match KnownSections).
+			// Returning the first match in document order can yield an empty
+			// or near-empty parent slice; the longest-match policy avoids this.
+			var best *extract.Section
+			for i := range sections {
+				s := &sections[i]
 				if s.ID == docTextSection || strings.Contains(s.Name, docTextSection) {
-					return outputResult(cmd.OutOrStdout(), map[string]string{
-						"section": s.ID,
-						"text":    s.Text,
-					})
+					if best == nil || len(s.Text) > len(best.Text) {
+						best = s
+					}
 				}
 			}
+			if best != nil {
+				return outputResult(cmd.OutOrStdout(), map[string]string{
+					"section": best.ID,
+					"text":    best.Text,
+				})
+			}
 			// Section not found — return full text with warning
 			_, _ = fmt.Fprintf(cmd.ErrOrStderr(), `{"warning":"section '%s' not found, returning full text"}`+"\n", docTextSection)
 		}

diff --git a/internal/extract/html.go b/internal/extract/html.go
@@ -97,25 +97,53 @@ func extractTextFromNode(n *html.Node, buf *strings.Builder) {
 	}
 }
 
+// sectionWalkState tracks the currently-open section and the heading depth
+// (h-level: 1..6) at which it was opened. Depth lets us flush on non-matching
+// sibling/parent headings while keeping sub-headings (deeper h-levels) as part
+// of the open section.
+type sectionWalkState struct {
+	current *Section
+	depth   int // h-level (1..6) where the current section was opened; 0 = no section open
+}
+
 func extractSectionsFromNodes(nodes []*html.Node) []Section {
 	var sections []Section
-	var currentSection *Section
+	state := &sectionWalkState{}
 	var currentText strings.Builder
 
 	for _, doc := range nodes {
-		walkForSections(doc, &sections, &currentSection, &currentText)
+		walkForSections(doc, &sections, state, &currentText)
 	}
 
 	// Flush last section
-	if currentSection != nil {
-		currentSection.Text = normalizeWhitespace(currentText.String())
-		sections = append(sections, *currentSection)
+	if state.current != nil {
+		state.current.Text = normalizeWhitespace(currentText.String())
+		sections = append(sections, *state.current)
 	}
 
-	return sections
+	return mergeAdjacentSameIDSections(sections)
+}
+
+// headingLevel returns the heading depth (1..6) for h1..h6 tags, or 0 otherwise.
+func headingLevel(tag string) int {
+	switch tag {
+	case "h1":
+		return 1
+	case "h2":
+		return 2
+	case "h3":
+		return 3
+	case "h4":
+		return 4
+	case "h5":
+		return 5
+	case "h6":
+		return 6
+	}
+	return 0
 }
 
-func walkForSections(n *html.Node, sections *[]Section, current **Section, text *strings.Builder) {
+func walkForSections(n *html.Node, sections *[]Section, state *sectionWalkState, text *strings.Builder) {
 	if n == nil {
 		return
 	}
@@ -128,23 +156,52 @@ func walkForSections(n *html.Node, sections *[]Section, current **Section, text
 	// Check if this is a heading element
 	if n.Type == html.ElementNode && isHeadingElement(n.Data) {
 		headingText := getNodeText(n)
+		level := headingLevel(n.Data)
+
 		if secDef := MatchSection(headingText); secDef != nil {
-			// Flush previous section
-			if *current != nil {
-				(*current).Text = normalizeWhitespace(text.String())
-				*sections = append(*sections, **current)
+			// If the current section has the same ID, treat this heading as a
+			// sub-heading inside the same section: do not flush, do not reset
+			// text. This handles EDINET filings where a parent heading like
+			// "コーポレート・ガバナンスの状況等" is immediately followed by
+			// child headings like "コーポレート・ガバナンスの概要" that also
+			// match the same KnownSections entry.
+			if state.current != nil && state.current.ID == secDef.ID {
+				return
 			}
-			*current = &Section{
+
+			// Different section: flush previous and start new.
+			if state.current != nil {
+				state.current.Text = normalizeWhitespace(text.String())
+				*sections = append(*sections, *state.current)
+			}
+			state.current = &Section{
 				ID:   secDef.ID,
 				Name: headingText,
 			}
+			state.depth = level
+			text.Reset()
+			return
+		}
+
+		// Non-matching heading: if it is at the same depth as (or shallower
+		// than) the heading that opened the current section, treat it as a
+		// section boundary and flush. Deeper headings are sub-headings of the
+		// current section (e.g., "（２）役員の状況" inside a governance section
+		// anchored at h3) and should keep accumulating text.
+		if state.current != nil && level > 0 && level <= state.depth {
+			state.current.Text = normalizeWhitespace(text.String())
+			*sections = append(*sections, *state.current)
+			state.current = nil
+			state.depth = 0
 			text.Reset()
+			// Fall through so the heading's own text is not collected into
+			// any section.
 			return
 		}
 	}
 
 	// Collect text for current section
-	if n.Type == html.TextNode && *current != nil {
+	if n.Type == html.TextNode && state.current != nil {
 		t := strings.TrimSpace(n.Data)
 		if t != "" {
 			text.WriteString(t)
@@ -153,8 +210,35 @@ func walkForSections(n *html.Node, sections *[]Section, current **Section, text
 	}
 
 	for c := n.FirstChild; c != nil; c = c.NextSibling {
-		walkForSections(c, sections, current, text)
+		walkForSections(c, sections, state, text)
+	}
+}
+
+// mergeAdjacentSameIDSections concatenates consecutive sections that share
+// the same ID. This is a safety net for cases where the same section opens
+// twice in a row (e.g., split across HTML files) — the depth-aware walker
+// already prevents most occurrences, but merging guards against edge cases.
+func mergeAdjacentSameIDSections(in []Section) []Section {
+	if len(in) <= 1 {
+		return in
+	}
+	out := make([]Section, 0, len(in))
+	out = append(out, in[0])
+	for i := 1; i < len(in); i++ {
+		last := &out[len(out)-1]
+		if last.ID != "" && last.ID == in[i].ID {
+			if in[i].Text != "" {
+				if last.Text != "" {
+					last.Text += " " + in[i].Text
+				} else {
+					last.Text = in[i].Text
+				}
+			}
+			continue
+		}
+		out = append(out, in[i])
 	}
+	return out
 }
 
 func getNodeText(n *html.Node) string {

diff --git a/internal/extract/html_test.go b/internal/extract/html_test.go
@@ -148,3 +148,130 @@ func TestMatchSection_Unknown(t *testing.T) {
 		t.Errorf("MatchSection should return nil for unknown heading, got %v", sec)
 	}
 }
+
+// TestExtractSections_BleedAcrossUnknownHeadings reproduces the bleed-truncated
+// pattern observed in EDINET filings such as docID S100XS22 (日本マクドナルド
+// HD 第55期): the "従業員の状況" section is followed by unknown headings
+// (関係会社の状況, 第２【事業の状況】, 経営方針, サステナビリティ) before the
+// next recognised heading (事業等のリスク). The depth-aware walker should flush
+// the employees section at the next heading whose h-level is the same as the
+// opening heading of employees, keeping employees content from bleeding into
+// the following chapters.
+func TestExtractSections_BleedAcrossUnknownHeadings(t *testing.T) {
+	data := createTestZip(t, map[string]string{
+		"PublicDoc/main.htm": `<html><body>
+			<h3>５【従業員の状況】</h3>
+			<p>従業員数は2,454名です。</p>
+			<h3>４【関係会社の状況】</h3>
+			<p>関係会社の説明です。</p>
+			<h2>第２【事業の状況】</h2>
+			<h3>１【経営方針、経営環境及び対処すべき課題等】</h3>
+			<p>経営方針の説明です。</p>
+			<h3>２【サステナビリティに関する考え方及び取組】</h3>
+			<p>サステナビリティの説明です。</p>
+			<h3>３【事業等のリスク】</h3>
+			<p>リスクの説明です。</p>
+		</body></html>`,
+	})
+
+	sections, err := ExtractSections(data)
+	if err != nil {
+		t.Fatalf("ExtractSections() error = %v", err)
+	}
+
+	var employees, risk *Section
+	for i := range sections {
+		s := &sections[i]
+		if s.ID == "employees" {
+			employees = s
+		}
+		if s.ID == "risk" {
+			risk = s
+		}
+	}
+
+	if employees == nil {
+		t.Fatal("missing 'employees' section")
+	}
+	if !strings.Contains(employees.Text, "従業員数は2,454名") {
+		t.Errorf("employees.Text = %q, missing expected content", employees.Text)
+	}
+	for _, leak := range []string{"関係会社の説明", "経営方針の説明", "サステナビリティの説明", "リスクの説明"} {
+		if strings.Contains(employees.Text, leak) {
+			t.Errorf("employees.Text bled into other chapter (found %q)", leak)
+		}
+	}
+	if risk == nil {
+		t.Fatal("missing 'risk' section")
+	}
+	if !strings.Contains(risk.Text, "リスクの説明") {
+		t.Errorf("risk.Text = %q, missing expected content", risk.Text)
+	}
+}
+
+// TestExtractSections_SameIDNestedHeading reproduces the empty-section pattern
+// observed for governance: a parent heading "コーポレート・ガバナンスの状況等"
+// (h3) is immediately followed by a child heading "コーポレート・ガバナンスの
+// 概要" (h4) — both match the governance KnownSections entry. The previous
+// implementation flushed and reset the section on the second match, leaving
+// the parent section empty. With the same-ID continuation rule, the deeper
+// heading is treated as a sub-heading inside the open governance section.
+func TestExtractSections_SameIDNestedHeading(t *testing.T) {
+	data := createTestZip(t, map[string]string{
+		"PublicDoc/main.htm": `<html><body>
+			<h3>４【コーポレート・ガバナンスの状況等】</h3>
+			<h4>（１）【コーポレート・ガバナンスの概要】</h4>
+			<p>ガバナンスの概要本文です。</p>
+			<h4>（２）【役員の状況】</h4>
+			<p>役員の状況本文です。</p>
+			<h4>（３）【監査の状況】</h4>
+			<p>監査の状況本文です。</p>
+			<h3>５【提出会社の株式事務の概要】</h3>
+			<p>株式事務の概要本文です。</p>
+		</body></html>`,
+	})
+
+	sections, err := ExtractSections(data)
+	if err != nil {
+		t.Fatalf("ExtractSections() error = %v", err)
+	}
+
+	var governance *Section
+	for i := range sections {
+		s := &sections[i]
+		if s.ID == "governance" {
+			governance = s
+		}
+	}
+
+	if governance == nil {
+		t.Fatal("missing 'governance' section")
+	}
+	for _, want := range []string{"ガバナンスの概要本文", "役員の状況本文", "監査の状況本文"} {
+		if !strings.Contains(governance.Text, want) {
+			t.Errorf("governance.Text missing expected content %q (text=%q)", want, governance.Text)
+		}
+	}
+	if strings.Contains(governance.Text, "株式事務の概要本文") {
+		t.Errorf("governance.Text bled into next chapter (株式事務の概要)")
+	}
+}
+
+// TestMergeAdjacentSameIDSections checks the merge safety net directly.
+func TestMergeAdjacentSameIDSections(t *testing.T) {
+	in := []Section{
+		{ID: "governance", Name: "コーポレート・ガバナンスの状況等", Text: ""},
+		{ID: "governance", Name: "コーポレート・ガバナンスの概要", Text: "ガバナンス本文"},
+		{ID: "financial", Name: "連結財務諸表", Text: "財務諸表本文"},
+	}
+	out := mergeAdjacentSameIDSections(in)
+	if len(out) != 2 {
+		t.Fatalf("len = %d, want 2 (governance merged + financial)", len(out))
+	}
+	if out[0].ID != "governance" {
+		t.Errorf("out[0].ID = %q, want governance", out[0].ID)
+	}
+	if !strings.Contains(out[0].Text, "ガバナンス本文") {
+		t.Errorf("merged governance.Text = %q, missing content", out[0].Text)
+	}
+}