From 7b6cf52413de2d68a0df6770bacd64e343022962 Mon Sep 17 00:00:00 2001 From: Jack Cushman Date: Wed, 3 Jun 2026 14:45:16 -0400 Subject: [PATCH] Summarize unclaimed rows in table_splitter.ambiguous suggestion Instead of enumerating every unclaimed row index into one sentence (which produced multi-hundred-thousand-character lines on large or irregular tables), report the count plus a small capped sample. Closes #88 --- .../src/transformers/table_splitter.rs | 51 ++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/binoc-stdlib/src/transformers/table_splitter.rs b/binoc-stdlib/src/transformers/table_splitter.rs index ba939c2..7f99f9d 100644 --- a/binoc-stdlib/src/transformers/table_splitter.rs +++ b/binoc-stdlib/src/transformers/table_splitter.rs @@ -259,13 +259,9 @@ fn detect_sections(table: &TabularData) -> Detection { let ambiguous_reason = if sections.len() >= 2 && !wide_unclaimed.is_empty() { Some(format!( - "The CSV has stacked table-like regions, but row{} {} outside any clear rectangle; leaving it as one table.", - if wide_unclaimed.len() == 1 { "" } else { "s" }, - wide_unclaimed - .iter() - .map(|row| row.to_string()) - .collect::>() - .join(", ") + "The CSV has stacked table-like regions, but {} {} outside any clear rectangle; leaving it as one table.", + summarize_unclaimed(&wide_unclaimed), + if wide_unclaimed.len() == 1 { "falls" } else { "fall" } )) } else { None @@ -277,6 +273,32 @@ fn detect_sections(table: &TabularData) -> Detection { } } +/// Summarize the unclaimed row indices as a count plus a small sample, rather +/// than enumerating every index (which bloats the suggestion on large tables). +fn summarize_unclaimed(rows: &[usize]) -> String { + const SAMPLE: usize = 5; + + let count = rows.len(); + let noun = if count == 1 { "row" } else { "rows" }; + + if count <= SAMPLE { + let list = rows + .iter() + .map(|row| row.to_string()) + .collect::>() + .join(", "); + return format!("{count} {noun} ({list})"); + } + + let sample = rows[..SAMPLE] + .iter() + .map(|row| row.to_string()) + .collect::>() + .join(", "); + let remaining = count - SAMPLE; + format!("{count} {noun} (e.g. {sample}, … and {remaining} more)") +} + fn raw_rows(table: &TabularData) -> Vec> { std::iter::once(table.headers.clone()) .chain(table.rows.clone()) @@ -508,4 +530,19 @@ mod tests { assert_eq!(detection.sections.len(), 2); assert!(detection.ambiguous_reason.is_some()); } + + #[test] + fn summarizes_unclaimed_rows_without_enumerating_all() { + let few = summarize_unclaimed(&[1, 2, 3]); + assert_eq!(few, "3 rows (1, 2, 3)"); + + let one = summarize_unclaimed(&[7]); + assert_eq!(one, "1 row (7)"); + + let many: Vec = (1..=10_000).collect(); + let summary = summarize_unclaimed(&many); + assert_eq!(summary, "10000 rows (e.g. 1, 2, 3, 4, 5, … and 9995 more)"); + // The whole point: the summary stays short regardless of row count. + assert!(summary.len() < 80); + } }