diff --git a/csv_gp/src/parser.rs b/csv_gp/src/parser.rs index 21b0af8..642e8bc 100644 --- a/csv_gp/src/parser.rs +++ b/csv_gp/src/parser.rs @@ -11,7 +11,7 @@ impl CSVReader { Self { reader, delimiter } } - /// Returns a owned iterator of all the csv lines + /// Returns an owned iterator of all the csv lines fn into_lines(self) -> CSVLineIntoIter { CSVLineIntoIter::new(self) } @@ -31,39 +31,96 @@ impl CSVLineIntoIter { } } -/// Determines if a passed string has fully closed quotes or not -fn has_open_quotes(s: &str, delimiter: char) -> bool { - let mut is_open = false; - let mut prev_char: Option = None; +/// Quote parse state across line boundaries. +#[derive(Clone, Copy)] +struct QuoteState { + is_open: bool, + prev_char: Option, + prev_prev_char: Option, +} + +impl QuoteState { + fn initial() -> Self { + Self { + is_open: false, + prev_char: None, + prev_prev_char: None, + } + } +} - // Remove quoted quotes - Basically any two consecutive quotes can be ignored for the purposes of this - // function. - // After removing quoted quotes, we can also remove any case of "," if delimiter is ,; as those don't count - // as open cell - let s2 = s - .replace("\"\"", "") - .replace(&format!("\"{}\"", delimiter), ""); +/// Returns quote state after processing `s` with the given initial state. +fn quote_state_after(s: &str, delimiter: char, initial: QuoteState) -> QuoteState { + let mut is_open = initial.is_open; + let mut prev_char = initial.prev_char; + let mut prev_prev_char = initial.prev_prev_char; + let mut chars = s.chars().peekable(); - let mut chars = s2.chars().peekable(); while let Some(current_char) = chars.next() { - match (prev_char, current_char, chars.peek()) { - // Quote at beginning of line - (None, '"', _) => is_open = true, - // Quote at the end of the string preceeded by the delimiter (`,"`), unless already open - (Some(c), '"', None) if c == delimiter && !is_open => is_open = true, - // Quote at the end of the string - (_, '"', None) => is_open = false, - // Quote followed by the delimiter (`",`), if already open - (_, '"', Some(n)) if n == &delimiter && is_open => is_open = false, - // Quote preceded by the delimiter (`,"`) - (Some(c), '"', _) if c == delimiter => is_open = true, - _ => (), - } + if current_char == '"' { + if chars.peek() == Some(&'"') { + chars.next(); + prev_prev_char = prev_char; + prev_char = Some('"'); + continue; + } + + // Handle consecutive delimiter-only cells pattern: ,"","," or ,","," + // When closing a cell that contains just the delimiter, and next is delimiter, + // and we just came from pattern that started with ," + // This handles the specific case where prev content was just the delimiter char + if is_open + && prev_char == Some(delimiter) + && prev_prev_char == Some('"') + && chars.peek() == Some(&delimiter) + { + // This quote closes a "," cell; consume the closing quote and following delimiter + chars.next(); + is_open = false; + prev_prev_char = Some('"'); + prev_char = Some(delimiter); + continue; + } + let at_start = prev_char.is_none(); + let after_delimiter = prev_char == Some(delimiter); + let at_end = chars.peek().is_none(); + let before_delimiter = chars.peek() == Some(&delimiter); + let single_quote_after_escaped = at_end && prev_char == Some('"'); + + match ( + at_start, + after_delimiter, + at_end, + before_delimiter, + is_open, + single_quote_after_escaped, + ) { + // closing quote: inside quoted field, next not delimiter, prev was " + (_, _, _, false, true, _) if prev_char == Some('"') => is_open = false, + // at start of cell or after delimiter when not open → opening quote + // This must come before EOL close to handle ," at line end + (true, _, _, _, false, _) | (_, true, _, _, false, _) => is_open = true, + // end of string when open (and not escaped single-quote) or delimiter after open → close + (_, _, true, _, true, false) | (_, _, _, true, true, _) => is_open = false, + // at EOL, was open, single-quote-after-escaped → close (empty quoted at EOL) + (_, _, true, _, true, true) => is_open = false, + // at EOL, was not open, single-quote-after-escaped → open (dangling quote) + (_, _, true, _, false, true) => is_open = true, + // inside content, prev was " (e.g. after escaped quote) → stay in quoted field + (_, false, false, _, _, _) if prev_char == Some('"') => is_open = true, + _ => (), + } + } + prev_prev_char = prev_char; prev_char = Some(current_char); } - is_open + QuoteState { + is_open, + prev_char, + prev_prev_char, + } } impl Iterator for CSVLineIntoIter { @@ -71,6 +128,7 @@ impl Iterator for CSVLineIntoIter { fn next(&mut self) -> Option { let mut current_selection = String::new(); + let mut state = QuoteState::initial(); loop { match self.lines.next() { @@ -89,8 +147,9 @@ impl Iterator for CSVLineIntoIter { let line = line.trim_end_matches('\r'); current_selection.push_str(line); + state = quote_state_after(line, self.delimiter, state); - if has_open_quotes(¤t_selection, self.delimiter) { + if state.is_open { // this newline is escaped, add back to text and continue loop current_selection.push('\n'); } else { @@ -153,84 +212,128 @@ mod has_open_quotes_tests { fn test_empty() { let input = ""; - assert!(!has_open_quotes(input, ',')) + assert!(!quote_state_after(input, ',', QuoteState::initial()).is_open) } #[test] fn test_no_quotes() { let input = "asdfasdf"; - assert!(!has_open_quotes(input, ',')) + assert!(!quote_state_after(input, ',', QuoteState::initial()).is_open) } #[test] fn test_with_opened_quote() { let input = "\"asdfasdf"; - assert!(has_open_quotes(input, ',')) + assert!(quote_state_after(input, ',', QuoteState::initial()).is_open) } #[test] fn test_with_closed_quote() { let input = "\"\"asdfasdf"; - assert!(!has_open_quotes(input, ',')) + assert!(!quote_state_after(input, ',', QuoteState::initial()).is_open) } #[test] fn test_two_quotes_middle() { let input = "\"asdf\"\"asdf"; - assert!(has_open_quotes(input, ',')) + assert!(quote_state_after(input, ',', QuoteState::initial()).is_open) } #[test] fn test_two_quotes_end() { let input = "\"asdfasdf\"\""; - assert!(has_open_quotes(input, ',')) + assert!(quote_state_after(input, ',', QuoteState::initial()).is_open) } #[test] fn test_three_quotes_end() { let input = "\"asdfasdf\"\"\""; - assert!(!has_open_quotes(input, ',')) + assert!(!quote_state_after(input, ',', QuoteState::initial()).is_open) } #[test] fn test_three_quotes_start() { let input = "\"\"\"asdfasdf"; - assert!(has_open_quotes(input, ',')) + assert!(quote_state_after(input, ',', QuoteState::initial()).is_open) } #[test] fn test_only_three_quotes_start() { let input = "\"\"\""; - assert!(has_open_quotes(input, ',')) + assert!(quote_state_after(input, ',', QuoteState::initial()).is_open) } #[test] fn test_three_quotes_end_of_line() { let input = "X,\"\"\""; - assert!(has_open_quotes(input, ',')) + assert!(quote_state_after(input, ',', QuoteState::initial()).is_open) } #[test] fn test_just_delimiter_quotes() { let input = "d,e,\",\""; - assert!(!has_open_quotes(input, ',')); + assert!(!quote_state_after(input, ',', QuoteState::initial()).is_open); } #[test] fn test_just_delimiter_open() { let input = "a,,\","; - assert!(has_open_quotes(input, ',')); + assert!(quote_state_after(input, ',', QuoteState::initial()).is_open); + } + + #[test] + fn test_unclosed_quote_mid_field() { + let input = "a|b|\"text|d|e"; + assert!(quote_state_after(input, '|', QuoteState::initial()).is_open); + } + + #[test] + fn test_unclosed_quote_with_pipe_delimiter() { + let input = "a|b||c|d|e|f|g|h|i|j|k|\"unclosed|m|n|o"; + assert!(quote_state_after(input, '|', QuoteState::initial()).is_open); + } + + #[test] + fn test_pipe_delimiter_basic() { + let input = "a|b|c"; + assert!(!quote_state_after(input, '|', QuoteState::initial()).is_open); + } + + #[test] + fn test_pipe_delimiter_quoted_cell() { + let input = "a|\"b|c\"|d"; + assert!(!quote_state_after(input, '|', QuoteState::initial()).is_open); + } + + #[test] + fn test_pipe_delimiter_only_cell() { + let input = "a|\"|\""; + assert!(!quote_state_after(input, '|', QuoteState::initial()).is_open); + } + + #[test] + fn test_consecutive_delimiter_only_cells() { + // Two consecutive "," cells: ,",",",", (empty, ",", ",", empty) + let input = ",\",\",\",\","; + assert!(!quote_state_after(input, ',', QuoteState::initial()).is_open); + } + + #[test] + fn test_unclosed_delimiter_only_cell() { + // One complete "," cell followed by incomplete: ,",","," (missing closing quote) + let input = ",\",\",\","; + assert!(quote_state_after(input, ',', QuoteState::initial()).is_open); } } @@ -490,6 +593,42 @@ mod parse_rows_tests { ] ) } + + #[test] + fn test_unclosed_quote_accumulates_following_lines() { + let input = "a|b|c\nd|\"unclosed|f\ng|h|i\nj|k|l".as_bytes(); + let result = CSVReader::new(input, '|') + .into_lines() + .collect::, _>>(); + + let rows = result.unwrap(); + assert_eq!(rows.len(), 2); + assert_eq!(rows[0].len(), 3); + let row1_str: String = rows[1] + .iter() + .map(|c| c.to_string()) + .collect::>() + .join("|"); + assert!(row1_str.contains("unclosed")); + } + + #[test] + fn test_escaped_quote_at_line_boundary_stays_open() { + let input = "a|b|c\nd|\"b\n\"\"|c|d\ne|f|g".as_bytes(); + let result = CSVReader::new(input, '|') + .into_lines() + .collect::, _>>(); + + let rows = result.unwrap(); + assert_eq!(rows.len(), 2); + let row1_str: String = rows[1] + .iter() + .map(|c| c.to_string()) + .collect::>() + .join("|"); + assert!(row1_str.contains("|c|d")); + assert!(row1_str.contains("e|f|g")); + } } #[cfg(test)] @@ -540,3 +679,146 @@ mod parse_cells_tests { assert_eq!(parse_cells("", ',').unwrap(), vec![]) } } + +#[cfg(test)] +mod equivalence_tests { + use super::*; + use std::io::BufRead; + + /// Full-buffer reference for equivalence tests. + fn rows_via_full_buffer( + input: &[u8], + delimiter: char, + ) -> Result>, std::io::Error> { + let lines = std::io::BufReader::new(input).lines(); + let mut current_selection = String::new(); + let mut rows = Vec::new(); + + for line in lines { + let line: String = line?.trim_end_matches('\r').to_string(); + current_selection.push_str(&line); + if quote_state_after(¤t_selection, delimiter, QuoteState::initial()).is_open { + current_selection.push('\n'); + } else { + rows.push(parse_cells(¤t_selection, delimiter)?); + current_selection.clear(); + } + } + if !current_selection.is_empty() { + rows.push(parse_cells(¤t_selection, delimiter)?); + } + Ok(rows) + } + + #[test] + fn test_incremental_matches_full_buffer_simple() { + let input = "a|b|c\nd|e|f\ng|h|i".as_bytes(); + let iter_rows = CSVReader::new(input, '|') + .into_lines() + .collect::, _>>() + .unwrap(); + let ref_rows = rows_via_full_buffer(input, '|').unwrap(); + assert_eq!(iter_rows, ref_rows); + } + + #[test] + fn test_incremental_matches_full_buffer_unclosed_quote() { + let input = "a|b|c\nd|\"unclosed|f\ng|h|i\nj|k|l".as_bytes(); + let iter_rows = CSVReader::new(input, '|') + .into_lines() + .collect::, _>>() + .unwrap(); + let ref_rows = rows_via_full_buffer(input, '|').unwrap(); + assert_eq!(iter_rows, ref_rows); + } + + #[test] + fn test_incremental_matches_full_buffer_escaped_at_boundary() { + let input = "a|b|c\nd|\"b\n\"\"|c|d\ne|f|g".as_bytes(); + let iter_rows = CSVReader::new(input, '|') + .into_lines() + .collect::, _>>() + .unwrap(); + let ref_rows = rows_via_full_buffer(input, '|').unwrap(); + assert_eq!(iter_rows, ref_rows); + } + + #[test] + fn test_incremental_matches_full_buffer_quoted_newline() { + let input = "\"test\n\",\"broken\ncolumn\"\nnext,row".as_bytes(); + let iter_rows = CSVReader::new(input, ',') + .into_lines() + .collect::, _>>() + .unwrap(); + let ref_rows = rows_via_full_buffer(input, ',').unwrap(); + assert_eq!(iter_rows, ref_rows); + } + + #[test] + fn test_incremental_matches_full_buffer_delimiter_only_cell() { + let input = "a,\",\",b\nc,d,e".as_bytes(); + let iter_rows = CSVReader::new(input, ',') + .into_lines() + .collect::, _>>() + .unwrap(); + let ref_rows = rows_via_full_buffer(input, ',').unwrap(); + assert_eq!(iter_rows, ref_rows); + } + + #[test] + fn test_incremental_matches_full_buffer_delimiter_only_at_boundary() { + let input = "a,\",\"\n\"b\",c".as_bytes(); + let iter_rows = CSVReader::new(input, ',') + .into_lines() + .collect::, _>>() + .unwrap(); + let ref_rows = rows_via_full_buffer(input, ',').unwrap(); + assert_eq!(iter_rows, ref_rows); + } +} + +#[cfg(test)] +mod performance_tests { + use super::*; + use std::time::Instant; + + #[test] + fn test_large_accumulated_string_performance() { + let line = "a|b||c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|1|2|3\n"; + let mut accumulated = String::with_capacity(line.len() * 10_000); + for _ in 0..10_000 { + accumulated.push_str(line); + } + + let start = Instant::now(); + let _ = quote_state_after(&accumulated, '|', QuoteState::initial()).is_open; + let elapsed = start.elapsed(); + + assert!( + elapsed.as_millis() < 100, + "quote_state_after took {}ms for 10k lines, expected <100ms", + elapsed.as_millis() + ); + } + + #[test] + fn test_iterator_on_accumulated_lines_fast() { + let line = "a|\"unclosed|b|c\n"; + let mut input = String::with_capacity(line.len() * 5000); + for _ in 0..5000 { + input.push_str(line); + } + let start = Instant::now(); + let rows = CSVReader::new(input.as_bytes(), '|') + .into_lines() + .collect::, _>>() + .unwrap(); + let elapsed = start.elapsed(); + assert_eq!(rows.len(), 1); + assert!( + elapsed.as_millis() < 500, + "iterator took {}ms", + elapsed.as_millis() + ); + } +} diff --git a/csv_gp_python/tests/fixtures/unclosed_quote.csv b/csv_gp_python/tests/fixtures/unclosed_quote.csv new file mode 100644 index 0000000..0329706 --- /dev/null +++ b/csv_gp_python/tests/fixtures/unclosed_quote.csv @@ -0,0 +1,3 @@ +a|b|c +d|"unclosed|f +g|h|i diff --git a/csv_gp_python/tests/test_integration.py b/csv_gp_python/tests/test_integration.py index d407104..a09044f 100644 --- a/csv_gp_python/tests/test_integration.py +++ b/csv_gp_python/tests/test_integration.py @@ -159,3 +159,14 @@ def test_incorrect_quote(): assert result.column_count_per_line == [3, 3, 2, 1] assert result.valid_rows == {0} assert not result.header_messed_up + + +def test_unclosed_quote_with_pipe(): + """Test that files with unclosed quotes are handled correctly.""" + result = csv_gp.check_file(str(FIXTURES / "unclosed_quote.csv"), "|", encoding="utf-8") + + assert result + assert result.row_count == 2 # Header + merged row + assert result.column_count == 3 + assert result.incorrect_cell_quote == [1] + assert result.header_messed_up # Merged row has wrong column count