From 8eee452beb755d812b4c7bf297d5508430815160 Mon Sep 17 00:00:00 2001 From: Wes Etheredge Date: Tue, 24 Jun 2025 17:30:22 -0500 Subject: [PATCH] Implement JSON and CSV data processing tools (Phase 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added comprehensive data module with JSON and CSV processing capabilities: Infrastructure: - New exception classes: DataError, ValidationError, SerializationError - Data-specific type definitions: DataDict, NestedData, ValidationResult - Helper functions for loading tools by category JSON Tools (json_tools.py): - safe_json_serialize/deserialize with error handling - validate_json_string for validation without parsing - compress/decompress_json_data for efficient storage - Full Unicode support and comprehensive error handling CSV Tools (csv_tools.py): - read/write_csv_file with flexible delimiter and header options - csv_to_dict_list and dict_list_to_csv for string conversion - detect_csv_delimiter for auto-detection - validate_csv_structure for file validation - clean_csv_data with configurable cleaning rules Testing: - 71 comprehensive tests covering all functions - 91% coverage for CSV tools, 100% for JSON tools - Edge cases, error conditions, and round-trip validation Integration: - Updated main package to export data module - Added helper functions for selective tool loading - Maintains project's zero runtime dependencies πŸ€– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/basic_open_agent_tools/__init__.py | 10 +- src/basic_open_agent_tools/data/__init__.py | 41 +- src/basic_open_agent_tools/data/csv_tools.py | 376 +++++++++++++++ src/basic_open_agent_tools/data/json_tools.py | 144 ++++++ src/basic_open_agent_tools/exceptions.py | 18 + src/basic_open_agent_tools/helpers.py | 87 +++- src/basic_open_agent_tools/types.py | 7 +- tests/test_data_csv_tools.py | 454 ++++++++++++++++++ tests/test_data_json_tools.py | 245 ++++++++++ tests/test_helpers.py | 21 +- 10 files changed, 1388 insertions(+), 15 deletions(-) create mode 100644 src/basic_open_agent_tools/data/csv_tools.py create mode 100644 src/basic_open_agent_tools/data/json_tools.py create mode 100644 tests/test_data_csv_tools.py create mode 100644 tests/test_data_json_tools.py diff --git a/src/basic_open_agent_tools/__init__.py b/src/basic_open_agent_tools/__init__.py index 6ce766b..549ea29 100644 --- a/src/basic_open_agent_tools/__init__.py +++ b/src/basic_open_agent_tools/__init__.py @@ -9,14 +9,17 @@ __version__ = "0.2.0" # Modular structure -from . import exceptions, file_system, text, types +from . import data, exceptions, file_system, text, types # Helper functions for tool management from .helpers import ( get_tool_info, list_all_available_tools, + load_all_data_tools, load_all_filesystem_tools, load_all_text_tools, + load_data_csv_tools, + load_data_json_tools, merge_tool_lists, ) @@ -31,10 +34,10 @@ # Implemented modules "file_system", "text", + "data", # Future modules (uncomment when implemented) # "system", # "network", - # "data", # "crypto", # "utilities", # Common infrastructure @@ -43,6 +46,9 @@ # Helper functions "load_all_filesystem_tools", "load_all_text_tools", + "load_all_data_tools", + "load_data_json_tools", + "load_data_csv_tools", "merge_tool_lists", "get_tool_info", "list_all_available_tools", diff --git a/src/basic_open_agent_tools/data/__init__.py b/src/basic_open_agent_tools/data/__init__.py index 56feeb7..6246349 100644 --- a/src/basic_open_agent_tools/data/__init__.py +++ b/src/basic_open_agent_tools/data/__init__.py @@ -1,8 +1,45 @@ """Data tools for AI agents. -This module is not yet implemented. See TODO.md for planned functionality. +This module provides data processing and manipulation tools organized into logical submodules: + +- json_tools: JSON serialization, compression, and validation +- csv_tools: CSV file processing, parsing, and cleaning """ from typing import List -__all__: List[str] = [] # No functions available yet +# Import all functions from submodules +from .csv_tools import ( + clean_csv_data, + csv_to_dict_list, + detect_csv_delimiter, + dict_list_to_csv, + read_csv_file, + validate_csv_structure, + write_csv_file, +) +from .json_tools import ( + compress_json_data, + decompress_json_data, + safe_json_deserialize, + safe_json_serialize, + validate_json_string, +) + +# Re-export all functions at module level for convenience +__all__: List[str] = [ + # JSON processing + "safe_json_serialize", + "safe_json_deserialize", + "validate_json_string", + "compress_json_data", + "decompress_json_data", + # CSV processing + "read_csv_file", + "write_csv_file", + "csv_to_dict_list", + "dict_list_to_csv", + "detect_csv_delimiter", + "validate_csv_structure", + "clean_csv_data", +] diff --git a/src/basic_open_agent_tools/data/csv_tools.py b/src/basic_open_agent_tools/data/csv_tools.py new file mode 100644 index 0000000..76bc4a8 --- /dev/null +++ b/src/basic_open_agent_tools/data/csv_tools.py @@ -0,0 +1,376 @@ +"""CSV processing utilities for AI agents.""" + +import csv +import io +from pathlib import Path +from typing import Any, Dict, List, Optional + +from ..exceptions import DataError +from ..types import DataDict, PathLike + + +def read_csv_file( + file_path: PathLike, delimiter: str = ",", headers: bool = True +) -> List[DataDict]: + """Read CSV file and return as list of dictionaries. + + Args: + file_path: Path to the CSV file + delimiter: CSV field delimiter + headers: Whether first row contains headers + + Returns: + List of dictionaries representing CSV rows + + Raises: + DataError: If file cannot be read or parsed + TypeError: If arguments have wrong types + + Example: + >>> # Assuming file contains: name,age\\nAlice,25\\nBob,30 + >>> data = read_csv_file("people.csv") + >>> data + [{'name': 'Alice', 'age': '25'}, {'name': 'Bob', 'age': '30'}] + """ + if not isinstance(file_path, (str, Path)): + raise TypeError("file_path must be a string or Path") + if not isinstance(delimiter, str): + raise TypeError("delimiter must be a string") + if not isinstance(headers, bool): + raise TypeError("headers must be a boolean") + + file_path = Path(file_path) + + try: + with open(file_path, encoding="utf-8", newline="") as csvfile: + if headers: + reader = csv.DictReader(csvfile, delimiter=delimiter) + return [dict(row) for row in reader] + else: + reader = csv.reader(csvfile, delimiter=delimiter) # type: ignore[assignment] + rows = list(reader) + if not rows: + return [] + # Create numeric headers for headerless CSV + num_cols = len(rows[0]) if rows else 0 + headers_list = [f"col_{i}" for i in range(num_cols)] + return [dict(zip(headers_list, row)) for row in rows] + except FileNotFoundError: + raise DataError(f"CSV file not found: {file_path}") + except UnicodeDecodeError as e: + raise DataError(f"Failed to decode CSV file {file_path}: {e}") + except csv.Error as e: + raise DataError(f"Failed to parse CSV file {file_path}: {e}") + + +def write_csv_file( + data: List[DataDict], + file_path: PathLike, + delimiter: str = ",", + headers: bool = True, +) -> None: + """Write list of dictionaries to CSV file. + + Args: + data: List of dictionaries to write + file_path: Path where CSV file will be created + delimiter: CSV field delimiter + headers: Whether to write headers as first row + + Raises: + DataError: If file cannot be written + TypeError: If arguments have wrong types + + Example: + >>> data = [{'name': 'Alice', 'age': 25}, {'name': 'Bob', 'age': 30}] + >>> write_csv_file(data, "output.csv") + """ + if not isinstance(data, list): + raise TypeError("data must be a list") + if not isinstance(file_path, (str, Path)): + raise TypeError("file_path must be a string or Path") + if not isinstance(delimiter, str): + raise TypeError("delimiter must be a string") + if not isinstance(headers, bool): + raise TypeError("headers must be a boolean") + + if not data: + # Write empty file for empty data + Path(file_path).write_text("", encoding="utf-8") + return + + # Validate all items are dictionaries + if not all(isinstance(item, dict) for item in data): + raise TypeError("All items in data must be dictionaries") + + file_path = Path(file_path) + + try: + # Get all unique fieldnames from all dictionaries + fieldnames = [] + for item in data: + for key in item.keys(): + if key not in fieldnames: + fieldnames.append(key) + + with open(file_path, "w", encoding="utf-8", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=delimiter) + if headers: + writer.writeheader() + writer.writerows(data) + except OSError as e: + raise DataError(f"Failed to write CSV file {file_path}: {e}") + + +def csv_to_dict_list(csv_data: str, delimiter: str = ",") -> List[DataDict]: + """Convert CSV string to list of dictionaries. + + Args: + csv_data: CSV data as string + delimiter: CSV field delimiter + + Returns: + List of dictionaries representing CSV rows + + Raises: + DataError: If CSV data cannot be parsed + TypeError: If arguments have wrong types + + Example: + >>> csv_str = "name,age\\nAlice,25\\nBob,30" + >>> csv_to_dict_list(csv_str) + [{'name': 'Alice', 'age': '25'}, {'name': 'Bob', 'age': '30'}] + """ + if not isinstance(csv_data, str): + raise TypeError("csv_data must be a string") + if not isinstance(delimiter, str): + raise TypeError("delimiter must be a string") + + try: + reader = csv.DictReader(io.StringIO(csv_data), delimiter=delimiter) + return [dict(row) for row in reader] + except csv.Error as e: + raise DataError(f"Failed to parse CSV data: {e}") + + +def dict_list_to_csv(data: List[DataDict], delimiter: str = ",") -> str: + """Convert list of dictionaries to CSV string. + + Args: + data: List of dictionaries to convert + delimiter: CSV field delimiter + + Returns: + CSV data as string + + Raises: + TypeError: If arguments have wrong types + + Example: + >>> data = [{'name': 'Alice', 'age': 25}, {'name': 'Bob', 'age': 30}] + >>> dict_list_to_csv(data) + 'name,age\\nAlice,25\\nBob,30\\n' + """ + if not isinstance(data, list): + raise TypeError("data must be a list") + if not isinstance(delimiter, str): + raise TypeError("delimiter must be a string") + + if not data: + return "" + + # Validate all items are dictionaries + if not all(isinstance(item, dict) for item in data): + raise TypeError("All items in data must be dictionaries") + + # Get all unique fieldnames + fieldnames = [] + for item in data: + for key in item.keys(): + if key not in fieldnames: + fieldnames.append(key) + + output = io.StringIO() + writer = csv.DictWriter(output, fieldnames=fieldnames, delimiter=delimiter) + writer.writeheader() + writer.writerows(data) + return output.getvalue() + + +def detect_csv_delimiter(file_path: PathLike, sample_size: int = 1024) -> str: + """Auto-detect CSV delimiter by analyzing file content. + + Args: + file_path: Path to the CSV file + sample_size: Number of characters to sample for detection + + Returns: + Detected delimiter character + + Raises: + DataError: If file cannot be read or delimiter cannot be detected + TypeError: If arguments have wrong types + + Example: + >>> detect_csv_delimiter("data.csv") + ',' + >>> detect_csv_delimiter("data.tsv") + '\\t' + """ + if not isinstance(file_path, (str, Path)): + raise TypeError("file_path must be a string or Path") + if not isinstance(sample_size, int) or sample_size <= 0: + raise TypeError("sample_size must be a positive integer") + + file_path = Path(file_path) + + try: + with open(file_path, encoding="utf-8") as csvfile: + sample = csvfile.read(sample_size) + + if not sample: + raise DataError("File is empty, cannot detect delimiter") + + sniffer = csv.Sniffer() + delimiter = sniffer.sniff(sample).delimiter + return delimiter + except FileNotFoundError: + raise DataError(f"CSV file not found: {file_path}") + except UnicodeDecodeError as e: + raise DataError(f"Failed to decode CSV file {file_path}: {e}") + except csv.Error as e: + raise DataError(f"Failed to detect delimiter in {file_path}: {e}") + + +def validate_csv_structure( + file_path: PathLike, expected_columns: Optional[List[str]] = None +) -> bool: + """Validate CSV file structure and column headers. + + Args: + file_path: Path to the CSV file + expected_columns: List of expected column names (None to skip check) + + Returns: + True if CSV structure is valid + + Raises: + DataError: If file cannot be read or structure is invalid + TypeError: If arguments have wrong types + + Example: + >>> validate_csv_structure("data.csv", ["name", "age", "email"]) + True + >>> validate_csv_structure("malformed.csv") + False + """ + if not isinstance(file_path, (str, Path)): + raise TypeError("file_path must be a string or Path") + if expected_columns is not None and not isinstance(expected_columns, list): + raise TypeError("expected_columns must be a list or None") + + try: + # Check if file is empty first + file_path = Path(file_path) + if file_path.stat().st_size == 0: + return True # Empty file is considered valid + + # Try to detect delimiter first + delimiter = detect_csv_delimiter(file_path) + + # Read first few rows to validate structure + data = read_csv_file(file_path, delimiter=delimiter, headers=True) + + if not data: + return True # Empty file is considered valid + + # Check if expected columns are present + if expected_columns is not None: + first_row = data[0] + actual_columns = set(first_row.keys()) + expected_set = set(expected_columns) + + if not expected_set.issubset(actual_columns): + missing = expected_set - actual_columns + raise DataError(f"Missing expected columns: {missing}") + + return True + except DataError: + # Re-raise DataError as-is + raise + except Exception as e: + raise DataError(f"Invalid CSV structure in {file_path}: {e}") + + +def clean_csv_data( + data: List[DataDict], rules: Optional[Dict[str, Any]] = None +) -> List[DataDict]: + """Clean CSV data according to specified rules. + + Args: + data: List of dictionaries to clean + rules: Dictionary of cleaning rules (None for default cleaning) + + Returns: + Cleaned list of dictionaries + + Raises: + TypeError: If arguments have wrong types + + Example: + >>> data = [{'name': ' Alice ', 'age': '', 'score': 'N/A'}] + >>> rules = {'strip_whitespace': True, 'remove_empty': True, 'na_values': ['N/A']} + >>> clean_csv_data(data, rules) + [{'name': 'Alice', 'score': None}] + """ + if not isinstance(data, list): + raise TypeError("data must be a list") + if rules is not None and not isinstance(rules, dict): + raise TypeError("rules must be a dictionary or None") + + if not data: + return data + + # Default cleaning rules + default_rules = { + "strip_whitespace": True, + "remove_empty": False, + "na_values": ["N/A", "n/a", "NA", "null", "NULL", "None"], + } + + # Merge with provided rules + if rules: + default_rules.update(rules) + + cleaned_data = [] + + for row in data: + if not isinstance(row, dict): + continue # Skip non-dictionary items + + cleaned_row = {} + + for key, value in row.items(): + # Convert to string for processing + if not isinstance(value, str): + value = str(value) if value is not None else "" + + # Strip whitespace + if default_rules.get("strip_whitespace", False): + value = value.strip() + + # Handle NA values + na_values = default_rules.get("na_values", []) + if isinstance(na_values, list) and value in na_values: + value = None + + # Remove empty fields if requested + if default_rules.get("remove_empty", False): + if value == "" or value is None: + continue + + cleaned_row[key] = value + + cleaned_data.append(cleaned_row) + + return cleaned_data diff --git a/src/basic_open_agent_tools/data/json_tools.py b/src/basic_open_agent_tools/data/json_tools.py new file mode 100644 index 0000000..7e404ca --- /dev/null +++ b/src/basic_open_agent_tools/data/json_tools.py @@ -0,0 +1,144 @@ +"""JSON processing utilities for AI agents.""" + +import gzip +import json +from typing import Any, Optional + +from ..exceptions import SerializationError + + +def safe_json_serialize(data: Any, indent: Optional[int] = None) -> str: + """Safely serialize data to JSON string with error handling. + + Args: + data: Data to serialize to JSON + indent: Number of spaces for indentation (None for compact) + + Returns: + JSON string representation of the data + + Raises: + SerializationError: If data cannot be serialized to JSON + TypeError: If data contains non-serializable objects + + Example: + >>> safe_json_serialize({"name": "test", "value": 42}) + '{"name": "test", "value": 42}' + >>> safe_json_serialize({"a": 1, "b": 2}, indent=2) + '{\\n "a": 1,\\n "b": 2\\n}' + """ + if not isinstance(indent, (int, type(None))): + raise TypeError("indent must be an integer or None") + + try: + return json.dumps(data, indent=indent, ensure_ascii=False) + except (TypeError, ValueError) as e: + raise SerializationError(f"Failed to serialize data to JSON: {e}") + + +def safe_json_deserialize(json_str: str) -> Any: + """Safely deserialize JSON string to Python object with error handling. + + Args: + json_str: JSON string to deserialize + + Returns: + Deserialized Python object + + Raises: + SerializationError: If JSON string cannot be parsed + TypeError: If input is not a string + + Example: + >>> safe_json_deserialize('{"name": "test", "value": 42}') + {'name': 'test', 'value': 42} + >>> safe_json_deserialize('[1, 2, 3]') + [1, 2, 3] + """ + if not isinstance(json_str, str): + raise TypeError("Input must be a string") + + try: + return json.loads(json_str) + except (json.JSONDecodeError, ValueError) as e: + raise SerializationError(f"Failed to deserialize JSON string: {e}") + + +def validate_json_string(json_str: str) -> bool: + """Validate JSON string without deserializing. + + Args: + json_str: JSON string to validate + + Returns: + True if valid JSON, False otherwise + + Example: + >>> validate_json_string('{"valid": true}') + True + >>> validate_json_string('{"invalid": }') + False + """ + if not isinstance(json_str, str): + return False + + try: + json.loads(json_str) + return True + except (json.JSONDecodeError, ValueError): + return False + + +def compress_json_data(data: Any) -> bytes: + """Compress JSON data for storage or transmission. + + Args: + data: Data to serialize and compress + + Returns: + Compressed JSON data as bytes + + Raises: + SerializationError: If data cannot be serialized or compressed + TypeError: If data contains non-serializable objects + + Example: + >>> compressed = compress_json_data({"test": "data"}) + >>> isinstance(compressed, bytes) + True + """ + try: + json_str = safe_json_serialize(data) + return gzip.compress(json_str.encode("utf-8")) + except Exception as e: + raise SerializationError(f"Failed to compress JSON data: {e}") + + +def decompress_json_data(compressed_data: bytes) -> Any: + """Decompress and deserialize JSON data. + + Args: + compressed_data: Compressed JSON data as bytes + + Returns: + Deserialized Python object + + Raises: + SerializationError: If data cannot be decompressed or deserialized + TypeError: If input is not bytes + + Example: + >>> original = {"test": "data"} + >>> compressed = compress_json_data(original) + >>> decompressed = decompress_json_data(compressed) + >>> decompressed == original + True + """ + if not isinstance(compressed_data, bytes): + raise TypeError("Input must be bytes") + + try: + json_str = gzip.decompress(compressed_data).decode("utf-8") + return safe_json_deserialize(json_str) + except Exception as e: + raise SerializationError(f"Failed to decompress JSON data: {e}") diff --git a/src/basic_open_agent_tools/exceptions.py b/src/basic_open_agent_tools/exceptions.py index eca7cd1..1d55731 100644 --- a/src/basic_open_agent_tools/exceptions.py +++ b/src/basic_open_agent_tools/exceptions.py @@ -11,3 +11,21 @@ class FileSystemError(BasicAgentToolsError): """Exception for file system operations.""" pass + + +class DataError(BasicAgentToolsError): + """Exception for data operations.""" + + pass + + +class ValidationError(DataError): + """Exception for data validation operations.""" + + pass + + +class SerializationError(DataError): + """Exception for data serialization/deserialization operations.""" + + pass diff --git a/src/basic_open_agent_tools/helpers.py b/src/basic_open_agent_tools/helpers.py index ddab5a5..b0939ba 100644 --- a/src/basic_open_agent_tools/helpers.py +++ b/src/basic_open_agent_tools/helpers.py @@ -3,7 +3,7 @@ import inspect from typing import Any, Callable, Dict, List, Union -from . import file_system, text +from . import data, file_system, text def load_all_filesystem_tools() -> List[Callable[..., Any]]: @@ -50,6 +50,90 @@ def load_all_text_tools() -> List[Callable[..., Any]]: return tools +def load_all_data_tools() -> List[Callable[..., Any]]: + """Load all data processing tools as a list of callable functions. + + Returns: + List of all data processing tool functions + + Example: + >>> data_tools = load_all_data_tools() + >>> len(data_tools) > 0 + True + """ + tools = [] + + # Get all functions from data module + for name in data.__all__: + func = getattr(data, name) + if callable(func): + tools.append(func) + + return tools + + +def load_data_json_tools() -> List[Callable[..., Any]]: + """Load JSON processing tools as a list of callable functions. + + Returns: + List of JSON processing tool functions + + Example: + >>> json_tools = load_data_json_tools() + >>> len(json_tools) == 5 + True + """ + from .data import json_tools + + tools = [] + json_function_names = [ + "safe_json_serialize", + "safe_json_deserialize", + "validate_json_string", + "compress_json_data", + "decompress_json_data", + ] + + for name in json_function_names: + func = getattr(json_tools, name) + if callable(func): + tools.append(func) + + return tools + + +def load_data_csv_tools() -> List[Callable[..., Any]]: + """Load CSV processing tools as a list of callable functions. + + Returns: + List of CSV processing tool functions + + Example: + >>> csv_tools = load_data_csv_tools() + >>> len(csv_tools) == 7 + True + """ + from .data import csv_tools + + tools = [] + csv_function_names = [ + "read_csv_file", + "write_csv_file", + "csv_to_dict_list", + "dict_list_to_csv", + "detect_csv_delimiter", + "validate_csv_structure", + "clean_csv_data", + ] + + for name in csv_function_names: + func = getattr(csv_tools, name) + if callable(func): + tools.append(func) + + return tools + + def merge_tool_lists( *args: Union[List[Callable[..., Any]], Callable[..., Any]], ) -> List[Callable[..., Any]]: @@ -149,4 +233,5 @@ def list_all_available_tools() -> Dict[str, List[Dict[str, Any]]]: return { "file_system": [get_tool_info(tool) for tool in load_all_filesystem_tools()], "text": [get_tool_info(tool) for tool in load_all_text_tools()], + "data": [get_tool_info(tool) for tool in load_all_data_tools()], } diff --git a/src/basic_open_agent_tools/types.py b/src/basic_open_agent_tools/types.py index 0e21180..e01742a 100644 --- a/src/basic_open_agent_tools/types.py +++ b/src/basic_open_agent_tools/types.py @@ -1,9 +1,14 @@ """Common type definitions for basic-open-agent-tools.""" from pathlib import Path -from typing import Union +from typing import Any, Dict, List, Union # Common type aliases currently in use PathLike = Union[str, Path] +# Data-related type aliases +DataDict = Dict[str, Any] +NestedData = Union[Dict[str, Any], List[Any], str, int, float, bool, None] +ValidationResult = Dict[str, Union[bool, str, List[str]]] + # Additional types will be added as modules are implemented diff --git a/tests/test_data_csv_tools.py b/tests/test_data_csv_tools.py new file mode 100644 index 0000000..a261423 --- /dev/null +++ b/tests/test_data_csv_tools.py @@ -0,0 +1,454 @@ +"""Tests for CSV processing tools.""" + + +import pytest + +from basic_open_agent_tools.data.csv_tools import ( + clean_csv_data, + csv_to_dict_list, + detect_csv_delimiter, + dict_list_to_csv, + read_csv_file, + validate_csv_structure, + write_csv_file, +) +from basic_open_agent_tools.exceptions import DataError + + +class TestReadCsvFile: + """Test read_csv_file function.""" + + def test_read_simple_csv(self, tmp_path): + """Test reading a simple CSV file.""" + csv_content = "name,age\nAlice,25\nBob,30" + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + result = read_csv_file(csv_file) + expected = [{"name": "Alice", "age": "25"}, {"name": "Bob", "age": "30"}] + assert result == expected + + def test_read_csv_without_headers(self, tmp_path): + """Test reading CSV without headers.""" + csv_content = "Alice,25\nBob,30" + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + result = read_csv_file(csv_file, headers=False) + expected = [{"col_0": "Alice", "col_1": "25"}, {"col_0": "Bob", "col_1": "30"}] + assert result == expected + + def test_read_csv_custom_delimiter(self, tmp_path): + """Test reading CSV with custom delimiter.""" + csv_content = "name;age\nAlice;25\nBob;30" + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + result = read_csv_file(csv_file, delimiter=";") + expected = [{"name": "Alice", "age": "25"}, {"name": "Bob", "age": "30"}] + assert result == expected + + def test_read_empty_csv(self, tmp_path): + """Test reading empty CSV file.""" + csv_file = tmp_path / "empty.csv" + csv_file.write_text("") + + result = read_csv_file(csv_file) + assert result == [] + + def test_read_csv_headers_only(self, tmp_path): + """Test reading CSV with headers only.""" + csv_content = "name,age" + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + result = read_csv_file(csv_file) + assert result == [] + + def test_read_nonexistent_file(self, tmp_path): + """Test reading non-existent file.""" + nonexistent = tmp_path / "nonexistent.csv" + with pytest.raises(DataError, match="CSV file not found"): + read_csv_file(nonexistent) + + def test_read_csv_invalid_types(self): + """Test with invalid argument types.""" + with pytest.raises(TypeError, match="file_path must be a string or Path"): + read_csv_file(123) + + with pytest.raises(TypeError, match="delimiter must be a string"): + read_csv_file("test.csv", delimiter=123) + + with pytest.raises(TypeError, match="headers must be a boolean"): + read_csv_file("test.csv", headers="yes") + + +class TestWriteCsvFile: + """Test write_csv_file function.""" + + def test_write_simple_csv(self, tmp_path): + """Test writing a simple CSV file.""" + data = [{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}] + csv_file = tmp_path / "output.csv" + + write_csv_file(data, csv_file) + + # Verify content + content = csv_file.read_text() + assert "name,age" in content + assert "Alice,25" in content + assert "Bob,30" in content + + def test_write_csv_without_headers(self, tmp_path): + """Test writing CSV without headers.""" + data = [{"name": "Alice", "age": 25}] + csv_file = tmp_path / "output.csv" + + write_csv_file(data, csv_file, headers=False) + + content = csv_file.read_text() + assert "name,age" not in content + assert "Alice,25" in content + + def test_write_csv_custom_delimiter(self, tmp_path): + """Test writing CSV with custom delimiter.""" + data = [{"name": "Alice", "age": 25}] + csv_file = tmp_path / "output.csv" + + write_csv_file(data, csv_file, delimiter=";") + + content = csv_file.read_text() + assert "name;age" in content + assert "Alice;25" in content + + def test_write_empty_data(self, tmp_path): + """Test writing empty data.""" + csv_file = tmp_path / "empty.csv" + write_csv_file([], csv_file) + + assert csv_file.read_text() == "" + + def test_write_csv_mixed_fields(self, tmp_path): + """Test writing CSV with mixed fields across rows.""" + data = [ + {"name": "Alice", "age": 25}, + {"name": "Bob", "city": "NYC"}, + {"age": 30, "country": "USA"}, + ] + csv_file = tmp_path / "output.csv" + + write_csv_file(data, csv_file) + + # Should include all unique fields + content = csv_file.read_text() + assert "name" in content + assert "age" in content + assert "city" in content + assert "country" in content + + def test_write_csv_invalid_types(self, tmp_path): + """Test with invalid argument types.""" + csv_file = tmp_path / "test.csv" + + with pytest.raises(TypeError, match="data must be a list"): + write_csv_file("not a list", csv_file) + + with pytest.raises(TypeError, match="file_path must be a string or Path"): + write_csv_file([], 123) + + with pytest.raises(TypeError, match="All items in data must be dictionaries"): + write_csv_file(["not", "dicts"], csv_file) + + +class TestCsvToDictList: + """Test csv_to_dict_list function.""" + + def test_convert_simple_csv(self): + """Test converting simple CSV string.""" + csv_str = "name,age\nAlice,25\nBob,30" + result = csv_to_dict_list(csv_str) + expected = [{"name": "Alice", "age": "25"}, {"name": "Bob", "age": "30"}] + assert result == expected + + def test_convert_custom_delimiter(self): + """Test converting CSV with custom delimiter.""" + csv_str = "name;age\nAlice;25\nBob;30" + result = csv_to_dict_list(csv_str, delimiter=";") + expected = [{"name": "Alice", "age": "25"}, {"name": "Bob", "age": "30"}] + assert result == expected + + def test_convert_empty_csv(self): + """Test converting empty CSV.""" + result = csv_to_dict_list("") + assert result == [] + + def test_convert_headers_only(self): + """Test converting CSV with headers only.""" + result = csv_to_dict_list("name,age") + assert result == [] + + def test_convert_invalid_types(self): + """Test with invalid argument types.""" + with pytest.raises(TypeError, match="csv_data must be a string"): + csv_to_dict_list(123) + + with pytest.raises(TypeError, match="delimiter must be a string"): + csv_to_dict_list("name,age", delimiter=123) + + +class TestDictListToCsv: + """Test dict_list_to_csv function.""" + + def test_convert_simple_data(self): + """Test converting simple data to CSV.""" + data = [{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}] + result = dict_list_to_csv(data) + + assert "name,age" in result + assert "Alice,25" in result + assert "Bob,30" in result + + def test_convert_custom_delimiter(self): + """Test converting with custom delimiter.""" + data = [{"name": "Alice", "age": 25}] + result = dict_list_to_csv(data, delimiter=";") + + assert "name;age" in result + assert "Alice;25" in result + + def test_convert_empty_data(self): + """Test converting empty data.""" + result = dict_list_to_csv([]) + assert result == "" + + def test_convert_mixed_fields(self): + """Test converting data with mixed fields.""" + data = [{"name": "Alice", "age": 25}, {"name": "Bob", "city": "NYC"}] + result = dict_list_to_csv(data) + + lines = result.strip().split("\n") + header = lines[0] + assert "name" in header + assert "age" in header + assert "city" in header + + def test_convert_invalid_types(self): + """Test with invalid argument types.""" + with pytest.raises(TypeError, match="data must be a list"): + dict_list_to_csv("not a list") + + with pytest.raises(TypeError, match="All items in data must be dictionaries"): + dict_list_to_csv(["not", "dicts"]) + + +class TestDetectCsvDelimiter: + """Test detect_csv_delimiter function.""" + + def test_detect_comma_delimiter(self, tmp_path): + """Test detecting comma delimiter.""" + csv_content = "name,age\nAlice,25\nBob,30" + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + result = detect_csv_delimiter(csv_file) + assert result == "," + + def test_detect_semicolon_delimiter(self, tmp_path): + """Test detecting semicolon delimiter.""" + csv_content = "name;age\nAlice;25\nBob;30" + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + result = detect_csv_delimiter(csv_file) + assert result == ";" + + def test_detect_tab_delimiter(self, tmp_path): + """Test detecting tab delimiter.""" + csv_content = "name\tage\nAlice\t25\nBob\t30" + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + result = detect_csv_delimiter(csv_file) + assert result == "\t" + + def test_detect_custom_sample_size(self, tmp_path): + """Test detection with custom sample size.""" + csv_content = "name,age\n" + "Alice,25\n" * 1000 + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + result = detect_csv_delimiter(csv_file, sample_size=100) + assert result == "," + + def test_detect_empty_file(self, tmp_path): + """Test detecting delimiter in empty file.""" + csv_file = tmp_path / "empty.csv" + csv_file.write_text("") + + with pytest.raises(DataError, match="File is empty, cannot detect delimiter"): + detect_csv_delimiter(csv_file) + + def test_detect_nonexistent_file(self, tmp_path): + """Test detecting delimiter in non-existent file.""" + nonexistent = tmp_path / "nonexistent.csv" + with pytest.raises(DataError, match="CSV file not found"): + detect_csv_delimiter(nonexistent) + + def test_detect_invalid_types(self): + """Test with invalid argument types.""" + with pytest.raises(TypeError, match="file_path must be a string or Path"): + detect_csv_delimiter(123) + + with pytest.raises(TypeError, match="sample_size must be a positive integer"): + detect_csv_delimiter("test.csv", sample_size=0) + + +class TestValidateCsvStructure: + """Test validate_csv_structure function.""" + + def test_validate_valid_structure(self, tmp_path): + """Test validating valid CSV structure.""" + csv_content = "name,age,email\nAlice,25,alice@example.com" + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + result = validate_csv_structure(csv_file, ["name", "age"]) + assert result is True + + def test_validate_missing_columns(self, tmp_path): + """Test validating CSV with missing expected columns.""" + csv_content = "name,age\nAlice,25" + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + with pytest.raises(DataError, match="Missing expected columns"): + validate_csv_structure(csv_file, ["name", "age", "email"]) + + def test_validate_no_expected_columns(self, tmp_path): + """Test validating without expected columns.""" + csv_content = "name,age\nAlice,25" + csv_file = tmp_path / "test.csv" + csv_file.write_text(csv_content) + + result = validate_csv_structure(csv_file) + assert result is True + + def test_validate_empty_file(self, tmp_path): + """Test validating empty CSV file.""" + csv_file = tmp_path / "empty.csv" + csv_file.write_text("") + + result = validate_csv_structure(csv_file) + assert result is True + + def test_validate_invalid_types(self): + """Test with invalid argument types.""" + with pytest.raises(TypeError, match="file_path must be a string or Path"): + validate_csv_structure(123) + + with pytest.raises(TypeError, match="expected_columns must be a list or None"): + validate_csv_structure("test.csv", "not a list") + + +class TestCleanCsvData: + """Test clean_csv_data function.""" + + def test_clean_default_rules(self): + """Test cleaning with default rules.""" + data = [ + {"name": " Alice ", "age": "25", "score": ""}, + {"name": "Bob", "age": "N/A", "score": "95"}, + ] + + result = clean_csv_data(data) + expected = [ + {"name": "Alice", "age": "25", "score": ""}, + {"name": "Bob", "age": None, "score": "95"}, + ] + assert result == expected + + def test_clean_custom_rules(self): + """Test cleaning with custom rules.""" + data = [ + {"name": " Alice ", "age": "", "score": "N/A"}, + {"name": "Bob", "age": "30", "score": "95"}, + ] + + rules = { + "strip_whitespace": True, + "remove_empty": True, + "na_values": ["N/A", "", "null"], + } + + result = clean_csv_data(data, rules) + expected = [ + {"name": "Alice"}, # Empty values removed when remove_empty=True + {"name": "Bob", "age": "30", "score": "95"}, + ] + assert result == expected + + def test_clean_no_strip_whitespace(self): + """Test cleaning without stripping whitespace.""" + data = [{"name": " Alice ", "age": "25"}] + rules = {"strip_whitespace": False} + + result = clean_csv_data(data, rules) + assert result[0]["name"] == " Alice " + + def test_clean_empty_data(self): + """Test cleaning empty data.""" + result = clean_csv_data([]) + assert result == [] + + def test_clean_invalid_types(self): + """Test with invalid argument types.""" + with pytest.raises(TypeError, match="data must be a list"): + clean_csv_data("not a list") + + with pytest.raises(TypeError, match="rules must be a dictionary or None"): + clean_csv_data([], "not a dict") + + def test_clean_skip_non_dict_items(self): + """Test cleaning skips non-dictionary items.""" + data = [ + {"name": "Alice", "age": "25"}, + "not a dict", + {"name": "Bob", "age": "30"}, + ] + + result = clean_csv_data(data) + assert len(result) == 2 + assert result[0]["name"] == "Alice" + assert result[1]["name"] == "Bob" + + +class TestRoundTripCsvOperations: + """Test round-trip CSV operations.""" + + def test_write_read_roundtrip(self, tmp_path): + """Test that write -> read returns original data.""" + original_data = [ + {"name": "Alice", "age": "25", "city": "NYC"}, + {"name": "Bob", "age": "30", "city": "LA"}, + {"name": "Charlie", "age": "35", "city": "Chicago"}, + ] + + csv_file = tmp_path / "roundtrip.csv" + write_csv_file(original_data, csv_file) + read_data = read_csv_file(csv_file) + + # Convert age back to string for comparison (CSV always returns strings) + expected = [] + for item in original_data: + expected.append({k: str(v) for k, v in item.items()}) + + assert read_data == expected + + def test_dict_to_csv_to_dict_roundtrip(self): + """Test that dict_list -> CSV string -> dict_list returns original.""" + original_data = [{"name": "Alice", "age": "25"}, {"name": "Bob", "age": "30"}] + + csv_string = dict_list_to_csv(original_data) + converted_back = csv_to_dict_list(csv_string) + + assert converted_back == original_data diff --git a/tests/test_data_json_tools.py b/tests/test_data_json_tools.py new file mode 100644 index 0000000..14948f9 --- /dev/null +++ b/tests/test_data_json_tools.py @@ -0,0 +1,245 @@ +"""Tests for JSON processing tools.""" + +import pytest + +from basic_open_agent_tools.data.json_tools import ( + compress_json_data, + decompress_json_data, + safe_json_deserialize, + safe_json_serialize, + validate_json_string, +) +from basic_open_agent_tools.exceptions import SerializationError + + +class TestSafeJsonSerialize: + """Test safe_json_serialize function.""" + + def test_serialize_dict(self): + """Test serializing a dictionary.""" + data = {"name": "test", "value": 42} + result = safe_json_serialize(data) + assert result == '{"name": "test", "value": 42}' + + def test_serialize_list(self): + """Test serializing a list.""" + data = [1, 2, 3] + result = safe_json_serialize(data) + assert result == "[1, 2, 3]" + + def test_serialize_with_indent(self): + """Test serializing with indentation.""" + data = {"a": 1, "b": 2} + result = safe_json_serialize(data, indent=2) + expected = '{\n "a": 1,\n "b": 2\n}' + assert result == expected + + def test_serialize_unicode(self): + """Test serializing Unicode characters.""" + data = {"message": "Hello δΈ–η•Œ"} + result = safe_json_serialize(data) + assert "δΈ–η•Œ" in result + + def test_serialize_none(self): + """Test serializing None.""" + result = safe_json_serialize(None) + assert result == "null" + + def test_serialize_invalid_indent_type(self): + """Test with invalid indent type.""" + with pytest.raises(TypeError, match="indent must be an integer or None"): + safe_json_serialize({"test": "data"}, indent="invalid") + + def test_serialize_non_serializable_object(self): + """Test serializing non-serializable object.""" + + class CustomClass: + pass + + with pytest.raises( + SerializationError, match="Failed to serialize data to JSON" + ): + safe_json_serialize({"obj": CustomClass()}) + + +class TestSafeJsonDeserialize: + """Test safe_json_deserialize function.""" + + def test_deserialize_dict(self): + """Test deserializing a dictionary.""" + json_str = '{"name": "test", "value": 42}' + result = safe_json_deserialize(json_str) + assert result == {"name": "test", "value": 42} + + def test_deserialize_list(self): + """Test deserializing a list.""" + json_str = "[1, 2, 3]" + result = safe_json_deserialize(json_str) + assert result == [1, 2, 3] + + def test_deserialize_unicode(self): + """Test deserializing Unicode characters.""" + json_str = '{"message": "Hello δΈ–η•Œ"}' + result = safe_json_deserialize(json_str) + assert result == {"message": "Hello δΈ–η•Œ"} + + def test_deserialize_null(self): + """Test deserializing null.""" + result = safe_json_deserialize("null") + assert result is None + + def test_deserialize_invalid_type(self): + """Test with invalid input type.""" + with pytest.raises(TypeError, match="Input must be a string"): + safe_json_deserialize({"invalid": "input"}) + + def test_deserialize_invalid_json(self): + """Test deserializing invalid JSON.""" + with pytest.raises( + SerializationError, match="Failed to deserialize JSON string" + ): + safe_json_deserialize('{"invalid": }') + + def test_deserialize_empty_string(self): + """Test deserializing empty string.""" + with pytest.raises( + SerializationError, match="Failed to deserialize JSON string" + ): + safe_json_deserialize("") + + +class TestValidateJsonString: + """Test validate_json_string function.""" + + def test_validate_valid_json(self): + """Test validating valid JSON.""" + assert validate_json_string('{"valid": true}') is True + assert validate_json_string("[1, 2, 3]") is True + assert validate_json_string('"string"') is True + assert validate_json_string("null") is True + + def test_validate_invalid_json(self): + """Test validating invalid JSON.""" + assert validate_json_string('{"invalid": }') is False + assert validate_json_string("[1, 2,]") is False + assert validate_json_string("undefined") is False + assert validate_json_string("") is False + + def test_validate_non_string(self): + """Test validating non-string input.""" + assert validate_json_string(None) is False + assert validate_json_string(123) is False + assert validate_json_string({"dict": "input"}) is False + assert validate_json_string([1, 2, 3]) is False + + +class TestCompressJsonData: + """Test compress_json_data function.""" + + def test_compress_simple_data(self): + """Test compressing simple data.""" + data = {"test": "data"} + compressed = compress_json_data(data) + assert isinstance(compressed, bytes) + assert len(compressed) > 0 + + def test_compress_large_data(self): + """Test compressing larger data for better compression.""" + data = {"repeated": "data" * 100, "numbers": list(range(100))} + compressed = compress_json_data(data) + original_json = safe_json_serialize(data) + + # Compressed should be smaller than original for repetitive data + assert len(compressed) < len(original_json.encode("utf-8")) + + def test_compress_unicode_data(self): + """Test compressing Unicode data.""" + data = {"unicode": "Hello δΈ–η•Œ", "emoji": "πŸš€"} + compressed = compress_json_data(data) + assert isinstance(compressed, bytes) + + def test_compress_non_serializable(self): + """Test compressing non-serializable data.""" + + class CustomClass: + pass + + with pytest.raises(SerializationError, match="Failed to compress JSON data"): + compress_json_data({"obj": CustomClass()}) + + +class TestDecompressJsonData: + """Test decompress_json_data function.""" + + def test_decompress_simple_data(self): + """Test decompressing simple data.""" + original = {"test": "data"} + compressed = compress_json_data(original) + decompressed = decompress_json_data(compressed) + assert decompressed == original + + def test_decompress_complex_data(self): + """Test decompressing complex data.""" + original = { + "string": "Hello δΈ–η•Œ", + "number": 42, + "list": [1, 2, 3], + "nested": {"key": "value"}, + "null": None, + "boolean": True, + } + compressed = compress_json_data(original) + decompressed = decompress_json_data(compressed) + assert decompressed == original + + def test_decompress_invalid_type(self): + """Test decompressing invalid input type.""" + with pytest.raises(TypeError, match="Input must be bytes"): + decompress_json_data("not bytes") + + def test_decompress_invalid_data(self): + """Test decompressing invalid compressed data.""" + with pytest.raises(SerializationError, match="Failed to decompress JSON data"): + decompress_json_data(b"invalid compressed data") + + def test_decompress_empty_bytes(self): + """Test decompressing empty bytes.""" + with pytest.raises(SerializationError, match="Failed to decompress JSON data"): + decompress_json_data(b"") + + +class TestRoundTripSerialization: + """Test round-trip serialization scenarios.""" + + def test_serialize_deserialize_roundtrip(self): + """Test that serialize -> deserialize returns original data.""" + test_cases = [ + {"simple": "dict"}, + [1, 2, 3, "mixed", {"nested": "list"}], + None, + True, + False, + 42, + 3.14, + "string with unicode δΈ–η•Œ", + {"complex": {"nested": {"deeply": [1, 2, {"more": "nesting"}]}}}, + ] + + for original in test_cases: + serialized = safe_json_serialize(original) + deserialized = safe_json_deserialize(serialized) + assert deserialized == original + + def test_compress_decompress_roundtrip(self): + """Test that compress -> decompress returns original data.""" + test_cases = [ + {"simple": "dict"}, + [1, 2, 3, "mixed", {"nested": "list"}], + {"large": "data" * 1000}, # Test compression benefits + {"unicode": "δΈ–η•Œ 🌍 πŸš€"}, + ] + + for original in test_cases: + compressed = compress_json_data(original) + decompressed = decompress_json_data(compressed) + assert decompressed == original diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 91ffe12..2004e34 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -143,18 +143,20 @@ def test_merge_tool_lists_deduplication(self): # Load the same tools multiple times fs_tools_1 = load_all_filesystem_tools() fs_tools_2 = load_all_filesystem_tools() - + # Merge with duplicates merged = merge_tool_lists(fs_tools_1, fs_tools_2) - + # Should have same length as single load (duplicates removed) assert len(merged) == len(fs_tools_1) - + # Check that no function name appears twice function_names = [tool.__name__ for tool in merged] unique_names = set(function_names) - assert len(function_names) == len(unique_names), "Found duplicate function names" - + assert len(function_names) == len(unique_names), ( + "Found duplicate function names" + ) + # Should still contain all expected functions expected_names = [tool.__name__ for tool in fs_tools_1] for name in expected_names: @@ -162,20 +164,21 @@ def test_merge_tool_lists_deduplication(self): def test_merge_tool_lists_different_modules_same_name(self): """Test handling of functions with same name from different modules.""" + # Create two functions with the same name but different modules def test_function(): return "first" - + def another_test_function(): return "second" - + # Manually set different module names to simulate different sources test_function.__module__ = "module1" another_test_function.__module__ = "module2" another_test_function.__name__ = "test_function" # Same name as first - + merged = merge_tool_lists([test_function], [another_test_function]) - + # Should keep both since they're from different modules assert len(merged) == 2 assert test_function in merged