From 4b9f83b439a691ee4be661fb304a05e0b918df74 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 4 Feb 2026 14:34:07 +0100 Subject: [PATCH 001/100] Add option to show unique samples dimensions CLI --- .../cli_commands/dimensions_cli.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 0636fa99..b9469ecb 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -52,6 +52,13 @@ def show_dimensions_index( help="If set, will show all unique scaffold names found across all the VCF files in the project.", ) ), + unique_samples: bool = ( + typer.Option( + False, + "--unique-samples", + help="If set, will show all unique sample names found across all the VCF files in the project.", + ) + ), project: str | None = PROJECT_NAME_OPTION, config_file: Path = CONFIG_FILE_OPTION, ) -> None: @@ -87,6 +94,7 @@ def show_dimensions_index( return if unique_scaffolds: + # TODO for scalability: implement this as a separate CRUD instead of parsing all data on the client side unique_scaffold_names = set() for entry in dimensions_info.get("indexed_files", []): unique_scaffold_names.update(entry.get("dimensions", {}).get("scaffolds", [])) @@ -106,6 +114,17 @@ def show_dimensions_index( print(f"Unique scaffold names found across all the VCF files in the project:\n{unique_scaffold_names_sorted}") return + if unique_samples: + # TODO for scalability: implement this as a separate CRUD instead of parsing all data on the client side + unique_sample_names = set() + for entry in dimensions_info.get("indexed_files", []): + unique_sample_names.update(entry.get("dimensions", {}).get("sample_names", [])) + + unique_sample_names_sorted = sorted(unique_sample_names) + + print(f"Unique sample names found across all the VCF files in the project:\n{unique_sample_names_sorted}") + return + print(yaml.safe_dump(dimensions_info, sort_keys=False)) From a62d2328147a56d6e3b94ca58b2bb752b390eb6a Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 4 Feb 2026 15:00:19 +0100 Subject: [PATCH 002/100] Add CLI to create metadata TSV from dimensions Users can use this to add their own user defined columns for these samples. --- .gitignore | 9 ++- .../cli_commands/dimensions_cli.py | 56 +++++++++++++++++++ 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index f41c9a3b..867a6d9d 100644 --- a/.gitignore +++ b/.gitignore @@ -17,9 +17,11 @@ sample_metadata_*.tsv *.vcf *.vcf.gz *.vcf.gz.csi +*.vcf.gz.tbi !tests/fixtures/*.vcf.gz tests/fixtures/temp* tests/fixtures/merged* +divbase_metadata_template*.tsv # query job config files bcftools_divbase_job_config.json @@ -28,11 +30,12 @@ bcftools_divbase_job_config.json vcf_dimensions.tsv mock*.tsv task_records*.json +split_scaffold_files.txt +scripts/benchmarking/*.yaml +scripts/benchmarking/results #MacOS artifacts .DS_Store # mkdocs build cache -.cache/ -# pypi -dist/ +.cache/ \ No newline at end of file diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index b9469ecb..0ef18154 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -1,4 +1,6 @@ +import csv import logging +import os from pathlib import Path import typer @@ -160,3 +162,57 @@ def _format_api_response_for_display_in_terminal(api_response: DimensionsShowRes "indexed_files": dimensions_list, "skipped_files": skipped_list, } + + +@dimensions_app.command("create-metadata-template") +def create_metadata_template_with_project_samples_names( + project: str | None = PROJECT_NAME_OPTION, + config_file: Path = CONFIG_FILE_OPTION, +) -> None: + """ + Use the samples index in a projects dimensions cache to create a TSV metadata template file + that has the sample names as pre-filled as the first column. + """ + + # TODO this duplicates some code with show_dimensions_index() above. A refactoring should probably include creating a separate CRUD function + # so that the client does not need to parse all data. + + project_config = resolve_project(project_name=project, config_path=config_file) + + response = make_authenticated_request( + method="GET", + divbase_base_url=project_config.divbase_url, + api_route=f"v1/vcf-dimensions/projects/{project_config.name}", + ) + vcf_dimensions_data = DimensionsShowResult(**response.json()) + + dimensions_info = _format_api_response_for_display_in_terminal(vcf_dimensions_data) + + unique_sample_names = set() + for entry in dimensions_info.get("indexed_files", []): + unique_sample_names.update(entry.get("dimensions", {}).get("sample_names", [])) + + unique_sample_names_sorted = sorted(unique_sample_names) + sample_count = len(unique_sample_names_sorted) + print( + f"There were {sample_count} unique samples found in the dimensions file for the {project_config.name} project." + ) + + if sample_count == 0: + # Fallback in case there are no samples in the dimensions index. If no dimensions entry for the project + # VCFDimensionsEntryMissingError will be returned. But for some reason, there are no samples in the VCF, this will catch that. + print("No samples found for this project. No file written.") + return + + output_filename = "divbase_metadata_template.tsv" + output_path = os.path.join(os.getcwd(), output_filename) + + with open(output_path, mode="w", newline="") as tsvfile: + writer = csv.writer(tsvfile, delimiter="\t") + writer.writerow(["#Sample_ID"]) + for sample in unique_sample_names_sorted: + writer.writerow([sample]) + + print(f"A sample metadata template with these sample names was written to: {output_path}") + + # TODO perhaps add a message on how to fill in additional columns and how to upload the metadata file to DivBase? From fc8a8b41ce5793dfbbcf25a3f1883c3863f6694d Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 4 Feb 2026 15:27:48 +0100 Subject: [PATCH 003/100] Update user guides with metadata template info --- docs/user-guides/quick-start.md | 48 +++++++++++++++++----------- docs/user-guides/sidecar-metadata.md | 7 +++- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/docs/user-guides/quick-start.md b/docs/user-guides/quick-start.md index 3379fa9c..4678d22b 100644 --- a/docs/user-guides/quick-start.md +++ b/docs/user-guides/quick-start.md @@ -87,23 +87,7 @@ Check your uploaded files: divbase-cli files list ``` -## Step 7: Upload sample metadata - -TODO It might make more sense to have run the dimensions update job before this if we are to use a pre-populated template file - -Sample metadata must be uploaded as follows: - -- In TSV format and be named "sample_metadata.tsv" -- Must contain a column named "sample_id" which matches the sample IDs in your VCF files -- The names and values of all other columns are optional. - -TODO update this after `sidecar-metadata.md` docs are done, there are changes planned for some details. - -```bash -divbase-cli files upload path/to/your/sample_metadata.tsv -``` - -## Step 8: Dimensions update +## Step 7: Dimensions update For DivBase to be able to efficiently handle the VCF files in the the project, some key information about each VCF files is fetched from the files. In DivBase, this is refered to as "VCF dimensions". These include for instance which samples and scaffolds that a VCF file contains. @@ -120,7 +104,7 @@ This submits a task to the DivBase task management system. The task will wait in 2. Please also note that the `divbase-cli dimensions update` command needs to be done every time a new VCF or a new version of a VCF file is uploaded. -## Step 9: Confirm dimensions update job completion +## Step 8: Confirm dimensions update job completion Check the task history to confirm the dimensions update job has completed: @@ -136,6 +120,34 @@ It is possible to inspect the cached VCF dimensions data for the project at any divbase-cli dimensions show ``` +## Step 9: Upload sample metadata + +DivBase can checkout data based the VCF files themselves, but can also take an optional sidecar sample metadata file into account. The metadata file must be a TSV (tab-separated variables) file. The metadata contents of the file is defined by the users. If the VCF dimensions command has been run for the project, the cached dimensions data can be used create a template where the samples of the project have been pre-filled: + +```bash +divbase-cli dimensions create-metadata-template +``` + +Details on how to write this file are given in [Sidecar Metadata TSV files: creating and querying sample metadata files](sidecar-metadata.md). In short, the first row starts with `#` and contains the headers for different metadata columns. The first column (`Sample_ID`) is mandatory and can be created by the system as just described; if created manually just make. The rest of the columns are free for the user to define. + +Example of a sidecar metadata TSV file with the mandatory `Sample_ID` column and two user defined columns. + +``` +#Sample_ID Population Area +129P2 1 North +129S1 2 East +129S5 3 South +``` + +!!! note + Please use a text editor than preserves the tabs when the file is saved. Incorrect tabs can lead to issues with running metadata queries in DivBase. + +The sample metadata file should then be uploaded the the DivBase project with follows: + +```bash +divbase-cli files upload path/to/your/sample_metadata.tsv +``` + ## Step 10: Run your queries There are three types of queries in DivBase: diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 89cfe441..34eed30c 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -6,12 +6,17 @@ TODO ## Creating a sidecar TSV for a DivBase project +If the dimensions VCF files in the project have been cached in DivBase, a template metadata file with the sample names pre-filled can be created with: + +```bash +divbase-cli dimensions create-metadata-template +``` + Note! there can be multiple TSVs in the same project and it is possible to call them for the queries with the `--metadata-tsv-name` flag. TODOs: - [TO BE IMPLEMENTED] consider changing the mandatory column name from `Sample_ID` to `Sample` -- [TO BE IMPLEMENTED] CLI command to generate template (empty template and template with the samples from the DivBase project pre-filled). Pre-filling the template will require that dimensions update has been run - [TO BE IMPLEMENTED] what happens if a TSV does not contain all the samples in the DivBase project? There should probably be a warning, but not an error? - [TO BE IMPLEMENTED] what happens if a sample name is misspelled in the TSV? a warning? can this be checked against the dimensions show? From db50f853f64a9fe259d7b75e91a2b9ef9b7c8e56 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 4 Feb 2026 15:52:49 +0100 Subject: [PATCH 004/100] Enable range filtering for numerical TSV columns For example: divbase-cli query tsv "Population:2-3" --metadata-tsv-name \ tutorial_mock_metadata_mgpv3snps.tsv Pandas read_csv() infer if columns are numerical or string based on the data so we already had the correct type in the df. See: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html --- .../src/divbase_api/services/queries.py | 52 ++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index ea45447d..98a91006 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -7,6 +7,7 @@ import gzip import logging import os +import re import subprocess from dataclasses import dataclass from pathlib import Path @@ -528,7 +529,9 @@ def load_file(self) -> "SidecarQueryManager": # TODO: pandas will likely read all plain files to df, so perhaps there should be a check that the file is a TSV file? or at least has properly formatted tabular columns and rows? try: logger.info(f"Loading sidecar metadata file: {self.file}") - self.df = pd.read_csv(self.file, sep="\t") + self.df = pd.read_csv( + self.file, sep="\t" + ) # Pandas has Type Inference and will detect numberic and string columns automatically self.df.columns = self.df.columns.str.lstrip("#") if "Sample_ID" not in self.df.columns: raise SidecarColumnNotFoundError("The 'Sample_ID' column is required in the metadata file.") @@ -580,18 +583,55 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": continue try: key, values = key_value.split(":", 1) - values_list = values.split(",") + key = key.strip() + values = values.strip() + + if key not in self.df.columns: + logger.warning(f"Column '{key}' not found in the TSV file. Skipping this filter condition.") + continue + + # Check if column is numeric + is_numeric = pd.api.types.is_numeric_dtype(self.df[key]) + + # Handle numeric range filtering + range_match = re.match(r"^(\d+\.?\d*)-(\d+\.?\d*)$", values) + + if is_numeric and range_match: + min_val = float(range_match.group(1)) + max_val = float(range_match.group(2)) + condition = (self.df[key] >= min_val) & (self.df[key] <= max_val) + if not condition.any(): + logger.warning(f"No values in column '{key}' fall within range {min_val}-{max_val}") + filter_conditions.append(condition) + logger.debug(f"Applied range filter on '{key}': {min_val} to {max_val}") + else: + values_list = values.split(",") + + # Convert query str values to numeric if the column is numeric + if is_numeric: + # User input in the CLI query is always string (e.g. "Group:1,3,8") + converted_values = [] + for v in values_list: + try: + converted_values.append(float(v) if "." in v else int(v)) + except ValueError: + # Handle cases such as "Group:1,three,8" where "three" cannot be converted to numeric, however unlikely their occurance may be. + logger.warning( + f"Cannot convert '{v}' to numeric for column '{key}'. Skipping this value." + ) + if not converted_values: + logger.warning( + f"No valid numeric values provided for the numeric column '{key}'. Filter condition will not match any rows." + ) + values_list = converted_values - if key in self.df.columns: condition = self.df[key].isin(values_list) if not condition.any(): logger.warning(f"None of the values {values_list} were found in column '{key}'") filter_conditions.append(condition) - else: - logger.warning(f"Column '{key}' not found in the TSV file. Skipping this filter condition.") except Exception as e: raise SidecarInvalidFilterError( - f"Invalid filter format: '{key_value}'. Expected format 'key:value1,value2'" + f"Invalid filter format: '{key_value}'. Expected format 'key:value1,value2' or 'key:min-max' for numeric ranges" ) from e if filter_conditions: From b36d09ac948abaed83f0a8f71a9905bf700ca745 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 4 Feb 2026 16:24:08 +0100 Subject: [PATCH 005/100] Enable inequality filtering on numberical columns Might need some more testing and docs, but the basics are in place. --- docs/user-guides/sidecar-metadata.md | 10 ++++ .../src/divbase_api/services/queries.py | 53 +++++++++++++++++-- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 34eed30c..e1fcc925 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -42,6 +42,16 @@ String consisting of keys:values in the tsv file to filter on. The syntax is 'Ke - [TO BE IMPLEMENTED] filtering based on ranges (ints and maybe floats) and not just on strings, e.g. in range 31 - 50. etc... - [TO BE IMPLEMENTED] add more Set Operations: union, intersection, difference, symmetric difference. Be clear on the default behaviour +Please do not mix numerical and string values in the same column! + +For numeric columns, you can filter on: + +- Inequalities: 'Weight:>25' or "Weight:>=20,<=40" or "Weight:<100". The inequality operator must be expressed relative to the Key, i.e. for 'Weight:>25' the reverse notation 'Weight:25<' is not supported. +- Range (inclusive): 'Weight:20-40' +- Discrete values: 'Weight:25,30,35' + +The syntax only accepts `<=` and `>=` since this is the syntax of Python. The forms =< and => are not accepted and will return an error. + ## Trying out a query ```bash diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 98a91006..bb31e150 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -593,7 +593,51 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": # Check if column is numeric is_numeric = pd.api.types.is_numeric_dtype(self.df[key]) - # Handle numeric range filtering + # Handle numeric inequality filtering (e.g., "Weight:>25", "Weight:>=20,<=40") + if is_numeric and re.search(r"[<>]=?", values): + inequality_parts = values.split(",") + conditions = [] + + for part in inequality_parts: + part = part.strip() + + # Check for common mistakes: =< or => instead of <= or >= + if re.match(r"^=<\d+\.?\d*$", part) or re.match(r"^=>\d+\.?\d*$", part): + raise SidecarInvalidFilterError( + f"Invalid operator format '{part[:2]}' in filter '{key}:{values}'. " + f"Use standard operators: '<=' (not '=<') or '>=' (not '=>')" + ) + + inequality_match = re.match(r"^(>=|<=|>|<)(\d+\.?\d*)$", part) + if inequality_match: + operator = inequality_match.group(1) + value = float(inequality_match.group(2)) + + if operator == ">": + conditions.append(self.df[key] > value) + elif operator == ">=": + conditions.append(self.df[key] >= value) + elif operator == "<": + conditions.append(self.df[key] < value) + elif operator == "<=": + conditions.append(self.df[key] <= value) + + logger.debug(f"Applied inequality filter on '{key}': {operator} {value}") + else: + logger.warning(f"Invalid inequality format: '{part}' for column '{key}'. Skipping.") + + if conditions: + # Combine multiple conditions with AND + combined = conditions[0] + for cond in conditions[1:]: + combined = combined & cond + + if not combined.any(): + logger.warning(f"No values in column '{key}' satisfy the inequality conditions: {values}") + filter_conditions.append(combined) + continue + + # Handle numeric range filtering (e.g., "Weight:20-40") range_match = re.match(r"^(\d+\.?\d*)-(\d+\.?\d*)$", values) if is_numeric and range_match: @@ -607,15 +651,16 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": else: values_list = values.split(",") - # Convert query str values to numeric if the column is numeric + # Handle discrete numberical values if is_numeric: - # User input in the CLI query is always string (e.g. "Group:1,3,8") + # Convert query str values to numeric if the column is numeric + # User input in the CLI query is always string (e.g. "Weight:1,3,8") converted_values = [] for v in values_list: try: converted_values.append(float(v) if "." in v else int(v)) except ValueError: - # Handle cases such as "Group:1,three,8" where "three" cannot be converted to numeric, however unlikely their occurance may be. + # Handle cases such as "Weight:1,three,8" where "three" cannot be converted to numeric, however unlikely their occurance may be. logger.warning( f"Cannot convert '{v}' to numeric for column '{key}'. Skipping this value." ) From be16423dda9c0523179a3c958cdb9b659521fc22 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 5 Feb 2026 10:03:22 +0100 Subject: [PATCH 006/100] Enable OR filtering for numerical metadata I.e inquality OR range OR discrete value example "Weight": ">50,30-40,25,45" --- docs/user-guides/sidecar-metadata.md | 9 ++ .../src/divbase_api/services/queries.py | 118 ++++++++++-------- 2 files changed, 75 insertions(+), 52 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index e1fcc925..01211741 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -52,6 +52,15 @@ For numeric columns, you can filter on: The syntax only accepts `<=` and `>=` since this is the syntax of Python. The forms =< and => are not accepted and will return an error. +It is possible to combine filters on inequalities, ranges, and discrete values to an OR logic if desired. For example: + +Weight:<2,4 → values less than 2 OR equal to 4 +Weight:1-2,4 → values in range 1-2 OR equal to 4 +Weight:>5,1-2,4 → values greater than 5 OR in range 1-2 OR equal to 4 +Weight:>10,<2,5-7 → values >10 OR <2 OR in range 5-7 + +TODO write pytests that ensure that these numerical filters work + ## Trying out a query ```bash diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index bb31e150..b324d0f0 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -590,86 +590,100 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": logger.warning(f"Column '{key}' not found in the TSV file. Skipping this filter condition.") continue - # Check if column is numeric is_numeric = pd.api.types.is_numeric_dtype(self.df[key]) - # Handle numeric inequality filtering (e.g., "Weight:>25", "Weight:>=20,<=40") - if is_numeric and re.search(r"[<>]=?", values): - inequality_parts = values.split(",") - conditions = [] + # Handle numeric filtering: inequalities, ranges, and discrete values (all with OR logic) + # e.g., "Weight:>25,<30,50" or "Weight:20-40,50,>100" + if is_numeric: + values_list = values.split(",") + inequality_conditions = [] + range_conditions = [] + discrete_values = [] - for part in inequality_parts: - part = part.strip() + for v in values_list: + v = v.strip() # Check for common mistakes: =< or => instead of <= or >= - if re.match(r"^=<\d+\.?\d*$", part) or re.match(r"^=>\d+\.?\d*$", part): + if re.match(r"^=<\d+\.?\d*$", v) or re.match(r"^=>\d+\.?\d*$", v): raise SidecarInvalidFilterError( - f"Invalid operator format '{part[:2]}' in filter '{key}:{values}'. " + f"Invalid operator format '{v[:2]}' in filter '{key}:{values}'. " f"Use standard operators: '<=' (not '=<') or '>=' (not '=>')" ) - inequality_match = re.match(r"^(>=|<=|>|<)(\d+\.?\d*)$", part) + # Check if it's an inequality (e.g., ">25", "<=40") + inequality_match = re.match(r"^(>=|<=|>|<)(\d+\.?\d*)$", v) if inequality_match: operator = inequality_match.group(1) value = float(inequality_match.group(2)) if operator == ">": - conditions.append(self.df[key] > value) + inequality_conditions.append(self.df[key] > value) elif operator == ">=": - conditions.append(self.df[key] >= value) + inequality_conditions.append(self.df[key] >= value) elif operator == "<": - conditions.append(self.df[key] < value) + inequality_conditions.append(self.df[key] < value) elif operator == "<=": - conditions.append(self.df[key] <= value) + inequality_conditions.append(self.df[key] <= value) logger.debug(f"Applied inequality filter on '{key}': {operator} {value}") - else: - logger.warning(f"Invalid inequality format: '{part}' for column '{key}'. Skipping.") + continue + + # Check if it's a range (e.g., "20-40") + range_match = re.match(r"^(\d+\.?\d*)-(\d+\.?\d*)$", v) + if range_match: + min_val = float(range_match.group(1)) + max_val = float(range_match.group(2)) + range_condition = (self.df[key] >= min_val) & (self.df[key] <= max_val) + range_conditions.append(range_condition) + logger.debug(f"Applied range filter on '{key}': {min_val} to {max_val}") + continue + + # Otherwise, treat as discrete value + try: + discrete_values.append(float(v) if "." in v else int(v)) + except ValueError: + logger.warning(f"Cannot convert '{v}' to numeric for column '{key}'. Skipping this value.") + + # If multiple conditions (inequality, range, discrete values), combine them with with OR logic + # In short, this builds a boolean filter that Pandas will apply to the dataframe column + conditions = [] + + if inequality_conditions: + # Seperately combine multiple inequalities with OR + combined_inequalities = inequality_conditions[0] + for cond in inequality_conditions[1:]: + # Compare bools pairwise. As long as one of the two are true, set bool filter to true. The bar (|)is pandas syntax for element-wise OR between boolean Series. + combined_inequalities = combined_inequalities | cond + conditions.append(combined_inequalities) + + if range_conditions: + # Seperately combine multiple ranges with OR + combined_ranges = range_conditions[0] + for cond in range_conditions[1:]: + combined_ranges = combined_ranges | cond + conditions.append(combined_ranges) + + if discrete_values: + discrete_condition = self.df[key].isin(discrete_values) + conditions.append(discrete_condition) if conditions: - # Combine multiple conditions with AND + # Combine inequalities, ranges, and discrete values with OR combined = conditions[0] for cond in conditions[1:]: - combined = combined & cond + combined = combined | cond if not combined.any(): - logger.warning(f"No values in column '{key}' satisfy the inequality conditions: {values}") + logger.warning(f"No values in column '{key}' match the filter: {values}") filter_conditions.append(combined) - continue - - # Handle numeric range filtering (e.g., "Weight:20-40") - range_match = re.match(r"^(\d+\.?\d*)-(\d+\.?\d*)$", values) - - if is_numeric and range_match: - min_val = float(range_match.group(1)) - max_val = float(range_match.group(2)) - condition = (self.df[key] >= min_val) & (self.df[key] <= max_val) - if not condition.any(): - logger.warning(f"No values in column '{key}' fall within range {min_val}-{max_val}") - filter_conditions.append(condition) - logger.debug(f"Applied range filter on '{key}': {min_val} to {max_val}") + logger.info("filter_conditions: " + str(filter_conditions)) # debug + else: + logger.warning( + f"No valid numeric values, ranges, or inequalities provided for column '{key}'. Filter condition will not match any rows." + ) else: + # Non-numeric column: handle as discrete string values values_list = values.split(",") - - # Handle discrete numberical values - if is_numeric: - # Convert query str values to numeric if the column is numeric - # User input in the CLI query is always string (e.g. "Weight:1,3,8") - converted_values = [] - for v in values_list: - try: - converted_values.append(float(v) if "." in v else int(v)) - except ValueError: - # Handle cases such as "Weight:1,three,8" where "three" cannot be converted to numeric, however unlikely their occurance may be. - logger.warning( - f"Cannot convert '{v}' to numeric for column '{key}'. Skipping this value." - ) - if not converted_values: - logger.warning( - f"No valid numeric values provided for the numeric column '{key}'. Filter condition will not match any rows." - ) - values_list = converted_values - condition = self.df[key].isin(values_list) if not condition.any(): logger.warning(f"None of the values {values_list} were found in column '{key}'") From 4c5f15d40eba293862aaa81692d3852211ab8273 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 6 Feb 2026 08:25:16 +0100 Subject: [PATCH 007/100] Propagate metadata query warnings to terminal Up until now, warnings were only displayed in the worker logs. --- .../src/divbase_api/services/queries.py | 29 +++++++++++++------ .../src/divbase_cli/cli_commands/query_cli.py | 13 ++++++++- .../src/divbase_lib/api_schemas/queries.py | 1 + 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index f499159e..b50ba8b2 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -56,6 +56,7 @@ def run_sidecar_metadata_query( sidecar_manager = SidecarQueryManager(file=file).run_query(filter_string=filter_string) query_message = sidecar_manager.query_message + warnings = sidecar_manager.warnings unique_sample_ids = sidecar_manager.get_unique_values("Sample_ID") logger.info(f"Metadata query returned {len(unique_sample_ids)} unique sample IDs") @@ -82,6 +83,7 @@ def run_sidecar_metadata_query( unique_sample_ids=list(unique_sample_ids), unique_filenames=list(unique_filenames), query_message=query_message, + warnings=warnings, ) @@ -685,6 +687,7 @@ def __init__(self, file: Path): self.df = None self.query_result = None self.query_message: str = "" + self.warnings: list[str] = [] self.load_file() def load_file(self) -> "SidecarQueryManager": @@ -699,7 +702,7 @@ def load_file(self) -> "SidecarQueryManager": logger.info(f"Loading sidecar metadata file: {self.file}") self.df = pd.read_csv( self.file, sep="\t" - ) # Pandas has Type Inference and will detect numberic and string columns automatically + ) # Pandas has Type Inference and will detect numeric and string columns automatically self.df.columns = self.df.columns.str.lstrip("#") if "Sample_ID" not in self.df.columns: raise SidecarColumnNotFoundError("The 'Sample_ID' column is required in the metadata file.") @@ -755,7 +758,9 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": values = values.strip() if key not in self.df.columns: - logger.warning(f"Column '{key}' not found in the TSV file. Skipping this filter condition.") + warning_msg = f"Column '{key}' not found in the TSV file. Skipping this filter condition." + logger.warning(warning_msg) + self.warnings.append(warning_msg) continue is_numeric = pd.api.types.is_numeric_dtype(self.df[key]) @@ -842,19 +847,23 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": combined = combined | cond if not combined.any(): - logger.warning(f"No values in column '{key}' match the filter: {values}") + warning_msg = f"No values in column '{key}' match the filter: {values}" + logger.warning(warning_msg) + self.warnings.append(warning_msg) filter_conditions.append(combined) logger.info("filter_conditions: " + str(filter_conditions)) # debug else: - logger.warning( - f"No valid numeric values, ranges, or inequalities provided for column '{key}'. Filter condition will not match any rows." - ) + warning_msg = f"No valid numeric values, ranges, or inequalities provided for column '{key}'. Filter condition will not match any rows." + logger.warning(warning_msg) + self.warnings.append(warning_msg) else: # Non-numeric column: handle as discrete string values values_list = values.split(",") condition = self.df[key].isin(values_list) if not condition.any(): - logger.warning(f"None of the values {values_list} were found in column '{key}'") + warning_msg = f"None of the values {values_list} were found in column '{key}'" + logger.warning(warning_msg) + self.warnings.append(warning_msg) filter_conditions.append(condition) except Exception as e: raise SidecarInvalidFilterError( @@ -869,9 +878,11 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": self.query_result = self.df[combined_condition].copy() self.query_message = self.filter_string else: - logger.warning("Invalid filter conditions found - returning ALL records. This may be a large result set.") + warning_msg = "Invalid filter conditions: none of the filters matched any records. Returning ALL records. This may be a large result set. Please check your filter keys, value spelling, and syntax." + logger.warning(warning_msg) + self.warnings.append(warning_msg) self.query_result = self.df - self.query_message = "Invalid filter conditions - returning ALL records" + self.query_message = f"Invalid filter conditions ({self.filter_string}) - returning ALL records" return self diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py index bce892fc..60c8af72 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py @@ -100,12 +100,23 @@ def sample_metadata_query( results = SampleMetadataQueryTaskResult(**response.json()) + if results.warnings: + print("[yellow]⚠ Warnings:[/yellow]") + for warning in results.warnings: + print(f" • {warning}") + print() + if show_sample_results: print("[bright_blue]Name and file for each sample in query results:[/bright_blue]") for sample in results.sample_and_filename_subset: print(f"Sample ID: '{sample['Sample_ID']}', Filename: '{sample['Filename']}'") - print(f"The results for the query ([bright_blue]{results.query_message}[/bright_blue]):") + invalid_query_prefix = "Invalid filter conditions" + if results.query_message and results.query_message.startswith(invalid_query_prefix): + color = "red" + else: + color = "bright_blue" + print(f"The results for the query ([{color}]{results.query_message}[/{color}]):") print(f"Unique Sample IDs: {results.unique_sample_ids}") print(f"Unique filenames: {results.unique_filenames}\n") diff --git a/packages/divbase-lib/src/divbase_lib/api_schemas/queries.py b/packages/divbase-lib/src/divbase_lib/api_schemas/queries.py index de827d88..1db6e51b 100644 --- a/packages/divbase-lib/src/divbase_lib/api_schemas/queries.py +++ b/packages/divbase-lib/src/divbase_lib/api_schemas/queries.py @@ -55,6 +55,7 @@ class SampleMetadataQueryTaskResult(BaseModel): unique_sample_ids: list[str] unique_filenames: list[str] query_message: str + warnings: list[str] = [] status: Optional[str] = None From 678977ebd42d0cca6e68388742a5b3b13c611a95 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 6 Feb 2026 08:28:18 +0100 Subject: [PATCH 008/100] Improve formatting of metadata query warnings Use textwrap to ensure that bulleted warnings keep bullet indentation when warnings string is too long to fit within the terminal width. --- .../divbase-cli/src/divbase_cli/cli_commands/query_cli.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py index 60c8af72..a60ea3f1 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py @@ -17,6 +17,7 @@ """ import logging +import textwrap from pathlib import Path import typer @@ -103,7 +104,8 @@ def sample_metadata_query( if results.warnings: print("[yellow]⚠ Warnings:[/yellow]") for warning in results.warnings: - print(f" • {warning}") + wrapped = textwrap.fill(warning, width=100, initial_indent=" • ", subsequent_indent=" ") + print(wrapped) print() if show_sample_results: From f1e798fb33cf82909b725703dba1a06a8f2f661f Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 6 Feb 2026 09:07:46 +0100 Subject: [PATCH 009/100] Support semicolon-separated values in TSV columns Use lambda functions and pandas .apply() to split values on semicolons and apply filters accordingly. This allows users to define sample metadata TSVs where cells can contain multiple values separated by semicolons, and filter strings can match any of those values. Sample_ID Group S1 1 S2 1;2 S3 2 --- .../src/divbase_api/services/queries.py | 152 ++++++++++++++---- 1 file changed, 117 insertions(+), 35 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index b50ba8b2..e5cd48fa 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -714,15 +714,31 @@ def load_file(self) -> "SidecarQueryManager": def run_query(self, filter_string: str = None) -> "SidecarQueryManager": """ Method to run a query against the loaded data. The filter_string should be a semicolon-separated list of key:value pairs, - where key is a column name and value is a comma-separated list of values to filter by. + where key is a column name and value is a comma-separated list of filter values. For example: "key1:value1,value2;key2:value3,value4". + The TSV that is loaded into the pandas DataFrame can have both string and numeric columns. + - String columns are matched to filter string values with OR logic: if ANY value in a cell matches ANY filter value, the row matches. + - Numeric columns support: + - Inequalities: ">25", "<=40" (checks if any cell value satisfies the condition) + - Ranges: "20-40" (checks if any cell value is within the range) + - Discrete values: "25,30,50" (checks if any cell value matches any filter value) + - All are combined with OR logic + + Filter string values in the query vs. cell values in the TSV: + - Filter strings are handled per semicolon-separated key-value pair: in "key1:value1,value2;key2:value3,value4" + "key1:value1,value2" is handled separately from "key2:value3,value4". + - Filter string values can be comma-separated, e.g. "value1,value2" in "key1:value1,value2" and each filter string value is handled separately. + - Cell values can be semicolon-separated, e.g. "25;30;35" in a TSV cell + - Matching of filter string to cell values uses OR logic: if ANY value in a cell matches ANY filter value, the row matches. + E.g. "key2:value3,value4" means that TSV cells in the "key2" column that contain "value3" will match, but also cells that contain "value3;value4" or "value4;value3" will match. + Summary of how different input filter values are handled: - If the filter_string is empty, all records are returned. - If the filter_string is None, an error is raised. - - If the filter_string is not empty, the method filters the DataFrame based on the provided filter_string. - - If any of the keys in the filter_string are not found in the DataFrame columns, a warning is logged and those conditions are skipped. - - If none of the values in the filter_string are found in the DataFrame, a warning is logged and all records are returned. + - If the filter_string is not empty, the method filters the dataframe based on the provided filter_string. + - If any of the keys in the filter_string are not found in the dataframe columns, a warning is logged and those conditions are skipped. + - If none of the filter string values match any cell values in the dataframe, a warning is logged and all records are returned. - If the filter_string is invalid, a SidecarInvalidFilterError is raised. The method returns the SidecarQueryManager instance with the query_result and query_message. The former is the filtered DataFrame results, @@ -749,13 +765,14 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": key_values = self.filter_string.split(";") filter_conditions = [] + # 1. Parse the input filter string and build a list of boolean conditions to apply to the dataframe for key_value in key_values: if not key_value.strip(): continue try: - key, values = key_value.split(":", 1) + key, filter_string_values = key_value.split(":", 1) key = key.strip() - values = values.strip() + filter_string_values = filter_string_values.strip() if key not in self.df.columns: warning_msg = f"Column '{key}' not found in the TSV file. Skipping this filter condition." @@ -767,58 +784,99 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": # Handle numeric filtering: inequalities, ranges, and discrete values (all with OR logic) # e.g., "Weight:>25,<30,50" or "Weight:20-40,50,>100" + # Supports filtering on semicolon-separated values in cells in the TSV: e.g. "25;30;35" if is_numeric: - values_list = values.split(",") + filter_string_values_list = filter_string_values.split(",") inequality_conditions = [] range_conditions = [] discrete_values = [] - for v in values_list: - v = v.strip() + for filter_string_value in filter_string_values_list: + filter_string_value = filter_string_value.strip() # Check for common mistakes: =< or => instead of <= or >= - if re.match(r"^=<\d+\.?\d*$", v) or re.match(r"^=>\d+\.?\d*$", v): + if re.match(r"^=<\d+\.?\d*$", filter_string_value) or re.match( + r"^=>\d+\.?\d*$", filter_string_value + ): raise SidecarInvalidFilterError( - f"Invalid operator format '{v[:2]}' in filter '{key}:{values}'. " + f"Invalid operator format '{filter_string_value[:2]}' in filter '{key}:{filter_string_values}'." f"Use standard operators: '<=' (not '=<') or '>=' (not '=>')" ) # Check if it's an inequality (e.g., ">25", "<=40") - inequality_match = re.match(r"^(>=|<=|>|<)(\d+\.?\d*)$", v) + inequality_match = re.match(r"^(>=|<=|>|<)(\d+\.?\d*)$", filter_string_value) if inequality_match: operator = inequality_match.group(1) - value = float(inequality_match.group(2)) - - if operator == ">": - inequality_conditions.append(self.df[key] > value) - elif operator == ">=": - inequality_conditions.append(self.df[key] >= value) - elif operator == "<": - inequality_conditions.append(self.df[key] < value) - elif operator == "<=": - inequality_conditions.append(self.df[key] <= value) - - logger.debug(f"Applied inequality filter on '{key}': {operator} {value}") + threshold = float(inequality_match.group(2)) + + condition = self.df[key].apply( + lambda cell_value, op=operator, thresh=threshold: ( + False + if pd.isna(cell_value) + else any( + ( + cell_value_num > thresh + if op == ">" + else cell_value_num >= thresh + if op == ">=" + else cell_value_num < thresh + if op == "<" + else cell_value_num <= thresh + ) + for cell_value_num in ( + float(cell_value_str) if "." in cell_value_str else int(cell_value_str) + for cell_value_str in str(cell_value).split(";") + if cell_value_str.strip() + ) + ) + if not pd.isna(cell_value) + else False + ) + if not pd.isna(cell_value) + else False + ) + inequality_conditions.append(condition) + logger.debug(f"Applied inequality filter on '{key}': {operator} {threshold}") continue # Check if it's a range (e.g., "20-40") - range_match = re.match(r"^(\d+\.?\d*)-(\d+\.?\d*)$", v) + range_match = re.match(r"^(\d+\.?\d*)-(\d+\.?\d*)$", filter_string_value) if range_match: min_val = float(range_match.group(1)) max_val = float(range_match.group(2)) - range_condition = (self.df[key] >= min_val) & (self.df[key] <= max_val) - range_conditions.append(range_condition) + + condition = self.df[key].apply( + lambda cell_value, min_v=min_val, max_v=max_val: ( + False + if pd.isna(cell_value) + else any( + min_v <= cell_value_num <= max_v + for cell_value_num in ( + float(cell_value_str) if "." in cell_value_str else int(cell_value_str) + for cell_value_str in str(cell_value).split(";") + if cell_value_str.strip() + ) + ) + ) + ) + range_conditions.append(condition) logger.debug(f"Applied range filter on '{key}': {min_val} to {max_val}") continue # Otherwise, treat as discrete value try: - discrete_values.append(float(v) if "." in v else int(v)) + # Collect all discrete values from the filter string as the "for filter_string_value in filter_string_values_list" loop progresses. + # The lambda function for discrete values is below and will run once for all collected discrete values. + discrete_values.append( + float(filter_string_value) if "." in filter_string_value else int(filter_string_value) + ) except ValueError: - logger.warning(f"Cannot convert '{v}' to numeric for column '{key}'. Skipping this value.") + logger.warning( + f"Cannot convert '{filter_string_value}' to numeric for column '{key}'. Skipping this value." + ) # If multiple conditions (inequality, range, discrete values), combine them with with OR logic - # In short, this builds a boolean filter that Pandas will apply to the dataframe column + # In short, this builds a new boolean filter from the combined bools that Pandas will apply to the dataframe column conditions = [] if inequality_conditions: @@ -837,7 +895,20 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": conditions.append(combined_ranges) if discrete_values: - discrete_condition = self.df[key].isin(discrete_values) + discrete_condition = self.df[key].apply( + lambda cell_value, target_filter_values=discrete_values: ( + False + if pd.isna(cell_value) + else any( + cell_value_num in target_filter_values + for cell_value_num in ( + float(cell_value_str) if "." in cell_value_str else int(cell_value_str) + for cell_value_str in str(cell_value).split(";") + if cell_value_str.strip() + ) + ) + ) + ) conditions.append(discrete_condition) if conditions: @@ -847,7 +918,7 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": combined = combined | cond if not combined.any(): - warning_msg = f"No values in column '{key}' match the filter: {values}" + warning_msg = f"No values in column '{key}' match the filter: {filter_string_values}" logger.warning(warning_msg) self.warnings.append(warning_msg) filter_conditions.append(combined) @@ -858,10 +929,20 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": self.warnings.append(warning_msg) else: # Non-numeric column: handle as discrete string values - values_list = values.split(",") - condition = self.df[key].isin(values_list) + # Supports filtering on semicolon-separated values in cells in the TSV: e.g. "North;West" + filter_string_values_list = filter_string_values.split(",") + + condition = self.df[key].apply( + lambda cell_value, target_filter_values=filter_string_values_list: ( + False + if pd.isna(cell_value) + else any( + cell_value_str in target_filter_values for cell_value_str in str(cell_value).split(";") + ) + ) + ) if not condition.any(): - warning_msg = f"None of the values {values_list} were found in column '{key}'" + warning_msg = f"None of the values {filter_string_values_list} were found in column '{key}'" logger.warning(warning_msg) self.warnings.append(warning_msg) filter_conditions.append(condition) @@ -870,6 +951,7 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": f"Invalid filter format: '{key_value}'. Expected format 'key:value1,value2' or 'key:min-max' for numeric ranges" ) from e + # 2. Apply the final boolean filters on the dataframe if filter_conditions: combined_condition = pd.Series(True, index=self.df.index) for condition in filter_conditions: From c5b5a1cd21ebdbae5e227d9516e4ece688a3520e Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 6 Feb 2026 10:45:45 +0100 Subject: [PATCH 010/100] Add text on TSV format requirements to user guide --- docs/user-guides/quick-start.md | 2 +- docs/user-guides/sidecar-metadata.md | 45 +++++++++++++++++++++------- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/docs/user-guides/quick-start.md b/docs/user-guides/quick-start.md index 4678d22b..cb3c7f05 100644 --- a/docs/user-guides/quick-start.md +++ b/docs/user-guides/quick-start.md @@ -128,7 +128,7 @@ DivBase can checkout data based the VCF files themselves, but can also take an o divbase-cli dimensions create-metadata-template ``` -Details on how to write this file are given in [Sidecar Metadata TSV files: creating and querying sample metadata files](sidecar-metadata.md). In short, the first row starts with `#` and contains the headers for different metadata columns. The first column (`Sample_ID`) is mandatory and can be created by the system as just described; if created manually just make. The rest of the columns are free for the user to define. +Details on how to write this file are given in [Sidecar Metadata TSV files: creating and querying sample metadata files](sidecar-metadata.md). In short, the first row starts with `#` and contains the headers for different metadata columns. The first column (`Sample_ID`) is mandatory and can be created by the system as just described; if created manually just make sure that each sample name is spelled exactly as in the VCF files. The rest of the columns are free for the user to define. Example of a sidecar metadata TSV file with the mandatory `Sample_ID` column and two user defined columns. diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 01211741..c26827b5 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -14,26 +14,47 @@ divbase-cli dimensions create-metadata-template Note! there can be multiple TSVs in the same project and it is possible to call them for the queries with the `--metadata-tsv-name` flag. -TODOs: +### Sidecar TSV format requirements -- [TO BE IMPLEMENTED] consider changing the mandatory column name from `Sample_ID` to `Sample` -- [TO BE IMPLEMENTED] what happens if a TSV does not contain all the samples in the DivBase project? There should probably be a warning, but not an error? -- [TO BE IMPLEMENTED] what happens if a sample name is misspelled in the TSV? a warning? can this be checked against the dimensions show? +**Mandatory content:** -- [TO BE IMPLEMENTED] what happens if a sample is duplicated in the file. what happens if the sample name is duplicated but not the values (diverging duplicate)? +1. The first row must be a header row and the first column must be named `Sample_ID`. +2. The `Sample_ID` column must contain the exact names of the samples as they are spelled in the VCF files. This will already be handled if user has run a `divbase-cli dimensions update` job and, after its completion, has generated a pre-filled template with: `divbase-cli dimensions create-metadata-template` +3. The `Sample_ID` column can only contain one sample name per row. This is different from the user-defined columns that can take arrays of values for each cell in a column. +4. Every column need to be tab separated for all rows. -1 mandatory column: sample name. +**User-defined columns:** -any other columns are optional and user defined +After the `Sample_ID` column has been populated, users can add any columns and values to the TSV. -it is possible to have more than one sidecar sample metadata file in each DivBase project. +1. It is the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling must be used for the filters. +2. The user-defined columns can be either numeric or string type. Try to avoid mixing string and numeric values in the same column is possible. If a mix of string and numerical data is used in the same column, the system will treat them all as strings, which might lead to unexpected filtering results when running queries. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values is numerical or string. +3. Use English decimal notation (.) and not comma (,) when entering decimals. This ensures that the data is correctly loaded by `Pandas`. +4. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data. -example +**Example:** -``` -TODO add example here +This example illustrates how a sidecar sample metadata TSV can look like. The mandatory requirement are fulfilled (heading, `Sample_ID` column, tab-separated file). The user-defined column contains examples of a numerical column (`Population`) and a string column (`Area`). In some cells, semicolons (`;`) are used to assign multiple values to the same sample and column. + +```text +#Sample_ID Population Area +S1 1 North +S2 2;4 East +S3 3 West;South +S4 4 West +S5 5 North +S6 6 East +S7 1;3;5 South +S8 2 West ``` +TODOs: + +- [TO BE IMPLEMENTED] consider changing the mandatory column name from `Sample_ID` to `Sample` +- [TO BE IMPLEMENTED] what happens if a TSV does not contain all the samples in the DivBase project? There should probably be a warning, but not an error? +- [TO BE IMPLEMENTED] what happens if a sample name is misspelled in the TSV? a warning? can this be checked against the dimensions show? +- [TO BE IMPLEMENTED] what happens if a sample is duplicated in the file. what happens if the sample name is duplicated but not the values (diverging duplicate)? + ## Query Syntax for sidecar metadata from `divbase-cli query tsv -h` docstring: @@ -59,6 +80,8 @@ Weight:1-2,4 → values in range 1-2 OR equal to 4 Weight:>5,1-2,4 → values greater than 5 OR in range 1-2 OR equal to 4 Weight:>10,<2,5-7 → values >10 OR <2 OR in range 5-7 +note that semicolon is allowed in in cells in the TSV, but have another meaning in the query syntax! + TODO write pytests that ensure that these numerical filters work ## Trying out a query From 227d4c481fda7799c0cafb75d6afe3f376da3833 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 6 Feb 2026 11:32:50 +0100 Subject: [PATCH 011/100] Add text on TSV query syntax --- docs/user-guides/sidecar-metadata.md | 80 +++++++++++++++++++++------- 1 file changed, 60 insertions(+), 20 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index c26827b5..30978eea 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -16,14 +16,14 @@ Note! there can be multiple TSVs in the same project and it is possible to call ### Sidecar TSV format requirements -**Mandatory content:** +#### Mandatory content 1. The first row must be a header row and the first column must be named `Sample_ID`. 2. The `Sample_ID` column must contain the exact names of the samples as they are spelled in the VCF files. This will already be handled if user has run a `divbase-cli dimensions update` job and, after its completion, has generated a pre-filled template with: `divbase-cli dimensions create-metadata-template` 3. The `Sample_ID` column can only contain one sample name per row. This is different from the user-defined columns that can take arrays of values for each cell in a column. 4. Every column need to be tab separated for all rows. -**User-defined columns:** +#### User-defined columns After the `Sample_ID` column has been populated, users can add any columns and values to the TSV. @@ -32,7 +32,7 @@ After the `Sample_ID` column has been populated, users can add any columns and v 3. Use English decimal notation (.) and not comma (,) when entering decimals. This ensures that the data is correctly loaded by `Pandas`. 4. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data. -**Example:** +#### Example This example illustrates how a sidecar sample metadata TSV can look like. The mandatory requirement are fulfilled (heading, `Sample_ID` column, tab-separated file). The user-defined column contains examples of a numerical column (`Population`) and a string column (`Area`). In some cells, semicolons (`;`) are used to assign multiple values to the same sample and column. @@ -57,32 +57,72 @@ TODOs: ## Query Syntax for sidecar metadata -from `divbase-cli query tsv -h` docstring: -String consisting of keys:values in the tsv file to filter on. The syntax is 'Key1:Value1,Value2;Key2:Value3,Value4', where the key are the column header names in the tsv, and values are the column values. Multiple values for a key are separated by commas, and multiple keys are separated by semicolons. When multple keys are provided, an intersect query will be performed. E.g. 'Area:West of Ireland,Northern Portugal;Sex:F'. +### Overview: querys are applied as filters on columns in the TSV -- [TO BE IMPLEMENTED] filtering based on ranges (ints and maybe floats) and not just on strings, e.g. in range 31 - 50. etc... -- [TO BE IMPLEMENTED] add more Set Operations: union, intersection, difference, symmetric difference. Be clear on the default behaviour +Queries on the sidecar sample metadata TSV can be done with the `divbase-cli query tsv` command. The filters that the user want to query on needs entered as a string (i.e. enclosed in quotes, `""`). -Please do not mix numerical and string values in the same column! +The TSV query syntax is `"Key1:Value1,Value2;Key2:Value3,Value4"`, where `Key1:`...`Key2:` are the column header names in the TSV, and `Value1`...`Value4` are the values. Multiple filter values for a key are separated by commas, and multiple keys are separated by semicolons. There can be any number keys and values to filter on, but it is up to the user to write queries that return useful results. -For numeric columns, you can filter on: +!!! note + Please note that semicolon (`;`) is used for different purposes in the TSV (multi-value cells) and in the query syntax (perform queries on multiple columns)! -- Inequalities: 'Weight:>25' or "Weight:>=20,<=40" or "Weight:<100". The inequality operator must be expressed relative to the Key, i.e. for 'Weight:>25' the reverse notation 'Weight:25<' is not supported. -- Range (inclusive): 'Weight:20-40' -- Discrete values: 'Weight:25,30,35' +Filtering is inclusive by default. This applies both for the filter values and the cell values: -The syntax only accepts `<=` and `>=` since this is the syntax of Python. The forms =< and => are not accepted and will return an error. +- If a filter contains multiple values, e.g. `"Area:North,West"`, the row is included if at least one of the filter values matches any value in the cell. I.e. a row with `North`, and a row with `West` will both be returned from this filter. +- If a cell in the TSV contains multiple values separated by a semicolon as explained in [User-defined columns](#user-defined-columns) (e.g., `North;West`), the row is included if any of those values match the filter. Filters with `"Area:North"`, `"Area:West"`, and `"Area:North,West"` will all return the row with the array value `North;West`. -It is possible to combine filters on inequalities, ranges, and discrete values to an OR logic if desired. For example: +For example, if the user wants to query the TSV on column `Area` for all samples that contain the value `North`,: -Weight:<2,4 → values less than 2 OR equal to 4 -Weight:1-2,4 → values in range 1-2 OR equal to 4 -Weight:>5,1-2,4 → values greater than 5 OR in range 1-2 OR equal to 4 -Weight:>10,<2,5-7 → values >10 OR <2 OR in range 5-7 +```bash +divbase-cli query tsv "Area:North" +``` + +It is also possible to run a sidecar sample metadata query as part of a VCF query by adding the query as a sting to the flag `--tsv-filter`: + +```bash +divbase-cli query bcftools-pipe --tsv-filter "Area:North" --command "view -s SAMPLES" +``` + +Please also see the documentation on [DivBase Query Syntax for VCF data](query-syntax.md) for more details on how that command works. + +!!! note + To reiterate what was written in the [User-defined columns](#user-defined-columns) section above: it the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling must be used for the filters. + +### Filtering on string columns + +Queries on string columns are straight-forward in the sense that each semicolon-separated value in the TSV are treated as discrete values. + +As explained above, commas can be used to write multi-values filters. For instance, the query: + +```bash +divbase-cli query tsv "Area:North,South,East" +``` + +will return all samples where **at least one** of the semicolon-separated values in the Area column matches any of the filter values (`North`, `South`, or `East`). + +Comma-separated: "Area:North,South,East" +OR logic: if ANY cell value matches ANY filter value, the row matches +Example: "key2:value3,value4" matches cells containing value3, value4, value3;value4, or value4;value3 + +### Filtering on numerical columns + +For numerical columns, it is possible to filter on the following operations: + +- Inequalities: + - Examples: `"Weight:>25"` or `"Weight:>=20,<=40"` or `"Weight:<100"`. + - Note" The inequality operator must be expressed relative to the key, i.e. `"Weight:>25"`. The reverse notation `"Weight:25<"` is not supported. + - The syntax only accepts `<=` and `>=` since this is the syntax of Python. The forms =< and => are not accepted and will return an error. +- Range (inclusive): + - Example: `"Weight:20-40"` +- Discrete values: + - Example: `"Weight:25,30,35"` -note that semicolon is allowed in in cells in the TSV, but have another meaning in the query syntax! +Furthermore, it is possible to combine filters on inequalities, ranges, and discrete values using inclusive OR logic. This means that if any one of the specified conditions is satisfied for a cell, the row will be included in the results. For example: -TODO write pytests that ensure that these numerical filters work +- `"Weight:<2,4"` returns rows where the value is less than 2 **or** equal to 4 +- `"Weight:1-2,4"` returns rows where the value is in the range 1–2 **or** equal to 4 +- `"Weight:>5,1-2,4"` returns rows where the value is greater than 5 **or** in the range 1–2 **or** equal to 4 +- `"Weight:>10,<2,5-7"` returns rows where the value is greater than 10 **or** less than 2 **or** in the range 5–7 ## Trying out a query From cef5a38adee0b3c2b6721fa48b50714dcdf4c025 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 6 Feb 2026 13:20:36 +0100 Subject: [PATCH 012/100] Add unit tests for numerical filtering There might be an issue with semicolon separated numerical values... --- .../test_sample_metadata_queries.py | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 tests/unit/divbase_api/test_sample_metadata_queries.py diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py new file mode 100644 index 00000000..731d8855 --- /dev/null +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -0,0 +1,208 @@ +""" +Unit tests for SidecarQueryManager filtering +""" + +import pytest + +from divbase_api.services.queries import SidecarQueryManager + + +@pytest.fixture +def sample_tsv_with_numeric_data(tmp_path): + """ + Create a temporary TSV file with numeric and string columns for testing. + Includes semicolon-separated values in some cells. + Note: Weight and Age columns have NO semicolons to ensure pandas infers them as numeric. + Population column has semicolons but should still be numeric. + """ + tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea +S1\t1\t20.0\t5.0\tNorth +S2\t2;4\t25.0\t10\tEast +S3\t3\t30.0\t15\tWest;South +S4\t4\t35.0\t20\tWest +S5\t5\t40.0\t25\tNorth +S6\t6\t45.0\t30\tEast +S7\t1;3;5\t50.0\t35\tSouth +S8\t2\t55.0\t40\tWest +S9\t7\t62.0\t45\tNorth +S10\t8\t70.0\t52\tEast +""" + tsv_file = tmp_path / "test_metadata.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +class TestNumericalFilteringInequalities: + """Test inequality operators on numerical columns.""" + + def test_greater_than(self, sample_tsv_with_numeric_data): + """Test > operator returns correct samples.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Weight:>50") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 3 + assert "S8" in sample_ids # Weight: 55.0 + assert "S9" in sample_ids # Weight: 62.0 + assert "S10" in sample_ids # Weight: 70.0 + + def test_greater_than_or_equal(self, sample_tsv_with_numeric_data): + """Test >= operator returns correct samples.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Weight:>=50") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 4 + assert "S7" in sample_ids # Weight: 50.0 + assert "S8" in sample_ids # Weight: 55.0 + assert "S9" in sample_ids # Weight: 62.0 + assert "S10" in sample_ids # Weight: 70.0 + + def test_less_than(self, sample_tsv_with_numeric_data): + """Test < operator returns correct samples.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Age:<15") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 2 + assert "S1" in sample_ids # Age: 5 + assert "S2" in sample_ids # Age: 10 + + def test_less_than_or_equal(self, sample_tsv_with_numeric_data): + """Test <= operator returns correct samples.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Age:<=15") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 3 + assert "S1" in sample_ids # Age: 5 + assert "S2" in sample_ids # Age: 10 + assert "S3" in sample_ids # Age: 15 + + def test_inequality_on_weight_column(self, sample_tsv_with_numeric_data): + """Test inequality on Weight column (no semicolons, pure numeric).""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Weight:>60") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 2 + assert "S9" in sample_ids # Weight: 62.0 + assert "S10" in sample_ids # Weight: 70.0 + + def test_inequality_on_age_column(self, sample_tsv_with_numeric_data): + """Test inequality on Age column (no semicolons, pure numeric).""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Age:>=40") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 3 + assert "S8" in sample_ids # Age: 40.0 + assert "S9" in sample_ids # Age: 45.0 + assert "S10" in sample_ids # Age: 52.0 + + +class TestNumericalFilteringRanges: + """Test range filtering on numerical columns.""" + + def test_simple_range(self, sample_tsv_with_numeric_data): + """Test inclusive range filtering.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Weight:30-45") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 4 + assert "S3" in sample_ids # Weight: 30 + assert "S4" in sample_ids # Weight: 35 + assert "S5" in sample_ids # Weight: 40 + assert "S6" in sample_ids # Weight: 45 + + def test_range_boundaries_inclusive(self, sample_tsv_with_numeric_data): + """Test that range boundaries are inclusive.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Age:20-30") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 3 + assert "S4" in sample_ids # Age: 20 (lower boundary) + assert "S5" in sample_ids # Age: 25 + assert "S6" in sample_ids # Age: 30 (upper boundary) + + def test_range_on_weight_column(self, sample_tsv_with_numeric_data): + """Test range filtering on Weight column (no semicolons, pure numeric).""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Weight:40-60") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 4 + assert "S5" in sample_ids # Weight: 40.0 + assert "S6" in sample_ids # Weight: 45.0 + assert "S7" in sample_ids # Weight: 50.0 + assert "S8" in sample_ids # Weight: 55.0 + + def test_narrow_range(self, sample_tsv_with_numeric_data): + """Test a narrow range returns only matching samples.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Age:20-30") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 3 + assert "S4" in sample_ids # Age: 20.0 + assert "S5" in sample_ids # Age: 25.0 + assert "S6" in sample_ids # Age: 30.0 + + +class TestNumericalFilteringDiscreteValues: + """Test discrete value filtering on numerical columns.""" + + def test_single_discrete_value(self, sample_tsv_with_numeric_data): + """Test filtering with a single discrete value.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Weight:50") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S7" in sample_ids # Weight: 50 + + def test_multiple_discrete_values(self, sample_tsv_with_numeric_data): + """Test filtering with multiple discrete values (OR logic).""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Weight:20,30,50") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 3 + assert "S1" in sample_ids # Weight: 20 + assert "S3" in sample_ids # Weight: 30 + assert "S7" in sample_ids # Weight: 50 + + def test_discrete_values_with_semicolon_separated_cells(self, sample_tsv_with_numeric_data): + """Test discrete value filtering on Population column (string column with semicolons).""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:1,3,5") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 4 + assert "S1" in sample_ids # Population: 1 + assert "S3" in sample_ids # Population: 3 + assert "S5" in sample_ids # Population: 5 + assert "S7" in sample_ids # Population: 1;3;5 (string matches "1", "3", and "5") + + def test_discrete_values_match_any_semicolon_value(self, sample_tsv_with_numeric_data): + """Test that discrete filtering matches if ANY semicolon value matches (string matching on Population).""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:4") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 2 + assert "S2" in sample_ids # Population: 2;4 (string "4" matches) + assert "S4" in sample_ids # Population: 4 + + def test_discrete_age_values(self, sample_tsv_with_numeric_data): + """Test discrete filtering on Age column.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Age:10,25,40") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 3 + assert "S2" in sample_ids # Age: 10 + assert "S5" in sample_ids # Age: 25 + assert "S8" in sample_ids # Age: 40 From ffbbf56b52ea3d4f62eda7227ae4bbfd7e48c16b Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 6 Feb 2026 13:35:24 +0100 Subject: [PATCH 013/100] Handle semicolon-separated numeric values Pandas will not infer a column as numeric if it contains semicolon-separated values, even if those values are numeric. This means that the inequalities and range logic will not work for such values. This adds a helper method that checks if a column contains semicolon-separated numeric values and raises an error if the values are of mixed types (e.g. "1";two;3). The lambda functions in the run_query() logic will convert each semicolon-separated numerical value float or int before applying the inequality or range logic. --- .../src/divbase_api/services/queries.py | 99 ++++++++--- .../test_sample_metadata_queries.py | 161 +++++++++++++----- 2 files changed, 194 insertions(+), 66 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index e5cd48fa..7b1d4d5f 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -711,6 +711,58 @@ def load_file(self) -> "SidecarQueryManager": raise SidecarNoDataLoadedError(file_path=self.file, submethod="load_file") from e return self + def _is_semicolon_separated_numeric_column(self, key: str) -> bool: + """ + Helper method to detect if a column contains semicolon-separated numeric values. + Pandas correctly infers type from single-value columns. But for columns with semicolon-separated values + (e.g.: "1;2;3"), it infers them as strings (object dtype). This is an issue since numeric operations + (inequalities, ranges) cannot be performed on string values. + + This helper method checks ALL non-null values in the column to determine if they can be parsed as numeric + after splitting by semicolon. Note that it only detects if a column value is semicolon-separated numeric, it does not + convert the column to numeric type. It also validates that the colum value is not of mixed string-numerical type, which + is invalid input for the query system. The actual parsing and handling of the semicolon-separated numeric values is + done in the lamda functions in the run_query() method. + + Returns True if the column contains ONLY numeric values (with or without semicolons). + Returns False if the column contains ONLY non-numeric values (regular string column), or if the column is empty. + Raises SidecarInvalidFilterError if mixed types detected (e.g., "1;2" and "abc;def" in same column). + """ + if key not in self.df.columns: + return False + + non_null_values = self.df[key].dropna() + if len(non_null_values) == 0: + return False + + has_numeric_type = False + has_non_numeric_type = False + + for row_index, cell_value in enumerate(non_null_values): + cell_str = str(cell_value).strip() + if not cell_str: + continue + + parts = cell_str.split(";") + for part in parts: + part = part.strip() + if not part: + continue + try: + float(part) + has_numeric_type = True + except ValueError: + has_non_numeric_type = True + + if has_numeric_type and has_non_numeric_type: + raise SidecarInvalidFilterError( + f"Column '{key}' contains mixed types. Value '{cell_str}' at row {row_index} " + f"has both numeric and non-numeric parts. All values in a column must be consistently " + f"numeric or string for filtering to work correctly." + ) + + return has_numeric_type and not has_non_numeric_type + def run_query(self, filter_string: str = None) -> "SidecarQueryManager": """ Method to run a query against the loaded data. The filter_string should be a semicolon-separated list of key:value pairs, @@ -781,11 +833,13 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": continue is_numeric = pd.api.types.is_numeric_dtype(self.df[key]) + is_semicolon_numeric = self._is_semicolon_separated_numeric_column(key) if not is_numeric else False # Handle numeric filtering: inequalities, ranges, and discrete values (all with OR logic) # e.g., "Weight:>25,<30,50" or "Weight:20-40,50,>100" # Supports filtering on semicolon-separated values in cells in the TSV: e.g. "25;30;35" - if is_numeric: + # Also handles columns that pandas infers as strings but contain numeric values with semicolons (e.g., "1;2;3") + if is_numeric or is_semicolon_numeric: filter_string_values_list = filter_string_values.split(",") inequality_conditions = [] range_conditions = [] @@ -811,29 +865,32 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": condition = self.df[key].apply( lambda cell_value, op=operator, thresh=threshold: ( - False - if pd.isna(cell_value) - else any( - ( - cell_value_num > thresh - if op == ">" - else cell_value_num >= thresh - if op == ">=" - else cell_value_num < thresh - if op == "<" - else cell_value_num <= thresh - ) - for cell_value_num in ( - float(cell_value_str) if "." in cell_value_str else int(cell_value_str) - for cell_value_str in str(cell_value).split(";") - if cell_value_str.strip() + ( + False + if pd.isna(cell_value) + else any( + ( + cell_value_num > thresh + if op == ">" + else cell_value_num >= thresh + if op == ">=" + else cell_value_num < thresh + if op == "<" + else cell_value_num <= thresh + ) + for cell_value_num in ( + # convert to numeric type after splitting by semicolon + float(cell_value_str) if "." in cell_value_str else int(cell_value_str) + for cell_value_str in str(cell_value).split(";") + if cell_value_str.strip() + ) ) + if not pd.isna(cell_value) + else False ) if not pd.isna(cell_value) else False ) - if not pd.isna(cell_value) - else False ) inequality_conditions.append(condition) logger.debug(f"Applied inequality filter on '{key}': {operator} {threshold}") @@ -852,6 +909,7 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": else any( min_v <= cell_value_num <= max_v for cell_value_num in ( + # convert to numeric type after splitting by semicolon float(cell_value_str) if "." in cell_value_str else int(cell_value_str) for cell_value_str in str(cell_value).split(";") if cell_value_str.strip() @@ -946,6 +1004,9 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": logger.warning(warning_msg) self.warnings.append(warning_msg) filter_conditions.append(condition) + except SidecarInvalidFilterError: + # Re-raise our custom exceptions without wrapping them + raise except Exception as e: raise SidecarInvalidFilterError( f"Invalid filter format: '{key_value}'. Expected format 'key:value1,value2' or 'key:min-max' for numeric ranges" diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 731d8855..8713b745 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -11,9 +11,8 @@ def sample_tsv_with_numeric_data(tmp_path): """ Create a temporary TSV file with numeric and string columns for testing. - Includes semicolon-separated values in some cells. - Note: Weight and Age columns have NO semicolons to ensure pandas infers them as numeric. - Population column has semicolons but should still be numeric. + Includes semicolon-separated values in some cells. Includes both int and float + numeric values to test that both are detected as numeric. """ tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea S1\t1\t20.0\t5.0\tNorth @@ -42,9 +41,9 @@ def test_greater_than(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 3 - assert "S8" in sample_ids # Weight: 55.0 - assert "S9" in sample_ids # Weight: 62.0 - assert "S10" in sample_ids # Weight: 70.0 + assert "S8" in sample_ids + assert "S9" in sample_ids + assert "S10" in sample_ids def test_greater_than_or_equal(self, sample_tsv_with_numeric_data): """Test >= operator returns correct samples.""" @@ -53,10 +52,10 @@ def test_greater_than_or_equal(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 4 - assert "S7" in sample_ids # Weight: 50.0 - assert "S8" in sample_ids # Weight: 55.0 - assert "S9" in sample_ids # Weight: 62.0 - assert "S10" in sample_ids # Weight: 70.0 + assert "S7" in sample_ids + assert "S8" in sample_ids + assert "S9" in sample_ids + assert "S10" in sample_ids def test_less_than(self, sample_tsv_with_numeric_data): """Test < operator returns correct samples.""" @@ -65,8 +64,8 @@ def test_less_than(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 2 - assert "S1" in sample_ids # Age: 5 - assert "S2" in sample_ids # Age: 10 + assert "S1" in sample_ids + assert "S2" in sample_ids def test_less_than_or_equal(self, sample_tsv_with_numeric_data): """Test <= operator returns correct samples.""" @@ -75,9 +74,9 @@ def test_less_than_or_equal(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 3 - assert "S1" in sample_ids # Age: 5 - assert "S2" in sample_ids # Age: 10 - assert "S3" in sample_ids # Age: 15 + assert "S1" in sample_ids + assert "S2" in sample_ids + assert "S3" in sample_ids def test_inequality_on_weight_column(self, sample_tsv_with_numeric_data): """Test inequality on Weight column (no semicolons, pure numeric).""" @@ -86,8 +85,8 @@ def test_inequality_on_weight_column(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 2 - assert "S9" in sample_ids # Weight: 62.0 - assert "S10" in sample_ids # Weight: 70.0 + assert "S9" in sample_ids + assert "S10" in sample_ids def test_inequality_on_age_column(self, sample_tsv_with_numeric_data): """Test inequality on Age column (no semicolons, pure numeric).""" @@ -96,9 +95,9 @@ def test_inequality_on_age_column(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 3 - assert "S8" in sample_ids # Age: 40.0 - assert "S9" in sample_ids # Age: 45.0 - assert "S10" in sample_ids # Age: 52.0 + assert "S8" in sample_ids + assert "S9" in sample_ids + assert "S10" in sample_ids class TestNumericalFilteringRanges: @@ -111,10 +110,10 @@ def test_simple_range(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 4 - assert "S3" in sample_ids # Weight: 30 - assert "S4" in sample_ids # Weight: 35 - assert "S5" in sample_ids # Weight: 40 - assert "S6" in sample_ids # Weight: 45 + assert "S3" in sample_ids + assert "S4" in sample_ids + assert "S5" in sample_ids + assert "S6" in sample_ids def test_range_boundaries_inclusive(self, sample_tsv_with_numeric_data): """Test that range boundaries are inclusive.""" @@ -123,9 +122,9 @@ def test_range_boundaries_inclusive(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 3 - assert "S4" in sample_ids # Age: 20 (lower boundary) - assert "S5" in sample_ids # Age: 25 - assert "S6" in sample_ids # Age: 30 (upper boundary) + assert "S4" in sample_ids + assert "S5" in sample_ids + assert "S6" in sample_ids def test_range_on_weight_column(self, sample_tsv_with_numeric_data): """Test range filtering on Weight column (no semicolons, pure numeric).""" @@ -134,10 +133,10 @@ def test_range_on_weight_column(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 4 - assert "S5" in sample_ids # Weight: 40.0 - assert "S6" in sample_ids # Weight: 45.0 - assert "S7" in sample_ids # Weight: 50.0 - assert "S8" in sample_ids # Weight: 55.0 + assert "S5" in sample_ids + assert "S6" in sample_ids + assert "S7" in sample_ids + assert "S8" in sample_ids def test_narrow_range(self, sample_tsv_with_numeric_data): """Test a narrow range returns only matching samples.""" @@ -146,9 +145,9 @@ def test_narrow_range(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 3 - assert "S4" in sample_ids # Age: 20.0 - assert "S5" in sample_ids # Age: 25.0 - assert "S6" in sample_ids # Age: 30.0 + assert "S4" in sample_ids + assert "S5" in sample_ids + assert "S6" in sample_ids class TestNumericalFilteringDiscreteValues: @@ -161,7 +160,7 @@ def test_single_discrete_value(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 1 - assert "S7" in sample_ids # Weight: 50 + assert "S7" in sample_ids def test_multiple_discrete_values(self, sample_tsv_with_numeric_data): """Test filtering with multiple discrete values (OR logic).""" @@ -170,9 +169,9 @@ def test_multiple_discrete_values(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 3 - assert "S1" in sample_ids # Weight: 20 - assert "S3" in sample_ids # Weight: 30 - assert "S7" in sample_ids # Weight: 50 + assert "S1" in sample_ids + assert "S3" in sample_ids + assert "S7" in sample_ids def test_discrete_values_with_semicolon_separated_cells(self, sample_tsv_with_numeric_data): """Test discrete value filtering on Population column (string column with semicolons).""" @@ -181,10 +180,10 @@ def test_discrete_values_with_semicolon_separated_cells(self, sample_tsv_with_nu sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 4 - assert "S1" in sample_ids # Population: 1 - assert "S3" in sample_ids # Population: 3 - assert "S5" in sample_ids # Population: 5 - assert "S7" in sample_ids # Population: 1;3;5 (string matches "1", "3", and "5") + assert "S1" in sample_ids + assert "S3" in sample_ids + assert "S5" in sample_ids + assert "S7" in sample_ids def test_discrete_values_match_any_semicolon_value(self, sample_tsv_with_numeric_data): """Test that discrete filtering matches if ANY semicolon value matches (string matching on Population).""" @@ -193,8 +192,8 @@ def test_discrete_values_match_any_semicolon_value(self, sample_tsv_with_numeric sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 2 - assert "S2" in sample_ids # Population: 2;4 (string "4" matches) - assert "S4" in sample_ids # Population: 4 + assert "S2" in sample_ids + assert "S4" in sample_ids def test_discrete_age_values(self, sample_tsv_with_numeric_data): """Test discrete filtering on Age column.""" @@ -203,6 +202,74 @@ def test_discrete_age_values(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 3 - assert "S2" in sample_ids # Age: 10 - assert "S5" in sample_ids # Age: 25 - assert "S8" in sample_ids # Age: 40 + assert "S2" in sample_ids + assert "S5" in sample_ids + assert "S8" in sample_ids + + +class TestSemicolonSeparatedNumericFiltering: + """Test that inequalities and ranges work on columns with semicolon-separated numeric values.""" + + def test_inequality_on_semicolon_separated_column(self, sample_tsv_with_numeric_data): + """Test that > operator works on Population column (semicolon-separated numbers).""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:>4") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 5 + assert "S5" in sample_ids + assert "S6" in sample_ids + assert "S7" in sample_ids + assert "S9" in sample_ids + assert "S10" in sample_ids + + def test_inequality_less_than_on_semicolon_separated_column(self, sample_tsv_with_numeric_data): + """Test that < operator works on Population column.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:<3") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 4 + assert "S1" in sample_ids + assert "S2" in sample_ids + assert "S7" in sample_ids + assert "S8" in sample_ids + + def test_range_on_semicolon_separated_column(self, sample_tsv_with_numeric_data): + """Test that range filtering works on Population column.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:3-6") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 6 + assert "S2" in sample_ids + assert "S3" in sample_ids + assert "S4" in sample_ids + assert "S5" in sample_ids + assert "S6" in sample_ids + assert "S7" in sample_ids + + def test_combined_inequality_and_discrete_on_semicolon_separated(self, sample_tsv_with_numeric_data): + """Test combining inequality and discrete values on Population column.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:>6,2") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 4 + assert "S2" in sample_ids + assert "S8" in sample_ids + assert "S9" in sample_ids + assert "S10" in sample_ids + + def test_range_with_semicolon_values_at_boundaries(self, sample_tsv_with_numeric_data): + """Test that range boundaries work correctly with semicolon-separated values.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:1-3") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 5 + assert "S1" in sample_ids + assert "S2" in sample_ids + assert "S3" in sample_ids + assert "S7" in sample_ids + assert "S8" in sample_ids From b64ebdbf738e9167c8a5d99711e370db8ec69f75 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 9 Feb 2026 11:00:49 +0100 Subject: [PATCH 014/100] Add test to assert that mixed-type error is raised For semicolon-separated columns. --- .../test_sample_metadata_queries.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 8713b745..229bbdeb 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -5,6 +5,7 @@ import pytest from divbase_api.services.queries import SidecarQueryManager +from divbase_lib.exceptions import SidecarInvalidFilterError @pytest.fixture @@ -31,6 +32,23 @@ def sample_tsv_with_numeric_data(tmp_path): return tsv_file +@pytest.fixture +def sample_tsv_with_unsupported_mixed_type_data(tmp_path): + """ + Create a temporary TSV file with a column that containes + mixed numeric and non-numeric values to test that this correctly raises an error. + """ + tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea +S1\t1\t20.0\t5.0\tNorth +S2\t2;four;5\t25.0\t10\tEast +S3\t3\t30.0\t15\tWest;South + +""" + tsv_file = tmp_path / "test_metadata_unsupported_values.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + class TestNumericalFilteringInequalities: """Test inequality operators on numerical columns.""" @@ -273,3 +291,9 @@ def test_range_with_semicolon_values_at_boundaries(self, sample_tsv_with_numeric assert "S3" in sample_ids assert "S7" in sample_ids assert "S8" in sample_ids + + def test_raise_exception_on_mixed_type_column_value(self, sample_tsv_with_unsupported_mixed_type_data): + """Test that a column with mixed numeric and non-numeric values raises an error.""" + manager = SidecarQueryManager(file=sample_tsv_with_unsupported_mixed_type_data) + with pytest.raises(SidecarInvalidFilterError, match="Column 'Population' contains mixed types"): + manager.run_query(filter_string="Population:>2") From 963acdd7c2afb19acb08a3f1bef3e778213a310b Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 9 Feb 2026 11:18:38 +0100 Subject: [PATCH 015/100] Ensure that mixed-type error is propagated to user Return 400 and print error message with details about the mixed numeric and string types in the column. --- packages/divbase-api/src/divbase_api/routes/queries.py | 5 +++++ packages/divbase-api/src/divbase_api/services/queries.py | 2 +- tests/unit/divbase_api/test_sample_metadata_queries.py | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/routes/queries.py b/packages/divbase-api/src/divbase_api/routes/queries.py index 1d1c8481..0b2c580d 100644 --- a/packages/divbase-api/src/divbase_api/routes/queries.py +++ b/packages/divbase-api/src/divbase_api/routes/queries.py @@ -29,6 +29,7 @@ SampleMetadataQueryRequest, SampleMetadataQueryTaskResult, ) +from divbase_lib.exceptions import SidecarInvalidFilterError logging.basicConfig(level=settings.api.log_level, handlers=[logging.StreamHandler(sys.stderr)]) @@ -81,6 +82,10 @@ async def sample_metadata_query( # TODO - consider if we split this into 2 routes to handle time out issues on CLI side. # Route 1, create job and get back job id. # Route 2, get job result by id (with status etc), CLI can poll until done. + except SidecarInvalidFilterError as e: + # Catch invalid filter errors (e.g., mixed types in columns) and return 400 + error_message = str(e) + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=error_message) from None except VCFDimensionsEntryMissingError: # Catch and raise anew to avoid duplications in the error message raise VCFDimensionsEntryMissingError(project_name=project.name) from None diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 7b1d4d5f..6e570728 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -756,7 +756,7 @@ def _is_semicolon_separated_numeric_column(self, key: str) -> bool: if has_numeric_type and has_non_numeric_type: raise SidecarInvalidFilterError( - f"Column '{key}' contains mixed types. Value '{cell_str}' at row {row_index} " + f"Column '{key}' in the metadata file contains mixed types. Value '{cell_str}' at row {row_index} " f"has both numeric and non-numeric parts. All values in a column must be consistently " f"numeric or string for filtering to work correctly." ) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 229bbdeb..57cea555 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -295,5 +295,7 @@ def test_range_with_semicolon_values_at_boundaries(self, sample_tsv_with_numeric def test_raise_exception_on_mixed_type_column_value(self, sample_tsv_with_unsupported_mixed_type_data): """Test that a column with mixed numeric and non-numeric values raises an error.""" manager = SidecarQueryManager(file=sample_tsv_with_unsupported_mixed_type_data) - with pytest.raises(SidecarInvalidFilterError, match="Column 'Population' contains mixed types"): + with pytest.raises( + SidecarInvalidFilterError, match="Column 'Population' in the metadata file contains mixed types" + ): manager.run_query(filter_string="Population:>2") From e58486c6ee1cf6185fe7857d7337cca023d5546a Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 9 Feb 2026 11:34:27 +0100 Subject: [PATCH 016/100] Add tests for string value filters String filtering has less set operations that the numerical filters (inquealities, range, discrete) since it only takes discrete values. Will thus need less test coverage for now. --- .../test_sample_metadata_queries.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 57cea555..61d2409f 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -299,3 +299,47 @@ def test_raise_exception_on_mixed_type_column_value(self, sample_tsv_with_unsupp SidecarInvalidFilterError, match="Column 'Population' in the metadata file contains mixed types" ): manager.run_query(filter_string="Population:>2") + + +class TestStringColumnFiltering: + """Test string column filtering with single and semicolon-separated values.""" + + def test_single_string_value_column(self, sample_tsv_with_edge_cases): + """Test filtering on a string column with single values (no semicolons).""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="SingleString:West") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S1" in sample_ids + + def test_single_string_value_column_multiple_filters(self, sample_tsv_with_edge_cases): + """Test filtering on a single-value string column with multiple filter values (OR logic).""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="SingleString:West,North") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 2 + assert "S1" in sample_ids + assert "S2" in sample_ids + + def test_semicolon_separated_string_column(self, sample_tsv_with_numeric_data): + """Test filtering on string column with semicolon-separated values (Area column).""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Area:West") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 3 + assert "S3" in sample_ids + assert "S4" in sample_ids + assert "S8" in sample_ids + + def test_semicolon_separated_string_column_any_match(self, sample_tsv_with_numeric_data): + """Test that filtering matches if ANY semicolon-separated value matches.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Area:South") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 2 + assert "S3" in sample_ids + assert "S7" in sample_ids From 70ea81d6133eb0dfa7ef24ef49b0c79a5e5d5938 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 9 Feb 2026 11:49:55 +0100 Subject: [PATCH 017/100] Add fixture column that only has single values And tests that act on that column. This ensure that the query logic can handle columns that contain some semicolon-separated and columns with no semicolon-separated values --- .../test_sample_metadata_queries.py | 58 +++++++++++++++---- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 61d2409f..74e018e7 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -15,17 +15,18 @@ def sample_tsv_with_numeric_data(tmp_path): Includes semicolon-separated values in some cells. Includes both int and float numeric values to test that both are detected as numeric. """ - tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea -S1\t1\t20.0\t5.0\tNorth -S2\t2;4\t25.0\t10\tEast -S3\t3\t30.0\t15\tWest;South -S4\t4\t35.0\t20\tWest -S5\t5\t40.0\t25\tNorth -S6\t6\t45.0\t30\tEast -S7\t1;3;5\t50.0\t35\tSouth -S8\t2\t55.0\t40\tWest -S9\t7\t62.0\t45\tNorth -S10\t8\t70.0\t52\tEast + # Keep indentation like this to ensure that leading spaces in column 1 does not cause issues. + tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea\tSingleNumber +S1\t1\t20.0\t5.0\tNorth\t100 +S2\t2;4\t25.0\t10\tEast\t200 +S3\t3\t30.0\t15\tWest;South\t300 +S4\t4\t35.0\t20\tWest\t400 +S5\t5\t40.0\t25\tNorth\t500 +S6\t6\t45.0\t30\tEast\t600 +S7\t1;3;5\t50.0\t35\tSouth\t700 +S8\t2\t55.0\t40\tWest\t800 +S9\t7\t62.0\t45\tNorth\t900 +S10\t8\t70.0\t52\tEast\t1000 """ tsv_file = tmp_path / "test_metadata.tsv" tsv_file.write_text(tsv_content) @@ -117,6 +118,18 @@ def test_inequality_on_age_column(self, sample_tsv_with_numeric_data): assert "S9" in sample_ids assert "S10" in sample_ids + def test_inequality_on_single_numeric_value_column(self, sample_tsv_with_numeric_data): + """Test inequality on single-value numeric column (that does not have semicolon separated values).""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="SingleNumber:>600") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 4 + assert "S7" in sample_ids + assert "S8" in sample_ids + assert "S9" in sample_ids + assert "S10" in sample_ids + class TestNumericalFilteringRanges: """Test range filtering on numerical columns.""" @@ -167,6 +180,19 @@ def test_narrow_range(self, sample_tsv_with_numeric_data): assert "S5" in sample_ids assert "S6" in sample_ids + def test_range_on_single_numeric_value_column(self, sample_tsv_with_numeric_data): + """Test range filtering on single-value numeric column.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="SingleNumber:350-850") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 5 + assert "S4" in sample_ids + assert "S5" in sample_ids + assert "S6" in sample_ids + assert "S7" in sample_ids + assert "S8" in sample_ids + class TestNumericalFilteringDiscreteValues: """Test discrete value filtering on numerical columns.""" @@ -224,6 +250,16 @@ def test_discrete_age_values(self, sample_tsv_with_numeric_data): assert "S5" in sample_ids assert "S8" in sample_ids + def test_discrete_on_single_numeric_value_column_(self, sample_tsv_with_numeric_data): + """Test discrete value filtering on single-value numeric column.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="SingleNumber:100,600") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 2 + assert "S1" in sample_ids + assert "S6" in sample_ids + class TestSemicolonSeparatedNumericFiltering: """Test that inequalities and ranges work on columns with semicolon-separated numeric values.""" From 27196116e5dbcc97785875b675a91a82cea43b32 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 9 Feb 2026 14:32:43 +0100 Subject: [PATCH 018/100] Update fixture with SingleString and adapt tests --- .../test_sample_metadata_queries.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 74e018e7..b7516a65 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -16,17 +16,17 @@ def sample_tsv_with_numeric_data(tmp_path): numeric values to test that both are detected as numeric. """ # Keep indentation like this to ensure that leading spaces in column 1 does not cause issues. - tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea\tSingleNumber -S1\t1\t20.0\t5.0\tNorth\t100 -S2\t2;4\t25.0\t10\tEast\t200 -S3\t3\t30.0\t15\tWest;South\t300 -S4\t4\t35.0\t20\tWest\t400 -S5\t5\t40.0\t25\tNorth\t500 -S6\t6\t45.0\t30\tEast\t600 -S7\t1;3;5\t50.0\t35\tSouth\t700 -S8\t2\t55.0\t40\tWest\t800 -S9\t7\t62.0\t45\tNorth\t900 -S10\t8\t70.0\t52\tEast\t1000 + tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea\tSingleNumber\tSingleString +S1\t1\t20.0\t5.0\tNorth\t100\tString +S2\t2;4\t25.0\t10\tEast\t200\tStrings +S3\t3\t30.0\t15\tWest;South;East\t300\tSting +S4\t4\t35.0\t20\tWest\t400\tStings +S5\t5\t40.0\t25\tNorth\t500\tThing +S6\t6\t45.0\t30\tEast\t600\tThings +S7\t1;3;5\t50.0\t35\tSouth\t700\tStrong +S8\t2\t55.0\t40\tWest\t800\tStrung +S9\t7\t62.0\t45\tNorth\t900\tStang +S10\t8\t70.0\t52\tEast\t1000\tSong """ tsv_file = tmp_path / "test_metadata.tsv" tsv_file.write_text(tsv_content) @@ -340,19 +340,19 @@ def test_raise_exception_on_mixed_type_column_value(self, sample_tsv_with_unsupp class TestStringColumnFiltering: """Test string column filtering with single and semicolon-separated values.""" - def test_single_string_value_column(self, sample_tsv_with_edge_cases): + def test_single_string_value_column(self, sample_tsv_with_numeric_data): """Test filtering on a string column with single values (no semicolons).""" - manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - result = manager.run_query(filter_string="SingleString:West") + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="SingleString:String") sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 1 assert "S1" in sample_ids - def test_single_string_value_column_multiple_filters(self, sample_tsv_with_edge_cases): + def test_single_string_value_column_multiple_filters(self, sample_tsv_with_numeric_data): """Test filtering on a single-value string column with multiple filter values (OR logic).""" - manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - result = manager.run_query(filter_string="SingleString:West,North") + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="SingleString:String,Strings") sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 2 @@ -366,9 +366,9 @@ def test_semicolon_separated_string_column(self, sample_tsv_with_numeric_data): sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 3 - assert "S3" in sample_ids assert "S4" in sample_ids assert "S8" in sample_ids + assert "S3" in sample_ids def test_semicolon_separated_string_column_any_match(self, sample_tsv_with_numeric_data): """Test that filtering matches if ANY semicolon-separated value matches.""" From 3730a5c1d6c29ba2ad1e342f2bdd36ec2c67544e Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 9 Feb 2026 14:49:19 +0100 Subject: [PATCH 019/100] Add edge-case fixture and tests Some of these tests fail at the moment which show spots where the SidecarQueryManager's type infereance and handling should either be improved or clarified in the docs. --- .../test_sample_metadata_queries.py | 119 +++++++++++++++--- 1 file changed, 102 insertions(+), 17 deletions(-) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index b7516a65..8337e80f 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -34,18 +34,23 @@ def sample_tsv_with_numeric_data(tmp_path): @pytest.fixture -def sample_tsv_with_unsupported_mixed_type_data(tmp_path): +def sample_tsv_with_edge_cases(tmp_path): """ - Create a temporary TSV file with a column that containes - mixed numeric and non-numeric values to test that this correctly raises an error. - """ - tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea -S1\t1\t20.0\t5.0\tNorth -S2\t2;four;5\t25.0\t10\tEast -S3\t3\t30.0\t15\tWest;South + Create a temporary TSV file to test the 4 specific edge cases: + 1. "string;string;string" - OK (pure strings) + 2. "string,string;string;string" - OK (strings with commas are allowed) + 3. "1;3,5" - FAIL (mixed type: numbers with comma are treated as string since comma is not a numeric character) + 4. "1;two;5" - FAIL (mixed numeric and non-numeric should raise exception) + Also includes cases for string values containing numbers. + """ + tsv_content = """#Sample_ID\tPureStrings\tStringsWithCommas\tNumbersWithComma\tMixedTypes\tSingleString\tSingleNumber +S1\tNorth;South;East\tRegion1,Area1;Region2,Area2;Region3,Area3\t1;3,5\t1;two;5\tWest\t100 +S2\tWest;East;North\tZone1,Subzone1;Zone2,Subzone2\t2;4,6\t2;three;6\tNorth\t200 +S3\tSouth\tCity1,District1\t3\t3\tEast\t300 +S4\t1string\tstring2string\tstring3\tstring4\tString5\t400 """ - tsv_file = tmp_path / "test_metadata_unsupported_values.tsv" + tsv_file = tmp_path / "test_metadata_edge_cases.tsv" tsv_file.write_text(tsv_content) return tsv_file @@ -328,14 +333,6 @@ def test_range_with_semicolon_values_at_boundaries(self, sample_tsv_with_numeric assert "S7" in sample_ids assert "S8" in sample_ids - def test_raise_exception_on_mixed_type_column_value(self, sample_tsv_with_unsupported_mixed_type_data): - """Test that a column with mixed numeric and non-numeric values raises an error.""" - manager = SidecarQueryManager(file=sample_tsv_with_unsupported_mixed_type_data) - with pytest.raises( - SidecarInvalidFilterError, match="Column 'Population' in the metadata file contains mixed types" - ): - manager.run_query(filter_string="Population:>2") - class TestStringColumnFiltering: """Test string column filtering with single and semicolon-separated values.""" @@ -379,3 +376,91 @@ def test_semicolon_separated_string_column_any_match(self, sample_tsv_with_numer assert len(sample_ids) == 2 assert "S3" in sample_ids assert "S7" in sample_ids + + +class TestEdgeCases: + """Edge case tests for SidecarQueryManager filtering.""" + + def test_mixed_types_should_fail(self, sample_tsv_with_edge_cases): + """Test that a column with mixed numeric and non-numeric values raises an error.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + with pytest.raises(SidecarInvalidFilterError): + manager.run_query(filter_string="MixedTypes:1") + + def test_strings_with_commas(self, sample_tsv_with_edge_cases): + """Test that a column with strings containing commas is correctly handled.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="StringsWithCommas:Region1") + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + + def test_numbers_with_comma(self, sample_tsv_with_edge_cases): + """Test that a column with numeric values containing commas is treated as string type (since comma is not a numeric character).""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="NumbersWithComma:1") + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + + def test_string_with_numbers_in_value(self, sample_tsv_with_edge_cases): + """Test that values like "1string", "string2string", "string3" are correctly inferred as strings.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="PureStrings:1string") + sample_ids = result.get_unique_values("Sample_ID") + assert "S4" in sample_ids + + def test_string_with_numbers_in_other_columns(self, sample_tsv_with_edge_cases): + """Test that values with numbers in other columns do not affect type inference of a string column.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="StringsWithCommas:string2string") + sample_ids = result.get_unique_values("Sample_ID") + assert "S4" in sample_ids + result2 = manager.run_query(filter_string="NumbersWithComma:string3") + sample_ids2 = result2.get_unique_values("Sample_ID") + assert "S4" in sample_ids2 + result3 = manager.run_query(filter_string="MixedTypes:string4") + sample_ids3 = result3.get_unique_values("Sample_ID") + assert "S4" in sample_ids3 + + def test_multi_column_single_string_and_single_number(self, sample_tsv_with_edge_cases): + """Test that filtering on two valid single-value columns (SingleString and SingleNumber) will pass.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="SingleString:String5;SingleNumber:400") + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S4" in sample_ids + + def test_multi_column_single_string_and_mixed_types_should_fail(self, sample_tsv_with_edge_cases): + """Test that filtering on SingleString (valid) and MixedTypes (invalid) will fail due to mixed types.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + with pytest.raises(SidecarInvalidFilterError): + manager.run_query(filter_string="SingleString:String5;MixedTypes:string4") + + def test_multi_column_single_number_and_mixed_types_should_fail(self, sample_tsv_with_edge_cases): + """Test that filtering on SingleNumber (valid) and MixedTypes (invalid) will fail due to mixed types.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + with pytest.raises(SidecarInvalidFilterError): + manager.run_query(filter_string="SingleNumber:400;MixedTypes:string4") + + def test_multi_column_single_string_and_pure_strings(self, sample_tsv_with_edge_cases): + """Test that filtering on SingleString and PureStrings (both valid string columns) will pass.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="SingleString:String5;PureStrings:1string") + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S4" in sample_ids + + def test_multi_column_single_number_and_numbers_with_comma(self, sample_tsv_with_edge_cases): + """Test that filtering on SingleNumber (numeric) and NumbersWithComma (treated as string due to comma) will pass.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="SingleNumber:400;NumbersWithComma:string3") + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S4" in sample_ids + + def test_multi_column_single_string_and_strings_with_commas(self, sample_tsv_with_edge_cases): + """Test that filtering on SingleString and StringsWithCommas (both string columns) will pass.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="SingleString:String5;StringsWithCommas:string2string") + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S4" in sample_ids From 628dea6e5018cb71a9ea3bf7e2c0b5b9b27129d4 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 9 Feb 2026 17:08:57 +0100 Subject: [PATCH 020/100] Drop support of commaa in TSV values Add helper methods to keep the checks DRY, and update fixture and tests. --- .../src/divbase_api/services/queries.py | 19 ++++++- .../test_sample_metadata_queries.py | 56 +++++++++---------- 2 files changed, 46 insertions(+), 29 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 6e570728..73a2a244 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -711,6 +711,19 @@ def load_file(self) -> "SidecarQueryManager": raise SidecarNoDataLoadedError(file_path=self.file, submethod="load_file") from e return self + def _validate_no_commas_in_column(self, key: str) -> None: + """ + Helper method to validate that column values in the imported TSV does not contain commas. + Raises SidecarInvalidFilterError if any comma is found in the column values. + """ + for row_index, cell_value in enumerate(self.df[key].dropna()): + cell_str = str(cell_value).strip() + if cell_str and "," in cell_str: + raise SidecarInvalidFilterError( + f"Column '{key}' contains commas in value '{cell_str}' at row {row_index}. " + f"Commas are not allowed in DivBase metadata files. Use semicolons (;) to separate multiple values." + ) + def _is_semicolon_separated_numeric_column(self, key: str) -> bool: """ Helper method to detect if a column contains semicolon-separated numeric values. @@ -726,7 +739,7 @@ def _is_semicolon_separated_numeric_column(self, key: str) -> bool: Returns True if the column contains ONLY numeric values (with or without semicolons). Returns False if the column contains ONLY non-numeric values (regular string column), or if the column is empty. - Raises SidecarInvalidFilterError if mixed types detected (e.g., "1;2" and "abc;def" in same column). + Raises SidecarInvalidFilterError if mixed types detected (e.g., "1;2" and "abc;def" in same column) or if commas are found. """ if key not in self.df.columns: return False @@ -735,6 +748,8 @@ def _is_semicolon_separated_numeric_column(self, key: str) -> bool: if len(non_null_values) == 0: return False + self._validate_no_commas_in_column(key) + has_numeric_type = False has_non_numeric_type = False @@ -990,6 +1005,8 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": # Supports filtering on semicolon-separated values in cells in the TSV: e.g. "North;West" filter_string_values_list = filter_string_values.split(",") + self._validate_no_commas_in_column(key) + condition = self.df[key].apply( lambda cell_value, target_filter_values=filter_string_values_list: ( False diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 8337e80f..79409950 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -36,19 +36,19 @@ def sample_tsv_with_numeric_data(tmp_path): @pytest.fixture def sample_tsv_with_edge_cases(tmp_path): """ - Create a temporary TSV file to test the 4 specific edge cases: + Create a temporary TSV file to test edge cases: 1. "string;string;string" - OK (pure strings) - 2. "string,string;string;string" - OK (strings with commas are allowed) - 3. "1;3,5" - FAIL (mixed type: numbers with comma are treated as string since comma is not a numeric character) - 4. "1;two;5" - FAIL (mixed numeric and non-numeric should raise exception) + 2. "1;two;5" - FAIL (mixed numeric and non-numeric should raise exception) + 3. String values containing numbers like "1string" - OK (inferred as string) + 4. Column with commas should raise SidecarInvalidFilterError - Also includes cases for string values containing numbers. + Commas are NOT allowed in divbase TSV format. """ - tsv_content = """#Sample_ID\tPureStrings\tStringsWithCommas\tNumbersWithComma\tMixedTypes\tSingleString\tSingleNumber -S1\tNorth;South;East\tRegion1,Area1;Region2,Area2;Region3,Area3\t1;3,5\t1;two;5\tWest\t100 -S2\tWest;East;North\tZone1,Subzone1;Zone2,Subzone2\t2;4,6\t2;three;6\tNorth\t200 -S3\tSouth\tCity1,District1\t3\t3\tEast\t300 -S4\t1string\tstring2string\tstring3\tstring4\tString5\t400 + tsv_content = """#Sample_ID\tPureStrings\tMixedTypes\tSingleString\tSingleNumber\tUnicodeStrings\tWithCommas +S1\tNorth;South;East\t1;two;5\tWest\t100\tStockholm;Göteborg\tNorth,South +S2\tWest;East;North\t2;three;6\tNorth\t200\tMalmö;Uppsala\West,East +S3\tSouth\t3\tEast\t300\tKöpenhamn;København\tNorth, +S4\t1string\tstring4\tString5\t400\tHumlebæk\t,South """ tsv_file = tmp_path / "test_metadata_edge_cases.tsv" tsv_file.write_text(tsv_content) @@ -381,6 +381,13 @@ def test_semicolon_separated_string_column_any_match(self, sample_tsv_with_numer class TestEdgeCases: """Edge case tests for SidecarQueryManager filtering.""" + def test_column_with_commas_raises(self, sample_tsv_with_edge_cases): + """Test that a column containing commas raises SidecarInvalidFilterError.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + with pytest.raises(SidecarInvalidFilterError) as excinfo: + manager.run_query(filter_string="WithCommas:foo") + assert "contains commas" in str(excinfo.value) + def test_mixed_types_should_fail(self, sample_tsv_with_edge_cases): """Test that a column with mixed numeric and non-numeric values raises an error.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) @@ -402,24 +409,19 @@ def test_numbers_with_comma(self, sample_tsv_with_edge_cases): assert "S1" in sample_ids def test_string_with_numbers_in_value(self, sample_tsv_with_edge_cases): - """Test that values like "1string", "string2string", "string3" are correctly inferred as strings.""" + """Test that values like '1string' are correctly inferred as strings.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) result = manager.run_query(filter_string="PureStrings:1string") sample_ids = result.get_unique_values("Sample_ID") assert "S4" in sample_ids - def test_string_with_numbers_in_other_columns(self, sample_tsv_with_edge_cases): - """Test that values with numbers in other columns do not affect type inference of a string column.""" + def test_unicode_string_filtering(self, sample_tsv_with_edge_cases): + """Test that any query on UnicodeStrings raises SidecarInvalidFilterError if any cell contains a comma.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - result = manager.run_query(filter_string="StringsWithCommas:string2string") - sample_ids = result.get_unique_values("Sample_ID") - assert "S4" in sample_ids - result2 = manager.run_query(filter_string="NumbersWithComma:string3") - sample_ids2 = result2.get_unique_values("Sample_ID") - assert "S4" in sample_ids2 - result3 = manager.run_query(filter_string="MixedTypes:string4") - sample_ids3 = result3.get_unique_values("Sample_ID") - assert "S4" in sample_ids3 + with pytest.raises(SidecarInvalidFilterError): + manager.run_query(filter_string="UnicodeStrings:Göteborg") + with pytest.raises(SidecarInvalidFilterError): + manager.run_query(filter_string="UnicodeStrings:Malmö") def test_multi_column_single_string_and_single_number(self, sample_tsv_with_edge_cases): """Test that filtering on two valid single-value columns (SingleString and SingleNumber) will pass.""" @@ -457,10 +459,8 @@ def test_multi_column_single_number_and_numbers_with_comma(self, sample_tsv_with assert len(sample_ids) == 1 assert "S4" in sample_ids - def test_multi_column_single_string_and_strings_with_commas(self, sample_tsv_with_edge_cases): - """Test that filtering on SingleString and StringsWithCommas (both string columns) will pass.""" + def test_multi_column_with_unicode(self, sample_tsv_with_edge_cases): + """Test that multi-column filtering works with unicode strings, but raises error if commas are present.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - result = manager.run_query(filter_string="SingleString:String5;StringsWithCommas:string2string") - sample_ids = result.get_unique_values("Sample_ID") - assert len(sample_ids) == 1 - assert "S4" in sample_ids + with pytest.raises(SidecarInvalidFilterError): + manager.run_query(filter_string="UnicodeStrings:København;SingleNumber:300") From 88fdc26192732897441d05891cdffc643ae354c5 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 10 Feb 2026 10:29:54 +0100 Subject: [PATCH 021/100] Refactor run_query() into helper methods To make the code easier to read, and a little more DRY. Also took the opurtunity to change the lamda functions used to create filters for numerical operations on the dataframe into named nested functions. This should hopefully be a little more readable and still comply with the Ruff linter. --- .../src/divbase_api/services/queries.py | 368 ++++++++++-------- 1 file changed, 204 insertions(+), 164 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 73a2a244..3e4637e2 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -711,72 +711,18 @@ def load_file(self) -> "SidecarQueryManager": raise SidecarNoDataLoadedError(file_path=self.file, submethod="load_file") from e return self - def _validate_no_commas_in_column(self, key: str) -> None: - """ - Helper method to validate that column values in the imported TSV does not contain commas. - Raises SidecarInvalidFilterError if any comma is found in the column values. - """ - for row_index, cell_value in enumerate(self.df[key].dropna()): - cell_str = str(cell_value).strip() - if cell_str and "," in cell_str: - raise SidecarInvalidFilterError( - f"Column '{key}' contains commas in value '{cell_str}' at row {row_index}. " - f"Commas are not allowed in DivBase metadata files. Use semicolons (;) to separate multiple values." - ) - - def _is_semicolon_separated_numeric_column(self, key: str) -> bool: + def get_unique_values(self, column: str) -> list: """ - Helper method to detect if a column contains semicolon-separated numeric values. - Pandas correctly infers type from single-value columns. But for columns with semicolon-separated values - (e.g.: "1;2;3"), it infers them as strings (object dtype). This is an issue since numeric operations - (inequalities, ranges) cannot be performed on string values. - - This helper method checks ALL non-null values in the column to determine if they can be parsed as numeric - after splitting by semicolon. Note that it only detects if a column value is semicolon-separated numeric, it does not - convert the column to numeric type. It also validates that the colum value is not of mixed string-numerical type, which - is invalid input for the query system. The actual parsing and handling of the semicolon-separated numeric values is - done in the lamda functions in the run_query() method. - - Returns True if the column contains ONLY numeric values (with or without semicolons). - Returns False if the column contains ONLY non-numeric values (regular string column), or if the column is empty. - Raises SidecarInvalidFilterError if mixed types detected (e.g., "1;2" and "abc;def" in same column) or if commas are found. + Method to fetch unique values from a specific column in the query result. Intended to be invoked on a SidecarQueryManager + instance after a query has been run with run_query(). """ - if key not in self.df.columns: - return False - - non_null_values = self.df[key].dropna() - if len(non_null_values) == 0: - return False - - self._validate_no_commas_in_column(key) - - has_numeric_type = False - has_non_numeric_type = False - - for row_index, cell_value in enumerate(non_null_values): - cell_str = str(cell_value).strip() - if not cell_str: - continue - - parts = cell_str.split(";") - for part in parts: - part = part.strip() - if not part: - continue - try: - float(part) - has_numeric_type = True - except ValueError: - has_non_numeric_type = True - - if has_numeric_type and has_non_numeric_type: - raise SidecarInvalidFilterError( - f"Column '{key}' in the metadata file contains mixed types. Value '{cell_str}' at row {row_index} " - f"has both numeric and non-numeric parts. All values in a column must be consistently " - f"numeric or string for filtering to work correctly." - ) + if self.query_result is None: + raise SidecarColumnNotFoundError("No query result available. Run run_query() first.") - return has_numeric_type and not has_non_numeric_type + if column in self.query_result.columns: + return self.query_result[column].unique().tolist() + else: + raise SidecarColumnNotFoundError(f"Column '{column}' not found in query result") def run_query(self, filter_string: str = None) -> "SidecarQueryManager": """ @@ -877,36 +823,7 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": if inequality_match: operator = inequality_match.group(1) threshold = float(inequality_match.group(2)) - - condition = self.df[key].apply( - lambda cell_value, op=operator, thresh=threshold: ( - ( - False - if pd.isna(cell_value) - else any( - ( - cell_value_num > thresh - if op == ">" - else cell_value_num >= thresh - if op == ">=" - else cell_value_num < thresh - if op == "<" - else cell_value_num <= thresh - ) - for cell_value_num in ( - # convert to numeric type after splitting by semicolon - float(cell_value_str) if "." in cell_value_str else int(cell_value_str) - for cell_value_str in str(cell_value).split(";") - if cell_value_str.strip() - ) - ) - if not pd.isna(cell_value) - else False - ) - if not pd.isna(cell_value) - else False - ) - ) + condition = self._create_inequality_condition(key, operator, threshold) inequality_conditions.append(condition) logger.debug(f"Applied inequality filter on '{key}': {operator} {threshold}") continue @@ -916,30 +833,14 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": if range_match: min_val = float(range_match.group(1)) max_val = float(range_match.group(2)) - - condition = self.df[key].apply( - lambda cell_value, min_v=min_val, max_v=max_val: ( - False - if pd.isna(cell_value) - else any( - min_v <= cell_value_num <= max_v - for cell_value_num in ( - # convert to numeric type after splitting by semicolon - float(cell_value_str) if "." in cell_value_str else int(cell_value_str) - for cell_value_str in str(cell_value).split(";") - if cell_value_str.strip() - ) - ) - ) - ) + condition = self._create_range_condition(key, min_val, max_val) range_conditions.append(condition) logger.debug(f"Applied range filter on '{key}': {min_val} to {max_val}") continue # Otherwise, treat as discrete value try: - # Collect all discrete values from the filter string as the "for filter_string_value in filter_string_values_list" loop progresses. - # The lambda function for discrete values is below and will run once for all collected discrete values. + # Collect all discrete values from the filter string as the loop progresses. discrete_values.append( float(filter_string_value) if "." in filter_string_value else int(filter_string_value) ) @@ -948,48 +849,21 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": f"Cannot convert '{filter_string_value}' to numeric for column '{key}'. Skipping this value." ) - # If multiple conditions (inequality, range, discrete values), combine them with with OR logic - # In short, this builds a new boolean filter from the combined bools that Pandas will apply to the dataframe column + # Combine multiple conditions (inequality, range, discrete values) with OR logic conditions = [] if inequality_conditions: - # Seperately combine multiple inequalities with OR - combined_inequalities = inequality_conditions[0] - for cond in inequality_conditions[1:]: - # Compare bools pairwise. As long as one of the two are true, set bool filter to true. The bar (|)is pandas syntax for element-wise OR between boolean Series. - combined_inequalities = combined_inequalities | cond - conditions.append(combined_inequalities) + conditions.append(self._combine_conditions_with_or(inequality_conditions)) if range_conditions: - # Seperately combine multiple ranges with OR - combined_ranges = range_conditions[0] - for cond in range_conditions[1:]: - combined_ranges = combined_ranges | cond - conditions.append(combined_ranges) + conditions.append(self._combine_conditions_with_or(range_conditions)) if discrete_values: - discrete_condition = self.df[key].apply( - lambda cell_value, target_filter_values=discrete_values: ( - False - if pd.isna(cell_value) - else any( - cell_value_num in target_filter_values - for cell_value_num in ( - float(cell_value_str) if "." in cell_value_str else int(cell_value_str) - for cell_value_str in str(cell_value).split(";") - if cell_value_str.strip() - ) - ) - ) - ) + discrete_condition = self._create_discrete_numeric_condition(key, discrete_values) conditions.append(discrete_condition) if conditions: - # Combine inequalities, ranges, and discrete values with OR - combined = conditions[0] - for cond in conditions[1:]: - combined = combined | cond - + combined = self._combine_conditions_with_or(conditions) if not combined.any(): warning_msg = f"No values in column '{key}' match the filter: {filter_string_values}" logger.warning(warning_msg) @@ -1002,27 +876,18 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": self.warnings.append(warning_msg) else: # Non-numeric column: handle as discrete string values - # Supports filtering on semicolon-separated values in cells in the TSV: e.g. "North;West" filter_string_values_list = filter_string_values.split(",") - self._validate_no_commas_in_column(key) - condition = self.df[key].apply( - lambda cell_value, target_filter_values=filter_string_values_list: ( - False - if pd.isna(cell_value) - else any( - cell_value_str in target_filter_values for cell_value_str in str(cell_value).split(";") - ) - ) - ) + condition = self._create_string_condition(key, filter_string_values_list) if not condition.any(): warning_msg = f"None of the values {filter_string_values_list} were found in column '{key}'" logger.warning(warning_msg) self.warnings.append(warning_msg) filter_conditions.append(condition) except SidecarInvalidFilterError: - # Re-raise our custom exceptions without wrapping them + # Allow specific validation errors (like "contains commas") to propagate unchanged. + # This preserves detailed error messages for user-facing exceptions. raise except Exception as e: raise SidecarInvalidFilterError( @@ -1032,7 +897,9 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": # 2. Apply the final boolean filters on the dataframe if filter_conditions: combined_condition = pd.Series(True, index=self.df.index) + # Iteratively combine each condition in filter_conditions to create a final combined condition where each row must satisfy all filter conditions to be included. for condition in filter_conditions: + # The ampersand (&) is pandas syntax for element-wise AND between boolean Series. combined_condition = combined_condition & condition self.query_result = self.df[combined_condition].copy() @@ -1046,15 +913,188 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": return self - def get_unique_values(self, column: str) -> list: + def _validate_no_commas_in_column(self, key: str) -> None: """ - Method to fetch unique values from a specific column in the query result. Intended to be invoked on a SidecarQueryManager - instance after a query has been run with run_query(). + Helper method to validate that column values in the imported TSV does not contain commas. + Raises SidecarInvalidFilterError if any comma is found in the column values. """ - if self.query_result is None: - raise SidecarColumnNotFoundError("No query result available. Run run_query() first.") + for row_index, cell_value in enumerate(self.df[key].dropna()): + cell_str = str(cell_value).strip() + if cell_str and "," in cell_str: + raise SidecarInvalidFilterError( + f"Column '{key}' contains commas in value '{cell_str}' at row {row_index}. " + f"Commas are not allowed in DivBase metadata files. Use semicolons (;) to separate multiple values." + ) - if column in self.query_result.columns: - return self.query_result[column].unique().tolist() - else: - raise SidecarColumnNotFoundError(f"Column '{column}' not found in query result") + def _is_semicolon_separated_numeric_column(self, key: str) -> bool: + """ + Helper method to detect if a column contains semicolon-separated numeric values. + Pandas correctly infers type from single-value columns. But for columns with semicolon-separated values + (e.g.: "1;2;3"), it infers them as strings (object dtype). This is an issue since numeric operations + (inequalities, ranges) cannot be performed on string values. + + This helper method checks ALL non-null values in the column to determine if they can be parsed as numeric + after splitting by semicolon. Note that it only detects if a column value is semicolon-separated numeric, it does not + convert the column to numeric type. It also validates that the colum value is not of mixed string-numerical type, which + is invalid input for the query system. The actual parsing and handling of the semicolon-separated numeric values is + done in the lamda functions in the run_query() method. + + Returns True if the column contains ONLY numeric values (with or without semicolons). + Returns False if the column contains ONLY non-numeric values (regular string column), or if the column is empty. + Raises SidecarInvalidFilterError if mixed types detected (e.g., "1;2" and "abc;def" in same column) or if commas are found. + """ + if key not in self.df.columns: + return False + + non_null_values = self.df[key].dropna() + if len(non_null_values) == 0: + return False + + self._validate_no_commas_in_column(key) + + has_numeric_type = False + has_non_numeric_type = False + + for row_index, cell_value in enumerate(non_null_values): + cell_str = str(cell_value).strip() + if not cell_str: + continue + + parts = cell_str.split(";") + for part in parts: + part = part.strip() + if not part: + continue + try: + float(part) + has_numeric_type = True + except ValueError: + has_non_numeric_type = True + + if has_numeric_type and has_non_numeric_type: + raise SidecarInvalidFilterError( + f"Column '{key}' in the metadata file contains mixed types. Value '{cell_str}' at row {row_index} " + f"has both numeric and non-numeric parts. All values in a column must be consistently " + f"numeric or string for filtering to work correctly." + ) + + return has_numeric_type and not has_non_numeric_type + + def _split_cell_values(self, cell_value: Any) -> list[str]: + """ + Helper method to split cell value by semicolon and return list of non-empty values. + If the cell contains a single value without semicolon, it will return a list with that single value. + If the cell is empty or NaN, it will return an empty list. + """ + if pd.isna(cell_value): + return [] + return [val.strip() for val in str(cell_value).split(";") if val.strip()] + + def _parse_numeric_value(self, value_str: str) -> float | int: + """Helper method to parse a string value to int or float. To be used when other checks have already confirmed that the value can be parsed as numeric.""" + return float(value_str) if "." in value_str else int(value_str) + + def _create_inequality_condition(self, key: str, operator: str, threshold: float) -> pd.Series: + """ + Helper method to create a condition for inequality filtering on a column. + Uses a named nested function instead of a lambda to improve readability to defined the + logic that will be applied to the Pandas dataframe. + """ + + def check_inequality(cell_value): + if pd.isna(cell_value): + return False + cell_values = self._split_cell_values(cell_value) + for val_str in cell_values: + try: + val_num = self._parse_numeric_value(val_str) + if ( + operator == ">" + and val_num > threshold + or operator == ">=" + and val_num >= threshold + or operator == "<" + and val_num < threshold + or operator == "<=" + and val_num <= threshold + ): + return True + except ValueError: + continue + return False + + return self.df[key].apply(check_inequality) + + def _create_range_condition(self, key: str, min_val: float, max_val: float) -> pd.Series: + """ + Helper method to create a condition for range filtering on a column. + Uses a named nested function instead of a lambda to improve readability to define the + logic that will be applied to the Pandas dataframe. + """ + + def check_range(cell_value): + if pd.isna(cell_value): + return False + cell_values = self._split_cell_values(cell_value) + for val_str in cell_values: + try: + val_num = self._parse_numeric_value(val_str) + if min_val <= val_num <= max_val: + return True + except ValueError: + continue + return False + + return self.df[key].apply(check_range) + + def _create_discrete_numeric_condition(self, key: str, target_values: list[float | int]) -> pd.Series: + """ + Helper method to create a condition for discrete numeric value filtering on a column. + Uses a named nested function instead of a lambda to improve readability to define the + logic that will be applied to the Pandas dataframe. + """ + + def check_discrete(cell_value): + if pd.isna(cell_value): + return False + cell_values = self._split_cell_values(cell_value) + for val_str in cell_values: + try: + val_num = self._parse_numeric_value(val_str) + if val_num in target_values: + return True + except ValueError: + continue + return False + + return self.df[key].apply(check_discrete) + + def _create_string_condition(self, key: str, target_values: list[str]) -> pd.Series: + """ + Helper method to create a condition for string value filtering on a column. + Uses a named nested function instead of a lambda to improve readability to define the + logic that will be applied to the Pandas dataframe. + """ + + def check_string(cell_value): + if pd.isna(cell_value): + return False + cell_values = self._split_cell_values(cell_value) + return any(val in target_values for val in cell_values) + + return self.df[key].apply(check_string) + + def _combine_conditions_with_or(self, conditions: list[pd.Series]) -> pd.Series: + """ + Helper method to combine multiple Pandas boolean Series with OR logic. + Returns a single boolean Series that is True if any of the input conditions is True for each row. + + The resulting Series is used at the end of self.run_query() to filter the DataFrame values. + """ + if not conditions: + return pd.Series(False, index=self.df.index) + combined = conditions[0] + for cond in conditions[1:]: + # The bar (|) is pandas syntax for element-wise OR between boolean Series. + combined = combined | cond + return combined From 15e320501e55c865ce95fd384dba74d73af447f8 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 10 Feb 2026 10:42:40 +0100 Subject: [PATCH 022/100] Allow hyphens in str but not numeric TSV values Could be a common use case to have hyphens in string values, e.g. "North-East", "South-West". Numerical values are not supported and will raise an exception. --- .../src/divbase_api/services/queries.py | 9 +++++ .../test_sample_metadata_queries.py | 36 +++++++++++++------ 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 3e4637e2..305a3b9e 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -965,6 +965,15 @@ def _is_semicolon_separated_numeric_column(self, key: str) -> bool: part = part.strip() if not part: continue + + # Check if the value contains a hyphen and looks like it could be numeric (e.g., "1-2", "3-4") + if "-" in part and any(p.isdigit() for p in part): + raise SidecarInvalidFilterError( + f"Column '{key}' contains value '{part}' with a hyphen at row {row_index}. " + f"Hyphens are not allowed in numeric column values (only in string columns). " + f"If this is meant to be a string column, all values should be non-numeric strings." + ) + try: float(part) has_numeric_type = True diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 79409950..e18d83e4 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -44,11 +44,11 @@ def sample_tsv_with_edge_cases(tmp_path): Commas are NOT allowed in divbase TSV format. """ - tsv_content = """#Sample_ID\tPureStrings\tMixedTypes\tSingleString\tSingleNumber\tUnicodeStrings\tWithCommas -S1\tNorth;South;East\t1;two;5\tWest\t100\tStockholm;Göteborg\tNorth,South -S2\tWest;East;North\t2;three;6\tNorth\t200\tMalmö;Uppsala\West,East -S3\tSouth\t3\tEast\t300\tKöpenhamn;København\tNorth, -S4\t1string\tstring4\tString5\t400\tHumlebæk\t,South + tsv_content = """#Sample_ID\tPureStrings\tMixedTypes\tSingleString\tSingleNumber\tUnicodeStrings\tWithCommas\tStringWithHyphen\tNumericalWithHyphen +S1\tNorth;South;East\t1;two;5\tWest\t100\tStockholm;Göteborg\tNorth,South\tNorth-East\t1-2 +S2\tWest;East;North\t2;three;6\tNorth\t200\tMalmö;Uppsala\tWest,East\tSouth-West\t2-3 +S3\tSouth\t3\tEast\t300\tKöpenhamn;København\tNorth,\tNorth-North-West\t3-4 +S4\t1string\tstring4\tString5\t400\tHumlebæk\t,South\tEast-South-East\t4-5 """ tsv_file = tmp_path / "test_metadata_edge_cases.tsv" tsv_file.write_text(tsv_content) @@ -416,12 +416,11 @@ def test_string_with_numbers_in_value(self, sample_tsv_with_edge_cases): assert "S4" in sample_ids def test_unicode_string_filtering(self, sample_tsv_with_edge_cases): - """Test that any query on UnicodeStrings raises SidecarInvalidFilterError if any cell contains a comma.""" + """Test that filtering for Unicode values like 'Göteborg' and 'Malmö' works and returns correct samples.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - with pytest.raises(SidecarInvalidFilterError): - manager.run_query(filter_string="UnicodeStrings:Göteborg") - with pytest.raises(SidecarInvalidFilterError): - manager.run_query(filter_string="UnicodeStrings:Malmö") + result = manager.run_query(filter_string="UnicodeStrings:Göteborg") + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids def test_multi_column_single_string_and_single_number(self, sample_tsv_with_edge_cases): """Test that filtering on two valid single-value columns (SingleString and SingleNumber) will pass.""" @@ -463,4 +462,19 @@ def test_multi_column_with_unicode(self, sample_tsv_with_edge_cases): """Test that multi-column filtering works with unicode strings, but raises error if commas are present.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) with pytest.raises(SidecarInvalidFilterError): - manager.run_query(filter_string="UnicodeStrings:København;SingleNumber:300") + manager.run_query(filter_string="UnicodeStrings:København;WithCommas:North") + + def test_hyphens_allowed_in_string_values(self, sample_tsv_with_edge_cases): + """Test that hyphens are allowed in string values and can be filtered correctly.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="StringWithHyphen:South-West") + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S2" in sample_ids + + def test_hyphens_in_numerical_column_raises(self, sample_tsv_with_edge_cases): + """Test that hyphens are allowed in string columns but not in numerical columns.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + with pytest.raises(SidecarInvalidFilterError) as excinfo: + manager.run_query(filter_string="NumericalWithHyphen:2") + assert "Column 'NumericalWithHyphen' contains value '1-2' with a hyphen at row 0." in str(excinfo.value) From 746a3b470340416187db44d191526c250bf21ce2 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 10 Feb 2026 12:02:23 +0100 Subject: [PATCH 023/100] Support NOT filters with ! Big refactoring. The same processing needs to be applied to positive and to negative user-inputted filter values. NOT conditions are applied with AND logic after positive conditions: in the end, the rows must NOT match any negated values. To try to keep the code DRY and manageable, several helper methods have been added. --- .../src/divbase_api/services/queries.py | 262 +++++++++++++----- .../test_sample_metadata_queries.py | 141 ++++++++++ 2 files changed, 340 insertions(+), 63 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 305a3b9e..87f4c556 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -738,6 +738,12 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": - Discrete values: "25,30,50" (checks if any cell value matches any filter value) - All are combined with OR logic + Filtering using the ! (NOT) operator: + - "!" must prefix the filter value, e.g. "key:!value" means that rows with "value" in the "key" column should be excluded. + - Numeric examples: "Population:!2" (exclude 2), "Age:<30,!25" (less than 30 but not 25), "Weight:!20-40" (exclude range 20-40) + - String examples: "Area:!North" (exclude North), "Region:East,West,!South" (East or West but not South) + - NOT conditions are applied with AND logic after positive conditions have been applied: rows must NOT match any negated value + Filter string values in the query vs. cell values in the TSV: - Filter strings are handled per semicolon-separated key-value pair: in "key1:value1,value2;key2:value3,value4" "key1:value1,value2" is handled separately from "key2:value3,value4". @@ -800,70 +806,57 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": # e.g., "Weight:>25,<30,50" or "Weight:20-40,50,>100" # Supports filtering on semicolon-separated values in cells in the TSV: e.g. "25;30;35" # Also handles columns that pandas infers as strings but contain numeric values with semicolons (e.g., "1;2;3") + # Also supports NOT operator with ! prefix: e.g., "Weight:!25" or "Weight:<4,!2" if is_numeric or is_semicolon_numeric: filter_string_values_list = filter_string_values.split(",") - inequality_conditions = [] - range_conditions = [] - discrete_values = [] - - for filter_string_value in filter_string_values_list: - filter_string_value = filter_string_value.strip() - - # Check for common mistakes: =< or => instead of <= or >= - if re.match(r"^=<\d+\.?\d*$", filter_string_value) or re.match( - r"^=>\d+\.?\d*$", filter_string_value - ): - raise SidecarInvalidFilterError( - f"Invalid operator format '{filter_string_value[:2]}' in filter '{key}:{filter_string_values}'." - f"Use standard operators: '<=' (not '=<') or '>=' (not '=>')" - ) - - # Check if it's an inequality (e.g., ">25", "<=40") - inequality_match = re.match(r"^(>=|<=|>|<)(\d+\.?\d*)$", filter_string_value) - if inequality_match: - operator = inequality_match.group(1) - threshold = float(inequality_match.group(2)) - condition = self._create_inequality_condition(key, operator, threshold) - inequality_conditions.append(condition) - logger.debug(f"Applied inequality filter on '{key}': {operator} {threshold}") - continue - - # Check if it's a range (e.g., "20-40") - range_match = re.match(r"^(\d+\.?\d*)-(\d+\.?\d*)$", filter_string_value) - if range_match: - min_val = float(range_match.group(1)) - max_val = float(range_match.group(2)) - condition = self._create_range_condition(key, min_val, max_val) - range_conditions.append(condition) - logger.debug(f"Applied range filter on '{key}': {min_val} to {max_val}") - continue - - # Otherwise, treat as discrete value - try: - # Collect all discrete values from the filter string as the loop progresses. - discrete_values.append( - float(filter_string_value) if "." in filter_string_value else int(filter_string_value) - ) - except ValueError: - logger.warning( - f"Cannot convert '{filter_string_value}' to numeric for column '{key}'. Skipping this value." - ) - # Combine multiple conditions (inequality, range, discrete values) with OR logic - conditions = [] + # Negated values are those that start with "!" in the filter string + positive_values, negated_values = self._separate_positive_and_negated_values( + filter_values=filter_string_values_list + ) + + filter_context = { + "key": key, + "filter_string_values": filter_string_values, + "is_negated": False, + } + + inequality_conditions, range_conditions, discrete_values = self._parse_numeric_filter_values( + values_to_process=positive_values, + context=filter_context, + ) + + filter_context["is_negated"] = True + negated_inequality_conditions, negated_range_conditions, negated_discrete_values = ( + self._parse_numeric_filter_values( + values_to_process=negated_values, + context=filter_context, + ) + ) - if inequality_conditions: - conditions.append(self._combine_conditions_with_or(inequality_conditions)) + # Combine multiple conditions (inequality, range, discrete values) with OR logic + conditions = self._build_condition_list( + inequality_conditions=inequality_conditions, + range_conditions=range_conditions, + discrete_values=discrete_values, + key=key, + ) - if range_conditions: - conditions.append(self._combine_conditions_with_or(range_conditions)) + negated_conditions = self._build_condition_list( + inequality_conditions=negated_inequality_conditions, + range_conditions=negated_range_conditions, + discrete_values=negated_discrete_values, + key=key, + ) - if discrete_values: - discrete_condition = self._create_discrete_numeric_condition(key, discrete_values) - conditions.append(discrete_condition) + if conditions or negated_conditions: + # First combine all positive conditions with OR logic. Can be None if there are no positive conditions, e.g. if the filter string only contains negated conditions like "Weight:!20-40" + base_condition = self._combine_conditions_with_or(conditions=conditions) if conditions else None + # Then apply negated conditions with AND logic: rows must NOT match any negated condition. + combined = self._apply_not_conditions( + base_condition=base_condition, negated_conditions=negated_conditions + ) - if conditions: - combined = self._combine_conditions_with_or(conditions) if not combined.any(): warning_msg = f"No values in column '{key}' match the filter: {filter_string_values}" logger.warning(warning_msg) @@ -876,15 +869,36 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": self.warnings.append(warning_msg) else: # Non-numeric column: handle as discrete string values + # Supports NOT operator with ! prefix: e.g., "Area:!North" or "Area:North,!South" filter_string_values_list = filter_string_values.split(",") self._validate_no_commas_in_column(key) - condition = self._create_string_condition(key, filter_string_values_list) - if not condition.any(): - warning_msg = f"None of the values {filter_string_values_list} were found in column '{key}'" - logger.warning(warning_msg) - self.warnings.append(warning_msg) - filter_conditions.append(condition) + positive_values, negated_values = self._separate_positive_and_negated_values( + filter_values=filter_string_values_list + ) + + # Build condition + if positive_values or negated_values: + base_condition = ( + self._create_string_condition(key=key, target_values=positive_values) + if positive_values + else None + ) + negated_conditions = ( + [self._create_string_condition(key=key, target_values=negated_values)] + if negated_values + else [] + ) + + condition = self._apply_not_conditions( + base_condition=base_condition, negated_conditions=negated_conditions + ) + + if not condition.any(): + warning_msg = f"None of the values {filter_string_values_list} were found in column '{key}'" + logger.warning(warning_msg) + self.warnings.append(warning_msg) + filter_conditions.append(condition) except SidecarInvalidFilterError: # Allow specific validation errors (like "contains commas") to propagate unchanged. # This preserves detailed error messages for user-facing exceptions. @@ -1107,3 +1121,125 @@ def _combine_conditions_with_or(self, conditions: list[pd.Series]) -> pd.Series: # The bar (|) is pandas syntax for element-wise OR between boolean Series. combined = combined | cond return combined + + def _build_condition_list( + self, + inequality_conditions: list[pd.Series], + range_conditions: list[pd.Series], + discrete_values: list[float | int], + key: str, + ) -> list[pd.Series]: + """ + Helper method to build a list of conditions from inequality, range, and discrete value filters. + """ + conditions = [] + + if inequality_conditions: + conditions.append(self._combine_conditions_with_or(conditions=inequality_conditions)) + + if range_conditions: + conditions.append(self._combine_conditions_with_or(conditions=range_conditions)) + + if discrete_values: + discrete_condition = self._create_discrete_numeric_condition(key=key, target_values=discrete_values) + conditions.append(discrete_condition) + + return conditions + + def _parse_numeric_filter_values( + self, values_to_process: list[str], context: dict[str, str | bool] + ) -> tuple[list[pd.Series], list[pd.Series], list[float | int]]: + """ + Helper method to identify if a numeric filter values is an inequality, range, or discrete value and process it accordingly. + + The context dict is intended to keep the kwargs manageable when passing positive and negative values back-to-back: + - key: Column name being filtered + - filter_string_values: Original filter string (for error messages) + - is_negated: Whether these are negated (NOT) conditions + """ + key = context["key"] + filter_string_values = context["filter_string_values"] + is_negated = context["is_negated"] + + inequality_conditions = [] + range_conditions = [] + discrete_values = [] + + for filter_string_value in values_to_process: + # Check for common mistakes: =< or => instead of <= or >= + if re.match(r"^=<\d+\.?\d*$", filter_string_value) or re.match(r"^=>\d+\.?\d*$", filter_string_value): + raise SidecarInvalidFilterError( + f"Invalid operator format '{filter_string_value[:2]}' in filter '{key}:{filter_string_values}'." + f"Use standard operators: '<=' (not '=<') or '>=' (not '=>')" + ) + + # Check if it's an inequality (e.g., ">25", "<=40") + inequality_match = re.match(r"^(>=|<=|>|<)(\d+\.?\d*)$", filter_string_value) + if inequality_match: + operator = inequality_match.group(1) + threshold = float(inequality_match.group(2)) + condition = self._create_inequality_condition(key, operator, threshold) + inequality_conditions.append(condition) + prefix = "NOT " if is_negated else "" + logger.debug( + f"Applied {'negated ' if is_negated else ''}inequality filter on '{key}': {prefix}{operator} {threshold}" + ) + continue + + # Check if it's a range (e.g., "20-40") + range_match = re.match(r"^(\d+\.?\d*)-(\d+\.?\d*)$", filter_string_value) + if range_match: + min_val = float(range_match.group(1)) + max_val = float(range_match.group(2)) + condition = self._create_range_condition(key, min_val, max_val) + range_conditions.append(condition) + prefix = "NOT " if is_negated else "" + logger.debug( + f"Applied {'negated ' if is_negated else ''}range filter on '{key}': {prefix}{min_val} to {max_val}" + ) + continue + + # Otherwise, treat as discrete value + try: + numeric_value = float(filter_string_value) if "." in filter_string_value else int(filter_string_value) + discrete_values.append(numeric_value) + except ValueError: + logger.warning( + f"Cannot convert '{filter_string_value}' to numeric for column '{key}'. Skipping this value." + ) + + return inequality_conditions, range_conditions, discrete_values + + def _separate_positive_and_negated_values(self, filter_values: list[str]) -> tuple[list[str], list[str]]: + """ + Helper method to separate filter values into positive and negated lists. + Values prefixed with '!' are negated (NOT conditions). + """ + positive_values = [] + negated_values = [] + + for value in filter_values: + value = value.strip() + if value.startswith("!"): + negated_values.append(value[1:].strip()) + else: + positive_values.append(value) + + return positive_values, negated_values + + def _apply_not_conditions(self, base_condition: pd.Series | None, negated_conditions: list[pd.Series]) -> pd.Series: + """ + Helper method to apply NOT conditions to a base condition. The base condition contains positive filters combined with OR, or None if there were only negations + in the input filter string from the CLI. Returns a combined condition where rows must match base_condition AND NOT match any negated condition + """ + if base_condition is None: + # If only negated conditions (no positive conditions), start with all True + combined = pd.Series(True, index=self.df.index) + else: + combined = base_condition + + # Apply negated conditions (must NOT match any negated condition) + for negated_condition in negated_conditions: + combined = combined & ~negated_condition + + return combined diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index e18d83e4..fd1ee20c 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -135,6 +135,36 @@ def test_inequality_on_single_numeric_value_column(self, sample_tsv_with_numeric assert "S9" in sample_ids assert "S10" in sample_ids + def test_not_operator_with_inequality(self, sample_tsv_with_numeric_data): + """Test NOT operator (!) with inequality: Population:<4,!2 should return 1 and 3 but not 2.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:<4,!2") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 3 + assert "S1" in sample_ids + assert "S3" in sample_ids + assert "S7" in sample_ids + assert "S2" not in sample_ids + assert "S8" not in sample_ids + + def test_not_operator_standalone(self, sample_tsv_with_numeric_data): + """Test NOT operator (!) standalone: Population:!2 should return all except 2.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:!2") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + assert "S3" in sample_ids + assert "S4" in sample_ids + assert "S5" in sample_ids + assert "S6" in sample_ids + assert "S7" in sample_ids + assert "S9" in sample_ids + assert "S10" in sample_ids + assert "S2" not in sample_ids + assert "S8" not in sample_ids + class TestNumericalFilteringRanges: """Test range filtering on numerical columns.""" @@ -198,6 +228,23 @@ def test_range_on_single_numeric_value_column(self, sample_tsv_with_numeric_data assert "S7" in sample_ids assert "S8" in sample_ids + def test_not_operator_with_range(self, sample_tsv_with_numeric_data): + """Test NOT operator (!) with range: Age:!20-30 should exclude values in range 20-30.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Age:!20-30") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + assert "S2" in sample_ids + assert "S3" in sample_ids + assert "S7" in sample_ids + assert "S8" in sample_ids + assert "S9" in sample_ids + assert "S10" in sample_ids + assert "S4" not in sample_ids + assert "S5" not in sample_ids + assert "S6" not in sample_ids + class TestNumericalFilteringDiscreteValues: """Test discrete value filtering on numerical columns.""" @@ -265,6 +312,33 @@ def test_discrete_on_single_numeric_value_column_(self, sample_tsv_with_numeric_ assert "S1" in sample_ids assert "S6" in sample_ids + def test_not_operator_with_discrete_values(self, sample_tsv_with_numeric_data): + """Test NOT operator (!) with discrete values: Population:1,3,!2 should return 1 and 3 but not 2.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:1,3,!2") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 3 + assert "S1" in sample_ids + assert "S3" in sample_ids + assert "S7" in sample_ids + assert "S2" not in sample_ids + assert "S8" not in sample_ids + + def test_not_operator_multiple_negations(self, sample_tsv_with_numeric_data): + """Test NOT operator (!) with multiple negations: Weight:>30,!50,!55 should exclude 50 and 55.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Weight:>30,!50,!55") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S4" in sample_ids + assert "S5" in sample_ids + assert "S6" in sample_ids + assert "S9" in sample_ids + assert "S10" in sample_ids + assert "S7" not in sample_ids + assert "S8" not in sample_ids + class TestSemicolonSeparatedNumericFiltering: """Test that inequalities and ranges work on columns with semicolon-separated numeric values.""" @@ -333,6 +407,20 @@ def test_range_with_semicolon_values_at_boundaries(self, sample_tsv_with_numeric assert "S7" in sample_ids assert "S8" in sample_ids + def test_not_operator_with_semicolon_separated(self, sample_tsv_with_numeric_data): + """Test NOT operator (!) with semicolon-separated values: Population:>3,!5 should exclude 5.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Population:>3,!5") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S2" in sample_ids + assert "S4" in sample_ids + assert "S6" in sample_ids + assert "S9" in sample_ids + assert "S10" in sample_ids + assert "S5" not in sample_ids + assert "S7" not in sample_ids + class TestStringColumnFiltering: """Test string column filtering with single and semicolon-separated values.""" @@ -377,6 +465,38 @@ def test_semicolon_separated_string_column_any_match(self, sample_tsv_with_numer assert "S3" in sample_ids assert "S7" in sample_ids + def test_not_operator_with_string_values(self, sample_tsv_with_numeric_data): + """Test NOT operator (!) with string values: Area:!North should exclude North.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Area:!North") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 7 + assert "S2" in sample_ids + assert "S3" in sample_ids + assert "S4" in sample_ids + assert "S6" in sample_ids + assert "S7" in sample_ids + assert "S8" in sample_ids + assert "S10" in sample_ids + assert "S1" not in sample_ids + assert "S5" not in sample_ids + assert "S9" not in sample_ids + + def test_not_operator_with_string_positive_and_negative(self, sample_tsv_with_numeric_data): + """Test NOT operator (!) combined with positive values: Area:East,West,!South.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Area:East,West,!South") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S2" in sample_ids + assert "S4" in sample_ids + assert "S6" in sample_ids + assert "S8" in sample_ids + assert "S10" in sample_ids + assert "S3" not in sample_ids + assert "S7" not in sample_ids + class TestEdgeCases: """Edge case tests for SidecarQueryManager filtering.""" @@ -478,3 +598,24 @@ def test_hyphens_in_numerical_column_raises(self, sample_tsv_with_edge_cases): with pytest.raises(SidecarInvalidFilterError) as excinfo: manager.run_query(filter_string="NumericalWithHyphen:2") assert "Column 'NumericalWithHyphen' contains value '1-2' with a hyphen at row 0." in str(excinfo.value) + + def test_not_operator_edge_case_with_unicode(self, sample_tsv_with_edge_cases): + """Test NOT operator (!) with unicode string values.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="UnicodeStrings:!København") + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + assert "S2" in sample_ids + assert "S4" in sample_ids + assert "S3" not in sample_ids + + def test_not_operator_only_negations(self, sample_tsv_with_edge_cases): + """Test NOT operator (!) with only negations (no positive values): PureStrings:!North should return all except North.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="PureStrings:!North") + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 2 + assert "S3" in sample_ids + assert "S4" in sample_ids + assert "S1" not in sample_ids + assert "S2" not in sample_ids From aaffc1f865848dfabb65349c312fdd77d7a77581 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 10 Feb 2026 12:57:14 +0100 Subject: [PATCH 024/100] Strip leading/trailing spaces when loading TSV Whitespaces inside values are preserved --- packages/divbase-api/src/divbase_api/services/queries.py | 7 ++++++- tests/unit/divbase_api/test_sample_metadata_queries.py | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 87f4c556..7c3da302 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -701,9 +701,14 @@ def load_file(self) -> "SidecarQueryManager": try: logger.info(f"Loading sidecar metadata file: {self.file}") self.df = pd.read_csv( - self.file, sep="\t" + self.file, sep="\t", skipinitialspace=True ) # Pandas has Type Inference and will detect numeric and string columns automatically self.df.columns = self.df.columns.str.lstrip("#") + + # Strip leading and trailing whitespace from all columns + for col in self.df.columns: + self.df[col] = self.df[col].apply(lambda x: x.strip() if isinstance(x, str) else x) + if "Sample_ID" not in self.df.columns: raise SidecarColumnNotFoundError("The 'Sample_ID' column is required in the metadata file.") diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index fd1ee20c..be1e0eda 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -43,11 +43,12 @@ def sample_tsv_with_edge_cases(tmp_path): 4. Column with commas should raise SidecarInvalidFilterError Commas are NOT allowed in divbase TSV format. + Note that S2 and S3 have leading/trailing whitespace in the Sample_ID and the code should handle that by stripping whitespace. """ tsv_content = """#Sample_ID\tPureStrings\tMixedTypes\tSingleString\tSingleNumber\tUnicodeStrings\tWithCommas\tStringWithHyphen\tNumericalWithHyphen S1\tNorth;South;East\t1;two;5\tWest\t100\tStockholm;Göteborg\tNorth,South\tNorth-East\t1-2 -S2\tWest;East;North\t2;three;6\tNorth\t200\tMalmö;Uppsala\tWest,East\tSouth-West\t2-3 -S3\tSouth\t3\tEast\t300\tKöpenhamn;København\tNorth,\tNorth-North-West\t3-4 +S2 \tWest;East;North\t2;three;6\tNorth\t200\tMalmö;Uppsala\tWest,East\tSouth-West\t2-3 + S3\tSouth\t3\tEast\t300\tKöpenhamn;København\tNorth,\tNorth-North-West\t3-4 S4\t1string\tstring4\tString5\t400\tHumlebæk\t,South\tEast-South-East\t4-5 """ tsv_file = tmp_path / "test_metadata_edge_cases.tsv" From a0ab8371943bdbc97229a7de537edc70d8cd6c14 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 10 Feb 2026 14:16:25 +0100 Subject: [PATCH 025/100] Update metadata user guide after refactoring --- docs/user-guides/sidecar-metadata.md | 86 +++++++++++++++++++--------- 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 30978eea..41ecd558 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -27,25 +27,31 @@ Note! there can be multiple TSVs in the same project and it is possible to call After the `Sample_ID` column has been populated, users can add any columns and values to the TSV. -1. It is the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling must be used for the filters. -2. The user-defined columns can be either numeric or string type. Try to avoid mixing string and numeric values in the same column is possible. If a mix of string and numerical data is used in the same column, the system will treat them all as strings, which might lead to unexpected filtering results when running queries. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values is numerical or string. -3. Use English decimal notation (.) and not comma (,) when entering decimals. This ensures that the data is correctly loaded by `Pandas`. -4. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data. +!!! Warning + It is the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling must be used for the filters. This includes matching upper and lower case letters. + +To ensure that user-defined metadata can be used in DivBase, we ask you follow the following constraints and considerations: + +1. The user-defined columns can be either numeric or string type. Mixing string and numeric values in the same column is not allowed; if a mix is detected, DivBase will raise an error and reject the file. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values are numerical or string. +2. Commas are not supported for the TSV and the DivBase system will send an error message if it detects any TSV cells with commas in them. Commas can have different meanings in different notation systems and to avoid confusion and to keep it simple, DivBase will simply not handle commas. Note that commas are used in the [Query syntax](#query-syntax-for-sidecar-metadata) for a different purpose. For decimals, use English decimal notation (.) and not comma (,). DivBase allows one single delimiter for enumerations in the TSV files and that is the semicolon (;) as will be described in the bullet. +3. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data (e.g. "2;4;21" or "North; North-West"). Note that this might make the process of writing queries on the more complex than if just a single value is use for each cell. +4. As outlined above, the only characters with special meaning or restrictions in the TSV are `#`, `,`, `;`, and `\t` (tab). Other special characters should be supported, but please be aware that Your Milage May Vary. Some common cases that have been tested and are supported include hyphens (`-`), e.g.`North-West`), diacritic unicodecharacters like `å`,`ä`,`ö`. +5. Leading and trailing whitespaces are removed by the DivBase backend in order to ensure robust filtering and pattern matching. Whitespaces inside strings will be preserved. For instance: " Sample 1 " will be processed as "Sample 1". #### Example This example illustrates how a sidecar sample metadata TSV can look like. The mandatory requirement are fulfilled (heading, `Sample_ID` column, tab-separated file). The user-defined column contains examples of a numerical column (`Population`) and a string column (`Area`). In some cells, semicolons (`;`) are used to assign multiple values to the same sample and column. ```text -#Sample_ID Population Area -S1 1 North -S2 2;4 East -S3 3 West;South -S4 4 West -S5 5 North -S6 6 East -S7 1;3;5 South -S8 2 West +#Sample_ID Population Area Weight +S1 1 North 12.1 +S2 2;4 East 18.8 +S3 3 West;South 15.0 +S4 4 West 20.2 +S5 5 North 16.1 +S6 6 East 25.2 +S7 1;3;5 South 22.6 +S8 2 West 19.5 ``` TODOs: @@ -57,15 +63,21 @@ TODOs: ## Query Syntax for sidecar metadata +- TODO: explain warnings +- TODO: explain when empty results or all results are returned + ### Overview: querys are applied as filters on columns in the TSV Queries on the sidecar sample metadata TSV can be done with the `divbase-cli query tsv` command. The filters that the user want to query on needs entered as a string (i.e. enclosed in quotes, `""`). The TSV query syntax is `"Key1:Value1,Value2;Key2:Value3,Value4"`, where `Key1:`...`Key2:` are the column header names in the TSV, and `Value1`...`Value4` are the values. Multiple filter values for a key are separated by commas, and multiple keys are separated by semicolons. There can be any number keys and values to filter on, but it is up to the user to write queries that return useful results. +It is possible to exclude a value by prefixing it with a `!` (NOT) operator: `"Key:!Value"`. When mixing inclusive and exclusive filters (e.g. `"Key1:Value1,Value2; Key2:!Value3"`), only the rows that match the positive filters and do not match any of the excluded values will be returned. This can be used to write complex queries. !!! note Please note that semicolon (`;`) is used for different purposes in the TSV (multi-value cells) and in the query syntax (perform queries on multiple columns)! + Also note that commas are allowed in the query syntax, but are not allowed in the cells in the TSV. + Filtering is inclusive by default. This applies both for the filter values and the cell values: - If a filter contains multiple values, e.g. `"Area:North,West"`, the row is included if at least one of the filter values matches any value in the cell. I.e. a row with `North`, and a row with `West` will both be returned from this filter. @@ -100,22 +112,30 @@ divbase-cli query tsv "Area:North,South,East" will return all samples where **at least one** of the semicolon-separated values in the Area column matches any of the filter values (`North`, `South`, or `East`). -Comma-separated: "Area:North,South,East" -OR logic: if ANY cell value matches ANY filter value, the row matches -Example: "key2:value3,value4" matches cells containing value3, value4, value3;value4, or value4;value3 +The `!`(NOT) operator can be used to exclude specific cell values from a column. When a `!` is used on its own, such as in the command: + +```bash +divbase-cli query tsv "Area:!North" +``` + +it will return all rows that do not contain `North` in the `Area`. Multi-column values that contain `North`, such as a row with e.g. `North;South` will also be excluded by this query. + +Note that when inclusive and exclusive are combined (e.g. `"Area:East,!South"`), only rows that match both filters (include `East`, exclude `South`) will be returned in the results. ### Filtering on numerical columns For numerical columns, it is possible to filter on the following operations: -- Inequalities: - - Examples: `"Weight:>25"` or `"Weight:>=20,<=40"` or `"Weight:<100"`. - - Note" The inequality operator must be expressed relative to the key, i.e. `"Weight:>25"`. The reverse notation `"Weight:25<"` is not supported. - - The syntax only accepts `<=` and `>=` since this is the syntax of Python. The forms =< and => are not accepted and will return an error. -- Range (inclusive): - - Example: `"Weight:20-40"` -- Discrete values: - - Example: `"Weight:25,30,35"` +- **Inequalities** + Examples: `"Weight:>25"` or `"Weight:>=20,<=40"` or `"Weight:<100"` + Note: The inequality operator must be expressed relative to the key, i.e. `"Weight:>25"`. The reverse notation `"Weight:25<"` is not supported. + The syntax only accepts `<=` and `>=` since this is the syntax of Python. The forms `=<` and `=>` are not accepted and will return an error. + +- **Range (inclusive)** + Example: `"Weight:20-40"` + +- **Discrete values** + Example: `"Weight:25,30,35"` Furthermore, it is possible to combine filters on inequalities, ranges, and discrete values using inclusive OR logic. This means that if any one of the specified conditions is satisfied for a cell, the row will be included in the results. For example: @@ -124,12 +144,26 @@ Furthermore, it is possible to combine filters on inequalities, ranges, and disc - `"Weight:>5,1-2,4"` returns rows where the value is greater than 5 **or** in the range 1–2 **or** equal to 4 - `"Weight:>10,<2,5-7"` returns rows where the value is greater than 10 **or** less than 2 **or** in the range 5–7 -## Trying out a query +The `!` (NOT) operator can really come to good use for numerical filters: + +- `"Weight:!25"` returns rows where the value is not 25. +- `"Weight:>5,!10-15"` returns rows where the value is greater than 5, but not in the range 10–15. +- `"Weight:!1-2,4"` returns rows where the value is not in the range 1–2, or is 4. + +## Examples of complex queries + +Assuming that the sidecar metadata TSV file looks like in the [Example](#example) above, a query like will: ```bash -divbase-cli query tsv "Area:Northern Portugal" +divbase-cli query tsv "Area:North,West,!South;Weight:>10,<=20,!15,18-22" ``` +- include rows where the `Area` column contains either `North` or `West` (also applied to semicolon-separated multi-value cells), **but excludes** any row where `South` is present in the `Area` column—even if `North` or `West` is also present. + +- include rows where the `Weight` column is greater than 10, **or** less than or equal to 20, **or** in the range 18–22 (inclusive), **but excludes** any row where Weight is exactly 15 **or** any value in the range 18–22. + +There are three samples (rows) that fulfill this, and this is what the query results will return: `S1`, `S4`, and `S5`. + - [TO BE IMPLEMENTED] what to do if a query references a column that does not exist. E.g. `divbase-cli query tsv "Area:Northern Portugal"` when Area does not exist? This should probably give a warning and not just return nothing - [TO BE IMPLEMENTED] what to do if a query references a column value. E.g. `divbase-cli query tsv "Area:Northern Portugal"` when Northern Portugal does not exist in the column? This should probably also give a warning and not just return nothing, but nothing is a result here and not a syntax problem... From 7bca2873e3fe540a2932cda505bf91c762a665a5 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 10 Feb 2026 14:20:14 +0100 Subject: [PATCH 026/100] Update fixture to use other floats than just X.0 --- .../test_sample_metadata_queries.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index be1e0eda..782dc577 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -17,16 +17,16 @@ def sample_tsv_with_numeric_data(tmp_path): """ # Keep indentation like this to ensure that leading spaces in column 1 does not cause issues. tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea\tSingleNumber\tSingleString -S1\t1\t20.0\t5.0\tNorth\t100\tString +S1\t1\t20.2\t5.0\tNorth\t100\tString S2\t2;4\t25.0\t10\tEast\t200\tStrings -S3\t3\t30.0\t15\tWest;South;East\t300\tSting -S4\t4\t35.0\t20\tWest\t400\tStings +S3\t3\t30.8\t15\tWest;South;East\t300\tSting +S4\t4\t35.1\t20\tWest\t400\tStings S5\t5\t40.0\t25\tNorth\t500\tThing -S6\t6\t45.0\t30\tEast\t600\tThings -S7\t1;3;5\t50.0\t35\tSouth\t700\tStrong -S8\t2\t55.0\t40\tWest\t800\tStrung -S9\t7\t62.0\t45\tNorth\t900\tStang -S10\t8\t70.0\t52\tEast\t1000\tSong +S6\t6\t45.4\t30\tEast\t600\tThings +S7\t1;3;5\t50.9\t35\tSouth\t700\tStrong +S8\t2\t55.2\t40\tWest\t800\tStrung +S9\t7\t62.6\t45\tNorth\t900\tStang +S10\t8\t70.7\t52\tEast\t1000\tSong """ tsv_file = tmp_path / "test_metadata.tsv" tsv_file.write_text(tsv_content) @@ -65,7 +65,9 @@ def test_greater_than(self, sample_tsv_with_numeric_data): result = manager.run_query(filter_string="Weight:>50") sample_ids = result.get_unique_values("Sample_ID") - assert len(sample_ids) == 3 + # Weight > 50: S7 (50.9), S8 (55.2), S9 (62.6), S10 (70.7) + assert len(sample_ids) == 4 + assert "S7" in sample_ids assert "S8" in sample_ids assert "S9" in sample_ids assert "S10" in sample_ids @@ -176,11 +178,10 @@ def test_simple_range(self, sample_tsv_with_numeric_data): result = manager.run_query(filter_string="Weight:30-45") sample_ids = result.get_unique_values("Sample_ID") - assert len(sample_ids) == 4 + assert len(sample_ids) == 3 assert "S3" in sample_ids assert "S4" in sample_ids assert "S5" in sample_ids - assert "S6" in sample_ids def test_range_boundaries_inclusive(self, sample_tsv_with_numeric_data): """Test that range boundaries are inclusive.""" @@ -253,7 +254,7 @@ class TestNumericalFilteringDiscreteValues: def test_single_discrete_value(self, sample_tsv_with_numeric_data): """Test filtering with a single discrete value.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) - result = manager.run_query(filter_string="Weight:50") + result = manager.run_query(filter_string="Weight:50.9") sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 1 @@ -262,7 +263,7 @@ def test_single_discrete_value(self, sample_tsv_with_numeric_data): def test_multiple_discrete_values(self, sample_tsv_with_numeric_data): """Test filtering with multiple discrete values (OR logic).""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) - result = manager.run_query(filter_string="Weight:20,30,50") + result = manager.run_query(filter_string="Weight:20.2,30.8,50.9") sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 3 @@ -327,9 +328,9 @@ def test_not_operator_with_discrete_values(self, sample_tsv_with_numeric_data): assert "S8" not in sample_ids def test_not_operator_multiple_negations(self, sample_tsv_with_numeric_data): - """Test NOT operator (!) with multiple negations: Weight:>30,!50,!55 should exclude 50 and 55.""" + """Test NOT operator (!) with multiple negations: Weight:>30,!50.9,!55.2 should exclude 50.9 and 55.2.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) - result = manager.run_query(filter_string="Weight:>30,!50,!55") + result = manager.run_query(filter_string="Weight:>30,!50.9,!55.2") sample_ids = result.get_unique_values("Sample_ID") assert "S4" in sample_ids @@ -339,6 +340,7 @@ def test_not_operator_multiple_negations(self, sample_tsv_with_numeric_data): assert "S10" in sample_ids assert "S7" not in sample_ids assert "S8" not in sample_ids + assert "S8" not in sample_ids class TestSemicolonSeparatedNumericFiltering: From 94c223bed2e68ed0857e6970f1ca43ca5fcadba3 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 10 Feb 2026 15:05:03 +0100 Subject: [PATCH 027/100] Update test assertions for sample metadata queries --- tests/e2e_integration/queries/test_SidecarQueryManager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e_integration/queries/test_SidecarQueryManager.py b/tests/e2e_integration/queries/test_SidecarQueryManager.py index 585087da..86b4c289 100644 --- a/tests/e2e_integration/queries/test_SidecarQueryManager.py +++ b/tests/e2e_integration/queries/test_SidecarQueryManager.py @@ -146,7 +146,7 @@ def test_tsv_query_column_not_found(sample_tsv_file, caplog, create_sidecar_mana assert len(query_result) == 5, "Should return all records when column not found" assert "Column 'NonExistentColumn' not found in the TSV file" in caplog.text - assert query_message == "Invalid filter conditions - returning ALL records" + assert query_message == "Invalid filter conditions (NonExistentColumn:Value) - returning ALL records" @pytest.mark.unit From bc39329c152eb040d64ab43bb2cee56a183b0ea7 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 10 Feb 2026 15:07:19 +0100 Subject: [PATCH 028/100] Update metadata template CLI to use pathlib --- .../src/divbase_cli/cli_commands/dimensions_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 7cb85d8d..c3df6e24 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -1,6 +1,6 @@ import csv import logging -import os +from pathlib import Path import typer import yaml @@ -200,7 +200,7 @@ def create_metadata_template_with_project_samples_names( return output_filename = "divbase_metadata_template.tsv" - output_path = os.path.join(os.getcwd(), output_filename) + output_path = Path.cwd() / output_filename with open(output_path, mode="w", newline="") as tsvfile: writer = csv.writer(tsvfile, delimiter="\t") From cc74df6e18d160ee40b07d6b8d05c29a1f6d1451 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 10 Feb 2026 15:12:56 +0100 Subject: [PATCH 029/100] Allow custom name for sample metadata templates Change default to sample_metadata_.tsv and added warning and prompt if file already exists --- .../divbase_cli/cli_commands/dimensions_cli.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index c3df6e24..4563e702 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -162,6 +162,12 @@ def _format_api_response_for_display_in_terminal(api_response: DimensionsShowRes @dimensions_app.command("create-metadata-template") def create_metadata_template_with_project_samples_names( + output_filename: str | None = typer.Option( + None, + "--output", + "-o", + help="Name of the output TSV file to create. Defaults to sample_metadata_.tsv. If a file with the same name already exists in the current directory, you will be prompted to confirm if you want to overwrite it.", + ), project: str | None = PROJECT_NAME_OPTION, ) -> None: """ @@ -174,6 +180,9 @@ def create_metadata_template_with_project_samples_names( project_config = resolve_project(project_name=project) + if output_filename is None: + output_filename = f"sample_metadata_{project_config.name}.tsv" + response = make_authenticated_request( method="GET", divbase_base_url=project_config.divbase_url, @@ -199,9 +208,15 @@ def create_metadata_template_with_project_samples_names( print("No samples found for this project. No file written.") return - output_filename = "divbase_metadata_template.tsv" output_path = Path.cwd() / output_filename + # Check if file exists and prompt user for confirmation + if output_path.exists(): + overwrite = typer.confirm(f"File '{output_path}' already exists. Do you want to overwrite it?") + if not overwrite: + print("File not written. Exiting.") + return + with open(output_path, mode="w", newline="") as tsvfile: writer = csv.writer(tsvfile, delimiter="\t") writer.writerow(["#Sample_ID"]) From 8d648ff612e6575832230c4cbaa0f2e73f13935a Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 10 Feb 2026 16:23:00 +0100 Subject: [PATCH 030/100] Add draft sample metadata validator command Runs client side validation checks on sample metadata TSV files before upload to catch common issues and provide more helpful error messages to users. Requires a server call to fetch the dimensions index for the project, but all of the validation on the TSV is run on the client side. Client side was chosen to avoid handling API requests with large TSV payloads or to avoid several uploads of metadata files to S3 for they should be linted from the there. --- .../cli_commands/dimensions_cli.py | 112 ++++++++ .../services/sample_metadata_tsv_validator.py | 244 ++++++++++++++++++ 2 files changed, 356 insertions(+) create mode 100644 packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 4563e702..5d47470d 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -4,9 +4,11 @@ import typer import yaml +from rich import print from divbase_cli.cli_commands.shared_args_options import PROJECT_NAME_OPTION from divbase_cli.config_resolver import resolve_project +from divbase_cli.services.sample_metadata_tsv_validator import MetadataTSVValidator from divbase_cli.user_auth import make_authenticated_request from divbase_lib.api_schemas.vcf_dimensions import DimensionsShowResult @@ -226,3 +228,113 @@ def create_metadata_template_with_project_samples_names( print(f"A sample metadata template with these sample names was written to: {output_path}") # TODO perhaps add a message on how to fill in additional columns and how to upload the metadata file to DivBase? + + +@dimensions_app.command("validate-metadata-file") +def validate_metadata_template_versus_dimensions_and_formatting_constraints( + input_filename: str = typer.Argument( + ..., + help="Name of the input TSV file to validate.", + ), + project: str | None = PROJECT_NAME_OPTION, +) -> None: + """ + Validate a sidecar metadata TSV file against DivBase formatting requirements and project dimensions. + + Validation is run client-side to keep sensitive metadata local during validation. + + Validation checks: + - File is properly tab-delimited + - First column is named '#Sample_ID' + - No commas in cells + - Sample_ID has only one value per row (no semicolons) + - No duplicate sample IDs + - Invalid characters + - Basic type consistency in user-defined columns. But not Pandas type inference, + as we want to avoid having the user install Pandas just for validation. So just check that numeric columns have only numeric values (excluding header). + - All samples in the TSV exist in the project's dimensions index + + Returns errors for critical issues and warnings for non-critical issues. + """ + + project_config = resolve_project(project_name=project) + + input_path = Path.cwd() / input_filename + + if not input_path.exists(): + print(f"Error: File '{input_path}' not found.") + raise typer.Exit(code=1) + + print(f"Validating local metadata file: {input_path}") + print(f"Project: {project_config.name}\n") + + response = make_authenticated_request( + method="GET", + divbase_base_url=project_config.divbase_url, + api_route=f"v1/vcf-dimensions/projects/{project_config.name}", + ) + vcf_dimensions_data = DimensionsShowResult(**response.json()) + dimensions_info = _format_api_response_for_display_in_terminal(vcf_dimensions_data) + + # TODO there is duplication here with other commands in this file. Could be made DRY with a helper function or a separate CRUD function to get dimensions info without needing to parse API response on client side. + + unique_sample_names = set() + for entry in dimensions_info.get("indexed_files", []): + unique_sample_names.update(entry.get("dimensions", {}).get("sample_names", [])) + + validator = MetadataTSVValidator(file_path=input_path, project_samples=unique_sample_names) + stats, errors, warnings = validator.validate() + + if stats: + print("[bold cyan]VALIDATION SUMMARY:[/bold cyan]") + print(f" Total columns: {stats.get('total_columns', 0)} ({stats.get('user_defined_columns', 0)} user-defined)") + + samples_in_tsv = stats.get("samples_in_tsv", 0) + samples_matching = stats.get("samples_matching_project", 0) + total_project = stats.get("total_project_samples", 0) + + print( + f" Samples matching project VCF dimensions: {samples_matching}/{samples_in_tsv} (project has {total_project} total)" + ) + + numeric_cols = stats.get("numeric_columns", []) + string_cols = stats.get("string_columns", []) + mixed_cols = stats.get("mixed_type_columns", []) + + if numeric_cols: + print(f" Numeric columns ({len(numeric_cols)}): {', '.join(numeric_cols)}") + if string_cols: + print(f" String columns ({len(string_cols)}): {', '.join(string_cols)}") + if mixed_cols: + print( + f" [red]Mixed-type columns ({len(mixed_cols)}): {', '.join(mixed_cols)} - NOT ALLOWED, see errors below[/red]" + ) + + if stats.get("has_multi_values", False): + print(" Multi-value cells: Yes (semicolon-separated values detected)") + else: + print(" Multi-value cells: No") + + print() + + if errors: + print("[red bold]ERRORS (must be fixed):[/red bold]") + for error in errors: + print(f" - {error}") + print() + + if warnings: + print("[yellow bold]WARNINGS (should be reviewed):[/yellow bold]") + for warning in warnings: + print(f" - {warning}") + print() + + if not errors and not warnings: + print("[green bold]Validation passed![/green bold] The metadata file meets all DivBase requirements.") + elif errors: + print("[red bold]Validation failed![/red bold] Please fix the errors above before uploading.") + raise typer.Exit(code=1) + else: + print("[yellow bold]Validation passed with warnings![/yellow bold] Review the warnings above.") + + # TODO: Add information about how to upload the validated metadata file to DivBase diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py new file mode 100644 index 00000000..b09ce2a7 --- /dev/null +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -0,0 +1,244 @@ +""" +Client-side validator for DivBase sidecar metadata TSV files. + +Requires that the dimensions index for the project is up to date and is fetched from the server. +Validates formatting requirements without sending data to the server. +""" + +import csv +from pathlib import Path + + +class MetadataTSVValidator: + """Validates sidecar metadata TSV files against DivBase requirements.""" + + FORBIDDEN_CHARS = [","] + + def __init__(self, file_path: Path, project_samples: set[str]): + """ + Initialize the validator. File path is the path to the TSV file to validate, + and project_samples is a set of unique sample names from the project's dimensions index. + """ + self.file_path = file_path + self.project_samples = project_samples + self.errors: list[str] = [] + self.warnings: list[str] = [] + self.stats: dict = {} + + def validate(self) -> tuple[dict, list[str], list[str]]: + """ + Run all validation checks on the TSV file. + Returns a tuple of (stats, errors, warnings) where stats is a dictionary of collected statistics about the TSV file, + errors is a list of error messages, and warnings is a list of warning messages. + """ + self.errors = [] + self.warnings = [] + self.stats = {} + + try: + with open(self.file_path, "r", newline="", encoding="utf-8") as f: + reader = csv.reader(f, delimiter="\t") + rows = list(reader) + except Exception as e: + self.errors.append(f"Failed to read file: {e}") + return self.stats, self.errors, self.warnings + + if not rows: + self.errors.append("File is empty") + return self.stats, self.errors, self.warnings + + self._validate_header(rows[0]) + + if len(rows) > 1: + self._validate_data_rows(rows) + + return self.stats, self.errors, self.warnings + + def _validate_header(self, header: list[str]) -> None: + """Validate the header row.""" + if not header: + self.errors.append("Header row is missing") + return + + if header[0] != "#Sample_ID": + self.errors.append(f"First column must be named '#Sample_ID', found: '{header[0]}'") + + if len(header) != len(set(header)): + duplicates = [col for col in header if header.count(col) > 1] + self.errors.append(f"Duplicate column names found: {set(duplicates)}") + + for i, col in enumerate(header): + if not col.strip(): + self.errors.append(f"Empty column name at position {i + 1}") + + def _validate_data_rows(self, rows: list[list[str]]) -> None: + """Validate all data rows.""" + header = rows[0] + data_rows = rows[1:] + + num_columns = len(header) + sample_ids_seen = set() + tsv_samples = set() + + column_types: dict[int, set[str]] = {i: set() for i in range(1, num_columns)} + + has_multi_values = False + + for row_num, row in enumerate(data_rows, start=2): # Start at row 2 (after header) + if len(row) != num_columns: + sample_hint = f" (Sample_ID: '{row[0]}')" if row else "" + self.errors.append( + f"Row {row_num}: Expected {num_columns} tab-separated columns from reading the header, found {len(row)}{sample_hint}. " + "Check that all values are separated by tabs (not spaces)." + ) + continue + + sample_id = row[0].strip() if row else "" + + if not sample_id: + self.errors.append(f"Row {row_num}: Sample_ID is empty") + continue + + if ";" in sample_id: + self.errors.append( + f"Row {row_num}: Sample_ID '{sample_id}' contains semicolon. Sample_ID must contain only one value." + ) + + if sample_id in sample_ids_seen: + self.errors.append(f"Row {row_num}: Duplicate Sample_ID '{sample_id}'") + else: + sample_ids_seen.add(sample_id) + tsv_samples.add(sample_id) + + for col_idx, cell in enumerate(row): + self._validate_cell(row_num, col_idx, header[col_idx], cell) + + # Track column types for user-defined columns (skip col 0, i.e. Sample_ID) + if col_idx > 0: + if ";" in cell: + has_multi_values = True + self._infer_column_type(row_num, col_idx, header[col_idx], cell, column_types) + + self._check_mixed_types(header, column_types) + + self._validate_sample_names(tsv_samples) + + self._collect_statistics(header, tsv_samples, column_types, has_multi_values) + + def _validate_cell(self, row_num: int, col_idx: int, col_name: str, cell: str) -> None: + """Validate an individual cell.""" + + for char in self.FORBIDDEN_CHARS: + if char in cell: + self.errors.append(f"Row {row_num}, Column '{col_name}': Cell contains forbidden character '{char}'") + + if cell != cell.strip(): + self.warnings.append( + f"Row {row_num}, Column '{col_name}': Cell has leading or trailing whitespace " + "(will be stripped by server)" + ) + + def _infer_column_type( + self, row_num: int, col_idx: int, col_name: str, cell: str, column_types: dict[int, set[str]] + ) -> None: + """ + Infer the type of values in a column and validate type consistency. + Matches server-side logic in queries.py::_is_semicolon_separated_numeric_column + """ + values = [v.strip() for v in cell.split(";") if v.strip()] + + cell_has_numeric = False + cell_has_string = False + + for value in values: + # Check for hyphens in values that might be numeric + # The server-side logic rejects hyphens in numeric columns (e.g., "1-2" could be confused with range syntax) + if ( + "-" in value + and any(c.isdigit() for c in value) + and ("numeric" in column_types[col_idx] or all(t == "numeric" for t in column_types[col_idx] if t)) + ): + self.errors.append( + f"Row {row_num}, Column '{col_name}': Value '{value}' contains a hyphen. " + f"Hyphens are not allowed in numeric column values (only in string columns). " + f"If this is meant to be a string column, all values should be non-numeric strings." + ) + + # Try to determine if numeric or string. Note! The queries used Pandas for this, so there could potentially be a discrepency here. + try: + float(value) + cell_has_numeric = True + column_types[col_idx].add("numeric") + except ValueError: + cell_has_string = True + column_types[col_idx].add("string") + + # Check for mixed types within the same cell (e.g., "1;abc") + if cell_has_numeric and cell_has_string: + self.errors.append( + f"Row {row_num}, Column '{col_name}': Cell '{cell}' contains mixed types. " + f"All cell values in the same column must be consistently numeric or string." + ) + + def _check_mixed_types(self, header: list[str], column_types: dict[int, set[str]]) -> None: + """ + Check for mixed types in columns and raise errors. + Matches server-side logic in queries.py::_is_semicolon_separated_numeric_column + """ + for col_idx, types in column_types.items(): + if len(types) > 1: + col_name = header[col_idx] + self.errors.append( + f"Column '{col_name}': Contains mixed types (both numeric and string values). " + f"All values in a column must be consistently numeric or string for DivBase sidecar metadata queries to work correctly." + ) + + def _validate_sample_names(self, tsv_samples: set[str]) -> None: + """Validate sample names against project dimensions.""" + + missing_from_project = tsv_samples - self.project_samples + if missing_from_project: + examples = sorted(list(missing_from_project)) + self.errors.append( + f"The following samples in the TSV were not found in the DivBase project's dimensions index: {examples}. " + "DivBase requires that all samples in the TSV file must be present in the project's dimensions index to be used for queries." + ) + + missing_from_tsv = self.project_samples - tsv_samples + if missing_from_tsv: + examples = sorted(list(missing_from_tsv)) + self.warnings.append( + f"The following samples in the DivBase project's dimensions index were not found in the TSV: {examples}. " + "This is allowed for DivBase metadata TSV files, but please be aware that these samples will not be considered when making queries with this metadata file." + ) + + def _collect_statistics( + self, header: list[str], tsv_samples: set[str], column_types: dict[int, set[str]], has_multi_values: bool + ) -> None: + """Collect statistics about the TSV file.""" + + self.stats["total_columns"] = len(header) + self.stats["user_defined_columns"] = len(header) - 1 # Exclude Sample_ID + + matching_samples = tsv_samples & self.project_samples + self.stats["samples_in_tsv"] = len(tsv_samples) + self.stats["samples_matching_project"] = len(matching_samples) + self.stats["total_project_samples"] = len(self.project_samples) + + numeric_cols = [] + string_cols = [] + mixed_cols = [] + + for col_idx, types in column_types.items(): + col_name = header[col_idx] + if len(types) > 1: + mixed_cols.append(col_name) + elif "numeric" in types: + numeric_cols.append(col_name) + elif "string" in types: + string_cols.append(col_name) + + self.stats["numeric_columns"] = numeric_cols + self.stats["string_columns"] = string_cols + self.stats["mixed_type_columns"] = mixed_cols + self.stats["has_multi_values"] = has_multi_values From 1cd9d66f7c0ab87fe5ad969fecb868d12647a6c1 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 11:01:22 +0100 Subject: [PATCH 031/100] Add fixture TSV with incorrect formatting For testing of the TSV validator --- .gitignore | 2 +- ...rrect_formatting_to_test_tsv_validator.tsv | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 tests/fixtures/sample_metadata_incorrect_formatting_to_test_tsv_validator.tsv diff --git a/.gitignore b/.gitignore index 867a6d9d..7b0ec9b2 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,7 @@ __pycache__/ # project specific files /sample_metadata.tsv -sample_metadata_*.tsv +/sample_metadata_*.tsv *.vcf *.vcf.gz *.vcf.gz.csi diff --git a/tests/fixtures/sample_metadata_incorrect_formatting_to_test_tsv_validator.tsv b/tests/fixtures/sample_metadata_incorrect_formatting_to_test_tsv_validator.tsv new file mode 100644 index 00000000..3de7e04f --- /dev/null +++ b/tests/fixtures/sample_metadata_incorrect_formatting_to_test_tsv_validator.tsv @@ -0,0 +1,19 @@ +#Sample_ID Population Area Test +129P2 1 North 2 +129S1 2;4 East,West 3 +129S5 3 South all +AJ 4 West 1 +AKRJ 5 North 2 +BALBcJ 6 East 2 +C3HHeJ 1;three;5 South 3 +C57BL6NJ 2 West 3 +CASTEiJ 3 North 4 +CBAJ 4 East 4 +DBA2J 5 South 5 +FVBNJ 6 West 5 +LPJ 1 North 1 +NODShiLtJ 2 East 1 +NZOHlLtJ 3 South 2 +PWKPhJ 4 West 1 +test 0 South 1 +noTab 0 vscode_spaces 1 \ No newline at end of file From a9bfa9fb039a82271342dd42ef09b1d0a62553ee Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 11:07:00 +0100 Subject: [PATCH 032/100] Collect all mixed-type cols for a single error Before, this long error msg was repeated for each mixed-type column, which was a lot when there was multiple such columns. Could perhaps be worth it to treat more errors like this, but some that are related to a specific cell value might be better suited to be reported seperatelly. --- .../services/sample_metadata_tsv_validator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index b09ce2a7..45b0862c 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -185,13 +185,17 @@ def _check_mixed_types(self, header: list[str], column_types: dict[int, set[str] Check for mixed types in columns and raise errors. Matches server-side logic in queries.py::_is_semicolon_separated_numeric_column """ + mixed_columns = [] for col_idx, types in column_types.items(): if len(types) > 1: col_name = header[col_idx] - self.errors.append( - f"Column '{col_name}': Contains mixed types (both numeric and string values). " - f"All values in a column must be consistently numeric or string for DivBase sidecar metadata queries to work correctly." - ) + mixed_columns.append(col_name) + + if mixed_columns: + self.errors.append( + f"The following columns contain mixed types (both numeric and string values): {mixed_columns}. " + "All values in a column must be consistently numeric or string for DivBase sidecar metadata queries to work correctly." + ) def _validate_sample_names(self, tsv_samples: set[str]) -> None: """Validate sample names against project dimensions.""" From 9d9848e3935f6dfafc2053385ecc43c1376fabf7 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 11:27:30 +0100 Subject: [PATCH 033/100] Add unit tests for the TSV validator --- .../test_sample_metadata_tsv_validator.py | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py new file mode 100644 index 00000000..3b9370fb --- /dev/null +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -0,0 +1,208 @@ +""" +Unit tests for the MetadataTSVValidator class. +""" + +import pytest + +from divbase_cli.services.sample_metadata_tsv_validator import MetadataTSVValidator + + +@pytest.fixture +def project_samples(): + """Standard set of project samples for testing.""" + return {"S1", "S2", "S3", "S4", "S5"} + + +@pytest.fixture +def valid_tsv(tmp_path): + """Create a valid TSV file that passes all validation checks and includes all project samples.""" + tsv_content = """#Sample_ID\tPopulation\tArea\tWeight +S1\t1\tNorth\t12.5 +S2\t2;4\tEast\t18.8 +S3\t3\tWest;South\t15.0 +S4\t3;5\tSouth\t20.0 +S5\t4\tNorth\t22.1 +""" + tsv_file = tmp_path / "valid.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def header_errors_tsv(tmp_path): + """Create TSV with header errors: wrong first column, duplicate columns, empty column.""" + tsv_content = """SampleID\tPopulation\tArea\tArea\t +S1\t1\tNorth\tEast\tValue +""" + tsv_file = tmp_path / "header_errors.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_errors_tsv(tmp_path): + """Create TSV with Sample_ID errors: empty, semicolons, duplicates.""" + tsv_content = """#Sample_ID\tPopulation +S1\t1 +\t2 +S3;S4\t3 +S1\t4 +""" + tsv_file = tmp_path / "sample_errors.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def format_errors_tsv(tmp_path): + """Create TSV with formatting errors: wrong column count, commas, whitespace.""" + tsv_content = """#Sample_ID\tPopulation\tArea +S1\t1\tNorth +S2\t2,3\tEast +S3 \t 4 \t West +S4\t5 +""" + tsv_file = tmp_path / "format_errors.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +def test_valid_tsv_passes_all_checks(valid_tsv, project_samples): + """Valid TSV should pass with no errors or warnings.""" + validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert len(errors) == 0 + assert len(warnings) == 0 + assert stats["total_columns"] == 4 + assert stats["user_defined_columns"] == 3 + assert stats["samples_in_tsv"] == 5 + assert stats["samples_matching_project"] == 5 + assert stats["has_multi_values"] is True + assert "Population" in stats["numeric_columns"] + assert "Area" in stats["string_columns"] + assert "Weight" in stats["numeric_columns"] + + +class TestHeaderValidation: + """Test validation of header row.""" + + def test_wrong_first_column_name(self, header_errors_tsv, project_samples): + """First column must be '#Sample_ID'.""" + validator = MetadataTSVValidator(file_path=header_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("First column must be named '#Sample_ID'" in e for e in errors) + + def test_duplicate_column_names(self, header_errors_tsv, project_samples): + """Duplicate column names should be detected.""" + validator = MetadataTSVValidator(file_path=header_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("Duplicate column names" in e and "Area" in e for e in errors) + + def test_empty_column_name(self, header_errors_tsv, project_samples): + """Empty column names should be detected.""" + validator = MetadataTSVValidator(file_path=header_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("Empty column name" in e for e in errors) + + +class TestSampleIDValidation: + """Test validation of Sample_ID column.""" + + def test_empty_sample_id(self, sample_errors_tsv, project_samples): + """Empty Sample_ID should be detected.""" + validator = MetadataTSVValidator(file_path=sample_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("Sample_ID is empty" in e for e in errors) + + def test_semicolon_in_sample_id(self, sample_errors_tsv, project_samples): + """Sample_ID containing semicolon should be detected.""" + validator = MetadataTSVValidator(file_path=sample_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("contains semicolon" in e and "S3;S4" in e for e in errors) + + def test_duplicate_sample_id(self, sample_errors_tsv, project_samples): + """Duplicate Sample_IDs should be detected.""" + validator = MetadataTSVValidator(file_path=sample_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("Duplicate Sample_ID" in e and "S1" in e for e in errors) + + +class TestFormattingValidation: + """Test validation of TSV formatting.""" + + def test_wrong_column_count(self, format_errors_tsv, project_samples): + """Rows with wrong number of columns should be detected.""" + validator = MetadataTSVValidator(file_path=format_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("Expected 3 tab-separated columns" in e and "found 2" in e for e in errors) + + def test_comma_in_cell(self, format_errors_tsv, project_samples): + """Commas in cells should be detected.""" + validator = MetadataTSVValidator(file_path=format_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("forbidden character ','" in e for e in errors) + + def test_whitespace_warning(self, format_errors_tsv, project_samples): + """Leading/trailing whitespace should generate warnings.""" + validator = MetadataTSVValidator(file_path=format_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("leading or trailing whitespace" in w for w in warnings) + + +class TestDimensionMatching: + """Test validation against project dimensions.""" + + def test_samples_not_in_project(self, valid_tsv): + """Samples in TSV but not in project should be errors.""" + project_samples = {"S1", "S2"} + validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any( + "following samples in the TSV were not found in the DivBase project's dimensions index" in e and "S3" in e + for e in errors + ) + + def test_samples_not_in_tsv(self, valid_tsv): + """Samples in project but not in TSV should be warnings.""" + project_samples = {"S1", "S2", "S3", "S10", "S20"} + validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any( + "following samples in the DivBase project's dimensions index were not found in the TSV" in w and "S10" in w + for w in warnings + ) + assert any( + "following samples in the DivBase project's dimensions index were not found in the TSV" in w and "S20" in w + for w in warnings + ) + + +class TestStatistics: + """Test statistics collection.""" + + def test_statistics_collection(self, valid_tsv, project_samples): + """Verify statistics are correctly collected.""" + validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert stats["total_columns"] == 4 + assert stats["user_defined_columns"] == 3 + assert stats["samples_in_tsv"] == 5 + assert stats["samples_matching_project"] == 5 + assert stats["total_project_samples"] == 5 + assert len(stats["numeric_columns"]) == 2 + assert len(stats["string_columns"]) == 1 + assert len(stats["mixed_type_columns"]) == 0 + assert stats["has_multi_values"] is True From 6090e9709ddb7ca762b03293d72d80fd3665cc6d Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 11:39:36 +0100 Subject: [PATCH 034/100] Add more tests to validator to cover more cases --- .../test_sample_metadata_tsv_validator.py | 136 +++++++++++++++++- 1 file changed, 134 insertions(+), 2 deletions(-) diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index 3b9370fb..5eb17763 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -2,6 +2,8 @@ Unit tests for the MetadataTSVValidator class. """ +from pathlib import Path + import pytest from divbase_cli.services.sample_metadata_tsv_validator import MetadataTSVValidator @@ -67,6 +69,49 @@ def format_errors_tsv(tmp_path): return tsv_file +@pytest.fixture +def type_errors_tsv(tmp_path): + """Create TSV with type errors: mixed types in column and cell, hyphen in numeric. + + Population: Has both cell-level error (1;three;5) and column-level mixed types (numeric + string) + Test: Has column-level mixed types (all numeric values + string 'all') + Code: String column with hyphen in one value + """ + tsv_content = """#Sample_ID\tPopulation\tTest\tCode +S1\t1\t2\tA100 +S2\tabc\t3\tB200 +S3\t1;three;5\tall\tC300 +S4\t3-5\t4\tD400 +""" + tsv_file = tmp_path / "type_errors.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def no_multi_values_tsv(tmp_path): + """Create a TSV file with no semicolon-separated values in any cell.""" + tsv_content = """#Sample_ID\tPopulation\nS1\t1\nS2\t2\n""" + tsv_file = tmp_path / "no_multi_values.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def numeric_multi_values_tsv(tmp_path): + """Create a TSV file with multi-value numeric cells to verify they're classified as numeric.""" + tsv_content = """#Sample_ID\tScores\tValues +S1\t1;2;3\t10;20 +S2\t4;5\t30;40;50 +S3\t6\t60 +S4\t7;8;9;10\t70 +S5\t11\t80;90 +""" + tsv_file = tmp_path / "numeric_multi_values.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + def test_valid_tsv_passes_all_checks(valid_tsv, project_samples): """Valid TSV should pass with no errors or warnings.""" validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) @@ -159,12 +204,72 @@ def test_whitespace_warning(self, format_errors_tsv, project_samples): assert any("leading or trailing whitespace" in w for w in warnings) +class TestTypeValidation: + """Test validation of column types.""" + + def test_mixed_types_in_column(self, type_errors_tsv, project_samples): + """Columns with mixed numeric and string types should be detected.""" + validator = MetadataTSVValidator(file_path=type_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("mixed types" in e.lower() and "Population" in e for e in errors) + + def test_mixed_types_in_cell(self, type_errors_tsv, project_samples): + """Cells with mixed types (e.g., '1;three;5') should be detected.""" + validator = MetadataTSVValidator(file_path=type_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("Cell '1;three;5' contains mixed types" in e for e in errors) + + def test_hyphen_in_numeric_column(self, type_errors_tsv, project_samples): + """Hyphens in numeric columns should be detected.""" + validator = MetadataTSVValidator(file_path=type_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("hyphen" in e.lower() and "3-5" in e for e in errors) + + def test_cell_and_column_level_mixed_types(self, type_errors_tsv, project_samples): + """When a column has both cell-level and column-level mixed types, both errors should be reported.""" + validator = MetadataTSVValidator(file_path=type_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("Cell '1;three;5' contains mixed types" in e for e in errors) + assert any("following columns contain mixed types" in e and "Population" in e for e in errors) + assert "Population" in stats["mixed_type_columns"] + assert "Test" in stats["mixed_type_columns"] + + def test_stats_show_mixed_type_columns_with_cell_errors(self, type_errors_tsv, project_samples): + """ + Stats should show columns as mixed-type even when they have cell-level errors. + The type_errors_tsv fixture used here has two columns with mixed types. + """ + validator = MetadataTSVValidator(file_path=type_errors_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert "Population" in stats["mixed_type_columns"] + assert "Test" in stats["mixed_type_columns"] + assert len(stats["mixed_type_columns"]) == 2 + + def test_multi_value_numeric_cells_are_numeric(self, numeric_multi_values_tsv, project_samples): + """Multi-value numeric cells (e.g., '2;4') should be correctly classified as numeric, not string or mixed-type.""" + validator = MetadataTSVValidator(file_path=numeric_multi_values_tsv, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert "Scores" in stats["numeric_columns"] + assert "Values" in stats["numeric_columns"] + assert "Scores" not in stats["string_columns"] + assert "Values" not in stats["string_columns"] + assert "Scores" not in stats["mixed_type_columns"] + assert "Values" not in stats["mixed_type_columns"] + assert not any("mixed types" in e.lower() and ("Scores" in e or "Values" in e) for e in errors) + + class TestDimensionMatching: """Test validation against project dimensions.""" def test_samples_not_in_project(self, valid_tsv): """Samples in TSV but not in project should be errors.""" - project_samples = {"S1", "S2"} + project_samples = {"S1", "S2"} # Only S1 and S2 exist in project dimensions validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) stats, errors, warnings = validator.validate() @@ -175,7 +280,7 @@ def test_samples_not_in_project(self, valid_tsv): def test_samples_not_in_tsv(self, valid_tsv): """Samples in project but not in TSV should be warnings.""" - project_samples = {"S1", "S2", "S3", "S10", "S20"} + project_samples = {"S1", "S2", "S3", "S10", "S20"} # S10 and S20 not in TSV validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) stats, errors, warnings = validator.validate() @@ -206,3 +311,30 @@ def test_statistics_collection(self, valid_tsv, project_samples): assert len(stats["string_columns"]) == 1 assert len(stats["mixed_type_columns"]) == 0 assert stats["has_multi_values"] is True + + def test_no_multi_values_detected(self, no_multi_values_tsv): + """Test detection when no semicolon-separated values present.""" + validator = MetadataTSVValidator(file_path=no_multi_values_tsv, project_samples={"S1", "S2"}) + stats, errors, warnings = validator.validate() + assert stats["has_multi_values"] is False + + +class TestEdgeCases: + """Test edge cases and error conditions.""" + + def test_empty_file(self, project_samples, tmp_path): + """Empty file should be detected.""" + empty_file = tmp_path / "empty.tsv" + empty_file.write_text("") + + validator = MetadataTSVValidator(file_path=empty_file, project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("File is empty" in e for e in errors) + + def test_nonexistent_file(self, project_samples): + """Nonexistent file should be handled gracefully.""" + validator = MetadataTSVValidator(file_path=Path("/nonexistent/file.tsv"), project_samples=project_samples) + stats, errors, warnings = validator.validate() + + assert any("Failed to read file" in e for e in errors) From 6b2b6cda254e5613e5b4abbe3e0931986723ec5d Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 11:44:44 +0100 Subject: [PATCH 035/100] Add section on TSV validator to quick start guide --- docs/user-guides/quick-start.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/user-guides/quick-start.md b/docs/user-guides/quick-start.md index 77d8218c..ba68194f 100644 --- a/docs/user-guides/quick-start.md +++ b/docs/user-guides/quick-start.md @@ -134,7 +134,13 @@ Example of a sidecar metadata TSV file with the mandatory `Sample_ID` column and !!! note Please use a text editor than preserves the tabs when the file is saved. Incorrect tabs can lead to issues with running metadata queries in DivBase. -The sample metadata file should then be uploaded the the DivBase project with follows: +There is a command to help check that the sidecar metadata TSV is correctly formatted for use with DivBase. Running it is optional: + +```bash +divbase-cli dimensions validate-metadata-file path/to/your/sample_metadata.tsv +``` + +When you are happy with the sample metadata file, it should be uploaded the the DivBase project with the following: ```bash divbase-cli files upload path/to/your/sample_metadata.tsv From d6795c37c80525f7180aa3ee61196c5f8718ccc3 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 13:42:07 +0100 Subject: [PATCH 036/100] Add section on validator to sidecar metadata docs --- docs/user-guides/sidecar-metadata.md | 59 +++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 41ecd558..5d5e2348 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -4,6 +4,15 @@ TODO - rationale: what is this and how can it be used +assumes that update dimensions has been run for the latest data + +!!! Notes + There is a CLI command to help check that a user-defined sample metadata TSV file aligns with the requirements described on this page. This validator tool will be [described in its own section below](#validating-a-sidecar-metadata-tsv-with-divbase-cli), but, in short, it can be run with: + + ```bash + divbase-cli dimensions validate-metadata-file path/to/your/sample_metadata.tsv + ``` + ## Creating a sidecar TSV for a DivBase project If the dimensions VCF files in the project have been cached in DivBase, a template metadata file with the sample names pre-filled can be created with: @@ -23,6 +32,8 @@ Note! there can be multiple TSVs in the same project and it is possible to call 3. The `Sample_ID` column can only contain one sample name per row. This is different from the user-defined columns that can take arrays of values for each cell in a column. 4. Every column need to be tab separated for all rows. +TODO non empty, unique, no duplication, no semicolones in Sample_ID + #### User-defined columns After the `Sample_ID` column has been populated, users can add any columns and values to the TSV. @@ -38,6 +49,8 @@ To ensure that user-defined metadata can be used in DivBase, we ask you follow t 4. As outlined above, the only characters with special meaning or restrictions in the TSV are `#`, `,`, `;`, and `\t` (tab). Other special characters should be supported, but please be aware that Your Milage May Vary. Some common cases that have been tested and are supported include hyphens (`-`), e.g.`North-West`), diacritic unicodecharacters like `å`,`ä`,`ö`. 5. Leading and trailing whitespaces are removed by the DivBase backend in order to ensure robust filtering and pattern matching. Whitespaces inside strings will be preserved. For instance: " Sample 1 " will be processed as "Sample 1". +TODO - add info on No duplicate column names, no empty column names + #### Example This example illustrates how a sidecar sample metadata TSV can look like. The mandatory requirement are fulfilled (heading, `Sample_ID` column, tab-separated file). The user-defined column contains examples of a numerical column (`Population`) and a string column (`Area`). In some cells, semicolons (`;`) are used to assign multiple values to the same sample and column. @@ -54,16 +67,43 @@ S7 1;3;5 South 22.6 S8 2 West 19.5 ``` -TODOs: +### Validating a sidecar metadata TSV with `divbase-cli` -- [TO BE IMPLEMENTED] consider changing the mandatory column name from `Sample_ID` to `Sample` -- [TO BE IMPLEMENTED] what happens if a TSV does not contain all the samples in the DivBase project? There should probably be a warning, but not an error? -- [TO BE IMPLEMENTED] what happens if a sample name is misspelled in the TSV? a warning? can this be checked against the dimensions show? -- [TO BE IMPLEMENTED] what happens if a sample is duplicated in the file. what happens if the sample name is duplicated but not the values (diverging duplicate)? +Manually checking that a TSV fulfills the DivBase requirement can be tedious. To help users validate their sidecar TSV files, the following CLI command has been implemented: + +```bash +divbase-cli dimensions validate-metadata-file path/to/your/sample_metadata.tsv +``` + +The validation runs on the users local computer and not as a job on the DivBase server. It is intendend to be used on sidecar metadata TSV files before they are uploaded to the DivBase project. The validator will check the formatting requirements as described in [Mandatory contents](#mandatory-content) and [User-defined columns](#user-defined-columns). + +The command requires that the project's dimensions index is up-to-date with the VCF files in the project, and that is why is sort under `divbase-cli dimensions` in the CLI command tree. If you are unsure if the dimensions index is up-to-date, just run `divbase-cli dimensions update` and wait until that job has completed by checking `divbase-cli task-history user`. + +The validation command will fetch all sample names from the project dimensions index from the DivBase server and use that to validate that the sample names in the TSV are correct. Misspelled, missing, or otherwise incorrect sample names in the TSV will result in erroneus or even misleading query results, and the validator will help with spotting that. + +The following will return errors. These must be fixed if the sidecar TSV should be used in DivBase queries: + +- Header formatting: Header row is missing or first column is not #Sample_ID, Duplicate or empty column names + +- Tab separation: Row has the wrong number of columns + +- `Sample_ID` : Empty Sample_ID,Sample_ID contains a semicolon,Duplicate Sample_ID + +- Unsupported characters: no commas in cell values; no hyphens in numerical columns + +- Type consistency (numeric and string values): no Mixed types in a column or in a cell in a cell (e.g., 1;abc) + +- All samples listed in in TSV must exist in the dimensions index + +The validator will also raise Warnings. DivBase queries can still be run with these, but the user should review them, and possible address them if so desired: + +- Cell value has leading or trailing whitespace (will be stripped by server) + +- Samples in the project’s dimensions index not found in the TSV. These samples will not be considered in queries, and that might in fact be what the user wants, espcially if using multiple TSVs. Just be sure to be careful when using this since it will affect the results. ## Query Syntax for sidecar metadata -- TODO: explain warnings +- TODO: explain warnings, these should be the same as the validator, but this needs to be checked - TODO: explain when empty results or all results are returned ### Overview: querys are applied as filters on columns in the TSV @@ -164,6 +204,13 @@ divbase-cli query tsv "Area:North,West,!South;Weight:>10,<=20,!15,18-22" There are three samples (rows) that fulfill this, and this is what the query results will return: `S1`, `S4`, and `S5`. +TODOs: + +- [TO BE IMPLEMENTED] consider changing the mandatory column name from `Sample_ID` to `Sample` +- [TO BE IMPLEMENTED] what happens if a TSV does not contain all the samples in the DivBase project? There should probably be a warning, but not an error? +- [TO BE IMPLEMENTED] what happens if a sample name is misspelled in the TSV? a warning? can this be checked against the dimensions show? +- [TO BE IMPLEMENTED] what happens if a sample is duplicated in the file. what happens if the sample name is duplicated but not the values (diverging duplicate)? + - [TO BE IMPLEMENTED] what to do if a query references a column that does not exist. E.g. `divbase-cli query tsv "Area:Northern Portugal"` when Area does not exist? This should probably give a warning and not just return nothing - [TO BE IMPLEMENTED] what to do if a query references a column value. E.g. `divbase-cli query tsv "Area:Northern Portugal"` when Northern Portugal does not exist in the column? This should probably also give a warning and not just return nothing, but nothing is a result here and not a syntax problem... From 82b8d2188c3e07b0b2c54f68f4c6cf1a94784a39 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 13:51:18 +0100 Subject: [PATCH 037/100] Use classmethod in MetadataTSVValidator Mainly a point of style, but this allows us to avoid instantiating a MetadataTSVValidator object when running the validate method. The object will not be reused after instatiation so this feels cleaner. Could of course use a post-init strategy instead. --- .../cli_commands/dimensions_cli.py | 3 +- .../services/sample_metadata_tsv_validator.py | 28 ++++---- .../test_sample_metadata_tsv_validator.py | 66 +++++++------------ 3 files changed, 37 insertions(+), 60 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 5d47470d..e9d0800a 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -282,8 +282,7 @@ def validate_metadata_template_versus_dimensions_and_formatting_constraints( for entry in dimensions_info.get("indexed_files", []): unique_sample_names.update(entry.get("dimensions", {}).get("sample_names", [])) - validator = MetadataTSVValidator(file_path=input_path, project_samples=unique_sample_names) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(file_path=input_path, project_samples=unique_sample_names) if stats: print("[bold cyan]VALIDATION SUMMARY:[/bold cyan]") diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index 45b0862c..852cc696 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -25,34 +25,34 @@ def __init__(self, file_path: Path, project_samples: set[str]): self.warnings: list[str] = [] self.stats: dict = {} - def validate(self) -> tuple[dict, list[str], list[str]]: + @classmethod + def validate(cls, file_path: Path, project_samples: set[str]) -> tuple[dict, list[str], list[str]]: """ - Run all validation checks on the TSV file. - Returns a tuple of (stats, errors, warnings) where stats is a dictionary of collected statistics about the TSV file, + Validate a TSV file and return results. + + Returns a tuple of (stats, errors, warnings) where stats is a dictionary of collected statistics, errors is a list of error messages, and warnings is a list of warning messages. """ - self.errors = [] - self.warnings = [] - self.stats = {} + validator = cls(file_path, project_samples) try: - with open(self.file_path, "r", newline="", encoding="utf-8") as f: + with open(validator.file_path, "r", newline="", encoding="utf-8") as f: reader = csv.reader(f, delimiter="\t") rows = list(reader) except Exception as e: - self.errors.append(f"Failed to read file: {e}") - return self.stats, self.errors, self.warnings + validator.errors.append(f"Failed to read file: {e}") + return validator.stats, validator.errors, validator.warnings if not rows: - self.errors.append("File is empty") - return self.stats, self.errors, self.warnings + validator.errors.append("File is empty") + return validator.stats, validator.errors, validator.warnings - self._validate_header(rows[0]) + validator._validate_header(rows[0]) if len(rows) > 1: - self._validate_data_rows(rows) + validator._validate_data_rows(rows) - return self.stats, self.errors, self.warnings + return validator.stats, validator.errors, validator.warnings def _validate_header(self, header: list[str]) -> None: """Validate the header row.""" diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index 5eb17763..172386e7 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -114,8 +114,7 @@ def numeric_multi_values_tsv(tmp_path): def test_valid_tsv_passes_all_checks(valid_tsv, project_samples): """Valid TSV should pass with no errors or warnings.""" - validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) assert len(errors) == 0 assert len(warnings) == 0 @@ -134,22 +133,19 @@ class TestHeaderValidation: def test_wrong_first_column_name(self, header_errors_tsv, project_samples): """First column must be '#Sample_ID'.""" - validator = MetadataTSVValidator(file_path=header_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) assert any("First column must be named '#Sample_ID'" in e for e in errors) def test_duplicate_column_names(self, header_errors_tsv, project_samples): """Duplicate column names should be detected.""" - validator = MetadataTSVValidator(file_path=header_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) assert any("Duplicate column names" in e and "Area" in e for e in errors) def test_empty_column_name(self, header_errors_tsv, project_samples): """Empty column names should be detected.""" - validator = MetadataTSVValidator(file_path=header_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) assert any("Empty column name" in e for e in errors) @@ -159,22 +155,19 @@ class TestSampleIDValidation: def test_empty_sample_id(self, sample_errors_tsv, project_samples): """Empty Sample_ID should be detected.""" - validator = MetadataTSVValidator(file_path=sample_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(sample_errors_tsv, project_samples) assert any("Sample_ID is empty" in e for e in errors) def test_semicolon_in_sample_id(self, sample_errors_tsv, project_samples): """Sample_ID containing semicolon should be detected.""" - validator = MetadataTSVValidator(file_path=sample_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(sample_errors_tsv, project_samples) assert any("contains semicolon" in e and "S3;S4" in e for e in errors) def test_duplicate_sample_id(self, sample_errors_tsv, project_samples): """Duplicate Sample_IDs should be detected.""" - validator = MetadataTSVValidator(file_path=sample_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(sample_errors_tsv, project_samples) assert any("Duplicate Sample_ID" in e and "S1" in e for e in errors) @@ -184,22 +177,19 @@ class TestFormattingValidation: def test_wrong_column_count(self, format_errors_tsv, project_samples): """Rows with wrong number of columns should be detected.""" - validator = MetadataTSVValidator(file_path=format_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) assert any("Expected 3 tab-separated columns" in e and "found 2" in e for e in errors) def test_comma_in_cell(self, format_errors_tsv, project_samples): """Commas in cells should be detected.""" - validator = MetadataTSVValidator(file_path=format_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) assert any("forbidden character ','" in e for e in errors) def test_whitespace_warning(self, format_errors_tsv, project_samples): """Leading/trailing whitespace should generate warnings.""" - validator = MetadataTSVValidator(file_path=format_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) assert any("leading or trailing whitespace" in w for w in warnings) @@ -209,29 +199,25 @@ class TestTypeValidation: def test_mixed_types_in_column(self, type_errors_tsv, project_samples): """Columns with mixed numeric and string types should be detected.""" - validator = MetadataTSVValidator(file_path=type_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) assert any("mixed types" in e.lower() and "Population" in e for e in errors) def test_mixed_types_in_cell(self, type_errors_tsv, project_samples): """Cells with mixed types (e.g., '1;three;5') should be detected.""" - validator = MetadataTSVValidator(file_path=type_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) assert any("Cell '1;three;5' contains mixed types" in e for e in errors) def test_hyphen_in_numeric_column(self, type_errors_tsv, project_samples): """Hyphens in numeric columns should be detected.""" - validator = MetadataTSVValidator(file_path=type_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) assert any("hyphen" in e.lower() and "3-5" in e for e in errors) def test_cell_and_column_level_mixed_types(self, type_errors_tsv, project_samples): """When a column has both cell-level and column-level mixed types, both errors should be reported.""" - validator = MetadataTSVValidator(file_path=type_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) assert any("Cell '1;three;5' contains mixed types" in e for e in errors) assert any("following columns contain mixed types" in e and "Population" in e for e in errors) @@ -243,8 +229,7 @@ def test_stats_show_mixed_type_columns_with_cell_errors(self, type_errors_tsv, p Stats should show columns as mixed-type even when they have cell-level errors. The type_errors_tsv fixture used here has two columns with mixed types. """ - validator = MetadataTSVValidator(file_path=type_errors_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) assert "Population" in stats["mixed_type_columns"] assert "Test" in stats["mixed_type_columns"] @@ -252,8 +237,7 @@ def test_stats_show_mixed_type_columns_with_cell_errors(self, type_errors_tsv, p def test_multi_value_numeric_cells_are_numeric(self, numeric_multi_values_tsv, project_samples): """Multi-value numeric cells (e.g., '2;4') should be correctly classified as numeric, not string or mixed-type.""" - validator = MetadataTSVValidator(file_path=numeric_multi_values_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(numeric_multi_values_tsv, project_samples) assert "Scores" in stats["numeric_columns"] assert "Values" in stats["numeric_columns"] @@ -270,8 +254,7 @@ class TestDimensionMatching: def test_samples_not_in_project(self, valid_tsv): """Samples in TSV but not in project should be errors.""" project_samples = {"S1", "S2"} # Only S1 and S2 exist in project dimensions - validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) assert any( "following samples in the TSV were not found in the DivBase project's dimensions index" in e and "S3" in e @@ -281,8 +264,7 @@ def test_samples_not_in_project(self, valid_tsv): def test_samples_not_in_tsv(self, valid_tsv): """Samples in project but not in TSV should be warnings.""" project_samples = {"S1", "S2", "S3", "S10", "S20"} # S10 and S20 not in TSV - validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) assert any( "following samples in the DivBase project's dimensions index were not found in the TSV" in w and "S10" in w @@ -299,8 +281,7 @@ class TestStatistics: def test_statistics_collection(self, valid_tsv, project_samples): """Verify statistics are correctly collected.""" - validator = MetadataTSVValidator(file_path=valid_tsv, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) assert stats["total_columns"] == 4 assert stats["user_defined_columns"] == 3 @@ -314,8 +295,7 @@ def test_statistics_collection(self, valid_tsv, project_samples): def test_no_multi_values_detected(self, no_multi_values_tsv): """Test detection when no semicolon-separated values present.""" - validator = MetadataTSVValidator(file_path=no_multi_values_tsv, project_samples={"S1", "S2"}) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(no_multi_values_tsv, {"S1", "S2"}) assert stats["has_multi_values"] is False @@ -327,14 +307,12 @@ def test_empty_file(self, project_samples, tmp_path): empty_file = tmp_path / "empty.tsv" empty_file.write_text("") - validator = MetadataTSVValidator(file_path=empty_file, project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(empty_file, project_samples) assert any("File is empty" in e for e in errors) def test_nonexistent_file(self, project_samples): """Nonexistent file should be handled gracefully.""" - validator = MetadataTSVValidator(file_path=Path("/nonexistent/file.tsv"), project_samples=project_samples) - stats, errors, warnings = validator.validate() + stats, errors, warnings = MetadataTSVValidator.validate(Path("/nonexistent/file.tsv"), project_samples) assert any("Failed to read file" in e for e in errors) From 3e26c25ad6fe61af58bf41d2184c6d6860fb609a Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 13:55:16 +0100 Subject: [PATCH 038/100] Add mkdocs autogen docs for dimensions CLI command Contains the new template and validator subcommands --- docs/cli/_auto_generated/dimensions.md | 54 ++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/docs/cli/_auto_generated/dimensions.md b/docs/cli/_auto_generated/dimensions.md index 37e63bd8..874187a7 100644 --- a/docs/cli/_auto_generated/dimensions.md +++ b/docs/cli/_auto_generated/dimensions.md @@ -18,6 +18,8 @@ $ divbase-cli dimensions [OPTIONS] COMMAND [ARGS]... * `update`: Calculate and add the dimensions of a VCF... * `show`: Show the dimensions index file for a project. +* `create-metadata-template`: Use the samples index in a projects... +* `validate-metadata-file`: Validate a sidecar metadata TSV file... ## `divbase-cli dimensions update` @@ -49,5 +51,57 @@ $ divbase-cli dimensions show [OPTIONS] * `--filename TEXT`: If set, will show only the entry for this VCF filename. * `--unique-scaffolds`: If set, will show all unique scaffold names found across all the VCF files in the project. +* `--unique-samples`: If set, will show all unique sample names found across all the VCF files in the project. +* `--project TEXT`: Name of the DivBase project, if not provided uses the default in your DivBase config file +* `--help`: Show this message and exit. + +## `divbase-cli dimensions create-metadata-template` + +Use the samples index in a projects dimensions cache to create a TSV metadata template file +that has the sample names as pre-filled as the first column. + +**Usage**: + +```console +$ divbase-cli dimensions create-metadata-template [OPTIONS] +``` + +**Options**: + +* `-o, --output TEXT`: Name of the output TSV file to create. Defaults to sample_metadata_<project_name>.tsv. If a file with the same name already exists in the current directory, you will be prompted to confirm if you want to overwrite it. +* `--project TEXT`: Name of the DivBase project, if not provided uses the default in your DivBase config file +* `--help`: Show this message and exit. + +## `divbase-cli dimensions validate-metadata-file` + +Validate a sidecar metadata TSV file against DivBase formatting requirements and project dimensions. + +Validation is run client-side to keep sensitive metadata local during validation. + +Validation checks: +- File is properly tab-delimited +- First column is named '#Sample_ID' +- No commas in cells +- Sample_ID has only one value per row (no semicolons) +- No duplicate sample IDs +- Invalid characters +- Basic type consistency in user-defined columns. But not Pandas type inference, + as we want to avoid having the user install Pandas just for validation. So just check that numeric columns have only numeric values (excluding header). +- All samples in the TSV exist in the project's dimensions index + +Returns errors for critical issues and warnings for non-critical issues. + +**Usage**: + +```console +$ divbase-cli dimensions validate-metadata-file [OPTIONS] INPUT_FILENAME +``` + +**Arguments**: + +* `INPUT_FILENAME`: Name of the input TSV file to validate. [required] + +**Options**: + * `--project TEXT`: Name of the DivBase project, if not provided uses the default in your DivBase config file * `--help`: Show this message and exit. From 1ba014b58bdab890a44a5e49983272c5a73c501d Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 16:01:47 +0100 Subject: [PATCH 039/100] Add text to migration dev docs on branch switching Just some notes on how I handled an awkward migrations mismatch after reviewing another branch with migrations that the caused errors in my local Docker Compose stack after switching back to my branch. --- docs/development/database-migrations.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/development/database-migrations.md b/docs/development/database-migrations.md index ac94eab7..4c5c46d2 100644 --- a/docs/development/database-migrations.md +++ b/docs/development/database-migrations.md @@ -95,6 +95,26 @@ You can also run the `pytest-alembic` tests to further validate the newly create pytest tests/migrations ``` +### Local Dev with Docker Compose: Handling Alembic Migration Errors When Switching Branches + +When working in local development, there are cases where you might to temporarily switch to a different git branch that has a different set of Alembic migration files. For instance, you are working on one branch and switch to review another branch with additional migration files that, for some reason, you do not want to merge into your branch yet. + +After switching back to your branch, you might find that the Docker Compose stack no longer will be able to start. The reason for this is: if the database's migration history (the `alembic_version` table) in the Docker Compose environment expects a migration revision that is missing in your current branch, the `divbase-db-migrator-1` container will fail to start and log an error like: `FAILED: Can't locate revision identified by ''` + +If you **are sure** your database schema matches the migrations in your current branch, you can manually update the `alembic_version` table in your local `divbase-postgres-1` container to point to the latest migration in your branch. Find the revision ID for the latest migration file in your branch (``) and run: + +```bash +docker exec -it divbase-postgres-1 psql -U divbase_user -d divbase_db -c "UPDATE alembic_version SET version_num = '';" +``` + +After this, restart the stack with: + +```bash +docker compose -f docker/divbase_compose.yaml down && docker compose -f docker/divbase_compose.yaml watch +``` + +**Warning**: Only do this for local Docker Compose environments and for cases where you know that you can recover/rebuild/afford to lose the data in the local postgres instance. If you are not sure about this, it might actually be safer to merge in the other branch to yours (assuming that you know that both branches will eventually be merged to main after review). + ## Production Deployment Documentation on how to run migrations in production/deployed environments is covered in our [private repository, argocd-divbase](https://github.com/ScilifelabDataCentre/argocd-divbase). From 9c3c4da4ec6f4beae699dfdf3272c36d87890fad Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 16:39:21 +0100 Subject: [PATCH 040/100] Add Sample_ID exceptions in SidecarQueryManager Check already on load_file(). Sample_ID column must exist, have no empty/missing values, and all values must be unique (no duplicates). Add new exception to handle missing and duplicate Sample_IDs. Ensure that the exceptions are correclty propagated to the API route and returned to the user. --- .../divbase-api/src/divbase_api/routes/queries.py | 11 ++++++++--- .../divbase-api/src/divbase_api/services/queries.py | 13 +++++++++++++ packages/divbase-lib/src/divbase_lib/exceptions.py | 6 ++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/routes/queries.py b/packages/divbase-api/src/divbase_api/routes/queries.py index 0b2c580d..7c9d9637 100644 --- a/packages/divbase-api/src/divbase_api/routes/queries.py +++ b/packages/divbase-api/src/divbase_api/routes/queries.py @@ -29,7 +29,11 @@ SampleMetadataQueryRequest, SampleMetadataQueryTaskResult, ) -from divbase_lib.exceptions import SidecarInvalidFilterError +from divbase_lib.exceptions import ( + SidecarColumnNotFoundError, + SidecarInvalidFilterError, + SidecarSampleIDError, +) logging.basicConfig(level=settings.api.log_level, handlers=[logging.StreamHandler(sys.stderr)]) @@ -82,8 +86,9 @@ async def sample_metadata_query( # TODO - consider if we split this into 2 routes to handle time out issues on CLI side. # Route 1, create job and get back job id. # Route 2, get job result by id (with status etc), CLI can poll until done. - except SidecarInvalidFilterError as e: - # Catch invalid filter errors (e.g., mixed types in columns) and return 400 + + except (SidecarInvalidFilterError, SidecarColumnNotFoundError, SidecarSampleIDError) as e: + # Catch validation errors (mixed types, missing columns, invalid Sample_IDs) and return 400 error_message = str(e) raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=error_message) from None except VCFDimensionsEntryMissingError: diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 76f167e2..2c7c937a 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -26,6 +26,7 @@ SidecarColumnNotFoundError, SidecarInvalidFilterError, SidecarNoDataLoadedError, + SidecarSampleIDError, ) logger = logging.getLogger(__name__) @@ -713,7 +714,19 @@ def load_file(self) -> "SidecarQueryManager": if "Sample_ID" not in self.df.columns: raise SidecarColumnNotFoundError("The 'Sample_ID' column is required in the metadata file.") + if self.df["Sample_ID"].isna().any() or (self.df["Sample_ID"] == "").any(): + raise SidecarSampleIDError( + "Sample_ID column contains empty or missing values. All rows must have a valid Sample_ID." + ) + if self.df["Sample_ID"].duplicated().any(): + duplicates = self.df[self.df["Sample_ID"].duplicated()]["Sample_ID"].tolist() + raise SidecarSampleIDError(f"Duplicate Sample_IDs found: {duplicates}. Each Sample_ID must be unique.") + + except (SidecarSampleIDError, SidecarColumnNotFoundError, SidecarInvalidFilterError): + # Let validation errors propagate directly to user with specific error messages + raise except Exception as e: + # Only wrap unexpected errors (file I/O, pandas errors, etc.) raise SidecarNoDataLoadedError(file_path=self.file, submethod="load_file") from e return self diff --git a/packages/divbase-lib/src/divbase_lib/exceptions.py b/packages/divbase-lib/src/divbase_lib/exceptions.py index e06bd764..5b72981a 100644 --- a/packages/divbase-lib/src/divbase_lib/exceptions.py +++ b/packages/divbase-lib/src/divbase_lib/exceptions.py @@ -101,6 +101,12 @@ class SidecarColumnNotFoundError(Exception): pass +class SidecarSampleIDError(Exception): + """Raised when a Sample_ID is invalid or duplicated when loading sidecar data.""" + + pass + + class NoVCFFilesFoundError(Exception): """Raised when no VCF files are found in the project bucket.""" From 7aeda1b5db11f13f6bebef07bf648aa735e6b8ea Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 16:58:19 +0100 Subject: [PATCH 041/100] Add unit tests for the Sample_ID exceptions --- .../test_sample_metadata_queries.py | 75 ++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 782dc577..c18085da 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -5,7 +5,7 @@ import pytest from divbase_api.services.queries import SidecarQueryManager -from divbase_lib.exceptions import SidecarInvalidFilterError +from divbase_lib.exceptions import SidecarColumnNotFoundError, SidecarInvalidFilterError, SidecarSampleIDError @pytest.fixture @@ -56,6 +56,56 @@ def sample_tsv_with_edge_cases(tmp_path): return tsv_file +@pytest.fixture +def sample_tsv_with_invalid_sample_ids(tmp_path): + """ + Create a temporary TSV file to test Sample_ID validation: + Has empty and duplicate Sample_IDs that both should raise error during load + """ + tsv_content = """#Sample_ID\tPopulation\tWeight +S1\t1\t20.2 +\t2\t25.0 +S3\t3\t30.8 +S3\t4\t35.1 +""" + tsv_file = tmp_path / "test_metadata_invalid_sample_ids.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_tsv_missing_sample_id_column(tmp_path): + """ + Create a temporary TSV file that omits the Sample_ID column. + Should trigger SidecarColumnNotFoundError during file load. + """ + tsv_content = """Population\tWeight\tAge\tArea +1\t20.2\t5.0\tNorth +2\t25.0\t10\tEast +3\t30.8\t15\tWest +""" + tsv_file = tmp_path / "test_metadata_missing_sample_id.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_tsv_with_duplicate_sample_ids(tmp_path): + """ + Create a temporary TSV file to test duplicate Sample_IDs (should raise error during load). + """ + tsv_content = """#Sample_ID\tPopulation\tWeight +S1\t1\t20.2 +S2\t2\t25.0 +S3\t3\t30.8 +S3\t4\t35.1 +S4\t5\t40.0 +""" + tsv_file = tmp_path / "test_duplicate_sample_ids.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + class TestNumericalFilteringInequalities: """Test inequality operators on numerical columns.""" @@ -622,3 +672,26 @@ def test_not_operator_only_negations(self, sample_tsv_with_edge_cases): assert "S4" in sample_ids assert "S1" not in sample_ids assert "S2" not in sample_ids + + +class TestSampleIDValidation: + """Test Sample_ID validation during file loading.""" + + def test_empty_sample_id_raises_error(self, sample_tsv_with_invalid_sample_ids): + """Test that empty Sample_ID values raise SidecarSampleIDError directly during file load.""" + with pytest.raises(SidecarSampleIDError) as excinfo: + SidecarQueryManager(file=sample_tsv_with_invalid_sample_ids) + assert "Sample_ID column contains empty or missing values" in str(excinfo.value) + + def test_duplicate_sample_id_raises_error(self, sample_tsv_with_duplicate_sample_ids): + """Test that duplicate Sample_ID values raise SidecarSampleIDError directly during file load.""" + with pytest.raises(SidecarSampleIDError) as excinfo: + SidecarQueryManager(file=sample_tsv_with_duplicate_sample_ids) + assert "Duplicate Sample_IDs found" in str(excinfo.value) + assert "S3" in str(excinfo.value) + + def test_missing_sample_id_column_raises_error(self, sample_tsv_missing_sample_id_column): + """Test that missing Sample_ID column raises SidecarColumnNotFoundError during file load.""" + with pytest.raises(SidecarColumnNotFoundError) as excinfo: + SidecarQueryManager(file=sample_tsv_missing_sample_id_column) + assert "The 'Sample_ID' column is required in the metadata file." in str(excinfo.value) From 8b2a1e918e74a52719e7ab7e41b293718fec87cb Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 17:05:26 +0100 Subject: [PATCH 042/100] Update error/warning handling in validator Should now hopefully handle everything equivalent to the query logic in SidecarQueryManager. --- .../cli_commands/dimensions_cli.py | 6 +++++ .../services/sample_metadata_tsv_validator.py | 26 ++++++++++++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index e9d0800a..c964e344 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -314,6 +314,12 @@ def validate_metadata_template_versus_dimensions_and_formatting_constraints( else: print(" Multi-value cells: No") + empty_cells = stats.get("empty_cells_per_column", {}) + if empty_cells: + print( + f" User-defined columns with empty cells ({len(empty_cells)}): {', '.join(f'{col} ({count})' for col, count in empty_cells.items())}" + ) + print() if errors: diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index 852cc696..6ea1268a 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -81,6 +81,7 @@ def _validate_data_rows(self, rows: list[list[str]]) -> None: tsv_samples = set() column_types: dict[int, set[str]] = {i: set() for i in range(1, num_columns)} + empty_cells_per_column: dict[str, int] = {header[i]: 0 for i in range(1, num_columns)} has_multi_values = False @@ -113,8 +114,11 @@ def _validate_data_rows(self, rows: list[list[str]]) -> None: for col_idx, cell in enumerate(row): self._validate_cell(row_num, col_idx, header[col_idx], cell) - # Track column types for user-defined columns (skip col 0, i.e. Sample_ID) + # Track column types and empty-cells for user-defined columns (skip col 0, i.e. Sample_ID) if col_idx > 0: + if not cell.strip(): + empty_cells_per_column[header[col_idx]] += 1 + if ";" in cell: has_multi_values = True self._infer_column_type(row_num, col_idx, header[col_idx], cell, column_types) @@ -123,7 +127,7 @@ def _validate_data_rows(self, rows: list[list[str]]) -> None: self._validate_sample_names(tsv_samples) - self._collect_statistics(header, tsv_samples, column_types, has_multi_values) + self._collect_statistics(header, tsv_samples, column_types, has_multi_values, empty_cells_per_column) def _validate_cell(self, row_num: int, col_idx: int, col_name: str, cell: str) -> None: """Validate an individual cell.""" @@ -138,6 +142,12 @@ def _validate_cell(self, row_num: int, col_idx: int, col_name: str, cell: str) - "(will be stripped by server)" ) + if col_idx > 0 and not cell.strip(): + self.warnings.append( + f"Row {row_num}, Column '{col_name}': Cell is empty. " + "Empty values will be treated as missing by the server and will not match any filter conditions in queries." + ) + def _infer_column_type( self, row_num: int, col_idx: int, col_name: str, cell: str, column_types: dict[int, set[str]] ) -> None: @@ -217,7 +227,12 @@ def _validate_sample_names(self, tsv_samples: set[str]) -> None: ) def _collect_statistics( - self, header: list[str], tsv_samples: set[str], column_types: dict[int, set[str]], has_multi_values: bool + self, + header: list[str], + tsv_samples: set[str], + column_types: dict[int, set[str]], + has_multi_values: bool, + empty_cells_per_column: dict[str, int], ) -> None: """Collect statistics about the TSV file.""" @@ -246,3 +261,8 @@ def _collect_statistics( self.stats["string_columns"] = string_cols self.stats["mixed_type_columns"] = mixed_cols self.stats["has_multi_values"] = has_multi_values + + # Only include columns with empty cells in stats + columns_with_empty_cells = {col: count for col, count in empty_cells_per_column.items() if count > 0} + if columns_with_empty_cells: + self.stats["empty_cells_per_column"] = columns_with_empty_cells From 8e9d29b5f1d511b2f59e5c33a45d11cb976f36c4 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 17:32:27 +0100 Subject: [PATCH 043/100] Support negative numbers The checks to disallow hyphens for ranges in numeric columns were also disallowing negative numbers, which was not intended. This update fixes this in the SidecarQueryMangager and the TSV validator. --- .../src/divbase_api/services/queries.py | 26 ++++++++--------- .../services/sample_metadata_tsv_validator.py | 29 ++++++++++--------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 2c7c937a..ab90db5e 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -999,19 +999,19 @@ def _is_semicolon_separated_numeric_column(self, key: str) -> bool: if not part: continue - # Check if the value contains a hyphen and looks like it could be numeric (e.g., "1-2", "3-4") - if "-" in part and any(p.isdigit() for p in part): - raise SidecarInvalidFilterError( - f"Column '{key}' contains value '{part}' with a hyphen at row {row_index}. " - f"Hyphens are not allowed in numeric column values (only in string columns). " - f"If this is meant to be a string column, all values should be non-numeric strings." - ) - try: float(part) has_numeric_type = True except ValueError: has_non_numeric_type = True + # If not numeric and contains a hyphen with digits, it's likely a range notation like "1-2". Negative numbers should already have been classified as numeric with the float() check. + if "-" in part and any(c.isdigit() for c in part): + raise SidecarInvalidFilterError( + f"Column '{key}' contains value '{part}' with a hyphen at row {row_index}. " + f"This appears to be range notation (e.g., '1-2'), which is not allowed in data values. " + f"If this is meant to be a numeric column, use semicolons to separate values (e.g., '1;2'). " + f"If this is meant to be a string column, all values should be non-numeric strings." + ) from None if has_numeric_type and has_non_numeric_type: raise SidecarInvalidFilterError( @@ -1186,14 +1186,14 @@ def _parse_numeric_filter_values( for filter_string_value in values_to_process: # Check for common mistakes: =< or => instead of <= or >= - if re.match(r"^=<\d+\.?\d*$", filter_string_value) or re.match(r"^=>\d+\.?\d*$", filter_string_value): + if re.match(r"^=<-?\d+\.?\d*$", filter_string_value) or re.match(r"^=>-?\d+\.?\d*$", filter_string_value): raise SidecarInvalidFilterError( f"Invalid operator format '{filter_string_value[:2]}' in filter '{key}:{filter_string_values}'." f"Use standard operators: '<=' (not '=<') or '>=' (not '=>')" ) - # Check if it's an inequality (e.g., ">25", "<=40") - inequality_match = re.match(r"^(>=|<=|>|<)(\d+\.?\d*)$", filter_string_value) + # Check if it's an inequality (e.g., ">25", "<=40", "<-5") + inequality_match = re.match(r"^(>=|<=|>|<)(-?\d+\.?\d*)$", filter_string_value) if inequality_match: operator = inequality_match.group(1) threshold = float(inequality_match.group(2)) @@ -1205,8 +1205,8 @@ def _parse_numeric_filter_values( ) continue - # Check if it's a range (e.g., "20-40") - range_match = re.match(r"^(\d+\.?\d*)-(\d+\.?\d*)$", filter_string_value) + # Check if it's a range (e.g., "20-40", "-100--50", "10-20") + range_match = re.match(r"^(-?\d+\.?\d*)-(-?\d+\.?\d*)$", filter_string_value) if range_match: min_val = float(range_match.group(1)) max_val = float(range_match.group(2)) diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index 6ea1268a..d3ca557c 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -161,20 +161,8 @@ def _infer_column_type( cell_has_string = False for value in values: - # Check for hyphens in values that might be numeric - # The server-side logic rejects hyphens in numeric columns (e.g., "1-2" could be confused with range syntax) - if ( - "-" in value - and any(c.isdigit() for c in value) - and ("numeric" in column_types[col_idx] or all(t == "numeric" for t in column_types[col_idx] if t)) - ): - self.errors.append( - f"Row {row_num}, Column '{col_name}': Value '{value}' contains a hyphen. " - f"Hyphens are not allowed in numeric column values (only in string columns). " - f"If this is meant to be a string column, all values should be non-numeric strings." - ) - - # Try to determine if numeric or string. Note! The queries used Pandas for this, so there could potentially be a discrepency here. + # Try to determine if numeric or string first + # Note! The queries use Pandas for this which is not used here due to different dependencies in the packages. There could potentially be a discrepancy here. try: float(value) cell_has_numeric = True @@ -183,6 +171,19 @@ def _infer_column_type( cell_has_string = True column_types[col_idx].add("string") + # Check for hyphens in non-numeric values that might indicate range notation. Negative numbers should already have been classified as numeric with the float() check. + if ( + "-" in value + and any(c.isdigit() for c in value) + and ("numeric" in column_types[col_idx] or all(t == "numeric" for t in column_types[col_idx] if t)) + ): + self.errors.append( + f"Row {row_num}, Column '{col_name}': Value '{value}' contains a hyphen. " + f"This appears to be range notation (e.g., '1-2'), which is not allowed in data values. " + f"If this is meant to be a numeric column, use semicolons to separate values (e.g., '1;2'). " + f"If this is meant to be a string column, all values should be non-numeric strings." + ) + # Check for mixed types within the same cell (e.g., "1;abc") if cell_has_numeric and cell_has_string: self.errors.append( From 6d450053ba35b40c0a6a61bc5bffa3c5b9a43b57 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 11 Feb 2026 17:51:23 +0100 Subject: [PATCH 044/100] Update unit tests with negative number cases --- .../test_sample_metadata_queries.py | 76 ++++++++++++++++--- .../test_sample_metadata_tsv_validator.py | 67 ++++++++++++---- 2 files changed, 117 insertions(+), 26 deletions(-) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index c18085da..3cbd8836 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -13,20 +13,21 @@ def sample_tsv_with_numeric_data(tmp_path): """ Create a temporary TSV file with numeric and string columns for testing. Includes semicolon-separated values in some cells. Includes both int and float - numeric values to test that both are detected as numeric. + numeric values to test that both are detected as numeric. Also includes negative + numbers to verify they are properly handled as numeric values. """ # Keep indentation like this to ensure that leading spaces in column 1 does not cause issues. - tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea\tSingleNumber\tSingleString -S1\t1\t20.2\t5.0\tNorth\t100\tString -S2\t2;4\t25.0\t10\tEast\t200\tStrings -S3\t3\t30.8\t15\tWest;South;East\t300\tSting -S4\t4\t35.1\t20\tWest\t400\tStings -S5\t5\t40.0\t25\tNorth\t500\tThing -S6\t6\t45.4\t30\tEast\t600\tThings -S7\t1;3;5\t50.9\t35\tSouth\t700\tStrong -S8\t2\t55.2\t40\tWest\t800\tStrung -S9\t7\t62.6\t45\tNorth\t900\tStang -S10\t8\t70.7\t52\tEast\t1000\tSong + tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea\tSingleNumber\tSingleString\tTemperature\tLongitude\tLatitude\tElevation +S1\t1\t20.2\t5.0\tNorth\t100\tString\t-5.5\t-2.78305556\t51.5\t100 +S2\t2;4\t25.0\t10\tEast\t200\tStrings\t-10.2\t-0.12765\t52.2\t-50 +S3\t3\t30.8\t15\tWest;South;East\t300\tSting\t0\t1.25\t50.8\t-100.5 +S4\t4\t35.1\t20\tWest\t400\tStings\t15.5\t-3.5;-2.1\t49.5\t200 +S5\t5\t40.0\t25\tNorth\t500\tThing\t-20\t0\t48.2\t-25 +S6\t6\t45.4\t30\tEast\t600\tThings\t10\t2.5\t53.1\t150 +S7\t1;3;5\t50.9\t35\tSouth\t700\tStrong\t5\t-1.5\t52.8\t50 +S8\t2\t55.2\t40\tWest\t800\tStrung\t20\t3.0\t51.0\t75 +S9\t7\t62.6\t45\tNorth\t900\tStang\t-15\t-2.0\t54.5\t-10 +S10\t8\t70.7\t52\tEast\t1000\tSong\t25\t1.5\t50.5\t200 """ tsv_file = tmp_path / "test_metadata.tsv" tsv_file.write_text(tsv_content) @@ -695,3 +696,54 @@ def test_missing_sample_id_column_raises_error(self, sample_tsv_missing_sample_i with pytest.raises(SidecarColumnNotFoundError) as excinfo: SidecarQueryManager(file=sample_tsv_missing_sample_id_column) assert "The 'Sample_ID' column is required in the metadata file." in str(excinfo.value) + + +class TestNegativeNumbers: + """Test that negative numbers are properly handled as numeric values.""" + + def test_negative_numbers_in_single_value_column(self, sample_tsv_with_numeric_data): + """Test that negative numbers in single-value columns are treated as numeric and can be filtered with inequalities.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Temperature:<0") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 4 + assert "S1" in sample_ids # -5.5 + assert "S2" in sample_ids # -10.2 + assert "S5" in sample_ids # -20 + assert "S9" in sample_ids # -15 + + def test_negative_numbers_discrete_values(self, sample_tsv_with_numeric_data): + """Test that negative numbers can be used as discrete filter values.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + result = manager.run_query(filter_string="Temperature:-5.5,-20") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 2 + assert "S1" in sample_ids + assert "S5" in sample_ids + + def test_negative_numbers_greater_than_inequality(self, sample_tsv_with_numeric_data): + """Test greater than with negative numbers.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + # Temperature: -5.5, -10.2, 0, 15.5, -20, 10, 5, 20, -15, 25 + # Less than -5 should match: -5.5, -10.2, -20, -15 + result = manager.run_query(filter_string="Temperature:<-5") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 4 + assert "S1" in sample_ids # -5.5 < -5 + assert "S2" in sample_ids # -10.2 < -5 + assert "S5" in sample_ids # -20 < -5 + assert "S9" in sample_ids # -15 < -5 + + def test_negative_numbers_in_semicolon_cells(self, sample_tsv_with_numeric_data): + """Test that negative numbers in semicolon-separated cells work correctly.""" + manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) + # Longitude values: -2.78305556, -0.12765, 1.25, -3.5;-2.1, 0, 2.5, -1.5, 3.0, -2.0, 1.5 + # Discrete value -3.5 should only match S4 which has -3.5;-2.1 + result = manager.run_query(filter_string="Longitude:-3.5") + + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S4" in sample_ids diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index 172386e7..50a85d66 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -71,17 +71,18 @@ def format_errors_tsv(tmp_path): @pytest.fixture def type_errors_tsv(tmp_path): - """Create TSV with type errors: mixed types in column and cell, hyphen in numeric. + """Create TSV with type errors: mixed types in column and cell, hyphen in numeric, and range notation. Population: Has both cell-level error (1;three;5) and column-level mixed types (numeric + string) Test: Has column-level mixed types (all numeric values + string 'all') Code: String column with hyphen in one value + Range: Contains range notation (e.g., '1-2') which should be rejected in numeric columns """ - tsv_content = """#Sample_ID\tPopulation\tTest\tCode -S1\t1\t2\tA100 -S2\tabc\t3\tB200 -S3\t1;three;5\tall\tC300 -S4\t3-5\t4\tD400 + tsv_content = """#Sample_ID\tPopulation\tTest\tCode\tRange +S1\t1\t2\tA100\t1-2 +S2\tabc\t3\tB200\t3 +S3\t1;three;5\tall\tC300\t4 +S4\t3-5\t4\tD400\t5 """ tsv_file = tmp_path / "type_errors.tsv" tsv_file.write_text(tsv_content) @@ -99,19 +100,25 @@ def no_multi_values_tsv(tmp_path): @pytest.fixture def numeric_multi_values_tsv(tmp_path): - """Create a TSV file with multi-value numeric cells to verify they're classified as numeric.""" - tsv_content = """#Sample_ID\tScores\tValues -S1\t1;2;3\t10;20 -S2\t4;5\t30;40;50 -S3\t6\t60 -S4\t7;8;9;10\t70 -S5\t11\t80;90 + """Create a TSV file with multi-value numeric cells and negative numbers to verify they're classified as numeric.""" + tsv_content = """#Sample_ID\tScores\tValues\tTemperature\tLongitude\tLatitude\tElevation +S1\t1;2;3\t10;20\t-5.5\t-2.78305556\t51.5\t100 +S2\t4;5\t30;40;50\t-10.2\t-0.12765\t52.2\t-50 +S3\t6\t60\t0\t1.25\t50.8\t-100.5 +S4\t7;8;9;10\t70\t15.5\t-3.5;-2.1\t49.5\t200 +S5\t11\t80;90\t-20\t0\t48.2\t-25 """ tsv_file = tmp_path / "numeric_multi_values.tsv" tsv_file.write_text(tsv_content) return tsv_file +@pytest.fixture +def negative_numeric_columns(): + """Columns in the numeric_multi_values_tsv fixture that should be classified as numeric (including negative values).""" + return ["Temperature", "Longitude", "Latitude", "Elevation"] + + def test_valid_tsv_passes_all_checks(valid_tsv, project_samples): """Valid TSV should pass with no errors or warnings.""" stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) @@ -233,7 +240,7 @@ def test_stats_show_mixed_type_columns_with_cell_errors(self, type_errors_tsv, p assert "Population" in stats["mixed_type_columns"] assert "Test" in stats["mixed_type_columns"] - assert len(stats["mixed_type_columns"]) == 2 + assert len(stats["mixed_type_columns"]) == 3 def test_multi_value_numeric_cells_are_numeric(self, numeric_multi_values_tsv, project_samples): """Multi-value numeric cells (e.g., '2;4') should be correctly classified as numeric, not string or mixed-type.""" @@ -316,3 +323,35 @@ def test_nonexistent_file(self, project_samples): stats, errors, warnings = MetadataTSVValidator.validate(Path("/nonexistent/file.tsv"), project_samples) assert any("Failed to read file" in e for e in errors) + + +class TestNegativeNumbers: + """Test that negative numbers are properly handled as numeric values.""" + + def test_negative_numbers_are_numeric(self, numeric_multi_values_tsv, negative_numeric_columns): + """Test that negative numbers are correctly classified as numeric, not flagged as errors due to hyphen check for ranges in numeric cells.""" + stats, errors, warnings = MetadataTSVValidator.validate( + numeric_multi_values_tsv, {"S1", "S2", "S3", "S4", "S5"} + ) + + for col in negative_numeric_columns: + assert not any("hyphen" in e.lower() and col in e for e in errors) + assert col in stats["numeric_columns"] + + assert len(stats["mixed_type_columns"]) == 0 + + def test_negative_numbers_with_semicolons(self, numeric_multi_values_tsv, negative_numeric_columns): + """Test that negative numbers in semicolon-separated cells are handled correctly.""" + stats, errors, warnings = MetadataTSVValidator.validate( + numeric_multi_values_tsv, {"S1", "S2", "S3", "S4", "S5"} + ) + + assert "Longitude" in negative_numeric_columns + assert "Longitude" in stats["numeric_columns"] + assert "Longitude" not in stats["mixed_type_columns"] + assert not any("Longitude" in e and "mixed" in e.lower() for e in errors) + + def test_range_notation_still_rejected(self, type_errors_tsv): + """Test that range notation like '1-2' is still rejected in numeric columns.""" + stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, {"S1", "S2", "S3", "S4"}) + assert any("mixed" in e.lower() and "Range" in e for e in errors) From 058aedc00d19c49466358e521d6ae6f1aec234c2 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 09:14:00 +0100 Subject: [PATCH 045/100] Add intro to sidecar metadata user guide --- docs/user-guides/query-syntax.md | 4 +++ docs/user-guides/sidecar-metadata.md | 38 ++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/docs/user-guides/query-syntax.md b/docs/user-guides/query-syntax.md index d516f8bc..fb68ffca 100644 --- a/docs/user-guides/query-syntax.md +++ b/docs/user-guides/query-syntax.md @@ -1,3 +1,7 @@ # DivBase Query Syntax for VCF data TODO + +## combined sample metadata and VCF queries + +TODO - there is a link to here from the sample metadata guide, so the combined queries should be described in detail here diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 5d5e2348..3b00c724 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -1,10 +1,12 @@ # Sidecar Metadata TSV files: creating and querying sample metadata files -TODO +DivBase supports that users supply a sidecar TSV (tab separated variables) file with metadata on the samples contained within the VCF files in the DivBase project. -- rationale: what is this and how can it be used +There are ways for sample metadata to be stored in the VCF itself (see [The Variant Call Format Specification](https://samtools.github.io/hts-specs/VCFv4.5.pdf)). For instance in a global `##SAMPLE` header (once per sample) or in a custom per-variant genotype `FORMAT` field in each variant and sample. The downside of the former is that common tools like `bcftools view` do not filter on the headers; the downside of the latter is that writing the metadata once per variant will result in a lot of repeated data, which in turn leads to elevated file size and processing times as the VCF file scales. -assumes that update dimensions has been run for the latest data +DivBase takes a different approach by decoupling the sample metadata from the VCF data by storing it in a sidecar file. The sidecar TSV can be queried on its own, or together with the VCF files in the DivBase project. The TSV is lightweight and highly extendable (essentially a plain-text form of a spreadsheet). This approach avoids having to read, write, and rewrite metadata to the VCF files and therefore keeps the resource overhead low for the sample metadata. + +To be able to accomodate metadata needs for any research project that deals with VCF files, the sidecar sample metadata TSV and filtering in DivBase has been designed to be very open-ended and user-defined. As long as a few format and filter syntax requirements, the user is free to design their metadata TSV as the like. Column names in the TSV represent metadata categories and rows represent the samples found in the VCF files in the DivBase project. However, this flexibility put the responsibility on the user that spelling and values in columns and rows are correct: if not, the sample metadata filters will return incomplete or unintended results. !!! Notes There is a CLI command to help check that a user-defined sample metadata TSV file aligns with the requirements described on this page. This validator tool will be [described in its own section below](#validating-a-sidecar-metadata-tsv-with-divbase-cli), but, in short, it can be run with: @@ -13,7 +15,22 @@ assumes that update dimensions has been run for the latest data divbase-cli dimensions validate-metadata-file path/to/your/sample_metadata.tsv ``` -## Creating a sidecar TSV for a DivBase project +This guide will describe how to [Create a sample metadata TSV](#creating-a-sidecar-sample-metadata-tsv-for-a-divbase-project)), and [How to run queries on sample metadata TSV files](#query-syntax-for-sidecar-metadata). Instructions on how to run combined sample metadata and VCF data queries are found in [DivBase Query Syntax for VCF data](query-syntax.md). + +!!! Warning + All instructions regarding running DivBase queries, generating sample metadata templates, and validating sample metadata TSV files required that the project's VCF dimensions index is updated against the current versions of the VCF files in the project's data store. This can be assured by running the command: + + ```bash + divbase-cli dimensions update + ``` + + Depending on the number and sizes of the VCF files, this can take a little time. To check the status of the dimensions update job, use the command: + + ```bash + divbase-cli task-history user + ``` + +## Creating a sidecar sample metadata TSV for a DivBase project If the dimensions VCF files in the project have been cached in DivBase, a template metadata file with the sample names pre-filled can be created with: @@ -23,8 +40,12 @@ divbase-cli dimensions create-metadata-template Note! there can be multiple TSVs in the same project and it is possible to call them for the queries with the `--metadata-tsv-name` flag. +TODO - give more example of how and when it can be relevant to have multiple tsv files. they can have sample subsets + ### Sidecar TSV format requirements +TODO - There is no fixed schema but some mandatory requirements + #### Mandatory content 1. The first row must be a header row and the first column must be named `Sample_ID`. @@ -103,6 +124,8 @@ The validator will also raise Warnings. DivBase queries can still be run with th ## Query Syntax for sidecar metadata +This section describes how to query on the sample metadata file itself. The same syntax used here will also be used when running combined sample metadata and VCF data queries; how to do that is covered in [DivBase Query Syntax for VCF data](query-syntax.md). + - TODO: explain warnings, these should be the same as the validator, but this needs to be checked - TODO: explain when empty results or all results are returned @@ -190,7 +213,7 @@ The `!` (NOT) operator can really come to good use for numerical filters: - `"Weight:>5,!10-15"` returns rows where the value is greater than 5, but not in the range 10–15. - `"Weight:!1-2,4"` returns rows where the value is not in the range 1–2, or is 4. -## Examples of complex queries +### Examples of complex queries Assuming that the sidecar metadata TSV file looks like in the [Example](#example) above, a query like will: @@ -206,11 +229,6 @@ There are three samples (rows) that fulfill this, and this is what the query res TODOs: -- [TO BE IMPLEMENTED] consider changing the mandatory column name from `Sample_ID` to `Sample` -- [TO BE IMPLEMENTED] what happens if a TSV does not contain all the samples in the DivBase project? There should probably be a warning, but not an error? -- [TO BE IMPLEMENTED] what happens if a sample name is misspelled in the TSV? a warning? can this be checked against the dimensions show? -- [TO BE IMPLEMENTED] what happens if a sample is duplicated in the file. what happens if the sample name is duplicated but not the values (diverging duplicate)? - - [TO BE IMPLEMENTED] what to do if a query references a column that does not exist. E.g. `divbase-cli query tsv "Area:Northern Portugal"` when Area does not exist? This should probably give a warning and not just return nothing - [TO BE IMPLEMENTED] what to do if a query references a column value. E.g. `divbase-cli query tsv "Area:Northern Portugal"` when Northern Portugal does not exist in the column? This should probably also give a warning and not just return nothing, but nothing is a result here and not a syntax problem... From ea2bcc3b28eb70ebad7e9ea6803d56b544319537 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 09:14:36 +0100 Subject: [PATCH 046/100] Move dimensions user guide up in the query tree Since it is a prerequisite for everything else, it makes more sense for it to come after the query overview --- mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 5d200d37..4dc8bf28 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -75,8 +75,8 @@ nav: - Working with VCF Files in DivBase: user-guides/vcf-files.md - Running Queries: - Overview: user-guides/running-queries.md - - "Sidecar Metadata TSV files: creating and querying sample metadata files": user-guides/sidecar-metadata.md - VCF Dimensions caching: user-guides/vcf-dimensions.md + - "Sidecar Metadata TSV files: creating and querying sample metadata files": user-guides/sidecar-metadata.md - DivBase Query Syntax for VCF data: user-guides/query-syntax.md - How to create efficient DivBase queries: user-guides/how-to-create-efficient-divbase-queries.md - "Tutorial: Running a query on a public dataset": user-guides/tutorial-query-on-public-data.md From 9c7baec4d241b4f6567d93a837bcc51dea7fc7fe Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 10:12:02 +0100 Subject: [PATCH 047/100] Add CRUD, pydantic model, route for unique samples To avoid using the endpoint that returns all VCF metadata from the project's dimensions entries. --- .../src/divbase_api/crud/vcf_dimensions.py | 15 +++++++++++- .../src/divbase_api/routes/vcf_dimensions.py | 23 ++++++++++++++++++- .../divbase_lib/api_schemas/vcf_dimensions.py | 6 +++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py b/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py index 69731364..9b8e137f 100644 --- a/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py +++ b/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py @@ -4,7 +4,7 @@ import logging -from sqlalchemy import select +from sqlalchemy import func, select from sqlalchemy.ext.asyncio import AsyncSession from divbase_api.models.vcf_dimensions import SkippedVCFDB, VCFMetadataDB @@ -49,3 +49,16 @@ async def get_skipped_vcfs_by_project_async(db: AsyncSession, project_id: int) - stmt = select(SkippedVCFDB).where(SkippedVCFDB.project_id == project_id) result = await db.execute(stmt) return list(result.scalars().all()) + + +async def get_unique_samples_by_project_async(db: AsyncSession, project_id: int) -> list[str]: + """ + Get unique sample names across all VCF files from a project's dimensions entries. + + Samples are stored in as ARRAY(String) in the VCFMetadataDB model and need to be flattened before finding the unqiue values. + To do all operations on the PostgreSQL side (to avoid having do it here in the fastAPI side), need to first use unnest() to flatten the arrays. + """ + + stmt = select(func.unnest(VCFMetadataDB.samples)).where(VCFMetadataDB.project_id == project_id).distinct() + result = await db.execute(stmt) + return sorted([row[0] for row in result]) diff --git a/packages/divbase-api/src/divbase_api/routes/vcf_dimensions.py b/packages/divbase-api/src/divbase_api/routes/vcf_dimensions.py index c35dd81d..a93158e4 100644 --- a/packages/divbase-api/src/divbase_api/routes/vcf_dimensions.py +++ b/packages/divbase-api/src/divbase_api/routes/vcf_dimensions.py @@ -11,6 +11,7 @@ from divbase_api.crud.task_history import create_task_history_entry from divbase_api.crud.vcf_dimensions import ( get_skipped_vcfs_by_project_async, + get_unique_samples_by_project_async, get_vcf_metadata_by_project_async, ) from divbase_api.db import get_db @@ -19,7 +20,7 @@ from divbase_api.models.projects import ProjectDB, ProjectRoles from divbase_api.models.users import UserDB from divbase_api.worker.tasks import update_vcf_dimensions_task -from divbase_lib.api_schemas.vcf_dimensions import DimensionsShowResult, DimensionUpdateKwargs +from divbase_lib.api_schemas.vcf_dimensions import DimensionsSamplesResult, DimensionsShowResult, DimensionUpdateKwargs logger = logging.getLogger(__name__) @@ -102,3 +103,23 @@ async def update_vcf_dimensions_endpoint( ) return job_id + + +@vcf_dimensions_router.get( + "/projects/{project_name}/samples", status_code=status.HTTP_200_OK, response_model=DimensionsSamplesResult +) +async def list_unique_samples_endpoint( + project_name: str, + project_and_user_and_role: tuple[ProjectDB, UserDB, ProjectRoles] = Depends(get_project_member), + db: AsyncSession = Depends(get_db), +) -> DimensionsSamplesResult: + """Get all unique sample names across project VCFs.""" + + project, current_user, role = project_and_user_and_role + + if not has_required_role(role, ProjectRoles.READ): + raise AuthorizationError("You don't have permission to view VCF dimensions for this project.") + + result = await get_unique_samples_by_project_async(db, project.id) + + return DimensionsSamplesResult(unique_samples=result) diff --git a/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py b/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py index 53441412..20c57d71 100644 --- a/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py +++ b/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py @@ -40,3 +40,9 @@ class DimensionsShowResult(BaseModel): vcf_files: list[dict] skipped_file_count: int skipped_files: list[dict] + + +class DimensionsSamplesResult(BaseModel): + """Result model for showing unique samples across project VCFs.""" + + unique_samples: list[str] From ae8928cc247662e8e8565302cd69f3e2f6133496 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 10:25:15 +0100 Subject: [PATCH 048/100] Update template and validator command for new CRUD Now calls the endpoint that only returns unique sample names across project VCFs, and not the endpoint that returns all dimensions metadata. --- .../cli_commands/dimensions_cli.py | 25 +++++-------------- .../services/sample_metadata_tsv_validator.py | 8 +++--- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index c964e344..12d049d2 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -10,7 +10,7 @@ from divbase_cli.config_resolver import resolve_project from divbase_cli.services.sample_metadata_tsv_validator import MetadataTSVValidator from divbase_cli.user_auth import make_authenticated_request -from divbase_lib.api_schemas.vcf_dimensions import DimensionsShowResult +from divbase_lib.api_schemas.vcf_dimensions import DimensionsSamplesResult, DimensionsShowResult logger = logging.getLogger(__name__) @@ -188,17 +188,11 @@ def create_metadata_template_with_project_samples_names( response = make_authenticated_request( method="GET", divbase_base_url=project_config.divbase_url, - api_route=f"v1/vcf-dimensions/projects/{project_config.name}", + api_route=f"v1/vcf-dimensions/projects/{project_config.name}/samples", ) - vcf_dimensions_data = DimensionsShowResult(**response.json()) - - dimensions_info = _format_api_response_for_display_in_terminal(vcf_dimensions_data) - - unique_sample_names = set() - for entry in dimensions_info.get("indexed_files", []): - unique_sample_names.update(entry.get("dimensions", {}).get("sample_names", [])) + vcf_dimensions_data = DimensionsSamplesResult(**response.json()) - unique_sample_names_sorted = sorted(unique_sample_names) + unique_sample_names_sorted = sorted(vcf_dimensions_data.unique_samples) sample_count = len(unique_sample_names_sorted) print( f"There were {sample_count} unique samples found in the dimensions file for the {project_config.name} project." @@ -271,16 +265,9 @@ def validate_metadata_template_versus_dimensions_and_formatting_constraints( response = make_authenticated_request( method="GET", divbase_base_url=project_config.divbase_url, - api_route=f"v1/vcf-dimensions/projects/{project_config.name}", + api_route=f"v1/vcf-dimensions/projects/{project_config.name}/samples", ) - vcf_dimensions_data = DimensionsShowResult(**response.json()) - dimensions_info = _format_api_response_for_display_in_terminal(vcf_dimensions_data) - - # TODO there is duplication here with other commands in this file. Could be made DRY with a helper function or a separate CRUD function to get dimensions info without needing to parse API response on client side. - - unique_sample_names = set() - for entry in dimensions_info.get("indexed_files", []): - unique_sample_names.update(entry.get("dimensions", {}).get("sample_names", [])) + unique_sample_names = DimensionsSamplesResult(**response.json()).unique_samples stats, errors, warnings = MetadataTSVValidator.validate(file_path=input_path, project_samples=unique_sample_names) diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index d3ca557c..f3ddea70 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -14,19 +14,19 @@ class MetadataTSVValidator: FORBIDDEN_CHARS = [","] - def __init__(self, file_path: Path, project_samples: set[str]): + def __init__(self, file_path: Path, project_samples: list[str] | set[str]): """ Initialize the validator. File path is the path to the TSV file to validate, - and project_samples is a set of unique sample names from the project's dimensions index. + and project_samples is a list or set of unique sample names from the project's dimensions index. """ self.file_path = file_path - self.project_samples = project_samples + self.project_samples = set(project_samples) if isinstance(project_samples, list) else project_samples self.errors: list[str] = [] self.warnings: list[str] = [] self.stats: dict = {} @classmethod - def validate(cls, file_path: Path, project_samples: set[str]) -> tuple[dict, list[str], list[str]]: + def validate(cls, file_path: Path, project_samples: list[str] | set[str]) -> tuple[dict, list[str], list[str]]: """ Validate a TSV file and return results. From cd09ec7f6c82ff8a35ee653e4afa6eb01814ed82 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 10:31:49 +0100 Subject: [PATCH 049/100] Use the samples endpoint in dimensions show CLI divbase-cli dimensions show --unique-samples now calls the dedicated endpoint for fetching unique sample names across project VCFs, which is more efficient than fetching full dimensions and parsing on the client side. --- .../cli_commands/dimensions_cli.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 12d049d2..513b1b2d 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -69,6 +69,19 @@ def show_dimensions_index( project_config = resolve_project(project_name=project) + if unique_samples: + response = make_authenticated_request( + method="GET", + divbase_base_url=project_config.divbase_url, + api_route=f"v1/vcf-dimensions/projects/{project_config.name}/samples", + ) + unique_sample_names_sorted = sorted(DimensionsSamplesResult(**response.json()).unique_samples) + sample_count = len(unique_sample_names_sorted) + print( + f"Unique sample names found across all the VCF files in the project (count: {sample_count}):\n{unique_sample_names_sorted}" + ) + return + response = make_authenticated_request( method="GET", divbase_base_url=project_config.divbase_url, @@ -114,17 +127,6 @@ def show_dimensions_index( print(f"Unique scaffold names found across all the VCF files in the project:\n{unique_scaffold_names_sorted}") return - if unique_samples: - # TODO for scalability: implement this as a separate CRUD instead of parsing all data on the client side - unique_sample_names = set() - for entry in dimensions_info.get("indexed_files", []): - unique_sample_names.update(entry.get("dimensions", {}).get("sample_names", [])) - - unique_sample_names_sorted = sorted(unique_sample_names) - - print(f"Unique sample names found across all the VCF files in the project:\n{unique_sample_names_sorted}") - return - print(yaml.safe_dump(dimensions_info, sort_keys=False)) From 74c3aeff813951e35c009dc8db888c5284cbbb81 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 10:44:28 +0100 Subject: [PATCH 050/100] Drop duplicate sorting step, already done in CRUD --- .../src/divbase_cli/cli_commands/dimensions_cli.py | 8 ++------ .../src/divbase_lib/api_schemas/vcf_dimensions.py | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 513b1b2d..7ff88dfd 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -75,7 +75,7 @@ def show_dimensions_index( divbase_base_url=project_config.divbase_url, api_route=f"v1/vcf-dimensions/projects/{project_config.name}/samples", ) - unique_sample_names_sorted = sorted(DimensionsSamplesResult(**response.json()).unique_samples) + unique_sample_names_sorted = DimensionsSamplesResult(**response.json()).unique_samples sample_count = len(unique_sample_names_sorted) print( f"Unique sample names found across all the VCF files in the project (count: {sample_count}):\n{unique_sample_names_sorted}" @@ -179,9 +179,6 @@ def create_metadata_template_with_project_samples_names( that has the sample names as pre-filled as the first column. """ - # TODO this duplicates some code with show_dimensions_index() above. A refactoring should probably include creating a separate CRUD function - # so that the client does not need to parse all data. - project_config = resolve_project(project_name=project) if output_filename is None: @@ -192,9 +189,8 @@ def create_metadata_template_with_project_samples_names( divbase_base_url=project_config.divbase_url, api_route=f"v1/vcf-dimensions/projects/{project_config.name}/samples", ) - vcf_dimensions_data = DimensionsSamplesResult(**response.json()) + unique_sample_names_sorted = DimensionsSamplesResult(**response.json()) - unique_sample_names_sorted = sorted(vcf_dimensions_data.unique_samples) sample_count = len(unique_sample_names_sorted) print( f"There were {sample_count} unique samples found in the dimensions file for the {project_config.name} project." diff --git a/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py b/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py index 20c57d71..b3ac268a 100644 --- a/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py +++ b/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py @@ -45,4 +45,4 @@ class DimensionsShowResult(BaseModel): class DimensionsSamplesResult(BaseModel): """Result model for showing unique samples across project VCFs.""" - unique_samples: list[str] + unique_samples: list[str] # Already sorted, by the CRUD function get_unique_samples_by_project_async() From d366456c93ae7ac1b88c460397919d8a534b95ad Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 10:58:28 +0100 Subject: [PATCH 051/100] Add separate endpoint also for --unique-scaffolds Not strictly related to the TSV validation work, but the pattern is the same as for the unique samples endpoint so might just as well do it now. --- .../src/divbase_api/crud/vcf_dimensions.py | 18 +++++++ .../src/divbase_api/routes/vcf_dimensions.py | 28 +++++++++- .../cli_commands/dimensions_cli.py | 51 ++++++++++--------- .../divbase_lib/api_schemas/vcf_dimensions.py | 6 +++ 4 files changed, 79 insertions(+), 24 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py b/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py index 9b8e137f..b5da054a 100644 --- a/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py +++ b/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py @@ -62,3 +62,21 @@ async def get_unique_samples_by_project_async(db: AsyncSession, project_id: int) stmt = select(func.unnest(VCFMetadataDB.samples)).where(VCFMetadataDB.project_id == project_id).distinct() result = await db.execute(stmt) return sorted([row[0] for row in result]) + + +async def get_unique_scaffolds_by_project_async(db: AsyncSession, project_id: int) -> list[str]: + """ + Get unique scaffold names across all VCF files for a project. + + Like samples, scaffolds are stored in as ARRAY(String) in the VCFMetadataDB model and need to be flattened with the unnest() PostgreSQL function. + """ + + stmt = select(func.unnest(VCFMetadataDB.scaffolds)).where(VCFMetadataDB.project_id == project_id).distinct() + result = await db.execute(stmt) + scaffolds = [row[0] for row in result] + + # Sort scaffold names in the same way as the dimensions show CLI does when returning all dimensions data: numeric first, then alphabetic + # Numeric sorting of name strings results means that 10 comes after 2 for scaffolds that have numeric names. + numeric = sorted([int(s) for s in scaffolds if s.isdigit()]) + non_numeric = sorted([s for s in scaffolds if not s.isdigit()]) + return [str(n) for n in numeric] + non_numeric diff --git a/packages/divbase-api/src/divbase_api/routes/vcf_dimensions.py b/packages/divbase-api/src/divbase_api/routes/vcf_dimensions.py index a93158e4..7e6209a8 100644 --- a/packages/divbase-api/src/divbase_api/routes/vcf_dimensions.py +++ b/packages/divbase-api/src/divbase_api/routes/vcf_dimensions.py @@ -12,6 +12,7 @@ from divbase_api.crud.vcf_dimensions import ( get_skipped_vcfs_by_project_async, get_unique_samples_by_project_async, + get_unique_scaffolds_by_project_async, get_vcf_metadata_by_project_async, ) from divbase_api.db import get_db @@ -20,7 +21,12 @@ from divbase_api.models.projects import ProjectDB, ProjectRoles from divbase_api.models.users import UserDB from divbase_api.worker.tasks import update_vcf_dimensions_task -from divbase_lib.api_schemas.vcf_dimensions import DimensionsSamplesResult, DimensionsShowResult, DimensionUpdateKwargs +from divbase_lib.api_schemas.vcf_dimensions import ( + DimensionsSamplesResult, + DimensionsScaffoldsResult, + DimensionsShowResult, + DimensionUpdateKwargs, +) logger = logging.getLogger(__name__) @@ -123,3 +129,23 @@ async def list_unique_samples_endpoint( result = await get_unique_samples_by_project_async(db, project.id) return DimensionsSamplesResult(unique_samples=result) + + +@vcf_dimensions_router.get( + "/projects/{project_name}/scaffolds", status_code=status.HTTP_200_OK, response_model=DimensionsScaffoldsResult +) +async def list_unique_scaffolds_endpoint( + project_name: str, + project_and_user_and_role: tuple[ProjectDB, UserDB, ProjectRoles] = Depends(get_project_member), + db: AsyncSession = Depends(get_db), +) -> DimensionsScaffoldsResult: + """Get all unique scaffold names across project VCFs.""" + + project, current_user, role = project_and_user_and_role + + if not has_required_role(role, ProjectRoles.READ): + raise AuthorizationError("You don't have permission to view VCF dimensions for this project.") + + result = await get_unique_scaffolds_by_project_async(db, project.id) + + return DimensionsScaffoldsResult(unique_scaffolds=result) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 7ff88dfd..ae26b30c 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -10,7 +10,11 @@ from divbase_cli.config_resolver import resolve_project from divbase_cli.services.sample_metadata_tsv_validator import MetadataTSVValidator from divbase_cli.user_auth import make_authenticated_request -from divbase_lib.api_schemas.vcf_dimensions import DimensionsSamplesResult, DimensionsShowResult +from divbase_lib.api_schemas.vcf_dimensions import ( + DimensionsSamplesResult, + DimensionsScaffoldsResult, + DimensionsShowResult, +) logger = logging.getLogger(__name__) @@ -82,6 +86,19 @@ def show_dimensions_index( ) return + if unique_scaffolds: + response = make_authenticated_request( + method="GET", + divbase_base_url=project_config.divbase_url, + api_route=f"v1/vcf-dimensions/projects/{project_config.name}/scaffolds", + ) + unique_scaffold_names_sorted = DimensionsScaffoldsResult(**response.json()).unique_scaffolds + scaffold_count = len(unique_scaffold_names_sorted) + print( + f"Unique scaffold names found across all the VCF files in the project (count: {scaffold_count}):\n{unique_scaffold_names_sorted}" + ) + return + response = make_authenticated_request( method="GET", divbase_base_url=project_config.divbase_url, @@ -106,27 +123,6 @@ def show_dimensions_index( ) return - if unique_scaffolds: - # TODO for scalability: implement this as a separate CRUD instead of parsing all data on the client side - unique_scaffold_names = set() - for entry in dimensions_info.get("indexed_files", []): - unique_scaffold_names.update(entry.get("dimensions", {}).get("scaffolds", [])) - - numeric_scaffold_names = [] - non_numeric_scaffold_names = [] - for scaffold in unique_scaffold_names: - if scaffold.isdigit(): - numeric_scaffold_names.append(int(scaffold)) - else: - non_numeric_scaffold_names.append(scaffold) - - unique_scaffold_names_sorted = [str(n) for n in sorted(numeric_scaffold_names)] + sorted( - non_numeric_scaffold_names - ) - - print(f"Unique scaffold names found across all the VCF files in the project:\n{unique_scaffold_names_sorted}") - return - print(yaml.safe_dump(dimensions_info, sort_keys=False)) @@ -134,14 +130,23 @@ def _format_api_response_for_display_in_terminal(api_response: DimensionsShowRes """ Convert the API response to a YAML-like format for display in the user's terminal. """ + + def sort_scaffolds(scaffolds): + numeric = sorted([int(s) for s in scaffolds if s.isdigit()]) + non_numeric = sorted([s for s in scaffolds if not s.isdigit()]) + return [str(n) for n in numeric] + non_numeric + dimensions_list = [] for entry in api_response.vcf_files: + scaffolds = entry.get("scaffolds", []) + sorted_scaffolds = sort_scaffolds(scaffolds) dimensions_entry = { "filename": entry["vcf_file_s3_key"], "file_version_ID_in_bucket": entry["s3_version_id"], "last_updated": entry.get("updated_at"), "dimensions": { - "scaffolds": entry.get("scaffolds", []), + "scaffold_count": len(sorted_scaffolds), + "scaffolds": sorted_scaffolds, "sample_count": entry.get("sample_count", 0), "sample_names": entry.get("samples", []), "variants": entry.get("variant_count", 0), diff --git a/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py b/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py index b3ac268a..23a3d376 100644 --- a/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py +++ b/packages/divbase-lib/src/divbase_lib/api_schemas/vcf_dimensions.py @@ -46,3 +46,9 @@ class DimensionsSamplesResult(BaseModel): """Result model for showing unique samples across project VCFs.""" unique_samples: list[str] # Already sorted, by the CRUD function get_unique_samples_by_project_async() + + +class DimensionsScaffoldsResult(BaseModel): + """Result model for showing unique scaffolds across project VCFs.""" + + unique_scaffolds: list[str] # Already sorted, by the CRUD function get_unique_scaffolds_by_project_async() From b3b9f5c332abd3af2fb31a297eb85694db1cdade Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 11:07:47 +0100 Subject: [PATCH 052/100] Clarify difference between API and worker crud The dimensions file has separate CRUDs depending on external and internal use. Added comments in the files to clarify the difference. --- .../src/divbase_api/crud/vcf_dimensions.py | 14 +++++++-- .../src/divbase_api/worker/crud_dimensions.py | 31 +++++++++++++++++-- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py b/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py index b5da054a..9bcde473 100644 --- a/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py +++ b/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py @@ -1,5 +1,9 @@ """ CRUD operations for VCF dimensions. + +The functions in this file are intended to be used with API endpoints. +There are separate VCF dimensions CRUD functions for the Celery workers in +packages/divbase-api/src/divbase_api/worker/crud_dimensions.py """ import logging @@ -14,7 +18,8 @@ async def get_vcf_metadata_by_project_async(db: AsyncSession, project_id: int) -> dict: """ - FOR USER INTERACTIONS WITH API ONLY + FOR USER INTERACTIONS WITH API ONLY, Celery workers have their own dimensions CRUD functions. + Get all VCF metadata entries for a given project ID. """ stmt = select(VCFMetadataDB).where(VCFMetadataDB.project_id == project_id) @@ -43,7 +48,8 @@ async def get_vcf_metadata_by_project_async(db: AsyncSession, project_id: int) - async def get_skipped_vcfs_by_project_async(db: AsyncSession, project_id: int) -> list[SkippedVCFDB]: """ - FOR USER INTERACTIONS WITH API ONLY + FOR USER INTERACTIONS WITH API ONLY, Celery workers have their own dimensions CRUD functions. + Get all skipped VCF entries for a given project. """ stmt = select(SkippedVCFDB).where(SkippedVCFDB.project_id == project_id) @@ -53,6 +59,8 @@ async def get_skipped_vcfs_by_project_async(db: AsyncSession, project_id: int) - async def get_unique_samples_by_project_async(db: AsyncSession, project_id: int) -> list[str]: """ + FOR USER INTERACTIONS WITH API ONLY, Celery workers have their own dimensions CRUD functions. + Get unique sample names across all VCF files from a project's dimensions entries. Samples are stored in as ARRAY(String) in the VCFMetadataDB model and need to be flattened before finding the unqiue values. @@ -66,6 +74,8 @@ async def get_unique_samples_by_project_async(db: AsyncSession, project_id: int) async def get_unique_scaffolds_by_project_async(db: AsyncSession, project_id: int) -> list[str]: """ + FOR USER INTERACTIONS WITH API ONLY, Celery workers have their own dimensions CRUD functions. + Get unique scaffold names across all VCF files for a project. Like samples, scaffolds are stored in as ARRAY(String) in the VCFMetadataDB model and need to be flattened with the unnest() PostgreSQL function. diff --git a/packages/divbase-api/src/divbase_api/worker/crud_dimensions.py b/packages/divbase-api/src/divbase_api/worker/crud_dimensions.py index 8103476c..b75cf81e 100644 --- a/packages/divbase-api/src/divbase_api/worker/crud_dimensions.py +++ b/packages/divbase-api/src/divbase_api/worker/crud_dimensions.py @@ -1,3 +1,10 @@ +""" +CRUD operations for VCF dimensions for the Celery workers. +. +There are separate VCF dimensions CRUD functions for used with API endpoints in +packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py +""" + import logging from sqlalchemy import delete, select @@ -11,6 +18,8 @@ def get_vcf_metadata_by_project(db: Session, project_id: int) -> dict: """ + FOR CELERY WORKERS, not for user interactions with API. + Get all VCF metadata entries for a given project ID. """ stmt = select(VCFMetadataDB).where(VCFMetadataDB.project_id == project_id) @@ -42,6 +51,8 @@ def get_vcf_metadata_by_project(db: Session, project_id: int) -> dict: def get_skipped_vcfs_by_project_worker(db: Session, project_id: int) -> dict[str, str]: """ + FOR CELERY WORKERS, not for user interactions with API. + Get all skipped VCF entries for a given project. """ stmt = select(SkippedVCFDB).where(SkippedVCFDB.project_id == project_id) @@ -57,6 +68,8 @@ def get_skipped_vcfs_by_project_worker(db: Session, project_id: int) -> dict[str def get_vcf_metadata_by_keys(db: Session, vcf_file_s3_key: str, project_id: int) -> VCFMetadataDB | None: """ + FOR CELERY WORKERS, not for user interactions with API. + Get VCF metadata by S3 key AND project ID (unique constraint). """ stmt = select(VCFMetadataDB).where( @@ -68,6 +81,8 @@ def get_vcf_metadata_by_keys(db: Session, vcf_file_s3_key: str, project_id: int) def create_or_update_vcf_metadata(db: Session, vcf_metadata_data: dict) -> None: """ + FOR CELERY WORKERS, not for user interactions with API. + Upsert (insert or update) a VCF metadata entry in the database. This function uses PostgreSQL's ON CONFLICT DO UPDATE to ensure that if a VCF metadata entry @@ -96,6 +111,8 @@ def create_or_update_vcf_metadata(db: Session, vcf_metadata_data: dict) -> None: def delete_vcf_metadata(db: Session, vcf_file_s3_key: str, project_id: int) -> None: """ + FOR CELERY WORKERS, not for user interactions with API. + Delete a VCF metadata entry by S3 key and project ID. Called when a VCF file is removed from the project bucket. @@ -110,7 +127,11 @@ def delete_vcf_metadata(db: Session, vcf_file_s3_key: str, project_id: int) -> N def get_skipped_vcf_by_keys(db: Session, vcf_file_s3_key: str, project_id: int) -> SkippedVCFDB | None: - """Get skipped VCF entry by S3 key and project ID.""" + """ + FOR CELERY WORKERS, not for user interactions with API. + + Get skipped VCF entry by S3 key and project ID. + """ stmt = select(SkippedVCFDB).where( SkippedVCFDB.vcf_file_s3_key == vcf_file_s3_key, SkippedVCFDB.project_id == project_id ) @@ -120,6 +141,8 @@ def get_skipped_vcf_by_keys(db: Session, vcf_file_s3_key: str, project_id: int) def create_or_update_skipped_vcf(db: Session, skipped_vcf_data: dict) -> SkippedVCFDB: """ + FOR CELERY WORKERS, not for user interactions with API. + Upsert (update or insert) skipped VCF entry. Similar to create_or_update_vcf_metadata but for tracking the skipped VCF files (=old divbase results VCF files). """ stmt = insert(SkippedVCFDB).values(**skipped_vcf_data) @@ -143,7 +166,11 @@ def create_or_update_skipped_vcf(db: Session, skipped_vcf_data: dict) -> Skipped def delete_skipped_vcf(db: Session, vcf_file_s3_key: str, project_id: int) -> None: - """Delete a skipped VCF entry by S3 key and project ID.""" + """ + FOR CELERY WORKERS, not for user interactions with API. + + Delete a skipped VCF entry by S3 key and project ID. + """ # TODO - add test for deleting a non existant entry, does it raise an error? stmt = delete(SkippedVCFDB).where( SkippedVCFDB.vcf_file_s3_key == vcf_file_s3_key, SkippedVCFDB.project_id == project_id From e80f5b88ffa83a063496e3cc7c87e067ab7f9552 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 11:42:13 +0100 Subject: [PATCH 053/100] Fix mistake where property call was dropped --- .../divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index ae26b30c..cc9bfe88 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -194,7 +194,7 @@ def create_metadata_template_with_project_samples_names( divbase_base_url=project_config.divbase_url, api_route=f"v1/vcf-dimensions/projects/{project_config.name}/samples", ) - unique_sample_names_sorted = DimensionsSamplesResult(**response.json()) + unique_sample_names_sorted = DimensionsSamplesResult(**response.json()).unique_samples sample_count = len(unique_sample_names_sorted) print( From f9f12691dcfb01738d2219c83abbe9059bce246b Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 12:15:49 +0100 Subject: [PATCH 054/100] Add e2e tests for updated dimensions CLI commands One test with the validator fails, which highlighted a design issue with not allowing mixed-type columns that needs to be considered. It was discovered by using an older tsv fixture, which shows the imporatance of test data coverage... --- .../cli_commands/test_dimensions_cli.py | 324 ++++++++++++++++++ .../test_sample_metadata_queries.py | 20 +- 2 files changed, 332 insertions(+), 12 deletions(-) diff --git a/tests/e2e_integration/cli_commands/test_dimensions_cli.py b/tests/e2e_integration/cli_commands/test_dimensions_cli.py index 49209bab..83a7255e 100644 --- a/tests/e2e_integration/cli_commands/test_dimensions_cli.py +++ b/tests/e2e_integration/cli_commands/test_dimensions_cli.py @@ -4,6 +4,7 @@ import ast import gzip +import os import re from unittest.mock import patch @@ -27,6 +28,29 @@ def auto_clean_dimensions_entries_for_all_projects(clean_all_projects_dimensions yield +def _parse_list_from_cli_output(stdout: str) -> list: + """ + Helper function to parse a Python list from CLI output that may span multiple lines. + """ + lines = stdout.splitlines() + list_text = "" + collecting = False + + for line in lines: + if "[" in line and "count:" not in line: + collecting = True + if collecting: + list_text += line + if "]" in line: + break + + assert list_text, f"List not found in output:\n{stdout}" + + list_start = list_text.find("[") + list_end = list_text.rfind("]") + 1 + return ast.literal_eval(list_text[list_start:list_end]) + + def test_update_vcf_dimensions_task_directly( CONSTANTS, run_update_dimensions, @@ -276,3 +300,303 @@ def test_update_dimensions_twice_with_no_new_VCF_added_inbetween( assert result_second_run.get("VCF_files_added") is None or result_second_run.get("VCF_files_added") == [], ( f"Expected no new files indexed, got: {result_second_run.get('VCF_files_added')}" ) + + +def test_show_unique_samples( + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, +): + """ + Test the CLI 'dimensions show --unique-samples' command. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 + + run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) + + command = f"dimensions show --project {project_name} --unique-samples" + cli_result = runner.invoke(app, command) + assert cli_result.exit_code == 0, f"Command failed with: {cli_result.stdout}" + + assert "count:" in cli_result.stdout, "Expected count to be displayed in output" + assert "Unique sample names found" in cli_result.stdout, "Expected header message" + assert "[" in cli_result.stdout and "]" in cli_result.stdout, "Expected list output" + + sample_names = _parse_list_from_cli_output(cli_result.stdout) + + assert isinstance(sample_names, list), f"Expected list, got {type(sample_names)}" + assert len(sample_names) > 0, "Expected at least one sample" + + assert sample_names == sorted(sample_names), f"Samples should be sorted: {sample_names}" + + +def test_show_unique_scaffolds_dedicated_endpoint( + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, +): + """ + Test the CLI 'dimensions show --unique-scaffolds' command using the dedicated endpoint. + This tests both the CRUD function and the CLI integration. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 + + run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) + + command = f"dimensions show --project {project_name} --unique-scaffolds" + cli_result = runner.invoke(app, command) + assert cli_result.exit_code == 0, f"Command failed with: {cli_result.stdout}" + assert "count:" in cli_result.stdout, "Expected count to be displayed in output" + + scaffold_names = _parse_list_from_cli_output(cli_result.stdout) + + expected_scaffolds = ["1", "4", "5", "6", "7", "8", "13", "18", "20", "21", "22", "24"] + assert scaffold_names == expected_scaffolds, f"Expected {expected_scaffolds}, got {scaffold_names}" + + # Verify numeric scaffolds come first, sorted numerically + numeric_scaffolds = [s for s in scaffold_names if s.isdigit()] + assert numeric_scaffolds == sorted(numeric_scaffolds, key=int), "Numeric scaffolds should be sorted numerically" + + +@pytest.mark.parametrize( + "option_flag,expected_message,expected_items,verify_sorting", + [ + ( + "--unique-samples", + "Unique sample names found", + [ + "1a_HOM-G34", + "1b_HOM-G55", + "1b_HOM-G58", + "1b_HOM-G83", + "4_HOM-P25", + "5a_HOM-I13", + "5a_HOM-I14", + "5a_HOM-I20", + "5a_HOM-I21", + "5a_HOM-I7", + "5b_HOM-H17", + "5b_HOM-H23", + "5b_HOM-H25", + "5b_HOM-H7", + "7_HOM-J21", + "8_HOM-E57", + "8_HOM-E59", + "8_HOM-E64", + "8_HOM-E74", + "8_HOM-E78", + ], + True, # Should be sorted alphabetically + ), + ( + "--unique-scaffolds", + "Unique scaffold names found", + ["1", "4", "5", "6", "7", "8", "13", "18", "20", "21", "22", "24"], + True, # Should be sorted numerically then alphabetically + ), + ], +) +def test_show_unique_items_parametrized( + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, + option_flag, + expected_message, + expected_items, + verify_sorting, +): + """ + Parametrized test for --unique-samples and --unique-scaffolds options. + Tests both the CRUD functions and CLI integration. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 + + run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) + + command = f"dimensions show --project {project_name} {option_flag}" + cli_result = runner.invoke(app, command) + + assert cli_result.exit_code == 0, f"Command failed with: {cli_result.stdout}" + assert "count:" in cli_result.stdout, "Expected count to be displayed in output" + assert expected_message in cli_result.stdout, f"Expected message '{expected_message}' in output" + assert "[" in cli_result.stdout and "]" in cli_result.stdout, "Expected list output" + + items = _parse_list_from_cli_output(cli_result.stdout) + + assert isinstance(items, list), f"Expected list, got {type(items)}" + assert len(items) > 0, f"Expected at least one item in {option_flag} output" + + if expected_items is not None: + assert items == expected_items, f"Expected {expected_items}, got {items}" + + if verify_sorting and option_flag == "--unique-samples": + assert items == sorted(items), f"Samples should be sorted alphabetically: {items}" + elif verify_sorting and option_flag == "--unique-scaffolds": + numeric_items = [s for s in items if s.isdigit()] + assert numeric_items == sorted(numeric_items, key=int), "Numeric scaffolds should be sorted numerically" + + +def test_create_metadata_template( + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, + tmp_path, +): + """ + Test the CLI 'dimensions create-metadata-template' command. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 + + run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) + output_filename = f"test_metadata_{project_name}.tsv" + output_path = tmp_path / output_filename + + command = f"dimensions create-metadata-template --project {project_name} --output {output_path}" + cli_result = runner.invoke(app, command) + assert cli_result.exit_code == 0, f"Command failed with: {cli_result.stdout}" + assert output_path.exists(), f"Expected output file {output_path} to exist" + + with open(output_path, "r") as f: + lines = f.readlines() + assert lines[0].strip() == "#Sample_ID", f"Expected header '#Sample_ID', got {lines[0].strip()}" + assert len(lines) > 1, "Expected at least one sample in the template" + stdout_lower = cli_result.stdout.lower() + assert "unique samples" in stdout_lower or "samples found" in stdout_lower, ( + f"Expected message about unique samples, got: {cli_result.stdout}" + ) + assert str(output_path) in cli_result.stdout or "written" in stdout_lower, ( + "Expected output filename or confirmation message" + ) + + +def test_create_metadata_template_with_overwrite_prompt( + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, + tmp_path, +): + """ + Test that create-metadata-template prompts when file exists. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 + + run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) + + output_filename = f"test_metadata_{project_name}.tsv" + output_path = tmp_path / output_filename + + # Create template file (does not exist since before) + command = f"dimensions create-metadata-template --project {project_name} --output {output_path}" + cli_result = runner.invoke(app, command) + assert cli_result.exit_code == 0, f"First creation failed: {cli_result.stdout}" + + # Try to create template file again and decline overwrite + cli_result = runner.invoke(app, command, input="n\n") + assert "already exists" in cli_result.stdout, "Expected overwrite prompt" + assert "not written" in cli_result.stdout.lower() or cli_result.exit_code != 0, ( + "Expected message about file not written or non-zero exit" + ) + + # Try to create template file again and accept overwrite + cli_result = runner.invoke(app, command, input="y\n") + assert cli_result.exit_code == 0, f"Expected exit code 0, got {cli_result.exit_code}. Output: {cli_result.stdout}" + assert "already exists" in cli_result.stdout, "Expected overwrite prompt" + + +def test_validate_metadata_file_valid( + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, +): + """ + Test the CLI 'dimensions validate-metadata-file' command with a valid TSV file. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 + + run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) + + fixture_path = os.path.join( + os.path.dirname(__file__), "../..", "fixtures", "sample_metadata_HOM_chr_split_version.tsv" + ) + + command = f"dimensions validate-metadata-file {fixture_path} --project {project_name}" + cli_result = runner.invoke(app, command) + + assert cli_result.exit_code == 0, f"Expected validation to succeed with exit code 0, got {cli_result.exit_code}" + assert "VALIDATION SUMMARY" in cli_result.stdout, "Expected validation summary" + assert "ERRORS" not in cli_result.stdout, f"Did not expect errors, got: {cli_result.stdout}" + + +def test_validate_metadata_file_with_errors( + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, +): + """ + Test the CLI 'dimensions validate-metadata-file' command with an invalid TSV file. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 + + run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) + + fixture_path = os.path.join( + os.path.dirname(__file__), "../..", "fixtures", "sample_metadata_incorrect_formatting_to_test_tsv_validator.tsv" + ) + + command = f"dimensions validate-metadata-file {fixture_path} --project {project_name}" + cli_result = runner.invoke(app, command) + + assert cli_result.exit_code == 1, f"Expected validation to fail but it passed: {cli_result.stdout}" + assert "VALIDATION SUMMARY" in cli_result.stdout, "Expected validation summary" + assert "ERRORS" in cli_result.stdout or "WARNINGS" in cli_result.stdout, "Expected errors or warnings" + + +def test_validate_metadata_file_nonexistent( + CONSTANTS, + logged_in_edit_user_with_existing_config, +): + """ + Test that validate-metadata-file handles nonexistent files gracefully. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + + command = f"dimensions validate-metadata-file nonexistent_file.tsv --project {project_name}" + cli_result = runner.invoke(app, command) + + assert cli_result.exit_code == 1, "Expected exit code 1 for nonexistent file" + assert "not found" in cli_result.stdout.lower(), "Expected error message about file not found" diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 3cbd8836..ba8ee987 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -708,10 +708,10 @@ def test_negative_numbers_in_single_value_column(self, sample_tsv_with_numeric_d sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 4 - assert "S1" in sample_ids # -5.5 - assert "S2" in sample_ids # -10.2 - assert "S5" in sample_ids # -20 - assert "S9" in sample_ids # -15 + assert "S1" in sample_ids + assert "S2" in sample_ids + assert "S5" in sample_ids + assert "S9" in sample_ids def test_negative_numbers_discrete_values(self, sample_tsv_with_numeric_data): """Test that negative numbers can be used as discrete filter values.""" @@ -726,22 +726,18 @@ def test_negative_numbers_discrete_values(self, sample_tsv_with_numeric_data): def test_negative_numbers_greater_than_inequality(self, sample_tsv_with_numeric_data): """Test greater than with negative numbers.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) - # Temperature: -5.5, -10.2, 0, 15.5, -20, 10, 5, 20, -15, 25 - # Less than -5 should match: -5.5, -10.2, -20, -15 result = manager.run_query(filter_string="Temperature:<-5") sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 4 - assert "S1" in sample_ids # -5.5 < -5 - assert "S2" in sample_ids # -10.2 < -5 - assert "S5" in sample_ids # -20 < -5 - assert "S9" in sample_ids # -15 < -5 + assert "S1" in sample_ids + assert "S2" in sample_ids + assert "S5" in sample_ids + assert "S9" in sample_ids def test_negative_numbers_in_semicolon_cells(self, sample_tsv_with_numeric_data): """Test that negative numbers in semicolon-separated cells work correctly.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) - # Longitude values: -2.78305556, -0.12765, 1.25, -3.5;-2.1, 0, 2.5, -1.5, 3.0, -2.0, 1.5 - # Discrete value -3.5 should only match S4 which has -3.5;-2.1 result = manager.run_query(filter_string="Longitude:-3.5") sample_ids = result.get_unique_values("Sample_ID") From 66ea161453fba55afa3c3fca95fcd73226471811 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 14:44:43 +0100 Subject: [PATCH 055/100] Refactor query logic to relax constraints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous constraints on mixed-type columns were convenient to develop with, but as the e2e tests in the previous commit revealed, there are edge cases that would fail when the constraints raise errors instead of warnings. This refactoring changes how mixed-type columns are handled: a column is classified as numeric only if all values — including each individual part within semicolon-separated cells — can be parsed as a number. If any value in a column is non-numeric, the entire column is treated as a string column. This is now communicated to the user as warnings rather than errors, both in the validatior and in the queries. The query manager also now warns when the user attempts numeric filter operations (inequalities, ranges) on a string column, explaining why the column is not numeric and suggesting exact string matching instead. --- .../src/divbase_api/routes/queries.py | 8 +- .../src/divbase_api/services/queries.py | 206 +++++++++++++++--- .../cli_commands/dimensions_cli.py | 4 +- .../src/divbase_cli/cli_commands/query_cli.py | 6 +- .../services/sample_metadata_tsv_validator.py | 39 ++-- .../divbase-lib/src/divbase_lib/exceptions.py | 6 + 6 files changed, 216 insertions(+), 53 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/routes/queries.py b/packages/divbase-api/src/divbase_api/routes/queries.py index 7c9d9637..5a9d4bc3 100644 --- a/packages/divbase-api/src/divbase_api/routes/queries.py +++ b/packages/divbase-api/src/divbase_api/routes/queries.py @@ -32,6 +32,7 @@ from divbase_lib.exceptions import ( SidecarColumnNotFoundError, SidecarInvalidFilterError, + SidecarMetadataFormatError, SidecarSampleIDError, ) @@ -87,7 +88,12 @@ async def sample_metadata_query( # Route 1, create job and get back job id. # Route 2, get job result by id (with status etc), CLI can poll until done. - except (SidecarInvalidFilterError, SidecarColumnNotFoundError, SidecarSampleIDError) as e: + except ( + SidecarInvalidFilterError, + SidecarColumnNotFoundError, + SidecarSampleIDError, + SidecarMetadataFormatError, + ) as e: # Catch validation errors (mixed types, missing columns, invalid Sample_IDs) and return 400 error_message = str(e) raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=error_message) from None diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index ab90db5e..2a817168 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -25,6 +25,7 @@ BcftoolsPipeUnsupportedCommandError, SidecarColumnNotFoundError, SidecarInvalidFilterError, + SidecarMetadataFormatError, SidecarNoDataLoadedError, SidecarSampleIDError, ) @@ -695,19 +696,25 @@ def __init__(self, file: Path): def load_file(self) -> "SidecarQueryManager": """ Method that loads the TSV file into a pandas DataFrame. Assumes that the first row is a header row, and that the file is tab-separated. - Also removes any leading '#' characters from the column names + Also removes any leading '#' characters from the column names. - Strip empty filenames if there e.g. are typos with trailing commas + Validates the same errors as the client-side MetadataTSVValidator: + - Header: first column must be #Sample_ID, no duplicate or empty column names + - Sample_ID: no empty values, no duplicates, no semicolons + - Data: no commas in any cell values """ - # TODO: pandas will likely read all plain files to df, so perhaps there should be a check that the file is a TSV file? or at least has properly formatted tabular columns and rows? try: logger.info(f"Loading sidecar metadata file: {self.file}") + + # Read the raw header before loading with pandas to catch issues that pandas would silently fix, such as duplicate column names or empty column names. + self._read_and_validate_raw_header() + self.df = pd.read_csv( self.file, sep="\t", skipinitialspace=True ) # Pandas has Type Inference and will detect numeric and string columns automatically self.df.columns = self.df.columns.str.lstrip("#") - # Strip leading and trailing whitespace from all columns + # Strip leading and trailing whitespace from all string columns for col in self.df.columns: self.df[col] = self.df[col].apply(lambda x: x.strip() if isinstance(x, str) else x) @@ -722,7 +729,21 @@ def load_file(self) -> "SidecarQueryManager": duplicates = self.df[self.df["Sample_ID"].duplicated()]["Sample_ID"].tolist() raise SidecarSampleIDError(f"Duplicate Sample_IDs found: {duplicates}. Each Sample_ID must be unique.") - except (SidecarSampleIDError, SidecarColumnNotFoundError, SidecarInvalidFilterError): + semicolon_samples = self.df[self.df["Sample_ID"].str.contains(";", na=False)]["Sample_ID"].tolist() + if semicolon_samples: + raise SidecarSampleIDError( + f"Sample_ID column contains semicolons in values: {semicolon_samples}. " + "Sample_ID must contain only one value per row (semicolons are not allowed)." + ) + + self._validate_no_commas_in_data() + + except ( + SidecarSampleIDError, + SidecarColumnNotFoundError, + SidecarInvalidFilterError, + SidecarMetadataFormatError, + ): # Let validation errors propagate directly to user with specific error messages raise except Exception as e: @@ -730,6 +751,53 @@ def load_file(self) -> "SidecarQueryManager": raise SidecarNoDataLoadedError(file_path=self.file, submethod="load_file") from e return self + def _read_and_validate_raw_header(self) -> None: + """ + Read the first line of the TSV file and validate column names. This is intended to be run + before pandas loads the file, to catch issues that pandas would silently fix. Pandas for instance + rename duplicate columns (e.g., "Area", "Area.1") and empty column names. + """ + with open(self.file, "r", encoding="utf-8") as f: + first_line = f.readline().rstrip("\n\r") + + raw_columns = first_line.split("\t") + cleaned_columns = [col.lstrip("#") for col in raw_columns] + + empty_columns = [i + 1 for i, col in enumerate(cleaned_columns) if not col.strip()] + if empty_columns: + raise SidecarMetadataFormatError( + f"Empty column name(s) found at position(s): {empty_columns}. All columns must have a non-empty name." + ) + + seen = {} + duplicate_columns = [] + for col in cleaned_columns: + col_stripped = col.strip() + if col_stripped in seen: + if col_stripped not in duplicate_columns: + duplicate_columns.append(col_stripped) + else: + seen[col_stripped] = True + + if duplicate_columns: + raise SidecarMetadataFormatError( + f"Duplicate column names found: {duplicate_columns}. Each column name must be unique in the metadata file." + ) + + def _validate_no_commas_in_data(self) -> None: + """ + Validate that no cells in the entire DataFrame contain commas. + Matches the client-side validator's comma check. + """ + for col in self.df.columns: + for row_index, cell_value in enumerate(self.df[col].dropna()): + cell_str = str(cell_value).strip() + if cell_str and "," in cell_str: + raise SidecarMetadataFormatError( + f"Column '{col}' contains commas in value '{cell_str}' at row {row_index + 1}. " + f"Commas are not allowed in DivBase metadata files. Use semicolons (;) to separate multiple values." + ) + def get_unique_values(self, column: str) -> list: """ Method to fetch unique values from a specific column in the query result. Intended to be invoked on a SidecarQueryManager @@ -821,6 +889,30 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": is_numeric = pd.api.types.is_numeric_dtype(self.df[key]) is_semicolon_numeric = self._is_semicolon_separated_numeric_column(key) if not is_numeric else False + # Check if type consistency and return warnings to users if applicable. + # If the column is treated as string, check for potential user mistakes (e.g. using numeric filter syntax on a string column that contains numeric-looking values): + # 1. Warn if the column has mixed types (some values look numeric) and that the column will be treat as string type. + # 2. Warn if the filter uses numeric syntax on this string column. Do not raise error. + if not is_numeric and not is_semicolon_numeric: + is_mixed = self._is_mixed_type_column(key) + + problematic_filter_values = self._detect_numeric_filter_syntax_on_string_column( + key, filter_string_values + ) + + if is_mixed and problematic_filter_values: + warning_msg = f"Column '{key}' has mixed types (including semicolon-separated values) and is treated as string by DivBase. Your filter {problematic_filter_values} uses numeric operations which won't work. Use exact string matching instead (e.g., '{key}:value1,value2')." + logger.warning(warning_msg) + self.warnings.append(warning_msg) + elif problematic_filter_values: + warning_msg = f"Column '{key}' is a string column. Your filter {problematic_filter_values} uses numeric operations which won't work on strings. Use exact string matching instead (e.g., '{key}:value1,value2')." + logger.warning(warning_msg) + self.warnings.append(warning_msg) + elif is_mixed: + warning_msg = f"Column '{key}' has mixed types (including semicolon-separated values) and is treated as string by DivBase. Numeric operations are not available for this column." + logger.warning(warning_msg) + self.warnings.append(warning_msg) + # Handle numeric filtering: inequalities, ranges, and discrete values (all with OR logic) # e.g., "Weight:>25,<30,50" or "Weight:20-40,50,>100" # Supports filtering on semicolon-separated values in cells in the TSV: e.g. "25;30;35" @@ -914,7 +1006,9 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": ) if not condition.any(): - warning_msg = f"None of the values {filter_string_values_list} were found in column '{key}'" + warning_msg = ( + f"No results for the filter {filter_string_values_list} were found in column '{key}'." + ) logger.warning(warning_msg) self.warnings.append(warning_msg) filter_conditions.append(condition) @@ -968,13 +1062,15 @@ def _is_semicolon_separated_numeric_column(self, key: str) -> bool: This helper method checks ALL non-null values in the column to determine if they can be parsed as numeric after splitting by semicolon. Note that it only detects if a column value is semicolon-separated numeric, it does not - convert the column to numeric type. It also validates that the colum value is not of mixed string-numerical type, which - is invalid input for the query system. The actual parsing and handling of the semicolon-separated numeric values is - done in the lamda functions in the run_query() method. + convert the column to numeric type. The actual parsing and handling of the semicolon-separated numeric values is + done in the helper functions in the run_query() method. Returns True if the column contains ONLY numeric values (with or without semicolons). - Returns False if the column contains ONLY non-numeric values (regular string column), or if the column is empty. - Raises SidecarInvalidFilterError if mixed types detected (e.g., "1;2" and "abc;def" in same column) or if commas are found. + Returns False if the column contains ANY non-numeric values (treated as a regular string column). + Returns False if the column is empty. + + This method does not raise errors for mixed types. Columns with a mix of numeric-looking and + non-numeric values (e.g., "8", "1a", "5a") are simply treated as string columns. """ if key not in self.df.columns: return False @@ -985,10 +1081,7 @@ def _is_semicolon_separated_numeric_column(self, key: str) -> bool: self._validate_no_commas_in_column(key) - has_numeric_type = False - has_non_numeric_type = False - - for row_index, cell_value in enumerate(non_null_values): + for cell_value in non_null_values: cell_str = str(cell_value).strip() if not cell_str: continue @@ -1001,26 +1094,75 @@ def _is_semicolon_separated_numeric_column(self, key: str) -> bool: try: float(part) - has_numeric_type = True except ValueError: - has_non_numeric_type = True - # If not numeric and contains a hyphen with digits, it's likely a range notation like "1-2". Negative numbers should already have been classified as numeric with the float() check. - if "-" in part and any(c.isdigit() for c in part): - raise SidecarInvalidFilterError( - f"Column '{key}' contains value '{part}' with a hyphen at row {row_index}. " - f"This appears to be range notation (e.g., '1-2'), which is not allowed in data values. " - f"If this is meant to be a numeric column, use semicolons to separate values (e.g., '1;2'). " - f"If this is meant to be a string column, all values should be non-numeric strings." - ) from None + # Any non-numeric value means this column is not a numeric column. + # It will be treated as a string column instead. + return False - if has_numeric_type and has_non_numeric_type: - raise SidecarInvalidFilterError( - f"Column '{key}' in the metadata file contains mixed types. Value '{cell_str}' at row {row_index} " - f"has both numeric and non-numeric parts. All values in a column must be consistently " - f"numeric or string for filtering to work correctly." - ) + return True + + def _is_mixed_type_column(self, key: str) -> bool: + """ + Helper method to detect if a non-numeric column contains a mix of numeric-looking + and non-numeric values (e.g., Population_code with "8", "1a", "5a"). + + This is called only for columns where pandas infers object dtype AND + _is_semicolon_separated_numeric_column returned False. It determines whether + the column has SOME numeric-looking values (mixed) vs. being purely string. + + This is used to provide a targeted warning to the user at query time: + the validator warns about mixed types at validation time, but users may + skip validation or the column may be intentionally mixed. + """ + if key not in self.df.columns: + return False + + non_null_values = self.df[key].dropna() + if len(non_null_values) == 0: + return False + + has_numeric = False + has_non_numeric = False + + for cell_value in non_null_values: + cell_str = str(cell_value).strip() + if not cell_str: + continue + parts = cell_str.split(";") + for part in parts: + part = part.strip() + if not part: + continue + try: + float(part) + has_numeric = True + except ValueError: + has_non_numeric = True + + if has_numeric and has_non_numeric: + return True + + return False - return has_numeric_type and not has_non_numeric_type + def _detect_numeric_filter_syntax_on_string_column(self, key: str, filter_string_values: str) -> list[str]: + """ + Helper method to detect when a user's filter string contains numeric inequality syntax + (like ">25", ">=10", "<30", "<=5") on a column that is treated as string. Returns a list of the + problematic filter values for use in a warning message. + + Doesn't flag range-like filter values like "1-2" since these are common string values + (e.g., hyphenated IDs or names) and will correctly match via string matching. + """ + problematic_filter_values = [] + values = filter_string_values.split(",") + for filter_value in values: + filter_value = filter_value.strip().lstrip("!") # strip negation prefix for checking + if not filter_value: + continue + # Check for inequality operators + if re.match(r"^(>=|<=|>|<)-?\d+\.?\d*$", filter_value): + problematic_filter_values.append(filter_value) + return problematic_filter_values def _split_cell_values(self, cell_value: Any) -> list[str]: """ diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index cc9bfe88..610998b6 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -295,9 +295,7 @@ def validate_metadata_template_versus_dimensions_and_formatting_constraints( if string_cols: print(f" String columns ({len(string_cols)}): {', '.join(string_cols)}") if mixed_cols: - print( - f" [red]Mixed-type columns ({len(mixed_cols)}): {', '.join(mixed_cols)} - NOT ALLOWED, see errors below[/red]" - ) + print(f" Mixed-type columns treated as string ({len(mixed_cols)}): {', '.join(mixed_cols)}") if stats.get("has_multi_values", False): print(" Multi-value cells: Yes (semicolon-separated values detected)") diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py index 30c54b24..a46efea1 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py @@ -17,7 +17,6 @@ """ import logging -import textwrap import typer from rich import print @@ -99,10 +98,9 @@ def sample_metadata_query( results = SampleMetadataQueryTaskResult(**response.json()) if results.warnings: - print("[yellow]⚠ Warnings:[/yellow]") + print("[yellow]Warnings:[/yellow]") for warning in results.warnings: - wrapped = textwrap.fill(warning, width=100, initial_indent=" • ", subsequent_indent=" ") - print(wrapped) + print(f" • {warning}") print() if show_sample_results: diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index f3ddea70..3453db6d 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -152,8 +152,11 @@ def _infer_column_type( self, row_num: int, col_idx: int, col_name: str, cell: str, column_types: dict[int, set[str]] ) -> None: """ - Infer the type of values in a column and validate type consistency. + Infer the type of values in a column and track type information. Matches server-side logic in queries.py::_is_semicolon_separated_numeric_column + + Columns with a mix of numeric-looking and non-numeric values (e.g., "8", "1a", "5a") + are treated as string columns. Mixed types are reported as warnings, not errors. """ values = [v.strip() for v in cell.split(";") if v.strip()] @@ -171,30 +174,38 @@ def _infer_column_type( cell_has_string = True column_types[col_idx].add("string") - # Check for hyphens in non-numeric values that might indicate range notation. Negative numbers should already have been classified as numeric with the float() check. + # Check for hyphens in non-numeric values that might indicate range notation. + # Negative numbers should already have been classified as numeric with the float() check. + # This is a warning to help users who may have used range notation (e.g., "1-2") instead of + # semicolons (e.g., "1;2") in their data values. if ( "-" in value and any(c.isdigit() for c in value) and ("numeric" in column_types[col_idx] or all(t == "numeric" for t in column_types[col_idx] if t)) ): - self.errors.append( + self.warnings.append( f"Row {row_num}, Column '{col_name}': Value '{value}' contains a hyphen. " f"This appears to be range notation (e.g., '1-2'), which is not allowed in data values. " - f"If this is meant to be a numeric column, use semicolons to separate values (e.g., '1;2'). " - f"If this is meant to be a string column, all values should be non-numeric strings." + f"If this is meant to be a numeric multi-value column, use semicolons to separate values (e.g., '1;2'). " + f"This column will be treated as a string column." ) - # Check for mixed types within the same cell (e.g., "1;abc") + # Check for mixed types within the same cell (e.g., "1;abc") and warn the user if applicable if cell_has_numeric and cell_has_string: - self.errors.append( - f"Row {row_num}, Column '{col_name}': Cell '{cell}' contains mixed types. " - f"All cell values in the same column must be consistently numeric or string." + self.warnings.append( + f"Row {row_num}, Column '{col_name}': Cell '{cell}' contains mixed types " + f"(both numeric and non-numeric values in semicolon-separated cell). " + f"This column will be treated as a string column." ) def _check_mixed_types(self, header: list[str], column_types: dict[int, set[str]]) -> None: """ - Check for mixed types in columns and raise errors. + Check for mixed types in columns and report as informational warnings. Matches server-side logic in queries.py::_is_semicolon_separated_numeric_column + + Columns with mixed types are treated as string columns by the DivBase query engine. + This happen for values such as e.g., "8", "1a", "5a" that happen to look numeric but + are semantically a strings (e.g. names, IDs).. """ mixed_columns = [] for col_idx, types in column_types.items(): @@ -203,9 +214,11 @@ def _check_mixed_types(self, header: list[str], column_types: dict[int, set[str] mixed_columns.append(col_name) if mixed_columns: - self.errors.append( - f"The following columns contain mixed types (both numeric and string values): {mixed_columns}. " - "All values in a column must be consistently numeric or string for DivBase sidecar metadata queries to work correctly." + self.warnings.append( + f"The following columns contain mixed types (both numeric-looking and string values): {mixed_columns}. " + "A column is only numeric if all values (including each part in semicolon-separated cells) are valid numbers. " + "These columns will be treated as string columns by DivBase. Numeric query operations " + "(ranges, inequalities) will not be applicable to these columns." ) def _validate_sample_names(self, tsv_samples: set[str]) -> None: diff --git a/packages/divbase-lib/src/divbase_lib/exceptions.py b/packages/divbase-lib/src/divbase_lib/exceptions.py index 5b72981a..bf330d75 100644 --- a/packages/divbase-lib/src/divbase_lib/exceptions.py +++ b/packages/divbase-lib/src/divbase_lib/exceptions.py @@ -107,6 +107,12 @@ class SidecarSampleIDError(Exception): pass +class SidecarMetadataFormatError(Exception): + """Raised when the sidecar metadata TSV file has formatting issues (duplicate columns, empty columns, commas in values, etc.).""" + + pass + + class NoVCFFilesFoundError(Exception): """Raised when no VCF files are found in the project bucket.""" From b86d73ff4e8c9ecf961a8dc5931883b2a1c5addc Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 15:27:01 +0100 Subject: [PATCH 056/100] Update e2e and validator unit tests after refactor --- .../queries/test_SidecarQueryManager.py | 2 +- .../test_sample_metadata_tsv_validator.py | 103 ++++++++++++++---- 2 files changed, 81 insertions(+), 24 deletions(-) diff --git a/tests/e2e_integration/queries/test_SidecarQueryManager.py b/tests/e2e_integration/queries/test_SidecarQueryManager.py index 86b4c289..2d7be203 100644 --- a/tests/e2e_integration/queries/test_SidecarQueryManager.py +++ b/tests/e2e_integration/queries/test_SidecarQueryManager.py @@ -158,7 +158,7 @@ def test_tsv_query_value_not_found(sample_tsv_file, caplog, create_sidecar_manag query_result = manager.query_result assert len(query_result) == 0, "Should return empty DataFrame when no values match" - assert "None of the values ['NonExistentPop'] were found in column 'Population'" in caplog.text + assert "No results for the filter ['NonExistentPop'] were found in column 'Population'" in caplog.text @pytest.mark.unit diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index 50a85d66..dbbaceba 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -202,39 +202,46 @@ def test_whitespace_warning(self, format_errors_tsv, project_samples): class TestTypeValidation: - """Test validation of column types.""" + """Test validation of column types. - def test_mixed_types_in_column(self, type_errors_tsv, project_samples): - """Columns with mixed numeric and string types should be detected.""" + Mixed types (columns with both numeric-looking and non-numeric values) are treated + as string columns and reported as warnings, not errors. + """ + + def test_mixed_types_in_column_is_warning(self, type_errors_tsv, project_samples): + """Columns with mixed numeric and string types should produce a warning (not error) and be classified as mixed_type.""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) - assert any("mixed types" in e.lower() and "Population" in e for e in errors) + assert any("mixed" in w.lower() and "Population" in w for w in warnings) + assert not any("mixed types" in e.lower() and "Population" in e for e in errors) - def test_mixed_types_in_cell(self, type_errors_tsv, project_samples): - """Cells with mixed types (e.g., '1;three;5') should be detected.""" + def test_mixed_types_in_cell_is_warning(self, type_errors_tsv, project_samples): + """Cells with mixed types (e.g., '1;three;5') should produce a warning (not error).""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) - assert any("Cell '1;three;5' contains mixed types" in e for e in errors) + assert any("1;three;5" in w and "mixed types" in w.lower() for w in warnings) + assert not any("1;three;5" in e and "mixed types" in e.lower() for e in errors) - def test_hyphen_in_numeric_column(self, type_errors_tsv, project_samples): - """Hyphens in numeric columns should be detected.""" + def test_hyphen_in_numeric_looking_column_is_warning(self, type_errors_tsv, project_samples): + """Hyphens in values that look like range notation should produce a warning (not error).""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) - assert any("hyphen" in e.lower() and "3-5" in e for e in errors) + assert any("hyphen" in w.lower() and "3-5" in w for w in warnings) + assert not any("hyphen" in e.lower() and "3-5" in e for e in errors) - def test_cell_and_column_level_mixed_types(self, type_errors_tsv, project_samples): - """When a column has both cell-level and column-level mixed types, both errors should be reported.""" + def test_cell_and_column_level_mixed_types_are_warnings(self, type_errors_tsv, project_samples): + """When a column has both cell-level and column-level mixed types, both should produce warnings (not errors).""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) - assert any("Cell '1;three;5' contains mixed types" in e for e in errors) - assert any("following columns contain mixed types" in e and "Population" in e for e in errors) + assert any("1;three;5" in w and "mixed types" in w.lower() for w in warnings) + assert any("mixed" in w.lower() and "Population" in w for w in warnings) assert "Population" in stats["mixed_type_columns"] assert "Test" in stats["mixed_type_columns"] - def test_stats_show_mixed_type_columns_with_cell_errors(self, type_errors_tsv, project_samples): + def test_stats_show_mixed_type_columns(self, type_errors_tsv, project_samples): """ - Stats should show columns as mixed-type even when they have cell-level errors. - The type_errors_tsv fixture used here has two columns with mixed types. + Stats should show columns as mixed-type for informational purposes. + The type_errors_tsv fixture used here has columns with mixed types. """ stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) @@ -252,7 +259,7 @@ def test_multi_value_numeric_cells_are_numeric(self, numeric_multi_values_tsv, p assert "Values" not in stats["string_columns"] assert "Scores" not in stats["mixed_type_columns"] assert "Values" not in stats["mixed_type_columns"] - assert not any("mixed types" in e.lower() and ("Scores" in e or "Values" in e) for e in errors) + assert not any("mixed" in w.lower() and ("Scores" in w or "Values" in w) for w in warnings) class TestDimensionMatching: @@ -260,7 +267,7 @@ class TestDimensionMatching: def test_samples_not_in_project(self, valid_tsv): """Samples in TSV but not in project should be errors.""" - project_samples = {"S1", "S2"} # Only S1 and S2 exist in project dimensions + project_samples = {"S1", "S2"} stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) assert any( @@ -270,7 +277,7 @@ def test_samples_not_in_project(self, valid_tsv): def test_samples_not_in_tsv(self, valid_tsv): """Samples in project but not in TSV should be warnings.""" - project_samples = {"S1", "S2", "S3", "S10", "S20"} # S10 and S20 not in TSV + project_samples = {"S1", "S2", "S3", "S10", "S20"} stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) assert any( @@ -351,7 +358,57 @@ def test_negative_numbers_with_semicolons(self, numeric_multi_values_tsv, negati assert "Longitude" not in stats["mixed_type_columns"] assert not any("Longitude" in e and "mixed" in e.lower() for e in errors) - def test_range_notation_still_rejected(self, type_errors_tsv): - """Test that range notation like '1-2' is still rejected in numeric columns.""" + def test_range_notation_produces_warning(self, type_errors_tsv): + """Test that range notation like '1-2' in a mixed-type column produces a warning (column treated as string).""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, {"S1", "S2", "S3", "S4"}) - assert any("mixed" in e.lower() and "Range" in e for e in errors) + assert any("mixed" in w.lower() and "Range" in w for w in warnings) + + +class TestSemicolonColumnTypeClassification: + """Test that the validator correctly classifies columns when semicolon-separated + cells contain a mix of numeric and non-numeric parts.""" + + @pytest.fixture + def semicolon_mixed_tsv(self, tmp_path): + """TSV where a column has '1;1-2' - a cell with one numeric and one non-numeric part.""" + content = "#Sample_ID\tCode\tPureNumSemicolon\n" + content += "S1\t1;1-2\t10;20;30\n" + content += "S2\t3\t40\n" + content += "S3\t5\t50;60\n" + tsv_file = tmp_path / "semicolon_mixed.tsv" + tsv_file.write_text(content) + return tsv_file + + def test_semicolon_cell_with_non_numeric_part_is_mixed(self, semicolon_mixed_tsv): + """A column with cell '1;1-2' should be classified as mixed-type because '1-2' is not a number.""" + stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) + + assert "Code" in stats["mixed_type_columns"] + assert "Code" not in stats["numeric_columns"] + assert "Code" not in stats["string_columns"] + + def test_semicolon_cell_mixed_produces_cell_level_warning(self, semicolon_mixed_tsv): + """A cell '1;1-2' should produce a cell-level mixed-type warning.""" + stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) + + assert any("1;1-2" in w and "mixed types" in w.lower() for w in warnings) + + def test_semicolon_cell_mixed_produces_column_level_warning(self, semicolon_mixed_tsv): + """The column-level mixed-type warning should mention the semicolon classification rule.""" + stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) + + assert any("semicolon-separated" in w and "Code" in w for w in warnings) + + def test_semicolon_cell_mixed_is_not_error(self, semicolon_mixed_tsv): + """Mixed types from semicolon cells should NOT produce errors.""" + stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) + + assert not any("mixed" in e.lower() for e in errors) + + def test_purely_numeric_semicolon_column_stays_numeric(self, semicolon_mixed_tsv): + """A column with only numeric values in semicolons (e.g., '10;20;30') should be numeric.""" + stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) + + assert "PureNumSemicolon" in stats["numeric_columns"] + assert "PureNumSemicolon" not in stats["mixed_type_columns"] + assert "PureNumSemicolon" not in stats["string_columns"] From acfffd8337671eb9c5648f7f8e167bf88b4fc6b9 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 15:40:37 +0100 Subject: [PATCH 057/100] Update unit tests for SidecarQueryManager --- .../test_sample_metadata_queries.py | 371 +++++++++++++++--- 1 file changed, 316 insertions(+), 55 deletions(-) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index ba8ee987..b6c5ff0b 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -2,10 +2,15 @@ Unit tests for SidecarQueryManager filtering """ +import pandas as pd import pytest from divbase_api.services.queries import SidecarQueryManager -from divbase_lib.exceptions import SidecarColumnNotFoundError, SidecarInvalidFilterError, SidecarSampleIDError +from divbase_lib.exceptions import ( + SidecarColumnNotFoundError, + SidecarMetadataFormatError, + SidecarSampleIDError, +) @pytest.fixture @@ -39,18 +44,17 @@ def sample_tsv_with_edge_cases(tmp_path): """ Create a temporary TSV file to test edge cases: 1. "string;string;string" - OK (pure strings) - 2. "1;two;5" - FAIL (mixed numeric and non-numeric should raise exception) + 2. "1;two;5" - Mixed numeric and non-numeric: treated as string column (not an error) 3. String values containing numbers like "1string" - OK (inferred as string) - 4. Column with commas should raise SidecarInvalidFilterError + 4. Unicode strings with diacritics - OK - Commas are NOT allowed in divbase TSV format. Note that S2 and S3 have leading/trailing whitespace in the Sample_ID and the code should handle that by stripping whitespace. """ - tsv_content = """#Sample_ID\tPureStrings\tMixedTypes\tSingleString\tSingleNumber\tUnicodeStrings\tWithCommas\tStringWithHyphen\tNumericalWithHyphen -S1\tNorth;South;East\t1;two;5\tWest\t100\tStockholm;Göteborg\tNorth,South\tNorth-East\t1-2 -S2 \tWest;East;North\t2;three;6\tNorth\t200\tMalmö;Uppsala\tWest,East\tSouth-West\t2-3 - S3\tSouth\t3\tEast\t300\tKöpenhamn;København\tNorth,\tNorth-North-West\t3-4 -S4\t1string\tstring4\tString5\t400\tHumlebæk\t,South\tEast-South-East\t4-5 + tsv_content = """#Sample_ID\tPureStrings\tMixedTypes\tSingleString\tSingleNumber\tUnicodeStrings\tStringWithHyphen\tNumericalWithHyphen +S1\tNorth;South;East\t1;two;5\tWest\t100\tStockholm;Göteborg\tNorth-East\t1-2 +S2 \tWest;East;North\t2;three;6\tNorth\t200\tMalmö;Uppsala\tSouth-West\t2-3 + S3\tSouth\t3\tEast\t300\tKöpenhamn;København\tNorth-North-West\t3-4 +S4\t1string\tstring4\tString5\t400\tHumlebæk\tEast-South-East\t4-5 """ tsv_file = tmp_path / "test_metadata_edge_cases.tsv" tsv_file.write_text(tsv_content) @@ -107,6 +111,43 @@ def sample_tsv_with_duplicate_sample_ids(tmp_path): return tsv_file +@pytest.fixture +def sample_tsv_with_mixed_type_column(tmp_path): + """ + Create a TSV with a column that has mixed numeric-looking and non-numeric values, + similar to Population_code with values like "8", "1a", "5a". + For testing query warnings for mixed-type columns. + """ + tsv_content = """#Sample_ID\tPopulation_code\tArea\tWeight +S1\t8\tNorth\t12.5 +S2\t1a\tEast\t18.8 +S3\t5a\tWest\t15.0 +S4\t1b\tSouth\t20.0 +S5\t4\tNorth\t22.1 +""" + tsv_file = tmp_path / "test_mixed_type.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_tsv_with_semicolon_mixed_type_column(tmp_path): + """ + Create a TSV where a column has semicolon-separated values with a non-numeric part, + e.g. '1;1-2'. This makes the column a string column because '1-2' is not a number. + Tests that semicolon-split values are individually checked for numeric parsing. + """ + tsv_content = """#Sample_ID\tCode\tPureNumericSemicolon\tWeight +S1\t1;1-2\t10;20;30\t12.5 +S2\t3\t40\t18.8 +S3\t5\t50;60\t15.0 +S4\t7\t70;80;90\t20.0 +""" + tsv_file = tmp_path / "test_semicolon_mixed.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + class TestNumericalFilteringInequalities: """Test inequality operators on numerical columns.""" @@ -555,30 +596,12 @@ def test_not_operator_with_string_positive_and_negative(self, sample_tsv_with_nu class TestEdgeCases: """Edge case tests for SidecarQueryManager filtering.""" - def test_column_with_commas_raises(self, sample_tsv_with_edge_cases): - """Test that a column containing commas raises SidecarInvalidFilterError.""" - manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - with pytest.raises(SidecarInvalidFilterError) as excinfo: - manager.run_query(filter_string="WithCommas:foo") - assert "contains commas" in str(excinfo.value) - - def test_mixed_types_should_fail(self, sample_tsv_with_edge_cases): - """Test that a column with mixed numeric and non-numeric values raises an error.""" + def test_mixed_types_treated_as_string(self, sample_tsv_with_edge_cases): + """Test that a column with mixed numeric and non-numeric values is treated as a string column. + The MixedTypes column has values like '1;two;5', '2;three;6', '3', 'string4'. + When treated as string, filtering for '1' should match cells containing '1' as a semicolon-separated value.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - with pytest.raises(SidecarInvalidFilterError): - manager.run_query(filter_string="MixedTypes:1") - - def test_strings_with_commas(self, sample_tsv_with_edge_cases): - """Test that a column with strings containing commas is correctly handled.""" - manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - result = manager.run_query(filter_string="StringsWithCommas:Region1") - sample_ids = result.get_unique_values("Sample_ID") - assert "S1" in sample_ids - - def test_numbers_with_comma(self, sample_tsv_with_edge_cases): - """Test that a column with numeric values containing commas is treated as string type (since comma is not a numeric character).""" - manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - result = manager.run_query(filter_string="NumbersWithComma:1") + result = manager.run_query(filter_string="MixedTypes:1") sample_ids = result.get_unique_values("Sample_ID") assert "S1" in sample_ids @@ -604,17 +627,21 @@ def test_multi_column_single_string_and_single_number(self, sample_tsv_with_edge assert len(sample_ids) == 1 assert "S4" in sample_ids - def test_multi_column_single_string_and_mixed_types_should_fail(self, sample_tsv_with_edge_cases): - """Test that filtering on SingleString (valid) and MixedTypes (invalid) will fail due to mixed types.""" + def test_multi_column_single_string_and_mixed_types_treated_as_string(self, sample_tsv_with_edge_cases): + """Test that filtering on SingleString and MixedTypes (treated as string) works with string matching.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - with pytest.raises(SidecarInvalidFilterError): - manager.run_query(filter_string="SingleString:String5;MixedTypes:string4") + result = manager.run_query(filter_string="SingleString:String5;MixedTypes:string4") + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S4" in sample_ids - def test_multi_column_single_number_and_mixed_types_should_fail(self, sample_tsv_with_edge_cases): - """Test that filtering on SingleNumber (valid) and MixedTypes (invalid) will fail due to mixed types.""" + def test_multi_column_single_number_and_mixed_types_treated_as_string(self, sample_tsv_with_edge_cases): + """Test that filtering on SingleNumber (numeric) and MixedTypes (treated as string) works correctly.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - with pytest.raises(SidecarInvalidFilterError): - manager.run_query(filter_string="SingleNumber:400;MixedTypes:string4") + result = manager.run_query(filter_string="SingleNumber:400;MixedTypes:string4") + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S4" in sample_ids def test_multi_column_single_string_and_pure_strings(self, sample_tsv_with_edge_cases): """Test that filtering on SingleString and PureStrings (both valid string columns) will pass.""" @@ -624,19 +651,13 @@ def test_multi_column_single_string_and_pure_strings(self, sample_tsv_with_edge_ assert len(sample_ids) == 1 assert "S4" in sample_ids - def test_multi_column_single_number_and_numbers_with_comma(self, sample_tsv_with_edge_cases): - """Test that filtering on SingleNumber (numeric) and NumbersWithComma (treated as string due to comma) will pass.""" + def test_multi_column_with_unicode(self, sample_tsv_with_edge_cases): + """Test that multi-column filtering works with unicode strings.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - result = manager.run_query(filter_string="SingleNumber:400;NumbersWithComma:string3") + result = manager.run_query(filter_string="UnicodeStrings:København;SingleString:East") sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 1 - assert "S4" in sample_ids - - def test_multi_column_with_unicode(self, sample_tsv_with_edge_cases): - """Test that multi-column filtering works with unicode strings, but raises error if commas are present.""" - manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - with pytest.raises(SidecarInvalidFilterError): - manager.run_query(filter_string="UnicodeStrings:København;WithCommas:North") + assert "S3" in sample_ids def test_hyphens_allowed_in_string_values(self, sample_tsv_with_edge_cases): """Test that hyphens are allowed in string values and can be filtered correctly.""" @@ -646,12 +667,15 @@ def test_hyphens_allowed_in_string_values(self, sample_tsv_with_edge_cases): assert len(sample_ids) == 1 assert "S2" in sample_ids - def test_hyphens_in_numerical_column_raises(self, sample_tsv_with_edge_cases): - """Test that hyphens are allowed in string columns but not in numerical columns.""" + def test_hyphens_in_column_treated_as_string(self, sample_tsv_with_edge_cases): + """Test that a column with hyphenated values like '1-2', '2-3' is treated as a string column. + The values are not parseable as floats, so the column is inferred as string. Querying for + the exact string value '2-3' should return the matching row.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) - with pytest.raises(SidecarInvalidFilterError) as excinfo: - manager.run_query(filter_string="NumericalWithHyphen:2") - assert "Column 'NumericalWithHyphen' contains value '1-2' with a hyphen at row 0." in str(excinfo.value) + result = manager.run_query(filter_string="NumericalWithHyphen:2-3") + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 1 + assert "S2" in sample_ids def test_not_operator_edge_case_with_unicode(self, sample_tsv_with_edge_cases): """Test NOT operator (!) with unicode string values.""" @@ -743,3 +767,240 @@ def test_negative_numbers_in_semicolon_cells(self, sample_tsv_with_numeric_data) sample_ids = result.get_unique_values("Sample_ID") assert len(sample_ids) == 1 assert "S4" in sample_ids + + +class TestQueryWarnings: + """Test that the query engine produces helpful warnings when users + filter on mixed-type or string columns with numeric syntax.""" + + def test_mixed_type_column_warns_on_inequality_filter(self, sample_tsv_with_mixed_type_column): + """Filtering with inequality syntax on a mixed-type column should produce a warning + that mentions both mixed types and the numeric operations.""" + manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) + result = manager.run_query(filter_string="Population_code:>5") + + assert any( + "mixed types" in w.lower() and "numeric operations" in w.lower() and "Population_code" in w + for w in result.warnings + ) + + def test_mixed_type_column_range_syntax_does_string_match(self, sample_tsv_with_mixed_type_column): + """Range-like patterns (e.g., '1-5') on a mixed-type column are treated as literal string + matches, not numeric ranges. A general mixed-type warning is expected, but not a + 'numeric operations won't work' warning since hyphenated values are common in strings.""" + manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) + result = manager.run_query(filter_string="Population_code:1-5") + + assert any("mixed types" in w.lower() and "Population_code" in w for w in result.warnings) + assert not any("will not work" in w for w in result.warnings) + + def test_mixed_type_column_no_warning_on_string_filter(self, sample_tsv_with_mixed_type_column): + """Filtering with plain string values on a mixed-type column should produce a + general mixed-type info warning but not a numeric-syntax warning.""" + manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) + result = manager.run_query(filter_string="Population_code:8,1a") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + assert "S2" in sample_ids + assert any("mixed types" in w.lower() and "Population_code" in w for w in result.warnings) + assert not any("will not work" in w for w in result.warnings) + + def test_pure_string_column_warns_on_numeric_inequality_filter(self, sample_tsv_with_mixed_type_column): + """Filtering with numeric inequality syntax (>5) on a pure string column (Area) should result in a warning.""" + manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) + result = manager.run_query(filter_string="Area:>5") + + assert any( + "string column" in w.lower() and "Area" in w and "numeric operations" in w.lower() for w in result.warnings + ) + assert not any("mixed types" in w.lower() and "Area" in w for w in result.warnings) + + def test_pure_string_column_no_warning_on_normal_filter(self, sample_tsv_with_mixed_type_column): + """Filtering with plain string values on a pure string column should produce no warnings.""" + manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) + result = manager.run_query(filter_string="Area:North,East") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + assert "S2" in sample_ids + assert len(result.warnings) == 0 + + def test_numeric_column_no_false_warning(self, sample_tsv_with_mixed_type_column): + """Filtering with numeric syntax on a numeric column should NOT produce a warning.""" + manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) + result = manager.run_query(filter_string="Weight:>15") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S2" in sample_ids + assert "S4" in sample_ids + assert "S5" in sample_ids + assert not any("string column" in w.lower() for w in result.warnings) + assert not any("numeric operations" in w.lower() for w in result.warnings) + + def test_warning_mentions_semicolon_rule(self, sample_tsv_with_mixed_type_column): + """Query warnings should explain the semicolon classification rule.""" + manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) + result = manager.run_query(filter_string="Population_code:>5") + + assert any("semicolon-separated" in w for w in result.warnings) + + +class TestSemicolonColumnTypeClassification: + """Test that column type classification correctly handles semicolon-separated values. + A column is numeric only if all parts of all semicolon-separated cells are valid numbers. + If any part is non-numeric (e.g., '1-2' in '1;1-2'), the entire column is string.""" + + def test_semicolon_cell_with_non_numeric_part_makes_column_string( + self, sample_tsv_with_semicolon_mixed_type_column + ): + """A column with a cell '1;1-2' should be treated as string because '1-2' is not a number.""" + manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + + assert not pd.api.types.is_numeric_dtype(manager.df["Code"]) + assert not manager._is_semicolon_separated_numeric_column("Code") + assert manager._is_mixed_type_column("Code") + + def test_semicolon_cell_with_non_numeric_part_warns_on_inequality( + self, sample_tsv_with_semicolon_mixed_type_column + ): + """Inequality filter on a column broken by '1;1-2' should produce a warning.""" + manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + result = manager.run_query(filter_string="Code:>2") + + assert any("mixed types" in w.lower() and "Code" in w for w in result.warnings) + assert any("numeric operations" in w.lower() for w in result.warnings) + + def test_semicolon_cell_with_non_numeric_part_string_matching_works( + self, sample_tsv_with_semicolon_mixed_type_column + ): + """String matching should still work on the mixed column. Filtering for '1-2' should matches cell value '1;1-2'.""" + manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + result = manager.run_query(filter_string="Code:1-2") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + assert len(sample_ids) == 1 + + def test_semicolon_cell_with_non_numeric_part_single_numeric_match( + self, sample_tsv_with_semicolon_mixed_type_column + ): + """String matching for '3' on the mixed column should return S2 (exact string match).""" + manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + result = manager.run_query(filter_string="Code:3") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S2" in sample_ids + assert len(sample_ids) == 1 + + def test_purely_numeric_semicolon_column_supports_numeric_ops(self, sample_tsv_with_semicolon_mixed_type_column): + """A column with only numeric semicolon values (e.g., '10;20;30') should support numeric operations.""" + manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + + assert manager._is_semicolon_separated_numeric_column("PureNumericSemicolon") + + result = manager.run_query(filter_string="PureNumericSemicolon:>55") + sample_ids = result.get_unique_values("Sample_ID") + assert "S3" in sample_ids + assert "S4" in sample_ids + assert "S1" not in sample_ids + assert "S2" not in sample_ids + assert not any("string column" in w.lower() for w in result.warnings) + + def test_purely_numeric_semicolon_column_range_filter(self, sample_tsv_with_semicolon_mixed_type_column): + """A purely numeric semicolon column should support range operations.""" + manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + result = manager.run_query(filter_string="PureNumericSemicolon:25-45") + + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + assert "S2" in sample_ids + assert "S3" not in sample_ids + assert "S4" not in sample_ids + assert not any("string column" in w.lower() for w in result.warnings) + + +class TestLoadFileValidation: + """Test that SidecarQueryManager validates the same errors as the client-side + MetadataTSVValidator in load_file()), before any queries are run. + + This ensures that even if a user skips the CLI validator, the server-side + query engine catches the same formatting issues with clear error messages.""" + + def test_commas_in_data_raises_at_tsv_load(self, tmp_path): + """Commas in any cell value should raise SidecarMetadataFormatError during load_file().""" + tsv_content = "#Sample_ID\tArea\tWeight\nS1\tNorth,South\t12.5\nS2\tEast\t18.0\n" + tsv_file = tmp_path / "commas.tsv" + tsv_file.write_text(tsv_content) + + with pytest.raises(SidecarMetadataFormatError) as excinfo: + SidecarQueryManager(file=tsv_file) + assert "commas" in str(excinfo.value).lower() + + def test_commas_in_non_queried_column_raises_at_tsv_load(self, tmp_path): + """Commas should be caught in all columns when the tsv is loaded, not just the column being queried.""" + tsv_content = "#Sample_ID\tArea\tBadCol\nS1\tNorth\thas,comma\nS2\tEast\tclean\n" + tsv_file = tmp_path / "commas_other_col.tsv" + tsv_file.write_text(tsv_content) + + with pytest.raises(SidecarMetadataFormatError) as excinfo: + SidecarQueryManager(file=tsv_file) + assert "commas" in str(excinfo.value).lower() + + def test_duplicate_column_names_raises(self, tmp_path): + """Duplicate column names should raise SidecarMetadataFormatError during load_file(). + Without this check, pandas might silently rename them (e.g., 'Area', 'Area.1').""" + tsv_content = "#Sample_ID\tArea\tArea\nS1\tNorth\tSouth\nS2\tEast\tWest\n" + tsv_file = tmp_path / "duplicate_cols.tsv" + tsv_file.write_text(tsv_content) + + with pytest.raises(SidecarMetadataFormatError) as excinfo: + SidecarQueryManager(file=tsv_file) + assert "duplicate" in str(excinfo.value).lower() + + def test_empty_column_name_raises(self, tmp_path): + """Empty column names should raise SidecarMetadataFormatError during load_file().""" + tsv_content = "#Sample_ID\t\tWeight\nS1\tNorth\t12.5\nS2\tEast\t18.0\n" + tsv_file = tmp_path / "empty_col.tsv" + tsv_file.write_text(tsv_content) + + with pytest.raises(SidecarMetadataFormatError) as excinfo: + SidecarQueryManager(file=tsv_file) + assert "empty" in str(excinfo.value).lower() + + def test_semicolon_in_sample_id_raises(self, tmp_path): + """Semicolons in Sample_ID values should raise SidecarSampleIDError during load_file(). + Sample_ID must contain exactly one value per row.""" + tsv_content = "#Sample_ID\tArea\nS1;S2\tNorth\nS3\tEast\n" + tsv_file = tmp_path / "semicolon_sample_id.tsv" + tsv_file.write_text(tsv_content) + + with pytest.raises(SidecarSampleIDError) as excinfo: + SidecarQueryManager(file=tsv_file) + assert "semicolon" in str(excinfo.value).lower() + + def test_missing_sample_id_column_raises(self, sample_tsv_missing_sample_id_column): + """Missing Sample_ID column should raise SidecarColumnNotFoundError.""" + with pytest.raises(SidecarColumnNotFoundError): + SidecarQueryManager(file=sample_tsv_missing_sample_id_column) + + def test_empty_sample_id_raises(self, sample_tsv_with_invalid_sample_ids): + """Empty Sample_ID values should raise SidecarSampleIDError.""" + with pytest.raises(SidecarSampleIDError): + SidecarQueryManager(file=sample_tsv_with_invalid_sample_ids) + + def test_duplicate_sample_id_raises(self, sample_tsv_with_duplicate_sample_ids): + """Duplicate Sample_ID values should raise SidecarSampleIDError.""" + with pytest.raises(SidecarSampleIDError) as excinfo: + SidecarQueryManager(file=sample_tsv_with_duplicate_sample_ids) + assert "duplicate" in str(excinfo.value).lower() + + def test_valid_file_loads_successfully(self, sample_tsv_with_edge_cases): + """ + A TSV that follows DivBase requirements should load without errors. + Use the edge case fixture to assert that these are fine as long as they all + follow the DivBase requirements. + """ + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + assert manager.df is not None + assert "Sample_ID" in manager.df.columns From f82a6cc679ab2bc11461988975783b73820f59d0 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 16:00:03 +0100 Subject: [PATCH 058/100] Handle the case of filters like Area:>North Should give clear warnings that inequality operators are not supported on string columns. --- .../src/divbase_api/services/queries.py | 41 +++++++++++++++---- .../test_sample_metadata_queries.py | 39 +++++++++++++++--- 2 files changed, 67 insertions(+), 13 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 2a817168..ffdc9b3b 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -900,16 +900,36 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": key, filter_string_values ) + comparison_operator_message = ( + f"DivBase comparison operators (>, <, >=, <=) only work on numeric columns. " + f"Use exact string matching instead (e.g., '{key}:value1,value2')." + ) if is_mixed and problematic_filter_values: - warning_msg = f"Column '{key}' has mixed types (including semicolon-separated values) and is treated as string by DivBase. Your filter {problematic_filter_values} uses numeric operations which won't work. Use exact string matching instead (e.g., '{key}:value1,value2')." + warning_msg = ( + f"Column '{key}' has mixed types (both numeric-looking and non-numeric values) " + f"and is treated as a string column. A column is only numeric if all values " + f"(including each part in semicolon-separated cells) are valid numbers. " + f"Your filter contains comparison operators {problematic_filter_values} which are not " + f"supported on string columns. " + f"{comparison_operator_message}" + ) logger.warning(warning_msg) self.warnings.append(warning_msg) elif problematic_filter_values: - warning_msg = f"Column '{key}' is a string column. Your filter {problematic_filter_values} uses numeric operations which won't work on strings. Use exact string matching instead (e.g., '{key}:value1,value2')." + warning_msg = ( + f"Column '{key}' is a string column but your filter contains comparison operators " + f"{problematic_filter_values} which are not supported on string columns. " + f"{comparison_operator_message}" + ) logger.warning(warning_msg) self.warnings.append(warning_msg) elif is_mixed: - warning_msg = f"Column '{key}' has mixed types (including semicolon-separated values) and is treated as string by DivBase. Numeric operations are not available for this column." + warning_msg = ( + f"Column '{key}' has mixed types (both numeric-looking and non-numeric values) " + f"and is treated as a string column. A column is only numeric if all values " + f"(including each part in semicolon-separated cells) are valid numbers. " + f"Comparison operators (>, <, >=, <=) are not available for this column." + ) logger.warning(warning_msg) self.warnings.append(warning_msg) @@ -1146,12 +1166,17 @@ def _is_mixed_type_column(self, key: str) -> bool: def _detect_numeric_filter_syntax_on_string_column(self, key: str, filter_string_values: str) -> list[str]: """ - Helper method to detect when a user's filter string contains numeric inequality syntax - (like ">25", ">=10", "<30", "<=5") on a column that is treated as string. Returns a list of the - problematic filter values for use in a warning message. + Helper method to detect when a user's filter string contains inequality operators + (e.g. ">25", ">=10", ", <, >=, <=) as a prefix, regardless of whether the + value after the operator is numeric or not. E.g. ">5" and ">North". + Doesn't flag range-like filter values like "1-2" since these are common string values (e.g., hyphenated IDs or names) and will correctly match via string matching. + + Returns a list of the problematic filter values for use in a warning messages. """ problematic_filter_values = [] values = filter_string_values.split(",") @@ -1159,8 +1184,8 @@ def _detect_numeric_filter_syntax_on_string_column(self, key: str, filter_string filter_value = filter_value.strip().lstrip("!") # strip negation prefix for checking if not filter_value: continue - # Check for inequality operators - if re.match(r"^(>=|<=|>|<)-?\d+\.?\d*$", filter_value): + # Check for inequality operators: + if re.match(r"^(>=|<=|>|<).+$", filter_value): problematic_filter_values.append(filter_value) return problematic_filter_values diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index b6c5ff0b..ad0a4136 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -775,12 +775,12 @@ class TestQueryWarnings: def test_mixed_type_column_warns_on_inequality_filter(self, sample_tsv_with_mixed_type_column): """Filtering with inequality syntax on a mixed-type column should produce a warning - that mentions both mixed types and the numeric operations.""" + that mentions both mixed types and the comparison operators.""" manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) result = manager.run_query(filter_string="Population_code:>5") assert any( - "mixed types" in w.lower() and "numeric operations" in w.lower() and "Population_code" in w + "mixed types" in w.lower() and "comparison operators" in w.lower() and "Population_code" in w for w in result.warnings ) @@ -812,7 +812,8 @@ def test_pure_string_column_warns_on_numeric_inequality_filter(self, sample_tsv_ result = manager.run_query(filter_string="Area:>5") assert any( - "string column" in w.lower() and "Area" in w and "numeric operations" in w.lower() for w in result.warnings + "string column" in w.lower() and "Area" in w and "comparison operators" in w.lower() + for w in result.warnings ) assert not any("mixed types" in w.lower() and "Area" in w for w in result.warnings) @@ -836,7 +837,7 @@ def test_numeric_column_no_false_warning(self, sample_tsv_with_mixed_type_column assert "S4" in sample_ids assert "S5" in sample_ids assert not any("string column" in w.lower() for w in result.warnings) - assert not any("numeric operations" in w.lower() for w in result.warnings) + assert not any("comparison operators" in w.lower() for w in result.warnings) def test_warning_mentions_semicolon_rule(self, sample_tsv_with_mixed_type_column): """Query warnings should explain the semicolon classification rule.""" @@ -845,6 +846,34 @@ def test_warning_mentions_semicolon_rule(self, sample_tsv_with_mixed_type_column assert any("semicolon-separated" in w for w in result.warnings) + @pytest.mark.parametrize( + "column,filter_string,expected_warning,expected_sample_ids", + [ + ("Area", ">North", ["comparison operators", "Area", "only work on numeric"], []), + ("Area", "=North", ["comparison operators", ">=North"], None), + ("Population_code", ">8", ["mixed types", "comparison operators"], None), + ], + ) + def test_comparison_operator_parametrized( + self, sample_tsv_with_mixed_type_column, column, filter_string, expected_warning, expected_sample_ids + ): + """ + Parametrized test for comparison operator warnings and result length on string and mixed-type columns. + """ + manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) + result = manager.run_query(filter_string=f"{column}:{filter_string}") + + for warning_substring in expected_warning: + assert any(warning_substring.lower() in w.lower() for w in result.warnings) + + if expected_sample_ids is not None: + sample_ids = result.get_unique_values("Sample_ID") + assert sample_ids == expected_sample_ids or len(sample_ids) == len(expected_sample_ids) + elif column == "Area" and filter_string == ">North": + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 0 + class TestSemicolonColumnTypeClassification: """Test that column type classification correctly handles semicolon-separated values. @@ -869,7 +898,7 @@ def test_semicolon_cell_with_non_numeric_part_warns_on_inequality( result = manager.run_query(filter_string="Code:>2") assert any("mixed types" in w.lower() and "Code" in w for w in result.warnings) - assert any("numeric operations" in w.lower() for w in result.warnings) + assert any("comparison operators" in w.lower() for w in result.warnings) def test_semicolon_cell_with_non_numeric_part_string_matching_works( self, sample_tsv_with_semicolon_mixed_type_column From b827b816ea20fb8871a3ef06e1851b0edec5eca8 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 16:14:27 +0100 Subject: [PATCH 059/100] Print None instead in CLI of [] when no result --- .../divbase-cli/src/divbase_cli/cli_commands/query_cli.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py index a46efea1..618a137c 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/query_cli.py @@ -113,9 +113,13 @@ def sample_metadata_query( color = "red" else: color = "bright_blue" + print(f"The results for the query ([{color}]{results.query_message}[/{color}]):") - print(f"Unique Sample IDs: {results.unique_sample_ids}") - print(f"Unique filenames: {results.unique_filenames}\n") + + unique_sample_ids = results.unique_sample_ids if results.unique_sample_ids else None + unique_filenames = results.unique_filenames if results.unique_filenames else None + print(f"Unique Sample IDs: {unique_sample_ids}") + print(f"Unique filenames: {unique_filenames}\n") @query_app.command("bcftools-pipe") From 57b82d877adc8db40882a117865533105b77fbc1 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 12 Feb 2026 17:32:55 +0100 Subject: [PATCH 060/100] Update metadata user guide after refactoring --- docs/user-guides/sidecar-metadata.md | 70 ++++++++++++++++++---------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 3b00c724..3eb3e5a2 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -44,16 +44,15 @@ TODO - give more example of how and when it can be relevant to have multiple tsv ### Sidecar TSV format requirements -TODO - There is no fixed schema but some mandatory requirements +TODO - write a section on how there is no fixed schema but some mandatory requirements #### Mandatory content 1. The first row must be a header row and the first column must be named `Sample_ID`. -2. The `Sample_ID` column must contain the exact names of the samples as they are spelled in the VCF files. This will already be handled if user has run a `divbase-cli dimensions update` job and, after its completion, has generated a pre-filled template with: `divbase-cli dimensions create-metadata-template` -3. The `Sample_ID` column can only contain one sample name per row. This is different from the user-defined columns that can take arrays of values for each cell in a column. -4. Every column need to be tab separated for all rows. +2. The `Sample_ID` column must contain the exact names of the samples as they are spelled in the VCF files. One entry per sample name; duplicates are not allowed. This will already be handled if user has run a `divbase-cli dimensions update` job and, after its completion, has generated a pre-filled template with: `divbase-cli dimensions create-metadata-template` +3. The `Sample_ID` column can only contain one sample name per row. This is different from the user-defined columns that can take arrays of values for each cell in a column using semicolons (;) as delimters. `Sample_ID` values can also not be empty. -TODO non empty, unique, no duplication, no semicolones in Sample_ID +4. Every column need to be tab separated for all rows. #### User-defined columns @@ -64,7 +63,7 @@ After the `Sample_ID` column has been populated, users can add any columns and v To ensure that user-defined metadata can be used in DivBase, we ask you follow the following constraints and considerations: -1. The user-defined columns can be either numeric or string type. Mixing string and numeric values in the same column is not allowed; if a mix is detected, DivBase will raise an error and reject the file. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values are numerical or string. +1. The user-defined columns can be either numeric or string type. A column is classified as numeric only if all values (including individual parts in semicolon-separated cells) can be parsed as numbers. If any value in a column is non-numeric, the entire column is treated as a string column. This means a column with values like "8", "1a", "5a" will treated as string column even though some values look numeric. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values are numerical or string. 2. Commas are not supported for the TSV and the DivBase system will send an error message if it detects any TSV cells with commas in them. Commas can have different meanings in different notation systems and to avoid confusion and to keep it simple, DivBase will simply not handle commas. Note that commas are used in the [Query syntax](#query-syntax-for-sidecar-metadata) for a different purpose. For decimals, use English decimal notation (.) and not comma (,). DivBase allows one single delimiter for enumerations in the TSV files and that is the semicolon (;) as will be described in the bullet. 3. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data (e.g. "2;4;21" or "North; North-West"). Note that this might make the process of writing queries on the more complex than if just a single value is use for each cell. 4. As outlined above, the only characters with special meaning or restrictions in the TSV are `#`, `,`, `;`, and `\t` (tab). Other special characters should be supported, but please be aware that Your Milage May Vary. Some common cases that have been tested and are supported include hyphens (`-`), e.g.`North-West`), diacritic unicodecharacters like `å`,`ä`,`ö`. @@ -100,35 +99,36 @@ The validation runs on the users local computer and not as a job on the DivBase The command requires that the project's dimensions index is up-to-date with the VCF files in the project, and that is why is sort under `divbase-cli dimensions` in the CLI command tree. If you are unsure if the dimensions index is up-to-date, just run `divbase-cli dimensions update` and wait until that job has completed by checking `divbase-cli task-history user`. -The validation command will fetch all sample names from the project dimensions index from the DivBase server and use that to validate that the sample names in the TSV are correct. Misspelled, missing, or otherwise incorrect sample names in the TSV will result in erroneus or even misleading query results, and the validator will help with spotting that. +The validation command will fetch all sample names from the project dimensions index from the DivBase server and use that to validate that the sample names in the TSV are correct. Misspelled, missing, or otherwise incorrect sample names in the TSV will result in erroneus or even misleading query results, and the validator will help with spotting that. Several of the checks that the validator performs are also done at the start of a sample metadata query, but this sample name check is currently only done by the validator. -The following will return errors. These must be fixed if the sidecar TSV should be used in DivBase queries: +The following will return **Errors**. These must be fixed if the sidecar TSV should be used in DivBase queries: -- Header formatting: Header row is missing or first column is not #Sample_ID, Duplicate or empty column names +- Header formatting: Header row is missing or first column is not `#Sample_ID`, duplicate or empty column names -- Tab separation: Row has the wrong number of columns +- Tab separation: Row has the wrong number of columns (Note that check is only done in the validator! It is currently not part of the checks at the start of a sample metadata query) -- `Sample_ID` : Empty Sample_ID,Sample_ID contains a semicolon,Duplicate Sample_ID +- `Sample_ID`: Empty Sample_ID, Sample_ID contains a semicolon, duplicate Sample_ID -- Unsupported characters: no commas in cell values; no hyphens in numerical columns +- Unsupported characters: no commas in cell values -- Type consistency (numeric and string values): no Mixed types in a column or in a cell in a cell (e.g., 1;abc) +- All samples listed in the TSV must exist in the dimensions index -- All samples listed in in TSV must exist in the dimensions index +!!! Note + The formatting errors listed above are also enforced by the DivBase query engine when loading the metadata file for queries (except checking tab separation and that samples match the dimensions file, which are validator-specific checks). This means that even if the validator is not run before upload, the query engine will analyse the file content and report issues as errors. Detected Errors are different from Warnings in that errors will result in queries not even being run. -The validator will also raise Warnings. DivBase queries can still be run with these, but the user should review them, and possible address them if so desired: +The validator will also raise **Warnings**. DivBase queries can still be run with Warnings, but the user should review them, and possible address them if so desired: - Cell value has leading or trailing whitespace (will be stripped by server) - Samples in the project’s dimensions index not found in the TSV. These samples will not be considered in queries, and that might in fact be what the user wants, espcially if using multiple TSVs. Just be sure to be careful when using this since it will affect the results. +- Mixed-type columns (e.g. a column with "8", "1a", "5a") and Semicolon-separated cells with mixed types (e.g., "1;abc"). They are allowed but the user should keep in mind that since they will be treated as string columns, numeric query operations (ranges, inequalities) will not work on these columns. + +- Hyphens in values that look like range notation (e.g., "1-2") in columns that also contain numeric values. The warning message will ask the user if they intended this to be a multicolumn value which should use semicolons as delimters. ## Query Syntax for sidecar metadata This section describes how to query on the sample metadata file itself. The same syntax used here will also be used when running combined sample metadata and VCF data queries; how to do that is covered in [DivBase Query Syntax for VCF data](query-syntax.md). -- TODO: explain warnings, these should be the same as the validator, but this needs to be checked -- TODO: explain when empty results or all results are returned - ### Overview: querys are applied as filters on columns in the TSV Queries on the sidecar sample metadata TSV can be done with the `divbase-cli query tsv` command. The filters that the user want to query on needs entered as a string (i.e. enclosed in quotes, `""`). @@ -187,6 +187,14 @@ Note that when inclusive and exclusive are combined (e.g. `"Area:East,!South"`), ### Filtering on numerical columns +A TSV column is considered as numeric in DivBase only if all cell values — including each individual part within semicolon-separated cells (e.g. `1;3;5`) — can be parsed as a number. For example: + +- A column with values `1`, `2;4`, `3`, `1;3;5` is considered numeric since all elements are numbers. All numeric operations below (inequalities, ranges, discrete) are fully supported on this column. + +- A column with values `1;1-2`, `3`, `5` is considered a string column since the part `1-2` cannot be parsed as a number. Only exact string matching is supported for this column. + +- A column with values `8`, `1a`, `5a`is considered a string column since it has mixed types (`8` is numeric, the others are strings). Only exact string matching is supported for this column. + For numerical columns, it is possible to filter on the following operations: - **Inequalities** @@ -213,6 +221,26 @@ The `!` (NOT) operator can really come to good use for numerical filters: - `"Weight:>5,!10-15"` returns rows where the value is greater than 5, but not in the range 10–15. - `"Weight:!1-2,4"` returns rows where the value is not in the range 1–2, or is 4. +### Query Warnings: spotting potential issues with the TSV or the query filter + +When running a sample metadata query in DivBase, the system will check the TSV and the query filter for the constraints and considerations described throughout this guide. If errors are encountered, the query will not run and a message with details on what went wrong will be return to the user. Warnings, however, will not stop not stop queries from running, but indicated that the user should carefully review the results. + +Reviewing the Warnings to judge if they are relevant or not is key help avoid unintended query results. The following are treated as Warnings by DivBase queries (and by the TSV validator). + +- **Comparison operators on string/mixed-type columns**: DivBase comparison operators (`>`, `<`, `>=`, `<=`) only work on numeric columns. If you use them on a string or mixed-type column — whether with a numeric operand (e.g., `Population:>5`) or a string operand (e.g., `Area:>North`) — DivBase will warn that comparison operators are not supported on string columns. Use exact string matching instead (e.g., `Area:North` or `Population:8,1a`). + +- **Mixed-type column information**: +When filtering on a mixed-type column with valid string matching, DivBase will inform you that the column is treated as string and comparison operators are not available. This is mainly to make the user aware of this. + +- **Column not found**: +If the filter references a column that does not exist in the TSV, DivBase will warn and skip that filter condition. + +- **No matching values**: +If none of the filter values match any values in the column, DivBase print a warning. This can indicate a typo in the filter value, or just that the specific filter combination filtered away all samples.. + +!!! Tip + Numeric operations such as inequalities like `>25`, and ranges like `20-40` are fully supported for semicolon-separated numeric columns as long as every semicolon separated part (`part;part`) in every cell in the column is a valid number. For instance: a `Population` column with values `1`, `2;4`, `1;3;5`; in this case a query like `divbase-cli query tsv "Population:>3"` will correctly match cells like `2;4` and `1;3;5`. + ### Examples of complex queries Assuming that the sidecar metadata TSV file looks like in the [Example](#example) above, a query like will: @@ -226,9 +254,3 @@ divbase-cli query tsv "Area:North,West,!South;Weight:>10,<=20,!15,18-22" - include rows where the `Weight` column is greater than 10, **or** less than or equal to 20, **or** in the range 18–22 (inclusive), **but excludes** any row where Weight is exactly 15 **or** any value in the range 18–22. There are three samples (rows) that fulfill this, and this is what the query results will return: `S1`, `S4`, and `S5`. - -TODOs: - -- [TO BE IMPLEMENTED] what to do if a query references a column that does not exist. E.g. `divbase-cli query tsv "Area:Northern Portugal"` when Area does not exist? This should probably give a warning and not just return nothing - -- [TO BE IMPLEMENTED] what to do if a query references a column value. E.g. `divbase-cli query tsv "Area:Northern Portugal"` when Northern Portugal does not exist in the column? This should probably also give a warning and not just return nothing, but nothing is a result here and not a syntax problem... From 7c588c62b0e703f000537c25df8cfa0aa8e37dbb Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 12:05:37 +0100 Subject: [PATCH 061/100] Ensure all errors from Celery task are propagaged Turns out that a source of error here was that the Celery JSON seralization/deserialization will raise UnpicklableExceptionWrapper when the exception definitions are more complex: all custom exceptions inheriting from DivBaseAPIException (which has __init__ with custom args and kwargs) triggered UnpicklableExceptionWrapper and the errors were not printed properly in the terminal. Simple exceptions that inherit directly from Exception, such as SidecarInvalidFilterError, did not trigger UnpicklableExceptionWrapper and could easily be caught and printed in the terminal in the API route. The work-around implemented here is to in tasks.py wrap complex exceptions that would trigger UnpicklableExceptionWrapper in a simple wrapper exception (TaskUserError) that inherits directly from Exception. process without being wrapped in UnpicklableExceptionWrapper, which would cause loss of the original error type and message. This allows the API to return more specific error messages to the user when a task fails due to known issues (e.g., missing TSV file, missing VCF dimensions entry) rather than a generic error message. --- .../divbase-api/src/divbase_api/exceptions.py | 39 ++++++++++++++++--- .../src/divbase_api/routes/queries.py | 16 +++----- .../src/divbase_api/worker/tasks.py | 22 ++++++++--- .../divbase-lib/src/divbase_lib/exceptions.py | 17 ++++++++ 4 files changed, 71 insertions(+), 23 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/exceptions.py b/packages/divbase-api/src/divbase_api/exceptions.py index 44ab233e..b67163cf 100644 --- a/packages/divbase-api/src/divbase_api/exceptions.py +++ b/packages/divbase-api/src/divbase_api/exceptions.py @@ -91,12 +91,19 @@ def __init__(self, message: str = "Could not find the specified project version" class VCFDimensionsEntryMissingError(DivBaseAPIException): """Raised when there are no entries in the VCF dimensions db table.""" - def __init__(self, project_name: str): - message = ( - f"The VCF dimensions index in project '{project_name}' is missing or empty. " - "Please ensure that there are VCF files in the project and run:\n" - "'divbase-cli dimensions update --project '\n" - ) + def __init__(self, project_name: str | None = None): + if project_name: + message = ( + f"The VCF dimensions index in project '{project_name}' is missing or empty. " + "Please ensure that there are VCF files in the project and run:\n" + "'divbase-cli dimensions update --project '\n" + ) + else: + message = ( + "The VCF dimensions index is missing or empty. " + "Please ensure that there are VCF files in the project and run:\n" + "'divbase-cli dimensions update --project '\n" + ) super().__init__(message, status_code=status.HTTP_404_NOT_FOUND) @@ -128,3 +135,23 @@ class ObjectDoesNotExistError(DivBaseAPIException): def __init__(self, key: str, bucket_name: str): message = f"The file/object '{key}' does not exist in the project '{bucket_name}'. " super().__init__(message=message, status_code=status.HTTP_404_NOT_FOUND) + + +class TSVFileNotFoundInProjectError(DivBaseAPIException): + """Raised when a TSV doesn't exist in project storage.""" + + def __init__(self, filename: str | None = None, project_name: str | None = None): + self.filename = filename + self.project_name = project_name + if filename and project_name: + message = ( + f"The sample metadata TSV file '{filename}' " + f"was not found in your project '{project_name}'.\n" + "Please check that you have spelled the file name correctly and that the file has been uploaded to the project." + ) + else: + message = ( + "A sample metadata TSV file was not found.\n" + "Please check that you have spelled the file name correctly and that the file has been uploaded to the project." + ) + super().__init__(message=message, status_code=status.HTTP_404_NOT_FOUND) diff --git a/packages/divbase-api/src/divbase_api/routes/queries.py b/packages/divbase-api/src/divbase_api/routes/queries.py index 5a9d4bc3..769c0915 100644 --- a/packages/divbase-api/src/divbase_api/routes/queries.py +++ b/packages/divbase-api/src/divbase_api/routes/queries.py @@ -15,7 +15,7 @@ from divbase_api.crud.task_history import create_task_history_entry, update_task_history_entry_with_celery_task_id from divbase_api.db import get_db from divbase_api.deps import get_project_member -from divbase_api.exceptions import AuthorizationError, VCFDimensionsEntryMissingError +from divbase_api.exceptions import AuthorizationError from divbase_api.models.projects import ProjectDB, ProjectRoles from divbase_api.models.users import UserDB from divbase_api.worker.tasks import ( @@ -34,6 +34,7 @@ SidecarInvalidFilterError, SidecarMetadataFormatError, SidecarSampleIDError, + TaskUserError, ) logging.basicConfig(level=settings.api.log_level, handlers=[logging.StreamHandler(sys.stderr)]) @@ -93,13 +94,12 @@ async def sample_metadata_query( SidecarColumnNotFoundError, SidecarSampleIDError, SidecarMetadataFormatError, + TaskUserError, ) as e: - # Catch validation errors (mixed types, missing columns, invalid Sample_IDs) and return 400 + # These are simple exceptions (that inherit from base Exception) that are able to pass through Celery's JSON serialization/deserialization without becoming UnpicklableExceptionWrapper. + # TaskUserError is a wrapper exception that allow to nest more complex exceptions that would normally trigger UnpicklableExceptionWrapper, and still be able to pass through the serialization/deserialization. error_message = str(e) raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=error_message) from None - except VCFDimensionsEntryMissingError: - # Catch and raise anew to avoid duplications in the error message - raise VCFDimensionsEntryMissingError(project_name=project.name) from None except celery.exceptions.TimeoutError: # type: ignore error_message = ( f"The query is still being processed and has Task ID: {results.id}. \n" @@ -108,12 +108,6 @@ async def sample_metadata_query( f"divbase-cli task-history id {results.id}" ) raise HTTPException(status_code=status.HTTP_408_REQUEST_TIMEOUT, detail=error_message) from None - except FileNotFoundError: - error_message = ( - f"The sample metadata TSV file named: {sample_metadata_query_request.metadata_tsv_name} was not found in your project {project.name} \n" - "Please make sure to upload it first ('divbase-cli files upload ...') and try again." - ) - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=error_message) from None return SampleMetadataQueryTaskResult(**result_dict) diff --git a/packages/divbase-api/src/divbase_api/worker/tasks.py b/packages/divbase-api/src/divbase_api/worker/tasks.py index 4dd4e265..227c333c 100644 --- a/packages/divbase-api/src/divbase_api/worker/tasks.py +++ b/packages/divbase-api/src/divbase_api/worker/tasks.py @@ -15,7 +15,11 @@ worker_process_init, ) -from divbase_api.exceptions import VCFDimensionsEntryMissingError +from divbase_api.exceptions import ( + ObjectDoesNotExistError, + TSVFileNotFoundInProjectError, + VCFDimensionsEntryMissingError, +) from divbase_api.models.task_history import TaskHistoryDB, TaskStartedAtDB from divbase_api.services.queries import BCFToolsInput, BcftoolsQueryManager, run_sidecar_metadata_query from divbase_api.services.s3_client import S3FileManager, create_s3_file_manager @@ -40,7 +44,7 @@ ) from divbase_api.worker.worker_db import SyncSessionLocal from divbase_lib.api_schemas.vcf_dimensions import DimensionUpdateTaskResult -from divbase_lib.exceptions import NoVCFFilesFoundError +from divbase_lib.exceptions import NoVCFFilesFoundError, TaskUserError logger = logging.getLogger(__name__) @@ -168,15 +172,21 @@ def sample_metadata_query_task( s3_file_manager = create_s3_file_manager(url=S3_ENDPOINT_URL) - metadata_path = _download_sample_metadata( - metadata_tsv_name=metadata_tsv_name, bucket_name=bucket_name, s3_file_manager=s3_file_manager - ) + try: + metadata_path = _download_sample_metadata( + metadata_tsv_name=metadata_tsv_name, bucket_name=bucket_name, s3_file_manager=s3_file_manager + ) + except ObjectDoesNotExistError: + # If ObjectDoesNotExistError, propagage the more specific TSVFileNotFoundInProjectError upwards. + # Wrap exeception in TaskUserError () to avoid Celery serilization UnpicklableExceptionWrapper issue + raise TaskUserError(str(TSVFileNotFoundInProjectError(metadata_tsv_name, project_name))) from None with SyncSessionLocal() as db: vcf_dimensions_data = get_vcf_metadata_by_project(project_id=project_id, db=db) if not vcf_dimensions_data.get("vcf_files"): - raise VCFDimensionsEntryMissingError(project_name=project_name) + # Wrap exeception in TaskUserError () to avoid Celery serilization UnpicklableExceptionWrapper issue + raise TaskUserError(str(VCFDimensionsEntryMissingError(project_name=project_name))) from None metadata_result = run_sidecar_metadata_query( file=metadata_path, diff --git a/packages/divbase-lib/src/divbase_lib/exceptions.py b/packages/divbase-lib/src/divbase_lib/exceptions.py index bf330d75..6930f3b7 100644 --- a/packages/divbase-lib/src/divbase_lib/exceptions.py +++ b/packages/divbase-lib/src/divbase_lib/exceptions.py @@ -113,6 +113,23 @@ class SidecarMetadataFormatError(Exception): pass +class TaskUserError(Exception): + """ + Raised in Celery tasks when an error needs to propagate back to the CLI user. + + This is intentionally kept as a simple Exception subclass (no custom __init__) + to avoid UnpicklableExceptionWrapper when passing through Celery's JSON serialization/deserialization. + Complex exception types such as those inheriting from DivBaseAPIException seem to trigger UnpicklableExceptionWrapper. + + This class is essentially a wrapper to allow to use more complex exceptions in Celery tasks and catch them + in the API route handlers to return user-friendly error messages to the CLI. + In the Celery task use it like: + raise TaskUserError(str(SomeComplexError(...))) from None + """ + + pass + + class NoVCFFilesFoundError(Exception): """Raised when no VCF files are found in the project bucket.""" From 667c56daa9f70cd8c34060886488c16f4cba840f Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 13:55:40 +0100 Subject: [PATCH 062/100] Add e2e test to assert latest errors in terminal For VCFDimensionsEntryMissingError TSVFileNotFoundInProjectError adressed in the previous commit --- .../cli_commands/test_query_cli.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tests/e2e_integration/cli_commands/test_query_cli.py b/tests/e2e_integration/cli_commands/test_query_cli.py index b5d3fada..ee9e97e2 100644 --- a/tests/e2e_integration/cli_commands/test_query_cli.py +++ b/tests/e2e_integration/cli_commands/test_query_cli.py @@ -355,6 +355,76 @@ def patched_download_vcf_files(files_to_download, bucket_name, s3_file_manager): ) +def test_error_in_terminal_for_sample_metadata_query_on_tsv_not_in_bucket( + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, +): + """ + Test that the sample metadata query raises the correct error when the specified TSV file is not found in the project bucket. + + Indirectly covers the case of the user misspells the TSV filename. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 + + run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) + + filename = "filename_that_does_not_exist_in_bucket.tsv" + command = f'query tsv "Area:North" --metadata-tsv-name {filename} --project {project_name}' + cli_result = runner.invoke(app, command) + + assert f"The sample metadata TSV file '{filename}' was not found in your project '{project_name}'" in str( + cli_result.exception + ), "Expected error message about missing TSV file in project bucket" + + +def test_error_in_terminal_for_sample_metadata_query_when_no_dimensions_file( + CONSTANTS, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, +): + """ + Test that the sample metadata query raises the correct error when there is no dimensions file in the project bucket. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + # To test for this, do not run the update_dimensions fixture + + filename = "sample_metadata_HOM_chr_split_version.tsv" + command = f'query tsv "Area:North" --metadata-tsv-name {filename} --project {project_name} ' + cli_result = runner.invoke(app, command) + + assert f"The VCF dimensions index in project '{project_name}' is missing or empty" in str(cli_result.exception), ( + "Expected error message about missing VCF dimensions file in project bucket" + ) + + +def test_error_in_terminal_for_sample_metadata_query_tsv_missing_should_be_raised_before_dimensions_check( + CONSTANTS, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, +): + """ + Test that the missing TSV in bucket error is raised before the dimensions file check for a case when both are incorrect. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + # To test for this, do not run the update_dimensions fixture + + filename = "sample_metadata.tsv" # not in the split-scaffold-project bucket + command = f'query tsv "Area:North" --metadata-tsv-name {filename} --project {project_name} ' + cli_result = runner.invoke(app, command) + + assert f"The sample metadata TSV file '{filename}' was not found in your project '{project_name}'" in str( + cli_result.exception + ), "Expected error message about missing TSV file in project bucket" + + @pytest.mark.integration @pytest.mark.parametrize( "params,expect_success,ensure_dimensions_file,expected_logs,expected_error_msgs", From a3c31e3a6e694b908bfd35f3a54f1dac77ba46a0 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 14:26:20 +0100 Subject: [PATCH 063/100] Add e2e tests for other metadata task exceptions --- .../cli_commands/test_query_cli.py | 250 ++++++++++++++---- 1 file changed, 194 insertions(+), 56 deletions(-) diff --git a/tests/e2e_integration/cli_commands/test_query_cli.py b/tests/e2e_integration/cli_commands/test_query_cli.py index ee9e97e2..f4789d36 100644 --- a/tests/e2e_integration/cli_commands/test_query_cli.py +++ b/tests/e2e_integration/cli_commands/test_query_cli.py @@ -355,74 +355,212 @@ def patched_download_vcf_files(files_to_download, bucket_name, s3_file_manager): ) -def test_error_in_terminal_for_sample_metadata_query_on_tsv_not_in_bucket( - CONSTANTS, - run_update_dimensions, - db_session_sync, - project_map, - logged_in_edit_user_with_existing_config, -): - """ - Test that the sample metadata query raises the correct error when the specified TSV file is not found in the project bucket. +class TestSidecarQueryTaskErrorsPropagation: + """Test that errors in sidecar query tasks are propagated correctly to the CLI.""" + + def test_error_in_terminal_for_sample_metadata_query_on_tsv_not_in_bucket( + self, + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, + ): + """ + Test that the sample metadata query raises the correct error when the specified TSV file is not found in the project bucket. - Indirectly covers the case of the user misspells the TSV filename. - """ - project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] - bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] - project_id = project_map[project_name] - user_id = 1 + Indirectly covers the case of the user misspells the TSV filename. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 - run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) + run_update_dimensions( + bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id + ) - filename = "filename_that_does_not_exist_in_bucket.tsv" - command = f'query tsv "Area:North" --metadata-tsv-name {filename} --project {project_name}' - cli_result = runner.invoke(app, command) + tsv_filename = "filename_that_does_not_exist_in_bucket.tsv" + command = f'query tsv "Area:North" --metadata-tsv-name {tsv_filename} --project {project_name}' + cli_result = runner.invoke(app, command) - assert f"The sample metadata TSV file '{filename}' was not found in your project '{project_name}'" in str( - cli_result.exception - ), "Expected error message about missing TSV file in project bucket" + assert f"The sample metadata TSV file '{tsv_filename}' was not found in your project '{project_name}'" in str( + cli_result.exception + ), "Expected error message about missing TSV file in project bucket" + def test_error_in_terminal_for_sample_metadata_query_when_no_dimensions_file( + self, + CONSTANTS, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, + ): + """ + Test that the sample metadata query raises the correct error when there is no dimensions file in the project bucket. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + # To test for this, do not run the update_dimensions fixture + + tsv_filename = "sample_metadata_HOM_chr_split_version.tsv" + command = f'query tsv "Area:North" --metadata-tsv-name {tsv_filename} --project {project_name} ' + cli_result = runner.invoke(app, command) + + assert f"The VCF dimensions index in project '{project_name}' is missing or empty" in str( + cli_result.exception + ), "Expected error message about missing VCF dimensions file in project bucket" + + def test_error_in_terminal_for_sample_metadata_query_tsv_missing_should_be_raised_before_dimensions_check( + self, + CONSTANTS, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, + ): + """ + Test that the missing TSV in bucket error is raised before the dimensions file check for a case when both are incorrect. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + # To test for this, do not run the update_dimensions fixture + + filename = "sample_metadata.tsv" # not in the split-scaffold-project bucket + command = f'query tsv "Area:North" --metadata-tsv-name {filename} --project {project_name} ' + cli_result = runner.invoke(app, command) + + assert f"The sample metadata TSV file '{filename}' was not found in your project '{project_name}'" in str( + cli_result.exception + ), "Expected error message about missing TSV file in project bucket" + + def test_error_in_terminal_for_invalid_filter_syntax( + self, + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, + tmp_path, + ): + """ + Test that SidecarInvalidFilterError is raised and propagated to terminal when filter syntax is invalid. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 -def test_error_in_terminal_for_sample_metadata_query_when_no_dimensions_file( - CONSTANTS, - db_session_sync, - project_map, - logged_in_edit_user_with_existing_config, -): - """ - Test that the sample metadata query raises the correct error when there is no dimensions file in the project bucket. - """ - project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] - # To test for this, do not run the update_dimensions fixture + run_update_dimensions( + bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id + ) - filename = "sample_metadata_HOM_chr_split_version.tsv" - command = f'query tsv "Area:North" --metadata-tsv-name {filename} --project {project_name} ' - cli_result = runner.invoke(app, command) + tsv_filename = "sample_metadata_HOM_chr_split_version.tsv" - assert f"The VCF dimensions index in project '{project_name}' is missing or empty" in str(cli_result.exception), ( - "Expected error message about missing VCF dimensions file in project bucket" - ) + invalid_filter = "Area North" # Use invalid filter syntax (missing colon) + command = f'query tsv "{invalid_filter}" --metadata-tsv-name {tsv_filename} --project {project_name}' + cli_result = runner.invoke(app, command) + assert f"Invalid filter format: '{invalid_filter}'. Expected format 'key:value1,value2' or" in str( + cli_result.exception + ), "Expected error message about invalid filter syntax" -def test_error_in_terminal_for_sample_metadata_query_tsv_missing_should_be_raised_before_dimensions_check( - CONSTANTS, - db_session_sync, - project_map, - logged_in_edit_user_with_existing_config, -): - """ - Test that the missing TSV in bucket error is raised before the dimensions file check for a case when both are incorrect. - """ - project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] - # To test for this, do not run the update_dimensions fixture + def test_error_in_terminal_when_querying_nonexistent_column_in_tsv( + self, + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, + tmp_path, + ): + """ + Test that SidecarColumnNotFoundError is raised and propagated to terminal when querying non-existent column. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 - filename = "sample_metadata.tsv" # not in the split-scaffold-project bucket - command = f'query tsv "Area:North" --metadata-tsv-name {filename} --project {project_name} ' - cli_result = runner.invoke(app, command) + run_update_dimensions( + bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id + ) + + tsv_filename = "sample_metadata_HOM_chr_split_version.tsv" + + invalid_filter = "NonExistentColumn:value" # Query a column that doesn't exist in the TSV + command = f'query tsv "{invalid_filter}" --metadata-tsv-name {tsv_filename} --project {project_name}' + cli_result = runner.invoke(app, command) + + output = cli_result.stdout + (str(cli_result.exception) if cli_result.exception else "") + assert "Column 'NonExistentColumn' not found in the TSV file. Skipping this filter condition." in output + assert "Invalid filter conditions: none of the filters matched any records. Returning ALL records." in output + assert "This may be a large result set. Please check your filter keys, value spelling, and syntax." in output + + def test_error_in_terminal_when_duplicate_sample_IDs_in_tsv( + self, + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, + tmp_path, + ): + """ + Test that SidecarSampleIDError is raised and propagated to terminal when TSV has duplicate Sample_IDs. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 + + run_update_dimensions( + bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id + ) - assert f"The sample metadata TSV file '{filename}' was not found in your project '{project_name}'" in str( - cli_result.exception - ), "Expected error message about missing TSV file in project bucket" + tsv_file = tmp_path / "test_duplicate_sample_ids.tsv" + tsv_file.write_text("Sample_ID\tArea\nS1\tNorth\nS1\tSouth\nS2\tEast\n") + command = f"files upload {tsv_file} --project {project_name}" + result = runner.invoke(app, command) + assert result.exit_code == 0 + + command = f'query tsv "Area:North" --metadata-tsv-name {tsv_file.name} --project {project_name}' + cli_result = runner.invoke(app, command) + + assert "Duplicate Sample_IDs found: ['S1']. Each Sample_ID must be unique." in str(cli_result.exception), ( + "Expected error message about duplicate Sample_IDs" + ) + + def test_error_in_terminal_for_comma_in_metadata( + self, + CONSTANTS, + run_update_dimensions, + db_session_sync, + project_map, + logged_in_edit_user_with_existing_config, + tmp_path, + ): + """ + Test that SidecarMetadataFormatError is raised and propagated to terminal when TSV has commas in data. + """ + project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + project_id = project_map[project_name] + user_id = 1 + + run_update_dimensions( + bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id + ) + + tsv_file = tmp_path / "test_comma_in_data.tsv" + tsv_file.write_text("Sample_ID\tArea\nS1\tNorth,West\nS2\tSouth\n") + command = f"files upload {tsv_file} --project {project_name}" + result = runner.invoke(app, command) + assert result.exit_code == 0 + + command = f'query tsv "Area:North" --metadata-tsv-name {tsv_file.name} --project {project_name}' + cli_result = runner.invoke(app, command) + + assert ( + "Column 'Area' contains commas in value 'North,West' at row 1. Commas are not allowed in DivBase metadata files." + in str(cli_result.exception) + ), "Expected error message about comma in metadata value" @pytest.mark.integration From cea27789d002897ee28fb054ff8de068e8b276bc Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 14:35:29 +0100 Subject: [PATCH 064/100] Harmonize test docstrings --- .../test_sample_metadata_queries.py | 46 ++++++++-------- .../test_sample_metadata_tsv_validator.py | 52 +++++++++---------- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index ad0a4136..19292e9d 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -774,7 +774,7 @@ class TestQueryWarnings: filter on mixed-type or string columns with numeric syntax.""" def test_mixed_type_column_warns_on_inequality_filter(self, sample_tsv_with_mixed_type_column): - """Filtering with inequality syntax on a mixed-type column should produce a warning + """Test that filtering with inequality syntax on a mixed-type column should produce a warning that mentions both mixed types and the comparison operators.""" manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) result = manager.run_query(filter_string="Population_code:>5") @@ -785,7 +785,7 @@ def test_mixed_type_column_warns_on_inequality_filter(self, sample_tsv_with_mixe ) def test_mixed_type_column_range_syntax_does_string_match(self, sample_tsv_with_mixed_type_column): - """Range-like patterns (e.g., '1-5') on a mixed-type column are treated as literal string + """Test that range-like patterns (e.g., '1-5') on a mixed-type column are treated as literal string matches, not numeric ranges. A general mixed-type warning is expected, but not a 'numeric operations won't work' warning since hyphenated values are common in strings.""" manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) @@ -795,7 +795,7 @@ def test_mixed_type_column_range_syntax_does_string_match(self, sample_tsv_with_ assert not any("will not work" in w for w in result.warnings) def test_mixed_type_column_no_warning_on_string_filter(self, sample_tsv_with_mixed_type_column): - """Filtering with plain string values on a mixed-type column should produce a + """Test that filtering with plain string values on a mixed-type column should produce a general mixed-type info warning but not a numeric-syntax warning.""" manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) result = manager.run_query(filter_string="Population_code:8,1a") @@ -807,7 +807,7 @@ def test_mixed_type_column_no_warning_on_string_filter(self, sample_tsv_with_mix assert not any("will not work" in w for w in result.warnings) def test_pure_string_column_warns_on_numeric_inequality_filter(self, sample_tsv_with_mixed_type_column): - """Filtering with numeric inequality syntax (>5) on a pure string column (Area) should result in a warning.""" + """Test that filtering with numeric inequality syntax (>5) on a pure string column (Area) should result in a warning.""" manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) result = manager.run_query(filter_string="Area:>5") @@ -818,7 +818,7 @@ def test_pure_string_column_warns_on_numeric_inequality_filter(self, sample_tsv_ assert not any("mixed types" in w.lower() and "Area" in w for w in result.warnings) def test_pure_string_column_no_warning_on_normal_filter(self, sample_tsv_with_mixed_type_column): - """Filtering with plain string values on a pure string column should produce no warnings.""" + """Test that filtering with plain string values on a pure string column should produce no warnings.""" manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) result = manager.run_query(filter_string="Area:North,East") @@ -828,7 +828,7 @@ def test_pure_string_column_no_warning_on_normal_filter(self, sample_tsv_with_mi assert len(result.warnings) == 0 def test_numeric_column_no_false_warning(self, sample_tsv_with_mixed_type_column): - """Filtering with numeric syntax on a numeric column should NOT produce a warning.""" + """Test that filtering with numeric syntax on a numeric column should NOT produce a warning.""" manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) result = manager.run_query(filter_string="Weight:>15") @@ -840,7 +840,7 @@ def test_numeric_column_no_false_warning(self, sample_tsv_with_mixed_type_column assert not any("comparison operators" in w.lower() for w in result.warnings) def test_warning_mentions_semicolon_rule(self, sample_tsv_with_mixed_type_column): - """Query warnings should explain the semicolon classification rule.""" + """Test that query warnings explain the semicolon classification rule.""" manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) result = manager.run_query(filter_string="Population_code:>5") @@ -859,7 +859,7 @@ def test_comparison_operator_parametrized( self, sample_tsv_with_mixed_type_column, column, filter_string, expected_warning, expected_sample_ids ): """ - Parametrized test for comparison operator warnings and result length on string and mixed-type columns. + Test for comparison operator warnings and result length on string and mixed-type columns. """ manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) result = manager.run_query(filter_string=f"{column}:{filter_string}") @@ -883,7 +883,7 @@ class TestSemicolonColumnTypeClassification: def test_semicolon_cell_with_non_numeric_part_makes_column_string( self, sample_tsv_with_semicolon_mixed_type_column ): - """A column with a cell '1;1-2' should be treated as string because '1-2' is not a number.""" + """Test that a column with a cell '1;1-2' should be treated as string because '1-2' is not a number.""" manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) assert not pd.api.types.is_numeric_dtype(manager.df["Code"]) @@ -893,7 +893,7 @@ def test_semicolon_cell_with_non_numeric_part_makes_column_string( def test_semicolon_cell_with_non_numeric_part_warns_on_inequality( self, sample_tsv_with_semicolon_mixed_type_column ): - """Inequality filter on a column broken by '1;1-2' should produce a warning.""" + """Test that inequality filter on a column broken by '1;1-2' should produce a warning.""" manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) result = manager.run_query(filter_string="Code:>2") @@ -903,7 +903,7 @@ def test_semicolon_cell_with_non_numeric_part_warns_on_inequality( def test_semicolon_cell_with_non_numeric_part_string_matching_works( self, sample_tsv_with_semicolon_mixed_type_column ): - """String matching should still work on the mixed column. Filtering for '1-2' should matches cell value '1;1-2'.""" + """Test that string matching should still work on the mixed column. Filtering for '1-2' should matches cell value '1;1-2'.""" manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) result = manager.run_query(filter_string="Code:1-2") @@ -914,7 +914,7 @@ def test_semicolon_cell_with_non_numeric_part_string_matching_works( def test_semicolon_cell_with_non_numeric_part_single_numeric_match( self, sample_tsv_with_semicolon_mixed_type_column ): - """String matching for '3' on the mixed column should return S2 (exact string match).""" + """Test that string matching for '3' on the mixed column should return S2 (exact string match).""" manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) result = manager.run_query(filter_string="Code:3") @@ -923,7 +923,7 @@ def test_semicolon_cell_with_non_numeric_part_single_numeric_match( assert len(sample_ids) == 1 def test_purely_numeric_semicolon_column_supports_numeric_ops(self, sample_tsv_with_semicolon_mixed_type_column): - """A column with only numeric semicolon values (e.g., '10;20;30') should support numeric operations.""" + """Test that a column with only numeric semicolon values (e.g., '10;20;30') should support numeric operations.""" manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) assert manager._is_semicolon_separated_numeric_column("PureNumericSemicolon") @@ -937,7 +937,7 @@ def test_purely_numeric_semicolon_column_supports_numeric_ops(self, sample_tsv_w assert not any("string column" in w.lower() for w in result.warnings) def test_purely_numeric_semicolon_column_range_filter(self, sample_tsv_with_semicolon_mixed_type_column): - """A purely numeric semicolon column should support range operations.""" + """Test that a purely numeric semicolon column should support range operations.""" manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) result = manager.run_query(filter_string="PureNumericSemicolon:25-45") @@ -957,7 +957,7 @@ class TestLoadFileValidation: query engine catches the same formatting issues with clear error messages.""" def test_commas_in_data_raises_at_tsv_load(self, tmp_path): - """Commas in any cell value should raise SidecarMetadataFormatError during load_file().""" + """Test that commas in any cell value raises SidecarMetadataFormatError during load_file().""" tsv_content = "#Sample_ID\tArea\tWeight\nS1\tNorth,South\t12.5\nS2\tEast\t18.0\n" tsv_file = tmp_path / "commas.tsv" tsv_file.write_text(tsv_content) @@ -967,7 +967,7 @@ def test_commas_in_data_raises_at_tsv_load(self, tmp_path): assert "commas" in str(excinfo.value).lower() def test_commas_in_non_queried_column_raises_at_tsv_load(self, tmp_path): - """Commas should be caught in all columns when the tsv is loaded, not just the column being queried.""" + """Test that commas are caught in all columns when the tsv is loaded, not just the column being queried.""" tsv_content = "#Sample_ID\tArea\tBadCol\nS1\tNorth\thas,comma\nS2\tEast\tclean\n" tsv_file = tmp_path / "commas_other_col.tsv" tsv_file.write_text(tsv_content) @@ -977,7 +977,7 @@ def test_commas_in_non_queried_column_raises_at_tsv_load(self, tmp_path): assert "commas" in str(excinfo.value).lower() def test_duplicate_column_names_raises(self, tmp_path): - """Duplicate column names should raise SidecarMetadataFormatError during load_file(). + """Test that duplicate column names raise SidecarMetadataFormatError during load_file(). Without this check, pandas might silently rename them (e.g., 'Area', 'Area.1').""" tsv_content = "#Sample_ID\tArea\tArea\nS1\tNorth\tSouth\nS2\tEast\tWest\n" tsv_file = tmp_path / "duplicate_cols.tsv" @@ -988,7 +988,7 @@ def test_duplicate_column_names_raises(self, tmp_path): assert "duplicate" in str(excinfo.value).lower() def test_empty_column_name_raises(self, tmp_path): - """Empty column names should raise SidecarMetadataFormatError during load_file().""" + """Test that empty column names raise SidecarMetadataFormatError during load_file().""" tsv_content = "#Sample_ID\t\tWeight\nS1\tNorth\t12.5\nS2\tEast\t18.0\n" tsv_file = tmp_path / "empty_col.tsv" tsv_file.write_text(tsv_content) @@ -998,7 +998,7 @@ def test_empty_column_name_raises(self, tmp_path): assert "empty" in str(excinfo.value).lower() def test_semicolon_in_sample_id_raises(self, tmp_path): - """Semicolons in Sample_ID values should raise SidecarSampleIDError during load_file(). + """Test that semicolons in Sample_ID values raise SidecarSampleIDError during load_file(). Sample_ID must contain exactly one value per row.""" tsv_content = "#Sample_ID\tArea\nS1;S2\tNorth\nS3\tEast\n" tsv_file = tmp_path / "semicolon_sample_id.tsv" @@ -1009,24 +1009,24 @@ def test_semicolon_in_sample_id_raises(self, tmp_path): assert "semicolon" in str(excinfo.value).lower() def test_missing_sample_id_column_raises(self, sample_tsv_missing_sample_id_column): - """Missing Sample_ID column should raise SidecarColumnNotFoundError.""" + """Test that missing Sample_ID column raise SidecarColumnNotFoundError.""" with pytest.raises(SidecarColumnNotFoundError): SidecarQueryManager(file=sample_tsv_missing_sample_id_column) def test_empty_sample_id_raises(self, sample_tsv_with_invalid_sample_ids): - """Empty Sample_ID values should raise SidecarSampleIDError.""" + """Test that empty Sample_ID values raise SidecarSampleIDError.""" with pytest.raises(SidecarSampleIDError): SidecarQueryManager(file=sample_tsv_with_invalid_sample_ids) def test_duplicate_sample_id_raises(self, sample_tsv_with_duplicate_sample_ids): - """Duplicate Sample_ID values should raise SidecarSampleIDError.""" + """Test that duplicate Sample_ID values raise SidecarSampleIDError.""" with pytest.raises(SidecarSampleIDError) as excinfo: SidecarQueryManager(file=sample_tsv_with_duplicate_sample_ids) assert "duplicate" in str(excinfo.value).lower() def test_valid_file_loads_successfully(self, sample_tsv_with_edge_cases): """ - A TSV that follows DivBase requirements should load without errors. + Test that a TSV that follows DivBase requirements loads without errors. Use the edge case fixture to assert that these are fine as long as they all follow the DivBase requirements. """ diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index dbbaceba..06ebf1d9 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -139,19 +139,19 @@ class TestHeaderValidation: """Test validation of header row.""" def test_wrong_first_column_name(self, header_errors_tsv, project_samples): - """First column must be '#Sample_ID'.""" + """Test that first column is '#Sample_ID'.""" stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) assert any("First column must be named '#Sample_ID'" in e for e in errors) def test_duplicate_column_names(self, header_errors_tsv, project_samples): - """Duplicate column names should be detected.""" + """Test that duplicate column names are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) assert any("Duplicate column names" in e and "Area" in e for e in errors) def test_empty_column_name(self, header_errors_tsv, project_samples): - """Empty column names should be detected.""" + """Test that empty column names are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) assert any("Empty column name" in e for e in errors) @@ -161,19 +161,19 @@ class TestSampleIDValidation: """Test validation of Sample_ID column.""" def test_empty_sample_id(self, sample_errors_tsv, project_samples): - """Empty Sample_ID should be detected.""" + """Test that empty Sample_ID are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(sample_errors_tsv, project_samples) assert any("Sample_ID is empty" in e for e in errors) def test_semicolon_in_sample_id(self, sample_errors_tsv, project_samples): - """Sample_ID containing semicolon should be detected.""" + """Test that Sample_ID containing semicolon are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(sample_errors_tsv, project_samples) assert any("contains semicolon" in e and "S3;S4" in e for e in errors) def test_duplicate_sample_id(self, sample_errors_tsv, project_samples): - """Duplicate Sample_IDs should be detected.""" + """Test that duplicate Sample_IDs are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(sample_errors_tsv, project_samples) assert any("Duplicate Sample_ID" in e and "S1" in e for e in errors) @@ -183,19 +183,19 @@ class TestFormattingValidation: """Test validation of TSV formatting.""" def test_wrong_column_count(self, format_errors_tsv, project_samples): - """Rows with wrong number of columns should be detected.""" + """Test that rows with wrong number of columns are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) assert any("Expected 3 tab-separated columns" in e and "found 2" in e for e in errors) def test_comma_in_cell(self, format_errors_tsv, project_samples): - """Commas in cells should be detected.""" + """Test that commas in cells are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) assert any("forbidden character ','" in e for e in errors) def test_whitespace_warning(self, format_errors_tsv, project_samples): - """Leading/trailing whitespace should generate warnings.""" + """Test that leading/trailing whitespace generate warnings.""" stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) assert any("leading or trailing whitespace" in w for w in warnings) @@ -209,28 +209,28 @@ class TestTypeValidation: """ def test_mixed_types_in_column_is_warning(self, type_errors_tsv, project_samples): - """Columns with mixed numeric and string types should produce a warning (not error) and be classified as mixed_type.""" + """Test that columns with mixed numeric and string types produce a warning (not error) and be classified as mixed_type.""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) assert any("mixed" in w.lower() and "Population" in w for w in warnings) assert not any("mixed types" in e.lower() and "Population" in e for e in errors) def test_mixed_types_in_cell_is_warning(self, type_errors_tsv, project_samples): - """Cells with mixed types (e.g., '1;three;5') should produce a warning (not error).""" + """Test that cells with mixed types (e.g., '1;three;5') produce a warning (not error).""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) assert any("1;three;5" in w and "mixed types" in w.lower() for w in warnings) assert not any("1;three;5" in e and "mixed types" in e.lower() for e in errors) def test_hyphen_in_numeric_looking_column_is_warning(self, type_errors_tsv, project_samples): - """Hyphens in values that look like range notation should produce a warning (not error).""" + """Test that hyphens in values that look like range notation produce a warning (not error).""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) assert any("hyphen" in w.lower() and "3-5" in w for w in warnings) assert not any("hyphen" in e.lower() and "3-5" in e for e in errors) def test_cell_and_column_level_mixed_types_are_warnings(self, type_errors_tsv, project_samples): - """When a column has both cell-level and column-level mixed types, both should produce warnings (not errors).""" + """Test that when a column has both cell-level and column-level mixed types, both produce warnings (not errors).""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) assert any("1;three;5" in w and "mixed types" in w.lower() for w in warnings) @@ -240,7 +240,7 @@ def test_cell_and_column_level_mixed_types_are_warnings(self, type_errors_tsv, p def test_stats_show_mixed_type_columns(self, type_errors_tsv, project_samples): """ - Stats should show columns as mixed-type for informational purposes. + Test that stats show columns as mixed-type information to user. The type_errors_tsv fixture used here has columns with mixed types. """ stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) @@ -250,7 +250,7 @@ def test_stats_show_mixed_type_columns(self, type_errors_tsv, project_samples): assert len(stats["mixed_type_columns"]) == 3 def test_multi_value_numeric_cells_are_numeric(self, numeric_multi_values_tsv, project_samples): - """Multi-value numeric cells (e.g., '2;4') should be correctly classified as numeric, not string or mixed-type.""" + """Test that multi-value numeric cells (e.g., '2;4') are correctly classified as numeric, not string or mixed-type.""" stats, errors, warnings = MetadataTSVValidator.validate(numeric_multi_values_tsv, project_samples) assert "Scores" in stats["numeric_columns"] @@ -266,7 +266,7 @@ class TestDimensionMatching: """Test validation against project dimensions.""" def test_samples_not_in_project(self, valid_tsv): - """Samples in TSV but not in project should be errors.""" + """Test that samples in TSV but not in project raise error.""" project_samples = {"S1", "S2"} stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) @@ -276,7 +276,7 @@ def test_samples_not_in_project(self, valid_tsv): ) def test_samples_not_in_tsv(self, valid_tsv): - """Samples in project but not in TSV should be warnings.""" + """Test that samples in project but not in TSV produce warnings.""" project_samples = {"S1", "S2", "S3", "S10", "S20"} stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) @@ -294,7 +294,7 @@ class TestStatistics: """Test statistics collection.""" def test_statistics_collection(self, valid_tsv, project_samples): - """Verify statistics are correctly collected.""" + """Test that statistics are correctly collected.""" stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) assert stats["total_columns"] == 4 @@ -308,7 +308,7 @@ def test_statistics_collection(self, valid_tsv, project_samples): assert stats["has_multi_values"] is True def test_no_multi_values_detected(self, no_multi_values_tsv): - """Test detection when no semicolon-separated values present.""" + """Test multi-value detection when no semicolon-separated values are present.""" stats, errors, warnings = MetadataTSVValidator.validate(no_multi_values_tsv, {"S1", "S2"}) assert stats["has_multi_values"] is False @@ -317,7 +317,7 @@ class TestEdgeCases: """Test edge cases and error conditions.""" def test_empty_file(self, project_samples, tmp_path): - """Empty file should be detected.""" + """Test that empty files are detected.""" empty_file = tmp_path / "empty.tsv" empty_file.write_text("") @@ -326,7 +326,7 @@ def test_empty_file(self, project_samples, tmp_path): assert any("File is empty" in e for e in errors) def test_nonexistent_file(self, project_samples): - """Nonexistent file should be handled gracefully.""" + """Test that nonexistent files are handled gracefully.""" stats, errors, warnings = MetadataTSVValidator.validate(Path("/nonexistent/file.tsv"), project_samples) assert any("Failed to read file" in e for e in errors) @@ -380,7 +380,7 @@ def semicolon_mixed_tsv(self, tmp_path): return tsv_file def test_semicolon_cell_with_non_numeric_part_is_mixed(self, semicolon_mixed_tsv): - """A column with cell '1;1-2' should be classified as mixed-type because '1-2' is not a number.""" + """Test that a column with cell '1;1-2' is classified as mixed-type because '1-2' is not a number.""" stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) assert "Code" in stats["mixed_type_columns"] @@ -388,25 +388,25 @@ def test_semicolon_cell_with_non_numeric_part_is_mixed(self, semicolon_mixed_tsv assert "Code" not in stats["string_columns"] def test_semicolon_cell_mixed_produces_cell_level_warning(self, semicolon_mixed_tsv): - """A cell '1;1-2' should produce a cell-level mixed-type warning.""" + """Test that a cell '1;1-2' produces a cell-level mixed-type warning.""" stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) assert any("1;1-2" in w and "mixed types" in w.lower() for w in warnings) def test_semicolon_cell_mixed_produces_column_level_warning(self, semicolon_mixed_tsv): - """The column-level mixed-type warning should mention the semicolon classification rule.""" + """Test that the column-level mixed-type warning mentions the semicolon classification rule.""" stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) assert any("semicolon-separated" in w and "Code" in w for w in warnings) def test_semicolon_cell_mixed_is_not_error(self, semicolon_mixed_tsv): - """Mixed types from semicolon cells should NOT produce errors.""" + """Test that mixed types from semicolon cells doesn't produce errors.""" stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) assert not any("mixed" in e.lower() for e in errors) def test_purely_numeric_semicolon_column_stays_numeric(self, semicolon_mixed_tsv): - """A column with only numeric values in semicolons (e.g., '10;20;30') should be numeric.""" + """Test that a column with only numeric values in semicolons (e.g., '10;20;30') should be numeric.""" stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) assert "PureNumSemicolon" in stats["numeric_columns"] From 75d03ab8296e77fa4edcdec328549014819ddf0b Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 15:37:36 +0100 Subject: [PATCH 065/100] Revert dev doc comment on migrations After discussion, this is an error prone way of doing it. Should use docker compose and db-migrator instead. --- docs/development/database-migrations.md | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/docs/development/database-migrations.md b/docs/development/database-migrations.md index 4c5c46d2..ac94eab7 100644 --- a/docs/development/database-migrations.md +++ b/docs/development/database-migrations.md @@ -95,26 +95,6 @@ You can also run the `pytest-alembic` tests to further validate the newly create pytest tests/migrations ``` -### Local Dev with Docker Compose: Handling Alembic Migration Errors When Switching Branches - -When working in local development, there are cases where you might to temporarily switch to a different git branch that has a different set of Alembic migration files. For instance, you are working on one branch and switch to review another branch with additional migration files that, for some reason, you do not want to merge into your branch yet. - -After switching back to your branch, you might find that the Docker Compose stack no longer will be able to start. The reason for this is: if the database's migration history (the `alembic_version` table) in the Docker Compose environment expects a migration revision that is missing in your current branch, the `divbase-db-migrator-1` container will fail to start and log an error like: `FAILED: Can't locate revision identified by ''` - -If you **are sure** your database schema matches the migrations in your current branch, you can manually update the `alembic_version` table in your local `divbase-postgres-1` container to point to the latest migration in your branch. Find the revision ID for the latest migration file in your branch (``) and run: - -```bash -docker exec -it divbase-postgres-1 psql -U divbase_user -d divbase_db -c "UPDATE alembic_version SET version_num = '';" -``` - -After this, restart the stack with: - -```bash -docker compose -f docker/divbase_compose.yaml down && docker compose -f docker/divbase_compose.yaml watch -``` - -**Warning**: Only do this for local Docker Compose environments and for cases where you know that you can recover/rebuild/afford to lose the data in the local postgres instance. If you are not sure about this, it might actually be safer to merge in the other branch to yours (assuming that you know that both branches will eventually be merged to main after review). - ## Production Deployment Documentation on how to run migrations in production/deployed environments is covered in our [private repository, argocd-divbase](https://github.com/ScilifelabDataCentre/argocd-divbase). From f20fc433570fa5bf2606543d1027ecafb9817d3a Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 16:57:15 +0100 Subject: [PATCH 066/100] Fix test assertions sensitive to linebreaks --- .../cli_commands/test_query_cli.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/e2e_integration/cli_commands/test_query_cli.py b/tests/e2e_integration/cli_commands/test_query_cli.py index f4789d36..ba3060b0 100644 --- a/tests/e2e_integration/cli_commands/test_query_cli.py +++ b/tests/e2e_integration/cli_commands/test_query_cli.py @@ -489,9 +489,19 @@ def test_error_in_terminal_when_querying_nonexistent_column_in_tsv( cli_result = runner.invoke(app, command) output = cli_result.stdout + (str(cli_result.exception) if cli_result.exception else "") - assert "Column 'NonExistentColumn' not found in the TSV file. Skipping this filter condition." in output - assert "Invalid filter conditions: none of the filters matched any records. Returning ALL records." in output - assert "This may be a large result set. Please check your filter keys, value spelling, and syntax." in output + # Normalize whitespace to handle line wrapping + normalized_output = " ".join(output.split()) + assert ( + "Column 'NonExistentColumn' not found in the TSV file. Skipping this filter condition." in normalized_output + ) + assert ( + "Invalid filter conditions: none of the filters matched any records. Returning ALL records." + in normalized_output + ) + assert ( + "This may be a large result set. Please check your filter keys, value spelling, and syntax." + in normalized_output + ) def test_error_in_terminal_when_duplicate_sample_IDs_in_tsv( self, From 574d45935375fffdd07355a246b6d67dbca50d57 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 17:16:55 +0100 Subject: [PATCH 067/100] Refactor dimensions-bucket check to be more strict Before it was only used for bcftools queries, but now also used for sample metadata. The previous version of the helper function required metadata query results and thus only checked for files needed from that query. But this creates a catch 22 where files not in dimensions will not be in the results and thus not checked by the helper. The new helper is run earlier in the query tasks and only checks VCF files in the bucket vs the dimensions file. This should be hopefully lead to less false positives/negatives. --- .../src/divbase_api/routes/queries.py | 2 + .../src/divbase_api/worker/tasks.py | 48 +++++++++++-------- .../divbase-lib/src/divbase_lib/exceptions.py | 6 +++ 3 files changed, 36 insertions(+), 20 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/routes/queries.py b/packages/divbase-api/src/divbase_api/routes/queries.py index 769c0915..e7e430bd 100644 --- a/packages/divbase-api/src/divbase_api/routes/queries.py +++ b/packages/divbase-api/src/divbase_api/routes/queries.py @@ -30,6 +30,7 @@ SampleMetadataQueryTaskResult, ) from divbase_lib.exceptions import ( + DimensionsNotUpToDateWithBucketError, SidecarColumnNotFoundError, SidecarInvalidFilterError, SidecarMetadataFormatError, @@ -95,6 +96,7 @@ async def sample_metadata_query( SidecarSampleIDError, SidecarMetadataFormatError, TaskUserError, + DimensionsNotUpToDateWithBucketError, ) as e: # These are simple exceptions (that inherit from base Exception) that are able to pass through Celery's JSON serialization/deserialization without becoming UnpicklableExceptionWrapper. # TaskUserError is a wrapper exception that allow to nest more complex exceptions that would normally trigger UnpicklableExceptionWrapper, and still be able to pass through the serialization/deserialization. diff --git a/packages/divbase-api/src/divbase_api/worker/tasks.py b/packages/divbase-api/src/divbase_api/worker/tasks.py index 227c333c..f75e738a 100644 --- a/packages/divbase-api/src/divbase_api/worker/tasks.py +++ b/packages/divbase-api/src/divbase_api/worker/tasks.py @@ -44,7 +44,7 @@ ) from divbase_api.worker.worker_db import SyncSessionLocal from divbase_lib.api_schemas.vcf_dimensions import DimensionUpdateTaskResult -from divbase_lib.exceptions import NoVCFFilesFoundError, TaskUserError +from divbase_lib.exceptions import DimensionsNotUpToDateWithBucketError, NoVCFFilesFoundError, TaskUserError logger = logging.getLogger(__name__) @@ -188,6 +188,13 @@ def sample_metadata_query_task( # Wrap exeception in TaskUserError () to avoid Celery serilization UnpicklableExceptionWrapper issue raise TaskUserError(str(VCFDimensionsEntryMissingError(project_name=project_name))) from None + latest_versions_of_bucket_files = s3_file_manager.latest_version_of_all_files(bucket_name=bucket_name) + + _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket( + vcf_dimensions_data=vcf_dimensions_data, + latest_versions_of_bucket_files=latest_versions_of_bucket_files, + ) + metadata_result = run_sidecar_metadata_query( file=metadata_path, filter_string=tsv_filter, @@ -248,6 +255,11 @@ def bcftools_pipe_task( latest_versions_of_bucket_files = s3_file_manager.latest_version_of_all_files(bucket_name=bucket_name) + _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket( + vcf_dimensions_data=vcf_dimensions_data, + latest_versions_of_bucket_files=latest_versions_of_bucket_files, + ) + metadata_path = _download_sample_metadata( metadata_tsv_name=metadata_tsv_name, bucket_name=bucket_name, s3_file_manager=s3_file_manager ) @@ -259,12 +271,6 @@ def bcftools_pipe_task( vcf_dimensions_data=vcf_dimensions_data, ) - _check_that_file_versions_match_dimensions_index( - vcf_dimensions_data=vcf_dimensions_data, - latest_versions_of_bucket_files=latest_versions_of_bucket_files, - metadata_result=metadata_result, - ) - files_to_download = metadata_result.unique_filenames sample_and_filename_subset = metadata_result.sample_and_filename_subset @@ -715,26 +721,28 @@ def _calculate_pairwise_overlap_types_for_sample_sets(sample_sets_dict: dict[tup return sample_set_overlap_results -def _check_that_file_versions_match_dimensions_index( +def _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket( vcf_dimensions_data: dict, latest_versions_of_bucket_files: dict[str, str], - metadata_result, ) -> None: """ - Ensure that the VCF dimensions index is up to date with the latest versions of the VCF files. + Check that all VCF files in the bucket are indexed in the dimensions file and raise and error with filenames if not. """ - # Build lookup dict: filename -> s3_version_id - vcf_lookup = {entry["vcf_file_s3_key"]: entry["s3_version_id"] for entry in vcf_dimensions_data["vcf_files"]} - for file in metadata_result.unique_filenames: - file_version_ID = latest_versions_of_bucket_files.get(file, "null") + indexed_vcf_files = {entry["vcf_file_s3_key"] for entry in vcf_dimensions_data.get("vcf_files", [])} - if file not in vcf_lookup or vcf_lookup[file] != file_version_ID: - logger.error(f"VCF dimensions are outdated for file: {file}") - raise ValueError( - "The VCF dimensions file is not up to date with the VCF files in the project. " - "Please run 'divbase-cli dimensions update --project ' and then submit the query again." - ) + vcf_files_in_bucket = { + file for file in latest_versions_of_bucket_files if file.endswith(".vcf") or file.endswith(".vcf.gz") + } + + unindexed_files = vcf_files_in_bucket - indexed_vcf_files + + if unindexed_files: + logger.error(f"Found {len(unindexed_files)} unindexed VCF file(s): {sorted(unindexed_files)}") + raise DimensionsNotUpToDateWithBucketError( + f"The following VCF files or file versions in the project are not part of the project's VCF dimensions: '{', '.join(sorted(unindexed_files))}'. " + "\nPlease run 'divbase-cli dimensions update --project ' and then submit the query again." + ) def _record_task_metrics(task_metrics: TaskMetrics) -> None: diff --git a/packages/divbase-lib/src/divbase_lib/exceptions.py b/packages/divbase-lib/src/divbase_lib/exceptions.py index 6930f3b7..06ff01cc 100644 --- a/packages/divbase-lib/src/divbase_lib/exceptions.py +++ b/packages/divbase-lib/src/divbase_lib/exceptions.py @@ -113,6 +113,12 @@ class SidecarMetadataFormatError(Exception): pass +class DimensionsNotUpToDateWithBucketError(Exception): + """Raised when there are VCF files in the bucket that are not in the database VCF dimensions.""" + + pass + + class TaskUserError(Exception): """ Raised in Celery tasks when an error needs to propagate back to the CLI user. From eb86f5eb6a192ef132fdf301e64834612e783aa8 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 18:43:25 +0100 Subject: [PATCH 068/100] Ensure DivBase results VCF not in updated check Now that the helper function runs earlier in the tasks and is checking all VCFs, the DivBase results VCFs should be omitted from the check. --- packages/divbase-api/src/divbase_api/worker/tasks.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/packages/divbase-api/src/divbase_api/worker/tasks.py b/packages/divbase-api/src/divbase_api/worker/tasks.py index f75e738a..adc1ce17 100644 --- a/packages/divbase-api/src/divbase_api/worker/tasks.py +++ b/packages/divbase-api/src/divbase_api/worker/tasks.py @@ -193,6 +193,7 @@ def sample_metadata_query_task( _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket( vcf_dimensions_data=vcf_dimensions_data, latest_versions_of_bucket_files=latest_versions_of_bucket_files, + project_id=project_id, ) metadata_result = run_sidecar_metadata_query( @@ -258,6 +259,7 @@ def bcftools_pipe_task( _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket( vcf_dimensions_data=vcf_dimensions_data, latest_versions_of_bucket_files=latest_versions_of_bucket_files, + project_id=project_id, ) metadata_path = _download_sample_metadata( @@ -724,18 +726,25 @@ def _calculate_pairwise_overlap_types_for_sample_sets(sample_sets_dict: dict[tup def _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket( vcf_dimensions_data: dict, latest_versions_of_bucket_files: dict[str, str], + project_id: int, ) -> None: """ Check that all VCF files in the bucket are indexed in the dimensions file and raise and error with filenames if not. + Skipped VCF files (i.e DivBase-generated result files) are excluded from the check. """ indexed_vcf_files = {entry["vcf_file_s3_key"] for entry in vcf_dimensions_data.get("vcf_files", [])} + with SyncSessionLocal() as db: + skipped_vcfs = get_skipped_vcfs_by_project_worker(db=db, project_id=project_id) + skipped_vcf_files = set(skipped_vcfs.keys()) + vcf_files_in_bucket = { file for file in latest_versions_of_bucket_files if file.endswith(".vcf") or file.endswith(".vcf.gz") } - unindexed_files = vcf_files_in_bucket - indexed_vcf_files + tracked_vcf_files = indexed_vcf_files | skipped_vcf_files + unindexed_files = vcf_files_in_bucket - tracked_vcf_files if unindexed_files: logger.error(f"Found {len(unindexed_files)} unindexed VCF file(s): {sorted(unindexed_files)}") From 4775237f8e8ef6e173af6f48d13e4c08e8a675fc Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 19:29:13 +0100 Subject: [PATCH 069/100] Have helper also consider VCF file version --- .../src/divbase_api/worker/tasks.py | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/worker/tasks.py b/packages/divbase-api/src/divbase_api/worker/tasks.py index adc1ce17..7e3a810b 100644 --- a/packages/divbase-api/src/divbase_api/worker/tasks.py +++ b/packages/divbase-api/src/divbase_api/worker/tasks.py @@ -729,11 +729,14 @@ def _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket( project_id: int, ) -> None: """ - Check that all VCF files in the bucket are indexed in the dimensions file and raise and error with filenames if not. + Check that all VCF files in the bucket are indexed in the dimensions file and that their version IDs match between bucket and VCF dimensions index. + Skipped VCF files (i.e DivBase-generated result files) are excluded from the check. """ - indexed_vcf_files = {entry["vcf_file_s3_key"] for entry in vcf_dimensions_data.get("vcf_files", [])} + indexed_vcf_lookup = { + entry["vcf_file_s3_key"]: entry["s3_version_id"] for entry in vcf_dimensions_data.get("vcf_files", []) + } with SyncSessionLocal() as db: skipped_vcfs = get_skipped_vcfs_by_project_worker(db=db, project_id=project_id) @@ -743,13 +746,25 @@ def _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket( file for file in latest_versions_of_bucket_files if file.endswith(".vcf") or file.endswith(".vcf.gz") } + indexed_vcf_files = set(indexed_vcf_lookup.keys()) tracked_vcf_files = indexed_vcf_files | skipped_vcf_files unindexed_files = vcf_files_in_bucket - tracked_vcf_files - if unindexed_files: - logger.error(f"Found {len(unindexed_files)} unindexed VCF file(s): {sorted(unindexed_files)}") + outdated_files = [] + for file_name in indexed_vcf_files: + if file_name in vcf_files_in_bucket: + indexed_version = indexed_vcf_lookup[file_name] + bucket_version = latest_versions_of_bucket_files.get(file_name, "null") + if indexed_version != bucket_version: + outdated_files.append(file_name) + logger.error( + f"VCF file '{file_name}' version mismatch: indexed={indexed_version}, bucket={bucket_version}" + ) + unindex_or_outdated_files = sorted(set(unindexed_files) | set(outdated_files)) + + if unindex_or_outdated_files: raise DimensionsNotUpToDateWithBucketError( - f"The following VCF files or file versions in the project are not part of the project's VCF dimensions: '{', '.join(sorted(unindexed_files))}'. " + f"The following VCF files or file versions in the project are not part of the project's VCF dimensions: '{', '.join(unindex_or_outdated_files)}'. " "\nPlease run 'divbase-cli dimensions update --project ' and then submit the query again." ) From 9304664017f388feff2f8109123791385be472a9 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 19:30:34 +0100 Subject: [PATCH 070/100] Update tests after helper refactoring --- .../e2e_integration/cli_commands/test_query_cli.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/e2e_integration/cli_commands/test_query_cli.py b/tests/e2e_integration/cli_commands/test_query_cli.py index ba3060b0..86178312 100644 --- a/tests/e2e_integration/cli_commands/test_query_cli.py +++ b/tests/e2e_integration/cli_commands/test_query_cli.py @@ -33,6 +33,7 @@ from divbase_cli.divbase_cli import app from divbase_lib.api_schemas.task_history import TaskHistoryResult from divbase_lib.divbase_constants import QUERY_RESULTS_FILE_PREFIX +from divbase_lib.exceptions import DimensionsNotUpToDateWithBucketError logging.basicConfig(level=logging.DEBUG) runner = CliRunner() @@ -244,12 +245,19 @@ def test_bcftools_pipe_query_errors( assert expected_error in full_error, f"Expected '{expected_error}' in error message, but got: {full_error}" -def test_get_task_status_by_task_id(CONSTANTS, logged_in_edit_user_with_existing_config, db_session_sync): +def test_get_task_status_by_task_id( + CONSTANTS, logged_in_edit_user_with_existing_config, db_session_sync, run_update_dimensions, project_map +): """ Get the status of a task by its ID, as in the task ID int that is returned to the users, not the Celery UUID task ID. Uses the PostgreSQL Celery results backend to get task info. """ project_name = CONSTANTS["QUERY_PROJECT"] + project_id = project_map[project_name] + bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] + user_id = 1 + run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) + tsv_filter = "Area:West of Ireland,Northern Portugal;" arg_command = "view -s SAMPLES; view -r 21:15000000-25000000" @@ -347,10 +355,10 @@ def patched_download_vcf_files(files_to_download, bucket_name, s3_file_manager): "user_id": 1, "job_id": 1, } - with pytest.raises(ValueError) as excinfo: + with pytest.raises(DimensionsNotUpToDateWithBucketError) as excinfo: bcftools_pipe_task(**params) assert ( - "The VCF dimensions file is not up to date with the VCF files in the project. Please run 'divbase-cli dimensions update --project ' and then submit the query again." + "The following VCF files or file versions in the project are not part of the project's VCF dimensions" in str(excinfo.value) ) From 77bfd3347587d693ff56df98c64e8f2b2d34c5c0 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Fri, 13 Feb 2026 19:51:24 +0100 Subject: [PATCH 071/100] Add test for VCF in bucket not index in dimensions DimensionsNotUpToDateWithBucketError is now raise when the there are VCF files in the bucket that are not indexed in the dimensions file, or when there are indexed VCF files whose version IDs don't match the bucket versions. --- .../cli_commands/test_query_cli.py | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/tests/e2e_integration/cli_commands/test_query_cli.py b/tests/e2e_integration/cli_commands/test_query_cli.py index 86178312..fbcc6eb0 100644 --- a/tests/e2e_integration/cli_commands/test_query_cli.py +++ b/tests/e2e_integration/cli_commands/test_query_cli.py @@ -293,11 +293,18 @@ def test_get_task_status_by_task_id( assert result in ["PENDING", "STARTED", "SUCCESS", "FAILURE"] +@pytest.mark.parametrize( + "test_scenario,vcf_filename,job_id,cleanup_file", + [ + ("version_outdated", "HOM_20ind_17SNPs.1.vcf.gz", 1, False), + ("unindexed", "HOM_20ind_17SNPs_first_10_samples.vcf.gz", 2, True), + ], +) @patch( "divbase_api.worker.tasks.create_s3_file_manager", side_effect=lambda url=None: create_s3_file_manager(url="http://localhost:9002"), ) -def test_query_exits_when_vcf_file_version_is_outdated( +def test_query_exits_when_dimensions_are_outdated( mock_create_s3_manager, CONSTANTS, logged_in_edit_user_with_existing_config, @@ -305,10 +312,16 @@ def test_query_exits_when_vcf_file_version_is_outdated( run_update_dimensions, project_map, db_session_sync, + test_scenario, + vcf_filename, + job_id, + cleanup_file, ): """ - Test that updates the dimensions file, uploads a new version of a VCF file, then runs a query that should fail - because the dimensions file expects an older version of the VCF file. + Test that verifies DimensionsNotUpToDateWithBucketError is raised when the dimensions index is not up-to-date. Test for these cases: + 1. version_outdated: uploads a new version of an existing VCF file after dimensions update + 2. unindexed: uploads a new VCF file that is not present in the dimensions index + """ project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] project_id = project_map[project_name] @@ -322,28 +335,21 @@ def ensure_fixture_path(filename, fixture_dir="tests/fixtures"): return f"{fixture_dir}/{filename}" def patched_download_sample_metadata(metadata_tsv_name, bucket_name, s3_file_manager): - """ - Patches the path for the sidecar metadata file so that it can be read from fixtures and not be downloaded. - """ return Path(ensure_fixture_path(metadata_tsv_name, fixture_dir="tests/fixtures")) def patched_download_vcf_files(files_to_download, bucket_name, s3_file_manager): - """ - Needs the path in the worker container so that it is compatible with the docker exec patch below for running bcftools jobs. - """ pass with ( patch("divbase_api.worker.tasks._download_sample_metadata", new=patched_download_sample_metadata), patch("divbase_api.worker.tasks._download_vcf_files", new=patched_download_vcf_files), ): - test_file = (fixtures_dir / "HOM_20ind_17SNPs.1.vcf.gz").resolve() - - command = f"files upload {test_file} --project {project_name} --disable-safe-mode" + test_file_path = (fixtures_dir / vcf_filename).resolve() + command = f"files upload {test_file_path} --project {project_name} --disable-safe-mode" result = runner.invoke(app, command) assert result.exit_code == 0 - assert f"{str(test_file)}" in result.stdout + assert vcf_filename in result.stdout params = { "tsv_filter": "Area:West of Ireland;Sex:F", @@ -353,7 +359,7 @@ def patched_download_vcf_files(files_to_download, bucket_name, s3_file_manager): "project_id": project_id, "project_name": project_name, "user_id": 1, - "job_id": 1, + "job_id": job_id, } with pytest.raises(DimensionsNotUpToDateWithBucketError) as excinfo: bcftools_pipe_task(**params) @@ -362,6 +368,11 @@ def patched_download_vcf_files(files_to_download, bucket_name, s3_file_manager): in str(excinfo.value) ) + # Clean up the uploaded file if needed (for unindexed test to not affect other tests) + if cleanup_file: + command = f"files rm {vcf_filename} --project {project_name}" + runner.invoke(app, command) + class TestSidecarQueryTaskErrorsPropagation: """Test that errors in sidecar query tasks are propagated correctly to the CLI.""" From 6c2118d00fae181d801ae7da3763e06d00fea608 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 16 Feb 2026 11:10:23 +0100 Subject: [PATCH 072/100] Allow commas, but send warnings to user Now that multi-type columns are treated as string and sends a warning to the user if found, it makes little sense to forbid commas in the values of these columns. Treat this the same way as for hyphens and other special characters: allow them but parse them as string and send a warning. This updates the SidecarQueryManager and the TSV validator --- .../src/divbase_api/services/queries.py | 132 ++++++++---------- .../services/sample_metadata_tsv_validator.py | 30 +--- .../cli_commands/test_query_cli.py | 13 +- .../test_sample_metadata_queries.py | 28 ++-- .../test_sample_metadata_tsv_validator.py | 11 +- 5 files changed, 89 insertions(+), 125 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index ffdc9b3b..d9a72ba1 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -736,8 +736,6 @@ def load_file(self) -> "SidecarQueryManager": "Sample_ID must contain only one value per row (semicolons are not allowed)." ) - self._validate_no_commas_in_data() - except ( SidecarSampleIDError, SidecarColumnNotFoundError, @@ -784,20 +782,6 @@ def _read_and_validate_raw_header(self) -> None: f"Duplicate column names found: {duplicate_columns}. Each column name must be unique in the metadata file." ) - def _validate_no_commas_in_data(self) -> None: - """ - Validate that no cells in the entire DataFrame contain commas. - Matches the client-side validator's comma check. - """ - for col in self.df.columns: - for row_index, cell_value in enumerate(self.df[col].dropna()): - cell_str = str(cell_value).strip() - if cell_str and "," in cell_str: - raise SidecarMetadataFormatError( - f"Column '{col}' contains commas in value '{cell_str}' at row {row_index + 1}. " - f"Commas are not allowed in DivBase metadata files. Use semicolons (;) to separate multiple values." - ) - def get_unique_values(self, column: str) -> list: """ Method to fetch unique values from a specific column in the query result. Intended to be invoked on a SidecarQueryManager @@ -894,42 +878,40 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": # 1. Warn if the column has mixed types (some values look numeric) and that the column will be treat as string type. # 2. Warn if the filter uses numeric syntax on this string column. Do not raise error. if not is_numeric and not is_semicolon_numeric: - is_mixed = self._is_mixed_type_column(key) + is_mixed, example_values, total_count = self._is_mixed_type_column(key) problematic_filter_values = self._detect_numeric_filter_syntax_on_string_column( key, filter_string_values ) - comparison_operator_message = ( - f"DivBase comparison operators (>, <, >=, <=) only work on numeric columns. " - f"Use exact string matching instead (e.g., '{key}:value1,value2')." - ) - if is_mixed and problematic_filter_values: - warning_msg = ( - f"Column '{key}' has mixed types (both numeric-looking and non-numeric values) " - f"and is treated as a string column. A column is only numeric if all values " - f"(including each part in semicolon-separated cells) are valid numbers. " - f"Your filter contains comparison operators {problematic_filter_values} which are not " - f"supported on string columns. " - f"{comparison_operator_message}" - ) - logger.warning(warning_msg) - self.warnings.append(warning_msg) - elif problematic_filter_values: - warning_msg = ( - f"Column '{key}' is a string column but your filter contains comparison operators " - f"{problematic_filter_values} which are not supported on string columns. " - f"{comparison_operator_message}" - ) - logger.warning(warning_msg) - self.warnings.append(warning_msg) - elif is_mixed: - warning_msg = ( - f"Column '{key}' has mixed types (both numeric-looking and non-numeric values) " - f"and is treated as a string column. A column is only numeric if all values " - f"(including each part in semicolon-separated cells) are valid numbers. " - f"Comparison operators (>, <, >=, <=) are not available for this column." - ) + # Build warning message for string columns with possible issues. Multiple warnings are presented with indended hyphen + if is_mixed or problematic_filter_values: + warning_lines = [f"Column '{key}':"] + if is_mixed: + warning_lines.append( + " - Contains mixed types (e.g., numeric-looking values mixed with non-numeric values, or special characters like commas (,) or hyphens (-), or Range notation such as '1-2')." + ) + if total_count > 0: + examples_str = ", ".join(f"'{v}'" for v in example_values) + warning_lines.append( + f" Found {total_count} cell(s) with problematic values. Showing up to three of those values as an example: {examples_str}" + ) + + warning_lines.append(" This column will be treated as a string column.") + warning_lines.append( + " To store multiple numeric values, use semicolon-separated values (;) instead." + ) + if problematic_filter_values: + warning_lines.append( + f" - Your filter contains comparison operators {problematic_filter_values}, which are not supported on string columns." + ) + warning_lines.append( + " DivBase comparison operators (>, <, >=, <=) only work on numeric columns." + ) + warning_lines.append( + f" Use exact string matching instead (e.g., '{key}:value1,value2')." + ) + warning_msg = "\n".join(warning_lines) logger.warning(warning_msg) self.warnings.append(warning_msg) @@ -1002,7 +984,6 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": # Non-numeric column: handle as discrete string values # Supports NOT operator with ! prefix: e.g., "Area:!North" or "Area:North,!South" filter_string_values_list = filter_string_values.split(",") - self._validate_no_commas_in_column(key) positive_values, negated_values = self._separate_positive_and_negated_values( filter_values=filter_string_values_list @@ -1060,19 +1041,6 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": return self - def _validate_no_commas_in_column(self, key: str) -> None: - """ - Helper method to validate that column values in the imported TSV does not contain commas. - Raises SidecarInvalidFilterError if any comma is found in the column values. - """ - for row_index, cell_value in enumerate(self.df[key].dropna()): - cell_str = str(cell_value).strip() - if cell_str and "," in cell_str: - raise SidecarInvalidFilterError( - f"Column '{key}' contains commas in value '{cell_str}' at row {row_index}. " - f"Commas are not allowed in DivBase metadata files. Use semicolons (;) to separate multiple values." - ) - def _is_semicolon_separated_numeric_column(self, key: str) -> bool: """ Helper method to detect if a column contains semicolon-separated numeric values. @@ -1099,8 +1067,6 @@ def _is_semicolon_separated_numeric_column(self, key: str) -> bool: if len(non_null_values) == 0: return False - self._validate_no_commas_in_column(key) - for cell_value in non_null_values: cell_str = str(cell_value).strip() if not cell_str: @@ -1121,48 +1087,62 @@ def _is_semicolon_separated_numeric_column(self, key: str) -> bool: return True - def _is_mixed_type_column(self, key: str) -> bool: + def _is_mixed_type_column(self, key: str) -> tuple[bool, list[str], int]: """ - Helper method to detect if a non-numeric column contains a mix of numeric-looking - and non-numeric values (e.g., Population_code with "8", "1a", "5a"). + Helper method to detect if a non-numeric column has mixed types. + + A column is considered mixed-type if it contains: + 1. Both numeric-looking and non-numeric values (e.g., "8", "1a", "5a") + 2. Special characters that suggest non-numeric use (commas, hyphens in non-negative-number contexts) This is called only for columns where pandas infers object dtype AND - _is_semicolon_separated_numeric_column returned False. It determines whether - the column has SOME numeric-looking values (mixed) vs. being purely string. + _is_semicolon_separated_numeric_column returned False. - This is used to provide a targeted warning to the user at query time: - the validator warns about mixed types at validation time, but users may - skip validation or the column may be intentionally mixed. + Returns a tuple of (is_mixed, example_values, total_count) where: + - is_mixed: True if the column should be treated as mixed-type (and thus string) + - example_values: A list of up to 3 example cell values that demonstrate the mixed types Limited to 3 for brevity. The CLI divbase-cli dimensions validate-metadata-file can be used to show all of them. + - total_count: Total number of cells with mixed types or special characters """ if key not in self.df.columns: - return False + return False, [], 0 non_null_values = self.df[key].dropna() if len(non_null_values) == 0: - return False + return False, [], 0 has_numeric = False has_non_numeric = False + example_values = [] + total_problematic_count = 0 for cell_value in non_null_values: cell_str = str(cell_value).strip() if not cell_str: continue + + cell_has_numeric = False + cell_has_non_numeric = False + parts = cell_str.split(";") for part in parts: part = part.strip() if not part: continue + try: float(part) + cell_has_numeric = True has_numeric = True except ValueError: + cell_has_non_numeric = True has_non_numeric = True - if has_numeric and has_non_numeric: - return True + if (cell_has_numeric and cell_has_non_numeric) or ("," in cell_str or "-" in cell_str): + total_problematic_count += 1 + if cell_str not in example_values and len(example_values) < 3: + example_values.append(cell_str) - return False + return (has_numeric and has_non_numeric), example_values, total_problematic_count def _detect_numeric_filter_syntax_on_string_column(self, key: str, filter_string_values: str) -> list[str]: """ diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index 3453db6d..9b7433d7 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -12,8 +12,6 @@ class MetadataTSVValidator: """Validates sidecar metadata TSV files against DivBase requirements.""" - FORBIDDEN_CHARS = [","] - def __init__(self, file_path: Path, project_samples: list[str] | set[str]): """ Initialize the validator. File path is the path to the TSV file to validate, @@ -132,10 +130,6 @@ def _validate_data_rows(self, rows: list[list[str]]) -> None: def _validate_cell(self, row_num: int, col_idx: int, col_name: str, cell: str) -> None: """Validate an individual cell.""" - for char in self.FORBIDDEN_CHARS: - if char in cell: - self.errors.append(f"Row {row_num}, Column '{col_name}': Cell contains forbidden character '{char}'") - if cell != cell.strip(): self.warnings.append( f"Row {row_num}, Column '{col_name}': Cell has leading or trailing whitespace " @@ -174,22 +168,6 @@ def _infer_column_type( cell_has_string = True column_types[col_idx].add("string") - # Check for hyphens in non-numeric values that might indicate range notation. - # Negative numbers should already have been classified as numeric with the float() check. - # This is a warning to help users who may have used range notation (e.g., "1-2") instead of - # semicolons (e.g., "1;2") in their data values. - if ( - "-" in value - and any(c.isdigit() for c in value) - and ("numeric" in column_types[col_idx] or all(t == "numeric" for t in column_types[col_idx] if t)) - ): - self.warnings.append( - f"Row {row_num}, Column '{col_name}': Value '{value}' contains a hyphen. " - f"This appears to be range notation (e.g., '1-2'), which is not allowed in data values. " - f"If this is meant to be a numeric multi-value column, use semicolons to separate values (e.g., '1;2'). " - f"This column will be treated as a string column." - ) - # Check for mixed types within the same cell (e.g., "1;abc") and warn the user if applicable if cell_has_numeric and cell_has_string: self.warnings.append( @@ -215,10 +193,12 @@ def _check_mixed_types(self, header: list[str], column_types: dict[int, set[str] if mixed_columns: self.warnings.append( - f"The following columns contain mixed types (both numeric-looking and string values): {mixed_columns}. " + "Clarification on mixed types columns: " "A column is only numeric if all values (including each part in semicolon-separated cells) are valid numbers. " - "These columns will be treated as string columns by DivBase. Numeric query operations " - "(ranges, inequalities) will not be applicable to these columns." + "Special characters like commas or hyphens in numeric-looking values, or Range notation (e.g., '1-2'), also cause DivBase to treat the column as string. " + "Use semicolons (;) to separate multiple numeric values. " + "These columns will be treated as string columns by DivBase. " + "Numeric query operations (ranges, inequalities) will not be applicable to these columns." ) def _validate_sample_names(self, tsv_samples: set[str]) -> None: diff --git a/tests/e2e_integration/cli_commands/test_query_cli.py b/tests/e2e_integration/cli_commands/test_query_cli.py index fbcc6eb0..68510c5c 100644 --- a/tests/e2e_integration/cli_commands/test_query_cli.py +++ b/tests/e2e_integration/cli_commands/test_query_cli.py @@ -566,7 +566,8 @@ def test_error_in_terminal_for_comma_in_metadata( tmp_path, ): """ - Test that SidecarMetadataFormatError is raised and propagated to terminal when TSV has commas in data. + Test that TSV files with commas generate warnings (not errors) and can still be queried. + Columns with commas are treated as string columns. """ project_name = CONSTANTS["SPLIT_SCAFFOLD_PROJECT"] bucket_name = CONSTANTS["PROJECT_TO_BUCKET_MAP"][project_name] @@ -578,18 +579,16 @@ def test_error_in_terminal_for_comma_in_metadata( ) tsv_file = tmp_path / "test_comma_in_data.tsv" - tsv_file.write_text("Sample_ID\tArea\nS1\tNorth,West\nS2\tSouth\n") + tsv_file.write_text("Sample_ID\tPopulation\nS1\t1,2\nS2\t3\n") command = f"files upload {tsv_file} --project {project_name}" result = runner.invoke(app, command) assert result.exit_code == 0 - command = f'query tsv "Area:North" --metadata-tsv-name {tsv_file.name} --project {project_name}' + command = f'query tsv "Population:2" --metadata-tsv-name {tsv_file.name} --project {project_name}' cli_result = runner.invoke(app, command) - assert ( - "Column 'Area' contains commas in value 'North,West' at row 1. Commas are not allowed in DivBase metadata files." - in str(cli_result.exception) - ), "Expected error message about comma in metadata value" + assert cli_result.exit_code == 0, f"Query should succeed with comma warning. Output: {cli_result.output}" + assert "comma" in cli_result.output.lower(), "Expected warning message about comma in metadata value" @pytest.mark.integration diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 19292e9d..47ac9ba7 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -956,25 +956,27 @@ class TestLoadFileValidation: This ensures that even if a user skips the CLI validator, the server-side query engine catches the same formatting issues with clear error messages.""" - def test_commas_in_data_raises_at_tsv_load(self, tmp_path): - """Test that commas in any cell value raises SidecarMetadataFormatError during load_file().""" - tsv_content = "#Sample_ID\tArea\tWeight\nS1\tNorth,South\t12.5\nS2\tEast\t18.0\n" + def test_commas_in_mixed_numeric_column_detected_during_query(self, tmp_path): + """Test that commas in a column with mixed numeric and non-numeric values trigger mixed-type warning.""" + tsv_content = "#Sample_ID\tPopulation\tWeight\nS1\t1,2\t12.5\nS2\t5\t18.0\n" tsv_file = tmp_path / "commas.tsv" tsv_file.write_text(tsv_content) - with pytest.raises(SidecarMetadataFormatError) as excinfo: - SidecarQueryManager(file=tsv_file) - assert "commas" in str(excinfo.value).lower() + manager = SidecarQueryManager(file=tsv_file) + assert len(manager.warnings) == 0 + result = manager.run_query("Population:1,2") + assert any("mixed types" in w.lower() for w in result.warnings) - def test_commas_in_non_queried_column_raises_at_tsv_load(self, tmp_path): - """Test that commas are caught in all columns when the tsv is loaded, not just the column being queried.""" - tsv_content = "#Sample_ID\tArea\tBadCol\nS1\tNorth\thas,comma\nS2\tEast\tclean\n" - tsv_file = tmp_path / "commas_other_col.tsv" + def test_commas_in_pure_string_column_no_warning(self, tmp_path): + """Test that commas in a pure string column don't trigger mixed-type warnings.""" + tsv_content = "#Sample_ID\tCode\nS1\t1,2\nS2\t3,4\nS3\t5,6\n" + tsv_file = tmp_path / "commas.tsv" tsv_file.write_text(tsv_content) - with pytest.raises(SidecarMetadataFormatError) as excinfo: - SidecarQueryManager(file=tsv_file) - assert "commas" in str(excinfo.value).lower() + manager = SidecarQueryManager(file=tsv_file) + result = manager.run_query("Code:1,2") + + assert not any("mixed types" in w.lower() for w in result.warnings) def test_duplicate_column_names_raises(self, tmp_path): """Test that duplicate column names raise SidecarMetadataFormatError during load_file(). diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index 06ebf1d9..43ee1bba 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -189,10 +189,13 @@ def test_wrong_column_count(self, format_errors_tsv, project_samples): assert any("Expected 3 tab-separated columns" in e and "found 2" in e for e in errors) def test_comma_in_cell(self, format_errors_tsv, project_samples): - """Test that commas in cells are detected.""" + """Test that commas in cells generate warnings (not errors) and cause column to be treated as string.""" stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) - assert any("forbidden character ','" in e for e in errors) + assert any("comma" in w.lower() for w in warnings) + assert not any("comma" in e.lower() for e in errors) + + assert "Population" in stats["mixed_type_columns"] or "Population" in stats["string_columns"] def test_whitespace_warning(self, format_errors_tsv, project_samples): """Test that leading/trailing whitespace generate warnings.""" @@ -226,8 +229,8 @@ def test_hyphen_in_numeric_looking_column_is_warning(self, type_errors_tsv, proj """Test that hyphens in values that look like range notation produce a warning (not error).""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) - assert any("hyphen" in w.lower() and "3-5" in w for w in warnings) - assert not any("hyphen" in e.lower() and "3-5" in e for e in errors) + assert any("hyphen" in w.lower() for w in warnings) + assert not any("hyphen" in e.lower() for e in errors) def test_cell_and_column_level_mixed_types_are_warnings(self, type_errors_tsv, project_samples): """Test that when a column has both cell-level and column-level mixed types, both produce warnings (not errors).""" From 27769f049942b663c2f2e85d14768fdce9f52499 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 16 Feb 2026 14:47:05 +0100 Subject: [PATCH 073/100] Update user guide with info on special characters --- docs/user-guides/sidecar-metadata.md | 31 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 3eb3e5a2..e6a1b5c2 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -64,9 +64,9 @@ After the `Sample_ID` column has been populated, users can add any columns and v To ensure that user-defined metadata can be used in DivBase, we ask you follow the following constraints and considerations: 1. The user-defined columns can be either numeric or string type. A column is classified as numeric only if all values (including individual parts in semicolon-separated cells) can be parsed as numbers. If any value in a column is non-numeric, the entire column is treated as a string column. This means a column with values like "8", "1a", "5a" will treated as string column even though some values look numeric. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values are numerical or string. -2. Commas are not supported for the TSV and the DivBase system will send an error message if it detects any TSV cells with commas in them. Commas can have different meanings in different notation systems and to avoid confusion and to keep it simple, DivBase will simply not handle commas. Note that commas are used in the [Query syntax](#query-syntax-for-sidecar-metadata) for a different purpose. For decimals, use English decimal notation (.) and not comma (,). DivBase allows one single delimiter for enumerations in the TSV files and that is the semicolon (;) as will be described in the bullet. -3. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data (e.g. "2;4;21" or "North; North-West"). Note that this might make the process of writing queries on the more complex than if just a single value is use for each cell. -4. As outlined above, the only characters with special meaning or restrictions in the TSV are `#`, `,`, `;`, and `\t` (tab). Other special characters should be supported, but please be aware that Your Milage May Vary. Some common cases that have been tested and are supported include hyphens (`-`), e.g.`North-West`), diacritic unicodecharacters like `å`,`ä`,`ö`. +2. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data (e.g. "2;4;21" or "North; North-West"). Note that this might make the process of writing queries more complex than if just a single value is used for each cell. **Important:** Semicolons (`;`) are the only supported delimiter for multi-value cells. DivBase uses commas (`,`) in the [Query syntax](#query-syntax-for-sidecar-metadata) for a different purpose (separating filter values in queries). +3. Special characters like hyphens (`-`) and commas (`,`) are allowed, but will cause the column to be treated as a string column. String columns cannot be filtered using numeric operators (see details in [Filtering on numerical columns](#filtering-on-numerical-columns)) and will raise warnings. For example, values like "1-2" or "1,2" will be interpreted as strings, not numeric ranges or multi-value fields. If you intend to store multiple numeric values in a cell, use semicolons (e.g., "1;2"). For decimals, use English decimal notation with a period (e.g., "3.14") and not a comma. +4. The only characters with special structural meaning in the TSV format are `#` (for header comments), `;` (for multi-value cell separation), and `\t` (tab, for column separation). Other special characters are generally supported in data values, but be aware that Your Mileage May Vary. Some common cases that have been tested and are supported include diacritic unicode characters like `å`, `ä`, `ö`, and hyphens in string contexts (e.g., `North-West`). 5. Leading and trailing whitespaces are removed by the DivBase backend in order to ensure robust filtering and pattern matching. Whitespaces inside strings will be preserved. For instance: " Sample 1 " will be processed as "Sample 1". TODO - add info on No duplicate column names, no empty column names @@ -95,23 +95,23 @@ Manually checking that a TSV fulfills the DivBase requirement can be tedious. To divbase-cli dimensions validate-metadata-file path/to/your/sample_metadata.tsv ``` -The validation runs on the users local computer and not as a job on the DivBase server. It is intendend to be used on sidecar metadata TSV files before they are uploaded to the DivBase project. The validator will check the formatting requirements as described in [Mandatory contents](#mandatory-content) and [User-defined columns](#user-defined-columns). +The validation runs on the user's local computer and not as a job on the DivBase server. It is intended to be used on sidecar metadata TSV files before they are uploaded to the DivBase project. The validator will check the formatting requirements as described in [Mandatory contents](#mandatory-content) and [User-defined columns](#user-defined-columns). -The command requires that the project's dimensions index is up-to-date with the VCF files in the project, and that is why is sort under `divbase-cli dimensions` in the CLI command tree. If you are unsure if the dimensions index is up-to-date, just run `divbase-cli dimensions update` and wait until that job has completed by checking `divbase-cli task-history user`. +The command requires that the project's dimensions index is up-to-date with the VCF files in the project, and that is why it is sorted under `divbase-cli dimensions` in the CLI command tree. If you are unsure if the dimensions index is up-to-date, just run `divbase-cli dimensions update` and wait until that job has completed by checking `divbase-cli task-history user`. -The validation command will fetch all sample names from the project dimensions index from the DivBase server and use that to validate that the sample names in the TSV are correct. Misspelled, missing, or otherwise incorrect sample names in the TSV will result in erroneus or even misleading query results, and the validator will help with spotting that. Several of the checks that the validator performs are also done at the start of a sample metadata query, but this sample name check is currently only done by the validator. +The validation command will fetch all sample names from the project dimensions index from the DivBase server and use that to validate that the sample names in the TSV are correct. Misspelled, missing, or otherwise incorrect sample names in the TSV will result in erroneous or even misleading query results, and the validator will help with spotting that. Several of the checks that the validator performs are also done at the start of a sample metadata query, but this sample name check is currently only done by the validator. The following will return **Errors**. These must be fixed if the sidecar TSV should be used in DivBase queries: -- Header formatting: Header row is missing or first column is not `#Sample_ID`, duplicate or empty column names +- File not found, unreadable, or empty: If the TSV file path is missing, misspelled, or the file cannot be opened, validation will fail. Empty TSV files are also not allowed. -- Tab separation: Row has the wrong number of columns (Note that check is only done in the validator! It is currently not part of the checks at the start of a sample metadata query) +- Header formatting: Header row is missing or first column is not `#Sample_ID`, duplicate or empty column names. -- `Sample_ID`: Empty Sample_ID, Sample_ID contains a semicolon, duplicate Sample_ID +- Tab separation: Row has the wrong number of columns. (Note: This check is only done in the validator! It is currently not part of the checks at the start of a sample metadata query.) -- Unsupported characters: no commas in cell values +- Sample_ID issues: Empty Sample_ID, Sample_ID contains a semicolon, duplicate Sample_ID. -- All samples listed in the TSV must exist in the dimensions index +- Samples in TSV not found in project dimensions index: All samples listed in the TSV must exist in the project's dimensions index. If any sample is missing, the user needs to run `divbase-cli dimensions update` to submit an update job and then try the validator again after the job has finished. !!! Note The formatting errors listed above are also enforced by the DivBase query engine when loading the metadata file for queries (except checking tab separation and that samples match the dimensions file, which are validator-specific checks). This means that even if the validator is not run before upload, the query engine will analyse the file content and report issues as errors. Detected Errors are different from Warnings in that errors will result in queries not even being run. @@ -120,10 +120,11 @@ The validator will also raise **Warnings**. DivBase queries can still be run wit - Cell value has leading or trailing whitespace (will be stripped by server) -- Samples in the project’s dimensions index not found in the TSV. These samples will not be considered in queries, and that might in fact be what the user wants, espcially if using multiple TSVs. Just be sure to be careful when using this since it will affect the results. -- Mixed-type columns (e.g. a column with "8", "1a", "5a") and Semicolon-separated cells with mixed types (e.g., "1;abc"). They are allowed but the user should keep in mind that since they will be treated as string columns, numeric query operations (ranges, inequalities) will not work on these columns. +- Samples in the project's dimensions index not found in the TSV. These samples will not be considered in queries, and that might in fact be what the user wants, especially if using multiple TSVs. Just be sure to be careful when using this since it will affect the results. -- Hyphens in values that look like range notation (e.g., "1-2") in columns that also contain numeric values. The warning message will ask the user if they intended this to be a multicolumn value which should use semicolons as delimters. +- Mixed-type columns (e.g. a column with "8", "1a", "5a") and semicolon-separated cells with mixed types (e.g., "1;abc"). They are allowed but the user should keep in mind that since they will be treated as string columns, numeric query operations (ranges, inequalities) will not work on these columns. + +- Hyphens in values that look like range notation (e.g., "1-2") in columns that otherwise contain numeric values. The same goes for commas (e.g. "1,2"). The warning message will ask the user if they intended this to be a multicolumn value which should use semicolons as delimiters. ## Query Syntax for sidecar metadata @@ -225,7 +226,7 @@ The `!` (NOT) operator can really come to good use for numerical filters: When running a sample metadata query in DivBase, the system will check the TSV and the query filter for the constraints and considerations described throughout this guide. If errors are encountered, the query will not run and a message with details on what went wrong will be return to the user. Warnings, however, will not stop not stop queries from running, but indicated that the user should carefully review the results. -Reviewing the Warnings to judge if they are relevant or not is key help avoid unintended query results. The following are treated as Warnings by DivBase queries (and by the TSV validator). +Reviewing the Warnings to judge if they are relevant or not is essential to avoid unintended query results. The following are treated as Warnings by DivBase queries (and by the TSV validator). - **Comparison operators on string/mixed-type columns**: DivBase comparison operators (`>`, `<`, `>=`, `<=`) only work on numeric columns. If you use them on a string or mixed-type column — whether with a numeric operand (e.g., `Population:>5`) or a string operand (e.g., `Area:>North`) — DivBase will warn that comparison operators are not supported on string columns. Use exact string matching instead (e.g., `Area:North` or `Population:8,1a`). From d76fc31daecf87c81cb56c599aa6df1061d124d4 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 16 Feb 2026 15:19:43 +0100 Subject: [PATCH 074/100] Add text to some TODOs in the metadata user guide --- docs/user-guides/sidecar-metadata.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index e6a1b5c2..601e16f1 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -38,18 +38,16 @@ If the dimensions VCF files in the project have been cached in DivBase, a templa divbase-cli dimensions create-metadata-template ``` -Note! there can be multiple TSVs in the same project and it is possible to call them for the queries with the `--metadata-tsv-name` flag. - -TODO - give more example of how and when it can be relevant to have multiple tsv files. they can have sample subsets +Note! there can be multiple TSVs in the same project and it is possible to call them for the queries with the `--metadata-tsv-name` flag. If not specified, it the default `sample_metadata.tsv` will be assumed. It is up to the user if the want to have multiple TSVs in the same project to organise their metadata in a specific way. It is allowed to have duplicate sample names and metadata across multiple TSV files, since only one TSV can be called per query. It is recommended to a have a master TSV that contains all samples from all the VCFs in the project: querying on TSVs that contain subsets of all sample names is possible, but will sample names not included in the TSV used for the query will be disregarded for the query. ### Sidecar TSV format requirements -TODO - write a section on how there is no fixed schema but some mandatory requirements +To be able to accomodate a variety of metadata needs, DivBase does not enforace a strict schema for the sidecar sample metadata TSV file since the file are designed to contain user-defined columns. Instead, there are a few mandatory requirements and some best-practices for defining columns. #### Mandatory content 1. The first row must be a header row and the first column must be named `Sample_ID`. -2. The `Sample_ID` column must contain the exact names of the samples as they are spelled in the VCF files. One entry per sample name; duplicates are not allowed. This will already be handled if user has run a `divbase-cli dimensions update` job and, after its completion, has generated a pre-filled template with: `divbase-cli dimensions create-metadata-template` +2. The `Sample_ID` column must contain the exact names of the samples as they are spelled in the VCF files. Sample names need to occur uniqely in the TSV: only one row per sample name in the `Sample_ID` column, no duplicates allowed. This will already be handled if user has run a `divbase-cli dimensions update` job and, after its completion, has generated a pre-filled template with: `divbase-cli dimensions create-metadata-template` 3. The `Sample_ID` column can only contain one sample name per row. This is different from the user-defined columns that can take arrays of values for each cell in a column using semicolons (;) as delimters. `Sample_ID` values can also not be empty. 4. Every column need to be tab separated for all rows. @@ -69,8 +67,6 @@ To ensure that user-defined metadata can be used in DivBase, we ask you follow t 4. The only characters with special structural meaning in the TSV format are `#` (for header comments), `;` (for multi-value cell separation), and `\t` (tab, for column separation). Other special characters are generally supported in data values, but be aware that Your Mileage May Vary. Some common cases that have been tested and are supported include diacritic unicode characters like `å`, `ä`, `ö`, and hyphens in string contexts (e.g., `North-West`). 5. Leading and trailing whitespaces are removed by the DivBase backend in order to ensure robust filtering and pattern matching. Whitespaces inside strings will be preserved. For instance: " Sample 1 " will be processed as "Sample 1". -TODO - add info on No duplicate column names, no empty column names - #### Example This example illustrates how a sidecar sample metadata TSV can look like. The mandatory requirement are fulfilled (heading, `Sample_ID` column, tab-separated file). The user-defined column contains examples of a numerical column (`Population`) and a string column (`Area`). In some cells, semicolons (`;`) are used to assign multiple values to the same sample and column. From 98f9015d990b08fe4a48a0bfb642be4a365c9092 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 16 Feb 2026 15:57:47 +0100 Subject: [PATCH 075/100] Polish the metadata user guide --- docs/user-guides/query-syntax.md | 6 +++ docs/user-guides/sidecar-metadata.md | 64 +++++++++++++--------------- 2 files changed, 36 insertions(+), 34 deletions(-) diff --git a/docs/user-guides/query-syntax.md b/docs/user-guides/query-syntax.md index fb68ffca..41351def 100644 --- a/docs/user-guides/query-syntax.md +++ b/docs/user-guides/query-syntax.md @@ -5,3 +5,9 @@ TODO ## combined sample metadata and VCF queries TODO - there is a link to here from the sample metadata guide, so the combined queries should be described in detail here + +It is also possible to run a sidecar sample metadata query as part of a VCF query by adding the query as a sting to the flag `--tsv-filter`: + +```bash +divbase-cli query bcftools-pipe --tsv-filter "Area:North" --command "view -s SAMPLES" +``` diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 601e16f1..b00994ee 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -2,11 +2,11 @@ DivBase supports that users supply a sidecar TSV (tab separated variables) file with metadata on the samples contained within the VCF files in the DivBase project. -There are ways for sample metadata to be stored in the VCF itself (see [The Variant Call Format Specification](https://samtools.github.io/hts-specs/VCFv4.5.pdf)). For instance in a global `##SAMPLE` header (once per sample) or in a custom per-variant genotype `FORMAT` field in each variant and sample. The downside of the former is that common tools like `bcftools view` do not filter on the headers; the downside of the latter is that writing the metadata once per variant will result in a lot of repeated data, which in turn leads to elevated file size and processing times as the VCF file scales. +While there are ways for sample metadata to be stored in the VCF itself (see [The Variant Call Format Specification](https://samtools.github.io/hts-specs/VCFv4.5.pdf)), it is not really standardized. Metadata can for instance be specified instance in a global `##SAMPLE` header (once per sample) or in a custom per-variant genotype `FORMAT` field in each variant and sample. The downside of the former is that common tools like `bcftools view` do not filter on the headers; the downside of the latter is that writing the metadata once per variant will result in a lot of repeated data, which in turn leads to elevated file size and processing times as the VCF file scales. DivBase takes a different approach by decoupling the sample metadata from the VCF data by storing it in a sidecar file. The sidecar TSV can be queried on its own, or together with the VCF files in the DivBase project. The TSV is lightweight and highly extendable (essentially a plain-text form of a spreadsheet). This approach avoids having to read, write, and rewrite metadata to the VCF files and therefore keeps the resource overhead low for the sample metadata. -To be able to accomodate metadata needs for any research project that deals with VCF files, the sidecar sample metadata TSV and filtering in DivBase has been designed to be very open-ended and user-defined. As long as a few format and filter syntax requirements, the user is free to design their metadata TSV as the like. Column names in the TSV represent metadata categories and rows represent the samples found in the VCF files in the DivBase project. However, this flexibility put the responsibility on the user that spelling and values in columns and rows are correct: if not, the sample metadata filters will return incomplete or unintended results. +To be able to accomodate metadata needs for any research project that deals with VCF files, the sidecar sample metadata TSV and filtering in DivBase has been designed to be very open-ended and user-defined. As long as a few format and filter syntax requirements are followed, the user is free to design their metadata TSV as they like using the format: column names represent metadata categories and rows represent the samples found in the VCF files in the DivBase project. However, this flexibility put the responsibility on the user that spelling and values in columns and rows are correct: if not, the sample metadata filters will return incomplete or unintended results. !!! Notes There is a CLI command to help check that a user-defined sample metadata TSV file aligns with the requirements described on this page. This validator tool will be [described in its own section below](#validating-a-sidecar-metadata-tsv-with-divbase-cli), but, in short, it can be run with: @@ -15,7 +15,7 @@ To be able to accomodate metadata needs for any research project that deals with divbase-cli dimensions validate-metadata-file path/to/your/sample_metadata.tsv ``` -This guide will describe how to [Create a sample metadata TSV](#creating-a-sidecar-sample-metadata-tsv-for-a-divbase-project)), and [How to run queries on sample metadata TSV files](#query-syntax-for-sidecar-metadata). Instructions on how to run combined sample metadata and VCF data queries are found in [DivBase Query Syntax for VCF data](query-syntax.md). +This guide will describe how to [Create a sample metadata TSV](#creating-a-sidecar-sample-metadata-tsv-for-a-divbase-project)), and [How to run queries on sample metadata TSV files](#query-syntax-for-sidecar-metadata). Instructions on how to run combined sample metadata and VCF data queries are found on the separate page on [DivBase Query Syntax for VCF data](query-syntax.md). !!! Warning All instructions regarding running DivBase queries, generating sample metadata templates, and validating sample metadata TSV files required that the project's VCF dimensions index is updated against the current versions of the VCF files in the project's data store. This can be assured by running the command: @@ -38,38 +38,39 @@ If the dimensions VCF files in the project have been cached in DivBase, a templa divbase-cli dimensions create-metadata-template ``` -Note! there can be multiple TSVs in the same project and it is possible to call them for the queries with the `--metadata-tsv-name` flag. If not specified, it the default `sample_metadata.tsv` will be assumed. It is up to the user if the want to have multiple TSVs in the same project to organise their metadata in a specific way. It is allowed to have duplicate sample names and metadata across multiple TSV files, since only one TSV can be called per query. It is recommended to a have a master TSV that contains all samples from all the VCFs in the project: querying on TSVs that contain subsets of all sample names is possible, but will sample names not included in the TSV used for the query will be disregarded for the query. +!!! Note + There can be multiple TSVs in the same project and it is possible to call them for the queries with the `--metadata-tsv-name` flag. If not specified, it the default `sample_metadata.tsv` will be assumed. It is up to the user if the want to have multiple TSVs in the same project to organise their metadata in a specific way. It is allowed to have duplicate sample names and metadata across multiple TSV files, since only one TSV can be called per query. It is recommended to a have a master TSV that contains all samples from all the VCFs in the project: querying on TSVs that contain subsets of all sample names is possible, but will sample names not included in the TSV used for the query will be disregarded for the query. ### Sidecar TSV format requirements -To be able to accomodate a variety of metadata needs, DivBase does not enforace a strict schema for the sidecar sample metadata TSV file since the file are designed to contain user-defined columns. Instead, there are a few mandatory requirements and some best-practices for defining columns. +To be able to accomodate a variety of metadata needs, DivBase does not enforce a strict schema for the sidecar sample metadata TSV file since it is designed to contain user-defined columns. Instead, there are a few mandatory requirements and some best-practices for defining columns. #### Mandatory content -1. The first row must be a header row and the first column must be named `Sample_ID`. +1. The first row must be a header row, start with `#`, and the first column must be named `Sample_ID`. 2. The `Sample_ID` column must contain the exact names of the samples as they are spelled in the VCF files. Sample names need to occur uniqely in the TSV: only one row per sample name in the `Sample_ID` column, no duplicates allowed. This will already be handled if user has run a `divbase-cli dimensions update` job and, after its completion, has generated a pre-filled template with: `divbase-cli dimensions create-metadata-template` -3. The `Sample_ID` column can only contain one sample name per row. This is different from the user-defined columns that can take arrays of values for each cell in a column using semicolons (;) as delimters. `Sample_ID` values can also not be empty. +3. The `Sample_ID` column can only contain one sample name per row. This is different from the user-defined columns that can take arrays of values for each cell in a column using semicolons (`;`) as delimters. `Sample_ID` values can also not be empty. -4. Every column need to be tab separated for all rows. +4. Every column need to be tab separated for all rows, including the header. #### User-defined columns After the `Sample_ID` column has been populated, users can add any columns and values to the TSV. !!! Warning - It is the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling must be used for the filters. This includes matching upper and lower case letters. + It is the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling of a column name must be used for the filters. This includes matching upper and lower case letters. To ensure that user-defined metadata can be used in DivBase, we ask you follow the following constraints and considerations: -1. The user-defined columns can be either numeric or string type. A column is classified as numeric only if all values (including individual parts in semicolon-separated cells) can be parsed as numbers. If any value in a column is non-numeric, the entire column is treated as a string column. This means a column with values like "8", "1a", "5a" will treated as string column even though some values look numeric. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values are numerical or string. +1. The user-defined columns can be **either** numeric **or** string type. A column is classified as numeric only if all values can be parsed as numbers (including individual parts in semicolon-separated cells). If any value in a column is non-numeric, the entire column is treated as a string column. This means a column with values like "8", "1a", "5a" will be treated as string column even though some values look numeric. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values are numerical or string. 2. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data (e.g. "2;4;21" or "North; North-West"). Note that this might make the process of writing queries more complex than if just a single value is used for each cell. **Important:** Semicolons (`;`) are the only supported delimiter for multi-value cells. DivBase uses commas (`,`) in the [Query syntax](#query-syntax-for-sidecar-metadata) for a different purpose (separating filter values in queries). 3. Special characters like hyphens (`-`) and commas (`,`) are allowed, but will cause the column to be treated as a string column. String columns cannot be filtered using numeric operators (see details in [Filtering on numerical columns](#filtering-on-numerical-columns)) and will raise warnings. For example, values like "1-2" or "1,2" will be interpreted as strings, not numeric ranges or multi-value fields. If you intend to store multiple numeric values in a cell, use semicolons (e.g., "1;2"). For decimals, use English decimal notation with a period (e.g., "3.14") and not a comma. -4. The only characters with special structural meaning in the TSV format are `#` (for header comments), `;` (for multi-value cell separation), and `\t` (tab, for column separation). Other special characters are generally supported in data values, but be aware that Your Mileage May Vary. Some common cases that have been tested and are supported include diacritic unicode characters like `å`, `ä`, `ö`, and hyphens in string contexts (e.g., `North-West`). +4. The only characters with special structural meaning in DivBase sidecar metadata TSV files are `#` (for header comments), `;` (for multi-value cell separation), and `\t` (tab, for column separation). Other special characters are generally supported in data values, but be aware that Your Mileage May Vary. Some common cases that have been tested and are supported include diacritic unicode characters like `å`, `ä`, `ö`, and hyphens in strings (e.g., `North-West`). 5. Leading and trailing whitespaces are removed by the DivBase backend in order to ensure robust filtering and pattern matching. Whitespaces inside strings will be preserved. For instance: " Sample 1 " will be processed as "Sample 1". #### Example -This example illustrates how a sidecar sample metadata TSV can look like. The mandatory requirement are fulfilled (heading, `Sample_ID` column, tab-separated file). The user-defined column contains examples of a numerical column (`Population`) and a string column (`Area`). In some cells, semicolons (`;`) are used to assign multiple values to the same sample and column. +This example illustrates how a sidecar sample metadata TSV can look like. The mandatory requirement are fulfilled (heading with `#`, `Sample_ID` column, tab-separated file). The user-defined column contains examples of a numerical column (`Population`) and a string column (`Area`). In some cells, semicolons (`;`) are used to assign multiple values to the same sample and column. ```text #Sample_ID Population Area Weight @@ -97,7 +98,7 @@ The command requires that the project's dimensions index is up-to-date with the The validation command will fetch all sample names from the project dimensions index from the DivBase server and use that to validate that the sample names in the TSV are correct. Misspelled, missing, or otherwise incorrect sample names in the TSV will result in erroneous or even misleading query results, and the validator will help with spotting that. Several of the checks that the validator performs are also done at the start of a sample metadata query, but this sample name check is currently only done by the validator. -The following will return **Errors**. These must be fixed if the sidecar TSV should be used in DivBase queries: +The following will return **Errors**. These must be fixed for the sidecar TSV be used with DivBase queries: - File not found, unreadable, or empty: If the TSV file path is missing, misspelled, or the file cannot be opened, validation will fail. Empty TSV files are also not allowed. @@ -105,20 +106,20 @@ The following will return **Errors**. These must be fixed if the sidecar TSV sho - Tab separation: Row has the wrong number of columns. (Note: This check is only done in the validator! It is currently not part of the checks at the start of a sample metadata query.) -- Sample_ID issues: Empty Sample_ID, Sample_ID contains a semicolon, duplicate Sample_ID. +- `Sample_ID` column issues: Empty value, value containing a semicolon, rows with duplicate sample names. -- Samples in TSV not found in project dimensions index: All samples listed in the TSV must exist in the project's dimensions index. If any sample is missing, the user needs to run `divbase-cli dimensions update` to submit an update job and then try the validator again after the job has finished. +- Samples in TSV not found in project dimensions index: All samples listed in the TSV must exist in the project's dimensions index. If a sample is known to be in a VCF file in the DivBase project but is missing from the VCF dimensions index, the user needs to run `divbase-cli dimensions update` to submit an update job and then try the validator again after the job has finished. !!! Note - The formatting errors listed above are also enforced by the DivBase query engine when loading the metadata file for queries (except checking tab separation and that samples match the dimensions file, which are validator-specific checks). This means that even if the validator is not run before upload, the query engine will analyse the file content and report issues as errors. Detected Errors are different from Warnings in that errors will result in queries not even being run. + The formatting errors listed above are also enforced by the DivBase query engine when loading the metadata file for queries (except checking tab separation which is a validator-specific check). This means that even if the validator is not run before upload, the query engine will analyse the file content and report issues as errors. Detected Errors are different from Warnings in that errors will result in queries not even being run. The validator will also raise **Warnings**. DivBase queries can still be run with Warnings, but the user should review them, and possible address them if so desired: -- Cell value has leading or trailing whitespace (will be stripped by server) +- Cell value has leading or trailing whitespace (will be stripped by DivBase when a query is run) - Samples in the project's dimensions index not found in the TSV. These samples will not be considered in queries, and that might in fact be what the user wants, especially if using multiple TSVs. Just be sure to be careful when using this since it will affect the results. -- Mixed-type columns (e.g. a column with "8", "1a", "5a") and semicolon-separated cells with mixed types (e.g., "1;abc"). They are allowed but the user should keep in mind that since they will be treated as string columns, numeric query operations (ranges, inequalities) will not work on these columns. +- Mixed-type columns (a column with numeric and string values, e.g. "8", "1a", "5a") and semicolon-separated cells with mixed types (e.g., "1;abc"). They are allowed but the user should keep in mind that since they will be treated as string columns, numeric query operations (ranges, inequalities) will not work on these columns. - Hyphens in values that look like range notation (e.g., "1-2") in columns that otherwise contain numeric values. The same goes for commas (e.g. "1,2"). The warning message will ask the user if they intended this to be a multicolumn value which should use semicolons as delimiters. @@ -134,14 +135,7 @@ The TSV query syntax is `"Key1:Value1,Value2;Key2:Value3,Value4"`, where `Key1:` It is possible to exclude a value by prefixing it with a `!` (NOT) operator: `"Key:!Value"`. When mixing inclusive and exclusive filters (e.g. `"Key1:Value1,Value2; Key2:!Value3"`), only the rows that match the positive filters and do not match any of the excluded values will be returned. This can be used to write complex queries. !!! note - Please note that semicolon (`;`) is used for different purposes in the TSV (multi-value cells) and in the query syntax (perform queries on multiple columns)! - - Also note that commas are allowed in the query syntax, but are not allowed in the cells in the TSV. - -Filtering is inclusive by default. This applies both for the filter values and the cell values: - -- If a filter contains multiple values, e.g. `"Area:North,West"`, the row is included if at least one of the filter values matches any value in the cell. I.e. a row with `North`, and a row with `West` will both be returned from this filter. -- If a cell in the TSV contains multiple values separated by a semicolon as explained in [User-defined columns](#user-defined-columns) (e.g., `North;West`), the row is included if any of those values match the filter. Filters with `"Area:North"`, `"Area:West"`, and `"Area:North,West"` will all return the row with the array value `North;West`. + Please note that semicolon (`;`) is used for different purposes in the TSV (for denoting multi-value cells) and in the query syntax (for performing queries on multiple columns)! For example, if the user wants to query the TSV on column `Area` for all samples that contain the value `North`,: @@ -149,13 +143,12 @@ For example, if the user wants to query the TSV on column `Area` for all samples divbase-cli query tsv "Area:North" ``` -It is also possible to run a sidecar sample metadata query as part of a VCF query by adding the query as a sting to the flag `--tsv-filter`: +Please also see the documentation on [DivBase Query Syntax for VCF data](query-syntax.md) for more details on how that command works. -```bash -divbase-cli query bcftools-pipe --tsv-filter "Area:North" --command "view -s SAMPLES" -``` +Filtering is inclusive by default. This applies both for the filter values and the cell values: -Please also see the documentation on [DivBase Query Syntax for VCF data](query-syntax.md) for more details on how that command works. +- If a filter contains multiple values, e.g. `"Area:North,West"`, the row is included if at least one of the filter values matches any value in the cell. I.e. a row with `North`, and a row with `West` will both be returned from this filter. +- If a cell in the TSV contains multiple values separated by a semicolon as explained in [User-defined columns](#user-defined-columns) (e.g., `North;West`), the row is included if any of those values match the filter. Filters with `"Area:North"`, `"Area:West"`, and `"Area:North,West"` will all return the row with the array value `North;West`. !!! note To reiterate what was written in the [User-defined columns](#user-defined-columns) section above: it the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling must be used for the filters. @@ -218,12 +211,18 @@ The `!` (NOT) operator can really come to good use for numerical filters: - `"Weight:>5,!10-15"` returns rows where the value is greater than 5, but not in the range 10–15. - `"Weight:!1-2,4"` returns rows where the value is not in the range 1–2, or is 4. +!!! Tip + Numeric operations such as inequalities like `>25`, and ranges like `20-40` are fully supported for semicolon-separated numeric columns as long as every semicolon separated part (`part;part`) in every cell in the column is a valid number. For instance: a `Population` column with values `1`, `2;4`, `1;3;5`; in this case a query like `divbase-cli query tsv "Population:>3"` will correctly match cells like `2;4` and `1;3;5`. + ### Query Warnings: spotting potential issues with the TSV or the query filter When running a sample metadata query in DivBase, the system will check the TSV and the query filter for the constraints and considerations described throughout this guide. If errors are encountered, the query will not run and a message with details on what went wrong will be return to the user. Warnings, however, will not stop not stop queries from running, but indicated that the user should carefully review the results. Reviewing the Warnings to judge if they are relevant or not is essential to avoid unintended query results. The following are treated as Warnings by DivBase queries (and by the TSV validator). +!!! note + Note that the warnings will only occur when query contains a combination of filter and column values that warrants a warning: filtering only on Column A will not check for warnings in Column B. Use the [TSV validator](#validating-a-sidecar-metadata-tsv-with-divbase-cli) if you want to check for issues across all columns. + - **Comparison operators on string/mixed-type columns**: DivBase comparison operators (`>`, `<`, `>=`, `<=`) only work on numeric columns. If you use them on a string or mixed-type column — whether with a numeric operand (e.g., `Population:>5`) or a string operand (e.g., `Area:>North`) — DivBase will warn that comparison operators are not supported on string columns. Use exact string matching instead (e.g., `Area:North` or `Population:8,1a`). - **Mixed-type column information**: @@ -235,9 +234,6 @@ If the filter references a column that does not exist in the TSV, DivBase will w - **No matching values**: If none of the filter values match any values in the column, DivBase print a warning. This can indicate a typo in the filter value, or just that the specific filter combination filtered away all samples.. -!!! Tip - Numeric operations such as inequalities like `>25`, and ranges like `20-40` are fully supported for semicolon-separated numeric columns as long as every semicolon separated part (`part;part`) in every cell in the column is a valid number. For instance: a `Population` column with values `1`, `2;4`, `1;3;5`; in this case a query like `divbase-cli query tsv "Population:>3"` will correctly match cells like `2;4` and `1;3;5`. - ### Examples of complex queries Assuming that the sidecar metadata TSV file looks like in the [Example](#example) above, a query like will: From 8c14483b8d7121c41eda541edfe92ffc2ca5420a Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 17 Feb 2026 09:03:29 +0100 Subject: [PATCH 076/100] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/user-guides/quick-start.md | 4 ++-- .../src/divbase_api/crud/vcf_dimensions.py | 4 ++-- .../src/divbase_api/services/queries.py | 14 +++++--------- .../src/divbase_api/worker/crud_dimensions.py | 3 +-- .../divbase-api/src/divbase_api/worker/tasks.py | 4 ++-- .../services/sample_metadata_tsv_validator.py | 4 ++-- 6 files changed, 14 insertions(+), 19 deletions(-) diff --git a/docs/user-guides/quick-start.md b/docs/user-guides/quick-start.md index ba68194f..ad379ab4 100644 --- a/docs/user-guides/quick-start.md +++ b/docs/user-guides/quick-start.md @@ -132,7 +132,7 @@ Example of a sidecar metadata TSV file with the mandatory `Sample_ID` column and ``` !!! note - Please use a text editor than preserves the tabs when the file is saved. Incorrect tabs can lead to issues with running metadata queries in DivBase. + Please use a text editor that preserves the tabs when the file is saved. Incorrect tabs can lead to issues with running metadata queries in DivBase. There is a command to help check that the sidecar metadata TSV is correctly formatted for use with DivBase. Running it is optional: @@ -140,7 +140,7 @@ There is a command to help check that the sidecar metadata TSV is correctly form divbase-cli dimensions validate-metadata-file path/to/your/sample_metadata.tsv ``` -When you are happy with the sample metadata file, it should be uploaded the the DivBase project with the following: +When you are happy with the sample metadata file, it should be uploaded to the DivBase project with the following: ```bash divbase-cli files upload path/to/your/sample_metadata.tsv diff --git a/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py b/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py index 9bcde473..5819dac0 100644 --- a/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py +++ b/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py @@ -63,8 +63,8 @@ async def get_unique_samples_by_project_async(db: AsyncSession, project_id: int) Get unique sample names across all VCF files from a project's dimensions entries. - Samples are stored in as ARRAY(String) in the VCFMetadataDB model and need to be flattened before finding the unqiue values. - To do all operations on the PostgreSQL side (to avoid having do it here in the fastAPI side), need to first use unnest() to flatten the arrays. + Samples are stored in as ARRAY(String) in the VCFMetadataDB model and need to be flattened before finding the unique values. + To do all operations on the PostgreSQL side (to avoid having to do it here on the FastAPI side), need to first use unnest() to flatten the arrays. """ stmt = select(func.unnest(VCFMetadataDB.samples)).where(VCFMetadataDB.project_id == project_id).distinct() diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index d9a72ba1..2c811258 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -975,7 +975,7 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": logger.warning(warning_msg) self.warnings.append(warning_msg) filter_conditions.append(combined) - logger.info("filter_conditions: " + str(filter_conditions)) # debug + else: warning_msg = f"No valid numeric values, ranges, or inequalities provided for column '{key}'. Filter condition will not match any rows." logger.warning(warning_msg) @@ -1198,14 +1198,10 @@ def check_inequality(cell_value): try: val_num = self._parse_numeric_value(val_str) if ( - operator == ">" - and val_num > threshold - or operator == ">=" - and val_num >= threshold - or operator == "<" - and val_num < threshold - or operator == "<=" - and val_num <= threshold + (operator == ">" and val_num > threshold) + or (operator == ">=" and val_num >= threshold) + or (operator == "<" and val_num < threshold) + or (operator == "<=" and val_num <= threshold) ): return True except ValueError: diff --git a/packages/divbase-api/src/divbase_api/worker/crud_dimensions.py b/packages/divbase-api/src/divbase_api/worker/crud_dimensions.py index b75cf81e..ecb6211e 100644 --- a/packages/divbase-api/src/divbase_api/worker/crud_dimensions.py +++ b/packages/divbase-api/src/divbase_api/worker/crud_dimensions.py @@ -1,6 +1,5 @@ """ -CRUD operations for VCF dimensions for the Celery workers. -. +CRUD operations for VCF dimensions for the Celery workers There are separate VCF dimensions CRUD functions for used with API endpoints in packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py """ diff --git a/packages/divbase-api/src/divbase_api/worker/tasks.py b/packages/divbase-api/src/divbase_api/worker/tasks.py index 7e3a810b..d5e944db 100644 --- a/packages/divbase-api/src/divbase_api/worker/tasks.py +++ b/packages/divbase-api/src/divbase_api/worker/tasks.py @@ -177,8 +177,8 @@ def sample_metadata_query_task( metadata_tsv_name=metadata_tsv_name, bucket_name=bucket_name, s3_file_manager=s3_file_manager ) except ObjectDoesNotExistError: - # If ObjectDoesNotExistError, propagage the more specific TSVFileNotFoundInProjectError upwards. - # Wrap exeception in TaskUserError () to avoid Celery serilization UnpicklableExceptionWrapper issue + # If ObjectDoesNotExistError, propagate the more specific TSVFileNotFoundInProjectError upwards. + # Wrap exception in TaskUserError () to avoid Celery serialization UnpicklableExceptionWrapper issue raise TaskUserError(str(TSVFileNotFoundInProjectError(metadata_tsv_name, project_name))) from None with SyncSessionLocal() as db: diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index 9b7433d7..c6ddf470 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -182,8 +182,8 @@ def _check_mixed_types(self, header: list[str], column_types: dict[int, set[str] Matches server-side logic in queries.py::_is_semicolon_separated_numeric_column Columns with mixed types are treated as string columns by the DivBase query engine. - This happen for values such as e.g., "8", "1a", "5a" that happen to look numeric but - are semantically a strings (e.g. names, IDs).. + This happens for values such as "8", "1a", "5a" that happen to look numeric but + are semantically strings (e.g. names, IDs). """ mixed_columns = [] for col_idx, types in column_types.items(): From b48d25f9616e17ab15cd595bdc58208df8e411de Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 17 Feb 2026 09:11:10 +0100 Subject: [PATCH 077/100] Fix additional typos found by copilot review --- docs/user-guides/sidecar-metadata.md | 10 +++++----- .../divbase-api/src/divbase_api/crud/vcf_dimensions.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index b00994ee..2ef9cc35 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -15,7 +15,7 @@ To be able to accomodate metadata needs for any research project that deals with divbase-cli dimensions validate-metadata-file path/to/your/sample_metadata.tsv ``` -This guide will describe how to [Create a sample metadata TSV](#creating-a-sidecar-sample-metadata-tsv-for-a-divbase-project)), and [How to run queries on sample metadata TSV files](#query-syntax-for-sidecar-metadata). Instructions on how to run combined sample metadata and VCF data queries are found on the separate page on [DivBase Query Syntax for VCF data](query-syntax.md). +This guide contains sections on how to [Create a sample metadata TSV](#creating-a-sidecar-sample-metadata-tsv-for-a-divbase-project), and [How to run queries on sample metadata TSV files](#query-syntax-for-sidecar-metadata). Instructions on how to run combined sample metadata and VCF data queries are found on the separate page on [DivBase Query Syntax for VCF data](query-syntax.md). !!! Warning All instructions regarding running DivBase queries, generating sample metadata templates, and validating sample metadata TSV files required that the project's VCF dimensions index is updated against the current versions of the VCF files in the project's data store. This can be assured by running the command: @@ -43,7 +43,7 @@ divbase-cli dimensions create-metadata-template ### Sidecar TSV format requirements -To be able to accomodate a variety of metadata needs, DivBase does not enforce a strict schema for the sidecar sample metadata TSV file since it is designed to contain user-defined columns. Instead, there are a few mandatory requirements and some best-practices for defining columns. +To be able to accommodate a variety of metadata needs, DivBase does not enforce a strict schema for the sidecar sample metadata TSV file since it is designed to contain user-defined columns. Instead, there are a few mandatory requirements and some best-practices for defining columns. #### Mandatory content @@ -129,7 +129,7 @@ This section describes how to query on the sample metadata file itself. The same ### Overview: querys are applied as filters on columns in the TSV -Queries on the sidecar sample metadata TSV can be done with the `divbase-cli query tsv` command. The filters that the user want to query on needs entered as a string (i.e. enclosed in quotes, `""`). +Queries on the sidecar sample metadata TSV can be done with the `divbase-cli query tsv` command. The filters that the user wants to query on need to be entered as a string (i.e. enclosed in quotes, `""`). The TSV query syntax is `"Key1:Value1,Value2;Key2:Value3,Value4"`, where `Key1:`...`Key2:` are the column header names in the TSV, and `Value1`...`Value4` are the values. Multiple filter values for a key are separated by commas, and multiple keys are separated by semicolons. There can be any number keys and values to filter on, but it is up to the user to write queries that return useful results. It is possible to exclude a value by prefixing it with a `!` (NOT) operator: `"Key:!Value"`. When mixing inclusive and exclusive filters (e.g. `"Key1:Value1,Value2; Key2:!Value3"`), only the rows that match the positive filters and do not match any of the excluded values will be returned. This can be used to write complex queries. @@ -151,7 +151,7 @@ Filtering is inclusive by default. This applies both for the filter values and t - If a cell in the TSV contains multiple values separated by a semicolon as explained in [User-defined columns](#user-defined-columns) (e.g., `North;West`), the row is included if any of those values match the filter. Filters with `"Area:North"`, `"Area:West"`, and `"Area:North,West"` will all return the row with the array value `North;West`. !!! note - To reiterate what was written in the [User-defined columns](#user-defined-columns) section above: it the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling must be used for the filters. + To reiterate what was written in the [User-defined columns](#user-defined-columns) section above: it is the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling must be used for the filters. ### Filtering on string columns @@ -216,7 +216,7 @@ The `!` (NOT) operator can really come to good use for numerical filters: ### Query Warnings: spotting potential issues with the TSV or the query filter -When running a sample metadata query in DivBase, the system will check the TSV and the query filter for the constraints and considerations described throughout this guide. If errors are encountered, the query will not run and a message with details on what went wrong will be return to the user. Warnings, however, will not stop not stop queries from running, but indicated that the user should carefully review the results. +When running a sample metadata query in DivBase, the system will check the TSV and the query filter for the constraints and considerations described throughout this guide. If errors are encountered, the query will not run and a message with details on what went wrong will be return to the user. Warnings, however, will not stop queries from running, but indicate that the user should carefully review the results. Reviewing the Warnings to judge if they are relevant or not is essential to avoid unintended query results. The following are treated as Warnings by DivBase queries (and by the TSV validator). diff --git a/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py b/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py index 5819dac0..22d49a79 100644 --- a/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py +++ b/packages/divbase-api/src/divbase_api/crud/vcf_dimensions.py @@ -78,7 +78,7 @@ async def get_unique_scaffolds_by_project_async(db: AsyncSession, project_id: in Get unique scaffold names across all VCF files for a project. - Like samples, scaffolds are stored in as ARRAY(String) in the VCFMetadataDB model and need to be flattened with the unnest() PostgreSQL function. + Like samples, scaffolds are stored as ARRAY(String) in the VCFMetadataDB model and need to be flattened with the unnest() PostgreSQL function. """ stmt = select(func.unnest(VCFMetadataDB.scaffolds)).where(VCFMetadataDB.project_id == project_id).distinct() From 6fdd4a55891f752138ab2361672ceb9f030ce8ad Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 17 Feb 2026 09:23:46 +0100 Subject: [PATCH 078/100] Fix verb conjugation in variable name --- packages/divbase-api/src/divbase_api/worker/tasks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/worker/tasks.py b/packages/divbase-api/src/divbase_api/worker/tasks.py index d5e944db..60acc7da 100644 --- a/packages/divbase-api/src/divbase_api/worker/tasks.py +++ b/packages/divbase-api/src/divbase_api/worker/tasks.py @@ -760,11 +760,11 @@ def _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket( logger.error( f"VCF file '{file_name}' version mismatch: indexed={indexed_version}, bucket={bucket_version}" ) - unindex_or_outdated_files = sorted(set(unindexed_files) | set(outdated_files)) + unindexed_or_outdated_files = sorted(set(unindexed_files) | set(outdated_files)) - if unindex_or_outdated_files: + if unindexed_or_outdated_files: raise DimensionsNotUpToDateWithBucketError( - f"The following VCF files or file versions in the project are not part of the project's VCF dimensions: '{', '.join(unindex_or_outdated_files)}'. " + f"The following VCF files or file versions in the project are not part of the project's VCF dimensions: '{', '.join(unindexed_or_outdated_files)}'. " "\nPlease run 'divbase-cli dimensions update --project ' and then submit the query again." ) From 562da4d4fd853e92a45834c9e436f69459825280 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 17 Feb 2026 11:57:56 +0100 Subject: [PATCH 079/100] Prune validator clafication message --- .../src/divbase_cli/cli_commands/dimensions_cli.py | 4 +++- .../divbase_cli/services/sample_metadata_tsv_validator.py | 5 ++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 610998b6..85be5312 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -276,7 +276,9 @@ def validate_metadata_template_versus_dimensions_and_formatting_constraints( if stats: print("[bold cyan]VALIDATION SUMMARY:[/bold cyan]") - print(f" Total columns: {stats.get('total_columns', 0)} ({stats.get('user_defined_columns', 0)} user-defined)") + print( + f" Total columns: {stats.get('total_columns', 0)} ({stats.get('user_defined_columns', 0)} user-defined + 1 Sample_ID column)" + ) samples_in_tsv = stats.get("samples_in_tsv", 0) samples_matching = stats.get("samples_matching_project", 0) diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index c6ddf470..6af4cd9e 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -194,11 +194,10 @@ def _check_mixed_types(self, header: list[str], column_types: dict[int, set[str] if mixed_columns: self.warnings.append( "Clarification on mixed types columns: " + "Columns are be treated as string by DivBase if it contain a mix of numeric and non-numeric values or numeric-looking values with extra characters (for example commas, hyphens, or range-like patterns such as '1-2'). " "A column is only numeric if all values (including each part in semicolon-separated cells) are valid numbers. " - "Special characters like commas or hyphens in numeric-looking values, or Range notation (e.g., '1-2'), also cause DivBase to treat the column as string. " "Use semicolons (;) to separate multiple numeric values. " - "These columns will be treated as string columns by DivBase. " - "Numeric query operations (ranges, inequalities) will not be applicable to these columns." + "Numeric query operations (ranges, inequalities) will not be applicable to string columns." ) def _validate_sample_names(self, tsv_samples: set[str]) -> None: From bd78a05612d2763a12b5a44eec2f78dba0e0ab2d Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 17 Feb 2026 13:51:25 +0100 Subject: [PATCH 080/100] Update test after updating validator warning msg --- tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index 43ee1bba..95ef3ce4 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -364,7 +364,7 @@ def test_negative_numbers_with_semicolons(self, numeric_multi_values_tsv, negati def test_range_notation_produces_warning(self, type_errors_tsv): """Test that range notation like '1-2' in a mixed-type column produces a warning (column treated as string).""" stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, {"S1", "S2", "S3", "S4"}) - assert any("mixed" in w.lower() and "Range" in w for w in warnings) + assert any("clarification on mixed types columns" in w.lower() for w in warnings) class TestSemicolonColumnTypeClassification: From 2be7adf3757d96585982fb16e850f0a3d8c9fb5b Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 17 Feb 2026 14:47:41 +0100 Subject: [PATCH 081/100] Update metadata user guide with results example --- docs/user-guides/sidecar-metadata.md | 38 ++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 2ef9cc35..40763916 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -1,6 +1,6 @@ # Sidecar Metadata TSV files: creating and querying sample metadata files -DivBase supports that users supply a sidecar TSV (tab separated variables) file with metadata on the samples contained within the VCF files in the DivBase project. +DivBase supports that users supply a sidecar TSV (tab separated variables) file with metadata on the samples contained within the VCF files in the DivBase project. The user can then send metadata queries to DivBase to find the samples that fulfil the the query, and which VCF files that the files are found in. Metadata queries can be run on their own (as is described in this guide), but can also be used for VCF queries to automate the checkout data related to specific samples from DivBase (described in the guide for [DivBase Query Syntax for VCF data](query-syntax.md)). While there are ways for sample metadata to be stored in the VCF itself (see [The Variant Call Format Specification](https://samtools.github.io/hts-specs/VCFv4.5.pdf)), it is not really standardized. Metadata can for instance be specified instance in a global `##SAMPLE` header (once per sample) or in a custom per-variant genotype `FORMAT` field in each variant and sample. The downside of the former is that common tools like `bcftools view` do not filter on the headers; the downside of the latter is that writing the metadata once per variant will result in a lot of repeated data, which in turn leads to elevated file size and processing times as the VCF file scales. @@ -68,6 +68,9 @@ To ensure that user-defined metadata can be used in DivBase, we ask you follow t 4. The only characters with special structural meaning in DivBase sidecar metadata TSV files are `#` (for header comments), `;` (for multi-value cell separation), and `\t` (tab, for column separation). Other special characters are generally supported in data values, but be aware that Your Mileage May Vary. Some common cases that have been tested and are supported include diacritic unicode characters like `å`, `ä`, `ö`, and hyphens in strings (e.g., `North-West`). 5. Leading and trailing whitespaces are removed by the DivBase backend in order to ensure robust filtering and pattern matching. Whitespaces inside strings will be preserved. For instance: " Sample 1 " will be processed as "Sample 1". +!!! Note + Note that the TSV does not need contain any information of which VCF files the samples are found in: this is handled by the project's VCF dimensions indexing (`divbase-cli dimensions update`). We advice against putting sample-VCF file mappings in TSV file to reduce the risk of confusion and data mismatch. + #### Example This example illustrates how a sidecar sample metadata TSV can look like. The mandatory requirement are fulfilled (heading with `#`, `Sample_ID` column, tab-separated file). The user-defined column contains examples of a numerical column (`Population`) and a string column (`Area`). In some cells, semicolons (`;`) are used to assign multiple values to the same sample and column. @@ -84,6 +87,8 @@ S7 1;3;5 South 22.6 S8 2 West 19.5 ``` +For the sake of the demonstration later in this guide, let's assume that this TSV file have been uploaded to a DivBase project among with two VCF files where samples S1-S4 are found in 'file1.vcf.gz' and S5-S6 in 'file2.vcf.gz'. Let's also assume that the `divbase-cli dimensions update` has been run after all files have been uploaded so that the system has up-to-data information on which sample is found in which file. + ### Validating a sidecar metadata TSV with `divbase-cli` Manually checking that a TSV fulfills the DivBase requirement can be tedious. To help users validate their sidecar TSV files, the following CLI command has been implemented: @@ -143,7 +148,30 @@ For example, if the user wants to query the TSV on column `Area` for all samples divbase-cli query tsv "Area:North" ``` -Please also see the documentation on [DivBase Query Syntax for VCF data](query-syntax.md) for more details on how that command works. +If this this command is run on the TSV and VCF files used in the above [example](#example) the query would return the following results. + +```bash +The results for the query (Area:North): +Unique Sample IDs: ['S1', 'S5'] +Unique filenames: ['file1.vcf.gz', 'file2.vcf.gz'] +``` + +This tells the user which samples that fullfil the query and which VCF files they need to use if they wanted to work with those two samples. The option ` --show-sample-results ` can be used to show the exact sample-to-VCF file mapping: + +```bash +divbase-cli query tsv "Area:North" --show-sample-results +``` + +which, for the same example, would return: + +```bash +Name and file for each sample in query results: +Sample ID: 'S1', Filename: 'file1.vcf.gz' +Sample ID: 'S5', Filename: 'file2.vcf.gz' +The results for the query (Area:North): +Unique Sample IDs: ['S1', 'S5'] +Unique filenames: ['file1.vcf.gz', 'file2.vcf.gz'] +``` Filtering is inclusive by default. This applies both for the filter values and the cell values: @@ -214,6 +242,12 @@ The `!` (NOT) operator can really come to good use for numerical filters: !!! Tip Numeric operations such as inequalities like `>25`, and ranges like `20-40` are fully supported for semicolon-separated numeric columns as long as every semicolon separated part (`part;part`) in every cell in the column is a valid number. For instance: a `Population` column with values `1`, `2;4`, `1;3;5`; in this case a query like `divbase-cli query tsv "Population:>3"` will correctly match cells like `2;4` and `1;3;5`. +### Filtering on Sample names + +The sidecar metadata filtering is designed to + +The `Sample_ID` column is a string column by design + ### Query Warnings: spotting potential issues with the TSV or the query filter When running a sample metadata query in DivBase, the system will check the TSV and the query filter for the constraints and considerations described throughout this guide. If errors are encountered, the query will not run and a message with details on what went wrong will be return to the user. Warnings, however, will not stop queries from running, but indicate that the user should carefully review the results. From 0104e575591ecf815510f8ff34ea57e76ee9cb2c Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 17 Feb 2026 14:54:17 +0100 Subject: [PATCH 082/100] Ensure same handling of # in validator and queries --- .../services/sample_metadata_tsv_validator.py | 6 ++++-- .../divbase_api/test_sample_metadata_queries.py | 12 ++++++++++++ .../test_sample_metadata_tsv_validator.py | 13 +++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index 6af4cd9e..cf83790e 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -61,8 +61,10 @@ def _validate_header(self, header: list[str]) -> None: if header[0] != "#Sample_ID": self.errors.append(f"First column must be named '#Sample_ID', found: '{header[0]}'") - if len(header) != len(set(header)): - duplicates = [col for col in header if header.count(col) > 1] + # Check for duplicates after stripping '#' to ensures both "#Sample_ID" and "Sample_ID" are caught as duplicates. Matches server-side logic. + cleaned_header = [col.lstrip("#") for col in header] + if len(cleaned_header) != len(set(cleaned_header)): + duplicates = [col for col in cleaned_header if cleaned_header.count(col) > 1] self.errors.append(f"Duplicate column names found: {set(duplicates)}") for i, col in enumerate(header): diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 47ac9ba7..bb6d6210 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -989,6 +989,18 @@ def test_duplicate_column_names_raises(self, tmp_path): SidecarQueryManager(file=tsv_file) assert "duplicate" in str(excinfo.value).lower() + def test_duplicate_column_names_after_stripping_hash_raises(self, tmp_path): + """Test that duplicate column names are caught even when one has '#' and one doesn't. + For example, '#Sample_ID' and 'Sample_ID' should be detected as duplicates.""" + tsv_content = "#Sample_ID\tSample_ID\tPopulation\nS1\tS1_dup\t1\nS2\tS2_dup\t2\n" + tsv_file = tmp_path / "duplicate_sample_id_cols.tsv" + tsv_file.write_text(tsv_content) + + with pytest.raises(SidecarMetadataFormatError) as excinfo: + SidecarQueryManager(file=tsv_file) + assert "duplicate" in str(excinfo.value).lower() + assert "Sample_ID" in str(excinfo.value) + def test_empty_column_name_raises(self, tmp_path): """Test that empty column names raise SidecarMetadataFormatError during load_file().""" tsv_content = "#Sample_ID\t\tWeight\nS1\tNorth\t12.5\nS2\tEast\t18.0\n" diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index 95ef3ce4..bc780028 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -150,6 +150,19 @@ def test_duplicate_column_names(self, header_errors_tsv, project_samples): assert any("Duplicate column names" in e and "Area" in e for e in errors) + def test_duplicate_column_names_after_stripping_hash(self, tmp_path, project_samples): + """Test that duplicate column names are detected even when one has '#' prefix and one doesn't. + This ensures consistency with server-side validation which strips '#' before checking duplicates.""" + tsv_content = """#Sample_ID\tSample_ID\tPopulation +S1\tS1_dup\t1 +S2\tS2_dup\t2 +""" + tsv_file = tmp_path / "duplicate_sample_id_columns.tsv" + tsv_file.write_text(tsv_content) + stats, errors, warnings = MetadataTSVValidator.validate(tsv_file, project_samples) + + assert any("Duplicate column names" in e and "Sample_ID" in e for e in errors) + def test_empty_column_name(self, header_errors_tsv, project_samples): """Test that empty column names are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) From e21e90fc4869b3cc057bfe1c26436b68de107bae Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 17 Feb 2026 17:24:08 +0100 Subject: [PATCH 083/100] Refactor TSV validator to a shared logic This ensures that the validation of the TSV content has a single source of truth in the form of the SharedMetadataValidator, which is used by both the CLI and the API. Note! This is only for the contents of the TSV files. The filtering logic for the queries is still separate to the query manager on the server side. --- .../src/divbase_api/services/queries.py | 201 ++++----- .../services/sample_metadata_tsv_validator.py | 257 +++-------- .../src/divbase_lib/metadata_validator.py | 399 ++++++++++++++++++ .../cli_commands/test_query_cli.py | 4 +- tests/e2e_integration/queries/conftest.py | 4 +- .../queries/test_SidecarQueryManager.py | 2 +- ...t_need_mixed_bcftools_concat_and_merge.tsv | 2 +- 7 files changed, 539 insertions(+), 330 deletions(-) create mode 100644 packages/divbase-lib/src/divbase_lib/metadata_validator.py diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 2c811258..34dace97 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -29,6 +29,7 @@ SidecarNoDataLoadedError, SidecarSampleIDError, ) +from divbase_lib.metadata_validator import SharedMetadataValidator logger = logging.getLogger(__name__) @@ -53,11 +54,18 @@ def run_sidecar_metadata_query( """ Run a query on a sidecar metadata TSV file and map samples to VCF files. - Takes vcf_dimensions_data fetched by an API call in the task layer + Takes vcf_dimensions_data fetched by an API call in the task layer and extracts the unique sample names. """ - - sidecar_manager = SidecarQueryManager(file=file).run_query(filter_string=filter_string) + project_samples = set() + if vcf_dimensions_data and vcf_dimensions_data.get("vcf_files"): + for vcf_entry in vcf_dimensions_data.get("vcf_files", []): + sample_names = vcf_entry.get("samples", []) + project_samples.update(sample_names) + + sidecar_manager = SidecarQueryManager(file=file, project_samples=project_samples).run_query( + filter_string=filter_string + ) query_message = sidecar_manager.query_message warnings = sidecar_manager.warnings unique_sample_ids = sidecar_manager.get_unique_values("Sample_ID") @@ -684,10 +692,12 @@ class SidecarQueryManager: TODO - some of the __init__ params are perhaps better as properties? """ - def __init__(self, file: Path): + def __init__(self, file: Path, project_samples: set[str] | None = None): self.file = file + self.project_samples = project_samples self.filter_string = None self.df = None + self.metadata_validator = None self.query_result = None self.query_message: str = "" self.warnings: list[str] = [] @@ -698,7 +708,7 @@ def load_file(self) -> "SidecarQueryManager": Method that loads the TSV file into a pandas DataFrame. Assumes that the first row is a header row, and that the file is tab-separated. Also removes any leading '#' characters from the column names. - Validates the same errors as the client-side MetadataTSVValidator: + Validates the same errors as the client-side MetadataTSVValidator using shared validation logic: - Header: first column must be #Sample_ID, no duplicate or empty column names - Sample_ID: no empty values, no duplicates, no semicolons - Data: no commas in any cell values @@ -706,41 +716,66 @@ def load_file(self) -> "SidecarQueryManager": try: logger.info(f"Loading sidecar metadata file: {self.file}") - # Read the raw header before loading with pandas to catch issues that pandas would silently fix, such as duplicate column names or empty column names. - self._read_and_validate_raw_header() - - self.df = pd.read_csv( - self.file, sep="\t", skipinitialspace=True - ) # Pandas has Type Inference and will detect numeric and string columns automatically - self.df.columns = self.df.columns.str.lstrip("#") - - # Strip leading and trailing whitespace from all string columns - for col in self.df.columns: - self.df[col] = self.df[col].apply(lambda x: x.strip() if isinstance(x, str) else x) + # Note! The SharedMetadataValidator is for checks on the contents of the TSV file. The logic is shared between this class and the client-side MetadataTSVValidator. + # There are several helper methods for the filtering logic in this class, but they are for the query filters and are not related to the validation of the TSV file contents. + self.metadata_validator = SharedMetadataValidator( + file_path=self.file, + project_samples=self.project_samples, + skip_dimensions_check=(self.project_samples is None), + ) + result = self.metadata_validator.load_and_validate() + + if result.errors: + error_msg = result.errors[0] + # Note! The order of these errors matters. + if "Failed to read file" in error_msg: + raise SidecarNoDataLoadedError(file_path=self.file, submethod="load_file") + elif "First column must be named '#Sample_ID'" in error_msg or ( + "Sample_ID" in error_msg and "column is required" in error_msg + ): + raise SidecarColumnNotFoundError( + "The 'Sample_ID' column is required in the metadata file." + if "First column must be named '#Sample_ID'" in error_msg + else error_msg + ) + elif ("Row" in error_msg and "Sample_ID is empty" in error_msg) or ( + "Sample_ID" in error_msg + and ( + "contains semicolons" in error_msg + or "Duplicate Sample_IDs" in error_msg + or "contains empty or missing values" in error_msg + ) + ): + raise SidecarSampleIDError( + "Sample_ID column contains empty or missing values. All rows must have a valid Sample_ID." + if "Row" in error_msg and "Sample_ID is empty" in error_msg + else error_msg + ) + elif ( + "not found in the DivBase project's dimensions index" in error_msg + or "Duplicate column names" in error_msg + or "Empty column name" in error_msg + or ("Row" in error_msg and ("Expected" in error_msg or "tab-separated" in error_msg)) + or "column" in error_msg.lower() + ): + raise SidecarMetadataFormatError(error_msg) + else: + raise SidecarMetadataFormatError(error_msg) - if "Sample_ID" not in self.df.columns: - raise SidecarColumnNotFoundError("The 'Sample_ID' column is required in the metadata file.") + # Capture dimension-related warnings from the validator (e.g., samples in project but not in TSV) + # Other validation warnings (mixed types, commas, etc.) are for file quality and shown in CLI validation only + if result.warnings: + dimension_warnings = [w for w in result.warnings if "dimensions index" in w or "project" in w] + self.warnings.extend(dimension_warnings) - if self.df["Sample_ID"].isna().any() or (self.df["Sample_ID"] == "").any(): - raise SidecarSampleIDError( - "Sample_ID column contains empty or missing values. All rows must have a valid Sample_ID." - ) - if self.df["Sample_ID"].duplicated().any(): - duplicates = self.df[self.df["Sample_ID"].duplicated()]["Sample_ID"].tolist() - raise SidecarSampleIDError(f"Duplicate Sample_IDs found: {duplicates}. Each Sample_ID must be unique.") - - semicolon_samples = self.df[self.df["Sample_ID"].str.contains(";", na=False)]["Sample_ID"].tolist() - if semicolon_samples: - raise SidecarSampleIDError( - f"Sample_ID column contains semicolons in values: {semicolon_samples}. " - "Sample_ID must contain only one value per row (semicolons are not allowed)." - ) + self.df = result.df except ( SidecarSampleIDError, SidecarColumnNotFoundError, SidecarInvalidFilterError, SidecarMetadataFormatError, + SidecarNoDataLoadedError, ): # Let validation errors propagate directly to user with specific error messages raise @@ -749,39 +784,6 @@ def load_file(self) -> "SidecarQueryManager": raise SidecarNoDataLoadedError(file_path=self.file, submethod="load_file") from e return self - def _read_and_validate_raw_header(self) -> None: - """ - Read the first line of the TSV file and validate column names. This is intended to be run - before pandas loads the file, to catch issues that pandas would silently fix. Pandas for instance - rename duplicate columns (e.g., "Area", "Area.1") and empty column names. - """ - with open(self.file, "r", encoding="utf-8") as f: - first_line = f.readline().rstrip("\n\r") - - raw_columns = first_line.split("\t") - cleaned_columns = [col.lstrip("#") for col in raw_columns] - - empty_columns = [i + 1 for i, col in enumerate(cleaned_columns) if not col.strip()] - if empty_columns: - raise SidecarMetadataFormatError( - f"Empty column name(s) found at position(s): {empty_columns}. All columns must have a non-empty name." - ) - - seen = {} - duplicate_columns = [] - for col in cleaned_columns: - col_stripped = col.strip() - if col_stripped in seen: - if col_stripped not in duplicate_columns: - duplicate_columns.append(col_stripped) - else: - seen[col_stripped] = True - - if duplicate_columns: - raise SidecarMetadataFormatError( - f"Duplicate column names found: {duplicate_columns}. Each column name must be unique in the metadata file." - ) - def get_unique_values(self, column: str) -> list: """ Method to fetch unique values from a specific column in the query result. Intended to be invoked on a SidecarQueryManager @@ -1043,53 +1045,18 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": def _is_semicolon_separated_numeric_column(self, key: str) -> bool: """ - Helper method to detect if a column contains semicolon-separated numeric values. - Pandas correctly infers type from single-value columns. But for columns with semicolon-separated values - (e.g.: "1;2;3"), it infers them as strings (object dtype). This is an issue since numeric operations - (inequalities, ranges) cannot be performed on string values. - - This helper method checks ALL non-null values in the column to determine if they can be parsed as numeric - after splitting by semicolon. Note that it only detects if a column value is semicolon-separated numeric, it does not - convert the column to numeric type. The actual parsing and handling of the semicolon-separated numeric values is - done in the helper functions in the run_query() method. - - Returns True if the column contains ONLY numeric values (with or without semicolons). - Returns False if the column contains ANY non-numeric values (treated as a regular string column). - Returns False if the column is empty. - - This method does not raise errors for mixed types. Columns with a mix of numeric-looking and - non-numeric values (e.g., "8", "1a", "5a") are simply treated as string columns. + Helper method for the filtering logic to detect if a column contains semicolon-separated numeric values. + + Uses the shared validation logic to ensure consistency with the TSV validator. """ if key not in self.df.columns: return False - non_null_values = self.df[key].dropna() - if len(non_null_values) == 0: - return False - - for cell_value in non_null_values: - cell_str = str(cell_value).strip() - if not cell_str: - continue - - parts = cell_str.split(";") - for part in parts: - part = part.strip() - if not part: - continue - - try: - float(part) - except ValueError: - # Any non-numeric value means this column is not a numeric column. - # It will be treated as a string column instead. - return False - - return True + return self.metadata_validator.is_semicolon_separated_numeric_column(self.df[key]) def _is_mixed_type_column(self, key: str) -> tuple[bool, list[str], int]: """ - Helper method to detect if a non-numeric column has mixed types. + Helper method for the filtering logic to detect if a non-numeric column has mixed types. A column is considered mixed-type if it contains: 1. Both numeric-looking and non-numeric values (e.g., "8", "1a", "5a") @@ -1146,7 +1113,7 @@ def _is_mixed_type_column(self, key: str) -> tuple[bool, list[str], int]: def _detect_numeric_filter_syntax_on_string_column(self, key: str, filter_string_values: str) -> list[str]: """ - Helper method to detect when a user's filter string contains inequality operators + Helper method for the filtering logic to detect when a user's filter string contains inequality operators (e.g. ">25", ">=10", ", <, >=, <=) as a prefix, regardless of whether the @@ -1171,7 +1138,7 @@ def _detect_numeric_filter_syntax_on_string_column(self, key: str, filter_string def _split_cell_values(self, cell_value: Any) -> list[str]: """ - Helper method to split cell value by semicolon and return list of non-empty values. + Helper method for the filtering logic to split cell value by semicolon and return list of non-empty values. If the cell contains a single value without semicolon, it will return a list with that single value. If the cell is empty or NaN, it will return an empty list. """ @@ -1180,12 +1147,12 @@ def _split_cell_values(self, cell_value: Any) -> list[str]: return [val.strip() for val in str(cell_value).split(";") if val.strip()] def _parse_numeric_value(self, value_str: str) -> float | int: - """Helper method to parse a string value to int or float. To be used when other checks have already confirmed that the value can be parsed as numeric.""" + """Helper method for the filtering logic to parse a string value to int or float. To be used when other checks have already confirmed that the value can be parsed as numeric.""" return float(value_str) if "." in value_str else int(value_str) def _create_inequality_condition(self, key: str, operator: str, threshold: float) -> pd.Series: """ - Helper method to create a condition for inequality filtering on a column. + Helper method for the filtering logic to create a condition for inequality filtering on a column. Uses a named nested function instead of a lambda to improve readability to defined the logic that will be applied to the Pandas dataframe. """ @@ -1212,7 +1179,7 @@ def check_inequality(cell_value): def _create_range_condition(self, key: str, min_val: float, max_val: float) -> pd.Series: """ - Helper method to create a condition for range filtering on a column. + Helper method for the filtering logic to create a condition for range filtering on a column. Uses a named nested function instead of a lambda to improve readability to define the logic that will be applied to the Pandas dataframe. """ @@ -1234,7 +1201,7 @@ def check_range(cell_value): def _create_discrete_numeric_condition(self, key: str, target_values: list[float | int]) -> pd.Series: """ - Helper method to create a condition for discrete numeric value filtering on a column. + Helper method for the filtering logic to create a condition for discrete numeric value filtering on a column. Uses a named nested function instead of a lambda to improve readability to define the logic that will be applied to the Pandas dataframe. """ @@ -1256,7 +1223,7 @@ def check_discrete(cell_value): def _create_string_condition(self, key: str, target_values: list[str]) -> pd.Series: """ - Helper method to create a condition for string value filtering on a column. + Helper method for the filtering logic to create a condition for string value filtering on a column. Uses a named nested function instead of a lambda to improve readability to define the logic that will be applied to the Pandas dataframe. """ @@ -1271,7 +1238,7 @@ def check_string(cell_value): def _combine_conditions_with_or(self, conditions: list[pd.Series]) -> pd.Series: """ - Helper method to combine multiple Pandas boolean Series with OR logic. + Helper method for the filtering logic to combine multiple Pandas boolean Series with OR logic. Returns a single boolean Series that is True if any of the input conditions is True for each row. The resulting Series is used at the end of self.run_query() to filter the DataFrame values. @@ -1292,7 +1259,7 @@ def _build_condition_list( key: str, ) -> list[pd.Series]: """ - Helper method to build a list of conditions from inequality, range, and discrete value filters. + Helper method for the filtering logic to build a list of conditions from inequality, range, and discrete value filters. """ conditions = [] @@ -1312,7 +1279,7 @@ def _parse_numeric_filter_values( self, values_to_process: list[str], context: dict[str, str | bool] ) -> tuple[list[pd.Series], list[pd.Series], list[float | int]]: """ - Helper method to identify if a numeric filter values is an inequality, range, or discrete value and process it accordingly. + Helper method for the filtering logic to identify if a numeric filter values is an inequality, range, or discrete value and process it accordingly. The context dict is intended to keep the kwargs manageable when passing positive and negative values back-to-back: - key: Column name being filtered @@ -1374,7 +1341,7 @@ def _parse_numeric_filter_values( def _separate_positive_and_negated_values(self, filter_values: list[str]) -> tuple[list[str], list[str]]: """ - Helper method to separate filter values into positive and negated lists. + Helper method for the filtering logic to separate filter values into positive and negated lists. Values prefixed with '!' are negated (NOT conditions). """ positive_values = [] @@ -1391,7 +1358,7 @@ def _separate_positive_and_negated_values(self, filter_values: list[str]) -> tup def _apply_not_conditions(self, base_condition: pd.Series | None, negated_conditions: list[pd.Series]) -> pd.Series: """ - Helper method to apply NOT conditions to a base condition. The base condition contains positive filters combined with OR, or None if there were only negations + Helper method for the filtering logic to apply NOT conditions to a base condition. The base condition contains positive filters combined with OR, or None if there were only negations in the input filter string from the CLI. Returns a combined condition where rows must match base_condition AND NOT match any negated condition """ if base_condition is None: diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index cf83790e..8fe7c7a9 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -3,11 +3,14 @@ Requires that the dimensions index for the project is up to date and is fetched from the server. Validates formatting requirements without sending data to the server. + +Calls the shared sidecar metadata TSV validation logic from divbase_lib.metadata_validator. """ -import csv from pathlib import Path +from divbase_lib.metadata_validator import SharedMetadataValidator + class MetadataTSVValidator: """Validates sidecar metadata TSV files against DivBase requirements.""" @@ -28,236 +31,76 @@ def validate(cls, file_path: Path, project_samples: list[str] | set[str]) -> tup """ Validate a TSV file and return results. + Since this is run on the client-side, it should try to detect all formatting errors and warnings. + Therefore, assuming that it is able read the local file, it does not raise exceptions like the SidecarQueryManager does for server-side validation, + but instead continues working through the file to collects all encountered errors and warnings. + Returns a tuple of (stats, errors, warnings) where stats is a dictionary of collected statistics, errors is a list of error messages, and warnings is a list of warning messages. """ validator = cls(file_path, project_samples) - try: - with open(validator.file_path, "r", newline="", encoding="utf-8") as f: - reader = csv.reader(f, delimiter="\t") - rows = list(reader) - except Exception as e: - validator.errors.append(f"Failed to read file: {e}") - return validator.stats, validator.errors, validator.warnings - - if not rows: - validator.errors.append("File is empty") - return validator.stats, validator.errors, validator.warnings - - validator._validate_header(rows[0]) - - if len(rows) > 1: - validator._validate_data_rows(rows) - - return validator.stats, validator.errors, validator.warnings - - def _validate_header(self, header: list[str]) -> None: - """Validate the header row.""" - if not header: - self.errors.append("Header row is missing") - return - - if header[0] != "#Sample_ID": - self.errors.append(f"First column must be named '#Sample_ID', found: '{header[0]}'") - - # Check for duplicates after stripping '#' to ensures both "#Sample_ID" and "Sample_ID" are caught as duplicates. Matches server-side logic. - cleaned_header = [col.lstrip("#") for col in header] - if len(cleaned_header) != len(set(cleaned_header)): - duplicates = [col for col in cleaned_header if cleaned_header.count(col) > 1] - self.errors.append(f"Duplicate column names found: {set(duplicates)}") - - for i, col in enumerate(header): - if not col.strip(): - self.errors.append(f"Empty column name at position {i + 1}") - - def _validate_data_rows(self, rows: list[list[str]]) -> None: - """Validate all data rows.""" - header = rows[0] - data_rows = rows[1:] - - num_columns = len(header) - sample_ids_seen = set() - tsv_samples = set() - - column_types: dict[int, set[str]] = {i: set() for i in range(1, num_columns)} - empty_cells_per_column: dict[str, int] = {header[i]: 0 for i in range(1, num_columns)} - - has_multi_values = False - - for row_num, row in enumerate(data_rows, start=2): # Start at row 2 (after header) - if len(row) != num_columns: - sample_hint = f" (Sample_ID: '{row[0]}')" if row else "" - self.errors.append( - f"Row {row_num}: Expected {num_columns} tab-separated columns from reading the header, found {len(row)}{sample_hint}. " - "Check that all values are separated by tabs (not spaces)." - ) - continue - - sample_id = row[0].strip() if row else "" - - if not sample_id: - self.errors.append(f"Row {row_num}: Sample_ID is empty") - continue - - if ";" in sample_id: - self.errors.append( - f"Row {row_num}: Sample_ID '{sample_id}' contains semicolon. Sample_ID must contain only one value." - ) - - if sample_id in sample_ids_seen: - self.errors.append(f"Row {row_num}: Duplicate Sample_ID '{sample_id}'") - else: - sample_ids_seen.add(sample_id) - tsv_samples.add(sample_id) - - for col_idx, cell in enumerate(row): - self._validate_cell(row_num, col_idx, header[col_idx], cell) - - # Track column types and empty-cells for user-defined columns (skip col 0, i.e. Sample_ID) - if col_idx > 0: - if not cell.strip(): - empty_cells_per_column[header[col_idx]] += 1 - - if ";" in cell: - has_multi_values = True - self._infer_column_type(row_num, col_idx, header[col_idx], cell, column_types) + shared_validator = SharedMetadataValidator( + file_path=file_path, + project_samples=validator.project_samples, + skip_dimensions_check=False, + ) + result = shared_validator.load_and_validate() - self._check_mixed_types(header, column_types) + validator.errors = result.errors + validator.warnings = result.warnings - self._validate_sample_names(tsv_samples) - - self._collect_statistics(header, tsv_samples, column_types, has_multi_values, empty_cells_per_column) - - def _validate_cell(self, row_num: int, col_idx: int, col_name: str, cell: str) -> None: - """Validate an individual cell.""" - - if cell != cell.strip(): - self.warnings.append( - f"Row {row_num}, Column '{col_name}': Cell has leading or trailing whitespace " - "(will be stripped by server)" - ) - - if col_idx > 0 and not cell.strip(): - self.warnings.append( - f"Row {row_num}, Column '{col_name}': Cell is empty. " - "Empty values will be treated as missing by the server and will not match any filter conditions in queries." - ) - - def _infer_column_type( - self, row_num: int, col_idx: int, col_name: str, cell: str, column_types: dict[int, set[str]] - ) -> None: - """ - Infer the type of values in a column and track type information. - Matches server-side logic in queries.py::_is_semicolon_separated_numeric_column - - Columns with a mix of numeric-looking and non-numeric values (e.g., "8", "1a", "5a") - are treated as string columns. Mixed types are reported as warnings, not errors. - """ - values = [v.strip() for v in cell.split(";") if v.strip()] - - cell_has_numeric = False - cell_has_string = False - - for value in values: - # Try to determine if numeric or string first - # Note! The queries use Pandas for this which is not used here due to different dependencies in the packages. There could potentially be a discrepancy here. + if result.df is not None and "Sample_ID" in result.df.columns: try: - float(value) - cell_has_numeric = True - column_types[col_idx].add("numeric") - except ValueError: - cell_has_string = True - column_types[col_idx].add("string") - - # Check for mixed types within the same cell (e.g., "1;abc") and warn the user if applicable - if cell_has_numeric and cell_has_string: - self.warnings.append( - f"Row {row_num}, Column '{col_name}': Cell '{cell}' contains mixed types " - f"(both numeric and non-numeric values in semicolon-separated cell). " - f"This column will be treated as a string column." - ) - - def _check_mixed_types(self, header: list[str], column_types: dict[int, set[str]]) -> None: - """ - Check for mixed types in columns and report as informational warnings. - Matches server-side logic in queries.py::_is_semicolon_separated_numeric_column - - Columns with mixed types are treated as string columns by the DivBase query engine. - This happens for values such as "8", "1a", "5a" that happen to look numeric but - are semantically strings (e.g. names, IDs). - """ - mixed_columns = [] - for col_idx, types in column_types.items(): - if len(types) > 1: - col_name = header[col_idx] - mixed_columns.append(col_name) - - if mixed_columns: - self.warnings.append( - "Clarification on mixed types columns: " - "Columns are be treated as string by DivBase if it contain a mix of numeric and non-numeric values or numeric-looking values with extra characters (for example commas, hyphens, or range-like patterns such as '1-2'). " - "A column is only numeric if all values (including each part in semicolon-separated cells) are valid numbers. " - "Use semicolons (;) to separate multiple numeric values. " - "Numeric query operations (ranges, inequalities) will not be applicable to string columns." - ) - - def _validate_sample_names(self, tsv_samples: set[str]) -> None: - """Validate sample names against project dimensions.""" - - missing_from_project = tsv_samples - self.project_samples - if missing_from_project: - examples = sorted(list(missing_from_project)) - self.errors.append( - f"The following samples in the TSV were not found in the DivBase project's dimensions index: {examples}. " - "DivBase requires that all samples in the TSV file must be present in the project's dimensions index to be used for queries." - ) + tsv_samples = set(result.df["Sample_ID"].tolist()) + validator._collect_statistics( + df=result.df, + tsv_samples=tsv_samples, + numeric_cols=result.numeric_columns, + string_cols=result.string_columns, + mixed_cols=result.mixed_type_columns, + ) + except (AttributeError, TypeError): + # If Sample_ID access fails (e.g., in the very rare case that duplicate Sample_ID column make it a DataFrame due to dataframe nesting), skip statistics as the validation errors already captured the issue + pass - missing_from_tsv = self.project_samples - tsv_samples - if missing_from_tsv: - examples = sorted(list(missing_from_tsv)) - self.warnings.append( - f"The following samples in the DivBase project's dimensions index were not found in the TSV: {examples}. " - "This is allowed for DivBase metadata TSV files, but please be aware that these samples will not be considered when making queries with this metadata file." - ) + return validator.stats, validator.errors, validator.warnings def _collect_statistics( self, - header: list[str], + df, tsv_samples: set[str], - column_types: dict[int, set[str]], - has_multi_values: bool, - empty_cells_per_column: dict[str, int], + numeric_cols: list[str], + string_cols: list[str], + mixed_cols: list[str], ) -> None: """Collect statistics about the TSV file.""" - self.stats["total_columns"] = len(header) - self.stats["user_defined_columns"] = len(header) - 1 # Exclude Sample_ID + self.stats["total_columns"] = len(df.columns) + self.stats["user_defined_columns"] = len(df.columns) - 1 # Exclude Sample_ID matching_samples = tsv_samples & self.project_samples self.stats["samples_in_tsv"] = len(tsv_samples) self.stats["samples_matching_project"] = len(matching_samples) self.stats["total_project_samples"] = len(self.project_samples) - numeric_cols = [] - string_cols = [] - mixed_cols = [] - - for col_idx, types in column_types.items(): - col_name = header[col_idx] - if len(types) > 1: - mixed_cols.append(col_name) - elif "numeric" in types: - numeric_cols.append(col_name) - elif "string" in types: - string_cols.append(col_name) - self.stats["numeric_columns"] = numeric_cols self.stats["string_columns"] = string_cols self.stats["mixed_type_columns"] = mixed_cols + + has_multi_values = False + for col in df.columns: + if df[col].astype(str).str.contains(";", na=False).any(): + has_multi_values = True + break self.stats["has_multi_values"] = has_multi_values - # Only include columns with empty cells in stats - columns_with_empty_cells = {col: count for col, count in empty_cells_per_column.items() if count > 0} - if columns_with_empty_cells: - self.stats["empty_cells_per_column"] = columns_with_empty_cells + empty_cells_per_column = {} + for col in df.columns: + if col != "Sample_ID": + empty_count = df[col].isna().sum() + (df[col] == "").sum() + if empty_count > 0: + empty_cells_per_column[col] = int(empty_count) + + if empty_cells_per_column: + self.stats["empty_cells_per_column"] = empty_cells_per_column diff --git a/packages/divbase-lib/src/divbase_lib/metadata_validator.py b/packages/divbase-lib/src/divbase_lib/metadata_validator.py new file mode 100644 index 00000000..889b7160 --- /dev/null +++ b/packages/divbase-lib/src/divbase_lib/metadata_validator.py @@ -0,0 +1,399 @@ +""" +Shared validation logic for DivBase sidecar metadata TSV files. + +This file contains the single source of truth for the TSV content validation logic used by both +the CLI validator (MetadataTSVValidator) on the client-side, and the SidecarQueryManager on the server-side. + +Note! Logic for the queries themselves (e.g. how filtering is handled) is not shared between the two. +This file is only for validation of the contents of the TSV file, not for query processing. +""" + +import csv +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import pandas as pd + + +@dataclass +class MetadataValidationResult: + """ + Dataclass to hold the results of the TSV file validation. + """ + + errors: list[str] = field(default_factory=list) + warnings: list[str] = field(default_factory=list) + stats: dict[str, Any] = field(default_factory=dict) + df: pd.DataFrame | None = None + mixed_type_columns: list[str] = field(default_factory=list) + numeric_columns: list[str] = field(default_factory=list) + string_columns: list[str] = field(default_factory=list) + + +class SharedMetadataValidator: + """ + Core validation logic for DivBase sidecar metadata TSV files. Shared between client-side MetadataTSVValidator + and server-side SidecarQueryManager to ensure consistent validation behavior. + + It does not validate metadata query filters. That is handled in the SidecarQueryManager. + + This class handles the following validation of the TSV content: + - Header (duplicates, empty columns, first column name) + - Sample_ID (empty, duplicates, semicolons) + - Column type (numeric vs string, semicolon-separated values) + - Data format (commas, whitespace, column count) + - Mixed type detection and warnings + """ + + def __init__(self, file_path: Path, project_samples: set[str] | None = None, skip_dimensions_check: bool = False): + self.file_path = file_path + self.project_samples = project_samples + self.skip_dimensions_check = skip_dimensions_check + self.result = MetadataValidationResult() + self.df: pd.DataFrame | None = None + + def load_and_validate(self) -> MetadataValidationResult: + """ + Main entry point to the class. Load a TSV file and call helper methods to validate it. + """ + try: + with open(self.file_path, "r", newline="", encoding="utf-8") as f: + reader = csv.reader(f, delimiter="\t") + rows = list(reader) + + if not rows: + self.result.errors.append("File is empty") + return self.result + + # Pre-pandas checks + first_line = "\t".join(rows[0]) + header_errors, header_warnings = self._validate_raw_header(first_line) + self.result.errors.extend(header_errors) + self.result.warnings.extend(header_warnings) + + if len(rows) > 1: + row_errors, row_warnings = self._check_row_formatting(rows) + self.result.errors.extend(row_errors) + self.result.warnings.extend(row_warnings) + + # Initiate Pandas dataframe from TSV + df = pd.read_csv(self.file_path, sep="\t", skipinitialspace=True, on_bad_lines="skip") + df.columns = df.columns.str.lstrip("#") + + for col in df.columns: + df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x) + + self.result.df = df + self.df = df + + # Dataframe checks + sample_id_errors, sample_id_warnings = self._validate_sample_ids(df) + self.result.errors.extend(sample_id_errors) + self.result.warnings.extend(sample_id_warnings) + + mixed_type_columns, cell_warnings = self._detect_mixed_type_columns(df) + self.result.warnings.extend(cell_warnings) + self.result.mixed_type_columns = mixed_type_columns + + mixed_type_warning = self._generate_mixed_type_warning(mixed_type_columns) + if mixed_type_warning: + self.result.warnings.append(mixed_type_warning) + + comma_warnings = self._check_for_commas(df) + self.result.warnings.extend(comma_warnings) + + numeric_cols, string_cols, mixed_cols = self._classify_columns(df, mixed_type_columns) + self.result.numeric_columns = numeric_cols + self.result.string_columns = string_cols + + if not self.skip_dimensions_check and self.project_samples is not None: + tsv_samples = set(df["Sample_ID"].tolist()) if "Sample_ID" in df.columns else set() + dim_errors, dim_warnings = self._validate_dimensions_match(tsv_samples, self.project_samples) + self.result.errors.extend(dim_errors) + self.result.warnings.extend(dim_warnings) + + except Exception as e: + self.result.errors.append(f"Failed to read file: {e}") + + return self.result + + def _validate_raw_header(self, header_line: str) -> tuple[list[str], list[str]]: + """ + Validate the raw header line before pandas processing. + + Attempts to catch issues that pandas would silently fix (like duplicate columns). + + """ + errors = [] + + raw_columns = header_line.split("\t") + cleaned_columns = [col.lstrip("#") for col in raw_columns] + + empty_columns = [i + 1 for i, col in enumerate(cleaned_columns) if not col.strip()] + if empty_columns: + errors.append( + f"Empty column name(s) found at position(s): {empty_columns}. All columns must have a non-empty name." + ) + + # Check for duplicate columns after stripping '#' + seen = {} + duplicate_columns = [] + for col in cleaned_columns: + col_stripped = col.strip() + if col_stripped in seen: + if col_stripped not in duplicate_columns: + duplicate_columns.append(col_stripped) + else: + seen[col_stripped] = True + + if duplicate_columns: + errors.append( + f"Duplicate column names found: {duplicate_columns}. " + "Each column name must be unique in the metadata file." + ) + + if raw_columns and raw_columns[0] != "#Sample_ID": + errors.append(f"First column must be named '#Sample_ID', found: '{raw_columns[0]}'") + + return errors, [] + + def _check_row_formatting(self, rows: list[list[str]]) -> tuple[list[str], list[str]]: + """ + Check for row-level formatting issues that pandas might handle silently. + """ + errors = [] + warnings = [] + + header = rows[0] + num_columns = len(header) + + for row_num, row in enumerate(rows[1:], start=2): # Start at row 2 (i.e. skip header) + if len(row) != num_columns: + sample_hint = f" (Sample_ID: '{row[0]}')" if row else "" + errors.append( + f"Row {row_num}: Expected {num_columns} tab-separated columns from reading the header, " + f"found {len(row)}{sample_hint}. " + "Check that all cells in the TSV are separated by tabs (not spaces)." + ) + continue + + for col_idx, cell in enumerate(row): + if cell != cell.strip(): + col_name = header[col_idx] + warnings.append( + f"Row {row_num}, Column '{col_name}': Cell has leading or trailing whitespace " + "(this is allowed, but note that they will be stripped by DivBase server when the TSV is used for queries)" + ) + + if col_idx == 0 and not cell.strip(): + errors.append(f"Row {row_num}: Sample_ID is empty") + + return errors, warnings + + def _validate_sample_ids(self, df: pd.DataFrame) -> tuple[list[str], list[str]]: + """ + Validate Sample_ID column in the DataFrame. + + """ + errors = [] + + if "Sample_ID" not in df.columns: + errors.append("The 'Sample_ID' column is required in the metadata file.") + return errors, [] + + if df["Sample_ID"].isna().any() or (df["Sample_ID"] == "").any(): + errors.append("Sample_ID column contains empty or missing values. All rows must have a valid Sample_ID.") + + if df["Sample_ID"].duplicated().any(): + duplicates = df[df["Sample_ID"].duplicated()]["Sample_ID"].tolist() + errors.append(f"Duplicate Sample_IDs found: {duplicates}. Each Sample_ID must be unique.") + + semicolon_samples = df[df["Sample_ID"].str.contains(";", na=False)]["Sample_ID"].tolist() + if semicolon_samples: + errors.append( + f"Sample_ID column contains semicolons in values: {semicolon_samples}. " + "Sample_ID must contain only one value per row (semicolons are not allowed)." + ) + + return errors, [] + + def is_semicolon_separated_numeric_column(self, series: pd.Series) -> bool: + """ + Determine if a column contains semicolon-separated numeric values. + + Pandas infers "1;2;3" as string object dtype. This method checks if all + non-null values in the column can be parsed as numeric after splitting by semicolon. + """ + non_null_values = series.dropna() + if len(non_null_values) == 0: + return False + + for cell_value in non_null_values: + if not isinstance(cell_value, str): + try: + float(cell_value) + continue + except (ValueError, TypeError): + return False + + # Also check each semicolon-separated part of the cell value + parts = [p.strip() for p in str(cell_value).split(";") if p.strip()] + for part in parts: + try: + float(part) + except ValueError: + return False + + return True + + def _detect_mixed_type_columns(self, df: pd.DataFrame) -> tuple[list[str], list[str]]: + """ + Detect columns with mixed types (numeric and non-numeric values). + """ + mixed_type_columns = [] + cell_warnings = [] + + for col in df.columns: + if col == "Sample_ID": + continue + + series = df[col] + non_null_values = series.dropna() + + if len(non_null_values) == 0: + continue + + has_numeric = False + has_string = False + + for idx, cell_value in non_null_values.items(): + if isinstance(cell_value, str) and ";" in cell_value: + # Mutli-value cell + parts = [p.strip() for p in cell_value.split(";") if p.strip()] + cell_has_numeric = False + cell_has_string = False + + for part in parts: + try: + float(part) + cell_has_numeric = True + except ValueError: + cell_has_string = True + + if cell_has_numeric and cell_has_string: + cell_warnings.append( + f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' contains mixed types " + f"(both numeric and non-numeric values in semicolon-separated cell). " + f"This column will be treated as a string column." + ) + has_string = True + elif cell_has_numeric: + has_numeric = True + else: + has_string = True + else: + # Single value cell + try: + float(cell_value) + has_numeric = True + except (ValueError, TypeError): + has_string = True + + if has_numeric and has_string: + mixed_type_columns.append(col) + + return mixed_type_columns, cell_warnings + + def _generate_mixed_type_warning(self, mixed_columns: list[str]) -> str | None: + """ + Generate warning about mixed-type columns. + """ + if not mixed_columns: + return None + + return ( + "Clarification on mixed types columns: " + "Columns are treated as string by DivBase if they contain a mix of numeric and non-numeric values " + "or numeric-looking values with extra characters (for example commas, hyphens, or range-like patterns such as '1-2'). " + "A column is only numeric if all values (including each part in semicolon-separated cells) are valid numbers. " + "Use semicolons (;) to separate multiple numeric values. " + "Numeric query operations (ranges, inequalities) will not be applicable to string columns." + ) + + def _classify_columns( + self, df: pd.DataFrame, mixed_type_columns: list[str] + ) -> tuple[list[str], list[str], list[str]]: + """ + Classify columns as numeric, string, or mixed-type. + """ + numeric_cols = [] + string_cols = [] + + for col in df.columns: + if col == "Sample_ID": + continue + + if col in mixed_type_columns: + continue + + series = df[col] + + if pd.api.types.is_numeric_dtype(series) or self.is_semicolon_separated_numeric_column(series): + numeric_cols.append(col) + else: + string_cols.append(col) + + return numeric_cols, string_cols, mixed_type_columns + + def _check_for_commas(self, df: pd.DataFrame) -> list[str]: + """ + Check for comma-separated values in any cell (warns user to use semicolons). + """ + warnings = [] + + for col in df.columns: + series = df[col] + # Only check string columns + if not pd.api.types.is_string_dtype(series) and not pd.api.types.is_object_dtype(series): + continue + + for idx, cell_value in series.items(): + if isinstance(cell_value, str) and "," in cell_value: + warnings.append( + f"Row {idx + 2}, Column '{col}': Cell contains comma. " + "Use semicolons (;) to separate multiple values, not commas." + ) + break # Only warn once per column + + return warnings + + def _validate_dimensions_match( + self, tsv_samples: set[str], project_samples: set[str] + ) -> tuple[list[str], list[str]]: + """ + Validate that TSV samples match project dimensions. + """ + + # TODO consider the fact that the query route also runs _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket in tasks.py before even reaching the SharedMetadataValidator... + + errors = [] + warnings = [] + + missing_from_project = tsv_samples - project_samples + if missing_from_project: + examples = sorted(list(missing_from_project)) + errors.append( + f"The following samples in the TSV were not found in the DivBase project's dimensions index: {examples}. " + "DivBase requires that all samples in the TSV file must be present in the project's dimensions index to be used for queries." + ) + + missing_from_tsv = project_samples - tsv_samples + if missing_from_tsv: + examples = sorted(list(missing_from_tsv)) + warnings.append( + f"The following samples in the DivBase project's dimensions index were not found in the TSV: {examples}. " + "This is allowed for DivBase metadata TSV files, but please be aware that these samples will not be considered when making queries with this metadata file." + ) + + return errors, warnings diff --git a/tests/e2e_integration/cli_commands/test_query_cli.py b/tests/e2e_integration/cli_commands/test_query_cli.py index 68510c5c..e4b18746 100644 --- a/tests/e2e_integration/cli_commands/test_query_cli.py +++ b/tests/e2e_integration/cli_commands/test_query_cli.py @@ -544,7 +544,7 @@ def test_error_in_terminal_when_duplicate_sample_IDs_in_tsv( ) tsv_file = tmp_path / "test_duplicate_sample_ids.tsv" - tsv_file.write_text("Sample_ID\tArea\nS1\tNorth\nS1\tSouth\nS2\tEast\n") + tsv_file.write_text("#Sample_ID\tArea\nS1\tNorth\nS1\tSouth\n8_HOM-E59\tEast\n") command = f"files upload {tsv_file} --project {project_name}" result = runner.invoke(app, command) assert result.exit_code == 0 @@ -579,7 +579,7 @@ def test_error_in_terminal_for_comma_in_metadata( ) tsv_file = tmp_path / "test_comma_in_data.tsv" - tsv_file.write_text("Sample_ID\tPopulation\nS1\t1,2\nS2\t3\n") + tsv_file.write_text("#Sample_ID\tPopulation\n8_HOM-E57\t1,2\n8_HOM-E59\t3\n") command = f"files upload {tsv_file} --project {project_name}" result = runner.invoke(app, command) assert result.exit_code == 0 diff --git a/tests/e2e_integration/queries/conftest.py b/tests/e2e_integration/queries/conftest.py index 202361e3..21a1256b 100644 --- a/tests/e2e_integration/queries/conftest.py +++ b/tests/e2e_integration/queries/conftest.py @@ -46,7 +46,7 @@ def example_sidecar_metadata_inputs_outputs() -> dict[str, Any]: def sample_tsv_file(tmp_path: Path) -> Path: """Create a sample TSV file for testing.""" data = { - "Sample_ID": ["S1", "S2", "S3", "S4", "S5"], + "#Sample_ID": ["S1", "S2", "S3", "S4", "S5"], "Population": ["Pop1", "Pop1", "Pop2", "Pop2", "Pop3"], "Sex": ["M", "F", "M", "F", "M"], } @@ -126,7 +126,7 @@ def mock_download_files_command( @pytest.fixture def valid_tsv_path(): with tempfile.NamedTemporaryFile("w", delete=False, suffix=".tsv") as tmp: - tmp.write("Sample_ID\tFilename\tcol1\tcol2\nS1\tS1.vcf\tA\t1\nS2\tS2.vcf\tB\t2\nS3\tS3.vcf\tB\t3\n") + tmp.write("#Sample_ID\tFilename\tcol1\tcol2\nS1\tS1.vcf\tA\t1\nS2\tS2.vcf\tB\t2\nS3\tS3.vcf\tB\t3\n") tmp_path = Path(tmp.name) yield tmp_path tmp_path.unlink() diff --git a/tests/e2e_integration/queries/test_SidecarQueryManager.py b/tests/e2e_integration/queries/test_SidecarQueryManager.py index 2d7be203..db31b94b 100644 --- a/tests/e2e_integration/queries/test_SidecarQueryManager.py +++ b/tests/e2e_integration/queries/test_SidecarQueryManager.py @@ -37,7 +37,7 @@ def test_run_query_invalid_filter_raises_custom_exception(create_sidecar_manager when an invalid filter format is provided.""" with tempfile.NamedTemporaryFile("w", delete=False, suffix=".tsv") as tmp_valid_tsv: - tmp_valid_tsv.write("Sample_ID\tFilename\ncol1\tcol2\nA\t1\nB\t2\n") + tmp_valid_tsv.write("#Sample_ID\tFilename\ncol1\tcol2\nA\t1\nB\t2\n") tmp_path = Path(tmp_valid_tsv.name) manager = create_sidecar_manager(tmp_path) diff --git a/tests/fixtures/sample_metadata_HOM_files_that_need_mixed_bcftools_concat_and_merge.tsv b/tests/fixtures/sample_metadata_HOM_files_that_need_mixed_bcftools_concat_and_merge.tsv index 0b1ee487..775cf0eb 100644 --- a/tests/fixtures/sample_metadata_HOM_files_that_need_mixed_bcftools_concat_and_merge.tsv +++ b/tests/fixtures/sample_metadata_HOM_files_that_need_mixed_bcftools_concat_and_merge.tsv @@ -5,7 +5,7 @@ 8_HOM-E74 8 NES E74 Northern Spanish shelf 2016 Western M -2.78305556 43.59416667 8_HOM-E78 8 NES E78 Northern Spanish shelf 2016 Western M -2.78305556 43.59416667 1a_HOM-G34 1a WIE1 G34 West of Ireland 2016 Western M -10.617 54.417 -5a_HOM-I13 5a NPT1 I13 Northern Portugal 2016 Southern F -9.2 +5a_HOM-I13 5a NPT1 I13 Northern Portugal 2016 Southern F -9.2 39.82833333 5a_HOM-I14 5a NPT1 I14 Northern Portugal 2016 Southern F -9.2 39.82833333 5a_HOM-I20 5a NPT1 I20 Northern Portugal 2016 Southern F -9.2 39.82833333 5a_HOM-I21 5a NPT1 I21 Northern Portugal 2016 Southern F -9.2 39.82833333 From ce5e27ff8a0850315e0e5fc170a5c8867a81dc88 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 18 Feb 2026 10:28:44 +0100 Subject: [PATCH 084/100] Update query overview with sample metadata text --- docs/user-guides/running-queries.md | 48 ++++++++++++++++++++-------- docs/user-guides/sidecar-metadata.md | 2 +- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/docs/user-guides/running-queries.md b/docs/user-guides/running-queries.md index ec95b61e..65766eff 100644 --- a/docs/user-guides/running-queries.md +++ b/docs/user-guides/running-queries.md @@ -14,40 +14,60 @@ TODO The system will use the latest version the the files for all queries. -TODO COPIED OVER FROM QUICKSTART REIMPLEMENT +## Before running any queries: run the VCF dimensions command -DivBase only allows `bcftools view` in its query syntax and no other `bcftools` commands. The `merge`, `concat`, and `annotate` commands are used when processing a query, but should not be defined by the user. +TODO - finish writing this section -## Side car sample metadata queries +For more details see [VCF Dimensions caching](vcf-dimensions.md) -For more details, see [Sidecar Metadata TSV files: creating and querying sample metadata files](sidecar-metadata.md). +For performance reasons and to ensure query feasibility, key metadata from the VCF files must first be cached in DivBase. +For each VCF in the DivBase project, the system extracts file name, number and name of all samples, number and name of all scaffolds, number of variants, -how to write the sample metadata TSV (+template) +DivBase will use this whenever a user submits a query to the project. For instance, the user might make a sample metadata filtering that results in only certain samples. The system knows which file names each requested sample are located in, and will ensure that only those files will be transferred to the worker. + +example +dimensions show how to use dimensions show to get all samples in the project -how to write query +## Sidecar sample metadata queries -## Before any VCF queries: run the VCF dimensions command +DivBase supports that users store extensive sample metadata in a separate TSV file. This metadata can be queried on its own or in combination with VCF data. Users are free to define their own metadata as they see if: column names represent metadata categories and rows represent the samples found in the VCF files in the DivBase project. The TSVs need to follow a few mandatory requirements, but no strict metadata schema is enforced, This allows DivBase to accomodate a variety of research projects with different metadata needs. -For more details see [VCF Dimensions caching](vcf-dimensions.md) +The metadata can be queried on its own to learn which samples that fulfil a certain metadata query and the VCF files the samples are present in. The same query syntax is used in the combined sample metadata and VCF data queries, and user can use the dedicated sample metadata query command as a dry-run before running full combined query to ensure that the metadata query produces the results the user intended. -For performance reasons and to ensure query feasibility, key metadata from the VCF files must first be cached in DivBase. -For each VCF in the DivBase project, the system extracts file name, number and name of all samples, number and name of all scaffolds, number of variants, +For instructions on how to create the sidecar sample metadata TSV files and how to run sample metadata queries, see the guide on [Sidecar Metadata TSV files: creating and querying sample metadata files](sidecar-metadata.md). The guide also describes the CLI commands that specifically relate to the sample metadata TSV files. These are: -DivBase will use this whenever a user submits a query to the project. For instance, the user might make a sample metadata filtering that results in only certain samples. The system knows which file names each requested sample are located in, and will ensure that only those files will be transferred to the worker. +```bash +divbase-cli dimensions create-metadata-template -example -dimensions show +divbase-cli dimensions validate-metadata-file path/to/your/sample_metadata.tsv + +divbase-cli files upload path/to/your/sample_metadata.tsv + +divbase-cli query tsv "Area:Northern Portugal" +``` ## VCF queries +TODO - finish writing this section + +Run the query on all VCF files in the DivBase project unless specified. There are two ways to specify the files: either as direct input to the `bcftools` command, or by combining the VCF query with a sample metadata query to determine which VCF files to use. + See also [DivBase Query Syntax for VCF data](query-syntax.md), [How to create efficient DivBase queries](how-to-create-efficient-divbase-queries.md), and [Tutorial: Running a query on a public dataset](tutorial-query-on-public-data.md). -can be run with or without sample metadata filtering +can be run with or without sample metadata filtering. for sample metadata linked VCF queries, it can be good to do a dry run first [TO BE IMPLEMENTED, at the moment it needs to be run seperatelly] how to write a query how to optimize a query (see separate markdown) + +TODO COPIED OVER FROM QUICKSTART REIMPLEMENT + +DivBase only allows `bcftools view` in its query syntax and no other `bcftools` commands. The `merge`, `concat`, and `annotate` commands are used when processing a query, but should not be defined by the user. + +## Combined sample metadata and VCF data query + +Uses a sample metadata query to identify the VCF files in the DivBase project to run the VCF queries on. diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 40763916..941f0f60 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -87,7 +87,7 @@ S7 1;3;5 South 22.6 S8 2 West 19.5 ``` -For the sake of the demonstration later in this guide, let's assume that this TSV file have been uploaded to a DivBase project among with two VCF files where samples S1-S4 are found in 'file1.vcf.gz' and S5-S6 in 'file2.vcf.gz'. Let's also assume that the `divbase-cli dimensions update` has been run after all files have been uploaded so that the system has up-to-data information on which sample is found in which file. +For the sake of the demonstration later in this guide, let's assume that this TSV file have been uploaded to a DivBase project among with two VCF files where samples S1-S4 are found in `file1.vcf.gz` and S5-S6 in `file2.vcf.gz`. Let's also assume that the `divbase-cli dimensions update` has been run after all files have been uploaded so that the system has up-to-data information on which sample is found in which file. ### Validating a sidecar metadata TSV with `divbase-cli` From d35419e8c11bcb92cdf64acb0e50629acdddb4dc Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 18 Feb 2026 11:15:22 +0100 Subject: [PATCH 085/100] Add script to print df from TSV using shared logic This script runs SharedMetadataValidator, i.e. same logic that the CLI and API uses when loading a sample metadata TSV file. Thus, this script can be used to ensure that the TSV results in the expected DataFrame structure after being ingested by DivBase. --- scripts/tsv_to_dataframe.py | 60 +++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 scripts/tsv_to_dataframe.py diff --git a/scripts/tsv_to_dataframe.py b/scripts/tsv_to_dataframe.py new file mode 100644 index 00000000..59beaf87 --- /dev/null +++ b/scripts/tsv_to_dataframe.py @@ -0,0 +1,60 @@ +""" +Script to load a sample metadata TSV file to a Pandas DataFrame using the SharedMetadataValidator and +display the df in the terminal. Can be used to ensure that the TSV results in the expected DataFrame structure, +and to inspect the DataFrame for debugging purposes. + +Usage: + +python scripts/tsv_to_dataframe.py --tsv path/to/metadata.tsv + +""" + +import argparse + +import pandas as pd + +from divbase_lib.metadata_validator import MetadataValidationResult, SharedMetadataValidator + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Load a sample metadata TSV file to Pandas dataframe using the SharedMetadataValidator." + ) + parser.add_argument( + "--tsv", + type=str, + required=True, + help="Path to the TSV file to be loaded.", + ) + return parser.parse_args() + + +def tsv_to_dataframe(file_path) -> MetadataValidationResult: + """ + Reads a TSV file and returns a pandas DataFrame. Just runs the loading and validation logic, but does not + print the results like the client-side MetadataTSVValidator does. + + Allows for inspection of of the dataframe. + """ + validator = SharedMetadataValidator(file_path=file_path, project_samples=set(), skip_dimensions_check=True) + result = validator.load_and_validate() + return result.df + + +def main(): + args = parse_arguments() + df = tsv_to_dataframe(args.tsv) + if df is not None: + print(df.head()) + else: + print("Failed to load DataFrame. Check if the file exists and is a valid TSV.") + + pd.set_option("display.max_columns", None) # Show all columns + pd.set_option("display.width", 120) # Set terminal width + pd.set_option("display.max_rows", None) # Show all rows + + print(df) + + +if __name__ == "__main__": + main() From 3097371f82d4de88991c35495ccc157c0941e1e7 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 18 Feb 2026 11:49:51 +0100 Subject: [PATCH 086/100] Add checks and warnings for array notation in TSV If users include Python/JSON-style array notation (e.g., '[1, 2, 3]') in their sample metadata TSV, it will be treated as a string column by DivBase. The warning tells that to the user and advises them to use semicolons (e.g., '1;2;3') instead for multi value columns. --- .../src/divbase_api/services/queries.py | 8 +- .../src/divbase_lib/metadata_validator.py | 27 +++++++ .../test_sample_metadata_queries.py | 50 ++++++++++++ .../test_sample_metadata_tsv_validator.py | 81 +++++++++++++++++++ 4 files changed, 164 insertions(+), 2 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 34dace97..f725908a 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -762,11 +762,15 @@ def load_file(self) -> "SidecarQueryManager": else: raise SidecarMetadataFormatError(error_msg) - # Capture dimension-related warnings from the validator (e.g., samples in project but not in TSV) - # Other validation warnings (mixed types, commas, etc.) are for file quality and shown in CLI validation only + # Capture dimension-related warnings and array notation warnings from the validator. + # Array notation warnings are forwarded because they directly affect query behaviour (the column + # will be string instead of numeric, so numeric filter syntax will not work as expected). + # Other file-quality warnings (mixed types, commas, etc.) are shown in CLI validation only. if result.warnings: dimension_warnings = [w for w in result.warnings if "dimensions index" in w or "project" in w] + array_notation_warnings = [w for w in result.warnings if "array notation" in w.lower()] self.warnings.extend(dimension_warnings) + self.warnings.extend(array_notation_warnings) self.df = result.df diff --git a/packages/divbase-lib/src/divbase_lib/metadata_validator.py b/packages/divbase-lib/src/divbase_lib/metadata_validator.py index 889b7160..866f3f40 100644 --- a/packages/divbase-lib/src/divbase_lib/metadata_validator.py +++ b/packages/divbase-lib/src/divbase_lib/metadata_validator.py @@ -103,6 +103,9 @@ def load_and_validate(self) -> MetadataValidationResult: comma_warnings = self._check_for_commas(df) self.result.warnings.extend(comma_warnings) + array_notation_warnings = self._check_for_array_notation(df) + self.result.warnings.extend(array_notation_warnings) + numeric_cols, string_cols, mixed_cols = self._classify_columns(df, mixed_type_columns) self.result.numeric_columns = numeric_cols self.result.string_columns = string_cols @@ -368,6 +371,30 @@ def _check_for_commas(self, df: pd.DataFrame) -> list[str]: return warnings + def _check_for_array_notation(self, df: pd.DataFrame) -> list[str]: + """ + Check for Python/JSON-style array notation (e.g. '[1, 2, 3]') in any cell. + Array notation is not supported by DivBase - the column will be treated as a string. + """ + warnings = [] + + for col in df.columns: + series = df[col] + if not pd.api.types.is_string_dtype(series) and not pd.api.types.is_object_dtype(series): + continue + + for idx, cell_value in series.items(): + if isinstance(cell_value, str) and cell_value.startswith("[") and cell_value.endswith("]"): + warnings.append( + f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' uses array notation '[...]'. " + "DivBase does not support Python/JSON array notation. " + "This column will be treated as a string. " + "Use semicolons (;) to separate multiple values instead (e.g., '1;2;3')." + ) + break + + return warnings + def _validate_dimensions_match( self, tsv_samples: set[str], project_samples: set[str] ) -> tuple[list[str], list[str]]: diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index bb6d6210..daadd01a 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -148,6 +148,30 @@ def sample_tsv_with_semicolon_mixed_type_column(tmp_path): return tsv_file +@pytest.fixture +def array_notation_tsv(tmp_path): + """TSV where one column uses '[1, 2, 3]' array notation instead of semicolons.""" + content = "#Sample_ID\tPopulation\tArea\n" + content += "S1\t[1, 2, 3]\tNorth\n" + content += "S2\t4\tEast\n" + content += "S3\t5\tSouth\n" + tsv_file = tmp_path / "array_notation.tsv" + tsv_file.write_text(content) + return tsv_file + + +@pytest.fixture +def array_notation_multiple_cols_tsv(tmp_path): + """TSV where multiple columns use '[...]' array notation.""" + content = "#Sample_ID\tPopulation\tScores\n" + content += "S1\t[1, 2]\t[10, 20, 30]\n" + content += "S2\t3\t[40]\n" + content += "S3\t5\t60\n" + tsv_file = tmp_path / "array_notation_multi.tsv" + tsv_file.write_text(content) + return tsv_file + + class TestNumericalFilteringInequalities: """Test inequality operators on numerical columns.""" @@ -1047,3 +1071,29 @@ def test_valid_file_loads_successfully(self, sample_tsv_with_edge_cases): manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) assert manager.df is not None assert "Sample_ID" in manager.df.columns + + +class TestArrayNotation: + """Test that Python/JSON-style array notation '[...]' in cells produces a warning and is treated as string.""" + + def test_array_notation_warning_content(self, array_notation_tsv): + """Test that array notation loads without error, produces a warning (not an error), + the warning mentions semicolons, and exactly one warning is emitted per offending column.""" + manager = SidecarQueryManager(file=array_notation_tsv) + + assert manager.df is not None + assert any("array notation" in w.lower() for w in manager.warnings) + assert any("semicolon" in w.lower() and "array notation" in w.lower() for w in manager.warnings) + assert len([w for w in manager.warnings if "array notation" in w.lower() and "Population" in w]) == 1 + + def test_array_notation_column_is_filterable_as_string(self, array_notation_tsv): + """Test that columns with array notation are queryable as plain string values.""" + manager = SidecarQueryManager(file=array_notation_tsv).run_query(filter_string="Area:North") + assert len(manager.query_result) == 1 + assert manager.query_result.iloc[0]["Sample_ID"] == "S1" + + def test_array_notation_multiple_columns_warns_per_column(self, array_notation_multiple_cols_tsv): + """Test that each column with array notation gets its own warning.""" + manager = SidecarQueryManager(file=array_notation_multiple_cols_tsv) + assert len([w for w in manager.warnings if "array notation" in w.lower() and "Population" in w]) == 1 + assert len([w for w in manager.warnings if "array notation" in w.lower() and "Scores" in w]) == 1 diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index bc780028..4aeb4bc5 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -119,6 +119,30 @@ def negative_numeric_columns(): return ["Temperature", "Longitude", "Latitude", "Elevation"] +@pytest.fixture +def array_notation_tsv(tmp_path): + """TSV where one column uses '[1, 2, 3]' array notation instead of semicolons.""" + content = "#Sample_ID\tPopulation\tArea\n" + content += "S1\t[1, 2, 3]\tNorth\n" + content += "S2\t4\tEast\n" + content += "S3\t5\tSouth\n" + tsv_file = tmp_path / "array_notation.tsv" + tsv_file.write_text(content) + return tsv_file + + +@pytest.fixture +def array_notation_multiple_cols_tsv(tmp_path): + """TSV where multiple columns use '[...]' array notation.""" + content = "#Sample_ID\tPopulation\tScores\n" + content += "S1\t[1, 2]\t[10, 20, 30]\n" + content += "S2\t3\t[40]\n" + content += "S3\t5\t60\n" + tsv_file = tmp_path / "array_notation_multi.tsv" + tsv_file.write_text(content) + return tsv_file + + def test_valid_tsv_passes_all_checks(valid_tsv, project_samples): """Valid TSV should pass with no errors or warnings.""" stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) @@ -428,3 +452,60 @@ def test_purely_numeric_semicolon_column_stays_numeric(self, semicolon_mixed_tsv assert "PureNumSemicolon" in stats["numeric_columns"] assert "PureNumSemicolon" not in stats["mixed_type_columns"] assert "PureNumSemicolon" not in stats["string_columns"] + + +class TestArrayNotation: + """Test that Python/JSON-style array notation '[...]' in cells produces a warning and is treated as string.""" + + def test_array_notation_produces_warning(self, array_notation_tsv): + """Test that array notation in a cell sends a warning that tells the user to use semicolons instead.""" + stats, errors, warnings = MetadataTSVValidator.validate(array_notation_tsv, {"S1", "S2", "S3"}) + + assert any("array notation" in w.lower() for w in warnings) + assert any("semicolon" in w.lower() and "array notation" in w.lower() for w in warnings) + + def test_array_notation_is_not_an_error(self, array_notation_tsv): + """Test that array notation should produce a warning, not an error.""" + stats, errors, warnings = MetadataTSVValidator.validate(array_notation_tsv, {"S1", "S2", "S3"}) + + assert not any("array" in e.lower() for e in errors) + + def test_array_notation_column_not_numeric(self, array_notation_tsv): + """Test that a column containing array notation should not be classified as numeric type.""" + stats, errors, warnings = MetadataTSVValidator.validate(array_notation_tsv, {"S1", "S2", "S3"}) + + assert "Population" not in stats["numeric_columns"] + assert "Population" in stats["mixed_type_columns"] or "Population" in stats["string_columns"] + + def test_array_notation_warns_once_per_column(self, array_notation_tsv): + """Test that only one warning per column should be emitted for array notation.""" + stats, errors, warnings = MetadataTSVValidator.validate(array_notation_tsv, {"S1", "S2", "S3"}) + + array_warnings = [w for w in warnings if "array notation" in w.lower() and "Population" in w] + assert len(array_warnings) == 1 + + def test_array_notation_multiple_columns_warns_per_column(self, array_notation_multiple_cols_tsv): + """Test that each column with array notation should get its own warning.""" + stats, errors, warnings = MetadataTSVValidator.validate(array_notation_multiple_cols_tsv, {"S1", "S2", "S3"}) + + population_warnings = [w for w in warnings if "array notation" in w.lower() and "Population" in w] + scores_warnings = [w for w in warnings if "array notation" in w.lower() and "Scores" in w] + assert len(population_warnings) == 1 + assert len(scores_warnings) == 1 + + def test_non_array_bracket_strings_do_not_warn(self, tmp_path): + """Test that strings that are not array notation (e.g., '[ref]', 'group[1]') does not trigger the warning.""" + content = "#Sample_ID\tCode\n" + content += "S1\t[ref]\n" # Starts and ends with [ and ] + content += ( + "S2\tgroup[1]\n" # does not start with [, should not be treated as array notation despite ] at the end + ) + content += "S3\tnormal\n" + tsv_file = tmp_path / "bracket_strings.tsv" + tsv_file.write_text(content) + + stats, errors, warnings = MetadataTSVValidator.validate(tsv_file, {"S1", "S2", "S3"}) + + array_warnings = [w for w in warnings if "array notation" in w.lower()] + # Only S1's cell '[ref]' matches the array notation so onlu 1 warning is expected. + assert len(array_warnings) == 1 From b97dc211b194b9068b0238291e4e4e8b6abbfbe9 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 18 Feb 2026 13:35:59 +0100 Subject: [PATCH 087/100] Add text on bracket array warning to user guide --- docs/user-guides/sidecar-metadata.md | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 941f0f60..48d87847 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -63,7 +63,7 @@ After the `Sample_ID` column has been populated, users can add any columns and v To ensure that user-defined metadata can be used in DivBase, we ask you follow the following constraints and considerations: 1. The user-defined columns can be **either** numeric **or** string type. A column is classified as numeric only if all values can be parsed as numbers (including individual parts in semicolon-separated cells). If any value in a column is non-numeric, the entire column is treated as a string column. This means a column with values like "8", "1a", "5a" will be treated as string column even though some values look numeric. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values are numerical or string. -2. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data (e.g. "2;4;21" or "North; North-West"). Note that this might make the process of writing queries more complex than if just a single value is used for each cell. **Important:** Semicolons (`;`) are the only supported delimiter for multi-value cells. DivBase uses commas (`,`) in the [Query syntax](#query-syntax-for-sidecar-metadata) for a different purpose (separating filter values in queries). +2. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data (e.g. "2;4;21" or "North; North-West"). Note that this might make the process of writing queries more complex than if just a single value is used for each cell. **Important:** Semicolons (`;`) are the only supported delimiter for multi-value cells; bracket array notation (`[1,2,3]`) is not supported and will treated as a string. DivBase uses commas (`,`) in the [Query syntax](#query-syntax-for-sidecar-metadata) for a different purpose (separating filter values in queries). 3. Special characters like hyphens (`-`) and commas (`,`) are allowed, but will cause the column to be treated as a string column. String columns cannot be filtered using numeric operators (see details in [Filtering on numerical columns](#filtering-on-numerical-columns)) and will raise warnings. For example, values like "1-2" or "1,2" will be interpreted as strings, not numeric ranges or multi-value fields. If you intend to store multiple numeric values in a cell, use semicolons (e.g., "1;2"). For decimals, use English decimal notation with a period (e.g., "3.14") and not a comma. 4. The only characters with special structural meaning in DivBase sidecar metadata TSV files are `#` (for header comments), `;` (for multi-value cell separation), and `\t` (tab, for column separation). Other special characters are generally supported in data values, but be aware that Your Mileage May Vary. Some common cases that have been tested and are supported include diacritic unicode characters like `å`, `ä`, `ö`, and hyphens in strings (e.g., `North-West`). 5. Leading and trailing whitespaces are removed by the DivBase backend in order to ensure robust filtering and pattern matching. Whitespaces inside strings will be preserved. For instance: " Sample 1 " will be processed as "Sample 1". @@ -103,6 +103,8 @@ The command requires that the project's dimensions index is up-to-date with the The validation command will fetch all sample names from the project dimensions index from the DivBase server and use that to validate that the sample names in the TSV are correct. Misspelled, missing, or otherwise incorrect sample names in the TSV will result in erroneous or even misleading query results, and the validator will help with spotting that. Several of the checks that the validator performs are also done at the start of a sample metadata query, but this sample name check is currently only done by the validator. +#### Errors from TSV content validation + The following will return **Errors**. These must be fixed for the sidecar TSV be used with DivBase queries: - File not found, unreadable, or empty: If the TSV file path is missing, misspelled, or the file cannot be opened, validation will fail. Empty TSV files are also not allowed. @@ -118,6 +120,8 @@ The following will return **Errors**. These must be fixed for the sidecar TSV be !!! Note The formatting errors listed above are also enforced by the DivBase query engine when loading the metadata file for queries (except checking tab separation which is a validator-specific check). This means that even if the validator is not run before upload, the query engine will analyse the file content and report issues as errors. Detected Errors are different from Warnings in that errors will result in queries not even being run. +#### Warnings from TSV content validation + The validator will also raise **Warnings**. DivBase queries can still be run with Warnings, but the user should review them, and possible address them if so desired: - Cell value has leading or trailing whitespace (will be stripped by DivBase when a query is run) @@ -128,6 +132,8 @@ The validator will also raise **Warnings**. DivBase queries can still be run wit - Hyphens in values that look like range notation (e.g., "1-2") in columns that otherwise contain numeric values. The same goes for commas (e.g. "1,2"). The warning message will ask the user if they intended this to be a multicolumn value which should use semicolons as delimiters. +- Bracket array notation, i.e. cell values that are enclosed in brackets `[ ... ]` is not supported by DivBase. Bracked cells will be treated as strings, which can lead to undesired filtering results. Use semicolon (`;`) to delimit multi-value cells instead. Example: use '`1;2;3` instead of `[1,2,3] in the TSV cells. + ## Query Syntax for sidecar metadata This section describes how to query on the sample metadata file itself. The same syntax used here will also be used when running combined sample metadata and VCF data queries; how to do that is covered in [DivBase Query Syntax for VCF data](query-syntax.md). @@ -244,9 +250,21 @@ The `!` (NOT) operator can really come to good use for numerical filters: ### Filtering on Sample names -The sidecar metadata filtering is designed to +The sidecar metadata filtering is designed to filter on metadata columns and return all samples that fulfil the queries. It is however possible to filter on the `Sample_ID` column directly. This column is a string column by design and thus follows the syntax described in the [Filtering on string columns](#filtering-on-string-columns) section. + +For example, with the above [example](#example) metadata and data, this query: + +```bash +divbase-cli query tsv "Sample_ID:S1" +``` + +will return: -The `Sample_ID` column is a string column by design +```bash +The results for the query (Sample_ID:S1): +Unique Sample IDs: ['S1'] +Unique filenames: ['file1.vcf.gz'] +``` ### Query Warnings: spotting potential issues with the TSV or the query filter From 8452994466637564d2d6b95072d65b20a4b47b96 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 18 Feb 2026 14:05:50 +0100 Subject: [PATCH 088/100] Move shared unit test TSV fixtures to conftest.py --- tests/unit/conftest.py | 247 ++++++++++++++++++ .../test_sample_metadata_queries.py | 159 ----------- .../test_sample_metadata_tsv_validator.py | 128 --------- 3 files changed, 247 insertions(+), 287 deletions(-) create mode 100644 tests/unit/conftest.py diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 00000000..6585f64e --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,247 @@ +""" +Shared pytest fixtures for all unit tests. + +Fixtures are defined here so they can be used across divbase_lib, divbase_cli, +and divbase_api unit tests without cross-file imports (which are unreliable under +pytest's --import-mode=importlib). +""" + +import pytest + + +@pytest.fixture +def valid_tsv(tmp_path): + """Simple valid TSV that passes all validation checks.""" + tsv_content = """#Sample_ID\tPopulation\tArea\tWeight +S1\t1\tNorth\t12.5 +S2\t2;4\tEast\t18.8 +S3\t3\tWest;South\t15.0 +S4\t3;5\tSouth\t20.0 +S5\t4\tNorth\t22.1 +""" + tsv_file = tmp_path / "valid.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def no_multi_values_tsv(tmp_path): + """TSV with no semicolon-separated values in any cell.""" + tsv_content = """#Sample_ID\tPopulation\nS1\t1\nS2\t2\n""" + tsv_file = tmp_path / "no_multi_values.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def numeric_multi_values_tsv(tmp_path): + """TSV with multi-value numeric cells and negative numbers.""" + tsv_content = """#Sample_ID\tScores\tValues\tTemperature\tLongitude\tLatitude\tElevation +S1\t1;2;3\t10;20\t-5.5\t-2.78305556\t51.5\t100 +S2\t4;5\t30;40;50\t-10.2\t-0.12765\t52.2\t-50 +S3\t6\t60\t0\t1.25\t50.8\t-100.5 +S4\t7;8;9;10\t70\t15.5\t-3.5;-2.1\t49.5\t200 +S5\t11\t80;90\t-20\t0\t48.2\t-25 +""" + tsv_file = tmp_path / "numeric_multi_values.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_tsv_with_numeric_data(tmp_path): + """Comprehensive TSV with numeric, string, semicolon, float, and negative columns.""" + tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea\tSingleNumber\tSingleString\tTemperature\tLongitude\tLatitude\tElevation +S1\t1\t20.2\t5.0\tNorth\t100\tString\t-5.5\t-2.78305556\t51.5\t100 +S2\t2;4\t25.0\t10\tEast\t200\tStrings\t-10.2\t-0.12765\t52.2\t-50 +S3\t3\t30.8\t15\tWest;South;East\t300\tSting\t0\t1.25\t50.8\t-100.5 +S4\t4\t35.1\t20\tWest\t400\tStings\t15.5\t-3.5;-2.1\t49.5\t200 +S5\t5\t40.0\t25\tNorth\t500\tThing\t-20\t0\t48.2\t-25 +S6\t6\t45.4\t30\tEast\t600\tThings\t10\t2.5\t53.1\t150 +S7\t1;3;5\t50.9\t35\tSouth\t700\tStrong\t5\t-1.5\t52.8\t50 +S8\t2\t55.2\t40\tWest\t800\tStrung\t20\t3.0\t51.0\t75 +S9\t7\t62.6\t45\tNorth\t900\tStang\t-15\t-2.0\t54.5\t-10 +S10\t8\t70.7\t52\tEast\t1000\tSong\t25\t1.5\t50.5\t200 +""" + tsv_file = tmp_path / "test_metadata.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_tsv_with_mixed_type_column(tmp_path): + """TSV with a mixed-type column (numeric-looking + non-numeric values).""" + tsv_content = """#Sample_ID\tPopulation_code\tArea\tWeight +S1\t8\tNorth\t12.5 +S2\t1a\tEast\t18.8 +S3\t5a\tWest\t15.0 +S4\t1b\tSouth\t20.0 +S5\t4\tNorth\t22.1 +""" + tsv_file = tmp_path / "test_mixed_type.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_tsv_with_semicolon_mixed_type_column(tmp_path): + """TSV with a semicolon-separated column where one part is non-numeric ('1;1-2').""" + tsv_content = """#Sample_ID\tCode\tPureNumericSemicolon\tWeight +S1\t1;1-2\t10;20;30\t12.5 +S2\t3\t40\t18.8 +S3\t5\t50;60\t15.0 +S4\t7\t70;80;90\t20.0 +""" + tsv_file = tmp_path / "test_semicolon_mixed.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def type_errors_tsv(tmp_path): + """TSV with mixed-type columns and range/hyphen notation. + + Population: mixed int + string + range + Test: mixed int + 'all' + Code: pure string with number prefix + Range: mixed int + hyphen-range notation + """ + tsv_content = """#Sample_ID\tPopulation\tTest\tCode\tRange +S1\t1\t2\tA100\t1-2 +S2\tabc\t3\tB200\t3 +S3\t1;three;5\tall\tC300\t4 +S4\t3-5\t4\tD400\t5 +""" + tsv_file = tmp_path / "type_errors.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def array_notation_tsv(tmp_path): + """TSV where one column uses '[1, 2, 3]' array notation instead of semicolons.""" + content = "#Sample_ID\tPopulation\tArea\n" + content += "S1\t[1, 2, 3]\tNorth\n" + content += "S2\t4\tEast\n" + content += "S3\t5\tSouth\n" + tsv_file = tmp_path / "array_notation.tsv" + tsv_file.write_text(content) + return tsv_file + + +@pytest.fixture +def array_notation_multiple_cols_tsv(tmp_path): + """TSV where multiple columns use '[...]' array notation.""" + content = "#Sample_ID\tPopulation\tScores\n" + content += "S1\t[1, 2]\t[10, 20, 30]\n" + content += "S2\t3\t[40]\n" + content += "S3\t5\t60\n" + tsv_file = tmp_path / "array_notation_multi.tsv" + tsv_file.write_text(content) + return tsv_file + + +@pytest.fixture +def header_errors_tsv(tmp_path): + """TSV with header errors: wrong first column, duplicate columns, empty column.""" + tsv_content = """SampleID\tPopulation\tArea\tArea\t +S1\t1\tNorth\tEast\tValue +""" + tsv_file = tmp_path / "header_errors.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_errors_tsv(tmp_path): + """TSV with Sample_ID errors: empty, semicolons, duplicates.""" + tsv_content = """#Sample_ID\tPopulation +S1\t1 +\t2 +S3;S4\t3 +S1\t4 +""" + tsv_file = tmp_path / "sample_errors.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def format_errors_tsv(tmp_path): + """TSV with formatting errors: wrong column count, commas, whitespace.""" + tsv_content = """#Sample_ID\tPopulation\tArea +S1\t1\tNorth +S2\t2,3\tEast +S3 \t 4 \t West +S4\t5 +""" + tsv_file = tmp_path / "format_errors.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_tsv_with_invalid_sample_ids(tmp_path): + """TSV with empty and duplicate Sample_IDs (both raise errors during load).""" + tsv_content = """#Sample_ID\tPopulation\tWeight +S1\t1\t20.2 +\t2\t25.0 +S3\t3\t30.8 +S3\t4\t35.1 +""" + tsv_file = tmp_path / "test_metadata_invalid_sample_ids.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_tsv_missing_sample_id_column(tmp_path): + """TSV that omits the Sample_ID column entirely.""" + tsv_content = """Population\tWeight\tAge\tArea +1\t20.2\t5.0\tNorth +2\t25.0\t10\tEast +3\t30.8\t15\tWest +""" + tsv_file = tmp_path / "test_metadata_missing_sample_id.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_tsv_with_duplicate_sample_ids(tmp_path): + """TSV with duplicate Sample_IDs (raises error during load).""" + tsv_content = """#Sample_ID\tPopulation\tWeight +S1\t1\t20.2 +S2\t2\t25.0 +S3\t3\t30.8 +S3\t4\t35.1 +S4\t5\t40.0 +""" + tsv_file = tmp_path / "test_duplicate_sample_ids.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def sample_tsv_with_edge_cases(tmp_path): + """TSV with edge cases: unicode, hyphens in strings, and whitespace in Sample_IDs. + + NOTE: S2 and S3 have leading/trailing whitespace in Sample_ID — the validator + strips these, so the exported file will DIFFER from the original. Do NOT use + this fixture for identity roundtrip tests. + """ + tsv_content = """#Sample_ID\tPureStrings\tMixedTypes\tSingleString\tSingleNumber\tUnicodeStrings\tStringWithHyphen\tNumericalWithHyphen +S1\tNorth;South;East\t1;two;5\tWest\t100\tStockholm;Göteborg\tNorth-East\t1-2 +S2 \tWest;East;North\t2;three;6\tNorth\t200\tMalmö;Uppsala\tSouth-West\t2-3 + S3\tSouth\t3\tEast\t300\tKöpenhamn;København\tNorth-North-West\t3-4 +S4\t1string\tstring4\tString5\t400\tHumlebæk\tEast-South-East\t4-5 +""" + tsv_file = tmp_path / "test_metadata_edge_cases.tsv" + tsv_file.write_text(tsv_content) + return tsv_file + + +@pytest.fixture +def negative_numeric_columns(): + """Column names in numeric_multi_values_tsv that contain negative numbers.""" + return ["Temperature", "Longitude", "Latitude", "Elevation"] diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index daadd01a..482f36f3 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -13,165 +13,6 @@ ) -@pytest.fixture -def sample_tsv_with_numeric_data(tmp_path): - """ - Create a temporary TSV file with numeric and string columns for testing. - Includes semicolon-separated values in some cells. Includes both int and float - numeric values to test that both are detected as numeric. Also includes negative - numbers to verify they are properly handled as numeric values. - """ - # Keep indentation like this to ensure that leading spaces in column 1 does not cause issues. - tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea\tSingleNumber\tSingleString\tTemperature\tLongitude\tLatitude\tElevation -S1\t1\t20.2\t5.0\tNorth\t100\tString\t-5.5\t-2.78305556\t51.5\t100 -S2\t2;4\t25.0\t10\tEast\t200\tStrings\t-10.2\t-0.12765\t52.2\t-50 -S3\t3\t30.8\t15\tWest;South;East\t300\tSting\t0\t1.25\t50.8\t-100.5 -S4\t4\t35.1\t20\tWest\t400\tStings\t15.5\t-3.5;-2.1\t49.5\t200 -S5\t5\t40.0\t25\tNorth\t500\tThing\t-20\t0\t48.2\t-25 -S6\t6\t45.4\t30\tEast\t600\tThings\t10\t2.5\t53.1\t150 -S7\t1;3;5\t50.9\t35\tSouth\t700\tStrong\t5\t-1.5\t52.8\t50 -S8\t2\t55.2\t40\tWest\t800\tStrung\t20\t3.0\t51.0\t75 -S9\t7\t62.6\t45\tNorth\t900\tStang\t-15\t-2.0\t54.5\t-10 -S10\t8\t70.7\t52\tEast\t1000\tSong\t25\t1.5\t50.5\t200 -""" - tsv_file = tmp_path / "test_metadata.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def sample_tsv_with_edge_cases(tmp_path): - """ - Create a temporary TSV file to test edge cases: - 1. "string;string;string" - OK (pure strings) - 2. "1;two;5" - Mixed numeric and non-numeric: treated as string column (not an error) - 3. String values containing numbers like "1string" - OK (inferred as string) - 4. Unicode strings with diacritics - OK - - Note that S2 and S3 have leading/trailing whitespace in the Sample_ID and the code should handle that by stripping whitespace. - """ - tsv_content = """#Sample_ID\tPureStrings\tMixedTypes\tSingleString\tSingleNumber\tUnicodeStrings\tStringWithHyphen\tNumericalWithHyphen -S1\tNorth;South;East\t1;two;5\tWest\t100\tStockholm;Göteborg\tNorth-East\t1-2 -S2 \tWest;East;North\t2;three;6\tNorth\t200\tMalmö;Uppsala\tSouth-West\t2-3 - S3\tSouth\t3\tEast\t300\tKöpenhamn;København\tNorth-North-West\t3-4 -S4\t1string\tstring4\tString5\t400\tHumlebæk\tEast-South-East\t4-5 -""" - tsv_file = tmp_path / "test_metadata_edge_cases.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def sample_tsv_with_invalid_sample_ids(tmp_path): - """ - Create a temporary TSV file to test Sample_ID validation: - Has empty and duplicate Sample_IDs that both should raise error during load - """ - tsv_content = """#Sample_ID\tPopulation\tWeight -S1\t1\t20.2 -\t2\t25.0 -S3\t3\t30.8 -S3\t4\t35.1 -""" - tsv_file = tmp_path / "test_metadata_invalid_sample_ids.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def sample_tsv_missing_sample_id_column(tmp_path): - """ - Create a temporary TSV file that omits the Sample_ID column. - Should trigger SidecarColumnNotFoundError during file load. - """ - tsv_content = """Population\tWeight\tAge\tArea -1\t20.2\t5.0\tNorth -2\t25.0\t10\tEast -3\t30.8\t15\tWest -""" - tsv_file = tmp_path / "test_metadata_missing_sample_id.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def sample_tsv_with_duplicate_sample_ids(tmp_path): - """ - Create a temporary TSV file to test duplicate Sample_IDs (should raise error during load). - """ - tsv_content = """#Sample_ID\tPopulation\tWeight -S1\t1\t20.2 -S2\t2\t25.0 -S3\t3\t30.8 -S3\t4\t35.1 -S4\t5\t40.0 -""" - tsv_file = tmp_path / "test_duplicate_sample_ids.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def sample_tsv_with_mixed_type_column(tmp_path): - """ - Create a TSV with a column that has mixed numeric-looking and non-numeric values, - similar to Population_code with values like "8", "1a", "5a". - For testing query warnings for mixed-type columns. - """ - tsv_content = """#Sample_ID\tPopulation_code\tArea\tWeight -S1\t8\tNorth\t12.5 -S2\t1a\tEast\t18.8 -S3\t5a\tWest\t15.0 -S4\t1b\tSouth\t20.0 -S5\t4\tNorth\t22.1 -""" - tsv_file = tmp_path / "test_mixed_type.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def sample_tsv_with_semicolon_mixed_type_column(tmp_path): - """ - Create a TSV where a column has semicolon-separated values with a non-numeric part, - e.g. '1;1-2'. This makes the column a string column because '1-2' is not a number. - Tests that semicolon-split values are individually checked for numeric parsing. - """ - tsv_content = """#Sample_ID\tCode\tPureNumericSemicolon\tWeight -S1\t1;1-2\t10;20;30\t12.5 -S2\t3\t40\t18.8 -S3\t5\t50;60\t15.0 -S4\t7\t70;80;90\t20.0 -""" - tsv_file = tmp_path / "test_semicolon_mixed.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def array_notation_tsv(tmp_path): - """TSV where one column uses '[1, 2, 3]' array notation instead of semicolons.""" - content = "#Sample_ID\tPopulation\tArea\n" - content += "S1\t[1, 2, 3]\tNorth\n" - content += "S2\t4\tEast\n" - content += "S3\t5\tSouth\n" - tsv_file = tmp_path / "array_notation.tsv" - tsv_file.write_text(content) - return tsv_file - - -@pytest.fixture -def array_notation_multiple_cols_tsv(tmp_path): - """TSV where multiple columns use '[...]' array notation.""" - content = "#Sample_ID\tPopulation\tScores\n" - content += "S1\t[1, 2]\t[10, 20, 30]\n" - content += "S2\t3\t[40]\n" - content += "S3\t5\t60\n" - tsv_file = tmp_path / "array_notation_multi.tsv" - tsv_file.write_text(content) - return tsv_file - - class TestNumericalFilteringInequalities: """Test inequality operators on numerical columns.""" diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index 4aeb4bc5..bd06f2ba 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -15,134 +15,6 @@ def project_samples(): return {"S1", "S2", "S3", "S4", "S5"} -@pytest.fixture -def valid_tsv(tmp_path): - """Create a valid TSV file that passes all validation checks and includes all project samples.""" - tsv_content = """#Sample_ID\tPopulation\tArea\tWeight -S1\t1\tNorth\t12.5 -S2\t2;4\tEast\t18.8 -S3\t3\tWest;South\t15.0 -S4\t3;5\tSouth\t20.0 -S5\t4\tNorth\t22.1 -""" - tsv_file = tmp_path / "valid.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def header_errors_tsv(tmp_path): - """Create TSV with header errors: wrong first column, duplicate columns, empty column.""" - tsv_content = """SampleID\tPopulation\tArea\tArea\t -S1\t1\tNorth\tEast\tValue -""" - tsv_file = tmp_path / "header_errors.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def sample_errors_tsv(tmp_path): - """Create TSV with Sample_ID errors: empty, semicolons, duplicates.""" - tsv_content = """#Sample_ID\tPopulation -S1\t1 -\t2 -S3;S4\t3 -S1\t4 -""" - tsv_file = tmp_path / "sample_errors.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def format_errors_tsv(tmp_path): - """Create TSV with formatting errors: wrong column count, commas, whitespace.""" - tsv_content = """#Sample_ID\tPopulation\tArea -S1\t1\tNorth -S2\t2,3\tEast -S3 \t 4 \t West -S4\t5 -""" - tsv_file = tmp_path / "format_errors.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def type_errors_tsv(tmp_path): - """Create TSV with type errors: mixed types in column and cell, hyphen in numeric, and range notation. - - Population: Has both cell-level error (1;three;5) and column-level mixed types (numeric + string) - Test: Has column-level mixed types (all numeric values + string 'all') - Code: String column with hyphen in one value - Range: Contains range notation (e.g., '1-2') which should be rejected in numeric columns - """ - tsv_content = """#Sample_ID\tPopulation\tTest\tCode\tRange -S1\t1\t2\tA100\t1-2 -S2\tabc\t3\tB200\t3 -S3\t1;three;5\tall\tC300\t4 -S4\t3-5\t4\tD400\t5 -""" - tsv_file = tmp_path / "type_errors.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def no_multi_values_tsv(tmp_path): - """Create a TSV file with no semicolon-separated values in any cell.""" - tsv_content = """#Sample_ID\tPopulation\nS1\t1\nS2\t2\n""" - tsv_file = tmp_path / "no_multi_values.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def numeric_multi_values_tsv(tmp_path): - """Create a TSV file with multi-value numeric cells and negative numbers to verify they're classified as numeric.""" - tsv_content = """#Sample_ID\tScores\tValues\tTemperature\tLongitude\tLatitude\tElevation -S1\t1;2;3\t10;20\t-5.5\t-2.78305556\t51.5\t100 -S2\t4;5\t30;40;50\t-10.2\t-0.12765\t52.2\t-50 -S3\t6\t60\t0\t1.25\t50.8\t-100.5 -S4\t7;8;9;10\t70\t15.5\t-3.5;-2.1\t49.5\t200 -S5\t11\t80;90\t-20\t0\t48.2\t-25 -""" - tsv_file = tmp_path / "numeric_multi_values.tsv" - tsv_file.write_text(tsv_content) - return tsv_file - - -@pytest.fixture -def negative_numeric_columns(): - """Columns in the numeric_multi_values_tsv fixture that should be classified as numeric (including negative values).""" - return ["Temperature", "Longitude", "Latitude", "Elevation"] - - -@pytest.fixture -def array_notation_tsv(tmp_path): - """TSV where one column uses '[1, 2, 3]' array notation instead of semicolons.""" - content = "#Sample_ID\tPopulation\tArea\n" - content += "S1\t[1, 2, 3]\tNorth\n" - content += "S2\t4\tEast\n" - content += "S3\t5\tSouth\n" - tsv_file = tmp_path / "array_notation.tsv" - tsv_file.write_text(content) - return tsv_file - - -@pytest.fixture -def array_notation_multiple_cols_tsv(tmp_path): - """TSV where multiple columns use '[...]' array notation.""" - content = "#Sample_ID\tPopulation\tScores\n" - content += "S1\t[1, 2]\t[10, 20, 30]\n" - content += "S2\t3\t[40]\n" - content += "S3\t5\t60\n" - tsv_file = tmp_path / "array_notation_multi.tsv" - tsv_file.write_text(content) - return tsv_file - - def test_valid_tsv_passes_all_checks(valid_tsv, project_samples): """Valid TSV should pass with no errors or warnings.""" stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) From 7b95999f1e74578d9ffcca7ca64b7e76e35e9f78 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 18 Feb 2026 14:06:33 +0100 Subject: [PATCH 089/100] Add unit test for TSV->df->TSV validation Turns out that some TSVs exported from df will not match the original TSV, and that mainly has to do with ints and floats, e.g. a 0 being exported as 0.0. I think that the discrepencies are acceptable but it could be something to further work on. --- .../test_tsv_to_dataframe_to_tsv.py | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 tests/unit/divbase_lib/test_tsv_to_dataframe_to_tsv.py diff --git a/tests/unit/divbase_lib/test_tsv_to_dataframe_to_tsv.py b/tests/unit/divbase_lib/test_tsv_to_dataframe_to_tsv.py new file mode 100644 index 00000000..d093b41d --- /dev/null +++ b/tests/unit/divbase_lib/test_tsv_to_dataframe_to_tsv.py @@ -0,0 +1,90 @@ +""" +Test for SharedMetadataValidator: load TSV -> pandas DataFrame +-> export back to TSV and ensure that the exported file is identical to the original. + +Fixtures are sourced from tests/unit/conftest.py so they are shared across all unit tests. + +Fixtures intentionally EXCLUDED from identity roundtrip tests: +- sample_tsv_with_edge_cases: S2/S3 have leading/trailing whitespace in Sample_ID; + the validator strips it, so the exported file will differ from the original. +- header_errors_tsv, sample_errors_tsv, format_errors_tsv, + sample_tsv_with_invalid_sample_ids, sample_tsv_missing_sample_id_column, + sample_tsv_with_duplicate_sample_ids: produce validation errors during load. +""" + +from pathlib import Path + +import pandas as pd +import pytest + +from divbase_lib.metadata_validator import SharedMetadataValidator + +FIXTURE_TSV = Path("tests/fixtures/sample_metadata.tsv") + +# TODO consider if the SharedMetadataValidator and SidecarQueryManger should be updated to handle the below examples, or if they are acceptable. + +# The tests in this file show that not all fixture cases used in the unit tests (tests/unit/conftest.py) will be identical after a roundtrip (TSV, load to DataFrame, export back to TSV). +# 1. numeric_multi_values_tsv: Temperature/Longitude columns mix 0 (int) with floats. Pandas writes 0 as 0.0 on export. +# 2. sample_tsv_with_numeric_data: Age column mixes 5.0 (float) with ints. Pandas writes 10 as 10.0 on export. +# 3. sample_tsv_with_edge_cases: Sample_IDs have leading/trailing whitespace that is stripped during load, so the exported file differs from the original. +ROUNDTRIP_FIXTURES = [ + "valid_tsv", + "no_multi_values_tsv", + "type_errors_tsv", + "array_notation_tsv", + "array_notation_multiple_cols_tsv", + "sample_tsv_with_mixed_type_column", + "sample_tsv_with_semicolon_mixed_type_column", +] + + +def load_tsv(path: Path) -> pd.DataFrame: + """Load a TSV file using SharedMetadataValidator and return the DataFrame.""" + validator = SharedMetadataValidator(file_path=path, project_samples=set(), skip_dimensions_check=True) + result = validator.load_and_validate() + assert result.df is not None, f"Failed to load {path}: {result.errors}" + return result.df + + +def export_tsv(df: pd.DataFrame, path: Path) -> None: + """Export DataFrame to TSV file, restoring the #Sample_ID header prefix.""" + df.copy().rename(columns={"Sample_ID": "#Sample_ID"}).to_csv(path, sep="\t", index=False) + + +def tsv_lines(path: Path) -> list[str]: + """Return the non-empty lines of a TSV file for comparison.""" + return path.read_text().strip().splitlines() + + +class TestRoundtrip: + """Tests that load a TSV, export it back, and verify the exported file is identical to the original.""" + + @pytest.mark.parametrize("fixture_name", ROUNDTRIP_FIXTURES) + def test_exported_tsv_identical_to_original(self, fixture_name, request, tmp_path): + """Exported TSV content must be line-for-line identical to the original. + + Covers: basic valid data, no multi-values, negative floats, multi-value numeric cells, + mixed-type columns (string+numeric), hyphen/range notation, array notation, + semicolon-mixed columns, and comprehensive numeric/string data. + """ + original: Path = request.getfixturevalue(fixture_name) + export_path = tmp_path / "exported.tsv" + export_tsv(load_tsv(original), export_path) + assert tsv_lines(export_path) == tsv_lines(original) + + def test_roundtrip_no_errors(self, valid_tsv, tmp_path): + """Exported TSV must pass validation without errors.""" + export_path = tmp_path / "exported.tsv" + export_tsv(load_tsv(valid_tsv), export_path) + result = SharedMetadataValidator( + file_path=export_path, project_samples=set(), skip_dimensions_check=True + ).load_and_validate() + assert result.errors == [], f"Unexpected errors after roundtrip: {result.errors}" + + def test_roundtrip_fixture_tsv(self, tmp_path): + """Real fixture TSV (tests/fixtures/sample_metadata.tsv) must be line-for-line identical after a roundtrip.""" + if not FIXTURE_TSV.exists(): + pytest.skip(f"Fixture TSV not found at {FIXTURE_TSV}") + export_path = tmp_path / "exported.tsv" + export_tsv(load_tsv(FIXTURE_TSV), export_path) + assert tsv_lines(export_path) == tsv_lines(FIXTURE_TSV) From 17268d8f4559c8f9eac207355a85e3ebea07eec6 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 19 Feb 2026 11:48:31 +0100 Subject: [PATCH 090/100] Update mock VCF and metadata scripts To be able to test the sample metadata validation and queries with larger files. The large files should not be under source control due to their potential size. The scripts use random seeds for reproducibility. Example to generate a mock VCF file with 5000 samples and 100 variants, and then generate the corresponding sample metadata TSV file with four fixed column, one warning column that will trigger validator warnings, followed by 25 random columns. bash scripts/benchmarking/generate_mock_vcf.sh -s 5000 -r 100 will generate a file named mock_vcf_5000s_100r.vcf.gz python scripts/generate_mock_sample_metadata.py --vcf mock_vcf_5000s_100r.vcf.gz --output mock_metadata_mock_vcf_5000s_100r.tsv --columns 25 --add-warning-column --- docker/benchmarking.dockerfile | 2 +- scripts/benchmarking/generate_mock_vcf.sh | 55 +++++++++++++ scripts/generate_mock_sample_metadata.py | 95 +++++++++++++++++++---- 3 files changed, 136 insertions(+), 16 deletions(-) create mode 100644 scripts/benchmarking/generate_mock_vcf.sh diff --git a/docker/benchmarking.dockerfile b/docker/benchmarking.dockerfile index 63936757..4442d2bc 100644 --- a/docker/benchmarking.dockerfile +++ b/docker/benchmarking.dockerfile @@ -1,6 +1,6 @@ FROM python:3.12-slim -RUN apt-get update && apt-get install -y git make && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y git make tabix && rm -rf /var/lib/apt/lists/* RUN git clone https://github.com/endast/fake-vcf.git /opt/fake-vcf WORKDIR /opt/fake-vcf diff --git a/scripts/benchmarking/generate_mock_vcf.sh b/scripts/benchmarking/generate_mock_vcf.sh new file mode 100644 index 00000000..387e410a --- /dev/null +++ b/scripts/benchmarking/generate_mock_vcf.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# +# Wrapper script that generates a mock VCF file using the fake-vcf package in a Docker container, and copies the resulting file to the host machine. +# The number of samples and variants to be generated in the mock file can be specified as arguments. +# The random seed is fixed to ensure that the same mock file is generated each time for the same input parameters. +# +# Usage: +# bash scripts/benchmarking/generate_mock_vcf.sh -s -r +# +# Example: +# bash scripts/benchmarking/generate_mock_vcf.sh -s 1000 -r 50 + +set -e + + +SAMPLES=5000 +VARIANTS=100 + +while [[ $# -gt 0 ]]; do + case $1 in + -s) + SAMPLES="$2" + shift 2 + ;; + -r) + VARIANTS="$2" + shift 2 + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 -s -r " + exit 1 + ;; + esac +done + +IMAGE="benchmarking-fake-vcf" +CONTAINER_NAME="fake-vcf-gen" + +# Build the Docker image +docker build -f docker/benchmarking.dockerfile -t $IMAGE . + +docker rm -f $CONTAINER_NAME 2>/dev/null || true + +docker run --name $CONTAINER_NAME -d $IMAGE tail -f /dev/null + +OUTFILE="/tmp/mock_vcf_${SAMPLES}s_${VARIANTS}r.vcf.gz" +HOST_OUTFILE="mock_vcf_${SAMPLES}s_${VARIANTS}r.vcf.gz" + +docker exec $CONTAINER_NAME bash -c "cd /opt/fake-vcf && poetry run fake-vcf generate -s $SAMPLES -r $VARIANTS --seed 12345 | bgzip > $OUTFILE" + +# Copy the file from the container to the host +docker cp $CONTAINER_NAME:$OUTFILE ./$HOST_OUTFILE + +docker rm -f $CONTAINER_NAME \ No newline at end of file diff --git a/scripts/generate_mock_sample_metadata.py b/scripts/generate_mock_sample_metadata.py index 42b7c346..11041f66 100644 --- a/scripts/generate_mock_sample_metadata.py +++ b/scripts/generate_mock_sample_metadata.py @@ -2,10 +2,15 @@ Helper script to generate mock sample metadata from a VCF file to allow for testing the codebase with VCFs that don't have a metadata sidecar file. This script reads a comma-separated list VCF files (either gzipped or uncompressed) and generates a single sample metadata file based on the sample IDs found in each VCF. The output file has these columns: sample ID, mock sampling population number, mock sampling area, mock sample sex, and the filename that the sample is found in. + +Usage: +python scripts/generate_mock_sample_metadata.py --vcf {vcf_filename} --output {metadata_filename} --columns {number_of_random_columns_to_add} --add-warning-column """ import argparse import gzip +import random +import string from pathlib import Path from typing import TextIO @@ -24,6 +29,23 @@ def parse_arguments(): default=Path("mock_sample_metadata.tsv"), help="Output metadata file (default: mock_sample_metadata.tsv)", ) + parser.add_argument( + "--columns", + type=int, + default=3, + help="Number of additional random columns to generate after the first 3 legacy columns (Population, Area, Sex). Default: 3", + ) + parser.add_argument( + "--add-warning-column", + action="store_true", + help="Add a column that will trigger a warning in the TSV validator (e.g., array notation)", + ) + parser.add_argument( + "--seed", + type=int, + default=12345, + help="Random seed for reproducibility (default: 12345)", + ) return parser.parse_args() @@ -55,47 +77,90 @@ def extract_samples_from_opened_vcf(file: TextIO) -> list[str]: return [] -def generate_mock_sample_metadata(all_samples: dict[tuple], output_file: Path) -> None: - """ - Function that generates mock sample metadata from a VCF file. It also counts and displays the number of samples and variants in the VCF file. +def generate_random_values(n): + vals = [] + for _ in range(n): + t = random.choice(["int", "float", "str"]) + if t == "int": + vals.append(str(random.randint(1, 100))) + elif t == "float": + vals.append(f"{random.uniform(1, 100):.2f}") + else: + vals.append(random_string(5)) + return vals - The output file contains the mandatory columns Sample_ID and Filename, as well as three mock columns: Population, Area, and Sex. - To create some variation across the three mock columns, the three mock columns are generated from lists of different lengths. - To ensure that the periodicity of the mock area and sex columns are different, the mock sex column dependent on the length of the mock area column. - Thus, for each sample: - - The mock area will be "North", "East", "South", "West", and will repeat every 4 samples. - - The mock population will be a number from 1 to 6, and will repeat every 6 samples. - - The mock sex will be "F" or "M", and will repeat "F" for 4 samples, then "M" for 4 samples, and so on. +def generate_mock_sample_metadata( + all_samples: list[str], output_file: Path, num_columns: int, seed: int, add_warning: bool +) -> None: + """ + Generate mock sample metadata with legacy columns (Population, Area, Sex), user-specified number of random columns, and optional warning column. + + The "legacy" columns were used in the first iteration of this script, and are kept for backwards compatibility. """ + random.seed(seed) + legacy_cols = ["Population", "Area", "Sex"] + random_col_names = [f"Col{i + 1}" for i in range(num_columns)] + if add_warning: + col_names = legacy_cols + ["WarningCol"] + random_col_names + else: + col_names = legacy_cols + random_col_names mock_area = ["North", "East", "South", "West"] mock_population = [1, 2, 3, 4, 5, 6] mock_sex = ["F", "M"] with open(output_file, "w") as writer: - writer.write("#Sample_ID\tPopulation\tArea\tSex\tFilename\n") - for i, (sample, vcf_filename) in enumerate(all_samples): + writer.write("#Sample_ID\t" + "\t".join(col_names) + "\n") + for i, sample in enumerate(all_samples): + # Legacy columns area = mock_area[i % len(mock_area)] population = mock_population[i % len(mock_population)] sex = mock_sex[(i // len(mock_area)) % len(mock_sex)] - writer.write(f"{sample}\t{population}\t{area}\t{sex}\t{vcf_filename}\n") + row = [str(population), area, sex] + if add_warning: + # Always generate a mixed-type value (random order, using the seed) + parts = [] + for _ in range(3): + if random.choice([True, False]): + parts.append(str(random.randint(1, 100))) + else: + parts.append(random_string(random.randint(4, 6))) + warning_col_val = ";".join(parts) + random_vals = generate_random_values(num_columns) + row = row + [warning_col_val] + random_vals + else: + row += generate_random_values(num_columns) + writer.write(f"{sample}\t" + "\t".join(row) + "\n") print(f"Wrote mock sidecar metadata file to: {output_file}") +def random_string(length=6): + return "".join(random.choices(string.ascii_letters, k=length)) + + def main(): args = parse_arguments() vcf_paths = [Path(vcf_file.strip()) for vcf_file in args.vcf.split(",")] output_file = Path(args.output) + num_columns = args.columns + seed = args.seed + add_warning = args.add_warning_column all_samples = [] for vcf_path in vcf_paths: if vcf_path.name.endswith(".vcf") or vcf_path.name.endswith(".vcf.gz"): sample_IDs = wrapper_get_sample_IDs_from_vcf_file(vcf_path=vcf_path) - all_samples.extend((sample, vcf_path.name) for sample in sample_IDs) + all_samples.extend(sample for sample in sample_IDs) else: print("Invalid file extension. Please provide a .vcf or .vcf.gz file.") - generate_mock_sample_metadata(all_samples=all_samples, output_file=output_file) + generate_mock_sample_metadata( + all_samples=all_samples, + output_file=output_file, + num_columns=num_columns, + seed=seed, + add_warning=add_warning, + ) if __name__ == "__main__": From 5816ca56d86525f10150e6c33d1344564627f7a5 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Mon, 23 Feb 2026 16:52:09 +0100 Subject: [PATCH 091/100] Update test to not rely on \t in stout Can cause issues with terminal settings, rich, typer etc. Add assertion for the expected sample names --- .../cli_commands/test_dimensions_cli.py | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/tests/e2e_integration/cli_commands/test_dimensions_cli.py b/tests/e2e_integration/cli_commands/test_dimensions_cli.py index e786891a..8b7dd5ba 100644 --- a/tests/e2e_integration/cli_commands/test_dimensions_cli.py +++ b/tests/e2e_integration/cli_commands/test_dimensions_cli.py @@ -477,16 +477,43 @@ def test_show_dimensions_sample_names_stdout_streams_rows( project_id = project_map[project_name] user_id = 1 + expected_sample_names = [ + "8_HOM-E57", + "8_HOM-E59", + "8_HOM-E64", + "8_HOM-E74", + "8_HOM-E78", + "1a_HOM-G34", + "5a_HOM-I13", + "5a_HOM-I14", + "5a_HOM-I20", + "5a_HOM-I21", + "5a_HOM-I7", + "1b_HOM-G55", + "1b_HOM-G58", + "1b_HOM-G83", + "5b_HOM-H17", + "5b_HOM-H23", + "5b_HOM-H25", + "5b_HOM-H7", + "7_HOM-J21", + "4_HOM-P25", + ] + run_update_dimensions(bucket_name=bucket_name, project_id=project_id, project_name=project_name, user_id=user_id) command = f"dimensions show --project {project_name} --sample-names-stdout" cli_result = runner.invoke(app, command) assert cli_result.exit_code == 0, f"Command failed with: {cli_result.stdout}" - lines = [line for line in cli_result.stdout.splitlines() if line.strip()] - sample_rows = [line for line in lines if "\t" in line] - assert len(sample_rows) > 0, "Expected streamed sample rows in stdout" - assert all("\t" in line for line in sample_rows), "Expected tab-delimited rows in format: filenamesample_name" + output_sample_names = [ + line.split()[1] + for line in cli_result.stdout.splitlines() + if len(line.split()) == 2 and line.split()[0].endswith((".vcf", ".vcf.gz")) + ] + assert output_sample_names, "Expected streamed sample rows in stdout" + missing = set(expected_sample_names) - set(output_sample_names) + assert not missing, f"Missing sample names in output: {missing}" def test_show_dimensions_truncates_sample_names_in_terminal( From 59cfe7beecfd9215320abe7f96c4ef62db0973f0 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Tue, 24 Feb 2026 16:15:53 +0100 Subject: [PATCH 092/100] WIP: refactor shared metadata validator for lists Use python list syntax for multi-value cells instead of previous semicolon-separated string format. To test the new functionaly, CLI commands and their unit tests are also updated here Tagged WIP since there are probably maby breaking changes outside of the CLI command and its units tests. --- .../cli_commands/dimensions_cli.py | 2 +- .../services/sample_metadata_tsv_validator.py | 8 +- .../src/divbase_lib/metadata_validator.py | 320 ++++++---- .../test_sample_metadata_tsv_validator.py | 579 ++++++++++-------- 4 files changed, 539 insertions(+), 370 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 9f154de5..77aac23c 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -360,7 +360,7 @@ def validate_metadata_template_versus_dimensions_and_formatting_constraints( print(f" Mixed-type columns treated as string ({len(mixed_cols)}): {', '.join(mixed_cols)}") if stats.get("has_multi_values", False): - print(" Multi-value cells: Yes (semicolon-separated values detected)") + print(" Multi-value cells: Yes (Python list notation detected)") else: print(" Multi-value cells: No") diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index 8fe7c7a9..a19f86db 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -90,9 +90,13 @@ def _collect_statistics( has_multi_values = False for col in df.columns: - if df[col].astype(str).str.contains(";", na=False).any(): - has_multi_values = True + for val in df[col].dropna(): + if isinstance(val, list): + has_multi_values = True + break + if has_multi_values: break + # If has_multi_values is True: at least one cell in the DataFrame contains a Python list (multi-value cell). self.stats["has_multi_values"] = has_multi_values empty_cells_per_column = {} diff --git a/packages/divbase-lib/src/divbase_lib/metadata_validator.py b/packages/divbase-lib/src/divbase_lib/metadata_validator.py index 866f3f40..bb1d347b 100644 --- a/packages/divbase-lib/src/divbase_lib/metadata_validator.py +++ b/packages/divbase-lib/src/divbase_lib/metadata_validator.py @@ -8,6 +8,7 @@ This file is only for validation of the contents of the TSV file, not for query processing. """ +import ast import csv from dataclasses import dataclass, field from pathlib import Path @@ -40,10 +41,17 @@ class SharedMetadataValidator: This class handles the following validation of the TSV content: - Header (duplicates, empty columns, first column name) - - Sample_ID (empty, duplicates, semicolons) - - Column type (numeric vs string, semicolon-separated values) + - Sample_ID (empty, duplicates, no list values) + - Column type (numeric vs string, list-type multi-value cells) - Data format (commas, whitespace, column count) - - Mixed type detection and warnings + - List syntax validation and mixed type detection + + + IMPORTANT! This class never raises errors, it collects them. This is to allow the output of the class to be compatible + with the CLI validator on the client-side and the query engine (SidecarQueryManager) on the server side. The CLI validator + is designed to present all errors and warnings to the user in a single pass in terminal display so that they can address all of them. + The sever-side use the errors and warnings collected by this class to raise expections on the first error it encounters in order to + protect the query engine from malformed TSV content. """ def __init__(self, file_path: Path, project_samples: set[str] | None = None, skip_dimensions_check: bool = False): @@ -51,11 +59,15 @@ def __init__(self, file_path: Path, project_samples: set[str] | None = None, ski self.project_samples = project_samples self.skip_dimensions_check = skip_dimensions_check self.result = MetadataValidationResult() - self.df: pd.DataFrame | None = None def load_and_validate(self) -> MetadataValidationResult: """ Main entry point to the class. Load a TSV file and call helper methods to validate it. + + After loading the TSV into a pandas DataFrame, cells that look like Python + list literals (starting with '[') are parsed with ast.literal_eval so that + downstream validation and querying can work with native Python lists and + their correctly-inferred element types. """ try: with open(self.file_path, "r", newline="", encoding="utf-8") as f: @@ -66,7 +78,7 @@ def load_and_validate(self) -> MetadataValidationResult: self.result.errors.append("File is empty") return self.result - # Pre-pandas checks + # Pre-pandas checks: first_line = "\t".join(rows[0]) header_errors, header_warnings = self._validate_raw_header(first_line) self.result.errors.extend(header_errors) @@ -77,39 +89,37 @@ def load_and_validate(self) -> MetadataValidationResult: self.result.errors.extend(row_errors) self.result.warnings.extend(row_warnings) - # Initiate Pandas dataframe from TSV + # Initiate Pandas dataframe from TSV and check for parsing issues: df = pd.read_csv(self.file_path, sep="\t", skipinitialspace=True, on_bad_lines="skip") df.columns = df.columns.str.lstrip("#") + self._strip_whitespace_from_cells(df) - for col in df.columns: - df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x) - + list_syntax_errors = self._parse_list_cells_in_dataframe(df) + self.result.errors.extend(list_syntax_errors) self.result.df = df - self.df = df - # Dataframe checks + # Dataframe checks: + semicolon_warnings = self._check_for_semicolons_in_plain_string_cells(df) + self.result.warnings.extend(semicolon_warnings) + + comma_warnings = self._check_for_commas_in_plain_string_cells(df) + self.result.warnings.extend(comma_warnings) + sample_id_errors, sample_id_warnings = self._validate_sample_ids(df) self.result.errors.extend(sample_id_errors) self.result.warnings.extend(sample_id_warnings) - mixed_type_columns, cell_warnings = self._detect_mixed_type_columns(df) + numeric_cols, string_cols, mixed_type_columns, cell_errors, cell_warnings = self._classify_column_type(df) + self.result.errors.extend(cell_errors) self.result.warnings.extend(cell_warnings) self.result.mixed_type_columns = mixed_type_columns + self.result.numeric_columns = numeric_cols + self.result.string_columns = string_cols - mixed_type_warning = self._generate_mixed_type_warning(mixed_type_columns) + mixed_type_warning = self._generate_mixed_type_warning_clarification(mixed_type_columns) if mixed_type_warning: self.result.warnings.append(mixed_type_warning) - comma_warnings = self._check_for_commas(df) - self.result.warnings.extend(comma_warnings) - - array_notation_warnings = self._check_for_array_notation(df) - self.result.warnings.extend(array_notation_warnings) - - numeric_cols, string_cols, mixed_cols = self._classify_columns(df, mixed_type_columns) - self.result.numeric_columns = numeric_cols - self.result.string_columns = string_cols - if not self.skip_dimensions_check and self.project_samples is not None: tsv_samples = set(df["Sample_ID"].tolist()) if "Sample_ID" in df.columns else set() dim_errors, dim_warnings = self._validate_dimensions_match(tsv_samples, self.project_samples) @@ -121,6 +131,51 @@ def load_and_validate(self) -> MetadataValidationResult: return self.result + def _strip_whitespace_from_cells(self, df: pd.DataFrame) -> None: + """Strip leading/trailing whitespace from all string cells in the DataFrame.""" + for col in df.columns: + df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x) + + def _parse_list_cells_in_dataframe(self, df: pd.DataFrame) -> list[str]: + """ + Parse all string cells in object columns that look like Python list literals, and collect errors for cells that fail to parse. + + Only columns with dtype "object" are considered since numeric columns inferred by pandas cannot contain list strings. + Cells that start with '[' are parsed via ast.literal_eval. On success, the cell is replaced with the parsed Python list; on failure the cell is left as-is + and an error message is collected. + + ast.literal_eval is whitespace-insensitive within list notation: + [3,2], [3, 2], and [ 3 , 2 ] all parse identically to [3, 2]. + + Returns a list of error messages for all cells with invalid list syntax. + """ + errors = [] + + for col in df.select_dtypes(include=["object"]).columns: + for idx, cell_value in df[col].items(): + if not isinstance(cell_value, str): + continue + stripped = cell_value.strip() + if not stripped.startswith("["): + continue + try: + parsed = ast.literal_eval(stripped) + if isinstance(parsed, list): + df.at[idx, col] = parsed + else: + errors.append( + f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' starts with '[' " + f"but parsed as {type(parsed).__name__}, not a list." + ) + except (ValueError, SyntaxError): + errors.append( + f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' has invalid " + "Python list syntax. Multi-value cells must use valid Python list " + 'notation, e.g. [1, 2, 3] or ["a", "b"].' + ) + + return errors + def _validate_raw_header(self, header_line: str) -> tuple[list[str], list[str]]: """ Validate the raw header line before pandas processing. @@ -198,6 +253,8 @@ def _validate_sample_ids(self, df: pd.DataFrame) -> tuple[list[str], list[str]]: """ Validate Sample_ID column in the DataFrame. + The Sample_ID column must be present in the TSV, it has to be the first column, and it must contain non-empty, unique values. + Further more, it must have a single value per row. List values (Python list notation like ["S1", "S2"]) are not allowed in Sample_ID. """ errors = [] @@ -212,50 +269,75 @@ def _validate_sample_ids(self, df: pd.DataFrame) -> tuple[list[str], list[str]]: duplicates = df[df["Sample_ID"].duplicated()]["Sample_ID"].tolist() errors.append(f"Duplicate Sample_IDs found: {duplicates}. Each Sample_ID must be unique.") - semicolon_samples = df[df["Sample_ID"].str.contains(";", na=False)]["Sample_ID"].tolist() - if semicolon_samples: + list_sample_ids = [ + sid + for sid in df["Sample_ID"].dropna() + if isinstance(sid, list) + or (isinstance(sid, str) and sid.strip().startswith("[") and sid.strip().endswith("]")) + ] + if list_sample_ids: errors.append( - f"Sample_ID column contains semicolons in values: {semicolon_samples}. " - "Sample_ID must contain only one value per row (semicolons are not allowed)." + f"Sample_ID column contains list values: {list_sample_ids}. " + "Sample_ID must contain only one value per row (list notation is not allowed)." ) return errors, [] - def is_semicolon_separated_numeric_column(self, series: pd.Series) -> bool: + @staticmethod + def parse_cell_value(cell_value) -> Any: """ - Determine if a column contains semicolon-separated numeric values. + Parse a single cell value. If the string representation starts with '[', + it must be a valid Python list literal (parsed via ast.literal_eval). + Non-list cells are returned as-is (scalar). - Pandas infers "1;2;3" as string object dtype. This method checks if all - non-null values in the column can be parsed as numeric after splitting by semicolon. + Raises ValueError if a cell looks like a list (starts with '[') but + cannot be parsed by ast.literal_eval. """ - non_null_values = series.dropna() - if len(non_null_values) == 0: - return False + if pd.isna(cell_value): + return cell_value + + cell_str = str(cell_value).strip() + if cell_str.startswith("["): + try: + parsed = ast.literal_eval(cell_str) + except (ValueError, SyntaxError) as exc: + raise ValueError( + f"Invalid Python list syntax: '{cell_str}'. " + "Multi-value cells must use valid Python list notation, " + 'e.g. [1, 2, 3] or ["a", "b"].' + ) from exc + if not isinstance(parsed, list): + raise ValueError( + f"Cell '{cell_str}' parsed successfully but is not a list (got {type(parsed).__name__}). " + "Multi-value cells must be Python lists." + ) + return parsed - for cell_value in non_null_values: - if not isinstance(cell_value, str): - try: - float(cell_value) - continue - except (ValueError, TypeError): - return False + return cell_value - # Also check each semicolon-separated part of the cell value - parts = [p.strip() for p in str(cell_value).split(";") if p.strip()] - for part in parts: - try: - float(part) - except ValueError: - return False + def _classify_column_type(self, df: pd.DataFrame) -> tuple[list[str], list[str], list[str], list[str], list[str]]: + """ + Classify every user-defined column as numeric, string, or mixed-type in a single pass over the data. - return True + For each column, every non-null cell is examined once: + - Single-value cells: numeric if ``float()`` succeeds, otherwise string. + - Multi-value cells (Python lists): numeric if all elements are int/float, string if all are strings. + A list with mixed element types (e.g. [1, "two"]) is a hard error because ast.literal_eval preserves the exact types + the user wrote, so mixed types indicate an explicit mistake. - def _detect_mixed_type_columns(self, df: pd.DataFrame) -> tuple[list[str], list[str]]: - """ - Detect columns with mixed types (numeric and non-numeric values). + After scanning all cells in a column, classify the column type: + - All cells are numeric -> numeric column + - All cells are string string -> string column + - Contains both numeric and string cells -> mixed-type column: treat as string and send per-cell warnings to communicate the ambiguities to the user. + - All cells are Null -> numeric (to match Pandas default for NaN-only columns) + + Returns (numeric_cols, string_cols, mixed_type_columns, cell_errors, cell_warnings). """ - mixed_type_columns = [] - cell_warnings = [] + numeric_cols: list[str] = [] + string_cols: list[str] = [] + mixed_type_columns: list[str] = [] + cell_errors: list[str] = [] + cell_warnings: list[str] = [] for col in df.columns: if col == "Sample_ID": @@ -265,52 +347,70 @@ def _detect_mixed_type_columns(self, df: pd.DataFrame) -> tuple[list[str], list[ non_null_values = series.dropna() if len(non_null_values) == 0: + numeric_cols.append(col) continue has_numeric = False has_string = False + numeric_cells: list[tuple[int, Any]] = [] + string_cells: list[tuple[int, Any]] = [] for idx, cell_value in non_null_values.items(): - if isinstance(cell_value, str) and ";" in cell_value: - # Mutli-value cell - parts = [p.strip() for p in cell_value.split(";") if p.strip()] + if isinstance(cell_value, list): cell_has_numeric = False cell_has_string = False - for part in parts: - try: - float(part) + for element in cell_value: + if isinstance(element, (int, float)): cell_has_numeric = True - except ValueError: + else: cell_has_string = True if cell_has_numeric and cell_has_string: - cell_warnings.append( - f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' contains mixed types " - f"(both numeric and non-numeric values in semicolon-separated cell). " - f"This column will be treated as a string column." + cell_errors.append( + f"Row {idx + 2}, Column '{col}': List cell {cell_value} contains " + f"mixed element types (both numeric and string values). " + f"All elements in a list must be the same type. Use " + f"either all numbers (e.g. [1, 2, 3]) or all strings " + f'(e.g. ["a", "b", "c"]).' ) has_string = True + string_cells.append((idx, cell_value)) elif cell_has_numeric: has_numeric = True + numeric_cells.append((idx, cell_value)) else: has_string = True + string_cells.append((idx, cell_value)) else: - # Single value cell try: float(cell_value) has_numeric = True + numeric_cells.append((idx, cell_value)) except (ValueError, TypeError): has_string = True + string_cells.append((idx, cell_value)) if has_numeric and has_string: mixed_type_columns.append(col) + minority = string_cells if len(string_cells) <= len(numeric_cells) else numeric_cells + minority_type = "non-numeric" if minority is string_cells else "numeric" + for idx, val in minority: + cell_warnings.append( + f"Row {idx + 2}, Column '{col}': Cell '{val}' is {minority_type} " + f"in a column that contains both numeric and non-numeric values. " + f"This column will be treated as string type." + ) + elif has_numeric: + numeric_cols.append(col) + else: + string_cols.append(col) - return mixed_type_columns, cell_warnings + return numeric_cols, string_cols, mixed_type_columns, cell_errors, cell_warnings - def _generate_mixed_type_warning(self, mixed_columns: list[str]) -> str | None: + def _generate_mixed_type_warning_clarification(self, mixed_columns: list[str]) -> str | None: """ - Generate warning about mixed-type columns. + Generate clarification warning about mixed-type columns. This tells users that mixed-type columns are treated as string in Divbase. """ if not mixed_columns: return None @@ -318,63 +418,56 @@ def _generate_mixed_type_warning(self, mixed_columns: list[str]) -> str | None: return ( "Clarification on mixed types columns: " "Columns are treated as string by DivBase if they contain a mix of numeric and non-numeric values " - "or numeric-looking values with extra characters (for example commas, hyphens, or range-like patterns such as '1-2'). " - "A column is only numeric if all values (including each part in semicolon-separated cells) are valid numbers. " - "Use semicolons (;) to separate multiple numeric values. " + 'or list cells with mixed element types (for example [1, "two"]). ' + "A column is only numeric if all values (including each element in list cells) are valid numbers. " + "Use Python list notation (e.g. [1, 2, 3]) for multi-value cells. " "Numeric query operations (ranges, inequalities) will not be applicable to string columns." ) - def _classify_columns( - self, df: pd.DataFrame, mixed_type_columns: list[str] - ) -> tuple[list[str], list[str], list[str]]: + def _check_for_semicolons_in_plain_string_cells(self, df: pd.DataFrame) -> list[str]: """ - Classify columns as numeric, string, or mixed-type. - """ - numeric_cols = [] - string_cols = [] - - for col in df.columns: - if col == "Sample_ID": - continue + Check for semicolons in plain string (single-value) cells. This is to reduce user confusion with the DivBase query filter syntax, + which uses semicolons to separate key:value pairs. For example: divbase query tsv "Area:North;Population:1" + filters for rows where Area is "North" AND Population is 1. - if col in mixed_type_columns: - continue - - series = df[col] - - if pd.api.types.is_numeric_dtype(series) or self.is_semicolon_separated_numeric_column(series): - numeric_cols.append(col) - else: - string_cols.append(col) - - return numeric_cols, string_cols, mixed_type_columns - - def _check_for_commas(self, df: pd.DataFrame) -> list[str]: - """ - Check for comma-separated values in any cell (warns user to use semicolons). + A TSV cell containing a semicolon (e.g. "2;4") will be treated as a plain string value and cannot be matched via the query syntax since + the query parser will split on the semicolon. If the user intended multiple values, they should use Python list notation instead. """ warnings = [] for col in df.columns: + if col == "Sample_ID": + continue series = df[col] - # Only check string columns if not pd.api.types.is_string_dtype(series) and not pd.api.types.is_object_dtype(series): continue for idx, cell_value in series.items(): - if isinstance(cell_value, str) and "," in cell_value: + if not isinstance(cell_value, str): + continue + if ";" in cell_value: warnings.append( - f"Row {idx + 2}, Column '{col}': Cell contains comma. " - "Use semicolons (;) to separate multiple values, not commas." + f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' contains a semicolon. " + "If you intended multiple values, use Python list notation instead " + '(e.g. [1, 2] or ["a", "b"]). ' + "If the semicolon is intentional, note that DivBase query syntax uses " + "semicolons to separate filter key:value pairs, so this exact cell value " + "cannot be matched via queries." ) - break # Only warn once per column + break return warnings - def _check_for_array_notation(self, df: pd.DataFrame) -> list[str]: + def _check_for_commas_in_plain_string_cells(self, df: pd.DataFrame) -> list[str]: """ - Check for Python/JSON-style array notation (e.g. '[1, 2, 3]') in any cell. - Array notation is not supported by DivBase - the column will be treated as a string. + Check for commas in plain string cells (in single-value cells, not in multi-values list cells) and warn users about the + ambiguity that might cause for DivBase filtering since the metadata query filter syntax uses commas to separate filter values. + For example: divbase query tsv "Area:North,South" filters for rows where Area is "North" OR "South". + + A TSV cell containing the literal string "North,South" would not match that query, because the query parser splits on commas. + This helper method warns uses about + + Commas inside list notation (e.g. ["North", "South"]) are fine since they are parsed as lists. """ warnings = [] @@ -384,12 +477,17 @@ def _check_for_array_notation(self, df: pd.DataFrame) -> list[str]: continue for idx, cell_value in series.items(): - if isinstance(cell_value, str) and cell_value.startswith("[") and cell_value.endswith("]"): + if not isinstance(cell_value, str): + continue + if "," in cell_value: warnings.append( - f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' uses array notation '[...]'. " - "DivBase does not support Python/JSON array notation. " - "This column will be treated as a string. " - "Use semicolons (;) to separate multiple values instead (e.g., '1;2;3')." + f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' contains a comma. " + "If you intended multiple values, use Python list notation instead " + '(e.g. [1, 2] or ["a", "b"]). ' + "If the comma is intentional, note that DivBase query syntax uses " + "commas to separate filter values. To query for this exact string, " + "enclose the value in double quotes in your filter (e.g. " + 'Area:"North,South").' ) break diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index bd06f2ba..82155746 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -1,5 +1,12 @@ """ Unit tests for the MetadataTSVValidator class. + +Tests the shared validation logic from SharedMetadataValidator as exercised +through the CLI's MetadataTSVValidator wrapper. + +Fixtures in this file use Python list notation for multi-value cells (the +current format). The shared conftest.py still contains old semicolon-format +fixtures used by query engine tests that haven't been migrated yet. """ from pathlib import Path @@ -15,76 +22,225 @@ def project_samples(): return {"S1", "S2", "S3", "S4", "S5"} -def test_valid_tsv_passes_all_checks(valid_tsv, project_samples): - """Valid TSV should pass with no errors or warnings.""" - stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) +@pytest.fixture +def valid_list_tsv(tmp_path): + """Valid TSV using Python list notation for multi-value cells.""" + content = "#Sample_ID\tPopulation\tArea\tWeight\n" + content += "S1\t1\tNorth\t12.5\n" + content += "S2\t[2, 4]\tEast\t18.8\n" + content += 'S3\t3\t["West", "South"]\t15.0\n' + content += "S4\t[3, 5]\tSouth\t20.0\n" + content += "S5\t4\tNorth\t22.1\n" + p = tmp_path / "valid_list.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def no_multi_values_tsv(tmp_path): + """TSV with only scalar values, no list notation.""" + content = "#Sample_ID\tPopulation\n" + content += "S1\t1\n" + content += "S2\t2\n" + p = tmp_path / "no_multi.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def numeric_list_tsv(tmp_path): + """TSV with numeric list cells and negative numbers.""" + content = "#Sample_ID\tScores\tValues\tTemperature\tLongitude\n" + content += "S1\t[1, 2, 3]\t[10, 20]\t-5.5\t-2.78\n" + content += "S2\t[4, 5]\t[30, 40, 50]\t-10.2\t-0.13\n" + content += "S3\t6\t60\t0\t1.25\n" + content += "S4\t[7, 8, 9, 10]\t70\t15.5\t[-3.5, -2.1]\n" + content += "S5\t11\t[80, 90]\t-20\t0\n" + p = tmp_path / "numeric_list.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def header_errors_tsv(tmp_path): + """TSV with header errors: wrong first column, duplicate columns, empty column.""" + content = "SampleID\tPopulation\tArea\tArea\t\n" + content += "S1\t1\tNorth\tEast\tValue\n" + p = tmp_path / "header_errors.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def sample_id_errors_tsv(tmp_path): + """TSV with Sample_ID errors: empty, list values, duplicates.""" + content = "#Sample_ID\tPopulation\n" + content += "S1\t1\n" + content += "\t2\n" + content += '["S3", "S4"]\t3\n' + content += "S1\t4\n" + p = tmp_path / "sample_id_errors.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def format_errors_tsv(tmp_path): + """TSV with formatting errors: wrong column count, commas, whitespace.""" + content = "#Sample_ID\tPopulation\tArea\n" + content += "S1\t1\tNorth\n" + content += "S2\t2,3\tEast\n" + content += "S3 \t 4 \t West \n" + content += "S4\t5\n" + p = tmp_path / "format_errors.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def invalid_list_syntax_tsv(tmp_path): + """TSV with cells that look like lists but have invalid syntax.""" + content = "#Sample_ID\tScores\tValues\n" + content += "S1\t[1, 2, 3]\tgood\n" + content += "S2\t[4\tbad_unclosed\n" + content += "S3\t[1 2 3]\tbad_no_commas\n" + content += "S4\t5\tnormal\n" + content += "S5\t[6, 7]\tok\n" + p = tmp_path / "invalid_list_syntax.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def mixed_type_list_cell_tsv(tmp_path): + """TSV with a list cell containing mixed element types (error).""" + content = "#Sample_ID\tPopulation\tArea\n" + content += "S1\t1\tNorth\n" + content += 'S2\t[1, "two", 3]\tEast\n' + content += "S3\t5\tSouth\n" + p = tmp_path / "mixed_type_list.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def mixed_type_across_cells_tsv(tmp_path): + """TSV with mixed types across cells in a column (some numeric, some string).""" + content = "#Sample_ID\tPopulation\tArea\n" + content += "S1\t1\tNorth\n" + content += "S2\tabc\tEast\n" + content += "S3\t3\tSouth\n" + content += "S4\t4\tWest\n" + content += "S5\tdef\tNorth\n" + p = tmp_path / "mixed_across_cells.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def semicolons_in_cells_tsv(tmp_path): + """TSV with semicolons in cell values (should produce warnings).""" + content = "#Sample_ID\tPopulation\tArea\n" + content += "S1\t1\tNorth\n" + content += "S2\t2;4\tEast\n" + content += "S3\t3\tWest;South\n" + p = tmp_path / "semicolons.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def whitespace_variant_lists_tsv(tmp_path): + """TSV with different whitespace styles in list notation.""" + content = "#Sample_ID\tScores\n" + content += "S1\t[1, 2, 3]\n" + content += "S2\t[4,5,6]\n" + content += "S3\t[ 7 , 8 , 9 ]\n" + p = tmp_path / "whitespace_lists.tsv" + p.write_text(content) + return p + - assert len(errors) == 0 - assert len(warnings) == 0 - assert stats["total_columns"] == 4 - assert stats["user_defined_columns"] == 3 - assert stats["samples_in_tsv"] == 5 - assert stats["samples_matching_project"] == 5 - assert stats["has_multi_values"] is True - assert "Population" in stats["numeric_columns"] - assert "Area" in stats["string_columns"] - assert "Weight" in stats["numeric_columns"] +@pytest.fixture +def string_list_tsv(tmp_path): + """TSV with string list cells.""" + content = "#Sample_ID\tAreas\tScore\n" + content += 'S1\t["North", "South"]\t10\n' + content += 'S2\t["East"]\t20\n' + content += "S3\tWest\t30\n" + p = tmp_path / "string_list.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def empty_list_tsv(tmp_path): + """TSV with an empty list cell.""" + content = "#Sample_ID\tScores\n" + content += "S1\t[1, 2]\n" + content += "S2\t[]\n" + content += "S3\t3\n" + p = tmp_path / "empty_list.tsv" + p.write_text(content) + return p + + +class TestValidTSV: + """Test that a valid TSV with list notation passes all checks.""" + + def test_valid_list_tsv_passes_all_checks(self, valid_list_tsv, project_samples): + """Test that a valid TSV with list notation passes with no errors or warnings.""" + stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + + assert len(errors) == 0 + assert len(warnings) == 0 + assert stats["total_columns"] == 4 + assert stats["user_defined_columns"] == 3 + assert stats["samples_in_tsv"] == 5 + assert stats["samples_matching_project"] == 5 + assert stats["has_multi_values"] is True + assert "Population" in stats["numeric_columns"] + assert "Area" in stats["string_columns"] + assert "Weight" in stats["numeric_columns"] class TestHeaderValidation: """Test validation of header row.""" def test_wrong_first_column_name(self, header_errors_tsv, project_samples): - """Test that first column is '#Sample_ID'.""" stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) - assert any("First column must be named '#Sample_ID'" in e for e in errors) def test_duplicate_column_names(self, header_errors_tsv, project_samples): - """Test that duplicate column names are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) - assert any("Duplicate column names" in e and "Area" in e for e in errors) def test_duplicate_column_names_after_stripping_hash(self, tmp_path, project_samples): - """Test that duplicate column names are detected even when one has '#' prefix and one doesn't. - This ensures consistency with server-side validation which strips '#' before checking duplicates.""" - tsv_content = """#Sample_ID\tSample_ID\tPopulation -S1\tS1_dup\t1 -S2\tS2_dup\t2 -""" - tsv_file = tmp_path / "duplicate_sample_id_columns.tsv" - tsv_file.write_text(tsv_content) + content = "#Sample_ID\tSample_ID\tPopulation\nS1\tS1_dup\t1\nS2\tS2_dup\t2\n" + tsv_file = tmp_path / "dup_sample_id_cols.tsv" + tsv_file.write_text(content) stats, errors, warnings = MetadataTSVValidator.validate(tsv_file, project_samples) - assert any("Duplicate column names" in e and "Sample_ID" in e for e in errors) def test_empty_column_name(self, header_errors_tsv, project_samples): - """Test that empty column names are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) - assert any("Empty column name" in e for e in errors) class TestSampleIDValidation: """Test validation of Sample_ID column.""" - def test_empty_sample_id(self, sample_errors_tsv, project_samples): - """Test that empty Sample_ID are detected.""" - stats, errors, warnings = MetadataTSVValidator.validate(sample_errors_tsv, project_samples) - + def test_empty_sample_id(self, sample_id_errors_tsv, project_samples): + stats, errors, warnings = MetadataTSVValidator.validate(sample_id_errors_tsv, project_samples) assert any("Sample_ID is empty" in e for e in errors) - def test_semicolon_in_sample_id(self, sample_errors_tsv, project_samples): - """Test that Sample_ID containing semicolon are detected.""" - stats, errors, warnings = MetadataTSVValidator.validate(sample_errors_tsv, project_samples) - - assert any("contains semicolon" in e and "S3;S4" in e for e in errors) - - def test_duplicate_sample_id(self, sample_errors_tsv, project_samples): - """Test that duplicate Sample_IDs are detected.""" - stats, errors, warnings = MetadataTSVValidator.validate(sample_errors_tsv, project_samples) + def test_list_value_in_sample_id(self, sample_id_errors_tsv, project_samples): + """List notation in Sample_ID should produce an error.""" + stats, errors, warnings = MetadataTSVValidator.validate(sample_id_errors_tsv, project_samples) + assert any("list values" in e.lower() for e in errors) + def test_duplicate_sample_id(self, sample_id_errors_tsv, project_samples): + stats, errors, warnings = MetadataTSVValidator.validate(sample_id_errors_tsv, project_samples) assert any("Duplicate Sample_ID" in e and "S1" in e for e in errors) @@ -92,123 +248,141 @@ class TestFormattingValidation: """Test validation of TSV formatting.""" def test_wrong_column_count(self, format_errors_tsv, project_samples): - """Test that rows with wrong number of columns are detected.""" stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) - assert any("Expected 3 tab-separated columns" in e and "found 2" in e for e in errors) - def test_comma_in_cell(self, format_errors_tsv, project_samples): - """Test that commas in cells generate warnings (not errors) and cause column to be treated as string.""" + def test_comma_in_cell_produces_warning(self, format_errors_tsv, project_samples): stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) - assert any("comma" in w.lower() for w in warnings) assert not any("comma" in e.lower() for e in errors) - assert "Population" in stats["mixed_type_columns"] or "Population" in stats["string_columns"] - def test_whitespace_warning(self, format_errors_tsv, project_samples): - """Test that leading/trailing whitespace generate warnings.""" stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) - assert any("leading or trailing whitespace" in w for w in warnings) + def test_semicolons_in_cells_produce_warning(self, semicolons_in_cells_tsv): + stats, errors, warnings = MetadataTSVValidator.validate(semicolons_in_cells_tsv, {"S1", "S2", "S3"}) + assert any("semicolon" in w.lower() for w in warnings) + assert not any("semicolon" in e.lower() for e in errors) -class TestTypeValidation: - """Test validation of column types. - Mixed types (columns with both numeric-looking and non-numeric values) are treated - as string columns and reported as warnings, not errors. - """ +class TestListSyntaxValidation: + """Test validation of Python list notation in cells.""" - def test_mixed_types_in_column_is_warning(self, type_errors_tsv, project_samples): - """Test that columns with mixed numeric and string types produce a warning (not error) and be classified as mixed_type.""" - stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) + def test_valid_list_syntax_no_errors(self, valid_list_tsv, project_samples): + """Test that valid list notation like [1, 2] should parse without errors.""" + stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + assert len(errors) == 0 - assert any("mixed" in w.lower() and "Population" in w for w in warnings) - assert not any("mixed types" in e.lower() and "Population" in e for e in errors) + def test_invalid_list_syntax_produces_errors(self, invalid_list_syntax_tsv, project_samples): + """Test that cells starting with '[' but can't be parsed produce hard errors.""" + stats, errors, warnings = MetadataTSVValidator.validate(invalid_list_syntax_tsv, project_samples) + assert any("[4" in e and "invalid" in e.lower() for e in errors) + assert any("[1 2 3]" in e and "invalid" in e.lower() for e in errors) - def test_mixed_types_in_cell_is_warning(self, type_errors_tsv, project_samples): - """Test that cells with mixed types (e.g., '1;three;5') produce a warning (not error).""" - stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) + def test_invalid_list_syntax_collects_all_errors(self, invalid_list_syntax_tsv, project_samples): + """Test that all invalid list cells should be reported, not just the first one.""" + stats, errors, warnings = MetadataTSVValidator.validate(invalid_list_syntax_tsv, project_samples) + list_errors = [e for e in errors if "invalid" in e.lower() and "list" in e.lower()] + assert len(list_errors) == 2 - assert any("1;three;5" in w and "mixed types" in w.lower() for w in warnings) - assert not any("1;three;5" in e and "mixed types" in e.lower() for e in errors) + def test_whitespace_insensitive_list_parsing(self, whitespace_variant_lists_tsv): + """Test that [1,2,3], [1, 2, 3], and [ 1 , 2 , 3 ] all parse identically.""" + stats, errors, warnings = MetadataTSVValidator.validate(whitespace_variant_lists_tsv, {"S1", "S2", "S3"}) + assert len(errors) == 0 + assert "Scores" in stats["numeric_columns"] - def test_hyphen_in_numeric_looking_column_is_warning(self, type_errors_tsv, project_samples): - """Test that hyphens in values that look like range notation produce a warning (not error).""" - stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) + def test_empty_list_parses_successfully(self, empty_list_tsv): + """Test that an empty list [] should parse without errors.""" + stats, errors, warnings = MetadataTSVValidator.validate(empty_list_tsv, {"S1", "S2", "S3"}) + list_errors = [e for e in errors if "list" in e.lower()] + assert len(list_errors) == 0 - assert any("hyphen" in w.lower() for w in warnings) - assert not any("hyphen" in e.lower() for e in errors) - def test_cell_and_column_level_mixed_types_are_warnings(self, type_errors_tsv, project_samples): - """Test that when a column has both cell-level and column-level mixed types, both produce warnings (not errors).""" - stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) +class TestTypeValidation: + """Test column type classification and mixed-type detection.""" - assert any("1;three;5" in w and "mixed types" in w.lower() for w in warnings) - assert any("mixed" in w.lower() and "Population" in w for w in warnings) - assert "Population" in stats["mixed_type_columns"] - assert "Test" in stats["mixed_type_columns"] + def test_numeric_list_column_is_numeric(self, numeric_list_tsv, project_samples): + """Test that columns with only numeric list cells (multi-value) and numeric scalars (single-values) are numeric.""" + stats, errors, warnings = MetadataTSVValidator.validate(numeric_list_tsv, project_samples) + assert "Scores" in stats["numeric_columns"] + assert "Values" in stats["numeric_columns"] + assert "Scores" not in stats["mixed_type_columns"] - def test_stats_show_mixed_type_columns(self, type_errors_tsv, project_samples): - """ - Test that stats show columns as mixed-type information to user. - The type_errors_tsv fixture used here has columns with mixed types. - """ - stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, project_samples) + def test_string_list_column_is_string(self, string_list_tsv): + """Test that columns with string list cells are classified as string.""" + stats, errors, warnings = MetadataTSVValidator.validate(string_list_tsv, {"S1", "S2", "S3"}) + assert "Areas" in stats["string_columns"] + assert "Score" in stats["numeric_columns"] - assert "Population" in stats["mixed_type_columns"] - assert "Test" in stats["mixed_type_columns"] - assert len(stats["mixed_type_columns"]) == 3 + def test_mixed_types_within_list_cell_is_error(self, mixed_type_list_cell_tsv): + """Test that a list cell like [1, "two", 3] with mixed element types raise an error.""" + stats, errors, warnings = MetadataTSVValidator.validate(mixed_type_list_cell_tsv, {"S1", "S2", "S3"}) + assert any("mixed element types" in e.lower() for e in errors) - def test_multi_value_numeric_cells_are_numeric(self, numeric_multi_values_tsv, project_samples): - """Test that multi-value numeric cells (e.g., '2;4') are correctly classified as numeric, not string or mixed-type.""" - stats, errors, warnings = MetadataTSVValidator.validate(numeric_multi_values_tsv, project_samples) + def test_mixed_types_across_cells_is_warning(self, mixed_type_across_cells_tsv, project_samples): + """Test that a column with both numeric and string sends a warning (not error).""" + stats, errors, warnings = MetadataTSVValidator.validate(mixed_type_across_cells_tsv, project_samples) + assert "Population" in stats["mixed_type_columns"] + assert not any("Population" in e and "mixed" in e.lower() for e in errors) + + def test_mixed_types_across_cells_produces_per_cell_warnings(self, mixed_type_across_cells_tsv, project_samples): + """Test that mixed-type columns identifies which cells are outliers.""" + stats, errors, warnings = MetadataTSVValidator.validate(mixed_type_across_cells_tsv, project_samples) + cell_warnings = [w for w in warnings if "non-numeric" in w.lower() and "Population" in w] + assert len(cell_warnings) >= 1 + assert any("abc" in w for w in cell_warnings) + assert any("def" in w for w in cell_warnings) + + def test_mixed_types_clarification_warning(self, mixed_type_across_cells_tsv, project_samples): + """Test that a general clarification warning is sent for mixed-type columns.""" + stats, errors, warnings = MetadataTSVValidator.validate(mixed_type_across_cells_tsv, project_samples) + assert any("clarification on mixed types" in w.lower() for w in warnings) + + def test_negative_numbers_are_numeric(self, numeric_list_tsv, project_samples): + """Test that negative numbers are classified as numeric, not string.""" + stats, errors, warnings = MetadataTSVValidator.validate(numeric_list_tsv, project_samples) + assert "Temperature" in stats["numeric_columns"] + assert "Longitude" in stats["numeric_columns"] + assert len(stats["mixed_type_columns"]) == 0 + def test_mixed_list_and_scalar_in_same_column(self, tmp_path): + """Test that a column with both list cells and compatible scalar (single-value) cells pass.""" + content = "#Sample_ID\tScores\n" + content += "S1\t[1, 2]\n" + content += "S2\t5\n" + content += "S3\t[3, 4]\n" + p = tmp_path / "mixed_list_scalar.tsv" + p.write_text(content) + stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2", "S3"}) + assert len(errors) == 0 assert "Scores" in stats["numeric_columns"] - assert "Values" in stats["numeric_columns"] - assert "Scores" not in stats["string_columns"] - assert "Values" not in stats["string_columns"] - assert "Scores" not in stats["mixed_type_columns"] - assert "Values" not in stats["mixed_type_columns"] - assert not any("mixed" in w.lower() and ("Scores" in w or "Values" in w) for w in warnings) class TestDimensionMatching: """Test validation against project dimensions.""" - def test_samples_not_in_project(self, valid_tsv): - """Test that samples in TSV but not in project raise error.""" + def test_samples_not_in_project(self, valid_list_tsv): project_samples = {"S1", "S2"} - stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) - + stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) assert any( - "following samples in the TSV were not found in the DivBase project's dimensions index" in e and "S3" in e - for e in errors + "following samples in the TSV were not found in the DivBase project's dimensions index" in e for e in errors ) - def test_samples_not_in_tsv(self, valid_tsv): - """Test that samples in project but not in TSV produce warnings.""" + def test_samples_not_in_tsv(self, valid_list_tsv): project_samples = {"S1", "S2", "S3", "S10", "S20"} - stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) - + stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) assert any( "following samples in the DivBase project's dimensions index were not found in the TSV" in w and "S10" in w for w in warnings ) - assert any( - "following samples in the DivBase project's dimensions index were not found in the TSV" in w and "S20" in w - for w in warnings - ) class TestStatistics: """Test statistics collection.""" - def test_statistics_collection(self, valid_tsv, project_samples): - """Test that statistics are correctly collected.""" - stats, errors, warnings = MetadataTSVValidator.validate(valid_tsv, project_samples) - + def test_statistics_collection(self, valid_list_tsv, project_samples): + stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) assert stats["total_columns"] == 4 assert stats["user_defined_columns"] == 3 assert stats["samples_in_tsv"] == 5 @@ -220,164 +394,57 @@ def test_statistics_collection(self, valid_tsv, project_samples): assert stats["has_multi_values"] is True def test_no_multi_values_detected(self, no_multi_values_tsv): - """Test multi-value detection when no semicolon-separated values are present.""" + """Test that has_multi_values is False when no list cells exist.""" stats, errors, warnings = MetadataTSVValidator.validate(no_multi_values_tsv, {"S1", "S2"}) assert stats["has_multi_values"] is False + def test_multi_values_detected_via_list_cells(self, valid_list_tsv, project_samples): + """Test that has_multi_values is True when list cells exist.""" + stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + assert stats["has_multi_values"] is True + class TestEdgeCases: """Test edge cases and error conditions.""" def test_empty_file(self, project_samples, tmp_path): - """Test that empty files are detected.""" empty_file = tmp_path / "empty.tsv" empty_file.write_text("") - stats, errors, warnings = MetadataTSVValidator.validate(empty_file, project_samples) - assert any("File is empty" in e for e in errors) def test_nonexistent_file(self, project_samples): - """Test that nonexistent files are handled gracefully.""" stats, errors, warnings = MetadataTSVValidator.validate(Path("/nonexistent/file.tsv"), project_samples) - assert any("Failed to read file" in e for e in errors) - -class TestNegativeNumbers: - """Test that negative numbers are properly handled as numeric values.""" - - def test_negative_numbers_are_numeric(self, numeric_multi_values_tsv, negative_numeric_columns): - """Test that negative numbers are correctly classified as numeric, not flagged as errors due to hyphen check for ranges in numeric cells.""" - stats, errors, warnings = MetadataTSVValidator.validate( - numeric_multi_values_tsv, {"S1", "S2", "S3", "S4", "S5"} - ) - - for col in negative_numeric_columns: - assert not any("hyphen" in e.lower() and col in e for e in errors) - assert col in stats["numeric_columns"] - - assert len(stats["mixed_type_columns"]) == 0 - - def test_negative_numbers_with_semicolons(self, numeric_multi_values_tsv, negative_numeric_columns): - """Test that negative numbers in semicolon-separated cells are handled correctly.""" - stats, errors, warnings = MetadataTSVValidator.validate( - numeric_multi_values_tsv, {"S1", "S2", "S3", "S4", "S5"} - ) - - assert "Longitude" in negative_numeric_columns - assert "Longitude" in stats["numeric_columns"] - assert "Longitude" not in stats["mixed_type_columns"] - assert not any("Longitude" in e and "mixed" in e.lower() for e in errors) - - def test_range_notation_produces_warning(self, type_errors_tsv): - """Test that range notation like '1-2' in a mixed-type column produces a warning (column treated as string).""" - stats, errors, warnings = MetadataTSVValidator.validate(type_errors_tsv, {"S1", "S2", "S3", "S4"}) - assert any("clarification on mixed types columns" in w.lower() for w in warnings) - - -class TestSemicolonColumnTypeClassification: - """Test that the validator correctly classifies columns when semicolon-separated - cells contain a mix of numeric and non-numeric parts.""" - - @pytest.fixture - def semicolon_mixed_tsv(self, tmp_path): - """TSV where a column has '1;1-2' - a cell with one numeric and one non-numeric part.""" - content = "#Sample_ID\tCode\tPureNumSemicolon\n" - content += "S1\t1;1-2\t10;20;30\n" - content += "S2\t3\t40\n" - content += "S3\t5\t50;60\n" - tsv_file = tmp_path / "semicolon_mixed.tsv" - tsv_file.write_text(content) - return tsv_file - - def test_semicolon_cell_with_non_numeric_part_is_mixed(self, semicolon_mixed_tsv): - """Test that a column with cell '1;1-2' is classified as mixed-type because '1-2' is not a number.""" - stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) - - assert "Code" in stats["mixed_type_columns"] - assert "Code" not in stats["numeric_columns"] - assert "Code" not in stats["string_columns"] - - def test_semicolon_cell_mixed_produces_cell_level_warning(self, semicolon_mixed_tsv): - """Test that a cell '1;1-2' produces a cell-level mixed-type warning.""" - stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) - - assert any("1;1-2" in w and "mixed types" in w.lower() for w in warnings) - - def test_semicolon_cell_mixed_produces_column_level_warning(self, semicolon_mixed_tsv): - """Test that the column-level mixed-type warning mentions the semicolon classification rule.""" - stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) - - assert any("semicolon-separated" in w and "Code" in w for w in warnings) - - def test_semicolon_cell_mixed_is_not_error(self, semicolon_mixed_tsv): - """Test that mixed types from semicolon cells doesn't produce errors.""" - stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) - - assert not any("mixed" in e.lower() for e in errors) - - def test_purely_numeric_semicolon_column_stays_numeric(self, semicolon_mixed_tsv): - """Test that a column with only numeric values in semicolons (e.g., '10;20;30') should be numeric.""" - stats, errors, warnings = MetadataTSVValidator.validate(semicolon_mixed_tsv, {"S1", "S2", "S3"}) - - assert "PureNumSemicolon" in stats["numeric_columns"] - assert "PureNumSemicolon" not in stats["mixed_type_columns"] - assert "PureNumSemicolon" not in stats["string_columns"] - - -class TestArrayNotation: - """Test that Python/JSON-style array notation '[...]' in cells produces a warning and is treated as string.""" - - def test_array_notation_produces_warning(self, array_notation_tsv): - """Test that array notation in a cell sends a warning that tells the user to use semicolons instead.""" - stats, errors, warnings = MetadataTSVValidator.validate(array_notation_tsv, {"S1", "S2", "S3"}) - - assert any("array notation" in w.lower() for w in warnings) - assert any("semicolon" in w.lower() and "array notation" in w.lower() for w in warnings) - - def test_array_notation_is_not_an_error(self, array_notation_tsv): - """Test that array notation should produce a warning, not an error.""" - stats, errors, warnings = MetadataTSVValidator.validate(array_notation_tsv, {"S1", "S2", "S3"}) - - assert not any("array" in e.lower() for e in errors) - - def test_array_notation_column_not_numeric(self, array_notation_tsv): - """Test that a column containing array notation should not be classified as numeric type.""" - stats, errors, warnings = MetadataTSVValidator.validate(array_notation_tsv, {"S1", "S2", "S3"}) - - assert "Population" not in stats["numeric_columns"] - assert "Population" in stats["mixed_type_columns"] or "Population" in stats["string_columns"] - - def test_array_notation_warns_once_per_column(self, array_notation_tsv): - """Test that only one warning per column should be emitted for array notation.""" - stats, errors, warnings = MetadataTSVValidator.validate(array_notation_tsv, {"S1", "S2", "S3"}) - - array_warnings = [w for w in warnings if "array notation" in w.lower() and "Population" in w] - assert len(array_warnings) == 1 - - def test_array_notation_multiple_columns_warns_per_column(self, array_notation_multiple_cols_tsv): - """Test that each column with array notation should get its own warning.""" - stats, errors, warnings = MetadataTSVValidator.validate(array_notation_multiple_cols_tsv, {"S1", "S2", "S3"}) - - population_warnings = [w for w in warnings if "array notation" in w.lower() and "Population" in w] - scores_warnings = [w for w in warnings if "array notation" in w.lower() and "Scores" in w] - assert len(population_warnings) == 1 - assert len(scores_warnings) == 1 - - def test_non_array_bracket_strings_do_not_warn(self, tmp_path): - """Test that strings that are not array notation (e.g., '[ref]', 'group[1]') does not trigger the warning.""" + def test_non_list_bracket_strings(self, tmp_path): + """Test that strings like 'group[1]' that don't start with '[' should not trigger list parsing.""" content = "#Sample_ID\tCode\n" - content += "S1\t[ref]\n" # Starts and ends with [ and ] - content += ( - "S2\tgroup[1]\n" # does not start with [, should not be treated as array notation despite ] at the end - ) - content += "S3\tnormal\n" - tsv_file = tmp_path / "bracket_strings.tsv" - tsv_file.write_text(content) - - stats, errors, warnings = MetadataTSVValidator.validate(tsv_file, {"S1", "S2", "S3"}) - - array_warnings = [w for w in warnings if "array notation" in w.lower()] - # Only S1's cell '[ref]' matches the array notation so onlu 1 warning is expected. - assert len(array_warnings) == 1 + content += "S1\tgroup[1]\n" + content += "S2\tnormal\n" + p = tmp_path / "bracket_strings.tsv" + p.write_text(content) + stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) + list_errors = [e for e in errors if "list" in e.lower()] + assert len(list_errors) == 0 + + def test_cell_starting_with_bracket_but_not_list(self, tmp_path): + """Test that a cell like '[ref]' starts with '[' -- ast.literal_eval will fail, producing an error.""" + content = "#Sample_ID\tCode\n" + content += "S1\t[ref]\n" + content += "S2\tnormal\n" + p = tmp_path / "bracket_ref.tsv" + p.write_text(content) + stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) + assert any("[ref]" in e and "invalid" in e.lower() for e in errors) + + def test_tuple_notation_is_not_a_list(self, tmp_path): + """Test that a cell like '(1, 2)' that starts with '(' not '[' does not trigger list parsing.""" + content = "#Sample_ID\tCode\n" + content += "S1\t(1, 2)\n" + content += "S2\tnormal\n" + p = tmp_path / "tuple.tsv" + p.write_text(content) + stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) + list_errors = [e for e in errors if "list" in e.lower()] + assert len(list_errors) == 0 From 847c7f96d0925d534bd38e37cd6d8dc2f0fb2c78 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Wed, 25 Feb 2026 13:59:49 +0100 Subject: [PATCH 093/100] WIP:make validation results more robust for worker The shared metadata validator has two callers: the CLI validator and the server-side query engine. The CLI validator will only need to display the error and warning messages to the user. But the query engine need to act on them (raise or warn). The previous implementation that relied on string matching the error messages was brittle: if message phrasing change (which they are prone to do in the pilot phase when we expect a lot of feedback), troubleshooting the query engine will be needed often. Inspired by HTML status error codes, this creates a custom DivBase metadata validation Enum category system that will allow the query engine to look for the validation Enums and not for the exact strings in the messages. Tagged WIP here too since things are probably still broken as the refactoring is not complete --- .../services/sample_metadata_tsv_validator.py | 4 +- .../src/divbase_lib/metadata_validator.py | 251 ++++++++++++------ 2 files changed, 173 insertions(+), 82 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index a19f86db..e22567ce 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -47,8 +47,8 @@ def validate(cls, file_path: Path, project_samples: list[str] | set[str]) -> tup ) result = shared_validator.load_and_validate() - validator.errors = result.errors - validator.warnings = result.warnings + validator.errors = [error_entry.message for error_entry in result.errors] + validator.warnings = [warning_entry.message for warning_entry in result.warnings] if result.df is not None and "Sample_ID" in result.df.columns: try: diff --git a/packages/divbase-lib/src/divbase_lib/metadata_validator.py b/packages/divbase-lib/src/divbase_lib/metadata_validator.py index bb1d347b..21dcdec5 100644 --- a/packages/divbase-lib/src/divbase_lib/metadata_validator.py +++ b/packages/divbase-lib/src/divbase_lib/metadata_validator.py @@ -11,20 +11,50 @@ import ast import csv from dataclasses import dataclass, field +from enum import Enum from pathlib import Path from typing import Any import pandas as pd +class ValidationCategory(Enum): + """ + Categories for validation messages to be used by the sever-side query engine in SidecarQueryManager. + These categories are used by SidecarQueryManager to act on the validation results by Enums instead of string-matching the error and warning messages. + """ + + # Error categories + FILE_READ = "file_read" + HEADER = "header" + SAMPLE_ID_COLUMN = "sample_id_column" + SAMPLE_ID_VALUE = "sample_id_value" + ROW_FORMAT = "row_format" + LIST_SYNTAX = "list_syntax" + MIXED_TYPE = "mixed_type" + DIMENSIONS = "dimensions" + # Warning categories + FORMAT = "format" + TYPE_CLASSIFICATION = "type_classification" + + +@dataclass +class ValidationMessage: + """A class to store the category and message for errors and warnings detected by SharedMetadataValidator.""" + + category: ValidationCategory + message: str + + @dataclass class MetadataValidationResult: """ - Dataclass to hold the results of the TSV file validation. + Dataclass to hold the results of the TSV file validation. Used by the callers of SharedMetadataValidator + (including the client-side CLI validator and the server-side query engine). """ - errors: list[str] = field(default_factory=list) - warnings: list[str] = field(default_factory=list) + errors: list[ValidationMessage] = field(default_factory=list) + warnings: list[ValidationMessage] = field(default_factory=list) stats: dict[str, Any] = field(default_factory=dict) df: pd.DataFrame | None = None mixed_type_columns: list[str] = field(default_factory=list) @@ -75,7 +105,7 @@ def load_and_validate(self) -> MetadataValidationResult: rows = list(reader) if not rows: - self.result.errors.append("File is empty") + self.result.errors.append(ValidationMessage(ValidationCategory.FILE_READ, "File is empty")) return self.result # Pre-pandas checks: @@ -127,7 +157,7 @@ def load_and_validate(self) -> MetadataValidationResult: self.result.warnings.extend(dim_warnings) except Exception as e: - self.result.errors.append(f"Failed to read file: {e}") + self.result.errors.append(ValidationMessage(ValidationCategory.FILE_READ, f"Failed to read file: {e}")) return self.result @@ -136,7 +166,7 @@ def _strip_whitespace_from_cells(self, df: pd.DataFrame) -> None: for col in df.columns: df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x) - def _parse_list_cells_in_dataframe(self, df: pd.DataFrame) -> list[str]: + def _parse_list_cells_in_dataframe(self, df: pd.DataFrame) -> list[ValidationMessage]: """ Parse all string cells in object columns that look like Python list literals, and collect errors for cells that fail to parse. @@ -146,10 +176,8 @@ def _parse_list_cells_in_dataframe(self, df: pd.DataFrame) -> list[str]: ast.literal_eval is whitespace-insensitive within list notation: [3,2], [3, 2], and [ 3 , 2 ] all parse identically to [3, 2]. - - Returns a list of error messages for all cells with invalid list syntax. """ - errors = [] + errors: list[ValidationMessage] = [] for col in df.select_dtypes(include=["object"]).columns: for idx, cell_value in df[col].items(): @@ -164,26 +192,31 @@ def _parse_list_cells_in_dataframe(self, df: pd.DataFrame) -> list[str]: df.at[idx, col] = parsed else: errors.append( - f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' starts with '[' " - f"but parsed as {type(parsed).__name__}, not a list." + ValidationMessage( + ValidationCategory.LIST_SYNTAX, + f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' starts with '[' " + f"but parsed as {type(parsed).__name__}, not a list.", + ) ) except (ValueError, SyntaxError): errors.append( - f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' has invalid " - "Python list syntax. Multi-value cells must use valid Python list " - 'notation, e.g. [1, 2, 3] or ["a", "b"].' + ValidationMessage( + ValidationCategory.LIST_SYNTAX, + f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' has invalid " + "Python list syntax. Multi-value cells must use valid Python list " + 'notation, e.g. [1, 2, 3] or ["a", "b"].', + ) ) return errors - def _validate_raw_header(self, header_line: str) -> tuple[list[str], list[str]]: + def _validate_raw_header(self, header_line: str) -> tuple[list[ValidationMessage], list[ValidationMessage]]: """ Validate the raw header line before pandas processing. Attempts to catch issues that pandas would silently fix (like duplicate columns). - """ - errors = [] + errors: list[ValidationMessage] = [] raw_columns = header_line.split("\t") cleaned_columns = [col.lstrip("#") for col in raw_columns] @@ -191,10 +224,12 @@ def _validate_raw_header(self, header_line: str) -> tuple[list[str], list[str]]: empty_columns = [i + 1 for i, col in enumerate(cleaned_columns) if not col.strip()] if empty_columns: errors.append( - f"Empty column name(s) found at position(s): {empty_columns}. All columns must have a non-empty name." + ValidationMessage( + ValidationCategory.HEADER, + f"Empty column name(s) found at position(s): {empty_columns}. All columns must have a non-empty name.", + ) ) - # Check for duplicate columns after stripping '#' seen = {} duplicate_columns = [] for col in cleaned_columns: @@ -207,21 +242,29 @@ def _validate_raw_header(self, header_line: str) -> tuple[list[str], list[str]]: if duplicate_columns: errors.append( - f"Duplicate column names found: {duplicate_columns}. " - "Each column name must be unique in the metadata file." + ValidationMessage( + ValidationCategory.HEADER, + f"Duplicate column names found: {duplicate_columns}. " + "Each column name must be unique in the metadata file.", + ) ) if raw_columns and raw_columns[0] != "#Sample_ID": - errors.append(f"First column must be named '#Sample_ID', found: '{raw_columns[0]}'") + errors.append( + ValidationMessage( + ValidationCategory.SAMPLE_ID_COLUMN, + f"First column must be named '#Sample_ID', found: '{raw_columns[0]}'", + ) + ) return errors, [] - def _check_row_formatting(self, rows: list[list[str]]) -> tuple[list[str], list[str]]: + def _check_row_formatting(self, rows: list[list[str]]) -> tuple[list[ValidationMessage], list[ValidationMessage]]: """ Check for row-level formatting issues that pandas might handle silently. """ - errors = [] - warnings = [] + errors: list[ValidationMessage] = [] + warnings: list[ValidationMessage] = [] header = rows[0] num_columns = len(header) @@ -230,9 +273,12 @@ def _check_row_formatting(self, rows: list[list[str]]) -> tuple[list[str], list[ if len(row) != num_columns: sample_hint = f" (Sample_ID: '{row[0]}')" if row else "" errors.append( - f"Row {row_num}: Expected {num_columns} tab-separated columns from reading the header, " - f"found {len(row)}{sample_hint}. " - "Check that all cells in the TSV are separated by tabs (not spaces)." + ValidationMessage( + ValidationCategory.ROW_FORMAT, + f"Row {row_num}: Expected {num_columns} tab-separated columns from reading the header, " + f"found {len(row)}{sample_hint}. " + "Check that all cells in the TSV are separated by tabs (not spaces).", + ) ) continue @@ -240,34 +286,57 @@ def _check_row_formatting(self, rows: list[list[str]]) -> tuple[list[str], list[ if cell != cell.strip(): col_name = header[col_idx] warnings.append( - f"Row {row_num}, Column '{col_name}': Cell has leading or trailing whitespace " - "(this is allowed, but note that they will be stripped by DivBase server when the TSV is used for queries)" + ValidationMessage( + ValidationCategory.FORMAT, + f"Row {row_num}, Column '{col_name}': Cell has leading or trailing whitespace " + "(this is allowed, but note that they will be stripped by DivBase server when the TSV is used for queries)", + ) ) if col_idx == 0 and not cell.strip(): - errors.append(f"Row {row_num}: Sample_ID is empty") + errors.append( + ValidationMessage( + ValidationCategory.SAMPLE_ID_VALUE, + f"Row {row_num}: Sample_ID is empty", + ) + ) return errors, warnings - def _validate_sample_ids(self, df: pd.DataFrame) -> tuple[list[str], list[str]]: + def _validate_sample_ids(self, df: pd.DataFrame) -> tuple[list[ValidationMessage], list[ValidationMessage]]: """ Validate Sample_ID column in the DataFrame. The Sample_ID column must be present in the TSV, it has to be the first column, and it must contain non-empty, unique values. Further more, it must have a single value per row. List values (Python list notation like ["S1", "S2"]) are not allowed in Sample_ID. """ - errors = [] + errors: list[ValidationMessage] = [] if "Sample_ID" not in df.columns: - errors.append("The 'Sample_ID' column is required in the metadata file.") + errors.append( + ValidationMessage( + ValidationCategory.SAMPLE_ID_COLUMN, + "The 'Sample_ID' column is required in the metadata file.", + ) + ) return errors, [] if df["Sample_ID"].isna().any() or (df["Sample_ID"] == "").any(): - errors.append("Sample_ID column contains empty or missing values. All rows must have a valid Sample_ID.") + errors.append( + ValidationMessage( + ValidationCategory.SAMPLE_ID_VALUE, + "Sample_ID column contains empty or missing values. All rows must have a valid Sample_ID.", + ) + ) if df["Sample_ID"].duplicated().any(): duplicates = df[df["Sample_ID"].duplicated()]["Sample_ID"].tolist() - errors.append(f"Duplicate Sample_IDs found: {duplicates}. Each Sample_ID must be unique.") + errors.append( + ValidationMessage( + ValidationCategory.SAMPLE_ID_VALUE, + f"Duplicate Sample_IDs found: {duplicates}. Each Sample_ID must be unique.", + ) + ) list_sample_ids = [ sid @@ -277,8 +346,11 @@ def _validate_sample_ids(self, df: pd.DataFrame) -> tuple[list[str], list[str]]: ] if list_sample_ids: errors.append( - f"Sample_ID column contains list values: {list_sample_ids}. " - "Sample_ID must contain only one value per row (list notation is not allowed)." + ValidationMessage( + ValidationCategory.SAMPLE_ID_VALUE, + f"Sample_ID column contains list values: {list_sample_ids}. " + "Sample_ID must contain only one value per row (list notation is not allowed).", + ) ) return errors, [] @@ -315,7 +387,9 @@ def parse_cell_value(cell_value) -> Any: return cell_value - def _classify_column_type(self, df: pd.DataFrame) -> tuple[list[str], list[str], list[str], list[str], list[str]]: + def _classify_column_type( + self, df: pd.DataFrame + ) -> tuple[list[str], list[str], list[str], list[ValidationMessage], list[ValidationMessage]]: """ Classify every user-defined column as numeric, string, or mixed-type in a single pass over the data. @@ -327,17 +401,15 @@ def _classify_column_type(self, df: pd.DataFrame) -> tuple[list[str], list[str], After scanning all cells in a column, classify the column type: - All cells are numeric -> numeric column - - All cells are string string -> string column + - All cells are string -> string column - Contains both numeric and string cells -> mixed-type column: treat as string and send per-cell warnings to communicate the ambiguities to the user. - All cells are Null -> numeric (to match Pandas default for NaN-only columns) - - Returns (numeric_cols, string_cols, mixed_type_columns, cell_errors, cell_warnings). """ numeric_cols: list[str] = [] string_cols: list[str] = [] mixed_type_columns: list[str] = [] - cell_errors: list[str] = [] - cell_warnings: list[str] = [] + cell_errors: list[ValidationMessage] = [] + cell_warnings: list[ValidationMessage] = [] for col in df.columns: if col == "Sample_ID": @@ -368,11 +440,14 @@ def _classify_column_type(self, df: pd.DataFrame) -> tuple[list[str], list[str], if cell_has_numeric and cell_has_string: cell_errors.append( - f"Row {idx + 2}, Column '{col}': List cell {cell_value} contains " - f"mixed element types (both numeric and string values). " - f"All elements in a list must be the same type. Use " - f"either all numbers (e.g. [1, 2, 3]) or all strings " - f'(e.g. ["a", "b", "c"]).' + ValidationMessage( + ValidationCategory.MIXED_TYPE, + f"Row {idx + 2}, Column '{col}': List cell {cell_value} contains " + f"mixed element types (both numeric and string values). " + f"All elements in a list must be the same type. Use " + f"either all numbers (e.g. [1, 2, 3]) or all strings " + f'(e.g. ["a", "b", "c"]).', + ) ) has_string = True string_cells.append((idx, cell_value)) @@ -397,9 +472,12 @@ def _classify_column_type(self, df: pd.DataFrame) -> tuple[list[str], list[str], minority_type = "non-numeric" if minority is string_cells else "numeric" for idx, val in minority: cell_warnings.append( - f"Row {idx + 2}, Column '{col}': Cell '{val}' is {minority_type} " - f"in a column that contains both numeric and non-numeric values. " - f"This column will be treated as string type." + ValidationMessage( + ValidationCategory.TYPE_CLASSIFICATION, + f"Row {idx + 2}, Column '{col}': Cell '{val}' is {minority_type} " + f"in a column that contains both numeric and non-numeric values. " + f"This column will be treated as string type.", + ) ) elif has_numeric: numeric_cols.append(col) @@ -408,23 +486,24 @@ def _classify_column_type(self, df: pd.DataFrame) -> tuple[list[str], list[str], return numeric_cols, string_cols, mixed_type_columns, cell_errors, cell_warnings - def _generate_mixed_type_warning_clarification(self, mixed_columns: list[str]) -> str | None: + def _generate_mixed_type_warning_clarification(self, mixed_columns: list[str]) -> ValidationMessage | None: """ Generate clarification warning about mixed-type columns. This tells users that mixed-type columns are treated as string in Divbase. """ if not mixed_columns: return None - return ( + return ValidationMessage( + ValidationCategory.TYPE_CLASSIFICATION, "Clarification on mixed types columns: " "Columns are treated as string by DivBase if they contain a mix of numeric and non-numeric values " 'or list cells with mixed element types (for example [1, "two"]). ' "A column is only numeric if all values (including each element in list cells) are valid numbers. " "Use Python list notation (e.g. [1, 2, 3]) for multi-value cells. " - "Numeric query operations (ranges, inequalities) will not be applicable to string columns." + "Numeric query operations (ranges, inequalities) will not be applicable to string columns.", ) - def _check_for_semicolons_in_plain_string_cells(self, df: pd.DataFrame) -> list[str]: + def _check_for_semicolons_in_plain_string_cells(self, df: pd.DataFrame) -> list[ValidationMessage]: """ Check for semicolons in plain string (single-value) cells. This is to reduce user confusion with the DivBase query filter syntax, which uses semicolons to separate key:value pairs. For example: divbase query tsv "Area:North;Population:1" @@ -433,7 +512,7 @@ def _check_for_semicolons_in_plain_string_cells(self, df: pd.DataFrame) -> list[ A TSV cell containing a semicolon (e.g. "2;4") will be treated as a plain string value and cannot be matched via the query syntax since the query parser will split on the semicolon. If the user intended multiple values, they should use Python list notation instead. """ - warnings = [] + warnings: list[ValidationMessage] = [] for col in df.columns: if col == "Sample_ID": @@ -447,29 +526,32 @@ def _check_for_semicolons_in_plain_string_cells(self, df: pd.DataFrame) -> list[ continue if ";" in cell_value: warnings.append( - f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' contains a semicolon. " - "If you intended multiple values, use Python list notation instead " - '(e.g. [1, 2] or ["a", "b"]). ' - "If the semicolon is intentional, note that DivBase query syntax uses " - "semicolons to separate filter key:value pairs, so this exact cell value " - "cannot be matched via queries." + ValidationMessage( + ValidationCategory.FORMAT, + f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' contains a semicolon. " + "If you intended multiple values, use Python list notation instead " + '(e.g. [1, 2] or ["a", "b"]). ' + "If the semicolon is intentional, note that DivBase query syntax uses " + "semicolons to separate filter key:value pairs, so this exact cell value " + "cannot be matched via queries.", + ) ) break return warnings - def _check_for_commas_in_plain_string_cells(self, df: pd.DataFrame) -> list[str]: + def _check_for_commas_in_plain_string_cells(self, df: pd.DataFrame) -> list[ValidationMessage]: """ Check for commas in plain string cells (in single-value cells, not in multi-values list cells) and warn users about the ambiguity that might cause for DivBase filtering since the metadata query filter syntax uses commas to separate filter values. For example: divbase query tsv "Area:North,South" filters for rows where Area is "North" OR "South". A TSV cell containing the literal string "North,South" would not match that query, because the query parser splits on commas. - This helper method warns uses about + This helper method warns users about this. Commas inside list notation (e.g. ["North", "South"]) are fine since they are parsed as lists. """ - warnings = [] + warnings: list[ValidationMessage] = [] for col in df.columns: series = df[col] @@ -481,13 +563,16 @@ def _check_for_commas_in_plain_string_cells(self, df: pd.DataFrame) -> list[str] continue if "," in cell_value: warnings.append( - f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' contains a comma. " - "If you intended multiple values, use Python list notation instead " - '(e.g. [1, 2] or ["a", "b"]). ' - "If the comma is intentional, note that DivBase query syntax uses " - "commas to separate filter values. To query for this exact string, " - "enclose the value in double quotes in your filter (e.g. " - 'Area:"North,South").' + ValidationMessage( + ValidationCategory.FORMAT, + f"Row {idx + 2}, Column '{col}': Cell '{cell_value}' contains a comma. " + "If you intended multiple values, use Python list notation instead " + '(e.g. [1, 2] or ["a", "b"]). ' + "If the comma is intentional, note that DivBase query syntax uses " + "commas to separate filter values. To query for this exact string, " + "enclose the value in double quotes in your filter (e.g. " + 'Area:"North,South").', + ) ) break @@ -495,30 +580,36 @@ def _check_for_commas_in_plain_string_cells(self, df: pd.DataFrame) -> list[str] def _validate_dimensions_match( self, tsv_samples: set[str], project_samples: set[str] - ) -> tuple[list[str], list[str]]: + ) -> tuple[list[ValidationMessage], list[ValidationMessage]]: """ Validate that TSV samples match project dimensions. """ # TODO consider the fact that the query route also runs _check_that_dimensions_is_up_to_date_with_VCF_files_in_bucket in tasks.py before even reaching the SharedMetadataValidator... - errors = [] - warnings = [] + errors: list[ValidationMessage] = [] + warnings: list[ValidationMessage] = [] missing_from_project = tsv_samples - project_samples if missing_from_project: examples = sorted(list(missing_from_project)) errors.append( - f"The following samples in the TSV were not found in the DivBase project's dimensions index: {examples}. " - "DivBase requires that all samples in the TSV file must be present in the project's dimensions index to be used for queries." + ValidationMessage( + ValidationCategory.DIMENSIONS, + f"The following samples in the TSV were not found in the DivBase project's dimensions index: {examples}. " + "DivBase requires that all samples in the TSV file must be present in the project's dimensions index to be used for queries.", + ) ) missing_from_tsv = project_samples - tsv_samples if missing_from_tsv: examples = sorted(list(missing_from_tsv)) warnings.append( - f"The following samples in the DivBase project's dimensions index were not found in the TSV: {examples}. " - "This is allowed for DivBase metadata TSV files, but please be aware that these samples will not be considered when making queries with this metadata file." + ValidationMessage( + ValidationCategory.DIMENSIONS, + f"The following samples in the DivBase project's dimensions index were not found in the TSV: {examples}. " + "This is allowed for DivBase metadata TSV files, but please be aware that these samples will not be considered when making queries with this metadata file.", + ) ) return errors, warnings From 4468f5e953965285a5b7c21f71cace2aefb12ba0 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 26 Feb 2026 10:22:29 +0100 Subject: [PATCH 094/100] WIP: refactor query engine for updated validator Reuse calculations from the shared metadata validator: no need to recheck if a col is num, string, or mixed-type, just import the results. The shared validator does the parsing of multi-value lists and passes the dataframe to the query engine logic. --- .../src/divbase_api/services/queries.py | 253 +++++++----------- 1 file changed, 96 insertions(+), 157 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index f725908a..0e991883 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -29,7 +29,7 @@ SidecarNoDataLoadedError, SidecarSampleIDError, ) -from divbase_lib.metadata_validator import SharedMetadataValidator +from divbase_lib.metadata_validator import SharedMetadataValidator, ValidationCategory logger = logging.getLogger(__name__) @@ -700,6 +700,9 @@ def __init__(self, file: Path, project_samples: set[str] | None = None): self.metadata_validator = None self.query_result = None self.query_message: str = "" + self.numeric_columns: list[str] = [] + self.string_columns: list[str] = [] + self.mixed_type_columns: list[str] = [] self.warnings: list[str] = [] self.load_file() @@ -708,9 +711,11 @@ def load_file(self) -> "SidecarQueryManager": Method that loads the TSV file into a pandas DataFrame. Assumes that the first row is a header row, and that the file is tab-separated. Also removes any leading '#' characters from the column names. + Uses the warning and error category Enums from SharedMetadataValidator logic to raise errors or send warnings to the user. + Validates the same errors as the client-side MetadataTSVValidator using shared validation logic: - Header: first column must be #Sample_ID, no duplicate or empty column names - - Sample_ID: no empty values, no duplicates, no semicolons + - Sample_ID: no empty values, no duplicates, no multi-values (Python lists) - Data: no commas in any cell values """ try: @@ -726,53 +731,28 @@ def load_file(self) -> "SidecarQueryManager": result = self.metadata_validator.load_and_validate() if result.errors: - error_msg = result.errors[0] - # Note! The order of these errors matters. - if "Failed to read file" in error_msg: + # Note! The order of these errors matters. The first error in the list is the one that is raised, so more critical errors should be placed higher in the order than less critical errors. + first_encountered_error = result.errors[0] + if first_encountered_error.category == ValidationCategory.FILE_READ: raise SidecarNoDataLoadedError(file_path=self.file, submethod="load_file") - elif "First column must be named '#Sample_ID'" in error_msg or ( - "Sample_ID" in error_msg and "column is required" in error_msg - ): - raise SidecarColumnNotFoundError( - "The 'Sample_ID' column is required in the metadata file." - if "First column must be named '#Sample_ID'" in error_msg - else error_msg - ) - elif ("Row" in error_msg and "Sample_ID is empty" in error_msg) or ( - "Sample_ID" in error_msg - and ( - "contains semicolons" in error_msg - or "Duplicate Sample_IDs" in error_msg - or "contains empty or missing values" in error_msg - ) - ): - raise SidecarSampleIDError( - "Sample_ID column contains empty or missing values. All rows must have a valid Sample_ID." - if "Row" in error_msg and "Sample_ID is empty" in error_msg - else error_msg - ) - elif ( - "not found in the DivBase project's dimensions index" in error_msg - or "Duplicate column names" in error_msg - or "Empty column name" in error_msg - or ("Row" in error_msg and ("Expected" in error_msg or "tab-separated" in error_msg)) - or "column" in error_msg.lower() - ): - raise SidecarMetadataFormatError(error_msg) + elif first_encountered_error.category == ValidationCategory.SAMPLE_ID_COLUMN: + raise SidecarColumnNotFoundError(first_encountered_error.message) + elif first_encountered_error.category == ValidationCategory.SAMPLE_ID_VALUE: + raise SidecarSampleIDError(first_encountered_error.message) else: - raise SidecarMetadataFormatError(error_msg) + raise SidecarMetadataFormatError(first_encountered_error.message) - # Capture dimension-related warnings and array notation warnings from the validator. - # Array notation warnings are forwarded because they directly affect query behaviour (the column - # will be string instead of numeric, so numeric filter syntax will not work as expected). - # Other file-quality warnings (mixed types, commas, etc.) are shown in CLI validation only. if result.warnings: - dimension_warnings = [w for w in result.warnings if "dimensions index" in w or "project" in w] - array_notation_warnings = [w for w in result.warnings if "array notation" in w.lower()] - self.warnings.extend(dimension_warnings) - self.warnings.extend(array_notation_warnings) + self.warnings.extend( + w.message + for w in result.warnings + if w.category in (ValidationCategory.DIMENSIONS, ValidationCategory.FORMAT) + ) self.df = result.df + self.numeric_columns = result.numeric_columns + self.string_columns = result.string_columns + self.mixed_type_columns = result.mixed_type_columns except ( SidecarSampleIDError, @@ -824,8 +804,8 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": Filter string values in the query vs. cell values in the TSV: - Filter strings are handled per semicolon-separated key-value pair: in "key1:value1,value2;key2:value3,value4" "key1:value1,value2" is handled separately from "key2:value3,value4". - - Filter string values can be comma-separated, e.g. "value1,value2" in "key1:value1,value2" and each filter string value is handled separately. - - Cell values can be semicolon-separated, e.g. "25;30;35" in a TSV cell + - Filter string values can be comma-separated, e.g. "value1,value2" in "key1:value1,va, lue2" and each filter string value is handled separately. + - Cells can have multi-values as long as Python list syntax is used in the TSV cell, e.g. [25, 30, 35]. - Matching of filter string to cell values uses OR logic: if ANY value in a cell matches ANY filter value, the row matches. E.g. "key2:value3,value4" means that TSV cells in the "key2" column that contain "value3" will match, but also cells that contain "value3;value4" or "value4;value3" will match. @@ -876,40 +856,28 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": self.warnings.append(warning_msg) continue - is_numeric = pd.api.types.is_numeric_dtype(self.df[key]) - is_semicolon_numeric = self._is_semicolon_separated_numeric_column(key) if not is_numeric else False + is_numeric = key in self.numeric_columns # Check if type consistency and return warnings to users if applicable. # If the column is treated as string, check for potential user mistakes (e.g. using numeric filter syntax on a string column that contains numeric-looking values): # 1. Warn if the column has mixed types (some values look numeric) and that the column will be treat as string type. # 2. Warn if the filter uses numeric syntax on this string column. Do not raise error. - if not is_numeric and not is_semicolon_numeric: - is_mixed, example_values, total_count = self._is_mixed_type_column(key) + if not is_numeric: + is_mixed = key in self.mixed_type_columns problematic_filter_values = self._detect_numeric_filter_syntax_on_string_column( key, filter_string_values ) - # Build warning message for string columns with possible issues. Multiple warnings are presented with indended hyphen if is_mixed or problematic_filter_values: warning_lines = [f"Column '{key}':"] if is_mixed: - warning_lines.append( - " - Contains mixed types (e.g., numeric-looking values mixed with non-numeric values, or special characters like commas (,) or hyphens (-), or Range notation such as '1-2')." - ) - if total_count > 0: - examples_str = ", ".join(f"'{v}'" for v in example_values) - warning_lines.append( - f" Found {total_count} cell(s) with problematic values. Showing up to three of those values as an example: {examples_str}" - ) - - warning_lines.append(" This column will be treated as a string column.") - warning_lines.append( - " To store multiple numeric values, use semicolon-separated values (;) instead." - ) + warning_lines.append(" - Contains mixed types (both numeric and non-numeric values).") + warning_lines.append(" This column is treated as a string column.") if problematic_filter_values: warning_lines.append( - f" - Your filter contains comparison operators {problematic_filter_values}, which are not supported on string columns." + f" - Your filter contains comparison operators {problematic_filter_values}, " + "which are not supported on string columns." ) warning_lines.append( " DivBase comparison operators (>, <, >=, <=) only work on numeric columns." @@ -926,8 +894,8 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": # Supports filtering on semicolon-separated values in cells in the TSV: e.g. "25;30;35" # Also handles columns that pandas infers as strings but contain numeric values with semicolons (e.g., "1;2;3") # Also supports NOT operator with ! prefix: e.g., "Weight:!25" or "Weight:<4,!2" - if is_numeric or is_semicolon_numeric: - filter_string_values_list = filter_string_values.split(",") + if is_numeric: + filter_string_values_list = self._split_filter_values(filter_string_values) # Negated values are those that start with "!" in the filter string positive_values, negated_values = self._separate_positive_and_negated_values( @@ -989,7 +957,8 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": else: # Non-numeric column: handle as discrete string values # Supports NOT operator with ! prefix: e.g., "Area:!North" or "Area:North,!South" - filter_string_values_list = filter_string_values.split(",") + # Supports quoted values with commas: e.g., 'Area:"North,South"' matches the literal string + filter_string_values_list = self._split_filter_values(filter_string_values) positive_values, negated_values = self._separate_positive_and_negated_values( filter_values=filter_string_values_list @@ -1047,74 +1016,6 @@ def run_query(self, filter_string: str = None) -> "SidecarQueryManager": return self - def _is_semicolon_separated_numeric_column(self, key: str) -> bool: - """ - Helper method for the filtering logic to detect if a column contains semicolon-separated numeric values. - - Uses the shared validation logic to ensure consistency with the TSV validator. - """ - if key not in self.df.columns: - return False - - return self.metadata_validator.is_semicolon_separated_numeric_column(self.df[key]) - - def _is_mixed_type_column(self, key: str) -> tuple[bool, list[str], int]: - """ - Helper method for the filtering logic to detect if a non-numeric column has mixed types. - - A column is considered mixed-type if it contains: - 1. Both numeric-looking and non-numeric values (e.g., "8", "1a", "5a") - 2. Special characters that suggest non-numeric use (commas, hyphens in non-negative-number contexts) - - This is called only for columns where pandas infers object dtype AND - _is_semicolon_separated_numeric_column returned False. - - Returns a tuple of (is_mixed, example_values, total_count) where: - - is_mixed: True if the column should be treated as mixed-type (and thus string) - - example_values: A list of up to 3 example cell values that demonstrate the mixed types Limited to 3 for brevity. The CLI divbase-cli dimensions validate-metadata-file can be used to show all of them. - - total_count: Total number of cells with mixed types or special characters - """ - if key not in self.df.columns: - return False, [], 0 - - non_null_values = self.df[key].dropna() - if len(non_null_values) == 0: - return False, [], 0 - - has_numeric = False - has_non_numeric = False - example_values = [] - total_problematic_count = 0 - - for cell_value in non_null_values: - cell_str = str(cell_value).strip() - if not cell_str: - continue - - cell_has_numeric = False - cell_has_non_numeric = False - - parts = cell_str.split(";") - for part in parts: - part = part.strip() - if not part: - continue - - try: - float(part) - cell_has_numeric = True - has_numeric = True - except ValueError: - cell_has_non_numeric = True - has_non_numeric = True - - if (cell_has_numeric and cell_has_non_numeric) or ("," in cell_str or "-" in cell_str): - total_problematic_count += 1 - if cell_str not in example_values and len(example_values) < 3: - example_values.append(cell_str) - - return (has_numeric and has_non_numeric), example_values, total_problematic_count - def _detect_numeric_filter_syntax_on_string_column(self, key: str, filter_string_values: str) -> list[str]: """ Helper method for the filtering logic to detect when a user's filter string contains inequality operators @@ -1130,7 +1031,7 @@ def _detect_numeric_filter_syntax_on_string_column(self, key: str, filter_string Returns a list of the problematic filter values for use in a warning messages. """ problematic_filter_values = [] - values = filter_string_values.split(",") + values = self._split_filter_values(filter_string_values) for filter_value in values: filter_value = filter_value.strip().lstrip("!") # strip negation prefix for checking if not filter_value: @@ -1140,15 +1041,55 @@ def _detect_numeric_filter_syntax_on_string_column(self, key: str, filter_string problematic_filter_values.append(filter_value) return problematic_filter_values - def _split_cell_values(self, cell_value: Any) -> list[str]: + def _split_filter_values(self, filter_values_str: str) -> list[str]: """ - Helper method for the filtering logic to split cell value by semicolon and return list of non-empty values. - If the cell contains a single value without semicolon, it will return a list with that single value. - If the cell is empty or NaN, it will return an empty list. + Split comma-separated filter-value strings. + + Designed to handle cases with filter values that contain commas or other special characters by allowing users to wrap such values in double quotes. + For example, if a TSV cell contains the literal string: `North, South`, the CLI filter must be wrapped in double quotes so the comma is not + treated as a value separator: + + divbase-cli query tsv 'Area:"North, South"' + + Additionally, the filter string itself must be wrapped in single quotes to prevent the shell from interpreting the inner double quotes. + Only double-quote quoting is supported inside filter values; single quotes inside the filter string are treated as literal characters + (i.e. ``"Area:'North, South'"`` is not supported.). + + The ``!`` NOT operator is preserved as part of the string and is handled later by ``_separate_positive_and_negated_values``. + + Examples: + "North,South" -> ["North", "South"] + '"North,South",East' -> ["North,South", "East"] + '!"North,South"' -> ["!North,South"] """ + values = [] + current = [] + in_quotes = False + + # Iterate through the filter values string character by character to handle double-quoted strings correctly. + for char in filter_values_str: + if char == '"': + in_quotes = not in_quotes + elif char == "," and not in_quotes: + values.append("".join(current).strip()) + current = [] + else: + current.append(char) + + values.append("".join(current).strip()) + return [v for v in values if v] + + def _get_cell_values(self, cell_value: Any) -> list: + """Return cell value as a list of values for filtering. + + Checks for list type before pd.isna() because pd.isna() raises + ValueError on list/array inputs. + """ + if isinstance(cell_value, list): + return cell_value if pd.isna(cell_value): return [] - return [val.strip() for val in str(cell_value).split(";") if val.strip()] + return [cell_value] def _parse_numeric_value(self, value_str: str) -> float | int: """Helper method for the filtering logic to parse a string value to int or float. To be used when other checks have already confirmed that the value can be parsed as numeric.""" @@ -1162,12 +1103,12 @@ def _create_inequality_condition(self, key: str, operator: str, threshold: float """ def check_inequality(cell_value): - if pd.isna(cell_value): + cell_values = self._get_cell_values(cell_value) + if not cell_values: return False - cell_values = self._split_cell_values(cell_value) - for val_str in cell_values: + for val in cell_values: try: - val_num = self._parse_numeric_value(val_str) + val_num = float(val) if ( (operator == ">" and val_num > threshold) or (operator == ">=" and val_num >= threshold) @@ -1189,12 +1130,12 @@ def _create_range_condition(self, key: str, min_val: float, max_val: float) -> p """ def check_range(cell_value): - if pd.isna(cell_value): + cell_values = self._get_cell_values(cell_value) + if not cell_values: return False - cell_values = self._split_cell_values(cell_value) - for val_str in cell_values: + for val in cell_values: try: - val_num = self._parse_numeric_value(val_str) + val_num = float(val) if min_val <= val_num <= max_val: return True except ValueError: @@ -1211,12 +1152,12 @@ def _create_discrete_numeric_condition(self, key: str, target_values: list[float """ def check_discrete(cell_value): - if pd.isna(cell_value): + cell_values = self._get_cell_values(cell_value) + if not cell_values: return False - cell_values = self._split_cell_values(cell_value) - for val_str in cell_values: + for val in cell_values: try: - val_num = self._parse_numeric_value(val_str) + val_num = float(val) if val_num in target_values: return True except ValueError: @@ -1233,10 +1174,8 @@ def _create_string_condition(self, key: str, target_values: list[str]) -> pd.Ser """ def check_string(cell_value): - if pd.isna(cell_value): - return False - cell_values = self._split_cell_values(cell_value) - return any(val in target_values for val in cell_values) + cell_values = self._get_cell_values(cell_value) + return any(str(val) in target_values for val in cell_values) return self.df[key].apply(check_string) From f2e19d89224a37a5a320b0b36e06bcbcbe3b1b79 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 26 Feb 2026 11:51:30 +0100 Subject: [PATCH 095/100] Update tests to match refactoring Also collect all shared unit test fixtures together, now that all tests pass again. --- tests/unit/conftest.py | 207 +++++++++-- .../test_sample_metadata_queries.py | 346 ++++++++++++------ .../test_sample_metadata_tsv_validator.py | 197 ++-------- .../test_tsv_to_dataframe_to_tsv.py | 2 +- 4 files changed, 437 insertions(+), 315 deletions(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 6585f64e..49054e13 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -11,12 +11,17 @@ @pytest.fixture def valid_tsv(tmp_path): - """Simple valid TSV that passes all validation checks.""" + """Simple valid TSV that passes all validation checks. + + Uses Python list notation for multi-value cells. Only numeric lists are + used here to keep the fixture compatible with identity roundtrip tests + (string lists use single quotes in repr, which breaks byte-identical roundtrips). + """ tsv_content = """#Sample_ID\tPopulation\tArea\tWeight S1\t1\tNorth\t12.5 -S2\t2;4\tEast\t18.8 -S3\t3\tWest;South\t15.0 -S4\t3;5\tSouth\t20.0 +S2\t[2, 4]\tEast\t18.8 +S3\t3\tWest\t15.0 +S4\t[3, 5]\tSouth\t20.0 S5\t4\tNorth\t22.1 """ tsv_file = tmp_path / "valid.tsv" @@ -26,7 +31,7 @@ def valid_tsv(tmp_path): @pytest.fixture def no_multi_values_tsv(tmp_path): - """TSV with no semicolon-separated values in any cell.""" + """TSV with no list cells — only scalar values.""" tsv_content = """#Sample_ID\tPopulation\nS1\t1\nS2\t2\n""" tsv_file = tmp_path / "no_multi_values.tsv" tsv_file.write_text(tsv_content) @@ -35,13 +40,13 @@ def no_multi_values_tsv(tmp_path): @pytest.fixture def numeric_multi_values_tsv(tmp_path): - """TSV with multi-value numeric cells and negative numbers.""" + """TSV with multi-value numeric cells (Python list notation) and negative numbers.""" tsv_content = """#Sample_ID\tScores\tValues\tTemperature\tLongitude\tLatitude\tElevation -S1\t1;2;3\t10;20\t-5.5\t-2.78305556\t51.5\t100 -S2\t4;5\t30;40;50\t-10.2\t-0.12765\t52.2\t-50 +S1\t[1, 2, 3]\t[10, 20]\t-5.5\t-2.78305556\t51.5\t100 +S2\t[4, 5]\t[30, 40, 50]\t-10.2\t-0.12765\t52.2\t-50 S3\t6\t60\t0\t1.25\t50.8\t-100.5 -S4\t7;8;9;10\t70\t15.5\t-3.5;-2.1\t49.5\t200 -S5\t11\t80;90\t-20\t0\t48.2\t-25 +S4\t[7, 8, 9, 10]\t70\t15.5\t[-3.5, -2.1]\t49.5\t200 +S5\t11\t[80, 90]\t-20\t0\t48.2\t-25 """ tsv_file = tmp_path / "numeric_multi_values.tsv" tsv_file.write_text(tsv_content) @@ -50,15 +55,15 @@ def numeric_multi_values_tsv(tmp_path): @pytest.fixture def sample_tsv_with_numeric_data(tmp_path): - """Comprehensive TSV with numeric, string, semicolon, float, and negative columns.""" + """Comprehensive TSV with numeric, string, list-notation, float, and negative columns.""" tsv_content = """#Sample_ID\tPopulation\tWeight\tAge\tArea\tSingleNumber\tSingleString\tTemperature\tLongitude\tLatitude\tElevation S1\t1\t20.2\t5.0\tNorth\t100\tString\t-5.5\t-2.78305556\t51.5\t100 -S2\t2;4\t25.0\t10\tEast\t200\tStrings\t-10.2\t-0.12765\t52.2\t-50 -S3\t3\t30.8\t15\tWest;South;East\t300\tSting\t0\t1.25\t50.8\t-100.5 -S4\t4\t35.1\t20\tWest\t400\tStings\t15.5\t-3.5;-2.1\t49.5\t200 +S2\t[2, 4]\t25.0\t10\tEast\t200\tStrings\t-10.2\t-0.12765\t52.2\t-50 +S3\t3\t30.8\t15\t["West", "South", "East"]\t300\tSting\t0\t1.25\t50.8\t-100.5 +S4\t4\t35.1\t20\tWest\t400\tStings\t15.5\t[-3.5, -2.1]\t49.5\t200 S5\t5\t40.0\t25\tNorth\t500\tThing\t-20\t0\t48.2\t-25 S6\t6\t45.4\t30\tEast\t600\tThings\t10\t2.5\t53.1\t150 -S7\t1;3;5\t50.9\t35\tSouth\t700\tStrong\t5\t-1.5\t52.8\t50 +S7\t[1, 3, 5]\t50.9\t35\tSouth\t700\tStrong\t5\t-1.5\t52.8\t50 S8\t2\t55.2\t40\tWest\t800\tStrung\t20\t3.0\t51.0\t75 S9\t7\t62.6\t45\tNorth\t900\tStang\t-15\t-2.0\t54.5\t-10 S10\t8\t70.7\t52\tEast\t1000\tSong\t25\t1.5\t50.5\t200 @@ -84,15 +89,16 @@ def sample_tsv_with_mixed_type_column(tmp_path): @pytest.fixture -def sample_tsv_with_semicolon_mixed_type_column(tmp_path): - """TSV with a semicolon-separated column where one part is non-numeric ('1;1-2').""" - tsv_content = """#Sample_ID\tCode\tPureNumericSemicolon\tWeight -S1\t1;1-2\t10;20;30\t12.5 +def sample_tsv_with_list_mixed_type_column(tmp_path): + """TSV where Code column has a non-numeric scalar ('1-2') mixed with numeric scalars, + making it a mixed-type column. PureNumericList has all-numeric list cells.""" + tsv_content = """#Sample_ID\tCode\tPureNumericList\tWeight +S1\t1-2\t[10, 20, 30]\t12.5 S2\t3\t40\t18.8 -S3\t5\t50;60\t15.0 -S4\t7\t70;80;90\t20.0 +S3\t5\t[50, 60]\t15.0 +S4\t7\t[70, 80, 90]\t20.0 """ - tsv_file = tmp_path / "test_semicolon_mixed.tsv" + tsv_file = tmp_path / "test_list_mixed.tsv" tsv_file.write_text(tsv_content) return tsv_file @@ -101,7 +107,8 @@ def sample_tsv_with_semicolon_mixed_type_column(tmp_path): def type_errors_tsv(tmp_path): """TSV with mixed-type columns and range/hyphen notation. - Population: mixed int + string + range + Population: mixed int + string + range. S3 has a semicolon in a plain string + cell (not list notation), which produces a semicolon warning. Test: mixed int + 'all' Code: pure string with number prefix Range: mixed int + hyphen-range notation @@ -119,7 +126,7 @@ def type_errors_tsv(tmp_path): @pytest.fixture def array_notation_tsv(tmp_path): - """TSV where one column uses '[1, 2, 3]' array notation instead of semicolons.""" + """TSV with Python list notation in one numeric column.""" content = "#Sample_ID\tPopulation\tArea\n" content += "S1\t[1, 2, 3]\tNorth\n" content += "S2\t4\tEast\n" @@ -131,7 +138,7 @@ def array_notation_tsv(tmp_path): @pytest.fixture def array_notation_multiple_cols_tsv(tmp_path): - """TSV where multiple columns use '[...]' array notation.""" + """TSV with Python list notation in multiple numeric columns.""" content = "#Sample_ID\tPopulation\tScores\n" content += "S1\t[1, 2]\t[10, 20, 30]\n" content += "S2\t3\t[40]\n" @@ -224,16 +231,19 @@ def sample_tsv_with_duplicate_sample_ids(tmp_path): @pytest.fixture def sample_tsv_with_edge_cases(tmp_path): - """TSV with edge cases: unicode, hyphens in strings, and whitespace in Sample_IDs. + """TSV with edge cases: unicode, hyphens in strings, list notation, and whitespace in Sample_IDs. NOTE: S2 and S3 have leading/trailing whitespace in Sample_ID — the validator strips these, so the exported file will DIFFER from the original. Do NOT use this fixture for identity roundtrip tests. + + MixedTypes uses scalar values (not lists) to create a mixed-type column without + triggering the "mixed element types within a list" hard error. """ tsv_content = """#Sample_ID\tPureStrings\tMixedTypes\tSingleString\tSingleNumber\tUnicodeStrings\tStringWithHyphen\tNumericalWithHyphen -S1\tNorth;South;East\t1;two;5\tWest\t100\tStockholm;Göteborg\tNorth-East\t1-2 -S2 \tWest;East;North\t2;three;6\tNorth\t200\tMalmö;Uppsala\tSouth-West\t2-3 - S3\tSouth\t3\tEast\t300\tKöpenhamn;København\tNorth-North-West\t3-4 +S1\t["North", "South", "East"]\t1\tWest\t100\t["Stockholm", "Göteborg"]\tNorth-East\t1-2 +S2 \t["West", "East", "North"]\ttwo\tNorth\t200\t["Malmö", "Uppsala"]\tSouth-West\t2-3 + S3\tSouth\t3\tEast\t300\t["Köpenhamn", "København"]\tNorth-North-West\t3-4 S4\t1string\tstring4\tString5\t400\tHumlebæk\tEast-South-East\t4-5 """ tsv_file = tmp_path / "test_metadata_edge_cases.tsv" @@ -245,3 +255,142 @@ def sample_tsv_with_edge_cases(tmp_path): def negative_numeric_columns(): """Column names in numeric_multi_values_tsv that contain negative numbers.""" return ["Temperature", "Longitude", "Latitude", "Elevation"] + + +@pytest.fixture +def project_samples(): + """Standard set of project samples for testing (S1–S5).""" + return {"S1", "S2", "S3", "S4", "S5"} + + +@pytest.fixture +def valid_list_tsv(tmp_path): + """Valid TSV using Python list notation for multi-value cells, including string lists. + + Unlike ``valid_tsv`` (which avoids string lists for roundtrip safety), this + fixture exercises both numeric lists ([2, 4]) and string lists (["West", "South"]). + """ + content = "#Sample_ID\tPopulation\tArea\tWeight\n" + content += "S1\t1\tNorth\t12.5\n" + content += "S2\t[2, 4]\tEast\t18.8\n" + content += 'S3\t3\t["West", "South"]\t15.0\n' + content += "S4\t[3, 5]\tSouth\t20.0\n" + content += "S5\t4\tNorth\t22.1\n" + p = tmp_path / "valid_list.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def numeric_list_tsv(tmp_path): + """TSV with numeric list cells and negative numbers (fewer columns than numeric_multi_values_tsv).""" + content = "#Sample_ID\tScores\tValues\tTemperature\tLongitude\n" + content += "S1\t[1, 2, 3]\t[10, 20]\t-5.5\t-2.78\n" + content += "S2\t[4, 5]\t[30, 40, 50]\t-10.2\t-0.13\n" + content += "S3\t6\t60\t0\t1.25\n" + content += "S4\t[7, 8, 9, 10]\t70\t15.5\t[-3.5, -2.1]\n" + content += "S5\t11\t[80, 90]\t-20\t0\n" + p = tmp_path / "numeric_list.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def sample_id_errors_tsv(tmp_path): + """TSV with Sample_ID errors: empty, list values, duplicates.""" + content = "#Sample_ID\tPopulation\n" + content += "S1\t1\n" + content += "\t2\n" + content += '["S3", "S4"]\t3\n' + content += "S1\t4\n" + p = tmp_path / "sample_id_errors.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def invalid_list_syntax_tsv(tmp_path): + """TSV with cells that look like lists but have invalid syntax.""" + content = "#Sample_ID\tScores\tValues\n" + content += "S1\t[1, 2, 3]\tgood\n" + content += "S2\t[4\tbad_unclosed\n" + content += "S3\t[1 2 3]\tbad_no_commas\n" + content += "S4\t5\tnormal\n" + content += "S5\t[6, 7]\tok\n" + p = tmp_path / "invalid_list_syntax.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def mixed_type_list_cell_tsv(tmp_path): + """TSV with a list cell containing mixed element types (error).""" + content = "#Sample_ID\tPopulation\tArea\n" + content += "S1\t1\tNorth\n" + content += 'S2\t[1, "two", 3]\tEast\n' + content += "S3\t5\tSouth\n" + p = tmp_path / "mixed_type_list.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def mixed_type_across_cells_tsv(tmp_path): + """TSV with mixed types across cells in a column (some numeric, some string).""" + content = "#Sample_ID\tPopulation\tArea\n" + content += "S1\t1\tNorth\n" + content += "S2\tabc\tEast\n" + content += "S3\t3\tSouth\n" + content += "S4\t4\tWest\n" + content += "S5\tdef\tNorth\n" + p = tmp_path / "mixed_across_cells.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def semicolons_in_cells_tsv(tmp_path): + """TSV with semicolons in cell values (should produce warnings).""" + content = "#Sample_ID\tPopulation\tArea\n" + content += "S1\t1\tNorth\n" + content += "S2\t2;4\tEast\n" + content += "S3\t3\tWest;South\n" + p = tmp_path / "semicolons.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def whitespace_variant_lists_tsv(tmp_path): + """TSV with different whitespace styles in list notation.""" + content = "#Sample_ID\tScores\n" + content += "S1\t[1, 2, 3]\n" + content += "S2\t[4,5,6]\n" + content += "S3\t[ 7 , 8 , 9 ]\n" + p = tmp_path / "whitespace_lists.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def string_list_tsv(tmp_path): + """TSV with string list cells.""" + content = "#Sample_ID\tAreas\tScore\n" + content += 'S1\t["North", "South"]\t10\n' + content += 'S2\t["East"]\t20\n' + content += "S3\tWest\t30\n" + p = tmp_path / "string_list.tsv" + p.write_text(content) + return p + + +@pytest.fixture +def empty_list_tsv(tmp_path): + """TSV with an empty list cell.""" + content = "#Sample_ID\tScores\n" + content += "S1\t[1, 2]\n" + content += "S2\t[]\n" + content += "S3\t3\n" + p = tmp_path / "empty_list.tsv" + p.write_text(content) + return p diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index 482f36f3..f9a2a914 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -1,16 +1,27 @@ """ -Unit tests for SidecarQueryManager filtering +Unit tests for SidecarQueryManager filtering. + +Fixtures use Python list notation for multi-value cells (e.g. [2, 4] instead +of the old semicolon-separated format 2;4). Shared fixtures live in +tests/unit/conftest.py. """ import pandas as pd import pytest +from divbase_api.services import queries as queries_module from divbase_api.services.queries import SidecarQueryManager from divbase_lib.exceptions import ( SidecarColumnNotFoundError, SidecarMetadataFormatError, + SidecarNoDataLoadedError, SidecarSampleIDError, ) +from divbase_lib.metadata_validator import ( + MetadataValidationResult, + ValidationCategory, + ValidationMessage, +) class TestNumericalFilteringInequalities: @@ -63,7 +74,7 @@ def test_less_than_or_equal(self, sample_tsv_with_numeric_data): assert "S3" in sample_ids def test_inequality_on_weight_column(self, sample_tsv_with_numeric_data): - """Test inequality on Weight column (no semicolons, pure numeric).""" + """Test inequality on Weight column (pure numeric scalars).""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Weight:>60") @@ -73,7 +84,7 @@ def test_inequality_on_weight_column(self, sample_tsv_with_numeric_data): assert "S10" in sample_ids def test_inequality_on_age_column(self, sample_tsv_with_numeric_data): - """Test inequality on Age column (no semicolons, pure numeric).""" + """Test inequality on Age column (pure numeric scalars).""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Age:>=40") @@ -84,7 +95,7 @@ def test_inequality_on_age_column(self, sample_tsv_with_numeric_data): assert "S10" in sample_ids def test_inequality_on_single_numeric_value_column(self, sample_tsv_with_numeric_data): - """Test inequality on single-value numeric column (that does not have semicolon separated values).""" + """Test inequality on single-value numeric column (no list cells).""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="SingleNumber:>600") @@ -152,7 +163,7 @@ def test_range_boundaries_inclusive(self, sample_tsv_with_numeric_data): assert "S6" in sample_ids def test_range_on_weight_column(self, sample_tsv_with_numeric_data): - """Test range filtering on Weight column (no semicolons, pure numeric).""" + """Test range filtering on Weight column (pure numeric scalars).""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Weight:40-60") @@ -228,8 +239,8 @@ def test_multiple_discrete_values(self, sample_tsv_with_numeric_data): assert "S3" in sample_ids assert "S7" in sample_ids - def test_discrete_values_with_semicolon_separated_cells(self, sample_tsv_with_numeric_data): - """Test discrete value filtering on Population column (string column with semicolons).""" + def test_discrete_values_on_list_column(self, sample_tsv_with_numeric_data): + """Test discrete value filtering on Population column (has list cells like [2, 4]).""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Population:1,3,5") @@ -240,8 +251,8 @@ def test_discrete_values_with_semicolon_separated_cells(self, sample_tsv_with_nu assert "S5" in sample_ids assert "S7" in sample_ids - def test_discrete_values_match_any_semicolon_value(self, sample_tsv_with_numeric_data): - """Test that discrete filtering matches if ANY semicolon value matches (string matching on Population).""" + def test_discrete_values_match_any_list_element(self, sample_tsv_with_numeric_data): + """Test that discrete filtering matches if ANY list element matches.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Population:4") @@ -300,11 +311,11 @@ def test_not_operator_multiple_negations(self, sample_tsv_with_numeric_data): assert "S8" not in sample_ids -class TestSemicolonSeparatedNumericFiltering: - """Test that inequalities and ranges work on columns with semicolon-separated numeric values.""" +class TestListValueNumericFiltering: + """Test that inequalities and ranges work on columns with Python list multi-value cells.""" - def test_inequality_on_semicolon_separated_column(self, sample_tsv_with_numeric_data): - """Test that > operator works on Population column (semicolon-separated numbers).""" + def test_inequality_on_list_column(self, sample_tsv_with_numeric_data): + """Test that > operator works on Population column (has list cells like [2, 4]).""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Population:>4") @@ -316,7 +327,7 @@ def test_inequality_on_semicolon_separated_column(self, sample_tsv_with_numeric_ assert "S9" in sample_ids assert "S10" in sample_ids - def test_inequality_less_than_on_semicolon_separated_column(self, sample_tsv_with_numeric_data): + def test_inequality_less_than_on_list_column(self, sample_tsv_with_numeric_data): """Test that < operator works on Population column.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Population:<3") @@ -328,7 +339,7 @@ def test_inequality_less_than_on_semicolon_separated_column(self, sample_tsv_wit assert "S7" in sample_ids assert "S8" in sample_ids - def test_range_on_semicolon_separated_column(self, sample_tsv_with_numeric_data): + def test_range_on_list_column(self, sample_tsv_with_numeric_data): """Test that range filtering works on Population column.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Population:3-6") @@ -342,7 +353,7 @@ def test_range_on_semicolon_separated_column(self, sample_tsv_with_numeric_data) assert "S6" in sample_ids assert "S7" in sample_ids - def test_combined_inequality_and_discrete_on_semicolon_separated(self, sample_tsv_with_numeric_data): + def test_combined_inequality_and_discrete_on_list_column(self, sample_tsv_with_numeric_data): """Test combining inequality and discrete values on Population column.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Population:>6,2") @@ -354,8 +365,8 @@ def test_combined_inequality_and_discrete_on_semicolon_separated(self, sample_ts assert "S9" in sample_ids assert "S10" in sample_ids - def test_range_with_semicolon_values_at_boundaries(self, sample_tsv_with_numeric_data): - """Test that range boundaries work correctly with semicolon-separated values.""" + def test_range_with_list_values_at_boundaries(self, sample_tsv_with_numeric_data): + """Test that range boundaries work correctly with list cells.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Population:1-3") @@ -367,8 +378,8 @@ def test_range_with_semicolon_values_at_boundaries(self, sample_tsv_with_numeric assert "S7" in sample_ids assert "S8" in sample_ids - def test_not_operator_with_semicolon_separated(self, sample_tsv_with_numeric_data): - """Test NOT operator (!) with semicolon-separated values: Population:>3,!5 should exclude 5.""" + def test_not_operator_with_list_values(self, sample_tsv_with_numeric_data): + """Test NOT operator (!) with list-value column: Population:>3,!5 should exclude 5.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Population:>3,!5") @@ -383,10 +394,10 @@ def test_not_operator_with_semicolon_separated(self, sample_tsv_with_numeric_dat class TestStringColumnFiltering: - """Test string column filtering with single and semicolon-separated values.""" + """Test string column filtering with single and list multi-value cells.""" def test_single_string_value_column(self, sample_tsv_with_numeric_data): - """Test filtering on a string column with single values (no semicolons).""" + """Test filtering on a string column with single values.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="SingleString:String") @@ -404,8 +415,8 @@ def test_single_string_value_column_multiple_filters(self, sample_tsv_with_numer assert "S1" in sample_ids assert "S2" in sample_ids - def test_semicolon_separated_string_column(self, sample_tsv_with_numeric_data): - """Test filtering on string column with semicolon-separated values (Area column).""" + def test_list_string_column(self, sample_tsv_with_numeric_data): + """Test filtering on string column with list cells (e.g. ["West", "South", "East"]).""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Area:West") @@ -415,8 +426,8 @@ def test_semicolon_separated_string_column(self, sample_tsv_with_numeric_data): assert "S8" in sample_ids assert "S3" in sample_ids - def test_semicolon_separated_string_column_any_match(self, sample_tsv_with_numeric_data): - """Test that filtering matches if ANY semicolon-separated value matches.""" + def test_list_string_column_any_match(self, sample_tsv_with_numeric_data): + """Test that filtering matches if ANY list element matches.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Area:South") @@ -462,9 +473,9 @@ class TestEdgeCases: """Edge case tests for SidecarQueryManager filtering.""" def test_mixed_types_treated_as_string(self, sample_tsv_with_edge_cases): - """Test that a column with mixed numeric and non-numeric values is treated as a string column. - The MixedTypes column has values like '1;two;5', '2;three;6', '3', 'string4'. - When treated as string, filtering for '1' should match cells containing '1' as a semicolon-separated value.""" + """Test that a column with mixed numeric and non-numeric scalar values is treated as a string column. + The MixedTypes column has values like 1, 'two', 3, 'string4'. + When treated as string, filtering for '1' should match cells containing '1'.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) result = manager.run_query(filter_string="MixedTypes:1") sample_ids = result.get_unique_values("Sample_ID") @@ -478,7 +489,7 @@ def test_string_with_numbers_in_value(self, sample_tsv_with_edge_cases): assert "S4" in sample_ids def test_unicode_string_filtering(self, sample_tsv_with_edge_cases): - """Test that filtering for Unicode values like 'Göteborg' and 'Malmö' works and returns correct samples.""" + """Test that filtering for Unicode values like 'Göteborg' works and returns correct samples.""" manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) result = manager.run_query(filter_string="UnicodeStrings:Göteborg") sample_ids = result.get_unique_values("Sample_ID") @@ -563,6 +574,14 @@ def test_not_operator_only_negations(self, sample_tsv_with_edge_cases): assert "S1" not in sample_ids assert "S2" not in sample_ids + def test_list_string_filtering_matches_element(self, sample_tsv_with_edge_cases): + """Test that filtering on a string column with list cells matches individual list elements.""" + manager = SidecarQueryManager(file=sample_tsv_with_edge_cases) + result = manager.run_query(filter_string="PureStrings:South") + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + assert "S3" in sample_ids + class TestSampleIDValidation: """Test Sample_ID validation during file loading.""" @@ -571,7 +590,7 @@ def test_empty_sample_id_raises_error(self, sample_tsv_with_invalid_sample_ids): """Test that empty Sample_ID values raise SidecarSampleIDError directly during file load.""" with pytest.raises(SidecarSampleIDError) as excinfo: SidecarQueryManager(file=sample_tsv_with_invalid_sample_ids) - assert "Sample_ID column contains empty or missing values" in str(excinfo.value) + assert "sample_id" in str(excinfo.value).lower() and "empty" in str(excinfo.value).lower() def test_duplicate_sample_id_raises_error(self, sample_tsv_with_duplicate_sample_ids): """Test that duplicate Sample_ID values raise SidecarSampleIDError directly during file load.""" @@ -584,7 +603,7 @@ def test_missing_sample_id_column_raises_error(self, sample_tsv_missing_sample_i """Test that missing Sample_ID column raises SidecarColumnNotFoundError during file load.""" with pytest.raises(SidecarColumnNotFoundError) as excinfo: SidecarQueryManager(file=sample_tsv_missing_sample_id_column) - assert "The 'Sample_ID' column is required in the metadata file." in str(excinfo.value) + assert "Sample_ID" in str(excinfo.value) class TestNegativeNumbers: @@ -624,8 +643,8 @@ def test_negative_numbers_greater_than_inequality(self, sample_tsv_with_numeric_ assert "S5" in sample_ids assert "S9" in sample_ids - def test_negative_numbers_in_semicolon_cells(self, sample_tsv_with_numeric_data): - """Test that negative numbers in semicolon-separated cells work correctly.""" + def test_negative_numbers_in_list_cells(self, sample_tsv_with_numeric_data): + """Test that negative numbers in list cells (e.g. [-3.5, -2.1]) work correctly.""" manager = SidecarQueryManager(file=sample_tsv_with_numeric_data) result = manager.run_query(filter_string="Longitude:-3.5") @@ -704,13 +723,6 @@ def test_numeric_column_no_false_warning(self, sample_tsv_with_mixed_type_column assert not any("string column" in w.lower() for w in result.warnings) assert not any("comparison operators" in w.lower() for w in result.warnings) - def test_warning_mentions_semicolon_rule(self, sample_tsv_with_mixed_type_column): - """Test that query warnings explain the semicolon classification rule.""" - manager = SidecarQueryManager(file=sample_tsv_with_mixed_type_column) - result = manager.run_query(filter_string="Population_code:>5") - - assert any("semicolon-separated" in w for w in result.warnings) - @pytest.mark.parametrize( "column,filter_string,expected_warning,expected_sample_ids", [ @@ -740,60 +752,50 @@ def test_comparison_operator_parametrized( assert len(sample_ids) == 0 -class TestSemicolonColumnTypeClassification: - """Test that column type classification correctly handles semicolon-separated values. - A column is numeric only if all parts of all semicolon-separated cells are valid numbers. - If any part is non-numeric (e.g., '1-2' in '1;1-2'), the entire column is string.""" +class TestListColumnTypeClassification: + """Test that column type classification correctly handles list and mixed-type cells. + A column is numeric only if all cells (scalars and list elements) are valid numbers. + If any cell is non-numeric, the column is treated as string/mixed-type.""" - def test_semicolon_cell_with_non_numeric_part_makes_column_string( - self, sample_tsv_with_semicolon_mixed_type_column - ): - """Test that a column with a cell '1;1-2' should be treated as string because '1-2' is not a number.""" - manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + def test_non_numeric_scalar_makes_column_mixed(self, sample_tsv_with_list_mixed_type_column): + """Test that a column with a non-numeric scalar ('1-2') among numeric values is mixed-type.""" + manager = SidecarQueryManager(file=sample_tsv_with_list_mixed_type_column) - assert not pd.api.types.is_numeric_dtype(manager.df["Code"]) - assert not manager._is_semicolon_separated_numeric_column("Code") - assert manager._is_mixed_type_column("Code") + assert "Code" in manager.mixed_type_columns - def test_semicolon_cell_with_non_numeric_part_warns_on_inequality( - self, sample_tsv_with_semicolon_mixed_type_column - ): - """Test that inequality filter on a column broken by '1;1-2' should produce a warning.""" - manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + def test_mixed_column_warns_on_inequality(self, sample_tsv_with_list_mixed_type_column): + """Test that inequality filter on a mixed-type column should produce a warning.""" + manager = SidecarQueryManager(file=sample_tsv_with_list_mixed_type_column) result = manager.run_query(filter_string="Code:>2") assert any("mixed types" in w.lower() and "Code" in w for w in result.warnings) assert any("comparison operators" in w.lower() for w in result.warnings) - def test_semicolon_cell_with_non_numeric_part_string_matching_works( - self, sample_tsv_with_semicolon_mixed_type_column - ): - """Test that string matching should still work on the mixed column. Filtering for '1-2' should matches cell value '1;1-2'.""" - manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + def test_mixed_column_string_matching_works(self, sample_tsv_with_list_mixed_type_column): + """Test that string matching works on a mixed-type column. Filtering for '1-2' should match S1.""" + manager = SidecarQueryManager(file=sample_tsv_with_list_mixed_type_column) result = manager.run_query(filter_string="Code:1-2") sample_ids = result.get_unique_values("Sample_ID") assert "S1" in sample_ids assert len(sample_ids) == 1 - def test_semicolon_cell_with_non_numeric_part_single_numeric_match( - self, sample_tsv_with_semicolon_mixed_type_column - ): + def test_mixed_column_single_numeric_match(self, sample_tsv_with_list_mixed_type_column): """Test that string matching for '3' on the mixed column should return S2 (exact string match).""" - manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + manager = SidecarQueryManager(file=sample_tsv_with_list_mixed_type_column) result = manager.run_query(filter_string="Code:3") sample_ids = result.get_unique_values("Sample_ID") assert "S2" in sample_ids assert len(sample_ids) == 1 - def test_purely_numeric_semicolon_column_supports_numeric_ops(self, sample_tsv_with_semicolon_mixed_type_column): - """Test that a column with only numeric semicolon values (e.g., '10;20;30') should support numeric operations.""" - manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) + def test_purely_numeric_list_column_supports_numeric_ops(self, sample_tsv_with_list_mixed_type_column): + """Test that a column with numeric list cells (e.g. [10, 20, 30]) supports numeric operations.""" + manager = SidecarQueryManager(file=sample_tsv_with_list_mixed_type_column) - assert manager._is_semicolon_separated_numeric_column("PureNumericSemicolon") + assert "PureNumericList" in manager.numeric_columns - result = manager.run_query(filter_string="PureNumericSemicolon:>55") + result = manager.run_query(filter_string="PureNumericList:>55") sample_ids = result.get_unique_values("Sample_ID") assert "S3" in sample_ids assert "S4" in sample_ids @@ -801,10 +803,10 @@ def test_purely_numeric_semicolon_column_supports_numeric_ops(self, sample_tsv_w assert "S2" not in sample_ids assert not any("string column" in w.lower() for w in result.warnings) - def test_purely_numeric_semicolon_column_range_filter(self, sample_tsv_with_semicolon_mixed_type_column): - """Test that a purely numeric semicolon column should support range operations.""" - manager = SidecarQueryManager(file=sample_tsv_with_semicolon_mixed_type_column) - result = manager.run_query(filter_string="PureNumericSemicolon:25-45") + def test_purely_numeric_list_column_range_filter(self, sample_tsv_with_list_mixed_type_column): + """Test that a purely numeric list column supports range operations.""" + manager = SidecarQueryManager(file=sample_tsv_with_list_mixed_type_column) + result = manager.run_query(filter_string="PureNumericList:25-45") sample_ids = result.get_unique_values("Sample_ID") assert "S1" in sample_ids @@ -828,11 +830,10 @@ def test_commas_in_mixed_numeric_column_detected_during_query(self, tmp_path): tsv_file.write_text(tsv_content) manager = SidecarQueryManager(file=tsv_file) - assert len(manager.warnings) == 0 result = manager.run_query("Population:1,2") assert any("mixed types" in w.lower() for w in result.warnings) - def test_commas_in_pure_string_column_no_warning(self, tmp_path): + def test_commas_in_pure_string_column_no_mixed_type_warning(self, tmp_path): """Test that commas in a pure string column don't trigger mixed-type warnings.""" tsv_content = "#Sample_ID\tCode\nS1\t1,2\nS2\t3,4\nS3\t5,6\n" tsv_file = tmp_path / "commas.tsv" @@ -843,6 +844,15 @@ def test_commas_in_pure_string_column_no_warning(self, tmp_path): assert not any("mixed types" in w.lower() for w in result.warnings) + def test_commas_in_cells_produce_format_warning(self, tmp_path): + """Test that commas in plain string cells produce FORMAT warnings during load.""" + tsv_content = "#Sample_ID\tCode\nS1\t1,2\nS2\t3,4\nS3\t5,6\n" + tsv_file = tmp_path / "commas.tsv" + tsv_file.write_text(tsv_content) + + manager = SidecarQueryManager(file=tsv_file) + assert any("comma" in w.lower() for w in manager.warnings) + def test_duplicate_column_names_raises(self, tmp_path): """Test that duplicate column names raise SidecarMetadataFormatError during load_file(). Without this check, pandas might silently rename them (e.g., 'Area', 'Area.1').""" @@ -876,17 +886,6 @@ def test_empty_column_name_raises(self, tmp_path): SidecarQueryManager(file=tsv_file) assert "empty" in str(excinfo.value).lower() - def test_semicolon_in_sample_id_raises(self, tmp_path): - """Test that semicolons in Sample_ID values raise SidecarSampleIDError during load_file(). - Sample_ID must contain exactly one value per row.""" - tsv_content = "#Sample_ID\tArea\nS1;S2\tNorth\nS3\tEast\n" - tsv_file = tmp_path / "semicolon_sample_id.tsv" - tsv_file.write_text(tsv_content) - - with pytest.raises(SidecarSampleIDError) as excinfo: - SidecarQueryManager(file=tsv_file) - assert "semicolon" in str(excinfo.value).lower() - def test_missing_sample_id_column_raises(self, sample_tsv_missing_sample_id_column): """Test that missing Sample_ID column raise SidecarColumnNotFoundError.""" with pytest.raises(SidecarColumnNotFoundError): @@ -913,28 +912,153 @@ def test_valid_file_loads_successfully(self, sample_tsv_with_edge_cases): assert manager.df is not None assert "Sample_ID" in manager.df.columns + def test_invalid_list_syntax_raises(self, tmp_path): + """Test that invalid Python list syntax in cells raises an error during load.""" + tsv_content = "#Sample_ID\tScores\nS1\t[1, 2\nS2\t5\n" + tsv_file = tmp_path / "bad_list.tsv" + tsv_file.write_text(tsv_content) -class TestArrayNotation: - """Test that Python/JSON-style array notation '[...]' in cells produces a warning and is treated as string.""" + with pytest.raises(SidecarMetadataFormatError) as excinfo: + SidecarQueryManager(file=tsv_file) + assert "invalid" in str(excinfo.value).lower() - def test_array_notation_warning_content(self, array_notation_tsv): - """Test that array notation loads without error, produces a warning (not an error), - the warning mentions semicolons, and exactly one warning is emitted per offending column.""" - manager = SidecarQueryManager(file=array_notation_tsv) - assert manager.df is not None - assert any("array notation" in w.lower() for w in manager.warnings) - assert any("semicolon" in w.lower() and "array notation" in w.lower() for w in manager.warnings) - assert len([w for w in manager.warnings if "array notation" in w.lower() and "Population" in w]) == 1 - - def test_array_notation_column_is_filterable_as_string(self, array_notation_tsv): - """Test that columns with array notation are queryable as plain string values.""" - manager = SidecarQueryManager(file=array_notation_tsv).run_query(filter_string="Area:North") - assert len(manager.query_result) == 1 - assert manager.query_result.iloc[0]["Sample_ID"] == "S1" - - def test_array_notation_multiple_columns_warns_per_column(self, array_notation_multiple_cols_tsv): - """Test that each column with array notation gets its own warning.""" - manager = SidecarQueryManager(file=array_notation_multiple_cols_tsv) - assert len([w for w in manager.warnings if "array notation" in w.lower() and "Population" in w]) == 1 - assert len([w for w in manager.warnings if "array notation" in w.lower() and "Scores" in w]) == 1 +class TestQuotedFilterValues: + """Test that quoted filter values allow querying for strings containing commas.""" + + def test_quoted_filter_value_with_comma(self, tmp_path): + """Test that a quoted filter value like '"North,South"' matches the literal string.""" + tsv_content = '#Sample_ID\tArea\nS1\t"North,South"\nS2\tEast\nS3\tNorth\n' + tsv_file = tmp_path / "quoted.tsv" + tsv_file.write_text(tsv_content) + + manager = SidecarQueryManager(file=tsv_file) + result = manager.run_query(filter_string='Area:"North,South"') + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + assert len(sample_ids) == 1 + + def test_negation_with_quoted_value(self, tmp_path): + """Test that negation works with quoted values: Area:!"North,South" excludes the literal.""" + tsv_content = '#Sample_ID\tArea\nS1\t"North,South"\nS2\tEast\nS3\tNorth\n' + tsv_file = tmp_path / "quoted_neg.tsv" + tsv_file.write_text(tsv_content) + + manager = SidecarQueryManager(file=tsv_file) + result = manager.run_query(filter_string='Area:!"North,South"') + sample_ids = result.get_unique_values("Sample_ID") + assert "S2" in sample_ids + assert "S3" in sample_ids + assert "S1" not in sample_ids + + def test_unquoted_comma_splits_filter_values(self, tmp_path): + """Test that unquoted commas still split filter values as OR conditions.""" + tsv_content = "#Sample_ID\tArea\nS1\tNorth\nS2\tEast\nS3\tSouth\n" + tsv_file = tmp_path / "unquoted.tsv" + tsv_file.write_text(tsv_content) + + manager = SidecarQueryManager(file=tsv_file) + result = manager.run_query(filter_string="Area:North,East") + sample_ids = result.get_unique_values("Sample_ID") + assert "S1" in sample_ids + assert "S2" in sample_ids + assert "S3" not in sample_ids + + def test_quoted_filter_value_with_comma_and_space(self, tmp_path): + """Test that a quoted filter value can match a literal value like 'North, South'.""" + tsv_content = '#Sample_ID\tArea\nS1\t"North, South"\nS2\tEast\nS3\tNorth\n' + tsv_file = tmp_path / "quoted_comma_space.tsv" + tsv_file.write_text(tsv_content) + + manager = SidecarQueryManager(file=tsv_file) + result = manager.run_query(filter_string='Area:"North, South"') + sample_ids = result.get_unique_values("Sample_ID") + assert sample_ids == ["S1"] + + def test_unquoted_filter_with_comma_and_space_does_not_match_literal(self, tmp_path): + """Test that unquoted comma-separated filter values do not match a literal 'North, South' cell.""" + tsv_content = '#Sample_ID\tArea\nS1\t"North, South"\nS2\tNorth\nS3\tSouth\n' + tsv_file = tmp_path / "unquoted_comma_space.tsv" + tsv_file.write_text(tsv_content) + + manager = SidecarQueryManager(file=tsv_file) + result = manager.run_query(filter_string="Area:North, South") + sample_ids = sorted(result.get_unique_values("Sample_ID")) + assert sample_ids == ["S2", "S3"] + assert "S1" not in sample_ids + + def test_embedded_double_quote_value_is_not_queryable_with_current_parser(self, tmp_path): + """Document current behavior: values containing literal double quotes are not queryable.""" + tsv_content = '#Sample_ID\tArea\nS1\t"He said ""Hi"""\nS2\tEast\n' + tsv_file = tmp_path / "embedded_quotes.tsv" + tsv_file.write_text(tsv_content) + + manager = SidecarQueryManager(file=tsv_file) + result = manager.run_query(filter_string='Area:"He said ""Hi"""') + sample_ids = result.get_unique_values("Sample_ID") + assert len(sample_ids) == 0 + assert any("No results for the filter" in w and "column 'Area'" in w for w in result.warnings) + + +class TestValidationCategoryDispatch: + """Test how SidecarQueryManager maps SharedMetadataValidator categories to behavior.""" + + @staticmethod + def _patch_shared_validator(monkeypatch, result): + class FakeSharedMetadataValidator: + def __init__(self, file_path, project_samples=None, skip_dimensions_check=False): + self.file_path = file_path + self.project_samples = project_samples + self.skip_dimensions_check = skip_dimensions_check + + def load_and_validate(self): + return result + + monkeypatch.setattr(queries_module, "SharedMetadataValidator", FakeSharedMetadataValidator) + + def test_file_read_category_raises_no_data_loaded(self, monkeypatch, tmp_path): + result = MetadataValidationResult( + errors=[ValidationMessage(ValidationCategory.FILE_READ, "failed to read")], + ) + self._patch_shared_validator(monkeypatch, result) + with pytest.raises(SidecarNoDataLoadedError): + SidecarQueryManager(file=tmp_path / "dummy.tsv") + + def test_sample_id_column_category_raises_column_not_found(self, monkeypatch, tmp_path): + result = MetadataValidationResult( + errors=[ValidationMessage(ValidationCategory.SAMPLE_ID_COLUMN, "Sample_ID column is required")], + ) + self._patch_shared_validator(monkeypatch, result) + with pytest.raises(SidecarColumnNotFoundError): + SidecarQueryManager(file=tmp_path / "dummy.tsv") + + def test_sample_id_value_category_raises_sample_id_error(self, monkeypatch, tmp_path): + result = MetadataValidationResult( + errors=[ValidationMessage(ValidationCategory.SAMPLE_ID_VALUE, "Sample_ID is empty")], + ) + self._patch_shared_validator(monkeypatch, result) + with pytest.raises(SidecarSampleIDError): + SidecarQueryManager(file=tmp_path / "dummy.tsv") + + def test_other_error_category_raises_metadata_format_error(self, monkeypatch, tmp_path): + result = MetadataValidationResult( + errors=[ValidationMessage(ValidationCategory.HEADER, "Duplicate column names found")], + ) + self._patch_shared_validator(monkeypatch, result) + with pytest.raises(SidecarMetadataFormatError): + SidecarQueryManager(file=tmp_path / "dummy.tsv") + + def test_only_dimensions_and_format_warnings_are_forwarded(self, monkeypatch, tmp_path): + result = MetadataValidationResult( + warnings=[ + ValidationMessage(ValidationCategory.DIMENSIONS, "dimension warning"), + ValidationMessage(ValidationCategory.FORMAT, "format warning"), + ValidationMessage(ValidationCategory.TYPE_CLASSIFICATION, "type warning"), + ], + df=pd.DataFrame({"Sample_ID": ["S1"]}), + ) + self._patch_shared_validator(monkeypatch, result) + manager = SidecarQueryManager(file=tmp_path / "dummy.tsv") + assert "dimension warning" in manager.warnings + assert "format warning" in manager.warnings + assert "type warning" not in manager.warnings diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index 82155746..a643f83f 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -4,187 +4,14 @@ Tests the shared validation logic from SharedMetadataValidator as exercised through the CLI's MetadataTSVValidator wrapper. -Fixtures in this file use Python list notation for multi-value cells (the -current format). The shared conftest.py still contains old semicolon-format -fixtures used by query engine tests that haven't been migrated yet. +Shared fixtures are defined in tests/unit/conftest.py. """ from pathlib import Path -import pytest - from divbase_cli.services.sample_metadata_tsv_validator import MetadataTSVValidator -@pytest.fixture -def project_samples(): - """Standard set of project samples for testing.""" - return {"S1", "S2", "S3", "S4", "S5"} - - -@pytest.fixture -def valid_list_tsv(tmp_path): - """Valid TSV using Python list notation for multi-value cells.""" - content = "#Sample_ID\tPopulation\tArea\tWeight\n" - content += "S1\t1\tNorth\t12.5\n" - content += "S2\t[2, 4]\tEast\t18.8\n" - content += 'S3\t3\t["West", "South"]\t15.0\n' - content += "S4\t[3, 5]\tSouth\t20.0\n" - content += "S5\t4\tNorth\t22.1\n" - p = tmp_path / "valid_list.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def no_multi_values_tsv(tmp_path): - """TSV with only scalar values, no list notation.""" - content = "#Sample_ID\tPopulation\n" - content += "S1\t1\n" - content += "S2\t2\n" - p = tmp_path / "no_multi.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def numeric_list_tsv(tmp_path): - """TSV with numeric list cells and negative numbers.""" - content = "#Sample_ID\tScores\tValues\tTemperature\tLongitude\n" - content += "S1\t[1, 2, 3]\t[10, 20]\t-5.5\t-2.78\n" - content += "S2\t[4, 5]\t[30, 40, 50]\t-10.2\t-0.13\n" - content += "S3\t6\t60\t0\t1.25\n" - content += "S4\t[7, 8, 9, 10]\t70\t15.5\t[-3.5, -2.1]\n" - content += "S5\t11\t[80, 90]\t-20\t0\n" - p = tmp_path / "numeric_list.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def header_errors_tsv(tmp_path): - """TSV with header errors: wrong first column, duplicate columns, empty column.""" - content = "SampleID\tPopulation\tArea\tArea\t\n" - content += "S1\t1\tNorth\tEast\tValue\n" - p = tmp_path / "header_errors.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def sample_id_errors_tsv(tmp_path): - """TSV with Sample_ID errors: empty, list values, duplicates.""" - content = "#Sample_ID\tPopulation\n" - content += "S1\t1\n" - content += "\t2\n" - content += '["S3", "S4"]\t3\n' - content += "S1\t4\n" - p = tmp_path / "sample_id_errors.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def format_errors_tsv(tmp_path): - """TSV with formatting errors: wrong column count, commas, whitespace.""" - content = "#Sample_ID\tPopulation\tArea\n" - content += "S1\t1\tNorth\n" - content += "S2\t2,3\tEast\n" - content += "S3 \t 4 \t West \n" - content += "S4\t5\n" - p = tmp_path / "format_errors.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def invalid_list_syntax_tsv(tmp_path): - """TSV with cells that look like lists but have invalid syntax.""" - content = "#Sample_ID\tScores\tValues\n" - content += "S1\t[1, 2, 3]\tgood\n" - content += "S2\t[4\tbad_unclosed\n" - content += "S3\t[1 2 3]\tbad_no_commas\n" - content += "S4\t5\tnormal\n" - content += "S5\t[6, 7]\tok\n" - p = tmp_path / "invalid_list_syntax.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def mixed_type_list_cell_tsv(tmp_path): - """TSV with a list cell containing mixed element types (error).""" - content = "#Sample_ID\tPopulation\tArea\n" - content += "S1\t1\tNorth\n" - content += 'S2\t[1, "two", 3]\tEast\n' - content += "S3\t5\tSouth\n" - p = tmp_path / "mixed_type_list.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def mixed_type_across_cells_tsv(tmp_path): - """TSV with mixed types across cells in a column (some numeric, some string).""" - content = "#Sample_ID\tPopulation\tArea\n" - content += "S1\t1\tNorth\n" - content += "S2\tabc\tEast\n" - content += "S3\t3\tSouth\n" - content += "S4\t4\tWest\n" - content += "S5\tdef\tNorth\n" - p = tmp_path / "mixed_across_cells.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def semicolons_in_cells_tsv(tmp_path): - """TSV with semicolons in cell values (should produce warnings).""" - content = "#Sample_ID\tPopulation\tArea\n" - content += "S1\t1\tNorth\n" - content += "S2\t2;4\tEast\n" - content += "S3\t3\tWest;South\n" - p = tmp_path / "semicolons.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def whitespace_variant_lists_tsv(tmp_path): - """TSV with different whitespace styles in list notation.""" - content = "#Sample_ID\tScores\n" - content += "S1\t[1, 2, 3]\n" - content += "S2\t[4,5,6]\n" - content += "S3\t[ 7 , 8 , 9 ]\n" - p = tmp_path / "whitespace_lists.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def string_list_tsv(tmp_path): - """TSV with string list cells.""" - content = "#Sample_ID\tAreas\tScore\n" - content += 'S1\t["North", "South"]\t10\n' - content += 'S2\t["East"]\t20\n' - content += "S3\tWest\t30\n" - p = tmp_path / "string_list.tsv" - p.write_text(content) - return p - - -@pytest.fixture -def empty_list_tsv(tmp_path): - """TSV with an empty list cell.""" - content = "#Sample_ID\tScores\n" - content += "S1\t[1, 2]\n" - content += "S2\t[]\n" - content += "S3\t3\n" - p = tmp_path / "empty_list.tsv" - p.write_text(content) - return p - - class TestValidTSV: """Test that a valid TSV with list notation passes all checks.""" @@ -448,3 +275,25 @@ def test_tuple_notation_is_not_a_list(self, tmp_path): stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) list_errors = [e for e in errors if "list" in e.lower()] assert len(list_errors) == 0 + + +class TestQuotedCellValues: + """Test quoted cell values that include commas/spaces and embedded quote characters.""" + + def test_quoted_cell_with_comma_space_produces_comma_warning(self, tmp_path): + content = '#Sample_ID\tArea\nS1\t"North, South"\nS2\tEast\n' + p = tmp_path / "quoted_comma_space.tsv" + p.write_text(content) + + stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) + assert len(errors) == 0 + assert any("comma" in w.lower() and "North, South" in w for w in warnings) + + def test_quoted_cell_with_embedded_double_quotes_is_allowed(self, tmp_path): + content = '#Sample_ID\tComment\nS1\t"He said ""Hi"""\nS2\tPlain\n' + p = tmp_path / "embedded_quotes.tsv" + p.write_text(content) + + stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) + assert len(errors) == 0 + assert not any("list syntax" in e.lower() for e in errors) diff --git a/tests/unit/divbase_lib/test_tsv_to_dataframe_to_tsv.py b/tests/unit/divbase_lib/test_tsv_to_dataframe_to_tsv.py index d093b41d..028fb465 100644 --- a/tests/unit/divbase_lib/test_tsv_to_dataframe_to_tsv.py +++ b/tests/unit/divbase_lib/test_tsv_to_dataframe_to_tsv.py @@ -34,7 +34,7 @@ "array_notation_tsv", "array_notation_multiple_cols_tsv", "sample_tsv_with_mixed_type_column", - "sample_tsv_with_semicolon_mixed_type_column", + "sample_tsv_with_list_mixed_type_column", ] From cc4050411108435371cd36f525d0aa91ee47ca56 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 26 Feb 2026 13:06:45 +0100 Subject: [PATCH 096/100] Update user guide for sidecar metadata To reflect new syntax using bracket arrays. --- docs/user-guides/sidecar-metadata.md | 71 ++++++++++++++++------------ 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/docs/user-guides/sidecar-metadata.md b/docs/user-guides/sidecar-metadata.md index 48d87847..e564930a 100644 --- a/docs/user-guides/sidecar-metadata.md +++ b/docs/user-guides/sidecar-metadata.md @@ -39,7 +39,7 @@ divbase-cli dimensions create-metadata-template ``` !!! Note - There can be multiple TSVs in the same project and it is possible to call them for the queries with the `--metadata-tsv-name` flag. If not specified, it the default `sample_metadata.tsv` will be assumed. It is up to the user if the want to have multiple TSVs in the same project to organise their metadata in a specific way. It is allowed to have duplicate sample names and metadata across multiple TSV files, since only one TSV can be called per query. It is recommended to a have a master TSV that contains all samples from all the VCFs in the project: querying on TSVs that contain subsets of all sample names is possible, but will sample names not included in the TSV used for the query will be disregarded for the query. + There can be multiple TSVs in the same project and it is possible to call them for the queries with the `--metadata-tsv-name` flag. If not specified, the default `sample_metadata.tsv` will be assumed. It is up to the user if the want to have multiple TSVs in the same project to organise their metadata in a specific way. It is allowed to have duplicate sample names and metadata across multiple TSV files, since only one TSV can be called per query. It is recommended to a have a master TSV that contains all samples from all the VCFs in the project: querying on TSVs that contain subsets of all sample names is possible, but will sample names not included in the TSV used for the query will be disregarded for the query. ### Sidecar TSV format requirements @@ -49,45 +49,45 @@ To be able to accommodate a variety of metadata needs, DivBase does not enforce 1. The first row must be a header row, start with `#`, and the first column must be named `Sample_ID`. 2. The `Sample_ID` column must contain the exact names of the samples as they are spelled in the VCF files. Sample names need to occur uniqely in the TSV: only one row per sample name in the `Sample_ID` column, no duplicates allowed. This will already be handled if user has run a `divbase-cli dimensions update` job and, after its completion, has generated a pre-filled template with: `divbase-cli dimensions create-metadata-template` -3. The `Sample_ID` column can only contain one sample name per row. This is different from the user-defined columns that can take arrays of values for each cell in a column using semicolons (`;`) as delimters. `Sample_ID` values can also not be empty. +3. The `Sample_ID` column can only contain one sample name per row. This is different from the user-defined columns that can take arrays of values for each cell in a column using bracket array notation (explained in the next section below). `Sample_ID` values can also not be empty. 4. Every column need to be tab separated for all rows, including the header. #### User-defined columns -After the `Sample_ID` column has been populated, users can add any columns and values to the TSV. +After the `Sample_ID` column has been populated, users can add any columns and values to the TSV. The cells can be either single-value (e.g. `1`, `55.02`, and `North`) or multi-value using bracket array notation (the syntax for Python lists), e.g. `[1,2,3]`, `["North", "Northwest"]`. Bracket array was chosen for DivBase since it is a common notation used in several programming languages, including Python, JSON, and JavaScript to name a few. !!! Warning It is the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling of a column name must be used for the filters. This includes matching upper and lower case letters. To ensure that user-defined metadata can be used in DivBase, we ask you follow the following constraints and considerations: -1. The user-defined columns can be **either** numeric **or** string type. A column is classified as numeric only if all values can be parsed as numbers (including individual parts in semicolon-separated cells). If any value in a column is non-numeric, the entire column is treated as a string column. This means a column with values like "8", "1a", "5a" will be treated as string column even though some values look numeric. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values are numerical or string. -2. Semicolon-separated values are supported in TSV cells to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data (e.g. "2;4;21" or "North; North-West"). Note that this might make the process of writing queries more complex than if just a single value is used for each cell. **Important:** Semicolons (`;`) are the only supported delimiter for multi-value cells; bracket array notation (`[1,2,3]`) is not supported and will treated as a string. DivBase uses commas (`,`) in the [Query syntax](#query-syntax-for-sidecar-metadata) for a different purpose (separating filter values in queries). -3. Special characters like hyphens (`-`) and commas (`,`) are allowed, but will cause the column to be treated as a string column. String columns cannot be filtered using numeric operators (see details in [Filtering on numerical columns](#filtering-on-numerical-columns)) and will raise warnings. For example, values like "1-2" or "1,2" will be interpreted as strings, not numeric ranges or multi-value fields. If you intend to store multiple numeric values in a cell, use semicolons (e.g., "1;2"). For decimals, use English decimal notation with a period (e.g., "3.14") and not a comma. -4. The only characters with special structural meaning in DivBase sidecar metadata TSV files are `#` (for header comments), `;` (for multi-value cell separation), and `\t` (tab, for column separation). Other special characters are generally supported in data values, but be aware that Your Mileage May Vary. Some common cases that have been tested and are supported include diacritic unicode characters like `å`, `ä`, `ö`, and hyphens in strings (e.g., `North-West`). -5. Leading and trailing whitespaces are removed by the DivBase backend in order to ensure robust filtering and pattern matching. Whitespaces inside strings will be preserved. For instance: " Sample 1 " will be processed as "Sample 1". +1. The user-defined columns can be **either** numeric **or** string type. A column is classified as numeric only if all values can be parsed as numbers ( including each element in list cells (e.g. `[1, 2, 3]`). See bullet 2 below.). If any value in a column is non-numeric, the entire column is treated as a string column. This means a column with values like `8`, `1a`, `5a` will be treated as string column even though some values look numeric. The DivBase backend uses [`Pandas`](https://pandas.pydata.org/) to automatically infer column type based on its data, so there is no need to specify in the TSV whether the values are numerical or string. +2. Bracket array notation in TSV cells can be used to represent arrays of values. This allows users to have samples that can belong to multiple values in the same column. For instance belong to two different groups or categories. This works with both numerical and string data (e.g. `[2,4,21]` or `["North", "Northwest"]`). Note that this might make the process of writing queries more complex than if just a single value is used for each cell. The arrays are whitespace-insensitive: `[1,2]`, `[1, 2]`, and `[ 1 , 2 ]` all parse identically in the DivBase backend. We recommend that string elements are enclosed in quotes inside the array `["North", "Northwest"]` or `['North', 'Northwest']`, although it is not stricly necessary, it helps clarify the type to the user. It is also possible to use empty lists `[]` to representing "no values", if so desired. +3. Special characters like hyphens (`-`) and commas (`,`) are allowed, but will cause the column to be treated as a string column. String columns cannot be filtered using numeric operator (see details in [Filtering on numerical columns](#filtering-on-numerical-columns)) and will raise warnings. For example, values like `1-2` or `1,2` will be interpreted as strings, not numeric ranges or multi-value fields. If you intend to store multiple numeric values in a cell, use bracket array notation (e.g., `[1, 2]`). For decimals, use English decimal notation with a period (e.g., `3.14`) and not a comma. +4. The only characters with special structural meaning in DivBase sidecar metadata TSV files are `#` (for header comments), `[` and `]` (for multi-value cell separation), and `\t` (tab, for column separation). Other special characters are generally supported in data values, but be aware that Your Mileage May Vary. Some common cases that have been tested and are supported include diacritic unicode characters like `å`, `ä`, `ö`, and hyphens in strings (e.g., `North-West`). +5. Leading and trailing whitespaces are removed by the DivBase backend in order to ensure robust filtering and pattern matching. Whitespaces inside strings will be preserved. For instance: `" Sample 1 "` will be processed as `"Sample 1"`. !!! Note Note that the TSV does not need contain any information of which VCF files the samples are found in: this is handled by the project's VCF dimensions indexing (`divbase-cli dimensions update`). We advice against putting sample-VCF file mappings in TSV file to reduce the risk of confusion and data mismatch. #### Example -This example illustrates how a sidecar sample metadata TSV can look like. The mandatory requirement are fulfilled (heading with `#`, `Sample_ID` column, tab-separated file). The user-defined column contains examples of a numerical column (`Population`) and a string column (`Area`). In some cells, semicolons (`;`) are used to assign multiple values to the same sample and column. +This example illustrates how a sidecar sample metadata TSV can look like. The mandatory requirement are fulfilled (heading with `#`, `Sample_ID` column, tab-separated file). The user-defined column contains examples of a numerical column (`Population`) and a string column (`Area`). In some cells, bracket array notation (`[...]`) is used to assign multiple values to the same sample and column. ```text #Sample_ID Population Area Weight S1 1 North 12.1 -S2 2;4 East 18.8 -S3 3 West;South 15.0 +S2 [2, 4] East 18.8 +S3 3 ["West", "South"] 15.0 S4 4 West 20.2 S5 5 North 16.1 S6 6 East 25.2 -S7 1;3;5 South 22.6 +S7 [1, 3, 5] South 22.6 S8 2 West 19.5 ``` -For the sake of the demonstration later in this guide, let's assume that this TSV file have been uploaded to a DivBase project among with two VCF files where samples S1-S4 are found in `file1.vcf.gz` and S5-S6 in `file2.vcf.gz`. Let's also assume that the `divbase-cli dimensions update` has been run after all files have been uploaded so that the system has up-to-data information on which sample is found in which file. +For the sake of the demonstration later in this guide, let's assume that this TSV file have been uploaded to a DivBase project along with two VCF files where samples S1-S4 are found in `file1.vcf.gz` and S5-S8 in `file2.vcf.gz`. Let's also assume that the `divbase-cli dimensions update` has been run after all files have been uploaded so that the system has up-to-date information on which sample is found in which file. ### Validating a sidecar metadata TSV with `divbase-cli` @@ -113,10 +113,14 @@ The following will return **Errors**. These must be fixed for the sidecar TSV be - Tab separation: Row has the wrong number of columns. (Note: This check is only done in the validator! It is currently not part of the checks at the start of a sample metadata query.) -- `Sample_ID` column issues: Empty value, value containing a semicolon, rows with duplicate sample names. +- `Sample_ID` column issues: Empty value, value containing bracket array notation (e.g. `["S1", "S2"]`), rows with duplicate sample names. - Samples in TSV not found in project dimensions index: All samples listed in the TSV must exist in the project's dimensions index. If a sample is known to be in a VCF file in the DivBase project but is missing from the VCF dimensions index, the user needs to run `divbase-cli dimensions update` to submit an update job and then try the validator again after the job has finished. +- Mixed element types within a single multi-value cell: e.g. `[1, "two", 3]` (since it contains different types: `int`, `string`, `int`). + +- Incorrect bracket array notation according to the Python list syntax. E.g. unclosed brackets `[4`, missing commas to delimit array elements `[1 2 3]`. DivBase uses the [`ast.literal_eval`](https://docs.python.org/3/library/ast.html) Python library to parse every cell in the TSV and report errors to the user. + !!! Note The formatting errors listed above are also enforced by the DivBase query engine when loading the metadata file for queries (except checking tab separation which is a validator-specific check). This means that even if the validator is not run before upload, the query engine will analyse the file content and report issues as errors. Detected Errors are different from Warnings in that errors will result in queries not even being run. @@ -128,11 +132,11 @@ The validator will also raise **Warnings**. DivBase queries can still be run wit - Samples in the project's dimensions index not found in the TSV. These samples will not be considered in queries, and that might in fact be what the user wants, especially if using multiple TSVs. Just be sure to be careful when using this since it will affect the results. -- Mixed-type columns (a column with numeric and string values, e.g. "8", "1a", "5a") and semicolon-separated cells with mixed types (e.g., "1;abc"). They are allowed but the user should keep in mind that since they will be treated as string columns, numeric query operations (ranges, inequalities) will not work on these columns. +- Mixed-type columns (a column with numeric and string values, e.g. `8`, `1a`, `5a`). They are allowed but the user should keep in mind that since they will be treated as string columns, numeric query operations (ranges, inequalities) will not work on these columns. -- Hyphens in values that look like range notation (e.g., "1-2") in columns that otherwise contain numeric values. The same goes for commas (e.g. "1,2"). The warning message will ask the user if they intended this to be a multicolumn value which should use semicolons as delimiters. +- Hyphens in values that look like range notation (e.g., `1-2`) in columns that otherwise contain numeric values. The same goes for commas (e.g. `"1,2"`). The warning message will ask the user if they intended this to be a multi-value cell, which should use bracket array notation. -- Bracket array notation, i.e. cell values that are enclosed in brackets `[ ... ]` is not supported by DivBase. Bracked cells will be treated as strings, which can lead to undesired filtering results. Use semicolon (`;`) to delimit multi-value cells instead. Example: use '`1;2;3` instead of `[1,2,3] in the TSV cells. +- Semicolons in plain string cells (e.g., `2;4` or `West;South`). Since DivBase uses semicolons to separate key:value filter pairs in query syntax (e.g. `"Area:North;Population:1"`), a TSV cell containing a literal semicolon cannot be matched with the queries. Use bracket array notation instead to store multi-column values (e.g. `[2, 4]` or `["West", "South"]`). ## Query Syntax for sidecar metadata @@ -146,7 +150,7 @@ The TSV query syntax is `"Key1:Value1,Value2;Key2:Value3,Value4"`, where `Key1:` It is possible to exclude a value by prefixing it with a `!` (NOT) operator: `"Key:!Value"`. When mixing inclusive and exclusive filters (e.g. `"Key1:Value1,Value2; Key2:!Value3"`), only the rows that match the positive filters and do not match any of the excluded values will be returned. This can be used to write complex queries. !!! note - Please note that semicolon (`;`) is used for different purposes in the TSV (for denoting multi-value cells) and in the query syntax (for performing queries on multiple columns)! + Please note that semicolons (`;`) have special meaning in the query filter syntax: to separate key:value filter pairs (e.g. `"Area:North;Population:1"`). This means that if a TSV cell contains a literal semicolon, it cannot be matched via query filters because the query parser will consider it a delimiter and split the string on it. DivBase will return warnings to the user if the TSV contains semicolons. For example, if the user wants to query the TSV on column `Area` for all samples that contain the value `North`,: @@ -182,14 +186,14 @@ Unique filenames: ['file1.vcf.gz', 'file2.vcf.gz'] Filtering is inclusive by default. This applies both for the filter values and the cell values: - If a filter contains multiple values, e.g. `"Area:North,West"`, the row is included if at least one of the filter values matches any value in the cell. I.e. a row with `North`, and a row with `West` will both be returned from this filter. -- If a cell in the TSV contains multiple values separated by a semicolon as explained in [User-defined columns](#user-defined-columns) (e.g., `North;West`), the row is included if any of those values match the filter. Filters with `"Area:North"`, `"Area:West"`, and `"Area:North,West"` will all return the row with the array value `North;West`. +- If a cell in the TSV contains multiple values using bracket array notation as explained in [User-defined columns](#user-defined-columns) (e.g., `["North","West"]`), the row is included if any of those values match the filter. Filters with `"Area:North"`, `"Area:West"`, and `"Area:North,West"` will all return the row from the TSV with the cell value `["North","West"]`. !!! note To reiterate what was written in the [User-defined columns](#user-defined-columns) section above: it is the user's responsibility to ensure that the spelling of column headers and values is consistent. When filtering on the sidecar metadata, the exact spelling must be used for the filters. ### Filtering on string columns -Queries on string columns are straight-forward in the sense that each semicolon-separated value in the TSV are treated as discrete values. +Queries on string columns are straight-forward in the sense that each element in a cell (single-value or multi-value in a bracket array) in the TSV are treated as discrete values. As explained above, commas can be used to write multi-values filters. For instance, the query: @@ -197,7 +201,7 @@ As explained above, commas can be used to write multi-values filters. For instan divbase-cli query tsv "Area:North,South,East" ``` -will return all samples where **at least one** of the semicolon-separated values in the Area column matches any of the filter values (`North`, `South`, or `East`). +will return all samples where **at least one** of the values (including the individual values in a bracket array) in the Area column matches any of the filter values (`North`, `South`, or `East`). The `!`(NOT) operator can be used to exclude specific cell values from a column. When a `!` is used on its own, such as in the command: @@ -205,17 +209,26 @@ The `!`(NOT) operator can be used to exclude specific cell values from a column. divbase-cli query tsv "Area:!North" ``` -it will return all rows that do not contain `North` in the `Area`. Multi-column values that contain `North`, such as a row with e.g. `North;South` will also be excluded by this query. +it will return all rows that do not contain `North` in the `Area`. Multi-column values that contain `North`, such as a row with e.g. `["North", "South"]` will also be excluded by this query. Note that when inclusive and exclusive are combined (e.g. `"Area:East,!South"`), only rows that match both filters (include `East`, exclude `South`) will be returned in the results. +!!! Note + It is also possible to use quotes in the query filters to match TSV cell values that have e.g. a comma or a white space. When doing this, the whole outer filter string must be enclosed in single quotes in order to prevent the shell from interpreting the inner double quotes incorrectly. For instance, if there is a cell value in the TSV with `North, South` (i.e. containing a comma and a whitespace), it IS possible to filter on this value with: + + ```bash + divbase-cli query tsv 'Area:"North, South"' + ``` + + Using double quotes for the outside is NOT supported `"Area:'North, South'"`. + ### Filtering on numerical columns -A TSV column is considered as numeric in DivBase only if all cell values — including each individual part within semicolon-separated cells (e.g. `1;3;5`) — can be parsed as a number. For example: +A TSV column is considered as numeric in DivBase only if all cell values — including each individual element within a bracketed array (e.g. `[1,3,5]`) — can be parsed as a number. For example: -- A column with values `1`, `2;4`, `3`, `1;3;5` is considered numeric since all elements are numbers. All numeric operations below (inequalities, ranges, discrete) are fully supported on this column. +- A column with values `1`, `[2,4]`, `3`, `[1,3,5]` is considered numeric since all elements are numbers. All numeric operations below (inequalities, ranges, discrete) are fully supported on this column. -- A column with values `1;1-2`, `3`, `5` is considered a string column since the part `1-2` cannot be parsed as a number. Only exact string matching is supported for this column. +- A column with values `1-2`, `3`, `5` is considered a string column since `1-2` cannot be parsed as a number. Only exact string matching is supported for this column. - A column with values `8`, `1a`, `5a`is considered a string column since it has mixed types (`8` is numeric, the others are strings). Only exact string matching is supported for this column. @@ -246,7 +259,7 @@ The `!` (NOT) operator can really come to good use for numerical filters: - `"Weight:!1-2,4"` returns rows where the value is not in the range 1–2, or is 4. !!! Tip - Numeric operations such as inequalities like `>25`, and ranges like `20-40` are fully supported for semicolon-separated numeric columns as long as every semicolon separated part (`part;part`) in every cell in the column is a valid number. For instance: a `Population` column with values `1`, `2;4`, `1;3;5`; in this case a query like `divbase-cli query tsv "Population:>3"` will correctly match cells like `2;4` and `1;3;5`. + Numeric operations such as inequalities like `>25`, and ranges like `20-40` are fully supported for multi-value cells in numeric columns as long as every element in the bracket array (`[1, 2]`) in every cell in the column is of numerical type. For instance: a `Population` column with values `1`, `[2,4]`, `[1,3,5]`; in this case a query like `divbase-cli query tsv "Population:>3"` will correctly match cells like `[2,4]` and `[1,3,5]`. ### Filtering on Sample names @@ -294,8 +307,8 @@ Assuming that the sidecar metadata TSV file looks like in the [Example](#example divbase-cli query tsv "Area:North,West,!South;Weight:>10,<=20,!15,18-22" ``` -- include rows where the `Area` column contains either `North` or `West` (also applied to semicolon-separated multi-value cells), **but excludes** any row where `South` is present in the `Area` column—even if `North` or `West` is also present. +- include rows where the `Area` column contains either `North` or `West` (also applied to bracket array multi-value cells), **but excludes** any row where `South` is present in the `Area` column—even if `North` or `West` is also present. -- include rows where the `Weight` column is greater than 10, **or** less than or equal to 20, **or** in the range 18–22 (inclusive), **but excludes** any row where Weight is exactly 15 **or** any value in the range 18–22. +- include rows where the `Weight` column is greater than 10, **or** less than or equal to 20, **or** in the range 18–22 (inclusive), **but excludes** any row where Weight is exactly 15. -There are three samples (rows) that fulfill this, and this is what the query results will return: `S1`, `S4`, and `S5`. +There are four samples (rows) that fulfill this, and this is what the query results will return: `S1`, `S4`, `S5`, and `S8`. From df0caed49b47882c59f65327c69948d9eeca83a2 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 26 Feb 2026 13:17:59 +0100 Subject: [PATCH 097/100] Update mkdocs CLI autogen docs on dimensions Should probably have been commited as part of #70. --- docs/cli/_auto_generated/dimensions.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/cli/_auto_generated/dimensions.md b/docs/cli/_auto_generated/dimensions.md index 874187a7..b67be980 100644 --- a/docs/cli/_auto_generated/dimensions.md +++ b/docs/cli/_auto_generated/dimensions.md @@ -52,6 +52,9 @@ $ divbase-cli dimensions show [OPTIONS] * `--filename TEXT`: If set, will show only the entry for this VCF filename. * `--unique-scaffolds`: If set, will show all unique scaffold names found across all the VCF files in the project. * `--unique-samples`: If set, will show all unique sample names found across all the VCF files in the project. +* `--sample-names-limit INTEGER RANGE`: Maximum number of sample names to display per list in terminal output. [default: 20; x>=1] +* `--sample-names-output TEXT`: Write full sample names to file instead of truncating in terminal output. Mutually exclusive with --sample-names-stdout. +* `--sample-names-stdout`: Print full sample names to stdout (useful for piping). Mutually exclusive with --sample-names-output. * `--project TEXT`: Name of the DivBase project, if not provided uses the default in your DivBase config file * `--help`: Show this message and exit. From 801676ce5cc142837ff4b9ca19e129645dd314da Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 26 Feb 2026 14:04:09 +0100 Subject: [PATCH 098/100] Limit number of samples validator prints to user When using divbase-cli dimensions validate-metadata-file with the mock 5000 sample data, it will print every single sample name to terminal if they are not in the dimensions of that project. This limits it to 20. --- .../cli_commands/dimensions_cli.py | 12 +++- .../src/divbase_lib/metadata_validator.py | 68 +++++++++++++++---- .../test_sample_metadata_tsv_validator.py | 7 ++ 3 files changed, 72 insertions(+), 15 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 77aac23c..3128827c 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -293,6 +293,11 @@ def validate_metadata_template_versus_dimensions_and_formatting_constraints( ..., help="Name of the input TSV file to validate.", ), + full_sample_mismatch_names: bool = typer.Option( + False, + "--full-sample-mismatch-names", + help="Show full (untruncated) list of sample names for dimensions mismatch messages. Otherwise, the default limit is to show 20 sample names.", + ), project: str | None = PROJECT_NAME_OPTION, ) -> None: """ @@ -332,7 +337,12 @@ def validate_metadata_template_versus_dimensions_and_formatting_constraints( ) unique_sample_names = DimensionsSamplesResult(**response.json()).unique_samples - stats, errors, warnings = MetadataTSVValidator.validate(file_path=input_path, project_samples=unique_sample_names) + dimensions_sample_preview_limit = None if full_sample_mismatch_names else 20 + stats, errors, warnings = MetadataTSVValidator.validate( + file_path=input_path, + project_samples=unique_sample_names, + dimensions_sample_preview_limit=dimensions_sample_preview_limit, + ) if stats: print("[bold cyan]VALIDATION SUMMARY:[/bold cyan]") diff --git a/packages/divbase-lib/src/divbase_lib/metadata_validator.py b/packages/divbase-lib/src/divbase_lib/metadata_validator.py index 21dcdec5..4329e291 100644 --- a/packages/divbase-lib/src/divbase_lib/metadata_validator.py +++ b/packages/divbase-lib/src/divbase_lib/metadata_validator.py @@ -84,10 +84,17 @@ class SharedMetadataValidator: protect the query engine from malformed TSV content. """ - def __init__(self, file_path: Path, project_samples: set[str] | None = None, skip_dimensions_check: bool = False): + def __init__( + self, + file_path: Path, + dimensions_sample_preview_limit: int | None, + project_samples: set[str] | None = None, + skip_dimensions_check: bool = False, + ): self.file_path = file_path self.project_samples = project_samples self.skip_dimensions_check = skip_dimensions_check + self.dimensions_sample_preview_limit = dimensions_sample_preview_limit self.result = MetadataValidationResult() def load_and_validate(self) -> MetadataValidationResult: @@ -355,15 +362,12 @@ def _validate_sample_ids(self, df: pd.DataFrame) -> tuple[list[ValidationMessage return errors, [] - @staticmethod - def parse_cell_value(cell_value) -> Any: + def parse_cell_value(self, cell_value) -> Any: """ - Parse a single cell value. If the string representation starts with '[', - it must be a valid Python list literal (parsed via ast.literal_eval). - Non-list cells are returned as-is (scalar). + Parse a single cell value. If the string representation starts with '[', it must be a valid Python list literal (parsed via ast.literal_eval). + Non-list cells are returned as-is. - Raises ValueError if a cell looks like a list (starts with '[') but - cannot be parsed by ast.literal_eval. + Raises ValueError if a cell looks like a list (starts with '[') but cannot be parsed by ast.literal_eval. """ if pd.isna(cell_value): return cell_value @@ -592,24 +596,60 @@ def _validate_dimensions_match( missing_from_project = tsv_samples - project_samples if missing_from_project: - examples = sorted(list(missing_from_project)) + examples, was_truncated = self._format_sample_name_preview( + sample_names=missing_from_project, preview_limit=self.dimensions_sample_preview_limit + ) + full_output_hint = f" {self._build_full_sample_output_hint()}" if was_truncated else "" errors.append( ValidationMessage( ValidationCategory.DIMENSIONS, - f"The following samples in the TSV were not found in the DivBase project's dimensions index: {examples}. " - "DivBase requires that all samples in the TSV file must be present in the project's dimensions index to be used for queries.", + f"The following samples in the TSV were not found in the DivBase project's dimensions index ({examples}). " + "DivBase requires that all samples in the TSV file must be present in the project's dimensions index to be used for queries." + f"{full_output_hint}", ) ) missing_from_tsv = project_samples - tsv_samples if missing_from_tsv: - examples = sorted(list(missing_from_tsv)) + examples, was_truncated = self._format_sample_name_preview( + sample_names=missing_from_tsv, preview_limit=self.dimensions_sample_preview_limit + ) + full_output_hint = f" {self._build_full_sample_output_hint()}" if was_truncated else "" warnings.append( ValidationMessage( ValidationCategory.DIMENSIONS, - f"The following samples in the DivBase project's dimensions index were not found in the TSV: {examples}. " - "This is allowed for DivBase metadata TSV files, but please be aware that these samples will not be considered when making queries with this metadata file.", + f"The following samples in the DivBase project's dimensions index were not found in the TSV ({examples}). " + "This is allowed for DivBase metadata TSV files, but please be aware that these samples will not be considered when making queries with this metadata file." + f"{full_output_hint}", ) ) return errors, warnings + + def _format_sample_name_preview(self, sample_names: set[str], preview_limit: int | None = 20) -> tuple[str, bool]: + """ + Build a compact sample-name summary for terminal messages. Truncates list of sample names if it exceeds the preview limit (20 by default). + + Returns only a full list when the list is small, otherwise includes a count and + a preview of the first 20 samples (default value) to avoid overwhelming CLI output. + """ + sorted_names = sorted(sample_names) + total = len(sorted_names) + if preview_limit is None: + return f"count: {total}, samples: {sorted_names}", False + + if total <= preview_limit: + return f"count: {total}, samples: {sorted_names}", False + + preview = sorted_names[:preview_limit] + return f"count: {total}, showing first {preview_limit}: {preview}", True + + def _build_full_sample_output_hint(self) -> str: + """ + When the sample mismatch list is truncated (>20 for default settings), give users a hint on how they can see the full sample-name mismatch list. + """ + return ( + "To view the full list of mismatched samples, run: " + "divbase-cli dimensions validate-metadata-file --project " + "--full-sample-mismatch-names." + ) diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index a643f83f..e3af50f4 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -204,6 +204,13 @@ def test_samples_not_in_tsv(self, valid_list_tsv): for w in warnings ) + def test_large_dimension_mismatch_is_summarized(self, valid_list_tsv): + # Create samples named S0001, S0002, ..., S0050 + project_samples = {f"S{i:04d}" for i in range(1, 51)} + _, _, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + assert any("count: 50, showing first 20" in w for w in warnings) + assert not any("S0050" in w for w in warnings) + class TestStatistics: """Test statistics collection.""" From 467b38c394cf370d403c102921761c157aa8bb39 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 26 Feb 2026 16:38:21 +0100 Subject: [PATCH 099/100] Make MetadataTSVValidator instance-based instead Having the classmethod was convienient for calling the for calling the class from the CLI since the instance was never reused. But the classmethod still captured state and used the __init__ method so it can be clearer to just follow the instance-based style used in the rest of the codebase. --- .../cli_commands/dimensions_cli.py | 3 +- .../services/sample_metadata_tsv_validator.py | 31 +++-- .../src/divbase_lib/metadata_validator.py | 2 +- .../test_sample_metadata_tsv_validator.py | 123 ++++++++++++------ 4 files changed, 107 insertions(+), 52 deletions(-) diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index 3128827c..e5dc9e5f 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -338,11 +338,12 @@ def validate_metadata_template_versus_dimensions_and_formatting_constraints( unique_sample_names = DimensionsSamplesResult(**response.json()).unique_samples dimensions_sample_preview_limit = None if full_sample_mismatch_names else 20 - stats, errors, warnings = MetadataTSVValidator.validate( + validator = MetadataTSVValidator( file_path=input_path, project_samples=unique_sample_names, dimensions_sample_preview_limit=dimensions_sample_preview_limit, ) + stats, errors, warnings = validator.validate() if stats: print("[bold cyan]VALIDATION SUMMARY:[/bold cyan]") diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index e22567ce..33e5c9d9 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -13,21 +13,29 @@ class MetadataTSVValidator: - """Validates sidecar metadata TSV files against DivBase requirements.""" + """ + Client-side wrapper for validating sidecar metadata TSV files against DivBase requirements. + Calls SharedMetadataValidator to perform the actual validation. + """ - def __init__(self, file_path: Path, project_samples: list[str] | set[str]): + def __init__( + self, + file_path: Path, + project_samples: list[str] | set[str], + dimensions_sample_preview_limit: int | None = 20, + ): """ Initialize the validator. File path is the path to the TSV file to validate, and project_samples is a list or set of unique sample names from the project's dimensions index. """ self.file_path = file_path self.project_samples = set(project_samples) if isinstance(project_samples, list) else project_samples + self.dimensions_sample_preview_limit = dimensions_sample_preview_limit self.errors: list[str] = [] self.warnings: list[str] = [] self.stats: dict = {} - @classmethod - def validate(cls, file_path: Path, project_samples: list[str] | set[str]) -> tuple[dict, list[str], list[str]]: + def validate(self) -> tuple[dict, list[str], list[str]]: """ Validate a TSV file and return results. @@ -38,22 +46,21 @@ def validate(cls, file_path: Path, project_samples: list[str] | set[str]) -> tup Returns a tuple of (stats, errors, warnings) where stats is a dictionary of collected statistics, errors is a list of error messages, and warnings is a list of warning messages. """ - validator = cls(file_path, project_samples) - shared_validator = SharedMetadataValidator( - file_path=file_path, - project_samples=validator.project_samples, + file_path=self.file_path, + project_samples=self.project_samples, skip_dimensions_check=False, + dimensions_sample_preview_limit=self.dimensions_sample_preview_limit, ) result = shared_validator.load_and_validate() - validator.errors = [error_entry.message for error_entry in result.errors] - validator.warnings = [warning_entry.message for warning_entry in result.warnings] + self.errors = [error_entry.message for error_entry in result.errors] + self.warnings = [warning_entry.message for warning_entry in result.warnings] if result.df is not None and "Sample_ID" in result.df.columns: try: tsv_samples = set(result.df["Sample_ID"].tolist()) - validator._collect_statistics( + self._collect_statistics( df=result.df, tsv_samples=tsv_samples, numeric_cols=result.numeric_columns, @@ -64,7 +71,7 @@ def validate(cls, file_path: Path, project_samples: list[str] | set[str]) -> tup # If Sample_ID access fails (e.g., in the very rare case that duplicate Sample_ID column make it a DataFrame due to dataframe nesting), skip statistics as the validation errors already captured the issue pass - return validator.stats, validator.errors, validator.warnings + return self.stats, self.errors, self.warnings def _collect_statistics( self, diff --git a/packages/divbase-lib/src/divbase_lib/metadata_validator.py b/packages/divbase-lib/src/divbase_lib/metadata_validator.py index 4329e291..3bf0e233 100644 --- a/packages/divbase-lib/src/divbase_lib/metadata_validator.py +++ b/packages/divbase-lib/src/divbase_lib/metadata_validator.py @@ -87,9 +87,9 @@ class SharedMetadataValidator: def __init__( self, file_path: Path, - dimensions_sample_preview_limit: int | None, project_samples: set[str] | None = None, skip_dimensions_check: bool = False, + dimensions_sample_preview_limit: int | None = 20, ): self.file_path = file_path self.project_samples = project_samples diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index e3af50f4..fc71d6ec 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -17,7 +17,8 @@ class TestValidTSV: def test_valid_list_tsv_passes_all_checks(self, valid_list_tsv, project_samples): """Test that a valid TSV with list notation passes with no errors or warnings.""" - stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + validator = MetadataTSVValidator(valid_list_tsv, project_samples) + stats, errors, warnings = validator.validate() assert len(errors) == 0 assert len(warnings) == 0 @@ -35,22 +36,26 @@ class TestHeaderValidation: """Test validation of header row.""" def test_wrong_first_column_name(self, header_errors_tsv, project_samples): - stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) + validator = MetadataTSVValidator(header_errors_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("First column must be named '#Sample_ID'" in e for e in errors) def test_duplicate_column_names(self, header_errors_tsv, project_samples): - stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) + validator = MetadataTSVValidator(header_errors_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("Duplicate column names" in e and "Area" in e for e in errors) def test_duplicate_column_names_after_stripping_hash(self, tmp_path, project_samples): content = "#Sample_ID\tSample_ID\tPopulation\nS1\tS1_dup\t1\nS2\tS2_dup\t2\n" tsv_file = tmp_path / "dup_sample_id_cols.tsv" tsv_file.write_text(content) - stats, errors, warnings = MetadataTSVValidator.validate(tsv_file, project_samples) + validator = MetadataTSVValidator(tsv_file, project_samples) + stats, errors, warnings = validator.validate() assert any("Duplicate column names" in e and "Sample_ID" in e for e in errors) def test_empty_column_name(self, header_errors_tsv, project_samples): - stats, errors, warnings = MetadataTSVValidator.validate(header_errors_tsv, project_samples) + validator = MetadataTSVValidator(header_errors_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("Empty column name" in e for e in errors) @@ -58,16 +63,19 @@ class TestSampleIDValidation: """Test validation of Sample_ID column.""" def test_empty_sample_id(self, sample_id_errors_tsv, project_samples): - stats, errors, warnings = MetadataTSVValidator.validate(sample_id_errors_tsv, project_samples) + validator = MetadataTSVValidator(sample_id_errors_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("Sample_ID is empty" in e for e in errors) def test_list_value_in_sample_id(self, sample_id_errors_tsv, project_samples): """List notation in Sample_ID should produce an error.""" - stats, errors, warnings = MetadataTSVValidator.validate(sample_id_errors_tsv, project_samples) + validator = MetadataTSVValidator(sample_id_errors_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("list values" in e.lower() for e in errors) def test_duplicate_sample_id(self, sample_id_errors_tsv, project_samples): - stats, errors, warnings = MetadataTSVValidator.validate(sample_id_errors_tsv, project_samples) + validator = MetadataTSVValidator(sample_id_errors_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("Duplicate Sample_ID" in e and "S1" in e for e in errors) @@ -75,20 +83,24 @@ class TestFormattingValidation: """Test validation of TSV formatting.""" def test_wrong_column_count(self, format_errors_tsv, project_samples): - stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) + validator = MetadataTSVValidator(format_errors_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("Expected 3 tab-separated columns" in e and "found 2" in e for e in errors) def test_comma_in_cell_produces_warning(self, format_errors_tsv, project_samples): - stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) + validator = MetadataTSVValidator(format_errors_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("comma" in w.lower() for w in warnings) assert not any("comma" in e.lower() for e in errors) def test_whitespace_warning(self, format_errors_tsv, project_samples): - stats, errors, warnings = MetadataTSVValidator.validate(format_errors_tsv, project_samples) + validator = MetadataTSVValidator(format_errors_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("leading or trailing whitespace" in w for w in warnings) def test_semicolons_in_cells_produce_warning(self, semicolons_in_cells_tsv): - stats, errors, warnings = MetadataTSVValidator.validate(semicolons_in_cells_tsv, {"S1", "S2", "S3"}) + validator = MetadataTSVValidator(semicolons_in_cells_tsv, {"S1", "S2", "S3"}) + stats, errors, warnings = validator.validate() assert any("semicolon" in w.lower() for w in warnings) assert not any("semicolon" in e.lower() for e in errors) @@ -98,30 +110,35 @@ class TestListSyntaxValidation: def test_valid_list_syntax_no_errors(self, valid_list_tsv, project_samples): """Test that valid list notation like [1, 2] should parse without errors.""" - stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + validator = MetadataTSVValidator(valid_list_tsv, project_samples) + stats, errors, warnings = validator.validate() assert len(errors) == 0 def test_invalid_list_syntax_produces_errors(self, invalid_list_syntax_tsv, project_samples): """Test that cells starting with '[' but can't be parsed produce hard errors.""" - stats, errors, warnings = MetadataTSVValidator.validate(invalid_list_syntax_tsv, project_samples) + validator = MetadataTSVValidator(invalid_list_syntax_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("[4" in e and "invalid" in e.lower() for e in errors) assert any("[1 2 3]" in e and "invalid" in e.lower() for e in errors) def test_invalid_list_syntax_collects_all_errors(self, invalid_list_syntax_tsv, project_samples): """Test that all invalid list cells should be reported, not just the first one.""" - stats, errors, warnings = MetadataTSVValidator.validate(invalid_list_syntax_tsv, project_samples) + validator = MetadataTSVValidator(invalid_list_syntax_tsv, project_samples) + stats, errors, warnings = validator.validate() list_errors = [e for e in errors if "invalid" in e.lower() and "list" in e.lower()] assert len(list_errors) == 2 def test_whitespace_insensitive_list_parsing(self, whitespace_variant_lists_tsv): """Test that [1,2,3], [1, 2, 3], and [ 1 , 2 , 3 ] all parse identically.""" - stats, errors, warnings = MetadataTSVValidator.validate(whitespace_variant_lists_tsv, {"S1", "S2", "S3"}) + validator = MetadataTSVValidator(whitespace_variant_lists_tsv, {"S1", "S2", "S3"}) + stats, errors, warnings = validator.validate() assert len(errors) == 0 assert "Scores" in stats["numeric_columns"] def test_empty_list_parses_successfully(self, empty_list_tsv): """Test that an empty list [] should parse without errors.""" - stats, errors, warnings = MetadataTSVValidator.validate(empty_list_tsv, {"S1", "S2", "S3"}) + validator = MetadataTSVValidator(empty_list_tsv, {"S1", "S2", "S3"}) + stats, errors, warnings = validator.validate() list_errors = [e for e in errors if "list" in e.lower()] assert len(list_errors) == 0 @@ -131,31 +148,36 @@ class TestTypeValidation: def test_numeric_list_column_is_numeric(self, numeric_list_tsv, project_samples): """Test that columns with only numeric list cells (multi-value) and numeric scalars (single-values) are numeric.""" - stats, errors, warnings = MetadataTSVValidator.validate(numeric_list_tsv, project_samples) + validator = MetadataTSVValidator(numeric_list_tsv, project_samples) + stats, errors, warnings = validator.validate() assert "Scores" in stats["numeric_columns"] assert "Values" in stats["numeric_columns"] assert "Scores" not in stats["mixed_type_columns"] def test_string_list_column_is_string(self, string_list_tsv): """Test that columns with string list cells are classified as string.""" - stats, errors, warnings = MetadataTSVValidator.validate(string_list_tsv, {"S1", "S2", "S3"}) + validator = MetadataTSVValidator(string_list_tsv, {"S1", "S2", "S3"}) + stats, errors, warnings = validator.validate() assert "Areas" in stats["string_columns"] assert "Score" in stats["numeric_columns"] def test_mixed_types_within_list_cell_is_error(self, mixed_type_list_cell_tsv): """Test that a list cell like [1, "two", 3] with mixed element types raise an error.""" - stats, errors, warnings = MetadataTSVValidator.validate(mixed_type_list_cell_tsv, {"S1", "S2", "S3"}) + validator = MetadataTSVValidator(mixed_type_list_cell_tsv, {"S1", "S2", "S3"}) + stats, errors, warnings = validator.validate() assert any("mixed element types" in e.lower() for e in errors) def test_mixed_types_across_cells_is_warning(self, mixed_type_across_cells_tsv, project_samples): """Test that a column with both numeric and string sends a warning (not error).""" - stats, errors, warnings = MetadataTSVValidator.validate(mixed_type_across_cells_tsv, project_samples) + validator = MetadataTSVValidator(mixed_type_across_cells_tsv, project_samples) + stats, errors, warnings = validator.validate() assert "Population" in stats["mixed_type_columns"] assert not any("Population" in e and "mixed" in e.lower() for e in errors) def test_mixed_types_across_cells_produces_per_cell_warnings(self, mixed_type_across_cells_tsv, project_samples): """Test that mixed-type columns identifies which cells are outliers.""" - stats, errors, warnings = MetadataTSVValidator.validate(mixed_type_across_cells_tsv, project_samples) + validator = MetadataTSVValidator(mixed_type_across_cells_tsv, project_samples) + stats, errors, warnings = validator.validate() cell_warnings = [w for w in warnings if "non-numeric" in w.lower() and "Population" in w] assert len(cell_warnings) >= 1 assert any("abc" in w for w in cell_warnings) @@ -163,12 +185,14 @@ def test_mixed_types_across_cells_produces_per_cell_warnings(self, mixed_type_ac def test_mixed_types_clarification_warning(self, mixed_type_across_cells_tsv, project_samples): """Test that a general clarification warning is sent for mixed-type columns.""" - stats, errors, warnings = MetadataTSVValidator.validate(mixed_type_across_cells_tsv, project_samples) + validator = MetadataTSVValidator(mixed_type_across_cells_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any("clarification on mixed types" in w.lower() for w in warnings) def test_negative_numbers_are_numeric(self, numeric_list_tsv, project_samples): """Test that negative numbers are classified as numeric, not string.""" - stats, errors, warnings = MetadataTSVValidator.validate(numeric_list_tsv, project_samples) + validator = MetadataTSVValidator(numeric_list_tsv, project_samples) + stats, errors, warnings = validator.validate() assert "Temperature" in stats["numeric_columns"] assert "Longitude" in stats["numeric_columns"] assert len(stats["mixed_type_columns"]) == 0 @@ -181,7 +205,8 @@ def test_mixed_list_and_scalar_in_same_column(self, tmp_path): content += "S3\t[3, 4]\n" p = tmp_path / "mixed_list_scalar.tsv" p.write_text(content) - stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2", "S3"}) + validator = MetadataTSVValidator(p, {"S1", "S2", "S3"}) + stats, errors, warnings = validator.validate() assert len(errors) == 0 assert "Scores" in stats["numeric_columns"] @@ -191,14 +216,16 @@ class TestDimensionMatching: def test_samples_not_in_project(self, valid_list_tsv): project_samples = {"S1", "S2"} - stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + validator = MetadataTSVValidator(valid_list_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any( "following samples in the TSV were not found in the DivBase project's dimensions index" in e for e in errors ) def test_samples_not_in_tsv(self, valid_list_tsv): project_samples = {"S1", "S2", "S3", "S10", "S20"} - stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + validator = MetadataTSVValidator(valid_list_tsv, project_samples) + stats, errors, warnings = validator.validate() assert any( "following samples in the DivBase project's dimensions index were not found in the TSV" in w and "S10" in w for w in warnings @@ -207,16 +234,27 @@ def test_samples_not_in_tsv(self, valid_list_tsv): def test_large_dimension_mismatch_is_summarized(self, valid_list_tsv): # Create samples named S0001, S0002, ..., S0050 project_samples = {f"S{i:04d}" for i in range(1, 51)} - _, _, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + validator = MetadataTSVValidator(valid_list_tsv, project_samples) + _, _, warnings = validator.validate() assert any("count: 50, showing first 20" in w for w in warnings) + assert any("--full-sample-mismatch-names" in w for w in warnings) assert not any("S0050" in w for w in warnings) + def test_large_dimension_mismatch_can_show_full_list(self, valid_list_tsv): + project_samples = {f"S{i:04d}" for i in range(1, 51)} + validator = MetadataTSVValidator(valid_list_tsv, project_samples, dimensions_sample_preview_limit=None) + _, _, warnings = validator.validate() + assert any("count: 50, samples:" in w and "S0050" in w for w in warnings) + assert not any("showing first 20" in w for w in warnings) + assert not any("--full-sample-mismatch-names" in w for w in warnings) + class TestStatistics: """Test statistics collection.""" def test_statistics_collection(self, valid_list_tsv, project_samples): - stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + validator = MetadataTSVValidator(valid_list_tsv, project_samples) + stats, errors, warnings = validator.validate() assert stats["total_columns"] == 4 assert stats["user_defined_columns"] == 3 assert stats["samples_in_tsv"] == 5 @@ -229,12 +267,14 @@ def test_statistics_collection(self, valid_list_tsv, project_samples): def test_no_multi_values_detected(self, no_multi_values_tsv): """Test that has_multi_values is False when no list cells exist.""" - stats, errors, warnings = MetadataTSVValidator.validate(no_multi_values_tsv, {"S1", "S2"}) + validator = MetadataTSVValidator(no_multi_values_tsv, {"S1", "S2"}) + stats, errors, warnings = validator.validate() assert stats["has_multi_values"] is False def test_multi_values_detected_via_list_cells(self, valid_list_tsv, project_samples): """Test that has_multi_values is True when list cells exist.""" - stats, errors, warnings = MetadataTSVValidator.validate(valid_list_tsv, project_samples) + validator = MetadataTSVValidator(valid_list_tsv, project_samples) + stats, errors, warnings = validator.validate() assert stats["has_multi_values"] is True @@ -244,11 +284,13 @@ class TestEdgeCases: def test_empty_file(self, project_samples, tmp_path): empty_file = tmp_path / "empty.tsv" empty_file.write_text("") - stats, errors, warnings = MetadataTSVValidator.validate(empty_file, project_samples) + validator = MetadataTSVValidator(empty_file, project_samples) + stats, errors, warnings = validator.validate() assert any("File is empty" in e for e in errors) def test_nonexistent_file(self, project_samples): - stats, errors, warnings = MetadataTSVValidator.validate(Path("/nonexistent/file.tsv"), project_samples) + validator = MetadataTSVValidator(Path("/nonexistent/file.tsv"), project_samples) + stats, errors, warnings = validator.validate() assert any("Failed to read file" in e for e in errors) def test_non_list_bracket_strings(self, tmp_path): @@ -258,7 +300,8 @@ def test_non_list_bracket_strings(self, tmp_path): content += "S2\tnormal\n" p = tmp_path / "bracket_strings.tsv" p.write_text(content) - stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) + validator = MetadataTSVValidator(p, {"S1", "S2"}) + stats, errors, warnings = validator.validate() list_errors = [e for e in errors if "list" in e.lower()] assert len(list_errors) == 0 @@ -269,7 +312,8 @@ def test_cell_starting_with_bracket_but_not_list(self, tmp_path): content += "S2\tnormal\n" p = tmp_path / "bracket_ref.tsv" p.write_text(content) - stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) + validator = MetadataTSVValidator(p, {"S1", "S2"}) + stats, errors, warnings = validator.validate() assert any("[ref]" in e and "invalid" in e.lower() for e in errors) def test_tuple_notation_is_not_a_list(self, tmp_path): @@ -279,7 +323,8 @@ def test_tuple_notation_is_not_a_list(self, tmp_path): content += "S2\tnormal\n" p = tmp_path / "tuple.tsv" p.write_text(content) - stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) + validator = MetadataTSVValidator(p, {"S1", "S2"}) + stats, errors, warnings = validator.validate() list_errors = [e for e in errors if "list" in e.lower()] assert len(list_errors) == 0 @@ -292,7 +337,8 @@ def test_quoted_cell_with_comma_space_produces_comma_warning(self, tmp_path): p = tmp_path / "quoted_comma_space.tsv" p.write_text(content) - stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) + validator = MetadataTSVValidator(p, {"S1", "S2"}) + stats, errors, warnings = validator.validate() assert len(errors) == 0 assert any("comma" in w.lower() and "North, South" in w for w in warnings) @@ -301,6 +347,7 @@ def test_quoted_cell_with_embedded_double_quotes_is_allowed(self, tmp_path): p = tmp_path / "embedded_quotes.tsv" p.write_text(content) - stats, errors, warnings = MetadataTSVValidator.validate(p, {"S1", "S2"}) + validator = MetadataTSVValidator(p, {"S1", "S2"}) + stats, errors, warnings = validator.validate() assert len(errors) == 0 assert not any("list syntax" in e.lower() for e in errors) From bf9ff816027542debd56959869258a53658884f2 Mon Sep 17 00:00:00 2001 From: Daniel P Brink Date: Thu, 26 Feb 2026 16:59:58 +0100 Subject: [PATCH 100/100] Rename class to ClientSideMetadataTSVValidator The old name was MetadataTSVValidator, but this could become a source of confusion since the core validation logic is in the SharedMetadataValidator --- .../src/divbase_api/services/queries.py | 4 +- .../cli_commands/dimensions_cli.py | 4 +- .../services/sample_metadata_tsv_validator.py | 2 +- .../src/divbase_lib/metadata_validator.py | 4 +- scripts/tsv_to_dataframe.py | 2 +- .../test_sample_metadata_queries.py | 2 +- .../test_sample_metadata_tsv_validator.py | 86 ++++++++++--------- 7 files changed, 53 insertions(+), 51 deletions(-) diff --git a/packages/divbase-api/src/divbase_api/services/queries.py b/packages/divbase-api/src/divbase_api/services/queries.py index 0e991883..3238587f 100644 --- a/packages/divbase-api/src/divbase_api/services/queries.py +++ b/packages/divbase-api/src/divbase_api/services/queries.py @@ -713,7 +713,7 @@ def load_file(self) -> "SidecarQueryManager": Uses the warning and error category Enums from SharedMetadataValidator logic to raise errors or send warnings to the user. - Validates the same errors as the client-side MetadataTSVValidator using shared validation logic: + Validates the same errors as the client-side ClientSideClientSideMetadataTSVValidator using shared validation logic: - Header: first column must be #Sample_ID, no duplicate or empty column names - Sample_ID: no empty values, no duplicates, no multi-values (Python lists) - Data: no commas in any cell values @@ -721,7 +721,7 @@ def load_file(self) -> "SidecarQueryManager": try: logger.info(f"Loading sidecar metadata file: {self.file}") - # Note! The SharedMetadataValidator is for checks on the contents of the TSV file. The logic is shared between this class and the client-side MetadataTSVValidator. + # Note! The SharedMetadataValidator is for checks on the contents of the TSV file. The logic is shared between this class and the client-side ClientSideClientSideMetadataTSVValidator. # There are several helper methods for the filtering logic in this class, but they are for the query filters and are not related to the validation of the TSV file contents. self.metadata_validator = SharedMetadataValidator( file_path=self.file, diff --git a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py index e5dc9e5f..108672c9 100644 --- a/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py +++ b/packages/divbase-cli/src/divbase_cli/cli_commands/dimensions_cli.py @@ -8,7 +8,7 @@ from divbase_cli.cli_commands.shared_args_options import PROJECT_NAME_OPTION from divbase_cli.config_resolver import resolve_project -from divbase_cli.services.sample_metadata_tsv_validator import MetadataTSVValidator +from divbase_cli.services.sample_metadata_tsv_validator import ClientSideClientSideMetadataTSVValidator from divbase_cli.user_auth import make_authenticated_request from divbase_lib.api_schemas.vcf_dimensions import ( DimensionsSamplesResult, @@ -338,7 +338,7 @@ def validate_metadata_template_versus_dimensions_and_formatting_constraints( unique_sample_names = DimensionsSamplesResult(**response.json()).unique_samples dimensions_sample_preview_limit = None if full_sample_mismatch_names else 20 - validator = MetadataTSVValidator( + validator = ClientSideClientSideMetadataTSVValidator( file_path=input_path, project_samples=unique_sample_names, dimensions_sample_preview_limit=dimensions_sample_preview_limit, diff --git a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py index 33e5c9d9..4b6e7b80 100644 --- a/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py +++ b/packages/divbase-cli/src/divbase_cli/services/sample_metadata_tsv_validator.py @@ -12,7 +12,7 @@ from divbase_lib.metadata_validator import SharedMetadataValidator -class MetadataTSVValidator: +class ClientSideClientSideMetadataTSVValidator: """ Client-side wrapper for validating sidecar metadata TSV files against DivBase requirements. Calls SharedMetadataValidator to perform the actual validation. diff --git a/packages/divbase-lib/src/divbase_lib/metadata_validator.py b/packages/divbase-lib/src/divbase_lib/metadata_validator.py index 3bf0e233..2cc90f87 100644 --- a/packages/divbase-lib/src/divbase_lib/metadata_validator.py +++ b/packages/divbase-lib/src/divbase_lib/metadata_validator.py @@ -2,7 +2,7 @@ Shared validation logic for DivBase sidecar metadata TSV files. This file contains the single source of truth for the TSV content validation logic used by both -the CLI validator (MetadataTSVValidator) on the client-side, and the SidecarQueryManager on the server-side. +the CLI validator (ClientSideMetadataTSVValidator) on the client-side, and the SidecarQueryManager on the server-side. Note! Logic for the queries themselves (e.g. how filtering is handled) is not shared between the two. This file is only for validation of the contents of the TSV file, not for query processing. @@ -64,7 +64,7 @@ class MetadataValidationResult: class SharedMetadataValidator: """ - Core validation logic for DivBase sidecar metadata TSV files. Shared between client-side MetadataTSVValidator + Core validation logic for DivBase sidecar metadata TSV files. Shared between client-side ClientSideMetadataTSVValidator and server-side SidecarQueryManager to ensure consistent validation behavior. It does not validate metadata query filters. That is handled in the SidecarQueryManager. diff --git a/scripts/tsv_to_dataframe.py b/scripts/tsv_to_dataframe.py index 59beaf87..9e80cc3f 100644 --- a/scripts/tsv_to_dataframe.py +++ b/scripts/tsv_to_dataframe.py @@ -32,7 +32,7 @@ def parse_arguments(): def tsv_to_dataframe(file_path) -> MetadataValidationResult: """ Reads a TSV file and returns a pandas DataFrame. Just runs the loading and validation logic, but does not - print the results like the client-side MetadataTSVValidator does. + print the results like the client-side ClientSideClientSideMetadataTSVValidator does. Allows for inspection of of the dataframe. """ diff --git a/tests/unit/divbase_api/test_sample_metadata_queries.py b/tests/unit/divbase_api/test_sample_metadata_queries.py index f9a2a914..91738eaf 100644 --- a/tests/unit/divbase_api/test_sample_metadata_queries.py +++ b/tests/unit/divbase_api/test_sample_metadata_queries.py @@ -818,7 +818,7 @@ def test_purely_numeric_list_column_range_filter(self, sample_tsv_with_list_mixe class TestLoadFileValidation: """Test that SidecarQueryManager validates the same errors as the client-side - MetadataTSVValidator in load_file()), before any queries are run. + ClientSideClientSideMetadataTSVValidator in load_file()), before any queries are run. This ensures that even if a user skips the CLI validator, the server-side query engine catches the same formatting issues with clear error messages.""" diff --git a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py index fc71d6ec..a7c71e4d 100644 --- a/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py +++ b/tests/unit/divbase_cli/test_sample_metadata_tsv_validator.py @@ -1,15 +1,15 @@ """ -Unit tests for the MetadataTSVValidator class. +Unit tests for the ClientSideClientSideMetadataTSVValidator class. Tests the shared validation logic from SharedMetadataValidator as exercised -through the CLI's MetadataTSVValidator wrapper. +through the CLI's ClientSideClientSideMetadataTSVValidator wrapper. Shared fixtures are defined in tests/unit/conftest.py. """ from pathlib import Path -from divbase_cli.services.sample_metadata_tsv_validator import MetadataTSVValidator +from divbase_cli.services.sample_metadata_tsv_validator import ClientSideClientSideMetadataTSVValidator class TestValidTSV: @@ -17,7 +17,7 @@ class TestValidTSV: def test_valid_list_tsv_passes_all_checks(self, valid_list_tsv, project_samples): """Test that a valid TSV with list notation passes with no errors or warnings.""" - validator = MetadataTSVValidator(valid_list_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(valid_list_tsv, project_samples) stats, errors, warnings = validator.validate() assert len(errors) == 0 @@ -36,12 +36,12 @@ class TestHeaderValidation: """Test validation of header row.""" def test_wrong_first_column_name(self, header_errors_tsv, project_samples): - validator = MetadataTSVValidator(header_errors_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(header_errors_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("First column must be named '#Sample_ID'" in e for e in errors) def test_duplicate_column_names(self, header_errors_tsv, project_samples): - validator = MetadataTSVValidator(header_errors_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(header_errors_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("Duplicate column names" in e and "Area" in e for e in errors) @@ -49,12 +49,12 @@ def test_duplicate_column_names_after_stripping_hash(self, tmp_path, project_sam content = "#Sample_ID\tSample_ID\tPopulation\nS1\tS1_dup\t1\nS2\tS2_dup\t2\n" tsv_file = tmp_path / "dup_sample_id_cols.tsv" tsv_file.write_text(content) - validator = MetadataTSVValidator(tsv_file, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(tsv_file, project_samples) stats, errors, warnings = validator.validate() assert any("Duplicate column names" in e and "Sample_ID" in e for e in errors) def test_empty_column_name(self, header_errors_tsv, project_samples): - validator = MetadataTSVValidator(header_errors_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(header_errors_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("Empty column name" in e for e in errors) @@ -63,18 +63,18 @@ class TestSampleIDValidation: """Test validation of Sample_ID column.""" def test_empty_sample_id(self, sample_id_errors_tsv, project_samples): - validator = MetadataTSVValidator(sample_id_errors_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(sample_id_errors_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("Sample_ID is empty" in e for e in errors) def test_list_value_in_sample_id(self, sample_id_errors_tsv, project_samples): """List notation in Sample_ID should produce an error.""" - validator = MetadataTSVValidator(sample_id_errors_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(sample_id_errors_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("list values" in e.lower() for e in errors) def test_duplicate_sample_id(self, sample_id_errors_tsv, project_samples): - validator = MetadataTSVValidator(sample_id_errors_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(sample_id_errors_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("Duplicate Sample_ID" in e and "S1" in e for e in errors) @@ -83,23 +83,23 @@ class TestFormattingValidation: """Test validation of TSV formatting.""" def test_wrong_column_count(self, format_errors_tsv, project_samples): - validator = MetadataTSVValidator(format_errors_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(format_errors_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("Expected 3 tab-separated columns" in e and "found 2" in e for e in errors) def test_comma_in_cell_produces_warning(self, format_errors_tsv, project_samples): - validator = MetadataTSVValidator(format_errors_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(format_errors_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("comma" in w.lower() for w in warnings) assert not any("comma" in e.lower() for e in errors) def test_whitespace_warning(self, format_errors_tsv, project_samples): - validator = MetadataTSVValidator(format_errors_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(format_errors_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("leading or trailing whitespace" in w for w in warnings) def test_semicolons_in_cells_produce_warning(self, semicolons_in_cells_tsv): - validator = MetadataTSVValidator(semicolons_in_cells_tsv, {"S1", "S2", "S3"}) + validator = ClientSideClientSideMetadataTSVValidator(semicolons_in_cells_tsv, {"S1", "S2", "S3"}) stats, errors, warnings = validator.validate() assert any("semicolon" in w.lower() for w in warnings) assert not any("semicolon" in e.lower() for e in errors) @@ -110,34 +110,34 @@ class TestListSyntaxValidation: def test_valid_list_syntax_no_errors(self, valid_list_tsv, project_samples): """Test that valid list notation like [1, 2] should parse without errors.""" - validator = MetadataTSVValidator(valid_list_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(valid_list_tsv, project_samples) stats, errors, warnings = validator.validate() assert len(errors) == 0 def test_invalid_list_syntax_produces_errors(self, invalid_list_syntax_tsv, project_samples): """Test that cells starting with '[' but can't be parsed produce hard errors.""" - validator = MetadataTSVValidator(invalid_list_syntax_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(invalid_list_syntax_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("[4" in e and "invalid" in e.lower() for e in errors) assert any("[1 2 3]" in e and "invalid" in e.lower() for e in errors) def test_invalid_list_syntax_collects_all_errors(self, invalid_list_syntax_tsv, project_samples): """Test that all invalid list cells should be reported, not just the first one.""" - validator = MetadataTSVValidator(invalid_list_syntax_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(invalid_list_syntax_tsv, project_samples) stats, errors, warnings = validator.validate() list_errors = [e for e in errors if "invalid" in e.lower() and "list" in e.lower()] assert len(list_errors) == 2 def test_whitespace_insensitive_list_parsing(self, whitespace_variant_lists_tsv): """Test that [1,2,3], [1, 2, 3], and [ 1 , 2 , 3 ] all parse identically.""" - validator = MetadataTSVValidator(whitespace_variant_lists_tsv, {"S1", "S2", "S3"}) + validator = ClientSideClientSideMetadataTSVValidator(whitespace_variant_lists_tsv, {"S1", "S2", "S3"}) stats, errors, warnings = validator.validate() assert len(errors) == 0 assert "Scores" in stats["numeric_columns"] def test_empty_list_parses_successfully(self, empty_list_tsv): """Test that an empty list [] should parse without errors.""" - validator = MetadataTSVValidator(empty_list_tsv, {"S1", "S2", "S3"}) + validator = ClientSideClientSideMetadataTSVValidator(empty_list_tsv, {"S1", "S2", "S3"}) stats, errors, warnings = validator.validate() list_errors = [e for e in errors if "list" in e.lower()] assert len(list_errors) == 0 @@ -148,7 +148,7 @@ class TestTypeValidation: def test_numeric_list_column_is_numeric(self, numeric_list_tsv, project_samples): """Test that columns with only numeric list cells (multi-value) and numeric scalars (single-values) are numeric.""" - validator = MetadataTSVValidator(numeric_list_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(numeric_list_tsv, project_samples) stats, errors, warnings = validator.validate() assert "Scores" in stats["numeric_columns"] assert "Values" in stats["numeric_columns"] @@ -156,27 +156,27 @@ def test_numeric_list_column_is_numeric(self, numeric_list_tsv, project_samples) def test_string_list_column_is_string(self, string_list_tsv): """Test that columns with string list cells are classified as string.""" - validator = MetadataTSVValidator(string_list_tsv, {"S1", "S2", "S3"}) + validator = ClientSideClientSideMetadataTSVValidator(string_list_tsv, {"S1", "S2", "S3"}) stats, errors, warnings = validator.validate() assert "Areas" in stats["string_columns"] assert "Score" in stats["numeric_columns"] def test_mixed_types_within_list_cell_is_error(self, mixed_type_list_cell_tsv): """Test that a list cell like [1, "two", 3] with mixed element types raise an error.""" - validator = MetadataTSVValidator(mixed_type_list_cell_tsv, {"S1", "S2", "S3"}) + validator = ClientSideClientSideMetadataTSVValidator(mixed_type_list_cell_tsv, {"S1", "S2", "S3"}) stats, errors, warnings = validator.validate() assert any("mixed element types" in e.lower() for e in errors) def test_mixed_types_across_cells_is_warning(self, mixed_type_across_cells_tsv, project_samples): """Test that a column with both numeric and string sends a warning (not error).""" - validator = MetadataTSVValidator(mixed_type_across_cells_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(mixed_type_across_cells_tsv, project_samples) stats, errors, warnings = validator.validate() assert "Population" in stats["mixed_type_columns"] assert not any("Population" in e and "mixed" in e.lower() for e in errors) def test_mixed_types_across_cells_produces_per_cell_warnings(self, mixed_type_across_cells_tsv, project_samples): """Test that mixed-type columns identifies which cells are outliers.""" - validator = MetadataTSVValidator(mixed_type_across_cells_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(mixed_type_across_cells_tsv, project_samples) stats, errors, warnings = validator.validate() cell_warnings = [w for w in warnings if "non-numeric" in w.lower() and "Population" in w] assert len(cell_warnings) >= 1 @@ -185,13 +185,13 @@ def test_mixed_types_across_cells_produces_per_cell_warnings(self, mixed_type_ac def test_mixed_types_clarification_warning(self, mixed_type_across_cells_tsv, project_samples): """Test that a general clarification warning is sent for mixed-type columns.""" - validator = MetadataTSVValidator(mixed_type_across_cells_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(mixed_type_across_cells_tsv, project_samples) stats, errors, warnings = validator.validate() assert any("clarification on mixed types" in w.lower() for w in warnings) def test_negative_numbers_are_numeric(self, numeric_list_tsv, project_samples): """Test that negative numbers are classified as numeric, not string.""" - validator = MetadataTSVValidator(numeric_list_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(numeric_list_tsv, project_samples) stats, errors, warnings = validator.validate() assert "Temperature" in stats["numeric_columns"] assert "Longitude" in stats["numeric_columns"] @@ -205,7 +205,7 @@ def test_mixed_list_and_scalar_in_same_column(self, tmp_path): content += "S3\t[3, 4]\n" p = tmp_path / "mixed_list_scalar.tsv" p.write_text(content) - validator = MetadataTSVValidator(p, {"S1", "S2", "S3"}) + validator = ClientSideClientSideMetadataTSVValidator(p, {"S1", "S2", "S3"}) stats, errors, warnings = validator.validate() assert len(errors) == 0 assert "Scores" in stats["numeric_columns"] @@ -216,7 +216,7 @@ class TestDimensionMatching: def test_samples_not_in_project(self, valid_list_tsv): project_samples = {"S1", "S2"} - validator = MetadataTSVValidator(valid_list_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(valid_list_tsv, project_samples) stats, errors, warnings = validator.validate() assert any( "following samples in the TSV were not found in the DivBase project's dimensions index" in e for e in errors @@ -224,7 +224,7 @@ def test_samples_not_in_project(self, valid_list_tsv): def test_samples_not_in_tsv(self, valid_list_tsv): project_samples = {"S1", "S2", "S3", "S10", "S20"} - validator = MetadataTSVValidator(valid_list_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(valid_list_tsv, project_samples) stats, errors, warnings = validator.validate() assert any( "following samples in the DivBase project's dimensions index were not found in the TSV" in w and "S10" in w @@ -234,7 +234,7 @@ def test_samples_not_in_tsv(self, valid_list_tsv): def test_large_dimension_mismatch_is_summarized(self, valid_list_tsv): # Create samples named S0001, S0002, ..., S0050 project_samples = {f"S{i:04d}" for i in range(1, 51)} - validator = MetadataTSVValidator(valid_list_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(valid_list_tsv, project_samples) _, _, warnings = validator.validate() assert any("count: 50, showing first 20" in w for w in warnings) assert any("--full-sample-mismatch-names" in w for w in warnings) @@ -242,7 +242,9 @@ def test_large_dimension_mismatch_is_summarized(self, valid_list_tsv): def test_large_dimension_mismatch_can_show_full_list(self, valid_list_tsv): project_samples = {f"S{i:04d}" for i in range(1, 51)} - validator = MetadataTSVValidator(valid_list_tsv, project_samples, dimensions_sample_preview_limit=None) + validator = ClientSideClientSideMetadataTSVValidator( + valid_list_tsv, project_samples, dimensions_sample_preview_limit=None + ) _, _, warnings = validator.validate() assert any("count: 50, samples:" in w and "S0050" in w for w in warnings) assert not any("showing first 20" in w for w in warnings) @@ -253,7 +255,7 @@ class TestStatistics: """Test statistics collection.""" def test_statistics_collection(self, valid_list_tsv, project_samples): - validator = MetadataTSVValidator(valid_list_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(valid_list_tsv, project_samples) stats, errors, warnings = validator.validate() assert stats["total_columns"] == 4 assert stats["user_defined_columns"] == 3 @@ -267,13 +269,13 @@ def test_statistics_collection(self, valid_list_tsv, project_samples): def test_no_multi_values_detected(self, no_multi_values_tsv): """Test that has_multi_values is False when no list cells exist.""" - validator = MetadataTSVValidator(no_multi_values_tsv, {"S1", "S2"}) + validator = ClientSideClientSideMetadataTSVValidator(no_multi_values_tsv, {"S1", "S2"}) stats, errors, warnings = validator.validate() assert stats["has_multi_values"] is False def test_multi_values_detected_via_list_cells(self, valid_list_tsv, project_samples): """Test that has_multi_values is True when list cells exist.""" - validator = MetadataTSVValidator(valid_list_tsv, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(valid_list_tsv, project_samples) stats, errors, warnings = validator.validate() assert stats["has_multi_values"] is True @@ -284,12 +286,12 @@ class TestEdgeCases: def test_empty_file(self, project_samples, tmp_path): empty_file = tmp_path / "empty.tsv" empty_file.write_text("") - validator = MetadataTSVValidator(empty_file, project_samples) + validator = ClientSideClientSideMetadataTSVValidator(empty_file, project_samples) stats, errors, warnings = validator.validate() assert any("File is empty" in e for e in errors) def test_nonexistent_file(self, project_samples): - validator = MetadataTSVValidator(Path("/nonexistent/file.tsv"), project_samples) + validator = ClientSideClientSideMetadataTSVValidator(Path("/nonexistent/file.tsv"), project_samples) stats, errors, warnings = validator.validate() assert any("Failed to read file" in e for e in errors) @@ -300,7 +302,7 @@ def test_non_list_bracket_strings(self, tmp_path): content += "S2\tnormal\n" p = tmp_path / "bracket_strings.tsv" p.write_text(content) - validator = MetadataTSVValidator(p, {"S1", "S2"}) + validator = ClientSideClientSideMetadataTSVValidator(p, {"S1", "S2"}) stats, errors, warnings = validator.validate() list_errors = [e for e in errors if "list" in e.lower()] assert len(list_errors) == 0 @@ -312,7 +314,7 @@ def test_cell_starting_with_bracket_but_not_list(self, tmp_path): content += "S2\tnormal\n" p = tmp_path / "bracket_ref.tsv" p.write_text(content) - validator = MetadataTSVValidator(p, {"S1", "S2"}) + validator = ClientSideClientSideMetadataTSVValidator(p, {"S1", "S2"}) stats, errors, warnings = validator.validate() assert any("[ref]" in e and "invalid" in e.lower() for e in errors) @@ -323,7 +325,7 @@ def test_tuple_notation_is_not_a_list(self, tmp_path): content += "S2\tnormal\n" p = tmp_path / "tuple.tsv" p.write_text(content) - validator = MetadataTSVValidator(p, {"S1", "S2"}) + validator = ClientSideClientSideMetadataTSVValidator(p, {"S1", "S2"}) stats, errors, warnings = validator.validate() list_errors = [e for e in errors if "list" in e.lower()] assert len(list_errors) == 0 @@ -337,7 +339,7 @@ def test_quoted_cell_with_comma_space_produces_comma_warning(self, tmp_path): p = tmp_path / "quoted_comma_space.tsv" p.write_text(content) - validator = MetadataTSVValidator(p, {"S1", "S2"}) + validator = ClientSideClientSideMetadataTSVValidator(p, {"S1", "S2"}) stats, errors, warnings = validator.validate() assert len(errors) == 0 assert any("comma" in w.lower() and "North, South" in w for w in warnings) @@ -347,7 +349,7 @@ def test_quoted_cell_with_embedded_double_quotes_is_allowed(self, tmp_path): p = tmp_path / "embedded_quotes.tsv" p.write_text(content) - validator = MetadataTSVValidator(p, {"S1", "S2"}) + validator = ClientSideClientSideMetadataTSVValidator(p, {"S1", "S2"}) stats, errors, warnings = validator.validate() assert len(errors) == 0 assert not any("list syntax" in e.lower() for e in errors)