From 984b60ac8b14850e62426a9e5ccdad9d9cab10d4 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Thu, 26 Feb 2026 12:27:31 +0300 Subject: [PATCH 1/4] Additional validation for disease ontology --- src/cap_upload_validator/errors.py | 10 +++++ src/cap_upload_validator/upload_validator.py | 43 +++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 544c574..17b5a84 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -114,6 +114,16 @@ def __init__(self, missing_columns: list[str] = None): self.message = msg +class AnnDataMultipleDiseaseOntologyIDs(CapException): + name = "AnnDataMultipleDiseaseOntologyIDs" + message = "Only one disease ontology ID is allowed per value." + + +class AnnDataInvalidDiseaseOntologyForHuman(CapException): + name = "AnnDataInvalidDiseaseOntologyForHuman" + message = "For human samples only MONDO or PATO IDs are allowed." + + class AnnDataEmptyOrNoneInGeneralMetadata(CapException): name = "AnnDataEmptyOrNoneInGeneralMetadata" diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 3f03c8a..acd8532 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -21,6 +21,8 @@ AnnDataMissingEmbeddings, AnnDataMissingObs, AnnDataMissingObsColumns, + AnnDataMultipleDiseaseOntologyIDs, + AnnDataInvalidDiseaseOntologyForHuman, AnnDataMissingVarIndex, AnnDataNumericVarIndex, AnnDataVarNotSubsetOfRawVar, @@ -39,6 +41,7 @@ ORGANISM_COLUMN = "organism" ORGANISM_ONT_ID_COLUMN = f"{ORGANISM_COLUMN}_ontology_term_id" GENERAL_METADATA = ["assay", "disease", ORGANISM_COLUMN, "tissue"] +DISEASE_ONTOLOGY_HUMAN_PREFIXES = ("MONDO", "PATO") class UploadValidator: @@ -78,8 +81,8 @@ def validate(self, report_success: bool = True) -> None: self._validate_x_and_raw_x_formats(cap_adata) self._check_X(cap_adata) self._check_obsm(cap_adata) - self._check_obs(cap_adata) self._check_var_index(cap_adata) + self._check_obs(cap_adata) # Must be called after organism detection in _check_var_index # Check any errors were during validation stage and raise them if self._multi_exception.have_errors(): @@ -213,6 +216,9 @@ def _check_column(series: pd.Series, name: str): _check_column(cap_adata.obs[ont_id_col], ont_id_col) + if col == "disease": + self._validate_disease_ontology(cap_adata.obs[ont_id_col]) + # Report missing columns if missing_columns: logger.debug("Missing required obs columns: " + ", ".join(missing_columns)) @@ -234,6 +240,41 @@ def _check_column(series: pd.Series, name: str): logger.debug("Finished checking obs!") + def _validate_disease_ontology(self, series: pd.Series) -> None: + if series is None: + return + + has_multiple_ids = False + has_invalid_prefix_for_human = False + + for value in series.dropna(): + value_str = str(value).strip() + + if not value_str: + continue + + # Multiple IDs restriction + if "," in value_str: + has_multiple_ids = True + continue + + # Human-specific restriction + if self._organism is HomoSapiens: + delimiter = ":" + if delimiter not in value_str: + continue # format validation not in scope + + prefix = value_str.split(delimiter, 1)[0] + if prefix not in DISEASE_ONTOLOGY_HUMAN_PREFIXES: + has_invalid_prefix_for_human = True + + # Append errors only once + if has_multiple_ids: + self._multi_exception.append(AnnDataMultipleDiseaseOntologyIDs()) + + if has_invalid_prefix_for_human: + self._multi_exception.append(AnnDataInvalidDiseaseOntologyForHuman()) + @staticmethod def _classify_missing(series: pd.Series) -> tuple[bool, bool]: """ From f02474cfb2a982634f7d485700f45060fda35c08 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 4 Mar 2026 11:40:29 +0300 Subject: [PATCH 2/4] fixes and unit tests --- src/cap_upload_validator/errors.py | 16 +++- src/cap_upload_validator/upload_validator.py | 69 +++++++++------ test/test_upload_validator.py | 92 ++++++++++++++++++++ 3 files changed, 145 insertions(+), 32 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 17b5a84..5f022f4 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -114,9 +114,19 @@ def __init__(self, missing_columns: list[str] = None): self.message = msg -class AnnDataMultipleDiseaseOntologyIDs(CapException): - name = "AnnDataMultipleDiseaseOntologyIDs" - message = "Only one disease ontology ID is allowed per value." +class AnnDataMultipleOntologyIDs(CapException): + name = "AnnDataMultipleOntologyIDs" + + def __init__(self, columns: list[str] | None = None): + msg = ( + "Ontology term columns must contain exactly one ontology ID per value. " + "Multiple IDs (e.g. comma-separated values) are not allowed." + ) + + if columns: + msg += "\nColumns with multiple IDs detected: " + ", ".join(sorted(columns)) + + self.message = msg class AnnDataInvalidDiseaseOntologyForHuman(CapException): diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index acd8532..4bc8ed9 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -21,7 +21,7 @@ AnnDataMissingEmbeddings, AnnDataMissingObs, AnnDataMissingObsColumns, - AnnDataMultipleDiseaseOntologyIDs, + AnnDataMultipleOntologyIDs, AnnDataInvalidDiseaseOntologyForHuman, AnnDataMissingVarIndex, AnnDataNumericVarIndex, @@ -186,6 +186,7 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: missing_columns: list[str] = [] none_columns: set[str] = set() empty_columns: set[str] = set() + multiple_ontology_columns: set[str] = set() def _check_column(series: pd.Series, name: str): has_none, has_empty = self._classify_missing(series) @@ -214,10 +215,27 @@ def _check_column(series: pd.Series, name: str): if ont_id_col not in cap_adata.obs.columns: cap_adata.read_obs(columns=[ont_id_col]) - _check_column(cap_adata.obs[ont_id_col], ont_id_col) + series = cap_adata.obs[ont_id_col] + _check_column(series, ont_id_col) + # General validation (all ontology columns) + unique_values = ( + series + .dropna() + .astype(str) + .str.strip() + .unique() + ) + + for value_str in unique_values: + if not value_str: + continue + if "," in value_str: + multiple_ontology_columns.add(ont_id_col) + + # Disease-specific validation if col == "disease": - self._validate_disease_ontology(cap_adata.obs[ont_id_col]) + self._validate_disease_ontology(series) # Report missing columns if missing_columns: @@ -231,46 +249,39 @@ def _check_column(series: pd.Series, name: str): ", ".join(sorted(empty_columns)) if empty_columns else "—", ", ".join(sorted(none_columns)) if none_columns else "—", ) - self._multi_exception.append( - AnnDataEmptyOrNoneInGeneralMetadata( - none_columns=list(none_columns), - empty_columns=list(empty_columns), - ) - ) + self._multi_exception.append(AnnDataEmptyOrNoneInGeneralMetadata(none_columns=list(none_columns), empty_columns=list(empty_columns))) + + if multiple_ontology_columns: + self._multi_exception.append(AnnDataMultipleOntologyIDs(columns=list(multiple_ontology_columns))) logger.debug("Finished checking obs!") def _validate_disease_ontology(self, series: pd.Series) -> None: - if series is None: + if series is None or self._organism is not HomoSapiens: return - has_multiple_ids = False has_invalid_prefix_for_human = False - for value in series.dropna(): - value_str = str(value).strip() + unique_values = ( + series + .dropna() + .astype(str) + .str.strip() + .unique() + ) + delimiter = ":" + for value_str in unique_values: if not value_str: continue - # Multiple IDs restriction - if "," in value_str: - has_multiple_ids = True + if delimiter not in value_str: + has_invalid_prefix_for_human = True continue - # Human-specific restriction - if self._organism is HomoSapiens: - delimiter = ":" - if delimiter not in value_str: - continue # format validation not in scope - - prefix = value_str.split(delimiter, 1)[0] - if prefix not in DISEASE_ONTOLOGY_HUMAN_PREFIXES: - has_invalid_prefix_for_human = True - - # Append errors only once - if has_multiple_ids: - self._multi_exception.append(AnnDataMultipleDiseaseOntologyIDs()) + prefix = value_str.split(delimiter, 1)[0] + if prefix not in DISEASE_ONTOLOGY_HUMAN_PREFIXES: + has_invalid_prefix_for_human = True if has_invalid_prefix_for_human: self._multi_exception.append(AnnDataInvalidDiseaseOntologyForHuman()) diff --git a/test/test_upload_validator.py b/test/test_upload_validator.py index b0ae059..86f23f2 100644 --- a/test/test_upload_validator.py +++ b/test/test_upload_validator.py @@ -31,6 +31,8 @@ CapMultiException, AnnDataEmptyOrNoneInGeneralMetadata, CSCMatrixInX, + AnnDataMultipleOntologyIDs, + AnnDataInvalidDiseaseOntologyForHuman, ) TMP_DIR = Path(tempfile.mkdtemp()) @@ -338,3 +340,93 @@ def test_dense_and_csr_pass(tmp_path): with read_h5ad(p, edit=False) as cap_adata: v._validate_x_and_raw_x_formats(cap_adata) # should not raise + + +def add_required_obs_columns(adata): + adata.obs[ORGANISM_COLUMN] = pd.Categorical( + [HomoSapiens.name] * adata.n_obs + ) + adata.obs["assay_ontology_term_id"] = pd.Categorical( + ["EFO:0000001"] * adata.n_obs + ) + adata.obs["organism_ontology_term_id"] = pd.Categorical( + [HomoSapiens.ontology_id] * adata.n_obs + ) + adata.obs["disease_ontology_term_id"] = pd.Categorical( + ["MONDO:0000001"] * adata.n_obs + ) + adata.obs["tissue_ontology_term_id"] = pd.Categorical( + ["UBERON:0000001"] * adata.n_obs + ) + + +def test_multiple_ontology_ids_raises(tmp_path, monkeypatch): + file_path = tmp_path / "test_multiple_ids.h5ad" + + adata = ad.AnnData(X=np.eye(3)) + add_required_obs_columns(adata) + + # Add invalid multi-ID category + adata.obs["tissue_ontology_term_id"] = ( + adata.obs["tissue_ontology_term_id"] + .cat.add_categories(["UBERON:0001,UBERON:0002"]) + ) + + adata.obs.iloc[0, adata.obs.columns.get_loc("tissue_ontology_term_id")] = \ + "UBERON:0001,UBERON:0002" + + adata.write_h5ad(file_path) + + validator = UploadValidator(file_path) + validator._multi_exception.raise_on_append = True + + # Mock var validation + def mock_check_var_index(self, cap_adata): + self._organism = HomoSapiens + + monkeypatch.setattr( + UploadValidator, + "_check_var_index", + mock_check_var_index, + ) + + with read_h5ad(file_path) as cap_adata: + cap_adata.read_obs() + validator._check_var_index(cap_adata) + + with pytest.raises(AnnDataMultipleOntologyIDs): + validator._check_obs(cap_adata) + + +def test_invalid_disease_prefix_for_human_raises(tmp_path, monkeypatch): + file_path = tmp_path / "test_invalid_disease_prefix.h5ad" + + adata = ad.AnnData(X=np.eye(3)) + add_required_obs_columns(adata) + + # Replace disease column with invalid prefix + adata.obs["disease_ontology_term_id"] = pd.Categorical( + ["DOID:1234"] * adata.n_obs + ) + + adata.write_h5ad(file_path) + + validator = UploadValidator(file_path) + validator._multi_exception.raise_on_append = True + + # Mock var validation + def mock_check_var_index(self, cap_adata): + self._organism = HomoSapiens + + monkeypatch.setattr( + UploadValidator, + "_check_var_index", + mock_check_var_index, + ) + + with read_h5ad(file_path) as cap_adata: + cap_adata.read_obs() + validator._check_var_index(cap_adata) + + with pytest.raises(AnnDataInvalidDiseaseOntologyForHuman): + validator._check_obs(cap_adata) From 46cc01540bb8632b02b0fb7791fa8665915a570f Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 4 Mar 2026 11:43:24 +0300 Subject: [PATCH 3/4] Update errors.py --- src/cap_upload_validator/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 5f022f4..c908697 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -117,7 +117,7 @@ def __init__(self, missing_columns: list[str] = None): class AnnDataMultipleOntologyIDs(CapException): name = "AnnDataMultipleOntologyIDs" - def __init__(self, columns: list[str] | None = None): + def __init__(self, columns: Optional[List[str]] = None): msg = ( "Ontology term columns must contain exactly one ontology ID per value. " "Multiple IDs (e.g. comma-separated values) are not allowed." From 8bf8787ac1d3948cc05cc733566ed184262a288a Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 4 Mar 2026 12:49:43 +0300 Subject: [PATCH 4/4] Update src/cap_upload_validator/errors.py Co-authored-by: Roman Mukhin <59999203+rm1113@users.noreply.github.com> --- src/cap_upload_validator/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index c908697..9a3f81b 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -131,7 +131,7 @@ def __init__(self, columns: Optional[List[str]] = None): class AnnDataInvalidDiseaseOntologyForHuman(CapException): name = "AnnDataInvalidDiseaseOntologyForHuman" - message = "For human samples only MONDO or PATO IDs are allowed." + message = "Unsupported disease ontology term. For Homo sapiens datasets, only `MONDO:` and `PATO:` ontology terms are supported." class AnnDataEmptyOrNoneInGeneralMetadata(CapException):