diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 544c574..9a3f81b 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -114,6 +114,26 @@ def __init__(self, missing_columns: list[str] = None): self.message = msg +class AnnDataMultipleOntologyIDs(CapException): + name = "AnnDataMultipleOntologyIDs" + + def __init__(self, columns: Optional[List[str]] = None): + msg = ( + "Ontology term columns must contain exactly one ontology ID per value. " + "Multiple IDs (e.g. comma-separated values) are not allowed." + ) + + if columns: + msg += "\nColumns with multiple IDs detected: " + ", ".join(sorted(columns)) + + self.message = msg + + +class AnnDataInvalidDiseaseOntologyForHuman(CapException): + name = "AnnDataInvalidDiseaseOntologyForHuman" + message = "Unsupported disease ontology term. For Homo sapiens datasets, only `MONDO:` and `PATO:` ontology terms are supported." + + class AnnDataEmptyOrNoneInGeneralMetadata(CapException): name = "AnnDataEmptyOrNoneInGeneralMetadata" diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 3f03c8a..4bc8ed9 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -21,6 +21,8 @@ AnnDataMissingEmbeddings, AnnDataMissingObs, AnnDataMissingObsColumns, + AnnDataMultipleOntologyIDs, + AnnDataInvalidDiseaseOntologyForHuman, AnnDataMissingVarIndex, AnnDataNumericVarIndex, AnnDataVarNotSubsetOfRawVar, @@ -39,6 +41,7 @@ ORGANISM_COLUMN = "organism" ORGANISM_ONT_ID_COLUMN = f"{ORGANISM_COLUMN}_ontology_term_id" GENERAL_METADATA = ["assay", "disease", ORGANISM_COLUMN, "tissue"] +DISEASE_ONTOLOGY_HUMAN_PREFIXES = ("MONDO", "PATO") class UploadValidator: @@ -78,8 +81,8 @@ def validate(self, report_success: bool = True) -> None: self._validate_x_and_raw_x_formats(cap_adata) self._check_X(cap_adata) self._check_obsm(cap_adata) - self._check_obs(cap_adata) self._check_var_index(cap_adata) + self._check_obs(cap_adata) # Must be called after organism detection in _check_var_index # Check any errors were during validation stage and raise them if self._multi_exception.have_errors(): @@ -183,6 +186,7 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: missing_columns: list[str] = [] none_columns: set[str] = set() empty_columns: set[str] = set() + multiple_ontology_columns: set[str] = set() def _check_column(series: pd.Series, name: str): has_none, has_empty = self._classify_missing(series) @@ -211,7 +215,27 @@ def _check_column(series: pd.Series, name: str): if ont_id_col not in cap_adata.obs.columns: cap_adata.read_obs(columns=[ont_id_col]) - _check_column(cap_adata.obs[ont_id_col], ont_id_col) + series = cap_adata.obs[ont_id_col] + _check_column(series, ont_id_col) + + # General validation (all ontology columns) + unique_values = ( + series + .dropna() + .astype(str) + .str.strip() + .unique() + ) + + for value_str in unique_values: + if not value_str: + continue + if "," in value_str: + multiple_ontology_columns.add(ont_id_col) + + # Disease-specific validation + if col == "disease": + self._validate_disease_ontology(series) # Report missing columns if missing_columns: @@ -225,15 +249,43 @@ def _check_column(series: pd.Series, name: str): ", ".join(sorted(empty_columns)) if empty_columns else "—", ", ".join(sorted(none_columns)) if none_columns else "—", ) - self._multi_exception.append( - AnnDataEmptyOrNoneInGeneralMetadata( - none_columns=list(none_columns), - empty_columns=list(empty_columns), - ) - ) + self._multi_exception.append(AnnDataEmptyOrNoneInGeneralMetadata(none_columns=list(none_columns), empty_columns=list(empty_columns))) + + if multiple_ontology_columns: + self._multi_exception.append(AnnDataMultipleOntologyIDs(columns=list(multiple_ontology_columns))) logger.debug("Finished checking obs!") + def _validate_disease_ontology(self, series: pd.Series) -> None: + if series is None or self._organism is not HomoSapiens: + return + + has_invalid_prefix_for_human = False + + unique_values = ( + series + .dropna() + .astype(str) + .str.strip() + .unique() + ) + + delimiter = ":" + for value_str in unique_values: + if not value_str: + continue + + if delimiter not in value_str: + has_invalid_prefix_for_human = True + continue + + prefix = value_str.split(delimiter, 1)[0] + if prefix not in DISEASE_ONTOLOGY_HUMAN_PREFIXES: + has_invalid_prefix_for_human = True + + if has_invalid_prefix_for_human: + self._multi_exception.append(AnnDataInvalidDiseaseOntologyForHuman()) + @staticmethod def _classify_missing(series: pd.Series) -> tuple[bool, bool]: """ diff --git a/test/test_upload_validator.py b/test/test_upload_validator.py index b0ae059..86f23f2 100644 --- a/test/test_upload_validator.py +++ b/test/test_upload_validator.py @@ -31,6 +31,8 @@ CapMultiException, AnnDataEmptyOrNoneInGeneralMetadata, CSCMatrixInX, + AnnDataMultipleOntologyIDs, + AnnDataInvalidDiseaseOntologyForHuman, ) TMP_DIR = Path(tempfile.mkdtemp()) @@ -338,3 +340,93 @@ def test_dense_and_csr_pass(tmp_path): with read_h5ad(p, edit=False) as cap_adata: v._validate_x_and_raw_x_formats(cap_adata) # should not raise + + +def add_required_obs_columns(adata): + adata.obs[ORGANISM_COLUMN] = pd.Categorical( + [HomoSapiens.name] * adata.n_obs + ) + adata.obs["assay_ontology_term_id"] = pd.Categorical( + ["EFO:0000001"] * adata.n_obs + ) + adata.obs["organism_ontology_term_id"] = pd.Categorical( + [HomoSapiens.ontology_id] * adata.n_obs + ) + adata.obs["disease_ontology_term_id"] = pd.Categorical( + ["MONDO:0000001"] * adata.n_obs + ) + adata.obs["tissue_ontology_term_id"] = pd.Categorical( + ["UBERON:0000001"] * adata.n_obs + ) + + +def test_multiple_ontology_ids_raises(tmp_path, monkeypatch): + file_path = tmp_path / "test_multiple_ids.h5ad" + + adata = ad.AnnData(X=np.eye(3)) + add_required_obs_columns(adata) + + # Add invalid multi-ID category + adata.obs["tissue_ontology_term_id"] = ( + adata.obs["tissue_ontology_term_id"] + .cat.add_categories(["UBERON:0001,UBERON:0002"]) + ) + + adata.obs.iloc[0, adata.obs.columns.get_loc("tissue_ontology_term_id")] = \ + "UBERON:0001,UBERON:0002" + + adata.write_h5ad(file_path) + + validator = UploadValidator(file_path) + validator._multi_exception.raise_on_append = True + + # Mock var validation + def mock_check_var_index(self, cap_adata): + self._organism = HomoSapiens + + monkeypatch.setattr( + UploadValidator, + "_check_var_index", + mock_check_var_index, + ) + + with read_h5ad(file_path) as cap_adata: + cap_adata.read_obs() + validator._check_var_index(cap_adata) + + with pytest.raises(AnnDataMultipleOntologyIDs): + validator._check_obs(cap_adata) + + +def test_invalid_disease_prefix_for_human_raises(tmp_path, monkeypatch): + file_path = tmp_path / "test_invalid_disease_prefix.h5ad" + + adata = ad.AnnData(X=np.eye(3)) + add_required_obs_columns(adata) + + # Replace disease column with invalid prefix + adata.obs["disease_ontology_term_id"] = pd.Categorical( + ["DOID:1234"] * adata.n_obs + ) + + adata.write_h5ad(file_path) + + validator = UploadValidator(file_path) + validator._multi_exception.raise_on_append = True + + # Mock var validation + def mock_check_var_index(self, cap_adata): + self._organism = HomoSapiens + + monkeypatch.setattr( + UploadValidator, + "_check_var_index", + mock_check_var_index, + ) + + with read_h5ad(file_path) as cap_adata: + cap_adata.read_obs() + validator._check_var_index(cap_adata) + + with pytest.raises(AnnDataInvalidDiseaseOntologyForHuman): + validator._check_obs(cap_adata)