Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions src/cap_upload_validator/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,26 @@ def __init__(self, missing_columns: list[str] = None):
self.message = msg


class AnnDataMultipleOntologyIDs(CapException):
name = "AnnDataMultipleOntologyIDs"

def __init__(self, columns: Optional[List[str]] = None):
msg = (
"Ontology term columns must contain exactly one ontology ID per value. "
"Multiple IDs (e.g. comma-separated values) are not allowed."
)

if columns:
msg += "\nColumns with multiple IDs detected: " + ", ".join(sorted(columns))

self.message = msg


class AnnDataInvalidDiseaseOntologyForHuman(CapException):
name = "AnnDataInvalidDiseaseOntologyForHuman"
message = "Unsupported disease ontology term. For Homo sapiens datasets, only `MONDO:` and `PATO:` ontology terms are supported."


class AnnDataEmptyOrNoneInGeneralMetadata(CapException):
name = "AnnDataEmptyOrNoneInGeneralMetadata"

Expand Down
68 changes: 60 additions & 8 deletions src/cap_upload_validator/upload_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
AnnDataMissingEmbeddings,
AnnDataMissingObs,
AnnDataMissingObsColumns,
AnnDataMultipleOntologyIDs,
AnnDataInvalidDiseaseOntologyForHuman,
AnnDataMissingVarIndex,
AnnDataNumericVarIndex,
AnnDataVarNotSubsetOfRawVar,
Expand All @@ -39,6 +41,7 @@
ORGANISM_COLUMN = "organism"
ORGANISM_ONT_ID_COLUMN = f"{ORGANISM_COLUMN}_ontology_term_id"
GENERAL_METADATA = ["assay", "disease", ORGANISM_COLUMN, "tissue"]
DISEASE_ONTOLOGY_HUMAN_PREFIXES = ("MONDO", "PATO")

class UploadValidator:

Expand Down Expand Up @@ -78,8 +81,8 @@ def validate(self, report_success: bool = True) -> None:
self._validate_x_and_raw_x_formats(cap_adata)
self._check_X(cap_adata)
self._check_obsm(cap_adata)
self._check_obs(cap_adata)
self._check_var_index(cap_adata)
self._check_obs(cap_adata) # Must be called after organism detection in _check_var_index

# Check any errors were during validation stage and raise them
if self._multi_exception.have_errors():
Expand Down Expand Up @@ -183,6 +186,7 @@ def _check_obs(self, cap_adata: CapAnnData) -> None:
missing_columns: list[str] = []
none_columns: set[str] = set()
empty_columns: set[str] = set()
multiple_ontology_columns: set[str] = set()

def _check_column(series: pd.Series, name: str):
has_none, has_empty = self._classify_missing(series)
Expand Down Expand Up @@ -211,7 +215,27 @@ def _check_column(series: pd.Series, name: str):
if ont_id_col not in cap_adata.obs.columns:
cap_adata.read_obs(columns=[ont_id_col])

_check_column(cap_adata.obs[ont_id_col], ont_id_col)
series = cap_adata.obs[ont_id_col]
_check_column(series, ont_id_col)

# General validation (all ontology columns)
unique_values = (
series
.dropna()
.astype(str)
.str.strip()
.unique()
)

for value_str in unique_values:
if not value_str:
continue
if "," in value_str:
multiple_ontology_columns.add(ont_id_col)

# Disease-specific validation
if col == "disease":
self._validate_disease_ontology(series)

# Report missing columns
if missing_columns:
Expand All @@ -225,15 +249,43 @@ def _check_column(series: pd.Series, name: str):
", ".join(sorted(empty_columns)) if empty_columns else "—",
", ".join(sorted(none_columns)) if none_columns else "—",
)
self._multi_exception.append(
AnnDataEmptyOrNoneInGeneralMetadata(
none_columns=list(none_columns),
empty_columns=list(empty_columns),
)
)
self._multi_exception.append(AnnDataEmptyOrNoneInGeneralMetadata(none_columns=list(none_columns), empty_columns=list(empty_columns)))

if multiple_ontology_columns:
self._multi_exception.append(AnnDataMultipleOntologyIDs(columns=list(multiple_ontology_columns)))

logger.debug("Finished checking obs!")

def _validate_disease_ontology(self, series: pd.Series) -> None:
if series is None or self._organism is not HomoSapiens:
return

has_invalid_prefix_for_human = False

unique_values = (
series
.dropna()
.astype(str)
.str.strip()
.unique()
)

delimiter = ":"
for value_str in unique_values:
if not value_str:
continue

if delimiter not in value_str:
has_invalid_prefix_for_human = True
continue

prefix = value_str.split(delimiter, 1)[0]
if prefix not in DISEASE_ONTOLOGY_HUMAN_PREFIXES:
has_invalid_prefix_for_human = True

if has_invalid_prefix_for_human:
self._multi_exception.append(AnnDataInvalidDiseaseOntologyForHuman())

@staticmethod
def _classify_missing(series: pd.Series) -> tuple[bool, bool]:
"""
Expand Down
92 changes: 92 additions & 0 deletions test/test_upload_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
CapMultiException,
AnnDataEmptyOrNoneInGeneralMetadata,
CSCMatrixInX,
AnnDataMultipleOntologyIDs,
AnnDataInvalidDiseaseOntologyForHuman,
)

TMP_DIR = Path(tempfile.mkdtemp())
Expand Down Expand Up @@ -338,3 +340,93 @@ def test_dense_and_csr_pass(tmp_path):

with read_h5ad(p, edit=False) as cap_adata:
v._validate_x_and_raw_x_formats(cap_adata) # should not raise


def add_required_obs_columns(adata):
adata.obs[ORGANISM_COLUMN] = pd.Categorical(
[HomoSapiens.name] * adata.n_obs
)
adata.obs["assay_ontology_term_id"] = pd.Categorical(
["EFO:0000001"] * adata.n_obs
)
adata.obs["organism_ontology_term_id"] = pd.Categorical(
[HomoSapiens.ontology_id] * adata.n_obs
)
adata.obs["disease_ontology_term_id"] = pd.Categorical(
["MONDO:0000001"] * adata.n_obs
)
adata.obs["tissue_ontology_term_id"] = pd.Categorical(
["UBERON:0000001"] * adata.n_obs
)


def test_multiple_ontology_ids_raises(tmp_path, monkeypatch):
file_path = tmp_path / "test_multiple_ids.h5ad"

adata = ad.AnnData(X=np.eye(3))
add_required_obs_columns(adata)

# Add invalid multi-ID category
adata.obs["tissue_ontology_term_id"] = (
adata.obs["tissue_ontology_term_id"]
.cat.add_categories(["UBERON:0001,UBERON:0002"])
)

adata.obs.iloc[0, adata.obs.columns.get_loc("tissue_ontology_term_id")] = \
"UBERON:0001,UBERON:0002"

adata.write_h5ad(file_path)

validator = UploadValidator(file_path)
validator._multi_exception.raise_on_append = True

# Mock var validation
def mock_check_var_index(self, cap_adata):
self._organism = HomoSapiens

monkeypatch.setattr(
UploadValidator,
"_check_var_index",
mock_check_var_index,
)

with read_h5ad(file_path) as cap_adata:
cap_adata.read_obs()
validator._check_var_index(cap_adata)

with pytest.raises(AnnDataMultipleOntologyIDs):
validator._check_obs(cap_adata)


def test_invalid_disease_prefix_for_human_raises(tmp_path, monkeypatch):
file_path = tmp_path / "test_invalid_disease_prefix.h5ad"

adata = ad.AnnData(X=np.eye(3))
add_required_obs_columns(adata)

# Replace disease column with invalid prefix
adata.obs["disease_ontology_term_id"] = pd.Categorical(
["DOID:1234"] * adata.n_obs
)

adata.write_h5ad(file_path)

validator = UploadValidator(file_path)
validator._multi_exception.raise_on_append = True

# Mock var validation
def mock_check_var_index(self, cap_adata):
self._organism = HomoSapiens

monkeypatch.setattr(
UploadValidator,
"_check_var_index",
mock_check_var_index,
)

with read_h5ad(file_path) as cap_adata:
cap_adata.read_obs()
validator._check_var_index(cap_adata)

with pytest.raises(AnnDataInvalidDiseaseOntologyForHuman):
validator._check_obs(cap_adata)