Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 99 additions & 23 deletions src/ingest_validation_tests/qptiff_channel_validator.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import os
from pathlib import Path
from xml.etree import ElementTree

import pandas as pd
import tifffile
from validator import Validator


class QpTiffChannelValidator(Validator):
description = """Check for presence of at least one "Yes" value in
'is_channel_used_for_nuclei_segmentation' and 'is_channel_used_for_cell_segmentation'"""
description = """Check qptiff.channels.csv for cell/nuclei segmentation markers;
check channels in QPTIFF against channels in qptiff.channels.csv"""
cost = 1.0
version = "1.0"
required = ["phenocycler"]
Expand All @@ -30,32 +32,45 @@ def __init__(self, base_paths, assay_type, *args, **kwargs):
self.errors = []

def _collect_errors(self) -> list[str | None]:
filenames_to_test = []
if not (file_pairs_to_test := self.get_file_pairs_to_test()):
self.errors.append(
f"Could not find qptiff.channels.csv and associated QPTIFF files (required for {self.assay_type})."
)
return self._return_result(self.errors, False)
for channels_csv, qptiff_file in file_pairs_to_test.items():
self.check_qptiff_channels_file(channels_csv)
self.check_channels(channels_csv, qptiff_file)
return self._return_result(self.errors, bool(file_pairs_to_test))

def get_file_pairs_to_test(self) -> dict:
"""
For each data path, pair {qptiff.channels.csv: qptiff_file}
"""
file_pairs_to_test = {}

for path in self.paths:
images_path = Path(os.path.join(path, "lab_processed/images"))
if not images_path.exists():
self.errors.append(
f"Can't find 'lab_processed/images' subdirectory in '{path.stem}'."
)
channels_parent_path, qptiff_parent_path = self._get_parent_dir_paths(path)
if not channels_parent_path or not qptiff_parent_path:
continue
for filename in images_path.iterdir():
if "qptiff.channels.csv" in str(filename).lower():
filenames_to_test.append(filename)

if filenames_to_test:
for filename in filenames_to_test:
self.check_qptiff_file(filename)
else:
self.errors.append(
f"Could not find 'lab_processed/images/*.qptiff.channels.csv' files (required for {self.assay_type})."
)
return self._return_result(self.errors, filenames_to_test)

def check_qptiff_file(self, filename: Path):
channel_csv = self._get_file_path(channels_parent_path, "qptiff.channels.csv")
qptiff_file = self._get_file_path(qptiff_parent_path, ".qptiff")
if not (channel_csv and qptiff_file):
continue
file_pairs_to_test[channel_csv] = qptiff_file

return file_pairs_to_test

def check_qptiff_channels_file(self, filename: Path):
"""
Check for presence of at least one "Yes" value in
'is_channel_used_for_nuclei_segmentation' and 'is_channel_used_for_cell_segmentation',
and make sure columns are in order.
"""

df = pd.read_csv(filename)
# pipeline uses column position to determine channel & cell/nucleus segmentation
if column_order_errors := self._check_column_order(df, filename):
if column_order_errors := self.check_column_order(df, filename):
# validation can't continue if columns out of order
self.errors.extend(column_order_errors)
return
Expand All @@ -66,7 +81,7 @@ def check_qptiff_file(self, filename: Path):
f"{self.rel_filename_str(filename)} must have at least one 'Yes' value in column '{column}'"
)

def _check_column_order(self, df: pd.DataFrame, filename: Path) -> list:
def check_column_order(self, df: pd.DataFrame, filename: Path) -> list:
column_order_errors = []
for index, columns in enumerate(self.ordered_columns):
try:
Expand All @@ -84,3 +99,64 @@ def _check_column_order(self, df: pd.DataFrame, filename: Path) -> list:
else:
column_order_errors.append(f"{self.rel_filename_str(filename)}: {e}")
return column_order_errors

def check_channels(self, channels_csv: Path, qptiff_file: Path):
"""
Check that channels in channel_id column of qptiff.channels.csv
match channels in accompanying QPTIFF file.
"""
channels = pd.read_csv(channels_csv)
channels_list = channels.iloc[:, 0].tolist()
qptf_channels = self._get_qptiff_channels(qptiff_file)
channels_list.sort()
channels_set = set(channels_list)
if not channels_set == qptf_channels:
self.errors.append(
f"""Channels in {self.rel_filename_str(channels_csv)} and {self.rel_filename_str(qptiff_file)} do not match.
Channels in CSV that are not present in QPTIFF: {', '.join(channels_set.difference(qptf_channels))}
Channels in QPTIFF that are not present in CSV: {', '.join(qptf_channels.difference(channels_set))}
"""
)

def _get_qptiff_channels(self, qptiff_file: Path) -> set[str]:
qptf_channels = []
with tifffile.TiffFile(qptiff_file) as qptf:
for page in qptf.pages:
if description := page.tags.get("ImageDescription").value:
"""
Bioformats conversion (used in pipeline) uses ImageDescription.Biomarker
as the channel name if present, defaulting to ImageDescription.Name if not.
https://github.com/ome/bioformats/blob/877c317e4e396381dc76e56c1539b24947f71dce/components/formats-gpl/src/loci/formats/in/VectraReader.java#L546
"""
if (
biomarker := ElementTree.fromstring(description).find("Biomarker")
) is not None:
qptf_channels.append(biomarker.text)
elif (
channel_name := ElementTree.fromstring(description).find("Name")
) is not None:
qptf_channels.append(channel_name.text)
return set(sorted(qptf_channels))

def _get_parent_dir_paths(self, path) -> tuple[Path | None, Path | None]:
channels_parent_path = Path(os.path.join(path, "lab_processed/images"))
if not channels_parent_path.exists():
channels_parent_path = None
self.errors.append(f"Can't find 'lab_processed/images' subdirectory in '{path.stem}'.")
qptiff_parent_path = Path(os.path.join(path, "raw/images"))
if not qptiff_parent_path.exists():
qptiff_parent_path = None
self.errors.append(f"Can't find 'raw/images' subdirectory in '{path.stem}'.")
return channels_parent_path, qptiff_parent_path

def _get_file_path(self, parent_dir_path: Path, search_str: str) -> Path | None:
files = []
for filename in parent_dir_path.iterdir():
if search_str in str(filename).lower():
files.append(filename)
if len(files) != 1:
self.errors.append(
f"Found {len(files)} {search_str} files in {parent_dir_path} directory."
)
return
return files[0]
51 changes: 31 additions & 20 deletions tests/test_qptiff_channel_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from qptiff_channel_validator import QpTiffChannelValidator


class TestQpTiffChannelValidator:
class TestQpTiffChannelCsv:
@pytest.mark.parametrize(
("test_data_fname", "msg_re_list", "assay_type"),
(
Expand All @@ -27,11 +27,11 @@ class TestQpTiffChannelValidator:
"phenocycler",
),
# test case: both columns have Yes/true values
("test_data/qptiff_good.zip", [None], "phenocycler"),
("test_data/qptiff_good.zip", [], "phenocycler"),
# test case: both columns have Yes/true values, column names have spaces
(
"test_data/qptiff_good_with_alt_column_format.zip",
[None],
[],
"phenocycler",
),
# test case: columns out of order
Expand All @@ -56,35 +56,44 @@ class TestQpTiffChannelValidator:
("test_data/qptiff_good.zip", [], "snRNAseq"),
),
)
def test_qptiff_channel_validator(self, test_data_fname, msg_re_list, assay_type, tmp_path):
def test_qptiff_channel_csv(self, test_data_fname, msg_re_list, assay_type, tmp_path):
test_data_path = Path(test_data_fname)
zfile = zipfile.ZipFile(test_data_path)
zfile.extractall(tmp_path)
validator = QpTiffChannelValidator(tmp_path / test_data_path.stem, assay_type)
errors = validator.collect_errors()[:]
errors.sort()
assert errors == msg_re_list
validator.check_qptiff_channels_file(
Path(
tmp_path
/ test_data_path.stem
/ f"lab_processed/images/{test_data_path.stem}.qptiff.channels.csv"
)
)
for error in msg_re_list:
assert error in validator.errors

def test_missing_required_dir(self, tmp_path):
validator = QpTiffChannelValidator(tmp_path, "phenocycler")
errors = validator.collect_errors()[:]
errors.sort()
assert errors == [
for err in [
"Can't find 'lab_processed/images' subdirectory in 'test_missing_required_dir0'.",
"Could not find 'lab_processed/images/*.qptiff.channels.csv' files (required for phenocycler).",
]
"Can't find 'raw/images' subdirectory in 'test_missing_required_dir0'.",
"Could not find qptiff.channels.csv and associated QPTIFF files (required for phenocycler).",
]:
assert err in errors

def test_missing_required_files(self, tmp_path):
def test_missing_channels_csv(self, tmp_path):
dir1 = tmp_path / "lab_processed"
dir1.mkdir()
dir2 = dir1 / "images"
dir2.mkdir()
validator = QpTiffChannelValidator(tmp_path, "phenocycler")
errors = validator.collect_errors()[:]
errors.sort()
assert errors == [
"Could not find 'lab_processed/images/*.qptiff.channels.csv' files (required for phenocycler).",
]
assert (
"Could not find qptiff.channels.csv and associated QPTIFF files (required for phenocycler)."
in errors
)

@pytest.mark.parametrize(
("test_data_fnames", "msg_re_list"),
Expand Down Expand Up @@ -127,9 +136,11 @@ def test_multiple_files(self, test_data_fnames, msg_re_list, tmp_path):
test_data_path = Path(test_data_fname)
zfile = zipfile.ZipFile(test_data_path)
zfile.extractall(tmp_path)
validator = QpTiffChannelValidator(
[tmp_path / test_data_path.stem for test_data_path in test_data_fnames], "phenocycler"
)
errors = validator.collect_errors()[:]
errors.sort()
assert errors == msg_re_list
test_data_paths = [tmp_path / test_data_path.stem for test_data_path in test_data_fnames]
validator = QpTiffChannelValidator(test_data_paths, "phenocycler")
for data_path in test_data_paths:
validator.check_qptiff_channels_file(
data_path / f"lab_processed/images/{data_path.stem}.qptiff.channels.csv"
)
for error in msg_re_list:
assert error in validator.errors
Loading