diff --git a/configs/dataset/tiled/ftn/0_320.yaml b/configs/dataset/tiled/ftn/0_320.yaml new file mode 100644 index 0000000..ff91a77 --- /dev/null +++ b/configs/dataset/tiled/ftn/0_320.yaml @@ -0,0 +1,11 @@ +defaults: + - /dataset/processed_w_masks/ftn@_here_ + - _self_ + +mpp: 0.17 +tile_extent: 320 +level: 0 +tiling_uris: + train: "mlflow-artifacts:/86/bbbe4603bc30495d85ac99093fc9269a/artifacts/train - ftn" # TODO update URI + test_preliminary: "mlflow-artifacts:/86/bbbe4603bc30495d85ac99093fc9269a/artifacts/test preliminary - ftn" # TODO update URI + test_final: "mlflow-artifacts:/86/bbbe4603bc30495d85ac99093fc9269a/artifacts/test final - ftn" # TODO update URI \ No newline at end of file diff --git a/configs/dataset/tiled/ftn/0_430.yaml b/configs/dataset/tiled/ftn/0_430.yaml new file mode 100644 index 0000000..79b9c8e --- /dev/null +++ b/configs/dataset/tiled/ftn/0_430.yaml @@ -0,0 +1,11 @@ +defaults: + - /dataset/processed_w_masks/ftn@_here_ + - _self_ + +mpp: 0.17 +tile_extent: 430 +level: 0 +tiling_uris: + train: "mlflow-artifacts:/86/de450f835f0d4462a91b35f4a79a500f/artifacts/train - ftn" # TODO update URI + test_preliminary: "mlflow-artifacts:/86/de450f835f0d4462a91b35f4a79a500f/artifacts/test preliminary - ftn" # TODO update URI + test_final: "mlflow-artifacts:/86/de450f835f0d4462a91b35f4a79a500f/artifacts/test final - ftn" # TODO update URI \ No newline at end of file diff --git a/configs/dataset/tiled/ftn/1_224.yaml b/configs/dataset/tiled/ftn/1_224.yaml new file mode 100644 index 0000000..91a27da --- /dev/null +++ b/configs/dataset/tiled/ftn/1_224.yaml @@ -0,0 +1,11 @@ +defaults: + - /dataset/processed_w_masks/ftn@_here_ + - _self_ + +mpp: 0.52 +tile_extent: 224 +level: 1 +tiling_uris: + train: "mlflow-artifacts:/86/f85b64a7f96c41e38f86d84956e2dbe9/artifacts/train - ftn" # TODO update URI + test_preliminary: "mlflow-artifacts:/86/f85b64a7f96c41e38f86d84956e2dbe9/artifacts/test preliminary - ftn" # TODO update URI + test_final: "mlflow-artifacts:/86/f85b64a7f96c41e38f86d84956e2dbe9/artifacts/test final - ftn" # TODO update URI \ No newline at end of file diff --git a/configs/dataset/tiled/ftn/2_224.yaml b/configs/dataset/tiled/ftn/2_224.yaml new file mode 100644 index 0000000..55da441 --- /dev/null +++ b/configs/dataset/tiled/ftn/2_224.yaml @@ -0,0 +1,11 @@ +defaults: + - /dataset/processed_w_masks/ftn@_here_ + - _self_ + +mpp: 1.55 +tile_extent: 224 +level: 2 +tiling_uris: + train: "mlflow-artifacts:/86/5814484b6cd7467e9d712889655479af/artifacts/train - ftn" # TODO update URI + test_preliminary: "mlflow-artifacts:/86/5814484b6cd7467e9d712889655479af/artifacts/test preliminary - ftn" # TODO update URI + test_final: "mlflow-artifacts:/86/5814484b6cd7467e9d712889655479af/artifacts/test final - ftn" # TODO update URI \ No newline at end of file diff --git a/configs/dataset/tiled/ikem/0_320.yaml b/configs/dataset/tiled/ikem/0_320.yaml new file mode 100644 index 0000000..065bca1 --- /dev/null +++ b/configs/dataset/tiled/ikem/0_320.yaml @@ -0,0 +1,11 @@ +defaults: + - /dataset/processed_w_masks/ikem@_here_ + - _self_ + +mpp: 0.17 +tile_extent: 320 +level: 0 +tiling_uris: + train: "mlflow-artifacts:/86/4486e598446d412d926ac66dadb35e51/artifacts/train - ikem" # TODO update URI + test_preliminary: "mlflow-artifacts:/86/4486e598446d412d926ac66dadb35e51/artifacts/test preliminary - ikem" # TODO update URI + test_final: "mlflow-artifacts:/86/4486e598446d412d926ac66dadb35e51/artifacts/test final - ikem" # TODO update URI \ No newline at end of file diff --git a/configs/dataset/tiled/ikem/0_430.yaml b/configs/dataset/tiled/ikem/0_430.yaml new file mode 100644 index 0000000..98c8ff1 --- /dev/null +++ b/configs/dataset/tiled/ikem/0_430.yaml @@ -0,0 +1,11 @@ +defaults: + - /dataset/processed_w_masks/ikem@_here_ + - _self_ + +mpp: 0.17 +tile_extent: 430 +level: 0 +tiling_uris: + train: "mlflow-artifacts:/86/fd112e63819c49d999502542b35bfce1/artifacts/train - ikem" # TODO update URI + test_preliminary: "mlflow-artifacts:/86/fd112e63819c49d999502542b35bfce1/artifacts/test preliminary - ikem" # TODO update URI + test_final: "mlflow-artifacts:/86/fd112e63819c49d999502542b35bfce1/artifacts/test final - ikem" # TODO update URI \ No newline at end of file diff --git a/configs/dataset/tiled/ikem/1_224.yaml b/configs/dataset/tiled/ikem/1_224.yaml new file mode 100644 index 0000000..0b40c0e --- /dev/null +++ b/configs/dataset/tiled/ikem/1_224.yaml @@ -0,0 +1,11 @@ +defaults: + - /dataset/processed_w_masks/ikem@_here_ + - _self_ + +mpp: 0.52 +tile_extent: 224 +level: 1 +tiling_uris: + train: "mlflow-artifacts:/86/ece822e7c0e3416f97212267c773c8ac/artifacts/train - ikem" # TODO update URI + test_preliminary: "mlflow-artifacts:/86/ece822e7c0e3416f97212267c773c8ac/artifacts/test preliminary - ikem" # TODO update URI + test_final: "mlflow-artifacts:/86/ece822e7c0e3416f97212267c773c8ac/artifacts/test final - ikem" # TODO update URI \ No newline at end of file diff --git a/configs/dataset/tiled/ikem/2_224.yaml b/configs/dataset/tiled/ikem/2_224.yaml new file mode 100644 index 0000000..c68e542 --- /dev/null +++ b/configs/dataset/tiled/ikem/2_224.yaml @@ -0,0 +1,11 @@ +defaults: + - /dataset/processed_w_masks/ikem@_here_ + - _self_ + +mpp: 1.55 +tile_extent: 224 +level: 2 +tiling_uris: + train: "mlflow-artifacts:/86/0c09e1c61d294fa3877b6b21703bab2f/artifacts/train - ikem" # TODO update URI + test_preliminary: "mlflow-artifacts:/86/0c09e1c61d294fa3877b6b21703bab2f/artifacts/test preliminary - ikem" # TODO update URI + test_final: "mlflow-artifacts:/86/0c09e1c61d294fa3877b6b21703bab2f/artifacts/test final - ikem" # TODO update URI \ No newline at end of file diff --git a/configs/dataset/tiled/knl_patos/0_320.yaml b/configs/dataset/tiled/knl_patos/0_320.yaml new file mode 100644 index 0000000..a84731d --- /dev/null +++ b/configs/dataset/tiled/knl_patos/0_320.yaml @@ -0,0 +1,10 @@ +defaults: + - /dataset/processed_w_masks/knl_patos@_here_ + - _self_ + +mpp: 0.17 +tile_extent: 320 +level: 0 +tiling_uris: + test_preliminary: "mlflow-artifacts:/86/7b9a446145b14965981bbac88e8e2c8b/artifacts/test preliminary - knl_patos" # TODO update URI + test_final: "mlflow-artifacts:/86/7b9a446145b14965981bbac88e8e2c8b/artifacts/test final - knl_patos" # TODO update URI diff --git a/configs/dataset/tiled/knl_patos/0_430.yaml b/configs/dataset/tiled/knl_patos/0_430.yaml new file mode 100644 index 0000000..0e6bb2c --- /dev/null +++ b/configs/dataset/tiled/knl_patos/0_430.yaml @@ -0,0 +1,10 @@ +defaults: + - /dataset/processed_w_masks/knl_patos@_here_ + - _self_ + +mpp: 0.17 +tile_extent: 430 +level: 0 +tiling_uris: + test_preliminary: "mlflow-artifacts:/86/eb29255c944d4dad926160a7cb102ad9/artifacts/test preliminary - knl_patos" # TODO update URI + test_final: "mlflow-artifacts:/86/eb29255c944d4dad926160a7cb102ad9/artifacts/test final - knl_patos" # TODO update URI \ No newline at end of file diff --git a/configs/dataset/tiled/knl_patos/1_224.yaml b/configs/dataset/tiled/knl_patos/1_224.yaml new file mode 100644 index 0000000..71e47d9 --- /dev/null +++ b/configs/dataset/tiled/knl_patos/1_224.yaml @@ -0,0 +1,10 @@ +defaults: + - /dataset/processed_w_masks/knl_patos@_here_ + - _self_ + +mpp: 0.52 +tile_extent: 224 +level: 1 +tiling_uris: + test_preliminary: "mlflow-artifacts:/86/6782155362d54ecc9f1beccb4362d359/artifacts/test preliminary - knl_patos" # TODO update URI + test_final: "mlflow-artifacts:/86/6782155362d54ecc9f1beccb4362d359/artifacts/test final - knl_patos" # TODO update URI \ No newline at end of file diff --git a/configs/dataset/tiled/knl_patos/2_224.yaml b/configs/dataset/tiled/knl_patos/2_224.yaml new file mode 100644 index 0000000..b814d37 --- /dev/null +++ b/configs/dataset/tiled/knl_patos/2_224.yaml @@ -0,0 +1,10 @@ +defaults: + - /dataset/processed_w_masks/knl_patos@_here_ + - _self_ + +mpp: 1.55 +tile_extent: 224 +level: 2 +tiling_uris: + test_preliminary: "mlflow-artifacts:/86/d7486bb6b667433989c3ce1c8ce31d60/artifacts/test preliminary - knl_patos" # TODO update URI + test_final: "mlflow-artifacts:/86/d7486bb6b667433989c3ce1c8ce31d60/artifacts/test final - knl_patos" # TODO update URI \ No newline at end of file diff --git a/configs/preprocessing/tile_masks.yaml b/configs/preprocessing/tile_masks.yaml new file mode 100644 index 0000000..7967088 --- /dev/null +++ b/configs/preprocessing/tile_masks.yaml @@ -0,0 +1,12 @@ +# @package _global_ + +max_concurrent: 64 +artifact_path: tile_masks +level: 3 + +metadata: + run_name: "🎭 Tile Masks: ${dataset.institution}" + description: Tile masks for ${dataset.institution} at tiling level ${dataset.level} + hyperparams: + mask_level: ${level} + tiling_level: ${dataset.level} \ No newline at end of file diff --git a/preprocessing/tile_masks.py b/preprocessing/tile_masks.py new file mode 100644 index 0000000..e1cf8d0 --- /dev/null +++ b/preprocessing/tile_masks.py @@ -0,0 +1,97 @@ +from collections.abc import Iterable +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import cast + +import hydra +import mlflow.artifacts +import numpy as np +import pandas as pd +import pyvips +import ray +from omegaconf import DictConfig +from openslide import OpenSlide +from rationai.masks import process_items, slide_resolution, tile_mask, write_big_tiff +from rationai.mlkit import autolog, with_cli_args +from rationai.mlkit.lightning.loggers import MLFlowLogger + + +def download_slide_tiles(uris: Iterable[str]) -> tuple[pd.DataFrame, pd.DataFrame]: + slidess, tiless = [], [] + for uri in uris: + path = mlflow.artifacts.download_artifacts(artifact_uri=uri) + slidess.append(pd.read_parquet(Path(path) / "slides.parquet")) + tiless.append(pd.read_parquet(Path(path) / "tiles.parquet")) + + slides = pd.concat(slidess).reset_index(drop=True) + tiles = pd.concat(tiless).reset_index(drop=True) + return slides, tiles + + +@ray.remote(memory=4 * 1024**3) +def process_slide( + slide: pd.Series, + tiles: pd.DataFrame, # tiles are automatically serialized by Ray + level: int, + output_folder: Path, +) -> None: + with OpenSlide(slide["path"]) as slide_wsi: + mask_extent_x, mask_extent_y = slide_wsi.level_dimensions[level] + mpp_x, mpp_y = slide_resolution(slide_wsi, level) + + slide_tiles = tiles[tiles["slide_id"] == slide.id] + + blur_slide_tiles = slide_tiles[slide_tiles["blur"] > 0.25] + artifacts_slide_tiles = slide_tiles[slide_tiles["artifacts"] > 0.25] + + drop_indices = blur_slide_tiles.index.union(artifacts_slide_tiles.index) + slide_tiles = slide_tiles.drop(drop_indices) + + for folder, tiles_subset in [ + ("blur_tiles", blur_slide_tiles), + ("artifacts_tiles", artifacts_slide_tiles), + ("clean_tiles", slide_tiles), + ]: + if tiles_subset.empty: + continue + + mask = tile_mask( + tiles_subset, + tile_extent=(slide["tile_extent_x"], slide["tile_extent_y"]), + size=(slide["extent_x"], slide["extent_y"]), + ) + mask = cast("pyvips.Image", pyvips.Image.new_from_array(np.array(mask))) + mask = cast( + "pyvips.Image", + mask.resize(mask_extent_x / mask.width, vscale=mask_extent_y / mask.height), # type: ignore[reportCallIssue] + ) + + mask_path = output_folder / folder / f"{Path(slide['path']).stem}.tiff" + mask_path.parent.mkdir(parents=True, exist_ok=True) + write_big_tiff(mask, mask_path, mpp_x=mpp_x, mpp_y=mpp_y) + + +@with_cli_args(["+preprocessing=tile_masks"]) +@hydra.main(config_path="../configs", config_name="preprocessing", version_base=None) +@autolog +def main(config: DictConfig, logger: MLFlowLogger) -> None: + slides, tiles = download_slide_tiles(config.dataset.tiling_uris.values()) + tiles_ref = ray.put(tiles) + + with TemporaryDirectory() as output_dir: + process_items( + (slide for _, slide in slides.iterrows()), + process_item=process_slide, + fn_kwargs={ + "tiles": tiles_ref, + "level": config.level, + "output_folder": Path(output_dir), + }, + max_concurrent=config.max_concurrent, + ) + logger.log_artifacts(local_dir=output_dir, artifact_path=config.artifact_path) + + +if __name__ == "__main__": + ray.init(runtime_env={"excludes": [".git", ".venv"]}) + main() diff --git a/scripts/preprocessing/tile_masks.py b/scripts/preprocessing/tile_masks.py new file mode 100644 index 0000000..8bdcc53 --- /dev/null +++ b/scripts/preprocessing/tile_masks.py @@ -0,0 +1,17 @@ +from kube_jobs import storage, submit_job + + +submit_job( + job_name="ulcerative-colitis-tile-masks-...", + username=..., + public=False, + cpu=64, + memory="32Gi", + script=[ + "git clone https://github.com/RationAI/ulcerative-colitis.git workdir", + "cd workdir", + "uv sync --frozen", + "uv run --active -m preprocessing.tile_masks +dataset=tiled/.../...", + ], + storage=[storage.secure.Data], +)