Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,4 @@ lightning_logs/

# Project
/data
playground.ipynb
3 changes: 2 additions & 1 deletion .mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
strict = True
ignore_missing_imports = True
disallow_untyped_calls = False
disable_error_code = no-any-return
disable_error_code = no-any-return
exclude = scripts
7 changes: 7 additions & 0 deletions configs/base.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
defaults:
- hydra: default
- logger: mlflow
- _self_

metadata:
experiment_name: Ulcerative Colitis
10 changes: 10 additions & 0 deletions configs/dataset/raw/ftn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
institution: ftn
folder: /mnt/data/FTN/colon/IBD_AI
# [0-9]{1,6} - case ID (1 to 6 digits) (in year scope)
# _ - underscore separator
# 2[0-5] - year (2020 to 2025)
# .czi - file extension
regex_pattern: ^[0-9]{1,6}_2[0-5]\.czi$
labels:
- IBD_AI_FTN.xlsx
- IBD_AI_FTN_doplnek.xlsx
14 changes: 14 additions & 0 deletions configs/dataset/raw/ikem.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
institution: ikem
folder: /mnt/data/IKEM/colon/IBD_AI/20x
# [0-9]{1,5} - case ID (1 to 5 digits) (in year scope)
# _ - underscore separator
# 2[1-4] - year (2021 to 2024)
# _ - underscore separator
# HE - stain type
# (?:_0[1-6])? - optional underscore and slide number (01 to 06)
# .czi - file extension
regex_pattern: ^[0-9]{1,5}_2[1-4]_HE(?:_0[1-6])?\.czi$
labels:
- Fab_IBD_AI_12_2024.csv
- IBD_AI_2.xlsx
- missing.xlsx
17 changes: 17 additions & 0 deletions configs/dataset/raw/knl_patos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
institution: knl_patos
folder: /mnt/data/KNL_PATOS/colon/IBD_AI
# [0-9]{1,5} - case ID (1 to 5 digits) (in year scope)
# _ - underscore separator
# 25 - year 2025
# _ - underscore separator
# [A-F] - block identifier (A to F)
# _ - underscore separator
# HE - stain type
# (0[1-9]|1[0-2]) - slide number (01 to 12)
# .czi - file extension
regex_pattern: ^[0-9]{1,5}_25_[A-F]_HE(0[1-9]|1[0-2])\.czi$
labels:
- IBD_AI_Liberec.xlsx
- IBD_AI_Liberec_02.xlsx
- IBD_AI_Liberec_10_2025.xlsx
- IBD_AI_Liberec_28_10_2025.xlsx
Comment on lines +1 to +17
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Missing newline at end of file.

Most editors and POSIX tools expect a trailing newline. Add one to avoid diff noise and potential YAML parser warnings.

🤖 Prompt for AI Agents
In `@configs/dataset/raw/knl_patos.yaml` around lines 1 - 17, The file ending
lacks a trailing newline which can cause diffs and parser warnings; open the
YAML file containing the keys institution, folder, regex_pattern, and labels and
ensure the file terminates with a single newline character (i.e., add a final
blank line at EOF) so the file ends with '\n'.

20 changes: 0 additions & 20 deletions configs/default.yaml

This file was deleted.

9 changes: 9 additions & 0 deletions configs/preprocessing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# @package _global_

defaults:
- base
- _self_

dataset: ???

project_dir: /mnt/projects/inflammatory_bowel_disease/ulcerative_colitis
5 changes: 5 additions & 0 deletions configs/preprocessing/create_dataset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# @package _global_

metadata:
run_name: "📂 Dataset Creation: ${dataset.institution}"
description: "Create dataset for ${dataset.institution} institution"
108 changes: 108 additions & 0 deletions preprocessing/create_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import re
import tempfile
from pathlib import Path

import hydra
import pandas as pd
from omegaconf import DictConfig
from rationai.mlkit import autolog, with_cli_args
from rationai.mlkit.lightning.loggers import MLFlowLogger


def get_labels(folder_path: Path, labels: list[str]) -> pd.DataFrame:
dfs = []
for labels_file in labels:
labels_path = folder_path / labels_file

if labels_path.suffix == ".csv":
# One labels file is in CSV format
df = pd.read_csv(labels_path, index_col=0)
else:
df = pd.read_excel(labels_path, index_col=0)

df.columns = df.columns.str.lower()
dfs.append(df)

labels_df = pd.concat(dfs)
# id is in format [case_id]/YY (e.g., 01234/24 or 1234/24)
labels_df.index = labels_df.index.str.lstrip("0")
labels_df.index = labels_df.index.str.strip()
labels_df.index = labels_df.index.str.replace("/", "_", regex=False)

return labels_df


def get_slides(folder_path: Path, pattern: re.Pattern) -> pd.DataFrame:
slides = []
for slide_path in folder_path.iterdir():
if not pattern.fullmatch(slide_path.name):
continue
case_id = "_".join(slide_path.stem.split("_")[:2])

slides.append(
{"slide_id": slide_path.stem, "case_id": case_id, "path": str(slide_path)}
)

slides_df = pd.DataFrame(slides).set_index("slide_id")
return slides_df


def create_dataset(
folder: str, labels: list[str], institution: str, pattern: re.Pattern
) -> tuple[pd.DataFrame, list[str], list[str]]:
folder_path = Path(folder)
labels_df = get_labels(folder_path, labels)
slides_df = get_slides(folder_path, pattern)

# IKEM has only case-level labels (FTN has one slide per case)
on = "case_id" if institution == "ikem" else "slide_id"
dataset_df = slides_df.join(labels_df, on=on, how="outer")
dataset_df.index.name = "slide_id"

if institution == "ikem":
# IKEM has 'Lokalita' and 'Diagnóza' columns
# Slides inside ileum are not used
dataset_df = dataset_df[dataset_df["lokalita"] != "ileum"]
# Columns 'Lokalita' and 'Diagnóza' are no longer needed

missing_labels = dataset_df[dataset_df["nancy"].isna()].index.to_list()
missing_slides = dataset_df[dataset_df["path"].isna()][on].to_list()

dataset_df = dataset_df[["case_id", "path", "nancy"]]
dataset_df = dataset_df.dropna()
dataset_df["nancy"] = dataset_df["nancy"].astype(int)

return dataset_df, missing_slides, missing_labels


@with_cli_args(["+preprocessing=create_dataset"])
@hydra.main(config_path="../configs", config_name="preprocessing", version_base=None)
@autolog
def main(config: DictConfig, logger: MLFlowLogger) -> None:
dataset, missing_slides, missing_labels = create_dataset(
config.dataset.folder,
config.dataset.labels,
config.dataset.institution,
re.compile(config.dataset.regex_pattern),
)

with tempfile.TemporaryDirectory() as tmpdir:
tmpdir_path = Path(tmpdir)

output_path = tmpdir_path / "dataset.csv"
dataset.to_csv(output_path, index=True)
logger.log_artifact(str(output_path))

def _log_missing_items(items: list[str], filename: str) -> None:
if not items:
return
file_path = tmpdir_path / filename
file_path.write_text("\n".join(items) + "\n")
logger.log_artifact(str(file_path))

_log_missing_items(missing_slides, "missing_slides.txt")
_log_missing_items(missing_labels, "missing_labels.txt")


if __name__ == "__main__":
main()
17 changes: 17 additions & 0 deletions scripts/preprocessing/create_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from kube_jobs import storage, submit_job


submit_job(
job_name="ulcerative-colitis-dataset-creation-...",
username=...,
public=False,
cpu=2,
memory="4Gi",
script=[
"git clone https://github.com/RationAI/ulcerative-colitis.git workdir",
"cd workdir",
"uv sync --frozen",
"uv run -m preprocessing.create_dataset +data=raw/...",
],
storage=[storage.secure.DATA],
)
Loading
Loading