Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@
# If building the documentation fails because of a missing link that is outside your control,
# you can add an exception to this list.
("py:class", "Path"),
("py:class", "pathlib._local.Path"),
("py:class", "AnnData"),
("py:class", "SpatialData"),
("py:func", "imageio.imread"), # maybe this can be fixed
Expand Down
44 changes: 22 additions & 22 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@ requires = ["hatchling", "hatch-vcs"]

[project]
name = "spatialdata-io"
dynamic= [
"version" # allow version to be set by git tags
dynamic = [
"version" # allow version to be set by git tags
]
description = "SpatialData IO for common techs"
readme = "README.md"
requires-python = ">=3.11"
license = {file = "LICENSE"}
license = { file = "LICENSE" }
authors = [
{name = "scverse"},
{ name = "scverse" },
]
maintainers = [
{name = "scverse", email = "scverse@scverse.scverse"},
{ name = "scverse", email = "scverse@scverse.scverse" },
]
urls.Documentation = "https://spatialdata-io.readthedocs.io/"
urls.Source = "https://github.com/scverse/spatialdata-io"
Expand All @@ -26,7 +26,7 @@ dependencies = [
"click",
"numpy",
"scanpy",
"spatialdata>=0.2.6",
"spatialdata>=0.7.3a0",
"scikit-image",
"h5py",
"joblib",
Expand All @@ -46,7 +46,7 @@ dev = [
"pre-commit"
]
doc = [
"sphinx>=4.5",
"sphinx>=4.5,<9",
"sphinx-book-theme>=1.0.0",
"myst-nb",
"sphinxcontrib-bibtex>=1.0.0",
Expand All @@ -67,7 +67,7 @@ test = [
# update: readthedocs doens't seem to try to install pre-releases even if when trying to install the pre optional-dependency. For
# the moment, if needed, let's add the latest pre-release explicitly here.
pre = [
"spatialdata>=0.4.0rc0"
"spatialdata>=0.7.3a0"
]

[tool.coverage.run]
Expand All @@ -80,7 +80,7 @@ omit = [
testpaths = ["tests"]
xfail_strict = true
addopts = [
"--import-mode=importlib", # allow using test files with same name
"--import-mode=importlib", # allow using test files with same name
]

[tool.ruff]
Expand All @@ -95,19 +95,19 @@ exclude = [
"setup.py",
]
lint.select = [
"F", # Errors detected by Pyflakes
"E", # Error detected by Pycodestyle
"W", # Warning detected by Pycodestyle
"I", # isort
"D", # pydocstyle
"B", # flake8-bugbear
"TID", # flake8-tidy-imports
"C4", # flake8-comprehensions
"BLE", # flake8-blind-except
"UP", # pyupgrade
"RUF100", # Report unused noqa directives
"TCH", # Typing imports
"NPY", # Numpy specific rules
"F", # Errors detected by Pyflakes
"E", # Error detected by Pycodestyle
"W", # Warning detected by Pycodestyle
"I", # isort
"D", # pydocstyle
"B", # flake8-bugbear
"TID", # flake8-tidy-imports
"C4", # flake8-comprehensions
"BLE", # flake8-blind-except
"UP", # pyupgrade
"RUF100", # Report unused noqa directives
"TCH", # Typing imports
"NPY", # Numpy specific rules
# "PTH", # Use pathlib
# "S" # Security
]
Expand Down
59 changes: 1 addition & 58 deletions src/spatialdata_io/readers/_utils/_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from numpy.typing import NDArray
from spatialdata.models.models import Chunks_t

__all__ = ["Chunks_t", "_compute_chunks", "_read_chunks", "normalize_chunks"]
__all__ = ["Chunks_t", "_compute_chunks", "_read_chunks"]

_Y_IDX = 0
"""Index of y coordinate in in chunk coordinate array format: (y, x, height, width)"""
Expand Down Expand Up @@ -143,60 +143,3 @@ def _read_chunks(
for chunk_y in range(coords.shape[0])
]
return chunks


def normalize_chunks(
chunks: Chunks_t | None,
axes: Sequence[str],
) -> dict[str, int]:
"""Normalize chunk specification to dict format.

This function converts various chunk formats to a dict mapping dimension names
to chunk sizes. The dict format is preferred because it's explicit about which
dimension gets which chunk size and is compatible with spatialdata.

Parameters
----------
chunks
Chunk specification. Can be:
- None: Uses DEFAULT_CHUNK_SIZE for all axes
- int: Applied to all axes
- tuple[int, ...]: Chunk sizes in order corresponding to axes
- dict: Mapping of axis names to chunk sizes (validated against axes)
axes
Tuple of axis names that defines the expected dimensions (e.g., ('c', 'y', 'x')).

Returns
-------
dict[str, int]
Dict mapping axis names to chunk sizes.

Raises
------
ValueError
If chunks format is not supported or incompatible with axes.
"""
if chunks is None:
return dict.fromkeys(axes, DEFAULT_CHUNK_SIZE)

if isinstance(chunks, int):
return dict.fromkeys(axes, chunks)

if isinstance(chunks, Mapping):
chunks_dict = dict(chunks)
missing = set(axes) - set(chunks_dict.keys())
if missing:
raise ValueError(f"chunks dict missing keys for axes {missing}, got: {list(chunks_dict.keys())}")
return {ax: chunks_dict[ax] for ax in axes}

if isinstance(chunks, tuple):
if len(chunks) != len(axes):
raise ValueError(f"chunks tuple length {len(chunks)} doesn't match axes {axes} (length {len(axes)})")
if not all(isinstance(c, int) for c in chunks):
raise ValueError(f"All elements in chunks tuple must be int, got: {chunks}")
return dict(zip(axes, chunks, strict=True))

raise ValueError(f"Unsupported chunks type: {type(chunks)}. Expected int, tuple, dict, or None.")


##
9 changes: 6 additions & 3 deletions src/spatialdata_io/readers/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import numpy as np
import tifffile
from dask_image.imread import imread
from geopandas import GeoDataFrame
from spatialdata._docs import docstring_parameter
from spatialdata._logging import logger
from spatialdata.models import Image2DModel, ShapesModel
Expand All @@ -23,10 +22,12 @@
from xarray import DataArray


from spatialdata.models.chunks_utils import normalize_chunks

from spatialdata_io.readers._utils._image import (
DEFAULT_CHUNK_SIZE,
_compute_chunks,
_read_chunks,
normalize_chunks,
)

VALID_IMAGE_TYPES = [".tif", ".tiff", ".png", ".jpg", ".jpeg"]
Expand Down Expand Up @@ -179,7 +180,7 @@ def image(
chunks: Chunks_t | None = None,
scale_factors: Sequence[int] | None = None,
) -> DataArray:
"""Reads an image file and returns a parsed Image2D spatial element.
"""Read an image file and returns a parsed Image2D spatial element.

Parameters
----------
Expand Down Expand Up @@ -207,6 +208,8 @@ def image(
# Map passed data axes to position of dimension
axes_dim_mapping = {axes: ndim for ndim, axes in enumerate(data_axes)}

if chunks is None:
chunks = DEFAULT_CHUNK_SIZE
chunks_dict = normalize_chunks(chunks, axes=data_axes)

im = None
Expand Down
78 changes: 34 additions & 44 deletions src/spatialdata_io/readers/xenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
from dask.dataframe import read_parquet
from dask_image.imread import imread
from geopandas import GeoDataFrame
from joblib import Parallel, delayed
from pyarrow import Table
from shapely import GeometryType, Polygon, from_ragged_array
from spatialdata import SpatialData
from spatialdata._core.query.relational_query import get_element_instances
from spatialdata._logging import logger
from spatialdata.models import (
Image2DModel,
Labels2DModel,
Expand Down Expand Up @@ -61,7 +61,7 @@ def xenium(
*,
cells_boundaries: bool = True,
nucleus_boundaries: bool = True,
cells_as_circles: bool | None = None,
cells_as_circles: bool = False,
cells_labels: bool = True,
nucleus_labels: bool = True,
transcripts: bool = True,
Expand Down Expand Up @@ -136,7 +136,7 @@ def xenium(

Notes
-----
Old versions. Until spatialdata-io v0.1.3post0: previously, `cells_as_circles` was `True` by default; the table was associated to the
Old versions. Until spatialdata-io v0.6.0: `cells_as_circles` was `True` by default; the table was associated to the
circles when `cells_as_circles` was `True`, and the table was associated to the polygons when `cells_as_circles`
was `False`; the radii of the circles were computed form the nuclei instead of the cells.

Expand All @@ -153,14 +153,6 @@ def xenium(
... )
>>> sdata.write("path/to/data.zarr")
"""
if cells_as_circles is None:
cells_as_circles = True
warnings.warn(
"The default value of `cells_as_circles` will change to `False` in the next release. "
"Please pass `True` explicitly to maintain the current behavior.",
DeprecationWarning,
stacklevel=3,
)
image_models_kwargs, labels_models_kwargs = _initialize_raster_models_kwargs(
image_models_kwargs, labels_models_kwargs
)
Expand Down Expand Up @@ -223,18 +215,16 @@ def xenium(
# labels.
if nucleus_labels:
labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
path,
XeniumKeys.CELLS_ZARR,
specs,
path=path,
specs=specs,
mask_index=0,
labels_name="nucleus_labels",
labels_models_kwargs=labels_models_kwargs,
)
if cells_labels:
labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
path,
XeniumKeys.CELLS_ZARR,
specs,
path=path,
specs=specs,
mask_index=1,
labels_name="cell_labels",
labels_models_kwargs=labels_models_kwargs,
Expand Down Expand Up @@ -360,8 +350,8 @@ def filter(self, record: logging.LogRecord) -> bool:
return False
return True

logger = tifffile.logger()
logger.addFilter(IgnoreSpecificMessage())
tf_logger = tifffile.logger()
tf_logger.addFilter(IgnoreSpecificMessage())
image_models_kwargs = dict(image_models_kwargs)
assert "c_coords" not in image_models_kwargs, (
"The channel names for the morphology focus images are handled internally"
Expand All @@ -374,7 +364,7 @@ def filter(self, record: logging.LogRecord) -> bool:
image_models_kwargs,
)
del image_models_kwargs["c_coords"]
logger.removeFilter(IgnoreSpecificMessage())
tf_logger.removeFilter(IgnoreSpecificMessage())

if table is not None:
tables["table"] = table
Expand Down Expand Up @@ -402,14 +392,16 @@ def filter(self, record: logging.LogRecord) -> bool:
def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
if isinstance(cell_id_column.iloc[0], bytes):
return cell_id_column.str.decode("utf-8")
if not isinstance(cell_id_column.iloc[0], str):
cell_id_column.index = cell_id_column.index.astype(str)
return cell_id_column


def _get_polygons(
path: Path,
file: str,
specs: dict[str, Any],
idx: ArrayLike | None = None,
idx: pd.Series | None = None,
) -> GeoDataFrame:
# seems to be faster than pd.read_parquet
df = pq.read_table(path / file).to_pandas()
Expand Down Expand Up @@ -448,7 +440,7 @@ def _get_polygons(
if version is not None and version < packaging.version.parse("2.0.0"):
assert idx is not None
assert len(idx) == len(geo_df)
assert index.equals(idx)
assert np.array_equal(index.values, idx.values)
else:
if np.unique(geo_df.index).size != len(geo_df):
warnings.warn(
Expand All @@ -464,7 +456,6 @@ def _get_polygons(

def _get_labels_and_indices_mapping(
path: Path,
file: str,
specs: dict[str, Any],
mask_index: int,
labels_name: str,
Expand Down Expand Up @@ -493,36 +484,35 @@ def _get_labels_and_indices_mapping(
cell_id, dataset_suffix = z["cell_id"][...].T
cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)

# this information will probably be available in the `label_id` column for version > 2.0.0 (see public
# release notes mentioned above)
real_label_index = get_element_instances(labels).values

# background removal
if real_label_index[0] == 0:
real_label_index = real_label_index[1:]

if version < packaging.version.parse("2.0.0"):
expected_label_index = z["seg_mask_value"][...]

if not np.array_equal(expected_label_index, real_label_index):
raise ValueError(
"The label indices from the labels differ from the ones from the input data. Please report "
f"this issue. Real label indices: {real_label_index}, expected label indices: "
f"{expected_label_index}."
)
label_index = z["seg_mask_value"][...]
else:
labels_positional_indices = z["polygon_sets"][f"{mask_index}"]["cell_index"][...]
if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))):
raise ValueError(
"The positional indices of the labels do not match the expected range. Please report this issue."
# For v >= 2.0.0, seg_mask_value is no longer available in the zarr;
# read label_id from the corresponding parquet boundary file instead
boundaries_file = XeniumKeys.NUCLEUS_BOUNDARIES_FILE if mask_index == 0 else XeniumKeys.CELL_BOUNDARIES_FILE
boundary_columns = pq.read_schema(path / boundaries_file).names
if "label_id" in boundary_columns:
boundary_df = pq.read_table(path / boundaries_file, columns=[XeniumKeys.CELL_ID, "label_id"]).to_pandas()
unique_pairs = boundary_df.drop_duplicates(subset=[XeniumKeys.CELL_ID, "label_id"]).copy()
unique_pairs[XeniumKeys.CELL_ID] = _decode_cell_id_column(unique_pairs[XeniumKeys.CELL_ID])
cell_id_to_label_id = unique_pairs.set_index(XeniumKeys.CELL_ID)["label_id"]
label_index = cell_id_to_label_id.loc[cell_id_str].values
else:
# fallback for dev versions around 2.0.0 that lack both seg_mask_value and label_id
logger.warn(
f"Could not find the labels ids from the metadata for version {version}. Using a fallback (slower) implementation."
)
label_index = get_element_instances(labels).values

if label_index[0] == 0:
label_index = label_index[1:]

# labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
indices_mapping = pd.DataFrame(
{
"region": labels_name,
"cell_id": cell_id_str,
"label_index": real_label_index.astype(np.int64),
"label_index": label_index.astype(np.int64),
}
)
# because AnnData converts the indices to str
Expand Down
Loading
Loading