diff --git a/HISTORY.rst b/HISTORY.rst index 88a0b97a..7d5ab4e9 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,15 @@ History ======= +1.4.1 (2026-06-16) +------------------ +* Improved speed of random spot generation used in permutation testing. + +API and Bug Fixes: +* Fixed up docstring and types for various CCI methods. +* Fixes for issues #350 and #300 with incorrect types. +* Upgrade libraries: pillow>=12.0.0,<13.0 + 1.4.0 (2026-05-01) ------------------ * Removed tensorflow and keras and replaced with torch and torchvision. diff --git a/docs/index.rst b/docs/index.rst index c27acfb9..86541e08 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -38,6 +38,8 @@ undissociated tissue sample. Latest Additions ---------------- +.. include:: release_notes/1.4.1.rst + .. include:: release_notes/1.4.0.rst .. include:: release_notes/1.3.0.rst diff --git a/docs/release_notes/1.4.1.rst b/docs/release_notes/1.4.1.rst new file mode 100644 index 00000000..e303670a --- /dev/null +++ b/docs/release_notes/1.4.1.rst @@ -0,0 +1,11 @@ +1.4.1 `2026-06-16` +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. rubric:: Features +* Improved speed of random spot generation used in permutation testing. + +.. rubric:: Bugs + +* Fixed up docstring and types for various CCI methods. +* Fixes for issues #350 and #300 with incorrect types. +* Upgrade libraries: pillow>=12.0.0,<13.0 \ No newline at end of file diff --git a/docs/release_notes/index.rst b/docs/release_notes/index.rst index cd332bca..062be2bd 100644 --- a/docs/release_notes/index.rst +++ b/docs/release_notes/index.rst @@ -1,6 +1,8 @@ Release Notes =================================================== +.. include:: 1.4.1.rst + .. include:: 1.4.0.rst .. include:: 1.3.0.rst diff --git a/pyproject.toml b/pyproject.toml index 21bb1e26..4a71be73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "stlearn" -version = "1.4.0" +version = "1.4.1" authors = [ { name = "Genomics and Machine Learning Lab", email = "andrew.newman@uq.edu.au" }, ] @@ -20,7 +20,7 @@ dependencies = [ "leidenalg>=0.11.0", "numba>=0.58.1", "numpy>=2.4.0", - "pillow>=11.0.0,<12.0", + "pillow>=12.0.0,<13.0", "scanpy>=1.12.0", "scikit-image>=0.22.0", "pandas>=2.3.0", diff --git a/stlearn/__init__.py b/stlearn/__init__.py index d3d34186..a3e270dd 100644 --- a/stlearn/__init__.py +++ b/stlearn/__init__.py @@ -2,7 +2,7 @@ __author__ = """Genomics and Machine Learning Lab""" __email__ = "andrew.newman@uq.edu.au" -__version__ = "1.4.0" +__version__ = "1.4.1" from . import add, datasets, em, pl, pp, spatial, tl, types from ._settings import settings diff --git a/stlearn/adds/add_mask.py b/stlearn/adds/add_mask.py index 399807de..a483df42 100644 --- a/stlearn/adds/add_mask.py +++ b/stlearn/adds/add_mask.py @@ -38,9 +38,11 @@ def add_mask( library_id = next(iter(adata.uns["spatial"].keys())) quality = adata.uns["spatial"][library_id]["use_quality"] except Exception as e: - raise KeyError("""\ + raise KeyError( + """\ Please read ST data first and try again - """) from e + """ + ) from e if imgpath is not None and os.path.isfile(imgpath): try: @@ -59,13 +61,17 @@ def add_mask( adata.uns["mask_image"][library_id][key][quality] = img print("Added tissue mask to the object!") except Exception as e: - raise ValueError(f"""\ + raise ValueError( + f"""\ {imgpath!r} does not end on a valid extension. - """) from e + """ + ) from e else: - raise ValueError(f"""\ + raise ValueError( + f"""\ {imgpath!r} does not end on a valid extension. - """) + """ + ) return adata if copy else None @@ -128,9 +134,11 @@ def apply_mask( library_id = next(iter(adata.uns["spatial"].keys())) quality = adata.uns["spatial"][library_id]["use_quality"] except Exception as e: - raise KeyError("""\ + raise KeyError( + """\ Please read ST data first and try again - """) from e + """ + ) from e if masks == "all": masks = list(adata.uns["mask_image"][library_id].keys()) @@ -144,9 +152,11 @@ def apply_mask( try: mask_image = adata.uns["mask_image"][library_id][mask][quality] except Exception as e: - raise KeyError(f"""\ + raise KeyError( + f"""\ Please load mask {mask} images first and try again - """) from e + """ + ) from e # Normalise to uint8 if the image is float in [0, 1] if mask_image.dtype.kind == "f" and mask_image.max() <= 1.0: @@ -157,9 +167,11 @@ def apply_mask( elif select == "white": mask_image = np.where(mask_image >= 155, 1, 0) else: - raise ValueError("""\ + raise ValueError( + """\ Only support black and white mask yet. - """) + """ + ) mask_image_2d = mask_image.mean(axis=2) def apply_spot_mask(x, _mask=mask, _i=i, _mask_image_2d=mask_image_2d): diff --git a/stlearn/adds/image.py b/stlearn/adds/image.py index 1db531a4..8b1792a5 100644 --- a/stlearn/adds/image.py +++ b/stlearn/adds/image.py @@ -71,11 +71,15 @@ def image( print("Added tissue image to the object!") except Exception as e: - raise ValueError(f"""\ + raise ValueError( + f"""\ {imgpath!r} does not end on a valid extension. - """) from e + """ + ) from e else: - raise ValueError(f"""\ + raise ValueError( + f"""\ {imgpath!r} does not end on a valid extension. - """) + """ + ) return adata if copy else None diff --git a/stlearn/spatial/sme/_weighting_matrix.py b/stlearn/spatial/sme/_weighting_matrix.py index dab91ce9..aa15e9d7 100644 --- a/stlearn/spatial/sme/_weighting_matrix.py +++ b/stlearn/spatial/sme/_weighting_matrix.py @@ -36,9 +36,11 @@ def row_col_by_platform( array_col = adata.obs_names.map(lambda x: x.split("x")[0]) rate = 1.5 else: - raise ValueError(f"""\ + raise ValueError( + f"""\ {platform!r} does not support. - """) + """ + ) regression = LinearRegression() reg_row: LinearRegression = regression.fit(array_row.values.reshape(-1, 1), img_row) # type: ignore reg_col: LinearRegression = regression.fit(array_col.values.reshape(-1, 1), img_col) # type: ignore diff --git a/stlearn/spatial/sme/pseudo_spot.py b/stlearn/spatial/sme/pseudo_spot.py index 762144cc..5d154fa7 100644 --- a/stlearn/spatial/sme/pseudo_spot.py +++ b/stlearn/spatial/sme/pseudo_spot.py @@ -172,9 +172,11 @@ def pseudo_spot( ).reset_index() obs_df.drop_duplicates(subset=["array_row", "array_col"], keep="last") else: - raise ValueError(f"""\ + raise ValueError( + f"""\ {platform!r} does not support. - """) + """ + ) reg_row = LinearRegression().fit(array_row.values.reshape(-1, 1), img_row) diff --git a/stlearn/spatial/sme/sme_impute0.py b/stlearn/spatial/sme/sme_impute0.py index 49980c77..cfb96b04 100644 --- a/stlearn/spatial/sme/sme_impute0.py +++ b/stlearn/spatial/sme/sme_impute0.py @@ -61,9 +61,11 @@ def sme_impute0( elif isinstance(adata.X, pd.Dataframe): count_embed = adata.X.values else: - raise ValueError(f"""\ + raise ValueError( + f"""\ {type(adata.X)} is not a valid type. - """) + """ + ) else: count_embed = adata.obsm[use_data] diff --git a/stlearn/spatial/sme/sme_normalize.py b/stlearn/spatial/sme/sme_normalize.py index 4c7291d7..783da2e1 100644 --- a/stlearn/spatial/sme/sme_normalize.py +++ b/stlearn/spatial/sme/sme_normalize.py @@ -61,9 +61,11 @@ def sme_normalize( elif isinstance(adata.X, pd.Dataframe): count_embed = adata.X.values else: - raise ValueError(f"""\ + raise ValueError( + f"""\ {type(adata.X)} is not a valid type. - """) + """ + ) else: count_embed = adata.obsm[use_data] diff --git a/stlearn/tl/cci/analysis.py b/stlearn/tl/cci/analysis.py index 6ae03a83..6178a19b 100644 --- a/stlearn/tl/cci/analysis.py +++ b/stlearn/tl/cci/analysis.py @@ -6,6 +6,7 @@ import numba import numpy as np +import numpy.typing as npt import pandas as pd from anndata import AnnData from statsmodels.stats.multitest import multipletests @@ -25,7 +26,9 @@ # Functions related to Ligand-Receptor interactions -def load_lrs(names: str | list | None = None, species: str = "human") -> np.ndarray: +def load_lrs( + names: str | list | None = None, species: str = "human" +) -> npt.NDArray[np.str_]: """Loads inputted LR database, & concatenates into consistent database set of pairs without duplicates. If None loads 'connectomeDB2020_lit'. @@ -66,20 +69,20 @@ def load_lrs(names: str | list | None = None, species: str = "human") -> np.ndar + genes2[i][0] + genes2[i][1:].lower() for i in range(len(lrs_full)) - ] + ], ) return lrs_full_arr def grid( - adata, + adata: AnnData, n_row: int = 10, n_col: int = 10, use_label: str | None = None, n_cpus: int | None = None, verbose: bool = True, -): +) -> AnnData: """Creates a new anndata representing a gridded version of the data; can be used upstream of CCI pipeline. NOTE: intended use is for single cell spatial data, not Visium or other lower resolution tech. @@ -170,13 +173,15 @@ def grid( if use_label is not None and cell_info is not None and cell_set is not None: grid_data.uns[use_label] = pd.DataFrame( - cell_info, index=grid_data.obs_names.values.astype(str), columns=cell_set + cell_info, + index=grid_data.obs_names.values.astype(str), + columns=cell_set, ) max_indices = np.apply_along_axis(np.argmax, 1, cell_info) grid_data.obs[use_label] = [cell_set[index] for index in max_indices] grid_data.obs[use_label] = grid_data.obs[use_label].astype("category") grid_data.obs[use_label] = grid_data.obs[use_label].cat.set_categories( - adata.obs[use_label].cat.categories + adata.obs[use_label].cat.categories, ) if f"{use_label}_colors" in adata.uns: grid_data.uns[f"{use_label}_colors"] = adata.uns[f"{use_label}_colors"] @@ -195,7 +200,7 @@ def grid( def run( adata: AnnData, - lrs: np.ndarray, + lrs: npt.NDArray[np.str_], min_spots: int = 10, distance: float | None = None, n_pairs: int = 1000, @@ -203,12 +208,12 @@ def run( use_label: str | None = None, adj_method: str = "fdr_bh", pval_adj_cutoff: float = 0.05, - min_expr: float = 0, + min_expr: float = 0.0, save_bg: bool = False, neg_binom: bool = False, random_state: int = 0, verbose: bool = True, -): +) -> None: """Performs stLearn LR analysis. Parameters @@ -220,7 +225,7 @@ def run( min_spots: int Minimum number of spots with an LR score for an LR to be considered for further testing. - distance: int + distance: float Distance to determine the neighbours (default [None] is immediately adjacent neighbours if using Visium), distance=0 means within spot (only for non-single-cell spatial data). @@ -254,7 +259,7 @@ def run( Whether print dialogue to user during run-time. Returns -------- - adata: AnnData + None Relevant information stored: adata.uns['lr_summary'] Summary of significant spots detected per LR, @@ -285,7 +290,7 @@ def run( "Detected '_' within some gene names, which breaks " + "internal string handling for the lrs in format 'L_R'.\n" + "Recommend to rename adata.var_names or remove these " - + f"genes from adata:\n {prob_genes}" + + f"genes from adata:\n {prob_genes}", ) # Calculating neighbour & storing # @@ -307,7 +312,9 @@ def run( neigh_bcs = [adata.obs_names[index] for index in neigh_indices] spot_neigh_bcs.append(",".join(neigh_bcs)) spot_neigh_bcs_df = pd.DataFrame( - spot_neigh_bcs, index=spot_neighs_df.index, columns=["neighbour_bcs"] + spot_neigh_bcs, + index=spot_neighs_df.index, + columns=["neighbour_bcs"], ) # Important to store barcodes in-case adata subsetted # adata.obsm["spot_neigh_bcs"] = spot_neigh_bcs_df @@ -315,7 +322,7 @@ def run( if verbose: print( "Spot neighbour indices stored in adata.obsm['spot_neighbours'] " - "& adata.obsm['spot_neigh_bcs']." + "& adata.obsm['spot_neigh_bcs'].", ) # Conduct with cell heterogeneity info if label_transfer provided # @@ -391,18 +398,18 @@ def adj_pvals( ------- adata: AnnData Adjusts all of the LR results; warning, does not adjust - celltype-celltype results from running ran st.tl.run_cci downstream. + celltype-celltype results from running st.tl.run_cci downstream. """ if "lr_summary" not in adata.uns: raise Exception("Need to run st.tl.cci.run first.") - scores = adata.obsm["lr_scores"] - sig_scores = scores.copy() + lr_scores = adata.obsm["lr_scores"] + sig_scores = lr_scores.copy() ps = adata.obsm["p_vals"] padjs = np.ones(ps.shape) if correct_axis == "spot": for spot_i in range(ps.shape[0]): - lr_indices = np.where(scores[spot_i, :] > 0)[0] + lr_indices = np.where(lr_scores[spot_i, :] > 0)[0] if len(lr_indices) > 0: spot_ps = ps[spot_i, lr_indices] spot_padjs = multipletests(spot_ps, method=adj_method)[1] @@ -410,7 +417,7 @@ def adj_pvals( sig_scores[spot_i, lr_indices[spot_padjs >= pval_adj_cutoff]] = 0 elif correct_axis == "LR": for lr_i in range(ps.shape[1]): - spot_indices = np.where(scores[:, lr_i] > 0)[0] + spot_indices = np.where(lr_scores[:, lr_i] > 0)[0] if len(spot_indices) > 0: lr_ps = ps[spot_indices, lr_i] spot_padjs = multipletests(lr_ps, method=adj_method)[1] @@ -421,7 +428,7 @@ def adj_pvals( sig_scores[padjs >= pval_adj_cutoff] = 0 else: raise Exception( - "Invalid correct_axis input, must be one of: 'LR', 'spot', or None" + "Invalid correct_axis input, must be one of: 'LR', 'spot', or None", ) # Counting spots significant per lr # @@ -434,7 +441,7 @@ def adj_pvals( new_order = np.argsort(-adata.uns["lr_summary"].loc[:, "n_spots_sig"].values) adata.uns["lr_summary"] = adata.uns["lr_summary"].iloc[new_order, :] print("Updated adata.uns[lr_summary]") - scores_ordered = scores[:, new_order] + scores_ordered = lr_scores[:, new_order] sig_scores_ordered = sig_scores[:, new_order] ps_ordered = ps[:, new_order] padjs_ordered = padjs[:, new_order] @@ -618,7 +625,7 @@ def run_cci( ran_sig = False if not ran_lr else "n_spots_sig" in adata.uns["lr_summary"].columns if not ran_lr or not ran_sig: raise Exception( - "No LR results testing results found, please run st.tl.cci.run first" + "No LR results testing results found, please run st.tl.cci.run first", ) # Ensuring compatibility with current way of adding label_transfer to object @@ -630,7 +637,7 @@ def run_cci( # Getting the cell/tissue types that we are actually testing # if obs_key not in adata.obs: raise Exception( - f"Missing {obs_key} from adata.obs, need this even if using mixture mode." + f"Missing {obs_key} from adata.obs, need this even if using mixture mode.", ) tissue_types = adata.obs[obs_key].values.astype(str) all_set = np.unique(tissue_types) @@ -640,11 +647,11 @@ def run_cci( if not mix_mode and spot_mixtures: print( f"Warning: specified spot_mixtures but no deconvolution data in " - f"adata.uns['{uns_key}'].\nFalling back to discrete mode." + f"adata.uns['{uns_key}'].\nFalling back to discrete mode.", ) if mix_mode: # Checking the deconvolution results stored correctly. cols_present = np.all( - [cell_type in adata.uns[uns_key] for cell_type in all_set] + [cell_type in adata.uns[uns_key] for cell_type in all_set], ) rows_present = np.all(adata.uns[uns_key].index.values == adata.obs_names.values) msg = f"Cell type scores misformatted in adata.uns[{uns_key}]:\n" @@ -684,7 +691,7 @@ def run_cci( raise Exception( "No LR pairs returned with current filtering params; \n" "may need to adjust min_spots, sig_spots parameters, " - "or re-run st.tl.cci.run with more relaxed parameters." + "or re-run st.tl.cci.run with more relaxed parameters.", ) lr_expr = adata[:, lr_genes].to_df() @@ -764,10 +771,14 @@ def run_cci( adata.uns["lr_summary"][f"n-spot_cci_{use_label}"] = lr_n_spot_cci adata.uns["lr_summary"][f"n-spot_cci_sig_{use_label}"] = lr_n_spot_cci_sig adata.uns[f"lr_cci_{use_label}"] = pd.DataFrame( - all_matrix, index=all_set, columns=all_set + all_matrix, + index=all_set, + columns=all_set, ) adata.uns[f"lr_cci_raw_{use_label}"] = pd.DataFrame( - raw_matrix, index=all_set, columns=all_set + raw_matrix, + index=all_set, + columns=all_set, ) adata.uns[f"per_lr_cci_{use_label}"] = per_lr_cci adata.uns[f"per_lr_cci_pvals_{use_label}"] = per_lr_cci_pvals @@ -775,9 +786,9 @@ def run_cci( if verbose: print( f"Significant counts of cci_rank interactions for all LR pairs in " - f"{f'data.uns[lr_cci_{use_label}]'}" + f"{f'data.uns[lr_cci_{use_label}]'}", ) print( f"Significant counts of cci_rank interactions for each LR pair " - f"stored in dictionary {f'data.uns[per_lr_cci_{use_label}]'}" + f"stored in dictionary {f'data.uns[per_lr_cci_{use_label}]'}", ) diff --git a/stlearn/tl/cci/base.py b/stlearn/tl/cci/base.py index 625c5985..736af808 100644 --- a/stlearn/tl/cci/base.py +++ b/stlearn/tl/cci/base.py @@ -1,4 +1,5 @@ import numpy as np +import numpy.typing as npt import pandas as pd import scipy as sc import scipy.spatial as spatial @@ -9,68 +10,6 @@ from .het import create_grids -def lr( - adata: AnnData, - use_lr: str = "cci_lr", - distance: float | None = None, - verbose: bool = True, - neighbours: list | None = None, - fast: bool = True, -) -> AnnData: - """Calculate the proportion of known ligand-receptor co-expression among the - neighbouring spots or within spots - Parameters - ---------- - adata: AnnData - The data object to scan - use_lr: str - object to keep the result (default: adata.uns['cci_lr']) - distance: float - Distance to determine the neighbours (default: closest), distance=0 means - within spot. If distance is None gets it from adata.uns["spatial"] - neighbours: list - List of the neighbours for each spot, if None then computed. Useful for - speeding up function. - fast: bool - Whether to use the fast implementation or not. - - Returns - ------- - adata: AnnData - The data object including the results - """ - - # automatically calculate distance if not given, won't overwrite distance=0 - # which is within-spot - distance = calc_distance(adata, distance) - - # # expand the LR pairs list by swapping ligand-receptor positions - lr_pairs = adata.uns["lr"].copy() - spot_lr1 = get_spot_lrs(adata, lr_pairs=lr_pairs, lr_order=True) - spot_lr2 = get_spot_lrs(adata, lr_pairs=lr_pairs, lr_order=False) - if verbose: - print("Altogether " + str(spot_lr1.shape[1]) + " valid L-R pairs") - - # get neighbour spots for each spot according to the specified distance - if neighbours is None: - neighbours = calc_neighbours(adata, distance, index=fast) - - # Calculating the scores, can have either the fast or the pandas version # - if fast: - adata.obsm[use_lr] = lr_core(spot_lr1.values, spot_lr2.values, neighbours, 0) - else: - adata.obsm[use_lr] = lr_pandas(spot_lr1, spot_lr2, neighbours) - - if verbose: - print( - "L-R interactions with neighbours are counted and stored into adata.obsm['" - + use_lr - + "']" - ) - - # return adata - - def calc_distance(adata: AnnData, distance: float | None) -> float: """Automatically calculate distance if not given, won't overwrite \ distance=0 which is within-spot. @@ -103,13 +42,13 @@ def calc_distance(adata: AnnData, distance: float | None) -> float: def get_lrs_scores( adata: AnnData, - lrs: np.ndarray, + lrs: npt.NDArray[np.str_], neighbours: np.ndarray, het_vals: np.ndarray, min_expr: float, filter_pairs: bool = True, - spot_indices: np.ndarray | None = None, -): + spot_indices: npt.NDArray[np.int32] | None = None, +) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.str_]]: """Gets the scores for the indicated set of LR pairs & the heterogeneity values. Parameters ---------- @@ -126,14 +65,19 @@ def get_lrs_scores( have reasonable score. filter_pairs: bool Whether to filter to valid pairs or not. - spot_indices: np.ndarray - Array of integers speci + spot_indices: npt.NDArray[np.int32] + Subset of spots to score, given as their integer row positions. Returns ------- - lrs: np.ndarray lr pairs from the database in format ['L1_R1', 'LN_RN'] + lr_scores: npt.NDArray[np.float64] + Shape (n_scored_spots, n_pairs). LR score for each scored spot (rows in + the order of spot_indices) and each ligand-receptor pair (columns). + new_lrs: npt.NDArray[np.str_] + Shape (n_pairs,). Ligand-receptor pair labels in 'L_R' format + (e.g. 'L1_R1'), column-aligned with lr_scores. """ if spot_indices is None: - spot_indices = np.array(list(range(len(adata))), dtype=np.int32) + spot_indices = np.arange(len(adata), dtype=np.int32) spot_lr1s = get_spot_lrs( adata, lr_pairs=lrs, lr_order=True, filter_pairs=filter_pairs @@ -269,94 +213,55 @@ def calc_neighbours( return typed_neighs -@njit +@njit(cache=True) def lr_core( - spot_lr1: np.ndarray, - spot_lr2: np.ndarray, + spot_lr1: npt.NDArray[np.float64], + spot_lr2: npt.NDArray[np.float64], neighbours: List, min_expr: float, - spot_indices: np.array, -) -> np.ndarray: - """Calculate the lr scores for each spot. + spot_indices: npt.NDArray[np.int32], +) -> npt.NDArray[np.float64]: + """Calculate the mean of lr scores for each spot. Parameters ---------- - spot_lr1: np.ndarray - Spots*Ligands - spot_lr2: np.ndarray - Spots*Receptors - neighbours: numba.typed.List + spot_lr1: npt.NDArray[np.float64] + Spots x 2 block for one LR pair, columns in (ligand, receptor) order. + Column j is the gene whose expression is taken at the spot itself. + spot_lr2: npt.NDArray[np.float64] + Spots x 2 block for the same pair in reversed (receptor, ligand) order. + Column j is the partner gene whose expression is averaged over neighbours. + neighbours: List List of np.array's indicating neighbours by indices for each spot. min_expr: float Minimum expression for gene to be considered expressed. + spot_indices: npt.NDArray[np.int32] + Subset of spots to score, given as their integer row positions. Returns ------- - lr_scores: numpy.ndarray - Cells*LR-scores. - """ - # Calculating mean of lr2 expressions from neighbours of each spot - nb_lr2 = np.zeros((len(spot_indices), spot_lr2.shape[1]), np.float64) - for i in range(len(spot_indices)): - spot_i = spot_indices[i] - nb_expr = spot_lr2[neighbours[spot_i], :] - if nb_expr.shape[0] != 0: # Accounting for no neighbours - nb_expr_mean = nb_expr.sum(axis=0) / nb_expr.shape[0] - else: - nb_expr_mean = nb_expr.sum(axis=0) - nb_lr2[i, :] = nb_expr_mean - - scores = ( - spot_lr1[spot_indices, :] * (nb_lr2 > min_expr) - + (spot_lr1[spot_indices, :] > min_expr) * nb_lr2 - ) - spot_lr = scores.sum(axis=1) - return spot_lr / 2 - - -def lr_pandas( - spot_lr1: np.ndarray, - spot_lr2: np.ndarray, - neighbours: list, -) -> np.ndarray: - """Calculate the lr scores for each spot. - Parameters - ---------- - spot_lr1 (pd.DataFrame): - Cells*Ligands - spot_lr2 (pd.DataFrame): - Cells*Receptors - neighbours (list): - List of neighbours by indices for each spot. - Returns - ------- - lr_scores (numpy.ndarray): - Cells*LR-scores. + lr_scores: npt.NDArray[np.float64] + 1-D, one score per spot in spot_indices (forward/reverse directions averaged). """ - - # function to calculate mean of lr2 expression between neighbours or within - # spot (distance==0) for each spot - def mean_lr2(x): - # get lr2 expressions from the neighbour(s) - n_spots = neighbours[spot_lr2.index.tolist().index(x.name)] - nbs = spot_lr2.loc[n_spots, :] - if nbs.shape[0] > 0: # if neighbour exists - return nbs.sum() / nbs.shape[0] - else: - return 0 - - # mean of lr2 expressions from neighbours of each spot - nb_lr2 = spot_lr2.apply(mean_lr2, axis=1) - - # check whether neighbours exist - try: - nb_lr2.shape[1] - except: - raise ValueError("No neighbours found within given distance.") from None - - # keep value of nb_lr2 only when lr1 is also expressed on the spots - spot_lr = pd.DataFrame( - spot_lr1.values * (nb_lr2.values > 0) + (spot_lr1.values > 0) * nb_lr2.values, - ).sum(axis=1) - return spot_lr.values / 2 + n_rows = len(spot_indices) + n_cols = spot_lr2.shape[1] + spot_lr = np.zeros(n_rows, np.float64) + # For each spot (each spot is independent) + for i in range(n_rows): + si = spot_indices[i] + spot_neighbour = neighbours[si] + n_spot_neighbours = len(spot_neighbour) + total = 0.0 + # For each LR pair. + for j in range(n_cols): + mean_lr2 = 0.0 + if n_spot_neighbours > 0: + s = 0.0 + for k in range(n_spot_neighbours): + s += spot_lr2[spot_neighbour[k], j] + mean_lr2 = s / n_spot_neighbours + l1 = spot_lr1[si, j] + total += l1 * (mean_lr2 > min_expr) + (l1 > min_expr) * mean_lr2 + spot_lr[i] = total / 2.0 + return spot_lr @njit(parallel=True) @@ -366,8 +271,8 @@ def get_scores( neighbours: List, het_vals: np.ndarray, min_expr: float, - spot_indices: np.ndarray, -) -> np.ndarray: + spot_indices: npt.NDArray[np.int32], +) -> npt.NDArray[np.float64]: """Calculates the scores. Parameters ---------- @@ -377,10 +282,12 @@ def get_scores( Spots*GeneOrder2, in format r1, l1, ... rn, ln het_vals: np.ndarray Spots*Het counts - neighbours: numba.typed.List + neighbours: List List of np.array's indicating neighbours by indices for each spot. min_expr: float Minimum expression for gene to be considered expressed. + spot_indices: npt.NDArray[np.int32] + Subset of spots to score, given as their integer row positions. Returns ------- spot_scores: np.ndarray diff --git a/stlearn/tl/cci/base_grouping.py b/stlearn/tl/cci/base_grouping.py index fdec802a..8a1e7cb7 100644 --- a/stlearn/tl/cci/base_grouping.py +++ b/stlearn/tl/cci/base_grouping.py @@ -4,6 +4,7 @@ import matplotlib.pyplot as plt import numpy as np +import numpy.typing as npt import pandas as pd import seaborn as sb from anndata import AnnData @@ -16,7 +17,7 @@ def get_hotspots( adata: AnnData, lr_scores: np.ndarray, - lrs: np.array, + lrs: npt.NDArray[np.str_], eps: float, quantile=0.05, verbose=True, @@ -32,7 +33,7 @@ def get_hotspots( The data object lr_scores: np.ndarray LR_pair*Spots containing the LR scores. - lrs: np.array + lrs: npt.NDArray[np.str_] The LR_pairs, in-line with the rows of scores. eps: float The eps parameter used in DBScan to get the number of clusters. @@ -48,7 +49,13 @@ def get_hotspots( """ coors = adata.obs[["imagerow", "imagecol"]].values lr_summary, lr_hot_scores = hotspot_core( - lr_scores, lrs, coors, eps, quantile, plot_diagnostics, adata + lr_scores, + lrs, + coors, + eps, + quantile, + plot_diagnostics, + adata, ) if plot_diagnostics and show_plot: # Showing the diagnostic plotting # @@ -59,12 +66,15 @@ def get_hotspots( # Clustering the LR pairs to obtain a set of clusters so to order within # each cluster clusterer = AgglomerativeClustering( - affinity="euclidean", linkage="ward", distance_threshold=10, n_clusters=None + metric="euclidean", + linkage="ward", + distance_threshold=10, + n_clusters=None, ) clusterer.fit(lr_hot_scores > 0) dist_cutoff = np.quantile(clusterer.distances_, 0.98) clusterer = AgglomerativeClustering( - affinity="euclidean", + metric="euclidean", linkage="ward", distance_threshold=dist_cutoff, n_clusters=None, @@ -121,7 +131,7 @@ def get_hotspots( print("\tSummary values of lrs in adata.uns['lr_summary'].") print( "\tMatrix of lr scores in same order as the summary in " - + "adata.obsm['lr_scores']." + + "adata.obsm['lr_scores'].", ) print("\tMatrix of the hotspot scores in adata.obsm['lr_hot_scores'].") print("\tMatrix of the mean LR cluster scores in adata.obsm['cluster_scores'].") @@ -153,7 +163,7 @@ def hotspot_core( lr_mean_scores = np.apply_along_axis(non_zero_mean, 1, score_copy) lr_quant_values = np.quantile(lr_mean_scores, lr_quantiles) quant_lrs = np.array( - [lrs[lr_mean_scores == quant] for quant in lr_quant_values] + [lrs[lr_mean_scores == quant] for quant in lr_quant_values], ) _, axes = plt.subplots(6, 4, figsize=(20, 15)) @@ -178,7 +188,9 @@ def hotspot_core( coor_ = coors[spot_bool, :] clusters = DBSCAN( - min_samples=2, eps=eps, metric="manhattan" + min_samples=2, + eps=eps, + metric="manhattan", ).fit_predict(coor_) score = len(np.unique(clusters)) * (np.mean(lr_score_[spot_bool])) ** 2 cutoff_scores.append(score) diff --git a/stlearn/tl/cci/het.py b/stlearn/tl/cci/het.py index 9dc0defb..21b52699 100644 --- a/stlearn/tl/cci/het.py +++ b/stlearn/tl/cci/het.py @@ -30,7 +30,7 @@ def count( The cell type results to use in counting use_het: The storage place for result - distance: int + distance: float Distance to determine the neighbours (default is the nearest neighbour), distance=0 means within spot @@ -95,8 +95,8 @@ def count( def get_edges( adata: AnnData, - L_bool: np.ndarray, - R_bool: np.ndarray, + l_bool: np.ndarray, + r_bool: np.ndarray, sig_bool: np.ndarray, ): """Gets a list edges representing significant interactions. @@ -105,9 +105,9 @@ def get_edges( ---------- adata : AnnData Annotated data object containing spatial transcriptomics data. - L_bool : np.ndarray of bool, shape (n_spots,) + l_bool : np.ndarray of bool, shape (n_spots,) Boolean array indicating spots where the ligand is expressed. - R_bool : np.ndarray of bool, shape (n_spots,) + r_bool : np.ndarray of bool, shape (n_spots,) Boolean array indicating spots where the receptor is expressed. sig_bool : np.ndarray of bool, shape (n_spots,) Boolean array indicating spots with significant ligand-receptor interactions. @@ -122,16 +122,16 @@ def get_edges( neighbourhood_bcs, neighbourhood_indices = get_neighbourhoods(adata) # Getting the edges to draw in-between # - L_spot_indices = np.where(np.logical_and(L_bool, sig_bool))[0] - R_spot_indices = np.where(np.logical_and(R_bool, sig_bool))[0] + l_spot_indices = np.where(np.logical_and(l_bool, sig_bool))[0] + r_spot_indices = np.where(np.logical_and(r_bool, sig_bool))[0] # To keep the get_between_spot_edge_array function happy # cell_data = np.ones((1, len(sig_bool)))[0, :].astype(np.float64) # Retrieving the edges # - gene_bools = [R_bool, L_bool] + gene_bools = [r_bool, l_bool] edge_list = [] - for i, spot_indices in enumerate([L_spot_indices, R_spot_indices]): + for i, spot_indices in enumerate([l_spot_indices, r_spot_indices]): # Subsetting to the relevant neighbourhoods # neigh_bcs = List() neigh_indices = List() diff --git a/stlearn/tl/cci/perm_utils.py b/stlearn/tl/cci/perm_utils.py index 37791d7e..2e397a18 100644 --- a/stlearn/tl/cci/perm_utils.py +++ b/stlearn/tl/cci/perm_utils.py @@ -1,7 +1,9 @@ import numpy as np +import numpy.typing as npt import pandas as pd from numba import njit, prange -from numba.typed import List +from numba.core import types +from numba.typed import Dict, List from scipy.spatial.distance import canberra from sklearn.preprocessing import MinMaxScaler @@ -264,23 +266,54 @@ def get_similar_genes_fast( return similar_genes -@njit -def gen_rand_pairs(genes1: np.ndarray, genes2: np.ndarray, n_pairs: int, seed: int): - """Generates random pairs of genes.""" +def gen_rand_pairs( + genes1: npt.NDArray[np.str_], + genes2: npt.NDArray[np.str_], + n_pairs: int, + seed: int, +) -> npt.NDArray[np.str_]: + """Generate unique random gene pairs for building background LR scores. + + Each pair is formed by drawing one gene from genes1 and one from genes2, + formatted as 'gene1_gene2'. Self-pairs (the same gene on both sides) and + duplicate pairs are rejected, so every returned pair is unique and its two + genes are distinct. + + Parameters + ---------- + genes1: npt.NDArray[np.str_] + Candidate genes for the first (ligand) position of each pair. + genes2: npt.NDArray[np.str_] + Candidate genes for the second (receptor) position of each pair. + n_pairs: int + Number of unique pairs to generate. + seed: int + Seed for the random generator, for reproducible pair selection. + """ + n_possible = len(genes1) * len(genes2) - np.intersect1d(genes1, genes2).size + if n_pairs > n_possible: + raise ValueError( + f"Requested {n_pairs} unique pairs but only {n_possible} are " + f"possible from {len(genes1)}x{len(genes2)} genes." + ) + return np.array(list(_gen_rand_pairs(genes1, genes2, n_pairs, seed))) + +@njit +def _gen_rand_pairs(genes1, genes2, n_pairs, seed): np.random.seed(seed) # noqa: NPY002 (numba requires legacy API) rand_pairs = List() + seen = Dict.empty(types.unicode_type, types.boolean) # O(1) membership for _j in range(0, n_pairs): l_rand = np.random.choice(genes1, 1)[0] # noqa: NPY002 r_rand = np.random.choice(genes2, 1)[0] # noqa: NPY002 rand_pair = "_".join([l_rand, r_rand]) - while rand_pair in rand_pairs or l_rand == r_rand: + while rand_pair in seen or l_rand == r_rand: # was: in rand_pairs l_rand = np.random.choice(genes1, 1)[0] # noqa: NPY002 r_rand = np.random.choice(genes2, 1)[0] # noqa: NPY002 rand_pair = "_".join([l_rand, r_rand]) - rand_pairs.append(rand_pair) - + seen[rand_pair] = True return rand_pairs diff --git a/stlearn/tl/cci/permutation.py b/stlearn/tl/cci/permutation.py index 610e2f6d..05395234 100644 --- a/stlearn/tl/cci/permutation.py +++ b/stlearn/tl/cci/permutation.py @@ -1,6 +1,5 @@ -from typing import Any - import numpy as np +import numpy.typing as npt import pandas as pd import scipy import statsmodels.api as sm @@ -14,8 +13,8 @@ def perform_spot_testing( adata: AnnData, - lr_scores: np.ndarray, - lrs: np.ndarray, + lr_scores: npt.NDArray[np.float64], + lrs: npt.NDArray[np.str_], n_pairs: int, neighbours: List, het_vals: np.ndarray, @@ -23,11 +22,24 @@ def perform_spot_testing( adj_method: str = "fdr_bh", pval_adj_cutoff: float = 0.05, verbose: bool = True, - save_bg=False, - neg_binom=False, - quantiles=(0.5, 0.75, 0.85, 0.9, 0.95, 0.97, 0.98, 0.99, 0.995, 0.9975, 0.999, 1), + save_bg: bool = False, + neg_binom: bool = False, + quantiles: tuple[float, ...] = ( + 0.5, + 0.75, + 0.85, + 0.9, + 0.95, + 0.97, + 0.98, + 0.99, + 0.995, + 0.9975, + 0.999, + 1, + ), random_state: int = 0, -): +) -> None: """Calls significant spots by creating random gene pairs with similar expression to given LR pair; only generate background for spots which have score for given LR. @@ -41,14 +53,14 @@ def perform_spot_testing( n_genes = round(np.sqrt(n_pairs) * 2) if len(genes) < n_genes: print( - f"Exiting since need atleast {n_genes} genes to generate {n_pairs} pairs." + f"Exiting since need atleast {n_genes} genes to generate {n_pairs} pairs.", ) return if n_pairs < 100: print( "Exiting since n_pairs<100, need much larger number of pairs to " - "get accurate backgrounds (e.g. 1000)." + "get accurate backgrounds (e.g. 1000).", ) return @@ -62,7 +74,11 @@ def perform_spot_testing( lrs, [col for col in lr_feats.columns if "R_" in col] ].values candidate_quants = np.apply_along_axis( - np.quantile, 0, candidate_expr, q=quantiles, method="nearest" + np.quantile, + 0, + candidate_expr, + q=quantiles, + method="nearest", ) # Ensuring consistent typing to prevent numba errors # l_quants = l_quants.astype("= lr_scores[spot_index, lr_j])[ 0 - ] + ], ) n_greater = n_greater if n_greater != 0 else 1 # pseudocount pvals[spot_index, lr_j] = n_greater / background.shape[1] @@ -131,14 +145,15 @@ def perform_spot_testing( else: # Fitting NB per LR lr_j_scores = lr_scores[spot_indices, lr_j] bg_ = background.ravel() - bg_wScore = np.array(list(lr_j_scores) + list(bg_)) + bg_w_score = np.array(list(lr_j_scores) + list(bg_)) # 1) rounding discretisation # First multiple to get minimum value to be one before rounding # - bg_1 = bg_wScore * (1 / min(bg_wScore[bg_wScore != 0])) - bg_1 = np.round(bg_1) - lr_j_scores_1 = bg_1[0 : len(lr_j_scores)] - bg_1 = bg_1[len(lr_j_scores) : len(bg_1)] + scale = 1.0 / min(bg_w_score[bg_w_score != 0]) + scaled = np.round(bg_w_score * scale) + n_obs = len(lr_j_scores) + lr_j_scores_1 = scaled[:n_obs] + bg_1 = scaled[n_obs:] # Getting the pvalue from negative binomial approach round_pvals, _, _, _ = get_stats( @@ -146,7 +161,6 @@ def perform_spot_testing( bg_1, len(bg_1), neg_binom=True, - return_negbinom_params=False, ) pvals[spot_indices, lr_j] = round_pvals for spot_index in spot_indices: @@ -159,7 +173,8 @@ def perform_spot_testing( lr_indices = spot_lr_indices[spot_i] if len(lr_indices) != 0: pvals_adj[spot_i, lr_indices] = multipletests( - pvals[spot_i, lr_indices], method=adj_method + pvals[spot_i, lr_indices], + method=adj_method, )[1] log10pvals_adj[spot_i, :] = -np.log10(pvals_adj[spot_i, :]) @@ -200,7 +215,7 @@ def perform_spot_testing( if verbose: print( "\nPer-spot results in adata.obsm have columns in same order as " - "rows in adata.uns['lr_summary']." + "rows in adata.uns['lr_summary'].", ) print("Summary of LR results in adata.uns['lr_summary'].") @@ -212,8 +227,12 @@ def get_stats( neg_binom: bool = False, adj_method: str = "fdr_bh", pval_adj_cutoff: float = 0.01, - return_negbinom_params: bool = False, -): +) -> tuple[ + npt.NDArray[np.float64], + npt.NDArray[np.float64], + npt.NDArray[np.float64], + npt.NDArray[np.float64], +]: """Retrieves valid candidate genes to be used for random gene pairs. Parameters ---------- @@ -226,6 +245,8 @@ def get_stats( number of n_pairs for this). adj_method: str Parsed to statsmodels.stats.multitest.multipletests for multiple hypothesis testing correction. + pval_adj_cutoff: float Significance cutoff. + Returns ------- stats: tuple Per spot pvalues, pvals_adj, log10_pvals_adj, lr_sign @@ -238,7 +259,9 @@ def get_stats( pmin = min(background) background2 = [item - pmin for item in background] res = sm.NegativeBinomial( - background2, np.ones(len(background2)), loglike_method="nb2" + background2, + np.ones(len(background2)), + loglike_method="nb2", ).fit(start_params=[0.1, 0.3], disp=0) mu = res.predict() # use if not constant mu = np.exp(res.params[0]) @@ -246,10 +269,6 @@ def get_stats( Q = 0 size = 1.0 / alpha * mu**Q prob = size / (size + mu) - - if return_negbinom_params: # For testing purposes # - return size, prob - # Calculate probability for all spots pvals = 1 - scipy.stats.nbinom.cdf(scores - pmin, size, prob) diff --git a/stlearn/wrapper/concatenate_spatial_adata.py b/stlearn/wrapper/concatenate_spatial_adata.py index b6452387..d2df9a85 100644 --- a/stlearn/wrapper/concatenate_spatial_adata.py +++ b/stlearn/wrapper/concatenate_spatial_adata.py @@ -111,7 +111,7 @@ def concatenate_spatial_adata(adata_list, ncols=2, fixed_size=(2000, 2000)): img_rows = [] for min_id in range(0, len(use_adata_list), ncols): - img_row = np.hstack(imgs[min_id: min_id + ncols]) + img_row = np.hstack(imgs[min_id : min_id + ncols]) img_rows.append(img_row) imgs_comb = np.vstack(img_rows) diff --git a/tests/tl/test_base.py b/tests/tl/test_base.py new file mode 100644 index 00000000..8d274aad --- /dev/null +++ b/tests/tl/test_base.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +import unittest + +import numpy as np +import pandas as pd +from anndata import AnnData + +import stlearn as st +from stlearn.tl.cci.base import calc_neighbours, get_lrs_scores + + +def _make_cci_adata(dtype=np.float64, n_side=8, n_extra_genes=40, seed=0): + rng = np.random.default_rng(seed) + n_spots = n_side * n_side + + lr_genes = ["Lig0", "Rec0", "Lig1", "Rec1"] # two L-R pairs + extra_genes = [f"gene{i}" for i in range(n_extra_genes)] + var_names = lr_genes + extra_genes + + X = rng.poisson(3.0, size=(n_spots, len(var_names))).astype(dtype) + + rows, cols = np.divmod(np.arange(n_spots), n_side) + obs = pd.DataFrame( + {"imagerow": rows.astype(float), "imagecol": cols.astype(float)}, + index=[f"spot{i}" for i in range(n_spots)], + ) + var = pd.DataFrame(index=var_names) + return AnnData(X=X, obs=obs, var=var) + + +LRS = np.array(["Lig0_Rec0", "Lig1_Rec1"]) + + +class TestBase(unittest.TestCase): + + def test_lr_core_accepts_integer_expression(self): + adata = _make_cci_adata(dtype=np.int64) + neighbours = calc_neighbours(adata, distance=1.5, verbose=False) + het_vals = np.ones(len(adata)) # default het in run() is ones + + # This line raises the unification TypingError on the un-fixed code. + lr_scores, new_lrs = get_lrs_scores(adata, LRS, neighbours, het_vals, 0) + + self.assertEqual(lr_scores.dtype, np.float64) + self.assertEqual(lr_scores.shape[0], len(adata)) + self.assertEqual(lr_scores.shape[1], len(new_lrs)) + + def test_lr_summary_wide_enough_for_int64_counts(self): + adata = _make_cci_adata(dtype=np.float64) + + st.tl.cci.run( + adata, + LRS, + min_spots=0, + distance=1.5, + n_pairs=100, + verbose=False, + ) + + summary = adata.uns["lr_summary"] + for col in ("n_spots", "n_spots_sig", "n_spots_sig_pval"): + # int32 (itemsize 4) is the bug; int64 (itemsize 8) is the fix. + self.assertGreaterEqual( + summary[col].dtype.itemsize, + 8, + msg=f"lr_summary['{col}'] is {summary[col].dtype}; " + f"too narrow to hold int64 counts (bug 2).", + ) + st.tl.cci.adj_pvals(adata, correct_axis="spot", pval_adj_cutoff=0.05) + self.assertIn("p_adjs", adata.obsm) diff --git a/tests/tl/test_lr_core.py b/tests/tl/test_lr_core.py new file mode 100644 index 00000000..a319c7b6 --- /dev/null +++ b/tests/tl/test_lr_core.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +import unittest + +import numpy as np +from numba.typed import List + +from stlearn.tl.cci.base import lr_core + + +def _reference_lr_core(spot_lr1, spot_lr2, neighbour_lists, min_expr, spot_indices): + """Plain-numpy oracle for lr_core's intended semantics (slow but obvious).""" + n_rows = len(spot_indices) + n_cols = spot_lr2.shape[1] + nb_lr2 = np.zeros((n_rows, n_cols), np.float64) + for i, si in enumerate(spot_indices): + neigh = np.asarray(neighbour_lists[si], dtype=np.int64) + if len(neigh) > 0: + nb_lr2[i] = spot_lr2[neigh].mean(axis=0) + sl1 = spot_lr1[spot_indices] + scores = sl1 * (nb_lr2 > min_expr) + (sl1 > min_expr) * nb_lr2 + return scores.sum(axis=1) / 2.0 + + +class TestLrCore(unittest.TestCase): + def test_matches_reference_on_random_inputs(self): + """lr_core agrees with the numpy oracle across shapes and thresholds.""" + rng = np.random.default_rng(0) + for n_spots, n_cols, min_expr in [ + (10, 2, 0.0), + (10, 2, 0.5), + (25, 4, 0.0), + (40, 2, 1.0), + ]: + spot_lr1 = rng.random((n_spots, n_cols)) + spot_lr2 = rng.random((n_spots, n_cols)) + # Random neighbour sets, including some empty ones. + neighbour_lists = [] + for _ in range(n_spots): + k = rng.integers(0, 4) + if k == 0: + neighbour_lists.append(np.array([], dtype=np.int32)) + else: + neighbour_lists.append( + rng.choice(n_spots, size=k, replace=False).astype(np.int32) + ) + spot_indices = np.arange(n_spots, dtype=np.int32) + + got = lr_core( + spot_lr1, + spot_lr2, + neighbour_lists, + min_expr, + spot_indices, + ) + expected = _reference_lr_core( + spot_lr1, spot_lr2, neighbour_lists, min_expr, spot_indices + ) + np.testing.assert_allclose( + got, + expected, + rtol=1e-12, + atol=1e-12, + err_msg=f"mismatch for {n_spots=} {n_cols=} {min_expr=}", + ) + + def test_returns_float64(self): + spot_lr1 = np.array([[1.0, 0.0], [5.0, 3.0], [0.0, 7.0]], dtype=np.float64) + spot_lr2 = np.array([[2.0, 4.0], [6.0, 8.0], [10.0, 0.0]], dtype=np.float64) + neighbour_lists = [ + np.array([1, 2], dtype=np.int32), + np.array([0], dtype=np.int32), + np.array([], dtype=np.int32), + ] + + result = lr_core( + spot_lr1, + spot_lr2, + List(neighbour_lists), + 0.0, + np.array([0, 1, 2], dtype=np.int32), + ) + self.assertEqual(result.dtype, np.float64)