From 0cf0a36c25700249c243a48577713079a7a3b9fd Mon Sep 17 00:00:00 2001 From: Liudeng Zhang Date: Sun, 8 Mar 2026 19:21:53 -0500 Subject: [PATCH 1/2] Fix add_peak_annotation crash on empty distance values When the distance column in peak annotation TSV contains empty values (intergenic peaks), pandas may infer a numeric dtype. This caused LossySetitemError when trying to assign "" into the numeric column. Fix by converting distance to string via object dtype early, and using pd.to_numeric with errors="coerce" for the final int conversion, which gracefully handles empty/NaN values as 0. Add regression tests for empty distance values and semicolon-separated multi-gene annotations. Closes scverse/muon#181 --- muon/_atac/tools.py | 15 ++++-------- tests/test_atac_tools.py | 51 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 10 deletions(-) create mode 100644 tests/test_atac_tools.py diff --git a/muon/_atac/tools.py b/muon/_atac/tools.py index 246dc4f..5b97414 100644 --- a/muon/_atac/tools.py +++ b/muon/_atac/tools.py @@ -115,7 +115,9 @@ def add_peak_annotation( # Convert null values to empty strings pa.loc[pa.gene.isnull(), "gene"] = "" - pa.loc[pa.distance.isnull(), "distance"] = "" + # Convert distance to string via object dtype — pandas may infer a numeric + # or nullable StringDtype, both of which break on direct "" assignment + pa["distance"] = pa["distance"].astype(object).fillna("").astype(str) pa.loc[pa.peak_type.isnull(), "peak_type"] = "" # If peak name is not in the annotation table, reconstruct it: @@ -149,15 +151,8 @@ def add_peak_annotation( # chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN pa_long.peak = [peak.replace("_", ":", 1).replace("_", "-", 1) for peak in pa_long.peak] - # Make distance values integers with 0 for intergenic peaks - # DEPRECATED: Make distance values nullable integers - # See https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html - null_distance = pa_long.distance == "" - pa_long.loc[null_distance, "distance"] = 0 - pa_long.distance = pa_long.distance.astype(float).astype(int) - # DEPRECATED: Int64 is not recognized when saving HDF5 files with scanpy.write - # pa_long.distance = pa_long.distance.astype(int).astype("Int64") - # pa_long.distance[null_distance] = np.nan + # Make distance values integers with 0 for intergenic peaks (empty/NaN → 0) + pa_long["distance"] = pd.to_numeric(pa_long["distance"], errors="coerce").fillna(0).astype(int) if "atac" not in adata.uns: adata.uns["atac"] = dict() diff --git a/tests/test_atac_tools.py b/tests/test_atac_tools.py new file mode 100644 index 0000000..a20dc65 --- /dev/null +++ b/tests/test_atac_tools.py @@ -0,0 +1,51 @@ +import unittest +from io import StringIO + +import numpy as np +import pandas as pd +from anndata import AnnData +from muon._atac.tools import add_peak_annotation + + +class TestAddPeakAnnotation(unittest.TestCase): + """Regression tests for add_peak_annotation with empty distance values (#181).""" + + def test_empty_distance_values(self): + """Intergenic peaks with empty distance should not raise.""" + tsv = StringIO( + "chrom\tstart\tend\tgene\tdistance\tpeak_type\n" + "chr1\t100\t200\t\t\tintergenic\n" + "chr1\t300\t400\tGeneA\t-173268\tdistal\n" + ) + pa = pd.read_csv(tsv, sep="\t") + peaks = ["chr1:100-200", "chr1:300-400"] + adata = AnnData(np.zeros((2, 2))) + adata.var_names = peaks + + result = add_peak_annotation(adata, pa, return_annotation=True) + + self.assertEqual(result.distance.dtype, np.int64) + # Intergenic peak distance should be 0 + self.assertIn(0, result.distance.values) + # Distal peak distance should be preserved + self.assertIn(-173268, result.distance.values) + + def test_semicolon_separated_distances(self): + """Multi-gene peaks with semicolon-separated distances should work.""" + tsv = StringIO( + "chrom\tstart\tend\tgene\tdistance\tpeak_type\n" + "chr1\t100\t200\tGeneA;GeneB\t-100;200\tpromoter;distal\n" + ) + pa = pd.read_csv(tsv, sep="\t") + adata = AnnData(np.zeros((1, 1))) + adata.var_names = ["chr1:100-200"] + + result = add_peak_annotation(adata, pa, return_annotation=True) + + self.assertEqual(result.distance.dtype, np.int64) + self.assertIn(-100, result.distance.values) + self.assertIn(200, result.distance.values) + + +if __name__ == "__main__": + unittest.main() From db2cf26760fe1b7cb5253c1b9c045af61d183ff3 Mon Sep 17 00:00:00 2001 From: Ilia Kats Date: Tue, 10 Mar 2026 13:21:44 +0100 Subject: [PATCH 2/2] rework add_peak_annotation to make use of Pandas' pd.NA Pandas 1.0 has been released sufficiently long ago that we can assume muon users have it. --- muon/_atac/tools.py | 53 +++++++++++++++++++++------------------- pyproject.toml | 2 +- tests/test_atac_tools.py | 22 ++++++++--------- 3 files changed, 40 insertions(+), 37 deletions(-) diff --git a/muon/_atac/tools.py b/muon/_atac/tools.py index 5b97414..b8e38ba 100644 --- a/muon/_atac/tools.py +++ b/muon/_atac/tools.py @@ -6,6 +6,7 @@ from pathlib import Path from datetime import datetime from warnings import warn +from contextlib import suppress import numpy as np import pandas as pd @@ -81,7 +82,7 @@ def lsi(data: Union[AnnData, MuData], scale_embeddings=True, n_comps=50): def add_peak_annotation( data: Union[AnnData, MuData], - annotation: Union[str, pd.DataFrame], + annotation: Union[str, Path, pd.DataFrame], sep: str = "\t", return_annotation: bool = False, ): @@ -108,17 +109,12 @@ def add_peak_annotation( else: raise TypeError("Expected AnnData or MuData object with 'atac' modality") - if isinstance(annotation, str): - pa = pd.read_csv(annotation, sep=sep) - else: + if isinstance(annotation, pd.DataFrame): pa = annotation + else: + pa = pd.read_csv(annotation, sep=sep) - # Convert null values to empty strings - pa.loc[pa.gene.isnull(), "gene"] = "" - # Convert distance to string via object dtype — pandas may infer a numeric - # or nullable StringDtype, both of which break on direct "" assignment - pa["distance"] = pa["distance"].astype(object).fillna("").astype(str) - pa.loc[pa.peak_type.isnull(), "peak_type"] = "" + pa = pa.convert_dtypes() # If peak name is not in the annotation table, reconstruct it: # peak = chrom:start-end @@ -135,31 +131,38 @@ def add_peak_annotation( raise AttributeError( f"Peak annotation does not in contain neighter peak column nor chrom, start, and end columns." ) + else: + # chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN + pa["peak"] = pa["peak"].str.replace("_", ":", 1).str.replace("_", "-", 1) # Split genes, distances, and peaks into individual records - pa_g = pd.DataFrame(pa.gene.str.split(";").tolist(), index=pa.peak).stack() - pa_d = pd.DataFrame(pa.distance.astype(str).str.split(";").tolist(), index=pa.peak).stack() - pa_p = pd.DataFrame(pa.peak_type.str.split(";").tolist(), index=pa.peak).stack() - # Make a long dataframe indexed by gene - pa_long = pd.concat( - [pa_g.reset_index()[["peak", 0]], pa_d.reset_index()[[0]], pa_p.reset_index()[[0]]], axis=1 - ) - pa_long.columns = ["peak", "gene", "distance", "peak_type"] - pa_long = pa_long.set_index("gene") + if pd.api.types.is_string_dtype(pa.distance): + pa = pa.set_index("peak") + pa_g = pa.gene.str.split(";").explode() + pa_d = pa.distance.str.split(";").explode().astype(int) + pa_p = pa.peak_type.str.split(";").explode() + + # Make a long dataframe indexed by gene + pa = pd.concat((pa_g, pa_d, pa_p), axis=1).reset_index() + else: + pa = pa[["peak", "gene", "distance", "peak_type"]] + + with suppress(ValueError): # missing values + pa["distance"] = pa["distance"].astype(int) - # chrX_NNNNN_NNNNN -> chrX:NNNNN-NNNNN - pa_long.peak = [peak.replace("_", ":", 1).replace("_", "-", 1) for peak in pa_long.peak] + # TODO: nullable strings work with anndata >= 0.13 + for col in ("peak", "gene", "peak_type"): + pa[col] = pa[col].fillna("").astype(object) - # Make distance values integers with 0 for intergenic peaks (empty/NaN → 0) - pa_long["distance"] = pd.to_numeric(pa_long["distance"], errors="coerce").fillna(0).astype(int) + pa = pa.set_index("gene") if "atac" not in adata.uns: adata.uns["atac"] = dict() - adata.uns["atac"]["peak_annotation"] = pa_long + adata.uns["atac"]["peak_annotation"] = pa if return_annotation: - return pa_long + return pa def add_peak_annotation_gene_names( diff --git a/pyproject.toml b/pyproject.toml index c59b291..fc9736c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ classifiers = [ requires-python = ">= 3.10" requires = [ "numpy", - "pandas", + "pandas>=1", "matplotlib", "seaborn", "h5py", diff --git a/tests/test_atac_tools.py b/tests/test_atac_tools.py index a20dc65..53eb4ed 100644 --- a/tests/test_atac_tools.py +++ b/tests/test_atac_tools.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd from anndata import AnnData -from muon._atac.tools import add_peak_annotation +import muon class TestAddPeakAnnotation(unittest.TestCase): @@ -22,13 +22,12 @@ def test_empty_distance_values(self): adata = AnnData(np.zeros((2, 2))) adata.var_names = peaks - result = add_peak_annotation(adata, pa, return_annotation=True) + result = muon.atac.tl.add_peak_annotation(adata, pa, return_annotation=True) - self.assertEqual(result.distance.dtype, np.int64) - # Intergenic peak distance should be 0 - self.assertIn(0, result.distance.values) - # Distal peak distance should be preserved - self.assertIn(-173268, result.distance.values) + assert result.distance.dtype == pd.Int64Dtype() + assert result.distance.iloc[0] is pd.NA + assert result.distance.iloc[1] == -173268 + assert (result.peak == peaks).all() def test_semicolon_separated_distances(self): """Multi-gene peaks with semicolon-separated distances should work.""" @@ -40,11 +39,12 @@ def test_semicolon_separated_distances(self): adata = AnnData(np.zeros((1, 1))) adata.var_names = ["chr1:100-200"] - result = add_peak_annotation(adata, pa, return_annotation=True) + result = muon.atac.tl.add_peak_annotation(adata, pa, return_annotation=True) - self.assertEqual(result.distance.dtype, np.int64) - self.assertIn(-100, result.distance.values) - self.assertIn(200, result.distance.values) + assert result.distance.dtype == np.int64 + assert result.distance.iloc[0] == -100 + assert result.distance.iloc[1] == 200 + assert (result.peak.iloc[0] == result.peak.iloc[1] == adata.var_names).all() if __name__ == "__main__":