diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..279966c --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,27 @@ +name: Pre-Commit + +on: + push: + branches: [ '*' ] + pull_request: + branches: [ '*' ] + +jobs: + build-test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + python-version: [3.8] + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install . + - name: PreCommit + uses: pre-commit/action@v3.0.0 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 3bd5379..492393c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,137 @@ -*.fastq -*.fastq.gz -*.h5ad -*.hdf5 -*.pdf -*.zarr -.idea/ +# MacOS + +.DS_Store + +### Genomics ### + +# *.fastq +# *.fastq.gz +# *.h5ad +# *.hdf5 +# *.pdf +# *.zarr +# .idea/ +# __pycache__/ +# out/ + +### Pytest + + + + +### VS Code ### + +.vscode/ + +### Python ### + +# Byte-compiled / optimized / DLL files __pycache__/ -out/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 20dc44b..56faaf0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,12 @@ repos: - repo: https://github.com/psf/black - rev: 22.6.0 + rev: 22.10.0 hooks: - id: black language_version: python3 - - repo: https://github.com/pycqa/isort - rev: 5.10.1 + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 hooks: - id: isort + name: isort (python) args: ["--profile", "black"] diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index f2f6519..0000000 --- a/.travis.yml +++ /dev/null @@ -1,7 +0,0 @@ -dist: focal -language: python -python: 3.8 -install: - - pip install -r requirements-test.txt -script: - - ./test.sh diff --git a/README.md b/README.md new file mode 100644 index 0000000..5306e4b --- /dev/null +++ b/README.md @@ -0,0 +1,80 @@ +

+
+

+ +
+ + Python Version + + Format Version + + pre commit +
+
+ Docker + + cwltool +
+ +

+ About • + Requirements • + Installation • + Usage • + License +

+ +# About + +The HuBMAP scRNA-seq pipeline is built on Salmon, Scanpy, and scVelo, and is +implemented as a CWL workflow wrapping command-line tools encapsulated in +Docker containers. + +# Requirements + +- 28Gbs of RAM +- Docker + +We require [Docker](https://www.docker.com/) to run the pipeline. + +Once Docker is installed run the Docker Daemon. On Linux, this is typically +done by running ``sudo dockerd``. On Mac, this is done by clicking the Docker GUI app. If you are using a M1 or newer silicon Mac, add ``export DOCKER_DEFAULT_PLATFORM=linux/amd64`` to your ``.zshrc`` file to avoid docker warnings and possible errors. + +# Installation + +Clone the repository and install the requirements. The ``master`` branch and ``latest`` published Docker images may not always +be in sync; checking out a version like ``v2.0.6`` is *highly* recommended +before running the pipeline, unless building Docker images locally. + +```bash +git clone https://github.com/tmsincomb/salmon-rnaseq.git +cd salmon-rnaseq +pip install -e . +``` + +# Usage +```bash +salmon-rnaseq --help +salmon-rnaseq --assay ASSAY --fastq_dir FASTQ_DIR --threads THREADS -o OUTPUT_DIR +``` + +Supported assays: + +* ``10x_v2`` (single-cell) +* ``10x_v2_sn`` (single-nucleus) +* ``10x_v3`` (single-cell) +* ``10x_v3_sn`` (single-nucleus) +* ``snareseq`` +* ``sciseq`` +* ``slideseq`` + +# License + +[![License](https://img.shields.io/github/license/hubmapconsortium/salmon-rnaseq)](https://github.com/hubmapconsortium/salmon-rnaseq/blob/main/LICENSE) + +Copyright © HuBMAP diff --git a/README.rst b/README.rst deleted file mode 100644 index 974e992..0000000 --- a/README.rst +++ /dev/null @@ -1,45 +0,0 @@ -.. image:: https://travis-ci.com/hubmapconsortium/salmon-rnaseq.svg?branch=master - :target: https://travis-ci.com/hubmapconsortium/salmon-rnaseq -.. image:: https://img.shields.io/badge/code%20style-black-000000.svg - :target: https://github.com/psf/black - -HuBMAP scRNA-seq pipeline: Salmon, Scanpy, scVelo -================================================= - -Overview --------- - -The HuBMAP scRNA-seq pipeline is built on Salmon, Scanpy, and scVelo, and is -implemented as a CWL workflow wrapping command-line tools encapsulated in -Docker containers. - -Requirements ------------- - -Running the pipeline requires a CWL workflow execution engine and container -runtime; we recommend Docker and the ``cwltool`` reference implementation. -``cwltool`` is written in Python and can be installed into a sufficiently -recent Python environment with ``pip install cwltool``. Afterward, clone this -repository, check out a tag, and invoke the pipeline as:: - - cwltool pipeline.cwl --assay ASSAY --fastq_dir FASTQ_DIR --threads THREADS - -At least 28GB memory is required for the Salmon quantification step; this -memory usage is due to inclusion of the entire GRCh38 reference genome as -decoy sequences in the quantification index. See -https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02151-8 -for more details. - -(The ``master`` branch and ``latest`` published Docker images may not always -be in sync; checking out a version like ``v2.0.6`` is *highly* recommended -before running the pipeline, unless building Docker images locally.) - -Supported assays: - -* ``10x_v2`` (single-cell) -* ``10x_v2_sn`` (single-nucleus) -* ``10x_v3`` (single-cell) -* ``10x_v3_sn`` (single-nucleus) -* ``snareseq`` -* ``sciseq`` -* ``slideseq`` diff --git a/bin/analysis/adjust_barcodes.py b/bin/analysis/adjust_barcodes.py index 9a251d2..e50d906 100755 --- a/bin/analysis/adjust_barcodes.py +++ b/bin/analysis/adjust_barcodes.py @@ -3,11 +3,11 @@ from pathlib import Path from typing import Iterable -import manhole - import correct_snareseq_barcodes import expand_sciseq_barcodes import extract_slideseq_barcodes +import manhole + from common import ADJ_OUTPUT_DIR, Assay adj_funcs = { diff --git a/bin/analysis/annotate_cells.py b/bin/analysis/annotate_cells.py index 42001f4..54c7dcb 100755 --- a/bin/analysis/annotate_cells.py +++ b/bin/analysis/annotate_cells.py @@ -3,11 +3,11 @@ from pathlib import Path from typing import Optional, Sequence +import add_slideseq_coordinates import anndata +import annotate_sciseq_barcodes import manhole -import add_slideseq_coordinates -import annotate_sciseq_barcodes from common import Assay H5AD_PATH = Path("expr.h5ad") diff --git a/bin/analysis/fastqc_wrapper.py b/bin/analysis/fastqc_wrapper.py index 166fbf0..b32ca9f 100755 --- a/bin/analysis/fastqc_wrapper.py +++ b/bin/analysis/fastqc_wrapper.py @@ -25,7 +25,7 @@ def single_file_fastqc(fastq_file_and_subdir: Tuple[Path, Path]): """ command = [piece.format(out_dir=fastq_file_and_subdir[1]) for piece in FASTQC_COMMAND_TEMPLATE] command.append(fspath(fastq_file_and_subdir[0])) - print("Running", " ".join(command)) + print("Running:", " ".join(command)) check_call(command) diff --git a/bin/analysis/scanpy_entry_point.py b/bin/analysis/scanpy_entry_point.py index b7b0c2e..25d5c09 100755 --- a/bin/analysis/scanpy_entry_point.py +++ b/bin/analysis/scanpy_entry_point.py @@ -6,9 +6,9 @@ import manhole import matplotlib.pyplot as plt import scanpy as sc +from plot_utils import new_plot from common import Assay -from plot_utils import new_plot def main(assay: Assay, h5ad_file: Path): diff --git a/bin/analysis/scvelo_analysis.py b/bin/analysis/scvelo_analysis.py index 1e83f22..f247774 100755 --- a/bin/analysis/scvelo_analysis.py +++ b/bin/analysis/scvelo_analysis.py @@ -9,7 +9,6 @@ import scanpy as sc import scipy.sparse import scvelo as scv - from plot_utils import new_plot component_neighbor_count = 50 diff --git a/bin/salmon/bulk_salmon_wrapper.py b/bin/salmon/bulk_salmon_wrapper.py index b084bd5..05a695b 100755 --- a/bin/salmon/bulk_salmon_wrapper.py +++ b/bin/salmon/bulk_salmon_wrapper.py @@ -42,9 +42,9 @@ def main(threads: int, directory: Path): fastq_extension = [ "-1", - fspath(r1_fastq_file), + str(fspath(r1_fastq_file)), "-2", - fspath(r2_fastq_file), + str(fspath(r2_fastq_file)), ] command.extend(fastq_extension) diff --git a/bin/salmon/salmon_wrapper.py b/bin/salmon/salmon_wrapper.py index 0f16810..7720273 100755 --- a/bin/salmon/salmon_wrapper.py +++ b/bin/salmon/salmon_wrapper.py @@ -197,12 +197,11 @@ def main( for r1_fastq_file, r2_fastq_file in fastq_pairs: fastq_extension = [ "-1", - r1_fastq_file, + str(r1_fastq_file), "-2", - r2_fastq_file, + str(r2_fastq_file), ] command.extend(fastq_extension) - print("Running:", " ".join(str(x) for x in command)) env = environ.copy() # Necessary for Singularity; this environment variable isn't diff --git a/bin/salmon_rnaseq/cli.py b/bin/salmon_rnaseq/cli.py new file mode 100644 index 0000000..45ac35d --- /dev/null +++ b/bin/salmon_rnaseq/cli.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import os +import subprocess +import sys +from pathlib import Path +from subprocess import CompletedProcess +from typing import Any + +import click + + +class cd: + """Context manager for changing the current working directory""" + + def __init__(self, newPath: str): + self.newPath = os.path.expanduser(newPath) + + def __enter__(self): + self.savedPath = os.getcwd() + os.chdir(self.newPath) + + def __exit__(self, etype, value, traceback): + os.chdir(self.savedPath) + + +def pathing(path: str, new: bool = False, overwrite: bool = True) -> Path: + """Guarantees correct expansion rules for pathing. + :param Union[str, Path] path: path of folder or file you wish to expand. + :param bool new: will check if distination exists if new (will check parent path regardless). + :return: A pathlib.Path object. + >>> pathing('~/Desktop/folderofgoodstuffs/') + /home/user/Desktop/folderofgoodstuffs + """ + path = Path(path) + # Expand shortened path + if str(path)[0] == "~": + path = path.expanduser() + # Exand local path + if str(path)[0] == ".": + path = path.resolve() + else: + path = path.absolute() + # Making sure new paths don't exist while also making sure existing paths actually exist. + if new: + if not path.parent.exists(): + raise ValueError(f"ERROR ::: Parent directory of {path} does not exist.") + if path.exists() and not overwrite: + raise ValueError(f"ERROR ::: {path} already exists!") + else: + if not path.exists(): + raise ValueError(f"ERROR ::: Path {path} does not exist.") + return path + + +class Shell: + """Shell wrapper for cwtool pipeline.cwl""" + + @staticmethod + def run(args: list[str], capture_output: bool = False) -> CompletedProcess[Any]: + + cmd = [ + "cwltool", + str(Path(__file__).parent.parent.parent / "pipeline.cwl"), + *args, + ] + proc: CompletedProcess[Any] = subprocess.run( + cmd, capture_output=capture_output, encoding="utf-8" + ) + return proc + + def help(self) -> str: + args = ["--help"] + help_str = self.run(args, capture_output=True).stdout + # TODO: should be a direct change not a replace here + # help_str = "usage: salmon-rnaseq\n" + "\n".join(help_str.split("\n")[1:]) + return help_str + + +class RichHelp(click.Command): + """Override the default help command to use rich markup with CWLTOOL message.""" + + def format_help(self, ctx: click.Context, formatter: click.HelpFormatter) -> None: + click.echo(Shell().help()) + + +@click.command( + cls=RichHelp, + context_settings=dict( + ignore_unknown_options=True, allow_extra_args=True, help_option_names=["-h", "--help"] + ), +) +@click.pass_context +@click.option( + "-o", + "--outdir", + required=False, + type=click.Path( + file_okay=False, + dir_okay=True, + readable=True, + resolve_path=True, + ), + default=".", +) +def main(ctx: click.Context, outdir: str) -> None: + """Run CWLTOOL pipeline.cwl with rich markup help. + + Parameters + ---------- + ctx : click.Context + Click context object. + """ + if not ctx.args: + click.echo(Shell().help()) + ctx.exit() + + Path(outdir).mkdir(exist_ok=True) + for i, arg in enumerate(ctx.args): + if Path(arg).exists(): + ctx.args[i] = str(pathing(arg)) + outdir = str(pathing(outdir)) + + with cd(outdir): + Shell.run(ctx.args) + + +if __name__ == "__main__": + sys.exit(main()) # pragma: no cover diff --git a/bin/trim_reads/trim_reads.py b/bin/trim_reads/trim_reads.py index 63a38d1..ba60daa 100755 --- a/bin/trim_reads/trim_reads.py +++ b/bin/trim_reads/trim_reads.py @@ -53,7 +53,7 @@ def trim_reads(fastq_r1: Path, fastq_r2: Path, output_subdir: Path): command = [piece.format(input_fastq=fastq_r2) for piece in TRIM_COMMAND] fastq_r2_out = output_subdir / fastq_r2.name command_str = " ".join(quote(s) for s in command) - print("Running", command_str, "with output", quote(fspath(fastq_r2_out))) + print("Running:", command_str, "with output", quote(fspath(fastq_r2_out))) with open(fastq_r2_out, "wb") as f: check_call(command, stdout=f) diff --git a/pyproject.toml b/pyproject.toml index 568d860..6685c71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,54 @@ +[build-system] +requires = ["setuptools", "setuptools-scm"] +build-backend = "setuptools.build_meta" + +[project] +name = "salmon_rnaseq" +description = "The HuBMAP scRNA-seq pipeline is built on Salmon, Scanpy, and scVelo, and is implemented as a CWL workflow wrapping command-line tools encapsulated in Docker containers." +readme = "README.rst" +requires-python = ">=3.7" +keywords = ["scrna-seq", "salmon-rnaseq", "salmon", "rnaseq"] +license = {text = "GNU General Public License v3.0"} +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] +dependencies = [ + "cwltool~=3.1", + 'click~=8.1', +] +dynamic = ["version"] + +[project.optional-dependencies] +dev = [ + "pre-commit~=2.20", + "pytest~=7.2", +] + +[project.scripts] +salmon-rnaseq = "salmon_rnaseq.cli:main" + +[tool.setuptools.packages.find] +where = ["bin"] + [tool.black] -line-length = 99 +line-length = 99 # should consider 120 +include = '\.pyi?$' +extend-exclude = ''' +/( + # The following are specific to Black, you probably don't want those. + | blib2to3 + | tests/data + | profiling +)/ +''' [tool.isort] profile = "black" multi_line_output = 3 -src_paths = ["bin/analysis", "bin/common", "bin/salmon"] +src_paths = ["bin"]