diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f3dda58..fc8a255 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,26 +11,33 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9, '3.10', 3.11] + python-version: [3.9, '3.10', 3.11, 3.12, 3.13] + sqla-version: ["~=2.0"] + include: + # Also test with SQLALchemy >=1.4, <2 + - python-version: 3.12 + sqla-version: "~=1.4" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: pip + cache-dependency-path: | + pyproject.toml + setup.cfg - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install setuptools pytest + python -m pip install setuptools pytest sqlalchemy${{ matrix.sqla-version }} - name: Build package run: | pip install . - name: Test with pytest - env: - PY_IGNORE_IMPORTMISMATCH: 1 run: | pytest diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a25a3d..b8c19ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,25 @@ # Changelog +## 1.1.0 + +* Command line interface: + * Better error reporting when database file(s) not found + * Add more details to output of `gambit signatures info` command. +* Major overhaul of internal Python API and tests (see full release notes on GitHub) + * Many fixes to API documentation +* Increase minimum Python version to 3.9 +* Make compatible with SQLAlchemy 2.0 + + ## 1.0.1 +* Significant documentation updates. +* Better error reporting: + * When database files cannot be found (in CLI and API). + * On attempting to open an invalid signatures file. * Misc - * Better error reporting when database files cannot be found (in CLI and API). - * Minor documentation updates. + * Run tests on Python 3.11 and 3.12. ## 1.0.0 diff --git a/MANIFEST.in b/MANIFEST.in index 71ce312..8e16a00 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,2 @@ -# Alembic -include gambit/db/migrate/alembic.ini -recursive-include gambit/db/migrate *.py +graft src +graft docs diff --git a/README.md b/README.md index 83dd07f..29cb0c5 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,17 @@ See below for basic installation and usage instructions, or check out the a basic tutorial. +## About + +Copyright © 2016-2024 Jared Lumpe + +GAMBIT has been a personal project of mine for many years. Although there have been numerous +contributors to the publication, it is not a product of any lab or institution. + +GAMBIT is provided as free software under the terms of the [AGPLv3 license](LICENSE). +It is not covered by any type of software patent. + + ### Publication Lumpe J, Gumbleton L, Gorzalski A, Libuit K, Varghese V, et al. (2023) GAMBIT (Genomic Approximation diff --git a/docs/source/api/database.rst b/docs/source/api/database.rst index bfcf307..1655432 100644 --- a/docs/source/api/database.rst +++ b/docs/source/api/database.rst @@ -32,9 +32,3 @@ gambit.db.sqla .. autoclass:: ReadOnlySession :exclude-members: __init__, __new__ :no-members: - - -gambit.db.migrate ------------------ - -.. automodule:: gambit.db.migrate diff --git a/docs/source/api/kmers.rst b/docs/source/api/kmers.rst index 859fed4..ef890a1 100644 --- a/docs/source/api/kmers.rst +++ b/docs/source/api/kmers.rst @@ -27,6 +27,7 @@ gambit.sigs.base ---------------------- .. automodule:: gambit.sigs.base + :exclude-members: AbstractSignatureArray .. autoclass:: AbstractSignatureArray :special-members: +__eq__ @@ -38,16 +39,11 @@ gambit.sigs.calc .. automodule:: gambit.sigs.calc -gambit.sigs.convert -------------------------- - -.. automodule:: gambit.sigs.convert - - gambit.sigs.hdf5 ---------------------- .. automodule:: gambit.sigs.hdf5 + :exclude-members: HDF5Signatures .. autoclass:: HDF5Signatures :special-members: +__bool__ diff --git a/docs/source/api/metric.rst b/docs/source/api/metric.rst index 9d418ac..c6e8a76 100644 --- a/docs/source/api/metric.rst +++ b/docs/source/api/metric.rst @@ -5,7 +5,3 @@ gambit.metric ------------- .. automodule:: gambit.metric - - .. autofunction:: gambit.metric.jaccard - - .. autofunction:: gambit.metric.jaccarddist diff --git a/docs/source/api/misc.rst b/docs/source/api/misc.rst index 4b455c7..7e024d7 100644 --- a/docs/source/api/misc.rst +++ b/docs/source/api/misc.rst @@ -14,12 +14,6 @@ gambit.util.misc .. automodule:: gambit.util.misc -gambit.util.typing ------------------- - -.. automodule:: gambit.util.typing - - gambit.util.io -------------- @@ -36,15 +30,13 @@ gambit.util.indexing -------------------- .. automodule:: gambit.util.indexing + :exclude-members: AdvancedIndexingMixin + + .. autoclass:: AdvancedIndexingMixin + :private-members: _check_index, _getitem_int, _getitem_slice, _getitem_int_array, _getitem_bool_array gambit.util.progress -------------------- .. automodule:: gambit.util.progress - - -gambit.util.dev ----------------- - -.. automodule:: gambit.util.dev diff --git a/docs/source/api/results.rst b/docs/source/api/results.rst index fb889cb..c3ccda0 100644 --- a/docs/source/api/results.rst +++ b/docs/source/api/results.rst @@ -6,27 +6,3 @@ gambit.results ----------------- .. automodule:: gambit.results - - -gambit.results.base ----------------------- - -.. automodule:: gambit.results.base - - -gambit.results.json ----------------------- - -.. automodule:: gambit.results.json - - -gambit.results.csv ---------------------- - -.. automodule:: gambit.results.csv - - -gambit.results.archive -------------------------- - -.. automodule:: gambit.results.archive diff --git a/docs/source/conf.py b/docs/source/conf.py index c9f53f9..aefc29b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = 'GAMBIT' -copyright = '2021 - 2023, Jared Lumpe' +copyright = '2021 - 2024, Jared Lumpe' author = 'Jared Lumpe' # The full version, including alpha/beta/rc tags -release = '1.0.1' +release = '1.1.0' # -- General configuration --------------------------------------------------- @@ -50,6 +50,17 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] +# When debugging broken cross references using nitpick mode (-n option), ignore these errors. +# This mostly relates to external libraries that have not been linked to using intersphinx. +nitpick_ignore_regex = [ + ('py:.*', r'click\..*'), + ('py:.*', r'sqlalchemy\..*'), + ('py:.*', r'h5py\..*'), + ('py:.*', r'scipy\..*'), + # TypeVar + ('py:.*', r'(.*\.)?T\d?'), +] + # -- Options for HTML output ------------------------------------------------- @@ -77,4 +88,15 @@ autodoc_member_order = 'groupwise' autodoc_typehints = 'description' +autodoc_type_aliases = { + 'FilePath': 'FilePath', + 'DNASeq': 'DNASeq', +} + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'numpy': ('https://numpy.org/doc/stable/', None), + 'Bio': ('https://biopython.org/docs/latest/', None), +} + todo_include_todos = True diff --git a/gambit/_cython/kmers.pxd b/gambit/_cython/kmers.pxd deleted file mode 100644 index 88ba81e..0000000 --- a/gambit/_cython/kmers.pxd +++ /dev/null @@ -1,11 +0,0 @@ -# cython: language_level = 3str - -cimport numpy as np - -ctypedef unsigned char CHAR - - -cpdef np.uint64_t kmer_to_index(const CHAR[:]) nogil except? 0 -cpdef np.uint64_t kmer_to_index_rc(const CHAR[:]) nogil except? 0 -cdef void c_index_to_kmer(np.uint64_t, CHAR[:]) nogil -cdef void c_revcomp(const CHAR[:], CHAR[:]) nogil diff --git a/gambit/_cython/threads.pyx b/gambit/_cython/threads.pyx deleted file mode 100644 index dd3c49d..0000000 --- a/gambit/_cython/threads.pyx +++ /dev/null @@ -1,42 +0,0 @@ -"""OpenMP stuff.""" - -from cython import parallel - -import numpy as np -cimport numpy as np -cimport openmp - - -def omp_set_num_threads(n: int): - """Set maximum number of threads used by OpenMP. - - Just calls the ``omp_set_num_threads`` C function. - """ - if n <= 0: - raise ValueError('Argument must be positive.') - openmp.omp_set_num_threads(n) - - -def omp_get_max_threads(): - """Get the maximum number of threads used by OpenMP. - - Just calls the ``omp_get_max_threads`` C function. - """ - return openmp.omp_get_max_threads() - - -def get_thread_ids(int num_threads): - """Run a multithreaded loop and get the thread ID running in each iteration.""" - - cdef: - np.ndarray[np.intp_t, ndim=1] thread_ids - np.intp_t thread_id = -1 - int i - - thread_ids = np.full(num_threads, -1, dtype=np.intp) - - for i in parallel.prange(num_threads, nogil=True, schedule='static', chunksize=1): - thread_id = parallel.threadid() - thread_ids[i] = thread_id - - return thread_ids diff --git a/gambit/db/migrate/__init__.py b/gambit/db/migrate/__init__.py deleted file mode 100644 index c10d164..0000000 --- a/gambit/db/migrate/__init__.py +++ /dev/null @@ -1,137 +0,0 @@ -"""Perform genome database migrations with Alembic. - -This package contains all Alembic configuration and data files. Revision files are located in -``./alembic/versions``. - -Note on alembic configuration - seems like normal usage of Alembic involves getting the database URL -from ``alembic.ini``. Since this application has no fixed location for the database we can't use -this method. Instead we are following the -`Sharing a Connection with a Series of Migration Commands and Environments `_ -recipe in Alembic's documentation, where the connectable object is generated programmatically -somehow and then attached to the Alembic configuration object's ``attributes`` dict. The -``run_migrations_offline`` and ``run_migrations_online`` functions in ``alembic/env.py`` are -modified from the version generated by ``alembic init`` to get their connectable object from this -dict instead of creating it based on the contents of ``alembic.ini``. Note that this means we -can't do (online) migration stuff from the standard alembic CLI command, which gets its -connection information only from ``alembic.ini``. - -The way to use this setup is instead to create an :class:`alembic.config.Config` instance with -:func:`.get_alembic_config` and use the functions in :mod:`alembic.command`. - -.. _alembic-recipe: https://alembic.sqlalchemy.org/en/latest/cookbook.html#sharing-a-connection-with-a-series-of-migration-commands-and-environments -""" - -from typing import Optional - -from alembic.config import Config -from alembic import command -from alembic.migration import MigrationContext -from alembic.script import ScriptDirectory -from pkg_resources import resource_filename -from sqlalchemy.engine import Connectable - - -INI_PATH = resource_filename(__name__, 'alembic.ini') - - -def get_alembic_config(connectable: Optional[Connectable] = None, **kwargs) -> Config: - """Get an alembic config object to perform migrations. - - Parameters - ---------- - connectable - SQLAlchemy connectable specifying database connection info (optional). Assigned to - ``'connectable'`` key of :attr:`alembic.config.Config.attributes`. - \\**kwargs - Keyword arguments to pass to :meth:`alembic.config.Config.__init__`. - - Returns - ------- - Alembic config object. - """ - config = Config(INI_PATH, **kwargs) - config.attributes['connectable'] = connectable - - return config - - -def current_head() -> str: - """Get the current head revision number.""" - conf = get_alembic_config() - scriptdir = ScriptDirectory.from_config(conf) - return scriptdir.get_current_head() - - -def current_revision(connectable: Connectable) -> str: - """Get the current revision number of a genome database.""" - with connectable.connect() as conn: - ctx = MigrationContext.configure(conn) - return ctx.get_current_revision() - - -def is_current_revision(connectable: Connectable): - """Check if the current revision of a genome database is the most recent (head) revision.""" - head = current_head() - current = current_revision(connectable) - return current == head - - -def upgrade(connectable: Connectable, revision: str = 'head', tag=None, **kwargs): - """Run the alembic upgrade command. - - See :func:`alembic.command.upgrade` for more information on how this works. - - Parameters - ---------- - connectable - SQLAlchemy connectable specifying genome database connection info. - revision - Revision to upgrade to. Passed to :func:`alembic.command.upgrade`. - tag - Passed to :func:`alembic.command.upgrade`. - \\**kwargs - Passed to :func:`.get_alembic_config`. - """ - config = get_alembic_config(connectable, **kwargs) - command.upgrade(config, revision, tag=tag) - - -def init_db(connectable: Connectable): - """ - Initialize the genome database schema by creating all tables and stamping with the latest - Alembic revision. - - Expects a fresh database that does not already contain any tables for the :mod:`gambit.db.models` - models and has not had any migrations run on it yet. - - Parameters - ---------- - connectable - SQLAlchemy connectable specifying database connection info. - - Raises - ------ - RuntimeError - If the database is already stamped with an Alembic revision. - sqlalchemy.exc.OperationalError - If any of the database tables to be created already exist. - """ - from gambit.db.models import Base - - conf = get_alembic_config() - script = ScriptDirectory.from_config(conf) - - with connectable.connect() as conn: - ctx = MigrationContext.configure(conn) - - # Check there is no current revision stamped - current = ctx.get_current_revision() - if current is not None: - raise RuntimeError(f'Expected uninitialized database, but current alembic revision is {current}') - - # Create tables - # Set checkfirst=false so that we get an SQL error if any tables already exist - Base.metadata.create_all(conn, checkfirst=False) - - # Stamp latest alembic version - ctx.stamp(script, 'head') diff --git a/gambit/db/migrate/alembic.ini b/gambit/db/migrate/alembic.ini deleted file mode 100644 index e1d4181..0000000 --- a/gambit/db/migrate/alembic.ini +++ /dev/null @@ -1,89 +0,0 @@ -# A generic, single database configuration. - -[alembic] -# Database connection set dynamically in migrate.get_alembic_config function. - -# path to migration scripts -script_location = gambit.db.migrate:alembic - -# template used to generate migration files -# file_template = %%(rev)s_%%(slug)s - -# sys.path path, will be prepended to sys.path if present. -# defaults to the current working directory. -prepend_sys_path = . - -# timezone to use when rendering the date -# within the migration file as well as the filename. -# string value is passed to dateutil.tz.gettz() -# leave blank for localtime -# timezone = - -# max length of characters to apply to the -# "slug" field -# truncate_slug_length = 40 - -# set to 'true' to run the environment during -# the 'revision' command, regardless of autogenerate -# revision_environment = false - -# set to 'true' to allow .pyc and .pyo files without -# a source .py file to be detected as revisions in the -# versions/ directory -# sourceless = false - -# version location specification; this defaults -# to ./alembic/versions. When using multiple version -# directories, initial revisions must be specified with --version-path -# version_locations = %(here)s/bar %(here)s/bat ./alembic/versions - -# the output encoding used when revision files -# are written from script.py.mako -# output_encoding = utf-8 - - -[post_write_hooks] -# post_write_hooks defines scripts or Python functions that are run -# on newly generated revision scripts. See the documentation for further -# detail and examples - -# format using "black" - use the console_scripts runner, against the "black" entrypoint -# hooks = black -# black.type = console_scripts -# black.entrypoint = black -# black.options = -l 79 REVISION_SCRIPT_FILENAME - -# Logging configuration -[loggers] -keys = root,sqlalchemy,alembic - -[handlers] -keys = console - -[formatters] -keys = generic - -[logger_root] -level = WARN -handlers = console -qualname = - -[logger_sqlalchemy] -level = WARN -handlers = -qualname = sqlalchemy.engine - -[logger_alembic] -level = INFO -handlers = -qualname = alembic - -[handler_console] -class = StreamHandler -args = (sys.stderr,) -level = NOTSET -formatter = generic - -[formatter_generic] -format = %(levelname)-5.5s [%(name)s] %(message)s -datefmt = %H:%M:%S diff --git a/gambit/db/migrate/alembic/README b/gambit/db/migrate/alembic/README deleted file mode 100644 index 98e4f9c..0000000 --- a/gambit/db/migrate/alembic/README +++ /dev/null @@ -1 +0,0 @@ -Generic single-database configuration. \ No newline at end of file diff --git a/gambit/db/migrate/alembic/env.py b/gambit/db/migrate/alembic/env.py deleted file mode 100644 index bc15eb7..0000000 --- a/gambit/db/migrate/alembic/env.py +++ /dev/null @@ -1,62 +0,0 @@ -from logging.config import fileConfig - -from alembic import context - -# this is the Alembic Config object, which provides -# access to the values within the .ini file in use. -config = context.config - -# Interpret the config file for Python logging. -# This line sets up loggers basically. -fileConfig(config.config_file_name) - -# add your model's MetaData object here -# for 'autogenerate' support -from gambit.db.models import Base -target_metadata = Base.metadata - -# other values from the config, defined by the needs of env.py, -# can be acquired: -# my_important_option = config.get_main_option("my_important_option") -# ... etc. - - -def run_migrations_offline(): - """Run migrations in 'offline' mode. - - Since we don't have a connection URL written into alembic.ini, we need to specify the - "dialect_name" argument. - """ - context.configure( - dialect_name='sqlite', - target_metadata=target_metadata, - literal_binds=True, - dialect_opts={"paramstyle": "named"}, - ) - - with context.begin_transaction(): - context.run_migrations() - - -def run_migrations_online(): - """Run migrations in 'online' mode. - - Expects a value for the "connectable" argument to migrate.get_alembic_config(). - """ - connectable = config.attributes.get('connectable') - if connectable is None: - raise RuntimeError('Connectable object must be passed to gambit.db.migrate.get_alembic_config()') - - with connectable.connect() as connection: - context.configure( - connection=connection, target_metadata=target_metadata - ) - - with context.begin_transaction(): - context.run_migrations() - - -if context.is_offline_mode(): - run_migrations_offline() -else: - run_migrations_online() diff --git a/gambit/db/migrate/alembic/script.py.mako b/gambit/db/migrate/alembic/script.py.mako deleted file mode 100644 index 2c01563..0000000 --- a/gambit/db/migrate/alembic/script.py.mako +++ /dev/null @@ -1,24 +0,0 @@ -"""${message} - -Revision ID: ${up_revision} -Revises: ${down_revision | comma,n} -Create Date: ${create_date} - -""" -from alembic import op -import sqlalchemy as sa -${imports if imports else ""} - -# revision identifiers, used by Alembic. -revision = ${repr(up_revision)} -down_revision = ${repr(down_revision)} -branch_labels = ${repr(branch_labels)} -depends_on = ${repr(depends_on)} - - -def upgrade(): - ${upgrades if upgrades else "pass"} - - -def downgrade(): - ${downgrades if downgrades else "pass"} diff --git a/gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py b/gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py deleted file mode 100644 index 2548f7b..0000000 --- a/gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py +++ /dev/null @@ -1,98 +0,0 @@ -"""GAMBIT 0.1.0 - -Revision ID: c43540b80d50 -Revises: -Create Date: 2021-07-08 13:34:30.131392 - -Creates 0.1.0 database from scratch. -""" -from alembic import op -import sqlalchemy as sa - -from gambit.db.sqla import JsonString - - -# revision identifiers, used by Alembic. -revision = 'c43540b80d50' -down_revision = None -branch_labels = None -depends_on = None - - -def upgrade(): - op.create_table('genome_sets', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('key', sa.String(), nullable=False), - sa.Column('version', sa.String(), nullable=True), - sa.Column('name', sa.String(), nullable=False), - sa.Column('description', sa.String(), nullable=True), - sa.Column('extra', JsonString(), nullable=True), - sa.PrimaryKeyConstraint('id', name=op.f('pk_genome_sets')), - sa.UniqueConstraint('key', 'version', name=op.f('uq_genome_sets_key')) - ) - op.create_index(op.f('ix_genome_sets_key'), 'genome_sets', ['key'], unique=False) - - op.create_table('genomes', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('key', sa.String(), nullable=False), - sa.Column('description', sa.String(), nullable=False), - sa.Column('ncbi_db', sa.String(), nullable=True), - sa.Column('ncbi_id', sa.Integer(), nullable=True), - sa.Column('genbank_acc', sa.String(), nullable=True), - sa.Column('refseq_acc', sa.String(), nullable=True), - sa.Column('extra', JsonString(), nullable=True), - sa.PrimaryKeyConstraint('id', name=op.f('pk_genomes')), - sa.UniqueConstraint('genbank_acc', name=op.f('uq_genomes_genbank_acc')), - sa.UniqueConstraint('key', name=op.f('uq_genomes_key')), - sa.UniqueConstraint('ncbi_db', 'ncbi_id', name=op.f('uq_genomes_ncbi_db')), - sa.UniqueConstraint('refseq_acc', name=op.f('uq_genomes_refseq_acc')) - ) - - op.create_table('taxa', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('key', sa.String(), nullable=False), - sa.Column('name', sa.String(), nullable=False), - sa.Column('rank', sa.String(), nullable=True), - sa.Column('description', sa.String(), nullable=True), - sa.Column('distance_threshold', sa.Float(), nullable=True), - sa.Column('report', sa.Boolean(), server_default=sa.text('1'), nullable=False), - sa.Column('genome_set_id', sa.Integer(), nullable=False), - sa.Column('parent_id', sa.Integer(), nullable=True), - sa.Column('ncbi_id', sa.Integer(), nullable=True), - sa.Column('extra', JsonString(), nullable=True), - sa.ForeignKeyConstraint(['genome_set_id'], ['genome_sets.id'], name=op.f('fk_taxa_genome_set_id_genome_sets'), ondelete='CASCADE'), - sa.ForeignKeyConstraint(['parent_id'], ['taxa.id'], name=op.f('fk_taxa_parent_id_taxa'), ondelete='SET NULL'), - sa.PrimaryKeyConstraint('id', name=op.f('pk_taxa')), - sa.UniqueConstraint('key', name=op.f('uq_taxa_key')) - ) - op.create_index(op.f('ix_taxa_genome_set_id'), 'taxa', ['genome_set_id'], unique=False) - op.create_index(op.f('ix_taxa_name'), 'taxa', ['name'], unique=False) - op.create_index(op.f('ix_taxa_ncbi_id'), 'taxa', ['ncbi_id'], unique=False) - op.create_index(op.f('ix_taxa_parent_id'), 'taxa', ['parent_id'], unique=False) - op.create_index(op.f('ix_taxa_rank'), 'taxa', ['rank'], unique=False) - - op.create_table('genome_annotations', - sa.Column('genome_id', sa.Integer(), nullable=False), - sa.Column('genome_set_id', sa.Integer(), nullable=False), - sa.Column('taxon_id', sa.Integer(), nullable=True), - sa.Column('organism', sa.String(), nullable=True), - sa.ForeignKeyConstraint(['genome_id'], ['genomes.id'], name=op.f('fk_genome_annotations_genome_id_genomes'), ondelete='CASCADE'), - sa.ForeignKeyConstraint(['genome_set_id'], ['genome_sets.id'], name=op.f('fk_genome_annotations_genome_set_id_genome_sets'), ondelete='CASCADE'), - sa.ForeignKeyConstraint(['taxon_id'], ['taxa.id'], name=op.f('fk_genome_annotations_taxon_id_taxa'), ondelete='SET NULL'), - sa.PrimaryKeyConstraint('genome_id', 'genome_set_id', name=op.f('pk_genome_annotations')) - ) - op.create_index(op.f('ix_genome_annotations_taxon_id'), 'genome_annotations', ['taxon_id'], unique=False) - - -def downgrade(): - op.drop_index(op.f('ix_genome_annotations_taxon_id'), table_name='genome_annotations') - op.drop_table('genome_annotations') - op.drop_index(op.f('ix_taxa_rank'), table_name='taxa') - op.drop_index(op.f('ix_taxa_parent_id'), table_name='taxa') - op.drop_index(op.f('ix_taxa_ncbi_id'), table_name='taxa') - op.drop_index(op.f('ix_taxa_name'), table_name='taxa') - op.drop_index(op.f('ix_taxa_genome_set_id'), table_name='taxa') - op.drop_table('taxa') - op.drop_table('genomes') - op.drop_index(op.f('ix_genome_sets_key'), table_name='genome_sets') - op.drop_table('genome_sets') diff --git a/gambit/results/__init__.py b/gambit/results/__init__.py deleted file mode 100644 index fefd38c..0000000 --- a/gambit/results/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Export query results in various formats.""" diff --git a/gambit/results/archive.py b/gambit/results/archive.py deleted file mode 100644 index a6c7a2c..0000000 --- a/gambit/results/archive.py +++ /dev/null @@ -1,157 +0,0 @@ -"""Export results to JSON.""" - -import json -from typing import Union, IO, Any - -from attr import attrs, attrib, asdict, has as has_attrs -from sqlalchemy.orm import Session - -from gambit.query import QueryResultItem, QueryResults -from gambit.classify import ClassifierResult, GenomeMatch -from gambit.db import ReferenceGenomeSet, Taxon, AnnotatedGenome, Genome -import gambit.util.json as gjson -from gambit.util.io import FilePath, maybe_open -from gambit.util.misc import singledispatchmethod, type_singledispatchmethod -from gambit.util.typing import is_optional, unwrap_optional -from .base import asdict_default, BaseJSONResultsExporter - - -def _todict(obj, attrs): - return {a: getattr(obj, a) for a in attrs} - - -@attrs() -class ResultsArchiveWriter(BaseJSONResultsExporter): - """Exports query results to "archive" format which captures all stored data. - - This format is not intended to be read by users of the application. - The exported data can be read and converted back into an identical :class:`QueryResults` - object using :class:`.ResultsArchiveReader`. - - Attributes - ---------- - install_info - Add results of :func:`gambit.util.dev.install_info` to the ``QueryResults.extra`` dict. - """ - install_info: bool = attrib(default=False) - - to_json = singledispatchmethod(BaseJSONResultsExporter.to_json) - - to_json.register(ClassifierResult, asdict_default) - to_json.register(GenomeMatch, asdict_default) - to_json.register(QueryResultItem, asdict_default) - - @to_json.register(QueryResults) - def _queryresults_to_json(self, results): - data = asdict(results) - - if self.install_info: - from gambit.util.dev import install_info - data['extra']['install_info'] = install_info() - - return data - - @to_json.register(ReferenceGenomeSet) - def _genomeset_to_json(self, gset: ReferenceGenomeSet): - return _todict(gset, ['key', 'version']) - - @to_json.register(Taxon) - def _taxon_to_json(self, taxon: Taxon): - return _todict(taxon, ['key']) - - @to_json.register(AnnotatedGenome) - def _genome_to_json(self, genome: AnnotatedGenome): - return _todict(genome, ['key']) - - -class ResultsArchiveReader: - """Loads query results from file created by :class:`ResultsArchiveWriter`. - - Attributes - ---------- - session - SQLAlchemy session used to load database objects. - """ - session: Session - - def __init__(self, session): - self.session = session - - @type_singledispatchmethod - def _from_json(self, cls, data, ctx): - """Default implementation.""" - if is_optional(cls): - if data is None: - return None - else: - return self._from_json(unwrap_optional(cls), data, ctx) - - if has_attrs(cls): - return self._attrs_from_json(cls, data, ctx) - else: - return gjson.from_json(data, cls) - - def _attrs_from_json(self, cls, data, ctx, values=None): - """Create an attrs class instance from JSON data. - - ``values`` is a dictionary of already-deserialized attribute values. - """ - kw = dict() - - for a in cls.__attrs_attrs__: - if values is not None and a.name in values: - kw[a.name] = values[a.name] - else: - atype = Any if a.type is None else a.type - kw[a.name] = self._from_json(atype, data[a.name], ctx) - - return cls(**kw) - - @_from_json.register(ReferenceGenomeSet) - def _genomeset_from_json(self, cls, data, ctx): - assert data is not None - return self.session.query(ReferenceGenomeSet).filter_by(key=data['key'], version=data['version']).one() - - @_from_json.register(AnnotatedGenome) - def _genome_from_json(self, cls, data, ctx): - key = data['key'] - gset_id = ctx['genomeset'].id - return self.session.query(AnnotatedGenome)\ - .join(Genome)\ - .filter(AnnotatedGenome.genome_set_id == gset_id, Genome.key == key)\ - .one() - - @_from_json.register(Taxon) - def _taxon_from_json(self, cls, data, ctx): - key = data['key'] - gset_id = ctx['genomeset'].id - return self.session.query(Taxon).filter_by(genome_set_id=gset_id, key=key).one() - - @_from_json.register(QueryResultItem) - def _result_item_from_json(self, cls, data, ctx): - values = dict( - closest_genomes=[self._from_json(GenomeMatch, genome_data, ctx) for genome_data in data['closest_genomes']], - ) - return self._attrs_from_json(QueryResultItem, data, ctx, values) - - def results_from_json(self, data): - genomeset = self._from_json(ReferenceGenomeSet, data['genomeset'], dict()) - - # Add genome set to context so the correct AnnotatedGenomes can be loaded. - ctx = dict(genomeset=genomeset) - - items = [self._from_json(QueryResultItem, item, ctx) for item in data['items']] - return self._attrs_from_json(QueryResults, data, ctx, dict(genomeset=genomeset, items=items)) - - def read(self, file_or_path: Union[FilePath, IO]) -> QueryResults: - """Read query results from JSON file. - - Parameters - ---------- - file_or_path - Readable file object or file path. - """ - with maybe_open(file_or_path) as f: - data = json.load(f) - - return self.results_from_json(data) diff --git a/gambit/results/base.py b/gambit/results/base.py deleted file mode 100644 index c44220e..0000000 --- a/gambit/results/base.py +++ /dev/null @@ -1,74 +0,0 @@ -import json -from abc import ABC, abstractmethod -from typing import IO, Union, TextIO -from io import StringIO - -from attr import asdict, attrs, attrib - -from gambit.util.io import FilePath, maybe_open -import gambit.util.json as gjson -from gambit.query import QueryResults - - -class AbstractResultsExporter(ABC): - """Base for classes that export formatted query results. - - Subclasses must implement :meth:`export`. - """ - - @abstractmethod - def export(self, file_or_path: Union[FilePath, IO], results: QueryResults): - """Write query results to file. - - Parameters - ---------- - file_or_path - Open file-like object or file path to write to. - results - Results to export. - """ - - -def export_to_buffer(results: QueryResults, exporter) -> StringIO: - """Export query results to a `StringIO` buffer.""" - buf = StringIO() - exporter.export(buf, results) - buf.seek(0) - return buf - - -def _todict(obj, attrs): - return {a: getattr(obj, a) for a in attrs} - - -def asdict_method(recurse=False, **kw): - """Create a ``to_json`` method which calls :func:`attrs.asdict` with the given options.""" - def method(self, obj): - return asdict(obj, recurse=recurse, **kw) - return method - - -asdict_default = asdict_method() - - -@attrs() -class BaseJSONResultsExporter(AbstractResultsExporter): - """Base class for JSON exporters. - - Subclasses need to implement the ``to_json`` method. - - Attributes - ---------- - pretty - Write in more human-readable but less compact format. Defaults to False. - """ - pretty: bool = attrib(default=False) - - def to_json(self, obj): - """Convert object to JSON-compatible format (need not work recursively).""" - return gjson.to_json(obj) - - def export(self, file_or_path: Union[FilePath, TextIO], results: QueryResults): - opts = dict(indent=4, sort_keys=True) if self.pretty else dict() - with maybe_open(file_or_path, 'w') as f: - json.dump(results, f, default=self.to_json, **opts) diff --git a/gambit/results/csv.py b/gambit/results/csv.py deleted file mode 100644 index a56fd43..0000000 --- a/gambit/results/csv.py +++ /dev/null @@ -1,68 +0,0 @@ -"""Export query results to CSV.""" - -import csv -from typing import Dict, Any, List, Union, Iterable, TextIO - -from .base import AbstractResultsExporter -from gambit.query import QueryResultItem, QueryResults -from gambit.util.io import FilePath, maybe_open - - -def getattr_nested(obj, attrs: Union[str, Iterable[str]], pass_none=False): - if isinstance(attrs, str): - attrs = attrs.split('.') - - for attr in attrs: - if pass_none and obj is None: - return None - - obj = getattr(obj, attr) - - return obj - - -class CSVResultsExporter(AbstractResultsExporter): - """Exports query results in CSV format. - - Attributes - ---------- - format_opts - Dialect and other formatting arguments passed to :func:`csv.write`. - """ - format_opts: Dict[str, Any] - - COLUMNS = [ - ('query', 'input.label'), - ('predicted.name', 'report_taxon.name'), - ('predicted.rank', 'report_taxon.rank'), - ('predicted.ncbi_id', 'report_taxon.ncbi_id'), - ('predicted.threshold', 'report_taxon.distance_threshold'), - ('closest.distance', 'classifier_result.closest_match.distance'), - ('closest.description', 'classifier_result.closest_match.genome.description'), - ('next.name', 'classifier_result.next_taxon.name'), - ('next.rank', 'classifier_result.next_taxon.rank'), - ('next.ncbi_id', 'classifier_result.next_taxon.ncbi_id'), - ('next.threshold', 'classifier_result.next_taxon.distance_threshold'), - ] - - def __init__(self, **format_opts): - if 'dialect' not in format_opts: - format_opts.setdefault('lineterminator', '\n') - format_opts.setdefault('quoting', csv.QUOTE_MINIMAL) - self.format_opts = format_opts - - def get_header(self) -> List[str]: - """Get values for header row.""" - return [name for name, _ in self.COLUMNS] - - def get_row(self, item: QueryResultItem) -> List: - """Get row values for single result item.""" - return [getattr_nested(item, attrs, pass_none=True) for _, attrs in self.COLUMNS] - - def export(self, file_or_path: Union[FilePath, TextIO], results: QueryResults): - with maybe_open(file_or_path, 'w') as f: - writer = csv.writer(f, **self.format_opts) - - writer.writerow(self.get_header()) - for item in results.items: - writer.writerow(self.get_row(item)) diff --git a/gambit/results/json.py b/gambit/results/json.py deleted file mode 100644 index aa1d662..0000000 --- a/gambit/results/json.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Export results to JSON.""" - -from attr import attrs, asdict - -from .base import _todict, BaseJSONResultsExporter -from gambit.query import QueryResultItem, QueryResults, QueryInput -from gambit.db import ReferenceGenomeSet, Taxon, AnnotatedGenome -from gambit.util.misc import singledispatchmethod - - -@attrs() -class JSONResultsExporter(BaseJSONResultsExporter): - """Exports query results in basic JSON format. - - Currently it assumes that the query was run with ``classify_strict=False``, so the only - relevant information from ``ClassifierResult`` is the closest genome match. - """ - - to_json = singledispatchmethod(BaseJSONResultsExporter.to_json) - - @to_json.register(QueryResults) - def _results_to_json(self, results: QueryResults): - data = asdict(results, recurse=False) - del data['params'] # Parameters not currently exposed thru CLI, so omit for now. - return data - - @to_json.register(QueryResultItem) - def _item_to_json(self, item: QueryResultItem): - return dict( - query=item.input, - predicted_taxon=item.report_taxon, - next_taxon=item.classifier_result.next_taxon, - closest_genomes=item.closest_genomes, - ) - - @to_json.register(QueryInput) - def _input_to_json(self, input: QueryInput): - return dict( - name=input.label, - path=None if input.file is None else input.file.path, - format=None if input.file is None else input.file.format, - ) - - @to_json.register(ReferenceGenomeSet) - def _genomeset_to_json(self, gset: ReferenceGenomeSet): - return _todict(gset, ['id', 'key', 'version', 'name', 'description']) - - @to_json.register(Taxon) - def _taxon_to_json(self, taxon: Taxon): - return _todict(taxon, ['id', 'key', 'name', 'ncbi_id', 'rank', 'distance_threshold']) - - @to_json.register(AnnotatedGenome) - def _genome_to_json(self, genome: AnnotatedGenome): - data = _todict(genome, ['key', 'description', 'organism', 'ncbi_db', 'ncbi_id', 'genbank_acc', 'refseq_acc']) - data['id'] = genome.genome_id - data['taxonomy'] = list(genome.taxon.ancestors(incself=True)) - return data diff --git a/gambit/results/test.py b/gambit/results/test.py deleted file mode 100644 index 9bb7e80..0000000 --- a/gambit/results/test.py +++ /dev/null @@ -1,152 +0,0 @@ -"""Funcs for testing exported data.""" - -import csv -import json -from typing import TextIO -from pathlib import Path - -import numpy as np - -from gambit.util.json import to_json -from gambit.query import QueryResults -from gambit.util.misc import zip_strict - - -def cmp_json_attrs(data, obj, attrnames): - for attr in attrnames: - assert data[attr] == getattr(obj, attr) - -def cmp_taxon_json(taxon_data, taxon): - if taxon is None: - assert taxon_data is None - else: - assert taxon_data is not None - cmp_json_attrs(taxon_data, taxon, ['id', 'key', 'name', 'ncbi_id', 'rank', 'distance_threshold']) - -def cmp_annnotatedgenome_json(genome_data, genome): - assert genome_data['id'] == genome.genome_id - cmp_json_attrs( - genome_data, - genome, - ['key', 'description', 'organism', 'ncbi_db', 'ncbi_id', 'genbank_acc', 'refseq_acc'], - ) - for taxon_data, taxon in zip_strict(genome_data['taxonomy'], genome.taxon.ancestors(True)): - cmp_taxon_json(taxon_data, taxon) - -def cmp_genomematch_json(match_data, match): - assert np.isclose(match_data['distance'], match.distance) - cmp_annnotatedgenome_json(match_data['genome'], match.genome) - - assert (match_data['matched_taxon'] is None) == (match.matched_taxon is None) - if match.matched_taxon is not None: - cmp_taxon_json(match_data['matched_taxon'], match.matched_taxon) - -def check_json_results(file: TextIO, - results: QueryResults, - strict: bool = False, - ): - """Check exported JSON data matches the given results object. - - Parameters - ---------- - file - Opened results file. - results - Query results to check against. - strict - If True, expect that ``data`` was exported from the exact same ``results`` object. Otherwise - expect results from a separate query run with the same inputs. - - Raises - ------ - AssertionError - If any of the checks fail. - """ - - data = json.load(file) - - assert len(data['items']) == len(results.items) - # assert data['params'] == to_json(results.params) - cmp_json_attrs(data['genomeset'], results.genomeset, ['id', 'key', 'version', 'name', 'description']) - assert data['signaturesmeta'] == to_json(results.signaturesmeta) - # assert data['gambit_version'] == results.gambit_version - assert data['extra'] == results.extra - - if strict: - assert data['timestamp'] == to_json(results.timestamp) - - for item, item_data in zip(results.items, data['items']): - query = item_data['query'] - assert query['name'] == item.input.label - - if item.input.file is None: - assert query['path'] is None - assert query['format'] is None - - else: - assert query['format'] == item.input.file.format - - if strict: - assert query['path'] == str(item.input.file.path) - else: - assert Path(query['path']).name == item.input.file.path.name - - # Predicted taxon - predicted_data = item_data['predicted_taxon'] - cmp_taxon_json(predicted_data, item.report_taxon) - if item.report_taxon is not None: - assert np.isclose(predicted_data['distance_threshold'], item.report_taxon.distance_threshold) - - # Next taxon - cmp_taxon_json(item_data['next_taxon'], item.classifier_result.next_taxon) - - # Closest genomes - for match, match_data in zip_strict(item.closest_genomes, item_data['closest_genomes']): - cmp_genomematch_json(match_data, match) - - -def cmp_csv_taxon(row, taxon, prefix): - - if taxon is None: - assert row[prefix + '.name'] == '' - assert row[prefix + '.rank'] == '' - assert row[prefix + '.ncbi_id'] == '' - assert row[prefix + '.threshold'] == '' - else: - assert row[prefix + '.name'] == taxon.name - assert row[prefix + '.rank'] == taxon.rank - assert row[prefix + '.ncbi_id'] == str(taxon.ncbi_id or '') - assert np.isclose(float(row[prefix + '.threshold']), taxon.distance_threshold) - - -def check_csv_results(file: TextIO, results: QueryResults, strict: bool = False): - """Check exported CSV data matches the given results object. - - Parameters - ---------- - file - Opened results file. - results - Query results to check against. - strict - If True, expect that ``data`` was exported from the exact same ``results`` object. Otherwise - expect results from a separate query run with the same inputs. - - Raises - ------ - AssertionError - If any of the checks fail. - """ - - rows = list(csv.DictReader(file)) - assert len(rows) == len(results.items) - - for item, row in zip(results.items, rows): - assert row['query'] == item.input.label - - cmp_csv_taxon(row, item.report_taxon, 'predicted') - cmp_csv_taxon(row, item.classifier_result.next_taxon, 'next') - - closest = item.closest_genomes[0] - assert np.isclose(float(row['closest.distance']), closest.distance) - assert row['closest.description'] == closest.genome.description diff --git a/gambit/seq.py b/gambit/seq.py deleted file mode 100644 index f5c5f92..0000000 --- a/gambit/seq.py +++ /dev/null @@ -1,191 +0,0 @@ -"""Generic code for working with sequence data. - -Note that all code in this package operates on DNA sequences as sequences of -bytes containing ascii-encoded nucleotide codes. - -.. data:: NUCLEOTIDES - - ``bytes`` corresponding to the four DNA nucleotides. Ascii-encoded upper - case letters ``ACGT``. Note that the order, while arbitrary, is important - in this variable as it defines how unique indices are assigned to k-mer - sequences. -""" -from pathlib import Path -from typing import Union, Optional, IO, Iterable, List -from os import PathLike - -from Bio import SeqIO -from Bio.Seq import Seq -from attr import attrs, attrib - -from gambit._cython.kmers import revcomp -from gambit.util.io import FilePath -from gambit.util.io import open_compressed, ClosingIterator - - -# Byte representations of the four nucleotide codes in the order used for -# indexing k-mer sequences -NUCLEOTIDES = b'ACGT' - -SEQ_TYPES = (str, bytes, bytearray, Seq) - -#: Union of DNA sequence types accepted for k-mer search / signature calculation. -DNASeq = Union[SEQ_TYPES] - -#: Sequence types accepted directly by native (Cython) code. -DNASeqBytes = Union[bytes, bytearray] - - -def seq_to_bytes(seq: DNASeq) -> DNASeqBytes: - """Convert generic DNA sequence to byte string representation. - - This is for passing sequence data to Cython functions. - """ - if isinstance(seq, (bytes, bytearray)): - return seq - if isinstance(seq, str): - return seq.encode('ascii') - if isinstance(seq, Seq): - # This is recommended in the documentation over the deprecated encode() method, also - # probably avoids copying any data as it typically just returns the seq._data attribute. - return bytes(seq) - raise TypeError(f'Expected sequence type, got {type(seq)}') - - -def validate_dna_seq_bytes(seq : bytes): - """Check that a sequence contains only valid nucleotide codes (upper case). - - Parameters - ---------- - seq : bytes - ASCII-encoded nucleotide sequence. - - Raises - ------ - ValueError - If the sequence contains an invalid nucleotide. - """ - for i, nuc in enumerate(seq): - if nuc not in NUCLEOTIDES: - raise ValueError(f'Invalid byte at position {i}: {nuc}') - - -@attrs(frozen=True, slots=True) -class SequenceFile(PathLike): - """A reference to a DNA sequence file stored in the file system. - - Contains all the information needed to read and parse the file. Implements the - :class:`os.PathLike` interface, so it can be substituted for a ``str`` or :class:`pathlib.Path` - in most function arguments that take a file path to open. - - Parameters - ---------- - path : Union[os.PathLike, str] - Value of :attr:`path` attribute. May be string or path-like object. - format : str - Value of :attr:`format` attribute. - compression : Optional[str] - Value of :attr:`compression` attribute. - - Attributes - ---------- - path - Path to the file. - format - String describing the file format as interpreted by :func:`Bio.SeqIO.parse`, e.g. - ``'fasta'``. - compression - String describing compression method of the file, e.g. ``'gzip'``. None means no - compression. See :func:`gambit.util.io.open_compressed`. - """ - path: Path = attrib(converter=Path) - format: str = attrib() - compression: Optional[str] = attrib(default=None) - - def __fspath__(self): - return str(self.path) - - def __str__(self): - return str(self.path) - - def open(self, mode: str = 'r', **kwargs) -> IO: - """ - Open a stream to the file, with compression/decompression applied - transparently. - - Parameters - ---------- - - mode : str - Same as equivalent argument to the built-in :func:open`. Some modes may not be supported - by all compression types. - \\**kwargs - Additional text mode specific keyword arguments to pass to opener. Equivalent to the - following arguments of the built-in :func:`open`: ``encoding``, ``errors``, and - ``newlines``. May not be supported by all compression types. - - Returns - ------- - IO - Stream to file in given mode. - """ - return open_compressed(self.compression, self.path, mode, **kwargs) - - def parse(self, **kwargs) -> ClosingIterator[SeqIO.SeqRecord]: - """Open the file and lazily parse its contents. - - Returns iterator over sequence data in file. File is parsed lazily, - and so must be kept open. The returned iterator is of type - :class:`gambit.util.io.ClosingIterator` so it will close the file stream - automatically when it finishes. It may also be used as a context manager - that closes the stream on exit. You may also close the stream explicitly - using the iterator's ``close`` method. - - Parameters - ---------- - \\**kwargs - Keyword arguments to :meth:`open`. - - Returns - ------- - gambit.util.io.ClosingIterator - Iterator yielding :class:`Bio.SeqIO.SeqRecord` instances for each sequence in the file. - """ - - fobj = self.open('rt', **kwargs) - - try: - records = SeqIO.parse(fobj, self.format) - return ClosingIterator(records, fobj) - - except: - fobj.close() - raise - - def absolute(self) -> 'SequenceFile': - """Make a copy of the instance with an absolute path.""" - if self.path.is_absolute(): - return self - else: - return SequenceFile(self.path.absolute(), self.format, self.compression) - - @classmethod - def from_paths(cls, - paths: Iterable[FilePath], - format: str, - compression: Optional[str] = None, - ) -> List['SequenceFile']: - """ - Create many instances at once from a collection of paths and a single - format and compression type. - - Parameters - ---------- - paths - Collection of paths as strings or path-like objects. - format - Sequence file format of files. - compression - Compression method of files. - """ - return [cls(path, format, compression) for path in paths] diff --git a/gambit/sigs/convert.py b/gambit/sigs/convert.py deleted file mode 100644 index f0308b5..0000000 --- a/gambit/sigs/convert.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Convert signatures between representations or from one ``KmerSpec`` to another.""" - -from typing import Sequence, Union - -import numpy as np - -from .base import KmerSignature -from gambit.kmers import KmerSpec, nkmers, kmer_to_index - - -def dense_to_sparse(vec: Sequence[bool]) -> KmerSignature: - """Convert k-mer set from dense bit vector to sparse coordinate representation. - - Parameters - ---------- - vec - Boolean vector indicating which k-mers are present. - - Returns - ------- - numpy.ndarray - Sorted array of coordinates of k-mers present in vector. Data type will be ``numpy.intp``. - - See Also - -------- - .sparse_to_dense - """ - return np.flatnonzero(vec) - - -def sparse_to_dense(k_or_kspec: Union[int, KmerSpec], coords: KmerSignature) -> np.ndarray: - """Convert k-mer set from sparse coordinate representation back to dense bit vector. - - Parameters - ---------- - k_or_kspec - Value of k or a :class:`.KmerSpec` instance. - coords - Sparse coordinate array. - - Returns - ------- - numpy.ndarray - Dense k-mer bit vector. - - See Also - -------- - .dense_to_sparse - """ - idx_len = k_or_kspec.nkmers if isinstance(k_or_kspec, KmerSpec) else nkmers(k_or_kspec) - vec = np.zeros(idx_len, dtype=np.bool_) - vec[coords] = 1 - return vec - - -def can_convert(from_kspec: KmerSpec, to_kspec: KmerSpec) -> bool: - """Check if signatures from one KmerSpec can be converted to another. - - Conversion is possible if ``to_kspec.prefix`` is equal to or starts with ``from_kspec.prefix`` - and ``to_kspec.total_len <= from_kspec.total_len``. - """ - return to_kspec.prefix.startswith(from_kspec.prefix) and to_kspec.total_len <= from_kspec.total_len - - -def check_can_convert(from_kspec: KmerSpec, to_kspec: KmerSpec): - """ - Check that signatures can be converted from one KmerSpec to another or raise an error with an - informative message. - - Raises - ------ - ValueError - If conversion is not possible. - """ - if not to_kspec.prefix.startswith(from_kspec.prefix): - raise ValueError('Destination prefix must start with source prefix.') - if to_kspec.total_len > from_kspec.total_len: - raise ValueError('Cannot convert to KmerSpec with longer total length.') - - -def _convert_params(from_kspec: KmerSpec, to_kspec: KmerSpec): - extra_prefix = to_kspec.prefix[from_kspec.prefix_len:] - extra_ind = kmer_to_index(extra_prefix) - extra_len = len(extra_prefix) - - range_ = nkmers(from_kspec.k - extra_len) - start = extra_ind * range_ - stop = (extra_ind + 1) * range_ - reduce = from_kspec.k - to_kspec.k - extra_len - - return start, stop, reduce - - -def convert_dense(from_kspec: KmerSpec, to_kspec: KmerSpec, vec: np.ndarray) -> np.ndarray: - """Convert a k-mer signature in dense format from one ``KmerSpec`` to another. - - In the ideal case, if ``vec`` is the result of ``calc_signature(from_kspec, seq, sparse=False)`` - the output of this function should be identical to ``calc_signature(to_kspec, seq, sparse=False)``. - In reality this may not hold if any potential matches of ``from_kspec`` in ``seq`` are discarded - due to an invalid nucleotide which is not included in the corresponding ``to_kspec`` match. - """ - check_can_convert(from_kspec, to_kspec) - start, stop, reduce = _convert_params(from_kspec, to_kspec) - block_size = nkmers(reduce) - - out = np.zeros(to_kspec.nkmers, dtype=bool) - - for i in range(block_size): - out |= vec[start+i:stop:block_size] - - return out - - -def convert_sparse(from_kspec: KmerSpec, to_kspec: KmerSpec, sig: KmerSignature) -> KmerSignature: - """Convert a k-mer signature in sparse format from one ``KmerSpec`` to another. - - In the ideal case, if ``sig`` is the result of ``calc_signature(from_kspec, seq)`` - the output of this function should be identical to ``calc_signature(to_kspec, seq)``. - In reality this may not hold if any potential matches of ``from_kspec`` in ``seq`` are discarded - due to an invalid nucleotide which is not included in the corresponding ``to_kspec`` match. - """ - assert can_convert(from_kspec, to_kspec) - start, stop, reduce = _convert_params(from_kspec, to_kspec) - reduce_bits = 2 * reduce - - out = np.empty(len(sig), dtype=to_kspec.index_dtype) - i = 0 - next_ = start - - for from_idx in sig: - if from_idx < next_: - continue - if from_idx >= stop: - break - - to_idx = (from_idx - start) >> reduce_bits - out[i] = to_idx - i += 1 - - # Next possible input index that won't reduce to the same output - next_ = ((to_idx + 1) << reduce_bits) + start - - out.resize(i) - return out diff --git a/gambit/util/dev.py b/gambit/util/dev.py deleted file mode 100644 index d23c089..0000000 --- a/gambit/util/dev.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Development tools.""" - -from pathlib import Path -import subprocess -import shutil -from typing import Dict, Any - -import gambit -from gambit.util.io import FilePath -from gambit.util.misc import zip_strict - - -_INSTALL_INFO = None - - -def get_commit_info(repo_path: FilePath, commit: str = 'HEAD') -> Dict[str, str]: - """Get metadata on a git commit. - - This calls the ``git`` command, so it must be installed and available. - - Parameters - ---------- - repo_path - Path to git repo. - commit - Commit to get information on. - """ - fields = [ - ('hash', '%H'), - ('author', '%an <%ae>'), - ('author_date', '%aI'), - ('commit', '%cn <%ce>'), - ('commit_date', '%cI'), - ('subject', '%s'), - ] - - fmt_str = '%n'.join(fmt for name, fmt in fields) - cmd = ['git', 'show', '-s', '--format=' + fmt_str, commit] - - result = subprocess.run(cmd, cwd=repo_path, capture_output=True, check=True, text=True) - - lines = result.stdout.splitlines() - assert len(lines) == len(fields) - return {name: line for (name, fmt), line in zip_strict(fields, lines)} - - -def _install_info(): - info = dict(pkg_dir=None, repo_dir=None, commit=None) - - if not hasattr(gambit, '__path__'): - info['status'] = 'gambit module has no __path__ attribute.' - return info - - if len(gambit.__path__) != 1: - info['status'] = f'Expected gambit.__path__ to contain single item, got {gambit.__path__!r}' - return info - - pkg_dir = info['pkg_dir'] = Path(gambit.__path__[0]) - repo_dir = pkg_dir.parent - - if not (repo_dir / '.git').is_dir(): - info['status'] = 'Parent of package directory not a git repo (has no .git subdirectory).' - return info - - info['repo_dir'] = repo_dir - - if shutil.which('git') is None: - info['status'] = 'git command not found' - return info - - try: - commit = get_commit_info(repo_dir) - except subprocess.SubprocessError as e: - info['status'] = f'Command {e.cmd!r} returned exit code {e.returncode} with stderr output {e.stderr!r}' - except Exception as e: - info['status'] = f'Error getting commit info: {e!r}' - else: - info['status'] = 'Git info retrieved successfully.' - info['commit'] = commit - - return info - - -def install_info() -> Dict[str, Any]: - """Get information on the GAMBIT installation if it is installed in development mode. - - - If gambit is installed via the setuptools development install method (``pip install -e``), this - checks if the source directory is a valid git repo and tries to get information on the current - commit. This is used to mark exported results from development versions of the software which do - not correspond to an official release. - """ - global _INSTALL_INFO - if _INSTALL_INFO is None: - _INSTALL_INFO = _install_info() - return _INSTALL_INFO diff --git a/gambit/util/typing.py b/gambit/util/typing.py deleted file mode 100644 index c705fe1..0000000 --- a/gambit/util/typing.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Utilities based on the built-in ``typing`` module.""" - -import typing -from typing import Union, Any - - -def is_union(T) -> bool: - """Check if a type annotation is a *parameterized* :class:`typing.Union`. - - Parameters - ---------- - T - Result of ``Union[A, B, ...]``. - """ - return isinstance(T, typing._GenericAlias) and T.__origin__ is typing.Union - - -def union_types(T) -> tuple: - """Get the types from a parameterized :class:`typing.Union`. - - Parameters - ---------- - T - Result of ``Union[A, B, ...]``. - """ - return T.__args__ - - -def is_optional(T) -> bool: - """Check if a parametrized union type is equivalent to one returned by :data:`typing.Optional`.""" - if not is_union(T): - return False - types = union_types(T) - return len(types) == 2 and type(None) in types - - -def unwrap_optional(u): - """Get ``T`` from ``typing.Optional[T]``.""" - for T in union_types(u): - if T is not type(None): - return T - - raise ValueError(f'Not an Optional type: {u!r}') diff --git a/pyproject.toml b/pyproject.toml index e42e25d..b685eb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,8 +2,6 @@ requires = [ "setuptools", "wheel", - "Cython ~= 0.27", - # If the Numpy version is different at runtime than build time, the build version should be - # lower as the ABI is forward- but not backwards-compatible. - "oldest-supported-numpy", + "Cython >= 3.0", ] +build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg index cd3eb1a..ad43822 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,30 +6,39 @@ author = Jared Lumpe author_email = jared@jaredlumpe.com url = http://github.com/jlumpe/gambit license = AGPL-3.0-or-later -license_file = LICENSE +license_files = LICENSE [options] packages = find: +package_dir = + =src zip_safe = false include_package_data = true -python_requires = >= 3.7 +python_requires = >= 3.9 install_requires = - numpy~=1.13 - sqlalchemy~=1.1 - biopython~=1.69 - alembic~=1.0 - attrs>=20 - cattrs~=1.0 - click>=7.0 - h5py~=3.0 - scipy~=1.7 + # Test failures on 2.x + numpy ~= 1.13 + sqlalchemy >= 1.4 + # Seq stores data as bytes + biopython ~= 1.79 + attrs >= 20 + # Minimum for 3.12, also introduces potentially breaking changes + cattrs >= 23.2 + click >= 7.0 + h5py ~= 3.0 + scipy ~= 1.7 + typing-extensions >= 4.0 tests_require = pytest +[options.packages.find] +where = src + + [options.entry_points] console_scripts = gambit = gambit.cli:cli @@ -47,8 +56,8 @@ test = pytest # Also check docstrings in package testpaths = tests gambit -# Run doctests on all modules (except __main__.py and alembic config directory) -addopts = --doctest-modules --ignore-glob "**/__main__.py" --ignore "gambit/db/migrate/alembic/" +# Run doctests on all modules (except __main__.py) +addopts = --doctest-modules --ignore-glob "**/__main__.py" doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL # Treat functions/classes prefixed with "benchmark" as tests, for files in tests/benchmarks/. @@ -56,11 +65,6 @@ python_functions = test_* benchmark_* python_classes = Test* Benchmark* -# Custom markers -markers = - testdb_nqueries: number of query files from test database to use when testing CLI. - - # Flake8 settings [flake8] diff --git a/setup.py b/setup.py index cbcce77..de06f94 100644 --- a/setup.py +++ b/setup.py @@ -3,18 +3,23 @@ from setuptools import setup from distutils.extension import Extension from Cython.Build import cythonize -import numpy # Cython extensions -np_include = numpy.get_include() extensions = [Extension( 'gambit._cython.*', - ['gambit/_cython/*.pyx'], - include_dirs=[np_include], + ['src/gambit/_cython/*.pyx'], extra_compile_args=['-fopenmp', '-Wno-sign-compare'], extra_link_args=['-fopenmp'], )] +ext_modules = cythonize( + extensions, + compiler_directives=dict( + language_level='3str', + boundscheck=False, + wraparound=False, + ), +) -setup(ext_modules=cythonize(extensions)) +setup(ext_modules=ext_modules) diff --git a/gambit/__init__.py b/src/gambit/__init__.py similarity index 86% rename from gambit/__init__.py rename to src/gambit/__init__.py index 7a9aa80..78d441a 100644 --- a/gambit/__init__.py +++ b/src/gambit/__init__.py @@ -5,4 +5,4 @@ Author email: jared@jaredlumpe.com """ -__version__ = '1.0.1' +__version__ = '1.1.0' diff --git a/gambit/__main__.py b/src/gambit/__main__.py similarity index 100% rename from gambit/__main__.py rename to src/gambit/__main__.py diff --git a/gambit/_cython/.gitignore b/src/gambit/_cython/.gitignore similarity index 100% rename from gambit/_cython/.gitignore rename to src/gambit/_cython/.gitignore diff --git a/gambit/_cython/__init__.py b/src/gambit/_cython/__init__.py similarity index 100% rename from gambit/_cython/__init__.py rename to src/gambit/_cython/__init__.py diff --git a/src/gambit/_cython/kmers.pxd b/src/gambit/_cython/kmers.pxd new file mode 100644 index 0000000..c3f57be --- /dev/null +++ b/src/gambit/_cython/kmers.pxd @@ -0,0 +1,9 @@ +from libc.stdint cimport uint64_t, intptr_t + +ctypedef unsigned char CHAR + + +cdef uint64_t c_kmer_to_index(const CHAR[:], bint*) nogil +cdef uint64_t c_kmer_to_index_rc(const CHAR[:], bint*) nogil +cdef void c_index_to_kmer(uint64_t, CHAR[:]) nogil +cdef void c_revcomp(const CHAR[:], CHAR[:]) nogil diff --git a/gambit/_cython/kmers.pyx b/src/gambit/_cython/kmers.pyx similarity index 64% rename from gambit/_cython/kmers.pyx rename to src/gambit/_cython/kmers.pyx index f0de678..022853d 100644 --- a/gambit/_cython/kmers.pyx +++ b/src/gambit/_cython/kmers.pyx @@ -1,21 +1,39 @@ -# cython: language_level = 3str, wraparound = False, boundscheck = False +"""Cython module for working with DNA sequences and k-mers. -"""Cython module for working with DNA sequences and k-mers.""" +Note: each of the 4 Python functions here have a C counterpart that does the actual work. The Python +version is just a wrapper that does any needed conversion, allocates buffers, and raises exceptions +if needed. The separation currently isn't necessary as the C functions aren't used anywhere else +outside the wrappers, but they may be in the future. Handling exceptions in the Python wrappers only +allows the C functions to be declared with nogil. +""" -cpdef np.uint64_t kmer_to_index(const CHAR[:] kmer) nogil except? 0: - """kmer_to_index(kmer) +def kmer_to_index(const CHAR[:] kmer): + """kmer_to_index(kmer: bytes) -> int Convert k-mer byte string to its integer index. """ cdef: - np.uint64_t idx = 0 - int i, k = kmer.shape[0] - CHAR nuc + uint64_t idx + bint exc = False - if k > 32: + if kmer.shape[0] > 32: raise ValueError('k must be <= 32') + idx = c_kmer_to_index(kmer, &exc) + + if exc: + raise ValueError('Invalid character in k-mer') + + return idx + + +cdef uint64_t c_kmer_to_index(const CHAR[:] kmer, bint *exc) nogil: + cdef: + uint64_t idx = 0 + int i, k = kmer.shape[0] + CHAR nuc + for i in range(k): nuc = kmer[i] @@ -31,24 +49,38 @@ cpdef np.uint64_t kmer_to_index(const CHAR[:] kmer) nogil except? 0: elif nuc == 'T': idx += 3 else: - raise ValueError(nuc) + exc[0] = True + return 0 return idx -cpdef np.uint64_t kmer_to_index_rc(const CHAR[:] kmer) nogil except? 0: - """kmer_to_index_rc(kmer) +def kmer_to_index_rc(const CHAR[:] kmer): + """kmer_to_index_rc(kmer: bytes) -> int Get the integer index of the reverse complement of a k-mer byte string. """ cdef: - np.uint64_t idx = 0 - int i, k = kmer.shape[0] - CHAR nuc + uint64_t idx + bint exc = False - if k > 32: + if kmer.shape[0] > 32: raise ValueError('k must be <= 32') + idx = c_kmer_to_index_rc(kmer, &exc) + + if exc: + raise ValueError('Invalid character in k-mer') + + return idx + + +cdef uint64_t c_kmer_to_index_rc(const CHAR[:] kmer, bint *exc) nogil: + cdef: + uint64_t idx = 0 + int i, k = kmer.shape[0] + CHAR nuc + for i in range(k): nuc = kmer[k - i - 1] @@ -64,7 +96,8 @@ cpdef np.uint64_t kmer_to_index_rc(const CHAR[:] kmer) nogil except? 0: elif nuc == 'T': idx += 0 else: - raise ValueError(nuc) + exc[0] = True + return 0 return idx @@ -79,7 +112,7 @@ def index_to_kmer(index, int k): return bytes(buf) -cdef void c_index_to_kmer(np.uint64_t index, CHAR[:] out) nogil: +cdef void c_index_to_kmer(uint64_t index, CHAR[:] out) nogil: """Convert k-mer index to sequence.""" cdef: int k = out.shape[0] diff --git a/gambit/_cython/metric.pxd b/src/gambit/_cython/metric.pxd similarity index 80% rename from gambit/_cython/metric.pxd rename to src/gambit/_cython/metric.pxd index 9b91b06..125ec64 100644 --- a/gambit/_cython/metric.pxd +++ b/src/gambit/_cython/metric.pxd @@ -1,5 +1,4 @@ -# cython: language_level = 3str - +from libc.stdint cimport intptr_t from .types cimport SCORE_T, BOUNDS_T, COORDS_T, COORDS_T_2 cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil diff --git a/gambit/_cython/metric.pyx b/src/gambit/_cython/metric.pyx similarity index 59% rename from gambit/_cython/metric.pyx rename to src/gambit/_cython/metric.pyx index e543d0c..ecb5001 100644 --- a/gambit/_cython/metric.pyx +++ b/src/gambit/_cython/metric.pyx @@ -1,79 +1,18 @@ -# cython: language_level = 3str, wraparound = False - """Cython functions for calculating k-mer distance metrics""" -cimport cython -cimport numpy as np - -import numpy as np from cython.parallel import prange, parallel -# Numpy dtypes equivalent to SCORE_T and BOUNDS_T -SCORE_DTYPE = np.dtype(np.float32) -BOUNDS_DTYPE = np.dtype(np.intp) - - def jaccard(COORDS_T[:] coords1, COORDS_T_2[:] coords2): - """Compute the Jaccard index between two k-mer sets in sparse coordinate format. - - Arguments are Numpy arrays containing k-mer indices in sorted order. Data types must be 16, 32, - or 64-bit signed or unsigned integers, but do not need to match. - - This is by far the most efficient way to calculate the metric (this is a native function) and - should be used wherever possible. - - Parameters - ---------- - coords1 : numpy.ndarray - K-mer set in sparse coordinate format. - coords2 : numpy.ndarray - K-mer set in sparse coordinate format. - - Returns - ------- - numpy.float32 - Jaccard index between the two sets, a real number between 0 and 1. - - See Also - -------- - .jaccarddist - """ + """Compute the Jaccard index between two k-mer sets in sparse coordinate format.""" return 1 - c_jaccarddist(coords1, coords2) def jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2): - """Compute the Jaccard distance between two k-mer sets in sparse coordinate format. - - The Jaccard distance is equal to one minus the Jaccard index. - - Arguments are Numpy arrays containing k-mer indices in sorted order. Data types must be 16, 32, - or 64-bit signed or unsigned integers, but do not need to match. - - This is by far the most efficient way to calculate the metric (this is a native function) and - should be used wherever possible. - - Parameters - ---------- - coords1 : numpy.ndarray - K-mer set in sparse coordinate format. - coords2 : numpy.ndarray - K-mer set in sparse coordinate format. - - Returns - ------- - numpy.float32 - Jaccard distance between the two sets, a real number between 0 and 1. - - See Also - -------- - .jaccard - """ + """Compute the Jaccard distance between two k-mer sets in sparse coordinate format.""" return c_jaccarddist(coords1, coords2) -@cython.boundscheck(False) -@cython.wraparound(False) cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil: """Compute the Jaccard distance between two k-mer sets in ordered coordinate format. @@ -82,15 +21,15 @@ cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil: cdef: # Lengths of the two arrays - np.intp_t N = coords1.shape[0] - np.intp_t M = coords2.shape[0] + intptr_t N = coords1.shape[0] + intptr_t M = coords2.shape[0] # Index and value of items in each array as we are iterating - np.intp_t i = 0, j = 0 + intptr_t i = 0, j = 0 COORDS_T a COORDS_T_2 b - np.intp_t u = 0 # Size of union + intptr_t u = 0 # Size of union # Iterate through both arrays simultaneously, advance index for the array # with the smaller value. Advance both if they are equal. Increment the @@ -121,8 +60,6 @@ cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil: return (2 * u - N - M) / u -@cython.boundscheck(False) -@cython.wraparound(False) def _jaccarddist_parallel(COORDS_T[:] query, COORDS_T_2[:] ref_coords, BOUNDS_T[:] ref_bounds, SCORE_T[:] out): """Calculate Jaccard distances between a query k-mer set and a collection of reference sets. @@ -144,7 +81,7 @@ def _jaccarddist_parallel(COORDS_T[:] query, COORDS_T_2[:] ref_coords, BOUNDS_T[ out : numpy.ndarray Pre-allocated array to write distances to. """ - cdef np.intp_t N = ref_bounds.shape[0] - 1 + cdef intptr_t N = ref_bounds.shape[0] - 1 cdef BOUNDS_T begin, end cdef int i diff --git a/src/gambit/_cython/threads.pyx b/src/gambit/_cython/threads.pyx new file mode 100644 index 0000000..d663efa --- /dev/null +++ b/src/gambit/_cython/threads.pyx @@ -0,0 +1,60 @@ +"""OpenMP stuff.""" + +from cython import parallel +import array + +cimport cython +from cpython cimport array +cimport openmp + + +def omp_set_num_threads(n: int): + """Set maximum number of threads used by OpenMP. + + Just calls the ``omp_set_num_threads`` C function. + """ + if n <= 0: + raise ValueError('Argument must be positive.') + openmp.omp_set_num_threads(n) + + +def omp_get_max_threads(): + """Get the maximum number of threads used by OpenMP. + + Just calls the ``omp_get_max_threads`` C function. + """ + return openmp.omp_get_max_threads() + + +@cython.boundscheck(True) +def get_thread_ids(int n): + """Run a multithreaded loop and get the thread ID running in each iteration. + + Used to check that Cython code parallelization is working correctly. Result should contain + integers from 0 to ``num_threads``, repeated up to length ``n``. + + Parameters + ---------- + n: int + Size of loop. Make this at least as large as the expected number of threads. + + Returns + ------- + array.array + Array of size ``n`` containing the thread ID running in each loop iteration. + """ + + cdef: + array.array thread_ids_arr = array.array('i') + int[:] thread_ids + int i + + for i in range(n): + thread_ids_arr.append(-1) + + thread_ids = thread_ids_arr + + for i in parallel.prange(n, nogil=True, schedule='static', chunksize=1): + thread_ids[i] = parallel.threadid() + + return thread_ids diff --git a/gambit/_cython/types.pxd b/src/gambit/_cython/types.pxd similarity index 57% rename from gambit/_cython/types.pxd rename to src/gambit/_cython/types.pxd index 56992bb..731a4d1 100644 --- a/gambit/_cython/types.pxd +++ b/src/gambit/_cython/types.pxd @@ -1,28 +1,23 @@ """Shared typedefs.""" -cimport numpy as np +from libc.stdint cimport uint16_t, uint32_t, uint64_t, intptr_t # Type for similarity scores -ctypedef np.float32_t SCORE_T +ctypedef float SCORE_T # Type for bounds on c_jaccard_coords_col -ctypedef np.intp_t BOUNDS_T +# This should be equal to Numpy's intp dtype +ctypedef intptr_t BOUNDS_T # Fused type for storing k-mer coordinates/indices ctypedef fused COORDS_T: - np.int16_t - np.uint16_t - np.int32_t - np.uint32_t - np.int64_t - np.uint64_t + uint16_t + uint32_t + uint64_t # Copy of COORDS_T, used when two arguments have types in this set but may be different than each other. ctypedef fused COORDS_T_2: - np.int16_t - np.uint16_t - np.int32_t - np.uint32_t - np.int64_t - np.uint64_t + uint16_t + uint32_t + uint64_t diff --git a/gambit/classify.py b/src/gambit/classify.py similarity index 83% rename from gambit/classify.py rename to src/gambit/classify.py index 948106c..d3d609a 100644 --- a/gambit/classify.py +++ b/src/gambit/classify.py @@ -1,6 +1,6 @@ """Classify queries based on distance to reference sequences.""" -from typing import Optional, Tuple, Iterable, Dict, List, Set, Sequence +from typing import Optional, Iterable, Sequence from attr import attrs, attrib import numpy as np @@ -30,7 +30,7 @@ def matching_taxon(taxon: Taxon, d: float) -> Optional[Taxon]: return None -def find_matches(itr: Iterable[Tuple[AnnotatedGenome, float]]) -> Dict[Taxon, List[int]]: +def find_matches(itr: Iterable[tuple[AnnotatedGenome, float]]) -> dict[Taxon, list[int]]: """Find taxonomy matches given distances from a query to a set of reference genomes. Parameters @@ -40,7 +40,7 @@ def find_matches(itr: Iterable[Tuple[AnnotatedGenome, float]]) -> Dict[Taxon, Li Returns ------- - Dict[Taxon, List[Int]] + dict[Taxon, list[int]] Mapping from taxa to indices of genomes matched to them. """ matches = dict() @@ -53,7 +53,7 @@ def find_matches(itr: Iterable[Tuple[AnnotatedGenome, float]]) -> Dict[Taxon, Li return matches -def consensus_taxon(taxa: Iterable[Taxon]) -> Tuple[Optional[Taxon], Set[Taxon]]: +def consensus_taxon(taxa: Iterable[Taxon]) -> tuple[Optional[Taxon], set[Taxon]]: """Take a set of taxa matching a query and find a single consensus taxon for classification. If a query matches a given taxon, it is expected that there may be matches to some of that @@ -75,7 +75,7 @@ def consensus_taxon(taxa: Iterable[Taxon]) -> Tuple[Optional[Taxon], Set[Taxon]] Returns ------- - Tuple[Optional[Taxon], Set[Taxon]] + tuple[Optional[Taxon], set[Taxon]] Consensus taxon along with the set of any taxa in the argument which are descended from it. """ taxa = list(taxa) @@ -132,7 +132,7 @@ class GenomeMatch: Reference genome matched to. distance Distance between query and reference genome. - matching_taxon + matched_taxon Taxon prediction based off of this match alone. Will always be ``genome.taxon`` or one of its ancestors. """ @@ -163,23 +163,6 @@ def next_taxon(self) -> Optional[Taxon]: return lo -def compare_genome_matches(match1: Optional[GenomeMatch], match2: Optional[GenomeMatch]) -> bool: - """Compare two ``GenomeMatch`` instances for equality. - - The values for the ``distance`` attribute are only checked for approximate equality, to support - instances where one was loaded from a results archive (saving and loading a float in JSON is - lossy). - - Also allows one or both values to be None. - """ - if match1 is None or match2 is None: - return match1 is None and match2 is None - - return match1.genome == match2.genome and \ - match1.matched_taxon == match2.matched_taxon and \ - np.isclose(match1.distance, match2.distance) - - @attrs() class ClassifierResult: """Result of applying the classifier to a single query genome. @@ -210,7 +193,7 @@ class ClassifierResult: primary_match: Optional[GenomeMatch] = attrib() closest_match: GenomeMatch = attrib() next_taxon: Optional[Taxon] = attrib() - warnings: List[str] = attrib(factory=list, repr=False) + warnings: list[str] = attrib(factory=list, repr=False) error: Optional[str] = attrib(default=None, repr=False) @next_taxon.default @@ -218,17 +201,6 @@ def _next_taxon_default(self): return self.closest_match.next_taxon() -def compare_classifier_results(result1: ClassifierResult, result2: ClassifierResult) -> bool: - """Compare two ``ClassifierResult`` instances for equality.""" - return result1.success == result2.success and \ - result1.predicted_taxon == result2.predicted_taxon and \ - compare_genome_matches(result1.primary_match, result2.primary_match) and \ - compare_genome_matches(result1.closest_match, result2.closest_match) and \ - result1.next_taxon == result2.next_taxon and \ - set(result1.warnings) == set(result2.warnings) and \ - result1.error == result2.error - - def classify(ref_genomes: Sequence[AnnotatedGenome], dists: np.ndarray, *, @@ -313,7 +285,7 @@ def classify(ref_genomes: Sequence[AnnotatedGenome], # Warn of inconsistent matches if others: msg = f'Query matched {len(others)} inconsistent taxa: ' - msg += ', '.join(other.short_repr() for other in others) + msg += ', '.join(sorted(other.short_repr() for other in others)) msg += '. Reporting lowest common ancestor of this set.' result.warnings.append(msg) diff --git a/gambit/cli/__init__.py b/src/gambit/cli/__init__.py similarity index 100% rename from gambit/cli/__init__.py rename to src/gambit/cli/__init__.py diff --git a/gambit/cli/common.py b/src/gambit/cli/common.py similarity index 86% rename from gambit/cli/common.py rename to src/gambit/cli/common.py index 998a2b1..37612f2 100644 --- a/gambit/cli/common.py +++ b/src/gambit/cli/common.py @@ -1,19 +1,17 @@ import os -from typing import Optional, Sequence, TextIO, Union, Iterable, Tuple, List +from typing import Optional, Sequence, TextIO, Union, Iterable, Any from pathlib import Path from collections import Counter import click -from sqlalchemy import create_engine -from sqlalchemy.engine import Engine from sqlalchemy.orm import sessionmaker from gambit.kmers import KmerSpec, DEFAULT_KMERSPEC -from gambit.db import ReferenceDatabase, ReadOnlySession, only_genomeset, DatabaseLoadError +from gambit.db import ReferenceDatabase, only_genomeset, DatabaseLoadError, file_sessionmaker from gambit.sigs.base import ReferenceSignatures, load_signatures from gambit.util.io import FilePath, read_lines from gambit.util.misc import join_list_human -from gambit.seq import validate_dna_seq_bytes, SequenceFile +from gambit.seq import validate_dna_seq_bytes class CLIContext: @@ -37,8 +35,6 @@ class CLIContext: Whether reference signatures are available. has_database Whether reference genome metadata and reference signatures are both available. - engine - SQLAlchemy engine connecting to genomes database. Session SQLAlchemy session maker for genomes database. signatures @@ -49,7 +45,6 @@ class CLIContext: has_genomes: bool has_signatures: bool has_database: bool - engine: Optional[Engine] Session: Optional[sessionmaker] signatures: Optional[ReferenceSignatures] @@ -121,16 +116,10 @@ def require_signatures(self): self.require_database() def _init_genomes(self): - if self._engine is not None or not self.has_genomes: + if self._Session is not None or not self.has_genomes: return - self._engine = create_engine(f'sqlite:///{self._genomes_path}') - self._Session = sessionmaker(self.engine, class_=ReadOnlySession) - - @property - def engine(self): - self._init_genomes() - return self._engine + self._Session = file_sessionmaker(self._genomes_path) @property def Session(self): @@ -282,7 +271,7 @@ def strip_seq_file_ext(filename: str) -> str: return filename -def get_file_id(path: FilePath, strip_dir: bool = True, strip_ext: bool = True) -> str: +def get_file_id(path: 'FilePath', strip_dir: bool = True, strip_ext: bool = True) -> str: """Get sequence file ID derived from file path. Parameters @@ -305,7 +294,7 @@ def get_sequence_files(explicit: Optional[Iterable[FilePath]]=None, listfile_dir: Optional[str]=None, strip_dir: bool = True, strip_ext: bool = True, - ) -> Union[Tuple[List[str], List[SequenceFile]], Tuple[None, None]]: + ) -> Union[tuple[list[str], list[Path]], tuple[None, None]]: """Get list of sequence file paths and IDs from several types of CLI arguments. Does not check for conflict between ``explicit`` and ``listfile``. @@ -325,30 +314,29 @@ def get_sequence_files(explicit: Optional[Iterable[FilePath]]=None, Returns ------- - Tuple[Optional[List[str]], Optional[List[SequenceFile]]] + tuple[Optional[list[str]], Optional[list[Path]]] ``(ids, files)`` tuple. ``ids`` is a list of string IDs that can be used to label output. If the ``explicit`` and ``listfile`` arguments are None/empty both components of the tuple will be None as well. """ if explicit: - paths = list(map(Path, explicit)) - paths_str = list(map(str, paths)) + files = list(map(Path, explicit)) + files_str = list(map(str, files)) elif listfile is not None: lines = list(read_lines(listfile, skip_empty=True)) - paths = [Path(listfile_dir) / line for line in lines] - paths_str = lines + files = [Path(listfile_dir) / line for line in lines] + files_str = lines else: return None, None - files = SequenceFile.from_paths(paths, 'fasta', 'auto') - ids = [get_file_id(f, strip_dir, strip_ext) for f in paths_str] + ids = [get_file_id(f, strip_dir, strip_ext) for f in files_str] return ids, files -def warn_duplicate_file_ids(ids: List[str], template: str): +def warn_duplicate_file_ids(ids: list[str], template: str): """Print a warning message if duplicate file IDs are present. Parameters @@ -370,7 +358,7 @@ def warn_duplicate_file_ids(ids: List[str], template: str): # Click introspection ################################################################################ -def params_by_name(cmd: click.Command, names: Optional[Iterable[str]]=None): +def params_by_name(cmd: click.Command, names: Optional[Iterable[str]] = None): """Get parameters of click command by name. Parameters @@ -462,3 +450,39 @@ def print_table(rows: Sequence[Sequence], colsep: str=' ', left: str='', right: echo(right) echo('\n') + + +def get_revision_info(revision) -> Optional[dict[str, Any]]: + """Extract revision information from metadata JSON. + + :class:`gambit.sigs.base.SignaturesMeta` and :class:`gambit.db.models.ReferenceGenomeSet` + (stored in ``.gs`` and ``.gdb`` files) have an ``extra`` field to store additional metadata + in JSON format. There is no prescribed format for this, but the official GAMBIT database files + have a "revision" key that is an object with a common set of fields. This function attempts to + extract that data, without causing an error if the format is not as expected. + + Parameters + ---------- + revision + Value under extra metadata's "revision" key, or None. + + Returns + ------- + None if ``revision`` is not a dict, otherwise a dict with ``'num'`` , ``'date'```, + ``'author'``, and ``'description'`` keys. + """ + if not isinstance(revision, dict): + return None + + info = dict() + fields = [('num', int), ('date', str), ('author', str), ('description', str)] + + for name, type_ in fields: + if name not in revision: + info[name] = '' + elif isinstance(revision[name], type_): + info[name] = revision[name] + else: + info[name] = '' + + return info diff --git a/gambit/cli/debug.py b/src/gambit/cli/debug.py similarity index 87% rename from gambit/cli/debug.py rename to src/gambit/cli/debug.py index b2f827d..3494bf2 100644 --- a/gambit/cli/debug.py +++ b/src/gambit/cli/debug.py @@ -1,4 +1,4 @@ -from typing import Dict, Any +from typing import Any, Optional import click @@ -21,7 +21,7 @@ def debug_group(): pass -def make_shell_ns(ctx) -> Dict[str, Any]: +def make_shell_ns(ctx) -> dict[str, Any]: """Make the user namespace for the shell command.""" from importlib import import_module @@ -43,18 +43,17 @@ def make_shell_ns(ctx) -> Dict[str, Any]: help='Use IPython instead of built-in Python REPL.', ) @click.pass_context -def shell(ctx, use_ipython): +def shell(ctx, use_ipython: Optional[bool]): """Start an interactive shell with application data and modules imported. Attempts to launch an IPython interactive interpreter if it is installed, otherwise falls back on standard Python REPL. """ - from gambit.util.misc import is_importable - if use_ipython is None: - if is_importable('IPython'): + try: + import IPython use_ipython = True - else: + except ImportError: click.echo('IPython not available, defaulting to built-in Python REPL.', err=True) use_ipython = False diff --git a/gambit/cli/dist.py b/src/gambit/cli/dist.py similarity index 91% rename from gambit/cli/dist.py rename to src/gambit/cli/dist.py index 98cfcdf..7928139 100644 --- a/gambit/cli/dist.py +++ b/src/gambit/cli/dist.py @@ -1,11 +1,10 @@ import sys -from typing import Optional, TextIO, List +from typing import Optional, TextIO import click from . import common from .root import cli -from gambit.seq import SequenceFile from gambit.sigs import load_signatures from gambit.sigs.calc import calc_file_signatures from gambit.metric import jaccarddist_matrix, jaccarddist_pairwise @@ -41,11 +40,11 @@ def dist_cmd(ctx: click.Context, k: Optional[int], prefix: Optional[str], output: str, - q: List[str], + q: list[str], ql: Optional[TextIO], qdir: Optional[str], qs: Optional[str], - r: List[str], + r: list[str], rl: Optional[TextIO], rdir: Optional[str], rs: Optional[str], @@ -111,11 +110,11 @@ def dist_cmd(ctx: click.Context, else: if query_sigs is not None and query_sigs.kmerspec != kspec: raise click.ClickException( - f'K-mer search parameters from command line options ({fmt_kspec(kspec)}) do not' + f'K-mer search parameters from command line options ({fmt_kspec(kspec)}) do not ' f'match those of query signatures ({fmt_kspec(query_sigs.kmerspec)}).') if ref_sigs is not None and ref_sigs.kmerspec != kspec: raise click.ClickException( - f'K-mer search parameters from command line options ({fmt_kspec(kspec)}) do not' + f'K-mer search parameters from command line options ({fmt_kspec(kspec)}) do not ' f'match those of reference signatures ({fmt_kspec(ref_sigs.kmerspec)}).') prog = 'click' if progress else None @@ -137,9 +136,8 @@ def dist_cmd(ctx: click.Context, # Calculate signatures if needed if query_sigs is None: - query_sigfiles = SequenceFile.from_paths(query_files, 'fasta', 'auto') query_pconf = progress_config(prog, desc='Calculating query genome signatures') if len(query_files) > 1 else None - query_sigs = calc_file_signatures(kspec, query_sigfiles, progress=query_pconf, max_workers=cores) + query_sigs = calc_file_signatures(kspec, query_files, progress=query_pconf, max_workers=cores) # Calculate distances dist_pconf = progress_config(prog, desc='Calculating distances') @@ -151,9 +149,8 @@ def dist_cmd(ctx: click.Context, else: if ref_sigs is None: - ref_sigfiles = SequenceFile.from_paths(ref_files, 'fasta', 'auto') ref_pconf = progress_config('click', desc='Calculating reference genome signatures') if len(ref_files) > 1 else None - ref_sigs = calc_file_signatures(kspec, ref_sigfiles, progress=ref_pconf) + ref_sigs = calc_file_signatures(kspec, ref_files, progress=ref_pconf) dmat = jaccarddist_matrix(query_sigs, ref_sigs, progress=dist_pconf) diff --git a/gambit/cli/query.py b/src/gambit/cli/query.py similarity index 77% rename from gambit/cli/query.py rename to src/gambit/cli/query.py index e7e945d..5d4f495 100644 --- a/gambit/cli/query.py +++ b/src/gambit/cli/query.py @@ -1,30 +1,28 @@ import sys -from typing import TextIO, Optional, List +from typing import TextIO, Optional import click from . import common from .root import cli -from gambit.query import QueryParams, QueryInput, query, query_parse +from gambit.query import QueryParams, query, query_parse from gambit.util.progress import progress_config from gambit.sigs import load_signatures +from gambit.results import CSVResultsExporter, JSONResultsExporter, ResultsArchiveWriter from gambit._cython.threads import omp_set_num_threads -def get_exporter(outfmt: str): +def get_exporter(outfmt: str, pretty: bool): if outfmt == 'csv': - from gambit.results.csv import CSVResultsExporter return CSVResultsExporter() if outfmt == 'json': - from gambit.results.json import JSONResultsExporter - return JSONResultsExporter() + return JSONResultsExporter(pretty=pretty) if outfmt == 'archive': - from gambit.results.archive import ResultsArchiveWriter - return ResultsArchiveWriter(install_info=True) + return ResultsArchiveWriter(pretty=pretty) - assert 0 + raise ValueError(f'Invalid output format: {outfmt!r}') @cli.command(name='query', no_args_is_help=True) @@ -53,17 +51,23 @@ def get_exporter(outfmt: str): type=common.filepath(exists=True), help='File containing query signatures, to use in place of GENOMES.', ) +@click.option( + '--pretty/--no-pretty', + default=False, + hidden=True, +) @common.progress_param() @common.cores_param() @click.pass_context def query_cmd(ctx: click.Context, listfile: Optional[TextIO], ldir: Optional[str], - files_arg: List[str], + files_arg: list[str], sigfile: Optional[str], output: TextIO, outfmt: str, strict: bool, + pretty: bool, progress: bool, cores: Optional[int], ): @@ -73,7 +77,7 @@ def query_cmd(ctx: click.Context, db = ctx.obj.get_database() params = QueryParams(classify_strict=strict) - exporter = get_exporter(outfmt) + exporter = get_exporter(outfmt, pretty) pconf = progress_config('click' if progress else None) if cores is not None: @@ -81,15 +85,14 @@ def query_cmd(ctx: click.Context, if sigfile: sigs = load_signatures(sigfile) - inputs = [QueryInput(id) for id in sigs.ids] - results = query(db, sigs, params, inputs=inputs, progress=pconf) + results = query(db, sigs, params, progress=pconf) else: ids, files = common.get_sequence_files(files_arg, listfile, ldir) common.warn_duplicate_file_ids(ids, 'Warning: the following query file IDs are present more than once: {ids}') results = query_parse( db, files, params, - file_labels=ids, + labels=ids, progress=pconf, parse_kw=dict(max_workers=cores), ) diff --git a/gambit/cli/root.py b/src/gambit/cli/root.py similarity index 84% rename from gambit/cli/root.py rename to src/gambit/cli/root.py index 2bb273e..5d959ac 100644 --- a/gambit/cli/root.py +++ b/src/gambit/cli/root.py @@ -17,5 +17,10 @@ @click.version_option(GAMBIT_VERSION, prog_name='gambit') @click.pass_context def cli(ctx: click.Context, **kw): - """Tool for rapid taxonomic identification of microbial pathogens from genomic data.""" + """Tool for rapid taxonomic identification of microbial pathogens from genomic data. + + http://github.com/jlumpe/gambit + + Copyright (C) 2016-2024 Jared Lumpe + """ ctx.obj = CLIContext(ctx) diff --git a/gambit/cli/signatures.py b/src/gambit/cli/signatures.py similarity index 83% rename from gambit/cli/signatures.py rename to src/gambit/cli/signatures.py index 39d29c7..d60628b 100644 --- a/gambit/cli/signatures.py +++ b/src/gambit/cli/signatures.py @@ -1,5 +1,6 @@ -from typing import Optional, TextIO, List +from typing import Optional, TextIO import sys +from pathlib import Path import click @@ -49,8 +50,8 @@ def signatures_group(): required=False, ) @click.pass_context -def info(ctx: click.Context, file: str, json: bool, pretty: bool, ids: bool, use_db: bool): - """Inspect GAMBIT signature files.""" +def info(ctx: click.Context, file: Path, json: bool, pretty: bool, ids: bool, use_db: bool): + """Inspect GAMBIT signature (.gs) files.""" common.check_params_group(ctx, ['file', 'use_db'], True, True) common.check_params_group(ctx, ['ids', 'json'], True, False) @@ -79,26 +80,50 @@ def info(ctx: click.Context, file: str, json: bool, pretty: bool, ids: bool, use gjson.dump(data, sys.stdout, **kw) else: - rows1 = [ + rows = [ ('Genome Count:', len(sigs)), ('k:', sigs.kmerspec.k), ('Prefix:', sigs.kmerspec.prefix.decode('ascii')), ('File format:', f'HDF5, version {sigs.format_version}'), # HDF5-specific ('Data type:', sigs.dtype), ] - common.print_table(rows1, colsep=' ') + common.print_table(rows, colsep=' ') print('Metadata:') - rows2 = [ + rows = [ ('ID:', format_none(sigs.meta.id)), - ('Name:', format_none(sigs.meta.name)), ('Version:', format_none(sigs.meta.version)), + ('Name:', format_none(sigs.meta.name)), ('Description:', format_none(sigs.meta.description)), ('Genome ID attribute:', format_none(sigs.meta.id_attr)), - ('Has extra:', 'yes' if sigs.meta.extra else 'no'), ] - common.print_table(rows2, colsep=' ', left=' ') + common.print_table(rows, colsep=' ', left=' ') + + extra = sigs.meta.extra + + if extra: + revision = common.get_revision_info(extra.get('revision')) + + print('Additional metadata:') + + rows = [ + ('Author:', format_none(extra.get('author'))), + ('Revision:', '' if revision is None else ''), + ] + common.print_table(rows, colsep=' ', left=' ') + + if revision is not None: + rows = [ + ('Number:', revision['num']), + ('Date:', revision['date']), + ('Author:', revision['author']), + ('Description:', revision['description']), + ] + common.print_table(rows, colsep=' ', left=' ') + + else: + print('No additional metadata') @signatures_group.command(no_args_is_help=True) @@ -134,7 +159,7 @@ def info(ctx: click.Context, file: str, json: bool, pretty: bool, ids: bool, use def create(ctx: click.Context, listfile: Optional[TextIO], ldir: Optional[str], - files_arg: List[str], + files_arg: list[str], output: str, prefix: Optional[str], k: Optional[int], @@ -183,7 +208,7 @@ def create(ctx: click.Context, if dump_params: params = dict( kmerspec=kspec, - files=[f.path for f in files], + files=files, meta=meta, ids=ids, ) diff --git a/gambit/cli/tree.py b/src/gambit/cli/tree.py similarity index 86% rename from gambit/cli/tree.py rename to src/gambit/cli/tree.py index d42213e..a252c30 100644 --- a/gambit/cli/tree.py +++ b/src/gambit/cli/tree.py @@ -1,12 +1,11 @@ import sys -from typing import Optional, TextIO, List +from typing import Optional, TextIO import click from Bio import Phylo from . import common from .root import cli -from gambit.seq import SequenceFile from gambit.sigs import load_signatures from gambit.sigs.calc import calc_file_signatures from gambit.metric import jaccarddist_pairwise @@ -31,7 +30,7 @@ def tree_cmd(ctx: click.Context, listfile: Optional[TextIO], ldir: Optional[str], - files_arg: List[str], + files_arg: list[str], sigfile: Optional[str], k: Optional[int], prefix: Optional[str], @@ -58,8 +57,7 @@ def tree_cmd(ctx: click.Context, common.warn_duplicate_file_ids(labels, 'Warning: the following file IDs are present more than once: {ids}') kspec = common.kspec_from_params(k, prefix, default=True) - sigfiles = SequenceFile.from_paths(genome_files, 'fasta', 'auto') - sigs = calc_file_signatures(kspec, sigfiles, progress=pconf.update(desc='Calculating signatures'), max_workers=cores) + sigs = calc_file_signatures(kspec, genome_files, progress=pconf.update(desc='Calculating signatures'), max_workers=cores) # Calculate distances dmat = jaccarddist_pairwise(sigs, progress=pconf.update(desc='Calculating distances')) diff --git a/gambit/cluster.py b/src/gambit/cluster.py similarity index 95% rename from gambit/cluster.py rename to src/gambit/cluster.py index 5f27397..869e886 100644 --- a/gambit/cluster.py +++ b/src/gambit/cluster.py @@ -1,6 +1,6 @@ """Distance matrices and basic clustering/trees.""" -from typing import Union, Optional, Sequence, TextIO, Tuple, List +from typing import Union, Optional, Sequence, TextIO import csv import numpy as np @@ -119,7 +119,7 @@ def check_clade(clade): assert root_i == nleaves * 2 - 2 -def dump_dmat_csv(file: Union[FilePath, TextIO], +def dump_dmat_csv(file: Union['FilePath', TextIO], dmat: np.ndarray, row_ids: Sequence, col_ids: Sequence, @@ -136,7 +136,7 @@ def dump_dmat_csv(file: Union[FilePath, TextIO], writer.writerow([str(row_id), *values_str]) -def load_dmat_csv(file: Union[FilePath, TextIO]) -> Tuple[np.ndarray, List[str], List[str]]: +def load_dmat_csv(file: Union['FilePath', TextIO]) -> tuple[np.ndarray, list[str], list[str]]: """Load distance matrix from CSV file. Returns diff --git a/gambit/db/__init__.py b/src/gambit/db/__init__.py similarity index 70% rename from gambit/db/__init__.py rename to src/gambit/db/__init__.py index a32c494..7e9f270 100644 --- a/gambit/db/__init__.py +++ b/src/gambit/db/__init__.py @@ -1,3 +1,3 @@ from .models import Genome, ReferenceGenomeSet, AnnotatedGenome, Taxon, reportable_taxon, only_genomeset from .refdb import ReferenceDatabase, load_genomeset, DatabaseLoadError -from .sqla import file_sessionmaker, ReadOnlySession +from .sqla import default_sessionmaker, file_sessionmaker, ReadOnlySession diff --git a/gambit/db/models.py b/src/gambit/db/models.py similarity index 93% rename from gambit/db/models.py rename to src/gambit/db/models.py index bd6c16c..043cf83 100644 --- a/gambit/db/models.py +++ b/src/gambit/db/models.py @@ -1,13 +1,12 @@ """SQLAlchemy models for storing reference genomes and taxonomy information.""" -from typing import List, Any, Optional, Iterable, Collection, Callable +from typing import Any, Optional, Iterable, Collection, Callable import sqlalchemy as sa from sqlalchemy import Column, Integer, String, Boolean, Float from sqlalchemy import ForeignKey, UniqueConstraint -from sqlalchemy.orm import Session, relationship, backref, deferred +from sqlalchemy.orm import Session, relationship, backref, deferred, declarative_base from sqlalchemy.ext.hybrid import hybrid_property -from sqlalchemy.ext.declarative import declarative_base, declared_attr from sqlalchemy.exc import MultipleResultsFound, NoResultFound from .sqla import JsonString @@ -63,21 +62,16 @@ class Genome(Base): String column (optional, unique). RefSeq accession number for this genome, if any. extra : Optional[dict] JSON column (optional). Additional arbitrary metadata. - annotations : Collection[.AnnotatedGenome] + annotations : Collection[AnnotatedGenome] One-to-many relationship to :class:`.AnnotatedGenome`. """ __tablename__ = 'genomes' + __table_args__= (UniqueConstraint('ncbi_db', 'ncbi_id'),) #: Attributes which serve as unique IDs. ID_ATTRS = ('key', 'genbank_acc', 'refseq_acc', 'ncbi_id') - @declared_attr - def __table_args__(cls): - return ( - UniqueConstraint('ncbi_db', 'ncbi_id'), - ) - id = Column(Integer(), primary_key=True) key = Column(String(), unique=True, nullable=False) description = Column(String(), nullable=False) @@ -99,8 +93,8 @@ class ReferenceGenomeSet(Base): database which can be used for queries consists of a genome set plus a set of k-mer signatures for those genomes (stored separately). - Membership of :class:`.Genome`s in the set is determined by the presence of an associated - :class:`.AnnotatedGenomes` object, which also holds additional annotation data for the genome. + Membership of :class:`.Genome`\\ s in the set is determined by the presence of an associated + :class:`.AnnotatedGenome` object, which also holds additional annotation data for the genome. The genome set also includes a set of associated :class:`.Taxon` entries, which form a taxonomy tree under which all its genomes are categorized. @@ -125,23 +119,18 @@ class ReferenceGenomeSet(Base): Text column. Optional description. extra : Optional[dict] JSON column. Additional arbitrary data. - genomes : Collection[.AnnotatedGenome] + genomes : Collection[AnnotatedGenome] Many-to-many relationship with :class:`.AnnotatedGenome`, annotated versions of genomes in this set. - base_genomes : Collection[.Genome] + base_genomes : Collection[Genome] Unannotated :class:`Genome`\\ s in this set. Association proxy to the ``genome`` - relationship of members of :attr:`genome`. - taxa : Collection[.Taxon] + relationship of members of :attr:`genomes`. + taxa : Collection[Taxon] One-to-many relationship to :class:`.Taxon`. The taxa that form the classification system for this genome set. """ __tablename__ = 'genome_sets' - - @declared_attr - def __table_args__(cls): - return ( - UniqueConstraint('key', 'version'), - ) + __table_args__ = (UniqueConstraint('key', 'version'),) id = Column(Integer(), primary_key=True) key = Column(String(), index=True, nullable=False) @@ -184,14 +173,14 @@ class AnnotatedGenome(Base): organism : str String column. Single string describing the organism. May be "Genus species [strain]" but could contain more specific information. Intended to be human-readable and shouldn't have - any semantic meaning for the application (in contrast to the :attr:`taxa` relationship). + any semantic meaning for the application (in contrast to the :attr:`taxon` relationship). taxon_id : int Integer column. ID of the :class:`Taxon` this genome is classified as. - genome : .Genome + genome : Genome Many-to-one relationship to :class:`.Genome`. - genome_set : .ReferenceGenomeSet + genome_set : ReferenceGenomeSet Many-to-one relationship to :class:`.ReferenceGenomeSet`. - taxon : .Taxon + taxon : Taxon Many-to-one relationship to :class:`.Taxon`. The primary taxon this genome is classified as under the associated ``ReferenceGenomeSet``. Should be the most specific and "regular" (ideally defined on NCBI) taxon this genome belongs to. @@ -259,7 +248,7 @@ class Taxon(Base): Float column (optional). Query genomes within this distance of one of the taxon's reference genomes will be classified as that taxon. If NULL the taxon is just used establish the tree structure and is not used directly in classification. - report : Bool + report : bool Boolean column. Whether to report this taxon directly as a match when producing a human-readable query result. Some custom taxa might need to be "hidden" from the user, in which case the value should be false. The application should then ascend the taxon's @@ -273,13 +262,13 @@ class Taxon(Base): ncbi_id : Optional[int] Integer column (optional). ID of the entry in the NCBI taxonomy database this taxon corresponds to, if any. - parent : Optional[.Taxon] + parent : Optional[Taxon] Many-to-one relationship with :class:`.Taxon`, the parent of this taxon (if any). - children : Collection[.Taxon] + children : Collection[Taxon] One-to-many relationship with :class:`.Taxon`, the children of this taxon. - genome_set : .ReferenceGenomeSet + genome_set : ReferenceGenomeSet Many-to-one relationship to :class:`.ReferenceGenomeSet`. - genomes : Collection[.AnnotatedGenome] + genomes : Collection[AnnotatedGenome] One-to-many relationship with :class:`.AnnotatedGenome`, genomes which are assigned to this taxon. """ @@ -325,7 +314,7 @@ def ancestor_of_rank(self, rank: str) -> Optional['Taxon']: return ancestor return None - def lineage(self, ranks: Optional[Iterable[str]] = None) -> List[Optional['Taxon']]: + def lineage(self, ranks: Optional[Iterable[str]] = None) -> list[Optional['Taxon']]: """Get a last of this taxon's ancestors. With an argument, gets ancestors with the given ranks. Without, gets a sorted list of the @@ -416,12 +405,12 @@ def has_genome(self, genome: AnnotatedGenome) -> bool: return self in genome.taxon.ancestors(True) @classmethod - def common_ancestors(cls, taxa: Iterable['Taxon']) -> List['Taxon']: + def common_ancestors(cls, taxa: Iterable['Taxon']) -> list['Taxon']: """Get list of common ancestors of a set of taxa. Returns ------- - List[.Taxon] + list[Taxon] Common ancestors from top to bottom (same order as :meth:`lineage`. Will be empty if """ ancestors = None @@ -454,7 +443,7 @@ def common_ancestors(cls, taxa: Iterable['Taxon']) -> List['Taxon']: return [] if ancestors is None else ancestors @classmethod - def lca(cls, taxa: Iterable['Taxon']) -> List['Taxon']: + def lca(cls, taxa: Iterable['Taxon']) -> Optional['Taxon']: """Find the Least Common Ancestor of a set of taxa. Returns None if `taxa` is empty or its members do not all lie in the same tree. @@ -463,10 +452,10 @@ def lca(cls, taxa: Iterable['Taxon']) -> List['Taxon']: return ancestors[-1] if ancestors else None def print_tree(self, - f: Callable[['Taxon'], str] = None, + f: Optional[Callable[['Taxon'], str]] = None, *, indent: str = ' ', - sort_key: Callable[['Taxon'], Any] = None, + sort_key: Optional[Callable[['Taxon'], Any]] = None, ): """Print the taxon's subtree for debugging. diff --git a/gambit/db/refdb.py b/src/gambit/db/refdb.py similarity index 87% rename from gambit/db/refdb.py rename to src/gambit/db/refdb.py index 9fdcf29..a5affc7 100644 --- a/gambit/db/refdb.py +++ b/src/gambit/db/refdb.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Tuple, Sequence, Union, List, Dict, Optional, Any +from typing import Sequence, Union, Optional, Any from sqlalchemy.orm import object_session, Session from sqlalchemy.orm.attributes import InstrumentedAttribute @@ -38,7 +38,7 @@ def __init__(self, msg, directory=None, genomes_file=None, signatures_file=None) self.signatures_file = signatures_file -def load_genomeset(db_file: FilePath) -> Tuple[Session, ReferenceGenomeSet]: +def load_genomeset(db_file: 'FilePath') -> tuple[Session, ReferenceGenomeSet]: """Get the only :class:`gambit.db.models.ReferenceGenomeSet` from a genomes database file.""" session = file_sessionmaker(db_file)() gset = only_genomeset(session) @@ -78,14 +78,14 @@ def _check_genomes_have_ids(genomeset: ReferenceGenomeSet, id_attr: Instrumented raise RuntimeError(f'{c} genomes missing value for ID attribute {id_attr.key}') -def _map_ids_to_genomes(genomeset: ReferenceGenomeSet, id_attr: Union[str, InstrumentedAttribute]) -> Dict[AnnotatedGenome, Any]: +def _map_ids_to_genomes(genomeset: ReferenceGenomeSet, id_attr: Union[str, InstrumentedAttribute]) -> dict[AnnotatedGenome, Any]: """Get dict mapping ID values to AnnotatedGenome.""" q = genomeset.genomes.join(AnnotatedGenome.genome).add_columns(id_attr) return {id_: g for g, id_ in q} -def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Sequence, strict: bool = True) -> List[Optional[AnnotatedGenome]]: - """Match a :class:`ReferenceGenomeSet`'s genomes to a set of ID values. +def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Sequence, strict: bool = True) -> list[Optional[AnnotatedGenome]]: + """Match a ``ReferenceGenomeSet``'s genomes to a set of ID values. This is primarily used to match genomes to signatures based on the ID values stored in a signature file. It is expected that the signature file may contain signatures for more genomes @@ -97,7 +97,7 @@ def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Seque id_attr ID attribute of :class:`gambit.db.models.Genome` to use for lookup. Can be used as the attribute itself (e.g. ``Genome.refseq_acc``) or just the name (``'refsec_acc'``). - See :data:`.GENOME_IDS` for the set of allowed values. + See :attr:`~gambit.db.models.Genome.ID_ATTRS` for the set of allowed values. ids Sequence of ID values (strings or integers, matching type of attribute). strict @@ -105,7 +105,7 @@ def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Seque Returns ------- - List[Optional[AnnotatedGenome]] + list[Optional[AnnotatedGenome]] List of genomes of same length as ``ids``. If ``strict=False`` and a genome cannot be found for a given ID the list will contain ``None`` at the corresponding position. @@ -126,8 +126,8 @@ def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Seque def genomes_by_id_subset(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Sequence, - ) -> Tuple[List[AnnotatedGenome], List[int]]: - """Match a :class:`ReferenceGenomeSet`'s genomes to a set of ID values, allowing missing genomes. + ) -> tuple[list[AnnotatedGenome], list[int]]: + """Match a ``ReferenceGenomeSet``'s genomes to a set of ID values, allowing missing genomes. This calls :func:`.genomes_by_id` with ``strict=False`` and filters any ``None`` values from the output. The filtered list is returned along with the indices of all values in ``ids`` which were @@ -143,13 +143,9 @@ def genomes_by_id_subset(genomeset: ReferenceGenomeSet, id_attr ID attribute of :class:`gambit.db.models.Genome` to use for lookup. Can be used as the attribute itself (e.g. ``Genome.refseq_acc``) or just the name (``'refsec_acc'``). - See :data:`.GENOME_IDS` for the set of allowed values. + See :attr:`~gambit.db.models.Genome.ID_ATTRS` for the set of allowed values. ids Sequence of ID values (strings or integers, matching type of attribute). - - Returns - ------- - Tuple[List[AnnotatedGenome], List[int]] """ genomes = genomes_by_id(genomeset, id_attr, ids, strict=False) genomes_out = [] @@ -177,8 +173,8 @@ class ReferenceDatabase: signatures K-mer signatures for each genome. A subtype of ``ReferenceSignatures``, so contains metadata on signatures as well as the signatures themselves. Type may represent signatures stored on - disk (e.g. :class:`HDF5Signatures`) instead of in memory. OK to contain additional - signatures not corresponding to any genome in ``genomes``. + disk (e.g. :class:`~gambit.sigs.hdf5.HDF5Signatures`) instead of in memory. OK to contain + additional signatures not corresponding to any genome in ``genomes``. sig_indices Index of signature in ``signatures`` corresponding to each genome in ``genomes``. In sorted order to improve performance when iterating over them (improve locality if in @@ -215,7 +211,7 @@ def __init__(self, genomeset: ReferenceGenomeSet, signatures: ReferenceSignature raise ValueError(f'{missing} of {n} genomes not matched to signature IDs. Is the id_attr attribute of the signatures metadata correct?') @classmethod - def locate_files(cls, path: FilePath) -> Tuple[Path, Path]: + def locate_files(cls, path: 'FilePath') -> tuple[Path, Path]: """Locate an SQLite genome database file and HDF5 signatures file in a directory. Files are located by extension, ``.gdb`` or ``.db`` for SQLite file and ``.gs`` or ``.h5`` @@ -228,6 +224,7 @@ def locate_files(cls, path: FilePath) -> Tuple[Path, Path]: Returns ------- + tuple[pathlib.Path, pathlib.Path] Paths to genomes database file and signatures file. Raises @@ -261,19 +258,19 @@ def check_single_match(matches, desc: str): return genomes_file, signatures_file @classmethod - def load(cls, genomes_file: FilePath, signatures_file: FilePath) -> 'ReferenceDatabase': + def load(cls, genomes_file: 'FilePath', signatures_file: 'FilePath') -> 'ReferenceDatabase': """Load complete database given paths to SQLite genomes database file and HDF5 signatures file.""" session, gset = load_genomeset(genomes_file) sigs = load_signatures(signatures_file) return cls(gset, sigs) @classmethod - def load_from_dir(cls, path: FilePath) -> 'ReferenceDatabase': + def load_from_dir(cls, path: 'FilePath') -> 'ReferenceDatabase': """ Load complete database given directory containing SQLite genomes database file and HDF5 signatures file. - See :func:`.locate_db_files` for how these files are located within the directory. + See :meth:`.locate_files` for how these files are located within the directory. Raises ------ diff --git a/gambit/db/sqla.py b/src/gambit/db/sqla.py similarity index 61% rename from gambit/db/sqla.py rename to src/gambit/db/sqla.py index 2892903..0ad2957 100644 --- a/gambit/db/sqla.py +++ b/src/gambit/db/sqla.py @@ -1,5 +1,6 @@ """Custom types and other utilities for SQLAlchemy.""" import os +from typing import Optional from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker, Session @@ -37,22 +38,35 @@ def process_result_value(self, value, dialect): return None if value is None else gjson.loads(value) -def file_sessionmaker(path: FilePath, readonly: bool = True, cls: type = None, **kw) -> sessionmaker: +def default_sessionmaker(bind, *, readonly: bool = True, class_: Optional[type] = None, **kw) -> sessionmaker: + """Create an SQLAlchemy ``sessionmaker`` using some common default settings. + + Parameters + ---------- + bind + First argument to :class:`sqlalchemy.orm.sessionmaker`. + readonly + Sets the default value for the ``class_`` keyword argument (:class:`.ReadOnlySession` if True, + otherwise uses the standard SQLAlchemy session type). + \\**kw + Additional keyword arguments to :class:`sqlalchemy.orm.sessionmaker`. + """ + if class_ is None: + class_ = ReadOnlySession if readonly else Session + # future=True - forwards compatibility with SQLAlchemy 2.0 + return sessionmaker(bind, class_=class_, future=True, **kw) + + +def file_sessionmaker(path: 'FilePath', **kw) -> sessionmaker: """Get an SQLAlchemy ``sessionmaker`` for an sqlite database file. Parameters ---------- path Path to database file. - readonly - Sets the default value for ``class_``. - cls - SQLAlchemy ``Session`` subclass to use. Defaults to :class:`gambit.db.sqla.ReadOnlySession` - if ``readonly=True``, otherwise uses the standard SQLAlchemy session type. \\**kw - Additional keyword arguments to :class:`sqlalchemy.orm.sessionmaker`. + Additional keyword arguments to :func:`.default_sessionmaker` / + :class:`sqlalchemy.orm.sessionmaker`. """ - if cls is None: - cls = ReadOnlySession if readonly else Session engine = create_engine(f'sqlite:///{os.fspath(path)}') - return sessionmaker(engine, class_=cls, **kw) + return default_sessionmaker(engine, **kw) diff --git a/gambit/kmers.py b/src/gambit/kmers.py similarity index 91% rename from gambit/kmers.py rename to src/gambit/kmers.py index e756f97..9c5890c 100644 --- a/gambit/kmers.py +++ b/src/gambit/kmers.py @@ -1,6 +1,6 @@ """Core functions for searching for and working with k-mers.""" -from typing import Dict, Any, Iterator +from typing import Optional, Any, Iterator import numpy as np from attr import attrs, attrib @@ -16,7 +16,7 @@ def nkmers(k: int) -> int: return 4 ** k -def index_dtype(k: int) -> np.dtype: +def index_dtype(k: int) -> Optional[np.dtype]: """Get the smallest unsigned integer dtype that can store k-mer indices for the given ``k``.""" if k <= 4: return np.dtype('u1') @@ -30,7 +30,7 @@ def index_dtype(k: int) -> np.dtype: return None -def kmer_to_index(kmer: DNASeq) -> int: +def kmer_to_index(kmer: 'DNASeq') -> int: """Convert a k-mer to its integer index. Raises @@ -41,7 +41,7 @@ def kmer_to_index(kmer: DNASeq) -> int: return ckmers.kmer_to_index(seq_to_bytes(kmer)) -def kmer_to_index_rc(kmer: DNASeq) -> int: +def kmer_to_index_rc(kmer: 'DNASeq') -> int: """Get the integer index of a k-mer's reverse complement. Raises @@ -84,7 +84,7 @@ class KmerSpec(Jsonable): nkmers: int = attrib(eq=False) index_dtype: np.dtype = attrib(eq=False) - def __init__(self, k: int, prefix: DNASeq): + def __init__(self, k: int, prefix: 'DNASeq'): """ Parameters ---------- @@ -119,7 +119,7 @@ def __to_json__(self): return dict(k=int(self.k), prefix=self.prefix_str) @classmethod - def __from_json__(cls, data: Dict[str, Any]) -> 'KmerSpec': + def __from_json__(cls, data: dict[str, Any]) -> 'KmerSpec': return cls(data['k'], data['prefix']) @@ -129,7 +129,7 @@ def __from_json__(cls, data: Dict[str, Any]) -> 'KmerSpec': @attrs(slots=True) class KmerMatch: - """Represents a + """Represents the location of a k-mer prefix found within a DNA sequence. Attributes ---------- @@ -143,7 +143,7 @@ class KmerMatch: If the match is on the reverse strand. """ kmerspec: KmerSpec = attrib() - seq: DNASeq = attrib() + seq: 'DNASeq' = attrib() pos: int = attrib() reverse: bool = attrib() @@ -178,7 +178,7 @@ def kmer_index(self) -> int: return kmer_to_index_rc(kmer) if self.reverse else kmer_to_index(kmer) -def find_kmers(kmerspec: KmerSpec, seq: DNASeq) -> Iterator[KmerMatch]: +def find_kmers(kmerspec: KmerSpec, seq: 'DNASeq') -> Iterator[KmerMatch]: """Locate k-mers with the given prefix in a DNA sequence. Searches sequence both backwards and forwards (reverse complement). The sequence may contain diff --git a/gambit/metric.py b/src/gambit/metric.py similarity index 71% rename from gambit/metric.py rename to src/gambit/metric.py index cc23e34..e01eb0f 100644 --- a/gambit/metric.py +++ b/src/gambit/metric.py @@ -5,13 +5,103 @@ import numpy as np -from gambit._cython.metric import BOUNDS_DTYPE, SCORE_DTYPE, jaccard, jaccarddist, \ - _jaccarddist_parallel -from gambit.sigs.base import KmerSignature, SignatureArray, AbstractSignatureArray, SignatureList +import gambit._cython.metric as _cmetric +from gambit.sigs.base import KmerSignature, SignatureArray, AbstractSignatureArray, SignatureList, \ + BOUNDS_DTYPE from gambit.util.misc import chunk_slices from gambit.util.progress import get_progress +#: Numpy dtype for output of Cython Jaccard distance calculation code +# Equivalent to SCORE_T in types.pxd +SCORE_DTYPE = np.dtype(np.float32) + + +_COORDS_UNSIGNED_DTYPES = [np.dtype(f'u{s}') for s in [2, 4, 8]] +_COORDS_SIGNED_DTYPES = [np.dtype(f'i{s}') for s in [2, 4, 8]] + + +def _cast_sigs_array(arr: np.ndarray) -> np.ndarray: + """Convert signature array to proper data type for Cython metric code. + + Cython code accepts k-mer coordinate arrays in 16, 32, or 64-bit unsigned data types, these are + returned as-is. Equivalent signed data types can safely be casted (as the values should all be + non-negative), for these a view into the array with unsigned data type is returned (no coyping). + All other data types result in a ValueError. + """ + + dt = arr.dtype + if dt in _COORDS_UNSIGNED_DTYPES: + return arr + if dt in _COORDS_SIGNED_DTYPES: + new_dt = np.dtype(f'u{dt.itemsize}') + return arr.view(new_dt) + raise ValueError(f'Invalid dtype for k-mer coordinate array: {dt.str}') + + +def jaccard(coords1: np.ndarray, coords2: np.ndarray) -> np.float32: + """Compute the Jaccard index between two k-mer sets in sparse coordinate format. + + Arguments are Numpy arrays containing k-mer indices in sorted order. Data types must be 16, 32, + or 64-bit signed or unsigned integers, but do not need to match. + + This is by far the most efficient way to calculate the metric (this is a native function) and + should be used wherever possible. + + Parameters + ---------- + coords1 + K-mer set in sparse coordinate format. + coords2 + K-mer set in sparse coordinate format. + + Returns + ------- + numpy.float32 + Jaccard index between the two sets, a real number between 0 and 1. + + See Also + -------- + .jaccarddist + """ + coords1 = _cast_sigs_array(coords1) + coords2 = _cast_sigs_array(coords2) + return _cmetric.jaccard(coords1, coords2) + + +def jaccarddist(coords1: np.ndarray, coords2: np.ndarray): + """Compute the Jaccard distance between two k-mer sets in sparse coordinate format. + + The Jaccard distance is equal to one minus the Jaccard index. + + Arguments are Numpy arrays containing k-mer indices in sorted order. Data types must be 16, 32, + or 64-bit signed or unsigned integers, but do not need to match. + + This is by far the most efficient way to calculate the metric (this is a native function) and + should be used wherever possible. + + Parameters + ---------- + coords1 + K-mer set in sparse coordinate format. + coords2 + K-mer set in sparse coordinate format. + + Returns + ------- + numpy.float32 + Jaccard distance between the two sets, a real number between 0 and 1. + + See Also + -------- + .jaccard + """ + coords1 = _cast_sigs_array(coords1) + coords2 = _cast_sigs_array(coords2) + return _cmetric.jaccarddist(coords1, coords2) + + + def jaccard_generic(set1: Iterable, set2: Iterable) -> float: """Get the Jaccard index of of two arbitrary sets. @@ -49,7 +139,7 @@ def jaccard_bits(bits1: np.ndarray, bits2: np.ndarray) -> float: return 1. if union == 0 else intersection / union -def jaccarddist_array(query: KmerSignature, refs: Sequence[KmerSignature], out: np.ndarray = None) -> np.ndarray: +def jaccarddist_array(query: KmerSignature, refs: Sequence[KmerSignature], out: Optional[np.ndarray] = None) -> np.ndarray: """ Calculate Jaccard distances between a query k-mer signature and a list of reference signatures. @@ -79,6 +169,8 @@ def jaccarddist_array(query: KmerSignature, refs: Sequence[KmerSignature], out: .jaccarddist .jaccarddist_matrix """ + query = _cast_sigs_array(query) + if out is None: out = np.empty(len(refs), SCORE_DTYPE) elif out.shape != (len(refs),): @@ -87,14 +179,15 @@ def jaccarddist_array(query: KmerSignature, refs: Sequence[KmerSignature], out: raise ValueError(f'Output array dtype must be {SCORE_DTYPE}, got {out.dtype}') if isinstance(refs, SignatureArray): - values = refs.values + values = _cast_sigs_array(refs.values) bounds = refs.bounds.astype(BOUNDS_DTYPE, copy=False) - _jaccarddist_parallel(query, values, bounds, out) + _cmetric._jaccarddist_parallel(query, values, bounds, out) else: for i, ref in enumerate(refs): - out[i] = jaccarddist(query, ref) + ref = _cast_sigs_array(ref) + out[i] = _cmetric.jaccarddist(query, ref) return out @@ -116,7 +209,7 @@ def jaccarddist_matrix(queries: Sequence[KmerSignature], Performance is greatly improved if ``refs`` is a type that yields instances of ``SignatureArray`` when indexed with a slice object (``SignatureArray`` or - ``HDF5Signatures``), see :meth:`.jaccarddist_array`. There is no such dependence on the type of + ``HDF5Signatures``), see :func:`.jaccarddist_array`. There is no such dependence on the type of ``queries``, which can be a simple list. Parameters @@ -137,7 +230,7 @@ def jaccarddist_matrix(queries: Sequence[KmerSignature], Returns ------- - np.ndarray + numpy.ndarray Matrix of distances between query signatures in rows and reference signatures in columns. See Also @@ -190,7 +283,7 @@ def jaccarddist_pairwise(sigs: Sequence[KmerSignature], """ Calculate all pairwise Jaccard distances for a list of signatures. - This should be roughly twice as fast as calling :func:`.jaccarddist_flat` with the same array + This should be roughly twice as fast as calling :func:`.jaccarddist_matrix` with the same array for the first and second arguments, because each pairwise distance is computed once instead of twice. @@ -215,7 +308,7 @@ def jaccarddist_pairwise(sigs: Sequence[KmerSignature], Returns ------- - np.ndarray + numpy.ndarray Pairwise distances in matrix (if ``flat=False``) or condensed (``flat=True``) format. See Also diff --git a/gambit/query.py b/src/gambit/query.py similarity index 63% rename from gambit/query.py rename to src/gambit/query.py index b2e611a..daf0958 100644 --- a/gambit/query.py +++ b/src/gambit/query.py @@ -2,20 +2,21 @@ from warnings import warn from datetime import datetime -from typing import Sequence, Optional, Union, List, Dict, Any +from typing import Sequence, Optional, Any +from pathlib import Path from attr import attrs, attrib +from attr.converters import optional as optional_converter import numpy as np from gambit import __version__ as GAMBIT_VERSION -from gambit.classify import classify, ClassifierResult, GenomeMatch, compare_classifier_results, \ - compare_genome_matches +from gambit.classify import classify, ClassifierResult, GenomeMatch from gambit.db import ReferenceDatabase, Taxon, ReferenceGenomeSet, reportable_taxon -from gambit.seq import SequenceFile -from gambit.sigs import KmerSignature, SignaturesMeta +from gambit.sigs.base import KmerSignature, SignaturesMeta, ReferenceSignatures from gambit.metric import jaccarddist_matrix -from gambit.util.misc import zip_strict +from gambit.util.io import FilePath from gambit.util.progress import progress_config, iter_progress +from gambit.util.misc import zip_strict @attrs() @@ -37,44 +38,14 @@ class QueryParams: report_closest: int = attrib(default=10) -@attrs() -class QueryInput: - """Information on a query genome. - - Attributes - ---------- - label - Some unique label for the input, probably the file name. - file - Source file (optional). - """ - label: str = attrib() - file: Optional[SequenceFile] = attrib(default=None, repr=False) - - @classmethod - def convert(cls, x: Union['QueryInput', SequenceFile, str]) -> 'QueryInput': - """Convenience function to convert flexible argument types into QueryInput. - - Accepts single string label, ``SequenceFile`` (uses file path for label), or existing - ``QueryInput`` instance (returned unchanged). - """ - if isinstance(x, QueryInput): - return x - if isinstance(x, str): - return QueryInput(x) - if isinstance(x, SequenceFile): - return QueryInput(str(x.path), x) - raise TypeError(f'Cannot convert {type(x)} instance to QueryInput') - - @attrs() class QueryResultItem: """Result for a single query sequence. Attributes ---------- - input - Information on input genome. + label + Unique label describing query. classifier_result Result of running classifier. report_taxon @@ -82,30 +53,14 @@ class QueryResultItem: closest_genomes List of closest reference genomes to query. Length determined by :attr:`.QueryParams.report_closest`. + file + Path to file containing query genome (optional). """ - input: QueryInput = attrib() + label: str = attrib() classifier_result: ClassifierResult = attrib() report_taxon: Optional[Taxon] = attrib(default=None) - closest_genomes: List[GenomeMatch] = attrib(factory=list) - - -def compare_result_items(item1: QueryResultItem, item2: QueryResultItem) -> bool: - """Compare two ``QueryResultItem`` instances for equality. - - Does not compare the value of the ``input`` attributes. - """ - if item1.report_taxon != item2.report_taxon: - return False - if not compare_classifier_results(item1.classifier_result, item2.classifier_result): - return False - if len(item1.closest_genomes) != len(item2.closest_genomes): - return False - - for m1, m2 in zip(item1.closest_genomes, item2.closest_genomes): - if not compare_genome_matches(m1, m2): - return False - - return True + closest_genomes: list[GenomeMatch] = attrib(factory=list) + file: Optional[Path] = attrib(default=None, converter=optional_converter(Path)) @attrs(repr=False) @@ -129,20 +84,20 @@ class QueryResults: extra JSON-able dict containing additional arbitrary metadata. """ - items: List[QueryResultItem] = attrib() + items: list[QueryResultItem] = attrib() params: Optional[QueryParams] = attrib(default=None) genomeset: Optional[ReferenceGenomeSet] = attrib(default=None) signaturesmeta: Optional[SignaturesMeta] = attrib(default=None) gambit_version: str = attrib(default=GAMBIT_VERSION) timestamp: datetime = attrib(factory=datetime.now) - extra: Dict[str, Any] = attrib(factory=dict) + extra: dict[str, Any] = attrib(factory=dict) def query(db: ReferenceDatabase, queries: Sequence[KmerSignature], params: Optional[QueryParams] = None, *, - inputs: Optional[Sequence[Union[QueryInput, SequenceFile, str]]] = None, + labels: Optional[Sequence[str]] = None, progress = None, **kw, ) -> QueryResults: @@ -157,10 +112,10 @@ def query(db: ReferenceDatabase, params ``QueryParams`` instance defining parameter values. If None take values from additional keyword arguments or use defaults. - inputs - Description for each input, converted to :class:`gambit.query.result.QueryInput` in results - object. Only used for reporting, does not any other aspect of results. Items can be - ``QueryInput``, ``SequenceFile`` or ``str``. + labels + Optional list of string labels for each query. Only used for reporting (sets ``label`` + attribute of :class:`QueryResultItem` in results object), does not any other aspect of + results. progress Report progress for distance matrix calculation and classification. See :func:`gambit.util.progress.get_progress` for description of allowed values. @@ -172,18 +127,22 @@ def query(db: ReferenceDatabase, elif kw: warn('Additional keyword arguments ignored if "params" argument is not None.') - queries = list(queries) pconf = progress_config(progress) if len(queries) == 0: raise ValueError('Must supply at least one query.') - if inputs is not None: - inputs = list(map(QueryInput.convert, inputs)) - if len(inputs) != len(queries): - raise ValueError('Number of inputs does not match number of queries.') + # Labels + if labels is not None: + if len(labels) != len(queries): + raise ValueError('Number of labels does not match number of queries.') + + elif isinstance(queries, ReferenceSignatures): + # Get default labels from queries of ReferenceSignatures object + labels = list(map(str, queries.ids)) + else: - inputs = [QueryInput(str(i + 1)) for i in range(len(queries))] + labels = [str(i + 1) for i in range(len(queries))] # Calculate distances # (This will only be about 200kB per row/query [50k float32's] so having the whole thing in @@ -197,8 +156,11 @@ def query(db: ReferenceDatabase, ) # Classify inputs and create result items - with iter_progress(inputs, pconf, desc='Classifying') as inputs_iter: - items = [get_result_item(db, params, dmat[i, :], input) for i, input in enumerate(inputs_iter)] + with iter_progress(labels, pconf, desc='Classifying') as labels_iter: + items = [ + get_result_item(db, params, dmat[i, :], label) + for i, label in enumerate(labels_iter) + ] return QueryResults( items=items, @@ -208,7 +170,7 @@ def query(db: ReferenceDatabase, ) -def get_result_item(db:ReferenceDatabase, params: QueryParams, dists: np.ndarray, input: QueryInput) -> QueryResultItem: +def get_result_item(db: ReferenceDatabase, params: QueryParams, dists: np.ndarray, label: str) -> QueryResultItem: """Perform classification and create result item object for single query input. Parameters @@ -216,14 +178,14 @@ def get_result_item(db:ReferenceDatabase, params: QueryParams, dists: np.ndarray db params dists - Distances from query to reference genomes. - input + 1D array of distances from query to all reference genomes. + label """ clsresult = classify(db.genomes, dists, strict=params.classify_strict) closest = [GenomeMatch(db.genomes[i], dists[i]) for i in np.argsort(dists)[:params.report_closest]] return QueryResultItem( - input=input, + label=label, classifier_result=clsresult, report_taxon=reportable_taxon(clsresult.predicted_taxon), closest_genomes=closest, @@ -231,11 +193,11 @@ def get_result_item(db:ReferenceDatabase, params: QueryParams, dists: np.ndarray def query_parse(db: ReferenceDatabase, - files: Sequence[SequenceFile], + files: Sequence[FilePath], params: Optional[QueryParams] = None, *, - file_labels: Optional[Sequence[str]] = None, - parse_kw: Optional[Dict[str, Any]] = None, + labels: Optional[Sequence[str]] = None, + parse_kw: Optional[dict[str, Any]] = None, **kw, ) -> QueryResults: """Query a database with signatures derived by parsing a set of genome sequence files. @@ -249,7 +211,7 @@ def query_parse(db: ReferenceDatabase, params ``QueryParams`` instance defining parameter values. If None take values from additional keyword arguments or use defaults. - file_labels + labels Custom labels to use for each file in returned results object. If None use file names. parse_kw Keyword parameters to pass to :func:`gambit.sigs.calc.calc_file_signatures`. @@ -263,11 +225,18 @@ def query_parse(db: ReferenceDatabase, parse_kw = dict() parse_kw.setdefault('progress', pconf.update(desc='Parsing input')) - if file_labels is None: - inputs = files + if labels is None: + labels = [str(file) for file in files] else: - inputs = [QueryInput(label, file) for label, file in zip_strict(file_labels, files)] + if len(labels) != len(files): + raise ValueError('Number of labels does not match number of files') query_sigs = calc_file_signatures(db.signatures.kmerspec, files, **parse_kw) - return query(db, query_sigs, params, inputs=inputs, progress=pconf, **kw) + results = query(db, query_sigs, params, labels=labels, progress=pconf, **kw) + + # Assign file attribute of QueryResultItem's + for item, file in zip_strict(results.items, files): + item.file = file + + return results diff --git a/src/gambit/results.py b/src/gambit/results.py new file mode 100644 index 0000000..c0af9db --- /dev/null +++ b/src/gambit/results.py @@ -0,0 +1,270 @@ +"""Export query results in various formats.""" + +import json +from abc import ABC, abstractmethod +from typing import IO, Union, TextIO, Any, Iterable +import csv +from functools import singledispatchmethod + +from attr import attrs, attrib, asdict +from sqlalchemy.orm import Session + +from gambit.util.io import FilePath, maybe_open +import gambit.util.json as gjson +from gambit.query import QueryResults, QueryResultItem +from gambit.db import ReferenceGenomeSet, Taxon, AnnotatedGenome, Genome + + +class AbstractResultsExporter(ABC): + """Base for classes that export formatted query results. + + Subclasses must implement :meth:`export`. + """ + + @abstractmethod + def export(self, file_or_path: Union['FilePath', IO], results: QueryResults): + """Write query results to file. + + Parameters + ---------- + file_or_path + Open file-like object or file path to write to. + results + Results to export. + """ + + +def _todict(obj, attrs): + return {a: getattr(obj, a) for a in attrs} + + +@attrs() +class BaseJSONResultsExporter(AbstractResultsExporter): + """Base class for JSON exporters. + + Subclasses need to implement the ``to_json`` method. + + Attributes + ---------- + pretty + Write in more human-readable but less compact format. Defaults to False. + """ + pretty: bool = attrib(default=False) + + def to_json(self, obj): + """Convert object to JSON-compatible format (need not work recursively).""" + return gjson.to_json(obj) + + def export(self, file_or_path: Union['FilePath', TextIO], results: QueryResults): + opts = dict(indent=4, sort_keys=True) if self.pretty else dict() + with maybe_open(file_or_path, 'w') as f: + json.dump(results, f, default=self.to_json, **opts) + + +def getattr_nested(obj, attrs: Union[str, Iterable[str]], pass_none=False): + if isinstance(attrs, str): + attrs = attrs.split('.') + + for attr in attrs: + if pass_none and obj is None: + return None + + obj = getattr(obj, attr) + + return obj + + +class CSVResultsExporter(AbstractResultsExporter): + """Exports query results in CSV format. + + Attributes + ---------- + format_opts + Dialect and other formatting arguments passed to :func:`csv.writer`. + """ + format_opts: dict[str, Any] + + # Pairs of column name and QueryResultItem attribute + COLUMNS = [ + ('query', 'label'), + ('predicted.name', 'report_taxon.name'), + ('predicted.rank', 'report_taxon.rank'), + ('predicted.ncbi_id', 'report_taxon.ncbi_id'), + ('predicted.threshold', 'report_taxon.distance_threshold'), + ('closest.distance', 'classifier_result.closest_match.distance'), + ('closest.description', 'classifier_result.closest_match.genome.description'), + ('next.name', 'classifier_result.next_taxon.name'), + ('next.rank', 'classifier_result.next_taxon.rank'), + ('next.ncbi_id', 'classifier_result.next_taxon.ncbi_id'), + ('next.threshold', 'classifier_result.next_taxon.distance_threshold'), + ] + + def __init__(self, **format_opts): + if 'dialect' not in format_opts: + format_opts.setdefault('lineterminator', '\n') + format_opts.setdefault('quoting', csv.QUOTE_MINIMAL) + self.format_opts = format_opts + + def get_header(self) -> list[str]: + """Get values for header row.""" + return [name for name, _ in self.COLUMNS] + + def get_row(self, item: QueryResultItem) -> list: + """Get row values for single result item.""" + return [getattr_nested(item, attrs, pass_none=True) for _, attrs in self.COLUMNS] + + def export(self, file_or_path: Union['FilePath', TextIO], results: QueryResults): + with maybe_open(file_or_path, 'w') as f: + writer = csv.writer(f, **self.format_opts) + + writer.writerow(self.get_header()) + for item in results.items: + writer.writerow(self.get_row(item)) + + +@attrs() +class JSONResultsExporter(BaseJSONResultsExporter): + """Exports query results in basic JSON format. + + Currently it assumes that the query was run with ``classify_strict=False``, so the only + relevant information from ``ClassifierResult`` is the closest genome match. + """ + + to_json = singledispatchmethod(BaseJSONResultsExporter.to_json) + + @to_json.register(QueryResults) + def _results_to_json(self, results: QueryResults): + data = asdict(results, recurse=False) + del data['params'] # Parameters not currently exposed thru CLI, so omit for now. + return data + + @to_json.register(QueryResultItem) + def _item_to_json(self, item: QueryResultItem): + return dict( + query=dict( + name=item.label, + path=item.file, + ), + predicted_taxon=item.report_taxon, + next_taxon=item.classifier_result.next_taxon, + closest_genomes=item.closest_genomes, + ) + + @to_json.register(ReferenceGenomeSet) + def _genomeset_to_json(self, gset: ReferenceGenomeSet): + return _todict(gset, ['id', 'key', 'version', 'name', 'description']) + + @to_json.register(Taxon) + def _taxon_to_json(self, taxon: Taxon): + return _todict(taxon, ['id', 'key', 'name', 'ncbi_id', 'rank', 'distance_threshold']) + + @to_json.register(AnnotatedGenome) + def _genome_to_json(self, genome: AnnotatedGenome): + data = _todict(genome, ['key', 'description', 'organism', 'ncbi_db', 'ncbi_id', 'genbank_acc', 'refseq_acc']) + data['id'] = genome.genome_id + data['taxonomy'] = list(genome.taxon.ancestors(incself=True)) + return data + + +class ResultsArchiveWriter(BaseJSONResultsExporter): + """Exports query results to "archive" format which captures all stored data. + + This format is not intended to be read by users of the application. + The exported data can be read and converted back into an identical + :class:`~gambit.query.QueryResults` object using :class:`.ResultsArchiveReader`. + + Only the ID attributes of database models are saved, when loading the saved results the models + are recreated by database queries. + """ + + to_json = singledispatchmethod(BaseJSONResultsExporter.to_json) + + @to_json.register(ReferenceGenomeSet) + def _genomeset_to_json(self, gset: ReferenceGenomeSet): + return _todict(gset, ['key', 'version']) + + @to_json.register(Taxon) + def _taxon_to_json(self, taxon: Taxon): + return _todict(taxon, ['key']) + + @to_json.register(AnnotatedGenome) + def _genome_to_json(self, genome: AnnotatedGenome): + return _todict(genome, ['key']) + + +class ResultsArchiveReader: + """Loads query results from file created by :class:`ResultsArchiveWriter`. + + Attributes + ---------- + session + SQLAlchemy session used to load database objects. + """ + session: Session + + def __init__(self, session): + self.session = session + + self._init_converter() + + # Loading the Taxon and AnnotatedGenome instances from the database requires not just their + # ID (key attribute) values but also the ReferenceGenomeSet they belong to. Setting this + # attribute to the genome set instance of the results currently being loaded is a somewhat + # hacky method of passing this information to the unstructuring hook functions. There isn't + # a much better way of doing this without reimplementing a lot of the cattrs machinery. + self._current_genomeset = None + + def _init_converter(self): + """Initialize the cattrs converter instance. + + This is a clone of the converter instance in gambit.util.json, with additional structuring + hooks registered to methods on this instance. + """ + self._converter = gjson.converter.copy() + self._converter.register_structure_hook(ReferenceGenomeSet, self._structure_genomeset) + self._converter.register_structure_hook(AnnotatedGenome, self._structure_genome) + self._converter.register_structure_hook(Taxon, self._structure_taxon) + + def read(self, file_or_path: Union['FilePath', IO]) -> QueryResults: + """Read query results from JSON file. + + Parameters + ---------- + file_or_path + Readable file object or file path. + """ + with maybe_open(file_or_path) as f: + data = json.load(f) + + return self.results_from_json(data) + + def results_from_json(self, data: dict[str, Any]) -> QueryResults: + """Recreate results object from loaded JSON data.""" + + gset_key = data['genomeset']['key'] + gset_version = data['genomeset']['version'] + self._current_genomeset = self.session.query(ReferenceGenomeSet) \ + .filter_by(key=gset_key, version=gset_version) \ + .one() + + try: + return self._converter.structure(data, QueryResults) + + finally: + self._current_genomeset = None + + def _structure_genomeset(self, data: dict[str, Any], cls=None): + return self._current_genomeset + + def _structure_genome(self, data: dict[str, Any], cls=None) -> AnnotatedGenome: + key = data['key'] + gset_id = self._current_genomeset.id + return self.session.query(AnnotatedGenome)\ + .join(Genome)\ + .filter(AnnotatedGenome.genome_set_id == gset_id, Genome.key == key)\ + .one() + + def _structure_taxon(self, data: dict[str, Any], cls=None) -> Taxon: + key = data['key'] + gset_id = self._current_genomeset.id + return self.session.query(Taxon).filter_by(genome_set_id=gset_id, key=key).one() diff --git a/src/gambit/seq.py b/src/gambit/seq.py new file mode 100644 index 0000000..3786f7d --- /dev/null +++ b/src/gambit/seq.py @@ -0,0 +1,121 @@ +"""Generic code for working with sequence data. + +Note that all code in this package operates on DNA sequences as sequences of +bytes containing ascii-encoded nucleotide codes. + + +.. data:: NUCLEOTIDES + + ``bytes`` corresponding to the four DNA nucleotides. Ascii-encoded upper + case letters ``ACGT``. Note that the order, while arbitrary, is important + in this variable as it defines how unique indices are assigned to k-mer + sequences. + +.. class:: DNASeq + + Type alias for DNA sequence types accepted for k-mer search / signature calculation + (``str``, ``bytes``, ``bytearray``, or :class:`Bio.Seq.Seq`). +""" + +from pathlib import Path +from typing import Union, Optional, IO, Iterable +from os import PathLike + +from Bio import SeqIO +from Bio.Seq import Seq +from attr import attrs, attrib +from typing_extensions import TypeAlias + +from gambit._cython.kmers import revcomp +from gambit.util.io import FilePath +from gambit.util.io import open_compressed, ClosingIterator + + +# Byte representations of the four nucleotide codes in the order used for +# indexing k-mer sequences +NUCLEOTIDES = b'ACGT' + +SEQ_TYPES = (str, bytes, bytearray, Seq) + +DNASeq: TypeAlias = Union[SEQ_TYPES] +# Type alias for sequence types accepted directly by native (Cython) code. +DNASeqBytes: TypeAlias = Union[bytes, bytearray] + + +def seq_to_bytes(seq: 'DNASeq') -> 'DNASeqBytes': + """Convert generic DNA sequence to byte string representation. + + This is for passing sequence data to Cython functions. + """ + if isinstance(seq, (bytes, bytearray)): + return seq + if isinstance(seq, str): + return seq.encode('ascii') + if isinstance(seq, Seq): + # This is recommended in the documentation over the deprecated encode() method, also + # probably avoids copying any data as it typically just returns the seq._data attribute. + return bytes(seq) + raise TypeError(f'Expected sequence type, got {type(seq)}') + + +def validate_dna_seq_bytes(seq: DNASeqBytes): + """Check that a sequence contains only valid nucleotide codes (upper case). + + Parameters + ---------- + seq : bytes + ASCII-encoded nucleotide sequence. + + Raises + ------ + ValueError + If the sequence contains an invalid nucleotide. + """ + for i, nuc in enumerate(seq): + if nuc not in NUCLEOTIDES: + raise ValueError(f'Invalid byte at position {i}: {nuc}') + + +def parse_seqs(path: FilePath, + format: str = 'fasta', + compression: str = 'auto', + **kwargs) -> ClosingIterator[SeqIO.SeqRecord]: + """Open a sequence file and lazily parse its contents. + + This is essentially a wrapper over BioPython's :func:`Bio.SeqIO.parse` function that + transparently handles compressed files. + + Returns iterator over sequence data in file. File is parsed lazily, and so must be kept open. + The returned iterator is of type :class:`gambit.util.io.ClosingIterator` so it will close the + file stream automatically when it finishes. It may also be used as a context manager that closes + the stream on exit. You may also close the stream explicitly using the iterator's ``close`` + method. + + Parameters + ---------- + path + Path to the file. + format + String describing the file format as interpreted by :func:`Bio.SeqIO.parse`. + compression + String describing compression method of the file, e.g. ``'gzip'``. ``none`` means no + compression. Default is to determine compression automatically (can only detect gzip or + none). See :func:`gambit.util.io.open_compressed`. + kwargs + Keyword arguments to :func:`gambit.util.io.open_compressed`. + + Returns + ------- + gambit.util.io.ClosingIterator + Iterator yielding :class:`Bio.SeqIO.SeqRecord` instances for each sequence in the file. + """ + + fobj = open_compressed(path, 'rt', compression, **kwargs) + + try: + records = SeqIO.parse(fobj, format) + return ClosingIterator(records, fobj) + + except: + fobj.close() + raise diff --git a/gambit/sigs/__init__.py b/src/gambit/sigs/__init__.py similarity index 69% rename from gambit/sigs/__init__.py rename to src/gambit/sigs/__init__.py index 017470b..fd43484 100644 --- a/gambit/sigs/__init__.py +++ b/src/gambit/sigs/__init__.py @@ -1,4 +1,4 @@ """Calculate and store collections of k-mer signatures.""" from .base import KmerSignature, SignatureArray, SignatureList, sigarray_eq, SignaturesMeta,\ - AnnotatedSignatures, dump_signatures, load_signatures + AnnotatedSignatures, dump_signatures, load_signatures, BOUNDS_DTYPE diff --git a/gambit/sigs/base.py b/src/gambit/sigs/base.py similarity index 94% rename from gambit/sigs/base.py rename to src/gambit/sigs/base.py index bf73203..c0cda54 100644 --- a/gambit/sigs/base.py +++ b/src/gambit/sigs/base.py @@ -5,7 +5,6 @@ from attr import attrs, attrib from gambit.kmers import KmerSpec -from gambit._cython.metric import BOUNDS_DTYPE from gambit.util.indexing import AdvancedIndexingMixin from gambit.util.io import FilePath @@ -15,6 +14,12 @@ # TODO - use nptyping package to specify dimensions and data type? +#: Preferred Numpy dtype for :attr:`.ConcatenatedSignatureArray.bounds`. Can be used in parallelized +#: Cython metric calculation code without conversion. +# Equivalent to BOUNDS_T in types.pxd +BOUNDS_DTYPE = np.dtype(np.intp) + + def sigarray_eq(a1: Sequence[KmerSignature], a2: Sequence[KmerSignature]) -> bool: """Check two sequences of sparse k-mer signatures for equality. @@ -315,7 +320,7 @@ class SignaturesMeta: name Short human-readable name. id_attr - Name of ``Genome`` attribute the IDs correspond to (see :data:`gambit.db.models.GENOME_ID_ATTRS`). + Name of ``Genome`` attribute the IDs correspond to (see :attr:`~gambit.db.models.Genome.ID_ATTRS`). Optional, but signature set cannot be used as a reference for queries without it. description Human-readable description. @@ -397,7 +402,23 @@ def __getitem__(self, index): return self.signatures[index] -def load_signatures(path: FilePath, **kw) -> AbstractSignatureArray: +class SignaturesFileError(Exception): + """Indicates an error attempting to open a signatures file.""" + + message: str + filename: str + format: str + + def __init__(self, message: str, filename: Optional['FilePath'], format: Optional[str]): + self.message = message + self.filename = str(filename) + self.format = format + + def __str__(self): + return self.message + + +def load_signatures(path: 'FilePath', **kw) -> AbstractSignatureArray: """Load signatures from file. Currently the only format used to store signatures is the one in :mod:`gambit.sigs.hdf5`, but @@ -414,7 +435,7 @@ def load_signatures(path: FilePath, **kw) -> AbstractSignatureArray: return load_signatures_hdf5(path, **kw) -def dump_signatures(path: FilePath, +def dump_signatures(path: 'FilePath', signatures: AbstractSignatureArray, format: str = 'hdf5', **kw, diff --git a/gambit/sigs/calc.py b/src/gambit/sigs/calc.py similarity index 82% rename from gambit/sigs/calc.py rename to src/gambit/sigs/calc.py index c4c4614..07a1864 100644 --- a/gambit/sigs/calc.py +++ b/src/gambit/sigs/calc.py @@ -9,14 +9,15 @@ from .base import KmerSignature, SignatureList from gambit.kmers import KmerSpec, find_kmers, kmer_to_index, nkmers, index_dtype -from gambit.seq import SEQ_TYPES, DNASeq, SequenceFile +from gambit.seq import SEQ_TYPES, DNASeq, parse_seqs +from gambit.util.io import FilePath from gambit.util.progress import iter_progress, get_progress class KmerAccumulator(MutableSet[int]): """Base class for data structures which track k-mers as they are found in sequences. - Implements the ``MutableSet`` interface for k-mer indices. Indices are added via :meth:`add` or + Implements the ``MutableSet`` interface for k-mer indices. Indices are added via the ``add`` or :meth:`add_kmer` methods, when finished a sparse k-mer signature can be obtained from :meth:`signature`. """ @@ -127,7 +128,7 @@ def default_accumulator(k: int) -> KmerAccumulator: return SetAccumulator(k) if k > 11 else ArrayAccumulator(k) -def accumulate_kmers(accumulator: KmerAccumulator, kmerspec: KmerSpec, seq: DNASeq): +def accumulate_kmers(accumulator: KmerAccumulator, kmerspec: KmerSpec, seq: 'DNASeq'): """Find k-mer matches in sequence and add their indices to an accumulator.""" for match in find_kmers(kmerspec, seq): try: @@ -138,7 +139,7 @@ def accumulate_kmers(accumulator: KmerAccumulator, kmerspec: KmerSpec, seq: DNAS def calc_signature(kmerspec: KmerSpec, - seqs: Union[DNASeq, Iterable[DNASeq]], + seqs: Union['DNASeq', Iterable['DNASeq']], *, accumulator: Optional[KmerAccumulator] = None, ) -> KmerSignature: @@ -178,15 +179,12 @@ def calc_signature(kmerspec: KmerSpec, def calc_file_signature(kspec: KmerSpec, - seqfile: SequenceFile, + seqfile: FilePath, *, accumulator: Optional[KmerAccumulator] = None, ) -> KmerSignature: """Open a sequence file on disk and calculate its k-mer signature. - This works identically to :func:`.calc_signature_parse` but takes a :class:`.SequenceFile` as - input instead of a data stream. - Parameters ---------- kspec @@ -199,20 +197,19 @@ def calc_file_signature(kspec: KmerSpec, Returns ------- numpy.ndarray - K-mer signature in sparse coordinate format (dtype will match - :func:`gambit.kmers.dense_to_sparse`). + K-mer signature in sparse coordinate format (dtype will match :func:`.dense_to_sparse`). See Also -------- .calc_signature .calc_file_signatures """ - with seqfile.parse() as records: - return calc_signature(kspec, (record.seq for record in records)) + with parse_seqs(seqfile) as records: + return calc_signature(kspec, (record.seq for record in records), accumulator=accumulator) def calc_file_signatures(kspec: KmerSpec, - files: Sequence[SequenceFile], + files: Sequence[FilePath], progress=None, concurrency: Optional[str] = 'processes', max_workers: Optional[int] = None, @@ -278,3 +275,48 @@ def calc_file_signatures(kspec: KmerSpec, assert all(sig is not None for sig in sigs) return SignatureList(sigs, kspec) + + +def dense_to_sparse(vec: Sequence[bool]) -> KmerSignature: + """Convert k-mer set from dense bit vector to sparse coordinate representation. + + Parameters + ---------- + vec + Boolean vector indicating which k-mers are present. + + Returns + ------- + numpy.ndarray + Sorted array of coordinates of k-mers present in vector. Data type will be ``numpy.intp``. + + See Also + -------- + .sparse_to_dense + """ + return np.flatnonzero(vec) + + +def sparse_to_dense(k_or_kspec: Union[int, KmerSpec], coords: KmerSignature) -> np.ndarray: + """Convert k-mer set from sparse coordinate representation back to dense bit vector. + + Parameters + ---------- + k_or_kspec + Value of k or a :class:`.KmerSpec` instance. + coords + Sparse coordinate array. + + Returns + ------- + numpy.ndarray + Dense k-mer bit vector. + + See Also + -------- + .dense_to_sparse + """ + idx_len = k_or_kspec.nkmers if isinstance(k_or_kspec, KmerSpec) else nkmers(k_or_kspec) + vec = np.zeros(idx_len, dtype=np.bool_) + vec[coords] = 1 + return vec diff --git a/gambit/sigs/hdf5.py b/src/gambit/sigs/hdf5.py similarity index 86% rename from gambit/sigs/hdf5.py rename to src/gambit/sigs/hdf5.py index 7e2b930..b35a39a 100644 --- a/gambit/sigs/hdf5.py +++ b/src/gambit/sigs/hdf5.py @@ -7,9 +7,8 @@ import h5py as h5 from .base import SignatureArray, ConcatenatedSignatureArray, AbstractSignatureArray, SignaturesMeta,\ - ReferenceSignatures + ReferenceSignatures, SignaturesFileError, BOUNDS_DTYPE from gambit.kmers import KmerSpec -from gambit._cython.metric import BOUNDS_DTYPE from gambit.util.io import FilePath @@ -93,11 +92,11 @@ def __init__(self, group: h5.Group): self.group = group if FMT_VERSION_ATTR not in group.attrs: - raise RuntimeError('HDF5 group does not contain a signature set') + raise SignaturesFileError('HDF5 group does not contain a signature set', None, 'hdf5') self.format_version = group.attrs[FMT_VERSION_ATTR] if self.format_version != CURRENT_FMT_VERSION: - raise ValueError(f'Unrecognized format version: {self.format_version}') + raise ValueError(f'Unrecognized format version: {self.format_version}', None, 'hdf5') self.kmerspec = KmerSpec(group.attrs['kmerspec_k'], group.attrs['kmerspec_prefix']) self.meta = read_metadata(group) @@ -219,7 +218,7 @@ def create(cls, return cls(group) -def load_signatures_hdf5(path: FilePath, **kw) -> HDF5Signatures: +def load_signatures_hdf5(path: 'FilePath', **kw) -> HDF5Signatures: """Open HDF5 signature file. Parameters @@ -229,13 +228,33 @@ def load_signatures_hdf5(path: FilePath, **kw) -> HDF5Signatures: \\**kw Additional keyword arguments to :func:`h5py.File`. """ - return HDF5Signatures(h5.File(path, **kw)) + exc = SignaturesFileError(f'{path} does not appear to be a GAMBIT signtures file.', path, 'hdf5') + # Check for HDF5 magic number + # The errors raised by the h5py library are a bit cryptic, so make one with a better message if + # not a valid HDF5 file. + # This also raises the standard errors if file cannot be read. + with open(path, 'rb') as f: + header = f.read(8) + if header != b'\x89HDF\r\n\x1a\n': + raise exc -def dump_signatures_hdf5(path: FilePath, - signatures: AbstractSignatureArray, - **kw, - ): + h5file = h5.File(path, **kw) + + if FMT_VERSION_ATTR not in h5file.attrs: + raise exc + + try: + return HDF5Signatures(h5file) + + except SignaturesFileError as exc: + # Make sure errors in opening are annotated with the correct file name + exc.message = f'Error opening signatures file {path}: {exc.message}' + exc.filename = str(path) + raise + + +def dump_signatures_hdf5(path: 'FilePath', signatures: AbstractSignatureArray, **kw): """Write k-mer signatures and associated metadata to an HDF5 file. Parameters diff --git a/gambit/util/__init__.py b/src/gambit/util/__init__.py similarity index 100% rename from gambit/util/__init__.py rename to src/gambit/util/__init__.py diff --git a/gambit/util/indexing.py b/src/gambit/util/indexing.py similarity index 99% rename from gambit/util/indexing.py rename to src/gambit/util/indexing.py index 4be2a6a..36c9386 100644 --- a/gambit/util/indexing.py +++ b/src/gambit/util/indexing.py @@ -12,11 +12,13 @@ class AdvancedIndexingMixin: bounds checking, and converting negative indices. The following methods must be implemented by subtypes: + * :meth:`_getitem_int` * :meth:`_getitem_int_array` The following methods may optionally be overridden, but default to calling :meth:`_getitem_int_array`: - * :meth:`_getitem_range` + + * :meth:`_getitem_slice` * :meth:`_getitem_bool_array` """ diff --git a/gambit/util/io.py b/src/gambit/util/io.py similarity index 77% rename from gambit/util/io.py rename to src/gambit/util/io.py index 3cc7668..d1e52a8 100644 --- a/gambit/util/io.py +++ b/src/gambit/util/io.py @@ -1,35 +1,25 @@ -"""Utility code for reading/writing data files.""" +"""Utility code for reading/writing data files. -import os -from io import TextIOWrapper -from typing import Union, Optional, IO, BinaryIO, ContextManager, Iterable, TypeVar -from contextlib import nullcontext -#: Alias for types which can represent a file system path -FilePath = Union[str, os.PathLike] +.. class:: FilePath -T = TypeVar('T') + Alias for types which can represent a file system path (``str`` or :class:`os.PathLike`). +""" -COMPRESSED_OPENERS = {None: open} +import os +from io import TextIOWrapper +from typing import Union, IO, TextIO, BinaryIO, ContextManager, Iterable, TypeVar +from contextlib import nullcontext +from typing_extensions import TypeAlias -def _compressed_opener(compression): - """Decorator to register opener functions for compression types.""" - def decorator(func): - COMPRESSED_OPENERS[compression] = func - return func - return decorator +FilePath: TypeAlias = Union[str, os.PathLike] -@_compressed_opener('gzip') -def _open_gzip(path, mode, **kwargs): - """Opener for gzip-compressed files.""" - import gzip - return gzip.open(path, mode=mode, **kwargs) +T = TypeVar('T') -@_compressed_opener('auto') -def _open_auto(path, mode, **kwargs): +def _open_auto(path: FilePath, mode: str, **kwargs): """Open file for reading with compression determined automatically.""" if mode[0] != 'r': @@ -41,13 +31,13 @@ def _open_auto(path, mode, **kwargs): compression = guess_compression(file) file.seek(0) - if compression is None: + if compression == 'none': binary = file elif compression == 'gzip': import gzip binary = gzip.GzipFile(fileobj=file, mode='rb') else: - assert 0 + assert False, f'Unexpected compression type: {compression!r}' return TextIOWrapper(binary, **kwargs) if mode[1] == 't' else binary @@ -55,7 +45,7 @@ def _open_auto(path, mode, **kwargs): file.close() -def guess_compression(fobj: BinaryIO) -> Optional[str]: +def guess_compression(fobj: BinaryIO) -> str: """Guess the compression mode of an readable file-like object in binary mode. Assumes the current position is at the beginning of the file. @@ -65,26 +55,25 @@ def guess_compression(fobj: BinaryIO) -> Optional[str]: if magic == b'\x1f\x8b': return 'gzip' else: - return None + return 'none' -def open_compressed(compression: Optional[str], - path: FilePath, +def open_compressed(path: 'FilePath', mode: str = 'rt', + compression: str = 'auto', **kwargs, ) -> IO: """Open a file with compression method specified by a string. Parameters ---------- - compression : str - Compression method. None is no compression. Keys of :data:`COMPRESSED_OPENERS` are the - allowed values. path Path of file to open. May be string or path-like object. mode : str Mode to open file in - similar to :func:`open`. Must be exactly two characters, the first in ``rwax`` and the second in``tb``. + compression : str + Compression method. Allowed values are ``'none'``, ``'gzip'``, or ``'auto'``. \\**kwargs Additional text-specific keyword arguments identical to the following :func:`open` arguments: ``encoding``, ``errors``, and ``newlines``. @@ -94,19 +83,28 @@ def open_compressed(compression: Optional[str], IO Open file object. """ + + # Check mode if not(len(mode) == 2 and mode[0] in 'rwax' and mode[1] in 'tb'): msg = f'Invalid mode {mode!r}' if mode in 'rwax': msg += ' (must specify either binary or text mode)' raise ValueError(msg) - try: - opener = COMPRESSED_OPENERS[compression] + path = os.fsdecode(path) - except KeyError: - raise ValueError(f'Unknown compression type {compression!r}') from None + if compression == 'none': + return open(path, mode, **kwargs) - return opener(os.fsdecode(path), mode=mode, **kwargs) + elif compression == 'gzip': + import gzip + return gzip.open(path, mode, **kwargs) + + elif compression == 'auto': + return _open_auto(path, mode, **kwargs) + + else: + raise ValueError(f'Unknown compression type {compression!r}') from None class ClosingIterator(Iterable[T]): @@ -139,14 +137,14 @@ class ClosingIterator(Iterable[T]): method is called. """ - def __init__(self, iterable, fobj): + def __init__(self, iterable: Iterable[T], fobj): self.iterator = iter(iterable) self.fobj = fobj def __iter__(self): return self - def __next__(self): + def __next__(self) -> T: try: return next(self.iterator) @@ -173,7 +171,7 @@ def __exit__(self, *args): self.close() -def maybe_open(file_or_path: Union[FilePath, IO], mode: str = 'r', **open_kw) -> ContextManager[IO]: +def maybe_open(file_or_path: Union['FilePath', IO], mode: str = 'r', **open_kw) -> ContextManager[IO]: """Open a file given a file path as an argument, but pass existing file objects though. Intended to be used by API functions that take either type as an argument. If a file path is @@ -209,7 +207,7 @@ def maybe_open(file_or_path: Union[FilePath, IO], mode: str = 'r', **open_kw) -> return open(path, mode, **open_kw) -def read_lines(file_or_path: Union[FilePath, IO], strip: bool=True, skip_empty: bool=False) -> Iterable[str]: +def read_lines(file_or_path: Union['FilePath', TextIO], strip: bool=True, skip_empty: bool=False) -> Iterable[str]: """Iterate over lines in text file. Parameters @@ -233,7 +231,7 @@ def read_lines(file_or_path: Union[FilePath, IO], strip: bool=True, skip_empty: yield line -def write_lines(lines: Iterable, file_or_path: Union[FilePath, IO]): +def write_lines(lines: Iterable, file_or_path: Union['FilePath', TextIO]): """Write strings to text file, one per line. Parameters diff --git a/gambit/util/json.py b/src/gambit/util/json.py similarity index 96% rename from gambit/util/json.py rename to src/gambit/util/json.py index 409e20f..998a034 100644 --- a/gambit/util/json.py +++ b/src/gambit/util/json.py @@ -57,8 +57,7 @@ def dump(obj, f: TextIO, **kw): \\**kw Keyword arguments to :func:`json.dump`. """ - data = to_json(obj) - json.dump(data, f, **kw) + json.dump(obj, f, default=converter.unstructure, **kw) def load(f: TextIO, cls=Any): @@ -93,7 +92,7 @@ def dumps(obj, **kw) -> str: ------- str """ - return json.dumps(to_json(obj), **kw) + return json.dumps(obj, default=converter.unstructure, **kw) def loads(s: str, cls=Any): diff --git a/gambit/util/misc.py b/src/gambit/util/misc.py similarity index 71% rename from gambit/util/misc.py rename to src/gambit/util/misc.py index 44b468e..e797a43 100644 --- a/gambit/util/misc.py +++ b/src/gambit/util/misc.py @@ -1,11 +1,33 @@ """Utility code that doesn't fit anywhere else.""" import sys -from typing import Iterator, Tuple, Callable, Iterable +from typing import Iterator, Callable, Iterable, TypeVar, overload from functools import singledispatch, wraps -def zip_strict(*iterables: Iterator) -> Iterator[Tuple]: +T = TypeVar('T') +T2 = TypeVar('T2') +T3 = TypeVar('T3') +T4 = TypeVar('T4') + + +# Type-hinting zip() properly isn't really possible short of adding overloads for all possible #'s +# of arguments. Just do it for 2-4 here. +# Source code for https://github.com/python/typeshed/ does basically this. + +@overload +def zip_strict(it1: Iterable[T], it2: Iterable[T2], /) -> Iterator[tuple[T, T2]]: + pass # 2-iterable case + +@overload +def zip_strict(it1: Iterable[T], it2: Iterable[T2], it3: Iterable[T3], /) -> Iterator[tuple[T, T2, T3]]: + pass # 3-argument case + +@overload +def zip_strict(it1: Iterable[T], it2: Iterable[T2], it3: Iterable[T3], it4: Iterable[T4], /) -> Iterator[tuple[T, T2, T3, T4]]: + pass # 4-argument case + +def zip_strict(*iterables: Iterable) -> Iterator[tuple]: """Like the builtin ``zip`` function but raises an error if any argument is exhausted before the others. Parameters @@ -17,6 +39,16 @@ def zip_strict(*iterables: Iterator) -> Iterator[Tuple]: ------ ValueError """ + if sys.version_info >= (3, 10): + # Version 3.10+ has strict parameter for builtin zip() + return zip(*iterables, strict=True) + else: + return _zip_strict(*iterables) + + +def _zip_strict(*iterables: Iterable) -> Iterator[tuple]: + """Implementation for Python 3.9.""" + # Builtin zip gives empty output on empty input if not iterables: return @@ -73,24 +105,6 @@ def chunk_slices(n: int, size: int) -> Iterator[slice]: start = stop -# singledispatchmethod ot available in 3.7 -if sys.version_info[1] >= 8: - from functools import singledispatchmethod - -else: - # Make simple implementation - def singledispatchmethod(func): - dispatcher = singledispatch(func) - - @wraps(func) - def wrapper(self, arg, *rest, **kw): - impl = dispatcher.dispatch(type(arg)) - return impl(self, arg, *rest, **kw) - - wrapper.register = dispatcher.register - return wrapper - - def type_singledispatchmethod(func: Callable): """ Similar to ``singledispatchmethod``, but the first (non-self) argument is expected to be a @@ -124,12 +138,6 @@ def wrapper(self, cls, *rest, **kw): return wrapper -def is_importable(module: str) -> bool: - """Check if the specified module is importable, without actually importing it.""" - from importlib.util import find_spec - return find_spec(module) is not None - - def join_list_human(strings: Iterable[str], conj: str='and') -> str: """Join items into a single human-readable string with commas and the given conjunction.""" strings = list(strings) diff --git a/gambit/util/progress.py b/src/gambit/util/progress.py similarity index 94% rename from gambit/util/progress.py rename to src/gambit/util/progress.py index d744061..59a75ff 100644 --- a/gambit/util/progress.py +++ b/src/gambit/util/progress.py @@ -3,12 +3,14 @@ import sys from abc import ABC, abstractmethod -from typing import Optional, Union, Callable, Iterable, TextIO, Dict, Mapping, Any, cast, List, \ - Tuple, Iterator, ContextManager +from typing import Optional, Union, Callable, Iterable, TextIO, Mapping, Any, cast, Iterator, \ + TypeVar from warnings import warn from contextlib import contextmanager +T = TypeVar('T') + #: Type alias for a callable which takes ``total`` and keyword arguments and returns an AbstractProgressMeter ProgressFactoryFunc = Callable[[int], 'AbstractProgressMeter'] @@ -84,7 +86,7 @@ def create(cls, total Total number of iterations to completion. initial - Initial value of :attr:`n`. + Initial value of ``n``. desc Description to display to the user. file @@ -119,9 +121,9 @@ class ProgressConfig: Keyword arguments to pass to callable. """ callable: ProgressFactoryFunc - kw: Dict[str, Any] + kw: dict[str, Any] - def __init__(self, callable: ProgressFactoryFunc, kw: Dict[str, Any]): + def __init__(self, callable: ProgressFactoryFunc, kw: dict[str, Any]): self.callable = callable self.kw = kw @@ -192,7 +194,7 @@ def get_progress(arg: ProgressArg, total: int, initial: int = 0, **kw) -> Abstra Accepts the following types/values for the argument: - :class:`.ProgressConfig` - - ``None`` - uses :class:`.NullProgressBar`. + - ``None`` - uses :class:`.NullProgressMeter`. - ``True`` - uses class returned by :func:`.default_progress_cls`. - ``False`` - same as ``None``. - ``str`` key - Looks up progress bar class/factory function in :data:`.REGISTRY`. @@ -215,11 +217,40 @@ def get_progress(arg: ProgressArg, total: int, initial: int = 0, **kw) -> Abstra return config.create(total, initial=initial, **kw) -def iter_progress(iterable: Iterable, +class ProgressIterator(Iterator[T]): + itr: Iterator[T] + meter: AbstractProgressMeter + + def __init__(self, iterable: Iterable[T], meter: AbstractProgressMeter): + self.itr = iter(iterable) + self.meter = meter + self._first = True + + def __next__(self): + if not self._first: + self.meter.increment() + self._first = False + + try: + value = next(self.itr) + except StopIteration: + self.meter.close() # Close on reaching end + raise + + return value + + def __enter__(self): + return self + + def __exit__(self, *args): + self.meter.close() + + +def iter_progress(iterable: Iterable[T], progress: ProgressArg = True, total: Optional[int] = None, **kw, - ) -> Iterable: + ) -> ProgressIterator[T]: """Display a progress meter while iterating over an object. The returned iterator object can also be used as a context manager to ensure that the progress @@ -238,7 +269,7 @@ def iter_progress(iterable: Iterable, Returns ------- - .ProgressIterator + ProgressIterator Iterator over values in ``iterable`` which advances a progress meter. """ if total is None: @@ -248,36 +279,7 @@ def iter_progress(iterable: Iterable, return ProgressIterator(iterable, meter) -class ProgressIterator(Iterator): - itr: Iterator - meter: AbstractProgressMeter - - def __init__(self, iterable: Iterable, meter: AbstractProgressMeter): - self.itr = iter(iterable) - self.meter = meter - self._first = True - - def __next__(self): - if not self._first: - self.meter.increment() - self._first = False - - try: - value = next(self.itr) - except StopIteration: - self.meter.close() # Close on reaching end - raise - - return value - - def __enter__(self): - return self - - def __exit__(self, *args): - self.meter.close() - - -def capture_progress(config: ProgressConfig) -> Tuple[ProgressConfig, List[AbstractProgressMeter]]: +def capture_progress(config: ProgressConfig) -> tuple[ProgressConfig, list[AbstractProgressMeter]]: """ Creates a ``ProgressConfig`` which captures references to the progress meter instances created with it. @@ -309,7 +311,7 @@ def check_progress(*, total: Optional[int] = None, allow_decrement: bool = False, check_closed: bool = True, - ) -> ContextManager[ProgressConfig]: + ) -> Iterator[ProgressConfig]: """Context manager which checks a progress meter is advanced to completion. Returned context manager yields a ``ProgressConfig`` instance on enter, tests are run when @@ -413,7 +415,13 @@ def create(cls, total: int, initial: int = 0, **kw): class TqdmProgressMeter(AbstractProgressMeter): """Wrapper around a progress meter from the ``tqdm`` library.""" - def __init__(self, pbar: 'tqdm.std.tqdm'): + def __init__(self, pbar): + """ + Parameters + ---------- + pbar + ``tqdm.std.tqdm`` instance to wrap. + """ self.pbar = pbar @property diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..5cc3df0 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,13 @@ +""" +Adding the __init__.py to the tests/ directory (and its subdirectories) makes them all part of the +same package structure. + +- Allows test modules/files to import from each other (including from modules in different + directories, such as files in tests/cli/ importing from tests/testdb.py). +- Does not require test modules to have unique names. + +This necessitates using the "prepend" (or possibly "append"?) import mode (which is the default). +This setup comes with its own set of caveats. See +https://docs.pytest.org/en/7.1.x/explanation/pythonpath.html for a discussion of how test modules +are imported. +""" diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/benchmarks/benchmark_signatures.py b/tests/benchmarks/benchmark_signatures.py index d0b2450..f8bb9b8 100644 --- a/tests/benchmarks/benchmark_signatures.py +++ b/tests/benchmarks/benchmark_signatures.py @@ -5,7 +5,7 @@ from gambit.kmers import KmerSpec from gambit.sigs.calc import calc_signature, ArrayAccumulator, SetAccumulator -from gambit.test import random_seq +from ..common import random_seq @pytest.fixture(scope='module', params=[10**4, 10**6]) @@ -25,7 +25,7 @@ def prefix_len(request): @pytest.fixture() -def kspec(k, prefix_len): +def kspec(k: int, prefix_len: int): prefix ='ATGACCT'[:prefix_len] return KmerSpec(k, prefix) @@ -38,6 +38,6 @@ def accumulator(request): return request.param -def benchmark_calc_signature(seq, kspec, benchmark, accumulator): +def benchmark_calc_signature(seq: bytes, kspec: KmerSpec, benchmark, accumulator): acc = accumulator(kspec.k) benchmark(calc_signature, kspec, seq, accumulator=acc) diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gambit/cli/test.py b/tests/cli/common.py similarity index 86% rename from gambit/cli/test.py rename to tests/cli/common.py index 5c22800..2b4cd1c 100644 --- a/gambit/cli/test.py +++ b/tests/cli/common.py @@ -1,12 +1,12 @@ """Tools for testing CLI.""" -from typing import Optional, ContextManager, Sequence +from typing import Optional, Sequence, Any, Iterable, Iterator from contextlib import contextmanager import click from click.testing import CliRunner, Result -from .root import cli +from gambit.cli.root import cli DEFAULT_ENV = dict( @@ -14,7 +14,7 @@ ) -def pop_kwargs(d, keys): +def pop_kwargs(d: dict[str, Any], keys: Iterable[str]) -> dict[str, Any]: out = dict() for k in keys: try: @@ -33,6 +33,7 @@ def default_runner(**kw) -> CliRunner: kw.setdefault('env', DEFAULT_ENV) return CliRunner(**kw) + def invoke_cli(args: Sequence, runner: Optional[CliRunner]=None, success: Optional[bool]=True, **kw) -> Result: """Invoke CLI in test context, using different defaults than base Click method. @@ -53,7 +54,7 @@ def invoke_cli(args: Sequence, runner: Optional[CliRunner]=None, success: Option result = runner.invoke(cli, args, **kw) if success is True: - assert result.exit_code == 0 + assert result.exit_code == 0, result.stderr if success is False: assert result.exit_code != 0 @@ -61,7 +62,7 @@ def invoke_cli(args: Sequence, runner: Optional[CliRunner]=None, success: Option @contextmanager -def allow_no_args(command: click.Command) -> ContextManager[click.Command]: +def allow_no_args(command: click.Command) -> Iterator[click.Command]: """Context manager which patches a command to allow calling with no arguments. Group commands will print help and exit if called without a subcommand, this will also happen @@ -77,4 +78,3 @@ def allow_no_args(command: click.Command) -> ContextManager[click.Command]: finally: command.no_args_is_help = old_naih - diff --git a/tests/cli/test_cli_dist.py b/tests/cli/test_cli_dist.py deleted file mode 100644 index b900bb0..0000000 --- a/tests/cli/test_cli_dist.py +++ /dev/null @@ -1,207 +0,0 @@ -"""Tests for the "dist" command.""" - -import json - -import pytest -import numpy as np - -from gambit.kmers import KmerSpec -from gambit.metric import jaccarddist_matrix -from gambit.sigs import SignatureList, dump_signatures -from gambit.cli.test import invoke_cli -from gambit.util.io import write_lines -from gambit.cluster import load_dmat_csv -import gambit.util.json as gjson -from gambit.kmers import DEFAULT_KMERSPEC - - -@pytest.fixture() -def outfile(tmp_path): - return tmp_path / 'out.csv' - -@pytest.fixture(params=[None]) -def nqueries(request): - return request.param - -@pytest.fixture(params=[False]) -def queries_gz(request): - return request.param - -@pytest.fixture() -def query_files(testdb, nqueries, queries_gz): - return [f for f in testdb.get_query_files(queries_gz)[:nqueries]] - -@pytest.fixture(params=[None]) -def nrefs(request): - return request.param - -@pytest.fixture(params=[False]) -def refs_gz(request): - return request.param - -@pytest.fixture() -def ref_files(testdb, nrefs, refs_gz): - return [f for f in testdb.get_ref_files(refs_gz)[:nrefs]] - -@pytest.fixture(name='make_args') -def make_args_factory(testdb, query_files, ref_files, outfile, tmp_path): - - def make_args(q_opt=False, # Pass queries with -q option - q_list=False, # Pass queries with list file - q_sigs=False, # Use query signature file - r_opt=False, # Pass refs with -r option - r_list=False, # Pass refs with list file - r_sigs=False, # Use refs signature file - r_db=False, # Use db for refs - with_db=False, # Pass db at root level - with_kspec=False, # Pass -k and -p options - extra=(), # Additional args - ): - - args = ['dist', '-o', outfile, *extra] - - if with_db: - args.insert(0, f'--db={testdb.paths.root}') - - if q_opt: - for file in query_files: - args.extend(['-q', file]) - if q_list: - qlfile = tmp_path / 'queries.txt' - write_lines(query_files, qlfile) - args.extend(['--ql', qlfile]) - args.extend(['--qdir', testdb.paths.query_genomes_dir]) - if q_sigs: - args.extend(['--qs', testdb.paths.query_signatures]) - - if r_opt: - for file in ref_files: - args.extend(['-r', file]) - if r_list: - rlfile = tmp_path / 'refs.txt' - write_lines(ref_files, rlfile) - args.extend(['--rl', rlfile]) - args.extend(['--rdir', testdb.paths.ref_genomes_dir]) - if r_sigs: - args.extend(['--rs', testdb.paths.ref_signatures]) - if r_db: - args.append('--use-db') - - if with_kspec: - args += [ - '-k', str(testdb.kmerspec.k), - f'--prefix={testdb.kmerspec.prefix_str}', - ] - - return args - - return make_args - -@pytest.fixture(scope='session') -def expected_matrix(testdb): - return jaccarddist_matrix(testdb.query_signatures, testdb.ref_signatures) - -@pytest.fixture(scope='session') -def expected_matrix_square(testdb): - return jaccarddist_matrix(testdb.query_signatures, testdb.query_signatures) - -@pytest.fixture(name='check_output') -def check_output_factory(outfile, expected_matrix, nqueries, nrefs): - def check_output(): - dmat, row_ids, col_ids = load_dmat_csv(outfile) - assert np.allclose(dmat, expected_matrix[:nqueries, :nrefs], atol=1e-4) - # TODO check row/column IDs - - return check_output - - -@pytest.mark.parametrize( - 'q_type,r_type,nqueries,nrefs,queries_gz,refs_gz', - [ - ('sigs', 'sigs', None, None, False, False), - ('list', 'sigs', 10, None, False, False), - ('sigs', 'list', None, 10 , False, False), - ('list', 'list', 10, 10 , False, False), - ('opt', 'sigs', 10, None, False, False), - ('sigs', 'opt', None, 10 , False, False), - ('sigs', 'db', None, None, False, False), - ('list', 'sigs', 10, None, True, False), - ('sigs', 'list', None, 10 , False, True), - ], - indirect=['nqueries', 'nrefs', 'queries_gz', 'refs_gz'], -) -def test_basic(make_args, check_output, q_type, r_type): - """Test test basic usage, with query/ref sequences/signatures from different sources.""" - - args = make_args( - q_opt=q_type == 'opt', - q_list=q_type == 'list', - q_sigs=q_type == 'sigs', - r_opt=r_type == 'opt', - r_list=r_type == 'list', - r_sigs=r_type == 'sigs', - r_db=r_type == 'db', - with_kspec=True, - with_db=r_type == 'db', - ) - invoke_cli(args) - check_output() - -def test_kspec(make_args, testdb, tmp_path): - """Test selection of k-mer params and errors on inconsistencies.""" - - alt_kspec = KmerSpec(6, 'AC') - assert alt_kspec != testdb.kmerspec - alt_kspec_args = ['-k', alt_kspec.k, '-p', alt_kspec.prefix_str] - - alt_sigfile = tmp_path / 'alt_sigs.gs' - alt_sigs = SignatureList([], alt_kspec) - dump_signatures(alt_sigfile, alt_sigs) - - # Default kspec - args = make_args(q_list=True, r_list=True, extra=('--dump-params',)) - result = invoke_cli(args) - params = json.loads(result.stdout) - assert params['kmerspec'] == gjson.to_json(DEFAULT_KMERSPEC) - - # Kspec from args inconsistent with query or reference signatures - args = make_args(q_sigs=True, r_sigs=True) + alt_kspec_args - invoke_cli(args, success=False) - args = make_args(q_list=True, r_sigs=True) + alt_kspec_args - invoke_cli(args, success=False) - args = make_args(q_sigs=True, r_list=True) + alt_kspec_args - invoke_cli(args, success=False) - args = make_args(q_sigs=True, r_db=True) + alt_kspec_args - invoke_cli(args, success=False) - - # Ref and query signatures inconsistent - args = make_args(r_sigs=True, extra=('--qs', alt_sigfile)) - invoke_cli(args, success=False) - args = make_args(r_db=True, extra=('--qs', alt_sigfile)) - invoke_cli(args, success=False) - -@pytest.mark.parametrize( - 'q_type,nqueries,queries_gz', - [ - ('sigs', None, False), - ('list', 10, False), - ('opt', 10, False), - ('list', 10, True), - ], - indirect=['nqueries', 'queries_gz'], -) -def test_square(make_args, q_type, outfile, expected_matrix_square, nqueries): - """Test --square option.""" - - args = make_args( - q_opt=q_type == 'opt', - q_list=q_type == 'list', - q_sigs=q_type == 'sigs', - with_kspec=True, - extra=['--square'], - ) - invoke_cli(args) - - out_dmat, row_ids, col_ids = load_dmat_csv(outfile) - assert np.allclose(out_dmat, expected_matrix_square[:nqueries, :nqueries], atol=1e-4) - assert row_ids == col_ids diff --git a/tests/cli/test_cli_query.py b/tests/cli/test_cli_query.py deleted file mode 100644 index eea9c5d..0000000 --- a/tests/cli/test_cli_query.py +++ /dev/null @@ -1,178 +0,0 @@ -""" -Test the 'gambit query' CLI command using the testdb_210818 database. -""" - -import os -from copy import copy - -import pytest - -from gambit.cli.test import invoke_cli -from gambit.results.test import check_json_results, check_csv_results -from gambit.seq import SequenceFile -from gambit.query import QueryInput -from gambit.util.misc import zip_strict -from gambit.util.io import write_lines -from gambit.cli.common import strip_seq_file_ext - - -@pytest.fixture(params=[None]) -def nqueries(request): - """Number of testdb query files to use, None means use all of them. - - Can be changed via indirect parameterization in specific tests. - Note than with slice notation, `[:None]` is the same as `[:]`. - """ - return request.param - - -@pytest.fixture() -def query_files(testdb, nqueries): - """Paths to query files.""" - return [SequenceFile(f.path, f.format, f.compression) for f in testdb.get_query_files()[:nqueries]] - - -@pytest.fixture() -def cd_query_genomes(testdb): - """Change working directory to query genomes directory.""" - old_wd = os.getcwd() - try: - os.chdir(testdb.paths.query_genomes_dir) - yield - finally: - os.chdir(old_wd) - - -@pytest.fixture(name='make_args') -def make_args_factory(testdb, query_files, tmp_path): - - def make_args(positional=False, list_file=False, sig_file=False, output=None, outfmt=None, strict=False): - """Make command line arguments for query file.""" - - args = [f'--db={testdb.paths.root}', 'query'] - args.append('--strict' if strict else '--no-strict') - - if output is not None: - args.append(f'--output={output}') - - if outfmt is not None: - args.append(f'--outfmt={outfmt}') - - if positional: - args.extend(query_files) - - if list_file: - list_file = tmp_path / 'genomes.txt' - write_lines(query_files, list_file) - args += ['-l', str(list_file), f'--ldir={testdb.paths.query_genomes_dir}'] - - if sig_file: - args.append(f'--sigfile={testdb.paths.query_signatures}') - - return list(map(str, args)) - - return make_args - -@pytest.fixture(name='make_ref_results') -def make_ref_results_factory(testdb, nqueries, query_files): - """ - Make a copy of the reference query results to compare to, modifying to account for possibly - different query inputs and # of queries. - """ - def make_ref_results(strict, inputs): - ref_results = copy(testdb.get_query_results(strict)) - ref_results.items = ref_results.items[:nqueries] - - for item, input in zip_strict(ref_results.items, inputs): - item.input = input - - return ref_results - - return make_ref_results - - -def check_results(results_file, out_fmt, ref_results): - """Check results output matches reference QueryResults object.""" - if out_fmt == 'json': - with open(results_file) as fh: - check_json_results(fh, ref_results, strict=False) - elif out_fmt == 'csv': - with open(results_file) as fh: - check_csv_results(fh, ref_results, strict=False) - else: - raise ValueError(f'Invalid out_fmt {out_fmt!r}') - - -@pytest.mark.parametrize( - ['nqueries', 'use_list_file', 'out_fmt', 'strict', 'gzipped'], - [ - (None, False, 'json', False, False), - (20, False, 'csv', False, False), - (None, False, 'json', True, False), - (20, False, 'csv', True, False), - (None, False, 'json', False, True), - (20, True, 'json', False, False), - ], - indirect=['nqueries'], -) -def test_full_query(make_args, make_ref_results, use_list_file, out_fmt, strict, gzipped, query_files, tmp_path): - """Run a full query using the command line interface.""" - - inputs = [ - QueryInput(strip_seq_file_ext(file.path.name), file) - for file in query_files - ] - ref_results = make_ref_results(strict, inputs) - - results_file = tmp_path / ('results.' + out_fmt) - - args = make_args( - positional=not use_list_file, - list_file=use_list_file, - output=results_file, - outfmt=out_fmt, - strict=strict, - ) - - invoke_cli(args) - check_results(results_file, out_fmt, ref_results) - - -# Not really necessary to check all combinations of parameters. -@pytest.mark.parametrize('out_fmt', ['json']) -@pytest.mark.parametrize('strict', [False]) -def test_sigfile(make_args, make_ref_results, testdb, out_fmt, strict, tmp_path): - """Test using signature file instead of parsing genome files.""" - - inputs = list(map(QueryInput, testdb.query_signatures.ids)) - ref_results = make_ref_results(strict, inputs) - - results_file = tmp_path / ('results.' + out_fmt) - - args = make_args( - sig_file=True, - output=results_file, - outfmt=out_fmt, - strict=False, - ) - - invoke_cli(args) - check_results(results_file, out_fmt, ref_results) - - -def test_invalid(make_args, tmp_path): - """Test invalid parameter values exit with error code.""" - - results_file = tmp_path / ('results.json') - - # No genomes or signatures - args = make_args(output=results_file) - invoke_cli(args, success=False) - - # Multiple inputs - args = make_args(output=results_file, positional=True, list_file=True) - assert invoke_cli(args, success=False) - args = make_args(output=results_file, positional=True, sig_file=True) - assert invoke_cli(args, success=False) - args = make_args(output=results_file, list_file=True, sig_file=True) - assert invoke_cli(args, success=False) diff --git a/tests/cli/test_cli_common.py b/tests/cli/test_common.py similarity index 86% rename from tests/cli/test_cli_common.py rename to tests/cli/test_common.py index d74f433..0145108 100644 --- a/tests/cli/test_cli_common.py +++ b/tests/cli/test_common.py @@ -1,17 +1,19 @@ """Test code in gambit.cli.common.""" from pathlib import Path +from typing import Iterable import pytest import click import numpy as np from gambit.cli import cli, common -from gambit.cli.test import default_runner, allow_no_args from gambit.db import ReferenceDatabase -from gambit.seq import SequenceFile from gambit.util.misc import zip_strict -from gambit.util.io import write_lines +from gambit.util.io import write_lines, FilePath + +from ..testdb import TestDB +from .common import default_runner, allow_no_args class TestCLIContext: @@ -32,7 +34,6 @@ def test_no_db(self): assert not ctx.has_database assert not ctx.has_genomes assert not ctx.has_signatures - assert ctx.engine is None assert ctx.Session is None assert ctx.signatures is None @@ -44,7 +45,7 @@ def test_no_db(self): ctx.require_signatures() @pytest.mark.parametrize('method', ['option', 'envvar']) - def test_with_db(self, method, testdb): + def test_with_db(self, method: str, testdb: TestDB): """Test with database given through the --db argument or environment variable.""" dbpath = testdb.paths.root @@ -63,7 +64,6 @@ def test_with_db(self, method, testdb): assert ctx.has_database assert ctx.has_genomes assert ctx.has_signatures - assert ctx.engine is not None assert ctx.Session is not None assert ctx.signatures is not None @@ -96,7 +96,7 @@ def test_strip_seq_file_ext(): class TestGetSequenceFiles: """Test the get_sequence_files() function.""" - def check_ids(self, ids, paths, strip_dir, strip_ext): + def check_ids(self, ids: Iterable[str], paths: Iterable['FilePath'], strip_dir: bool, strip_ext: bool): for id_, path in zip_strict(ids, paths): if strip_dir: expected = Path(path).name @@ -107,14 +107,12 @@ def check_ids(self, ids, paths, strip_dir, strip_ext): assert id_ == expected - def check_files(self, files, paths): - for file, path in zip_strict(files, paths): - assert isinstance(file, SequenceFile) - assert file.path == Path(path) - assert file.format == 'fasta' - assert file.compression == 'auto' + def check_files(self, files, expected): + for file, ex in zip_strict(files, expected): + assert isinstance(file, Path) + assert file == Path(ex) - def test_explicit(self, strip_dir, strip_ext): + def test_explicit(self, strip_dir: bool, strip_ext: bool): """Test given explicit paths from CLI argument.""" paths = [f'path/to/{i + 1}.fasta' for i in range(10)] ids, files = common.get_sequence_files(paths, None, None, strip_dir=strip_dir, strip_ext=strip_ext) @@ -126,7 +124,7 @@ def test_explicit(self, strip_dir, strip_ext): ('path/to/genomes', False), # Relative to other directory ('foo/baz', True), # Absolute paths in file, ignore wd ]) - def test_listfile(self, wd, absolute, tmpdir, strip_dir, strip_ext): + def test_listfile(self, wd: str, absolute: bool, tmpdir: Path, strip_dir: bool, strip_ext: bool): """Test reading file paths from list file.""" wd = Path(wd) list_paths = [f'{i + 1}.fasta' for i in range(10)] diff --git a/tests/cli/test_dist.py b/tests/cli/test_dist.py new file mode 100644 index 0000000..1be78b7 --- /dev/null +++ b/tests/cli/test_dist.py @@ -0,0 +1,278 @@ +"""Tests for the "dist" command.""" + +import json +from typing import Optional, Iterable +from pathlib import Path + +import pytest +import numpy as np + +from gambit.kmers import KmerSpec +from gambit.metric import jaccarddist_matrix +from gambit.sigs import SignatureList, dump_signatures +from gambit.util.io import write_lines, FilePath +from gambit.cluster import load_dmat_csv +import gambit.util.json as gjson +from gambit.kmers import DEFAULT_KMERSPEC + +from ..testdb import TestDB +from .common import invoke_cli + + +def get_query_files(testdb: TestDB, n: Optional[int] = None, gz: bool = False) -> list[Path]: + return testdb.get_query_files(gz)[:n] + + +def get_ref_files(testdb: TestDB, n: Optional[int] = None, gz: bool = False) -> list[Path]: + return testdb.get_ref_files(gz)[:n] + + +def make_args(testdb: TestDB, + outfile: Path, + *, + q_opt: Optional[list[FilePath]] = None, # Query files with -q option + q_list: Optional[Path] = None, # Query list file + q_sigs: bool = False, # Use query signature file + r_opt: Optional[list[FilePath]] = None, # Ref files with -r option + r_list: Optional[Path] = None, # Ref list file + r_sigs: bool = False, # Use refs signature file + r_db: bool = False, # Use db for refs + with_db: bool = False, # Pass db at root level + kmerspec: Optional[KmerSpec] = None, # Pass -k and -p options + extra: Iterable[str] = (), # Additional args + ) -> list[str]: + + args: list[str] = ['dist', '-o', str(outfile), *extra] + + if with_db: + args.insert(0, f'--db={testdb.paths.root}') + + # Queries + if q_opt is not None: + for file in q_opt: + args.extend(['-q', str(file)]) + if q_list is not None: + args.extend(['--ql', str(q_list)]) + args.extend(['--qdir', str(testdb.paths.query_genomes_dir)]) + if q_sigs: + args.extend(['--qs', str(testdb.paths.query_signatures)]) + + # References + if r_opt is not None: + for file in r_opt: + args.extend(['-r', str(file)]) + if r_list is not None: + args.extend(['--rl', str(r_list)]) + args.extend(['--rdir', str(testdb.paths.ref_genomes_dir)]) + if r_sigs: + args.extend(['--rs', str(testdb.paths.ref_signatures)]) + if r_db: + args.append('--use-db') + + if kmerspec is not None: + args += [ + '-k', str(kmerspec.k), + '--prefix', kmerspec.prefix_str, + ] + + return args + + +def check_output(outfile: Path, expected_matrix: np.ndarray, nqueries: Optional[int], nrefs: Optional[int]): + dmat, row_ids, col_ids = load_dmat_csv(outfile) + assert np.allclose(dmat, expected_matrix[:nqueries, :nrefs], atol=1e-4) + # TODO: check row/col IDs + + +@pytest.fixture(scope='session') +def expected_matrix(testdb: TestDB): + return jaccarddist_matrix(testdb.query_signatures, testdb.ref_signatures) + + +@pytest.fixture(scope='session') +def expected_matrix_square(testdb: TestDB): + return jaccarddist_matrix(testdb.query_signatures, testdb.query_signatures) + + +@pytest.mark.parametrize( + 'q_type,r_type,queries_gz,refs_gz', + [ + ('sigs', 'sigs', False, False), + ('list', 'sigs', False, False), + ('sigs', 'list', False, False), + ('list', 'list', False, False), + ('opt', 'sigs', False, False), + ('sigs', 'opt', False, False), + ('sigs', 'db', False, False), + ('list', 'sigs', True, False), + ('sigs', 'list', False, True), + ], +) +def test_basic(testdb: TestDB, + q_type: str, # Query input format + r_type: str, # Referencer input format + queries_gz: bool, # Use gzipped query files + refs_gz: bool, # Use gzipped reference files + expected_matrix: np.ndarray, + tmp_path: Path, + ): + """Test test basic usage, with query/ref sequences/signatures from different sources.""" + + # Use only 10 query/reference files if passing by CLI option or by list file + nqueries = 10 if q_type in ('opt', 'list') else None + nrefs = 10 if r_type in ('opt', 'list') else None + + outfile = tmp_path / 'out.csv' + query_files = get_query_files(testdb, nqueries, queries_gz) + ref_files = get_ref_files(testdb, nrefs, refs_gz) + + # Query sequence specification + if q_type == 'opt': + query_kw = dict(q_opt=query_files) + elif q_type == 'list': + q_list = tmp_path / 'queries.txt' + write_lines(query_files, q_list) + query_kw = dict(q_list=q_list) + elif q_type == 'sigs': + query_kw = dict(q_sigs=True) + else: + assert False + + # Reference sequence specification + if r_type == 'opt': + ref_kw = dict(r_opt=ref_files) + elif r_type == 'list': + r_list = tmp_path / 'refs.txt' + write_lines(ref_files, r_list) + ref_kw = dict(r_list=r_list) + elif r_type == 'sigs': + ref_kw = dict(r_sigs=True) + elif r_type == 'db': + ref_kw = dict(r_db=True) + else: + assert False + + using_sigfile = q_type == 'sigs' or r_type == 'sigs' + + args = make_args( + testdb, + outfile, + **query_kw, + **ref_kw, + kmerspec=None if using_sigfile else testdb.kmerspec, + with_db=r_type == 'db', + ) + invoke_cli(args) + check_output(outfile, expected_matrix, nqueries, nrefs) + + +def test_default_kspec(testdb: TestDB, tmp_path: Path): + """Test that the default KmerSpec is used when not otherwise specified.""" + + outfile = tmp_path / 'out.csv' + q_list = tmp_path / 'queries.txt' + q_list.touch() + r_list = tmp_path / 'refs.txt' + r_list.touch() + + args = make_args(testdb, outfile, q_list=q_list, r_list=r_list, extra=('--dump-params',)) + + result = invoke_cli(args) + params = json.loads(result.stdout) + assert params['kmerspec'] == gjson.to_json(DEFAULT_KMERSPEC) + + +def test_kspec_err(testdb: TestDB, tmp_path: Path): + """Test selection of k-mer params and errors on inconsistencies.""" + + outfile = tmp_path / 'out.csv' + + query_files = get_query_files(testdb, 10) + query_lf = tmp_path / 'queries.txt' + write_lines(query_files, query_lf) + + ref_files = get_ref_files(testdb, 10) + ref_lf = tmp_path / 'refs.txt' + write_lines(ref_files, ref_lf) + + # Alternate kmerspec + kspec1 = testdb.kmerspec + kspec2 = KmerSpec(5, 'AC') + assert kspec2 != kspec1 + + # Create signatures file for alt kspec + alt_sigfile = tmp_path / 'alt_sigs.gs' + alt_sigs = SignatureList([], kspec2) + dump_signatures(alt_sigfile, alt_sigs) + + # Kspec from args inconsistent with query or reference signatures + msg = ( + f'Error: K-mer search parameters {{}} ({kspec2.k}/{kspec2.prefix_str}) ' + f'do not match those of {{}} ({kspec1.k}/{kspec1.prefix_str}).' + ) + + args = make_args(testdb, outfile, q_sigs=True, r_list=ref_lf, kmerspec=kspec2) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == msg.format('from command line options', 'query signatures') + + args = make_args(testdb, outfile, q_list=query_lf, r_sigs=True, kmerspec=kspec2) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == msg.format('from command line options', 'reference signatures') + + args = make_args(testdb, outfile, q_list=query_lf, r_db=True, kmerspec=kspec2, with_db=True) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == msg.format('from command line options', 'reference signatures') + + # Ref and query signatures have differing kspec + args = make_args(testdb, outfile, r_sigs=True, extra=('--qs', str(alt_sigfile))) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == msg.format('of query signatures', 'reference signatures') + + args = make_args(testdb, outfile, r_db=True, with_db=True, extra=('--qs', str(alt_sigfile))) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == msg.format('of query signatures', 'reference signatures') + + +@pytest.mark.parametrize( + 'q_type,queries_gz', + [ + ('sigs', False), + ('list', False), + ('opt', False), + ('list', True), + ], +) +def test_square(testdb: TestDB, + q_type: str, + queries_gz: bool, + expected_matrix_square: np.ndarray, + tmp_path: Path, + ): + """Test --square option.""" + + outfile = tmp_path / 'out.csv' + nqueries = 10 if q_type in ('opts', 'list') else None + query_files = get_query_files(testdb, nqueries, queries_gz) + + # Query sequence specification + if q_type == 'opt': + query_kw = dict(q_opt=query_files) + elif q_type == 'list': + q_list = tmp_path / 'queries.txt' + write_lines(query_files, q_list) + query_kw = dict(q_list=q_list) + elif q_type == 'sigs': + query_kw = dict(q_sigs=True) + else: + assert False + + args = make_args( + testdb, + outfile, + **query_kw, + kmerspec=None if q_type == 'sigs' else testdb.kmerspec, + extra=['--square'], + ) + invoke_cli(args) + + check_output(outfile, expected_matrix_square, nqueries, nqueries) diff --git a/tests/cli/test_query.py b/tests/cli/test_query.py new file mode 100644 index 0000000..2e80033 --- /dev/null +++ b/tests/cli/test_query.py @@ -0,0 +1,188 @@ +""" +Test the 'gambit query' CLI command using the testdb_210818 database. +""" + +from copy import copy +from typing import Optional, Iterable +from pathlib import Path + +import pytest + +from gambit.query import QueryResults +from gambit.util.misc import zip_strict +from gambit.util.io import write_lines, FilePath +from gambit.cli.common import strip_seq_file_ext + +from ..testdb import TestDB +from ..results import check_json_results, check_csv_results +from .common import invoke_cli + + +def make_args(testdb: TestDB, *, + positional_files: Optional[Iterable[FilePath]] = None, + list_file: Optional['FilePath'] = None, + sig_file: bool = False, + output: Optional['FilePath'] = None, + outfmt: Optional[str] = None, + strict: bool=False, + ) -> list[str]: + """Make command line arguments for querying.""" + + args: list[str] = [f'--db={testdb.paths.root}', 'query'] + args.append('--strict' if strict else '--no-strict') + + if output is not None: + args.append(f'--output={output}') + + if outfmt is not None: + args.append(f'--outfmt={outfmt}') + + if positional_files is not None: + args.extend(map(str, positional_files)) + + if list_file is not None: + args += ['-l', str(list_file), f'--ldir={testdb.paths.query_genomes_dir}'] + + if sig_file: + args.append(f'--sigfile={testdb.paths.query_signatures}') + + return args + + +def make_ref_results(testdb: TestDB, + labels: Iterable[str], + strict: bool, + files: Optional[Iterable[FilePath]], + nqueries: Optional[int] = None, + ): + """ + Make a copy of the reference query results to compare to, modifying to account for possibly + different query labels/files and # of queries. + """ + ref_results = copy(testdb.get_query_results(strict)) + ref_results.items = ref_results.items[:nqueries] + + for item, label in zip_strict(ref_results.items, labels): + item.label = label + + if files is None: + for item in ref_results.items: + item.file = None + + if files is not None: + for item, file in zip_strict(ref_results.items, files): + item.file = Path(file) + + return ref_results + + +def check_results(results_file: Path, out_fmt: str, ref_results: QueryResults): + """Check results output matches reference QueryResults object.""" + if out_fmt == 'json': + with open(results_file) as fh: + check_json_results(fh, ref_results, strict=False) + + elif out_fmt == 'csv': + with open(results_file) as fh: + check_csv_results(fh, ref_results, strict=False) + + elif out_fmt == 'archive': + assert results_file.is_file() # TODO + + else: + raise ValueError(f'Invalid out_fmt {out_fmt!r}') + + +@pytest.mark.parametrize( + ['nqueries', 'use_list_file', 'out_fmt', 'strict', 'gzipped'], + [ + (None, False, 'json', False, False), + (20, False, 'csv', False, False), + (None, False, 'json', True, False), + (20, False, 'csv', True, False), + (None, False, 'json', False, True), + (20, True, 'json', False, False), + (20, False, 'archive', False, False), + ], +) +def test_full_query(testdb: TestDB, + nqueries: Optional[int], + use_list_file: bool, + out_fmt: str, + strict: bool, + gzipped: bool, + tmp_path: Path, + ): + """Run a full query using the command line interface.""" + + query_files = testdb.get_query_files(gzipped)[:nqueries] + labels = [strip_seq_file_ext(file.name) for file in query_files] + ref_results: QueryResults = make_ref_results(testdb, labels, strict, query_files, nqueries=nqueries) + + results_file = tmp_path / ('results.' + out_fmt) + + if use_list_file: + list_file = tmp_path / 'genomes.txt' + write_lines(query_files, list_file) + input_kw = dict(list_file=list_file) + else: + input_kw = dict(positional_files=query_files) + + args = make_args( + testdb, + output=results_file, + outfmt=out_fmt, + strict=strict, + **input_kw, + ) + + invoke_cli(args) + check_results(results_file, out_fmt, ref_results) + + +# Not really necessary to check all combinations of parameters. +@pytest.mark.parametrize('out_fmt', ['json']) +@pytest.mark.parametrize('strict', [False]) +def test_sigfile(testdb: TestDB, out_fmt: str, strict: bool, tmp_path: Path): + """Test using signature file instead of parsing genome files.""" + + ref_results = make_ref_results(testdb, testdb.query_signatures.ids, strict, None) + + results_file = tmp_path / ('results.' + out_fmt) + + args = make_args( + testdb, + sig_file=True, + output=results_file, + outfmt=out_fmt, + strict=False, + ) + + invoke_cli(args) + check_results(results_file, out_fmt, ref_results) + + +def test_invalid(testdb: TestDB, tmp_path: Path): + """Test invalid parameter values exit with error code.""" + + query_files = testdb.get_query_files() + list_file = tmp_path / 'list.json' + write_lines(query_files, list_file) + results_file = tmp_path / ('results.json') + + # No genomes or signatures + args = make_args(testdb, output=results_file) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == 'Error: One of GENOMES, -l, or -s/--sigfile is required' + + # Multiple inputs + multi_msg = 'Error: GENOMES, -l, and -s/--sigfile are mutually exclusive' + + args = make_args(testdb, output=results_file, positional_files=query_files, list_file=list_file) + assert invoke_cli(args, success=False).stderr.strip() == multi_msg + + args = make_args(testdb, output=results_file, positional_files=query_files, sig_file=True) + assert invoke_cli(args, success=False).stderr.strip() == multi_msg + + args = make_args(testdb, output=results_file, list_file=list_file, sig_file=True) + assert invoke_cli(args, success=False).stderr.strip() == multi_msg diff --git a/tests/cli/test_cli_signatures.py b/tests/cli/test_signatures.py similarity index 81% rename from tests/cli/test_cli_signatures.py rename to tests/cli/test_signatures.py index 175cf24..7c5242c 100644 --- a/tests/cli/test_cli_signatures.py +++ b/tests/cli/test_signatures.py @@ -1,17 +1,20 @@ """Tests for the "signatures" command group.""" import json +from pathlib import Path import pytest import numpy as np -from gambit.cli.test import invoke_cli import gambit.util.json as gjson from gambit.sigs import SignaturesMeta, load_signatures from gambit.util.io import write_lines from gambit.cli.common import strip_seq_file_ext from gambit.kmers import DEFAULT_KMERSPEC +from ..testdb import TestDB +from .common import invoke_cli + class TestInfoCommand: @@ -20,18 +23,18 @@ def use_db(self, request): return request.param @pytest.fixture() - def base_args(self, testdb, use_db): + def base_args(self, testdb: TestDB, use_db: bool): if use_db: return [f'--db={testdb.paths.root}', 'signatures', 'info', '-d'] else: return ['signatures', 'info', str(testdb.paths.ref_signatures)] - def test_standard(self, base_args): + def test_standard(self, base_args: list[str]): result = invoke_cli(base_args) # TODO: check - def test_json(self, base_args, testdb): + def test_json(self, base_args: list[str], testdb: TestDB): args = [*base_args, '--json'] result = invoke_cli(args) @@ -40,13 +43,13 @@ def test_json(self, base_args, testdb): assert data['kmerspec'] == gjson.to_json(testdb.ref_signatures.kmerspec) assert data['metadata'] == gjson.to_json(testdb.ref_signatures.meta) - def test_ids(self, base_args, testdb): + def test_ids(self, base_args: list[str], testdb: TestDB): args = [*base_args, '-i'] result = invoke_cli(args) assert np.array_equal(result.stdout.splitlines(), testdb.ref_signatures.ids) - def test_invalid(self, testdb): + def test_invalid(self, testdb: TestDB): args = [ f'--db={testdb.paths.root}', 'signatures', @@ -60,16 +63,16 @@ def test_invalid(self, testdb): class TestCreateCommand: @pytest.fixture(params=[False]) - def infiles(self, request, testdb): + def infiles(self, request, testdb: TestDB): """Input files. Parameter is whether or not they are gzipped.""" - return [f.path for f in testdb.get_query_files(request.param)] + return testdb.get_query_files(request.param) @pytest.fixture() - def outfile(self, tmp_path): + def outfile(self, tmp_path: Path): return tmp_path / 'signatures.gs' @pytest.fixture(name='make_args') - def make_args_factory(self, outfile, testdb, infiles, tmp_path): + def make_args_factory(self, outfile: Path, testdb: TestDB, infiles: list[Path], tmp_path: Path): def make_args(opts=(), root_args=(), with_kspec=True, positional_files=True, list_file=False): args = list(root_args) @@ -96,11 +99,11 @@ def make_args(opts=(), root_args=(), with_kspec=True, positional_files=True, lis return make_args @pytest.fixture() - def default_ids(self, infiles): + def default_ids(self, infiles: list[Path]): return [strip_seq_file_ext(file.name) for file in infiles] @pytest.fixture(name='check_output') - def check_output_factory(self, outfile, testdb, infiles, default_ids): + def check_output_factory(self, outfile: Path, testdb: TestDB, infiles: list[Path], default_ids: list[str]): def check_output(expected_ids=default_ids): out = load_signatures(outfile) @@ -118,7 +121,7 @@ def test_basic(self, make_args, check_output, infiles): invoke_cli(args) check_output() - def test_list_file(self, make_args, infiles, default_ids): + def test_list_file(self, make_args, infiles: list[Path], default_ids: list[str]): """Test getting genome list from file.""" args = make_args(['--dump-params'], positional_files=False, list_file=True) @@ -127,7 +130,7 @@ def test_list_file(self, make_args, infiles, default_ids): assert params['files'] == list(map(str, infiles)) assert params['ids'] == default_ids - def test_with_metadata(self, testdb, make_args, check_output, tmp_path): + def test_with_metadata(self, testdb: TestDB, make_args, check_output, tmp_path: Path): """Test with ids and metadata JSON added.""" # Metadata file metadata = SignaturesMeta( @@ -157,7 +160,7 @@ def test_with_metadata(self, testdb, make_args, check_output, tmp_path): out = check_output(ids) assert out.meta == metadata - def test_kspec_from_refdb(self, make_args, testdb): + def test_kspec_from_refdb(self, make_args, testdb: TestDB): """Test with KmerSpec taken from reference database.""" args = make_args( ['-d', '--dump-params'], @@ -168,7 +171,7 @@ def test_kspec_from_refdb(self, make_args, testdb): params = json.loads(result.stdout) assert params['kmerspec'] == gjson.to_json(testdb.kmerspec) - def test_default_kspec(self, make_args, testdb): + def test_default_kspec(self, make_args, testdb: TestDB): """Test with default KmerSpec.""" args = make_args( ['--dump-params'], @@ -178,7 +181,7 @@ def test_default_kspec(self, make_args, testdb): params = json.loads(result.stdout) assert params['kmerspec'] == gjson.to_json(DEFAULT_KMERSPEC) - def test_invalid(self, testdb, make_args): + def test_invalid(self, testdb: TestDB, make_args): """Test with invalid parameter combinations.""" # No genomes @@ -203,7 +206,7 @@ def test_invalid(self, testdb, make_args): args = make_args(['-d'], with_kspec=False) invoke_cli(args, success=False) - def test_ids_wrong_len(self, testdb, make_args, tmp_path): + def test_ids_wrong_len(self, testdb: TestDB, make_args, tmp_path: Path): """Test where number of IDs does not match query files.""" ids = [f'seq-{i}' for i in range(len(testdb.query_genomes) - 1)] diff --git a/tests/cli/test_cli_tree.py b/tests/cli/test_tree.py similarity index 91% rename from tests/cli/test_cli_tree.py rename to tests/cli/test_tree.py index f25041b..f4a1e5e 100644 --- a/tests/cli/test_cli_tree.py +++ b/tests/cli/test_tree.py @@ -6,23 +6,27 @@ from Bio import Phylo from gambit.metric import jaccarddist_pairwise -from gambit.cli.test import invoke_cli from gambit.cluster import hclust, check_tree_matches_linkage from gambit.cli import common +from .common import invoke_cli + + @pytest.fixture() def expected_dmat(testdb): sigs = testdb.query_signatures return jaccarddist_pairwise(sigs) + @pytest.fixture() def expected_linkage(expected_dmat): return hclust(expected_dmat) + @pytest.mark.parametrize('from_sigs', [False, True]) def test_tree_command(from_sigs, expected_linkage, testdb): """Test running the command and checking the output.""" - seqfiles = [str(f.path) for f in testdb.get_query_files()] + seqfiles = [str(f) for f in testdb.get_query_files()] args = ['tree'] if from_sigs: diff --git a/gambit/test.py b/tests/common.py similarity index 94% rename from gambit/test.py rename to tests/common.py index 998963a..9bb5623 100644 --- a/gambit/test.py +++ b/tests/common.py @@ -1,13 +1,13 @@ """Helper functions for tests.""" -from typing import Optional, Tuple, Union, List, Sequence +from typing import Optional, Union, Sequence import numpy as np from gambit.kmers import KmerSpec, kmer_to_index from gambit.seq import seq_to_bytes, revcomp from gambit.sigs import KmerSignature, SignatureArray -from gambit.sigs.convert import dense_to_sparse, sparse_to_dense +from gambit.sigs.calc import dense_to_sparse, sparse_to_dense from gambit.db import Taxon @@ -123,7 +123,7 @@ def make_kmer_seq(kspec: KmerSpec, seqlen: int, kmer_interval: int, n_interval: Optional[int] = None, - ) -> Tuple[bytes, KmerSignature]: + ) -> tuple[bytes, KmerSignature]: """Create a DNA sequence with a known k-mer signature. The sequence consists of a background of N's with a k-mer match every ``kmer_interval`` @@ -187,7 +187,7 @@ def make_kmer_seqs(kspec: KmerSpec, seqlen: int, kmer_interval: int, n_interval: Optional[int] = None, - ) -> Tuple[List[bytes], KmerSignature]: + ) -> tuple[list[bytes], KmerSignature]: """Create a set of DNA sequences with known combined signature.""" seqs = [] @@ -208,7 +208,7 @@ def make_kmer_seqs(kspec: KmerSpec, return seqs, dense_to_sparse(vec) -def make_lineage(thresholds: Sequence[float]) -> List[Taxon]: +def make_lineage(thresholds: Sequence[Optional[float]]) -> list[Taxon]: """Create a linage of taxa that have the given distance thresholds. Parameters diff --git a/tests/conftest.py b/tests/conftest.py index e3431b6..69f55da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,8 @@ import numpy as np import pytest from sqlalchemy import create_engine -from testdb import TestDB + +from .testdb import TestDB @pytest.fixture(scope='session') @@ -46,5 +47,4 @@ def testdb(test_data): This cleans things up a bit from the way it was before, which was a bunch of separate fixtures with session scope named "testdb_*". """ - root = test_data / 'testdb_210818' return TestDB(test_data / 'testdb_210818') diff --git a/tests/data/__init__.py b/tests/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/testdb_210818/Readme.md b/tests/data/testdb_210818/Readme.md index f7fb1eb..4ebf707 100644 --- a/tests/data/testdb_210818/Readme.md +++ b/tests/data/testdb_210818/Readme.md @@ -9,12 +9,33 @@ repository. To use this database from the CLI, just pass this directory with the ## Files -* `ref-genomes.gdb` - reference genomes metadata. -* `ref-signatures.gs` - reference genome signatures. +* `ref-genomes.gdb`: reference genomes metadata. +* `ref-signatures.gs`: reference genome signatures. +* `ref-genomes.csv`: CSV file of basic reference genome properties (sort of redundant with `ref-genomes.gdb`). +* `ref-genomes/`: contains reference genome files in FASTA format. * `queries/` - * `queries.csv` - table listing all query files and expected results. - * `genomes/` - contains query genome files in FASTA format. - * `query-signatures.gs` - precalculated signatures for query genomes. -* `results/` - pre-calculated results using query files in `queries`. -* `generate-results.py` - script which generates result files in `results/`. - Verifies against expected result attributes in `queries.csv`. + * `queries.csv`: table listing all query files and expected results. + * `genomes/`: contains query genome files in FASTA format. + * `query-signatures.gs`: precalculated signatures for query genomes. +* `results/`: pre-calculated results using query files in `queries`, exported in the "archive" JSON + format. Two sets of results, one with strict mode enabled and one without. These are used to + reconsitute the `gambit.query.QueryResults` instances using `gambit.results.ResultsArchiveReader`. +* `generate-results.py`: script which generates result files in `results/`. This will need to be + re-run if the query results object changes structure or if the "archive" JSON format changes. + Results are verified against contents of `queries.csv` before exporting. + + +### Query genome properties + +`queries.csv` contains information on expected results for each query genome. This should stay +constant even if the exported files change format in future releases. + +Contains the following columns: + +- `name`: File name. +- `predicted`: Name of predicted taxon in strict mode, or empty if no prediction. +- `primary`: Description of primary genome match in strict mode, or empty if no prediction. +- `closest`: Description of closest genome match. +- `warnings`: Whether warnings should be generated in strict mode. + +In non-strict mode, the primary match will the set to the closest match. diff --git a/tests/data/testdb_210818/generate-results.py b/tests/data/testdb_210818/generate-results.py index 2e0cafe..ef2620a 100755 --- a/tests/data/testdb_210818/generate-results.py +++ b/tests/data/testdb_210818/generate-results.py @@ -9,13 +9,19 @@ import sys from pathlib import Path -from csv import DictReader -from gambit.seq import SequenceFile -from gambit.db import ReferenceDatabase, reportable_taxon -from gambit.query import QueryParams, query_parse -from gambit.results.archive import ResultsArchiveWriter +from gambit.query import QueryParams, QueryResults, query_parse +from gambit.results import ResultsArchiveWriter from gambit.util.misc import zip_strict +from gambit.util.io import FilePath + + +THISDIR = Path(__file__).parent +ROOTDIR = THISDIR.parent.parent.parent + +sys.path.insert(0, str(ROOTDIR)) +from tests.testdb import TestDB, TestQueryGenome +from tests.results import check_results as check_results_base PARAMS = { @@ -24,36 +30,17 @@ } -def load_query_data(): - with open('queries/queries.csv', newline='') as f: - rows = list(DictReader(f)) - - genomes_dir = Path('queries/genomes') - - for row in rows: - row['warnings'] = row['warnings'].lower() == 'true' - row['file'] = SequenceFile( - path=genomes_dir / (row['name'] + '.fasta'), - format='fasta', - ) +def check_results(queries: list[TestQueryGenome], query_files: list[FilePath], results: QueryResults): + """Check query results object against queries.csv table before exporting.""" - return rows - - -def check_results(queries, results): strict = results.params.classify_strict - for query, item in zip_strict(queries, results.items): - warnings = [] + for query, query_file, item in zip_strict(queries, query_files, results.items): clsresult = item.classifier_result predicted = clsresult.predicted_taxon - assert item.input.file == query['file'] - - # No errors - assert clsresult.success - assert clsresult.error is None + assert item.file == Path(query_file) # Check if warnings expected (only if in strict mode) assert bool(clsresult.warnings) == (strict and query['warnings']) @@ -70,60 +57,30 @@ def check_results(queries, results): assert predicted.name == query['predicted'] assert clsresult.primary_match.genome.description == query['primary'] - else: - assert clsresult.primary_match == clsresult.closest_match - assert predicted is clsresult.primary_match.matched_taxon - - assert item.report_taxon is reportable_taxon(predicted) - else: assert predicted is None assert clsresult.primary_match is None assert item.report_taxon is None - # Closest matches - assert len(item.closest_genomes) == results.params.report_closest - assert item.closest_genomes[0] == clsresult.closest_match - assert item.closest_genomes[0].genome.description == query['closest'] - - for i in range(1, results.params.report_closest): - assert item.closest_genomes[i].distance >= item.closest_genomes[i-1].distance - - # Next taxon - nt = clsresult.next_taxon - if nt is None: - # Predicted should be most specific possible - assert clsresult.closest_match.matched_taxon == clsresult.closest_match.genome.taxon - - else: - assert nt.distance_threshold is not None - assert nt.distance_threshold < clsresult.closest_match.distance - - # This should hold true as long as the primary match is the closest match, just warn if - # it fails. - if predicted is not None: - if predicted not in nt.ancestors(): - warnings.append(f'Next taxon {nt.name} not a descendant of predicted taxon {predicted.name}') - - # Display warnings - for w in warnings: - print(f'[Query "{query["name"]}"]:', w, file=sys.stderr) - def main(): - queries = load_query_data() - query_files = [query['file'] for query in queries] - db = ReferenceDatabase.load_from_dir('.') + testdb = TestDB(THISDIR) + db = testdb.refdb + query_files = testdb.get_query_files(relative=True) writer = ResultsArchiveWriter(pretty=True) for label, params in PARAMS.items(): + print('Running query:', label) results = query_parse(db, query_files, params) - check_results(queries, results) + check_results_base(results) + check_results(testdb.query_genomes, query_files, results) with open(f'results/{label}.json', 'wt') as f: writer.export(f, results) + print('done!\n\n') + if __name__ == '__main__': main() diff --git a/tests/data/testdb_210818/results/non_strict.json b/tests/data/testdb_210818/results/non_strict.json index 57763ea..ebea251 100644 --- a/tests/data/testdb_210818/results/non_strict.json +++ b/tests/data/testdb_210818/results/non_strict.json @@ -1,6 +1,6 @@ { "extra": {}, - "gambit_version": "0.4.0", + "gambit_version": "1.0.1", "genomeset": { "key": "gambit/testdb_210818", "version": "1.0" @@ -96,14 +96,8 @@ "matched_taxon": null } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/unclassifiable.fasta" - }, - "label": "unclassifiable.fasta" - }, + "file": "queries/genomes/unclassifiable.fasta", + "label": "queries/genomes/unclassifiable.fasta", "report_taxon": null }, { @@ -226,14 +220,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1.fasta" - }, - "label": "A1.fasta" - }, + "file": "queries/genomes/A1.fasta", + "label": "queries/genomes/A1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1" } @@ -358,14 +346,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C1.fasta" - }, - "label": "A1_B1_C1.fasta" - }, + "file": "queries/genomes/A1_B1_C1.fasta", + "label": "queries/genomes/A1_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C1" } @@ -490,14 +472,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C2.fasta" - }, - "label": "A1_B1_C2.fasta" - }, + "file": "queries/genomes/A1_B1_C2.fasta", + "label": "queries/genomes/A1_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C2" } @@ -622,14 +598,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C3.fasta" - }, - "label": "A1_B1_C3.fasta" - }, + "file": "queries/genomes/A1_B1_C3.fasta", + "label": "queries/genomes/A1_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C3" } @@ -754,14 +724,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C4.fasta" - }, - "label": "A1_B1_C4.fasta" - }, + "file": "queries/genomes/A1_B1_C4.fasta", + "label": "queries/genomes/A1_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C4" } @@ -886,14 +850,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2.fasta" - }, - "label": "A1_B2.fasta" - }, + "file": "queries/genomes/A1_B2.fasta", + "label": "queries/genomes/A1_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2" } @@ -1018,14 +976,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C1.fasta" - }, - "label": "A1_B2_C1.fasta" - }, + "file": "queries/genomes/A1_B2_C1.fasta", + "label": "queries/genomes/A1_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C1" } @@ -1150,14 +1102,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C2.fasta" - }, - "label": "A1_B2_C2.fasta" - }, + "file": "queries/genomes/A1_B2_C2.fasta", + "label": "queries/genomes/A1_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C2" } @@ -1282,14 +1228,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C3.fasta" - }, - "label": "A1_B2_C3.fasta" - }, + "file": "queries/genomes/A1_B2_C3.fasta", + "label": "queries/genomes/A1_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C3" } @@ -1414,14 +1354,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C4.fasta" - }, - "label": "A1_B2_C4.fasta" - }, + "file": "queries/genomes/A1_B2_C4.fasta", + "label": "queries/genomes/A1_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C4" } @@ -1546,14 +1480,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3.fasta" - }, - "label": "A1_B3.fasta" - }, + "file": "queries/genomes/A1_B3.fasta", + "label": "queries/genomes/A1_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3" } @@ -1678,14 +1606,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C1.fasta" - }, - "label": "A1_B3_C1.fasta" - }, + "file": "queries/genomes/A1_B3_C1.fasta", + "label": "queries/genomes/A1_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C1" } @@ -1810,14 +1732,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C2.fasta" - }, - "label": "A1_B3_C2.fasta" - }, + "file": "queries/genomes/A1_B3_C2.fasta", + "label": "queries/genomes/A1_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C2" } @@ -1942,14 +1858,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C3.fasta" - }, - "label": "A1_B3_C3.fasta" - }, + "file": "queries/genomes/A1_B3_C3.fasta", + "label": "queries/genomes/A1_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C3" } @@ -2074,14 +1984,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C4.fasta" - }, - "label": "A1_B3_C4.fasta" - }, + "file": "queries/genomes/A1_B3_C4.fasta", + "label": "queries/genomes/A1_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C4" } @@ -2206,14 +2110,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2.fasta" - }, - "label": "A2.fasta" - }, + "file": "queries/genomes/A2.fasta", + "label": "queries/genomes/A2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2" } @@ -2338,14 +2236,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1.fasta" - }, - "label": "A2_B1.fasta" - }, + "file": "queries/genomes/A2_B1.fasta", + "label": "queries/genomes/A2_B1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1" } @@ -2470,14 +2362,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C1.fasta" - }, - "label": "A2_B1_C1.fasta" - }, + "file": "queries/genomes/A2_B1_C1.fasta", + "label": "queries/genomes/A2_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C1" } @@ -2602,14 +2488,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C2.fasta" - }, - "label": "A2_B1_C2.fasta" - }, + "file": "queries/genomes/A2_B1_C2.fasta", + "label": "queries/genomes/A2_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C2" } @@ -2734,14 +2614,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C3.fasta" - }, - "label": "A2_B1_C3.fasta" - }, + "file": "queries/genomes/A2_B1_C3.fasta", + "label": "queries/genomes/A2_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C3" } @@ -2866,14 +2740,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C4.fasta" - }, - "label": "A2_B1_C4.fasta" - }, + "file": "queries/genomes/A2_B1_C4.fasta", + "label": "queries/genomes/A2_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C4" } @@ -2998,14 +2866,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2.fasta" - }, - "label": "A2_B2.fasta" - }, + "file": "queries/genomes/A2_B2.fasta", + "label": "queries/genomes/A2_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2" } @@ -3130,14 +2992,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C1.fasta" - }, - "label": "A2_B2_C1.fasta" - }, + "file": "queries/genomes/A2_B2_C1.fasta", + "label": "queries/genomes/A2_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C1" } @@ -3262,14 +3118,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C2.fasta" - }, - "label": "A2_B2_C2.fasta" - }, + "file": "queries/genomes/A2_B2_C2.fasta", + "label": "queries/genomes/A2_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C2" } @@ -3394,14 +3244,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C3.fasta" - }, - "label": "A2_B2_C3.fasta" - }, + "file": "queries/genomes/A2_B2_C3.fasta", + "label": "queries/genomes/A2_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C3" } @@ -3526,14 +3370,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C4.fasta" - }, - "label": "A2_B2_C4.fasta" - }, + "file": "queries/genomes/A2_B2_C4.fasta", + "label": "queries/genomes/A2_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C4" } @@ -3660,14 +3498,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3.fasta" - }, - "label": "A2_B3.fasta" - }, + "file": "queries/genomes/A2_B3.fasta", + "label": "queries/genomes/A2_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3" } @@ -3792,14 +3624,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C1.fasta" - }, - "label": "A2_B3_C1.fasta" - }, + "file": "queries/genomes/A2_B3_C1.fasta", + "label": "queries/genomes/A2_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C1" } @@ -3924,14 +3750,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C2.fasta" - }, - "label": "A2_B3_C2.fasta" - }, + "file": "queries/genomes/A2_B3_C2.fasta", + "label": "queries/genomes/A2_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C2" } @@ -4056,14 +3876,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C3.fasta" - }, - "label": "A2_B3_C3.fasta" - }, + "file": "queries/genomes/A2_B3_C3.fasta", + "label": "queries/genomes/A2_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C3" } @@ -4188,14 +4002,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C4.fasta" - }, - "label": "A2_B3_C4.fasta" - }, + "file": "queries/genomes/A2_B3_C4.fasta", + "label": "queries/genomes/A2_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C4" } @@ -4320,14 +4128,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3.fasta" - }, - "label": "A3.fasta" - }, + "file": "queries/genomes/A3.fasta", + "label": "queries/genomes/A3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3" } @@ -4452,14 +4254,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1.fasta" - }, - "label": "A3_B1.fasta" - }, + "file": "queries/genomes/A3_B1.fasta", + "label": "queries/genomes/A3_B1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1" } @@ -4584,14 +4380,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C1.fasta" - }, - "label": "A3_B1_C1.fasta" - }, + "file": "queries/genomes/A3_B1_C1.fasta", + "label": "queries/genomes/A3_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C1" } @@ -4716,14 +4506,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C2.fasta" - }, - "label": "A3_B1_C2.fasta" - }, + "file": "queries/genomes/A3_B1_C2.fasta", + "label": "queries/genomes/A3_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C2" } @@ -4848,14 +4632,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C3.fasta" - }, - "label": "A3_B1_C3.fasta" - }, + "file": "queries/genomes/A3_B1_C3.fasta", + "label": "queries/genomes/A3_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C3" } @@ -4980,14 +4758,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C4.fasta" - }, - "label": "A3_B1_C4.fasta" - }, + "file": "queries/genomes/A3_B1_C4.fasta", + "label": "queries/genomes/A3_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C4" } @@ -5112,14 +4884,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2.fasta" - }, - "label": "A3_B2.fasta" - }, + "file": "queries/genomes/A3_B2.fasta", + "label": "queries/genomes/A3_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2" } @@ -5244,14 +5010,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C1.fasta" - }, - "label": "A3_B2_C1.fasta" - }, + "file": "queries/genomes/A3_B2_C1.fasta", + "label": "queries/genomes/A3_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C1" } @@ -5376,14 +5136,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C2.fasta" - }, - "label": "A3_B2_C2.fasta" - }, + "file": "queries/genomes/A3_B2_C2.fasta", + "label": "queries/genomes/A3_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C2" } @@ -5508,14 +5262,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C3.fasta" - }, - "label": "A3_B2_C3.fasta" - }, + "file": "queries/genomes/A3_B2_C3.fasta", + "label": "queries/genomes/A3_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C3" } @@ -5640,14 +5388,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C4.fasta" - }, - "label": "A3_B2_C4.fasta" - }, + "file": "queries/genomes/A3_B2_C4.fasta", + "label": "queries/genomes/A3_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C4" } @@ -5772,14 +5514,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3.fasta" - }, - "label": "A3_B3.fasta" - }, + "file": "queries/genomes/A3_B3.fasta", + "label": "queries/genomes/A3_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3" } @@ -5904,14 +5640,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C1.fasta" - }, - "label": "A3_B3_C1.fasta" - }, + "file": "queries/genomes/A3_B3_C1.fasta", + "label": "queries/genomes/A3_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C1" } @@ -6036,14 +5766,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C2.fasta" - }, - "label": "A3_B3_C2.fasta" - }, + "file": "queries/genomes/A3_B3_C2.fasta", + "label": "queries/genomes/A3_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C2" } @@ -6168,14 +5892,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C3.fasta" - }, - "label": "A3_B3_C3.fasta" - }, + "file": "queries/genomes/A3_B3_C3.fasta", + "label": "queries/genomes/A3_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C3" } @@ -6300,14 +6018,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C4.fasta" - }, - "label": "A3_B3_C4.fasta" - }, + "file": "queries/genomes/A3_B3_C4.fasta", + "label": "queries/genomes/A3_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C4" } @@ -6432,14 +6144,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/inconsistent.fasta" - }, - "label": "inconsistent.fasta" - }, + "file": "queries/genomes/inconsistent.fasta", + "label": "queries/genomes/inconsistent.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2" } @@ -6566,14 +6272,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/primary_not_closest.fasta" - }, - "label": "primary_not_closest.fasta" - }, + "file": "queries/genomes/primary_not_closest.fasta", + "label": "queries/genomes/primary_not_closest.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2" } @@ -6595,5 +6295,5 @@ "name": "testdb_210818", "version": "1.0" }, - "timestamp": "2022-01-09T17:29:21.540969" + "timestamp": "2024-11-30T22:19:38.878341" } \ No newline at end of file diff --git a/tests/data/testdb_210818/results/strict.json b/tests/data/testdb_210818/results/strict.json index af37d71..259a6c5 100644 --- a/tests/data/testdb_210818/results/strict.json +++ b/tests/data/testdb_210818/results/strict.json @@ -1,6 +1,6 @@ { "extra": {}, - "gambit_version": "0.4.0", + "gambit_version": "1.0.1", "genomeset": { "key": "gambit/testdb_210818", "version": "1.0" @@ -96,14 +96,8 @@ "matched_taxon": null } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/unclassifiable.fasta" - }, - "label": "unclassifiable.fasta" - }, + "file": "queries/genomes/unclassifiable.fasta", + "label": "queries/genomes/unclassifiable.fasta", "report_taxon": null }, { @@ -226,14 +220,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1.fasta" - }, - "label": "A1.fasta" - }, + "file": "queries/genomes/A1.fasta", + "label": "queries/genomes/A1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1" } @@ -358,14 +346,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C1.fasta" - }, - "label": "A1_B1_C1.fasta" - }, + "file": "queries/genomes/A1_B1_C1.fasta", + "label": "queries/genomes/A1_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C1" } @@ -490,14 +472,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C2.fasta" - }, - "label": "A1_B1_C2.fasta" - }, + "file": "queries/genomes/A1_B1_C2.fasta", + "label": "queries/genomes/A1_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C2" } @@ -622,14 +598,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C3.fasta" - }, - "label": "A1_B1_C3.fasta" - }, + "file": "queries/genomes/A1_B1_C3.fasta", + "label": "queries/genomes/A1_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C3" } @@ -754,14 +724,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C4.fasta" - }, - "label": "A1_B1_C4.fasta" - }, + "file": "queries/genomes/A1_B1_C4.fasta", + "label": "queries/genomes/A1_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C4" } @@ -886,14 +850,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2.fasta" - }, - "label": "A1_B2.fasta" - }, + "file": "queries/genomes/A1_B2.fasta", + "label": "queries/genomes/A1_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2" } @@ -1018,14 +976,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C1.fasta" - }, - "label": "A1_B2_C1.fasta" - }, + "file": "queries/genomes/A1_B2_C1.fasta", + "label": "queries/genomes/A1_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C1" } @@ -1150,14 +1102,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C2.fasta" - }, - "label": "A1_B2_C2.fasta" - }, + "file": "queries/genomes/A1_B2_C2.fasta", + "label": "queries/genomes/A1_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C2" } @@ -1282,14 +1228,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C3.fasta" - }, - "label": "A1_B2_C3.fasta" - }, + "file": "queries/genomes/A1_B2_C3.fasta", + "label": "queries/genomes/A1_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C3" } @@ -1414,14 +1354,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C4.fasta" - }, - "label": "A1_B2_C4.fasta" - }, + "file": "queries/genomes/A1_B2_C4.fasta", + "label": "queries/genomes/A1_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C4" } @@ -1546,14 +1480,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3.fasta" - }, - "label": "A1_B3.fasta" - }, + "file": "queries/genomes/A1_B3.fasta", + "label": "queries/genomes/A1_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3" } @@ -1678,14 +1606,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C1.fasta" - }, - "label": "A1_B3_C1.fasta" - }, + "file": "queries/genomes/A1_B3_C1.fasta", + "label": "queries/genomes/A1_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C1" } @@ -1810,14 +1732,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C2.fasta" - }, - "label": "A1_B3_C2.fasta" - }, + "file": "queries/genomes/A1_B3_C2.fasta", + "label": "queries/genomes/A1_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C2" } @@ -1942,14 +1858,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C3.fasta" - }, - "label": "A1_B3_C3.fasta" - }, + "file": "queries/genomes/A1_B3_C3.fasta", + "label": "queries/genomes/A1_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C3" } @@ -2074,14 +1984,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C4.fasta" - }, - "label": "A1_B3_C4.fasta" - }, + "file": "queries/genomes/A1_B3_C4.fasta", + "label": "queries/genomes/A1_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C4" } @@ -2206,14 +2110,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2.fasta" - }, - "label": "A2.fasta" - }, + "file": "queries/genomes/A2.fasta", + "label": "queries/genomes/A2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2" } @@ -2338,14 +2236,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1.fasta" - }, - "label": "A2_B1.fasta" - }, + "file": "queries/genomes/A2_B1.fasta", + "label": "queries/genomes/A2_B1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1" } @@ -2470,14 +2362,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C1.fasta" - }, - "label": "A2_B1_C1.fasta" - }, + "file": "queries/genomes/A2_B1_C1.fasta", + "label": "queries/genomes/A2_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C1" } @@ -2602,14 +2488,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C2.fasta" - }, - "label": "A2_B1_C2.fasta" - }, + "file": "queries/genomes/A2_B1_C2.fasta", + "label": "queries/genomes/A2_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C2" } @@ -2734,14 +2614,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C3.fasta" - }, - "label": "A2_B1_C3.fasta" - }, + "file": "queries/genomes/A2_B1_C3.fasta", + "label": "queries/genomes/A2_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C3" } @@ -2866,14 +2740,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C4.fasta" - }, - "label": "A2_B1_C4.fasta" - }, + "file": "queries/genomes/A2_B1_C4.fasta", + "label": "queries/genomes/A2_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C4" } @@ -2998,14 +2866,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2.fasta" - }, - "label": "A2_B2.fasta" - }, + "file": "queries/genomes/A2_B2.fasta", + "label": "queries/genomes/A2_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2" } @@ -3130,14 +2992,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C1.fasta" - }, - "label": "A2_B2_C1.fasta" - }, + "file": "queries/genomes/A2_B2_C1.fasta", + "label": "queries/genomes/A2_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C1" } @@ -3262,14 +3118,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C2.fasta" - }, - "label": "A2_B2_C2.fasta" - }, + "file": "queries/genomes/A2_B2_C2.fasta", + "label": "queries/genomes/A2_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C2" } @@ -3394,14 +3244,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C3.fasta" - }, - "label": "A2_B2_C3.fasta" - }, + "file": "queries/genomes/A2_B2_C3.fasta", + "label": "queries/genomes/A2_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C3" } @@ -3526,14 +3370,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C4.fasta" - }, - "label": "A2_B2_C4.fasta" - }, + "file": "queries/genomes/A2_B2_C4.fasta", + "label": "queries/genomes/A2_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C4" } @@ -3660,14 +3498,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3.fasta" - }, - "label": "A2_B3.fasta" - }, + "file": "queries/genomes/A2_B3.fasta", + "label": "queries/genomes/A2_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3" } @@ -3792,14 +3624,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C1.fasta" - }, - "label": "A2_B3_C1.fasta" - }, + "file": "queries/genomes/A2_B3_C1.fasta", + "label": "queries/genomes/A2_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C1" } @@ -3924,14 +3750,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C2.fasta" - }, - "label": "A2_B3_C2.fasta" - }, + "file": "queries/genomes/A2_B3_C2.fasta", + "label": "queries/genomes/A2_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C2" } @@ -4056,14 +3876,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C3.fasta" - }, - "label": "A2_B3_C3.fasta" - }, + "file": "queries/genomes/A2_B3_C3.fasta", + "label": "queries/genomes/A2_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C3" } @@ -4188,14 +4002,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C4.fasta" - }, - "label": "A2_B3_C4.fasta" - }, + "file": "queries/genomes/A2_B3_C4.fasta", + "label": "queries/genomes/A2_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C4" } @@ -4320,14 +4128,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3.fasta" - }, - "label": "A3.fasta" - }, + "file": "queries/genomes/A3.fasta", + "label": "queries/genomes/A3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3" } @@ -4452,14 +4254,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1.fasta" - }, - "label": "A3_B1.fasta" - }, + "file": "queries/genomes/A3_B1.fasta", + "label": "queries/genomes/A3_B1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1" } @@ -4584,14 +4380,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C1.fasta" - }, - "label": "A3_B1_C1.fasta" - }, + "file": "queries/genomes/A3_B1_C1.fasta", + "label": "queries/genomes/A3_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C1" } @@ -4716,14 +4506,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C2.fasta" - }, - "label": "A3_B1_C2.fasta" - }, + "file": "queries/genomes/A3_B1_C2.fasta", + "label": "queries/genomes/A3_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C2" } @@ -4848,14 +4632,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C3.fasta" - }, - "label": "A3_B1_C3.fasta" - }, + "file": "queries/genomes/A3_B1_C3.fasta", + "label": "queries/genomes/A3_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C3" } @@ -4980,14 +4758,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C4.fasta" - }, - "label": "A3_B1_C4.fasta" - }, + "file": "queries/genomes/A3_B1_C4.fasta", + "label": "queries/genomes/A3_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C4" } @@ -5112,14 +4884,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2.fasta" - }, - "label": "A3_B2.fasta" - }, + "file": "queries/genomes/A3_B2.fasta", + "label": "queries/genomes/A3_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2" } @@ -5244,14 +5010,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C1.fasta" - }, - "label": "A3_B2_C1.fasta" - }, + "file": "queries/genomes/A3_B2_C1.fasta", + "label": "queries/genomes/A3_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C1" } @@ -5376,14 +5136,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C2.fasta" - }, - "label": "A3_B2_C2.fasta" - }, + "file": "queries/genomes/A3_B2_C2.fasta", + "label": "queries/genomes/A3_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C2" } @@ -5508,14 +5262,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C3.fasta" - }, - "label": "A3_B2_C3.fasta" - }, + "file": "queries/genomes/A3_B2_C3.fasta", + "label": "queries/genomes/A3_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C3" } @@ -5640,14 +5388,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C4.fasta" - }, - "label": "A3_B2_C4.fasta" - }, + "file": "queries/genomes/A3_B2_C4.fasta", + "label": "queries/genomes/A3_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C4" } @@ -5772,14 +5514,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3.fasta" - }, - "label": "A3_B3.fasta" - }, + "file": "queries/genomes/A3_B3.fasta", + "label": "queries/genomes/A3_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3" } @@ -5904,14 +5640,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C1.fasta" - }, - "label": "A3_B3_C1.fasta" - }, + "file": "queries/genomes/A3_B3_C1.fasta", + "label": "queries/genomes/A3_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C1" } @@ -6036,14 +5766,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C2.fasta" - }, - "label": "A3_B3_C2.fasta" - }, + "file": "queries/genomes/A3_B3_C2.fasta", + "label": "queries/genomes/A3_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C2" } @@ -6168,14 +5892,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C3.fasta" - }, - "label": "A3_B3_C3.fasta" - }, + "file": "queries/genomes/A3_B3_C3.fasta", + "label": "queries/genomes/A3_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C3" } @@ -6300,14 +6018,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C4.fasta" - }, - "label": "A3_B3_C4.fasta" - }, + "file": "queries/genomes/A3_B3_C4.fasta", + "label": "queries/genomes/A3_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C4" } @@ -6339,7 +6051,7 @@ }, "success": true, "warnings": [ - "Query matched 2 inconsistent taxa: 8:A2_B2, 7:A2_B1. Reporting lowest common ancestor of this set." + "Query matched 2 inconsistent taxa: 7:A2_B1, 8:A2_B2. Reporting lowest common ancestor of this set." ] }, "closest_genomes": [ @@ -6434,14 +6146,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/inconsistent.fasta" - }, - "label": "inconsistent.fasta" - }, + "file": "queries/genomes/inconsistent.fasta", + "label": "queries/genomes/inconsistent.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2" } @@ -6570,14 +6276,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/primary_not_closest.fasta" - }, - "label": "primary_not_closest.fasta" - }, + "file": "queries/genomes/primary_not_closest.fasta", + "label": "queries/genomes/primary_not_closest.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1" } @@ -6599,5 +6299,5 @@ "name": "testdb_210818", "version": "1.0" }, - "timestamp": "2022-01-09T17:29:22.044407" + "timestamp": "2024-11-30T22:19:39.124711" } \ No newline at end of file diff --git a/tests/db/__init__.py b/tests/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/db/test_db_migrate.py b/tests/db/test_db_migrate.py deleted file mode 100644 index de8845b..0000000 --- a/tests/db/test_db_migrate.py +++ /dev/null @@ -1,68 +0,0 @@ -"""Test the gambit.db.migrate module.""" - -import pytest -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker -from alembic.migration import MigrationContext -from alembic.script import ScriptDirectory - -from gambit.db.migrate import (current_head, current_revision, is_current_revision, init_db, - get_alembic_config) -from gambit.db import models - - -# Expected current head revision -# Need to update this value every time a new revision is introduced -CURRENT_HEAD = 'c43540b80d50' - -# Old revision number to test. Must actually exist in the scripts directory. -# TODO - set this once we have more than one revision file -OLD_REVISION = None - - -def test_current_head(): - assert current_head() == CURRENT_HEAD - - -class TestCurrentRevision: - """Test the current_revision() and is_current_revision() functions.""" - - def test_uninitialized(self): - """Test on uninitialized database (not stamped).""" - engine = create_engine('sqlite:///:memory:') - assert current_revision(engine) is None - assert not is_current_revision(engine) - - def test_empty(self): - """Test on freshly initialized database.""" - engine = create_engine('sqlite:///:memory:') - init_db(engine) - assert current_revision(engine) == CURRENT_HEAD - assert is_current_revision(engine) - - @pytest.mark.skipif(OLD_REVISION is None, reason='No older revisions to test.') - def test_old(self): - """Test on uninitialized database stamped with an old revision no.""" - engine = create_engine('sqlite:///:memory:') - conf = get_alembic_config(engine) - scriptdir = ScriptDirectory.from_config(conf) - - with engine.connect() as conn: - ctx = MigrationContext.configure(conn) - ctx.stamp(scriptdir, OLD_REVISION) - - assert current_revision(engine) == OLD_REVISION - - -def test_init_db(): - """Test the init_db() function.""" - engine = create_engine('sqlite:///:memory:') - init_db(engine) - - # Check current revision matches - assert current_revision(engine) == current_head() - - # Check we can query all models (won't return any results, but would fail if tables didn't exist). - session = sessionmaker(engine)() - for model in [models.Genome, models.ReferenceGenomeSet, models.AnnotatedGenome, models.Taxon]: - session.query(model).all() diff --git a/tests/db/test_db_models.py b/tests/db/test_models.py similarity index 92% rename from tests/db/test_db_models.py rename to tests/db/test_models.py index 2e95515..8654ce6 100644 --- a/tests/db/test_db_models.py +++ b/tests/db/test_models.py @@ -3,12 +3,16 @@ Uses the included testdb_210818 database. """ +from typing import Iterable, Optional + import pytest from sqlalchemy.orm import sessionmaker from gambit.db import models from gambit.db import Genome, ReferenceGenomeSet, AnnotatedGenome, Taxon +from ..testdb import TestDB + # Some arbitrary JSON data JSON_DATA = { @@ -85,7 +89,7 @@ def test_extra_json(self, empty_db_session): class TestReferenceGenomeSet: """Test ReferenceGenomeSet model.""" - def test_root_taxa(self, testdb): + def test_root_taxa(self, testdb: TestDB): session = testdb.Session() gset = session.query(ReferenceGenomeSet).one() assert {taxon.name for taxon in gset.root_taxa()} == {'A1', 'A2', 'A3'} @@ -104,7 +108,7 @@ def test_extra_json(self, empty_db_session): class TestAnnotatedGenome: """Test AnnotatedGEnome model.""" - def test_hybrid_props(self, testdb): + def test_hybrid_props(self, testdb: TestDB): session = testdb.Session() hybrid_attrs = [ @@ -124,7 +128,7 @@ def test_hybrid_props(self, testdb): class TestTaxon: """Test Taxon model.""" - def test_tree(self, testdb): + def test_tree(self, testdb: TestDB): """Test tree structure.""" session = testdb.Session() gset = session.query(ReferenceGenomeSet).one() @@ -163,7 +167,7 @@ def test_tree(self, testdb): # Check leaves assert set(taxon.leaves()) == {d for d in subtree_set if d.isleaf()} - def check_traversal(self, iterator, postorder, expected): + def check_traversal(self, iterator: Iterable[Taxon], postorder: bool, expected: set[Taxon]): seen = set() for taxon in iterator: @@ -176,7 +180,7 @@ def check_traversal(self, iterator, postorder, expected): assert seen == expected - def test_genome_membership(self, testdb): + def test_genome_membership(self, testdb: TestDB): """Test the subtree_genomes() and has_genome() methods.""" session = testdb.Session() @@ -212,7 +216,7 @@ def test_extra_json(self, empty_db_session): ) check_json_col(empty_db_session, taxon, 'extra') - def taxon_by_name(self, session, name): + def taxon_by_name(self, session, name: str): return session.query(Taxon).filter_by(name=name).one() def test_common_ancestry(self, testdb): @@ -220,7 +224,7 @@ def test_common_ancestry(self, testdb): session = testdb.Session() - def check(names, expected_names): + def check(names: list[str], expected_names: list[str]): taxa = [self.taxon_by_name(session, name) for name in names] ca = Taxon.common_ancestors(taxa) lca = Taxon.lca(taxa) @@ -246,12 +250,12 @@ def check(names, expected_names): check(['A1', 'A2'], []) check(['A1_B1', 'A1_B2', 'A2_B1'], []) - def test_ancestor_of_rank(self, testdb): + def test_ancestor_of_rank(self, testdb: TestDB): """Test ancestor_of_rank() method.""" session = testdb.Session() - def check(name, rank, expected): + def check(name: str, rank: str, expected: Optional[str]): taxon = self.taxon_by_name(session, name) ancestor = taxon.ancestor_of_rank(rank) assert (ancestor is None) == (expected is None) @@ -271,7 +275,7 @@ def check(name, rank, expected): check('A1_B1_C1', 'strain', 'A1_B1_C1') check('A1_B1_C1', 'foo', None) - def test_lineage_ranks(self, testdb): + def test_lineage_ranks(self, testdb: TestDB): """Test lineage() method with argument.""" session = testdb.Session() diff --git a/tests/db/test_db_refdb.py b/tests/db/test_refdb.py similarity index 93% rename from tests/db/test_db_refdb.py rename to tests/db/test_refdb.py index 7e12c60..7229bb2 100644 --- a/tests/db/test_db_refdb.py +++ b/tests/db/test_refdb.py @@ -1,12 +1,15 @@ """Test gambit.db.refdb.""" import random +from pathlib import Path import pytest -from sqlalchemy.orm import sessionmaker from gambit.db import refdb -from gambit.db import Genome, ReferenceGenomeSet, AnnotatedGenome, Taxon, ReferenceDatabase, DatabaseLoadError +from gambit.db import Genome, ReferenceGenomeSet, AnnotatedGenome, Taxon, ReferenceDatabase, \ + DatabaseLoadError, default_sessionmaker + +from ..testdb import TestDB GENOME_ID_ATTRS = {attr: getattr(Genome, attr) for attr in Genome.ID_ATTRS} @@ -24,7 +27,7 @@ class TestGenomeIDMapping: def session(self, make_empty_db): """In-memory database containing genomes which have values for all ID attributes.""" engine = make_empty_db() - Session = sessionmaker(engine) + Session = default_sessionmaker(engine, readonly=False) session = Session() gset = ReferenceGenomeSet( @@ -126,7 +129,7 @@ def test_genomes_by_id(self, session): class TestReferenceDatabase: """Test the ReferenceDatabase class.""" - def test_locate_files(self, tmp_path): + def test_locate_files(self, tmp_path: Path): genomes = tmp_path / 'test.gdb' genomes2 = tmp_path / 'test2.gdb' signatures = tmp_path / 'test.gs' @@ -158,10 +161,10 @@ def test_locate_files(self, tmp_path): signatures.touch() assert ReferenceDatabase.locate_files(tmp_path) == (genomes, signatures) - def test_load(self, testdb): + def test_load(self, testdb: TestDB): db = ReferenceDatabase.load(testdb.paths.ref_genomes, testdb.paths.ref_signatures) check_loaded_db(db) - def test_load_db_from_dir(self, testdb): + def test_load_db_from_dir(self, testdb: TestDB): db = ReferenceDatabase.load_from_dir(testdb.paths.root) check_loaded_db(db) diff --git a/tests/db/test_db_sqla.py b/tests/db/test_sqla.py similarity index 77% rename from tests/db/test_db_sqla.py rename to tests/db/test_sqla.py index 4361930..5603f35 100644 --- a/tests/db/test_db_sqla.py +++ b/tests/db/test_sqla.py @@ -3,9 +3,10 @@ from sqlalchemy.orm import Session from gambit.db import ReadOnlySession, file_sessionmaker +from ..testdb import TestDB -def test_file_sessionmaker(testdb): +def test_file_sessionmaker(testdb: TestDB): db_file = testdb.paths.ref_genomes maker = file_sessionmaker(db_file, readonly=True) @@ -15,5 +16,5 @@ def test_file_sessionmaker(testdb): assert isinstance(maker(), Session) for cls in [Session, ReadOnlySession]: - maker = file_sessionmaker(db_file, cls=cls) + maker = file_sessionmaker(db_file, class_=cls) assert isinstance(maker(), cls) diff --git a/tests/results.py b/tests/results.py new file mode 100644 index 0000000..33f0d0b --- /dev/null +++ b/tests/results.py @@ -0,0 +1,276 @@ +"""Helper code for tests related to the QueryResults class or exported result data.""" + +import csv +import json +from typing import TextIO, Any, Iterable, Optional +from pathlib import Path +from warnings import warn + +import numpy as np + +from gambit.util.json import to_json +from gambit.query import QueryResults, QueryResultItem, QueryParams +from gambit.classify import GenomeMatch, ClassifierResult +from gambit.util.misc import zip_strict +from gambit.db.models import AnnotatedGenome, Taxon, reportable_taxon + + +def check_results(results: QueryResults, warnings: bool = True): + """Check invariants on query results object.""" + + assert results.params is not None + + for item in results.items: + check_result_item(item, results.params, warnings=warnings) + + +def check_result_item(item: QueryResultItem, params: QueryParams, warnings: bool = True): + """Check invariants on successful query result item.""" + + clsresult = item.classifier_result + predicted = clsresult.predicted_taxon + + # No errors + assert clsresult.success + assert clsresult.error is None + + # Predicted taxon + if predicted is not None: + assert clsresult.primary_match is not None + + if not params.classify_strict: + assert clsresult.primary_match == clsresult.closest_match + assert predicted is clsresult.primary_match.matched_taxon + + assert item.report_taxon is reportable_taxon(predicted) + + else: + assert clsresult.primary_match is None + assert item.report_taxon is None + + # Closest matches + assert len(item.closest_genomes) == params.report_closest + assert item.closest_genomes[0] == clsresult.closest_match + + # Check closest_genomes is sorted by distance + for i in range(1, params.report_closest): + assert item.closest_genomes[i].distance >= item.closest_genomes[i-1].distance + + # Next taxon + nt = clsresult.next_taxon + if nt is None: + # Predicted should be most specific possible + assert clsresult.closest_match.matched_taxon == clsresult.closest_match.genome.taxon + + else: + assert nt.distance_threshold is not None + assert nt.distance_threshold < clsresult.closest_match.distance + + # This should hold true as long as the primary match is the closest match, just warn if + # it fails. + if predicted is not None: + if predicted not in nt.ancestors(): + if warnings: + warn( + f'[Query {item.label}]: ' + f'next taxon {nt.name} not a descendant of predicted taxon {predicted.name}' + ) + + +def compare_genome_matches(match1: Optional[GenomeMatch], match2: Optional[GenomeMatch]): + """Assert two ``GenomeMatch`` instances are equal. + + The values for the ``distance`` attribute are only checked for approximate equality, to support + instances where one was loaded from a results archive (saving and loading a float in JSON is + lossy). + + Also allows one or both values to be None. + """ + if match1 is None or match2 is None: + assert match1 is None and match2 is None + return + + assert match1.genome == match2.genome + assert match1.matched_taxon == match2.matched_taxon + assert np.isclose(match1.distance, match2.distance) + + +def compare_classifier_results(result1: ClassifierResult, result2: ClassifierResult): + """Assert two ``ClassifierResult`` instances are equal.""" + assert result1.success == result2.success + assert result1.predicted_taxon == result2.predicted_taxon + compare_genome_matches(result1.primary_match, result2.primary_match) + compare_genome_matches(result1.closest_match, result2.closest_match) + assert result1.next_taxon == result2.next_taxon + assert set(result1.warnings) == set(result2.warnings) + assert result1.error == result2.error + + +def compare_result_items(item1: QueryResultItem, item2: QueryResultItem): + """Assert two ``QueryResultItem`` instances are equal. + + Does not compare the value of the ``input`` attributes. + """ + assert item1.report_taxon == item2.report_taxon + compare_classifier_results(item1.classifier_result, item2.classifier_result) + + assert len(item1.closest_genomes) == len(item2.closest_genomes) + for m1, m2 in zip(item1.closest_genomes, item2.closest_genomes): + compare_genome_matches(m1, m2) + + +def cmp_json_attrs(data: dict[str, Any], obj, attrnames: Iterable[str]): + """Assert JSON data values equals object attribute values for the given keys/names.""" + + for attr in attrnames: + assert data[attr] == getattr(obj, attr) + + +def cmp_taxon_json(data: dict[str, Any], taxon: Optional[Taxon]): + """Assert Taxon instance matches data in JSON export.""" + + if taxon is None: + assert data is None + + else: + assert data is not None + cmp_json_attrs(data, taxon, ['id', 'key', 'name', 'ncbi_id', 'rank']) + if taxon.distance_threshold is None: + assert data['distance_threshold'] is None + else: + assert data['distance_threshold'] is not None + assert np.isclose(data['distance_threshold'], taxon.distance_threshold) + + +def cmp_annnotatedgenome_json(data: dict[str, Any], genome: AnnotatedGenome): + """Assert AnnotatedGenome instance matches data in JSON export.""" + + assert data['id'] == genome.genome_id + cmp_json_attrs( + data, + genome, + ['key', 'description', 'organism', 'ncbi_db', 'ncbi_id', 'genbank_acc', 'refseq_acc'], + ) + for taxon_data, taxon in zip_strict(data['taxonomy'], genome.taxon.ancestors(True)): + cmp_taxon_json(taxon_data, taxon) + + +def cmp_genomematch_json(data, match: GenomeMatch): + """Assert GenomeMatch instance matches data in JSON export.""" + + assert np.isclose(data['distance'], match.distance) + cmp_annnotatedgenome_json(data['genome'], match.genome) + + cmp_taxon_json(data['matched_taxon'], match.matched_taxon) + + +def check_json_results(file: TextIO, results: QueryResults, strict: bool = False): + """Assert exported JSON data matches the given results object. + + "Strict" mode also compares the ``timestamp``, ``gambit_version``, and ``extra`` attributes + at the top level and expects that full input file paths must match instead of just file names. + + Parameters + ---------- + file + Opened results file. + results + Query results to check against. + strict + If True, expect that ``data`` was exported from the exact same ``results`` object. Otherwise + expect results from a separate query run with the same inputs. + + Raises + ------ + AssertionError + If any of the checks fail. + """ + + data = json.load(file) + + assert len(data['items']) == len(results.items) + cmp_json_attrs(data['genomeset'], results.genomeset, ['id', 'key', 'version', 'name', 'description']) + assert data['signaturesmeta'] == to_json(results.signaturesmeta) + + if strict: + assert data['timestamp'] == to_json(results.timestamp) + assert data['gambit_version'] == results.gambit_version + assert data['extra'] == results.extra + + for item, item_data in zip(results.items, data['items']): + + # Compare data['query'] <-> item.label / item.file + query = item_data['query'] + assert query['name'] == item.label + + if item.file is None: + assert query['path'] is None + + else: + # Check path matches exactly if strict mode, otherwise just file name + if strict: + assert query['path'] == str(item.file) + else: + assert Path(query['path']).name == item.file.name + + # Predicted/next taxon + cmp_taxon_json(item_data['predicted_taxon'], item.report_taxon) + cmp_taxon_json(item_data['next_taxon'], item.classifier_result.next_taxon) + + # Closest genomes + assert len(item_data['closest_genomes']) == len(item.closest_genomes) + for match, match_data in zip_strict(item.closest_genomes, item_data['closest_genomes']): + cmp_genomematch_json(match_data, match) + + +def cmp_csv_taxon(row: dict[str, str], taxon: Optional[Taxon], prefix: str): + + if taxon is None: + assert row[prefix + '.name'] == '' + assert row[prefix + '.rank'] == '' + assert row[prefix + '.ncbi_id'] == '' + assert row[prefix + '.threshold'] == '' + + else: + assert row[prefix + '.name'] == taxon.name + assert row[prefix + '.rank'] == taxon.rank + assert row[prefix + '.ncbi_id'] == str(taxon.ncbi_id or '') + + dt = row[prefix + '.threshold'] + if taxon.distance_threshold is None: + assert dt == '' + else: + assert np.isclose(float(dt), taxon.distance_threshold) + + +def check_csv_results(file: TextIO, results: QueryResults, strict: bool = False): + """Assert exported CSV data matches the given results object. + + Parameters + ---------- + file + Opened results file. + results + Query results to check against. + strict + If True, expect that ``data`` was exported from the exact same ``results`` object. Otherwise + expect results from a separate query run with the same inputs. + + Raises + ------ + AssertionError + If any of the checks fail. + """ + + rows = list(csv.DictReader(file)) + assert len(rows) == len(results.items) + + for item, row in zip(results.items, rows): + assert row['query'] == item.label + + cmp_csv_taxon(row, item.report_taxon, 'predicted') + cmp_csv_taxon(row, item.classifier_result.next_taxon, 'next') + + closest = item.closest_genomes[0] + assert np.isclose(float(row['closest.distance']), closest.distance) + assert row['closest.description'] == closest.genome.description diff --git a/tests/sigs/__init__.py b/tests/sigs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gambit/sigs/test.py b/tests/sigs/common.py similarity index 97% rename from gambit/sigs/test.py rename to tests/sigs/common.py index 063f237..78c5b79 100644 --- a/gambit/sigs/test.py +++ b/tests/sigs/common.py @@ -1,9 +1,9 @@ -"""Utilities for testing signature set types.""" +"""Common code for testing signature set types.""" import pytest import numpy as np -from .base import AbstractSignatureArray, sigarray_eq +from gambit.sigs.base import AbstractSignatureArray, sigarray_eq class AbstractSignatureArrayTests: diff --git a/tests/sigs/test_sigs_base.py b/tests/sigs/test_base.py similarity index 98% rename from tests/sigs/test_sigs_base.py rename to tests/sigs/test_base.py index 8024a75..0761b0d 100644 --- a/tests/sigs/test_sigs_base.py +++ b/tests/sigs/test_base.py @@ -6,8 +6,8 @@ from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures, \ AnnotatedSignatures, sigarray_eq, SignaturesMeta from gambit.kmers import KmerSpec -from gambit.test import make_signatures -from gambit.sigs.test import AbstractSignatureArrayTests +from ..common import make_signatures +from .common import AbstractSignatureArrayTests @pytest.fixture(params=['u8', 'i8', 'u4']) diff --git a/tests/sigs/test_sigs_calc.py b/tests/sigs/test_calc.py similarity index 74% rename from tests/sigs/test_sigs_calc.py rename to tests/sigs/test_calc.py index 3bf7f9a..1110d26 100644 --- a/tests/sigs/test_sigs_calc.py +++ b/tests/sigs/test_calc.py @@ -1,19 +1,21 @@ """Tests for gambit.search module.""" -from io import StringIO +from typing import Optional +from pathlib import Path import pytest import numpy as np from Bio import SeqIO from Bio.Seq import Seq -from gambit.sigs.calc import calc_signature, calc_file_signature, calc_file_signatures +from gambit.sigs.calc import calc_signature, calc_file_signature, calc_file_signatures, \ + dense_to_sparse, sparse_to_dense from gambit.kmers import KmerSpec, index_to_kmer -from gambit.seq import SEQ_TYPES, revcomp, SequenceFile -from gambit.test import fill_bytearray, make_kmer_seq, make_kmer_seqs, convert_seq -import gambit.util.io as ioutil -from gambit.sigs import sigarray_eq +from gambit.seq import SEQ_TYPES, revcomp +from gambit.sigs import sigarray_eq, KmerSignature +from gambit.util.io import open_compressed from gambit.util.progress import check_progress +from ..common import fill_bytearray, make_kmer_seq, make_kmer_seqs, convert_seq KSPEC = KmerSpec(11, 'AGTAC') @@ -110,6 +112,9 @@ def test_overlapping(self): assert all(kmer in expected for kmer in found) +RecordSets = list[tuple[list[SeqIO.SeqRecord], KmerSignature]] + + class TestCalcFileSignatures: @pytest.fixture(scope='class') @@ -132,30 +137,26 @@ def record_sets(self): return items - @pytest.fixture(scope='class', params=['fasta']) - def format(self, request): - return request.param - - @pytest.fixture(scope='class', params=[None, 'gzip']) + @pytest.fixture(scope='class', params=['none', 'gzip']) def compression(self, request): return request.param @pytest.fixture() - def files(self, record_sets, tmp_path, format, compression): + def files(self, record_sets: RecordSets, tmp_path: Path, compression: str): files = [] for i, (records, sig) in enumerate(record_sets): - file = SequenceFile(tmp_path / f'{i + 1}.fasta', format, compression) + file = tmp_path / f'{i + 1}.fasta' - with file.open('wt') as f: - SeqIO.write(records, f, format) + with open_compressed(file, 'wt', compression) as f: + SeqIO.write(records, f, 'fasta') files.append(file) return files - def test_calc_file_signature(self, record_sets, files): + def test_calc_file_signature(self, record_sets: RecordSets, files: list[Path]): """Test the calc_file_signature function.""" for file, (records, sig) in zip(files, record_sets): @@ -163,7 +164,7 @@ def test_calc_file_signature(self, record_sets, files): assert np.array_equal(result, sig) @pytest.mark.parametrize('concurrency', [None, 'threads', 'processes']) - def test_calc_file_signatures(self, record_sets, files, concurrency): + def test_calc_file_signatures(self, record_sets: RecordSets, files: list[Path], concurrency: Optional[str]): """Test the calc_file_signatures function.""" sigs = [sig for records, sig in record_sets] @@ -171,3 +172,28 @@ def test_calc_file_signatures(self, record_sets, files, concurrency): sigs2 = calc_file_signatures(KSPEC, files, progress=pconf, concurrency=concurrency) assert sigarray_eq(sigs, sigs2) + + +def test_dense_sparse_conversion(): + """Test conversion between dense and sparse representations of k-mer coordinates.""" + + for k in range(1, 10): + + kspec = KmerSpec(k, 'ATGAC') + + # Create dense signature with every 3rd k-mer + vec = np.zeros(kspec.nkmers, dtype=bool) + vec[np.arange(vec.size) % 3 == 0] = True + + # Convert to sparse + sig = dense_to_sparse(vec) + + assert len(sig) == vec.sum() + for index in sig: + assert vec[index] + + # Check sorted + assert np.all(np.diff(sig) > 0) + + # Check converting back + assert np.array_equal(vec, sparse_to_dense(kspec, sig)) diff --git a/tests/sigs/test_sigs_hdf5.py b/tests/sigs/test_hdf5.py similarity index 67% rename from tests/sigs/test_sigs_hdf5.py rename to tests/sigs/test_hdf5.py index 819d6ce..a142c7f 100644 --- a/tests/sigs/test_sigs_hdf5.py +++ b/tests/sigs/test_hdf5.py @@ -1,14 +1,18 @@ """Test gambit.sigs.hdf5.""" +from pathlib import Path + import pytest import h5py as h5 import numpy as np -from gambit.sigs.hdf5 import read_metadata, write_metadata, load_signatures_hdf5, dump_signatures_hdf5 -from gambit.sigs import SignaturesMeta, SignatureList, AnnotatedSignatures -from gambit.sigs.test import AbstractSignatureArrayTests +from gambit.sigs.hdf5 import read_metadata, write_metadata, load_signatures_hdf5, \ + dump_signatures_hdf5, HDF5Signatures +from gambit.sigs.base import SignaturesMeta, SignatureList, AnnotatedSignatures, \ + AbstractSignatureArray, SignaturesFileError, SignatureArray from gambit.kmers import KmerSpec -from gambit.test import make_signatures +from ..common import make_signatures +from .common import AbstractSignatureArrayTests # JSON data to use for metadata extra field @@ -21,7 +25,7 @@ @pytest.mark.parametrize('optional_attrs', [False, True]) -def test_metadata(tmp_path, optional_attrs): +def test_metadata(tmp_path: Path, optional_attrs: bool): """Test reading/writing metadata""" fname = tmp_path / 'test.gs' @@ -44,13 +48,42 @@ def test_metadata(tmp_path, optional_attrs): assert meta2 == meta -def dump_load(sigs, path, **kw): +def dump_load(sigs: AbstractSignatureArray, path: Path, **kw) -> HDF5Signatures: """Dump signatures to HDF5 file and load them again.""" f = path / 'test.gs' dump_signatures_hdf5(f, sigs, **kw) return load_signatures_hdf5(f) +def test_open_not_hdf5(tmp_path: Path): + """Test opening an invalid file.""" + + # Not an HDF5 file + file = tmp_path / 'not-hdf5.gs' + with open(file, 'w') as f: + f.write('foo') + + with pytest.raises(SignaturesFileError) as einfo: + load_signatures_hdf5(file) + + assert einfo.value.filename == str(file) + assert einfo.value.format == 'hdf5' + + +def test_open_invalid(tmp_path: Path): + """Test opening an invalid HDF5 file.""" + + file = tmp_path / 'invalid.gs' + with h5.File(file, 'w') as f: + pass # Empty + + with pytest.raises(SignaturesFileError) as einfo: + load_signatures_hdf5(file) + + assert einfo.value.filename == str(file) + assert einfo.value.format == 'hdf5' + + class TestHDF5Signatures: @pytest.fixture(scope='class') @@ -58,24 +91,24 @@ def kspec(self): return KmerSpec(8, 'ATG') @pytest.fixture(scope='class', params=[(1000, 'u8'), (1000, 'i4'), (0, 'u8')]) - def sigs(self, request, kspec): + def sigs(self, request, kspec: KmerSpec): n, dtype = request.param return make_signatures(kspec, n, dtype) @pytest.fixture(scope='class') - def h5file(self, tmp_path_factory, sigs): + def h5file(self, tmp_path_factory, sigs: SignatureArray): """Write signatures to file and return file name.""" fname = tmp_path_factory.mktemp('HDF5FileSignatures') / 'test.gs' dump_signatures_hdf5(fname, sigs) return fname @pytest.fixture() - def h5sigs(self, h5file): + def h5sigs(self, h5file: Path): """Open HDF5Signatures object.""" with load_signatures_hdf5(h5file) as sigs: yield sigs - def test_attrs(self, h5sigs, sigs): + def test_attrs(self, h5sigs: AnnotatedSignatures, sigs: SignatureArray): """Test basic attributes for signatures saved without metadata.""" assert h5sigs.kmerspec == sigs.kmerspec assert h5sigs.dtype == sigs.values.dtype @@ -83,7 +116,7 @@ def test_attrs(self, h5sigs, sigs): assert h5sigs.meta == SignaturesMeta() @pytest.mark.parametrize('id_type', [int, str]) - def test_attrs_meta(self, sigs, id_type, tmp_path): + def test_attrs_meta(self, sigs: SignatureArray, id_type: type, tmp_path: Path): """Test basic attributes for signatures saved with metadata.""" if id_type is int: @@ -106,7 +139,7 @@ def test_attrs_meta(self, sigs, id_type, tmp_path): assert np.array_equal(h5sigs.ids, ids) assert h5sigs.meta == meta - def test_close(self, h5sigs): + def test_close(self, h5sigs: HDF5Signatures): assert h5sigs.group assert h5sigs @@ -116,7 +149,7 @@ def test_close(self, h5sigs): h5sigs.close() - def test_context(self, h5sigs): + def test_context(self, h5sigs: HDF5Signatures): with h5sigs as value: assert value is h5sigs assert h5sigs.group @@ -125,7 +158,7 @@ def test_context(self, h5sigs): assert not h5sigs.group assert not h5sigs - def test_create_from_list(self, sigs, tmp_path): + def test_create_from_list(self, sigs, tmp_path: Path): """Test creating from other AbstractSignatureArray type.""" siglist = SignatureList(sigs) @@ -134,7 +167,7 @@ def test_create_from_list(self, sigs, tmp_path): @pytest.mark.parametrize('from_list', [False, True]) @pytest.mark.parametrize('compression_level', [None, 7]) - def test_compression(self, from_list, compression_level, sigs, tmp_path): + def test_compression(self, from_list: bool, compression_level, sigs: SignatureArray, tmp_path: Path): """Test creating with gzip compression.""" create_from = SignatureList(sigs) if from_list else sigs diff --git a/tests/sigs/test_sigs_convert.py b/tests/sigs/test_sigs_convert.py deleted file mode 100644 index 59ccbb6..0000000 --- a/tests/sigs/test_sigs_convert.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Test the gambit.sigs.convert module.""" - -import pytest -import numpy as np - -from gambit.sigs.convert import dense_to_sparse, sparse_to_dense, can_convert, \ - check_can_convert, convert_dense, convert_sparse -from gambit.kmers import KmerSpec -from gambit.test import random_seq - - -def test_dense_sparse_conversion(): - """Test conversion between dense and sparse representations of k-mer coordinates.""" - - for k in range(1, 10): - - kspec = KmerSpec(k, 'ATGAC') - - # Create dense signature with every 3rd k-mer - vec = np.zeros(kspec.nkmers, dtype=bool) - vec[np.arange(vec.size) % 3 == 0] = True - - # Convert to sparse - sig = dense_to_sparse(vec) - - assert len(sig) == vec.sum() - for index in sig: - assert vec[index] - - # Check sorted - assert np.all(np.diff(sig) > 0) - - # Check converting back - assert np.array_equal(vec, sparse_to_dense(kspec, sig)) - - -class TestKmerSpecConversion: - """Test converting signatures from one KmerSpec to another.""" - - def test_can_convert(self): - from_kspec = KmerSpec(11, 'ATGAC') - - compatible = [ - KmerSpec(11, 'ATGAC'), - KmerSpec(8, 'ATGAC'), - KmerSpec(10, 'ATGACA'), - KmerSpec(8, 'ATGACA'), - ] - - for to_kspec in compatible: - assert can_convert(from_kspec, to_kspec) - check_can_convert(from_kspec, to_kspec) - - incompatible = [ - KmerSpec(11, 'CAGTA'), - KmerSpec(12, 'ATGAC'), - KmerSpec(11, 'ATGA'), - KmerSpec(11, 'ATGACT'), - ] - - for to_kspec in incompatible: - assert not can_convert(from_kspec, to_kspec) - with pytest.raises(ValueError): - check_can_convert(from_kspec, to_kspec) - - @pytest.fixture(scope='class') - def seqs(self): - np.random.seed(0) - return [random_seq(100_000) for _ in range(100)] - - @pytest.mark.parametrize('to_kspec', [ - KmerSpec(10, 'ATGAC'), # Reduce k - KmerSpec(8, 'ATGAC'), # Reduce k - KmerSpec(9, 'ATGACGT'), # Extend prefix - KmerSpec(7, 'ATGACGT'), # Extend prefix and reduce k further - ]) - def test_convert(self, seqs, to_kspec): - from gambit.sigs.calc import calc_signature - - from_kspec = KmerSpec(11, 'ATGAC') - - for seq in seqs: - from_sig = calc_signature(from_kspec, seq) - from_vec = sparse_to_dense(from_kspec.k, from_sig) - - to_vec = convert_dense(from_kspec, to_kspec, from_vec) - to_sig = convert_sparse(from_kspec, to_kspec, from_sig) - - found_sig = calc_signature(to_kspec, seq) - - assert np.array_equal(to_sig, found_sig) - assert np.array_equal(to_vec, sparse_to_dense(to_kspec.k, found_sig)) diff --git a/tests/test_classify.py b/tests/test_classify.py index 5c3a522..70cffe9 100644 --- a/tests/test_classify.py +++ b/tests/test_classify.py @@ -4,7 +4,7 @@ from gambit.classify import matching_taxon, find_matches, consensus_taxon, GenomeMatch from gambit.db import Taxon, AnnotatedGenome -from gambit.test import make_lineage +from .common import make_lineage def test_matching_taxon(): diff --git a/tests/test_kmers.py b/tests/test_kmers.py index fb0ab5a..14b0f21 100644 --- a/tests/test_kmers.py +++ b/tests/test_kmers.py @@ -7,7 +7,7 @@ from gambit import kmers from gambit.kmers import KmerSpec import gambit.util.json as gjson -from gambit.test import convert_seq, make_kmer_seq +from .common import convert_seq, make_kmer_seq class TestIndices: diff --git a/tests/test_metric.py b/tests/test_metric.py index 6ac340e..1b16e1b 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -6,12 +6,12 @@ import numpy as np from gambit.metric import jaccard, jaccarddist, jaccard_bits, jaccard_generic, jaccarddist_array, \ - jaccarddist_matrix, jaccarddist_pairwise, num_pairs, SCORE_DTYPE, BOUNDS_DTYPE -from gambit.sigs.convert import sparse_to_dense -from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures + jaccarddist_matrix, jaccarddist_pairwise, num_pairs, SCORE_DTYPE +from gambit.sigs.calc import sparse_to_dense +from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures, BOUNDS_DTYPE from gambit.kmers import KmerSpec -from gambit.test import make_signatures from gambit.util.progress import check_progress +from .common import make_signatures @pytest.fixture( diff --git a/tests/test_query.py b/tests/test_query.py index 55542cc..7562b83 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -2,41 +2,56 @@ import pytest -from gambit.query import QueryInput, query_parse, compare_result_items -from gambit.seq import SequenceFile +from gambit.query import QueryResults, query, query_parse from gambit.util.misc import zip_strict from gambit import __version__ as GAMBIT_VERSION +from .testdb import TestDB +from .results import compare_result_items, check_results -class TestQueryInput: - """Test QueryInput class.""" - def test_convert(self): - file = SequenceFile('path/to/file.fa', 'fasta') - qi = QueryInput('foo', file) +@pytest.mark.parametrize('strict', [False, True]) +class TestQuery: + """Run a full query using the Python API.""" - assert QueryInput.convert(qi) is qi - assert QueryInput.convert('foo') == QueryInput('foo', None) - assert QueryInput.convert(file) == QueryInput(str(file.path), file) + def check_results(self, results: QueryResults, ref_results: QueryResults): - with pytest.raises(TypeError): - QueryInput.convert(3.4) + # Check general invariants of QueryResults object + check_results(results, warnings=False) # One of the queries is designed to generate a warning + assert results.gambit_version == GAMBIT_VERSION + # Check matches reference results + assert results.params == ref_results.params + assert results.genomeset == ref_results.genomeset + assert results.signaturesmeta == ref_results.signaturesmeta -@pytest.mark.parametrize('strict', [False, True]) -def test_query_python(testdb, strict): - """Run a full query using the Python API.""" - ref_results = testdb.get_query_results(strict) - params = ref_results.params - query_files = [item['file'] for item in testdb.query_genomes] + for item, ref_item in zip_strict(results.items, ref_results.items): + compare_result_items(item, ref_item) + + def test_query(self, testdb: TestDB, strict: bool): + """Test the query() function.""" + + ref_results = testdb.get_query_results(strict) + params = ref_results.params + query_sigs = testdb.query_signatures + + results = query(testdb.refdb, query_sigs, params) + self.check_results(results, ref_results) + + for sigid, item in zip_strict(query_sigs.ids, results.items): + assert item.file is None + assert item.label == sigid + + def test_query_parse(self, testdb: TestDB, strict: bool): + """Test the query_parse() function.""" - results = query_parse(testdb.refdb, query_files, params) + ref_results = testdb.get_query_results(strict) + params = ref_results.params + query_files = testdb.get_query_files() - assert results.params == params - assert results.genomeset == ref_results.genomeset - assert results.signaturesmeta == testdb.ref_signatures.meta - assert results.gambit_version == GAMBIT_VERSION + results = query_parse(testdb.refdb, query_files, params) + self.check_results(results, ref_results) - for file, item, ref_item in zip_strict(query_files, results.items, ref_results.items): - assert item.input.file == file - compare_result_items(item, ref_item) + for file, item in zip_strict(query_files, results.items): + assert item.file == file + assert item.label == str(file) diff --git a/tests/test_results.py b/tests/test_results.py index 1c95371..70a7908 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -1,15 +1,28 @@ +"""Test the gambit.results module. + +Each ResultsExporter subclass is tested by exporting a fake QueryResults instance to a string buffer, +parsing the exported results and checking the against the original using the functions in the +.results tests helper module. +""" + +from io import StringIO + import pytest -from gambit.query import QueryResults, QueryResultItem, QueryInput, QueryParams +from gambit.query import QueryResults, QueryResultItem, QueryParams from gambit.classify import ClassifierResult, GenomeMatch from gambit.db import ReferenceGenomeSet, Genome from gambit.sigs import SignaturesMeta -from gambit.seq import SequenceFile -from gambit.results.base import export_to_buffer -from gambit.results.json import JSONResultsExporter -from gambit.results.csv import CSVResultsExporter -from gambit.results.archive import ResultsArchiveReader, ResultsArchiveWriter -from gambit.results.test import check_json_results, check_csv_results +from gambit.results import JSONResultsExporter, CSVResultsExporter, ResultsArchiveReader, ResultsArchiveWriter +from .results import check_json_results, check_csv_results + + +def export_to_buffer(results: QueryResults, exporter) -> StringIO: + """Export query results to a `StringIO` buffer.""" + buf = StringIO() + exporter.export(buf, results) + buf.seek(0) + return buf @pytest.fixture() @@ -70,14 +83,15 @@ def results(session): for i, cr in enumerate(classifier_results): predicted = cr.predicted_taxon items.append(QueryResultItem( - input=QueryInput(f'query-{i}', SequenceFile(f'query-{i}.fasta', 'fasta')), + f'query-{i}', classifier_result=cr, + file=f'query-{i}.fasta', report_taxon=None if predicted is None else predicted.parent if i % 4 == 0 else predicted, closest_genomes=[cr.closest_match], )) - # Set one input file to None - items[-1].input.file = None + # Set one file to None + items[-1].file = None return QueryResults( items=items, @@ -93,21 +107,21 @@ def results(session): ) -def test_json(results): +def test_json(results: QueryResults): """Test JSONResultsExporter.""" exporter = JSONResultsExporter() buf = export_to_buffer(results, exporter) check_json_results(buf, results, strict=True) -def test_csv(results): +def test_csv(results: QueryResults): """Test CSVResultsExporter.""" exporter = CSVResultsExporter() buf = export_to_buffer(results, exporter) check_csv_results(buf, results, strict=True) -def test_results_archive(session, results): +def test_results_archive(session, results: QueryResults): """Test ResultArchiveWriter/Reader.""" writer = ResultsArchiveWriter() buf = export_to_buffer(results, writer) diff --git a/tests/test_seq.py b/tests/test_seq.py index 379e560..ee20b4b 100644 --- a/tests/test_seq.py +++ b/tests/test_seq.py @@ -8,10 +8,12 @@ import numpy as np from Bio import Seq, SeqIO -from gambit.seq import SequenceFile, revcomp +from gambit.seq import revcomp, parse_seqs from gambit.kmers import nkmers, index_to_kmer from gambit.util.misc import zip_strict -from gambit.test import random_seq +from gambit.util.io import open_compressed + +from .common import random_seq # Complements to nucleotide ASCII codes @@ -88,162 +90,42 @@ def test_validate_dna_seq_bytes(): # TODO -class TestSequenceFile: - """Test the SequenceFile class.""" - - @pytest.fixture(params=['fasta'], scope='class') - def format(self, request): - """SequenceFile.format attribute.""" - return request.param - - @pytest.fixture(params=[None, 'gzip'], scope='class') - def compression(self, request): - """SequenceFile.compression attribute.""" - return request.param - - @pytest.fixture() - def seqfile(self, tmpdir, format, compression): - """A SequenceFile instance pointing to a file in a test temporary directory. - - File does not yet exist. - """ - path = tmpdir.join('test.' + format).strpath - return SequenceFile(path, format, compression) - - @pytest.fixture(scope='class') - def seqrecords(self): - """A collection of random Bio.SeqIO.SeqRecord's.""" - np.random.seed(0) - records = [] - - for i in range(20): - seq = Seq.Seq(random_seq(1000).decode('ascii')) - id_ = 'seq{}'.format(i + 1) - descr = 'Test sequence {}'.format(i + 1) - records.append(SeqIO.SeqRecord(seq, id=id_, description=descr)) - - return tuple(records) - - @pytest.fixture - def file_contents(self, format, seqrecords): - """String contents of a file containing the sequence records.""" - buf = StringIO() - SeqIO.write(seqrecords, buf, format) - return buf.getvalue() - - def test_constructor(self): - """Test constructor.""" - - seqfile = SequenceFile('foo.fasta', 'fasta') - assert seqfile == SequenceFile('foo.fasta', 'fasta', None) - assert seqfile.path == Path('foo.fasta') - - def test_eq(self): - """Test equality checking of instances.""" - seqfiles = [ - SequenceFile(p, format, comp) - for p in ['foo', 'bar'] - for format in ['fasta', 'genbank'] - for comp in [None, 'gzip'] - ] - - for i, seqfile1 in enumerate(seqfiles): - for j, seqfile2 in enumerate(seqfiles): - if i == j: - # Try with different instance - assert seqfile1 == SequenceFile(seqfile1.path, seqfile1.format, seqfile1.compression) - else: - assert seqfile1 != seqfile2 - - def test_special_methods(self, seqfile): - assert str(seqfile) == str(seqfile.path) - assert os.fspath(seqfile) == str(seqfile) - - # Check os.PathLike interface - text = 'foo' - with open(seqfile, 'w') as f: - f.write(text) - with open(seqfile, 'r') as f: - read = f.read() - assert read == text - - @pytest.mark.parametrize('binary', [False, True]) - def test_open(self, seqfile, file_contents, binary): - """Test sequence file is readable and writable.""" - - to_write = file_contents.encode() if binary else file_contents - - # Write data to file - with seqfile.open('wb' if binary else 'wt') as fobj: - fobj.write(to_write) - - # Read it back and make sure it's the same - with seqfile.open('rb' if binary else 'rt') as fobj: - read = fobj.read() - - assert read == to_write - - def test_parse(self, seqfile, seqrecords, file_contents): - """Test the parse() method, ensure we get the right records back.""" - - # Write pre-formatted contents to file - with seqfile.open('wt') as fobj: - fobj.write(file_contents) - - # Parse the sequences from it - parsed = list(seqfile.parse()) - - # Check they match - assert len(parsed) == len(seqrecords) - - for parsed_req, orig_req in zip_strict(parsed, seqrecords): - assert isinstance(parsed_req, SeqIO.SeqRecord) - assert parsed_req.seq == orig_req.seq - assert parsed_req.id == orig_req.id - - # This is something stupid BioPython does - when writing a SeqRecord - # as FASTA it writes the .id attributed followed by a space and then - # the .description attribute on the description line. When reading, - # the entire line is used as the description attribute and so - # includes the ID - assert parsed_req.description == orig_req.id + ' ' + orig_req.description - - def test_path_arg(self): - """Test the "path" argument to the constructor.""" - - path = Path('foo/bar.fasta') - - seqfile1 = SequenceFile(path, 'fasta') - assert isinstance(seqfile1, SequenceFile) and seqfile1.path == path +@pytest.fixture(scope='module') +def seqrecords(): + """Random SeqRecord instances.""" - seqfile2 = SequenceFile(str(path), 'fasta') - assert isinstance(seqfile2, SequenceFile) and seqfile2.path == path + records = [] - def test_absolute(self): - """Test the absolute() method.""" + np.random.seed(0) - relseqfile = SequenceFile('foo/bar.fasta', 'fasta') - assert not relseqfile.path.is_absolute() + for i in range(20): + seq = Seq.Seq(random_seq(1000).decode('ascii')) + id_ = f'seq{i + 1}' + descr = f'{id_} Test sequence {i + 1}' + records.append(SeqIO.SeqRecord(seq, id=id_, description=descr)) - absseqfile = relseqfile.absolute() - assert absseqfile.path.is_absolute() - assert absseqfile.path == relseqfile.path.absolute() + return records - absseqfile2 = absseqfile.absolute() - assert absseqfile2 == absseqfile - def test_from_paths(self, format, compression): - """Test the from_paths() class method.""" +@pytest.mark.parametrize('compression', ['none', 'gzip']) +@pytest.mark.parametrize('auto', [False, True]) +def test_parse_seqs(tmp_path: Path, seqrecords: list[SeqIO.SeqRecord], compression: str, auto: bool): + """Test the parse_seqs() function.""" - # List of unique path strings - paths = ['foo/bar{}.{}'.format(i, format) for i in range(20)] + # Write FASTA file + file = tmp_path / ('test.fa' + ('.gz' if compression == 'gzip' else '')) + with open_compressed(file, 'wt', compression) as fh: + SeqIO.write(seqrecords, fh, 'fasta') - seqfiles = SequenceFile.from_paths(paths, format, compression) + # Parse + with parse_seqs(file, 'fasta', compression='auto' if auto else compression) as parsed: + records2 = list(parsed) - assert len(paths) == len(seqfiles) + # Check ClosingIterator closes the underlying file object when last record is read + assert parsed.fobj.closed - for path, seqfile in zip_strict(paths, seqfiles): - assert isinstance(seqfile, SequenceFile) - assert str(seqfile.path) == path - assert seqfile.format == format - assert seqfile.compression == compression + # Check parsed records are correct + for record, record2 in zip_strict(seqrecords, records2): + assert record2.seq == record.seq + assert record2.id == record.id + assert record2.description == record.description diff --git a/tests/test_test.py b/tests/test_tests_common.py similarity index 85% rename from tests/test_test.py rename to tests/test_tests_common.py index a3f1da5..5ddac65 100644 --- a/tests/test_test.py +++ b/tests/test_tests_common.py @@ -1,13 +1,12 @@ -"""Test gambit.test module.""" +"""Test the common.py test module.""" import pytest import numpy as np -from gambit import test from gambit.kmers import KmerSpec, kmer_to_index, nkmers from gambit.seq import revcomp -from gambit.sigs.convert import dense_to_sparse -from gambit.util.progress import get_progress +from gambit.sigs.calc import dense_to_sparse +from . import common @pytest.mark.parametrize('k', [4, 6, 8]) @@ -15,7 +14,7 @@ @pytest.mark.parametrize('dtype', [np.dtype('u8'), np.dtype('u4')]) def test_make_signatures(k, n, dtype): np.random.seed(0) - sigs = test.make_signatures(k, n, dtype) + sigs = common.make_signatures(k, n, dtype) assert len(sigs) == n for i, sig in enumerate(sigs): @@ -32,7 +31,7 @@ def test_make_signatures(k, n, dtype): @pytest.mark.parametrize('chars', ['ACGT', 'XYZ']) def test_random_seq(n, chars): np.random.seed(0) - seq = test.random_seq(n, chars) + seq = common.random_seq(n, chars) assert isinstance(seq, bytes) assert len(seq) == n assert all(chr(c) in chars for c in seq) @@ -41,7 +40,7 @@ def test_random_seq(n, chars): @pytest.mark.parametrize('pattern', [b'N', b'ABC']) @pytest.mark.parametrize('n', [100, 1000]) def test_fill_bytearray(pattern, n): - arr = test.fill_bytearray(pattern, n) + arr = common.fill_bytearray(pattern, n) assert isinstance(arr, bytearray) assert len(arr) == n @@ -55,7 +54,7 @@ def test_fill_bytearray(pattern, n): @pytest.mark.parametrize('n_interval', [None, 5]) def test_make_kmer_seq(kspec, seqlen, kmer_interval, n_interval): np.random.seed(0) - seq, sig = test.make_kmer_seq(kspec, seqlen, kmer_interval, n_interval) + seq, sig = common.make_kmer_seq(kspec, seqlen, kmer_interval, n_interval) assert len(seq) == seqlen vec = np.zeros(kspec.nkmers, dtype=bool) @@ -80,7 +79,7 @@ def test_make_kmer_seq(kspec, seqlen, kmer_interval, n_interval): def test_make_lineage(): thresholds = [.1, .2, None, .3] n = len(thresholds) - taxa = test.make_lineage(thresholds) + taxa = common.make_lineage(thresholds) assert len(taxa) == n for i in range(n): diff --git a/tests/testdb.py b/tests/testdb.py index 63606a3..d0d5787 100644 --- a/tests/testdb.py +++ b/tests/testdb.py @@ -1,30 +1,42 @@ """Access test database data. """ -from typing import Callable +from typing import Callable, TypeVar, Generic, Any, overload, TypedDict from pathlib import Path -from types import SimpleNamespace +from dataclasses import dataclass from csv import DictReader import sqlite3 import gzip from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker -from gambit.seq import SequenceFile -from gambit.sigs import load_signatures -from gambit.db import ReferenceDatabase, ReadOnlySession, only_genomeset -from gambit.results.archive import ResultsArchiveReader +from gambit.kmers import KmerSpec +from gambit.sigs import load_signatures, AnnotatedSignatures +from gambit.db import ReferenceDatabase, only_genomeset, file_sessionmaker, default_sessionmaker +from gambit.results import ResultsArchiveReader +from gambit.query import QueryResults +from gambit.util.io import FilePath -class LazyAttribute: +T = TypeVar('T') + + +class LazyAttribute(Generic[T]): """Descriptor which initializes a property value the first time it is used.""" - def __init__(self, initializer: Callable, value_attr: str): + def __init__(self, initializer: Callable[[Any], T], value_attr: str): self.initializer = initializer self.value_attr = value_attr self.__doc__ = initializer.__doc__ + @overload + def __get__(self, instance: None, owner=None) -> 'LazyAttribute[T]': + pass + + @overload + def __get__(self, instance, owner=None) -> T: + pass + def __get__(self, instance, owner=None): if instance is None: return self @@ -39,11 +51,46 @@ def __get__(self, instance, owner=None): return value -def lazy(f: Callable) -> LazyAttribute: +def lazy(f: Callable[[Any], T]) -> LazyAttribute[T]: attr = '_' + f.__name__ return LazyAttribute(f, attr) +@dataclass +class TestDBPaths: + root: Path + # Reference genomes .gdb file + ref_genomes: Path + # Reference genomes .gs file + ref_signatures: Path + # Reference genomes .csv + refs_table: Path + # Directory containing reference genome FASTA files + ref_genomes_dir: Path + # queries.csv + queries_table: Path + # Directory containing query genome FASTA files + query_genomes_dir: Path + # Query genomes .gs file + query_signatures: Path + # Directory containing QueryResults exports in archive format. + results: Path + + +class TestQueryGenome(TypedDict): + name: str + predicted: str + primary: str + closest: str + warnings: bool + + +class TestRefGenome(TypedDict): + name: str + key: str + taxon: str + + class TestDB: """Object which provides access to test database resources. @@ -51,9 +98,14 @@ class TestDB: to how it would work if the attributes were separate Pytest fixtures. """ - def __init__(self, root): + paths: TestDBPaths + + # Prevent pytest interpreting as containing test methods + __test__ = False + + def __init__(self, root: FilePath): root = Path(root) - self.paths = SimpleNamespace( + self.paths = TestDBPaths( root=root, ref_genomes=root / 'ref-genomes.gdb', ref_signatures=root / 'ref-signatures.gs', @@ -65,15 +117,10 @@ def __init__(self, root): results=root / 'results/', ) - @lazy - def engine(self): - """SQLAlchemy engine connected to genome database.""" - return create_engine(f'sqlite:///{self.paths.ref_genomes}') - @lazy def Session(self): """Sessionmaker for the reference genome database.""" - return sessionmaker(self.engine, class_=ReadOnlySession) + return file_sessionmaker(self.paths.ref_genomes) def copy_session(self): """Create an in-memory copy of the test database.""" @@ -81,99 +128,102 @@ def copy_session(self): memory = sqlite3.connect(':memory:') src.backup(memory) engine = create_engine('sqlite://', creator=lambda: memory) - return sessionmaker(engine)() + return default_sessionmaker(engine)() @lazy - def ref_signatures(self): + def ref_signatures(self) -> AnnotatedSignatures: """K-mer signatures for reference genomes.""" - return load_signatures(self.paths.ref_signatures) + return load_signatures(self.paths.ref_signatures) # type: ignore @lazy - def query_signatures(self): + def query_signatures(self) -> AnnotatedSignatures: """K-mer signatures for query genomes.""" - return load_signatures(self.paths.query_signatures) + return load_signatures(self.paths.query_signatures) # type: ignore @lazy - def kmerspec(self): - return self.ref_signatures.kmerspec + def kmerspec(self) -> KmerSpec: + return self.ref_signatures.kmerspec # type: ignore @lazy - def refdb(self): + def refdb(self) -> ReferenceDatabase: """Full ReferenceDatabase object.""" session = self.Session() gset = only_genomeset(session) return ReferenceDatabase(gset, self.ref_signatures) - @classmethod - def _add_file_cols(cls, genomes_dir, row): - row['file'] = SequenceFile( - path=genomes_dir / (row['name'] + '.fasta'), - format='fasta', - ) - row['file_gz'] = SequenceFile( - path=genomes_dir / (row['name'] + '.fasta.gz'), - format='fasta', - compression='gzip', - ) - @lazy - def query_genomes(self): + def query_genomes(self) -> list[TestQueryGenome]: """Query genomes and their expected results.""" with open(self.paths.queries_table, newline='') as f: rows = list(DictReader(f)) for row in rows: + # Convert "warnings" column to bool row['warnings'] = row['warnings'].lower() == 'true' - self._add_file_cols(self.paths.query_genomes_dir, row) - return rows + return rows # type: ignore @lazy - def ref_genomes(self): + def ref_genomes(self) -> list[TestRefGenome]: """Reference genomes and their attributes.""" with open(self.paths.refs_table, newline='') as f: rows = list(DictReader(f)) - for row in rows: - self._add_file_cols(self.paths.ref_genomes_dir, row) - - return rows + return rows # type: ignore @classmethod - def _ensure_gz(cls, items): - """Ensure gzipped versions of the query/ref files are available. + def _ensure_gz(cls, file: Path, file_gz: Path): + """Ensure gzipped version of the query/ref file is available. These aren't added to version control, so they are created the first time they are needed. """ - for item in items: - dst = item['file_gz'].path - if dst.is_file(): - continue + if file_gz.is_file(): + return - with open(item['file'].path) as f: - content = f.read() + with open(file) as f: + content = f.read() - with gzip.open(dst, 'wt') as f: - f.write(content) + with gzip.open(file_gz, 'wt') as f: + f.write(content) - @classmethod - def _get_genome_files(cls, items, gzipped): - if gzipped: - col = 'file_gz' - cls._ensure_gz(items) - else: - col = 'file' - return [q[col] for q in items] + def _get_genome_files(self, base: Path, names: list[str], gzipped: bool, relative: bool) -> list[Path]: + base2 = base.relative_to(self.paths.root) if relative else base + + files = [] - def get_query_files(self, gzipped: bool=False): - return self._get_genome_files(self.query_genomes, gzipped) + for name in names: + fname = name + '.fasta' - def get_ref_files(self, gzipped: bool=False): - return self._get_genome_files(self.ref_genomes, gzipped) + if gzipped: + fname_gz = fname + '.gz' + self._ensure_gz(base / fname, base / fname_gz) + path = base2 / fname_gz + else: + path = base2 / fname + + files.append(path) + + return files + + def get_query_files(self, gzipped: bool = False, relative: bool = False) -> list[Path]: + return self._get_genome_files( + self.paths.query_genomes_dir, + [genome['name'] for genome in self.query_genomes], + gzipped=gzipped, + relative=relative, + ) + + def get_ref_files(self, gzipped: bool = False, relative: bool = False) -> list[Path]: + return self._get_genome_files( + self.paths.ref_genomes_dir, + [genome['name'] for genome in self.ref_genomes], + gzipped=gzipped, + relative=relative, + ) - def get_query_results(self, strict: bool, session=None): + def get_query_results(self, strict: bool, session=None) -> QueryResults: """Pre-calculated query results.""" if session is None: session = self.refdb.session diff --git a/tests/util/__init__.py b/tests/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/util/test_util_indexing.py b/tests/util/test_indexing.py similarity index 100% rename from tests/util/test_util_indexing.py rename to tests/util/test_indexing.py diff --git a/tests/util/test_util_io.py b/tests/util/test_io.py similarity index 75% rename from tests/util/test_util_io.py rename to tests/util/test_io.py index ba0819f..234dc64 100644 --- a/tests/util/test_util_io.py +++ b/tests/util/test_io.py @@ -17,29 +17,30 @@ def text_data(self): random = np.random.RandomState() return random.randint(32, 128, size=1000, dtype='b').tobytes() - @pytest.fixture(scope='class', params=[None, 'gzip']) + @pytest.fixture(scope='class', params=['none', 'gzip']) def compression(self, request): """Compression method string.""" return request.param @pytest.fixture() - def text_file(self, text_data, compression, tmpdir): + def text_file(self, text_data: bytes, compression: str, tmp_path: Path): """Path to file with text_data written to it using open_compressed.""" - file = tmpdir.join('chars.txt').strpath + file = tmp_path / 'chars.txt' - with ioutil.open_compressed(compression, file, 'wb') as fobj: + with ioutil.open_compressed(file, 'wb', compression) as fobj: fobj.write(text_data) return file @pytest.mark.parametrize('binary', [True, False]) - def test_read(self, binary, text_data, text_file, compression, tmpdir): + @pytest.mark.parametrize('auto', [True, False]) + def test_read(self, binary: bool, auto: bool, text_data: bytes, text_file: Path, compression: str): """Test we can read the file in both binary and text mode.""" mode = 'rb' if binary else 'rt' - with ioutil.open_compressed(compression, text_file, mode) as fobj: + with ioutil.open_compressed(text_file, mode, 'auto' if auto else compression) as fobj: contents = fobj.read() if binary: @@ -52,47 +53,25 @@ def test_read(self, binary, text_data, text_file, compression, tmpdir): @pytest.mark.parametrize('write_mode', ['w', 'a', 'x']) @pytest.mark.parametrize('binary', [True, False]) - def test_write(self, write_mode, binary, text_data, compression, tmpdir): + def test_write(self, write_mode: str, binary: bool, text_data: bytes, compression: str, tmp_path: Path): """ Test writing data using the w, a, and x modes. - TODO - these are all identical when the file doesn't exist, test behavior when it does + TODO - these are all identical when the file doesn't exist, test behavior when it does. """ - file = tmpdir.join('chars.txt') + file = tmp_path / 'chars.txt' mode = write_mode + ('b' if binary else 't') to_write = text_data if binary else text_data.decode('ascii') - with ioutil.open_compressed(compression, file.strpath, mode) as fobj: + with ioutil.open_compressed(file, mode, compression) as fobj: fobj.write(to_write) - with ioutil.open_compressed(compression, file.strpath, 'rb') as f: + with ioutil.open_compressed(file, 'rb', compression) as f: contents = f.read() assert contents == text_data - def test_invalid_mode(self, compression): - for mode in ['r', 'w', 'a', 't', 'b', 'abc', '']: - with pytest.raises(ValueError): - ioutil.open_compressed(compression, 'foo.txt', mode=mode) - - @pytest.mark.parametrize('binary', [True, False]) - def test_read_auto(self, binary, text_data, text_file): - """Test automatic determination of compression method.""" - - mode = 'rb' if binary else 'rt' - - with ioutil.open_compressed('auto', text_file, mode) as fobj: - contents = fobj.read() - - if binary: - assert isinstance(contents, bytes) - assert contents == text_data - - else: - assert isinstance(contents, str) - assert contents == text_data.decode('ascii') - class TestClosingIterator: """Test the ClosingIterator class.""" diff --git a/tests/util/test_util_json.py b/tests/util/test_json.py similarity index 100% rename from tests/util/test_util_json.py rename to tests/util/test_json.py diff --git a/tests/util/test_util_misc.py b/tests/util/test_misc.py similarity index 86% rename from tests/util/test_util_misc.py rename to tests/util/test_misc.py index e9963b7..61a12fa 100644 --- a/tests/util/test_util_misc.py +++ b/tests/util/test_misc.py @@ -54,14 +54,6 @@ def test_chunk_slices(): assert list(misc.chunk_slices(0, 10)) == [] -def test_is_importable(): - """Test the is_importable() function.""" - assert misc.is_importable('urllib') - assert misc.is_importable('urllib.request') - assert not misc.is_importable('aklhaskhdkslkdjahkdf') - assert not misc.is_importable('urllib.aklhaskhdkslkdjahkdf') - - def test_join_list_human(): l = ['foo', 'bar', 'baz'] assert misc.join_list_human(l[:1]) == 'foo' diff --git a/tests/util/test_util_progress.py b/tests/util/test_progress.py similarity index 100% rename from tests/util/test_util_progress.py rename to tests/util/test_progress.py diff --git a/tests/util/test_util_typing.py b/tests/util/test_util_typing.py deleted file mode 100644 index 2f19ba9..0000000 --- a/tests/util/test_util_typing.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Test the gambit.util.typing submodule.""" - -import typing -from typing import Union, Optional - -from gambit.util.typing import is_union, union_types, is_optional, unwrap_optional - - -def test_is_union(): - """Test the is_union() function.""" - assert is_union(Union[int, str]) - assert is_union(Union[int, str, bool]) - assert is_union(Optional[int]) - assert not is_union(Union) - assert not is_union(Optional) - assert not is_union(int) - assert not is_union(None) - assert not is_union(typing.List) - assert not is_union(typing.Any) - - -def test_union_types(): - """Test the union_types() function.""" - assert union_types(Union[int, str]) == (int, str) - assert union_types(Union[int, str, bool]) == (int, str, bool) - assert union_types(Optional[int]) == (int, type(None)) - - -def test_is_optional(): - """Test the is_optional() function.""" - assert is_optional(Optional[int]) - assert is_optional(Union[int, None]) - assert is_optional(Union[None, int]) - assert not is_optional(Union[int, str]) - assert not is_optional(Union) - assert not is_optional(Optional) - assert not is_optional(int) - assert not is_optional(None) - assert not is_optional(type(None)) - assert not is_optional(typing.Any) - - -def test_unwrap_optional(): - """Test the unwrap_optional() function.""" - assert unwrap_optional(Optional[int]) is int - assert unwrap_optional(Union[int, None]) is int - assert unwrap_optional(Union[None, int]) is int