From 25615932bc16ab95461b6a9836129398e677551f Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Wed, 6 Dec 2023 00:19:30 -0800 Subject: [PATCH 01/86] Test on Python 3.12 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f3dda58..ad8e426 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9, '3.10', 3.11] + python-version: [3.7, 3.8, 3.9, '3.10', 3.11, 3.12] steps: - uses: actions/checkout@v2 From 4b2eb31afcff455fc6d2dee4036daf91389395b2 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 27 Jan 2024 23:08:30 -0800 Subject: [PATCH 02/86] Better error messages when opening invalid signatures file --- gambit/sigs/base.py | 16 ++++++++++++++++ gambit/sigs/hdf5.py | 36 ++++++++++++++++++++++++++++-------- tests/sigs/test_sigs_hdf5.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 8 deletions(-) diff --git a/gambit/sigs/base.py b/gambit/sigs/base.py index bf73203..5ca383c 100644 --- a/gambit/sigs/base.py +++ b/gambit/sigs/base.py @@ -397,6 +397,22 @@ def __getitem__(self, index): return self.signatures[index] +class SignaturesFileError(Exception): + """Indicates an error attempting to open a signatures file.""" + + message: str + filename: str + format: str + + def __init__(self, message: str, filename: Optional[FilePath], format: Optional[str]): + self.message = message + self.filename = str(filename) + self.format = format + + def __str__(self): + return self.message + + def load_signatures(path: FilePath, **kw) -> AbstractSignatureArray: """Load signatures from file. diff --git a/gambit/sigs/hdf5.py b/gambit/sigs/hdf5.py index 7e2b930..c095f40 100644 --- a/gambit/sigs/hdf5.py +++ b/gambit/sigs/hdf5.py @@ -7,7 +7,7 @@ import h5py as h5 from .base import SignatureArray, ConcatenatedSignatureArray, AbstractSignatureArray, SignaturesMeta,\ - ReferenceSignatures + ReferenceSignatures, SignaturesFileError from gambit.kmers import KmerSpec from gambit._cython.metric import BOUNDS_DTYPE from gambit.util.io import FilePath @@ -93,11 +93,11 @@ def __init__(self, group: h5.Group): self.group = group if FMT_VERSION_ATTR not in group.attrs: - raise RuntimeError('HDF5 group does not contain a signature set') + raise SignaturesFileError('HDF5 group does not contain a signature set', None, 'hdf5') self.format_version = group.attrs[FMT_VERSION_ATTR] if self.format_version != CURRENT_FMT_VERSION: - raise ValueError(f'Unrecognized format version: {self.format_version}') + raise ValueError(f'Unrecognized format version: {self.format_version}', None, 'hdf5') self.kmerspec = KmerSpec(group.attrs['kmerspec_k'], group.attrs['kmerspec_prefix']) self.meta = read_metadata(group) @@ -229,13 +229,33 @@ def load_signatures_hdf5(path: FilePath, **kw) -> HDF5Signatures: \\**kw Additional keyword arguments to :func:`h5py.File`. """ - return HDF5Signatures(h5.File(path, **kw)) + exc = SignaturesFileError(f'{path} does not appear to be a GAMBIT signtures file.', path, 'hdf5') + # Check for HDF5 magic number + # The errors raised by the h5py library are a bit cryptic, so make one with a better message if + # not a valid HDF5 file. + # This also raises the standard errors if file cannot be read. + with open(path, 'rb') as f: + header = f.read(8) + if header != b'\x89HDF\r\n\x1a\n': + raise exc -def dump_signatures_hdf5(path: FilePath, - signatures: AbstractSignatureArray, - **kw, - ): + h5file = h5.File(path, **kw) + + if FMT_VERSION_ATTR not in h5file.attrs: + raise exc + + try: + return HDF5Signatures(h5file) + + except SignaturesFileError as exc: + # Make sure errors in opening are annotated with the correct file name + exc.message = f'Error opening signatures file {path}: {exc.message}' + exc.filename = str(path) + raise + + +def dump_signatures_hdf5(path: FilePath, signatures: AbstractSignatureArray, **kw): """Write k-mer signatures and associated metadata to an HDF5 file. Parameters diff --git a/tests/sigs/test_sigs_hdf5.py b/tests/sigs/test_sigs_hdf5.py index 819d6ce..941039a 100644 --- a/tests/sigs/test_sigs_hdf5.py +++ b/tests/sigs/test_sigs_hdf5.py @@ -6,6 +6,7 @@ from gambit.sigs.hdf5 import read_metadata, write_metadata, load_signatures_hdf5, dump_signatures_hdf5 from gambit.sigs import SignaturesMeta, SignatureList, AnnotatedSignatures +from gambit.sigs.base import SignaturesFileError from gambit.sigs.test import AbstractSignatureArrayTests from gambit.kmers import KmerSpec from gambit.test import make_signatures @@ -51,6 +52,35 @@ def dump_load(sigs, path, **kw): return load_signatures_hdf5(f) +def test_open_not_hdf5(tmp_path): + """Test opening an invalid file.""" + + # Not an HDF5 file + file = tmp_path / 'not-hdf5.gs' + with open(file, 'w') as f: + f.write('foo') + + with pytest.raises(SignaturesFileError) as einfo: + load_signatures_hdf5(file) + + assert einfo.value.filename == str(file) + assert einfo.value.format == 'hdf5' + + +def test_open_invalid(tmp_path): + """Test opening an invalid HDF5 file.""" + + file = tmp_path / 'invalid.gs' + with h5.File(file, 'w') as f: + pass # Empty + + with pytest.raises(SignaturesFileError) as einfo: + load_signatures_hdf5(file) + + assert einfo.value.filename == str(file) + assert einfo.value.format == 'hdf5' + + class TestHDF5Signatures: @pytest.fixture(scope='class') From d65a55849ee57bfeb42268eb09d078c3fb8479e5 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 27 Jan 2024 20:48:12 -0800 Subject: [PATCH 03/86] Update changelog --- CHANGELOG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a25a3d..3e33ac1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,12 @@ ## 1.0.1 +* Significant documentation updates. +* Better error reporting: + * When database files cannot be found (in CLI and API). + * On attempting to open an invalid signatures file. * Misc - * Better error reporting when database files cannot be found (in CLI and API). - * Minor documentation updates. + * Run tests on Python 3.11 and 3.12. ## 1.0.0 From 9febe5b128a97d1d04c1a9c1fbb1007e11d51158 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Wed, 8 May 2024 11:13:46 -0700 Subject: [PATCH 04/86] Update biopython requirement to 1.79 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index cd3eb1a..665ab30 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,7 +18,7 @@ python_requires = >= 3.7 install_requires = numpy~=1.13 sqlalchemy~=1.1 - biopython~=1.69 + biopython~=1.79 alembic~=1.0 attrs>=20 cattrs~=1.0 From e2ca9029b801973f8735908d4a3c4b2379121d9f Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 27 Jan 2024 23:51:21 -0800 Subject: [PATCH 05/86] Require Python 3.9 --- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 5 +++++ setup.cfg | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ad8e426..617ee54 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9, '3.10', 3.11, 3.12] + python-version: [3.9, '3.10', 3.11, 3.12] steps: - uses: actions/checkout@v2 diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e33ac1..fca10a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ # Changelog +## DEV + +- Require Python 3.9 + + ## 1.0.1 * Significant documentation updates. diff --git a/setup.cfg b/setup.cfg index 665ab30..a84ac3d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,7 @@ packages = find: zip_safe = false include_package_data = true -python_requires = >= 3.7 +python_requires = >= 3.9 install_requires = numpy~=1.13 From 5fc2c160801bfa9d151f7c7e6b49d3619ba444de Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 27 Jan 2024 23:52:43 -0800 Subject: [PATCH 06/86] Remove singledispatchmethod py3.7 backport --- gambit/results/archive.py | 3 ++- gambit/results/json.py | 3 ++- gambit/util/misc.py | 18 ------------------ 3 files changed, 4 insertions(+), 20 deletions(-) diff --git a/gambit/results/archive.py b/gambit/results/archive.py index a6c7a2c..4d29836 100644 --- a/gambit/results/archive.py +++ b/gambit/results/archive.py @@ -2,6 +2,7 @@ import json from typing import Union, IO, Any +from functools import singledispatchmethod from attr import attrs, attrib, asdict, has as has_attrs from sqlalchemy.orm import Session @@ -11,7 +12,7 @@ from gambit.db import ReferenceGenomeSet, Taxon, AnnotatedGenome, Genome import gambit.util.json as gjson from gambit.util.io import FilePath, maybe_open -from gambit.util.misc import singledispatchmethod, type_singledispatchmethod +from gambit.util.misc import type_singledispatchmethod from gambit.util.typing import is_optional, unwrap_optional from .base import asdict_default, BaseJSONResultsExporter diff --git a/gambit/results/json.py b/gambit/results/json.py index aa1d662..c285bcf 100644 --- a/gambit/results/json.py +++ b/gambit/results/json.py @@ -1,11 +1,12 @@ """Export results to JSON.""" +from functools import singledispatchmethod + from attr import attrs, asdict from .base import _todict, BaseJSONResultsExporter from gambit.query import QueryResultItem, QueryResults, QueryInput from gambit.db import ReferenceGenomeSet, Taxon, AnnotatedGenome -from gambit.util.misc import singledispatchmethod @attrs() diff --git a/gambit/util/misc.py b/gambit/util/misc.py index 44b468e..d40128d 100644 --- a/gambit/util/misc.py +++ b/gambit/util/misc.py @@ -73,24 +73,6 @@ def chunk_slices(n: int, size: int) -> Iterator[slice]: start = stop -# singledispatchmethod ot available in 3.7 -if sys.version_info[1] >= 8: - from functools import singledispatchmethod - -else: - # Make simple implementation - def singledispatchmethod(func): - dispatcher = singledispatch(func) - - @wraps(func) - def wrapper(self, arg, *rest, **kw): - impl = dispatcher.dispatch(type(arg)) - return impl(self, arg, *rest, **kw) - - wrapper.register = dispatcher.register - return wrapper - - def type_singledispatchmethod(func: Callable): """ Similar to ``singledispatchmethod``, but the first (non-self) argument is expected to be a From a23543d22506bc24aa065a9fdd4f42dc98aaaa76 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Wed, 3 Jul 2024 00:11:35 -0700 Subject: [PATCH 07/86] Add additional info to signatures info command --- CHANGELOG.md | 5 ++++- gambit/cli/common.py | 38 +++++++++++++++++++++++++++++++++++++- gambit/cli/signatures.py | 36 ++++++++++++++++++++++++++++++------ 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fca10a0..db7c1a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,10 @@ ## DEV -- Require Python 3.9 +* CLI + * Add additional details to `signatures info` command. +* Other + * Require Python 3.9 ## 1.0.1 diff --git a/gambit/cli/common.py b/gambit/cli/common.py index 998a2b1..2bd201b 100644 --- a/gambit/cli/common.py +++ b/gambit/cli/common.py @@ -1,5 +1,5 @@ import os -from typing import Optional, Sequence, TextIO, Union, Iterable, Tuple, List +from typing import Optional, Sequence, TextIO, Union, Iterable, Tuple, List, Any from pathlib import Path from collections import Counter @@ -462,3 +462,39 @@ def print_table(rows: Sequence[Sequence], colsep: str=' ', left: str='', right: echo(right) echo('\n') + + +def get_revision_info(revision) -> Optional[dict[str, Any]]: + """Extract revision information from metadata JSON. + + :class:`gambit.sigs.base.SignaturesMeta` and :class:`gambit.db.models.ReferenceGenomeSet` + (stored in ``.gs`` and ``.gdb`` files) have an ``extra`` field to store additional metadata + in JSON format. There is no prescribed format for this, but the official GAMBIT database files + have a "revision" key that is an object with a common set of fields. This function attempts to + extract that data, without causing an error if the format is not as expected. + + Parameters + ---------- + revision + Value under extra metadata's "revision" key, or None. + + Returns + ------- + None if ``revision`` is not a dict, otherwise a dict with ``'num'`` , ``'date'```, + ``'author'``, and ``'description'`` keys. + """ + if not isinstance(revision, dict): + return None + + info = dict() + fields = [('num', int), ('date', str), ('author', str), ('description', str)] + + for name, type_ in fields: + if name not in revision: + info[name] = '' + elif isinstance(revision[name], type_): + info[name] = revision[name] + else: + info[name] = '' + + return info diff --git a/gambit/cli/signatures.py b/gambit/cli/signatures.py index 39d29c7..6c17dcf 100644 --- a/gambit/cli/signatures.py +++ b/gambit/cli/signatures.py @@ -79,26 +79,50 @@ def info(ctx: click.Context, file: str, json: bool, pretty: bool, ids: bool, use gjson.dump(data, sys.stdout, **kw) else: - rows1 = [ + rows = [ ('Genome Count:', len(sigs)), ('k:', sigs.kmerspec.k), ('Prefix:', sigs.kmerspec.prefix.decode('ascii')), ('File format:', f'HDF5, version {sigs.format_version}'), # HDF5-specific ('Data type:', sigs.dtype), ] - common.print_table(rows1, colsep=' ') + common.print_table(rows, colsep=' ') print('Metadata:') - rows2 = [ + rows = [ ('ID:', format_none(sigs.meta.id)), - ('Name:', format_none(sigs.meta.name)), ('Version:', format_none(sigs.meta.version)), + ('Name:', format_none(sigs.meta.name)), ('Description:', format_none(sigs.meta.description)), ('Genome ID attribute:', format_none(sigs.meta.id_attr)), - ('Has extra:', 'yes' if sigs.meta.extra else 'no'), ] - common.print_table(rows2, colsep=' ', left=' ') + common.print_table(rows, colsep=' ', left=' ') + + extra = sigs.meta.extra + + if extra: + revision = common.get_revision_info(extra.get('revision')) + + print('Additional metadata:') + + rows = [ + ('Author:', format_none(extra.get('author'))), + ('Revision:', '' if revision is None else ''), + ] + common.print_table(rows, colsep=' ', left=' ') + + if revision is not None: + rows = [ + ('Number:', revision['num']), + ('Date:', revision['date']), + ('Author:', revision['author']), + ('Description:', revision['description']), + ] + common.print_table(rows, colsep=' ', left=' ') + + else: + print('No additional metadata') @signatures_group.command(no_args_is_help=True) From 0a53a8f8b26ed766d972cfc5b454a7730008fa26 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Wed, 3 Jul 2024 00:21:23 -0700 Subject: [PATCH 08/86] Minor edits to CLI code --- gambit/cli/signatures.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gambit/cli/signatures.py b/gambit/cli/signatures.py index 6c17dcf..2af326d 100644 --- a/gambit/cli/signatures.py +++ b/gambit/cli/signatures.py @@ -1,5 +1,6 @@ -from typing import Optional, TextIO, List +from typing import Optional, TextIO import sys +from pathlib import Path import click @@ -49,8 +50,8 @@ def signatures_group(): required=False, ) @click.pass_context -def info(ctx: click.Context, file: str, json: bool, pretty: bool, ids: bool, use_db: bool): - """Inspect GAMBIT signature files.""" +def info(ctx: click.Context, file: Path, json: bool, pretty: bool, ids: bool, use_db: bool): + """Inspect GAMBIT signature (.gs) files.""" common.check_params_group(ctx, ['file', 'use_db'], True, True) common.check_params_group(ctx, ['ids', 'json'], True, False) @@ -158,7 +159,7 @@ def info(ctx: click.Context, file: str, json: bool, pretty: bool, ids: bool, use def create(ctx: click.Context, listfile: Optional[TextIO], ldir: Optional[str], - files_arg: List[str], + files_arg: list[str], output: str, prefix: Optional[str], k: Optional[int], From 7267f0a3b8ce51ff67d11d8fb802c8a871d00f44 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Tue, 9 Jul 2024 20:00:53 -0700 Subject: [PATCH 09/86] Structure tests/ dir as package --- tests/__init__.py | 13 +++++++++++++ tests/benchmarks/__init__.py | 0 tests/cli/__init__.py | 0 tests/cli/{test_cli_common.py => test_common.py} | 0 tests/cli/{test_cli_dist.py => test_dist.py} | 0 tests/cli/{test_cli_query.py => test_query.py} | 0 .../{test_cli_signatures.py => test_signatures.py} | 0 tests/cli/{test_cli_tree.py => test_tree.py} | 0 tests/conftest.py | 3 ++- tests/data/__init__.py | 0 tests/db/__init__.py | 0 tests/db/{test_db_migrate.py => test_migrate.py} | 0 tests/db/{test_db_models.py => test_models.py} | 0 tests/db/{test_db_refdb.py => test_refdb.py} | 0 tests/db/{test_db_sqla.py => test_sqla.py} | 0 tests/sigs/__init__.py | 0 tests/sigs/{test_sigs_base.py => test_base.py} | 0 tests/sigs/{test_sigs_calc.py => test_calc.py} | 0 .../sigs/{test_sigs_convert.py => test_convert.py} | 0 tests/sigs/{test_sigs_hdf5.py => test_hdf5.py} | 0 tests/util/__init__.py | 0 .../{test_util_indexing.py => test_indexing.py} | 0 tests/util/{test_util_io.py => test_io.py} | 0 tests/util/{test_util_json.py => test_json.py} | 0 tests/util/{test_util_misc.py => test_misc.py} | 0 .../{test_util_progress.py => test_progress.py} | 0 tests/util/{test_util_typing.py => test_typing.py} | 0 27 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 tests/__init__.py create mode 100644 tests/benchmarks/__init__.py create mode 100644 tests/cli/__init__.py rename tests/cli/{test_cli_common.py => test_common.py} (100%) rename tests/cli/{test_cli_dist.py => test_dist.py} (100%) rename tests/cli/{test_cli_query.py => test_query.py} (100%) rename tests/cli/{test_cli_signatures.py => test_signatures.py} (100%) rename tests/cli/{test_cli_tree.py => test_tree.py} (100%) create mode 100644 tests/data/__init__.py create mode 100644 tests/db/__init__.py rename tests/db/{test_db_migrate.py => test_migrate.py} (100%) rename tests/db/{test_db_models.py => test_models.py} (100%) rename tests/db/{test_db_refdb.py => test_refdb.py} (100%) rename tests/db/{test_db_sqla.py => test_sqla.py} (100%) create mode 100644 tests/sigs/__init__.py rename tests/sigs/{test_sigs_base.py => test_base.py} (100%) rename tests/sigs/{test_sigs_calc.py => test_calc.py} (100%) rename tests/sigs/{test_sigs_convert.py => test_convert.py} (100%) rename tests/sigs/{test_sigs_hdf5.py => test_hdf5.py} (100%) create mode 100644 tests/util/__init__.py rename tests/util/{test_util_indexing.py => test_indexing.py} (100%) rename tests/util/{test_util_io.py => test_io.py} (100%) rename tests/util/{test_util_json.py => test_json.py} (100%) rename tests/util/{test_util_misc.py => test_misc.py} (100%) rename tests/util/{test_util_progress.py => test_progress.py} (100%) rename tests/util/{test_util_typing.py => test_typing.py} (100%) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..5cc3df0 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,13 @@ +""" +Adding the __init__.py to the tests/ directory (and its subdirectories) makes them all part of the +same package structure. + +- Allows test modules/files to import from each other (including from modules in different + directories, such as files in tests/cli/ importing from tests/testdb.py). +- Does not require test modules to have unique names. + +This necessitates using the "prepend" (or possibly "append"?) import mode (which is the default). +This setup comes with its own set of caveats. See +https://docs.pytest.org/en/7.1.x/explanation/pythonpath.html for a discussion of how test modules +are imported. +""" diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/cli/test_cli_common.py b/tests/cli/test_common.py similarity index 100% rename from tests/cli/test_cli_common.py rename to tests/cli/test_common.py diff --git a/tests/cli/test_cli_dist.py b/tests/cli/test_dist.py similarity index 100% rename from tests/cli/test_cli_dist.py rename to tests/cli/test_dist.py diff --git a/tests/cli/test_cli_query.py b/tests/cli/test_query.py similarity index 100% rename from tests/cli/test_cli_query.py rename to tests/cli/test_query.py diff --git a/tests/cli/test_cli_signatures.py b/tests/cli/test_signatures.py similarity index 100% rename from tests/cli/test_cli_signatures.py rename to tests/cli/test_signatures.py diff --git a/tests/cli/test_cli_tree.py b/tests/cli/test_tree.py similarity index 100% rename from tests/cli/test_cli_tree.py rename to tests/cli/test_tree.py diff --git a/tests/conftest.py b/tests/conftest.py index e3431b6..4467af3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,8 @@ import numpy as np import pytest from sqlalchemy import create_engine -from testdb import TestDB + +from .testdb import TestDB @pytest.fixture(scope='session') diff --git a/tests/data/__init__.py b/tests/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/db/__init__.py b/tests/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/db/test_db_migrate.py b/tests/db/test_migrate.py similarity index 100% rename from tests/db/test_db_migrate.py rename to tests/db/test_migrate.py diff --git a/tests/db/test_db_models.py b/tests/db/test_models.py similarity index 100% rename from tests/db/test_db_models.py rename to tests/db/test_models.py diff --git a/tests/db/test_db_refdb.py b/tests/db/test_refdb.py similarity index 100% rename from tests/db/test_db_refdb.py rename to tests/db/test_refdb.py diff --git a/tests/db/test_db_sqla.py b/tests/db/test_sqla.py similarity index 100% rename from tests/db/test_db_sqla.py rename to tests/db/test_sqla.py diff --git a/tests/sigs/__init__.py b/tests/sigs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/sigs/test_sigs_base.py b/tests/sigs/test_base.py similarity index 100% rename from tests/sigs/test_sigs_base.py rename to tests/sigs/test_base.py diff --git a/tests/sigs/test_sigs_calc.py b/tests/sigs/test_calc.py similarity index 100% rename from tests/sigs/test_sigs_calc.py rename to tests/sigs/test_calc.py diff --git a/tests/sigs/test_sigs_convert.py b/tests/sigs/test_convert.py similarity index 100% rename from tests/sigs/test_sigs_convert.py rename to tests/sigs/test_convert.py diff --git a/tests/sigs/test_sigs_hdf5.py b/tests/sigs/test_hdf5.py similarity index 100% rename from tests/sigs/test_sigs_hdf5.py rename to tests/sigs/test_hdf5.py diff --git a/tests/util/__init__.py b/tests/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/util/test_util_indexing.py b/tests/util/test_indexing.py similarity index 100% rename from tests/util/test_util_indexing.py rename to tests/util/test_indexing.py diff --git a/tests/util/test_util_io.py b/tests/util/test_io.py similarity index 100% rename from tests/util/test_util_io.py rename to tests/util/test_io.py diff --git a/tests/util/test_util_json.py b/tests/util/test_json.py similarity index 100% rename from tests/util/test_util_json.py rename to tests/util/test_json.py diff --git a/tests/util/test_util_misc.py b/tests/util/test_misc.py similarity index 100% rename from tests/util/test_util_misc.py rename to tests/util/test_misc.py diff --git a/tests/util/test_util_progress.py b/tests/util/test_progress.py similarity index 100% rename from tests/util/test_util_progress.py rename to tests/util/test_progress.py diff --git a/tests/util/test_util_typing.py b/tests/util/test_typing.py similarity index 100% rename from tests/util/test_util_typing.py rename to tests/util/test_typing.py From aa821a25ccffb5b0b7745b0eee2d80baf6969634 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Tue, 9 Jul 2024 20:35:10 -0700 Subject: [PATCH 10/86] Transition to src/ project layout --- MANIFEST.in | 5 ++--- setup.cfg | 6 ++++++ setup.py | 2 +- {gambit => src/gambit}/__init__.py | 0 {gambit => src/gambit}/__main__.py | 0 {gambit => src/gambit}/_cython/.gitignore | 0 {gambit => src/gambit}/_cython/__init__.py | 0 {gambit => src/gambit}/_cython/kmers.pxd | 0 {gambit => src/gambit}/_cython/kmers.pyx | 0 {gambit => src/gambit}/_cython/metric.pxd | 0 {gambit => src/gambit}/_cython/metric.pyx | 0 {gambit => src/gambit}/_cython/threads.pyx | 0 {gambit => src/gambit}/_cython/types.pxd | 0 {gambit => src/gambit}/classify.py | 0 {gambit => src/gambit}/cli/__init__.py | 0 {gambit => src/gambit}/cli/common.py | 0 {gambit => src/gambit}/cli/debug.py | 0 {gambit => src/gambit}/cli/dist.py | 0 {gambit => src/gambit}/cli/query.py | 0 {gambit => src/gambit}/cli/root.py | 0 {gambit => src/gambit}/cli/signatures.py | 0 {gambit => src/gambit}/cli/test.py | 0 {gambit => src/gambit}/cli/tree.py | 0 {gambit => src/gambit}/cluster.py | 0 {gambit => src/gambit}/db/__init__.py | 0 {gambit => src/gambit}/db/migrate/__init__.py | 0 {gambit => src/gambit}/db/migrate/alembic.ini | 0 {gambit => src/gambit}/db/migrate/alembic/README | 0 {gambit => src/gambit}/db/migrate/alembic/env.py | 0 {gambit => src/gambit}/db/migrate/alembic/script.py.mako | 0 .../migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py | 0 {gambit => src/gambit}/db/models.py | 0 {gambit => src/gambit}/db/refdb.py | 0 {gambit => src/gambit}/db/sqla.py | 0 {gambit => src/gambit}/kmers.py | 0 {gambit => src/gambit}/metric.py | 0 {gambit => src/gambit}/query.py | 0 {gambit => src/gambit}/results/__init__.py | 0 {gambit => src/gambit}/results/archive.py | 0 {gambit => src/gambit}/results/base.py | 0 {gambit => src/gambit}/results/csv.py | 0 {gambit => src/gambit}/results/json.py | 0 {gambit => src/gambit}/results/test.py | 0 {gambit => src/gambit}/seq.py | 0 {gambit => src/gambit}/sigs/__init__.py | 0 {gambit => src/gambit}/sigs/base.py | 0 {gambit => src/gambit}/sigs/calc.py | 0 {gambit => src/gambit}/sigs/convert.py | 0 {gambit => src/gambit}/sigs/hdf5.py | 0 {gambit => src/gambit}/sigs/test.py | 0 {gambit => src/gambit}/test.py | 0 {gambit => src/gambit}/util/__init__.py | 0 {gambit => src/gambit}/util/dev.py | 0 {gambit => src/gambit}/util/indexing.py | 0 {gambit => src/gambit}/util/io.py | 0 {gambit => src/gambit}/util/json.py | 0 {gambit => src/gambit}/util/misc.py | 0 {gambit => src/gambit}/util/progress.py | 0 {gambit => src/gambit}/util/typing.py | 0 59 files changed, 9 insertions(+), 4 deletions(-) rename {gambit => src/gambit}/__init__.py (100%) rename {gambit => src/gambit}/__main__.py (100%) rename {gambit => src/gambit}/_cython/.gitignore (100%) rename {gambit => src/gambit}/_cython/__init__.py (100%) rename {gambit => src/gambit}/_cython/kmers.pxd (100%) rename {gambit => src/gambit}/_cython/kmers.pyx (100%) rename {gambit => src/gambit}/_cython/metric.pxd (100%) rename {gambit => src/gambit}/_cython/metric.pyx (100%) rename {gambit => src/gambit}/_cython/threads.pyx (100%) rename {gambit => src/gambit}/_cython/types.pxd (100%) rename {gambit => src/gambit}/classify.py (100%) rename {gambit => src/gambit}/cli/__init__.py (100%) rename {gambit => src/gambit}/cli/common.py (100%) rename {gambit => src/gambit}/cli/debug.py (100%) rename {gambit => src/gambit}/cli/dist.py (100%) rename {gambit => src/gambit}/cli/query.py (100%) rename {gambit => src/gambit}/cli/root.py (100%) rename {gambit => src/gambit}/cli/signatures.py (100%) rename {gambit => src/gambit}/cli/test.py (100%) rename {gambit => src/gambit}/cli/tree.py (100%) rename {gambit => src/gambit}/cluster.py (100%) rename {gambit => src/gambit}/db/__init__.py (100%) rename {gambit => src/gambit}/db/migrate/__init__.py (100%) rename {gambit => src/gambit}/db/migrate/alembic.ini (100%) rename {gambit => src/gambit}/db/migrate/alembic/README (100%) rename {gambit => src/gambit}/db/migrate/alembic/env.py (100%) rename {gambit => src/gambit}/db/migrate/alembic/script.py.mako (100%) rename {gambit => src/gambit}/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py (100%) rename {gambit => src/gambit}/db/models.py (100%) rename {gambit => src/gambit}/db/refdb.py (100%) rename {gambit => src/gambit}/db/sqla.py (100%) rename {gambit => src/gambit}/kmers.py (100%) rename {gambit => src/gambit}/metric.py (100%) rename {gambit => src/gambit}/query.py (100%) rename {gambit => src/gambit}/results/__init__.py (100%) rename {gambit => src/gambit}/results/archive.py (100%) rename {gambit => src/gambit}/results/base.py (100%) rename {gambit => src/gambit}/results/csv.py (100%) rename {gambit => src/gambit}/results/json.py (100%) rename {gambit => src/gambit}/results/test.py (100%) rename {gambit => src/gambit}/seq.py (100%) rename {gambit => src/gambit}/sigs/__init__.py (100%) rename {gambit => src/gambit}/sigs/base.py (100%) rename {gambit => src/gambit}/sigs/calc.py (100%) rename {gambit => src/gambit}/sigs/convert.py (100%) rename {gambit => src/gambit}/sigs/hdf5.py (100%) rename {gambit => src/gambit}/sigs/test.py (100%) rename {gambit => src/gambit}/test.py (100%) rename {gambit => src/gambit}/util/__init__.py (100%) rename {gambit => src/gambit}/util/dev.py (100%) rename {gambit => src/gambit}/util/indexing.py (100%) rename {gambit => src/gambit}/util/io.py (100%) rename {gambit => src/gambit}/util/json.py (100%) rename {gambit => src/gambit}/util/misc.py (100%) rename {gambit => src/gambit}/util/progress.py (100%) rename {gambit => src/gambit}/util/typing.py (100%) diff --git a/MANIFEST.in b/MANIFEST.in index 71ce312..8e16a00 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,2 @@ -# Alembic -include gambit/db/migrate/alembic.ini -recursive-include gambit/db/migrate *.py +graft src +graft docs diff --git a/setup.cfg b/setup.cfg index a84ac3d..d84710f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,6 +10,8 @@ license_file = LICENSE [options] packages = find: +package_dir = + =src zip_safe = false include_package_data = true @@ -30,6 +32,10 @@ tests_require = pytest +[options.packages.find] +where=src + + [options.entry_points] console_scripts = gambit = gambit.cli:cli diff --git a/setup.py b/setup.py index cbcce77..2385bb8 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ np_include = numpy.get_include() extensions = [Extension( 'gambit._cython.*', - ['gambit/_cython/*.pyx'], + ['src/gambit/_cython/*.pyx'], include_dirs=[np_include], extra_compile_args=['-fopenmp', '-Wno-sign-compare'], extra_link_args=['-fopenmp'], diff --git a/gambit/__init__.py b/src/gambit/__init__.py similarity index 100% rename from gambit/__init__.py rename to src/gambit/__init__.py diff --git a/gambit/__main__.py b/src/gambit/__main__.py similarity index 100% rename from gambit/__main__.py rename to src/gambit/__main__.py diff --git a/gambit/_cython/.gitignore b/src/gambit/_cython/.gitignore similarity index 100% rename from gambit/_cython/.gitignore rename to src/gambit/_cython/.gitignore diff --git a/gambit/_cython/__init__.py b/src/gambit/_cython/__init__.py similarity index 100% rename from gambit/_cython/__init__.py rename to src/gambit/_cython/__init__.py diff --git a/gambit/_cython/kmers.pxd b/src/gambit/_cython/kmers.pxd similarity index 100% rename from gambit/_cython/kmers.pxd rename to src/gambit/_cython/kmers.pxd diff --git a/gambit/_cython/kmers.pyx b/src/gambit/_cython/kmers.pyx similarity index 100% rename from gambit/_cython/kmers.pyx rename to src/gambit/_cython/kmers.pyx diff --git a/gambit/_cython/metric.pxd b/src/gambit/_cython/metric.pxd similarity index 100% rename from gambit/_cython/metric.pxd rename to src/gambit/_cython/metric.pxd diff --git a/gambit/_cython/metric.pyx b/src/gambit/_cython/metric.pyx similarity index 100% rename from gambit/_cython/metric.pyx rename to src/gambit/_cython/metric.pyx diff --git a/gambit/_cython/threads.pyx b/src/gambit/_cython/threads.pyx similarity index 100% rename from gambit/_cython/threads.pyx rename to src/gambit/_cython/threads.pyx diff --git a/gambit/_cython/types.pxd b/src/gambit/_cython/types.pxd similarity index 100% rename from gambit/_cython/types.pxd rename to src/gambit/_cython/types.pxd diff --git a/gambit/classify.py b/src/gambit/classify.py similarity index 100% rename from gambit/classify.py rename to src/gambit/classify.py diff --git a/gambit/cli/__init__.py b/src/gambit/cli/__init__.py similarity index 100% rename from gambit/cli/__init__.py rename to src/gambit/cli/__init__.py diff --git a/gambit/cli/common.py b/src/gambit/cli/common.py similarity index 100% rename from gambit/cli/common.py rename to src/gambit/cli/common.py diff --git a/gambit/cli/debug.py b/src/gambit/cli/debug.py similarity index 100% rename from gambit/cli/debug.py rename to src/gambit/cli/debug.py diff --git a/gambit/cli/dist.py b/src/gambit/cli/dist.py similarity index 100% rename from gambit/cli/dist.py rename to src/gambit/cli/dist.py diff --git a/gambit/cli/query.py b/src/gambit/cli/query.py similarity index 100% rename from gambit/cli/query.py rename to src/gambit/cli/query.py diff --git a/gambit/cli/root.py b/src/gambit/cli/root.py similarity index 100% rename from gambit/cli/root.py rename to src/gambit/cli/root.py diff --git a/gambit/cli/signatures.py b/src/gambit/cli/signatures.py similarity index 100% rename from gambit/cli/signatures.py rename to src/gambit/cli/signatures.py diff --git a/gambit/cli/test.py b/src/gambit/cli/test.py similarity index 100% rename from gambit/cli/test.py rename to src/gambit/cli/test.py diff --git a/gambit/cli/tree.py b/src/gambit/cli/tree.py similarity index 100% rename from gambit/cli/tree.py rename to src/gambit/cli/tree.py diff --git a/gambit/cluster.py b/src/gambit/cluster.py similarity index 100% rename from gambit/cluster.py rename to src/gambit/cluster.py diff --git a/gambit/db/__init__.py b/src/gambit/db/__init__.py similarity index 100% rename from gambit/db/__init__.py rename to src/gambit/db/__init__.py diff --git a/gambit/db/migrate/__init__.py b/src/gambit/db/migrate/__init__.py similarity index 100% rename from gambit/db/migrate/__init__.py rename to src/gambit/db/migrate/__init__.py diff --git a/gambit/db/migrate/alembic.ini b/src/gambit/db/migrate/alembic.ini similarity index 100% rename from gambit/db/migrate/alembic.ini rename to src/gambit/db/migrate/alembic.ini diff --git a/gambit/db/migrate/alembic/README b/src/gambit/db/migrate/alembic/README similarity index 100% rename from gambit/db/migrate/alembic/README rename to src/gambit/db/migrate/alembic/README diff --git a/gambit/db/migrate/alembic/env.py b/src/gambit/db/migrate/alembic/env.py similarity index 100% rename from gambit/db/migrate/alembic/env.py rename to src/gambit/db/migrate/alembic/env.py diff --git a/gambit/db/migrate/alembic/script.py.mako b/src/gambit/db/migrate/alembic/script.py.mako similarity index 100% rename from gambit/db/migrate/alembic/script.py.mako rename to src/gambit/db/migrate/alembic/script.py.mako diff --git a/gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py b/src/gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py similarity index 100% rename from gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py rename to src/gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py diff --git a/gambit/db/models.py b/src/gambit/db/models.py similarity index 100% rename from gambit/db/models.py rename to src/gambit/db/models.py diff --git a/gambit/db/refdb.py b/src/gambit/db/refdb.py similarity index 100% rename from gambit/db/refdb.py rename to src/gambit/db/refdb.py diff --git a/gambit/db/sqla.py b/src/gambit/db/sqla.py similarity index 100% rename from gambit/db/sqla.py rename to src/gambit/db/sqla.py diff --git a/gambit/kmers.py b/src/gambit/kmers.py similarity index 100% rename from gambit/kmers.py rename to src/gambit/kmers.py diff --git a/gambit/metric.py b/src/gambit/metric.py similarity index 100% rename from gambit/metric.py rename to src/gambit/metric.py diff --git a/gambit/query.py b/src/gambit/query.py similarity index 100% rename from gambit/query.py rename to src/gambit/query.py diff --git a/gambit/results/__init__.py b/src/gambit/results/__init__.py similarity index 100% rename from gambit/results/__init__.py rename to src/gambit/results/__init__.py diff --git a/gambit/results/archive.py b/src/gambit/results/archive.py similarity index 100% rename from gambit/results/archive.py rename to src/gambit/results/archive.py diff --git a/gambit/results/base.py b/src/gambit/results/base.py similarity index 100% rename from gambit/results/base.py rename to src/gambit/results/base.py diff --git a/gambit/results/csv.py b/src/gambit/results/csv.py similarity index 100% rename from gambit/results/csv.py rename to src/gambit/results/csv.py diff --git a/gambit/results/json.py b/src/gambit/results/json.py similarity index 100% rename from gambit/results/json.py rename to src/gambit/results/json.py diff --git a/gambit/results/test.py b/src/gambit/results/test.py similarity index 100% rename from gambit/results/test.py rename to src/gambit/results/test.py diff --git a/gambit/seq.py b/src/gambit/seq.py similarity index 100% rename from gambit/seq.py rename to src/gambit/seq.py diff --git a/gambit/sigs/__init__.py b/src/gambit/sigs/__init__.py similarity index 100% rename from gambit/sigs/__init__.py rename to src/gambit/sigs/__init__.py diff --git a/gambit/sigs/base.py b/src/gambit/sigs/base.py similarity index 100% rename from gambit/sigs/base.py rename to src/gambit/sigs/base.py diff --git a/gambit/sigs/calc.py b/src/gambit/sigs/calc.py similarity index 100% rename from gambit/sigs/calc.py rename to src/gambit/sigs/calc.py diff --git a/gambit/sigs/convert.py b/src/gambit/sigs/convert.py similarity index 100% rename from gambit/sigs/convert.py rename to src/gambit/sigs/convert.py diff --git a/gambit/sigs/hdf5.py b/src/gambit/sigs/hdf5.py similarity index 100% rename from gambit/sigs/hdf5.py rename to src/gambit/sigs/hdf5.py diff --git a/gambit/sigs/test.py b/src/gambit/sigs/test.py similarity index 100% rename from gambit/sigs/test.py rename to src/gambit/sigs/test.py diff --git a/gambit/test.py b/src/gambit/test.py similarity index 100% rename from gambit/test.py rename to src/gambit/test.py diff --git a/gambit/util/__init__.py b/src/gambit/util/__init__.py similarity index 100% rename from gambit/util/__init__.py rename to src/gambit/util/__init__.py diff --git a/gambit/util/dev.py b/src/gambit/util/dev.py similarity index 100% rename from gambit/util/dev.py rename to src/gambit/util/dev.py diff --git a/gambit/util/indexing.py b/src/gambit/util/indexing.py similarity index 100% rename from gambit/util/indexing.py rename to src/gambit/util/indexing.py diff --git a/gambit/util/io.py b/src/gambit/util/io.py similarity index 100% rename from gambit/util/io.py rename to src/gambit/util/io.py diff --git a/gambit/util/json.py b/src/gambit/util/json.py similarity index 100% rename from gambit/util/json.py rename to src/gambit/util/json.py diff --git a/gambit/util/misc.py b/src/gambit/util/misc.py similarity index 100% rename from gambit/util/misc.py rename to src/gambit/util/misc.py diff --git a/gambit/util/progress.py b/src/gambit/util/progress.py similarity index 100% rename from gambit/util/progress.py rename to src/gambit/util/progress.py diff --git a/gambit/util/typing.py b/src/gambit/util/typing.py similarity index 100% rename from gambit/util/typing.py rename to src/gambit/util/typing.py From 152d0c06f0e07511cb69838322359c3ffdee4381 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Tue, 9 Jul 2024 21:05:58 -0700 Subject: [PATCH 11/86] Specify build-backend in pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index e42e25d..a3fd32e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,3 +7,4 @@ requires = [ # lower as the ABI is forward- but not backwards-compatible. "oldest-supported-numpy", ] +build-backend = "setuptools.build_meta" From cbff5d7dbf0f43f5f5956868dd2638318bc77103 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Tue, 9 Jul 2024 21:25:12 -0700 Subject: [PATCH 12/86] Update github actions --- .github/workflows/ci.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 617ee54..d7f7468 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,11 +14,15 @@ jobs: python-version: [3.9, '3.10', 3.11, 3.12] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: pip + cache-dependency-path: | + pyproject.toml + setup.cfg - name: Install dependencies run: | @@ -30,7 +34,5 @@ jobs: pip install . - name: Test with pytest - env: - PY_IGNORE_IMPORTMISMATCH: 1 run: | pytest From f98b09832a5135061f6919d400ca3ce2fc5b2ef3 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Thu, 11 Jul 2024 21:55:15 -0700 Subject: [PATCH 13/86] Update dependencies --- setup.cfg | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index d84710f..8c97121 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,10 +20,12 @@ python_requires = >= 3.9 install_requires = numpy~=1.13 sqlalchemy~=1.1 + # Seq stores data as bytes biopython~=1.79 alembic~=1.0 attrs>=20 - cattrs~=1.0 + # Minimum for 3.12, also introduces potentially breaking changes + cattrs>=23.2 click>=7.0 h5py~=3.0 scipy~=1.7 From f07cfad013f96dd51af6b4495acc8b0f9219f31b Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Thu, 11 Jul 2024 21:56:04 -0700 Subject: [PATCH 14/86] Move gambit.sigs.test module to test code dir --- src/gambit/sigs/test.py => tests/sigs/common.py | 4 ++-- tests/sigs/test_base.py | 2 +- tests/sigs/test_hdf5.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename src/gambit/sigs/test.py => tests/sigs/common.py (97%) diff --git a/src/gambit/sigs/test.py b/tests/sigs/common.py similarity index 97% rename from src/gambit/sigs/test.py rename to tests/sigs/common.py index 063f237..78c5b79 100644 --- a/src/gambit/sigs/test.py +++ b/tests/sigs/common.py @@ -1,9 +1,9 @@ -"""Utilities for testing signature set types.""" +"""Common code for testing signature set types.""" import pytest import numpy as np -from .base import AbstractSignatureArray, sigarray_eq +from gambit.sigs.base import AbstractSignatureArray, sigarray_eq class AbstractSignatureArrayTests: diff --git a/tests/sigs/test_base.py b/tests/sigs/test_base.py index 8024a75..9f65fe2 100644 --- a/tests/sigs/test_base.py +++ b/tests/sigs/test_base.py @@ -7,7 +7,7 @@ AnnotatedSignatures, sigarray_eq, SignaturesMeta from gambit.kmers import KmerSpec from gambit.test import make_signatures -from gambit.sigs.test import AbstractSignatureArrayTests +from .common import AbstractSignatureArrayTests @pytest.fixture(params=['u8', 'i8', 'u4']) diff --git a/tests/sigs/test_hdf5.py b/tests/sigs/test_hdf5.py index 941039a..5e75c44 100644 --- a/tests/sigs/test_hdf5.py +++ b/tests/sigs/test_hdf5.py @@ -7,9 +7,9 @@ from gambit.sigs.hdf5 import read_metadata, write_metadata, load_signatures_hdf5, dump_signatures_hdf5 from gambit.sigs import SignaturesMeta, SignatureList, AnnotatedSignatures from gambit.sigs.base import SignaturesFileError -from gambit.sigs.test import AbstractSignatureArrayTests from gambit.kmers import KmerSpec from gambit.test import make_signatures +from .common import AbstractSignatureArrayTests # JSON data to use for metadata extra field From d3517f3c8aefdf47fd420624bdeb34196e9051e2 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Thu, 11 Jul 2024 22:06:35 -0700 Subject: [PATCH 15/86] Move gambit.test module to tests/ --- tests/benchmarks/benchmark_signatures.py | 2 +- src/gambit/test.py => tests/common.py | 0 tests/sigs/test_base.py | 2 +- tests/sigs/test_calc.py | 2 +- tests/sigs/test_convert.py | 2 +- tests/sigs/test_hdf5.py | 2 +- tests/test_classify.py | 2 +- tests/test_kmers.py | 2 +- tests/test_metric.py | 2 +- tests/test_seq.py | 2 +- tests/{test_test.py => test_tests_common.py} | 14 +++++++------- 11 files changed, 16 insertions(+), 16 deletions(-) rename src/gambit/test.py => tests/common.py (100%) rename tests/{test_test.py => test_tests_common.py} (88%) diff --git a/tests/benchmarks/benchmark_signatures.py b/tests/benchmarks/benchmark_signatures.py index d0b2450..1682e9b 100644 --- a/tests/benchmarks/benchmark_signatures.py +++ b/tests/benchmarks/benchmark_signatures.py @@ -5,7 +5,7 @@ from gambit.kmers import KmerSpec from gambit.sigs.calc import calc_signature, ArrayAccumulator, SetAccumulator -from gambit.test import random_seq +from ..common import random_seq @pytest.fixture(scope='module', params=[10**4, 10**6]) diff --git a/src/gambit/test.py b/tests/common.py similarity index 100% rename from src/gambit/test.py rename to tests/common.py diff --git a/tests/sigs/test_base.py b/tests/sigs/test_base.py index 9f65fe2..0761b0d 100644 --- a/tests/sigs/test_base.py +++ b/tests/sigs/test_base.py @@ -6,7 +6,7 @@ from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures, \ AnnotatedSignatures, sigarray_eq, SignaturesMeta from gambit.kmers import KmerSpec -from gambit.test import make_signatures +from ..common import make_signatures from .common import AbstractSignatureArrayTests diff --git a/tests/sigs/test_calc.py b/tests/sigs/test_calc.py index 3bf7f9a..efed455 100644 --- a/tests/sigs/test_calc.py +++ b/tests/sigs/test_calc.py @@ -10,10 +10,10 @@ from gambit.sigs.calc import calc_signature, calc_file_signature, calc_file_signatures from gambit.kmers import KmerSpec, index_to_kmer from gambit.seq import SEQ_TYPES, revcomp, SequenceFile -from gambit.test import fill_bytearray, make_kmer_seq, make_kmer_seqs, convert_seq import gambit.util.io as ioutil from gambit.sigs import sigarray_eq from gambit.util.progress import check_progress +from ..common import fill_bytearray, make_kmer_seq, make_kmer_seqs, convert_seq KSPEC = KmerSpec(11, 'AGTAC') diff --git a/tests/sigs/test_convert.py b/tests/sigs/test_convert.py index 59ccbb6..964cc15 100644 --- a/tests/sigs/test_convert.py +++ b/tests/sigs/test_convert.py @@ -6,7 +6,7 @@ from gambit.sigs.convert import dense_to_sparse, sparse_to_dense, can_convert, \ check_can_convert, convert_dense, convert_sparse from gambit.kmers import KmerSpec -from gambit.test import random_seq +from ..common import random_seq def test_dense_sparse_conversion(): diff --git a/tests/sigs/test_hdf5.py b/tests/sigs/test_hdf5.py index 5e75c44..05b0863 100644 --- a/tests/sigs/test_hdf5.py +++ b/tests/sigs/test_hdf5.py @@ -8,7 +8,7 @@ from gambit.sigs import SignaturesMeta, SignatureList, AnnotatedSignatures from gambit.sigs.base import SignaturesFileError from gambit.kmers import KmerSpec -from gambit.test import make_signatures +from ..common import make_signatures from .common import AbstractSignatureArrayTests diff --git a/tests/test_classify.py b/tests/test_classify.py index 5c3a522..70cffe9 100644 --- a/tests/test_classify.py +++ b/tests/test_classify.py @@ -4,7 +4,7 @@ from gambit.classify import matching_taxon, find_matches, consensus_taxon, GenomeMatch from gambit.db import Taxon, AnnotatedGenome -from gambit.test import make_lineage +from .common import make_lineage def test_matching_taxon(): diff --git a/tests/test_kmers.py b/tests/test_kmers.py index fb0ab5a..14b0f21 100644 --- a/tests/test_kmers.py +++ b/tests/test_kmers.py @@ -7,7 +7,7 @@ from gambit import kmers from gambit.kmers import KmerSpec import gambit.util.json as gjson -from gambit.test import convert_seq, make_kmer_seq +from .common import convert_seq, make_kmer_seq class TestIndices: diff --git a/tests/test_metric.py b/tests/test_metric.py index 6ac340e..2784804 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -10,8 +10,8 @@ from gambit.sigs.convert import sparse_to_dense from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures from gambit.kmers import KmerSpec -from gambit.test import make_signatures from gambit.util.progress import check_progress +from .common import make_signatures @pytest.fixture( diff --git a/tests/test_seq.py b/tests/test_seq.py index 379e560..4896eb5 100644 --- a/tests/test_seq.py +++ b/tests/test_seq.py @@ -11,7 +11,7 @@ from gambit.seq import SequenceFile, revcomp from gambit.kmers import nkmers, index_to_kmer from gambit.util.misc import zip_strict -from gambit.test import random_seq +from .common import random_seq # Complements to nucleotide ASCII codes diff --git a/tests/test_test.py b/tests/test_tests_common.py similarity index 88% rename from tests/test_test.py rename to tests/test_tests_common.py index a3f1da5..899f45e 100644 --- a/tests/test_test.py +++ b/tests/test_tests_common.py @@ -1,13 +1,13 @@ -"""Test gambit.test module.""" +"""Test the common.py test module.""" import pytest import numpy as np -from gambit import test from gambit.kmers import KmerSpec, kmer_to_index, nkmers from gambit.seq import revcomp from gambit.sigs.convert import dense_to_sparse from gambit.util.progress import get_progress +from . import common @pytest.mark.parametrize('k', [4, 6, 8]) @@ -15,7 +15,7 @@ @pytest.mark.parametrize('dtype', [np.dtype('u8'), np.dtype('u4')]) def test_make_signatures(k, n, dtype): np.random.seed(0) - sigs = test.make_signatures(k, n, dtype) + sigs = common.make_signatures(k, n, dtype) assert len(sigs) == n for i, sig in enumerate(sigs): @@ -32,7 +32,7 @@ def test_make_signatures(k, n, dtype): @pytest.mark.parametrize('chars', ['ACGT', 'XYZ']) def test_random_seq(n, chars): np.random.seed(0) - seq = test.random_seq(n, chars) + seq = common.random_seq(n, chars) assert isinstance(seq, bytes) assert len(seq) == n assert all(chr(c) in chars for c in seq) @@ -41,7 +41,7 @@ def test_random_seq(n, chars): @pytest.mark.parametrize('pattern', [b'N', b'ABC']) @pytest.mark.parametrize('n', [100, 1000]) def test_fill_bytearray(pattern, n): - arr = test.fill_bytearray(pattern, n) + arr = common.fill_bytearray(pattern, n) assert isinstance(arr, bytearray) assert len(arr) == n @@ -55,7 +55,7 @@ def test_fill_bytearray(pattern, n): @pytest.mark.parametrize('n_interval', [None, 5]) def test_make_kmer_seq(kspec, seqlen, kmer_interval, n_interval): np.random.seed(0) - seq, sig = test.make_kmer_seq(kspec, seqlen, kmer_interval, n_interval) + seq, sig = common.make_kmer_seq(kspec, seqlen, kmer_interval, n_interval) assert len(seq) == seqlen vec = np.zeros(kspec.nkmers, dtype=bool) @@ -80,7 +80,7 @@ def test_make_kmer_seq(kspec, seqlen, kmer_interval, n_interval): def test_make_lineage(): thresholds = [.1, .2, None, .3] n = len(thresholds) - taxa = test.make_lineage(thresholds) + taxa = common.make_lineage(thresholds) assert len(taxa) == n for i in range(n): From 6f3cdb8e0f838b82a2dcaa74ee4100d4549fa1a4 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 14 Jul 2024 14:06:13 -0700 Subject: [PATCH 16/86] Relocate test function --- src/gambit/results/base.py | 9 --------- tests/test_results.py | 11 ++++++++++- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/gambit/results/base.py b/src/gambit/results/base.py index c44220e..c9440d0 100644 --- a/src/gambit/results/base.py +++ b/src/gambit/results/base.py @@ -1,7 +1,6 @@ import json from abc import ABC, abstractmethod from typing import IO, Union, TextIO -from io import StringIO from attr import asdict, attrs, attrib @@ -29,14 +28,6 @@ def export(self, file_or_path: Union[FilePath, IO], results: QueryResults): """ -def export_to_buffer(results: QueryResults, exporter) -> StringIO: - """Export query results to a `StringIO` buffer.""" - buf = StringIO() - exporter.export(buf, results) - buf.seek(0) - return buf - - def _todict(obj, attrs): return {a: getattr(obj, a) for a in attrs} diff --git a/tests/test_results.py b/tests/test_results.py index 1c95371..5f78a6a 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -1,3 +1,5 @@ +from io import StringIO + import pytest from gambit.query import QueryResults, QueryResultItem, QueryInput, QueryParams @@ -5,13 +7,20 @@ from gambit.db import ReferenceGenomeSet, Genome from gambit.sigs import SignaturesMeta from gambit.seq import SequenceFile -from gambit.results.base import export_to_buffer from gambit.results.json import JSONResultsExporter from gambit.results.csv import CSVResultsExporter from gambit.results.archive import ResultsArchiveReader, ResultsArchiveWriter from gambit.results.test import check_json_results, check_csv_results +def export_to_buffer(results: QueryResults, exporter) -> StringIO: + """Export query results to a `StringIO` buffer.""" + buf = StringIO() + exporter.export(buf, results) + buf.seek(0) + return buf + + @pytest.fixture() def session(testdb): return testdb.Session() From 36fac50add6244301a247dd9d425036ef33d7c10 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 14 Jul 2024 14:11:50 -0700 Subject: [PATCH 17/86] Type hints --- src/gambit/results/test.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/gambit/results/test.py b/src/gambit/results/test.py index 9bb7e80..bbc1248 100644 --- a/src/gambit/results/test.py +++ b/src/gambit/results/test.py @@ -2,28 +2,30 @@ import csv import json -from typing import TextIO +from typing import TextIO, Any, Iterable, Optional from pathlib import Path import numpy as np from gambit.util.json import to_json from gambit.query import QueryResults +from gambit.classify import GenomeMatch from gambit.util.misc import zip_strict +from gambit.db.models import AnnotatedGenome, Taxon -def cmp_json_attrs(data, obj, attrnames): +def cmp_json_attrs(data: dict[str, Any], obj, attrnames: Iterable[str]): for attr in attrnames: assert data[attr] == getattr(obj, attr) -def cmp_taxon_json(taxon_data, taxon): +def cmp_taxon_json(taxon_data: dict[str, Any], taxon: Optional[Taxon]): if taxon is None: assert taxon_data is None else: assert taxon_data is not None cmp_json_attrs(taxon_data, taxon, ['id', 'key', 'name', 'ncbi_id', 'rank', 'distance_threshold']) -def cmp_annnotatedgenome_json(genome_data, genome): +def cmp_annnotatedgenome_json(genome_data: dict[str, Any], genome: AnnotatedGenome): assert genome_data['id'] == genome.genome_id cmp_json_attrs( genome_data, @@ -33,7 +35,7 @@ def cmp_annnotatedgenome_json(genome_data, genome): for taxon_data, taxon in zip_strict(genome_data['taxonomy'], genome.taxon.ancestors(True)): cmp_taxon_json(taxon_data, taxon) -def cmp_genomematch_json(match_data, match): +def cmp_genomematch_json(match_data, match: GenomeMatch): assert np.isclose(match_data['distance'], match.distance) cmp_annnotatedgenome_json(match_data['genome'], match.genome) @@ -105,7 +107,7 @@ def check_json_results(file: TextIO, cmp_genomematch_json(match_data, match) -def cmp_csv_taxon(row, taxon, prefix): +def cmp_csv_taxon(row, taxon: Optional[Taxon], prefix: str): if taxon is None: assert row[prefix + '.name'] == '' From 6a93bab9f82b779399da5683354535c6697c9b8d Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 28 Jan 2024 00:03:59 -0800 Subject: [PATCH 18/86] Remove is_importable func --- src/gambit/cli/debug.py | 7 +++---- src/gambit/util/misc.py | 6 ------ tests/util/test_misc.py | 8 -------- 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/src/gambit/cli/debug.py b/src/gambit/cli/debug.py index b2f827d..33d1068 100644 --- a/src/gambit/cli/debug.py +++ b/src/gambit/cli/debug.py @@ -49,12 +49,11 @@ def shell(ctx, use_ipython): Attempts to launch an IPython interactive interpreter if it is installed, otherwise falls back on standard Python REPL. """ - from gambit.util.misc import is_importable - if use_ipython is None: - if is_importable('IPython'): + try: + import IPython use_ipython = True - else: + except ImportError: click.echo('IPython not available, defaulting to built-in Python REPL.', err=True) use_ipython = False diff --git a/src/gambit/util/misc.py b/src/gambit/util/misc.py index d40128d..18c905b 100644 --- a/src/gambit/util/misc.py +++ b/src/gambit/util/misc.py @@ -106,12 +106,6 @@ def wrapper(self, cls, *rest, **kw): return wrapper -def is_importable(module: str) -> bool: - """Check if the specified module is importable, without actually importing it.""" - from importlib.util import find_spec - return find_spec(module) is not None - - def join_list_human(strings: Iterable[str], conj: str='and') -> str: """Join items into a single human-readable string with commas and the given conjunction.""" strings = list(strings) diff --git a/tests/util/test_misc.py b/tests/util/test_misc.py index e9963b7..61a12fa 100644 --- a/tests/util/test_misc.py +++ b/tests/util/test_misc.py @@ -54,14 +54,6 @@ def test_chunk_slices(): assert list(misc.chunk_slices(0, 10)) == [] -def test_is_importable(): - """Test the is_importable() function.""" - assert misc.is_importable('urllib') - assert misc.is_importable('urllib.request') - assert not misc.is_importable('aklhaskhdkslkdjahkdf') - assert not misc.is_importable('urllib.aklhaskhdkslkdjahkdf') - - def test_join_list_human(): l = ['foo', 'bar', 'baz'] assert misc.join_list_human(l[:1]) == 'foo' From 81b3c61078007c5a9b41745357a0651371d7d277 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 14 Jul 2024 14:41:44 -0700 Subject: [PATCH 19/86] Update to zip_strict() func --- src/gambit/util/misc.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/gambit/util/misc.py b/src/gambit/util/misc.py index 18c905b..e797a43 100644 --- a/src/gambit/util/misc.py +++ b/src/gambit/util/misc.py @@ -1,11 +1,33 @@ """Utility code that doesn't fit anywhere else.""" import sys -from typing import Iterator, Tuple, Callable, Iterable +from typing import Iterator, Callable, Iterable, TypeVar, overload from functools import singledispatch, wraps -def zip_strict(*iterables: Iterator) -> Iterator[Tuple]: +T = TypeVar('T') +T2 = TypeVar('T2') +T3 = TypeVar('T3') +T4 = TypeVar('T4') + + +# Type-hinting zip() properly isn't really possible short of adding overloads for all possible #'s +# of arguments. Just do it for 2-4 here. +# Source code for https://github.com/python/typeshed/ does basically this. + +@overload +def zip_strict(it1: Iterable[T], it2: Iterable[T2], /) -> Iterator[tuple[T, T2]]: + pass # 2-iterable case + +@overload +def zip_strict(it1: Iterable[T], it2: Iterable[T2], it3: Iterable[T3], /) -> Iterator[tuple[T, T2, T3]]: + pass # 3-argument case + +@overload +def zip_strict(it1: Iterable[T], it2: Iterable[T2], it3: Iterable[T3], it4: Iterable[T4], /) -> Iterator[tuple[T, T2, T3, T4]]: + pass # 4-argument case + +def zip_strict(*iterables: Iterable) -> Iterator[tuple]: """Like the builtin ``zip`` function but raises an error if any argument is exhausted before the others. Parameters @@ -17,6 +39,16 @@ def zip_strict(*iterables: Iterator) -> Iterator[Tuple]: ------ ValueError """ + if sys.version_info >= (3, 10): + # Version 3.10+ has strict parameter for builtin zip() + return zip(*iterables, strict=True) + else: + return _zip_strict(*iterables) + + +def _zip_strict(*iterables: Iterable) -> Iterator[tuple]: + """Implementation for Python 3.9.""" + # Builtin zip gives empty output on empty input if not iterables: return From 48ca06510fce1e01b1483945b42370236ba55de0 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 14 Jul 2024 14:47:43 -0700 Subject: [PATCH 20/86] Possibly make JSON dumping faster --- src/gambit/util/json.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/gambit/util/json.py b/src/gambit/util/json.py index 409e20f..998a034 100644 --- a/src/gambit/util/json.py +++ b/src/gambit/util/json.py @@ -57,8 +57,7 @@ def dump(obj, f: TextIO, **kw): \\**kw Keyword arguments to :func:`json.dump`. """ - data = to_json(obj) - json.dump(data, f, **kw) + json.dump(obj, f, default=converter.unstructure, **kw) def load(f: TextIO, cls=Any): @@ -93,7 +92,7 @@ def dumps(obj, **kw) -> str: ------- str """ - return json.dumps(to_json(obj), **kw) + return json.dumps(obj, default=converter.unstructure, **kw) def loads(s: str, cls=Any): From f2eed723f38ccb21a53703f3934470acb5a13625 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 14 Jul 2024 15:23:34 -0700 Subject: [PATCH 21/86] Simplifications to "archive" results format code --- src/gambit/results/archive.py | 141 ++++++++++++---------------------- 1 file changed, 50 insertions(+), 91 deletions(-) diff --git a/src/gambit/results/archive.py b/src/gambit/results/archive.py index 4d29836..1f288cc 100644 --- a/src/gambit/results/archive.py +++ b/src/gambit/results/archive.py @@ -4,24 +4,15 @@ from typing import Union, IO, Any from functools import singledispatchmethod -from attr import attrs, attrib, asdict, has as has_attrs from sqlalchemy.orm import Session -from gambit.query import QueryResultItem, QueryResults -from gambit.classify import ClassifierResult, GenomeMatch +from gambit.query import QueryResults from gambit.db import ReferenceGenomeSet, Taxon, AnnotatedGenome, Genome import gambit.util.json as gjson from gambit.util.io import FilePath, maybe_open -from gambit.util.misc import type_singledispatchmethod -from gambit.util.typing import is_optional, unwrap_optional -from .base import asdict_default, BaseJSONResultsExporter +from .base import BaseJSONResultsExporter, _todict -def _todict(obj, attrs): - return {a: getattr(obj, a) for a in attrs} - - -@attrs() class ResultsArchiveWriter(BaseJSONResultsExporter): """Exports query results to "archive" format which captures all stored data. @@ -29,29 +20,12 @@ class ResultsArchiveWriter(BaseJSONResultsExporter): The exported data can be read and converted back into an identical :class:`QueryResults` object using :class:`.ResultsArchiveReader`. - Attributes - ---------- - install_info - Add results of :func:`gambit.util.dev.install_info` to the ``QueryResults.extra`` dict. + Only the ID attributes of database models are saved, when loading the saved results the models + are recreated by database queries. """ - install_info: bool = attrib(default=False) to_json = singledispatchmethod(BaseJSONResultsExporter.to_json) - to_json.register(ClassifierResult, asdict_default) - to_json.register(GenomeMatch, asdict_default) - to_json.register(QueryResultItem, asdict_default) - - @to_json.register(QueryResults) - def _queryresults_to_json(self, results): - data = asdict(results) - - if self.install_info: - from gambit.util.dev import install_info - data['extra']['install_info'] = install_info() - - return data - @to_json.register(ReferenceGenomeSet) def _genomeset_to_json(self, gset: ReferenceGenomeSet): return _todict(gset, ['key', 'version']) @@ -78,71 +52,25 @@ class ResultsArchiveReader: def __init__(self, session): self.session = session - @type_singledispatchmethod - def _from_json(self, cls, data, ctx): - """Default implementation.""" - if is_optional(cls): - if data is None: - return None - else: - return self._from_json(unwrap_optional(cls), data, ctx) + self._init_converter() - if has_attrs(cls): - return self._attrs_from_json(cls, data, ctx) - else: - return gjson.from_json(data, cls) + # Loading the Taxon and AnnotatedGenome instances from the database requires not just their + # ID (key attribute) values but also the ReferenceGenomeSet they belong to. Setting this + # attribute to the genome set instance of the results currently being loaded is a somewhat + # hacky method of passing this information to the unstructuring hook functions. There isn't + # a much better way of doing this without reimplementing a lot of the cattrs machinery. + self._current_genomeset = None - def _attrs_from_json(self, cls, data, ctx, values=None): - """Create an attrs class instance from JSON data. + def _init_converter(self): + """Initialize the cattrs converter instance. - ``values`` is a dictionary of already-deserialized attribute values. + This is a clone of the converter instance in gambit.util.json, with additional structuring + hooks registered to methods on this instance. """ - kw = dict() - - for a in cls.__attrs_attrs__: - if values is not None and a.name in values: - kw[a.name] = values[a.name] - else: - atype = Any if a.type is None else a.type - kw[a.name] = self._from_json(atype, data[a.name], ctx) - - return cls(**kw) - - @_from_json.register(ReferenceGenomeSet) - def _genomeset_from_json(self, cls, data, ctx): - assert data is not None - return self.session.query(ReferenceGenomeSet).filter_by(key=data['key'], version=data['version']).one() - - @_from_json.register(AnnotatedGenome) - def _genome_from_json(self, cls, data, ctx): - key = data['key'] - gset_id = ctx['genomeset'].id - return self.session.query(AnnotatedGenome)\ - .join(Genome)\ - .filter(AnnotatedGenome.genome_set_id == gset_id, Genome.key == key)\ - .one() - - @_from_json.register(Taxon) - def _taxon_from_json(self, cls, data, ctx): - key = data['key'] - gset_id = ctx['genomeset'].id - return self.session.query(Taxon).filter_by(genome_set_id=gset_id, key=key).one() - - @_from_json.register(QueryResultItem) - def _result_item_from_json(self, cls, data, ctx): - values = dict( - closest_genomes=[self._from_json(GenomeMatch, genome_data, ctx) for genome_data in data['closest_genomes']], - ) - return self._attrs_from_json(QueryResultItem, data, ctx, values) - - def results_from_json(self, data): - genomeset = self._from_json(ReferenceGenomeSet, data['genomeset'], dict()) - - # Add genome set to context so the correct AnnotatedGenomes can be loaded. - ctx = dict(genomeset=genomeset) - - items = [self._from_json(QueryResultItem, item, ctx) for item in data['items']] - return self._attrs_from_json(QueryResults, data, ctx, dict(genomeset=genomeset, items=items)) + self._converter = gjson.converter.copy() + self._converter.register_structure_hook(ReferenceGenomeSet, self._structure_genomeset) + self._converter.register_structure_hook(AnnotatedGenome, self._structure_genome) + self._converter.register_structure_hook(Taxon, self._structure_taxon) def read(self, file_or_path: Union[FilePath, IO]) -> QueryResults: """Read query results from JSON file. @@ -156,3 +84,34 @@ def read(self, file_or_path: Union[FilePath, IO]) -> QueryResults: data = json.load(f) return self.results_from_json(data) + + def results_from_json(self, data: dict[str, Any]) -> QueryResults: + """Recreate results object from loaded JSON data.""" + + gset_key = data['genomeset']['key'] + gset_version = data['genomeset']['version'] + self._current_genomeset = self.session.query(ReferenceGenomeSet) \ + .filter_by(key=gset_key, version=gset_version) \ + .one() + + try: + return self._converter.structure(data, QueryResults) + + finally: + self._current_genomeset = None + + def _structure_genomeset(self, data: dict[str, Any], cls=None): + return self._current_genomeset + + def _structure_genome(self, data: dict[str, Any], cls=None) -> AnnotatedGenome: + key = data['key'] + gset_id = self._current_genomeset.id + return self.session.query(AnnotatedGenome)\ + .join(Genome)\ + .filter(AnnotatedGenome.genome_set_id == gset_id, Genome.key == key)\ + .one() + + def _structure_taxon(self, data: dict[str, Any], cls=None) -> Taxon: + key = data['key'] + gset_id = self._current_genomeset.id + return self.session.query(Taxon).filter_by(genome_set_id=gset_id, key=key).one() From f3e43532bf84eef1e3e216db6cff451e4dcf7c83 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 14 Jul 2024 16:19:54 -0700 Subject: [PATCH 22/86] Remove gambit.util.typing module --- src/gambit/util/typing.py | 43 ----------------------------------- tests/util/test_typing.py | 47 --------------------------------------- 2 files changed, 90 deletions(-) delete mode 100644 src/gambit/util/typing.py delete mode 100644 tests/util/test_typing.py diff --git a/src/gambit/util/typing.py b/src/gambit/util/typing.py deleted file mode 100644 index c705fe1..0000000 --- a/src/gambit/util/typing.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Utilities based on the built-in ``typing`` module.""" - -import typing -from typing import Union, Any - - -def is_union(T) -> bool: - """Check if a type annotation is a *parameterized* :class:`typing.Union`. - - Parameters - ---------- - T - Result of ``Union[A, B, ...]``. - """ - return isinstance(T, typing._GenericAlias) and T.__origin__ is typing.Union - - -def union_types(T) -> tuple: - """Get the types from a parameterized :class:`typing.Union`. - - Parameters - ---------- - T - Result of ``Union[A, B, ...]``. - """ - return T.__args__ - - -def is_optional(T) -> bool: - """Check if a parametrized union type is equivalent to one returned by :data:`typing.Optional`.""" - if not is_union(T): - return False - types = union_types(T) - return len(types) == 2 and type(None) in types - - -def unwrap_optional(u): - """Get ``T`` from ``typing.Optional[T]``.""" - for T in union_types(u): - if T is not type(None): - return T - - raise ValueError(f'Not an Optional type: {u!r}') diff --git a/tests/util/test_typing.py b/tests/util/test_typing.py deleted file mode 100644 index 2f19ba9..0000000 --- a/tests/util/test_typing.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Test the gambit.util.typing submodule.""" - -import typing -from typing import Union, Optional - -from gambit.util.typing import is_union, union_types, is_optional, unwrap_optional - - -def test_is_union(): - """Test the is_union() function.""" - assert is_union(Union[int, str]) - assert is_union(Union[int, str, bool]) - assert is_union(Optional[int]) - assert not is_union(Union) - assert not is_union(Optional) - assert not is_union(int) - assert not is_union(None) - assert not is_union(typing.List) - assert not is_union(typing.Any) - - -def test_union_types(): - """Test the union_types() function.""" - assert union_types(Union[int, str]) == (int, str) - assert union_types(Union[int, str, bool]) == (int, str, bool) - assert union_types(Optional[int]) == (int, type(None)) - - -def test_is_optional(): - """Test the is_optional() function.""" - assert is_optional(Optional[int]) - assert is_optional(Union[int, None]) - assert is_optional(Union[None, int]) - assert not is_optional(Union[int, str]) - assert not is_optional(Union) - assert not is_optional(Optional) - assert not is_optional(int) - assert not is_optional(None) - assert not is_optional(type(None)) - assert not is_optional(typing.Any) - - -def test_unwrap_optional(): - """Test the unwrap_optional() function.""" - assert unwrap_optional(Optional[int]) is int - assert unwrap_optional(Union[int, None]) is int - assert unwrap_optional(Union[None, int]) is int From 6b4052eaf25eeaa4f8e38aa5a213cd55184fc4ab Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 14 Jul 2024 16:21:36 -0700 Subject: [PATCH 23/86] Remove gambit.util.dev module --- src/gambit/util/dev.py | 96 ------------------------------------------ 1 file changed, 96 deletions(-) delete mode 100644 src/gambit/util/dev.py diff --git a/src/gambit/util/dev.py b/src/gambit/util/dev.py deleted file mode 100644 index d23c089..0000000 --- a/src/gambit/util/dev.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Development tools.""" - -from pathlib import Path -import subprocess -import shutil -from typing import Dict, Any - -import gambit -from gambit.util.io import FilePath -from gambit.util.misc import zip_strict - - -_INSTALL_INFO = None - - -def get_commit_info(repo_path: FilePath, commit: str = 'HEAD') -> Dict[str, str]: - """Get metadata on a git commit. - - This calls the ``git`` command, so it must be installed and available. - - Parameters - ---------- - repo_path - Path to git repo. - commit - Commit to get information on. - """ - fields = [ - ('hash', '%H'), - ('author', '%an <%ae>'), - ('author_date', '%aI'), - ('commit', '%cn <%ce>'), - ('commit_date', '%cI'), - ('subject', '%s'), - ] - - fmt_str = '%n'.join(fmt for name, fmt in fields) - cmd = ['git', 'show', '-s', '--format=' + fmt_str, commit] - - result = subprocess.run(cmd, cwd=repo_path, capture_output=True, check=True, text=True) - - lines = result.stdout.splitlines() - assert len(lines) == len(fields) - return {name: line for (name, fmt), line in zip_strict(fields, lines)} - - -def _install_info(): - info = dict(pkg_dir=None, repo_dir=None, commit=None) - - if not hasattr(gambit, '__path__'): - info['status'] = 'gambit module has no __path__ attribute.' - return info - - if len(gambit.__path__) != 1: - info['status'] = f'Expected gambit.__path__ to contain single item, got {gambit.__path__!r}' - return info - - pkg_dir = info['pkg_dir'] = Path(gambit.__path__[0]) - repo_dir = pkg_dir.parent - - if not (repo_dir / '.git').is_dir(): - info['status'] = 'Parent of package directory not a git repo (has no .git subdirectory).' - return info - - info['repo_dir'] = repo_dir - - if shutil.which('git') is None: - info['status'] = 'git command not found' - return info - - try: - commit = get_commit_info(repo_dir) - except subprocess.SubprocessError as e: - info['status'] = f'Command {e.cmd!r} returned exit code {e.returncode} with stderr output {e.stderr!r}' - except Exception as e: - info['status'] = f'Error getting commit info: {e!r}' - else: - info['status'] = 'Git info retrieved successfully.' - info['commit'] = commit - - return info - - -def install_info() -> Dict[str, Any]: - """Get information on the GAMBIT installation if it is installed in development mode. - - - If gambit is installed via the setuptools development install method (``pip install -e``), this - checks if the source directory is a valid git repo and tries to get information on the current - commit. This is used to mark exported results from development versions of the software which do - not correspond to an official release. - """ - global _INSTALL_INFO - if _INSTALL_INFO is None: - _INSTALL_INFO = _install_info() - return _INSTALL_INFO From f1ac019a34398d621a6ac4b78c85a33ca5f61970 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 14 Jul 2024 16:43:16 -0700 Subject: [PATCH 24/86] Fixes to API docs --- docs/source/api/kmers.rst | 2 ++ docs/source/api/misc.rst | 12 ------------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/docs/source/api/kmers.rst b/docs/source/api/kmers.rst index 859fed4..36fdc14 100644 --- a/docs/source/api/kmers.rst +++ b/docs/source/api/kmers.rst @@ -27,6 +27,7 @@ gambit.sigs.base ---------------------- .. automodule:: gambit.sigs.base + :exclude-members: AbstractSignatureArray .. autoclass:: AbstractSignatureArray :special-members: +__eq__ @@ -48,6 +49,7 @@ gambit.sigs.hdf5 ---------------------- .. automodule:: gambit.sigs.hdf5 + :exclude-members: HDF5Signatures .. autoclass:: HDF5Signatures :special-members: +__bool__ diff --git a/docs/source/api/misc.rst b/docs/source/api/misc.rst index 4b455c7..d89cabd 100644 --- a/docs/source/api/misc.rst +++ b/docs/source/api/misc.rst @@ -14,12 +14,6 @@ gambit.util.misc .. automodule:: gambit.util.misc -gambit.util.typing ------------------- - -.. automodule:: gambit.util.typing - - gambit.util.io -------------- @@ -42,9 +36,3 @@ gambit.util.progress -------------------- .. automodule:: gambit.util.progress - - -gambit.util.dev ----------------- - -.. automodule:: gambit.util.dev From 973bac77b3efdcb23c53702bb840ff045e275579 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 14 Jul 2024 16:43:38 -0700 Subject: [PATCH 25/86] More unused code removal --- src/gambit/results/base.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/gambit/results/base.py b/src/gambit/results/base.py index c9440d0..11b4c42 100644 --- a/src/gambit/results/base.py +++ b/src/gambit/results/base.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from typing import IO, Union, TextIO -from attr import asdict, attrs, attrib +from attr import attrs, attrib from gambit.util.io import FilePath, maybe_open import gambit.util.json as gjson @@ -32,16 +32,6 @@ def _todict(obj, attrs): return {a: getattr(obj, a) for a in attrs} -def asdict_method(recurse=False, **kw): - """Create a ``to_json`` method which calls :func:`attrs.asdict` with the given options.""" - def method(self, obj): - return asdict(obj, recurse=recurse, **kw) - return method - - -asdict_default = asdict_method() - - @attrs() class BaseJSONResultsExporter(AbstractResultsExporter): """Base class for JSON exporters. From 3a8a7532328ea3a926b7727fe3af43227426d690 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Thu, 18 Jul 2024 22:58:34 -0700 Subject: [PATCH 26/86] Type annotations --- src/gambit/kmers.py | 6 +++--- src/gambit/util/io.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gambit/kmers.py b/src/gambit/kmers.py index e756f97..26e150f 100644 --- a/src/gambit/kmers.py +++ b/src/gambit/kmers.py @@ -1,6 +1,6 @@ """Core functions for searching for and working with k-mers.""" -from typing import Dict, Any, Iterator +from typing import Optional, Any, Iterator import numpy as np from attr import attrs, attrib @@ -16,7 +16,7 @@ def nkmers(k: int) -> int: return 4 ** k -def index_dtype(k: int) -> np.dtype: +def index_dtype(k: int) -> Optional[np.dtype]: """Get the smallest unsigned integer dtype that can store k-mer indices for the given ``k``.""" if k <= 4: return np.dtype('u1') @@ -119,7 +119,7 @@ def __to_json__(self): return dict(k=int(self.k), prefix=self.prefix_str) @classmethod - def __from_json__(cls, data: Dict[str, Any]) -> 'KmerSpec': + def __from_json__(cls, data: dict[str, Any]) -> 'KmerSpec': return cls(data['k'], data['prefix']) diff --git a/src/gambit/util/io.py b/src/gambit/util/io.py index 3cc7668..e591063 100644 --- a/src/gambit/util/io.py +++ b/src/gambit/util/io.py @@ -2,7 +2,7 @@ import os from io import TextIOWrapper -from typing import Union, Optional, IO, BinaryIO, ContextManager, Iterable, TypeVar +from typing import Union, Optional, IO, TextIO, BinaryIO, ContextManager, Iterable, TypeVar from contextlib import nullcontext #: Alias for types which can represent a file system path @@ -209,7 +209,7 @@ def maybe_open(file_or_path: Union[FilePath, IO], mode: str = 'r', **open_kw) -> return open(path, mode, **open_kw) -def read_lines(file_or_path: Union[FilePath, IO], strip: bool=True, skip_empty: bool=False) -> Iterable[str]: +def read_lines(file_or_path: Union[FilePath, TextIO], strip: bool=True, skip_empty: bool=False) -> Iterable[str]: """Iterate over lines in text file. Parameters @@ -233,7 +233,7 @@ def read_lines(file_or_path: Union[FilePath, IO], strip: bool=True, skip_empty: yield line -def write_lines(lines: Iterable, file_or_path: Union[FilePath, IO]): +def write_lines(lines: Iterable, file_or_path: Union[FilePath, TextIO]): """Write strings to text file, one per line. Parameters From fd7f3123e7dbc201fa0236ed6ee8bd34db58a6a1 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Thu, 25 Jul 2024 01:26:36 -0700 Subject: [PATCH 27/86] Remove gambit.sigs.convert module --- src/gambit/sigs/calc.py | 45 ++++++++++++ src/gambit/sigs/convert.py | 144 ------------------------------------- tests/common.py | 2 +- tests/sigs/test_calc.py | 28 +++++++- tests/sigs/test_convert.py | 92 ------------------------ tests/test_metric.py | 2 +- tests/test_tests_common.py | 2 +- 7 files changed, 75 insertions(+), 240 deletions(-) delete mode 100644 src/gambit/sigs/convert.py delete mode 100644 tests/sigs/test_convert.py diff --git a/src/gambit/sigs/calc.py b/src/gambit/sigs/calc.py index c4c4614..74b0eef 100644 --- a/src/gambit/sigs/calc.py +++ b/src/gambit/sigs/calc.py @@ -278,3 +278,48 @@ def calc_file_signatures(kspec: KmerSpec, assert all(sig is not None for sig in sigs) return SignatureList(sigs, kspec) + + +def dense_to_sparse(vec: Sequence[bool]) -> KmerSignature: + """Convert k-mer set from dense bit vector to sparse coordinate representation. + + Parameters + ---------- + vec + Boolean vector indicating which k-mers are present. + + Returns + ------- + numpy.ndarray + Sorted array of coordinates of k-mers present in vector. Data type will be ``numpy.intp``. + + See Also + -------- + .sparse_to_dense + """ + return np.flatnonzero(vec) + + +def sparse_to_dense(k_or_kspec: Union[int, KmerSpec], coords: KmerSignature) -> np.ndarray: + """Convert k-mer set from sparse coordinate representation back to dense bit vector. + + Parameters + ---------- + k_or_kspec + Value of k or a :class:`.KmerSpec` instance. + coords + Sparse coordinate array. + + Returns + ------- + numpy.ndarray + Dense k-mer bit vector. + + See Also + -------- + .dense_to_sparse + """ + idx_len = k_or_kspec.nkmers if isinstance(k_or_kspec, KmerSpec) else nkmers(k_or_kspec) + vec = np.zeros(idx_len, dtype=np.bool_) + vec[coords] = 1 + return vec diff --git a/src/gambit/sigs/convert.py b/src/gambit/sigs/convert.py deleted file mode 100644 index f0308b5..0000000 --- a/src/gambit/sigs/convert.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Convert signatures between representations or from one ``KmerSpec`` to another.""" - -from typing import Sequence, Union - -import numpy as np - -from .base import KmerSignature -from gambit.kmers import KmerSpec, nkmers, kmer_to_index - - -def dense_to_sparse(vec: Sequence[bool]) -> KmerSignature: - """Convert k-mer set from dense bit vector to sparse coordinate representation. - - Parameters - ---------- - vec - Boolean vector indicating which k-mers are present. - - Returns - ------- - numpy.ndarray - Sorted array of coordinates of k-mers present in vector. Data type will be ``numpy.intp``. - - See Also - -------- - .sparse_to_dense - """ - return np.flatnonzero(vec) - - -def sparse_to_dense(k_or_kspec: Union[int, KmerSpec], coords: KmerSignature) -> np.ndarray: - """Convert k-mer set from sparse coordinate representation back to dense bit vector. - - Parameters - ---------- - k_or_kspec - Value of k or a :class:`.KmerSpec` instance. - coords - Sparse coordinate array. - - Returns - ------- - numpy.ndarray - Dense k-mer bit vector. - - See Also - -------- - .dense_to_sparse - """ - idx_len = k_or_kspec.nkmers if isinstance(k_or_kspec, KmerSpec) else nkmers(k_or_kspec) - vec = np.zeros(idx_len, dtype=np.bool_) - vec[coords] = 1 - return vec - - -def can_convert(from_kspec: KmerSpec, to_kspec: KmerSpec) -> bool: - """Check if signatures from one KmerSpec can be converted to another. - - Conversion is possible if ``to_kspec.prefix`` is equal to or starts with ``from_kspec.prefix`` - and ``to_kspec.total_len <= from_kspec.total_len``. - """ - return to_kspec.prefix.startswith(from_kspec.prefix) and to_kspec.total_len <= from_kspec.total_len - - -def check_can_convert(from_kspec: KmerSpec, to_kspec: KmerSpec): - """ - Check that signatures can be converted from one KmerSpec to another or raise an error with an - informative message. - - Raises - ------ - ValueError - If conversion is not possible. - """ - if not to_kspec.prefix.startswith(from_kspec.prefix): - raise ValueError('Destination prefix must start with source prefix.') - if to_kspec.total_len > from_kspec.total_len: - raise ValueError('Cannot convert to KmerSpec with longer total length.') - - -def _convert_params(from_kspec: KmerSpec, to_kspec: KmerSpec): - extra_prefix = to_kspec.prefix[from_kspec.prefix_len:] - extra_ind = kmer_to_index(extra_prefix) - extra_len = len(extra_prefix) - - range_ = nkmers(from_kspec.k - extra_len) - start = extra_ind * range_ - stop = (extra_ind + 1) * range_ - reduce = from_kspec.k - to_kspec.k - extra_len - - return start, stop, reduce - - -def convert_dense(from_kspec: KmerSpec, to_kspec: KmerSpec, vec: np.ndarray) -> np.ndarray: - """Convert a k-mer signature in dense format from one ``KmerSpec`` to another. - - In the ideal case, if ``vec`` is the result of ``calc_signature(from_kspec, seq, sparse=False)`` - the output of this function should be identical to ``calc_signature(to_kspec, seq, sparse=False)``. - In reality this may not hold if any potential matches of ``from_kspec`` in ``seq`` are discarded - due to an invalid nucleotide which is not included in the corresponding ``to_kspec`` match. - """ - check_can_convert(from_kspec, to_kspec) - start, stop, reduce = _convert_params(from_kspec, to_kspec) - block_size = nkmers(reduce) - - out = np.zeros(to_kspec.nkmers, dtype=bool) - - for i in range(block_size): - out |= vec[start+i:stop:block_size] - - return out - - -def convert_sparse(from_kspec: KmerSpec, to_kspec: KmerSpec, sig: KmerSignature) -> KmerSignature: - """Convert a k-mer signature in sparse format from one ``KmerSpec`` to another. - - In the ideal case, if ``sig`` is the result of ``calc_signature(from_kspec, seq)`` - the output of this function should be identical to ``calc_signature(to_kspec, seq)``. - In reality this may not hold if any potential matches of ``from_kspec`` in ``seq`` are discarded - due to an invalid nucleotide which is not included in the corresponding ``to_kspec`` match. - """ - assert can_convert(from_kspec, to_kspec) - start, stop, reduce = _convert_params(from_kspec, to_kspec) - reduce_bits = 2 * reduce - - out = np.empty(len(sig), dtype=to_kspec.index_dtype) - i = 0 - next_ = start - - for from_idx in sig: - if from_idx < next_: - continue - if from_idx >= stop: - break - - to_idx = (from_idx - start) >> reduce_bits - out[i] = to_idx - i += 1 - - # Next possible input index that won't reduce to the same output - next_ = ((to_idx + 1) << reduce_bits) + start - - out.resize(i) - return out diff --git a/tests/common.py b/tests/common.py index 998963a..4eb294e 100644 --- a/tests/common.py +++ b/tests/common.py @@ -7,7 +7,7 @@ from gambit.kmers import KmerSpec, kmer_to_index from gambit.seq import seq_to_bytes, revcomp from gambit.sigs import KmerSignature, SignatureArray -from gambit.sigs.convert import dense_to_sparse, sparse_to_dense +from gambit.sigs.calc import dense_to_sparse, sparse_to_dense from gambit.db import Taxon diff --git a/tests/sigs/test_calc.py b/tests/sigs/test_calc.py index efed455..52924d2 100644 --- a/tests/sigs/test_calc.py +++ b/tests/sigs/test_calc.py @@ -7,7 +7,8 @@ from Bio import SeqIO from Bio.Seq import Seq -from gambit.sigs.calc import calc_signature, calc_file_signature, calc_file_signatures +from gambit.sigs.calc import calc_signature, calc_file_signature, calc_file_signatures, \ + dense_to_sparse, sparse_to_dense from gambit.kmers import KmerSpec, index_to_kmer from gambit.seq import SEQ_TYPES, revcomp, SequenceFile import gambit.util.io as ioutil @@ -171,3 +172,28 @@ def test_calc_file_signatures(self, record_sets, files, concurrency): sigs2 = calc_file_signatures(KSPEC, files, progress=pconf, concurrency=concurrency) assert sigarray_eq(sigs, sigs2) + + +def test_dense_sparse_conversion(): + """Test conversion between dense and sparse representations of k-mer coordinates.""" + + for k in range(1, 10): + + kspec = KmerSpec(k, 'ATGAC') + + # Create dense signature with every 3rd k-mer + vec = np.zeros(kspec.nkmers, dtype=bool) + vec[np.arange(vec.size) % 3 == 0] = True + + # Convert to sparse + sig = dense_to_sparse(vec) + + assert len(sig) == vec.sum() + for index in sig: + assert vec[index] + + # Check sorted + assert np.all(np.diff(sig) > 0) + + # Check converting back + assert np.array_equal(vec, sparse_to_dense(kspec, sig)) diff --git a/tests/sigs/test_convert.py b/tests/sigs/test_convert.py deleted file mode 100644 index 964cc15..0000000 --- a/tests/sigs/test_convert.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Test the gambit.sigs.convert module.""" - -import pytest -import numpy as np - -from gambit.sigs.convert import dense_to_sparse, sparse_to_dense, can_convert, \ - check_can_convert, convert_dense, convert_sparse -from gambit.kmers import KmerSpec -from ..common import random_seq - - -def test_dense_sparse_conversion(): - """Test conversion between dense and sparse representations of k-mer coordinates.""" - - for k in range(1, 10): - - kspec = KmerSpec(k, 'ATGAC') - - # Create dense signature with every 3rd k-mer - vec = np.zeros(kspec.nkmers, dtype=bool) - vec[np.arange(vec.size) % 3 == 0] = True - - # Convert to sparse - sig = dense_to_sparse(vec) - - assert len(sig) == vec.sum() - for index in sig: - assert vec[index] - - # Check sorted - assert np.all(np.diff(sig) > 0) - - # Check converting back - assert np.array_equal(vec, sparse_to_dense(kspec, sig)) - - -class TestKmerSpecConversion: - """Test converting signatures from one KmerSpec to another.""" - - def test_can_convert(self): - from_kspec = KmerSpec(11, 'ATGAC') - - compatible = [ - KmerSpec(11, 'ATGAC'), - KmerSpec(8, 'ATGAC'), - KmerSpec(10, 'ATGACA'), - KmerSpec(8, 'ATGACA'), - ] - - for to_kspec in compatible: - assert can_convert(from_kspec, to_kspec) - check_can_convert(from_kspec, to_kspec) - - incompatible = [ - KmerSpec(11, 'CAGTA'), - KmerSpec(12, 'ATGAC'), - KmerSpec(11, 'ATGA'), - KmerSpec(11, 'ATGACT'), - ] - - for to_kspec in incompatible: - assert not can_convert(from_kspec, to_kspec) - with pytest.raises(ValueError): - check_can_convert(from_kspec, to_kspec) - - @pytest.fixture(scope='class') - def seqs(self): - np.random.seed(0) - return [random_seq(100_000) for _ in range(100)] - - @pytest.mark.parametrize('to_kspec', [ - KmerSpec(10, 'ATGAC'), # Reduce k - KmerSpec(8, 'ATGAC'), # Reduce k - KmerSpec(9, 'ATGACGT'), # Extend prefix - KmerSpec(7, 'ATGACGT'), # Extend prefix and reduce k further - ]) - def test_convert(self, seqs, to_kspec): - from gambit.sigs.calc import calc_signature - - from_kspec = KmerSpec(11, 'ATGAC') - - for seq in seqs: - from_sig = calc_signature(from_kspec, seq) - from_vec = sparse_to_dense(from_kspec.k, from_sig) - - to_vec = convert_dense(from_kspec, to_kspec, from_vec) - to_sig = convert_sparse(from_kspec, to_kspec, from_sig) - - found_sig = calc_signature(to_kspec, seq) - - assert np.array_equal(to_sig, found_sig) - assert np.array_equal(to_vec, sparse_to_dense(to_kspec.k, found_sig)) diff --git a/tests/test_metric.py b/tests/test_metric.py index 2784804..25b775d 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -7,7 +7,7 @@ from gambit.metric import jaccard, jaccarddist, jaccard_bits, jaccard_generic, jaccarddist_array, \ jaccarddist_matrix, jaccarddist_pairwise, num_pairs, SCORE_DTYPE, BOUNDS_DTYPE -from gambit.sigs.convert import sparse_to_dense +from gambit.sigs.calc import sparse_to_dense from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures from gambit.kmers import KmerSpec from gambit.util.progress import check_progress diff --git a/tests/test_tests_common.py b/tests/test_tests_common.py index 899f45e..9ec7f7a 100644 --- a/tests/test_tests_common.py +++ b/tests/test_tests_common.py @@ -5,7 +5,7 @@ from gambit.kmers import KmerSpec, kmer_to_index, nkmers from gambit.seq import revcomp -from gambit.sigs.convert import dense_to_sparse +from gambit.sigs.calc import dense_to_sparse from gambit.util.progress import get_progress from . import common From ed2c50224ffde1e3947c6f1fe4df903fc71f4df2 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Thu, 25 Jul 2024 01:13:01 -0700 Subject: [PATCH 28/86] Update readme --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 83dd07f..29cb0c5 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,17 @@ See below for basic installation and usage instructions, or check out the a basic tutorial. +## About + +Copyright © 2016-2024 Jared Lumpe + +GAMBIT has been a personal project of mine for many years. Although there have been numerous +contributors to the publication, it is not a product of any lab or institution. + +GAMBIT is provided as free software under the terms of the [AGPLv3 license](LICENSE). +It is not covered by any type of software patent. + + ### Publication Lumpe J, Gumbleton L, Gorzalski A, Libuit K, Varghese V, et al. (2023) GAMBIT (Genomic Approximation From 933ddf808e5627995cd715add4d57dac3e586d54 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Wed, 3 Jul 2024 18:47:31 -0700 Subject: [PATCH 29/86] Update CLI description --- src/gambit/cli/root.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/gambit/cli/root.py b/src/gambit/cli/root.py index 2bb273e..5d959ac 100644 --- a/src/gambit/cli/root.py +++ b/src/gambit/cli/root.py @@ -17,5 +17,10 @@ @click.version_option(GAMBIT_VERSION, prog_name='gambit') @click.pass_context def cli(ctx: click.Context, **kw): - """Tool for rapid taxonomic identification of microbial pathogens from genomic data.""" + """Tool for rapid taxonomic identification of microbial pathogens from genomic data. + + http://github.com/jlumpe/gambit + + Copyright (C) 2016-2024 Jared Lumpe + """ ctx.obj = CLIContext(ctx) From 19d98a0924cd6e710a37afd120de750de87233c2 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Thu, 11 Jul 2024 21:45:42 -0700 Subject: [PATCH 30/86] TestDB type hints and minor updates --- tests/conftest.py | 1 - tests/testdb.py | 92 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 71 insertions(+), 22 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4467af3..69f55da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,5 +47,4 @@ def testdb(test_data): This cleans things up a bit from the way it was before, which was a bunch of separate fixtures with session scope named "testdb_*". """ - root = test_data / 'testdb_210818' return TestDB(test_data / 'testdb_210818') diff --git a/tests/testdb.py b/tests/testdb.py index 63606a3..71751d8 100644 --- a/tests/testdb.py +++ b/tests/testdb.py @@ -1,9 +1,9 @@ """Access test database data. """ -from typing import Callable +from typing import Callable, TypeVar, Generic, Any, overload, TypedDict from pathlib import Path -from types import SimpleNamespace +from dataclasses import dataclass from csv import DictReader import sqlite3 import gzip @@ -12,19 +12,32 @@ from sqlalchemy.orm import sessionmaker from gambit.seq import SequenceFile -from gambit.sigs import load_signatures +from gambit.kmers import KmerSpec +from gambit.sigs import load_signatures, AnnotatedSignatures from gambit.db import ReferenceDatabase, ReadOnlySession, only_genomeset from gambit.results.archive import ResultsArchiveReader +from gambit.query import QueryResults -class LazyAttribute: +T = TypeVar('T') + + +class LazyAttribute(Generic[T]): """Descriptor which initializes a property value the first time it is used.""" - def __init__(self, initializer: Callable, value_attr: str): + def __init__(self, initializer: Callable[[Any], T], value_attr: str): self.initializer = initializer self.value_attr = value_attr self.__doc__ = initializer.__doc__ + @overload + def __get__(self, instance: None, owner=None) -> 'LazyAttribute[T]': + pass + + @overload + def __get__(self, instance, owner=None) -> T: + pass + def __get__(self, instance, owner=None): if instance is None: return self @@ -39,11 +52,42 @@ def __get__(self, instance, owner=None): return value -def lazy(f: Callable) -> LazyAttribute: +def lazy(f: Callable[[Any], T]) -> LazyAttribute[T]: attr = '_' + f.__name__ return LazyAttribute(f, attr) +@dataclass +class TestDBPaths: + root: Path + ref_genomes: Path + ref_signatures: Path + refs_table: Path + ref_genomes_dir: Path + queries_table: Path + query_genomes_dir: Path + query_signatures: Path + results: Path + + +class TestQueryGenome(TypedDict): + name: str + predicted: str + primary: str + closest: str + warnings: bool + file: SequenceFile + file_gz: SequenceFile + + +class TestRefGenome(TypedDict): + name: str + key: str + taxon: str + file: SequenceFile + file_gz: SequenceFile + + class TestDB: """Object which provides access to test database resources. @@ -51,9 +95,14 @@ class TestDB: to how it would work if the attributes were separate Pytest fixtures. """ + paths: TestDBPaths + + # Prevent pytest interpreting as containing test methods + __test__ = False + def __init__(self, root): root = Path(root) - self.paths = SimpleNamespace( + self.paths = TestDBPaths( root=root, ref_genomes=root / 'ref-genomes.gdb', ref_signatures=root / 'ref-signatures.gs', @@ -84,21 +133,21 @@ def copy_session(self): return sessionmaker(engine)() @lazy - def ref_signatures(self): + def ref_signatures(self) -> AnnotatedSignatures: """K-mer signatures for reference genomes.""" - return load_signatures(self.paths.ref_signatures) + return load_signatures(self.paths.ref_signatures) # type: ignore @lazy - def query_signatures(self): + def query_signatures(self) -> AnnotatedSignatures: """K-mer signatures for query genomes.""" - return load_signatures(self.paths.query_signatures) + return load_signatures(self.paths.query_signatures) # type: ignore @lazy - def kmerspec(self): - return self.ref_signatures.kmerspec + def kmerspec(self) -> KmerSpec: + return self.ref_signatures.kmerspec # type: ignore @lazy - def refdb(self): + def refdb(self) -> ReferenceDatabase: """Full ReferenceDatabase object.""" session = self.Session() gset = only_genomeset(session) @@ -117,20 +166,21 @@ def _add_file_cols(cls, genomes_dir, row): ) @lazy - def query_genomes(self): + def query_genomes(self) -> list[TestQueryGenome]: """Query genomes and their expected results.""" with open(self.paths.queries_table, newline='') as f: rows = list(DictReader(f)) for row in rows: + # Convert "warnings" column to bool row['warnings'] = row['warnings'].lower() == 'true' self._add_file_cols(self.paths.query_genomes_dir, row) - return rows + return rows # type: ignore @lazy - def ref_genomes(self): + def ref_genomes(self) -> list[TestRefGenome]: """Reference genomes and their attributes.""" with open(self.paths.refs_table, newline='') as f: @@ -139,7 +189,7 @@ def ref_genomes(self): for row in rows: self._add_file_cols(self.paths.ref_genomes_dir, row) - return rows + return rows # type: ignore @classmethod def _ensure_gz(cls, items): @@ -167,13 +217,13 @@ def _get_genome_files(cls, items, gzipped): col = 'file' return [q[col] for q in items] - def get_query_files(self, gzipped: bool=False): + def get_query_files(self, gzipped: bool=False) -> list[SequenceFile]: return self._get_genome_files(self.query_genomes, gzipped) - def get_ref_files(self, gzipped: bool=False): + def get_ref_files(self, gzipped: bool=False) -> list[SequenceFile]: return self._get_genome_files(self.ref_genomes, gzipped) - def get_query_results(self, strict: bool, session=None): + def get_query_results(self, strict: bool, session=None) -> QueryResults: """Pre-calculated query results.""" if session is None: session = self.refdb.session From 181e4897b139ef340a3b04d59c5f27d13d6dfa04 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 17:31:01 -0700 Subject: [PATCH 31/86] Clean up test code for CLI query --- tests/cli/test_query.py | 150 +++++++++++++++++++--------------------- 1 file changed, 73 insertions(+), 77 deletions(-) diff --git a/tests/cli/test_query.py b/tests/cli/test_query.py index eea9c5d..d4de880 100644 --- a/tests/cli/test_query.py +++ b/tests/cli/test_query.py @@ -2,96 +2,69 @@ Test the 'gambit query' CLI command using the testdb_210818 database. """ -import os from copy import copy +from typing import Optional, Iterable +from pathlib import Path import pytest from gambit.cli.test import invoke_cli from gambit.results.test import check_json_results, check_csv_results from gambit.seq import SequenceFile -from gambit.query import QueryInput +from gambit.query import QueryInput, QueryResults from gambit.util.misc import zip_strict -from gambit.util.io import write_lines +from gambit.util.io import write_lines, FilePath from gambit.cli.common import strip_seq_file_ext +from ..testdb import TestDB -@pytest.fixture(params=[None]) -def nqueries(request): - """Number of testdb query files to use, None means use all of them. - Can be changed via indirect parameterization in specific tests. - Note than with slice notation, `[:None]` is the same as `[:]`. - """ - return request.param - - -@pytest.fixture() -def query_files(testdb, nqueries): - """Paths to query files.""" - return [SequenceFile(f.path, f.format, f.compression) for f in testdb.get_query_files()[:nqueries]] - - -@pytest.fixture() -def cd_query_genomes(testdb): - """Change working directory to query genomes directory.""" - old_wd = os.getcwd() - try: - os.chdir(testdb.paths.query_genomes_dir) - yield - finally: - os.chdir(old_wd) - - -@pytest.fixture(name='make_args') -def make_args_factory(testdb, query_files, tmp_path): +def make_args(testdb: TestDB, *, + positional_files: Optional[Iterable[SequenceFile]] = None, + list_file: Optional[FilePath] = None, + sig_file: bool = False, + output: Optional[FilePath] = None, + outfmt: Optional[str] = None, + strict: bool=False, + ) -> list[str]: + """Make command line arguments for querying.""" - def make_args(positional=False, list_file=False, sig_file=False, output=None, outfmt=None, strict=False): - """Make command line arguments for query file.""" + args: list[str] = [f'--db={testdb.paths.root}', 'query'] + args.append('--strict' if strict else '--no-strict') - args = [f'--db={testdb.paths.root}', 'query'] - args.append('--strict' if strict else '--no-strict') + if output is not None: + args.append(f'--output={output}') - if output is not None: - args.append(f'--output={output}') + if outfmt is not None: + args.append(f'--outfmt={outfmt}') - if outfmt is not None: - args.append(f'--outfmt={outfmt}') + if positional_files is not None: + args.extend(map(str, positional_files)) - if positional: - args.extend(query_files) + if list_file is not None: + args += ['-l', str(list_file), f'--ldir={testdb.paths.query_genomes_dir}'] - if list_file: - list_file = tmp_path / 'genomes.txt' - write_lines(query_files, list_file) - args += ['-l', str(list_file), f'--ldir={testdb.paths.query_genomes_dir}'] + if sig_file: + args.append(f'--sigfile={testdb.paths.query_signatures}') - if sig_file: - args.append(f'--sigfile={testdb.paths.query_signatures}') + return args - return list(map(str, args)) - return make_args - -@pytest.fixture(name='make_ref_results') -def make_ref_results_factory(testdb, nqueries, query_files): +def make_ref_results(testdb: TestDB, inputs: Iterable[QueryInput], strict: bool, nqueries: Optional[int]): """ Make a copy of the reference query results to compare to, modifying to account for possibly different query inputs and # of queries. """ - def make_ref_results(strict, inputs): - ref_results = copy(testdb.get_query_results(strict)) - ref_results.items = ref_results.items[:nqueries] - - for item, input in zip_strict(ref_results.items, inputs): - item.input = input + ref_results = copy(testdb.get_query_results(strict)) + ref_results.items = ref_results.items[:nqueries] - return ref_results + for item, input in zip_strict(ref_results.items, inputs): + item.input = input - return make_ref_results + return ref_results -def check_results(results_file, out_fmt, ref_results): +def check_results(results_file: Path, out_fmt: str, ref_results: QueryResults): """Check results output matches reference QueryResults object.""" if out_fmt == 'json': with open(results_file) as fh: @@ -113,25 +86,39 @@ def check_results(results_file, out_fmt, ref_results): (None, False, 'json', False, True), (20, True, 'json', False, False), ], - indirect=['nqueries'], ) -def test_full_query(make_args, make_ref_results, use_list_file, out_fmt, strict, gzipped, query_files, tmp_path): +def test_full_query(testdb: TestDB, + nqueries: Optional[int], + use_list_file: bool, + out_fmt: str, + strict: bool, + gzipped: bool, + tmp_path: Path, + ): """Run a full query using the command line interface.""" + query_files = testdb.get_query_files(gzipped)[:nqueries] inputs = [ QueryInput(strip_seq_file_ext(file.path.name), file) for file in query_files ] - ref_results = make_ref_results(strict, inputs) + ref_results: QueryResults = make_ref_results(testdb, inputs, strict, nqueries) results_file = tmp_path / ('results.' + out_fmt) + if use_list_file: + list_file = tmp_path / 'genomes.txt' + write_lines(query_files, list_file) + input_kw = dict(list_file=list_file) + else: + input_kw = dict(positional_files=query_files) + args = make_args( - positional=not use_list_file, - list_file=use_list_file, + testdb, output=results_file, outfmt=out_fmt, strict=strict, + **input_kw, ) invoke_cli(args) @@ -141,15 +128,16 @@ def test_full_query(make_args, make_ref_results, use_list_file, out_fmt, strict, # Not really necessary to check all combinations of parameters. @pytest.mark.parametrize('out_fmt', ['json']) @pytest.mark.parametrize('strict', [False]) -def test_sigfile(make_args, make_ref_results, testdb, out_fmt, strict, tmp_path): +def test_sigfile(testdb: TestDB, out_fmt: str, strict: bool, tmp_path: Path): """Test using signature file instead of parsing genome files.""" inputs = list(map(QueryInput, testdb.query_signatures.ids)) - ref_results = make_ref_results(strict, inputs) + ref_results = make_ref_results(testdb, inputs, strict, None) results_file = tmp_path / ('results.' + out_fmt) args = make_args( + testdb, sig_file=True, output=results_file, outfmt=out_fmt, @@ -160,19 +148,27 @@ def test_sigfile(make_args, make_ref_results, testdb, out_fmt, strict, tmp_path) check_results(results_file, out_fmt, ref_results) -def test_invalid(make_args, tmp_path): +def test_invalid(testdb: TestDB, tmp_path: Path): """Test invalid parameter values exit with error code.""" + query_files = testdb.get_query_files() + list_file = tmp_path / 'list.json' + write_lines(query_files, list_file) results_file = tmp_path / ('results.json') # No genomes or signatures - args = make_args(output=results_file) - invoke_cli(args, success=False) + args = make_args(testdb, output=results_file) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == 'Error: One of GENOMES, -l, or -s/--sigfile is required' # Multiple inputs - args = make_args(output=results_file, positional=True, list_file=True) - assert invoke_cli(args, success=False) - args = make_args(output=results_file, positional=True, sig_file=True) - assert invoke_cli(args, success=False) - args = make_args(output=results_file, list_file=True, sig_file=True) - assert invoke_cli(args, success=False) + multi_msg = 'Error: GENOMES, -l, and -s/--sigfile are mutually exclusive' + + args = make_args(testdb, output=results_file, positional_files=query_files, list_file=list_file) + assert invoke_cli(args, success=False).stderr.strip() == multi_msg + + args = make_args(testdb, output=results_file, positional_files=query_files, sig_file=True) + assert invoke_cli(args, success=False).stderr.strip() == multi_msg + + args = make_args(testdb, output=results_file, list_file=list_file, sig_file=True) + assert invoke_cli(args, success=False).stderr.strip() == multi_msg From 39936de07c7c0810e644b49c620894fa6eeb330a Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 19:03:52 -0700 Subject: [PATCH 32/86] Clean up CLI dist command tests --- tests/cli/test_dist.py | 375 ++++++++++++++++++++++++----------------- 1 file changed, 224 insertions(+), 151 deletions(-) diff --git a/tests/cli/test_dist.py b/tests/cli/test_dist.py index b900bb0..8dc262f 100644 --- a/tests/cli/test_dist.py +++ b/tests/cli/test_dist.py @@ -1,6 +1,8 @@ """Tests for the "dist" command.""" import json +from typing import Optional, Iterable +from pathlib import Path import pytest import numpy as np @@ -13,195 +15,266 @@ from gambit.cluster import load_dmat_csv import gambit.util.json as gjson from gambit.kmers import DEFAULT_KMERSPEC +from gambit.seq import SequenceFile +from gambit.cli.common import strip_seq_file_ext +from ..testdb import TestDB + + +def get_query_files(testdb: TestDB, n: Optional[int] = None, gz: bool = False) -> list[SequenceFile]: + return testdb.get_query_files(gz)[:n] + + +def get_ref_files(testdb: TestDB, n: Optional[int] = None, gz: bool = False) -> list[SequenceFile]: + return testdb.get_ref_files(gz)[:n] + + +def make_args(testdb: TestDB, + outfile: Path, + *, + q_opt: Optional[list[SequenceFile]] = None, # Query files with -q option + q_list: Optional[Path] = None, # Query list file + q_sigs: bool = False, # Use query signature file + r_opt: Optional[list[SequenceFile]] = None, # Ref files with -r option + r_list: Optional[Path] = None, # Ref list file + r_sigs: bool = False, # Use refs signature file + r_db: bool = False, # Use db for refs + with_db: bool = False, # Pass db at root level + kmerspec: Optional[KmerSpec] = None, # Pass -k and -p options + extra: Iterable[str] = (), # Additional args + ) -> list[str]: + + args: list[str] = ['dist', '-o', str(outfile), *extra] + + if with_db: + args.insert(0, f'--db={testdb.paths.root}') + + # Queries + if q_opt is not None: + for file in q_opt: + args.extend(['-q', str(file)]) + if q_list is not None: + args.extend(['--ql', str(q_list)]) + args.extend(['--qdir', str(testdb.paths.query_genomes_dir)]) + if q_sigs: + args.extend(['--qs', str(testdb.paths.query_signatures)]) + + # References + if r_opt is not None: + for file in r_opt: + args.extend(['-r', str(file)]) + if r_list is not None: + args.extend(['--rl', str(r_list)]) + args.extend(['--rdir', str(testdb.paths.ref_genomes_dir)]) + if r_sigs: + args.extend(['--rs', str(testdb.paths.ref_signatures)]) + if r_db: + args.append('--use-db') + + if kmerspec is not None: + args += [ + '-k', str(kmerspec.k), + '--prefix', kmerspec.prefix_str, + ] + + return args + + +def check_output(outfile: Path, expected_matrix: np.ndarray, nqueries: Optional[int], nrefs: Optional[int]): + dmat, row_ids, col_ids = load_dmat_csv(outfile) + assert np.allclose(dmat, expected_matrix[:nqueries, :nrefs], atol=1e-4) + # TODO: check row/col IDs -@pytest.fixture() -def outfile(tmp_path): - return tmp_path / 'out.csv' - -@pytest.fixture(params=[None]) -def nqueries(request): - return request.param - -@pytest.fixture(params=[False]) -def queries_gz(request): - return request.param - -@pytest.fixture() -def query_files(testdb, nqueries, queries_gz): - return [f for f in testdb.get_query_files(queries_gz)[:nqueries]] - -@pytest.fixture(params=[None]) -def nrefs(request): - return request.param - -@pytest.fixture(params=[False]) -def refs_gz(request): - return request.param - -@pytest.fixture() -def ref_files(testdb, nrefs, refs_gz): - return [f for f in testdb.get_ref_files(refs_gz)[:nrefs]] - -@pytest.fixture(name='make_args') -def make_args_factory(testdb, query_files, ref_files, outfile, tmp_path): - - def make_args(q_opt=False, # Pass queries with -q option - q_list=False, # Pass queries with list file - q_sigs=False, # Use query signature file - r_opt=False, # Pass refs with -r option - r_list=False, # Pass refs with list file - r_sigs=False, # Use refs signature file - r_db=False, # Use db for refs - with_db=False, # Pass db at root level - with_kspec=False, # Pass -k and -p options - extra=(), # Additional args - ): - - args = ['dist', '-o', outfile, *extra] - - if with_db: - args.insert(0, f'--db={testdb.paths.root}') - - if q_opt: - for file in query_files: - args.extend(['-q', file]) - if q_list: - qlfile = tmp_path / 'queries.txt' - write_lines(query_files, qlfile) - args.extend(['--ql', qlfile]) - args.extend(['--qdir', testdb.paths.query_genomes_dir]) - if q_sigs: - args.extend(['--qs', testdb.paths.query_signatures]) - - if r_opt: - for file in ref_files: - args.extend(['-r', file]) - if r_list: - rlfile = tmp_path / 'refs.txt' - write_lines(ref_files, rlfile) - args.extend(['--rl', rlfile]) - args.extend(['--rdir', testdb.paths.ref_genomes_dir]) - if r_sigs: - args.extend(['--rs', testdb.paths.ref_signatures]) - if r_db: - args.append('--use-db') - - if with_kspec: - args += [ - '-k', str(testdb.kmerspec.k), - f'--prefix={testdb.kmerspec.prefix_str}', - ] - - return args - - return make_args @pytest.fixture(scope='session') -def expected_matrix(testdb): +def expected_matrix(testdb: TestDB): return jaccarddist_matrix(testdb.query_signatures, testdb.ref_signatures) + @pytest.fixture(scope='session') -def expected_matrix_square(testdb): +def expected_matrix_square(testdb: TestDB): return jaccarddist_matrix(testdb.query_signatures, testdb.query_signatures) -@pytest.fixture(name='check_output') -def check_output_factory(outfile, expected_matrix, nqueries, nrefs): - def check_output(): - dmat, row_ids, col_ids = load_dmat_csv(outfile) - assert np.allclose(dmat, expected_matrix[:nqueries, :nrefs], atol=1e-4) - # TODO check row/column IDs - - return check_output - @pytest.mark.parametrize( - 'q_type,r_type,nqueries,nrefs,queries_gz,refs_gz', + 'q_type,r_type,queries_gz,refs_gz', [ - ('sigs', 'sigs', None, None, False, False), - ('list', 'sigs', 10, None, False, False), - ('sigs', 'list', None, 10 , False, False), - ('list', 'list', 10, 10 , False, False), - ('opt', 'sigs', 10, None, False, False), - ('sigs', 'opt', None, 10 , False, False), - ('sigs', 'db', None, None, False, False), - ('list', 'sigs', 10, None, True, False), - ('sigs', 'list', None, 10 , False, True), + ('sigs', 'sigs', False, False), + ('list', 'sigs', False, False), + ('sigs', 'list', False, False), + ('list', 'list', False, False), + ('opt', 'sigs', False, False), + ('sigs', 'opt', False, False), + ('sigs', 'db', False, False), + ('list', 'sigs', True, False), + ('sigs', 'list', False, True), ], - indirect=['nqueries', 'nrefs', 'queries_gz', 'refs_gz'], ) -def test_basic(make_args, check_output, q_type, r_type): +def test_basic(testdb: TestDB, + q_type: str, # Query input format + r_type: str, # Referencer input format + queries_gz: bool, # Use gzipped query files + refs_gz: bool, # Use gzipped reference files + expected_matrix: np.ndarray, + tmp_path: Path, + ): """Test test basic usage, with query/ref sequences/signatures from different sources.""" + # Use only 10 query/reference files if passing by CLI option or by list file + nqueries = 10 if q_type in ('opt', 'list') else None + nrefs = 10 if r_type in ('opt', 'list') else None + + outfile = tmp_path / 'out.csv' + query_files = get_query_files(testdb, nqueries, queries_gz) + ref_files = get_ref_files(testdb, nrefs, refs_gz) + + # Query sequence specification + if q_type == 'opt': + query_kw = dict(q_opt=query_files) + elif q_type == 'list': + q_list = tmp_path / 'queries.txt' + write_lines(query_files, q_list) + query_kw = dict(q_list=q_list) + elif q_type == 'sigs': + query_kw = dict(q_sigs=True) + else: + assert False + + # Reference sequence specification + if r_type == 'opt': + ref_kw = dict(r_opt=ref_files) + elif r_type == 'list': + r_list = tmp_path / 'refs.txt' + write_lines(ref_files, r_list) + ref_kw = dict(r_list=r_list) + elif r_type == 'sigs': + ref_kw = dict(r_sigs=True) + elif r_type == 'db': + ref_kw = dict(r_db=True) + else: + assert False + + using_sigfile = q_type == 'sigs' or r_type == 'sigs' + args = make_args( - q_opt=q_type == 'opt', - q_list=q_type == 'list', - q_sigs=q_type == 'sigs', - r_opt=r_type == 'opt', - r_list=r_type == 'list', - r_sigs=r_type == 'sigs', - r_db=r_type == 'db', - with_kspec=True, + testdb, + outfile, + **query_kw, + **ref_kw, + kmerspec=None if using_sigfile else testdb.kmerspec, with_db=r_type == 'db', ) invoke_cli(args) - check_output() + check_output(outfile, expected_matrix, nqueries, nrefs) -def test_kspec(make_args, testdb, tmp_path): - """Test selection of k-mer params and errors on inconsistencies.""" - alt_kspec = KmerSpec(6, 'AC') - assert alt_kspec != testdb.kmerspec - alt_kspec_args = ['-k', alt_kspec.k, '-p', alt_kspec.prefix_str] +def test_default_kspec(testdb: TestDB, tmp_path: Path): + """Test that the default KmerSpec is used when not otherwise specified.""" - alt_sigfile = tmp_path / 'alt_sigs.gs' - alt_sigs = SignatureList([], alt_kspec) - dump_signatures(alt_sigfile, alt_sigs) + outfile = tmp_path / 'out.csv' + q_list = tmp_path / 'queries.txt' + q_list.touch() + r_list = tmp_path / 'refs.txt' + r_list.touch() + + args = make_args(testdb, outfile, q_list=q_list, r_list=r_list, extra=('--dump-params',)) - # Default kspec - args = make_args(q_list=True, r_list=True, extra=('--dump-params',)) result = invoke_cli(args) params = json.loads(result.stdout) assert params['kmerspec'] == gjson.to_json(DEFAULT_KMERSPEC) + +def test_kspec_err(testdb: TestDB, tmp_path: Path): + """Test selection of k-mer params and errors on inconsistencies.""" + + outfile = tmp_path / 'out.csv' + + query_files = get_query_files(testdb, 10) + query_lf = tmp_path / 'queries.txt' + write_lines(query_files, query_lf) + + ref_files = get_ref_files(testdb, 10) + ref_lf = tmp_path / 'refs.txt' + write_lines(ref_files, ref_lf) + + # Alternate kmerspec + kspec1 = testdb.kmerspec + kspec2 = KmerSpec(5, 'AC') + assert kspec2 != kspec1 + + # Create signatures file for alt kspec + alt_sigfile = tmp_path / 'alt_sigs.gs' + alt_sigs = SignatureList([], kspec2) + dump_signatures(alt_sigfile, alt_sigs) + # Kspec from args inconsistent with query or reference signatures - args = make_args(q_sigs=True, r_sigs=True) + alt_kspec_args - invoke_cli(args, success=False) - args = make_args(q_list=True, r_sigs=True) + alt_kspec_args - invoke_cli(args, success=False) - args = make_args(q_sigs=True, r_list=True) + alt_kspec_args - invoke_cli(args, success=False) - args = make_args(q_sigs=True, r_db=True) + alt_kspec_args - invoke_cli(args, success=False) - - # Ref and query signatures inconsistent - args = make_args(r_sigs=True, extra=('--qs', alt_sigfile)) - invoke_cli(args, success=False) - args = make_args(r_db=True, extra=('--qs', alt_sigfile)) - invoke_cli(args, success=False) + msg = ( + f'Error: K-mer search parameters {{}} ({kspec2.k}/{kspec2.prefix_str}) ' + f'do not match those of {{}} ({kspec1.k}/{kspec1.prefix_str}).' + ) + + args = make_args(testdb, outfile, q_sigs=True, r_list=ref_lf, kmerspec=kspec2) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == msg.format('from command line options', 'query signatures') + + args = make_args(testdb, outfile, q_list=query_lf, r_sigs=True, kmerspec=kspec2) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == msg.format('from command line options', 'reference signatures') + + args = make_args(testdb, outfile, q_list=query_lf, r_db=True, kmerspec=kspec2, with_db=True) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == msg.format('from command line options', 'reference signatures') + + # Ref and query signatures have differing kspec + args = make_args(testdb, outfile, r_sigs=True, extra=('--qs', str(alt_sigfile))) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == msg.format('of query signatures', 'reference signatures') + + args = make_args(testdb, outfile, r_db=True, with_db=True, extra=('--qs', str(alt_sigfile))) + result = invoke_cli(args, success=False) + assert result.stderr.strip() == msg.format('of query signatures', 'reference signatures') + @pytest.mark.parametrize( - 'q_type,nqueries,queries_gz', + 'q_type,queries_gz', [ - ('sigs', None, False), - ('list', 10, False), - ('opt', 10, False), - ('list', 10, True), + ('sigs', False), + ('list', False), + ('opt', False), + ('list', True), ], - indirect=['nqueries', 'queries_gz'], ) -def test_square(make_args, q_type, outfile, expected_matrix_square, nqueries): +def test_square(testdb: TestDB, + q_type: str, + queries_gz: bool, + expected_matrix_square: np.ndarray, + tmp_path: Path, + ): """Test --square option.""" + outfile = tmp_path / 'out.csv' + nqueries = 10 if q_type in ('opts', 'list') else None + query_files = get_query_files(testdb, nqueries, queries_gz) + + # Query sequence specification + if q_type == 'opt': + query_kw = dict(q_opt=query_files) + elif q_type == 'list': + q_list = tmp_path / 'queries.txt' + write_lines(query_files, q_list) + query_kw = dict(q_list=q_list) + elif q_type == 'sigs': + query_kw = dict(q_sigs=True) + else: + assert False + args = make_args( - q_opt=q_type == 'opt', - q_list=q_type == 'list', - q_sigs=q_type == 'sigs', - with_kspec=True, + testdb, + outfile, + **query_kw, + kmerspec=None if q_type == 'sigs' else testdb.kmerspec, extra=['--square'], ) invoke_cli(args) - out_dmat, row_ids, col_ids = load_dmat_csv(outfile) - assert np.allclose(out_dmat, expected_matrix_square[:nqueries, :nqueries], atol=1e-4) - assert row_ids == col_ids + check_output(outfile, expected_matrix_square, nqueries, nqueries) From 25f78775789e341f3fcd8cef014172dcb43c2c1d Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 19:05:03 -0700 Subject: [PATCH 33/86] Fix CLI error message --- src/gambit/cli/dist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gambit/cli/dist.py b/src/gambit/cli/dist.py index 98cfcdf..0dbedb4 100644 --- a/src/gambit/cli/dist.py +++ b/src/gambit/cli/dist.py @@ -111,11 +111,11 @@ def dist_cmd(ctx: click.Context, else: if query_sigs is not None and query_sigs.kmerspec != kspec: raise click.ClickException( - f'K-mer search parameters from command line options ({fmt_kspec(kspec)}) do not' + f'K-mer search parameters from command line options ({fmt_kspec(kspec)}) do not ' f'match those of query signatures ({fmt_kspec(query_sigs.kmerspec)}).') if ref_sigs is not None and ref_sigs.kmerspec != kspec: raise click.ClickException( - f'K-mer search parameters from command line options ({fmt_kspec(kspec)}) do not' + f'K-mer search parameters from command line options ({fmt_kspec(kspec)}) do not ' f'match those of reference signatures ({fmt_kspec(ref_sigs.kmerspec)}).') prog = 'click' if progress else None From 6c511eb0d842b58446d5ef467ed2c0725af7d904 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 19:05:59 -0700 Subject: [PATCH 34/86] Proper error message on pytest CLI test failure --- src/gambit/cli/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gambit/cli/test.py b/src/gambit/cli/test.py index 5c22800..8ffd31e 100644 --- a/src/gambit/cli/test.py +++ b/src/gambit/cli/test.py @@ -53,7 +53,7 @@ def invoke_cli(args: Sequence, runner: Optional[CliRunner]=None, success: Option result = runner.invoke(cli, args, **kw) if success is True: - assert result.exit_code == 0 + assert result.exit_code == 0, result.stderr if success is False: assert result.exit_code != 0 From 411f729869183714827b8b80795df92dc613fd31 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 19:14:55 -0700 Subject: [PATCH 35/86] Type hints --- src/gambit/classify.py | 8 ++++---- src/gambit/cli/common.py | 8 ++++---- src/gambit/cli/debug.py | 6 +++--- src/gambit/cli/dist.py | 6 +++--- src/gambit/cli/query.py | 4 ++-- src/gambit/cli/test.py | 8 ++++---- src/gambit/cli/tree.py | 4 ++-- src/gambit/cluster.py | 4 ++-- src/gambit/db/models.py | 12 ++++++------ src/gambit/db/refdb.py | 18 +++++++----------- src/gambit/query.py | 10 +++++----- src/gambit/results/csv.py | 8 ++++---- src/gambit/seq.py | 4 ++-- src/gambit/util/progress.py | 9 ++++----- tests/cli/test_signatures.py | 27 +++++++++++++++------------ 15 files changed, 67 insertions(+), 69 deletions(-) diff --git a/src/gambit/classify.py b/src/gambit/classify.py index 948106c..bbdd8e4 100644 --- a/src/gambit/classify.py +++ b/src/gambit/classify.py @@ -1,6 +1,6 @@ """Classify queries based on distance to reference sequences.""" -from typing import Optional, Tuple, Iterable, Dict, List, Set, Sequence +from typing import Optional, Iterable, Sequence from attr import attrs, attrib import numpy as np @@ -30,7 +30,7 @@ def matching_taxon(taxon: Taxon, d: float) -> Optional[Taxon]: return None -def find_matches(itr: Iterable[Tuple[AnnotatedGenome, float]]) -> Dict[Taxon, List[int]]: +def find_matches(itr: Iterable[tuple[AnnotatedGenome, float]]) -> dict[Taxon, list[int]]: """Find taxonomy matches given distances from a query to a set of reference genomes. Parameters @@ -53,7 +53,7 @@ def find_matches(itr: Iterable[Tuple[AnnotatedGenome, float]]) -> Dict[Taxon, Li return matches -def consensus_taxon(taxa: Iterable[Taxon]) -> Tuple[Optional[Taxon], Set[Taxon]]: +def consensus_taxon(taxa: Iterable[Taxon]) -> tuple[Optional[Taxon], set[Taxon]]: """Take a set of taxa matching a query and find a single consensus taxon for classification. If a query matches a given taxon, it is expected that there may be matches to some of that @@ -210,7 +210,7 @@ class ClassifierResult: primary_match: Optional[GenomeMatch] = attrib() closest_match: GenomeMatch = attrib() next_taxon: Optional[Taxon] = attrib() - warnings: List[str] = attrib(factory=list, repr=False) + warnings: list[str] = attrib(factory=list, repr=False) error: Optional[str] = attrib(default=None, repr=False) @next_taxon.default diff --git a/src/gambit/cli/common.py b/src/gambit/cli/common.py index 2bd201b..1359268 100644 --- a/src/gambit/cli/common.py +++ b/src/gambit/cli/common.py @@ -1,5 +1,5 @@ import os -from typing import Optional, Sequence, TextIO, Union, Iterable, Tuple, List, Any +from typing import Optional, Sequence, TextIO, Union, Iterable, Any from pathlib import Path from collections import Counter @@ -305,7 +305,7 @@ def get_sequence_files(explicit: Optional[Iterable[FilePath]]=None, listfile_dir: Optional[str]=None, strip_dir: bool = True, strip_ext: bool = True, - ) -> Union[Tuple[List[str], List[SequenceFile]], Tuple[None, None]]: + ) -> Union[tuple[list[str], list[SequenceFile]], tuple[None, None]]: """Get list of sequence file paths and IDs from several types of CLI arguments. Does not check for conflict between ``explicit`` and ``listfile``. @@ -348,7 +348,7 @@ def get_sequence_files(explicit: Optional[Iterable[FilePath]]=None, return ids, files -def warn_duplicate_file_ids(ids: List[str], template: str): +def warn_duplicate_file_ids(ids: list[str], template: str): """Print a warning message if duplicate file IDs are present. Parameters @@ -370,7 +370,7 @@ def warn_duplicate_file_ids(ids: List[str], template: str): # Click introspection ################################################################################ -def params_by_name(cmd: click.Command, names: Optional[Iterable[str]]=None): +def params_by_name(cmd: click.Command, names: Optional[Iterable[str]] = None): """Get parameters of click command by name. Parameters diff --git a/src/gambit/cli/debug.py b/src/gambit/cli/debug.py index 33d1068..3494bf2 100644 --- a/src/gambit/cli/debug.py +++ b/src/gambit/cli/debug.py @@ -1,4 +1,4 @@ -from typing import Dict, Any +from typing import Any, Optional import click @@ -21,7 +21,7 @@ def debug_group(): pass -def make_shell_ns(ctx) -> Dict[str, Any]: +def make_shell_ns(ctx) -> dict[str, Any]: """Make the user namespace for the shell command.""" from importlib import import_module @@ -43,7 +43,7 @@ def make_shell_ns(ctx) -> Dict[str, Any]: help='Use IPython instead of built-in Python REPL.', ) @click.pass_context -def shell(ctx, use_ipython): +def shell(ctx, use_ipython: Optional[bool]): """Start an interactive shell with application data and modules imported. Attempts to launch an IPython interactive interpreter if it is installed, diff --git a/src/gambit/cli/dist.py b/src/gambit/cli/dist.py index 0dbedb4..6d702a3 100644 --- a/src/gambit/cli/dist.py +++ b/src/gambit/cli/dist.py @@ -1,5 +1,5 @@ import sys -from typing import Optional, TextIO, List +from typing import Optional, TextIO import click @@ -41,11 +41,11 @@ def dist_cmd(ctx: click.Context, k: Optional[int], prefix: Optional[str], output: str, - q: List[str], + q: list[str], ql: Optional[TextIO], qdir: Optional[str], qs: Optional[str], - r: List[str], + r: list[str], rl: Optional[TextIO], rdir: Optional[str], rs: Optional[str], diff --git a/src/gambit/cli/query.py b/src/gambit/cli/query.py index e7e945d..b7d4e82 100644 --- a/src/gambit/cli/query.py +++ b/src/gambit/cli/query.py @@ -1,5 +1,5 @@ import sys -from typing import TextIO, Optional, List +from typing import TextIO, Optional import click @@ -59,7 +59,7 @@ def get_exporter(outfmt: str): def query_cmd(ctx: click.Context, listfile: Optional[TextIO], ldir: Optional[str], - files_arg: List[str], + files_arg: list[str], sigfile: Optional[str], output: TextIO, outfmt: str, diff --git a/src/gambit/cli/test.py b/src/gambit/cli/test.py index 8ffd31e..125cc42 100644 --- a/src/gambit/cli/test.py +++ b/src/gambit/cli/test.py @@ -1,6 +1,6 @@ """Tools for testing CLI.""" -from typing import Optional, ContextManager, Sequence +from typing import Optional, Sequence, Any, Iterable, Iterator from contextlib import contextmanager import click @@ -14,7 +14,7 @@ ) -def pop_kwargs(d, keys): +def pop_kwargs(d: dict[str, Any], keys: Iterable[str]) -> dict[str, Any]: out = dict() for k in keys: try: @@ -33,6 +33,7 @@ def default_runner(**kw) -> CliRunner: kw.setdefault('env', DEFAULT_ENV) return CliRunner(**kw) + def invoke_cli(args: Sequence, runner: Optional[CliRunner]=None, success: Optional[bool]=True, **kw) -> Result: """Invoke CLI in test context, using different defaults than base Click method. @@ -61,7 +62,7 @@ def invoke_cli(args: Sequence, runner: Optional[CliRunner]=None, success: Option @contextmanager -def allow_no_args(command: click.Command) -> ContextManager[click.Command]: +def allow_no_args(command: click.Command) -> Iterator[click.Command]: """Context manager which patches a command to allow calling with no arguments. Group commands will print help and exit if called without a subcommand, this will also happen @@ -77,4 +78,3 @@ def allow_no_args(command: click.Command) -> ContextManager[click.Command]: finally: command.no_args_is_help = old_naih - diff --git a/src/gambit/cli/tree.py b/src/gambit/cli/tree.py index d42213e..deed426 100644 --- a/src/gambit/cli/tree.py +++ b/src/gambit/cli/tree.py @@ -1,5 +1,5 @@ import sys -from typing import Optional, TextIO, List +from typing import Optional, TextIO import click from Bio import Phylo @@ -31,7 +31,7 @@ def tree_cmd(ctx: click.Context, listfile: Optional[TextIO], ldir: Optional[str], - files_arg: List[str], + files_arg: list[str], sigfile: Optional[str], k: Optional[int], prefix: Optional[str], diff --git a/src/gambit/cluster.py b/src/gambit/cluster.py index 5f27397..71234fc 100644 --- a/src/gambit/cluster.py +++ b/src/gambit/cluster.py @@ -1,6 +1,6 @@ """Distance matrices and basic clustering/trees.""" -from typing import Union, Optional, Sequence, TextIO, Tuple, List +from typing import Union, Optional, Sequence, TextIO import csv import numpy as np @@ -136,7 +136,7 @@ def dump_dmat_csv(file: Union[FilePath, TextIO], writer.writerow([str(row_id), *values_str]) -def load_dmat_csv(file: Union[FilePath, TextIO]) -> Tuple[np.ndarray, List[str], List[str]]: +def load_dmat_csv(file: Union[FilePath, TextIO]) -> tuple[np.ndarray, list[str], list[str]]: """Load distance matrix from CSV file. Returns diff --git a/src/gambit/db/models.py b/src/gambit/db/models.py index bd6c16c..fd0f245 100644 --- a/src/gambit/db/models.py +++ b/src/gambit/db/models.py @@ -1,6 +1,6 @@ """SQLAlchemy models for storing reference genomes and taxonomy information.""" -from typing import List, Any, Optional, Iterable, Collection, Callable +from typing import Any, Optional, Iterable, Collection, Callable import sqlalchemy as sa from sqlalchemy import Column, Integer, String, Boolean, Float @@ -325,7 +325,7 @@ def ancestor_of_rank(self, rank: str) -> Optional['Taxon']: return ancestor return None - def lineage(self, ranks: Optional[Iterable[str]] = None) -> List[Optional['Taxon']]: + def lineage(self, ranks: Optional[Iterable[str]] = None) -> list[Optional['Taxon']]: """Get a last of this taxon's ancestors. With an argument, gets ancestors with the given ranks. Without, gets a sorted list of the @@ -416,7 +416,7 @@ def has_genome(self, genome: AnnotatedGenome) -> bool: return self in genome.taxon.ancestors(True) @classmethod - def common_ancestors(cls, taxa: Iterable['Taxon']) -> List['Taxon']: + def common_ancestors(cls, taxa: Iterable['Taxon']) -> list['Taxon']: """Get list of common ancestors of a set of taxa. Returns @@ -454,7 +454,7 @@ def common_ancestors(cls, taxa: Iterable['Taxon']) -> List['Taxon']: return [] if ancestors is None else ancestors @classmethod - def lca(cls, taxa: Iterable['Taxon']) -> List['Taxon']: + def lca(cls, taxa: Iterable['Taxon']) -> Optional['Taxon']: """Find the Least Common Ancestor of a set of taxa. Returns None if `taxa` is empty or its members do not all lie in the same tree. @@ -463,10 +463,10 @@ def lca(cls, taxa: Iterable['Taxon']) -> List['Taxon']: return ancestors[-1] if ancestors else None def print_tree(self, - f: Callable[['Taxon'], str] = None, + f: Optional[Callable[['Taxon'], str]] = None, *, indent: str = ' ', - sort_key: Callable[['Taxon'], Any] = None, + sort_key: Optional[Callable[['Taxon'], Any]] = None, ): """Print the taxon's subtree for debugging. diff --git a/src/gambit/db/refdb.py b/src/gambit/db/refdb.py index 9fdcf29..5a9dc9b 100644 --- a/src/gambit/db/refdb.py +++ b/src/gambit/db/refdb.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Tuple, Sequence, Union, List, Dict, Optional, Any +from typing import Sequence, Union, Optional, Any from sqlalchemy.orm import object_session, Session from sqlalchemy.orm.attributes import InstrumentedAttribute @@ -38,7 +38,7 @@ def __init__(self, msg, directory=None, genomes_file=None, signatures_file=None) self.signatures_file = signatures_file -def load_genomeset(db_file: FilePath) -> Tuple[Session, ReferenceGenomeSet]: +def load_genomeset(db_file: FilePath) -> tuple[Session, ReferenceGenomeSet]: """Get the only :class:`gambit.db.models.ReferenceGenomeSet` from a genomes database file.""" session = file_sessionmaker(db_file)() gset = only_genomeset(session) @@ -78,13 +78,13 @@ def _check_genomes_have_ids(genomeset: ReferenceGenomeSet, id_attr: Instrumented raise RuntimeError(f'{c} genomes missing value for ID attribute {id_attr.key}') -def _map_ids_to_genomes(genomeset: ReferenceGenomeSet, id_attr: Union[str, InstrumentedAttribute]) -> Dict[AnnotatedGenome, Any]: +def _map_ids_to_genomes(genomeset: ReferenceGenomeSet, id_attr: Union[str, InstrumentedAttribute]) -> dict[AnnotatedGenome, Any]: """Get dict mapping ID values to AnnotatedGenome.""" q = genomeset.genomes.join(AnnotatedGenome.genome).add_columns(id_attr) return {id_: g for g, id_ in q} -def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Sequence, strict: bool = True) -> List[Optional[AnnotatedGenome]]: +def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Sequence, strict: bool = True) -> list[Optional[AnnotatedGenome]]: """Match a :class:`ReferenceGenomeSet`'s genomes to a set of ID values. This is primarily used to match genomes to signatures based on the ID values stored in a @@ -105,7 +105,7 @@ def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Seque Returns ------- - List[Optional[AnnotatedGenome]] + list[Optional[AnnotatedGenome]] List of genomes of same length as ``ids``. If ``strict=False`` and a genome cannot be found for a given ID the list will contain ``None`` at the corresponding position. @@ -126,7 +126,7 @@ def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Seque def genomes_by_id_subset(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Sequence, - ) -> Tuple[List[AnnotatedGenome], List[int]]: + ) -> tuple[list[AnnotatedGenome], list[int]]: """Match a :class:`ReferenceGenomeSet`'s genomes to a set of ID values, allowing missing genomes. This calls :func:`.genomes_by_id` with ``strict=False`` and filters any ``None`` values from the @@ -146,10 +146,6 @@ def genomes_by_id_subset(genomeset: ReferenceGenomeSet, See :data:`.GENOME_IDS` for the set of allowed values. ids Sequence of ID values (strings or integers, matching type of attribute). - - Returns - ------- - Tuple[List[AnnotatedGenome], List[int]] """ genomes = genomes_by_id(genomeset, id_attr, ids, strict=False) genomes_out = [] @@ -215,7 +211,7 @@ def __init__(self, genomeset: ReferenceGenomeSet, signatures: ReferenceSignature raise ValueError(f'{missing} of {n} genomes not matched to signature IDs. Is the id_attr attribute of the signatures metadata correct?') @classmethod - def locate_files(cls, path: FilePath) -> Tuple[Path, Path]: + def locate_files(cls, path: FilePath) -> tuple[Path, Path]: """Locate an SQLite genome database file and HDF5 signatures file in a directory. Files are located by extension, ``.gdb`` or ``.db`` for SQLite file and ``.gs`` or ``.h5`` diff --git a/src/gambit/query.py b/src/gambit/query.py index b2e611a..6e7ec73 100644 --- a/src/gambit/query.py +++ b/src/gambit/query.py @@ -2,7 +2,7 @@ from warnings import warn from datetime import datetime -from typing import Sequence, Optional, Union, List, Dict, Any +from typing import Sequence, Optional, Union, Any from attr import attrs, attrib import numpy as np @@ -86,7 +86,7 @@ class QueryResultItem: input: QueryInput = attrib() classifier_result: ClassifierResult = attrib() report_taxon: Optional[Taxon] = attrib(default=None) - closest_genomes: List[GenomeMatch] = attrib(factory=list) + closest_genomes: list[GenomeMatch] = attrib(factory=list) def compare_result_items(item1: QueryResultItem, item2: QueryResultItem) -> bool: @@ -129,13 +129,13 @@ class QueryResults: extra JSON-able dict containing additional arbitrary metadata. """ - items: List[QueryResultItem] = attrib() + items: list[QueryResultItem] = attrib() params: Optional[QueryParams] = attrib(default=None) genomeset: Optional[ReferenceGenomeSet] = attrib(default=None) signaturesmeta: Optional[SignaturesMeta] = attrib(default=None) gambit_version: str = attrib(default=GAMBIT_VERSION) timestamp: datetime = attrib(factory=datetime.now) - extra: Dict[str, Any] = attrib(factory=dict) + extra: dict[str, Any] = attrib(factory=dict) def query(db: ReferenceDatabase, @@ -235,7 +235,7 @@ def query_parse(db: ReferenceDatabase, params: Optional[QueryParams] = None, *, file_labels: Optional[Sequence[str]] = None, - parse_kw: Optional[Dict[str, Any]] = None, + parse_kw: Optional[dict[str, Any]] = None, **kw, ) -> QueryResults: """Query a database with signatures derived by parsing a set of genome sequence files. diff --git a/src/gambit/results/csv.py b/src/gambit/results/csv.py index a56fd43..d4bb2e7 100644 --- a/src/gambit/results/csv.py +++ b/src/gambit/results/csv.py @@ -1,7 +1,7 @@ """Export query results to CSV.""" import csv -from typing import Dict, Any, List, Union, Iterable, TextIO +from typing import Any, Union, Iterable, TextIO from .base import AbstractResultsExporter from gambit.query import QueryResultItem, QueryResults @@ -29,7 +29,7 @@ class CSVResultsExporter(AbstractResultsExporter): format_opts Dialect and other formatting arguments passed to :func:`csv.write`. """ - format_opts: Dict[str, Any] + format_opts: dict[str, Any] COLUMNS = [ ('query', 'input.label'), @@ -51,11 +51,11 @@ def __init__(self, **format_opts): format_opts.setdefault('quoting', csv.QUOTE_MINIMAL) self.format_opts = format_opts - def get_header(self) -> List[str]: + def get_header(self) -> list[str]: """Get values for header row.""" return [name for name, _ in self.COLUMNS] - def get_row(self, item: QueryResultItem) -> List: + def get_row(self, item: QueryResultItem) -> list: """Get row values for single result item.""" return [getattr_nested(item, attrs, pass_none=True) for _, attrs in self.COLUMNS] diff --git a/src/gambit/seq.py b/src/gambit/seq.py index f5c5f92..1159e72 100644 --- a/src/gambit/seq.py +++ b/src/gambit/seq.py @@ -11,7 +11,7 @@ sequences. """ from pathlib import Path -from typing import Union, Optional, IO, Iterable, List +from typing import Union, Optional, IO, Iterable from os import PathLike from Bio import SeqIO @@ -174,7 +174,7 @@ def from_paths(cls, paths: Iterable[FilePath], format: str, compression: Optional[str] = None, - ) -> List['SequenceFile']: + ) -> list['SequenceFile']: """ Create many instances at once from a collection of paths and a single format and compression type. diff --git a/src/gambit/util/progress.py b/src/gambit/util/progress.py index d744061..31d5a8b 100644 --- a/src/gambit/util/progress.py +++ b/src/gambit/util/progress.py @@ -3,8 +3,7 @@ import sys from abc import ABC, abstractmethod -from typing import Optional, Union, Callable, Iterable, TextIO, Dict, Mapping, Any, cast, List, \ - Tuple, Iterator, ContextManager +from typing import Optional, Union, Callable, Iterable, TextIO, Mapping, Any, cast, Iterator, ContextManager from warnings import warn from contextlib import contextmanager @@ -119,9 +118,9 @@ class ProgressConfig: Keyword arguments to pass to callable. """ callable: ProgressFactoryFunc - kw: Dict[str, Any] + kw: dict[str, Any] - def __init__(self, callable: ProgressFactoryFunc, kw: Dict[str, Any]): + def __init__(self, callable: ProgressFactoryFunc, kw: dict[str, Any]): self.callable = callable self.kw = kw @@ -277,7 +276,7 @@ def __exit__(self, *args): self.meter.close() -def capture_progress(config: ProgressConfig) -> Tuple[ProgressConfig, List[AbstractProgressMeter]]: +def capture_progress(config: ProgressConfig) -> tuple[ProgressConfig, list[AbstractProgressMeter]]: """ Creates a ``ProgressConfig`` which captures references to the progress meter instances created with it. diff --git a/tests/cli/test_signatures.py b/tests/cli/test_signatures.py index 175cf24..b77be41 100644 --- a/tests/cli/test_signatures.py +++ b/tests/cli/test_signatures.py @@ -1,6 +1,7 @@ """Tests for the "signatures" command group.""" import json +from pathlib import Path import pytest import numpy as np @@ -12,6 +13,8 @@ from gambit.cli.common import strip_seq_file_ext from gambit.kmers import DEFAULT_KMERSPEC +from ..testdb import TestDB + class TestInfoCommand: @@ -20,18 +23,18 @@ def use_db(self, request): return request.param @pytest.fixture() - def base_args(self, testdb, use_db): + def base_args(self, testdb: TestDB, use_db: bool): if use_db: return [f'--db={testdb.paths.root}', 'signatures', 'info', '-d'] else: return ['signatures', 'info', str(testdb.paths.ref_signatures)] - def test_standard(self, base_args): + def test_standard(self, base_args: list[str]): result = invoke_cli(base_args) # TODO: check - def test_json(self, base_args, testdb): + def test_json(self, base_args: list[str], testdb: TestDB): args = [*base_args, '--json'] result = invoke_cli(args) @@ -40,13 +43,13 @@ def test_json(self, base_args, testdb): assert data['kmerspec'] == gjson.to_json(testdb.ref_signatures.kmerspec) assert data['metadata'] == gjson.to_json(testdb.ref_signatures.meta) - def test_ids(self, base_args, testdb): + def test_ids(self, base_args: list[str], testdb: TestDB): args = [*base_args, '-i'] result = invoke_cli(args) assert np.array_equal(result.stdout.splitlines(), testdb.ref_signatures.ids) - def test_invalid(self, testdb): + def test_invalid(self, testdb: TestDB): args = [ f'--db={testdb.paths.root}', 'signatures', @@ -60,16 +63,16 @@ def test_invalid(self, testdb): class TestCreateCommand: @pytest.fixture(params=[False]) - def infiles(self, request, testdb): + def infiles(self, request, testdb: TestDB): """Input files. Parameter is whether or not they are gzipped.""" return [f.path for f in testdb.get_query_files(request.param)] @pytest.fixture() - def outfile(self, tmp_path): + def outfile(self, tmp_path: Path): return tmp_path / 'signatures.gs' @pytest.fixture(name='make_args') - def make_args_factory(self, outfile, testdb, infiles, tmp_path): + def make_args_factory(self, outfile: Path, testdb: TestDB, infiles: list[Path], tmp_path: Path): def make_args(opts=(), root_args=(), with_kspec=True, positional_files=True, list_file=False): args = list(root_args) @@ -96,11 +99,11 @@ def make_args(opts=(), root_args=(), with_kspec=True, positional_files=True, lis return make_args @pytest.fixture() - def default_ids(self, infiles): + def default_ids(self, infiles: list[Path]): return [strip_seq_file_ext(file.name) for file in infiles] @pytest.fixture(name='check_output') - def check_output_factory(self, outfile, testdb, infiles, default_ids): + def check_output_factory(self, outfile: Path, testdb: TestDB, infiles: list[Path], default_ids: list[str]): def check_output(expected_ids=default_ids): out = load_signatures(outfile) @@ -118,7 +121,7 @@ def test_basic(self, make_args, check_output, infiles): invoke_cli(args) check_output() - def test_list_file(self, make_args, infiles, default_ids): + def test_list_file(self, make_args, infiles: list[Path], default_ids: list[str]): """Test getting genome list from file.""" args = make_args(['--dump-params'], positional_files=False, list_file=True) @@ -127,7 +130,7 @@ def test_list_file(self, make_args, infiles, default_ids): assert params['files'] == list(map(str, infiles)) assert params['ids'] == default_ids - def test_with_metadata(self, testdb, make_args, check_output, tmp_path): + def test_with_metadata(self, testdb: TestDB, make_args, check_output, tmp_path: Path): """Test with ids and metadata JSON added.""" # Metadata file metadata = SignaturesMeta( From e92374ca6d48020f96a15c71c67cad8319d1e1cc Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 19:48:52 -0700 Subject: [PATCH 36/86] Minor updates to signature calculation tests --- tests/sigs/test_calc.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/sigs/test_calc.py b/tests/sigs/test_calc.py index 52924d2..eb902a3 100644 --- a/tests/sigs/test_calc.py +++ b/tests/sigs/test_calc.py @@ -1,6 +1,7 @@ """Tests for gambit.search module.""" -from io import StringIO +from typing import Optional +from pathlib import Path import pytest import numpy as np @@ -12,7 +13,7 @@ from gambit.kmers import KmerSpec, index_to_kmer from gambit.seq import SEQ_TYPES, revcomp, SequenceFile import gambit.util.io as ioutil -from gambit.sigs import sigarray_eq +from gambit.sigs import sigarray_eq, KmerSignature from gambit.util.progress import check_progress from ..common import fill_bytearray, make_kmer_seq, make_kmer_seqs, convert_seq @@ -111,6 +112,9 @@ def test_overlapping(self): assert all(kmer in expected for kmer in found) +RecordSets = list[tuple[list[SeqIO.SeqRecord], KmerSignature]] + + class TestCalcFileSignatures: @pytest.fixture(scope='class') @@ -133,30 +137,26 @@ def record_sets(self): return items - @pytest.fixture(scope='class', params=['fasta']) - def format(self, request): - return request.param - @pytest.fixture(scope='class', params=[None, 'gzip']) def compression(self, request): return request.param @pytest.fixture() - def files(self, record_sets, tmp_path, format, compression): + def files(self, record_sets: RecordSets, tmp_path: Path, compression: Optional[str]): files = [] for i, (records, sig) in enumerate(record_sets): - file = SequenceFile(tmp_path / f'{i + 1}.fasta', format, compression) + file = SequenceFile(tmp_path / f'{i + 1}.fasta', 'fasta', compression) with file.open('wt') as f: - SeqIO.write(records, f, format) + SeqIO.write(records, f, 'fasta') files.append(file) return files - def test_calc_file_signature(self, record_sets, files): + def test_calc_file_signature(self, record_sets: RecordSets, files: list[SequenceFile]): """Test the calc_file_signature function.""" for file, (records, sig) in zip(files, record_sets): @@ -164,7 +164,7 @@ def test_calc_file_signature(self, record_sets, files): assert np.array_equal(result, sig) @pytest.mark.parametrize('concurrency', [None, 'threads', 'processes']) - def test_calc_file_signatures(self, record_sets, files, concurrency): + def test_calc_file_signatures(self, record_sets: RecordSets, files: list[SequenceFile], concurrency: Optional[str]): """Test the calc_file_signatures function.""" sigs = [sig for records, sig in record_sets] From 3fc70b2b9a8c83df6969ca97c6b147ba9253b60d Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 20:00:06 -0700 Subject: [PATCH 37/86] Type hints in tests --- src/gambit/cli/common.py | 2 +- tests/benchmarks/benchmark_signatures.py | 4 +-- tests/cli/test_common.py | 13 +++++---- tests/cli/test_signatures.py | 8 +++--- tests/common.py | 8 +++--- tests/db/test_models.py | 24 +++++++++------- tests/db/test_refdb.py | 9 ++++-- tests/db/test_sqla.py | 3 +- tests/sigs/test_hdf5.py | 35 +++++++++++++----------- tests/test_query.py | 4 ++- tests/test_results.py | 6 ++-- 11 files changed, 66 insertions(+), 50 deletions(-) diff --git a/src/gambit/cli/common.py b/src/gambit/cli/common.py index 1359268..496491a 100644 --- a/src/gambit/cli/common.py +++ b/src/gambit/cli/common.py @@ -325,7 +325,7 @@ def get_sequence_files(explicit: Optional[Iterable[FilePath]]=None, Returns ------- - Tuple[Optional[List[str]], Optional[List[SequenceFile]]] + tuple[Optional[list[str]], Optional[list[SequenceFile]]] ``(ids, files)`` tuple. ``ids`` is a list of string IDs that can be used to label output. If the ``explicit`` and ``listfile`` arguments are None/empty both components of the tuple will be None as well. diff --git a/tests/benchmarks/benchmark_signatures.py b/tests/benchmarks/benchmark_signatures.py index 1682e9b..f8bb9b8 100644 --- a/tests/benchmarks/benchmark_signatures.py +++ b/tests/benchmarks/benchmark_signatures.py @@ -25,7 +25,7 @@ def prefix_len(request): @pytest.fixture() -def kspec(k, prefix_len): +def kspec(k: int, prefix_len: int): prefix ='ATGACCT'[:prefix_len] return KmerSpec(k, prefix) @@ -38,6 +38,6 @@ def accumulator(request): return request.param -def benchmark_calc_signature(seq, kspec, benchmark, accumulator): +def benchmark_calc_signature(seq: bytes, kspec: KmerSpec, benchmark, accumulator): acc = accumulator(kspec.k) benchmark(calc_signature, kspec, seq, accumulator=acc) diff --git a/tests/cli/test_common.py b/tests/cli/test_common.py index d74f433..df2acb7 100644 --- a/tests/cli/test_common.py +++ b/tests/cli/test_common.py @@ -1,6 +1,7 @@ """Test code in gambit.cli.common.""" from pathlib import Path +from typing import Iterable import pytest import click @@ -11,7 +12,9 @@ from gambit.db import ReferenceDatabase from gambit.seq import SequenceFile from gambit.util.misc import zip_strict -from gambit.util.io import write_lines +from gambit.util.io import write_lines, FilePath + +from ..testdb import TestDB class TestCLIContext: @@ -44,7 +47,7 @@ def test_no_db(self): ctx.require_signatures() @pytest.mark.parametrize('method', ['option', 'envvar']) - def test_with_db(self, method, testdb): + def test_with_db(self, method: str, testdb: TestDB): """Test with database given through the --db argument or environment variable.""" dbpath = testdb.paths.root @@ -96,7 +99,7 @@ def test_strip_seq_file_ext(): class TestGetSequenceFiles: """Test the get_sequence_files() function.""" - def check_ids(self, ids, paths, strip_dir, strip_ext): + def check_ids(self, ids: Iterable[str], paths: Iterable[FilePath], strip_dir: bool, strip_ext: bool): for id_, path in zip_strict(ids, paths): if strip_dir: expected = Path(path).name @@ -114,7 +117,7 @@ def check_files(self, files, paths): assert file.format == 'fasta' assert file.compression == 'auto' - def test_explicit(self, strip_dir, strip_ext): + def test_explicit(self, strip_dir: bool, strip_ext: bool): """Test given explicit paths from CLI argument.""" paths = [f'path/to/{i + 1}.fasta' for i in range(10)] ids, files = common.get_sequence_files(paths, None, None, strip_dir=strip_dir, strip_ext=strip_ext) @@ -126,7 +129,7 @@ def test_explicit(self, strip_dir, strip_ext): ('path/to/genomes', False), # Relative to other directory ('foo/baz', True), # Absolute paths in file, ignore wd ]) - def test_listfile(self, wd, absolute, tmpdir, strip_dir, strip_ext): + def test_listfile(self, wd: str, absolute: bool, tmpdir: Path, strip_dir: bool, strip_ext: bool): """Test reading file paths from list file.""" wd = Path(wd) list_paths = [f'{i + 1}.fasta' for i in range(10)] diff --git a/tests/cli/test_signatures.py b/tests/cli/test_signatures.py index b77be41..e53f860 100644 --- a/tests/cli/test_signatures.py +++ b/tests/cli/test_signatures.py @@ -160,7 +160,7 @@ def test_with_metadata(self, testdb: TestDB, make_args, check_output, tmp_path: out = check_output(ids) assert out.meta == metadata - def test_kspec_from_refdb(self, make_args, testdb): + def test_kspec_from_refdb(self, make_args, testdb: TestDB): """Test with KmerSpec taken from reference database.""" args = make_args( ['-d', '--dump-params'], @@ -171,7 +171,7 @@ def test_kspec_from_refdb(self, make_args, testdb): params = json.loads(result.stdout) assert params['kmerspec'] == gjson.to_json(testdb.kmerspec) - def test_default_kspec(self, make_args, testdb): + def test_default_kspec(self, make_args, testdb: TestDB): """Test with default KmerSpec.""" args = make_args( ['--dump-params'], @@ -181,7 +181,7 @@ def test_default_kspec(self, make_args, testdb): params = json.loads(result.stdout) assert params['kmerspec'] == gjson.to_json(DEFAULT_KMERSPEC) - def test_invalid(self, testdb, make_args): + def test_invalid(self, testdb: TestDB, make_args): """Test with invalid parameter combinations.""" # No genomes @@ -206,7 +206,7 @@ def test_invalid(self, testdb, make_args): args = make_args(['-d'], with_kspec=False) invoke_cli(args, success=False) - def test_ids_wrong_len(self, testdb, make_args, tmp_path): + def test_ids_wrong_len(self, testdb: TestDB, make_args, tmp_path: Path): """Test where number of IDs does not match query files.""" ids = [f'seq-{i}' for i in range(len(testdb.query_genomes) - 1)] diff --git a/tests/common.py b/tests/common.py index 4eb294e..f504717 100644 --- a/tests/common.py +++ b/tests/common.py @@ -1,6 +1,6 @@ """Helper functions for tests.""" -from typing import Optional, Tuple, Union, List, Sequence +from typing import Optional, Union, Sequence import numpy as np @@ -123,7 +123,7 @@ def make_kmer_seq(kspec: KmerSpec, seqlen: int, kmer_interval: int, n_interval: Optional[int] = None, - ) -> Tuple[bytes, KmerSignature]: + ) -> tuple[bytes, KmerSignature]: """Create a DNA sequence with a known k-mer signature. The sequence consists of a background of N's with a k-mer match every ``kmer_interval`` @@ -187,7 +187,7 @@ def make_kmer_seqs(kspec: KmerSpec, seqlen: int, kmer_interval: int, n_interval: Optional[int] = None, - ) -> Tuple[List[bytes], KmerSignature]: + ) -> tuple[list[bytes], KmerSignature]: """Create a set of DNA sequences with known combined signature.""" seqs = [] @@ -208,7 +208,7 @@ def make_kmer_seqs(kspec: KmerSpec, return seqs, dense_to_sparse(vec) -def make_lineage(thresholds: Sequence[float]) -> List[Taxon]: +def make_lineage(thresholds: Sequence[float]) -> list[Taxon]: """Create a linage of taxa that have the given distance thresholds. Parameters diff --git a/tests/db/test_models.py b/tests/db/test_models.py index 2e95515..8654ce6 100644 --- a/tests/db/test_models.py +++ b/tests/db/test_models.py @@ -3,12 +3,16 @@ Uses the included testdb_210818 database. """ +from typing import Iterable, Optional + import pytest from sqlalchemy.orm import sessionmaker from gambit.db import models from gambit.db import Genome, ReferenceGenomeSet, AnnotatedGenome, Taxon +from ..testdb import TestDB + # Some arbitrary JSON data JSON_DATA = { @@ -85,7 +89,7 @@ def test_extra_json(self, empty_db_session): class TestReferenceGenomeSet: """Test ReferenceGenomeSet model.""" - def test_root_taxa(self, testdb): + def test_root_taxa(self, testdb: TestDB): session = testdb.Session() gset = session.query(ReferenceGenomeSet).one() assert {taxon.name for taxon in gset.root_taxa()} == {'A1', 'A2', 'A3'} @@ -104,7 +108,7 @@ def test_extra_json(self, empty_db_session): class TestAnnotatedGenome: """Test AnnotatedGEnome model.""" - def test_hybrid_props(self, testdb): + def test_hybrid_props(self, testdb: TestDB): session = testdb.Session() hybrid_attrs = [ @@ -124,7 +128,7 @@ def test_hybrid_props(self, testdb): class TestTaxon: """Test Taxon model.""" - def test_tree(self, testdb): + def test_tree(self, testdb: TestDB): """Test tree structure.""" session = testdb.Session() gset = session.query(ReferenceGenomeSet).one() @@ -163,7 +167,7 @@ def test_tree(self, testdb): # Check leaves assert set(taxon.leaves()) == {d for d in subtree_set if d.isleaf()} - def check_traversal(self, iterator, postorder, expected): + def check_traversal(self, iterator: Iterable[Taxon], postorder: bool, expected: set[Taxon]): seen = set() for taxon in iterator: @@ -176,7 +180,7 @@ def check_traversal(self, iterator, postorder, expected): assert seen == expected - def test_genome_membership(self, testdb): + def test_genome_membership(self, testdb: TestDB): """Test the subtree_genomes() and has_genome() methods.""" session = testdb.Session() @@ -212,7 +216,7 @@ def test_extra_json(self, empty_db_session): ) check_json_col(empty_db_session, taxon, 'extra') - def taxon_by_name(self, session, name): + def taxon_by_name(self, session, name: str): return session.query(Taxon).filter_by(name=name).one() def test_common_ancestry(self, testdb): @@ -220,7 +224,7 @@ def test_common_ancestry(self, testdb): session = testdb.Session() - def check(names, expected_names): + def check(names: list[str], expected_names: list[str]): taxa = [self.taxon_by_name(session, name) for name in names] ca = Taxon.common_ancestors(taxa) lca = Taxon.lca(taxa) @@ -246,12 +250,12 @@ def check(names, expected_names): check(['A1', 'A2'], []) check(['A1_B1', 'A1_B2', 'A2_B1'], []) - def test_ancestor_of_rank(self, testdb): + def test_ancestor_of_rank(self, testdb: TestDB): """Test ancestor_of_rank() method.""" session = testdb.Session() - def check(name, rank, expected): + def check(name: str, rank: str, expected: Optional[str]): taxon = self.taxon_by_name(session, name) ancestor = taxon.ancestor_of_rank(rank) assert (ancestor is None) == (expected is None) @@ -271,7 +275,7 @@ def check(name, rank, expected): check('A1_B1_C1', 'strain', 'A1_B1_C1') check('A1_B1_C1', 'foo', None) - def test_lineage_ranks(self, testdb): + def test_lineage_ranks(self, testdb: TestDB): """Test lineage() method with argument.""" session = testdb.Session() diff --git a/tests/db/test_refdb.py b/tests/db/test_refdb.py index 7e12c60..6dc6c16 100644 --- a/tests/db/test_refdb.py +++ b/tests/db/test_refdb.py @@ -1,6 +1,7 @@ """Test gambit.db.refdb.""" import random +from pathlib import Path import pytest from sqlalchemy.orm import sessionmaker @@ -8,6 +9,8 @@ from gambit.db import refdb from gambit.db import Genome, ReferenceGenomeSet, AnnotatedGenome, Taxon, ReferenceDatabase, DatabaseLoadError +from ..testdb import TestDB + GENOME_ID_ATTRS = {attr: getattr(Genome, attr) for attr in Genome.ID_ATTRS} @@ -126,7 +129,7 @@ def test_genomes_by_id(self, session): class TestReferenceDatabase: """Test the ReferenceDatabase class.""" - def test_locate_files(self, tmp_path): + def test_locate_files(self, tmp_path: Path): genomes = tmp_path / 'test.gdb' genomes2 = tmp_path / 'test2.gdb' signatures = tmp_path / 'test.gs' @@ -158,10 +161,10 @@ def test_locate_files(self, tmp_path): signatures.touch() assert ReferenceDatabase.locate_files(tmp_path) == (genomes, signatures) - def test_load(self, testdb): + def test_load(self, testdb: TestDB): db = ReferenceDatabase.load(testdb.paths.ref_genomes, testdb.paths.ref_signatures) check_loaded_db(db) - def test_load_db_from_dir(self, testdb): + def test_load_db_from_dir(self, testdb: TestDB): db = ReferenceDatabase.load_from_dir(testdb.paths.root) check_loaded_db(db) diff --git a/tests/db/test_sqla.py b/tests/db/test_sqla.py index 4361930..318d7d5 100644 --- a/tests/db/test_sqla.py +++ b/tests/db/test_sqla.py @@ -3,9 +3,10 @@ from sqlalchemy.orm import Session from gambit.db import ReadOnlySession, file_sessionmaker +from ..testdb import TestDB -def test_file_sessionmaker(testdb): +def test_file_sessionmaker(testdb: TestDB): db_file = testdb.paths.ref_genomes maker = file_sessionmaker(db_file, readonly=True) diff --git a/tests/sigs/test_hdf5.py b/tests/sigs/test_hdf5.py index 05b0863..a142c7f 100644 --- a/tests/sigs/test_hdf5.py +++ b/tests/sigs/test_hdf5.py @@ -1,12 +1,15 @@ """Test gambit.sigs.hdf5.""" +from pathlib import Path + import pytest import h5py as h5 import numpy as np -from gambit.sigs.hdf5 import read_metadata, write_metadata, load_signatures_hdf5, dump_signatures_hdf5 -from gambit.sigs import SignaturesMeta, SignatureList, AnnotatedSignatures -from gambit.sigs.base import SignaturesFileError +from gambit.sigs.hdf5 import read_metadata, write_metadata, load_signatures_hdf5, \ + dump_signatures_hdf5, HDF5Signatures +from gambit.sigs.base import SignaturesMeta, SignatureList, AnnotatedSignatures, \ + AbstractSignatureArray, SignaturesFileError, SignatureArray from gambit.kmers import KmerSpec from ..common import make_signatures from .common import AbstractSignatureArrayTests @@ -22,7 +25,7 @@ @pytest.mark.parametrize('optional_attrs', [False, True]) -def test_metadata(tmp_path, optional_attrs): +def test_metadata(tmp_path: Path, optional_attrs: bool): """Test reading/writing metadata""" fname = tmp_path / 'test.gs' @@ -45,14 +48,14 @@ def test_metadata(tmp_path, optional_attrs): assert meta2 == meta -def dump_load(sigs, path, **kw): +def dump_load(sigs: AbstractSignatureArray, path: Path, **kw) -> HDF5Signatures: """Dump signatures to HDF5 file and load them again.""" f = path / 'test.gs' dump_signatures_hdf5(f, sigs, **kw) return load_signatures_hdf5(f) -def test_open_not_hdf5(tmp_path): +def test_open_not_hdf5(tmp_path: Path): """Test opening an invalid file.""" # Not an HDF5 file @@ -67,7 +70,7 @@ def test_open_not_hdf5(tmp_path): assert einfo.value.format == 'hdf5' -def test_open_invalid(tmp_path): +def test_open_invalid(tmp_path: Path): """Test opening an invalid HDF5 file.""" file = tmp_path / 'invalid.gs' @@ -88,24 +91,24 @@ def kspec(self): return KmerSpec(8, 'ATG') @pytest.fixture(scope='class', params=[(1000, 'u8'), (1000, 'i4'), (0, 'u8')]) - def sigs(self, request, kspec): + def sigs(self, request, kspec: KmerSpec): n, dtype = request.param return make_signatures(kspec, n, dtype) @pytest.fixture(scope='class') - def h5file(self, tmp_path_factory, sigs): + def h5file(self, tmp_path_factory, sigs: SignatureArray): """Write signatures to file and return file name.""" fname = tmp_path_factory.mktemp('HDF5FileSignatures') / 'test.gs' dump_signatures_hdf5(fname, sigs) return fname @pytest.fixture() - def h5sigs(self, h5file): + def h5sigs(self, h5file: Path): """Open HDF5Signatures object.""" with load_signatures_hdf5(h5file) as sigs: yield sigs - def test_attrs(self, h5sigs, sigs): + def test_attrs(self, h5sigs: AnnotatedSignatures, sigs: SignatureArray): """Test basic attributes for signatures saved without metadata.""" assert h5sigs.kmerspec == sigs.kmerspec assert h5sigs.dtype == sigs.values.dtype @@ -113,7 +116,7 @@ def test_attrs(self, h5sigs, sigs): assert h5sigs.meta == SignaturesMeta() @pytest.mark.parametrize('id_type', [int, str]) - def test_attrs_meta(self, sigs, id_type, tmp_path): + def test_attrs_meta(self, sigs: SignatureArray, id_type: type, tmp_path: Path): """Test basic attributes for signatures saved with metadata.""" if id_type is int: @@ -136,7 +139,7 @@ def test_attrs_meta(self, sigs, id_type, tmp_path): assert np.array_equal(h5sigs.ids, ids) assert h5sigs.meta == meta - def test_close(self, h5sigs): + def test_close(self, h5sigs: HDF5Signatures): assert h5sigs.group assert h5sigs @@ -146,7 +149,7 @@ def test_close(self, h5sigs): h5sigs.close() - def test_context(self, h5sigs): + def test_context(self, h5sigs: HDF5Signatures): with h5sigs as value: assert value is h5sigs assert h5sigs.group @@ -155,7 +158,7 @@ def test_context(self, h5sigs): assert not h5sigs.group assert not h5sigs - def test_create_from_list(self, sigs, tmp_path): + def test_create_from_list(self, sigs, tmp_path: Path): """Test creating from other AbstractSignatureArray type.""" siglist = SignatureList(sigs) @@ -164,7 +167,7 @@ def test_create_from_list(self, sigs, tmp_path): @pytest.mark.parametrize('from_list', [False, True]) @pytest.mark.parametrize('compression_level', [None, 7]) - def test_compression(self, from_list, compression_level, sigs, tmp_path): + def test_compression(self, from_list: bool, compression_level, sigs: SignatureArray, tmp_path: Path): """Test creating with gzip compression.""" create_from = SignatureList(sigs) if from_list else sigs diff --git a/tests/test_query.py b/tests/test_query.py index 55542cc..325ac0f 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -7,6 +7,8 @@ from gambit.util.misc import zip_strict from gambit import __version__ as GAMBIT_VERSION +from .testdb import TestDB + class TestQueryInput: """Test QueryInput class.""" @@ -24,7 +26,7 @@ def test_convert(self): @pytest.mark.parametrize('strict', [False, True]) -def test_query_python(testdb, strict): +def test_query_python(testdb: TestDB, strict: bool): """Run a full query using the Python API.""" ref_results = testdb.get_query_results(strict) params = ref_results.params diff --git a/tests/test_results.py b/tests/test_results.py index 5f78a6a..60f4478 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -102,21 +102,21 @@ def results(session): ) -def test_json(results): +def test_json(results: QueryResults): """Test JSONResultsExporter.""" exporter = JSONResultsExporter() buf = export_to_buffer(results, exporter) check_json_results(buf, results, strict=True) -def test_csv(results): +def test_csv(results: QueryResults): """Test CSVResultsExporter.""" exporter = CSVResultsExporter() buf = export_to_buffer(results, exporter) check_csv_results(buf, results, strict=True) -def test_results_archive(session, results): +def test_results_archive(session, results: QueryResults): """Test ResultArchiveWriter/Reader.""" writer = ResultsArchiveWriter() buf = export_to_buffer(results, writer) From e5f2d70c3785ab75439c35dfc11a6287b8c6f52b Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 20:04:00 -0700 Subject: [PATCH 38/86] Move gambit.results.test module to tests/ --- tests/cli/test_query.py | 2 +- src/gambit/results/test.py => tests/results.py | 0 tests/test_results.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename src/gambit/results/test.py => tests/results.py (100%) diff --git a/tests/cli/test_query.py b/tests/cli/test_query.py index d4de880..6702fb3 100644 --- a/tests/cli/test_query.py +++ b/tests/cli/test_query.py @@ -9,7 +9,6 @@ import pytest from gambit.cli.test import invoke_cli -from gambit.results.test import check_json_results, check_csv_results from gambit.seq import SequenceFile from gambit.query import QueryInput, QueryResults from gambit.util.misc import zip_strict @@ -17,6 +16,7 @@ from gambit.cli.common import strip_seq_file_ext from ..testdb import TestDB +from ..results import check_json_results, check_csv_results def make_args(testdb: TestDB, *, diff --git a/src/gambit/results/test.py b/tests/results.py similarity index 100% rename from src/gambit/results/test.py rename to tests/results.py diff --git a/tests/test_results.py b/tests/test_results.py index 60f4478..70ac895 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -10,7 +10,7 @@ from gambit.results.json import JSONResultsExporter from gambit.results.csv import CSVResultsExporter from gambit.results.archive import ResultsArchiveReader, ResultsArchiveWriter -from gambit.results.test import check_json_results, check_csv_results +from .results import check_json_results, check_csv_results def export_to_buffer(results: QueryResults, exporter) -> StringIO: From 427ee1cc20b71a2f278dabbb680f8d25f0d5d7ac Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 20:12:18 -0700 Subject: [PATCH 39/86] Convert gambit.results from package to module --- src/gambit/cli/query.py | 4 +- src/gambit/results.py | 274 +++++++++++++++++++ src/gambit/results/__init__.py | 1 - src/gambit/results/archive.py | 117 -------- src/gambit/results/base.py | 55 ---- src/gambit/results/csv.py | 68 ----- src/gambit/results/json.py | 58 ---- tests/data/testdb_210818/generate-results.py | 2 +- tests/test_results.py | 4 +- tests/testdb.py | 2 +- 10 files changed, 278 insertions(+), 307 deletions(-) create mode 100644 src/gambit/results.py delete mode 100644 src/gambit/results/__init__.py delete mode 100644 src/gambit/results/archive.py delete mode 100644 src/gambit/results/base.py delete mode 100644 src/gambit/results/csv.py delete mode 100644 src/gambit/results/json.py diff --git a/src/gambit/cli/query.py b/src/gambit/cli/query.py index b7d4e82..ce92c98 100644 --- a/src/gambit/cli/query.py +++ b/src/gambit/cli/query.py @@ -8,20 +8,18 @@ from gambit.query import QueryParams, QueryInput, query, query_parse from gambit.util.progress import progress_config from gambit.sigs import load_signatures +from gambit.results import CSVResultsExporter, JSONResultsExporter, ResultsArchiveWriter from gambit._cython.threads import omp_set_num_threads def get_exporter(outfmt: str): if outfmt == 'csv': - from gambit.results.csv import CSVResultsExporter return CSVResultsExporter() if outfmt == 'json': - from gambit.results.json import JSONResultsExporter return JSONResultsExporter() if outfmt == 'archive': - from gambit.results.archive import ResultsArchiveWriter return ResultsArchiveWriter(install_info=True) assert 0 diff --git a/src/gambit/results.py b/src/gambit/results.py new file mode 100644 index 0000000..762264d --- /dev/null +++ b/src/gambit/results.py @@ -0,0 +1,274 @@ +"""Export query results in various formats.""" + +import json +from abc import ABC, abstractmethod +from typing import IO, Union, TextIO, Any, Iterable +import csv +from functools import singledispatchmethod + +from attr import attrs, attrib, asdict +from sqlalchemy.orm import Session + +from gambit.util.io import FilePath, maybe_open +import gambit.util.json as gjson +from gambit.query import QueryResults, QueryResultItem, QueryInput +from gambit.db import ReferenceGenomeSet, Taxon, AnnotatedGenome, Genome + + +class AbstractResultsExporter(ABC): + """Base for classes that export formatted query results. + + Subclasses must implement :meth:`export`. + """ + + @abstractmethod + def export(self, file_or_path: Union[FilePath, IO], results: QueryResults): + """Write query results to file. + + Parameters + ---------- + file_or_path + Open file-like object or file path to write to. + results + Results to export. + """ + + +def _todict(obj, attrs): + return {a: getattr(obj, a) for a in attrs} + + +@attrs() +class BaseJSONResultsExporter(AbstractResultsExporter): + """Base class for JSON exporters. + + Subclasses need to implement the ``to_json`` method. + + Attributes + ---------- + pretty + Write in more human-readable but less compact format. Defaults to False. + """ + pretty: bool = attrib(default=False) + + def to_json(self, obj): + """Convert object to JSON-compatible format (need not work recursively).""" + return gjson.to_json(obj) + + def export(self, file_or_path: Union[FilePath, TextIO], results: QueryResults): + opts = dict(indent=4, sort_keys=True) if self.pretty else dict() + with maybe_open(file_or_path, 'w') as f: + json.dump(results, f, default=self.to_json, **opts) + + +def getattr_nested(obj, attrs: Union[str, Iterable[str]], pass_none=False): + if isinstance(attrs, str): + attrs = attrs.split('.') + + for attr in attrs: + if pass_none and obj is None: + return None + + obj = getattr(obj, attr) + + return obj + + +class CSVResultsExporter(AbstractResultsExporter): + """Exports query results in CSV format. + + Attributes + ---------- + format_opts + Dialect and other formatting arguments passed to :func:`csv.write`. + """ + format_opts: dict[str, Any] + + COLUMNS = [ + ('query', 'input.label'), + ('predicted.name', 'report_taxon.name'), + ('predicted.rank', 'report_taxon.rank'), + ('predicted.ncbi_id', 'report_taxon.ncbi_id'), + ('predicted.threshold', 'report_taxon.distance_threshold'), + ('closest.distance', 'classifier_result.closest_match.distance'), + ('closest.description', 'classifier_result.closest_match.genome.description'), + ('next.name', 'classifier_result.next_taxon.name'), + ('next.rank', 'classifier_result.next_taxon.rank'), + ('next.ncbi_id', 'classifier_result.next_taxon.ncbi_id'), + ('next.threshold', 'classifier_result.next_taxon.distance_threshold'), + ] + + def __init__(self, **format_opts): + if 'dialect' not in format_opts: + format_opts.setdefault('lineterminator', '\n') + format_opts.setdefault('quoting', csv.QUOTE_MINIMAL) + self.format_opts = format_opts + + def get_header(self) -> list[str]: + """Get values for header row.""" + return [name for name, _ in self.COLUMNS] + + def get_row(self, item: QueryResultItem) -> list: + """Get row values for single result item.""" + return [getattr_nested(item, attrs, pass_none=True) for _, attrs in self.COLUMNS] + + def export(self, file_or_path: Union[FilePath, TextIO], results: QueryResults): + with maybe_open(file_or_path, 'w') as f: + writer = csv.writer(f, **self.format_opts) + + writer.writerow(self.get_header()) + for item in results.items: + writer.writerow(self.get_row(item)) + + +@attrs() +class JSONResultsExporter(BaseJSONResultsExporter): + """Exports query results in basic JSON format. + + Currently it assumes that the query was run with ``classify_strict=False``, so the only + relevant information from ``ClassifierResult`` is the closest genome match. + """ + + to_json = singledispatchmethod(BaseJSONResultsExporter.to_json) + + @to_json.register(QueryResults) + def _results_to_json(self, results: QueryResults): + data = asdict(results, recurse=False) + del data['params'] # Parameters not currently exposed thru CLI, so omit for now. + return data + + @to_json.register(QueryResultItem) + def _item_to_json(self, item: QueryResultItem): + return dict( + query=item.input, + predicted_taxon=item.report_taxon, + next_taxon=item.classifier_result.next_taxon, + closest_genomes=item.closest_genomes, + ) + + @to_json.register(QueryInput) + def _input_to_json(self, input: QueryInput): + return dict( + name=input.label, + path=None if input.file is None else input.file.path, + format=None if input.file is None else input.file.format, + ) + + @to_json.register(ReferenceGenomeSet) + def _genomeset_to_json(self, gset: ReferenceGenomeSet): + return _todict(gset, ['id', 'key', 'version', 'name', 'description']) + + @to_json.register(Taxon) + def _taxon_to_json(self, taxon: Taxon): + return _todict(taxon, ['id', 'key', 'name', 'ncbi_id', 'rank', 'distance_threshold']) + + @to_json.register(AnnotatedGenome) + def _genome_to_json(self, genome: AnnotatedGenome): + data = _todict(genome, ['key', 'description', 'organism', 'ncbi_db', 'ncbi_id', 'genbank_acc', 'refseq_acc']) + data['id'] = genome.genome_id + data['taxonomy'] = list(genome.taxon.ancestors(incself=True)) + return data + + +class ResultsArchiveWriter(BaseJSONResultsExporter): + """Exports query results to "archive" format which captures all stored data. + + This format is not intended to be read by users of the application. + The exported data can be read and converted back into an identical :class:`QueryResults` + object using :class:`.ResultsArchiveReader`. + + Only the ID attributes of database models are saved, when loading the saved results the models + are recreated by database queries. + """ + + to_json = singledispatchmethod(BaseJSONResultsExporter.to_json) + + @to_json.register(ReferenceGenomeSet) + def _genomeset_to_json(self, gset: ReferenceGenomeSet): + return _todict(gset, ['key', 'version']) + + @to_json.register(Taxon) + def _taxon_to_json(self, taxon: Taxon): + return _todict(taxon, ['key']) + + @to_json.register(AnnotatedGenome) + def _genome_to_json(self, genome: AnnotatedGenome): + return _todict(genome, ['key']) + + +class ResultsArchiveReader: + """Loads query results from file created by :class:`ResultsArchiveWriter`. + + Attributes + ---------- + session + SQLAlchemy session used to load database objects. + """ + session: Session + + def __init__(self, session): + self.session = session + + self._init_converter() + + # Loading the Taxon and AnnotatedGenome instances from the database requires not just their + # ID (key attribute) values but also the ReferenceGenomeSet they belong to. Setting this + # attribute to the genome set instance of the results currently being loaded is a somewhat + # hacky method of passing this information to the unstructuring hook functions. There isn't + # a much better way of doing this without reimplementing a lot of the cattrs machinery. + self._current_genomeset = None + + def _init_converter(self): + """Initialize the cattrs converter instance. + + This is a clone of the converter instance in gambit.util.json, with additional structuring + hooks registered to methods on this instance. + """ + self._converter = gjson.converter.copy() + self._converter.register_structure_hook(ReferenceGenomeSet, self._structure_genomeset) + self._converter.register_structure_hook(AnnotatedGenome, self._structure_genome) + self._converter.register_structure_hook(Taxon, self._structure_taxon) + + def read(self, file_or_path: Union[FilePath, IO]) -> QueryResults: + """Read query results from JSON file. + + Parameters + ---------- + file_or_path + Readable file object or file path. + """ + with maybe_open(file_or_path) as f: + data = json.load(f) + + return self.results_from_json(data) + + def results_from_json(self, data: dict[str, Any]) -> QueryResults: + """Recreate results object from loaded JSON data.""" + + gset_key = data['genomeset']['key'] + gset_version = data['genomeset']['version'] + self._current_genomeset = self.session.query(ReferenceGenomeSet) \ + .filter_by(key=gset_key, version=gset_version) \ + .one() + + try: + return self._converter.structure(data, QueryResults) + + finally: + self._current_genomeset = None + + def _structure_genomeset(self, data: dict[str, Any], cls=None): + return self._current_genomeset + + def _structure_genome(self, data: dict[str, Any], cls=None) -> AnnotatedGenome: + key = data['key'] + gset_id = self._current_genomeset.id + return self.session.query(AnnotatedGenome)\ + .join(Genome)\ + .filter(AnnotatedGenome.genome_set_id == gset_id, Genome.key == key)\ + .one() + + def _structure_taxon(self, data: dict[str, Any], cls=None) -> Taxon: + key = data['key'] + gset_id = self._current_genomeset.id + return self.session.query(Taxon).filter_by(genome_set_id=gset_id, key=key).one() diff --git a/src/gambit/results/__init__.py b/src/gambit/results/__init__.py deleted file mode 100644 index fefd38c..0000000 --- a/src/gambit/results/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Export query results in various formats.""" diff --git a/src/gambit/results/archive.py b/src/gambit/results/archive.py deleted file mode 100644 index 1f288cc..0000000 --- a/src/gambit/results/archive.py +++ /dev/null @@ -1,117 +0,0 @@ -"""Export results to JSON.""" - -import json -from typing import Union, IO, Any -from functools import singledispatchmethod - -from sqlalchemy.orm import Session - -from gambit.query import QueryResults -from gambit.db import ReferenceGenomeSet, Taxon, AnnotatedGenome, Genome -import gambit.util.json as gjson -from gambit.util.io import FilePath, maybe_open -from .base import BaseJSONResultsExporter, _todict - - -class ResultsArchiveWriter(BaseJSONResultsExporter): - """Exports query results to "archive" format which captures all stored data. - - This format is not intended to be read by users of the application. - The exported data can be read and converted back into an identical :class:`QueryResults` - object using :class:`.ResultsArchiveReader`. - - Only the ID attributes of database models are saved, when loading the saved results the models - are recreated by database queries. - """ - - to_json = singledispatchmethod(BaseJSONResultsExporter.to_json) - - @to_json.register(ReferenceGenomeSet) - def _genomeset_to_json(self, gset: ReferenceGenomeSet): - return _todict(gset, ['key', 'version']) - - @to_json.register(Taxon) - def _taxon_to_json(self, taxon: Taxon): - return _todict(taxon, ['key']) - - @to_json.register(AnnotatedGenome) - def _genome_to_json(self, genome: AnnotatedGenome): - return _todict(genome, ['key']) - - -class ResultsArchiveReader: - """Loads query results from file created by :class:`ResultsArchiveWriter`. - - Attributes - ---------- - session - SQLAlchemy session used to load database objects. - """ - session: Session - - def __init__(self, session): - self.session = session - - self._init_converter() - - # Loading the Taxon and AnnotatedGenome instances from the database requires not just their - # ID (key attribute) values but also the ReferenceGenomeSet they belong to. Setting this - # attribute to the genome set instance of the results currently being loaded is a somewhat - # hacky method of passing this information to the unstructuring hook functions. There isn't - # a much better way of doing this without reimplementing a lot of the cattrs machinery. - self._current_genomeset = None - - def _init_converter(self): - """Initialize the cattrs converter instance. - - This is a clone of the converter instance in gambit.util.json, with additional structuring - hooks registered to methods on this instance. - """ - self._converter = gjson.converter.copy() - self._converter.register_structure_hook(ReferenceGenomeSet, self._structure_genomeset) - self._converter.register_structure_hook(AnnotatedGenome, self._structure_genome) - self._converter.register_structure_hook(Taxon, self._structure_taxon) - - def read(self, file_or_path: Union[FilePath, IO]) -> QueryResults: - """Read query results from JSON file. - - Parameters - ---------- - file_or_path - Readable file object or file path. - """ - with maybe_open(file_or_path) as f: - data = json.load(f) - - return self.results_from_json(data) - - def results_from_json(self, data: dict[str, Any]) -> QueryResults: - """Recreate results object from loaded JSON data.""" - - gset_key = data['genomeset']['key'] - gset_version = data['genomeset']['version'] - self._current_genomeset = self.session.query(ReferenceGenomeSet) \ - .filter_by(key=gset_key, version=gset_version) \ - .one() - - try: - return self._converter.structure(data, QueryResults) - - finally: - self._current_genomeset = None - - def _structure_genomeset(self, data: dict[str, Any], cls=None): - return self._current_genomeset - - def _structure_genome(self, data: dict[str, Any], cls=None) -> AnnotatedGenome: - key = data['key'] - gset_id = self._current_genomeset.id - return self.session.query(AnnotatedGenome)\ - .join(Genome)\ - .filter(AnnotatedGenome.genome_set_id == gset_id, Genome.key == key)\ - .one() - - def _structure_taxon(self, data: dict[str, Any], cls=None) -> Taxon: - key = data['key'] - gset_id = self._current_genomeset.id - return self.session.query(Taxon).filter_by(genome_set_id=gset_id, key=key).one() diff --git a/src/gambit/results/base.py b/src/gambit/results/base.py deleted file mode 100644 index 11b4c42..0000000 --- a/src/gambit/results/base.py +++ /dev/null @@ -1,55 +0,0 @@ -import json -from abc import ABC, abstractmethod -from typing import IO, Union, TextIO - -from attr import attrs, attrib - -from gambit.util.io import FilePath, maybe_open -import gambit.util.json as gjson -from gambit.query import QueryResults - - -class AbstractResultsExporter(ABC): - """Base for classes that export formatted query results. - - Subclasses must implement :meth:`export`. - """ - - @abstractmethod - def export(self, file_or_path: Union[FilePath, IO], results: QueryResults): - """Write query results to file. - - Parameters - ---------- - file_or_path - Open file-like object or file path to write to. - results - Results to export. - """ - - -def _todict(obj, attrs): - return {a: getattr(obj, a) for a in attrs} - - -@attrs() -class BaseJSONResultsExporter(AbstractResultsExporter): - """Base class for JSON exporters. - - Subclasses need to implement the ``to_json`` method. - - Attributes - ---------- - pretty - Write in more human-readable but less compact format. Defaults to False. - """ - pretty: bool = attrib(default=False) - - def to_json(self, obj): - """Convert object to JSON-compatible format (need not work recursively).""" - return gjson.to_json(obj) - - def export(self, file_or_path: Union[FilePath, TextIO], results: QueryResults): - opts = dict(indent=4, sort_keys=True) if self.pretty else dict() - with maybe_open(file_or_path, 'w') as f: - json.dump(results, f, default=self.to_json, **opts) diff --git a/src/gambit/results/csv.py b/src/gambit/results/csv.py deleted file mode 100644 index d4bb2e7..0000000 --- a/src/gambit/results/csv.py +++ /dev/null @@ -1,68 +0,0 @@ -"""Export query results to CSV.""" - -import csv -from typing import Any, Union, Iterable, TextIO - -from .base import AbstractResultsExporter -from gambit.query import QueryResultItem, QueryResults -from gambit.util.io import FilePath, maybe_open - - -def getattr_nested(obj, attrs: Union[str, Iterable[str]], pass_none=False): - if isinstance(attrs, str): - attrs = attrs.split('.') - - for attr in attrs: - if pass_none and obj is None: - return None - - obj = getattr(obj, attr) - - return obj - - -class CSVResultsExporter(AbstractResultsExporter): - """Exports query results in CSV format. - - Attributes - ---------- - format_opts - Dialect and other formatting arguments passed to :func:`csv.write`. - """ - format_opts: dict[str, Any] - - COLUMNS = [ - ('query', 'input.label'), - ('predicted.name', 'report_taxon.name'), - ('predicted.rank', 'report_taxon.rank'), - ('predicted.ncbi_id', 'report_taxon.ncbi_id'), - ('predicted.threshold', 'report_taxon.distance_threshold'), - ('closest.distance', 'classifier_result.closest_match.distance'), - ('closest.description', 'classifier_result.closest_match.genome.description'), - ('next.name', 'classifier_result.next_taxon.name'), - ('next.rank', 'classifier_result.next_taxon.rank'), - ('next.ncbi_id', 'classifier_result.next_taxon.ncbi_id'), - ('next.threshold', 'classifier_result.next_taxon.distance_threshold'), - ] - - def __init__(self, **format_opts): - if 'dialect' not in format_opts: - format_opts.setdefault('lineterminator', '\n') - format_opts.setdefault('quoting', csv.QUOTE_MINIMAL) - self.format_opts = format_opts - - def get_header(self) -> list[str]: - """Get values for header row.""" - return [name for name, _ in self.COLUMNS] - - def get_row(self, item: QueryResultItem) -> list: - """Get row values for single result item.""" - return [getattr_nested(item, attrs, pass_none=True) for _, attrs in self.COLUMNS] - - def export(self, file_or_path: Union[FilePath, TextIO], results: QueryResults): - with maybe_open(file_or_path, 'w') as f: - writer = csv.writer(f, **self.format_opts) - - writer.writerow(self.get_header()) - for item in results.items: - writer.writerow(self.get_row(item)) diff --git a/src/gambit/results/json.py b/src/gambit/results/json.py deleted file mode 100644 index c285bcf..0000000 --- a/src/gambit/results/json.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Export results to JSON.""" - -from functools import singledispatchmethod - -from attr import attrs, asdict - -from .base import _todict, BaseJSONResultsExporter -from gambit.query import QueryResultItem, QueryResults, QueryInput -from gambit.db import ReferenceGenomeSet, Taxon, AnnotatedGenome - - -@attrs() -class JSONResultsExporter(BaseJSONResultsExporter): - """Exports query results in basic JSON format. - - Currently it assumes that the query was run with ``classify_strict=False``, so the only - relevant information from ``ClassifierResult`` is the closest genome match. - """ - - to_json = singledispatchmethod(BaseJSONResultsExporter.to_json) - - @to_json.register(QueryResults) - def _results_to_json(self, results: QueryResults): - data = asdict(results, recurse=False) - del data['params'] # Parameters not currently exposed thru CLI, so omit for now. - return data - - @to_json.register(QueryResultItem) - def _item_to_json(self, item: QueryResultItem): - return dict( - query=item.input, - predicted_taxon=item.report_taxon, - next_taxon=item.classifier_result.next_taxon, - closest_genomes=item.closest_genomes, - ) - - @to_json.register(QueryInput) - def _input_to_json(self, input: QueryInput): - return dict( - name=input.label, - path=None if input.file is None else input.file.path, - format=None if input.file is None else input.file.format, - ) - - @to_json.register(ReferenceGenomeSet) - def _genomeset_to_json(self, gset: ReferenceGenomeSet): - return _todict(gset, ['id', 'key', 'version', 'name', 'description']) - - @to_json.register(Taxon) - def _taxon_to_json(self, taxon: Taxon): - return _todict(taxon, ['id', 'key', 'name', 'ncbi_id', 'rank', 'distance_threshold']) - - @to_json.register(AnnotatedGenome) - def _genome_to_json(self, genome: AnnotatedGenome): - data = _todict(genome, ['key', 'description', 'organism', 'ncbi_db', 'ncbi_id', 'genbank_acc', 'refseq_acc']) - data['id'] = genome.genome_id - data['taxonomy'] = list(genome.taxon.ancestors(incself=True)) - return data diff --git a/tests/data/testdb_210818/generate-results.py b/tests/data/testdb_210818/generate-results.py index 2e0cafe..920df44 100755 --- a/tests/data/testdb_210818/generate-results.py +++ b/tests/data/testdb_210818/generate-results.py @@ -14,7 +14,7 @@ from gambit.seq import SequenceFile from gambit.db import ReferenceDatabase, reportable_taxon from gambit.query import QueryParams, query_parse -from gambit.results.archive import ResultsArchiveWriter +from gambit.results import ResultsArchiveWriter from gambit.util.misc import zip_strict diff --git a/tests/test_results.py b/tests/test_results.py index 70ac895..b739042 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -7,9 +7,7 @@ from gambit.db import ReferenceGenomeSet, Genome from gambit.sigs import SignaturesMeta from gambit.seq import SequenceFile -from gambit.results.json import JSONResultsExporter -from gambit.results.csv import CSVResultsExporter -from gambit.results.archive import ResultsArchiveReader, ResultsArchiveWriter +from gambit.results import JSONResultsExporter, CSVResultsExporter, ResultsArchiveReader, ResultsArchiveWriter from .results import check_json_results, check_csv_results diff --git a/tests/testdb.py b/tests/testdb.py index 71751d8..8439f48 100644 --- a/tests/testdb.py +++ b/tests/testdb.py @@ -15,7 +15,7 @@ from gambit.kmers import KmerSpec from gambit.sigs import load_signatures, AnnotatedSignatures from gambit.db import ReferenceDatabase, ReadOnlySession, only_genomeset -from gambit.results.archive import ResultsArchiveReader +from gambit.results import ResultsArchiveReader from gambit.query import QueryResults From f25d3aca08410bdd9ac22a73493ce3f330c65b0b Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 20:18:06 -0700 Subject: [PATCH 40/86] Move gambit.cli.test to tests/ --- src/gambit/cli/test.py => tests/cli/common.py | 2 +- tests/cli/test_common.py | 2 +- tests/cli/test_dist.py | 2 +- tests/cli/test_query.py | 2 +- tests/cli/test_signatures.py | 2 +- tests/cli/test_tree.py | 6 +++++- 6 files changed, 10 insertions(+), 6 deletions(-) rename src/gambit/cli/test.py => tests/cli/common.py (98%) diff --git a/src/gambit/cli/test.py b/tests/cli/common.py similarity index 98% rename from src/gambit/cli/test.py rename to tests/cli/common.py index 125cc42..2b4cd1c 100644 --- a/src/gambit/cli/test.py +++ b/tests/cli/common.py @@ -6,7 +6,7 @@ import click from click.testing import CliRunner, Result -from .root import cli +from gambit.cli.root import cli DEFAULT_ENV = dict( diff --git a/tests/cli/test_common.py b/tests/cli/test_common.py index df2acb7..c051454 100644 --- a/tests/cli/test_common.py +++ b/tests/cli/test_common.py @@ -8,13 +8,13 @@ import numpy as np from gambit.cli import cli, common -from gambit.cli.test import default_runner, allow_no_args from gambit.db import ReferenceDatabase from gambit.seq import SequenceFile from gambit.util.misc import zip_strict from gambit.util.io import write_lines, FilePath from ..testdb import TestDB +from .common import default_runner, allow_no_args class TestCLIContext: diff --git a/tests/cli/test_dist.py b/tests/cli/test_dist.py index 8dc262f..0e624e6 100644 --- a/tests/cli/test_dist.py +++ b/tests/cli/test_dist.py @@ -10,7 +10,6 @@ from gambit.kmers import KmerSpec from gambit.metric import jaccarddist_matrix from gambit.sigs import SignatureList, dump_signatures -from gambit.cli.test import invoke_cli from gambit.util.io import write_lines from gambit.cluster import load_dmat_csv import gambit.util.json as gjson @@ -19,6 +18,7 @@ from gambit.cli.common import strip_seq_file_ext from ..testdb import TestDB +from .common import invoke_cli def get_query_files(testdb: TestDB, n: Optional[int] = None, gz: bool = False) -> list[SequenceFile]: diff --git a/tests/cli/test_query.py b/tests/cli/test_query.py index 6702fb3..8b00dcd 100644 --- a/tests/cli/test_query.py +++ b/tests/cli/test_query.py @@ -8,7 +8,6 @@ import pytest -from gambit.cli.test import invoke_cli from gambit.seq import SequenceFile from gambit.query import QueryInput, QueryResults from gambit.util.misc import zip_strict @@ -17,6 +16,7 @@ from ..testdb import TestDB from ..results import check_json_results, check_csv_results +from .common import invoke_cli def make_args(testdb: TestDB, *, diff --git a/tests/cli/test_signatures.py b/tests/cli/test_signatures.py index e53f860..182e053 100644 --- a/tests/cli/test_signatures.py +++ b/tests/cli/test_signatures.py @@ -6,7 +6,6 @@ import pytest import numpy as np -from gambit.cli.test import invoke_cli import gambit.util.json as gjson from gambit.sigs import SignaturesMeta, load_signatures from gambit.util.io import write_lines @@ -14,6 +13,7 @@ from gambit.kmers import DEFAULT_KMERSPEC from ..testdb import TestDB +from .common import invoke_cli class TestInfoCommand: diff --git a/tests/cli/test_tree.py b/tests/cli/test_tree.py index f25041b..891dacf 100644 --- a/tests/cli/test_tree.py +++ b/tests/cli/test_tree.py @@ -6,19 +6,23 @@ from Bio import Phylo from gambit.metric import jaccarddist_pairwise -from gambit.cli.test import invoke_cli from gambit.cluster import hclust, check_tree_matches_linkage from gambit.cli import common +from .common import invoke_cli + + @pytest.fixture() def expected_dmat(testdb): sigs = testdb.query_signatures return jaccarddist_pairwise(sigs) + @pytest.fixture() def expected_linkage(expected_dmat): return hclust(expected_dmat) + @pytest.mark.parametrize('from_sigs', [False, True]) def test_tree_command(from_sigs, expected_linkage, testdb): """Test running the command and checking the output.""" From fed9466d90a0f6b9d46932e58afd25333d1607f4 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 20:21:28 -0700 Subject: [PATCH 41/86] Remove reference to unused pytest marker --- setup.cfg | 5 ----- 1 file changed, 5 deletions(-) diff --git a/setup.cfg b/setup.cfg index 8c97121..442fa38 100644 --- a/setup.cfg +++ b/setup.cfg @@ -64,11 +64,6 @@ python_functions = test_* benchmark_* python_classes = Test* Benchmark* -# Custom markers -markers = - testdb_nqueries: number of query files from test database to use when testing CLI. - - # Flake8 settings [flake8] From 5d844dac950a5db974a2461f1d00dfe60d38bb95 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 20:43:23 -0700 Subject: [PATCH 42/86] Remove unused import --- tests/test_tests_common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_tests_common.py b/tests/test_tests_common.py index 9ec7f7a..5ddac65 100644 --- a/tests/test_tests_common.py +++ b/tests/test_tests_common.py @@ -6,7 +6,6 @@ from gambit.kmers import KmerSpec, kmer_to_index, nkmers from gambit.seq import revcomp from gambit.sigs.calc import dense_to_sparse -from gambit.util.progress import get_progress from . import common From 084fbe0747b0fb30a85a130bce0374719784538c Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 20:46:35 -0700 Subject: [PATCH 43/86] Remove database migrations module --- docs/source/api/database.rst | 6 - setup.cfg | 5 +- src/gambit/db/migrate/__init__.py | 137 ------------------ src/gambit/db/migrate/alembic.ini | 89 ------------ src/gambit/db/migrate/alembic/README | 1 - src/gambit/db/migrate/alembic/env.py | 62 -------- src/gambit/db/migrate/alembic/script.py.mako | 24 --- .../versions/c43540b80d50_gambit_0_1_0.py | 98 ------------- tests/db/test_migrate.py | 68 --------- 9 files changed, 2 insertions(+), 488 deletions(-) delete mode 100644 src/gambit/db/migrate/__init__.py delete mode 100644 src/gambit/db/migrate/alembic.ini delete mode 100644 src/gambit/db/migrate/alembic/README delete mode 100644 src/gambit/db/migrate/alembic/env.py delete mode 100644 src/gambit/db/migrate/alembic/script.py.mako delete mode 100644 src/gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py delete mode 100644 tests/db/test_migrate.py diff --git a/docs/source/api/database.rst b/docs/source/api/database.rst index bfcf307..1655432 100644 --- a/docs/source/api/database.rst +++ b/docs/source/api/database.rst @@ -32,9 +32,3 @@ gambit.db.sqla .. autoclass:: ReadOnlySession :exclude-members: __init__, __new__ :no-members: - - -gambit.db.migrate ------------------ - -.. automodule:: gambit.db.migrate diff --git a/setup.cfg b/setup.cfg index 442fa38..2295cc8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,7 +22,6 @@ install_requires = sqlalchemy~=1.1 # Seq stores data as bytes biopython~=1.79 - alembic~=1.0 attrs>=20 # Minimum for 3.12, also introduces potentially breaking changes cattrs>=23.2 @@ -55,8 +54,8 @@ test = pytest # Also check docstrings in package testpaths = tests gambit -# Run doctests on all modules (except __main__.py and alembic config directory) -addopts = --doctest-modules --ignore-glob "**/__main__.py" --ignore "gambit/db/migrate/alembic/" +# Run doctests on all modules (except __main__.py) +addopts = --doctest-modules --ignore-glob "**/__main__.py" doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL # Treat functions/classes prefixed with "benchmark" as tests, for files in tests/benchmarks/. diff --git a/src/gambit/db/migrate/__init__.py b/src/gambit/db/migrate/__init__.py deleted file mode 100644 index c10d164..0000000 --- a/src/gambit/db/migrate/__init__.py +++ /dev/null @@ -1,137 +0,0 @@ -"""Perform genome database migrations with Alembic. - -This package contains all Alembic configuration and data files. Revision files are located in -``./alembic/versions``. - -Note on alembic configuration - seems like normal usage of Alembic involves getting the database URL -from ``alembic.ini``. Since this application has no fixed location for the database we can't use -this method. Instead we are following the -`Sharing a Connection with a Series of Migration Commands and Environments `_ -recipe in Alembic's documentation, where the connectable object is generated programmatically -somehow and then attached to the Alembic configuration object's ``attributes`` dict. The -``run_migrations_offline`` and ``run_migrations_online`` functions in ``alembic/env.py`` are -modified from the version generated by ``alembic init`` to get their connectable object from this -dict instead of creating it based on the contents of ``alembic.ini``. Note that this means we -can't do (online) migration stuff from the standard alembic CLI command, which gets its -connection information only from ``alembic.ini``. - -The way to use this setup is instead to create an :class:`alembic.config.Config` instance with -:func:`.get_alembic_config` and use the functions in :mod:`alembic.command`. - -.. _alembic-recipe: https://alembic.sqlalchemy.org/en/latest/cookbook.html#sharing-a-connection-with-a-series-of-migration-commands-and-environments -""" - -from typing import Optional - -from alembic.config import Config -from alembic import command -from alembic.migration import MigrationContext -from alembic.script import ScriptDirectory -from pkg_resources import resource_filename -from sqlalchemy.engine import Connectable - - -INI_PATH = resource_filename(__name__, 'alembic.ini') - - -def get_alembic_config(connectable: Optional[Connectable] = None, **kwargs) -> Config: - """Get an alembic config object to perform migrations. - - Parameters - ---------- - connectable - SQLAlchemy connectable specifying database connection info (optional). Assigned to - ``'connectable'`` key of :attr:`alembic.config.Config.attributes`. - \\**kwargs - Keyword arguments to pass to :meth:`alembic.config.Config.__init__`. - - Returns - ------- - Alembic config object. - """ - config = Config(INI_PATH, **kwargs) - config.attributes['connectable'] = connectable - - return config - - -def current_head() -> str: - """Get the current head revision number.""" - conf = get_alembic_config() - scriptdir = ScriptDirectory.from_config(conf) - return scriptdir.get_current_head() - - -def current_revision(connectable: Connectable) -> str: - """Get the current revision number of a genome database.""" - with connectable.connect() as conn: - ctx = MigrationContext.configure(conn) - return ctx.get_current_revision() - - -def is_current_revision(connectable: Connectable): - """Check if the current revision of a genome database is the most recent (head) revision.""" - head = current_head() - current = current_revision(connectable) - return current == head - - -def upgrade(connectable: Connectable, revision: str = 'head', tag=None, **kwargs): - """Run the alembic upgrade command. - - See :func:`alembic.command.upgrade` for more information on how this works. - - Parameters - ---------- - connectable - SQLAlchemy connectable specifying genome database connection info. - revision - Revision to upgrade to. Passed to :func:`alembic.command.upgrade`. - tag - Passed to :func:`alembic.command.upgrade`. - \\**kwargs - Passed to :func:`.get_alembic_config`. - """ - config = get_alembic_config(connectable, **kwargs) - command.upgrade(config, revision, tag=tag) - - -def init_db(connectable: Connectable): - """ - Initialize the genome database schema by creating all tables and stamping with the latest - Alembic revision. - - Expects a fresh database that does not already contain any tables for the :mod:`gambit.db.models` - models and has not had any migrations run on it yet. - - Parameters - ---------- - connectable - SQLAlchemy connectable specifying database connection info. - - Raises - ------ - RuntimeError - If the database is already stamped with an Alembic revision. - sqlalchemy.exc.OperationalError - If any of the database tables to be created already exist. - """ - from gambit.db.models import Base - - conf = get_alembic_config() - script = ScriptDirectory.from_config(conf) - - with connectable.connect() as conn: - ctx = MigrationContext.configure(conn) - - # Check there is no current revision stamped - current = ctx.get_current_revision() - if current is not None: - raise RuntimeError(f'Expected uninitialized database, but current alembic revision is {current}') - - # Create tables - # Set checkfirst=false so that we get an SQL error if any tables already exist - Base.metadata.create_all(conn, checkfirst=False) - - # Stamp latest alembic version - ctx.stamp(script, 'head') diff --git a/src/gambit/db/migrate/alembic.ini b/src/gambit/db/migrate/alembic.ini deleted file mode 100644 index e1d4181..0000000 --- a/src/gambit/db/migrate/alembic.ini +++ /dev/null @@ -1,89 +0,0 @@ -# A generic, single database configuration. - -[alembic] -# Database connection set dynamically in migrate.get_alembic_config function. - -# path to migration scripts -script_location = gambit.db.migrate:alembic - -# template used to generate migration files -# file_template = %%(rev)s_%%(slug)s - -# sys.path path, will be prepended to sys.path if present. -# defaults to the current working directory. -prepend_sys_path = . - -# timezone to use when rendering the date -# within the migration file as well as the filename. -# string value is passed to dateutil.tz.gettz() -# leave blank for localtime -# timezone = - -# max length of characters to apply to the -# "slug" field -# truncate_slug_length = 40 - -# set to 'true' to run the environment during -# the 'revision' command, regardless of autogenerate -# revision_environment = false - -# set to 'true' to allow .pyc and .pyo files without -# a source .py file to be detected as revisions in the -# versions/ directory -# sourceless = false - -# version location specification; this defaults -# to ./alembic/versions. When using multiple version -# directories, initial revisions must be specified with --version-path -# version_locations = %(here)s/bar %(here)s/bat ./alembic/versions - -# the output encoding used when revision files -# are written from script.py.mako -# output_encoding = utf-8 - - -[post_write_hooks] -# post_write_hooks defines scripts or Python functions that are run -# on newly generated revision scripts. See the documentation for further -# detail and examples - -# format using "black" - use the console_scripts runner, against the "black" entrypoint -# hooks = black -# black.type = console_scripts -# black.entrypoint = black -# black.options = -l 79 REVISION_SCRIPT_FILENAME - -# Logging configuration -[loggers] -keys = root,sqlalchemy,alembic - -[handlers] -keys = console - -[formatters] -keys = generic - -[logger_root] -level = WARN -handlers = console -qualname = - -[logger_sqlalchemy] -level = WARN -handlers = -qualname = sqlalchemy.engine - -[logger_alembic] -level = INFO -handlers = -qualname = alembic - -[handler_console] -class = StreamHandler -args = (sys.stderr,) -level = NOTSET -formatter = generic - -[formatter_generic] -format = %(levelname)-5.5s [%(name)s] %(message)s -datefmt = %H:%M:%S diff --git a/src/gambit/db/migrate/alembic/README b/src/gambit/db/migrate/alembic/README deleted file mode 100644 index 98e4f9c..0000000 --- a/src/gambit/db/migrate/alembic/README +++ /dev/null @@ -1 +0,0 @@ -Generic single-database configuration. \ No newline at end of file diff --git a/src/gambit/db/migrate/alembic/env.py b/src/gambit/db/migrate/alembic/env.py deleted file mode 100644 index bc15eb7..0000000 --- a/src/gambit/db/migrate/alembic/env.py +++ /dev/null @@ -1,62 +0,0 @@ -from logging.config import fileConfig - -from alembic import context - -# this is the Alembic Config object, which provides -# access to the values within the .ini file in use. -config = context.config - -# Interpret the config file for Python logging. -# This line sets up loggers basically. -fileConfig(config.config_file_name) - -# add your model's MetaData object here -# for 'autogenerate' support -from gambit.db.models import Base -target_metadata = Base.metadata - -# other values from the config, defined by the needs of env.py, -# can be acquired: -# my_important_option = config.get_main_option("my_important_option") -# ... etc. - - -def run_migrations_offline(): - """Run migrations in 'offline' mode. - - Since we don't have a connection URL written into alembic.ini, we need to specify the - "dialect_name" argument. - """ - context.configure( - dialect_name='sqlite', - target_metadata=target_metadata, - literal_binds=True, - dialect_opts={"paramstyle": "named"}, - ) - - with context.begin_transaction(): - context.run_migrations() - - -def run_migrations_online(): - """Run migrations in 'online' mode. - - Expects a value for the "connectable" argument to migrate.get_alembic_config(). - """ - connectable = config.attributes.get('connectable') - if connectable is None: - raise RuntimeError('Connectable object must be passed to gambit.db.migrate.get_alembic_config()') - - with connectable.connect() as connection: - context.configure( - connection=connection, target_metadata=target_metadata - ) - - with context.begin_transaction(): - context.run_migrations() - - -if context.is_offline_mode(): - run_migrations_offline() -else: - run_migrations_online() diff --git a/src/gambit/db/migrate/alembic/script.py.mako b/src/gambit/db/migrate/alembic/script.py.mako deleted file mode 100644 index 2c01563..0000000 --- a/src/gambit/db/migrate/alembic/script.py.mako +++ /dev/null @@ -1,24 +0,0 @@ -"""${message} - -Revision ID: ${up_revision} -Revises: ${down_revision | comma,n} -Create Date: ${create_date} - -""" -from alembic import op -import sqlalchemy as sa -${imports if imports else ""} - -# revision identifiers, used by Alembic. -revision = ${repr(up_revision)} -down_revision = ${repr(down_revision)} -branch_labels = ${repr(branch_labels)} -depends_on = ${repr(depends_on)} - - -def upgrade(): - ${upgrades if upgrades else "pass"} - - -def downgrade(): - ${downgrades if downgrades else "pass"} diff --git a/src/gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py b/src/gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py deleted file mode 100644 index 2548f7b..0000000 --- a/src/gambit/db/migrate/alembic/versions/c43540b80d50_gambit_0_1_0.py +++ /dev/null @@ -1,98 +0,0 @@ -"""GAMBIT 0.1.0 - -Revision ID: c43540b80d50 -Revises: -Create Date: 2021-07-08 13:34:30.131392 - -Creates 0.1.0 database from scratch. -""" -from alembic import op -import sqlalchemy as sa - -from gambit.db.sqla import JsonString - - -# revision identifiers, used by Alembic. -revision = 'c43540b80d50' -down_revision = None -branch_labels = None -depends_on = None - - -def upgrade(): - op.create_table('genome_sets', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('key', sa.String(), nullable=False), - sa.Column('version', sa.String(), nullable=True), - sa.Column('name', sa.String(), nullable=False), - sa.Column('description', sa.String(), nullable=True), - sa.Column('extra', JsonString(), nullable=True), - sa.PrimaryKeyConstraint('id', name=op.f('pk_genome_sets')), - sa.UniqueConstraint('key', 'version', name=op.f('uq_genome_sets_key')) - ) - op.create_index(op.f('ix_genome_sets_key'), 'genome_sets', ['key'], unique=False) - - op.create_table('genomes', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('key', sa.String(), nullable=False), - sa.Column('description', sa.String(), nullable=False), - sa.Column('ncbi_db', sa.String(), nullable=True), - sa.Column('ncbi_id', sa.Integer(), nullable=True), - sa.Column('genbank_acc', sa.String(), nullable=True), - sa.Column('refseq_acc', sa.String(), nullable=True), - sa.Column('extra', JsonString(), nullable=True), - sa.PrimaryKeyConstraint('id', name=op.f('pk_genomes')), - sa.UniqueConstraint('genbank_acc', name=op.f('uq_genomes_genbank_acc')), - sa.UniqueConstraint('key', name=op.f('uq_genomes_key')), - sa.UniqueConstraint('ncbi_db', 'ncbi_id', name=op.f('uq_genomes_ncbi_db')), - sa.UniqueConstraint('refseq_acc', name=op.f('uq_genomes_refseq_acc')) - ) - - op.create_table('taxa', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('key', sa.String(), nullable=False), - sa.Column('name', sa.String(), nullable=False), - sa.Column('rank', sa.String(), nullable=True), - sa.Column('description', sa.String(), nullable=True), - sa.Column('distance_threshold', sa.Float(), nullable=True), - sa.Column('report', sa.Boolean(), server_default=sa.text('1'), nullable=False), - sa.Column('genome_set_id', sa.Integer(), nullable=False), - sa.Column('parent_id', sa.Integer(), nullable=True), - sa.Column('ncbi_id', sa.Integer(), nullable=True), - sa.Column('extra', JsonString(), nullable=True), - sa.ForeignKeyConstraint(['genome_set_id'], ['genome_sets.id'], name=op.f('fk_taxa_genome_set_id_genome_sets'), ondelete='CASCADE'), - sa.ForeignKeyConstraint(['parent_id'], ['taxa.id'], name=op.f('fk_taxa_parent_id_taxa'), ondelete='SET NULL'), - sa.PrimaryKeyConstraint('id', name=op.f('pk_taxa')), - sa.UniqueConstraint('key', name=op.f('uq_taxa_key')) - ) - op.create_index(op.f('ix_taxa_genome_set_id'), 'taxa', ['genome_set_id'], unique=False) - op.create_index(op.f('ix_taxa_name'), 'taxa', ['name'], unique=False) - op.create_index(op.f('ix_taxa_ncbi_id'), 'taxa', ['ncbi_id'], unique=False) - op.create_index(op.f('ix_taxa_parent_id'), 'taxa', ['parent_id'], unique=False) - op.create_index(op.f('ix_taxa_rank'), 'taxa', ['rank'], unique=False) - - op.create_table('genome_annotations', - sa.Column('genome_id', sa.Integer(), nullable=False), - sa.Column('genome_set_id', sa.Integer(), nullable=False), - sa.Column('taxon_id', sa.Integer(), nullable=True), - sa.Column('organism', sa.String(), nullable=True), - sa.ForeignKeyConstraint(['genome_id'], ['genomes.id'], name=op.f('fk_genome_annotations_genome_id_genomes'), ondelete='CASCADE'), - sa.ForeignKeyConstraint(['genome_set_id'], ['genome_sets.id'], name=op.f('fk_genome_annotations_genome_set_id_genome_sets'), ondelete='CASCADE'), - sa.ForeignKeyConstraint(['taxon_id'], ['taxa.id'], name=op.f('fk_genome_annotations_taxon_id_taxa'), ondelete='SET NULL'), - sa.PrimaryKeyConstraint('genome_id', 'genome_set_id', name=op.f('pk_genome_annotations')) - ) - op.create_index(op.f('ix_genome_annotations_taxon_id'), 'genome_annotations', ['taxon_id'], unique=False) - - -def downgrade(): - op.drop_index(op.f('ix_genome_annotations_taxon_id'), table_name='genome_annotations') - op.drop_table('genome_annotations') - op.drop_index(op.f('ix_taxa_rank'), table_name='taxa') - op.drop_index(op.f('ix_taxa_parent_id'), table_name='taxa') - op.drop_index(op.f('ix_taxa_ncbi_id'), table_name='taxa') - op.drop_index(op.f('ix_taxa_name'), table_name='taxa') - op.drop_index(op.f('ix_taxa_genome_set_id'), table_name='taxa') - op.drop_table('taxa') - op.drop_table('genomes') - op.drop_index(op.f('ix_genome_sets_key'), table_name='genome_sets') - op.drop_table('genome_sets') diff --git a/tests/db/test_migrate.py b/tests/db/test_migrate.py deleted file mode 100644 index de8845b..0000000 --- a/tests/db/test_migrate.py +++ /dev/null @@ -1,68 +0,0 @@ -"""Test the gambit.db.migrate module.""" - -import pytest -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker -from alembic.migration import MigrationContext -from alembic.script import ScriptDirectory - -from gambit.db.migrate import (current_head, current_revision, is_current_revision, init_db, - get_alembic_config) -from gambit.db import models - - -# Expected current head revision -# Need to update this value every time a new revision is introduced -CURRENT_HEAD = 'c43540b80d50' - -# Old revision number to test. Must actually exist in the scripts directory. -# TODO - set this once we have more than one revision file -OLD_REVISION = None - - -def test_current_head(): - assert current_head() == CURRENT_HEAD - - -class TestCurrentRevision: - """Test the current_revision() and is_current_revision() functions.""" - - def test_uninitialized(self): - """Test on uninitialized database (not stamped).""" - engine = create_engine('sqlite:///:memory:') - assert current_revision(engine) is None - assert not is_current_revision(engine) - - def test_empty(self): - """Test on freshly initialized database.""" - engine = create_engine('sqlite:///:memory:') - init_db(engine) - assert current_revision(engine) == CURRENT_HEAD - assert is_current_revision(engine) - - @pytest.mark.skipif(OLD_REVISION is None, reason='No older revisions to test.') - def test_old(self): - """Test on uninitialized database stamped with an old revision no.""" - engine = create_engine('sqlite:///:memory:') - conf = get_alembic_config(engine) - scriptdir = ScriptDirectory.from_config(conf) - - with engine.connect() as conn: - ctx = MigrationContext.configure(conn) - ctx.stamp(scriptdir, OLD_REVISION) - - assert current_revision(engine) == OLD_REVISION - - -def test_init_db(): - """Test the init_db() function.""" - engine = create_engine('sqlite:///:memory:') - init_db(engine) - - # Check current revision matches - assert current_revision(engine) == current_head() - - # Check we can query all models (won't return any results, but would fail if tables didn't exist). - session = sessionmaker(engine)() - for model in [models.Genome, models.ReferenceGenomeSet, models.AnnotatedGenome, models.Taxon]: - session.query(model).all() From 2ec997bb922af19f14ab1366b4540fb19f2a09f3 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 21:03:05 -0700 Subject: [PATCH 44/86] Remove removed modules from API docs --- docs/source/api/kmers.rst | 6 ------ docs/source/api/results.rst | 24 ------------------------ 2 files changed, 30 deletions(-) diff --git a/docs/source/api/kmers.rst b/docs/source/api/kmers.rst index 36fdc14..ef890a1 100644 --- a/docs/source/api/kmers.rst +++ b/docs/source/api/kmers.rst @@ -39,12 +39,6 @@ gambit.sigs.calc .. automodule:: gambit.sigs.calc -gambit.sigs.convert -------------------------- - -.. automodule:: gambit.sigs.convert - - gambit.sigs.hdf5 ---------------------- diff --git a/docs/source/api/results.rst b/docs/source/api/results.rst index fb889cb..c3ccda0 100644 --- a/docs/source/api/results.rst +++ b/docs/source/api/results.rst @@ -6,27 +6,3 @@ gambit.results ----------------- .. automodule:: gambit.results - - -gambit.results.base ----------------------- - -.. automodule:: gambit.results.base - - -gambit.results.json ----------------------- - -.. automodule:: gambit.results.json - - -gambit.results.csv ---------------------- - -.. automodule:: gambit.results.csv - - -gambit.results.archive -------------------------- - -.. automodule:: gambit.results.archive From 0f846ff18135fda9fae9f5e4316fb8899cbcb798 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 21:37:06 -0700 Subject: [PATCH 45/86] Fix deprecated setup.cfg option --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 2295cc8..686de05 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,7 +6,7 @@ author = Jared Lumpe author_email = jared@jaredlumpe.com url = http://github.com/jlumpe/gambit license = AGPL-3.0-or-later -license_file = LICENSE +license_files = LICENSE [options] packages = find: From ebb993b234b59d9e53ca8d3611d01199c62a1ebe Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 22:35:06 -0700 Subject: [PATCH 46/86] Set Cython compiler directives globally --- setup.py | 10 +++++++++- src/gambit/_cython/kmers.pxd | 2 -- src/gambit/_cython/kmers.pyx | 1 - src/gambit/_cython/metric.pxd | 2 -- src/gambit/_cython/metric.pyx | 8 -------- 5 files changed, 9 insertions(+), 14 deletions(-) diff --git a/setup.py b/setup.py index 2385bb8..01bbbad 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,14 @@ extra_compile_args=['-fopenmp', '-Wno-sign-compare'], extra_link_args=['-fopenmp'], )] +ext_modules = cythonize( + extensions, + compiler_directives=dict( + language_level='3str', + boundscheck=False, + wraparound=False, + ), +) -setup(ext_modules=cythonize(extensions)) +setup(ext_modules=ext_modules) diff --git a/src/gambit/_cython/kmers.pxd b/src/gambit/_cython/kmers.pxd index 88ba81e..e5957c9 100644 --- a/src/gambit/_cython/kmers.pxd +++ b/src/gambit/_cython/kmers.pxd @@ -1,5 +1,3 @@ -# cython: language_level = 3str - cimport numpy as np ctypedef unsigned char CHAR diff --git a/src/gambit/_cython/kmers.pyx b/src/gambit/_cython/kmers.pyx index f0de678..7690578 100644 --- a/src/gambit/_cython/kmers.pyx +++ b/src/gambit/_cython/kmers.pyx @@ -1,4 +1,3 @@ -# cython: language_level = 3str, wraparound = False, boundscheck = False """Cython module for working with DNA sequences and k-mers.""" diff --git a/src/gambit/_cython/metric.pxd b/src/gambit/_cython/metric.pxd index 9b91b06..62d3f8e 100644 --- a/src/gambit/_cython/metric.pxd +++ b/src/gambit/_cython/metric.pxd @@ -1,5 +1,3 @@ -# cython: language_level = 3str - from .types cimport SCORE_T, BOUNDS_T, COORDS_T, COORDS_T_2 cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil diff --git a/src/gambit/_cython/metric.pyx b/src/gambit/_cython/metric.pyx index e543d0c..caebced 100644 --- a/src/gambit/_cython/metric.pyx +++ b/src/gambit/_cython/metric.pyx @@ -1,10 +1,6 @@ -# cython: language_level = 3str, wraparound = False - """Cython functions for calculating k-mer distance metrics""" -cimport cython cimport numpy as np - import numpy as np from cython.parallel import prange, parallel @@ -72,8 +68,6 @@ def jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2): return c_jaccarddist(coords1, coords2) -@cython.boundscheck(False) -@cython.wraparound(False) cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil: """Compute the Jaccard distance between two k-mer sets in ordered coordinate format. @@ -121,8 +115,6 @@ cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil: return (2 * u - N - M) / u -@cython.boundscheck(False) -@cython.wraparound(False) def _jaccarddist_parallel(COORDS_T[:] query, COORDS_T_2[:] ref_coords, BOUNDS_T[:] ref_bounds, SCORE_T[:] out): """Calculate Jaccard distances between a query k-mer set and a collection of reference sets. From 3a9258f7e6a54de25cb6fc0822d8d9d82c589a46 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 22:43:27 -0700 Subject: [PATCH 47/86] Update Cython integer <-> kmer funcs --- src/gambit/_cython/kmers.pxd | 4 +-- src/gambit/_cython/kmers.pyx | 60 ++++++++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/src/gambit/_cython/kmers.pxd b/src/gambit/_cython/kmers.pxd index e5957c9..87dcfab 100644 --- a/src/gambit/_cython/kmers.pxd +++ b/src/gambit/_cython/kmers.pxd @@ -3,7 +3,7 @@ cimport numpy as np ctypedef unsigned char CHAR -cpdef np.uint64_t kmer_to_index(const CHAR[:]) nogil except? 0 -cpdef np.uint64_t kmer_to_index_rc(const CHAR[:]) nogil except? 0 +cdef np.uint64_t c_kmer_to_index(const CHAR[:], bint*) nogil +cdef np.uint64_t c_kmer_to_index_rc(const CHAR[:], bint*) nogil cdef void c_index_to_kmer(np.uint64_t, CHAR[:]) nogil cdef void c_revcomp(const CHAR[:], CHAR[:]) nogil diff --git a/src/gambit/_cython/kmers.pyx b/src/gambit/_cython/kmers.pyx index 7690578..3dacc70 100644 --- a/src/gambit/_cython/kmers.pyx +++ b/src/gambit/_cython/kmers.pyx @@ -1,20 +1,39 @@ +"""Cython module for working with DNA sequences and k-mers. -"""Cython module for working with DNA sequences and k-mers.""" +Note: each of the 4 Python functions here have a C counterpart that does the actual work. The Python +version is just a wrapper that does any needed conversion, allocates buffers, and raises exceptions +if needed. The separation currently isn't necessary as the C functions aren't used anywhere else +outside the wrappers, but they may be in the future. Handling exceptions in the Python wrappers only +allows the C functions to be declared with nogil. +""" -cpdef np.uint64_t kmer_to_index(const CHAR[:] kmer) nogil except? 0: - """kmer_to_index(kmer) +def kmer_to_index(const CHAR[:] kmer): + """kmer_to_index(kmer: bytes) -> int Convert k-mer byte string to its integer index. """ + cdef: + np.uint64_t idx + bint exc = False + + if kmer.shape[0] > 32: + raise ValueError('k must be <= 32') + + idx = c_kmer_to_index(kmer, &exc) + + if exc: + raise ValueError('Invalid character in k-mer') + + return idx + + +cdef np.uint64_t c_kmer_to_index(const CHAR[:] kmer, bint *exc) nogil: cdef: np.uint64_t idx = 0 int i, k = kmer.shape[0] CHAR nuc - if k > 32: - raise ValueError('k must be <= 32') - for i in range(k): nuc = kmer[i] @@ -30,24 +49,38 @@ cpdef np.uint64_t kmer_to_index(const CHAR[:] kmer) nogil except? 0: elif nuc == 'T': idx += 3 else: - raise ValueError(nuc) + exc[0] = True + return 0 return idx -cpdef np.uint64_t kmer_to_index_rc(const CHAR[:] kmer) nogil except? 0: - """kmer_to_index_rc(kmer) +def kmer_to_index_rc(const CHAR[:] kmer): + """kmer_to_index_rc(kmer: bytes) -> int Get the integer index of the reverse complement of a k-mer byte string. """ + cdef: + np.uint64_t idx + bint exc = False + + if kmer.shape[0] > 32: + raise ValueError('k must be <= 32') + + idx = c_kmer_to_index_rc(kmer, &exc) + + if exc: + raise ValueError('Invalid character in k-mer') + + return idx + + +cdef np.uint64_t c_kmer_to_index_rc(const CHAR[:] kmer, bint *exc) nogil: cdef: np.uint64_t idx = 0 int i, k = kmer.shape[0] CHAR nuc - if k > 32: - raise ValueError('k must be <= 32') - for i in range(k): nuc = kmer[k - i - 1] @@ -63,7 +96,8 @@ cpdef np.uint64_t kmer_to_index_rc(const CHAR[:] kmer) nogil except? 0: elif nuc == 'T': idx += 0 else: - raise ValueError(nuc) + exc[0] = True + return 0 return idx From 460bf514c7f811dbf0bb8e372e483f93cecf871d Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 22:44:23 -0700 Subject: [PATCH 48/86] Use Cython 3 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a3fd32e..a040ee9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ requires = [ "setuptools", "wheel", - "Cython ~= 0.27", + "Cython >= 3.0", # If the Numpy version is different at runtime than build time, the build version should be # lower as the ABI is forward- but not backwards-compatible. "oldest-supported-numpy", From c9ad98d65cb8a8c6b3aa68e217599bd8462b5cc1 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 23:42:49 -0700 Subject: [PATCH 49/86] Fix type annotation and docstring --- src/gambit/kmers.py | 2 +- src/gambit/metric.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gambit/kmers.py b/src/gambit/kmers.py index 26e150f..34ccb6c 100644 --- a/src/gambit/kmers.py +++ b/src/gambit/kmers.py @@ -129,7 +129,7 @@ def __from_json__(cls, data: dict[str, Any]) -> 'KmerSpec': @attrs(slots=True) class KmerMatch: - """Represents a + """Represents the location of a k-mer prefix found within a DNA sequence. Attributes ---------- diff --git a/src/gambit/metric.py b/src/gambit/metric.py index cc23e34..2b0f292 100644 --- a/src/gambit/metric.py +++ b/src/gambit/metric.py @@ -49,7 +49,7 @@ def jaccard_bits(bits1: np.ndarray, bits2: np.ndarray) -> float: return 1. if union == 0 else intersection / union -def jaccarddist_array(query: KmerSignature, refs: Sequence[KmerSignature], out: np.ndarray = None) -> np.ndarray: +def jaccarddist_array(query: KmerSignature, refs: Sequence[KmerSignature], out: Optional[np.ndarray] = None) -> np.ndarray: """ Calculate Jaccard distances between a query k-mer signature and a list of reference signatures. From 429170b3b17214002ebcfcd4dfdeaf0f4387016a Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 23:43:10 -0700 Subject: [PATCH 50/86] Fix minor error --- src/gambit/sigs/calc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gambit/sigs/calc.py b/src/gambit/sigs/calc.py index 74b0eef..67773bb 100644 --- a/src/gambit/sigs/calc.py +++ b/src/gambit/sigs/calc.py @@ -208,7 +208,7 @@ def calc_file_signature(kspec: KmerSpec, .calc_file_signatures """ with seqfile.parse() as records: - return calc_signature(kspec, (record.seq for record in records)) + return calc_signature(kspec, (record.seq for record in records), accumulator=accumulator) def calc_file_signatures(kspec: KmerSpec, From 84c363fe1a04a458510cd774e8d3fcc57ced16fa Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 23:22:29 -0700 Subject: [PATCH 51/86] Remove Cython dependency on Numpy --- pyproject.toml | 3 --- setup.py | 3 --- src/gambit/_cython/kmers.pxd | 8 +++---- src/gambit/_cython/kmers.pyx | 14 ++++++------- src/gambit/_cython/metric.pxd | 1 + src/gambit/_cython/metric.pyx | 17 +++++---------- src/gambit/_cython/threads.pyx | 38 +++++++++++++++++++++++++--------- src/gambit/_cython/types.pxd | 31 +++++++++++++-------------- src/gambit/metric.py | 11 +++++++--- src/gambit/sigs/__init__.py | 2 +- src/gambit/sigs/base.py | 7 ++++++- src/gambit/sigs/hdf5.py | 3 +-- tests/test_metric.py | 4 ++-- 13 files changed, 79 insertions(+), 63 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a040ee9..b685eb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,8 +3,5 @@ requires = [ "setuptools", "wheel", "Cython >= 3.0", - # If the Numpy version is different at runtime than build time, the build version should be - # lower as the ABI is forward- but not backwards-compatible. - "oldest-supported-numpy", ] build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index 01bbbad..de06f94 100644 --- a/setup.py +++ b/setup.py @@ -3,15 +3,12 @@ from setuptools import setup from distutils.extension import Extension from Cython.Build import cythonize -import numpy # Cython extensions -np_include = numpy.get_include() extensions = [Extension( 'gambit._cython.*', ['src/gambit/_cython/*.pyx'], - include_dirs=[np_include], extra_compile_args=['-fopenmp', '-Wno-sign-compare'], extra_link_args=['-fopenmp'], )] diff --git a/src/gambit/_cython/kmers.pxd b/src/gambit/_cython/kmers.pxd index 87dcfab..c3f57be 100644 --- a/src/gambit/_cython/kmers.pxd +++ b/src/gambit/_cython/kmers.pxd @@ -1,9 +1,9 @@ -cimport numpy as np +from libc.stdint cimport uint64_t, intptr_t ctypedef unsigned char CHAR -cdef np.uint64_t c_kmer_to_index(const CHAR[:], bint*) nogil -cdef np.uint64_t c_kmer_to_index_rc(const CHAR[:], bint*) nogil -cdef void c_index_to_kmer(np.uint64_t, CHAR[:]) nogil +cdef uint64_t c_kmer_to_index(const CHAR[:], bint*) nogil +cdef uint64_t c_kmer_to_index_rc(const CHAR[:], bint*) nogil +cdef void c_index_to_kmer(uint64_t, CHAR[:]) nogil cdef void c_revcomp(const CHAR[:], CHAR[:]) nogil diff --git a/src/gambit/_cython/kmers.pyx b/src/gambit/_cython/kmers.pyx index 3dacc70..022853d 100644 --- a/src/gambit/_cython/kmers.pyx +++ b/src/gambit/_cython/kmers.pyx @@ -14,7 +14,7 @@ def kmer_to_index(const CHAR[:] kmer): Convert k-mer byte string to its integer index. """ cdef: - np.uint64_t idx + uint64_t idx bint exc = False if kmer.shape[0] > 32: @@ -28,9 +28,9 @@ def kmer_to_index(const CHAR[:] kmer): return idx -cdef np.uint64_t c_kmer_to_index(const CHAR[:] kmer, bint *exc) nogil: +cdef uint64_t c_kmer_to_index(const CHAR[:] kmer, bint *exc) nogil: cdef: - np.uint64_t idx = 0 + uint64_t idx = 0 int i, k = kmer.shape[0] CHAR nuc @@ -61,7 +61,7 @@ def kmer_to_index_rc(const CHAR[:] kmer): Get the integer index of the reverse complement of a k-mer byte string. """ cdef: - np.uint64_t idx + uint64_t idx bint exc = False if kmer.shape[0] > 32: @@ -75,9 +75,9 @@ def kmer_to_index_rc(const CHAR[:] kmer): return idx -cdef np.uint64_t c_kmer_to_index_rc(const CHAR[:] kmer, bint *exc) nogil: +cdef uint64_t c_kmer_to_index_rc(const CHAR[:] kmer, bint *exc) nogil: cdef: - np.uint64_t idx = 0 + uint64_t idx = 0 int i, k = kmer.shape[0] CHAR nuc @@ -112,7 +112,7 @@ def index_to_kmer(index, int k): return bytes(buf) -cdef void c_index_to_kmer(np.uint64_t index, CHAR[:] out) nogil: +cdef void c_index_to_kmer(uint64_t index, CHAR[:] out) nogil: """Convert k-mer index to sequence.""" cdef: int k = out.shape[0] diff --git a/src/gambit/_cython/metric.pxd b/src/gambit/_cython/metric.pxd index 62d3f8e..125ec64 100644 --- a/src/gambit/_cython/metric.pxd +++ b/src/gambit/_cython/metric.pxd @@ -1,3 +1,4 @@ +from libc.stdint cimport intptr_t from .types cimport SCORE_T, BOUNDS_T, COORDS_T, COORDS_T_2 cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil diff --git a/src/gambit/_cython/metric.pyx b/src/gambit/_cython/metric.pyx index caebced..45f9074 100644 --- a/src/gambit/_cython/metric.pyx +++ b/src/gambit/_cython/metric.pyx @@ -1,15 +1,8 @@ """Cython functions for calculating k-mer distance metrics""" -cimport numpy as np -import numpy as np from cython.parallel import prange, parallel -# Numpy dtypes equivalent to SCORE_T and BOUNDS_T -SCORE_DTYPE = np.dtype(np.float32) -BOUNDS_DTYPE = np.dtype(np.intp) - - def jaccard(COORDS_T[:] coords1, COORDS_T_2[:] coords2): """Compute the Jaccard index between two k-mer sets in sparse coordinate format. @@ -76,15 +69,15 @@ cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil: cdef: # Lengths of the two arrays - np.intp_t N = coords1.shape[0] - np.intp_t M = coords2.shape[0] + intptr_t N = coords1.shape[0] + intptr_t M = coords2.shape[0] # Index and value of items in each array as we are iterating - np.intp_t i = 0, j = 0 + intptr_t i = 0, j = 0 COORDS_T a COORDS_T_2 b - np.intp_t u = 0 # Size of union + intptr_t u = 0 # Size of union # Iterate through both arrays simultaneously, advance index for the array # with the smaller value. Advance both if they are equal. Increment the @@ -136,7 +129,7 @@ def _jaccarddist_parallel(COORDS_T[:] query, COORDS_T_2[:] ref_coords, BOUNDS_T[ out : numpy.ndarray Pre-allocated array to write distances to. """ - cdef np.intp_t N = ref_bounds.shape[0] - 1 + cdef intptr_t N = ref_bounds.shape[0] - 1 cdef BOUNDS_T begin, end cdef int i diff --git a/src/gambit/_cython/threads.pyx b/src/gambit/_cython/threads.pyx index dd3c49d..d663efa 100644 --- a/src/gambit/_cython/threads.pyx +++ b/src/gambit/_cython/threads.pyx @@ -1,9 +1,10 @@ """OpenMP stuff.""" from cython import parallel +import array -import numpy as np -cimport numpy as np +cimport cython +from cpython cimport array cimport openmp @@ -25,18 +26,35 @@ def omp_get_max_threads(): return openmp.omp_get_max_threads() -def get_thread_ids(int num_threads): - """Run a multithreaded loop and get the thread ID running in each iteration.""" +@cython.boundscheck(True) +def get_thread_ids(int n): + """Run a multithreaded loop and get the thread ID running in each iteration. + + Used to check that Cython code parallelization is working correctly. Result should contain + integers from 0 to ``num_threads``, repeated up to length ``n``. + + Parameters + ---------- + n: int + Size of loop. Make this at least as large as the expected number of threads. + + Returns + ------- + array.array + Array of size ``n`` containing the thread ID running in each loop iteration. + """ cdef: - np.ndarray[np.intp_t, ndim=1] thread_ids - np.intp_t thread_id = -1 + array.array thread_ids_arr = array.array('i') + int[:] thread_ids int i - thread_ids = np.full(num_threads, -1, dtype=np.intp) + for i in range(n): + thread_ids_arr.append(-1) + + thread_ids = thread_ids_arr - for i in parallel.prange(num_threads, nogil=True, schedule='static', chunksize=1): - thread_id = parallel.threadid() - thread_ids[i] = thread_id + for i in parallel.prange(n, nogil=True, schedule='static', chunksize=1): + thread_ids[i] = parallel.threadid() return thread_ids diff --git a/src/gambit/_cython/types.pxd b/src/gambit/_cython/types.pxd index 56992bb..43e9ad7 100644 --- a/src/gambit/_cython/types.pxd +++ b/src/gambit/_cython/types.pxd @@ -1,28 +1,29 @@ """Shared typedefs.""" -cimport numpy as np +from libc.stdint cimport int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t, intptr_t # Type for similarity scores -ctypedef np.float32_t SCORE_T +ctypedef float SCORE_T # Type for bounds on c_jaccard_coords_col -ctypedef np.intp_t BOUNDS_T +# This should be equal to Numpy's intp dtype +ctypedef intptr_t BOUNDS_T # Fused type for storing k-mer coordinates/indices ctypedef fused COORDS_T: - np.int16_t - np.uint16_t - np.int32_t - np.uint32_t - np.int64_t - np.uint64_t + int16_t + uint16_t + int32_t + uint32_t + int64_t + uint64_t # Copy of COORDS_T, used when two arguments have types in this set but may be different than each other. ctypedef fused COORDS_T_2: - np.int16_t - np.uint16_t - np.int32_t - np.uint32_t - np.int64_t - np.uint64_t + int16_t + uint16_t + int32_t + uint32_t + int64_t + uint64_t diff --git a/src/gambit/metric.py b/src/gambit/metric.py index 2b0f292..b7027c3 100644 --- a/src/gambit/metric.py +++ b/src/gambit/metric.py @@ -5,13 +5,18 @@ import numpy as np -from gambit._cython.metric import BOUNDS_DTYPE, SCORE_DTYPE, jaccard, jaccarddist, \ - _jaccarddist_parallel -from gambit.sigs.base import KmerSignature, SignatureArray, AbstractSignatureArray, SignatureList +from gambit._cython.metric import jaccard, jaccarddist, _jaccarddist_parallel +from gambit.sigs.base import KmerSignature, SignatureArray, AbstractSignatureArray, SignatureList, \ + BOUNDS_DTYPE from gambit.util.misc import chunk_slices from gambit.util.progress import get_progress +#: Numpy dtype for output of Cython Jaccard distance calculation code +# Equivalent to SCORE_T in types.pxd +SCORE_DTYPE = np.dtype(np.float32) + + def jaccard_generic(set1: Iterable, set2: Iterable) -> float: """Get the Jaccard index of of two arbitrary sets. diff --git a/src/gambit/sigs/__init__.py b/src/gambit/sigs/__init__.py index 017470b..fd43484 100644 --- a/src/gambit/sigs/__init__.py +++ b/src/gambit/sigs/__init__.py @@ -1,4 +1,4 @@ """Calculate and store collections of k-mer signatures.""" from .base import KmerSignature, SignatureArray, SignatureList, sigarray_eq, SignaturesMeta,\ - AnnotatedSignatures, dump_signatures, load_signatures + AnnotatedSignatures, dump_signatures, load_signatures, BOUNDS_DTYPE diff --git a/src/gambit/sigs/base.py b/src/gambit/sigs/base.py index 5ca383c..0712edf 100644 --- a/src/gambit/sigs/base.py +++ b/src/gambit/sigs/base.py @@ -5,7 +5,6 @@ from attr import attrs, attrib from gambit.kmers import KmerSpec -from gambit._cython.metric import BOUNDS_DTYPE from gambit.util.indexing import AdvancedIndexingMixin from gambit.util.io import FilePath @@ -15,6 +14,12 @@ # TODO - use nptyping package to specify dimensions and data type? +#: Preferred Numpy dtype for :attr:`.ConcatenatedSignatureArray.bounds`. Can be used in parallelized +#: Cython metric calculation code without conversion. +# Equivalent to BOUNDS_T in types.pxd +BOUNDS_DTYPE = np.dtype(np.intp) + + def sigarray_eq(a1: Sequence[KmerSignature], a2: Sequence[KmerSignature]) -> bool: """Check two sequences of sparse k-mer signatures for equality. diff --git a/src/gambit/sigs/hdf5.py b/src/gambit/sigs/hdf5.py index c095f40..b52f9da 100644 --- a/src/gambit/sigs/hdf5.py +++ b/src/gambit/sigs/hdf5.py @@ -7,9 +7,8 @@ import h5py as h5 from .base import SignatureArray, ConcatenatedSignatureArray, AbstractSignatureArray, SignaturesMeta,\ - ReferenceSignatures, SignaturesFileError + ReferenceSignatures, SignaturesFileError, BOUNDS_DTYPE from gambit.kmers import KmerSpec -from gambit._cython.metric import BOUNDS_DTYPE from gambit.util.io import FilePath diff --git a/tests/test_metric.py b/tests/test_metric.py index 25b775d..1b16e1b 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -6,9 +6,9 @@ import numpy as np from gambit.metric import jaccard, jaccarddist, jaccard_bits, jaccard_generic, jaccarddist_array, \ - jaccarddist_matrix, jaccarddist_pairwise, num_pairs, SCORE_DTYPE, BOUNDS_DTYPE + jaccarddist_matrix, jaccarddist_pairwise, num_pairs, SCORE_DTYPE from gambit.sigs.calc import sparse_to_dense -from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures +from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures, BOUNDS_DTYPE from gambit.kmers import KmerSpec from gambit.util.progress import check_progress from .common import make_signatures From 954a30f42bc72bb8fb0baa9496a97fa34e855486 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 00:31:37 -0700 Subject: [PATCH 52/86] Cython metric code accept unsigned dtypes only --- src/gambit/_cython/metric.pyx | 52 +------------------ src/gambit/_cython/types.pxd | 8 +-- src/gambit/metric.py | 96 +++++++++++++++++++++++++++++++++-- 3 files changed, 95 insertions(+), 61 deletions(-) diff --git a/src/gambit/_cython/metric.pyx b/src/gambit/_cython/metric.pyx index 45f9074..ecb5001 100644 --- a/src/gambit/_cython/metric.pyx +++ b/src/gambit/_cython/metric.pyx @@ -4,60 +4,12 @@ from cython.parallel import prange, parallel def jaccard(COORDS_T[:] coords1, COORDS_T_2[:] coords2): - """Compute the Jaccard index between two k-mer sets in sparse coordinate format. - - Arguments are Numpy arrays containing k-mer indices in sorted order. Data types must be 16, 32, - or 64-bit signed or unsigned integers, but do not need to match. - - This is by far the most efficient way to calculate the metric (this is a native function) and - should be used wherever possible. - - Parameters - ---------- - coords1 : numpy.ndarray - K-mer set in sparse coordinate format. - coords2 : numpy.ndarray - K-mer set in sparse coordinate format. - - Returns - ------- - numpy.float32 - Jaccard index between the two sets, a real number between 0 and 1. - - See Also - -------- - .jaccarddist - """ + """Compute the Jaccard index between two k-mer sets in sparse coordinate format.""" return 1 - c_jaccarddist(coords1, coords2) def jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2): - """Compute the Jaccard distance between two k-mer sets in sparse coordinate format. - - The Jaccard distance is equal to one minus the Jaccard index. - - Arguments are Numpy arrays containing k-mer indices in sorted order. Data types must be 16, 32, - or 64-bit signed or unsigned integers, but do not need to match. - - This is by far the most efficient way to calculate the metric (this is a native function) and - should be used wherever possible. - - Parameters - ---------- - coords1 : numpy.ndarray - K-mer set in sparse coordinate format. - coords2 : numpy.ndarray - K-mer set in sparse coordinate format. - - Returns - ------- - numpy.float32 - Jaccard distance between the two sets, a real number between 0 and 1. - - See Also - -------- - .jaccard - """ + """Compute the Jaccard distance between two k-mer sets in sparse coordinate format.""" return c_jaccarddist(coords1, coords2) diff --git a/src/gambit/_cython/types.pxd b/src/gambit/_cython/types.pxd index 43e9ad7..731a4d1 100644 --- a/src/gambit/_cython/types.pxd +++ b/src/gambit/_cython/types.pxd @@ -1,6 +1,6 @@ """Shared typedefs.""" -from libc.stdint cimport int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t, intptr_t +from libc.stdint cimport uint16_t, uint32_t, uint64_t, intptr_t # Type for similarity scores @@ -12,18 +12,12 @@ ctypedef intptr_t BOUNDS_T # Fused type for storing k-mer coordinates/indices ctypedef fused COORDS_T: - int16_t uint16_t - int32_t uint32_t - int64_t uint64_t # Copy of COORDS_T, used when two arguments have types in this set but may be different than each other. ctypedef fused COORDS_T_2: - int16_t uint16_t - int32_t uint32_t - int64_t uint64_t diff --git a/src/gambit/metric.py b/src/gambit/metric.py index b7027c3..792cd81 100644 --- a/src/gambit/metric.py +++ b/src/gambit/metric.py @@ -5,7 +5,7 @@ import numpy as np -from gambit._cython.metric import jaccard, jaccarddist, _jaccarddist_parallel +import gambit._cython.metric as _cmetric from gambit.sigs.base import KmerSignature, SignatureArray, AbstractSignatureArray, SignatureList, \ BOUNDS_DTYPE from gambit.util.misc import chunk_slices @@ -17,6 +17,91 @@ SCORE_DTYPE = np.dtype(np.float32) +_COORDS_UNSIGNED_DTYPES = [np.dtype(f'u{s}') for s in [2, 4, 8]] +_COORDS_SIGNED_DTYPES = [np.dtype(f'i{s}') for s in [2, 4, 8]] + + +def _cast_sigs_array(arr: np.ndarray) -> np.ndarray: + """Convert signature array to proper data type for Cython metric code. + + Cython code accepts k-mer coordinate arrays in 16, 32, or 64-bit unsigned data types, these are + returned as-is. Equivalent signed data types can safely be casted (as the values should all be + non-negative), for these a view into the array with unsigned data type is returned (no coyping). + All other data types result in a ValueError. + """ + + dt = arr.dtype + if dt in _COORDS_UNSIGNED_DTYPES: + return arr + if dt in _COORDS_SIGNED_DTYPES: + new_dt = np.dtype(f'u{dt.itemsize}') + return arr.view(new_dt) + raise ValueError(f'Invalid dtype for k-mer coordinate array: {dt.str}') + + +def jaccard(coords1: np.ndarray, coords2: np.ndarray) -> np.float32: + """Compute the Jaccard index between two k-mer sets in sparse coordinate format. + + Arguments are Numpy arrays containing k-mer indices in sorted order. Data types must be 16, 32, + or 64-bit signed or unsigned integers, but do not need to match. + + This is by far the most efficient way to calculate the metric (this is a native function) and + should be used wherever possible. + + Parameters + ---------- + coords1 + K-mer set in sparse coordinate format. + coords2 + K-mer set in sparse coordinate format. + + Returns + ------- + numpy.float32 + Jaccard index between the two sets, a real number between 0 and 1. + + See Also + -------- + .jaccarddist + """ + coords1 = _cast_sigs_array(coords1) + coords2 = _cast_sigs_array(coords2) + return _cmetric.jaccard(coords1, coords2) + + +def jaccarddist(coords1: np.ndarray, coords2: np.ndarray): + """Compute the Jaccard distance between two k-mer sets in sparse coordinate format. + + The Jaccard distance is equal to one minus the Jaccard index. + + Arguments are Numpy arrays containing k-mer indices in sorted order. Data types must be 16, 32, + or 64-bit signed or unsigned integers, but do not need to match. + + This is by far the most efficient way to calculate the metric (this is a native function) and + should be used wherever possible. + + Parameters + ---------- + coords1 + K-mer set in sparse coordinate format. + coords2 + K-mer set in sparse coordinate format. + + Returns + ------- + numpy.float32 + Jaccard distance between the two sets, a real number between 0 and 1. + + See Also + -------- + .jaccard + """ + coords1 = _cast_sigs_array(coords1) + coords2 = _cast_sigs_array(coords2) + return _cmetric.jaccarddist(coords1, coords2) + + + def jaccard_generic(set1: Iterable, set2: Iterable) -> float: """Get the Jaccard index of of two arbitrary sets. @@ -84,6 +169,8 @@ def jaccarddist_array(query: KmerSignature, refs: Sequence[KmerSignature], out: .jaccarddist .jaccarddist_matrix """ + query = _cast_sigs_array(query) + if out is None: out = np.empty(len(refs), SCORE_DTYPE) elif out.shape != (len(refs),): @@ -92,14 +179,15 @@ def jaccarddist_array(query: KmerSignature, refs: Sequence[KmerSignature], out: raise ValueError(f'Output array dtype must be {SCORE_DTYPE}, got {out.dtype}') if isinstance(refs, SignatureArray): - values = refs.values + values = _cast_sigs_array(refs.values) bounds = refs.bounds.astype(BOUNDS_DTYPE, copy=False) - _jaccarddist_parallel(query, values, bounds, out) + _cmetric._jaccarddist_parallel(query, values, bounds, out) else: for i, ref in enumerate(refs): - out[i] = jaccarddist(query, ref) + ref = _cast_sigs_array(ref) + out[i] = _cmetric.jaccarddist(query, ref) return out From 2e1e4293f91f276c09a71194f397070f27d0f4c0 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 01:01:28 -0700 Subject: [PATCH 53/86] Intersphinx mappings --- docs/source/conf.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index c9f53f9..11be91f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -50,6 +50,15 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] +# When debugging broken cross references using nitpick mode (-n option), ignore these errors. +# This mostly relates to external libraries that have not been linked to using intersphinx. +nitpick_ignore_regex = [ + ('py:.*', r'click\..*'), + ('py:.*', r'sqlalchemy\..*'), + ('py:.*', r'h5py\..*'), + ('py:.*', r'scipy\..*'), +] + # -- Options for HTML output ------------------------------------------------- @@ -77,4 +86,10 @@ autodoc_member_order = 'groupwise' autodoc_typehints = 'description' +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'numpy': ('https://numpy.org/doc/stable/', None), + 'Bio': ('https://biopython.org/docs/latest/', None), +} + todo_include_todos = True From 23092ed02fa92f4009fd82260b10df1fb8b7b6a2 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 01:15:06 -0700 Subject: [PATCH 54/86] Fix cross-references in docs(trings) --- docs/source/api/metric.rst | 4 ---- docs/source/api/misc.rst | 4 ++++ docs/source/conf.py | 2 ++ src/gambit/classify.py | 4 ++-- src/gambit/db/models.py | 34 +++++++++++++++++----------------- src/gambit/db/refdb.py | 15 ++++++++------- src/gambit/metric.py | 8 ++++---- src/gambit/query.py | 2 +- src/gambit/results.py | 6 +++--- src/gambit/seq.py | 2 +- src/gambit/sigs/base.py | 2 +- src/gambit/sigs/calc.py | 8 ++------ src/gambit/util/indexing.py | 2 +- src/gambit/util/io.py | 3 +-- src/gambit/util/progress.py | 14 ++++++++++---- 15 files changed, 57 insertions(+), 53 deletions(-) diff --git a/docs/source/api/metric.rst b/docs/source/api/metric.rst index 9d418ac..c6e8a76 100644 --- a/docs/source/api/metric.rst +++ b/docs/source/api/metric.rst @@ -5,7 +5,3 @@ gambit.metric ------------- .. automodule:: gambit.metric - - .. autofunction:: gambit.metric.jaccard - - .. autofunction:: gambit.metric.jaccarddist diff --git a/docs/source/api/misc.rst b/docs/source/api/misc.rst index d89cabd..7e024d7 100644 --- a/docs/source/api/misc.rst +++ b/docs/source/api/misc.rst @@ -30,6 +30,10 @@ gambit.util.indexing -------------------- .. automodule:: gambit.util.indexing + :exclude-members: AdvancedIndexingMixin + + .. autoclass:: AdvancedIndexingMixin + :private-members: _check_index, _getitem_int, _getitem_slice, _getitem_int_array, _getitem_bool_array gambit.util.progress diff --git a/docs/source/conf.py b/docs/source/conf.py index 11be91f..52580f1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -57,6 +57,8 @@ ('py:.*', r'sqlalchemy\..*'), ('py:.*', r'h5py\..*'), ('py:.*', r'scipy\..*'), + # TypeVar + ('py:.*', r'(.*\.)?T\d?'), ] diff --git a/src/gambit/classify.py b/src/gambit/classify.py index bbdd8e4..95c94f6 100644 --- a/src/gambit/classify.py +++ b/src/gambit/classify.py @@ -40,7 +40,7 @@ def find_matches(itr: Iterable[tuple[AnnotatedGenome, float]]) -> dict[Taxon, li Returns ------- - Dict[Taxon, List[Int]] + dict[Taxon, list[int]] Mapping from taxa to indices of genomes matched to them. """ matches = dict() @@ -75,7 +75,7 @@ def consensus_taxon(taxa: Iterable[Taxon]) -> tuple[Optional[Taxon], set[Taxon]] Returns ------- - Tuple[Optional[Taxon], Set[Taxon]] + tuple[Optional[Taxon], set[Taxon]] Consensus taxon along with the set of any taxa in the argument which are descended from it. """ taxa = list(taxa) diff --git a/src/gambit/db/models.py b/src/gambit/db/models.py index fd0f245..9e3e720 100644 --- a/src/gambit/db/models.py +++ b/src/gambit/db/models.py @@ -63,7 +63,7 @@ class Genome(Base): String column (optional, unique). RefSeq accession number for this genome, if any. extra : Optional[dict] JSON column (optional). Additional arbitrary metadata. - annotations : Collection[.AnnotatedGenome] + annotations : Collection[AnnotatedGenome] One-to-many relationship to :class:`.AnnotatedGenome`. """ @@ -99,8 +99,8 @@ class ReferenceGenomeSet(Base): database which can be used for queries consists of a genome set plus a set of k-mer signatures for those genomes (stored separately). - Membership of :class:`.Genome`s in the set is determined by the presence of an associated - :class:`.AnnotatedGenomes` object, which also holds additional annotation data for the genome. + Membership of :class:`.Genome`\\ s in the set is determined by the presence of an associated + :class:`.AnnotatedGenome` object, which also holds additional annotation data for the genome. The genome set also includes a set of associated :class:`.Taxon` entries, which form a taxonomy tree under which all its genomes are categorized. @@ -125,13 +125,13 @@ class ReferenceGenomeSet(Base): Text column. Optional description. extra : Optional[dict] JSON column. Additional arbitrary data. - genomes : Collection[.AnnotatedGenome] + genomes : Collection[AnnotatedGenome] Many-to-many relationship with :class:`.AnnotatedGenome`, annotated versions of genomes in this set. - base_genomes : Collection[.Genome] + base_genomes : Collection[Genome] Unannotated :class:`Genome`\\ s in this set. Association proxy to the ``genome`` - relationship of members of :attr:`genome`. - taxa : Collection[.Taxon] + relationship of members of :attr:`genomes`. + taxa : Collection[Taxon] One-to-many relationship to :class:`.Taxon`. The taxa that form the classification system for this genome set. """ @@ -184,14 +184,14 @@ class AnnotatedGenome(Base): organism : str String column. Single string describing the organism. May be "Genus species [strain]" but could contain more specific information. Intended to be human-readable and shouldn't have - any semantic meaning for the application (in contrast to the :attr:`taxa` relationship). + any semantic meaning for the application (in contrast to the :attr:`taxon` relationship). taxon_id : int Integer column. ID of the :class:`Taxon` this genome is classified as. - genome : .Genome + genome : Genome Many-to-one relationship to :class:`.Genome`. - genome_set : .ReferenceGenomeSet + genome_set : ReferenceGenomeSet Many-to-one relationship to :class:`.ReferenceGenomeSet`. - taxon : .Taxon + taxon : Taxon Many-to-one relationship to :class:`.Taxon`. The primary taxon this genome is classified as under the associated ``ReferenceGenomeSet``. Should be the most specific and "regular" (ideally defined on NCBI) taxon this genome belongs to. @@ -259,7 +259,7 @@ class Taxon(Base): Float column (optional). Query genomes within this distance of one of the taxon's reference genomes will be classified as that taxon. If NULL the taxon is just used establish the tree structure and is not used directly in classification. - report : Bool + report : bool Boolean column. Whether to report this taxon directly as a match when producing a human-readable query result. Some custom taxa might need to be "hidden" from the user, in which case the value should be false. The application should then ascend the taxon's @@ -273,13 +273,13 @@ class Taxon(Base): ncbi_id : Optional[int] Integer column (optional). ID of the entry in the NCBI taxonomy database this taxon corresponds to, if any. - parent : Optional[.Taxon] + parent : Optional[Taxon] Many-to-one relationship with :class:`.Taxon`, the parent of this taxon (if any). - children : Collection[.Taxon] + children : Collection[Taxon] One-to-many relationship with :class:`.Taxon`, the children of this taxon. - genome_set : .ReferenceGenomeSet + genome_set : ReferenceGenomeSet Many-to-one relationship to :class:`.ReferenceGenomeSet`. - genomes : Collection[.AnnotatedGenome] + genomes : Collection[AnnotatedGenome] One-to-many relationship with :class:`.AnnotatedGenome`, genomes which are assigned to this taxon. """ @@ -421,7 +421,7 @@ def common_ancestors(cls, taxa: Iterable['Taxon']) -> list['Taxon']: Returns ------- - List[.Taxon] + list[Taxon] Common ancestors from top to bottom (same order as :meth:`lineage`. Will be empty if """ ancestors = None diff --git a/src/gambit/db/refdb.py b/src/gambit/db/refdb.py index 5a9dc9b..a7bbf45 100644 --- a/src/gambit/db/refdb.py +++ b/src/gambit/db/refdb.py @@ -85,7 +85,7 @@ def _map_ids_to_genomes(genomeset: ReferenceGenomeSet, id_attr: Union[str, Instr def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Sequence, strict: bool = True) -> list[Optional[AnnotatedGenome]]: - """Match a :class:`ReferenceGenomeSet`'s genomes to a set of ID values. + """Match a ``ReferenceGenomeSet``'s genomes to a set of ID values. This is primarily used to match genomes to signatures based on the ID values stored in a signature file. It is expected that the signature file may contain signatures for more genomes @@ -97,7 +97,7 @@ def genomes_by_id(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Seque id_attr ID attribute of :class:`gambit.db.models.Genome` to use for lookup. Can be used as the attribute itself (e.g. ``Genome.refseq_acc``) or just the name (``'refsec_acc'``). - See :data:`.GENOME_IDS` for the set of allowed values. + See :attr:`~gambit.db.models.Genome.ID_ATTRS` for the set of allowed values. ids Sequence of ID values (strings or integers, matching type of attribute). strict @@ -127,7 +127,7 @@ def genomes_by_id_subset(genomeset: ReferenceGenomeSet, id_attr: GenomeAttr, ids: Sequence, ) -> tuple[list[AnnotatedGenome], list[int]]: - """Match a :class:`ReferenceGenomeSet`'s genomes to a set of ID values, allowing missing genomes. + """Match a ``ReferenceGenomeSet``'s genomes to a set of ID values, allowing missing genomes. This calls :func:`.genomes_by_id` with ``strict=False`` and filters any ``None`` values from the output. The filtered list is returned along with the indices of all values in ``ids`` which were @@ -143,7 +143,7 @@ def genomes_by_id_subset(genomeset: ReferenceGenomeSet, id_attr ID attribute of :class:`gambit.db.models.Genome` to use for lookup. Can be used as the attribute itself (e.g. ``Genome.refseq_acc``) or just the name (``'refsec_acc'``). - See :data:`.GENOME_IDS` for the set of allowed values. + See :attr:`~gambit.db.models.Genome.ID_ATTRS` for the set of allowed values. ids Sequence of ID values (strings or integers, matching type of attribute). """ @@ -173,8 +173,8 @@ class ReferenceDatabase: signatures K-mer signatures for each genome. A subtype of ``ReferenceSignatures``, so contains metadata on signatures as well as the signatures themselves. Type may represent signatures stored on - disk (e.g. :class:`HDF5Signatures`) instead of in memory. OK to contain additional - signatures not corresponding to any genome in ``genomes``. + disk (e.g. :class:`~gambit.sigs.hdf5.HDF5Signatures`) instead of in memory. OK to contain + additional signatures not corresponding to any genome in ``genomes``. sig_indices Index of signature in ``signatures`` corresponding to each genome in ``genomes``. In sorted order to improve performance when iterating over them (improve locality if in @@ -224,6 +224,7 @@ def locate_files(cls, path: FilePath) -> tuple[Path, Path]: Returns ------- + tuple[pathlib.Path, pathlib.Path] Paths to genomes database file and signatures file. Raises @@ -269,7 +270,7 @@ def load_from_dir(cls, path: FilePath) -> 'ReferenceDatabase': Load complete database given directory containing SQLite genomes database file and HDF5 signatures file. - See :func:`.locate_db_files` for how these files are located within the directory. + See :meth:`.locate_files` for how these files are located within the directory. Raises ------ diff --git a/src/gambit/metric.py b/src/gambit/metric.py index 792cd81..e01eb0f 100644 --- a/src/gambit/metric.py +++ b/src/gambit/metric.py @@ -209,7 +209,7 @@ def jaccarddist_matrix(queries: Sequence[KmerSignature], Performance is greatly improved if ``refs`` is a type that yields instances of ``SignatureArray`` when indexed with a slice object (``SignatureArray`` or - ``HDF5Signatures``), see :meth:`.jaccarddist_array`. There is no such dependence on the type of + ``HDF5Signatures``), see :func:`.jaccarddist_array`. There is no such dependence on the type of ``queries``, which can be a simple list. Parameters @@ -230,7 +230,7 @@ def jaccarddist_matrix(queries: Sequence[KmerSignature], Returns ------- - np.ndarray + numpy.ndarray Matrix of distances between query signatures in rows and reference signatures in columns. See Also @@ -283,7 +283,7 @@ def jaccarddist_pairwise(sigs: Sequence[KmerSignature], """ Calculate all pairwise Jaccard distances for a list of signatures. - This should be roughly twice as fast as calling :func:`.jaccarddist_flat` with the same array + This should be roughly twice as fast as calling :func:`.jaccarddist_matrix` with the same array for the first and second arguments, because each pairwise distance is computed once instead of twice. @@ -308,7 +308,7 @@ def jaccarddist_pairwise(sigs: Sequence[KmerSignature], Returns ------- - np.ndarray + numpy.ndarray Pairwise distances in matrix (if ``flat=False``) or condensed (``flat=True``) format. See Also diff --git a/src/gambit/query.py b/src/gambit/query.py index 6e7ec73..a10dc4b 100644 --- a/src/gambit/query.py +++ b/src/gambit/query.py @@ -158,7 +158,7 @@ def query(db: ReferenceDatabase, ``QueryParams`` instance defining parameter values. If None take values from additional keyword arguments or use defaults. inputs - Description for each input, converted to :class:`gambit.query.result.QueryInput` in results + Description for each input, converted to :class:`.QueryInput` in results object. Only used for reporting, does not any other aspect of results. Items can be ``QueryInput``, ``SequenceFile`` or ``str``. progress diff --git a/src/gambit/results.py b/src/gambit/results.py index 762264d..3722ff8 100644 --- a/src/gambit/results.py +++ b/src/gambit/results.py @@ -80,7 +80,7 @@ class CSVResultsExporter(AbstractResultsExporter): Attributes ---------- format_opts - Dialect and other formatting arguments passed to :func:`csv.write`. + Dialect and other formatting arguments passed to :func:`csv.writer`. """ format_opts: dict[str, Any] @@ -174,8 +174,8 @@ class ResultsArchiveWriter(BaseJSONResultsExporter): """Exports query results to "archive" format which captures all stored data. This format is not intended to be read by users of the application. - The exported data can be read and converted back into an identical :class:`QueryResults` - object using :class:`.ResultsArchiveReader`. + The exported data can be read and converted back into an identical + :class:`~gambit.query.QueryResults` object using :class:`.ResultsArchiveReader`. Only the ID attributes of database models are saved, when loading the saved results the models are recreated by database queries. diff --git a/src/gambit/seq.py b/src/gambit/seq.py index 1159e72..865929e 100644 --- a/src/gambit/seq.py +++ b/src/gambit/seq.py @@ -149,7 +149,7 @@ def parse(self, **kwargs) -> ClosingIterator[SeqIO.SeqRecord]: Returns ------- gambit.util.io.ClosingIterator - Iterator yielding :class:`Bio.SeqIO.SeqRecord` instances for each sequence in the file. + Iterator yielding :class:`Bio.SeqRecord.SeqRecord` instances for each sequence in the file. """ fobj = self.open('rt', **kwargs) diff --git a/src/gambit/sigs/base.py b/src/gambit/sigs/base.py index 0712edf..70ce046 100644 --- a/src/gambit/sigs/base.py +++ b/src/gambit/sigs/base.py @@ -320,7 +320,7 @@ class SignaturesMeta: name Short human-readable name. id_attr - Name of ``Genome`` attribute the IDs correspond to (see :data:`gambit.db.models.GENOME_ID_ATTRS`). + Name of ``Genome`` attribute the IDs correspond to (see :attr:`~gambit.db.models.Genome.ID_ATTRS`). Optional, but signature set cannot be used as a reference for queries without it. description Human-readable description. diff --git a/src/gambit/sigs/calc.py b/src/gambit/sigs/calc.py index 67773bb..512052f 100644 --- a/src/gambit/sigs/calc.py +++ b/src/gambit/sigs/calc.py @@ -16,7 +16,7 @@ class KmerAccumulator(MutableSet[int]): """Base class for data structures which track k-mers as they are found in sequences. - Implements the ``MutableSet`` interface for k-mer indices. Indices are added via :meth:`add` or + Implements the ``MutableSet`` interface for k-mer indices. Indices are added via the ``add`` or :meth:`add_kmer` methods, when finished a sparse k-mer signature can be obtained from :meth:`signature`. """ @@ -184,9 +184,6 @@ def calc_file_signature(kspec: KmerSpec, ) -> KmerSignature: """Open a sequence file on disk and calculate its k-mer signature. - This works identically to :func:`.calc_signature_parse` but takes a :class:`.SequenceFile` as - input instead of a data stream. - Parameters ---------- kspec @@ -199,8 +196,7 @@ def calc_file_signature(kspec: KmerSpec, Returns ------- numpy.ndarray - K-mer signature in sparse coordinate format (dtype will match - :func:`gambit.kmers.dense_to_sparse`). + K-mer signature in sparse coordinate format (dtype will match :func:`.dense_to_sparse`). See Also -------- diff --git a/src/gambit/util/indexing.py b/src/gambit/util/indexing.py index 4be2a6a..f7cab69 100644 --- a/src/gambit/util/indexing.py +++ b/src/gambit/util/indexing.py @@ -16,7 +16,7 @@ class AdvancedIndexingMixin: * :meth:`_getitem_int_array` The following methods may optionally be overridden, but default to calling :meth:`_getitem_int_array`: - * :meth:`_getitem_range` + * :meth:`_getitem_slice` * :meth:`_getitem_bool_array` """ diff --git a/src/gambit/util/io.py b/src/gambit/util/io.py index e591063..bd2f99c 100644 --- a/src/gambit/util/io.py +++ b/src/gambit/util/io.py @@ -78,8 +78,7 @@ def open_compressed(compression: Optional[str], Parameters ---------- compression : str - Compression method. None is no compression. Keys of :data:`COMPRESSED_OPENERS` are the - allowed values. + Compression method. None is no compression. path Path of file to open. May be string or path-like object. mode : str diff --git a/src/gambit/util/progress.py b/src/gambit/util/progress.py index 31d5a8b..63de2e0 100644 --- a/src/gambit/util/progress.py +++ b/src/gambit/util/progress.py @@ -83,7 +83,7 @@ def create(cls, total Total number of iterations to completion. initial - Initial value of :attr:`n`. + Initial value of ``n``. desc Description to display to the user. file @@ -191,7 +191,7 @@ def get_progress(arg: ProgressArg, total: int, initial: int = 0, **kw) -> Abstra Accepts the following types/values for the argument: - :class:`.ProgressConfig` - - ``None`` - uses :class:`.NullProgressBar`. + - ``None`` - uses :class:`.NullProgressMeter`. - ``True`` - uses class returned by :func:`.default_progress_cls`. - ``False`` - same as ``None``. - ``str`` key - Looks up progress bar class/factory function in :data:`.REGISTRY`. @@ -237,7 +237,7 @@ def iter_progress(iterable: Iterable, Returns ------- - .ProgressIterator + ProgressIterator Iterator over values in ``iterable`` which advances a progress meter. """ if total is None: @@ -412,7 +412,13 @@ def create(cls, total: int, initial: int = 0, **kw): class TqdmProgressMeter(AbstractProgressMeter): """Wrapper around a progress meter from the ``tqdm`` library.""" - def __init__(self, pbar: 'tqdm.std.tqdm'): + def __init__(self, pbar): + """ + Parameters + ---------- + pbar + ``tqdm.std.tqdm`` instance to wrap. + """ self.pbar = pbar @property From 2d5675f045afbadd40b47a6559914ad5a6aae944 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 02:07:10 -0700 Subject: [PATCH 55/86] Type hinting fix --- src/gambit/util/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gambit/util/io.py b/src/gambit/util/io.py index bd2f99c..b4b5dc5 100644 --- a/src/gambit/util/io.py +++ b/src/gambit/util/io.py @@ -138,14 +138,14 @@ class ClosingIterator(Iterable[T]): method is called. """ - def __init__(self, iterable, fobj): + def __init__(self, iterable: Iterable[T], fobj): self.iterator = iter(iterable) self.fobj = fobj def __iter__(self): return self - def __next__(self): + def __next__(self) -> T: try: return next(self.iterator) From f1f718ac93f781c66f596a383cda6e30b5f52205 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 02:52:31 -0700 Subject: [PATCH 56/86] Type aliases --- docs/source/conf.py | 5 +++++ setup.cfg | 1 + src/gambit/cli/common.py | 2 +- src/gambit/cluster.py | 4 ++-- src/gambit/db/refdb.py | 8 ++++---- src/gambit/db/sqla.py | 2 +- src/gambit/kmers.py | 10 +++++----- src/gambit/results.py | 8 ++++---- src/gambit/seq.py | 22 ++++++++++++++-------- src/gambit/sigs/base.py | 6 +++--- src/gambit/sigs/calc.py | 4 ++-- src/gambit/sigs/hdf5.py | 4 ++-- src/gambit/util/io.py | 22 +++++++++++++++------- tests/cli/test_common.py | 2 +- tests/cli/test_query.py | 4 ++-- 15 files changed, 62 insertions(+), 42 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 52580f1..d3c5d3a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -88,6 +88,11 @@ autodoc_member_order = 'groupwise' autodoc_typehints = 'description' +autodoc_type_aliases = { + 'FilePath': 'FilePath', + 'DNASeq': 'DNASeq', +} + intersphinx_mapping = { 'python': ('https://docs.python.org/3', None), 'numpy': ('https://numpy.org/doc/stable/', None), diff --git a/setup.cfg b/setup.cfg index 686de05..cccad4d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,6 +28,7 @@ install_requires = click>=7.0 h5py~=3.0 scipy~=1.7 + typing-extensions>=4.0 tests_require = pytest diff --git a/src/gambit/cli/common.py b/src/gambit/cli/common.py index 496491a..f02dac5 100644 --- a/src/gambit/cli/common.py +++ b/src/gambit/cli/common.py @@ -282,7 +282,7 @@ def strip_seq_file_ext(filename: str) -> str: return filename -def get_file_id(path: FilePath, strip_dir: bool = True, strip_ext: bool = True) -> str: +def get_file_id(path: 'FilePath', strip_dir: bool = True, strip_ext: bool = True) -> str: """Get sequence file ID derived from file path. Parameters diff --git a/src/gambit/cluster.py b/src/gambit/cluster.py index 71234fc..869e886 100644 --- a/src/gambit/cluster.py +++ b/src/gambit/cluster.py @@ -119,7 +119,7 @@ def check_clade(clade): assert root_i == nleaves * 2 - 2 -def dump_dmat_csv(file: Union[FilePath, TextIO], +def dump_dmat_csv(file: Union['FilePath', TextIO], dmat: np.ndarray, row_ids: Sequence, col_ids: Sequence, @@ -136,7 +136,7 @@ def dump_dmat_csv(file: Union[FilePath, TextIO], writer.writerow([str(row_id), *values_str]) -def load_dmat_csv(file: Union[FilePath, TextIO]) -> tuple[np.ndarray, list[str], list[str]]: +def load_dmat_csv(file: Union['FilePath', TextIO]) -> tuple[np.ndarray, list[str], list[str]]: """Load distance matrix from CSV file. Returns diff --git a/src/gambit/db/refdb.py b/src/gambit/db/refdb.py index a7bbf45..a5affc7 100644 --- a/src/gambit/db/refdb.py +++ b/src/gambit/db/refdb.py @@ -38,7 +38,7 @@ def __init__(self, msg, directory=None, genomes_file=None, signatures_file=None) self.signatures_file = signatures_file -def load_genomeset(db_file: FilePath) -> tuple[Session, ReferenceGenomeSet]: +def load_genomeset(db_file: 'FilePath') -> tuple[Session, ReferenceGenomeSet]: """Get the only :class:`gambit.db.models.ReferenceGenomeSet` from a genomes database file.""" session = file_sessionmaker(db_file)() gset = only_genomeset(session) @@ -211,7 +211,7 @@ def __init__(self, genomeset: ReferenceGenomeSet, signatures: ReferenceSignature raise ValueError(f'{missing} of {n} genomes not matched to signature IDs. Is the id_attr attribute of the signatures metadata correct?') @classmethod - def locate_files(cls, path: FilePath) -> tuple[Path, Path]: + def locate_files(cls, path: 'FilePath') -> tuple[Path, Path]: """Locate an SQLite genome database file and HDF5 signatures file in a directory. Files are located by extension, ``.gdb`` or ``.db`` for SQLite file and ``.gs`` or ``.h5`` @@ -258,14 +258,14 @@ def check_single_match(matches, desc: str): return genomes_file, signatures_file @classmethod - def load(cls, genomes_file: FilePath, signatures_file: FilePath) -> 'ReferenceDatabase': + def load(cls, genomes_file: 'FilePath', signatures_file: 'FilePath') -> 'ReferenceDatabase': """Load complete database given paths to SQLite genomes database file and HDF5 signatures file.""" session, gset = load_genomeset(genomes_file) sigs = load_signatures(signatures_file) return cls(gset, sigs) @classmethod - def load_from_dir(cls, path: FilePath) -> 'ReferenceDatabase': + def load_from_dir(cls, path: 'FilePath') -> 'ReferenceDatabase': """ Load complete database given directory containing SQLite genomes database file and HDF5 signatures file. diff --git a/src/gambit/db/sqla.py b/src/gambit/db/sqla.py index 2892903..18a9050 100644 --- a/src/gambit/db/sqla.py +++ b/src/gambit/db/sqla.py @@ -37,7 +37,7 @@ def process_result_value(self, value, dialect): return None if value is None else gjson.loads(value) -def file_sessionmaker(path: FilePath, readonly: bool = True, cls: type = None, **kw) -> sessionmaker: +def file_sessionmaker(path: 'FilePath', readonly: bool = True, cls: type = None, **kw) -> sessionmaker: """Get an SQLAlchemy ``sessionmaker`` for an sqlite database file. Parameters diff --git a/src/gambit/kmers.py b/src/gambit/kmers.py index 34ccb6c..9c5890c 100644 --- a/src/gambit/kmers.py +++ b/src/gambit/kmers.py @@ -30,7 +30,7 @@ def index_dtype(k: int) -> Optional[np.dtype]: return None -def kmer_to_index(kmer: DNASeq) -> int: +def kmer_to_index(kmer: 'DNASeq') -> int: """Convert a k-mer to its integer index. Raises @@ -41,7 +41,7 @@ def kmer_to_index(kmer: DNASeq) -> int: return ckmers.kmer_to_index(seq_to_bytes(kmer)) -def kmer_to_index_rc(kmer: DNASeq) -> int: +def kmer_to_index_rc(kmer: 'DNASeq') -> int: """Get the integer index of a k-mer's reverse complement. Raises @@ -84,7 +84,7 @@ class KmerSpec(Jsonable): nkmers: int = attrib(eq=False) index_dtype: np.dtype = attrib(eq=False) - def __init__(self, k: int, prefix: DNASeq): + def __init__(self, k: int, prefix: 'DNASeq'): """ Parameters ---------- @@ -143,7 +143,7 @@ class KmerMatch: If the match is on the reverse strand. """ kmerspec: KmerSpec = attrib() - seq: DNASeq = attrib() + seq: 'DNASeq' = attrib() pos: int = attrib() reverse: bool = attrib() @@ -178,7 +178,7 @@ def kmer_index(self) -> int: return kmer_to_index_rc(kmer) if self.reverse else kmer_to_index(kmer) -def find_kmers(kmerspec: KmerSpec, seq: DNASeq) -> Iterator[KmerMatch]: +def find_kmers(kmerspec: KmerSpec, seq: 'DNASeq') -> Iterator[KmerMatch]: """Locate k-mers with the given prefix in a DNA sequence. Searches sequence both backwards and forwards (reverse complement). The sequence may contain diff --git a/src/gambit/results.py b/src/gambit/results.py index 3722ff8..bd76a8e 100644 --- a/src/gambit/results.py +++ b/src/gambit/results.py @@ -22,7 +22,7 @@ class AbstractResultsExporter(ABC): """ @abstractmethod - def export(self, file_or_path: Union[FilePath, IO], results: QueryResults): + def export(self, file_or_path: Union['FilePath', IO], results: QueryResults): """Write query results to file. Parameters @@ -55,7 +55,7 @@ def to_json(self, obj): """Convert object to JSON-compatible format (need not work recursively).""" return gjson.to_json(obj) - def export(self, file_or_path: Union[FilePath, TextIO], results: QueryResults): + def export(self, file_or_path: Union['FilePath', TextIO], results: QueryResults): opts = dict(indent=4, sort_keys=True) if self.pretty else dict() with maybe_open(file_or_path, 'w') as f: json.dump(results, f, default=self.to_json, **opts) @@ -112,7 +112,7 @@ def get_row(self, item: QueryResultItem) -> list: """Get row values for single result item.""" return [getattr_nested(item, attrs, pass_none=True) for _, attrs in self.COLUMNS] - def export(self, file_or_path: Union[FilePath, TextIO], results: QueryResults): + def export(self, file_or_path: Union['FilePath', TextIO], results: QueryResults): with maybe_open(file_or_path, 'w') as f: writer = csv.writer(f, **self.format_opts) @@ -229,7 +229,7 @@ def _init_converter(self): self._converter.register_structure_hook(AnnotatedGenome, self._structure_genome) self._converter.register_structure_hook(Taxon, self._structure_taxon) - def read(self, file_or_path: Union[FilePath, IO]) -> QueryResults: + def read(self, file_or_path: Union['FilePath', IO]) -> QueryResults: """Read query results from JSON file. Parameters diff --git a/src/gambit/seq.py b/src/gambit/seq.py index 865929e..5dbb975 100644 --- a/src/gambit/seq.py +++ b/src/gambit/seq.py @@ -3,13 +3,20 @@ Note that all code in this package operates on DNA sequences as sequences of bytes containing ascii-encoded nucleotide codes. + .. data:: NUCLEOTIDES ``bytes`` corresponding to the four DNA nucleotides. Ascii-encoded upper case letters ``ACGT``. Note that the order, while arbitrary, is important in this variable as it defines how unique indices are assigned to k-mer sequences. + +.. class:: DNASeq + + Type alias for DNA sequence types accepted for k-mer search / signature calculation + (``str``, ``bytes``, ``bytearray``, or :class:`Bio.Seq.Seq`). """ + from pathlib import Path from typing import Union, Optional, IO, Iterable from os import PathLike @@ -17,6 +24,7 @@ from Bio import SeqIO from Bio.Seq import Seq from attr import attrs, attrib +from typing_extensions import TypeAlias from gambit._cython.kmers import revcomp from gambit.util.io import FilePath @@ -29,14 +37,12 @@ SEQ_TYPES = (str, bytes, bytearray, Seq) -#: Union of DNA sequence types accepted for k-mer search / signature calculation. -DNASeq = Union[SEQ_TYPES] - -#: Sequence types accepted directly by native (Cython) code. -DNASeqBytes = Union[bytes, bytearray] +DNASeq: TypeAlias = Union[SEQ_TYPES] +# Type alias for sequence types accepted directly by native (Cython) code. +DNASeqBytes: TypeAlias = Union[bytes, bytearray] -def seq_to_bytes(seq: DNASeq) -> DNASeqBytes: +def seq_to_bytes(seq: 'DNASeq') -> 'DNASeqBytes': """Convert generic DNA sequence to byte string representation. This is for passing sequence data to Cython functions. @@ -52,7 +58,7 @@ def seq_to_bytes(seq: DNASeq) -> DNASeqBytes: raise TypeError(f'Expected sequence type, got {type(seq)}') -def validate_dna_seq_bytes(seq : bytes): +def validate_dna_seq_bytes(seq: DNASeqBytes): """Check that a sequence contains only valid nucleotide codes (upper case). Parameters @@ -171,7 +177,7 @@ def absolute(self) -> 'SequenceFile': @classmethod def from_paths(cls, - paths: Iterable[FilePath], + paths: Iterable['FilePath'], format: str, compression: Optional[str] = None, ) -> list['SequenceFile']: diff --git a/src/gambit/sigs/base.py b/src/gambit/sigs/base.py index 70ce046..c0cda54 100644 --- a/src/gambit/sigs/base.py +++ b/src/gambit/sigs/base.py @@ -409,7 +409,7 @@ class SignaturesFileError(Exception): filename: str format: str - def __init__(self, message: str, filename: Optional[FilePath], format: Optional[str]): + def __init__(self, message: str, filename: Optional['FilePath'], format: Optional[str]): self.message = message self.filename = str(filename) self.format = format @@ -418,7 +418,7 @@ def __str__(self): return self.message -def load_signatures(path: FilePath, **kw) -> AbstractSignatureArray: +def load_signatures(path: 'FilePath', **kw) -> AbstractSignatureArray: """Load signatures from file. Currently the only format used to store signatures is the one in :mod:`gambit.sigs.hdf5`, but @@ -435,7 +435,7 @@ def load_signatures(path: FilePath, **kw) -> AbstractSignatureArray: return load_signatures_hdf5(path, **kw) -def dump_signatures(path: FilePath, +def dump_signatures(path: 'FilePath', signatures: AbstractSignatureArray, format: str = 'hdf5', **kw, diff --git a/src/gambit/sigs/calc.py b/src/gambit/sigs/calc.py index 512052f..212b269 100644 --- a/src/gambit/sigs/calc.py +++ b/src/gambit/sigs/calc.py @@ -127,7 +127,7 @@ def default_accumulator(k: int) -> KmerAccumulator: return SetAccumulator(k) if k > 11 else ArrayAccumulator(k) -def accumulate_kmers(accumulator: KmerAccumulator, kmerspec: KmerSpec, seq: DNASeq): +def accumulate_kmers(accumulator: KmerAccumulator, kmerspec: KmerSpec, seq: 'DNASeq'): """Find k-mer matches in sequence and add their indices to an accumulator.""" for match in find_kmers(kmerspec, seq): try: @@ -138,7 +138,7 @@ def accumulate_kmers(accumulator: KmerAccumulator, kmerspec: KmerSpec, seq: DNAS def calc_signature(kmerspec: KmerSpec, - seqs: Union[DNASeq, Iterable[DNASeq]], + seqs: Union['DNASeq', Iterable['DNASeq']], *, accumulator: Optional[KmerAccumulator] = None, ) -> KmerSignature: diff --git a/src/gambit/sigs/hdf5.py b/src/gambit/sigs/hdf5.py index b52f9da..b35a39a 100644 --- a/src/gambit/sigs/hdf5.py +++ b/src/gambit/sigs/hdf5.py @@ -218,7 +218,7 @@ def create(cls, return cls(group) -def load_signatures_hdf5(path: FilePath, **kw) -> HDF5Signatures: +def load_signatures_hdf5(path: 'FilePath', **kw) -> HDF5Signatures: """Open HDF5 signature file. Parameters @@ -254,7 +254,7 @@ def load_signatures_hdf5(path: FilePath, **kw) -> HDF5Signatures: raise -def dump_signatures_hdf5(path: FilePath, signatures: AbstractSignatureArray, **kw): +def dump_signatures_hdf5(path: 'FilePath', signatures: AbstractSignatureArray, **kw): """Write k-mer signatures and associated metadata to an HDF5 file. Parameters diff --git a/src/gambit/util/io.py b/src/gambit/util/io.py index b4b5dc5..bdf43b0 100644 --- a/src/gambit/util/io.py +++ b/src/gambit/util/io.py @@ -1,12 +1,20 @@ -"""Utility code for reading/writing data files.""" +"""Utility code for reading/writing data files. + + +.. class:: FilePath + + Alias for types which can represent a file system path (``str`` or :class:`os.PathLike`). +""" import os from io import TextIOWrapper from typing import Union, Optional, IO, TextIO, BinaryIO, ContextManager, Iterable, TypeVar from contextlib import nullcontext -#: Alias for types which can represent a file system path -FilePath = Union[str, os.PathLike] +from typing_extensions import TypeAlias + + +FilePath: TypeAlias = Union[str, os.PathLike] T = TypeVar('T') @@ -69,7 +77,7 @@ def guess_compression(fobj: BinaryIO) -> Optional[str]: def open_compressed(compression: Optional[str], - path: FilePath, + path: 'FilePath', mode: str = 'rt', **kwargs, ) -> IO: @@ -172,7 +180,7 @@ def __exit__(self, *args): self.close() -def maybe_open(file_or_path: Union[FilePath, IO], mode: str = 'r', **open_kw) -> ContextManager[IO]: +def maybe_open(file_or_path: Union['FilePath', IO], mode: str = 'r', **open_kw) -> ContextManager[IO]: """Open a file given a file path as an argument, but pass existing file objects though. Intended to be used by API functions that take either type as an argument. If a file path is @@ -208,7 +216,7 @@ def maybe_open(file_or_path: Union[FilePath, IO], mode: str = 'r', **open_kw) -> return open(path, mode, **open_kw) -def read_lines(file_or_path: Union[FilePath, TextIO], strip: bool=True, skip_empty: bool=False) -> Iterable[str]: +def read_lines(file_or_path: Union['FilePath', TextIO], strip: bool=True, skip_empty: bool=False) -> Iterable[str]: """Iterate over lines in text file. Parameters @@ -232,7 +240,7 @@ def read_lines(file_or_path: Union[FilePath, TextIO], strip: bool=True, skip_emp yield line -def write_lines(lines: Iterable, file_or_path: Union[FilePath, TextIO]): +def write_lines(lines: Iterable, file_or_path: Union['FilePath', TextIO]): """Write strings to text file, one per line. Parameters diff --git a/tests/cli/test_common.py b/tests/cli/test_common.py index c051454..cc8924b 100644 --- a/tests/cli/test_common.py +++ b/tests/cli/test_common.py @@ -99,7 +99,7 @@ def test_strip_seq_file_ext(): class TestGetSequenceFiles: """Test the get_sequence_files() function.""" - def check_ids(self, ids: Iterable[str], paths: Iterable[FilePath], strip_dir: bool, strip_ext: bool): + def check_ids(self, ids: Iterable[str], paths: Iterable['FilePath'], strip_dir: bool, strip_ext: bool): for id_, path in zip_strict(ids, paths): if strip_dir: expected = Path(path).name diff --git a/tests/cli/test_query.py b/tests/cli/test_query.py index 8b00dcd..a9a904c 100644 --- a/tests/cli/test_query.py +++ b/tests/cli/test_query.py @@ -21,9 +21,9 @@ def make_args(testdb: TestDB, *, positional_files: Optional[Iterable[SequenceFile]] = None, - list_file: Optional[FilePath] = None, + list_file: Optional['FilePath'] = None, sig_file: bool = False, - output: Optional[FilePath] = None, + output: Optional['FilePath'] = None, outfmt: Optional[str] = None, strict: bool=False, ) -> list[str]: From 24525ee1e7d36261a547c8a6472860ee13064a8c Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 03:03:51 -0700 Subject: [PATCH 57/86] Docstring formatting --- src/gambit/util/indexing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gambit/util/indexing.py b/src/gambit/util/indexing.py index f7cab69..36c9386 100644 --- a/src/gambit/util/indexing.py +++ b/src/gambit/util/indexing.py @@ -12,10 +12,12 @@ class AdvancedIndexingMixin: bounds checking, and converting negative indices. The following methods must be implemented by subtypes: + * :meth:`_getitem_int` * :meth:`_getitem_int_array` The following methods may optionally be overridden, but default to calling :meth:`_getitem_int_array`: + * :meth:`_getitem_slice` * :meth:`_getitem_bool_array` """ From 264b2903f3b9080c81afff27b9af4928909c96b7 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Wed, 3 Jul 2024 20:07:14 -0700 Subject: [PATCH 58/86] Typing updates/fixes in util.progress --- src/gambit/util/progress.py | 69 +++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/src/gambit/util/progress.py b/src/gambit/util/progress.py index 63de2e0..a3bf342 100644 --- a/src/gambit/util/progress.py +++ b/src/gambit/util/progress.py @@ -3,11 +3,14 @@ import sys from abc import ABC, abstractmethod -from typing import Optional, Union, Callable, Iterable, TextIO, Mapping, Any, cast, Iterator, ContextManager +from typing import Optional, Union, Callable, Iterable, TextIO, Mapping, Any, cast, Iterator, \ + ContextManager, TypeVar from warnings import warn from contextlib import contextmanager +T = TypeVar('T') + #: Type alias for a callable which takes ``total`` and keyword arguments and returns an AbstractProgressMeter ProgressFactoryFunc = Callable[[int], 'AbstractProgressMeter'] @@ -214,11 +217,40 @@ def get_progress(arg: ProgressArg, total: int, initial: int = 0, **kw) -> Abstra return config.create(total, initial=initial, **kw) -def iter_progress(iterable: Iterable, +class ProgressIterator(Iterator[T]): + itr: Iterator[T] + meter: AbstractProgressMeter + + def __init__(self, iterable: Iterable[T], meter: AbstractProgressMeter): + self.itr = iter(iterable) + self.meter = meter + self._first = True + + def __next__(self): + if not self._first: + self.meter.increment() + self._first = False + + try: + value = next(self.itr) + except StopIteration: + self.meter.close() # Close on reaching end + raise + + return value + + def __enter__(self): + return self + + def __exit__(self, *args): + self.meter.close() + + +def iter_progress(iterable: Iterable[T], progress: ProgressArg = True, total: Optional[int] = None, **kw, - ) -> Iterable: + ) -> ProgressIterator[T]: """Display a progress meter while iterating over an object. The returned iterator object can also be used as a context manager to ensure that the progress @@ -247,35 +279,6 @@ def iter_progress(iterable: Iterable, return ProgressIterator(iterable, meter) -class ProgressIterator(Iterator): - itr: Iterator - meter: AbstractProgressMeter - - def __init__(self, iterable: Iterable, meter: AbstractProgressMeter): - self.itr = iter(iterable) - self.meter = meter - self._first = True - - def __next__(self): - if not self._first: - self.meter.increment() - self._first = False - - try: - value = next(self.itr) - except StopIteration: - self.meter.close() # Close on reaching end - raise - - return value - - def __enter__(self): - return self - - def __exit__(self, *args): - self.meter.close() - - def capture_progress(config: ProgressConfig) -> tuple[ProgressConfig, list[AbstractProgressMeter]]: """ Creates a ``ProgressConfig`` which captures references to the progress meter instances created @@ -308,7 +311,7 @@ def check_progress(*, total: Optional[int] = None, allow_decrement: bool = False, check_closed: bool = True, - ) -> ContextManager[ProgressConfig]: + ) -> Iterator[ProgressConfig]: """Context manager which checks a progress meter is advanced to completion. Returned context manager yields a ``ProgressConfig`` instance on enter, tests are run when From 38336e545d309fd9d9776b58f45b7ce5e9b06f2d Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 03:28:07 -0700 Subject: [PATCH 59/86] Rework open_compressed() function --- src/gambit/seq.py | 3 ++- src/gambit/util/io.py | 57 ++++++++++++++++++------------------------- tests/util/test_io.py | 45 +++++++++------------------------- 3 files changed, 38 insertions(+), 67 deletions(-) diff --git a/src/gambit/seq.py b/src/gambit/seq.py index 5dbb975..62bb0d3 100644 --- a/src/gambit/seq.py +++ b/src/gambit/seq.py @@ -135,7 +135,8 @@ def open(self, mode: str = 'r', **kwargs) -> IO: IO Stream to file in given mode. """ - return open_compressed(self.compression, self.path, mode, **kwargs) + compression = 'none' if self.compression is None else self.compression + return open_compressed(self.path, mode, compression, **kwargs) def parse(self, **kwargs) -> ClosingIterator[SeqIO.SeqRecord]: """Open the file and lazily parse its contents. diff --git a/src/gambit/util/io.py b/src/gambit/util/io.py index bdf43b0..d1e52a8 100644 --- a/src/gambit/util/io.py +++ b/src/gambit/util/io.py @@ -8,7 +8,7 @@ import os from io import TextIOWrapper -from typing import Union, Optional, IO, TextIO, BinaryIO, ContextManager, Iterable, TypeVar +from typing import Union, IO, TextIO, BinaryIO, ContextManager, Iterable, TypeVar from contextlib import nullcontext from typing_extensions import TypeAlias @@ -18,26 +18,8 @@ T = TypeVar('T') -COMPRESSED_OPENERS = {None: open} - -def _compressed_opener(compression): - """Decorator to register opener functions for compression types.""" - def decorator(func): - COMPRESSED_OPENERS[compression] = func - return func - return decorator - - -@_compressed_opener('gzip') -def _open_gzip(path, mode, **kwargs): - """Opener for gzip-compressed files.""" - import gzip - return gzip.open(path, mode=mode, **kwargs) - - -@_compressed_opener('auto') -def _open_auto(path, mode, **kwargs): +def _open_auto(path: FilePath, mode: str, **kwargs): """Open file for reading with compression determined automatically.""" if mode[0] != 'r': @@ -49,13 +31,13 @@ def _open_auto(path, mode, **kwargs): compression = guess_compression(file) file.seek(0) - if compression is None: + if compression == 'none': binary = file elif compression == 'gzip': import gzip binary = gzip.GzipFile(fileobj=file, mode='rb') else: - assert 0 + assert False, f'Unexpected compression type: {compression!r}' return TextIOWrapper(binary, **kwargs) if mode[1] == 't' else binary @@ -63,7 +45,7 @@ def _open_auto(path, mode, **kwargs): file.close() -def guess_compression(fobj: BinaryIO) -> Optional[str]: +def guess_compression(fobj: BinaryIO) -> str: """Guess the compression mode of an readable file-like object in binary mode. Assumes the current position is at the beginning of the file. @@ -73,25 +55,25 @@ def guess_compression(fobj: BinaryIO) -> Optional[str]: if magic == b'\x1f\x8b': return 'gzip' else: - return None + return 'none' -def open_compressed(compression: Optional[str], - path: 'FilePath', +def open_compressed(path: 'FilePath', mode: str = 'rt', + compression: str = 'auto', **kwargs, ) -> IO: """Open a file with compression method specified by a string. Parameters ---------- - compression : str - Compression method. None is no compression. path Path of file to open. May be string or path-like object. mode : str Mode to open file in - similar to :func:`open`. Must be exactly two characters, the first in ``rwax`` and the second in``tb``. + compression : str + Compression method. Allowed values are ``'none'``, ``'gzip'``, or ``'auto'``. \\**kwargs Additional text-specific keyword arguments identical to the following :func:`open` arguments: ``encoding``, ``errors``, and ``newlines``. @@ -101,19 +83,28 @@ def open_compressed(compression: Optional[str], IO Open file object. """ + + # Check mode if not(len(mode) == 2 and mode[0] in 'rwax' and mode[1] in 'tb'): msg = f'Invalid mode {mode!r}' if mode in 'rwax': msg += ' (must specify either binary or text mode)' raise ValueError(msg) - try: - opener = COMPRESSED_OPENERS[compression] + path = os.fsdecode(path) - except KeyError: - raise ValueError(f'Unknown compression type {compression!r}') from None + if compression == 'none': + return open(path, mode, **kwargs) + + elif compression == 'gzip': + import gzip + return gzip.open(path, mode, **kwargs) - return opener(os.fsdecode(path), mode=mode, **kwargs) + elif compression == 'auto': + return _open_auto(path, mode, **kwargs) + + else: + raise ValueError(f'Unknown compression type {compression!r}') from None class ClosingIterator(Iterable[T]): diff --git a/tests/util/test_io.py b/tests/util/test_io.py index ba0819f..234dc64 100644 --- a/tests/util/test_io.py +++ b/tests/util/test_io.py @@ -17,29 +17,30 @@ def text_data(self): random = np.random.RandomState() return random.randint(32, 128, size=1000, dtype='b').tobytes() - @pytest.fixture(scope='class', params=[None, 'gzip']) + @pytest.fixture(scope='class', params=['none', 'gzip']) def compression(self, request): """Compression method string.""" return request.param @pytest.fixture() - def text_file(self, text_data, compression, tmpdir): + def text_file(self, text_data: bytes, compression: str, tmp_path: Path): """Path to file with text_data written to it using open_compressed.""" - file = tmpdir.join('chars.txt').strpath + file = tmp_path / 'chars.txt' - with ioutil.open_compressed(compression, file, 'wb') as fobj: + with ioutil.open_compressed(file, 'wb', compression) as fobj: fobj.write(text_data) return file @pytest.mark.parametrize('binary', [True, False]) - def test_read(self, binary, text_data, text_file, compression, tmpdir): + @pytest.mark.parametrize('auto', [True, False]) + def test_read(self, binary: bool, auto: bool, text_data: bytes, text_file: Path, compression: str): """Test we can read the file in both binary and text mode.""" mode = 'rb' if binary else 'rt' - with ioutil.open_compressed(compression, text_file, mode) as fobj: + with ioutil.open_compressed(text_file, mode, 'auto' if auto else compression) as fobj: contents = fobj.read() if binary: @@ -52,47 +53,25 @@ def test_read(self, binary, text_data, text_file, compression, tmpdir): @pytest.mark.parametrize('write_mode', ['w', 'a', 'x']) @pytest.mark.parametrize('binary', [True, False]) - def test_write(self, write_mode, binary, text_data, compression, tmpdir): + def test_write(self, write_mode: str, binary: bool, text_data: bytes, compression: str, tmp_path: Path): """ Test writing data using the w, a, and x modes. - TODO - these are all identical when the file doesn't exist, test behavior when it does + TODO - these are all identical when the file doesn't exist, test behavior when it does. """ - file = tmpdir.join('chars.txt') + file = tmp_path / 'chars.txt' mode = write_mode + ('b' if binary else 't') to_write = text_data if binary else text_data.decode('ascii') - with ioutil.open_compressed(compression, file.strpath, mode) as fobj: + with ioutil.open_compressed(file, mode, compression) as fobj: fobj.write(to_write) - with ioutil.open_compressed(compression, file.strpath, 'rb') as f: + with ioutil.open_compressed(file, 'rb', compression) as f: contents = f.read() assert contents == text_data - def test_invalid_mode(self, compression): - for mode in ['r', 'w', 'a', 't', 'b', 'abc', '']: - with pytest.raises(ValueError): - ioutil.open_compressed(compression, 'foo.txt', mode=mode) - - @pytest.mark.parametrize('binary', [True, False]) - def test_read_auto(self, binary, text_data, text_file): - """Test automatic determination of compression method.""" - - mode = 'rb' if binary else 'rt' - - with ioutil.open_compressed('auto', text_file, mode) as fobj: - contents = fobj.read() - - if binary: - assert isinstance(contents, bytes) - assert contents == text_data - - else: - assert isinstance(contents, str) - assert contents == text_data.decode('ascii') - class TestClosingIterator: """Test the ClosingIterator class.""" From 719e0a973bff60461c9cb322e96a76af1a73d064 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 03:44:56 -0700 Subject: [PATCH 60/86] Move query result comparison functions to tests/ code --- src/gambit/classify.py | 28 ----------------------- src/gambit/query.py | 22 +----------------- tests/results.py | 51 ++++++++++++++++++++++++++++++++++++++++-- tests/test_query.py | 5 +++-- 4 files changed, 53 insertions(+), 53 deletions(-) diff --git a/src/gambit/classify.py b/src/gambit/classify.py index 95c94f6..e8e7498 100644 --- a/src/gambit/classify.py +++ b/src/gambit/classify.py @@ -163,23 +163,6 @@ def next_taxon(self) -> Optional[Taxon]: return lo -def compare_genome_matches(match1: Optional[GenomeMatch], match2: Optional[GenomeMatch]) -> bool: - """Compare two ``GenomeMatch`` instances for equality. - - The values for the ``distance`` attribute are only checked for approximate equality, to support - instances where one was loaded from a results archive (saving and loading a float in JSON is - lossy). - - Also allows one or both values to be None. - """ - if match1 is None or match2 is None: - return match1 is None and match2 is None - - return match1.genome == match2.genome and \ - match1.matched_taxon == match2.matched_taxon and \ - np.isclose(match1.distance, match2.distance) - - @attrs() class ClassifierResult: """Result of applying the classifier to a single query genome. @@ -218,17 +201,6 @@ def _next_taxon_default(self): return self.closest_match.next_taxon() -def compare_classifier_results(result1: ClassifierResult, result2: ClassifierResult) -> bool: - """Compare two ``ClassifierResult`` instances for equality.""" - return result1.success == result2.success and \ - result1.predicted_taxon == result2.predicted_taxon and \ - compare_genome_matches(result1.primary_match, result2.primary_match) and \ - compare_genome_matches(result1.closest_match, result2.closest_match) and \ - result1.next_taxon == result2.next_taxon and \ - set(result1.warnings) == set(result2.warnings) and \ - result1.error == result2.error - - def classify(ref_genomes: Sequence[AnnotatedGenome], dists: np.ndarray, *, diff --git a/src/gambit/query.py b/src/gambit/query.py index a10dc4b..9634a5a 100644 --- a/src/gambit/query.py +++ b/src/gambit/query.py @@ -8,8 +8,7 @@ import numpy as np from gambit import __version__ as GAMBIT_VERSION -from gambit.classify import classify, ClassifierResult, GenomeMatch, compare_classifier_results, \ - compare_genome_matches +from gambit.classify import classify, ClassifierResult, GenomeMatch from gambit.db import ReferenceDatabase, Taxon, ReferenceGenomeSet, reportable_taxon from gambit.seq import SequenceFile from gambit.sigs import KmerSignature, SignaturesMeta @@ -89,25 +88,6 @@ class QueryResultItem: closest_genomes: list[GenomeMatch] = attrib(factory=list) -def compare_result_items(item1: QueryResultItem, item2: QueryResultItem) -> bool: - """Compare two ``QueryResultItem`` instances for equality. - - Does not compare the value of the ``input`` attributes. - """ - if item1.report_taxon != item2.report_taxon: - return False - if not compare_classifier_results(item1.classifier_result, item2.classifier_result): - return False - if len(item1.closest_genomes) != len(item2.closest_genomes): - return False - - for m1, m2 in zip(item1.closest_genomes, item2.closest_genomes): - if not compare_genome_matches(m1, m2): - return False - - return True - - @attrs(repr=False) class QueryResults: """Results for a set of queries, as well as information on database and parameters used. diff --git a/tests/results.py b/tests/results.py index bbc1248..2a3326e 100644 --- a/tests/results.py +++ b/tests/results.py @@ -8,12 +8,59 @@ import numpy as np from gambit.util.json import to_json -from gambit.query import QueryResults -from gambit.classify import GenomeMatch +from gambit.query import QueryResults, QueryResultItem +from gambit.classify import GenomeMatch, ClassifierResult from gambit.util.misc import zip_strict from gambit.db.models import AnnotatedGenome, Taxon +def compare_genome_matches(match1: Optional[GenomeMatch], match2: Optional[GenomeMatch]) -> bool: + """Compare two ``GenomeMatch`` instances for equality. + + The values for the ``distance`` attribute are only checked for approximate equality, to support + instances where one was loaded from a results archive (saving and loading a float in JSON is + lossy). + + Also allows one or both values to be None. + """ + if match1 is None or match2 is None: + return match1 is None and match2 is None + + return match1.genome == match2.genome and \ + match1.matched_taxon == match2.matched_taxon and \ + np.isclose(match1.distance, match2.distance) + + +def compare_classifier_results(result1: ClassifierResult, result2: ClassifierResult) -> bool: + """Compare two ``ClassifierResult`` instances for equality.""" + return result1.success == result2.success and \ + result1.predicted_taxon == result2.predicted_taxon and \ + compare_genome_matches(result1.primary_match, result2.primary_match) and \ + compare_genome_matches(result1.closest_match, result2.closest_match) and \ + result1.next_taxon == result2.next_taxon and \ + set(result1.warnings) == set(result2.warnings) and \ + result1.error == result2.error + + +def compare_result_items(item1: QueryResultItem, item2: QueryResultItem) -> bool: + """Compare two ``QueryResultItem`` instances for equality. + + Does not compare the value of the ``input`` attributes. + """ + if item1.report_taxon != item2.report_taxon: + return False + if not compare_classifier_results(item1.classifier_result, item2.classifier_result): + return False + if len(item1.closest_genomes) != len(item2.closest_genomes): + return False + + for m1, m2 in zip(item1.closest_genomes, item2.closest_genomes): + if not compare_genome_matches(m1, m2): + return False + + return True + + def cmp_json_attrs(data: dict[str, Any], obj, attrnames: Iterable[str]): for attr in attrnames: assert data[attr] == getattr(obj, attr) diff --git a/tests/test_query.py b/tests/test_query.py index 325ac0f..38d579f 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -2,12 +2,13 @@ import pytest -from gambit.query import QueryInput, query_parse, compare_result_items +from gambit.query import QueryInput, query_parse from gambit.seq import SequenceFile from gambit.util.misc import zip_strict from gambit import __version__ as GAMBIT_VERSION from .testdb import TestDB +from .results import compare_result_items class TestQueryInput: @@ -41,4 +42,4 @@ def test_query_python(testdb: TestDB, strict: bool): for file, item, ref_item in zip_strict(query_files, results.items, ref_results.items): assert item.input.file == file - compare_result_items(item, ref_item) + assert compare_result_items(item, ref_item) From 45a93f308ee80065cffe59f0064a6219381c2956 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 04:04:00 -0700 Subject: [PATCH 61/86] Fix nondeterministic warning message in query results --- src/gambit/classify.py | 2 +- tests/data/testdb_210818/results/strict.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gambit/classify.py b/src/gambit/classify.py index e8e7498..4e91bd6 100644 --- a/src/gambit/classify.py +++ b/src/gambit/classify.py @@ -285,7 +285,7 @@ def classify(ref_genomes: Sequence[AnnotatedGenome], # Warn of inconsistent matches if others: msg = f'Query matched {len(others)} inconsistent taxa: ' - msg += ', '.join(other.short_repr() for other in others) + msg += ', '.join(sorted(other.short_repr() for other in others)) msg += '. Reporting lowest common ancestor of this set.' result.warnings.append(msg) diff --git a/tests/data/testdb_210818/results/strict.json b/tests/data/testdb_210818/results/strict.json index af37d71..23487dd 100644 --- a/tests/data/testdb_210818/results/strict.json +++ b/tests/data/testdb_210818/results/strict.json @@ -6339,7 +6339,7 @@ }, "success": true, "warnings": [ - "Query matched 2 inconsistent taxa: 8:A2_B2, 7:A2_B1. Reporting lowest common ancestor of this set." + "Query matched 2 inconsistent taxa: 7:A2_B1, 8:A2_B2. Reporting lowest common ancestor of this set." ] }, "closest_genomes": [ From 007866bd34d2bd961f1a04df0434c131f43f56fb Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 04:42:52 -0700 Subject: [PATCH 62/86] Fix up results export test funcs --- tests/results.py | 121 +++++++++++++++++++++++++------------------- tests/test_query.py | 2 +- 2 files changed, 69 insertions(+), 54 deletions(-) diff --git a/tests/results.py b/tests/results.py index 2a3326e..6541147 100644 --- a/tests/results.py +++ b/tests/results.py @@ -14,8 +14,8 @@ from gambit.db.models import AnnotatedGenome, Taxon -def compare_genome_matches(match1: Optional[GenomeMatch], match2: Optional[GenomeMatch]) -> bool: - """Compare two ``GenomeMatch`` instances for equality. +def compare_genome_matches(match1: Optional[GenomeMatch], match2: Optional[GenomeMatch]): + """Assert two ``GenomeMatch`` instances are equal. The values for the ``distance`` attribute are only checked for approximate equality, to support instances where one was loaded from a results archive (saving and loading a float in JSON is @@ -24,77 +24,88 @@ def compare_genome_matches(match1: Optional[GenomeMatch], match2: Optional[Genom Also allows one or both values to be None. """ if match1 is None or match2 is None: - return match1 is None and match2 is None + assert match1 is None and match2 is None + return - return match1.genome == match2.genome and \ - match1.matched_taxon == match2.matched_taxon and \ - np.isclose(match1.distance, match2.distance) + assert match1.genome == match2.genome + assert match1.matched_taxon == match2.matched_taxon + assert np.isclose(match1.distance, match2.distance) def compare_classifier_results(result1: ClassifierResult, result2: ClassifierResult) -> bool: - """Compare two ``ClassifierResult`` instances for equality.""" - return result1.success == result2.success and \ - result1.predicted_taxon == result2.predicted_taxon and \ - compare_genome_matches(result1.primary_match, result2.primary_match) and \ - compare_genome_matches(result1.closest_match, result2.closest_match) and \ - result1.next_taxon == result2.next_taxon and \ - set(result1.warnings) == set(result2.warnings) and \ - result1.error == result2.error + """Assert two ``ClassifierResult`` instances are equal.""" + assert result1.success == result2.success + assert result1.predicted_taxon == result2.predicted_taxon + compare_genome_matches(result1.primary_match, result2.primary_match) + compare_genome_matches(result1.closest_match, result2.closest_match) + assert result1.next_taxon == result2.next_taxon + assert set(result1.warnings) == set(result2.warnings) + assert result1.error == result2.error def compare_result_items(item1: QueryResultItem, item2: QueryResultItem) -> bool: - """Compare two ``QueryResultItem`` instances for equality. + """Assert two ``QueryResultItem`` instances are equal. Does not compare the value of the ``input`` attributes. """ - if item1.report_taxon != item2.report_taxon: - return False - if not compare_classifier_results(item1.classifier_result, item2.classifier_result): - return False - if len(item1.closest_genomes) != len(item2.closest_genomes): - return False + assert item1.report_taxon == item2.report_taxon + compare_classifier_results(item1.classifier_result, item2.classifier_result) + assert len(item1.closest_genomes) == len(item2.closest_genomes) for m1, m2 in zip(item1.closest_genomes, item2.closest_genomes): - if not compare_genome_matches(m1, m2): - return False - - return True + compare_genome_matches(m1, m2) def cmp_json_attrs(data: dict[str, Any], obj, attrnames: Iterable[str]): + """Assert JSON data values equals object attribute values for the given keys/names.""" + for attr in attrnames: assert data[attr] == getattr(obj, attr) -def cmp_taxon_json(taxon_data: dict[str, Any], taxon: Optional[Taxon]): + +def cmp_taxon_json(data: dict[str, Any], taxon: Optional[Taxon]): + """Assert Taxon instance matches data in JSON export.""" + if taxon is None: - assert taxon_data is None + assert data is None + else: - assert taxon_data is not None - cmp_json_attrs(taxon_data, taxon, ['id', 'key', 'name', 'ncbi_id', 'rank', 'distance_threshold']) + assert data is not None + cmp_json_attrs(data, taxon, ['id', 'key', 'name', 'ncbi_id', 'rank']) + if taxon.distance_threshold is None: + assert data['distance_threshold'] is None + else: + assert data['distance_threshold'] is not None + assert np.isclose(data['distance_threshold'], taxon.distance_threshold) + + +def cmp_annnotatedgenome_json(data: dict[str, Any], genome: AnnotatedGenome): + """Assert AnnotatedGenome instance matches data in JSON export.""" -def cmp_annnotatedgenome_json(genome_data: dict[str, Any], genome: AnnotatedGenome): - assert genome_data['id'] == genome.genome_id + assert data['id'] == genome.genome_id cmp_json_attrs( - genome_data, + data, genome, ['key', 'description', 'organism', 'ncbi_db', 'ncbi_id', 'genbank_acc', 'refseq_acc'], ) - for taxon_data, taxon in zip_strict(genome_data['taxonomy'], genome.taxon.ancestors(True)): + for taxon_data, taxon in zip_strict(data['taxonomy'], genome.taxon.ancestors(True)): cmp_taxon_json(taxon_data, taxon) -def cmp_genomematch_json(match_data, match: GenomeMatch): - assert np.isclose(match_data['distance'], match.distance) - cmp_annnotatedgenome_json(match_data['genome'], match.genome) - assert (match_data['matched_taxon'] is None) == (match.matched_taxon is None) - if match.matched_taxon is not None: - cmp_taxon_json(match_data['matched_taxon'], match.matched_taxon) +def cmp_genomematch_json(data, match: GenomeMatch): + """Assert GenomeMatch instance matches data in JSON export.""" + + assert np.isclose(data['distance'], match.distance) + cmp_annnotatedgenome_json(data['genome'], match.genome) + + cmp_taxon_json(data['matched_taxon'], match.matched_taxon) + def check_json_results(file: TextIO, results: QueryResults, strict: bool = False, ): - """Check exported JSON data matches the given results object. + """Assert exported JSON data matches the given results object. Parameters ---------- @@ -115,16 +126,17 @@ def check_json_results(file: TextIO, data = json.load(file) assert len(data['items']) == len(results.items) - # assert data['params'] == to_json(results.params) cmp_json_attrs(data['genomeset'], results.genomeset, ['id', 'key', 'version', 'name', 'description']) assert data['signaturesmeta'] == to_json(results.signaturesmeta) - # assert data['gambit_version'] == results.gambit_version - assert data['extra'] == results.extra if strict: assert data['timestamp'] == to_json(results.timestamp) + assert data['gambit_version'] == results.gambit_version + assert data['extra'] == results.extra for item, item_data in zip(results.items, data['items']): + + # Compare data['query'] <-> item.input query = item_data['query'] assert query['name'] == item.input.label @@ -135,41 +147,44 @@ def check_json_results(file: TextIO, else: assert query['format'] == item.input.file.format + # Check path matches exactly if strict mode, otherwise just file name if strict: assert query['path'] == str(item.input.file.path) else: assert Path(query['path']).name == item.input.file.path.name - # Predicted taxon - predicted_data = item_data['predicted_taxon'] - cmp_taxon_json(predicted_data, item.report_taxon) - if item.report_taxon is not None: - assert np.isclose(predicted_data['distance_threshold'], item.report_taxon.distance_threshold) - - # Next taxon + # Predicted/next taxon + cmp_taxon_json(item_data['predicted_taxon'], item.report_taxon) cmp_taxon_json(item_data['next_taxon'], item.classifier_result.next_taxon) # Closest genomes + assert len(item_data['closest_genomes']) == len(item.closest_genomes) for match, match_data in zip_strict(item.closest_genomes, item_data['closest_genomes']): cmp_genomematch_json(match_data, match) -def cmp_csv_taxon(row, taxon: Optional[Taxon], prefix: str): +def cmp_csv_taxon(row: dict[str, str], taxon: Optional[Taxon], prefix: str): if taxon is None: assert row[prefix + '.name'] == '' assert row[prefix + '.rank'] == '' assert row[prefix + '.ncbi_id'] == '' assert row[prefix + '.threshold'] == '' + else: assert row[prefix + '.name'] == taxon.name assert row[prefix + '.rank'] == taxon.rank assert row[prefix + '.ncbi_id'] == str(taxon.ncbi_id or '') - assert np.isclose(float(row[prefix + '.threshold']), taxon.distance_threshold) + + dt = row[prefix + '.threshold'] + if taxon.distance_threshold is None: + assert dt == '' + else: + assert np.isclose(float(dt), taxon.distance_threshold) def check_csv_results(file: TextIO, results: QueryResults, strict: bool = False): - """Check exported CSV data matches the given results object. + """Assert exported CSV data matches the given results object. Parameters ---------- diff --git a/tests/test_query.py b/tests/test_query.py index 38d579f..6c84589 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -42,4 +42,4 @@ def test_query_python(testdb: TestDB, strict: bool): for file, item, ref_item in zip_strict(query_files, results.items, ref_results.items): assert item.input.file == file - assert compare_result_items(item, ref_item) + compare_result_items(item, ref_item) From 892c795d290beeef7e0c192c9ffb281f0b1ceb43 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 05:08:56 -0700 Subject: [PATCH 63/86] WIP CLI query archive output format --- src/gambit/cli/query.py | 4 ++-- tests/cli/test_query.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/gambit/cli/query.py b/src/gambit/cli/query.py index ce92c98..f1ad6ff 100644 --- a/src/gambit/cli/query.py +++ b/src/gambit/cli/query.py @@ -20,9 +20,9 @@ def get_exporter(outfmt: str): return JSONResultsExporter() if outfmt == 'archive': - return ResultsArchiveWriter(install_info=True) + return ResultsArchiveWriter() - assert 0 + raise ValueError(f'Invalid output format: {outfmt!r}') @cli.command(name='query', no_args_is_help=True) diff --git a/tests/cli/test_query.py b/tests/cli/test_query.py index a9a904c..d7eaad5 100644 --- a/tests/cli/test_query.py +++ b/tests/cli/test_query.py @@ -69,9 +69,14 @@ def check_results(results_file: Path, out_fmt: str, ref_results: QueryResults): if out_fmt == 'json': with open(results_file) as fh: check_json_results(fh, ref_results, strict=False) + elif out_fmt == 'csv': with open(results_file) as fh: check_csv_results(fh, ref_results, strict=False) + + elif out_fmt == 'archive': + assert results_file.is_file() # TODO + else: raise ValueError(f'Invalid out_fmt {out_fmt!r}') @@ -85,6 +90,7 @@ def check_results(results_file: Path, out_fmt: str, ref_results: QueryResults): (20, False, 'csv', True, False), (None, False, 'json', False, True), (20, True, 'json', False, False), + (20, False, 'archive', False, False), ], ) def test_full_query(testdb: TestDB, From e80a295dab77c451317701df29722644f1a87caa Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 21:30:23 -0700 Subject: [PATCH 64/86] Update tests for Python API query funcs --- tests/test_query.py | 50 +++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/tests/test_query.py b/tests/test_query.py index 6c84589..bc86414 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -2,7 +2,7 @@ import pytest -from gambit.query import QueryInput, query_parse +from gambit.query import QueryInput, QueryResults, query, query_parse from gambit.seq import SequenceFile from gambit.util.misc import zip_strict from gambit import __version__ as GAMBIT_VERSION @@ -27,19 +27,43 @@ def test_convert(self): @pytest.mark.parametrize('strict', [False, True]) -def test_query_python(testdb: TestDB, strict: bool): +class TestQuery: """Run a full query using the Python API.""" - ref_results = testdb.get_query_results(strict) - params = ref_results.params - query_files = [item['file'] for item in testdb.query_genomes] - results = query_parse(testdb.refdb, query_files, params) + def check_results(self, results: QueryResults, ref_results: QueryResults): - assert results.params == params - assert results.genomeset == ref_results.genomeset - assert results.signaturesmeta == testdb.ref_signatures.meta - assert results.gambit_version == GAMBIT_VERSION + assert results.params == ref_results.params + assert results.genomeset == ref_results.genomeset + assert results.signaturesmeta == ref_results.signaturesmeta + assert results.gambit_version == GAMBIT_VERSION - for file, item, ref_item in zip_strict(query_files, results.items, ref_results.items): - assert item.input.file == file - compare_result_items(item, ref_item) + for item, ref_item in zip_strict(results.items, ref_results.items): + compare_result_items(item, ref_item) + + def test_query(self, testdb: TestDB, strict: bool): + """Test the query() function.""" + + ref_results = testdb.get_query_results(strict) + params = ref_results.params + query_sigs = testdb.query_signatures + + results = query(testdb.refdb, query_sigs, params) + self.check_results(results, ref_results) + + for sigid, item in zip_strict(query_sigs.ids, results.items): + assert item.input.file is None + # assert item.input.label == sigid + + def test_query_parse(self, testdb: TestDB, strict: bool): + """Test the query_parse() function.""" + + ref_results = testdb.get_query_results(strict) + params = ref_results.params + query_files = testdb.get_query_files() + + results = query_parse(testdb.refdb, query_files, params) + self.check_results(results, ref_results) + + for file, item in zip_strict(query_files, results.items): + assert item.input.file == file + assert item.input.label == str(file.path) From fb77403ddfeb1cad87090fa12f0aaa263cbadbd6 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 20:57:00 -0700 Subject: [PATCH 65/86] Minor typing and import fixes --- src/gambit/util/progress.py | 2 +- tests/common.py | 2 +- tests/testdb.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/gambit/util/progress.py b/src/gambit/util/progress.py index a3bf342..59a75ff 100644 --- a/src/gambit/util/progress.py +++ b/src/gambit/util/progress.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from typing import Optional, Union, Callable, Iterable, TextIO, Mapping, Any, cast, Iterator, \ - ContextManager, TypeVar + TypeVar from warnings import warn from contextlib import contextmanager diff --git a/tests/common.py b/tests/common.py index f504717..9bb5623 100644 --- a/tests/common.py +++ b/tests/common.py @@ -208,7 +208,7 @@ def make_kmer_seqs(kspec: KmerSpec, return seqs, dense_to_sparse(vec) -def make_lineage(thresholds: Sequence[float]) -> list[Taxon]: +def make_lineage(thresholds: Sequence[Optional[float]]) -> list[Taxon]: """Create a linage of taxa that have the given distance thresholds. Parameters diff --git a/tests/testdb.py b/tests/testdb.py index 8439f48..068452e 100644 --- a/tests/testdb.py +++ b/tests/testdb.py @@ -17,6 +17,7 @@ from gambit.db import ReferenceDatabase, ReadOnlySession, only_genomeset from gambit.results import ResultsArchiveReader from gambit.query import QueryResults +from gambit.util.io import FilePath T = TypeVar('T') @@ -100,7 +101,7 @@ class TestDB: # Prevent pytest interpreting as containing test methods __test__ = False - def __init__(self, root): + def __init__(self, root: FilePath): root = Path(root) self.paths = TestDBPaths( root=root, From abec7bac6ded5b4f6d53ee0c59c66779a705629d Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Tue, 13 Aug 2024 21:23:21 -0700 Subject: [PATCH 66/86] Documentation on testdb_210818 files --- tests/data/testdb_210818/Readme.md | 37 +++++++++++++++++++++++------- tests/testdb.py | 8 +++++++ 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/tests/data/testdb_210818/Readme.md b/tests/data/testdb_210818/Readme.md index f7fb1eb..4ebf707 100644 --- a/tests/data/testdb_210818/Readme.md +++ b/tests/data/testdb_210818/Readme.md @@ -9,12 +9,33 @@ repository. To use this database from the CLI, just pass this directory with the ## Files -* `ref-genomes.gdb` - reference genomes metadata. -* `ref-signatures.gs` - reference genome signatures. +* `ref-genomes.gdb`: reference genomes metadata. +* `ref-signatures.gs`: reference genome signatures. +* `ref-genomes.csv`: CSV file of basic reference genome properties (sort of redundant with `ref-genomes.gdb`). +* `ref-genomes/`: contains reference genome files in FASTA format. * `queries/` - * `queries.csv` - table listing all query files and expected results. - * `genomes/` - contains query genome files in FASTA format. - * `query-signatures.gs` - precalculated signatures for query genomes. -* `results/` - pre-calculated results using query files in `queries`. -* `generate-results.py` - script which generates result files in `results/`. - Verifies against expected result attributes in `queries.csv`. + * `queries.csv`: table listing all query files and expected results. + * `genomes/`: contains query genome files in FASTA format. + * `query-signatures.gs`: precalculated signatures for query genomes. +* `results/`: pre-calculated results using query files in `queries`, exported in the "archive" JSON + format. Two sets of results, one with strict mode enabled and one without. These are used to + reconsitute the `gambit.query.QueryResults` instances using `gambit.results.ResultsArchiveReader`. +* `generate-results.py`: script which generates result files in `results/`. This will need to be + re-run if the query results object changes structure or if the "archive" JSON format changes. + Results are verified against contents of `queries.csv` before exporting. + + +### Query genome properties + +`queries.csv` contains information on expected results for each query genome. This should stay +constant even if the exported files change format in future releases. + +Contains the following columns: + +- `name`: File name. +- `predicted`: Name of predicted taxon in strict mode, or empty if no prediction. +- `primary`: Description of primary genome match in strict mode, or empty if no prediction. +- `closest`: Description of closest genome match. +- `warnings`: Whether warnings should be generated in strict mode. + +In non-strict mode, the primary match will the set to the closest match. diff --git a/tests/testdb.py b/tests/testdb.py index 068452e..f7a48ed 100644 --- a/tests/testdb.py +++ b/tests/testdb.py @@ -61,13 +61,21 @@ def lazy(f: Callable[[Any], T]) -> LazyAttribute[T]: @dataclass class TestDBPaths: root: Path + # Reference genomes .gdb file ref_genomes: Path + # Reference genomes .gs file ref_signatures: Path + # Reference genomes .csv refs_table: Path + # Directory containing reference genome FASTA files ref_genomes_dir: Path + # queries.csv queries_table: Path + # Directory containing query genome FASTA files query_genomes_dir: Path + # Query genomes .gs file query_signatures: Path + # Directory containing QueryResults exports in archive format. results: Path From b097e8f5137a5167a8091ea2f2a89539f51d85fd Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Tue, 13 Aug 2024 21:58:34 -0700 Subject: [PATCH 67/86] Updates to TestDB class --- tests/testdb.py | 86 ++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/tests/testdb.py b/tests/testdb.py index f7a48ed..b0b92fe 100644 --- a/tests/testdb.py +++ b/tests/testdb.py @@ -85,16 +85,12 @@ class TestQueryGenome(TypedDict): primary: str closest: str warnings: bool - file: SequenceFile - file_gz: SequenceFile class TestRefGenome(TypedDict): name: str key: str taxon: str - file: SequenceFile - file_gz: SequenceFile class TestDB: @@ -162,18 +158,6 @@ def refdb(self) -> ReferenceDatabase: gset = only_genomeset(session) return ReferenceDatabase(gset, self.ref_signatures) - @classmethod - def _add_file_cols(cls, genomes_dir, row): - row['file'] = SequenceFile( - path=genomes_dir / (row['name'] + '.fasta'), - format='fasta', - ) - row['file_gz'] = SequenceFile( - path=genomes_dir / (row['name'] + '.fasta.gz'), - format='fasta', - compression='gzip', - ) - @lazy def query_genomes(self) -> list[TestQueryGenome]: """Query genomes and their expected results.""" @@ -184,7 +168,6 @@ def query_genomes(self) -> list[TestQueryGenome]: for row in rows: # Convert "warnings" column to bool row['warnings'] = row['warnings'].lower() == 'true' - self._add_file_cols(self.paths.query_genomes_dir, row) return rows # type: ignore @@ -195,42 +178,57 @@ def ref_genomes(self) -> list[TestRefGenome]: with open(self.paths.refs_table, newline='') as f: rows = list(DictReader(f)) - for row in rows: - self._add_file_cols(self.paths.ref_genomes_dir, row) - return rows # type: ignore @classmethod - def _ensure_gz(cls, items): - """Ensure gzipped versions of the query/ref files are available. + def _ensure_gz(cls, file: Path, file_gz: Path): + """Ensure gzipped version of the query/ref file is available. These aren't added to version control, so they are created the first time they are needed. """ - for item in items: - dst = item['file_gz'].path - if dst.is_file(): - continue + if file_gz.is_file(): + return - with open(item['file'].path) as f: - content = f.read() + with open(file) as f: + content = f.read() - with gzip.open(dst, 'wt') as f: - f.write(content) + with gzip.open(file_gz, 'wt') as f: + f.write(content) - @classmethod - def _get_genome_files(cls, items, gzipped): - if gzipped: - col = 'file_gz' - cls._ensure_gz(items) - else: - col = 'file' - return [q[col] for q in items] - - def get_query_files(self, gzipped: bool=False) -> list[SequenceFile]: - return self._get_genome_files(self.query_genomes, gzipped) - - def get_ref_files(self, gzipped: bool=False) -> list[SequenceFile]: - return self._get_genome_files(self.ref_genomes, gzipped) + def _get_genome_files(self, base: Path, names: list[str], gzipped: bool, relative: bool) -> list[SequenceFile]: + base2 = base.relative_to(self.paths.root) if relative else base + + files = [] + + for name in names: + fname = name + '.fasta' + + if gzipped: + fname_gz = fname + '.gz' + self._ensure_gz(base / fname, base / fname_gz) + path = base2 / fname_gz + else: + path = base2 / fname + + files.append(SequenceFile(path, 'fasta', 'gzip' if gzipped else None)) + + return files + + def get_query_files(self, gzipped: bool = False, relative: bool = False) -> list[SequenceFile]: + return self._get_genome_files( + self.paths.query_genomes_dir, + [genome['name'] for genome in self.query_genomes], + gzipped=gzipped, + relative=relative, + ) + + def get_ref_files(self, gzipped: bool = False, relative: bool = False) -> list[SequenceFile]: + return self._get_genome_files( + self.paths.ref_genomes_dir, + [genome['name'] for genome in self.ref_genomes], + gzipped=gzipped, + relative=relative, + ) def get_query_results(self, strict: bool, session=None) -> QueryResults: """Pre-calculated query results.""" From 90fe8db587bbdaa733e1f220d51cc47ee318b21d Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Tue, 13 Aug 2024 21:39:27 -0700 Subject: [PATCH 68/86] Update testdb results generation script to use TestDB class --- tests/data/testdb_210818/generate-results.py | 46 +++++++++----------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/tests/data/testdb_210818/generate-results.py b/tests/data/testdb_210818/generate-results.py index 920df44..e9c8633 100755 --- a/tests/data/testdb_210818/generate-results.py +++ b/tests/data/testdb_210818/generate-results.py @@ -9,47 +9,39 @@ import sys from pathlib import Path -from csv import DictReader from gambit.seq import SequenceFile -from gambit.db import ReferenceDatabase, reportable_taxon -from gambit.query import QueryParams, query_parse +from gambit.db import reportable_taxon +from gambit.query import QueryParams, QueryResults, query_parse from gambit.results import ResultsArchiveWriter from gambit.util.misc import zip_strict +THISDIR = Path(__file__).parent +ROOTDIR = THISDIR.parent.parent.parent + +sys.path.insert(0, str(ROOTDIR)) +from tests.testdb import TestDB, TestQueryGenome + + PARAMS = { 'non_strict': QueryParams(classify_strict=False, report_closest=10), 'strict': QueryParams(classify_strict=True, report_closest=10), } -def load_query_data(): - with open('queries/queries.csv', newline='') as f: - rows = list(DictReader(f)) - - genomes_dir = Path('queries/genomes') +def check_results(queries: list[TestQueryGenome], query_files: list[SequenceFile], results: QueryResults): + """Check query results object against queries.csv table before exporting.""" - for row in rows: - row['warnings'] = row['warnings'].lower() == 'true' - row['file'] = SequenceFile( - path=genomes_dir / (row['name'] + '.fasta'), - format='fasta', - ) - - return rows - - -def check_results(queries, results): strict = results.params.classify_strict - for query, item in zip_strict(queries, results.items): + for query, query_file, item in zip_strict(queries, query_files, results.items): warnings = [] clsresult = item.classifier_result predicted = clsresult.predicted_taxon - assert item.input.file == query['file'] + assert item.input.file == query_file # No errors assert clsresult.success @@ -84,7 +76,6 @@ def check_results(queries, results): # Closest matches assert len(item.closest_genomes) == results.params.report_closest assert item.closest_genomes[0] == clsresult.closest_match - assert item.closest_genomes[0].genome.description == query['closest'] for i in range(1, results.params.report_closest): assert item.closest_genomes[i].distance >= item.closest_genomes[i-1].distance @@ -111,19 +102,22 @@ def check_results(queries, results): def main(): - queries = load_query_data() - query_files = [query['file'] for query in queries] - db = ReferenceDatabase.load_from_dir('.') + testdb = TestDB(THISDIR) + db = testdb.refdb + query_files = testdb.get_query_files(relative=True) writer = ResultsArchiveWriter(pretty=True) for label, params in PARAMS.items(): + print('Running query:', label) results = query_parse(db, query_files, params) - check_results(queries, results) + check_results(testdb.query_genomes, query_files, results) with open(f'results/{label}.json', 'wt') as f: writer.export(f, results) + print('done!\n\n') + if __name__ == '__main__': main() From 96b5edd777d4a5a8abfc4b4c6b8a1e6906f81d8b Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Tue, 13 Aug 2024 22:05:53 -0700 Subject: [PATCH 69/86] More results testing funcs --- tests/data/testdb_210818/generate-results.py | 41 +---------- tests/results.py | 73 ++++++++++++++++++-- tests/test_query.py | 8 ++- 3 files changed, 76 insertions(+), 46 deletions(-) diff --git a/tests/data/testdb_210818/generate-results.py b/tests/data/testdb_210818/generate-results.py index e9c8633..1c4bd22 100755 --- a/tests/data/testdb_210818/generate-results.py +++ b/tests/data/testdb_210818/generate-results.py @@ -11,7 +11,6 @@ from pathlib import Path from gambit.seq import SequenceFile -from gambit.db import reportable_taxon from gambit.query import QueryParams, QueryResults, query_parse from gambit.results import ResultsArchiveWriter from gambit.util.misc import zip_strict @@ -22,6 +21,7 @@ sys.path.insert(0, str(ROOTDIR)) from tests.testdb import TestDB, TestQueryGenome +from tests.results import check_results as check_results_base PARAMS = { @@ -36,17 +36,12 @@ def check_results(queries: list[TestQueryGenome], query_files: list[SequenceFile strict = results.params.classify_strict for query, query_file, item in zip_strict(queries, query_files, results.items): - warnings = [] clsresult = item.classifier_result predicted = clsresult.predicted_taxon assert item.input.file == query_file - # No errors - assert clsresult.success - assert clsresult.error is None - # Check if warnings expected (only if in strict mode) assert bool(clsresult.warnings) == (strict and query['warnings']) @@ -62,44 +57,11 @@ def check_results(queries: list[TestQueryGenome], query_files: list[SequenceFile assert predicted.name == query['predicted'] assert clsresult.primary_match.genome.description == query['primary'] - else: - assert clsresult.primary_match == clsresult.closest_match - assert predicted is clsresult.primary_match.matched_taxon - - assert item.report_taxon is reportable_taxon(predicted) - else: assert predicted is None assert clsresult.primary_match is None assert item.report_taxon is None - # Closest matches - assert len(item.closest_genomes) == results.params.report_closest - assert item.closest_genomes[0] == clsresult.closest_match - - for i in range(1, results.params.report_closest): - assert item.closest_genomes[i].distance >= item.closest_genomes[i-1].distance - - # Next taxon - nt = clsresult.next_taxon - if nt is None: - # Predicted should be most specific possible - assert clsresult.closest_match.matched_taxon == clsresult.closest_match.genome.taxon - - else: - assert nt.distance_threshold is not None - assert nt.distance_threshold < clsresult.closest_match.distance - - # This should hold true as long as the primary match is the closest match, just warn if - # it fails. - if predicted is not None: - if predicted not in nt.ancestors(): - warnings.append(f'Next taxon {nt.name} not a descendant of predicted taxon {predicted.name}') - - # Display warnings - for w in warnings: - print(f'[Query "{query["name"]}"]:', w, file=sys.stderr) - def main(): testdb = TestDB(THISDIR) @@ -111,6 +73,7 @@ def main(): for label, params in PARAMS.items(): print('Running query:', label) results = query_parse(db, query_files, params) + check_results_base(results) check_results(testdb.query_genomes, query_files, results) with open(f'results/{label}.json', 'wt') as f: diff --git a/tests/results.py b/tests/results.py index 6541147..11491ba 100644 --- a/tests/results.py +++ b/tests/results.py @@ -1,17 +1,80 @@ -"""Funcs for testing exported data.""" +"""Helper code for tests related to the QueryResults class or exported result data.""" import csv import json from typing import TextIO, Any, Iterable, Optional from pathlib import Path +from warnings import warn import numpy as np from gambit.util.json import to_json -from gambit.query import QueryResults, QueryResultItem +from gambit.query import QueryResults, QueryResultItem, QueryParams from gambit.classify import GenomeMatch, ClassifierResult from gambit.util.misc import zip_strict -from gambit.db.models import AnnotatedGenome, Taxon +from gambit.db.models import AnnotatedGenome, Taxon, reportable_taxon + + +def check_results(results: QueryResults, warnings: bool = True): + """Check invariants on query results object.""" + + assert results.params is not None + + for item in results.items: + check_result_item(item, results.params, warnings=warnings) + + +def check_result_item(item: QueryResultItem, params: QueryParams, warnings: bool = True): + """Check invariants on successful query result item.""" + + clsresult = item.classifier_result + predicted = clsresult.predicted_taxon + + # No errors + assert clsresult.success + assert clsresult.error is None + + # Predicted taxon + if predicted is not None: + assert clsresult.primary_match is not None + + if not params.classify_strict: + assert clsresult.primary_match == clsresult.closest_match + assert predicted is clsresult.primary_match.matched_taxon + + assert item.report_taxon is reportable_taxon(predicted) + + else: + assert clsresult.primary_match is None + assert item.report_taxon is None + + # Closest matches + assert len(item.closest_genomes) == params.report_closest + assert item.closest_genomes[0] == clsresult.closest_match + + # Check closest_genomes is sorted by distance + for i in range(1, params.report_closest): + assert item.closest_genomes[i].distance >= item.closest_genomes[i-1].distance + + # Next taxon + nt = clsresult.next_taxon + if nt is None: + # Predicted should be most specific possible + assert clsresult.closest_match.matched_taxon == clsresult.closest_match.genome.taxon + + else: + assert nt.distance_threshold is not None + assert nt.distance_threshold < clsresult.closest_match.distance + + # This should hold true as long as the primary match is the closest match, just warn if + # it fails. + if predicted is not None: + if predicted not in nt.ancestors(): + if warnings: + warn( + f'[Query {item.input.label}]: ' + f'next taxon {nt.name} not a descendant of predicted taxon {predicted.name}' + ) def compare_genome_matches(match1: Optional[GenomeMatch], match2: Optional[GenomeMatch]): @@ -32,7 +95,7 @@ def compare_genome_matches(match1: Optional[GenomeMatch], match2: Optional[Genom assert np.isclose(match1.distance, match2.distance) -def compare_classifier_results(result1: ClassifierResult, result2: ClassifierResult) -> bool: +def compare_classifier_results(result1: ClassifierResult, result2: ClassifierResult): """Assert two ``ClassifierResult`` instances are equal.""" assert result1.success == result2.success assert result1.predicted_taxon == result2.predicted_taxon @@ -43,7 +106,7 @@ def compare_classifier_results(result1: ClassifierResult, result2: ClassifierRes assert result1.error == result2.error -def compare_result_items(item1: QueryResultItem, item2: QueryResultItem) -> bool: +def compare_result_items(item1: QueryResultItem, item2: QueryResultItem): """Assert two ``QueryResultItem`` instances are equal. Does not compare the value of the ``input`` attributes. diff --git a/tests/test_query.py b/tests/test_query.py index bc86414..c2baea2 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -8,7 +8,7 @@ from gambit import __version__ as GAMBIT_VERSION from .testdb import TestDB -from .results import compare_result_items +from .results import compare_result_items, check_results class TestQueryInput: @@ -32,10 +32,14 @@ class TestQuery: def check_results(self, results: QueryResults, ref_results: QueryResults): + # Check general invariants of QueryResults object + check_results(results, warnings=False) # One of the queries is designed to generate a warning + assert results.gambit_version == GAMBIT_VERSION + + # Check matches reference results assert results.params == ref_results.params assert results.genomeset == ref_results.genomeset assert results.signaturesmeta == ref_results.signaturesmeta - assert results.gambit_version == GAMBIT_VERSION for item, ref_item in zip_strict(results.items, ref_results.items): compare_result_items(item, ref_item) From 719312425ec6563765a717266c8c984bc4dd48cd Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Tue, 13 Aug 2024 22:08:37 -0700 Subject: [PATCH 70/86] Fix docstring --- src/gambit/classify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gambit/classify.py b/src/gambit/classify.py index 4e91bd6..d3d609a 100644 --- a/src/gambit/classify.py +++ b/src/gambit/classify.py @@ -132,7 +132,7 @@ class GenomeMatch: Reference genome matched to. distance Distance between query and reference genome. - matching_taxon + matched_taxon Taxon prediction based off of this match alone. Will always be ``genome.taxon`` or one of its ancestors. """ From ecb9751fe81ec3ccbf330ec0302dc0e984812a2b Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Wed, 5 Oct 2022 20:14:32 -0600 Subject: [PATCH 71/86] parse_seqs() function --- src/gambit/seq.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ tests/test_seq.py | 45 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/src/gambit/seq.py b/src/gambit/seq.py index 62bb0d3..741d8b7 100644 --- a/src/gambit/seq.py +++ b/src/gambit/seq.py @@ -76,6 +76,51 @@ def validate_dna_seq_bytes(seq: DNASeqBytes): raise ValueError(f'Invalid byte at position {i}: {nuc}') +def parse_seqs(path: FilePath, + format: str = 'fasta', + compression: str = 'auto', + **kwargs) -> ClosingIterator[SeqIO.SeqRecord]: + """Open a sequence file and lazily parse its contents. + + This is essentially a wrapper over BioPython's :func:`Bio.SeqIO.parse` function that + transparently handles compressed files. + + Returns iterator over sequence data in file. File is parsed lazily, and so must be kept open. + The returned iterator is of type :class:`gambit.util.io.ClosingIterator` so it will close the + file stream automatically when it finishes. It may also be used as a context manager that closes + the stream on exit. You may also close the stream explicitly using the iterator's ``close`` + method. + + Parameters + ---------- + path + Path to the file. + format + String describing the file format as interpreted by :func:`Bio.SeqIO.parse`. + compression + String describing compression method of the file, e.g. ``'gzip'``. None means no + compression. Default is to determine compression automatically (can only detect gzip or + none). See :func:`gambit.util.io.open_compressed`. + kwargs + Keyword arguments to :func:`gambit.util.io.open_compressed`. + + Returns + ------- + gambit.util.io.ClosingIterator + Iterator yielding :class:`Bio.SeqIO.SeqRecord` instances for each sequence in the file. + """ + + fobj = open_compressed(path, 'rt', compression, **kwargs) + + try: + records = SeqIO.parse(fobj, format) + return ClosingIterator(records, fobj) + + except: + fobj.close() + raise + + @attrs(frozen=True, slots=True) class SequenceFile(PathLike): """A reference to a DNA sequence file stored in the file system. diff --git a/tests/test_seq.py b/tests/test_seq.py index 4896eb5..7ed4543 100644 --- a/tests/test_seq.py +++ b/tests/test_seq.py @@ -8,9 +8,11 @@ import numpy as np from Bio import Seq, SeqIO -from gambit.seq import SequenceFile, revcomp +from gambit.seq import SequenceFile, revcomp, parse_seqs from gambit.kmers import nkmers, index_to_kmer from gambit.util.misc import zip_strict +from gambit.util.io import open_compressed + from .common import random_seq @@ -247,3 +249,44 @@ def test_from_paths(self, format, compression): assert str(seqfile.path) == path assert seqfile.format == format assert seqfile.compression == compression + + +@pytest.fixture(scope='module') +def seqrecords(): + """Random SeqRecord instances.""" + + records = [] + + np.random.seed(0) + + for i in range(20): + seq = Seq.Seq(random_seq(1000).decode('ascii')) + id_ = f'seq{i + 1}' + descr = f'{id_} Test sequence {i + 1}' + records.append(SeqIO.SeqRecord(seq, id=id_, description=descr)) + + return records + + +@pytest.mark.parametrize('compression', ['none', 'gzip']) +@pytest.mark.parametrize('auto', [False, True]) +def test_parse_seqs(tmp_path: Path, seqrecords: list[SeqIO.SeqRecord], compression: str, auto: bool): + """Test the parse_seqs() function.""" + + # Write FASTA file + file = tmp_path / ('test.fa' + ('.gz' if compression == 'gzip' else '')) + with open_compressed(file, 'wt', compression) as fh: + SeqIO.write(seqrecords, fh, 'fasta') + + # Parse + with parse_seqs(file, 'fasta', compression='auto' if auto else compression) as parsed: + records2 = list(parsed) + + # Check ClosingIterator closes the underlying file object when last record is read + assert parsed.fobj.closed + + # Check parsed records are correct + for record, record2 in zip_strict(seqrecords, records2): + assert record2.seq == record.seq + assert record2.id == record.id + assert record2.description == record.description From 5050e0bdc920c7508e5270527a992b2a9e98271b Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 15:34:00 -0700 Subject: [PATCH 72/86] WIP remove QueryInput class --- src/gambit/cli/query.py | 7 ++- src/gambit/query.py | 107 ++++++++++++++++++---------------------- tests/test_query.py | 26 ++-------- 3 files changed, 56 insertions(+), 84 deletions(-) diff --git a/src/gambit/cli/query.py b/src/gambit/cli/query.py index f1ad6ff..e3f04f3 100644 --- a/src/gambit/cli/query.py +++ b/src/gambit/cli/query.py @@ -5,7 +5,7 @@ from . import common from .root import cli -from gambit.query import QueryParams, QueryInput, query, query_parse +from gambit.query import QueryParams, query, query_parse from gambit.util.progress import progress_config from gambit.sigs import load_signatures from gambit.results import CSVResultsExporter, JSONResultsExporter, ResultsArchiveWriter @@ -79,15 +79,14 @@ def query_cmd(ctx: click.Context, if sigfile: sigs = load_signatures(sigfile) - inputs = [QueryInput(id) for id in sigs.ids] - results = query(db, sigs, params, inputs=inputs, progress=pconf) + results = query(db, sigs, params, progress=pconf) else: ids, files = common.get_sequence_files(files_arg, listfile, ldir) common.warn_duplicate_file_ids(ids, 'Warning: the following query file IDs are present more than once: {ids}') results = query_parse( db, files, params, - file_labels=ids, + labels=ids, progress=pconf, parse_kw=dict(max_workers=cores), ) diff --git a/src/gambit/query.py b/src/gambit/query.py index 9634a5a..784197e 100644 --- a/src/gambit/query.py +++ b/src/gambit/query.py @@ -2,19 +2,21 @@ from warnings import warn from datetime import datetime -from typing import Sequence, Optional, Union, Any +from typing import Sequence, Optional, Any +from pathlib import Path from attr import attrs, attrib +from attr.converters import optional as optional_converter import numpy as np from gambit import __version__ as GAMBIT_VERSION from gambit.classify import classify, ClassifierResult, GenomeMatch from gambit.db import ReferenceDatabase, Taxon, ReferenceGenomeSet, reportable_taxon from gambit.seq import SequenceFile -from gambit.sigs import KmerSignature, SignaturesMeta +from gambit.sigs.base import KmerSignature, SignaturesMeta, ReferenceSignatures from gambit.metric import jaccarddist_matrix -from gambit.util.misc import zip_strict from gambit.util.progress import progress_config, iter_progress +from gambit.util.misc import zip_strict @attrs() @@ -36,44 +38,14 @@ class QueryParams: report_closest: int = attrib(default=10) -@attrs() -class QueryInput: - """Information on a query genome. - - Attributes - ---------- - label - Some unique label for the input, probably the file name. - file - Source file (optional). - """ - label: str = attrib() - file: Optional[SequenceFile] = attrib(default=None, repr=False) - - @classmethod - def convert(cls, x: Union['QueryInput', SequenceFile, str]) -> 'QueryInput': - """Convenience function to convert flexible argument types into QueryInput. - - Accepts single string label, ``SequenceFile`` (uses file path for label), or existing - ``QueryInput`` instance (returned unchanged). - """ - if isinstance(x, QueryInput): - return x - if isinstance(x, str): - return QueryInput(x) - if isinstance(x, SequenceFile): - return QueryInput(str(x.path), x) - raise TypeError(f'Cannot convert {type(x)} instance to QueryInput') - - @attrs() class QueryResultItem: """Result for a single query sequence. Attributes ---------- - input - Information on input genome. + label + Unique label describing query. classifier_result Result of running classifier. report_taxon @@ -81,11 +53,14 @@ class QueryResultItem: closest_genomes List of closest reference genomes to query. Length determined by :attr:`.QueryParams.report_closest`. + file + Path to file containing query genome (optional). """ - input: QueryInput = attrib() + label: str = attrib() classifier_result: ClassifierResult = attrib() report_taxon: Optional[Taxon] = attrib(default=None) closest_genomes: list[GenomeMatch] = attrib(factory=list) + file: Optional[Path] = attrib(default=None, converter=optional_converter(Path)) @attrs(repr=False) @@ -122,7 +97,7 @@ def query(db: ReferenceDatabase, queries: Sequence[KmerSignature], params: Optional[QueryParams] = None, *, - inputs: Optional[Sequence[Union[QueryInput, SequenceFile, str]]] = None, + labels: Optional[Sequence[str]] = None, progress = None, **kw, ) -> QueryResults: @@ -137,10 +112,10 @@ def query(db: ReferenceDatabase, params ``QueryParams`` instance defining parameter values. If None take values from additional keyword arguments or use defaults. - inputs - Description for each input, converted to :class:`.QueryInput` in results - object. Only used for reporting, does not any other aspect of results. Items can be - ``QueryInput``, ``SequenceFile`` or ``str``. + labels + Optional list of string labels for each query. Only used for reporting (sets ``label`` + attribute of :class:`QueryResultItem` in results object), does not any other aspect of + results. progress Report progress for distance matrix calculation and classification. See :func:`gambit.util.progress.get_progress` for description of allowed values. @@ -152,18 +127,22 @@ def query(db: ReferenceDatabase, elif kw: warn('Additional keyword arguments ignored if "params" argument is not None.') - queries = list(queries) pconf = progress_config(progress) if len(queries) == 0: raise ValueError('Must supply at least one query.') - if inputs is not None: - inputs = list(map(QueryInput.convert, inputs)) - if len(inputs) != len(queries): - raise ValueError('Number of inputs does not match number of queries.') + # Labels + if labels is not None: + if len(labels) != len(queries): + raise ValueError('Number of labels does not match number of queries.') + + elif isinstance(queries, ReferenceSignatures): + # Get default labels from queries of ReferenceSignatures object + labels = list(map(str, queries.ids)) + else: - inputs = [QueryInput(str(i + 1)) for i in range(len(queries))] + labels = [str(i + 1) for i in range(len(queries))] # Calculate distances # (This will only be about 200kB per row/query [50k float32's] so having the whole thing in @@ -177,8 +156,11 @@ def query(db: ReferenceDatabase, ) # Classify inputs and create result items - with iter_progress(inputs, pconf, desc='Classifying') as inputs_iter: - items = [get_result_item(db, params, dmat[i, :], input) for i, input in enumerate(inputs_iter)] + with iter_progress(labels, pconf, desc='Classifying') as labels_iter: + items = [ + get_result_item(db, params, dmat[i, :], label) + for i, label in enumerate(labels_iter) + ] return QueryResults( items=items, @@ -188,7 +170,7 @@ def query(db: ReferenceDatabase, ) -def get_result_item(db:ReferenceDatabase, params: QueryParams, dists: np.ndarray, input: QueryInput) -> QueryResultItem: +def get_result_item(db: ReferenceDatabase, params: QueryParams, dists: np.ndarray, label: str) -> QueryResultItem: """Perform classification and create result item object for single query input. Parameters @@ -196,14 +178,14 @@ def get_result_item(db:ReferenceDatabase, params: QueryParams, dists: np.ndarray db params dists - Distances from query to reference genomes. - input + 1D array of distances from query to all reference genomes. + label """ clsresult = classify(db.genomes, dists, strict=params.classify_strict) closest = [GenomeMatch(db.genomes[i], dists[i]) for i in np.argsort(dists)[:params.report_closest]] return QueryResultItem( - input=input, + label=label, classifier_result=clsresult, report_taxon=reportable_taxon(clsresult.predicted_taxon), closest_genomes=closest, @@ -214,7 +196,7 @@ def query_parse(db: ReferenceDatabase, files: Sequence[SequenceFile], params: Optional[QueryParams] = None, *, - file_labels: Optional[Sequence[str]] = None, + labels: Optional[Sequence[str]] = None, parse_kw: Optional[dict[str, Any]] = None, **kw, ) -> QueryResults: @@ -229,7 +211,7 @@ def query_parse(db: ReferenceDatabase, params ``QueryParams`` instance defining parameter values. If None take values from additional keyword arguments or use defaults. - file_labels + labels Custom labels to use for each file in returned results object. If None use file names. parse_kw Keyword parameters to pass to :func:`gambit.sigs.calc.calc_file_signatures`. @@ -243,11 +225,18 @@ def query_parse(db: ReferenceDatabase, parse_kw = dict() parse_kw.setdefault('progress', pconf.update(desc='Parsing input')) - if file_labels is None: - inputs = files + if labels is None: + labels = [str(file.path) for file in files] else: - inputs = [QueryInput(label, file) for label, file in zip_strict(file_labels, files)] + if len(labels) != len(files): + raise ValueError('Number of labels does not match number of files') query_sigs = calc_file_signatures(db.signatures.kmerspec, files, **parse_kw) - return query(db, query_sigs, params, inputs=inputs, progress=pconf, **kw) + results = query(db, query_sigs, params, labels=labels, progress=pconf, **kw) + + # Assign file attribute of QueryResultItem's + for item, file in zip_strict(results.items, files): + item.file = file.path + + return results diff --git a/tests/test_query.py b/tests/test_query.py index c2baea2..4de1477 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -2,8 +2,7 @@ import pytest -from gambit.query import QueryInput, QueryResults, query, query_parse -from gambit.seq import SequenceFile +from gambit.query import QueryResults, query, query_parse from gambit.util.misc import zip_strict from gambit import __version__ as GAMBIT_VERSION @@ -11,21 +10,6 @@ from .results import compare_result_items, check_results -class TestQueryInput: - """Test QueryInput class.""" - - def test_convert(self): - file = SequenceFile('path/to/file.fa', 'fasta') - qi = QueryInput('foo', file) - - assert QueryInput.convert(qi) is qi - assert QueryInput.convert('foo') == QueryInput('foo', None) - assert QueryInput.convert(file) == QueryInput(str(file.path), file) - - with pytest.raises(TypeError): - QueryInput.convert(3.4) - - @pytest.mark.parametrize('strict', [False, True]) class TestQuery: """Run a full query using the Python API.""" @@ -55,8 +39,8 @@ def test_query(self, testdb: TestDB, strict: bool): self.check_results(results, ref_results) for sigid, item in zip_strict(query_sigs.ids, results.items): - assert item.input.file is None - # assert item.input.label == sigid + assert item.file is None + assert item.label == sigid def test_query_parse(self, testdb: TestDB, strict: bool): """Test the query_parse() function.""" @@ -69,5 +53,5 @@ def test_query_parse(self, testdb: TestDB, strict: bool): self.check_results(results, ref_results) for file, item in zip_strict(query_files, results.items): - assert item.input.file == file - assert item.input.label == str(file.path) + assert item.file == file.path + assert item.label == str(file.path) From 399057b17d0fff589eb8195514a660e1db267775 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 4 Aug 2024 22:20:42 -0700 Subject: [PATCH 73/86] Update results exporters --- src/gambit/results.py | 18 +++++++----------- tests/results.py | 17 +++++++---------- tests/test_results.py | 17 ++++++++++++----- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/gambit/results.py b/src/gambit/results.py index bd76a8e..c0af9db 100644 --- a/src/gambit/results.py +++ b/src/gambit/results.py @@ -11,7 +11,7 @@ from gambit.util.io import FilePath, maybe_open import gambit.util.json as gjson -from gambit.query import QueryResults, QueryResultItem, QueryInput +from gambit.query import QueryResults, QueryResultItem from gambit.db import ReferenceGenomeSet, Taxon, AnnotatedGenome, Genome @@ -84,8 +84,9 @@ class CSVResultsExporter(AbstractResultsExporter): """ format_opts: dict[str, Any] + # Pairs of column name and QueryResultItem attribute COLUMNS = [ - ('query', 'input.label'), + ('query', 'label'), ('predicted.name', 'report_taxon.name'), ('predicted.rank', 'report_taxon.rank'), ('predicted.ncbi_id', 'report_taxon.ncbi_id'), @@ -140,20 +141,15 @@ def _results_to_json(self, results: QueryResults): @to_json.register(QueryResultItem) def _item_to_json(self, item: QueryResultItem): return dict( - query=item.input, + query=dict( + name=item.label, + path=item.file, + ), predicted_taxon=item.report_taxon, next_taxon=item.classifier_result.next_taxon, closest_genomes=item.closest_genomes, ) - @to_json.register(QueryInput) - def _input_to_json(self, input: QueryInput): - return dict( - name=input.label, - path=None if input.file is None else input.file.path, - format=None if input.file is None else input.file.format, - ) - @to_json.register(ReferenceGenomeSet) def _genomeset_to_json(self, gset: ReferenceGenomeSet): return _todict(gset, ['id', 'key', 'version', 'name', 'description']) diff --git a/tests/results.py b/tests/results.py index 11491ba..88f27e0 100644 --- a/tests/results.py +++ b/tests/results.py @@ -72,7 +72,7 @@ def check_result_item(item: QueryResultItem, params: QueryParams, warnings: bool if predicted not in nt.ancestors(): if warnings: warn( - f'[Query {item.input.label}]: ' + f'[Query {item.label}]: ' f'next taxon {nt.name} not a descendant of predicted taxon {predicted.name}' ) @@ -199,22 +199,19 @@ def check_json_results(file: TextIO, for item, item_data in zip(results.items, data['items']): - # Compare data['query'] <-> item.input + # Compare data['query'] <-> item.label / item.file query = item_data['query'] - assert query['name'] == item.input.label + assert query['name'] == item.label - if item.input.file is None: + if item.file is None: assert query['path'] is None - assert query['format'] is None else: - assert query['format'] == item.input.file.format - # Check path matches exactly if strict mode, otherwise just file name if strict: - assert query['path'] == str(item.input.file.path) + assert query['path'] == str(item.file) else: - assert Path(query['path']).name == item.input.file.path.name + assert Path(query['path']).name == item.file.name # Predicted/next taxon cmp_taxon_json(item_data['predicted_taxon'], item.report_taxon) @@ -269,7 +266,7 @@ def check_csv_results(file: TextIO, results: QueryResults, strict: bool = False) assert len(rows) == len(results.items) for item, row in zip(results.items, rows): - assert row['query'] == item.input.label + assert row['query'] == item.label cmp_csv_taxon(row, item.report_taxon, 'predicted') cmp_csv_taxon(row, item.classifier_result.next_taxon, 'next') diff --git a/tests/test_results.py b/tests/test_results.py index b739042..70a7908 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -1,12 +1,18 @@ +"""Test the gambit.results module. + +Each ResultsExporter subclass is tested by exporting a fake QueryResults instance to a string buffer, +parsing the exported results and checking the against the original using the functions in the +.results tests helper module. +""" + from io import StringIO import pytest -from gambit.query import QueryResults, QueryResultItem, QueryInput, QueryParams +from gambit.query import QueryResults, QueryResultItem, QueryParams from gambit.classify import ClassifierResult, GenomeMatch from gambit.db import ReferenceGenomeSet, Genome from gambit.sigs import SignaturesMeta -from gambit.seq import SequenceFile from gambit.results import JSONResultsExporter, CSVResultsExporter, ResultsArchiveReader, ResultsArchiveWriter from .results import check_json_results, check_csv_results @@ -77,14 +83,15 @@ def results(session): for i, cr in enumerate(classifier_results): predicted = cr.predicted_taxon items.append(QueryResultItem( - input=QueryInput(f'query-{i}', SequenceFile(f'query-{i}.fasta', 'fasta')), + f'query-{i}', classifier_result=cr, + file=f'query-{i}.fasta', report_taxon=None if predicted is None else predicted.parent if i % 4 == 0 else predicted, closest_genomes=[cr.closest_match], )) - # Set one input file to None - items[-1].input.file = None + # Set one file to None + items[-1].file = None return QueryResults( items=items, From 881c87c36be33d96403a7012d44120c13a9a5fa5 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 30 Nov 2024 21:45:26 -0800 Subject: [PATCH 74/86] WIP update test results generation script --- tests/data/testdb_210818/generate-results.py | 2 +- .../testdb_210818/results/non_strict.json | 504 ++++-------------- tests/data/testdb_210818/results/strict.json | 504 ++++-------------- 3 files changed, 205 insertions(+), 805 deletions(-) diff --git a/tests/data/testdb_210818/generate-results.py b/tests/data/testdb_210818/generate-results.py index 1c4bd22..4125e90 100755 --- a/tests/data/testdb_210818/generate-results.py +++ b/tests/data/testdb_210818/generate-results.py @@ -40,7 +40,7 @@ def check_results(queries: list[TestQueryGenome], query_files: list[SequenceFile clsresult = item.classifier_result predicted = clsresult.predicted_taxon - assert item.input.file == query_file + assert item.file == query_file.path # Check if warnings expected (only if in strict mode) assert bool(clsresult.warnings) == (strict and query['warnings']) diff --git a/tests/data/testdb_210818/results/non_strict.json b/tests/data/testdb_210818/results/non_strict.json index 57763ea..ebea251 100644 --- a/tests/data/testdb_210818/results/non_strict.json +++ b/tests/data/testdb_210818/results/non_strict.json @@ -1,6 +1,6 @@ { "extra": {}, - "gambit_version": "0.4.0", + "gambit_version": "1.0.1", "genomeset": { "key": "gambit/testdb_210818", "version": "1.0" @@ -96,14 +96,8 @@ "matched_taxon": null } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/unclassifiable.fasta" - }, - "label": "unclassifiable.fasta" - }, + "file": "queries/genomes/unclassifiable.fasta", + "label": "queries/genomes/unclassifiable.fasta", "report_taxon": null }, { @@ -226,14 +220,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1.fasta" - }, - "label": "A1.fasta" - }, + "file": "queries/genomes/A1.fasta", + "label": "queries/genomes/A1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1" } @@ -358,14 +346,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C1.fasta" - }, - "label": "A1_B1_C1.fasta" - }, + "file": "queries/genomes/A1_B1_C1.fasta", + "label": "queries/genomes/A1_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C1" } @@ -490,14 +472,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C2.fasta" - }, - "label": "A1_B1_C2.fasta" - }, + "file": "queries/genomes/A1_B1_C2.fasta", + "label": "queries/genomes/A1_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C2" } @@ -622,14 +598,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C3.fasta" - }, - "label": "A1_B1_C3.fasta" - }, + "file": "queries/genomes/A1_B1_C3.fasta", + "label": "queries/genomes/A1_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C3" } @@ -754,14 +724,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C4.fasta" - }, - "label": "A1_B1_C4.fasta" - }, + "file": "queries/genomes/A1_B1_C4.fasta", + "label": "queries/genomes/A1_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C4" } @@ -886,14 +850,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2.fasta" - }, - "label": "A1_B2.fasta" - }, + "file": "queries/genomes/A1_B2.fasta", + "label": "queries/genomes/A1_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2" } @@ -1018,14 +976,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C1.fasta" - }, - "label": "A1_B2_C1.fasta" - }, + "file": "queries/genomes/A1_B2_C1.fasta", + "label": "queries/genomes/A1_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C1" } @@ -1150,14 +1102,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C2.fasta" - }, - "label": "A1_B2_C2.fasta" - }, + "file": "queries/genomes/A1_B2_C2.fasta", + "label": "queries/genomes/A1_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C2" } @@ -1282,14 +1228,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C3.fasta" - }, - "label": "A1_B2_C3.fasta" - }, + "file": "queries/genomes/A1_B2_C3.fasta", + "label": "queries/genomes/A1_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C3" } @@ -1414,14 +1354,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C4.fasta" - }, - "label": "A1_B2_C4.fasta" - }, + "file": "queries/genomes/A1_B2_C4.fasta", + "label": "queries/genomes/A1_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C4" } @@ -1546,14 +1480,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3.fasta" - }, - "label": "A1_B3.fasta" - }, + "file": "queries/genomes/A1_B3.fasta", + "label": "queries/genomes/A1_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3" } @@ -1678,14 +1606,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C1.fasta" - }, - "label": "A1_B3_C1.fasta" - }, + "file": "queries/genomes/A1_B3_C1.fasta", + "label": "queries/genomes/A1_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C1" } @@ -1810,14 +1732,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C2.fasta" - }, - "label": "A1_B3_C2.fasta" - }, + "file": "queries/genomes/A1_B3_C2.fasta", + "label": "queries/genomes/A1_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C2" } @@ -1942,14 +1858,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C3.fasta" - }, - "label": "A1_B3_C3.fasta" - }, + "file": "queries/genomes/A1_B3_C3.fasta", + "label": "queries/genomes/A1_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C3" } @@ -2074,14 +1984,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C4.fasta" - }, - "label": "A1_B3_C4.fasta" - }, + "file": "queries/genomes/A1_B3_C4.fasta", + "label": "queries/genomes/A1_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C4" } @@ -2206,14 +2110,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2.fasta" - }, - "label": "A2.fasta" - }, + "file": "queries/genomes/A2.fasta", + "label": "queries/genomes/A2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2" } @@ -2338,14 +2236,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1.fasta" - }, - "label": "A2_B1.fasta" - }, + "file": "queries/genomes/A2_B1.fasta", + "label": "queries/genomes/A2_B1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1" } @@ -2470,14 +2362,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C1.fasta" - }, - "label": "A2_B1_C1.fasta" - }, + "file": "queries/genomes/A2_B1_C1.fasta", + "label": "queries/genomes/A2_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C1" } @@ -2602,14 +2488,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C2.fasta" - }, - "label": "A2_B1_C2.fasta" - }, + "file": "queries/genomes/A2_B1_C2.fasta", + "label": "queries/genomes/A2_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C2" } @@ -2734,14 +2614,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C3.fasta" - }, - "label": "A2_B1_C3.fasta" - }, + "file": "queries/genomes/A2_B1_C3.fasta", + "label": "queries/genomes/A2_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C3" } @@ -2866,14 +2740,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C4.fasta" - }, - "label": "A2_B1_C4.fasta" - }, + "file": "queries/genomes/A2_B1_C4.fasta", + "label": "queries/genomes/A2_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C4" } @@ -2998,14 +2866,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2.fasta" - }, - "label": "A2_B2.fasta" - }, + "file": "queries/genomes/A2_B2.fasta", + "label": "queries/genomes/A2_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2" } @@ -3130,14 +2992,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C1.fasta" - }, - "label": "A2_B2_C1.fasta" - }, + "file": "queries/genomes/A2_B2_C1.fasta", + "label": "queries/genomes/A2_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C1" } @@ -3262,14 +3118,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C2.fasta" - }, - "label": "A2_B2_C2.fasta" - }, + "file": "queries/genomes/A2_B2_C2.fasta", + "label": "queries/genomes/A2_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C2" } @@ -3394,14 +3244,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C3.fasta" - }, - "label": "A2_B2_C3.fasta" - }, + "file": "queries/genomes/A2_B2_C3.fasta", + "label": "queries/genomes/A2_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C3" } @@ -3526,14 +3370,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C4.fasta" - }, - "label": "A2_B2_C4.fasta" - }, + "file": "queries/genomes/A2_B2_C4.fasta", + "label": "queries/genomes/A2_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C4" } @@ -3660,14 +3498,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3.fasta" - }, - "label": "A2_B3.fasta" - }, + "file": "queries/genomes/A2_B3.fasta", + "label": "queries/genomes/A2_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3" } @@ -3792,14 +3624,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C1.fasta" - }, - "label": "A2_B3_C1.fasta" - }, + "file": "queries/genomes/A2_B3_C1.fasta", + "label": "queries/genomes/A2_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C1" } @@ -3924,14 +3750,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C2.fasta" - }, - "label": "A2_B3_C2.fasta" - }, + "file": "queries/genomes/A2_B3_C2.fasta", + "label": "queries/genomes/A2_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C2" } @@ -4056,14 +3876,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C3.fasta" - }, - "label": "A2_B3_C3.fasta" - }, + "file": "queries/genomes/A2_B3_C3.fasta", + "label": "queries/genomes/A2_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C3" } @@ -4188,14 +4002,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C4.fasta" - }, - "label": "A2_B3_C4.fasta" - }, + "file": "queries/genomes/A2_B3_C4.fasta", + "label": "queries/genomes/A2_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C4" } @@ -4320,14 +4128,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3.fasta" - }, - "label": "A3.fasta" - }, + "file": "queries/genomes/A3.fasta", + "label": "queries/genomes/A3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3" } @@ -4452,14 +4254,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1.fasta" - }, - "label": "A3_B1.fasta" - }, + "file": "queries/genomes/A3_B1.fasta", + "label": "queries/genomes/A3_B1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1" } @@ -4584,14 +4380,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C1.fasta" - }, - "label": "A3_B1_C1.fasta" - }, + "file": "queries/genomes/A3_B1_C1.fasta", + "label": "queries/genomes/A3_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C1" } @@ -4716,14 +4506,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C2.fasta" - }, - "label": "A3_B1_C2.fasta" - }, + "file": "queries/genomes/A3_B1_C2.fasta", + "label": "queries/genomes/A3_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C2" } @@ -4848,14 +4632,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C3.fasta" - }, - "label": "A3_B1_C3.fasta" - }, + "file": "queries/genomes/A3_B1_C3.fasta", + "label": "queries/genomes/A3_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C3" } @@ -4980,14 +4758,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C4.fasta" - }, - "label": "A3_B1_C4.fasta" - }, + "file": "queries/genomes/A3_B1_C4.fasta", + "label": "queries/genomes/A3_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C4" } @@ -5112,14 +4884,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2.fasta" - }, - "label": "A3_B2.fasta" - }, + "file": "queries/genomes/A3_B2.fasta", + "label": "queries/genomes/A3_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2" } @@ -5244,14 +5010,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C1.fasta" - }, - "label": "A3_B2_C1.fasta" - }, + "file": "queries/genomes/A3_B2_C1.fasta", + "label": "queries/genomes/A3_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C1" } @@ -5376,14 +5136,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C2.fasta" - }, - "label": "A3_B2_C2.fasta" - }, + "file": "queries/genomes/A3_B2_C2.fasta", + "label": "queries/genomes/A3_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C2" } @@ -5508,14 +5262,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C3.fasta" - }, - "label": "A3_B2_C3.fasta" - }, + "file": "queries/genomes/A3_B2_C3.fasta", + "label": "queries/genomes/A3_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C3" } @@ -5640,14 +5388,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C4.fasta" - }, - "label": "A3_B2_C4.fasta" - }, + "file": "queries/genomes/A3_B2_C4.fasta", + "label": "queries/genomes/A3_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C4" } @@ -5772,14 +5514,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3.fasta" - }, - "label": "A3_B3.fasta" - }, + "file": "queries/genomes/A3_B3.fasta", + "label": "queries/genomes/A3_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3" } @@ -5904,14 +5640,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C1.fasta" - }, - "label": "A3_B3_C1.fasta" - }, + "file": "queries/genomes/A3_B3_C1.fasta", + "label": "queries/genomes/A3_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C1" } @@ -6036,14 +5766,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C2.fasta" - }, - "label": "A3_B3_C2.fasta" - }, + "file": "queries/genomes/A3_B3_C2.fasta", + "label": "queries/genomes/A3_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C2" } @@ -6168,14 +5892,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C3.fasta" - }, - "label": "A3_B3_C3.fasta" - }, + "file": "queries/genomes/A3_B3_C3.fasta", + "label": "queries/genomes/A3_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C3" } @@ -6300,14 +6018,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C4.fasta" - }, - "label": "A3_B3_C4.fasta" - }, + "file": "queries/genomes/A3_B3_C4.fasta", + "label": "queries/genomes/A3_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C4" } @@ -6432,14 +6144,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/inconsistent.fasta" - }, - "label": "inconsistent.fasta" - }, + "file": "queries/genomes/inconsistent.fasta", + "label": "queries/genomes/inconsistent.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2" } @@ -6566,14 +6272,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/primary_not_closest.fasta" - }, - "label": "primary_not_closest.fasta" - }, + "file": "queries/genomes/primary_not_closest.fasta", + "label": "queries/genomes/primary_not_closest.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2" } @@ -6595,5 +6295,5 @@ "name": "testdb_210818", "version": "1.0" }, - "timestamp": "2022-01-09T17:29:21.540969" + "timestamp": "2024-11-30T22:19:38.878341" } \ No newline at end of file diff --git a/tests/data/testdb_210818/results/strict.json b/tests/data/testdb_210818/results/strict.json index 23487dd..259a6c5 100644 --- a/tests/data/testdb_210818/results/strict.json +++ b/tests/data/testdb_210818/results/strict.json @@ -1,6 +1,6 @@ { "extra": {}, - "gambit_version": "0.4.0", + "gambit_version": "1.0.1", "genomeset": { "key": "gambit/testdb_210818", "version": "1.0" @@ -96,14 +96,8 @@ "matched_taxon": null } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/unclassifiable.fasta" - }, - "label": "unclassifiable.fasta" - }, + "file": "queries/genomes/unclassifiable.fasta", + "label": "queries/genomes/unclassifiable.fasta", "report_taxon": null }, { @@ -226,14 +220,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1.fasta" - }, - "label": "A1.fasta" - }, + "file": "queries/genomes/A1.fasta", + "label": "queries/genomes/A1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1" } @@ -358,14 +346,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C1.fasta" - }, - "label": "A1_B1_C1.fasta" - }, + "file": "queries/genomes/A1_B1_C1.fasta", + "label": "queries/genomes/A1_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C1" } @@ -490,14 +472,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C2.fasta" - }, - "label": "A1_B1_C2.fasta" - }, + "file": "queries/genomes/A1_B1_C2.fasta", + "label": "queries/genomes/A1_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C2" } @@ -622,14 +598,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C3.fasta" - }, - "label": "A1_B1_C3.fasta" - }, + "file": "queries/genomes/A1_B1_C3.fasta", + "label": "queries/genomes/A1_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C3" } @@ -754,14 +724,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B1_C4.fasta" - }, - "label": "A1_B1_C4.fasta" - }, + "file": "queries/genomes/A1_B1_C4.fasta", + "label": "queries/genomes/A1_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B1_C4" } @@ -886,14 +850,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2.fasta" - }, - "label": "A1_B2.fasta" - }, + "file": "queries/genomes/A1_B2.fasta", + "label": "queries/genomes/A1_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2" } @@ -1018,14 +976,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C1.fasta" - }, - "label": "A1_B2_C1.fasta" - }, + "file": "queries/genomes/A1_B2_C1.fasta", + "label": "queries/genomes/A1_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C1" } @@ -1150,14 +1102,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C2.fasta" - }, - "label": "A1_B2_C2.fasta" - }, + "file": "queries/genomes/A1_B2_C2.fasta", + "label": "queries/genomes/A1_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C2" } @@ -1282,14 +1228,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C3.fasta" - }, - "label": "A1_B2_C3.fasta" - }, + "file": "queries/genomes/A1_B2_C3.fasta", + "label": "queries/genomes/A1_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C3" } @@ -1414,14 +1354,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B2_C4.fasta" - }, - "label": "A1_B2_C4.fasta" - }, + "file": "queries/genomes/A1_B2_C4.fasta", + "label": "queries/genomes/A1_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B2_C4" } @@ -1546,14 +1480,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3.fasta" - }, - "label": "A1_B3.fasta" - }, + "file": "queries/genomes/A1_B3.fasta", + "label": "queries/genomes/A1_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3" } @@ -1678,14 +1606,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C1.fasta" - }, - "label": "A1_B3_C1.fasta" - }, + "file": "queries/genomes/A1_B3_C1.fasta", + "label": "queries/genomes/A1_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C1" } @@ -1810,14 +1732,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C2.fasta" - }, - "label": "A1_B3_C2.fasta" - }, + "file": "queries/genomes/A1_B3_C2.fasta", + "label": "queries/genomes/A1_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C2" } @@ -1942,14 +1858,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C3.fasta" - }, - "label": "A1_B3_C3.fasta" - }, + "file": "queries/genomes/A1_B3_C3.fasta", + "label": "queries/genomes/A1_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C3" } @@ -2074,14 +1984,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A1_B3_C4.fasta" - }, - "label": "A1_B3_C4.fasta" - }, + "file": "queries/genomes/A1_B3_C4.fasta", + "label": "queries/genomes/A1_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A1_B3_C4" } @@ -2206,14 +2110,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2.fasta" - }, - "label": "A2.fasta" - }, + "file": "queries/genomes/A2.fasta", + "label": "queries/genomes/A2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2" } @@ -2338,14 +2236,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1.fasta" - }, - "label": "A2_B1.fasta" - }, + "file": "queries/genomes/A2_B1.fasta", + "label": "queries/genomes/A2_B1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1" } @@ -2470,14 +2362,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C1.fasta" - }, - "label": "A2_B1_C1.fasta" - }, + "file": "queries/genomes/A2_B1_C1.fasta", + "label": "queries/genomes/A2_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C1" } @@ -2602,14 +2488,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C2.fasta" - }, - "label": "A2_B1_C2.fasta" - }, + "file": "queries/genomes/A2_B1_C2.fasta", + "label": "queries/genomes/A2_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C2" } @@ -2734,14 +2614,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C3.fasta" - }, - "label": "A2_B1_C3.fasta" - }, + "file": "queries/genomes/A2_B1_C3.fasta", + "label": "queries/genomes/A2_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C3" } @@ -2866,14 +2740,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B1_C4.fasta" - }, - "label": "A2_B1_C4.fasta" - }, + "file": "queries/genomes/A2_B1_C4.fasta", + "label": "queries/genomes/A2_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1_C4" } @@ -2998,14 +2866,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2.fasta" - }, - "label": "A2_B2.fasta" - }, + "file": "queries/genomes/A2_B2.fasta", + "label": "queries/genomes/A2_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2" } @@ -3130,14 +2992,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C1.fasta" - }, - "label": "A2_B2_C1.fasta" - }, + "file": "queries/genomes/A2_B2_C1.fasta", + "label": "queries/genomes/A2_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C1" } @@ -3262,14 +3118,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C2.fasta" - }, - "label": "A2_B2_C2.fasta" - }, + "file": "queries/genomes/A2_B2_C2.fasta", + "label": "queries/genomes/A2_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C2" } @@ -3394,14 +3244,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C3.fasta" - }, - "label": "A2_B2_C3.fasta" - }, + "file": "queries/genomes/A2_B2_C3.fasta", + "label": "queries/genomes/A2_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C3" } @@ -3526,14 +3370,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B2_C4.fasta" - }, - "label": "A2_B2_C4.fasta" - }, + "file": "queries/genomes/A2_B2_C4.fasta", + "label": "queries/genomes/A2_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B2_C4" } @@ -3660,14 +3498,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3.fasta" - }, - "label": "A2_B3.fasta" - }, + "file": "queries/genomes/A2_B3.fasta", + "label": "queries/genomes/A2_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3" } @@ -3792,14 +3624,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C1.fasta" - }, - "label": "A2_B3_C1.fasta" - }, + "file": "queries/genomes/A2_B3_C1.fasta", + "label": "queries/genomes/A2_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C1" } @@ -3924,14 +3750,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C2.fasta" - }, - "label": "A2_B3_C2.fasta" - }, + "file": "queries/genomes/A2_B3_C2.fasta", + "label": "queries/genomes/A2_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C2" } @@ -4056,14 +3876,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C3.fasta" - }, - "label": "A2_B3_C3.fasta" - }, + "file": "queries/genomes/A2_B3_C3.fasta", + "label": "queries/genomes/A2_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C3" } @@ -4188,14 +4002,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A2_B3_C4.fasta" - }, - "label": "A2_B3_C4.fasta" - }, + "file": "queries/genomes/A2_B3_C4.fasta", + "label": "queries/genomes/A2_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B3_C4" } @@ -4320,14 +4128,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3.fasta" - }, - "label": "A3.fasta" - }, + "file": "queries/genomes/A3.fasta", + "label": "queries/genomes/A3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3" } @@ -4452,14 +4254,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1.fasta" - }, - "label": "A3_B1.fasta" - }, + "file": "queries/genomes/A3_B1.fasta", + "label": "queries/genomes/A3_B1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1" } @@ -4584,14 +4380,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C1.fasta" - }, - "label": "A3_B1_C1.fasta" - }, + "file": "queries/genomes/A3_B1_C1.fasta", + "label": "queries/genomes/A3_B1_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C1" } @@ -4716,14 +4506,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C2.fasta" - }, - "label": "A3_B1_C2.fasta" - }, + "file": "queries/genomes/A3_B1_C2.fasta", + "label": "queries/genomes/A3_B1_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C2" } @@ -4848,14 +4632,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C3.fasta" - }, - "label": "A3_B1_C3.fasta" - }, + "file": "queries/genomes/A3_B1_C3.fasta", + "label": "queries/genomes/A3_B1_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C3" } @@ -4980,14 +4758,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B1_C4.fasta" - }, - "label": "A3_B1_C4.fasta" - }, + "file": "queries/genomes/A3_B1_C4.fasta", + "label": "queries/genomes/A3_B1_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B1_C4" } @@ -5112,14 +4884,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2.fasta" - }, - "label": "A3_B2.fasta" - }, + "file": "queries/genomes/A3_B2.fasta", + "label": "queries/genomes/A3_B2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2" } @@ -5244,14 +5010,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C1.fasta" - }, - "label": "A3_B2_C1.fasta" - }, + "file": "queries/genomes/A3_B2_C1.fasta", + "label": "queries/genomes/A3_B2_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C1" } @@ -5376,14 +5136,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C2.fasta" - }, - "label": "A3_B2_C2.fasta" - }, + "file": "queries/genomes/A3_B2_C2.fasta", + "label": "queries/genomes/A3_B2_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C2" } @@ -5508,14 +5262,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C3.fasta" - }, - "label": "A3_B2_C3.fasta" - }, + "file": "queries/genomes/A3_B2_C3.fasta", + "label": "queries/genomes/A3_B2_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C3" } @@ -5640,14 +5388,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B2_C4.fasta" - }, - "label": "A3_B2_C4.fasta" - }, + "file": "queries/genomes/A3_B2_C4.fasta", + "label": "queries/genomes/A3_B2_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B2_C4" } @@ -5772,14 +5514,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3.fasta" - }, - "label": "A3_B3.fasta" - }, + "file": "queries/genomes/A3_B3.fasta", + "label": "queries/genomes/A3_B3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3" } @@ -5904,14 +5640,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C1.fasta" - }, - "label": "A3_B3_C1.fasta" - }, + "file": "queries/genomes/A3_B3_C1.fasta", + "label": "queries/genomes/A3_B3_C1.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C1" } @@ -6036,14 +5766,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C2.fasta" - }, - "label": "A3_B3_C2.fasta" - }, + "file": "queries/genomes/A3_B3_C2.fasta", + "label": "queries/genomes/A3_B3_C2.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C2" } @@ -6168,14 +5892,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C3.fasta" - }, - "label": "A3_B3_C3.fasta" - }, + "file": "queries/genomes/A3_B3_C3.fasta", + "label": "queries/genomes/A3_B3_C3.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C3" } @@ -6300,14 +6018,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/A3_B3_C4.fasta" - }, - "label": "A3_B3_C4.fasta" - }, + "file": "queries/genomes/A3_B3_C4.fasta", + "label": "queries/genomes/A3_B3_C4.fasta", "report_taxon": { "key": "gambit/testdb_210818/A3_B3_C4" } @@ -6434,14 +6146,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/inconsistent.fasta" - }, - "label": "inconsistent.fasta" - }, + "file": "queries/genomes/inconsistent.fasta", + "label": "queries/genomes/inconsistent.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2" } @@ -6570,14 +6276,8 @@ } } ], - "input": { - "file": { - "compression": null, - "format": "fasta", - "path": "queries/genomes/primary_not_closest.fasta" - }, - "label": "primary_not_closest.fasta" - }, + "file": "queries/genomes/primary_not_closest.fasta", + "label": "queries/genomes/primary_not_closest.fasta", "report_taxon": { "key": "gambit/testdb_210818/A2_B1" } @@ -6599,5 +6299,5 @@ "name": "testdb_210818", "version": "1.0" }, - "timestamp": "2022-01-09T17:29:22.044407" + "timestamp": "2024-11-30T22:19:39.124711" } \ No newline at end of file From bc6e1773685ac2f9efc2d065c2e5b6fb017663ca Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 30 Nov 2024 23:10:41 -0800 Subject: [PATCH 75/86] Additional updates to results tests --- tests/cli/test_query.py | 63 +++++++++++++++++++++++------------------ tests/results.py | 8 +++--- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/tests/cli/test_query.py b/tests/cli/test_query.py index d7eaad5..9421187 100644 --- a/tests/cli/test_query.py +++ b/tests/cli/test_query.py @@ -9,7 +9,7 @@ import pytest from gambit.seq import SequenceFile -from gambit.query import QueryInput, QueryResults +from gambit.query import QueryResults from gambit.util.misc import zip_strict from gambit.util.io import write_lines, FilePath from gambit.cli.common import strip_seq_file_ext @@ -20,13 +20,13 @@ def make_args(testdb: TestDB, *, - positional_files: Optional[Iterable[SequenceFile]] = None, - list_file: Optional['FilePath'] = None, - sig_file: bool = False, - output: Optional['FilePath'] = None, - outfmt: Optional[str] = None, - strict: bool=False, - ) -> list[str]: + positional_files: Optional[Iterable[SequenceFile]] = None, + list_file: Optional['FilePath'] = None, + sig_file: bool = False, + output: Optional['FilePath'] = None, + outfmt: Optional[str] = None, + strict: bool=False, + ) -> list[str]: """Make command line arguments for querying.""" args: list[str] = [f'--db={testdb.paths.root}', 'query'] @@ -50,16 +50,29 @@ def make_args(testdb: TestDB, *, return args -def make_ref_results(testdb: TestDB, inputs: Iterable[QueryInput], strict: bool, nqueries: Optional[int]): +def make_ref_results(testdb: TestDB, + labels: Iterable[str], + strict: bool, + files: Optional[Iterable[FilePath]], + nqueries: Optional[int] = None, + ): """ Make a copy of the reference query results to compare to, modifying to account for possibly - different query inputs and # of queries. + different query labels/files and # of queries. """ ref_results = copy(testdb.get_query_results(strict)) ref_results.items = ref_results.items[:nqueries] - for item, input in zip_strict(ref_results.items, inputs): - item.input = input + for item, label in zip_strict(ref_results.items, labels): + item.label = label + + if files is None: + for item in ref_results.items: + item.file = None + + if files is not None: + for item, file in zip_strict(ref_results.items, files): + item.file = Path(file) return ref_results @@ -94,21 +107,18 @@ def check_results(results_file: Path, out_fmt: str, ref_results: QueryResults): ], ) def test_full_query(testdb: TestDB, - nqueries: Optional[int], - use_list_file: bool, - out_fmt: str, - strict: bool, - gzipped: bool, - tmp_path: Path, - ): + nqueries: Optional[int], + use_list_file: bool, + out_fmt: str, + strict: bool, + gzipped: bool, + tmp_path: Path, + ): """Run a full query using the command line interface.""" - query_files = testdb.get_query_files(gzipped)[:nqueries] - inputs = [ - QueryInput(strip_seq_file_ext(file.path.name), file) - for file in query_files - ] - ref_results: QueryResults = make_ref_results(testdb, inputs, strict, nqueries) + query_files = [sigfile.path for sigfile in testdb.get_query_files(gzipped)[:nqueries]] + labels = [strip_seq_file_ext(file.name) for file in query_files] + ref_results: QueryResults = make_ref_results(testdb, labels, strict, query_files, nqueries=nqueries) results_file = tmp_path / ('results.' + out_fmt) @@ -137,8 +147,7 @@ def test_full_query(testdb: TestDB, def test_sigfile(testdb: TestDB, out_fmt: str, strict: bool, tmp_path: Path): """Test using signature file instead of parsing genome files.""" - inputs = list(map(QueryInput, testdb.query_signatures.ids)) - ref_results = make_ref_results(testdb, inputs, strict, None) + ref_results = make_ref_results(testdb, testdb.query_signatures.ids, strict, None) results_file = tmp_path / ('results.' + out_fmt) diff --git a/tests/results.py b/tests/results.py index 88f27e0..33f0d0b 100644 --- a/tests/results.py +++ b/tests/results.py @@ -164,12 +164,12 @@ def cmp_genomematch_json(data, match: GenomeMatch): cmp_taxon_json(data['matched_taxon'], match.matched_taxon) -def check_json_results(file: TextIO, - results: QueryResults, - strict: bool = False, - ): +def check_json_results(file: TextIO, results: QueryResults, strict: bool = False): """Assert exported JSON data matches the given results object. + "Strict" mode also compares the ``timestamp``, ``gambit_version``, and ``extra`` attributes + at the top level and expects that full input file paths must match instead of just file names. + Parameters ---------- file From ae9895e69a4a171eb9fe9c6eca40b577b42749ff Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 00:23:56 -0800 Subject: [PATCH 76/86] Remove SequenceFile class --- src/gambit/cli/common.py | 17 +- src/gambit/cli/dist.py | 7 +- src/gambit/cli/signatures.py | 2 +- src/gambit/cli/tree.py | 4 +- src/gambit/query.py | 8 +- src/gambit/seq.py | 122 -------------- src/gambit/sigs/calc.py | 9 +- tests/cli/test_common.py | 11 +- tests/cli/test_dist.py | 12 +- tests/cli/test_query.py | 5 +- tests/cli/test_signatures.py | 2 +- tests/cli/test_tree.py | 2 +- tests/data/testdb_210818/generate-results.py | 6 +- tests/sigs/test_calc.py | 16 +- tests/test_query.py | 4 +- tests/test_seq.py | 163 +------------------ tests/testdb.py | 9 +- 17 files changed, 52 insertions(+), 347 deletions(-) diff --git a/src/gambit/cli/common.py b/src/gambit/cli/common.py index f02dac5..9316d59 100644 --- a/src/gambit/cli/common.py +++ b/src/gambit/cli/common.py @@ -13,7 +13,7 @@ from gambit.sigs.base import ReferenceSignatures, load_signatures from gambit.util.io import FilePath, read_lines from gambit.util.misc import join_list_human -from gambit.seq import validate_dna_seq_bytes, SequenceFile +from gambit.seq import validate_dna_seq_bytes class CLIContext: @@ -305,7 +305,7 @@ def get_sequence_files(explicit: Optional[Iterable[FilePath]]=None, listfile_dir: Optional[str]=None, strip_dir: bool = True, strip_ext: bool = True, - ) -> Union[tuple[list[str], list[SequenceFile]], tuple[None, None]]: + ) -> Union[tuple[list[str], list[Path]], tuple[None, None]]: """Get list of sequence file paths and IDs from several types of CLI arguments. Does not check for conflict between ``explicit`` and ``listfile``. @@ -325,25 +325,24 @@ def get_sequence_files(explicit: Optional[Iterable[FilePath]]=None, Returns ------- - tuple[Optional[list[str]], Optional[list[SequenceFile]]] + tuple[Optional[list[str]], Optional[list[Path]]] ``(ids, files)`` tuple. ``ids`` is a list of string IDs that can be used to label output. If the ``explicit`` and ``listfile`` arguments are None/empty both components of the tuple will be None as well. """ if explicit: - paths = list(map(Path, explicit)) - paths_str = list(map(str, paths)) + files = list(map(Path, explicit)) + files_str = list(map(str, files)) elif listfile is not None: lines = list(read_lines(listfile, skip_empty=True)) - paths = [Path(listfile_dir) / line for line in lines] - paths_str = lines + files = [Path(listfile_dir) / line for line in lines] + files_str = lines else: return None, None - files = SequenceFile.from_paths(paths, 'fasta', 'auto') - ids = [get_file_id(f, strip_dir, strip_ext) for f in paths_str] + ids = [get_file_id(f, strip_dir, strip_ext) for f in files_str] return ids, files diff --git a/src/gambit/cli/dist.py b/src/gambit/cli/dist.py index 6d702a3..7928139 100644 --- a/src/gambit/cli/dist.py +++ b/src/gambit/cli/dist.py @@ -5,7 +5,6 @@ from . import common from .root import cli -from gambit.seq import SequenceFile from gambit.sigs import load_signatures from gambit.sigs.calc import calc_file_signatures from gambit.metric import jaccarddist_matrix, jaccarddist_pairwise @@ -137,9 +136,8 @@ def dist_cmd(ctx: click.Context, # Calculate signatures if needed if query_sigs is None: - query_sigfiles = SequenceFile.from_paths(query_files, 'fasta', 'auto') query_pconf = progress_config(prog, desc='Calculating query genome signatures') if len(query_files) > 1 else None - query_sigs = calc_file_signatures(kspec, query_sigfiles, progress=query_pconf, max_workers=cores) + query_sigs = calc_file_signatures(kspec, query_files, progress=query_pconf, max_workers=cores) # Calculate distances dist_pconf = progress_config(prog, desc='Calculating distances') @@ -151,9 +149,8 @@ def dist_cmd(ctx: click.Context, else: if ref_sigs is None: - ref_sigfiles = SequenceFile.from_paths(ref_files, 'fasta', 'auto') ref_pconf = progress_config('click', desc='Calculating reference genome signatures') if len(ref_files) > 1 else None - ref_sigs = calc_file_signatures(kspec, ref_sigfiles, progress=ref_pconf) + ref_sigs = calc_file_signatures(kspec, ref_files, progress=ref_pconf) dmat = jaccarddist_matrix(query_sigs, ref_sigs, progress=dist_pconf) diff --git a/src/gambit/cli/signatures.py b/src/gambit/cli/signatures.py index 2af326d..d60628b 100644 --- a/src/gambit/cli/signatures.py +++ b/src/gambit/cli/signatures.py @@ -208,7 +208,7 @@ def create(ctx: click.Context, if dump_params: params = dict( kmerspec=kspec, - files=[f.path for f in files], + files=files, meta=meta, ids=ids, ) diff --git a/src/gambit/cli/tree.py b/src/gambit/cli/tree.py index deed426..a252c30 100644 --- a/src/gambit/cli/tree.py +++ b/src/gambit/cli/tree.py @@ -6,7 +6,6 @@ from . import common from .root import cli -from gambit.seq import SequenceFile from gambit.sigs import load_signatures from gambit.sigs.calc import calc_file_signatures from gambit.metric import jaccarddist_pairwise @@ -58,8 +57,7 @@ def tree_cmd(ctx: click.Context, common.warn_duplicate_file_ids(labels, 'Warning: the following file IDs are present more than once: {ids}') kspec = common.kspec_from_params(k, prefix, default=True) - sigfiles = SequenceFile.from_paths(genome_files, 'fasta', 'auto') - sigs = calc_file_signatures(kspec, sigfiles, progress=pconf.update(desc='Calculating signatures'), max_workers=cores) + sigs = calc_file_signatures(kspec, genome_files, progress=pconf.update(desc='Calculating signatures'), max_workers=cores) # Calculate distances dmat = jaccarddist_pairwise(sigs, progress=pconf.update(desc='Calculating distances')) diff --git a/src/gambit/query.py b/src/gambit/query.py index 784197e..daf0958 100644 --- a/src/gambit/query.py +++ b/src/gambit/query.py @@ -12,9 +12,9 @@ from gambit import __version__ as GAMBIT_VERSION from gambit.classify import classify, ClassifierResult, GenomeMatch from gambit.db import ReferenceDatabase, Taxon, ReferenceGenomeSet, reportable_taxon -from gambit.seq import SequenceFile from gambit.sigs.base import KmerSignature, SignaturesMeta, ReferenceSignatures from gambit.metric import jaccarddist_matrix +from gambit.util.io import FilePath from gambit.util.progress import progress_config, iter_progress from gambit.util.misc import zip_strict @@ -193,7 +193,7 @@ def get_result_item(db: ReferenceDatabase, params: QueryParams, dists: np.ndarra def query_parse(db: ReferenceDatabase, - files: Sequence[SequenceFile], + files: Sequence[FilePath], params: Optional[QueryParams] = None, *, labels: Optional[Sequence[str]] = None, @@ -226,7 +226,7 @@ def query_parse(db: ReferenceDatabase, parse_kw.setdefault('progress', pconf.update(desc='Parsing input')) if labels is None: - labels = [str(file.path) for file in files] + labels = [str(file) for file in files] else: if len(labels) != len(files): raise ValueError('Number of labels does not match number of files') @@ -237,6 +237,6 @@ def query_parse(db: ReferenceDatabase, # Assign file attribute of QueryResultItem's for item, file in zip_strict(results.items, files): - item.file = file.path + item.file = file return results diff --git a/src/gambit/seq.py b/src/gambit/seq.py index 741d8b7..0c9b4f0 100644 --- a/src/gambit/seq.py +++ b/src/gambit/seq.py @@ -119,125 +119,3 @@ def parse_seqs(path: FilePath, except: fobj.close() raise - - -@attrs(frozen=True, slots=True) -class SequenceFile(PathLike): - """A reference to a DNA sequence file stored in the file system. - - Contains all the information needed to read and parse the file. Implements the - :class:`os.PathLike` interface, so it can be substituted for a ``str`` or :class:`pathlib.Path` - in most function arguments that take a file path to open. - - Parameters - ---------- - path : Union[os.PathLike, str] - Value of :attr:`path` attribute. May be string or path-like object. - format : str - Value of :attr:`format` attribute. - compression : Optional[str] - Value of :attr:`compression` attribute. - - Attributes - ---------- - path - Path to the file. - format - String describing the file format as interpreted by :func:`Bio.SeqIO.parse`, e.g. - ``'fasta'``. - compression - String describing compression method of the file, e.g. ``'gzip'``. None means no - compression. See :func:`gambit.util.io.open_compressed`. - """ - path: Path = attrib(converter=Path) - format: str = attrib() - compression: Optional[str] = attrib(default=None) - - def __fspath__(self): - return str(self.path) - - def __str__(self): - return str(self.path) - - def open(self, mode: str = 'r', **kwargs) -> IO: - """ - Open a stream to the file, with compression/decompression applied - transparently. - - Parameters - ---------- - - mode : str - Same as equivalent argument to the built-in :func:open`. Some modes may not be supported - by all compression types. - \\**kwargs - Additional text mode specific keyword arguments to pass to opener. Equivalent to the - following arguments of the built-in :func:`open`: ``encoding``, ``errors``, and - ``newlines``. May not be supported by all compression types. - - Returns - ------- - IO - Stream to file in given mode. - """ - compression = 'none' if self.compression is None else self.compression - return open_compressed(self.path, mode, compression, **kwargs) - - def parse(self, **kwargs) -> ClosingIterator[SeqIO.SeqRecord]: - """Open the file and lazily parse its contents. - - Returns iterator over sequence data in file. File is parsed lazily, - and so must be kept open. The returned iterator is of type - :class:`gambit.util.io.ClosingIterator` so it will close the file stream - automatically when it finishes. It may also be used as a context manager - that closes the stream on exit. You may also close the stream explicitly - using the iterator's ``close`` method. - - Parameters - ---------- - \\**kwargs - Keyword arguments to :meth:`open`. - - Returns - ------- - gambit.util.io.ClosingIterator - Iterator yielding :class:`Bio.SeqRecord.SeqRecord` instances for each sequence in the file. - """ - - fobj = self.open('rt', **kwargs) - - try: - records = SeqIO.parse(fobj, self.format) - return ClosingIterator(records, fobj) - - except: - fobj.close() - raise - - def absolute(self) -> 'SequenceFile': - """Make a copy of the instance with an absolute path.""" - if self.path.is_absolute(): - return self - else: - return SequenceFile(self.path.absolute(), self.format, self.compression) - - @classmethod - def from_paths(cls, - paths: Iterable['FilePath'], - format: str, - compression: Optional[str] = None, - ) -> list['SequenceFile']: - """ - Create many instances at once from a collection of paths and a single - format and compression type. - - Parameters - ---------- - paths - Collection of paths as strings or path-like objects. - format - Sequence file format of files. - compression - Compression method of files. - """ - return [cls(path, format, compression) for path in paths] diff --git a/src/gambit/sigs/calc.py b/src/gambit/sigs/calc.py index 212b269..07a1864 100644 --- a/src/gambit/sigs/calc.py +++ b/src/gambit/sigs/calc.py @@ -9,7 +9,8 @@ from .base import KmerSignature, SignatureList from gambit.kmers import KmerSpec, find_kmers, kmer_to_index, nkmers, index_dtype -from gambit.seq import SEQ_TYPES, DNASeq, SequenceFile +from gambit.seq import SEQ_TYPES, DNASeq, parse_seqs +from gambit.util.io import FilePath from gambit.util.progress import iter_progress, get_progress @@ -178,7 +179,7 @@ def calc_signature(kmerspec: KmerSpec, def calc_file_signature(kspec: KmerSpec, - seqfile: SequenceFile, + seqfile: FilePath, *, accumulator: Optional[KmerAccumulator] = None, ) -> KmerSignature: @@ -203,12 +204,12 @@ def calc_file_signature(kspec: KmerSpec, .calc_signature .calc_file_signatures """ - with seqfile.parse() as records: + with parse_seqs(seqfile) as records: return calc_signature(kspec, (record.seq for record in records), accumulator=accumulator) def calc_file_signatures(kspec: KmerSpec, - files: Sequence[SequenceFile], + files: Sequence[FilePath], progress=None, concurrency: Optional[str] = 'processes', max_workers: Optional[int] = None, diff --git a/tests/cli/test_common.py b/tests/cli/test_common.py index cc8924b..fee607d 100644 --- a/tests/cli/test_common.py +++ b/tests/cli/test_common.py @@ -9,7 +9,6 @@ from gambit.cli import cli, common from gambit.db import ReferenceDatabase -from gambit.seq import SequenceFile from gambit.util.misc import zip_strict from gambit.util.io import write_lines, FilePath @@ -110,12 +109,10 @@ def check_ids(self, ids: Iterable[str], paths: Iterable['FilePath'], strip_dir: assert id_ == expected - def check_files(self, files, paths): - for file, path in zip_strict(files, paths): - assert isinstance(file, SequenceFile) - assert file.path == Path(path) - assert file.format == 'fasta' - assert file.compression == 'auto' + def check_files(self, files, expected): + for file, ex in zip_strict(files, expected): + assert isinstance(file, Path) + assert file == Path(ex) def test_explicit(self, strip_dir: bool, strip_ext: bool): """Test given explicit paths from CLI argument.""" diff --git a/tests/cli/test_dist.py b/tests/cli/test_dist.py index 0e624e6..1be78b7 100644 --- a/tests/cli/test_dist.py +++ b/tests/cli/test_dist.py @@ -10,32 +10,30 @@ from gambit.kmers import KmerSpec from gambit.metric import jaccarddist_matrix from gambit.sigs import SignatureList, dump_signatures -from gambit.util.io import write_lines +from gambit.util.io import write_lines, FilePath from gambit.cluster import load_dmat_csv import gambit.util.json as gjson from gambit.kmers import DEFAULT_KMERSPEC -from gambit.seq import SequenceFile -from gambit.cli.common import strip_seq_file_ext from ..testdb import TestDB from .common import invoke_cli -def get_query_files(testdb: TestDB, n: Optional[int] = None, gz: bool = False) -> list[SequenceFile]: +def get_query_files(testdb: TestDB, n: Optional[int] = None, gz: bool = False) -> list[Path]: return testdb.get_query_files(gz)[:n] -def get_ref_files(testdb: TestDB, n: Optional[int] = None, gz: bool = False) -> list[SequenceFile]: +def get_ref_files(testdb: TestDB, n: Optional[int] = None, gz: bool = False) -> list[Path]: return testdb.get_ref_files(gz)[:n] def make_args(testdb: TestDB, outfile: Path, *, - q_opt: Optional[list[SequenceFile]] = None, # Query files with -q option + q_opt: Optional[list[FilePath]] = None, # Query files with -q option q_list: Optional[Path] = None, # Query list file q_sigs: bool = False, # Use query signature file - r_opt: Optional[list[SequenceFile]] = None, # Ref files with -r option + r_opt: Optional[list[FilePath]] = None, # Ref files with -r option r_list: Optional[Path] = None, # Ref list file r_sigs: bool = False, # Use refs signature file r_db: bool = False, # Use db for refs diff --git a/tests/cli/test_query.py b/tests/cli/test_query.py index 9421187..2e80033 100644 --- a/tests/cli/test_query.py +++ b/tests/cli/test_query.py @@ -8,7 +8,6 @@ import pytest -from gambit.seq import SequenceFile from gambit.query import QueryResults from gambit.util.misc import zip_strict from gambit.util.io import write_lines, FilePath @@ -20,7 +19,7 @@ def make_args(testdb: TestDB, *, - positional_files: Optional[Iterable[SequenceFile]] = None, + positional_files: Optional[Iterable[FilePath]] = None, list_file: Optional['FilePath'] = None, sig_file: bool = False, output: Optional['FilePath'] = None, @@ -116,7 +115,7 @@ def test_full_query(testdb: TestDB, ): """Run a full query using the command line interface.""" - query_files = [sigfile.path for sigfile in testdb.get_query_files(gzipped)[:nqueries]] + query_files = testdb.get_query_files(gzipped)[:nqueries] labels = [strip_seq_file_ext(file.name) for file in query_files] ref_results: QueryResults = make_ref_results(testdb, labels, strict, query_files, nqueries=nqueries) diff --git a/tests/cli/test_signatures.py b/tests/cli/test_signatures.py index 182e053..7c5242c 100644 --- a/tests/cli/test_signatures.py +++ b/tests/cli/test_signatures.py @@ -65,7 +65,7 @@ class TestCreateCommand: @pytest.fixture(params=[False]) def infiles(self, request, testdb: TestDB): """Input files. Parameter is whether or not they are gzipped.""" - return [f.path for f in testdb.get_query_files(request.param)] + return testdb.get_query_files(request.param) @pytest.fixture() def outfile(self, tmp_path: Path): diff --git a/tests/cli/test_tree.py b/tests/cli/test_tree.py index 891dacf..f4a1e5e 100644 --- a/tests/cli/test_tree.py +++ b/tests/cli/test_tree.py @@ -26,7 +26,7 @@ def expected_linkage(expected_dmat): @pytest.mark.parametrize('from_sigs', [False, True]) def test_tree_command(from_sigs, expected_linkage, testdb): """Test running the command and checking the output.""" - seqfiles = [str(f.path) for f in testdb.get_query_files()] + seqfiles = [str(f) for f in testdb.get_query_files()] args = ['tree'] if from_sigs: diff --git a/tests/data/testdb_210818/generate-results.py b/tests/data/testdb_210818/generate-results.py index 4125e90..ef2620a 100755 --- a/tests/data/testdb_210818/generate-results.py +++ b/tests/data/testdb_210818/generate-results.py @@ -10,10 +10,10 @@ import sys from pathlib import Path -from gambit.seq import SequenceFile from gambit.query import QueryParams, QueryResults, query_parse from gambit.results import ResultsArchiveWriter from gambit.util.misc import zip_strict +from gambit.util.io import FilePath THISDIR = Path(__file__).parent @@ -30,7 +30,7 @@ } -def check_results(queries: list[TestQueryGenome], query_files: list[SequenceFile], results: QueryResults): +def check_results(queries: list[TestQueryGenome], query_files: list[FilePath], results: QueryResults): """Check query results object against queries.csv table before exporting.""" strict = results.params.classify_strict @@ -40,7 +40,7 @@ def check_results(queries: list[TestQueryGenome], query_files: list[SequenceFile clsresult = item.classifier_result predicted = clsresult.predicted_taxon - assert item.file == query_file.path + assert item.file == Path(query_file) # Check if warnings expected (only if in strict mode) assert bool(clsresult.warnings) == (strict and query['warnings']) diff --git a/tests/sigs/test_calc.py b/tests/sigs/test_calc.py index eb902a3..1110d26 100644 --- a/tests/sigs/test_calc.py +++ b/tests/sigs/test_calc.py @@ -11,9 +11,9 @@ from gambit.sigs.calc import calc_signature, calc_file_signature, calc_file_signatures, \ dense_to_sparse, sparse_to_dense from gambit.kmers import KmerSpec, index_to_kmer -from gambit.seq import SEQ_TYPES, revcomp, SequenceFile -import gambit.util.io as ioutil +from gambit.seq import SEQ_TYPES, revcomp from gambit.sigs import sigarray_eq, KmerSignature +from gambit.util.io import open_compressed from gambit.util.progress import check_progress from ..common import fill_bytearray, make_kmer_seq, make_kmer_seqs, convert_seq @@ -137,26 +137,26 @@ def record_sets(self): return items - @pytest.fixture(scope='class', params=[None, 'gzip']) + @pytest.fixture(scope='class', params=['none', 'gzip']) def compression(self, request): return request.param @pytest.fixture() - def files(self, record_sets: RecordSets, tmp_path: Path, compression: Optional[str]): + def files(self, record_sets: RecordSets, tmp_path: Path, compression: str): files = [] for i, (records, sig) in enumerate(record_sets): - file = SequenceFile(tmp_path / f'{i + 1}.fasta', 'fasta', compression) + file = tmp_path / f'{i + 1}.fasta' - with file.open('wt') as f: + with open_compressed(file, 'wt', compression) as f: SeqIO.write(records, f, 'fasta') files.append(file) return files - def test_calc_file_signature(self, record_sets: RecordSets, files: list[SequenceFile]): + def test_calc_file_signature(self, record_sets: RecordSets, files: list[Path]): """Test the calc_file_signature function.""" for file, (records, sig) in zip(files, record_sets): @@ -164,7 +164,7 @@ def test_calc_file_signature(self, record_sets: RecordSets, files: list[Sequence assert np.array_equal(result, sig) @pytest.mark.parametrize('concurrency', [None, 'threads', 'processes']) - def test_calc_file_signatures(self, record_sets: RecordSets, files: list[SequenceFile], concurrency: Optional[str]): + def test_calc_file_signatures(self, record_sets: RecordSets, files: list[Path], concurrency: Optional[str]): """Test the calc_file_signatures function.""" sigs = [sig for records, sig in record_sets] diff --git a/tests/test_query.py b/tests/test_query.py index 4de1477..7562b83 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -53,5 +53,5 @@ def test_query_parse(self, testdb: TestDB, strict: bool): self.check_results(results, ref_results) for file, item in zip_strict(query_files, results.items): - assert item.file == file.path - assert item.label == str(file.path) + assert item.file == file + assert item.label == str(file) diff --git a/tests/test_seq.py b/tests/test_seq.py index 7ed4543..ee20b4b 100644 --- a/tests/test_seq.py +++ b/tests/test_seq.py @@ -8,7 +8,7 @@ import numpy as np from Bio import Seq, SeqIO -from gambit.seq import SequenceFile, revcomp, parse_seqs +from gambit.seq import revcomp, parse_seqs from gambit.kmers import nkmers, index_to_kmer from gambit.util.misc import zip_strict from gambit.util.io import open_compressed @@ -90,167 +90,6 @@ def test_validate_dna_seq_bytes(): # TODO -class TestSequenceFile: - """Test the SequenceFile class.""" - - @pytest.fixture(params=['fasta'], scope='class') - def format(self, request): - """SequenceFile.format attribute.""" - return request.param - - @pytest.fixture(params=[None, 'gzip'], scope='class') - def compression(self, request): - """SequenceFile.compression attribute.""" - return request.param - - @pytest.fixture() - def seqfile(self, tmpdir, format, compression): - """A SequenceFile instance pointing to a file in a test temporary directory. - - File does not yet exist. - """ - path = tmpdir.join('test.' + format).strpath - return SequenceFile(path, format, compression) - - @pytest.fixture(scope='class') - def seqrecords(self): - """A collection of random Bio.SeqIO.SeqRecord's.""" - np.random.seed(0) - records = [] - - for i in range(20): - seq = Seq.Seq(random_seq(1000).decode('ascii')) - id_ = 'seq{}'.format(i + 1) - descr = 'Test sequence {}'.format(i + 1) - records.append(SeqIO.SeqRecord(seq, id=id_, description=descr)) - - return tuple(records) - - @pytest.fixture - def file_contents(self, format, seqrecords): - """String contents of a file containing the sequence records.""" - buf = StringIO() - SeqIO.write(seqrecords, buf, format) - return buf.getvalue() - - def test_constructor(self): - """Test constructor.""" - - seqfile = SequenceFile('foo.fasta', 'fasta') - assert seqfile == SequenceFile('foo.fasta', 'fasta', None) - assert seqfile.path == Path('foo.fasta') - - def test_eq(self): - """Test equality checking of instances.""" - seqfiles = [ - SequenceFile(p, format, comp) - for p in ['foo', 'bar'] - for format in ['fasta', 'genbank'] - for comp in [None, 'gzip'] - ] - - for i, seqfile1 in enumerate(seqfiles): - for j, seqfile2 in enumerate(seqfiles): - if i == j: - # Try with different instance - assert seqfile1 == SequenceFile(seqfile1.path, seqfile1.format, seqfile1.compression) - else: - assert seqfile1 != seqfile2 - - def test_special_methods(self, seqfile): - assert str(seqfile) == str(seqfile.path) - assert os.fspath(seqfile) == str(seqfile) - - # Check os.PathLike interface - text = 'foo' - with open(seqfile, 'w') as f: - f.write(text) - with open(seqfile, 'r') as f: - read = f.read() - assert read == text - - @pytest.mark.parametrize('binary', [False, True]) - def test_open(self, seqfile, file_contents, binary): - """Test sequence file is readable and writable.""" - - to_write = file_contents.encode() if binary else file_contents - - # Write data to file - with seqfile.open('wb' if binary else 'wt') as fobj: - fobj.write(to_write) - - # Read it back and make sure it's the same - with seqfile.open('rb' if binary else 'rt') as fobj: - read = fobj.read() - - assert read == to_write - - def test_parse(self, seqfile, seqrecords, file_contents): - """Test the parse() method, ensure we get the right records back.""" - - # Write pre-formatted contents to file - with seqfile.open('wt') as fobj: - fobj.write(file_contents) - - # Parse the sequences from it - parsed = list(seqfile.parse()) - - # Check they match - assert len(parsed) == len(seqrecords) - - for parsed_req, orig_req in zip_strict(parsed, seqrecords): - assert isinstance(parsed_req, SeqIO.SeqRecord) - assert parsed_req.seq == orig_req.seq - assert parsed_req.id == orig_req.id - - # This is something stupid BioPython does - when writing a SeqRecord - # as FASTA it writes the .id attributed followed by a space and then - # the .description attribute on the description line. When reading, - # the entire line is used as the description attribute and so - # includes the ID - assert parsed_req.description == orig_req.id + ' ' + orig_req.description - - def test_path_arg(self): - """Test the "path" argument to the constructor.""" - - path = Path('foo/bar.fasta') - - seqfile1 = SequenceFile(path, 'fasta') - assert isinstance(seqfile1, SequenceFile) and seqfile1.path == path - - seqfile2 = SequenceFile(str(path), 'fasta') - assert isinstance(seqfile2, SequenceFile) and seqfile2.path == path - - def test_absolute(self): - """Test the absolute() method.""" - - relseqfile = SequenceFile('foo/bar.fasta', 'fasta') - assert not relseqfile.path.is_absolute() - - absseqfile = relseqfile.absolute() - assert absseqfile.path.is_absolute() - assert absseqfile.path == relseqfile.path.absolute() - - absseqfile2 = absseqfile.absolute() - assert absseqfile2 == absseqfile - - def test_from_paths(self, format, compression): - """Test the from_paths() class method.""" - - # List of unique path strings - paths = ['foo/bar{}.{}'.format(i, format) for i in range(20)] - - seqfiles = SequenceFile.from_paths(paths, format, compression) - - assert len(paths) == len(seqfiles) - - for path, seqfile in zip_strict(paths, seqfiles): - assert isinstance(seqfile, SequenceFile) - assert str(seqfile.path) == path - assert seqfile.format == format - assert seqfile.compression == compression - - @pytest.fixture(scope='module') def seqrecords(): """Random SeqRecord instances.""" diff --git a/tests/testdb.py b/tests/testdb.py index b0b92fe..e26d5cd 100644 --- a/tests/testdb.py +++ b/tests/testdb.py @@ -11,7 +11,6 @@ from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker -from gambit.seq import SequenceFile from gambit.kmers import KmerSpec from gambit.sigs import load_signatures, AnnotatedSignatures from gambit.db import ReferenceDatabase, ReadOnlySession, only_genomeset @@ -195,7 +194,7 @@ def _ensure_gz(cls, file: Path, file_gz: Path): with gzip.open(file_gz, 'wt') as f: f.write(content) - def _get_genome_files(self, base: Path, names: list[str], gzipped: bool, relative: bool) -> list[SequenceFile]: + def _get_genome_files(self, base: Path, names: list[str], gzipped: bool, relative: bool) -> list[Path]: base2 = base.relative_to(self.paths.root) if relative else base files = [] @@ -210,11 +209,11 @@ def _get_genome_files(self, base: Path, names: list[str], gzipped: bool, relativ else: path = base2 / fname - files.append(SequenceFile(path, 'fasta', 'gzip' if gzipped else None)) + files.append(path) return files - def get_query_files(self, gzipped: bool = False, relative: bool = False) -> list[SequenceFile]: + def get_query_files(self, gzipped: bool = False, relative: bool = False) -> list[Path]: return self._get_genome_files( self.paths.query_genomes_dir, [genome['name'] for genome in self.query_genomes], @@ -222,7 +221,7 @@ def get_query_files(self, gzipped: bool = False, relative: bool = False) -> list relative=relative, ) - def get_ref_files(self, gzipped: bool = False, relative: bool = False) -> list[SequenceFile]: + def get_ref_files(self, gzipped: bool = False, relative: bool = False) -> list[Path]: return self._get_genome_files( self.paths.ref_genomes_dir, [genome['name'] for genome in self.ref_genomes], From 44872be6d216019030e87b97306f635221a8bdee Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 00:57:51 -0800 Subject: [PATCH 77/86] query command --pretty flag --- src/gambit/cli/query.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/gambit/cli/query.py b/src/gambit/cli/query.py index e3f04f3..5d4f495 100644 --- a/src/gambit/cli/query.py +++ b/src/gambit/cli/query.py @@ -12,15 +12,15 @@ from gambit._cython.threads import omp_set_num_threads -def get_exporter(outfmt: str): +def get_exporter(outfmt: str, pretty: bool): if outfmt == 'csv': return CSVResultsExporter() if outfmt == 'json': - return JSONResultsExporter() + return JSONResultsExporter(pretty=pretty) if outfmt == 'archive': - return ResultsArchiveWriter() + return ResultsArchiveWriter(pretty=pretty) raise ValueError(f'Invalid output format: {outfmt!r}') @@ -51,6 +51,11 @@ def get_exporter(outfmt: str): type=common.filepath(exists=True), help='File containing query signatures, to use in place of GENOMES.', ) +@click.option( + '--pretty/--no-pretty', + default=False, + hidden=True, +) @common.progress_param() @common.cores_param() @click.pass_context @@ -62,6 +67,7 @@ def query_cmd(ctx: click.Context, output: TextIO, outfmt: str, strict: bool, + pretty: bool, progress: bool, cores: Optional[int], ): @@ -71,7 +77,7 @@ def query_cmd(ctx: click.Context, db = ctx.obj.get_database() params = QueryParams(classify_strict=strict) - exporter = get_exporter(outfmt) + exporter = get_exporter(outfmt, pretty) pconf = progress_config('click' if progress else None) if cores is not None: From c10630519837ed75d163a02465d9f4ef3dbc7dda Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 01:04:10 -0800 Subject: [PATCH 78/86] Fix docstring --- src/gambit/seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gambit/seq.py b/src/gambit/seq.py index 0c9b4f0..3786f7d 100644 --- a/src/gambit/seq.py +++ b/src/gambit/seq.py @@ -98,7 +98,7 @@ def parse_seqs(path: FilePath, format String describing the file format as interpreted by :func:`Bio.SeqIO.parse`. compression - String describing compression method of the file, e.g. ``'gzip'``. None means no + String describing compression method of the file, e.g. ``'gzip'``. ``none`` means no compression. Default is to determine compression automatically (can only detect gzip or none). See :func:`gambit.util.io.open_compressed`. kwargs From 8c44e129e510a5241bb4300e49f98684ad562419 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 01:09:41 -0800 Subject: [PATCH 79/86] Test on Python 3.13 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d7f7468..173041c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9, '3.10', 3.11, 3.12] + python-version: [3.9, '3.10', 3.11, 3.12, 3.13] steps: - uses: actions/checkout@v4 From aee8039c7a189b0b18e43e976e715642437a9d39 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 02:01:39 -0800 Subject: [PATCH 80/86] Use common func for creating SQLAlchemy Sessions --- src/gambit/cli/common.py | 17 +++-------------- src/gambit/db/__init__.py | 2 +- src/gambit/db/sqla.py | 33 +++++++++++++++++++++++---------- tests/cli/test_common.py | 2 -- tests/db/test_refdb.py | 6 +++--- tests/db/test_sqla.py | 2 +- tests/testdb.py | 12 +++--------- 7 files changed, 34 insertions(+), 40 deletions(-) diff --git a/src/gambit/cli/common.py b/src/gambit/cli/common.py index 9316d59..37612f2 100644 --- a/src/gambit/cli/common.py +++ b/src/gambit/cli/common.py @@ -4,12 +4,10 @@ from collections import Counter import click -from sqlalchemy import create_engine -from sqlalchemy.engine import Engine from sqlalchemy.orm import sessionmaker from gambit.kmers import KmerSpec, DEFAULT_KMERSPEC -from gambit.db import ReferenceDatabase, ReadOnlySession, only_genomeset, DatabaseLoadError +from gambit.db import ReferenceDatabase, only_genomeset, DatabaseLoadError, file_sessionmaker from gambit.sigs.base import ReferenceSignatures, load_signatures from gambit.util.io import FilePath, read_lines from gambit.util.misc import join_list_human @@ -37,8 +35,6 @@ class CLIContext: Whether reference signatures are available. has_database Whether reference genome metadata and reference signatures are both available. - engine - SQLAlchemy engine connecting to genomes database. Session SQLAlchemy session maker for genomes database. signatures @@ -49,7 +45,6 @@ class CLIContext: has_genomes: bool has_signatures: bool has_database: bool - engine: Optional[Engine] Session: Optional[sessionmaker] signatures: Optional[ReferenceSignatures] @@ -121,16 +116,10 @@ def require_signatures(self): self.require_database() def _init_genomes(self): - if self._engine is not None or not self.has_genomes: + if self._Session is not None or not self.has_genomes: return - self._engine = create_engine(f'sqlite:///{self._genomes_path}') - self._Session = sessionmaker(self.engine, class_=ReadOnlySession) - - @property - def engine(self): - self._init_genomes() - return self._engine + self._Session = file_sessionmaker(self._genomes_path) @property def Session(self): diff --git a/src/gambit/db/__init__.py b/src/gambit/db/__init__.py index a32c494..7e9f270 100644 --- a/src/gambit/db/__init__.py +++ b/src/gambit/db/__init__.py @@ -1,3 +1,3 @@ from .models import Genome, ReferenceGenomeSet, AnnotatedGenome, Taxon, reportable_taxon, only_genomeset from .refdb import ReferenceDatabase, load_genomeset, DatabaseLoadError -from .sqla import file_sessionmaker, ReadOnlySession +from .sqla import default_sessionmaker, file_sessionmaker, ReadOnlySession diff --git a/src/gambit/db/sqla.py b/src/gambit/db/sqla.py index 18a9050..fabdef9 100644 --- a/src/gambit/db/sqla.py +++ b/src/gambit/db/sqla.py @@ -1,5 +1,6 @@ """Custom types and other utilities for SQLAlchemy.""" import os +from typing import Optional from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker, Session @@ -37,22 +38,34 @@ def process_result_value(self, value, dialect): return None if value is None else gjson.loads(value) -def file_sessionmaker(path: 'FilePath', readonly: bool = True, cls: type = None, **kw) -> sessionmaker: +def default_sessionmaker(bind, *, readonly: bool = True, class_: Optional[type] = None, **kw) -> sessionmaker: + """Create an SQLAlchemy ``sessionmaker`` using some common default settings. + + Parameters + ---------- + bind + First argument to :class:`sqlalchemy.orm.sessionmaker`. + readonly + Sets the default value for the ``class_`` keyword argument (:class:`.ReadOnlySession` if True, + otherwise uses the standard SQLAlchemy session type). + \\**kw + Additional keyword arguments to :class:`sqlalchemy.orm.sessionmaker`. + """ + if class_ is None: + class_ = ReadOnlySession if readonly else Session + return sessionmaker(bind, class_=class_, **kw) + + +def file_sessionmaker(path: 'FilePath', **kw) -> sessionmaker: """Get an SQLAlchemy ``sessionmaker`` for an sqlite database file. Parameters ---------- path Path to database file. - readonly - Sets the default value for ``class_``. - cls - SQLAlchemy ``Session`` subclass to use. Defaults to :class:`gambit.db.sqla.ReadOnlySession` - if ``readonly=True``, otherwise uses the standard SQLAlchemy session type. \\**kw - Additional keyword arguments to :class:`sqlalchemy.orm.sessionmaker`. + Additional keyword arguments to :func:`.default_sessionmaker` / + :class:`sqlalchemy.orm.sessionmaker`. """ - if cls is None: - cls = ReadOnlySession if readonly else Session engine = create_engine(f'sqlite:///{os.fspath(path)}') - return sessionmaker(engine, class_=cls, **kw) + return default_sessionmaker(engine, **kw) diff --git a/tests/cli/test_common.py b/tests/cli/test_common.py index fee607d..0145108 100644 --- a/tests/cli/test_common.py +++ b/tests/cli/test_common.py @@ -34,7 +34,6 @@ def test_no_db(self): assert not ctx.has_database assert not ctx.has_genomes assert not ctx.has_signatures - assert ctx.engine is None assert ctx.Session is None assert ctx.signatures is None @@ -65,7 +64,6 @@ def test_with_db(self, method: str, testdb: TestDB): assert ctx.has_database assert ctx.has_genomes assert ctx.has_signatures - assert ctx.engine is not None assert ctx.Session is not None assert ctx.signatures is not None diff --git a/tests/db/test_refdb.py b/tests/db/test_refdb.py index 6dc6c16..7229bb2 100644 --- a/tests/db/test_refdb.py +++ b/tests/db/test_refdb.py @@ -4,10 +4,10 @@ from pathlib import Path import pytest -from sqlalchemy.orm import sessionmaker from gambit.db import refdb -from gambit.db import Genome, ReferenceGenomeSet, AnnotatedGenome, Taxon, ReferenceDatabase, DatabaseLoadError +from gambit.db import Genome, ReferenceGenomeSet, AnnotatedGenome, Taxon, ReferenceDatabase, \ + DatabaseLoadError, default_sessionmaker from ..testdb import TestDB @@ -27,7 +27,7 @@ class TestGenomeIDMapping: def session(self, make_empty_db): """In-memory database containing genomes which have values for all ID attributes.""" engine = make_empty_db() - Session = sessionmaker(engine) + Session = default_sessionmaker(engine, readonly=False) session = Session() gset = ReferenceGenomeSet( diff --git a/tests/db/test_sqla.py b/tests/db/test_sqla.py index 318d7d5..5603f35 100644 --- a/tests/db/test_sqla.py +++ b/tests/db/test_sqla.py @@ -16,5 +16,5 @@ def test_file_sessionmaker(testdb: TestDB): assert isinstance(maker(), Session) for cls in [Session, ReadOnlySession]: - maker = file_sessionmaker(db_file, cls=cls) + maker = file_sessionmaker(db_file, class_=cls) assert isinstance(maker(), cls) diff --git a/tests/testdb.py b/tests/testdb.py index e26d5cd..d0d5787 100644 --- a/tests/testdb.py +++ b/tests/testdb.py @@ -9,11 +9,10 @@ import gzip from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker from gambit.kmers import KmerSpec from gambit.sigs import load_signatures, AnnotatedSignatures -from gambit.db import ReferenceDatabase, ReadOnlySession, only_genomeset +from gambit.db import ReferenceDatabase, only_genomeset, file_sessionmaker, default_sessionmaker from gambit.results import ResultsArchiveReader from gambit.query import QueryResults from gambit.util.io import FilePath @@ -118,15 +117,10 @@ def __init__(self, root: FilePath): results=root / 'results/', ) - @lazy - def engine(self): - """SQLAlchemy engine connected to genome database.""" - return create_engine(f'sqlite:///{self.paths.ref_genomes}') - @lazy def Session(self): """Sessionmaker for the reference genome database.""" - return sessionmaker(self.engine, class_=ReadOnlySession) + return file_sessionmaker(self.paths.ref_genomes) def copy_session(self): """Create an in-memory copy of the test database.""" @@ -134,7 +128,7 @@ def copy_session(self): memory = sqlite3.connect(':memory:') src.backup(memory) engine = create_engine('sqlite://', creator=lambda: memory) - return sessionmaker(engine)() + return default_sessionmaker(engine)() @lazy def ref_signatures(self) -> AnnotatedSignatures: From c0a0d9f7050d6938580476307cf4910fe79dbc53 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 02:35:57 -0800 Subject: [PATCH 81/86] SQLAlchemy 2.0 compatibility --- setup.cfg | 2 +- src/gambit/db/models.py | 17 +++-------------- src/gambit/db/sqla.py | 3 ++- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/setup.cfg b/setup.cfg index cccad4d..c626c66 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,7 +19,7 @@ python_requires = >= 3.9 install_requires = numpy~=1.13 - sqlalchemy~=1.1 + sqlalchemy>=1.4 # Seq stores data as bytes biopython~=1.79 attrs>=20 diff --git a/src/gambit/db/models.py b/src/gambit/db/models.py index 9e3e720..043cf83 100644 --- a/src/gambit/db/models.py +++ b/src/gambit/db/models.py @@ -5,9 +5,8 @@ import sqlalchemy as sa from sqlalchemy import Column, Integer, String, Boolean, Float from sqlalchemy import ForeignKey, UniqueConstraint -from sqlalchemy.orm import Session, relationship, backref, deferred +from sqlalchemy.orm import Session, relationship, backref, deferred, declarative_base from sqlalchemy.ext.hybrid import hybrid_property -from sqlalchemy.ext.declarative import declarative_base, declared_attr from sqlalchemy.exc import MultipleResultsFound, NoResultFound from .sqla import JsonString @@ -68,16 +67,11 @@ class Genome(Base): """ __tablename__ = 'genomes' + __table_args__= (UniqueConstraint('ncbi_db', 'ncbi_id'),) #: Attributes which serve as unique IDs. ID_ATTRS = ('key', 'genbank_acc', 'refseq_acc', 'ncbi_id') - @declared_attr - def __table_args__(cls): - return ( - UniqueConstraint('ncbi_db', 'ncbi_id'), - ) - id = Column(Integer(), primary_key=True) key = Column(String(), unique=True, nullable=False) description = Column(String(), nullable=False) @@ -136,12 +130,7 @@ class ReferenceGenomeSet(Base): for this genome set. """ __tablename__ = 'genome_sets' - - @declared_attr - def __table_args__(cls): - return ( - UniqueConstraint('key', 'version'), - ) + __table_args__ = (UniqueConstraint('key', 'version'),) id = Column(Integer(), primary_key=True) key = Column(String(), index=True, nullable=False) diff --git a/src/gambit/db/sqla.py b/src/gambit/db/sqla.py index fabdef9..0ad2957 100644 --- a/src/gambit/db/sqla.py +++ b/src/gambit/db/sqla.py @@ -53,7 +53,8 @@ def default_sessionmaker(bind, *, readonly: bool = True, class_: Optional[type] """ if class_ is None: class_ = ReadOnlySession if readonly else Session - return sessionmaker(bind, class_=class_, **kw) + # future=True - forwards compatibility with SQLAlchemy 2.0 + return sessionmaker(bind, class_=class_, future=True, **kw) def file_sessionmaker(path: 'FilePath', **kw) -> sessionmaker: From 229d0270330e4dd24320d0a5d1b094876c56b5f1 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 03:27:53 -0800 Subject: [PATCH 82/86] CI test SQLAlchemy 1.4 and 2.0 --- .github/workflows/ci.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 173041c..4661a52 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,6 +12,11 @@ jobs: fail-fast: false matrix: python-version: [3.9, '3.10', 3.11, 3.12, 3.13] + sqla-version: ["~=2.0"] + include: + # Also test with SQLALchemy >=1.4, <2 + - python-version: 3.12 + - sqla-version: "~=1.4" steps: - uses: actions/checkout@v4 @@ -27,7 +32,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install setuptools pytest + python -m pip install setuptools pytest sqlalchemy{{ matrix.sqla-version }} - name: Build package run: | From bd8f076624ba968c6d01257d91e6d2dcabdb91c9 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 03:30:13 -0800 Subject: [PATCH 83/86] Bump version to 1.1.0 --- docs/source/conf.py | 4 ++-- src/gambit/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index d3c5d3a..aefc29b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = 'GAMBIT' -copyright = '2021 - 2023, Jared Lumpe' +copyright = '2021 - 2024, Jared Lumpe' author = 'Jared Lumpe' # The full version, including alpha/beta/rc tags -release = '1.0.1' +release = '1.1.0' # -- General configuration --------------------------------------------------- diff --git a/src/gambit/__init__.py b/src/gambit/__init__.py index 7a9aa80..78d441a 100644 --- a/src/gambit/__init__.py +++ b/src/gambit/__init__.py @@ -5,4 +5,4 @@ Author email: jared@jaredlumpe.com """ -__version__ = '1.0.1' +__version__ = '1.1.0' From a1dc58686dd46949b5d7993f25c5fb867d85d6a8 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 03:33:54 -0800 Subject: [PATCH 84/86] Fix CI setup --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4661a52..fc8a255 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ jobs: include: # Also test with SQLALchemy >=1.4, <2 - python-version: 3.12 - - sqla-version: "~=1.4" + sqla-version: "~=1.4" steps: - uses: actions/checkout@v4 @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install setuptools pytest sqlalchemy{{ matrix.sqla-version }} + python -m pip install setuptools pytest sqlalchemy${{ matrix.sqla-version }} - name: Build package run: | From c1a5e6381dd26292fa1ebcb01baa699ddc923238 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 16:26:44 -0800 Subject: [PATCH 85/86] setup.cfg formatting --- setup.cfg | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/setup.cfg b/setup.cfg index c626c66..ad43822 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,24 +18,25 @@ include_package_data = true python_requires = >= 3.9 install_requires = - numpy~=1.13 - sqlalchemy>=1.4 + # Test failures on 2.x + numpy ~= 1.13 + sqlalchemy >= 1.4 # Seq stores data as bytes - biopython~=1.79 - attrs>=20 + biopython ~= 1.79 + attrs >= 20 # Minimum for 3.12, also introduces potentially breaking changes - cattrs>=23.2 - click>=7.0 - h5py~=3.0 - scipy~=1.7 - typing-extensions>=4.0 + cattrs >= 23.2 + click >= 7.0 + h5py ~= 3.0 + scipy ~= 1.7 + typing-extensions >= 4.0 tests_require = pytest [options.packages.find] -where=src +where = src [options.entry_points] From f5ad92d57a0507541b0c52301e4ee730537cc504 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sun, 1 Dec 2024 19:15:56 -0800 Subject: [PATCH 86/86] Update changelog --- CHANGELOG.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db7c1a5..b8c19ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,15 @@ # Changelog -## DEV - -* CLI - * Add additional details to `signatures info` command. -* Other - * Require Python 3.9 +## 1.1.0 + +* Command line interface: + * Better error reporting when database file(s) not found + * Add more details to output of `gambit signatures info` command. +* Major overhaul of internal Python API and tests (see full release notes on GitHub) + * Many fixes to API documentation +* Increase minimum Python version to 3.9 +* Make compatible with SQLAlchemy 2.0 ## 1.0.1