diff --git a/collectoss/application/cli/_cli_util.py b/collectoss/application/cli/_cli_util.py index 0e1a7e1aa..4198fc248 100644 --- a/collectoss/application/cli/_cli_util.py +++ b/collectoss/application/cli/_cli_util.py @@ -4,6 +4,7 @@ import psutil import signal from urllib.parse import urlparse +import sqlalchemy as s from collectoss.tasks.init.redis_connection import get_redis_connection @@ -69,4 +70,21 @@ def raise_open_file_limit(num_files): resource.setrlimit(resource.RLIMIT_NOFILE, (num_files, current_hard)) - return \ No newline at end of file + return + +def get_db_version(engine): + """ a helper function to return the database version + used in print_db_version and upgrade_db_version in the db cli module + """ + + db_version_sql = s.sql.text( + """ + SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version' + """ + ) + + with engine.connect() as connection: + result = int(connection.execute(db_version_sql).fetchone()[2]) + + engine.dispose() + return result \ No newline at end of file diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index fd5db52cf..237644ba3 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -29,6 +29,8 @@ process_repo_group_csv, ) +from ._cli_util import get_db_version + logger = logging.getLogger(__name__) @@ -258,21 +260,6 @@ def add_github_org(ctx, organization_name): controller.add_cli_org(organization_name) -# get_db_version is a helper function to print_db_version and upgrade_db_version -def get_db_version(engine): - db_version_sql = s.sql.text( - """ - SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version' - """ - ) - - with engine.connect() as connection: - result = int(connection.execute(db_version_sql).fetchone()[2]) - - engine.dispose() - return result - - @cli.command("print-db-version") @test_connection @test_db_connection diff --git a/collectoss/application/cli/selftest.py b/collectoss/application/cli/selftest.py new file mode 100644 index 000000000..587d939f6 --- /dev/null +++ b/collectoss/application/cli/selftest.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: MIT +import logging +from typing import Union +import click +from sqlalchemy import Constraint, ForeignKeyConstraint, Index, select, func, Table + +from collectoss.application.cli import ( + test_db_connection, + with_database, + DatabaseContext, +) +from collectoss.application.db.models.augur_data import Commit + +from alembic.config import Config +from alembic import command +from alembic.util.exc import AutogenerateDiffsDetected + + +logger = logging.getLogger(__name__) + + +@click.group("selftest", short_help="CollectOSS self-testing utilities") +@click.pass_context +def cli(ctx): + ctx.obj = DatabaseContext() + + +def check_database_parity_with_versioned_model() -> list: + """Checks for alignent between the versioned SQLAlchemy models and the actual database state + + Returns: + list: a list of diff entries in alembic's format + """ + alembic_cfg = Config("alembic.ini") + try: + # 2. Call the 'check' command + # This will raise an AutogenerateDiffsDetected error if + # the database is out of sync with the models. + command.check(alembic_cfg) + return [] + + + except AutogenerateDiffsDetected as e: + return list(e.diffs) + + +def humanize_alembic_diff(diff: list[Union[tuple, list]], debug=False) -> list[str]: + """transform an alembic diff for printing in a more intuitive way + + Args: + diff (list): A diff list created by Alembic + + Returns: + list[str]: A list of findings as strings + """ + output: list[str] = [] + + if len(diff) == 0: + return output + + # Alembic diffs use their own bespoke format, documented in https://alembic.sqlalchemy.org/en/latest/api/autogenerate.html#alembic.autogenerate.compare_metadata + for item in diff: + # coerce every item into a mini-diff list to make processing easier + changes_to_process = item if isinstance(item, list) else [item] + + for change in changes_to_process: + finding = "" + + if debug: + # its not the most ideal to do this here rather than bypassing the whole function, + # but debug mode still benefits from the type coercion above + finding = str(change) + output.append(finding) + + continue + + action, category = change[0].split("_") + + action_text = "" + + if action == "add": + action_text = "found in models but not database" + elif action == "modify": + action_text = "modified" + elif action == "remove": + action_text = "found in database but not models" + + + if category == "table": + table_def:Table = change[1] + finding += f"Table {table_def.name} in schema {table_def.schema}" + + elif category == "column": + _, _, table_name, col_def = change + finding += f"Column {col_def.name} in table {table_name}" + + elif category == "fk": + fk_def:ForeignKeyConstraint = change[1] + fk_name = f" called '{fk_def.name}'" if fk_def.name else "" + finding += f"Foreign Key{fk_name} on `{fk_def.table.columns[0]}` linking to `{fk_def.table}`" + + attrs = [] + if fk_def.onupdate: + attrs.append(f"ON UPDATE {fk_def.onupdate}") + if fk_def.ondelete: + attrs.append(f"ON DELETE {fk_def.ondelete}") + if fk_def.deferrable: + attrs.append("DEFERRABLE") + if fk_def.initially: + attrs.append(f"INITIALLY {fk_def.initially}") + + if attrs: + finding += f"\n\tRules: {' | '.join(attrs)}" + elif category == "index": + index_def:Index = change[1] + cols = [c.name for c in index_def.columns] + finding += f"Index {index_def.name} on '{index_def.table}' covering {','.join(cols)}" + + elif category == "constraint": + constraint_def:Constraint = change[1] + constraint_name = f" called '{constraint_def.name}'" if constraint_def.name else "" + constraint_type = constraint_def.__class__.__name__ or "Constraint" + constraint_covers = [c.name for c in constraint_def.columns] + finding += f"{constraint_type}{constraint_name} covering {','.join(constraint_covers)}" + + elif category == "nullable": + _, schema, table_name, col_name, _, old, new = change + print(f"Column {col_name} in table {table_name} nullable changed: {old} -> {new}") + + else: + finding += "unknown type found in alembic diff:\n" + finding += str(change) + + finding += f"\n\t{action_text}" + output.append(finding) + + return output + + +@cli.command("report") +@click.option("--debug", is_flag=True, default=False, help="Show more detailed information for debuging purposes") +@test_db_connection +@with_database +@click.pass_context +def run_selftest_report(ctx, debug): + """ + Run queries to evaluate various aspects of the collectoss system's functioning and produce a report + """ + click.echo('Generating CollectOSS selftest report....') + + model_diffs = check_database_parity_with_versioned_model() + + cmt_author_name_issue_233_query = ( + select(func.count()) + .select_from(Commit) + .where(Commit.cmt_author_name == '') + ) + cmt_author_name_issue_233_count = None + + with ctx.obj.engine.begin() as connection: + cmt_author_name_issue_233_count = connection.execute(cmt_author_name_issue_233_query).scalar_one() + click.echo(f'Issue 233 count: {cmt_author_name_issue_233_count} commit changes in the `commits` table contain authors with an empty string as their name') + + model_diff_findings = humanize_alembic_diff(model_diffs, debug=debug) + + if len(model_diff_findings) == 0: + print("✅ No drift detected. Database is in sync with models.") + else: + print(f"❌ Database drift detected:") + for f in model_diff_findings: + print(f) \ No newline at end of file