From 1a4373ec9b36286c661aa8d15b0ee66d06331ef6 Mon Sep 17 00:00:00 2001 From: tanmay-9 Date: Thu, 17 Apr 2025 16:13:29 +0200 Subject: [PATCH 1/7] Added commands to setup Oxigraph Added all the commands necessary to setup SPARQL endpoint for oxigraph natively and containerized --- pyproject.toml | 2 + src/qoxigraph/__init__.py | 0 src/qoxigraph/commands/__init__.py | 0 src/qoxigraph/commands/example_queries.py | 12 ++ src/qoxigraph/commands/extract_queries.py | 1 + src/qoxigraph/commands/get_data.py | 1 + src/qoxigraph/commands/index.py | 109 ++++++++++++++ src/qoxigraph/commands/log.py | 50 +++++++ src/qoxigraph/commands/query.py | 54 +++++++ src/qoxigraph/commands/setup_config.py | 109 ++++++++++++++ src/qoxigraph/commands/start.py | 175 ++++++++++++++++++++++ src/qoxigraph/commands/status.py | 19 +++ src/qoxigraph/commands/stop.py | 71 +++++++++ 13 files changed, 603 insertions(+) create mode 100644 src/qoxigraph/__init__.py create mode 100644 src/qoxigraph/commands/__init__.py create mode 100644 src/qoxigraph/commands/example_queries.py create mode 120000 src/qoxigraph/commands/extract_queries.py create mode 120000 src/qoxigraph/commands/get_data.py create mode 100644 src/qoxigraph/commands/index.py create mode 100644 src/qoxigraph/commands/log.py create mode 100644 src/qoxigraph/commands/query.py create mode 100644 src/qoxigraph/commands/setup_config.py create mode 100644 src/qoxigraph/commands/start.py create mode 100644 src/qoxigraph/commands/status.py create mode 100644 src/qoxigraph/commands/stop.py diff --git a/pyproject.toml b/pyproject.toml index b053fe62..12c82f47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,8 @@ Github = "https://github.com/ad-freiburg/qlever" [project.scripts] "qlever" = "qlever.qlever_main:main" +"qoxigraph" = "qlever.qlever_main:main" +"qlever-old" = "qlever.qlever_old:main" [tool.setuptools] license-files = ["LICENSE"] diff --git a/src/qoxigraph/__init__.py b/src/qoxigraph/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/qoxigraph/commands/__init__.py b/src/qoxigraph/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/qoxigraph/commands/example_queries.py b/src/qoxigraph/commands/example_queries.py new file mode 100644 index 00000000..d62982a8 --- /dev/null +++ b/src/qoxigraph/commands/example_queries.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from qlever.commands.example_queries import ( + ExampleQueriesCommand as QleverExampleQueriesCommand, +) + + +class ExampleQueriesCommand(QleverExampleQueriesCommand): + def execute(self, args) -> bool: + if not args.sparql_endpoint: + args.sparql_endpoint = f"localhost:{args.port}/query" + return super().execute(args) diff --git a/src/qoxigraph/commands/extract_queries.py b/src/qoxigraph/commands/extract_queries.py new file mode 120000 index 00000000..5667cc52 --- /dev/null +++ b/src/qoxigraph/commands/extract_queries.py @@ -0,0 +1 @@ +../../qlever/commands/extract_queries.py \ No newline at end of file diff --git a/src/qoxigraph/commands/get_data.py b/src/qoxigraph/commands/get_data.py new file mode 120000 index 00000000..4900dbb8 --- /dev/null +++ b/src/qoxigraph/commands/get_data.py @@ -0,0 +1 @@ +../../qlever/commands/get_data.py \ No newline at end of file diff --git a/src/qoxigraph/commands/index.py b/src/qoxigraph/commands/index.py new file mode 100644 index 00000000..128b9a82 --- /dev/null +++ b/src/qoxigraph/commands/index.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +import glob +import shlex +from pathlib import Path + +from qlever.command import QleverCommand +from qlever.containerize import Containerize +from qlever.log import log +from qlever.util import binary_exists, run_command + + +class IndexCommand(QleverCommand): + def __init__(self): + self.script_name = "qoxigraph" + + def description(self) -> str: + return "Build the index for a given RDF dataset" + + def should_have_qleverfile(self) -> bool: + return True + + def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + return { + "data": ["name", "format"], + "index": ["input_files"], + "runtime": ["system", "image", "index_container"], + } + + def additional_arguments(self, subparser): + subparser.add_argument( + "--index-binary", + type=str, + default="oxigraph", + help=( + "The binary for building the index (default: oxigraph) " + "(this requires that you have oxigraph-cli installed " + "on your machine)" + ), + ) + + @staticmethod + def wrap_cmd_in_container(args, cmd: str) -> str: + return Containerize().containerize_command( + cmd=cmd, + container_system=args.system, + run_subcommand="run --rm", + image_name=args.image, + container_name=args.index_container, + volumes=[("$(pwd)", "/index")], + working_directory="/index", + use_bash=False, + ) + + def execute(self, args) -> bool: + index_cmd = f"load --location . --file {args.input_files}" + index_cmd += f" |& tee {args.name}.index-log.txt" + + index_cmd = ( + f"{args.index_binary} {index_cmd}" + if args.system == "native" + else self.wrap_cmd_in_container(args, index_cmd) + ) + + # Show the command line. + self.show(index_cmd, only_show=args.show) + if args.show: + return True + + # Check if all of the input files exist. + for pattern in shlex.split(args.input_files): + if len(glob.glob(pattern)) == 0: + log.error(f'No file matching "{pattern}" found') + log.info("") + log.info( + f"Did you call `{self.script_name} get-data`? If you did, " + "check GET_DATA_CMD and INPUT_FILES in the Qleverfile" + ) + return False + + # When running natively, check if the binary exists and works. + if args.system == "native": + if not binary_exists(args.index_binary, "index-binary"): + return False + else: + if Containerize().is_running(args.system, args.index_container): + log.info( + f"{args.system} container {args.index_container} is still up, " + "which means that data loading is in progress. Please wait..." + ) + return False + + if len([p.name for p in Path.cwd().glob("*.sst")]) != 0: + log.error( + "Index files (*.sst) found in current directory " + "which shows presence of a previous index" + ) + log.info("") + log.info("Aborting the index operation...") + return False + + # Run the index command. + try: + run_command(index_cmd, show_output=True, show_stderr=True) + except Exception as e: + log.error(f"Building the index failed: {e}") + return False + + return True diff --git a/src/qoxigraph/commands/log.py b/src/qoxigraph/commands/log.py new file mode 100644 index 00000000..a90d2228 --- /dev/null +++ b/src/qoxigraph/commands/log.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from qlever.commands.log import LogCommand as QleverLogCommand +from qlever.containerize import Containerize +from qlever.log import log +from qlever.util import run_command + + +class LogCommand(QleverLogCommand): + def __init__(self): + self.script_name = "qoxigraph" + + def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + return { + "data": ["name"], + "runtime": [ + "system", + "image", + "server_container", + ], + } + + def execute(self, args) -> bool: + if args.system == "native": + return super().execute(args) + + log_cmd = f"{args.system} logs " + + if not args.from_beginning: + log_cmd += f"-n {args.tail_num_lines} " + if not args.no_follow: + log_cmd += "-f " + + log_cmd += args.server_container + + # Show the command line. + self.show(log_cmd, only_show=args.show) + if args.show: + return True + + if not Containerize().is_running(args.system, args.server_container): + log.error(f"No server container {args.server_container} found!\n") + log.info(f"Are you sure you called `{self.script_name} start`?") + return False + + try: + run_command(log_cmd, show_output=True, show_stderr=True) + except Exception as e: + log.error(f"Cannot display container logs - {e}") + return True diff --git a/src/qoxigraph/commands/query.py b/src/qoxigraph/commands/query.py new file mode 100644 index 00000000..6518905f --- /dev/null +++ b/src/qoxigraph/commands/query.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from qlever.commands.query import QueryCommand as QleverQueryCommand + + +class QueryCommand(QleverQueryCommand): + def additional_arguments(self, subparser) -> None: + subparser.add_argument( + "query", + type=str, + nargs="?", + default="SELECT * WHERE { ?s ?p ?o } LIMIT 10", + help="SPARQL query to send", + ) + subparser.add_argument( + "--predefined-query", + type=str, + choices=self.predefined_queries.keys(), + help="Use a predefined query", + ) + subparser.add_argument( + "--sparql-endpoint", type=str, help="URL of the SPARQL endpoint" + ) + subparser.add_argument( + "--accept", + type=str, + choices=[ + "text/tab-separated-values", + "text/csv", + "application/sparql-results+json", + "application/sparql-results+xml", + ], + default="text/tab-separated-values", + help="Accept header for the SPARQL query", + ) + subparser.add_argument( + "--get", + action="store_true", + default=False, + help="Use GET request instead of POST", + ) + subparser.add_argument( + "--no-time", + action="store_true", + default=False, + help="Do not print the (end-to-end) time taken", + ) + + def execute(self, args) -> bool: + if not args.sparql_endpoint: + args.sparql_endpoint = f"localhost:{args.port}/query" + args.pin_to_cache = None + args.access_token = None + super().execute(args) diff --git a/src/qoxigraph/commands/setup_config.py b/src/qoxigraph/commands/setup_config.py new file mode 100644 index 00000000..b6d9225b --- /dev/null +++ b/src/qoxigraph/commands/setup_config.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from configparser import RawConfigParser +from pathlib import Path + +from qlever.command import QleverCommand +from qlever.log import log +from qlever.qleverfile import Qleverfile + + +class SetupConfigCommand(QleverCommand): + IMAGE = "ghcr.io/oxigraph/oxigraph" + + FILTER_CRITERIA = { + "data": [], + "index": ["INPUT_FILES"], + "server": ["PORT"], + "runtime": ["SYSTEM", "IMAGE"], + "ui": ["UI_CONFIG"], + } + + def __init__(self): + self.qleverfiles_path = ( + Path(__file__).parent.parent.parent / "qlever" / "Qleverfiles" + ) + self.qleverfile_names = [ + p.name.split(".")[1] + for p in self.qleverfiles_path.glob("Qleverfile.*") + ] + + def description(self) -> str: + return "Get a pre-configured Qleverfile" + + def should_have_qleverfile(self) -> bool: + return False + + def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + return {} + + def additional_arguments(self, subparser) -> None: + subparser.add_argument( + "config_name", + type=str, + choices=self.qleverfile_names, + help="The name of the pre-configured Qleverfile to create", + ) + + def validate_qleverfile_setup( + self, args, qleverfile_path: Path + ) -> bool | None: + # Construct the command line and show it. + setup_config_show = ( + f"Creating Qleverfile for {args.config_name} using " + f"Qleverfile.{args.config_name} file in {self.qleverfiles_path}" + ) + self.show(setup_config_show, only_show=args.show) + if args.show: + return True + + # If there is already a Qleverfile in the current directory, exit. + if qleverfile_path.exists(): + log.error("`Qleverfile` already exists in current directory") + log.info("") + log.info( + "If you want to create a new Qleverfile using " + "`qlever setup-config`, delete the existing Qleverfile " + "first" + ) + return False + return None + + def get_filtered_qleverfile_parser( + self, config_name: str + ) -> RawConfigParser: + qleverfile_config_path = ( + self.qleverfiles_path / f"Qleverfile.{config_name}" + ) + qleverfile_parser = Qleverfile.filter( + qleverfile_config_path, self.FILTER_CRITERIA + ) + if qleverfile_parser.has_section("runtime"): + qleverfile_parser.set("runtime", "IMAGE", self.IMAGE) + return qleverfile_parser + + def execute(self, args) -> bool: + qleverfile_path = Path("Qleverfile") + exit_status = self.validate_qleverfile_setup(args, qleverfile_path) + if exit_status is not None: + return exit_status + + qleverfile_parser = self.get_filtered_qleverfile_parser( + args.config_name + ) + # Copy the Qleverfile to the current directory. + try: + with qleverfile_path.open("w") as f: + qleverfile_parser.write(f) + except Exception as e: + log.error( + f'Could not copy "{qleverfile_path}" to current directory: {e}' + ) + return False + + # If we get here, everything went well. + log.info( + f'Created Qleverfile for config "{args.config_name}"' + f" in current directory" + ) + return True diff --git a/src/qoxigraph/commands/start.py b/src/qoxigraph/commands/start.py new file mode 100644 index 00000000..8a038344 --- /dev/null +++ b/src/qoxigraph/commands/start.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +import subprocess +import time +from pathlib import Path + +from qlever.command import QleverCommand +from qlever.containerize import Containerize +from qlever.log import log +from qlever.util import binary_exists, is_server_alive, run_command + + +class StartCommand(QleverCommand): + def __init__(self): + self.script_name = "qoxigraph" + + def description(self) -> str: + return ( + "Start the server for Oxigraph (requires that you have built an " + "index before)" + ) + + def should_have_qleverfile(self) -> bool: + return True + + def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + return { + "data": ["name"], + "server": ["host_name", "port"], + "runtime": ["system", "image", "server_container"], + } + + def additional_arguments(self, subparser): + subparser.add_argument( + "--run-in-foreground", + action="store_true", + default=False, + help=( + "Run the start command in the foreground " + "(default: run in the background)" + ), + ) + subparser.add_argument( + "--server-binary", + type=str, + default="oxigraph", + help=( + "The binary for starting the server (default: oxigraph) " + "(this requires that you have oxigraph-cli installed " + "on your machine)" + ), + ) + + @staticmethod + def wrap_cmd_in_container(args, cmd: str) -> str: + run_subcommand = "run --restart=unless-stopped" + if not args.run_in_foreground: + run_subcommand += " -d" + return Containerize().containerize_command( + cmd=cmd, + container_system=args.system, + run_subcommand=run_subcommand, + image_name=args.image, + container_name=args.server_container, + volumes=[("$(pwd)", "/index")], + ports=[(args.port, args.port)], + working_directory="/index", + use_bash=False, + ) + + def execute(self, args) -> bool: + bind = ( + f"{args.host_name}:{args.port}" + if args.system == "native" + else f"0.0.0.0:{args.port}" + ) + start_cmd = f"serve-read-only --location . --bind={bind}" + + if args.system == "native": + start_cmd = f"{args.server_binary} {start_cmd}" + if not args.run_in_foreground: + start_cmd = ( + f"nohup {start_cmd} > {args.name}.server-log.txt 2>&1 &" + ) + else: + start_cmd = self.wrap_cmd_in_container(args, start_cmd) + + # Show the command line. + self.show(start_cmd, only_show=args.show) + if args.show: + return True + + endpoint_url = f"http://{args.host_name}:{args.port}/query" + + # When running natively, check if the binary exists and works. + if args.system == "native": + if not binary_exists(args.server_binary, "server-binary"): + return False + else: + if Containerize().is_running(args.system, args.server_container): + log.error( + f"Server container {args.server_container} already exists!\n" + ) + log.info( + f"To kill the existing server, use `{self.script_name} stop`" + ) + return False + + # Check if index files (*.sst) present in cwd + if len([p.name for p in Path.cwd().glob("*.sst")]) == 0: + log.error(f"No Oxigraph index files for {args.name} found!\n") + log.info( + f"Did you call `{self.script_name} index`? If you did, check " + "if .sst index files are present in current working directory." + ) + return False + + if is_server_alive(url=endpoint_url): + log.error(f"Oxigraph server already running on {endpoint_url}\n") + log.info( + f"To kill the existing server, use `{self.script_name} stop`" + ) + return False + + try: + process = run_command( + start_cmd, + use_popen=args.run_in_foreground, + ) + except Exception as e: + log.error(f"Starting the Oxigraph server failed ({e})") + return False + + # Tail the server log until the server is ready (note that the `exec` + # is important to make sure that the tail process is killed and not + # just the bash process). + if args.run_in_foreground: + log.info( + "Follow the server logs as long as the server is" + " running (Ctrl-C stops the server)" + ) + else: + log.info( + "Follow the server logs until the server is ready" + " (Ctrl-C stops following the log, but NOT the server)" + ) + log.info("") + if args.system == "native": + log_cmd = f"exec tail -f {args.name}.server-log.txt" + else: + time.sleep(2) + log_cmd = f"exec {args.system} logs -f {args.server_container}" + log_proc = subprocess.Popen(log_cmd, shell=True) + while not is_server_alive(endpoint_url): + time.sleep(1) + + log.info( + f"Oxigraph server webapp for {args.name} will be available at " + f"http://{args.host_name}:{args.port} and the sparql endpoint for " + f"queries is {endpoint_url} when the server is ready" + ) + + # Kill the log process + if not args.run_in_foreground: + log_proc.terminate() + + # With `--run-in-foreground`, wait until the server is stopped. + if args.run_in_foreground: + try: + process.wait() + except KeyboardInterrupt: + process.terminate() + log_proc.terminate() + + return True diff --git a/src/qoxigraph/commands/status.py b/src/qoxigraph/commands/status.py new file mode 100644 index 00000000..eb2de86c --- /dev/null +++ b/src/qoxigraph/commands/status.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from qlever.commands.status import StatusCommand as QleverStatusCommand + + +class StatusCommand(QleverStatusCommand): + DEFAULT_REGEX = "oxigraph\\s+serve-read-only" + + def description(self) -> str: + return "Show Oxigraph processes running on this machine" + + def additional_arguments(self, subparser) -> None: + subparser.add_argument( + "--cmdline-regex", + default=self.DEFAULT_REGEX, + help=( + "Show only processes where the command line matches this regex" + ), + ) diff --git a/src/qoxigraph/commands/stop.py b/src/qoxigraph/commands/stop.py new file mode 100644 index 00000000..ed9c1036 --- /dev/null +++ b/src/qoxigraph/commands/stop.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from qlever.command import QleverCommand +from qlever.commands import stop as qlever_stop +from qlever.log import log +from qoxigraph.commands.status import StatusCommand + + +class StopCommand(QleverCommand): + # Override this with StatusCommand from child class for execute + # method to work as intended + STATUS_COMMAND = StatusCommand() + DEFAULT_REGEX = "oxigraph\\s+serve-read-only.*:%%PORT%%" + + def __init__(self): + pass + + def description(self) -> str: + return "Stop Oxigraph server for a given dataset or port" + + def should_have_qleverfile(self) -> bool: + return True + + def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + return { + "data": ["name"], + "server": ["port"], + "runtime": ["system", "server_container"], + } + + def additional_arguments(self, subparser) -> None: + subparser.add_argument( + "--cmdline-regex", + default=self.DEFAULT_REGEX, + help="Show only processes where the command " + "line matches this regex", + ) + + def execute(self, args) -> bool: + cmdline_regex = args.cmdline_regex.replace("%%PORT%%", str(args.port)) + description = ( + f'Checking for processes matching "{cmdline_regex}"' + if args.system == "native" + else f"Checking for container with name {args.server_container}" + ) + + self.show(description, only_show=args.show) + if args.show: + return True + + if args.system == "native": + stop_process_results = ( + qlever_stop.StopCommand().stop_process_with_regex( + cmdline_regex + ) + ) + if stop_process_results is None: + return False + if len(stop_process_results) > 0: + return all(stop_process_results) + + # If no matching process found, show a message and the output of the + # status command. + log.error("No matching process found") + args.cmdline_regex = self.STATUS_COMMAND.DEFAULT_REGEX + log.info("") + StatusCommand().execute(args) + return True + + # First check if container is running and if yes, stop and remove it + return qlever_stop.stop_container(args.server_container) From 956a59df7e60f71f57315173c36ca81acec13ba8 Mon Sep 17 00:00:00 2001 From: tanmay-9 Date: Mon, 19 May 2025 23:51:47 +0200 Subject: [PATCH 2/7] Fix host_name bug in example_querie and add util stop_with_regex command to stop --- src/qoxigraph/commands/example_queries.py | 2 +- src/qoxigraph/commands/stop.py | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/qoxigraph/commands/example_queries.py b/src/qoxigraph/commands/example_queries.py index d62982a8..4ef76c24 100644 --- a/src/qoxigraph/commands/example_queries.py +++ b/src/qoxigraph/commands/example_queries.py @@ -8,5 +8,5 @@ class ExampleQueriesCommand(QleverExampleQueriesCommand): def execute(self, args) -> bool: if not args.sparql_endpoint: - args.sparql_endpoint = f"localhost:{args.port}/query" + args.sparql_endpoint = f"{args.host_name}:{args.port}/query" return super().execute(args) diff --git a/src/qoxigraph/commands/stop.py b/src/qoxigraph/commands/stop.py index ed9c1036..dedd1ff2 100644 --- a/src/qoxigraph/commands/stop.py +++ b/src/qoxigraph/commands/stop.py @@ -3,6 +3,7 @@ from qlever.command import QleverCommand from qlever.commands import stop as qlever_stop from qlever.log import log +from qlever.util import stop_process_with_regex from qoxigraph.commands.status import StatusCommand @@ -49,11 +50,7 @@ def execute(self, args) -> bool: return True if args.system == "native": - stop_process_results = ( - qlever_stop.StopCommand().stop_process_with_regex( - cmdline_regex - ) - ) + stop_process_results = stop_process_with_regex(cmdline_regex) if stop_process_results is None: return False if len(stop_process_results) > 0: From b1310ad42b07ab8ba9df177da06c4c2fb1e585c6 Mon Sep 17 00:00:00 2001 From: tanmay-9 Date: Fri, 27 Mar 2026 15:59:39 +0100 Subject: [PATCH 3/7] Update branch code and have only qoxigraph latest code --- README.md | 80 +- pyproject.toml | 14 +- src/qlever/Qleverfiles/Qleverfile.dblp | 5 +- src/qlever/Qleverfiles/Qleverfile.default | 2 +- src/qlever/Qleverfiles/Qleverfile.fbeasy | 6 +- src/qlever/Qleverfiles/Qleverfile.freebase | 2 +- src/qlever/Qleverfiles/Qleverfile.imdb | 7 +- src/qlever/Qleverfiles/Qleverfile.ohm-planet | 5 +- src/qlever/Qleverfiles/Qleverfile.osm-country | 49 +- src/qlever/Qleverfiles/Qleverfile.osm-planet | 15 +- .../Qleverfile.osm-planet-from-pbf | 42 + src/qlever/Qleverfiles/Qleverfile.pubchem | 4 +- src/qlever/Qleverfiles/Qleverfile.uniprot | 7 +- src/qlever/Qleverfiles/Qleverfile.wikidata | 2 +- .../Qleverfiles/Qleverfile.wikidata-munged | 47 + .../Qleverfiles/Qleverfile.wikipathways | 2 +- src/qlever/command.py | 2 +- src/qlever/commands/add_text_index.py | 17 +- src/qlever/commands/benchmark_queries.py | 1241 ++++++++++++++++ src/qlever/commands/cache_stats.py | 86 +- src/qlever/commands/clear_cache.py | 32 +- src/qlever/commands/extract_queries.py | 19 +- src/qlever/commands/get_data.py | 2 +- src/qlever/commands/index.py | 37 +- src/qlever/commands/index_stats.py | 501 ++++--- src/qlever/commands/log.py | 2 +- src/qlever/commands/materialized_view.py | 110 ++ src/qlever/commands/query.py | 5 +- src/qlever/commands/rebuild_index.py | 337 +++++ src/qlever/commands/reset_updates.py | 59 + src/qlever/commands/settings.py | 113 +- src/qlever/commands/setup_config.py | 2 +- src/qlever/commands/start.py | 46 +- src/qlever/commands/status.py | 4 +- src/qlever/commands/stop.py | 6 +- src/qlever/commands/system_info.py | 2 +- src/qlever/commands/ui.py | 20 +- src/qlever/commands/update.py | 90 ++ src/qlever/commands/update_wikidata.py | 1306 +++++++++++++++++ src/qlever/commands/warmup.py | 2 +- src/qlever/config.py | 10 + src/qlever/containerize.py | 14 +- src/qlever/qlever_main.py | 3 +- src/qlever/qleverfile.py | 103 +- src/qlever/util.py | 66 +- src/qoxigraph/commands/benchmark_queries.py | 17 + src/qoxigraph/commands/index.py | 168 ++- src/qoxigraph/commands/index_stats.py | 72 + src/qoxigraph/commands/log.py | 18 +- src/qoxigraph/commands/query.py | 12 +- src/qoxigraph/commands/setup_config.py | 120 +- src/qoxigraph/commands/start.py | 183 ++- src/qoxigraph/commands/status.py | 4 +- src/qoxigraph/commands/stop.py | 31 +- src/qoxigraph/qleverfile.py | 77 + .../test_benchmark_queries_methods.py | 454 ++++++ .../commands/test_cache_stats_execute.py | 34 +- .../test_cache_stats_other_methods.py | 4 +- test/qlever/commands/test_index_execute.py | 29 +- .../commands/test_index_other_methods.py | 4 + .../commands/test_index_stats_methods.py | 266 ++++ test/qlever/commands/test_start_execute.py | 20 +- .../commands/test_start_other_methods.py | 4 +- test/qlever/commands/test_status_execute.py | 2 +- .../commands/test_status_other_methods.py | 2 +- test/qlever/commands/test_stop_execute.py | 14 +- .../commands/test_stop_other_methods.py | 2 +- test/qlever/conftest.py | 16 + 68 files changed, 5314 insertions(+), 765 deletions(-) create mode 100644 src/qlever/Qleverfiles/Qleverfile.osm-planet-from-pbf create mode 100644 src/qlever/Qleverfiles/Qleverfile.wikidata-munged create mode 100644 src/qlever/commands/benchmark_queries.py create mode 100644 src/qlever/commands/materialized_view.py create mode 100644 src/qlever/commands/rebuild_index.py create mode 100644 src/qlever/commands/reset_updates.py create mode 100644 src/qlever/commands/update.py create mode 100644 src/qlever/commands/update_wikidata.py create mode 100644 src/qoxigraph/commands/benchmark_queries.py create mode 100644 src/qoxigraph/commands/index_stats.py create mode 100644 src/qoxigraph/qleverfile.py create mode 100644 test/qlever/commands/test_benchmark_queries_methods.py create mode 100644 test/qlever/commands/test_index_stats_methods.py create mode 100644 test/qlever/conftest.py diff --git a/README.md b/README.md index bb0765b5..0ef5bac9 100644 --- a/README.md +++ b/README.md @@ -1,72 +1,34 @@ # QLever -QLever is a very fast SPARQL engine, much faster than most existing engines. It -can handle graphs with more than hundred billion triples on a single machine -with moderate resources. See https://qlever.cs.uni-freiburg.de for more -information and many public SPARQL endpoints that use QLever +This repository provides a self-documenting and easy-to-use command-line tool +for QLever (pronounced "Clever"), a graph database implementing the +[RDF](https://www.w3.org/TR/rdf11-concepts/) and +[SPARQL](https://www.w3.org/TR/sparql11-overview/) standards. +For a detailed description of what QLever is and what it can do, see +[here](https://github.com/ad-freiburg/qlever). -This project provides a Python script that can control everything that QLever -does, in particular, creating SPARQL endpoints for arbitrary RDF datasets. It -is supposed to be very easy to use and self-explanatory as you use it. In -particular, the tool provides context-sensitive autocompletion of all its -commands and options. If you use a container system (like Docker or Podman), -you don't even have to download any QLever code, but the script will download -the required image for you. +# Documentation -NOTE: There has been a major update on 24.03.2024, which changed some of the -Qleverfile variables and command-line options (all for the better, of course). -If you encounter any problems, please contact us by opening an issue on -https://github.com/ad-freiburg/qlever-control/issues. +View the latest documentation at . # Installation -Simply do `pip install qlever` and make sure that the directory where pip -installs the package is in your `PATH`. Typically, `pip` will warn you when -that is not the case and tell you what to do. +There are native packages available for +- [Debian and Ubuntu](https://docs.qlever.dev/quickstart/#debian-and-ubuntu) +- [macOS](https://docs.qlever.dev/quickstart/#macos-apple-silicon) -# Usage - -Create an empty directory, with a name corresponding to the dataset you want to -work with. For the following example, take `olympics`. Go to that directory -and do the following. After the first call, `qlever` will tell you how to -activate autocompletion for all its commands and options (it's very easy, but -`pip` cannot do that automatically). - -``` -qlever setup-config olympics # Get Qleverfile (config file) for this dataset -qlever get-data # Download the dataset -qlever index # Build index data structures for this dataset -qlever start # Start a QLever server using that index -qlever example-queries # Launch some example queries -qlever ui # Launch the QLever UI -``` - -This will create a SPARQL endpoint for the [120 Years of -Olympics](https://github.com/wallscope/olympics-rdf) dataset. It is a great -dataset for getting started because it is small, but not trivial (around 2 -million triples), and the downloading and indexing should only take a few -seconds. - -Each command will also show you the command line it uses. That way you can -learn, on the side, how QLever works internally. If you just want to know the -command line for a particular command, without executing it, you can append -`--show` like this: - -``` -qlever index --show -``` - -There are many more commands and options, see `qlever --help` for general help, -`qlever --help` for help on a specific command, or just the -autocompletion. +On other platforms simply install the `qlever` command-line +[python package using `pipx`/`uv`](https://docs.qlever.dev/quickstart/#others). +Note: QLever will be executed in a container which will come with a performance penalty. # Use with your own dataset -To use QLever with your own dataset, you should also write a `Qleverfile`, like -in the example above. The easiest way to write a `Qleverfile` is to get one of -the existing ones (using `qlever setup-config ...` as explained above) and then -change it according to your needs (the variable names should be self-explanatory). -Pick one for a dataset that is similar to yours and when in doubt, pick `olympics`. +To use QLever with your own dataset, you need a `Qleverfile`, like in the +example above. The easiest way to write a `Qleverfile` is to get one of the +existing ones (using `qlever setup-config ...`) and then +change it according to your needs. Pick one for a dataset that is similar to +yours and when in doubt, pick `olympics`. A +[reference of all options](https://docs.qlever.dev/qleverfile/) is available. # For developers @@ -84,7 +46,7 @@ pip install -e . Then you can use `qlever` just as if you had installed it via `pip install qlever`. Note that you don't have to rerun `pip install -e .` when you modify any of the `*.py` files and not even when you add new commands in -`src/qlever/commands`. The exceutable created by `pip` simply links and refers +`src/qlever/commands`. The executable created by `pip` simply links and refers to the files in your working copy. If you have bug fixes or new useful features or commands, please open a pull diff --git a/pyproject.toml b/pyproject.toml index 12c82f47..531f98aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,12 +5,13 @@ build-backend = "setuptools.build_meta" [project] name = "qlever" description = "Command-line tool for using the QLever graph database" -version = "0.5.23" +version = "0.5.45" authors = [ { name = "Hannah Bast", email = "bast@cs.uni-freiburg.de" } ] readme = "README.md" -license = { text = "Apache-2.0" } +license = "Apache-2.0" +license-files = ["LICENSE"] requires-python = ">=3.8" keywords = ["Graph database", "Triplestore", "Knowledge graphs", "SPARQL", "RDF"] @@ -20,18 +21,19 @@ classifiers = [ "Topic :: Database :: Front-Ends" ] -dependencies = [ "psutil", "termcolor", "argcomplete", "pyyaml" ] +dependencies = [ "psutil", "termcolor", "argcomplete", "pyyaml", "rdflib", "requests-sse", "tqdm" ] [project.urls] -Github = "https://github.com/ad-freiburg/qlever" +homepage = "https://github.com/ad-freiburg/qlever" +documentation = "https://docs.qlever.dev" +repository = "https://github.com/ad-freiburg/qlever.git" +bugtracker = "https://github.com/ad-freiburg/qlever/issues" [project.scripts] "qlever" = "qlever.qlever_main:main" "qoxigraph" = "qlever.qlever_main:main" -"qlever-old" = "qlever.qlever_old:main" [tool.setuptools] -license-files = ["LICENSE"] package-data = { "qlever" = ["Qleverfiles/*"] } [tool.pytest.ini_options] diff --git a/src/qlever/Qleverfiles/Qleverfile.dblp b/src/qlever/Qleverfiles/Qleverfile.dblp index 639fe31d..e82b3acb 100644 --- a/src/qlever/Qleverfiles/Qleverfile.dblp +++ b/src/qlever/Qleverfiles/Qleverfile.dblp @@ -12,13 +12,14 @@ DATA_TARFILE = dblp_KG_with_associated_data.tar GET_DATA_URL = https://sparql.dblp.org/download/${DATA_TARFILE} GET_DATA_CMD = (curl -LROC - ${GET_DATA_URL} && tar -xf ${DATA_TARFILE}) 2>&1 | tee ${NAME}.download-log.txt && rm -f ${DATA_TARFILE} VERSION = $$(date -r dblp.ttl.gz +"%d.%m.%Y %H:%M" || echo "NO_DATE") -DESCRIPTION = DBLP computer science bibliography + citations from OpenCitations, data from ${GET_DATA_URL} (version ${VERSION}) +DESCRIPTION = DBLP computer science bibliography + citations from OpenCitations, data from ${GET_DATA_URL}, version ${VERSION} FORMAT = ttl [index] INPUT_FILES = *.gz MULTI_INPUT_JSON = { "cmd": "zcat {}", "for-each": "*.gz" } -SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 5000000, "prefixes-external": [""] } +SETTINGS_JSON = { "num-triples-per-batch": 5000000 } +STXXL_MEMORY = 5G [server] PORT = 7015 diff --git a/src/qlever/Qleverfiles/Qleverfile.default b/src/qlever/Qleverfiles/Qleverfile.default index eb452b2e..dc4075f3 100644 --- a/src/qlever/Qleverfiles/Qleverfile.default +++ b/src/qlever/Qleverfiles/Qleverfile.default @@ -36,7 +36,7 @@ ACCESS_TOKEN = # Use SYSTEM = docker to run QLever inside a docker container; the Docker image # will be downloaded automatically. Use SYSTEM = native to use self-compiled -# binaries `IndexBuilderMain` and `ServerMain` (which should be in you PATH). +# binaries `qlever-index` and `qlever-server` (which should be in you PATH). [runtime] SYSTEM = docker IMAGE = docker.io/adfreiburg/qlever:latest diff --git a/src/qlever/Qleverfiles/Qleverfile.fbeasy b/src/qlever/Qleverfiles/Qleverfile.fbeasy index 36320816..26ac0b3a 100644 --- a/src/qlever/Qleverfiles/Qleverfile.fbeasy +++ b/src/qlever/Qleverfiles/Qleverfile.fbeasy @@ -8,13 +8,13 @@ NAME = fbeasy DATA_URL = https://freebase-easy.cs.uni-freiburg.de GET_DATA_CMD = wget -nc ${DATA_URL}/dump/fbeasy.nt -DESCRIPTION = RDF data from ${DATA_URL}, latest version from 18.07.2019 -TEXT_DESCRIPTION = Sentences from Wikipedia that mention at least one Freebase entity +DESCRIPTION = Freebase Easy, RDF NT from ${DATA_URL}, latest version from 18.07.2019 +TEXT_DESCRIPTION = Sentences from the English Wikipedia that mention at least one Freebase entity [index] INPUT_FILES = fbeasy.nt CAT_INPUT_FILES = cat ${INPUT_FILES} -SETTINGS_JSON = { "ascii-prefixes-only": true, "num-triples-per-batch": 10000000 } +SETTINGS_JSON = { "num-triples-per-batch": 10000000 } [server] PORT = 7003 diff --git a/src/qlever/Qleverfiles/Qleverfile.freebase b/src/qlever/Qleverfiles/Qleverfile.freebase index 381ee716..4f19075b 100644 --- a/src/qlever/Qleverfiles/Qleverfile.freebase +++ b/src/qlever/Qleverfiles/Qleverfile.freebase @@ -8,7 +8,7 @@ NAME = freebase DATA_URL = http://commondatastorage.googleapis.com/freebase-public/rdf/freebase-rdf-latest.gz GET_DATA_CMD = wget -nc ${DATA_URL} -DESCRIPTION = RDF data from ${DATA_URL}, latest (and final) version from 09.08.2015 +DESCRIPTION = Freebase, RDF NT from ${DATA_URL}, latest (and final) version from 09.08.2015 [index] INPUT_FILES = freebase-rdf-latest.gz diff --git a/src/qlever/Qleverfiles/Qleverfile.imdb b/src/qlever/Qleverfiles/Qleverfile.imdb index f4965cbc..dd5e7b79 100644 --- a/src/qlever/Qleverfiles/Qleverfile.imdb +++ b/src/qlever/Qleverfiles/Qleverfile.imdb @@ -13,13 +13,14 @@ GET_PREFIXES = echo "@prefix imdb: ." GET_IMDB_BASICS = FILE=title.basics.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ gsub("\\\\", "\\\\", $$3); gsub("\"", "\\\"", $$3); printf "imdb:%s imdb:id \"%s\" ; imdb:type \"%s\" ; imdb:title \"%s\" .\n", $$1, $$1, $$2, $$3 }'; rm -f $${FILE} GET_IMDB_RATINGS = FILE=title.ratings.tsv.gz; curl -sLO -C - ${IMDB_DATA_URL}/$${FILE}; zcat $${FILE} | sed 1d | awk -F'\t' '{ printf "imdb:%s imdb:averageRating %s ; imdb:numVotes %s .\n", $$1, $$2, $$3 }'; rm -f $${FILE} GET_DATA_CMD = (${GET_PREFIXES}; ${GET_IMDB_BASICS}; ${GET_IMDB_RATINGS}) > ${NAME}.ttl -DESCRIPTION = RDF data derived from ${IMDB_DATA_URL} -TEXT_DESCRIPTION = All literals, search with FILTER CONTAINS(?var, "...") +VERSION = $$(date -r imdb.ttl +"%d.%m.%Y %H:%M" || echo "NO_DATE") +DESCRIPTION = IMDb, CSV from ${IMDB_DATA_URL}, converted to RDF TTL using awk, version ${VERSION} +TEXT_DESCRIPTION = All literals, search with [ ql:contains-word "..."; ql:contains-entity ?literal ] [index] INPUT_FILES = ${data:NAME}.ttl CAT_INPUT_FILES = cat ${INPUT_FILES} -SETTINGS_JSON = { "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 } +SETTINGS_JSON = { "num-triples-per-batch": 1000000 } TEXT_INDEX = from_literals [server] diff --git a/src/qlever/Qleverfiles/Qleverfile.ohm-planet b/src/qlever/Qleverfiles/Qleverfile.ohm-planet index f20d3b94..5585f48b 100644 --- a/src/qlever/Qleverfiles/Qleverfile.ohm-planet +++ b/src/qlever/Qleverfiles/Qleverfile.ohm-planet @@ -11,10 +11,10 @@ NAME = ohm-planet GET_DATA_URL = https://planet.openhistoricalmap.org/planet CHECK_BINARIES = osm2rdf -h > /dev/null || (echo "osm2rdf not found, make sure that it's installed and in your PATH" && exit 1) GET_DATA_CMD_1 = unbuffer wget -O ${NAME}.pbf $$(curl -s ${GET_DATA_URL}/state.txt) 2>&1 | tee ${NAME}.download-log.txt -GET_DATA_CMD_2 = osm2rdf ${NAME}.pbf -o ${NAME}.ttl --source-dataset OHM --output-compression gz --store-locations=disk-dense --cache . --num-threads 12 --add-way-node-order --no-untagged-nodes-geometric-relations 2>&1 | tee ${NAME}.osm2rdf-log.txt +GET_DATA_CMD_2 = osm2rdf ${NAME}.pbf -o ${NAME}.ttl --source-dataset OHM --output-compression gz --store-locations=disk-dense --cache . --num-threads 12 --iri-prefix-for-untagged-nodes http://www.openhistoricalmap.org/node/ 2>&1 | tee ${NAME}.osm2rdf-log.txt GET_DATA_CMD = ${CHECK_BINARIES} && ${GET_DATA_CMD_1} && echo && ${GET_DATA_CMD_2} VERSION = $$(date -r ${NAME}.pbf +%d.%m.%Y || echo "NO_DATE") -DESCRIPTION = OHM Planet, data from ${GET_DATA_URL} version ${VERSION} (with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects) +DESCRIPTION = OHM from ${GET_DATA_URL} (with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects), version ${VERSION} [index] INPUT_FILES = ${data:NAME}.ttl.gz @@ -22,6 +22,7 @@ MULTI_INPUT_JSON = { "cmd": "zcat ${INPUT_FILES}", "parallel": "true" } STXXL_MEMORY = 5G PARSER_BUFFER_SIZE = 50M SETTINGS_JSON = { "num-triples-per-batch": 5000000 } +ENCODE_AS_ID = https://www.openhistoricalmap.org/node/ http://www.openhistoricalmap.org/node/ https://www.openhistoricalmap.org/way/ https://www.openhistoricalmap.org/relation/ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#ohmnode_tagged_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#ohmnode_untagged_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#ohmway_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#ohmrel_ https://www.openstreetmap.org/changeset/ [server] PORT = 7037 diff --git a/src/qlever/Qleverfiles/Qleverfile.osm-country b/src/qlever/Qleverfiles/Qleverfile.osm-country index 1b2c334f..45e7c86d 100644 --- a/src/qlever/Qleverfiles/Qleverfile.osm-country +++ b/src/qlever/Qleverfiles/Qleverfile.osm-country @@ -1,48 +1,39 @@ -# Qleverfile for OSM of some country, use with https://github.com/ad-freiburg/qlever-control +# Qleverfile for OSM of some country, use with `qlever` CLI # -# qlever get-data # downloads .pbf file from Geofabrik und builds .ttl.bz2 using osm2rdf -# qlever index # for example Germany takes ~30 minutes and ~10 GB RAM (on an AMD Ryzen 9 5900X) -# qlever start # starts the server -# -# Make sure that osm2rdf is in your path. Set CONTINENT and COUNTRY such that -# the link under GET_DATA_CMD exists (the names are usually the canonical -# names). The time for osm2rdf is around the same as that for "qlever index". +# Make sure that `osm2rdf` is in your path. Set CONTINENT and COUNTRY +# such that the link under GET_DATA_CMD exists (the names are usually +# the canonical names). -# Dataset settings [data] -CONTINENT = europe -COUNTRY = switzerland -NAME = osm-${COUNTRY} -PBF = ${NAME}.pbf -WITH_TEXT = false -VERSION = $$(ls -l --time-style=+%d.%m.%Y ${PBF} 2> /dev/null | cut -d' ' -f6) -GET_DATA_CMD = wget -nc -O ${PBF} https://download.geofabrik.de/${CONTINENT}/${COUNTRY}-latest.osm.pbf; rm -f ${NAME}.*.bz2; ( time osm2rdf ${PBF} -o ${NAME}.ttl --cache . ) 2>&1 | tee ${NAME}.osm2rdf-log.txt; rm -f spatial-* -DESCRIPTION = OSM ${COUNTRY}, dump from ${VERSION} with ogc:sfContains +CONTINENT = europe +COUNTRY = switzerland +NAME = osm-${COUNTRY} +GET_DATA_URL = https://download.geofabrik.de/${CONTINENT}/${COUNTRY}-latest.osm.pbf +GET_DATA_CMD = wget -nc -O ${NAME}.pbf ${GET_DATA_URL}; (time osm2rdf ${NAME}.pbf -o ${NAME}.ttl --output-compression gz --cache . --iri-prefix-for-untagged-nodes http://www.openstreetmap.org/node/) 2>&1 | tee ${NAME}.osm2rdf-log.txt +VERSION = $$(ls -l --time-style=+%d.%m.%Y ${NAME}.pbf 2> /dev/null | cut -d' ' -f6) +DESCRIPTION = OSM ${COUNTRY}, PBF from ${GET_DATA_URL}, converted to RDF with osm2rdf, version ${VERSION} -# Indexer settings [index] -INPUT_FILES = ${data:NAME}.ttl.bz2 -CAT_INPUT_FILES = bzcat ${data:NAME}.ttl.bz2 -STXXL_MEMORY = 10G -SETTINGS_JSON = { "prefixes-external": [ "\"LINESTRING(", "\"MULTIPOLYGON(", "\"POLYGON(" ], "ascii-prefixes-only": false, "num-triples-per-batch": 1000000 } +INPUT_FILES = ${data:NAME}.ttl.gz +CAT_INPUT_FILES = zcat ${data:NAME}.ttl.gz +PARALLEL_PARSING = true +VOCABULARY_TYPE = on-disk-compressed-geo-split +STXXL_MEMORY = 10G +SETTINGS_JSON = { "num-triples-per-batch": 10000000 } +ENCODE_AS_ID = https://www.openstreetmap.org/node/ http://www.openstreetmap.org/node/ https://www.openstreetmap.org/way/ https://www.openstreetmap.org/relation/ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmnode_tagged_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmnode_untagged_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmway_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmrel_ https://www.openstreetmap.org/changeset/ -# Server settings [server] -HOSTNAME = localhost PORT = 7025 -ACCESS_TOKEN = ${data:NAME}_%RANDOM% +ACCESS_TOKEN = ${data:NAME} MEMORY_FOR_QUERIES = 20G CACHE_MAX_SIZE = 10G CACHE_MAX_SIZE_SINGLE_ENTRY = 5G -CACHE_MAX_NUM_ENTRIES = 100 TIMEOUT = 100s -# Runtime to use [runtime] SYSTEM = docker IMAGE = docker.io/adfreiburg/qlever:latest -# Qlever UI [ui] UI_PORT = 7000 -UI_CONFIG = osm +UI_CONFIG = osm-planet diff --git a/src/qlever/Qleverfiles/Qleverfile.osm-planet b/src/qlever/Qleverfiles/Qleverfile.osm-planet index 8ccfb8c5..051d39c1 100644 --- a/src/qlever/Qleverfiles/Qleverfile.osm-planet +++ b/src/qlever/Qleverfiles/Qleverfile.osm-planet @@ -8,19 +8,20 @@ [data] NAME = osm-planet -DATA_URL = https://osm2rdf.cs.uni-freiburg.de/ttl/planet.osm.ttl.bz2 -GET_DATA_CMD = unbuffer wget -O ${NAME}.ttl.bz2 ${DATA_URL} | tee ${NAME}.download-log.txt +GET_DATA_URL = https://osm2rdf.cs.uni-freiburg.de/ttl/planet.osm.ttl.bz2 +GET_DATA_CMD = unbuffer wget -O ${NAME}.ttl.bz2 ${GET_DATA_URL} | tee ${NAME}.download-log.txt VERSION = $$(date -r ${NAME}.ttl.bz2 +"%d.%m.%Y" || echo "NO_DATE") -DESCRIPTION = OSM Planet, data from ${DATA_URL} version ${VERSION} (complete OSM data, with GeoSPARQL predicates ogc:sfContains and ogc:sfIntersects) +DESCRIPTION = OpenStreetMap, RDF TTL from ${GET_DATA_URL} including DE-9IM triples, version ${VERSION} [index] INPUT_FILES = ${data:NAME}.ttl.bz2 -CAT_INPUT_FILES = lbzcat -n 2 ${INPUT_FILES} -PARALLEL_PARSING = true +MULTI_INPUT_JSON = { "cmd": "lbzcat -n 2 ${INPUT_FILES}", "parallel": "true" } +VOCABULARY_TYPE = on-disk-compressed-geo-split PARSER_BUFFER_SIZE = 100M -STXXL_MEMORY = 40G +STXXL_MEMORY = 60G SETTINGS_JSON = { "num-triples-per-batch": 10000000 } -ULIMIT = 10000 +ULIMIT = 50000 +ENCODE_AS_ID = https://www.openstreetmap.org/node/ http://www.openstreetmap.org/node/ https://www.openstreetmap.org/way/ https://www.openstreetmap.org/relation/ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmnode_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmway_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmrel_ https://www.openstreetmap.org/changeset/ [server] PORT = 7007 diff --git a/src/qlever/Qleverfiles/Qleverfile.osm-planet-from-pbf b/src/qlever/Qleverfiles/Qleverfile.osm-planet-from-pbf new file mode 100644 index 00000000..f26daf8e --- /dev/null +++ b/src/qlever/Qleverfiles/Qleverfile.osm-planet-from-pbf @@ -0,0 +1,42 @@ +# Qleverfile for OSM Planet, use with the QLever CLI (`pip install qlever`) +# +# qlever get-data # download ~100 GB (pbf), convert with osm2rdf, ~200B triples +# qlever index # ~40 hours, ~60 GB RAM, ~2.5 TB index size on disk +# qlever start # a few seconds, adjust MEMORY_FOR_QUERIES as needed +# +# Measured on an AMD Ryzen 9 9950X with 128 GB RAM and 4 x 8 TB NVMe (02.10.2025) + +[data] +NAME = osm-planet +PLANET_PBF = planet-250929.osm.pbf +GET_DATA_URL = https://planet.openstreetmap.org/pbf/${PLANET_PBF} +GET_PBF_CMD = unbuffer wget -O ${PLANET_PBF} ${GET_DATA_URL} +OSM2RDF_CMD = unbuffer osm2rdf ${PLANET_PBF} -o ${NAME}.ttl --num-threads 20 --output-compression gz --cache . --store-locations disk-dense --iri-prefix-for-untagged-nodes http://www.openstreetmap.org/node/ --split-tag-key-by-semicolon ref --split-tag-key-by-semicolon service +GET_DATA_CMD = ${GET_PBF_CMD} && ${OSM2RDF_CMD} 2>&1 | tee ${NAME}.osm2rdf-log.txt +VERSION = $$(date -r ${PLANET_PBF} +%d.%m.%Y || echo "NO_DATE") +DESCRIPTION = OpenStreetMap, PBF from ${GET_DATA_URL}, converted to RDF TTL and enhanced by DE-9IM triples using osm2rdf + +[index] +INPUT_FILES = ${data:NAME}.ttl.gz +MULTI_INPUT_JSON = { "cmd": "zcat ${INPUT_FILES}", "parallel": "true" } +VOCABULARY_TYPE = on-disk-compressed-geo-split +PARSER_BUFFER_SIZE = 100M +STXXL_MEMORY = 60G +SETTINGS_JSON = { "num-triples-per-batch": 10000000 } +ULIMIT = 50000 +ENCODE_AS_ID = https://www.openstreetmap.org/node/ http://www.openstreetmap.org/node/ https://www.openstreetmap.org/way/ https://www.openstreetmap.org/relation/ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmnode_tagged_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmnode_untagged_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmway_ https://osm2rdf.cs.uni-freiburg.de/rdf/geom#osmrel_ https://www.openstreetmap.org/changeset/ + +[server] +PORT = 7007 +ACCESS_TOKEN = ${data:NAME} +MEMORY_FOR_QUERIES = 40G +CACHE_MAX_SIZE = 20G +CACHE_MAX_SIZE_SINGLE_ENTRY = 10G +TIMEOUT = 600s + +[runtime] +SYSTEM = docker +IMAGE = docker.io/adfreiburg/qlever:latest + +[ui] +UI_CONFIG = osm-planet diff --git a/src/qlever/Qleverfiles/Qleverfile.pubchem b/src/qlever/Qleverfiles/Qleverfile.pubchem index 25aef1ca..fa7894dd 100644 --- a/src/qlever/Qleverfiles/Qleverfile.pubchem +++ b/src/qlever/Qleverfiles/Qleverfile.pubchem @@ -24,11 +24,11 @@ ONTOLOGIES_DIR = RDF.ontologies PUBCHEM_DIR = RDF.pubchem ONTOLOGIES_CSV = ontologies.csv CHECK_REQUIREMENTS = for CMD in docker parallel; do $$CMD --version >/dev/null 2>&1 || (echo "Requires \"$$CMD\", please install it"; false); done -GET_DATA_CMD_1 = mkdir -p ${ONTOLOGIES_DIR} && cd ${ONTOLOGIES_DIR} && cat ${ONTOLOGIES_CSV} | parallel --colsep "," 'FILE={2} && URL={3} && ERRFILE=$${FILE%.*}.jena-stderr; echo "Processing $$URL ($$FILE) ..." && curl -sLRo $$FILE $$URL && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$FILE 2> $$ERRFILE | gzip -c > $${FILE%.*}.nt.gz && rm -f $$FILE; if [ -s $$ERRFILE ]; then grep -q "ERROR *riot" $$ERRFILE && echo "riot ERRORs in $$FILE, check $$ERRFILE"; else rm $$ERRFILE; fi' +GET_DATA_CMD_1 = mkdir -p ${ONTOLOGIES_DIR} && (cd ${ONTOLOGIES_DIR} && cat ${ONTOLOGIES_CSV} | parallel --colsep "," 'FILE={2} && URL={3} && ERRFILE=$${FILE%.*}.jena-stderr; echo "Processing $$URL ($$FILE) ..." && curl -sLRo $$FILE $$URL && docker run --rm -v $$(pwd):/data stain/jena riot --output=NT /data/$$FILE 2> $$ERRFILE | gzip -c > $${FILE%.*}.nt.gz && rm -f $$FILE; if [ -s $$ERRFILE ]; then grep -q "ERROR *riot" $$ERRFILE && echo "riot ERRORs in $$FILE, check $$ERRFILE"; else rm $$ERRFILE; fi') GET_DATA_CMD_2 = mkdir -p ${PUBCHEM_DIR} && wget -r -nv -nH --cut-dirs=2 --no-parent -P ${PUBCHEM_DIR} ${GET_DATA_URL} GET_DATA_CMD = ${CHECK_REQUIREMENTS} && ${GET_DATA_CMD_1} 2>&1 | tee pubchem.get-data-log.txt; ${GET_DATA_CMD_2} 2>&1 | tee -a pubchem.get-data-log.txt VERSION = $$(date -r void.ttl +%d.%m.%Y || echo "NO_DATE") -DESCRIPTION = PubChem RDF from ${GET_DATA_URL} (version ${VERSION}) + associated ontologies (bao, bfo, biopax-level3, chebi, cheminf, cito, dublin_core_terms, fabio, go, iao, ncit, obi, pr, ro, sio, skos, so, uo) +DESCRIPTION = PubChem, RDF TTL from ${GET_DATA_URL} + associated ontologies (bao, bfo, biopax-level3, chebi, cheminf, cito, dublin_core_terms, fabio, go, iao, ncit, obi, pr, ro, sio, skos, so, uo), version ${data:VERSION} MAKE_ONTOLOGIES_CSV = $$(mkdir -p ${ONTOLOGIES_DIR} && echo "BAO - BioAssay Ontology,bao.owl,https://data.bioontology.org/ontologies/BAO/submissions/56/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nBFO - Basic Formal Ontology,bfo.owl,http://purl.obolibrary.org/obo/bfo.owl\nBioPAX - biological pathway data,bp.owl,http://www.biopax.org/release/biopax-level3.owl\nCHEMINF - Chemical Information Ontology,cheminf.owl,http://purl.obolibrary.org/obo/cheminf.owl\nChEBI - Chemical Entities of Biological Interest,chebi.owl,http://purl.obolibrary.org/obo/chebi.owl\nCiTO,cito.nt,http://purl.org/spar/cito.nt\nDCMI Terms,dcterms.nt,https://www.dublincore.org/specifications/dublin-core/dcmi-terms/dublin_core_terms.nt\nFaBiO,fabio.nt,http://purl.org/spar/fabio.nt\nGO - Gene Ontology,go.owl,http://purl.obolibrary.org/obo/go.owl\nIAO - Information Artifact Ontology,iao.owl,http://purl.obolibrary.org/obo/iao.owl\nNCIt,ncit.owl,http://purl.obolibrary.org/obo/ncit.owl\nNDF-RT,ndfrt.owl,https://data.bioontology.org/ontologies/NDF-RT/submissions/1/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\nOBI - Ontology for Biomedical Investigations,obi.owl,http://purl.obolibrary.org/obo/obi.owl\nOWL,owl.ttl,http://www.w3.org/2002/07/owl.ttl\nPDBo,pdbo.owl,http://rdf.wwpdb.org/schema/pdbx-v40.owl\nPR - PRotein Ontology (PRO),pr.owl,http://purl.obolibrary.org/obo/pr.owl\nRDF Schema,rdfs.ttl,https://www.w3.org/2000/01/rdf-schema.ttl\nRDF,rdf.ttl,http://www.w3.org/1999/02/22-rdf-syntax-ns.ttl\nRO - Relation Ontology,ro.owl,http://purl.obolibrary.org/obo/ro.owl\nSIO - Semanticscience Integrated Ontology,sio.owl,http://semanticscience.org/ontology/sio.owl\nSKOS,skos.rdf,http://www.w3.org/TR/skos-reference/skos.rdf\nSO - Sequence types and features ontology,so.owl,http://purl.obolibrary.org/obo/so.owl\nUO - Units of measurement ontology,uo.owl,http://purl.obolibrary.org/obo/uo.owl" > ${ONTOLOGIES_DIR}/${ONTOLOGIES_CSV}) [index] diff --git a/src/qlever/Qleverfiles/Qleverfile.uniprot b/src/qlever/Qleverfiles/Qleverfile.uniprot index 295bf4ab..c3626d45 100644 --- a/src/qlever/Qleverfiles/Qleverfile.uniprot +++ b/src/qlever/Qleverfiles/Qleverfile.uniprot @@ -12,7 +12,7 @@ [data] NAME = uniprot -DATE = 2024-11-27 +DATE = 2025-06-18 RDFXML_DIR = rdf.${DATE} TTL_DIR = ttl.${DATE} UNIPROT_URL = https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf @@ -22,7 +22,7 @@ GET_EXAMPLES_CMD = mkdir -p ${TTL_DIR} && git clone ${EXAMPLES_URL} && (cd sparq GET_RDFXML_CMD = mkdir -p ${RDFXML_DIR} && (echo "${RHEA_URL}/chebi.owl.gz"; echo "${RHEA_URL}/rhea.rdf.gz"; curl -s ${UNIPROT_URL}/RELEASE.meta4 | sed "s//" | xmllint --xpath "/metalink/files/file/url[@location=\"ch\"]/text()" -) | while read URL; do wget --no-verbose -P ${RDFXML_DIR} $$URL 2>&1 | tee -a uniprot.download-log; done RDFXML2TTL_CMD = mkdir -p ${TTL_DIR} && for RDFXML in ${RDFXML_DIR}/*.{owl,owl.xz,rdf,rdf.xz}; do echo "xzcat -f $$RDFXML | rdfxml --output=ttl -q 2> ${TTL_DIR}/$$(basename $$RDFXML).stderr | gzip -c > ${TTL_DIR}/$$(basename $$RDFXML | sed 's/\(rdf\|rdf.xz\|owl\|owl.xz\)$$/ttl.gz/') && echo 'DONE converting $$RDFXML'"; done | parallel GET_DATA_CMD = date > ${NAME}.get-data.begin-date && ${GET_EXAMPLES_CMD} && ${GET_RDFXML_CMD} && ${RDFXML2TTL_CMD} && date > ${NAME}.get-data.end-date -DESCRIPTION = Complete UniProt data from ${UNIPROT_URL}, with additional data from ${RHEA_URL} and ${EXAMPLES_URL} +DESCRIPTION = UniProt, RDF XML from ${UNIPROT_URL} + additional data from ${RHEA_URL} and ${EXAMPLES_URL}, version ${DATE} [index] INPUT_FILES = ${data:TTL_DIR}/*.ttl.gz @@ -55,7 +55,8 @@ MULTI_INPUT_JSON = [{ "cmd": "zcat {}", "graph": "http://sparql.uniprot.org/unip { "cmd": "zcat ${data:TTL_DIR}/examples_uniprot.ttl.gz", "graph": "http://sparql.uniprot.org/.well-known/sparql-examples" }, { "cmd": "zcat ${data:TTL_DIR}/core.ttl.gz", "graph": "http://purl.uniprot.org/core" }] SETTINGS_JSON = { "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 25000000 } -STXXL_MEMORY = 60G +STXXL_MEMORY = 80G +ULIMIT = 50000 [server] PORT = 7018 diff --git a/src/qlever/Qleverfiles/Qleverfile.wikidata b/src/qlever/Qleverfiles/Qleverfile.wikidata index e6ec6f6c..5e9c4cd8 100644 --- a/src/qlever/Qleverfiles/Qleverfile.wikidata +++ b/src/qlever/Qleverfiles/Qleverfile.wikidata @@ -16,7 +16,7 @@ GET_DATA_URL = https://dumps.wikimedia.org/wikidatawiki/entities GET_DATA_CMD = curl -LRC - -O ${GET_DATA_URL}/latest-all.ttl.bz2 -O ${GET_DATA_URL}/latest-lexemes.ttl.bz2 2>&1 | tee wikidata.download-log.txt && curl -sL ${GET_DATA_URL}/dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt DATE_WIKIDATA = $$(date -r latest-all.ttl.bz2 +%d.%m.%Y || echo "NO_DATE") DATE_WIKIPEDIA = $$(date -r wikipedia-abstracts.nt +%d.%m.%Y || echo "NO_DATE") -DESCRIPTION = Full Wikidata dump from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2, version ${DATE_WIKIDATA}) +DESCRIPTION = Complete Wikidata, from ${GET_DATA_URL} (latest-all.ttl.bz2 and latest-lexemes.ttl.bz2), version ${DATE_WIKIDATA} [index] INPUT_FILES = latest-all.ttl.bz2 latest-lexemes.ttl.bz2 dcatap.nt diff --git a/src/qlever/Qleverfiles/Qleverfile.wikidata-munged b/src/qlever/Qleverfiles/Qleverfile.wikidata-munged new file mode 100644 index 00000000..834cd364 --- /dev/null +++ b/src/qlever/Qleverfiles/Qleverfile.wikidata-munged @@ -0,0 +1,47 @@ +# Qleverfile for Wikidata MUNGED, use with `qlever` CLI (`pipx install qlever`) +# +# qlever get-data # ~24 hours, ~110 GB (compressed), ~18 billion triples +# qlever index # ~4 hours, ~20 GB RAM, ~500 GB index size on disk +# qlever start # a few seconds, adjust MEMORY_FOR_QUERIES as needed +# +# Measured on an AMD Ryzen 9 9950X with 128 GB RAM, and NVMe SSD (17.01.2026) + +[DEFAULT] +NAME = wikidata + +[data] +GET_DATA_URL_BASE = https://dumps.wikimedia.org/wikidatawiki/entities +GET_DATA_URL_ALL = ${GET_DATA_URL_BASE}/20260112/wikidata-20260112-all-BETA.ttl.bz2 +GET_DATA_URL_LEXEMES = ${GET_DATA_URL_BASE}/20260116/wikidata-20260116-lexemes-BETA.ttl.bz2 +GET_DATA_WDQS_VER = 0.3.156 +GET_DATA_WGET_CMD = unbuffer wget -q --show-progress +GET_DATA_CMD_1 = ${GET_DATA_WGET_CMD} -O service-${GET_DATA_WDQS_VER}-dist.tar.gz https://archiva.wikimedia.org/repository/releases/org/wikidata/query/rdf/service/${GET_DATA_WDQS_VER}/service-${GET_DATA_WDQS_VER}-dist.tar.gz | tee wikidata.download-log.txt && tar -xzf service-${GET_DATA_WDQS_VER}-dist.tar.gz && rm service-${GET_DATA_WDQS_VER}-dist.tar.gz +GET_DATA_CMD_2 = ${GET_DATA_WGET_CMD} -O dcatap.rdf https://dumps.wikimedia.org/wikidatawiki/entities/dcatap.rdf | tee -a wikidata.download-log.txt && cat dcatap.rdf | docker run -i --rm -v $$(pwd):/data stain/jena riot --syntax=RDF/XML --output=NT /dev/stdin > dcatap.nt && rm dcatap.rdf && ${GET_DATA_WGET_CMD} -O latest-lexemes.ttl.bz2 ${GET_DATA_URL_LEXEMES} 2>&1 | tee -a wikidata.download-log.txt && ${GET_DATA_WGET_CMD} -O latest-all.ttl.bz2 ${GET_DATA_URL_ALL} 2>&1 | tee -a wikidata.download-log.txt +GET_DATA_CMD_3 = service-${GET_DATA_WDQS_VER}/munge.sh -f latest-all.ttl.bz2 -d . -c 150000000 && mv wikidump-000000001.ttl.gz latest-all.MUNGED.ttl.gz && touch -r latest-all.ttl.bz2 latest-all.MUNGED.ttl.gz +GET_DATA_CMD = ${GET_DATA_CMD_1} && ${GET_DATA_CMD_2} && ${GET_DATA_CMD_3} +DATE_WIKIDATA_ALL = $$(date -r latest-all.ttl.bz2 +%d.%m.%Y || echo "NO_DATE") +DATE_WIKIDATA_LEXEMES = $$(date -r latest-lexemes.ttl.bz2 +%d.%m.%Y || echo "NO_DATE") +DESCRIPTION = Complete Wikidata, TTL from ${GET_DATA_URL_BASE} (latest-all.ttl.bz2 from ${DATE_WIKIDATA_ALL} munged, latest-lexemes.ttl.bz2 from ${DATE_WIKIDATA_LEXEMES}), updated LIVE + +[index] +INPUT_FILES = latest-all.MUNGED.ttl.gz latest-lexemes.ttl.bz2 dcatap.nt +MULTI_INPUT_JSON = { "cmd": "zcat latest-all.MUNGED.ttl.gz", "format": "ttl", "parallel": "true" } + { "cmd": "lbzcat -n 1 latest-lexemes.ttl.bz2", "format": "ttl", "parallel": "false" } + { "cmd": "cat dcatap.nt", "format": "nt", "parallel": "false" } +SETTINGS_JSON = { "num-triples-per-batch": 5000000, "languages-internal": [], "prefixes-external": [""], "locale": { "language": "en", "country": "US", "ignore-punctuation": true } } +STXXL_MEMORY = 10G + +[server] +PORT = 7001 +ACCESS_TOKEN = ${data:NAME} +MEMORY_FOR_QUERIES = 20G +CACHE_MAX_SIZE = 15G +CACHE_MAX_SIZE_SINGLE_ENTRY = 5G +TIMEOUT = 600s + +[runtime] +SYSTEM = docker +IMAGE = adfreiburg/qlever:latest + +[ui] +UI_CONFIG = wikidata diff --git a/src/qlever/Qleverfiles/Qleverfile.wikipathways b/src/qlever/Qleverfiles/Qleverfile.wikipathways index 7e9d0432..2cfb6c6b 100644 --- a/src/qlever/Qleverfiles/Qleverfile.wikipathways +++ b/src/qlever/Qleverfiles/Qleverfile.wikipathways @@ -8,7 +8,7 @@ [data] NAME = wikipathways -RELEASE = 20240810 +RELEASE = current GET_DATA_URL = https://data.wikipathways.org/${RELEASE}/rdf GET_DATA_CMD = wget -O wikipathways-rdf-void.ttl ${GET_DATA_URL}/wikipathways-rdf-void.ttl && \ wget ${GET_DATA_URL}/wikipathways-${RELEASE}-rdf-wp.zip && \ diff --git a/src/qlever/command.py b/src/qlever/command.py index e48b3d17..a4d95923 100644 --- a/src/qlever/command.py +++ b/src/qlever/command.py @@ -42,7 +42,7 @@ def should_have_qleverfile(self) -> bool: pass @abstractmethod - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: """ Retun the arguments relevant for this command. This must be a subset of the names of `all_arguments` defined in `QleverConfig`. Only these diff --git a/src/qlever/commands/add_text_index.py b/src/qlever/commands/add_text_index.py index 943c701f..df250287 100644 --- a/src/qlever/commands/add_text_index.py +++ b/src/qlever/commands/add_text_index.py @@ -5,7 +5,7 @@ from qlever.command import QleverCommand from qlever.containerize import Containerize from qlever.log import log -from qlever.util import get_existing_index_files, run_command +from qlever.util import binary_exists, get_existing_index_files, run_command class AddTextIndexCommand(QleverCommand): @@ -22,7 +22,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return { "data": ["name"], "index": [ @@ -80,17 +80,8 @@ def execute(self, args) -> bool: if args.show: return True - # When running natively, check if the binary exists and works. - if args.system == "native": - try: - run_command(f"{args.index_binary} --help") - except Exception as e: - log.error( - f'Running "{args.index_binary}" failed ({e}), ' - f"set `--index-binary` to a different binary or " - f"use `--container_system`" - ) - return False + if not binary_exists(args.index_binary, "index-binary", args): + return False # Check if text index files already exist. existing_text_index_files = get_existing_index_files( diff --git a/src/qlever/commands/benchmark_queries.py b/src/qlever/commands/benchmark_queries.py new file mode 100644 index 00000000..05a3647d --- /dev/null +++ b/src/qlever/commands/benchmark_queries.py @@ -0,0 +1,1241 @@ +from __future__ import annotations + +import csv +import json +import re +import shlex +import subprocess +import time +import traceback +from io import StringIO +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import rdflib +import yaml +from termcolor import colored + +from qlever import command_objects, engine_name, script_name +from qlever.command import QleverCommand +from qlever.commands.clear_cache import ClearCacheCommand +from qlever.commands.ui import dict_to_yaml +from qlever.containerize import Containerize +from qlever.log import log, mute_log +from qlever.util import run_command, run_curl_command + + +def pretty_printed_query( + query: str, show_prefixes: bool, system: str = "docker" +) -> str: + """ + Pretty-print a SPARQL query using the sparql-formatter Docker image. + Optionally strips PREFIX declarations from the output. + Argument `system` can either be docker or podman. + """ + if system not in Containerize.supported_systems(): + system = "docker" + remove_prefixes_cmd = " | sed '/^PREFIX /Id'" if not show_prefixes else "" + pretty_print_query_cmd = ( + f"echo {shlex.quote(query)}" + f" | {system} run -i --rm docker.io/sparqling/sparql-formatter" + f"{remove_prefixes_cmd} | grep -v '^$'" + ) + try: + query_pretty_printed = run_command( + pretty_print_query_cmd, return_output=True + ) + return query_pretty_printed.rstrip() + except Exception as e: + log.debug( + f"Failed to pretty-print query, returning original query: {e}" + ) + return query.rstrip() + + +def sparql_query_type(query: str) -> str: + """ + Determine the SPARQL query type (SELECT, ASK, CONSTRUCT, DESCRIBE) + from the query string. Returns "UNKNOWN" if no type is found. + """ + match = re.search( + r"(SELECT|ASK|CONSTRUCT|DESCRIBE)\s", query, re.IGNORECASE + ) + if match: + return match.group(1).upper() + else: + return "UNKNOWN" + + +def filter_queries( + queries: list[tuple[str, str, str]], query_ids: str, query_regex: str +) -> list[tuple[str, str, str]]: + """ + Given a list of queries (tuple of query name, desc and full sparql query), + filter them and keep the ones which are a part of query_ids + and match with query_regex (if provided). + """ + # Parse query_ids into a list of indices + total_queries = len(queries) + query_indices = [] + for part in query_ids.split(","): + part = part.strip() + if not part: + continue + try: + if "-" in part: + start, end = part.split("-", 1) + if end == "$": + end = total_queries + query_indices.extend(range(int(start) - 1, int(end))) + else: + idx = (int(part) if part != "$" else total_queries) - 1 + query_indices.append(idx) + except ValueError as exc: + log.error(f"Invalid query ID '{part}': {exc}") + return [] + + # Check for duplicate indices + seen = set() + for idx in query_indices: + if idx in seen: + log.error(f"Duplicate query ID {idx + 1} in '{query_ids}'") + return [] + seen.add(idx) + + # Filter by regex and collect results + try: + filtered_queries = [] + pattern = ( + re.compile(query_regex, re.IGNORECASE) if query_regex else None + ) + for query_idx in query_indices: + if query_idx < 0 or query_idx >= total_queries: + continue + + name, description, query = queries[query_idx] + + # Only include queries that match the query_regex if present + if pattern and not ( + pattern.search(name) + or pattern.search(description) + or pattern.search(query) + ): + continue + + filtered_queries.append((name, description, query)) + return filtered_queries + except Exception as exc: + log.error(f"Error filtering queries: {exc}") + return [] + + +def parse_queries_tsv(queries_cmd: str) -> list[tuple[str, str, str]]: + """ + Execute the given bash command to fetch tsv queries and return a + list of queries i.e. tuple(query_name, "", full_sparql_query) + Note: query_description is returned as empty to match the return + structure of parse_queries_yml. + """ + try: + tsv_queries_str = run_command(queries_cmd, return_output=True) + if len(tsv_queries_str) == 0: + log.error("No queries found in the TSV queries file") + return [] + return [ + (query_name, "", sparql_query) + for line in tsv_queries_str.strip().splitlines() + for query_name, sparql_query in [line.split("\t", 1)] + ] + except Exception as exc: + log.error(f"Failed to read the TSV queries file: {exc}") + return [] + + +def parse_queries_yml( + queries_file: str, +) -> tuple[str | None, str | None, list[tuple[str, str, str]]]: + """ + Parse a YML file, validate its structure and return a tuple of + (benchmark_name, benchmark_description, queries) where queries is a + list of tuple(query_name, query_description, full_sparql_query). + """ + with open(queries_file, "r", encoding="utf-8") as q_file: + try: + data = yaml.safe_load(q_file) + except yaml.YAMLError as exc: + log.error(f"Error parsing {queries_file} file: {exc}") + return None, None, [] + + # Validate the structure + if not isinstance(data, dict) or "queries" not in data: + log.error("Error: YAML file must contain a top-level 'queries' key") + return None, None, [] + + if not isinstance(data["queries"], list): + log.error("Error: 'queries' key in YML file must hold a list.") + return None, None, [] + + queries = [] + for query in data["queries"]: + if ( + not isinstance(query, dict) + or "query" not in query + or "name" not in query + ): + log.error( + "Error: Each item in 'queries' must contain " + "'name' and 'query' keys." + ) + return None, None, [] + queries.append( + (query["name"], query.get("description", ""), query["query"]) + ) + return data.get("name"), data.get("description"), queries + + +def get_result_size( + count_only: bool, + query_type: str, + accept_header: str, + result_file: str, +) -> tuple[int, dict[str, str] | None]: + """ + Get the result size and error_msg dict (if query failed) for + different accept headers + """ + + def get_json_error_msg(e: Exception) -> dict[str, str]: + error_msg = { + "short": "Malformed JSON", + "long": "curl returned with code 200, " + "but the JSON is malformed: " + re.sub(r"\s+", " ", str(e)), + } + return error_msg + + result_size = 0 + error_msg = None + # CASE 0: The result is empty despite a 200 HTTP code (not a + # problem for CONSTRUCT and DESCRIBE queries). + if Path(result_file).stat().st_size == 0 and ( + not query_type == "CONSTRUCT" and not query_type == "DESCRIBE" + ): + result_size = 0 + error_msg = { + "short": "Empty result", + "long": "curl returned with code 200, but the result is empty", + } + + # CASE 1: Just counting the size of the result (TSV or JSON). + elif count_only: + if accept_header in ("text/tab-separated-values", "text/csv"): + result_size = run_command( + f"sed 1d {result_file}", return_output=True + ) + elif accept_header == "application/qlever-results+json": + try: + # sed cmd to get the number between 2nd and 3rd double_quotes + result_size = run_command( + f"jq '.res[0]' {result_file}" + " | sed 's/[^0-9]*\\([0-9]*\\).*/\\1/'", + return_output=True, + ) + except Exception as e: + error_msg = get_json_error_msg(e) + else: + try: + result_size = run_command( + f'jq -r ".results.bindings[0]' + f" | to_entries[0].value.value" + f' | tonumber" {result_file}', + return_output=True, + ) + except Exception as e: + error_msg = get_json_error_msg(e) + + # CASE 2: Downloading the full result (TSV, CSV, Turtle, JSON). + else: + if accept_header in ("text/tab-separated-values", "text/csv"): + result_size = run_command( + f"sed 1d {result_file} | wc -l", return_output=True + ) + elif accept_header == "text/turtle": + result_size = run_command( + f"sed '1d;/^@prefix/d;/^\\s*$/d' {result_file} | wc -l", + return_output=True, + ) + elif accept_header == "application/qlever-results+json": + try: + result_size = run_command( + f'jq -r ".resultsize" {result_file}', + return_output=True, + ) + except Exception as e: + error_msg = get_json_error_msg(e) + else: + try: + result_size = int( + run_command( + f'jq -r ".results.bindings | length" {result_file}', + return_output=True, + ).rstrip() + ) + except Exception as e: + error_msg = get_json_error_msg(e) + return int(result_size), error_msg + + +def get_single_int_result(result_file: str) -> int | None: + """ + When downloading the full result of a query with accept header as + application/sparql-results+json and result_size == 1, get the single + integer result value (if any). + """ + single_int_result = None + try: + single_int_result = int( + run_command( + f'jq -e -r ".results.bindings[0][] | .value" {result_file}', + return_output=True, + ).rstrip() + ) + except Exception: + pass + return single_int_result + + +def restart_server(start_only: bool = False) -> bool: + """ + Restart the SPARQL server after the server hangs i.e. doesn't return + results after timeout + 30s + Extremely useful for benchmarking oxigraph (doesn't have timeout implemented) + and blazegraph (sometimes doesn't terminate query execution at timeout) + Only useful when Qleverfile in CWD and configured properly i.e. no command + line args needed to call stop and start commands + """ + stop_cmd = f"{script_name} stop" + start_cmd = f"{script_name} start" + if not start_only: + try: + run_command(stop_cmd) + time.sleep(2) + except Exception as e: + log.warning(f"{script_name} process could not be stopped!: {e}") + try: + run_command(start_cmd) + time.sleep(5) + log.info(f"Successfully restarted {engine_name} server after hang!") + return True + except Exception as e: + log.warning( + f"{script_name} server could not be restarted. This might affect " + f"the benchmark process!: {e}" + ) + return False + + +def resolve_benchmark_metadata( + cli_name: str | None, + cli_description: str | None, + yml_name: str | None, + yml_description: str | None, + dataset: str | None, +) -> tuple[str | None, str | None]: + """ + Resolve benchmark name and description using priority: + 1. CLI args (highest priority) + 2. YML file fields + 3. Default values derived from dataset name + """ + dataset_name = dataset.capitalize() if dataset else None + default_description = ( + f"{dataset_name} benchmark ran using {script_name} benchmark-queries" + if dataset_name + else None + ) + benchmark_name = cli_name or yml_name or dataset_name + benchmark_description = ( + cli_description or yml_description or default_description + ) + return benchmark_name, benchmark_description + + +def compute_index_stats() -> tuple[float | None, float | None]: + """ + Compute the index size (Bytes) and time (seconds) if available + """ + index_stats = command_objects["index-stats"] + index_time = index_size = None + index_log_file = next(Path.cwd().glob("*.index-log.txt"), None) + + if index_log_file: + index_args = SimpleNamespace( + time_unit="s", + size_unit="B", + ignore_text_index=False, + name=index_log_file.name.split(".")[0], + ) + durations = index_stats.execute_time(index_args, index_log_file.name) + if len(durations) > 0 and "TOTAL time" in durations: + index_time = durations["TOTAL time"][0] + sizes = index_stats.execute_space(index_args) + if len(sizes) > 0 and "TOTAL size" in sizes: + index_size = sizes["TOTAL size"][0] + + return index_time, index_size + + +def get_query_results( + result_file: str, result_size: int, accept_header: str +) -> tuple[list[str], list[list[str]]]: + """ + Return headers and query results as a tuple for various accept headers + """ + if accept_header in ("text/tab-separated-values", "text/csv"): + separator = "," if accept_header == "text/csv" else "\t" + get_result_cmd = f"sed -n '1,{result_size + 1}p' {result_file}" + results_str = run_command(get_result_cmd, return_output=True) + results = results_str.splitlines() + reader = csv.reader(StringIO(results_str), delimiter=separator) + headers = next(reader) + results = [row for row in reader] + return headers, results + + elif accept_header == "application/qlever-results+json": + get_result_cmd = ( + f"jq '{{headers: .selected, results: .res[0:{result_size}]}}' " + f"{result_file}" + ) + results_str = run_command(get_result_cmd, return_output=True) + results_json = json.loads(results_str) + return results_json["headers"], results_json["results"] + + elif accept_header == "application/sparql-results+json": + get_result_cmd = ( + f"jq '{{headers: .head.vars, " + f"bindings: .results.bindings[0:{result_size}]}}' " + f"{result_file}" + ) + results_str = run_command(get_result_cmd, return_output=True) + results_json = json.loads(results_str) + results = [] + bindings = results_json.get("bindings", []) + for binding in bindings: + result = [] + if not binding or not isinstance(binding, dict): + results.append([]) + continue + for obj in binding.values(): + value = '"' + obj["value"] + '"' + if obj["type"] == "uri": + value = "<" + value.strip('"') + ">" + elif "datatype" in obj: + value += "^^<" + obj["datatype"] + ">" + elif "xml:lang" in obj: + value += "@" + obj["xml:lang"] + result.append(value) + results.append(result) + return results_json["headers"], results + + else: # text/turtle + graph = rdflib.Graph() + graph.parse(result_file, format="turtle") + headers = ["?subject", "?predicate", "?object"] + results = [] + for i, (s, p, o) in enumerate(graph): + if i >= result_size: + break + results.append([str(s), str(p), str(o)]) + return headers, results + + +def get_result_yml_query_record( + name: str, + description: str, + query: str, + client_time: float, + result: str | dict[str, str], + result_size: int | None, + max_result_size: int, + accept_header: str, + server_restarted: bool, +) -> dict[str, Any]: + """ + Construct a dictionary with query information for output result yaml file + """ + record = { + "name": name, + "description": description, + "query": query, + "runtime_info": {}, + "server_restarted": server_restarted, + } + headers = results = [] + if result_size is None and isinstance(result, dict): + results = f"{result['short']}: {result['long']}" + headers = [] + if result_size and isinstance(result, str): + record["result_size"] = result_size + result_size = ( + max_result_size if result_size > max_result_size else result_size + ) + headers, results = get_query_results( + result, result_size, accept_header + ) + if accept_header == "application/qlever-results+json": + runtime_info_cmd = ( + f"jq 'if .runtimeInformation then" + f" .runtimeInformation else" + f' "null" end\' {result}' + ) + runtime_info_str = run_command( + runtime_info_cmd, return_output=True + ) + if runtime_info_str != "null": + record["runtime_info"] = json.loads(runtime_info_str) + record["runtime_info"]["client_time"] = client_time + record["headers"] = headers + record["results"] = results + return record + + +def write_query_records_to_result_file( + query_data: dict[str, list[dict[str, Any]]], out_file: Path +) -> None: + """ + Write yaml record for all queries to output yaml file + """ + config_yaml = dict_to_yaml(query_data) + with open(out_file, "w") as eval_yaml_file: + eval_yaml_file.write(config_yaml) + log.info("") + log.info( + f"Generated result yaml file: {out_file.stem}{out_file.suffix} " + f"in the directory {out_file.parent.resolve()}" + ) + + +class BenchmarkQueriesCommand(QleverCommand): + """ + Class for running a given sequence of benchmark or example queries and + showing their processing times and result sizes. + """ + + def __init__(self): + pass + + def description(self) -> str: + return ( + "Run the given benchmark or example queries and show their " + "processing times and result sizes. Optionally, store the " + "benchmark results in a YML file." + ) + + def should_have_qleverfile(self) -> bool: + return False + + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: + return { + "server": ["host_name", "port", "timeout"], + "runtime": ["system"], + "ui": ["ui_config"], + } + + def additional_arguments(self, subparser) -> None: + subparser.add_argument( + "--sparql-endpoint", type=str, help="URL of the SPARQL endpoint" + ) + subparser.add_argument( + "--sparql-endpoint-preset", + choices=[ + "https://qlever.dev/api/wikidata", + "https://qlever.dev/api/uniprot", + "https://qlever.dev/api/pubchem", + "https://qlever.dev/api/osm-planet", + "https://wikidata.demo.openlinksw.com/sparql", + "https://sparql.uniprot.org/sparql", + ], + help="SPARQL endpoint from fixed list (to save typing)", + ) + subparser.add_argument( + "--queries-tsv", + type=str, + default=None, + help=( + "Path to a TSV file containing the benchmark queries " + "(short_query_name, full_sparql_query)" + ), + ) + subparser.add_argument( + "--queries-yml", + type=str, + default=None, + help=( + "Path to a YML file containing the benchmark queries. " + "The YML file must follow this structure -> " + "name: , " + "description: , " + "queries: where each query contains: " + "name: , " + "description , " + "query: " + ), + ) + subparser.add_argument( + "--query-ids", + type=str, + default="1-$", + help="Query IDs as comma-separated list of " + "ranges (e.g., 1-5,7,12-$)", + ) + subparser.add_argument( + "--query-regex", + type=str, + help="Only consider example queries matching " + "this regex (using grep -Pi)", + ) + subparser.add_argument( + "--example-queries", + action="store_true", + default=False, + help=( + "Run the example queries for the given --ui-config " + "instead of the benchmark queries from a TSV or YML file" + ), + ) + subparser.add_argument( + "--download-or-count", + choices=["download", "count"], + default="download", + help="Whether to download the full result " + "or just compute the size of the result", + ) + subparser.add_argument( + "--limit", type=int, help="Limit on the number of results" + ) + subparser.add_argument( + "--remove-offset-and-limit", + action="store_true", + default=False, + help="Remove OFFSET and LIMIT from the query", + ) + subparser.add_argument( + "--accept", + type=str, + choices=[ + "text/tab-separated-values", + "text/csv", + "application/sparql-results+json", + "application/qlever-results+json", + "application/octet-stream", + "text/turtle", + "AUTO", + ], + default="application/sparql-results+json", + help="Accept header for the SPARQL query; AUTO means " + "`text/turtle` for CONSTRUCT AND DESCRIBE queries, " + "`application/sparql-results+json` for all others", + ) + subparser.add_argument( + "--clear-cache", + choices=["yes", "no"], + default="no", + help="Clear the cache before each query (only works for QLever)", + ) + subparser.add_argument( + "--width-query-name", + type=int, + default=70, + help="Width for printing the query name", + ) + subparser.add_argument( + "--width-error-message", + type=int, + default=50, + help="Width for printing the error message (0 = no limit)", + ) + subparser.add_argument( + "--width-result-size", + type=int, + default=14, + help="Width for printing the result size", + ) + subparser.add_argument( + "--add-query-type-to-description", + action="store_true", + default=False, + help="Add the query type (SELECT, ASK, CONSTRUCT, DESCRIBE, " + "UNKNOWN) to the query description", + ) + subparser.add_argument( + "--show-query", + choices=["always", "never", "on-error"], + default="never", + help="Show the queries that will be executed (always, never, on error)", + ) + subparser.add_argument( + "--show-prefixes", + action="store_true", + default=False, + help="When showing the query, also show the prefixes", + ) + subparser.add_argument( + "--results-dir", + type=str, + default=".", + help=( + "The directory where the YML result file would be saved " + "for the evaluation web app (Default = current working directory)" + ), + ) + subparser.add_argument( + "--result-file", + type=str, + default=None, + help=( + "Base name used for the result YML file, should be of the " + "form `.`, e.g., `wikidata.qlever`" + ), + ) + subparser.add_argument( + "--max-results-output-file", + type=int, + default=5, + help=( + "Maximum number of results per query in the output result " + "YML file (Default = 5)" + ), + ) + subparser.add_argument( + "--benchmark-name", + type=str, + default=None, + help=( + "Benchmark name to be saved in result YML file (This will " + "override the 'name' field in --queries-yml file). This benchmark " + "name would be displayed as header title when comparing RDF Graph " + "Databases on the evaluation web app. Only relevant " + "when --result-file argument is passed." + ), + ) + subparser.add_argument( + "--benchmark-description", + type=str, + default=None, + help=( + "Benchmark description to be saved in result YML file (This " + "will override the 'description' field in --queries-yml file). " + "This benchmark description would be displayed as additional " + "help text on the evaluation web app for the given benchmark. " + "Only relevant when --result-file argument is passed." + ), + ) + subparser.add_argument( + "--restart-on-hang", + action="store_true", + help=( + "Enable automatic server recovery during benchmarking. " + "If a query continues running for more than 30 seconds past the " + "configured timeout, the benchmark runner will assume the SPARQL " + "server is stuck. It will then stop and restart the server for " + "the current engine, and resume execution with the next query. " + "NOTE: This only works if all the server parameters for start and " + "stop are configured in the Qleverfile and no arguments are needed " + f"for the {script_name} start and {script_name} stop commands." + ), + ) + + def execute(self, args) -> bool: + # We can't have both `--remove-offset-and-limit` and `--limit`. + if args.remove_offset_and_limit and args.limit: + log.error("Cannot have both --remove-offset-and-limit and --limit") + return False + + # Extract dataset and sparql_engine name from result file + dataset = engine = None + if args.result_file is not None: + result_file_parts = args.result_file.split(".") + if len(result_file_parts) != 2: + log.error( + "The argument of --result-file should be of the form " + "`.`, e.g., `wikidata.qlever`" + ) + return False + dataset, engine = result_file_parts + + # Make sure results_dir is a directory path and if it doesn't + # exist, create the directory + results_dir_path = Path(args.results_dir) + if results_dir_path.exists(): + if not results_dir_path.is_dir(): + log.error( + f"{results_dir_path} exists but is not a directory" + ) + return False + else: + log.info( + f"Creating results directory: {results_dir_path.absolute()}" + ) + results_dir_path.mkdir(parents=True, exist_ok=True) + + # If `args.accept` is `application/sparql-results+json` or + # `application/qlever-results+json` or `AUTO`, we need `jq`. + if args.accept in ( + "application/sparql-results+json", + "application/qlever-results+json", + "AUTO", + ): + try: + subprocess.run( + "jq --version", + shell=True, + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + except Exception as e: + log.error(f"Please install `jq` for {args.accept} ({e})") + return False + + # Ensure unique source for benchmark queries + if not any((args.queries_tsv, args.queries_yml, args.example_queries)): + log.error( + "No benchmark or example queries to read! Either pass benchmark " + "queries using --queries-tsv or --queries-yml, or pass the " + "argument --example-queries to run example queries for the " + f"given ui_config {args.ui_config}" + ) + return False + + if all((args.queries_tsv, args.queries_yml)): + log.error("Cannot have both --queries-tsv and --queries-yml") + return False + + if any((args.queries_tsv, args.queries_yml)) and args.example_queries: + queries_file_arg = "tsv" if args.queries_tsv else "yml" + log.error( + f"Cannot have both --queries-{queries_file_arg} and " + "--example-queries" + ) + return False + + # Handle shortcuts for SPARQL endpoint. + if args.sparql_endpoint_preset: + args.sparql_endpoint = args.sparql_endpoint_preset + + # Limit only works with full result. + if args.limit and args.download_or_count == "count": + log.error("Limit only works with full result") + return False + + # Clear cache only works for QLever. + is_qlever = ( + not args.sparql_endpoint + or args.sparql_endpoint.startswith("https://qlever") + ) + if engine is not None: + is_qlever = is_qlever or "qlever" in engine.lower() + if args.clear_cache == "yes": + if is_qlever: + log.warning( + "Clearing the cache before each query" + " (only works for QLever)" + ) + else: + log.warning( + "Clearing the cache only works for QLever" + ", option `--clear-cache` is ignored" + ) + args.clear_cache = "no" + + # Show what the command will do. + example_queries_cmd = ( + f"curl -sv https://qlever.dev/api/examples/{args.ui_config}" + ) + sparql_endpoint = ( + args.sparql_endpoint or f"{args.host_name}:{args.port}" + ) + + self.show( + f"Obtain queries via: {args.queries_yml or args.queries_tsv or example_queries_cmd}\n" + f"SPARQL endpoint: {sparql_endpoint}\n" + f"Accept header: {args.accept}\n" + f"Download result for each query or just count:" + f" {args.download_or_count.upper()}" + + (f" with LIMIT {args.limit}" if args.limit else ""), + only_show=args.show, + ) + if args.show: + return True + + # Parse queries and extract benchmark name/description from YML. + yml_name = yml_description = None + if args.queries_yml: + yml_name, yml_description, queries = parse_queries_yml( + args.queries_yml + ) + elif args.queries_tsv: + queries = parse_queries_tsv(f"cat {args.queries_tsv}") + else: + queries = parse_queries_tsv(example_queries_cmd) + + filtered_queries = filter_queries( + queries, args.query_ids, args.query_regex + ) + + if len(filtered_queries) == 0 or not filtered_queries[0]: + log.error("No queries to process!") + return False + + # We want the width of the query description to be an uneven number (in + # case we have to truncated it, in which case we want to have a " ... " + # in the middle). + width_query_name_half = args.width_query_name // 2 + width_query_name = 2 * width_query_name_half + 1 + + try: + timeout = int(args.timeout[:-1]) + except ValueError: + timeout = None + + benchmark_name, benchmark_description = resolve_benchmark_metadata( + args.benchmark_name, + args.benchmark_description, + yml_name, + yml_description, + dataset, + ) + + # Launch the queries one after the other and for each print: the + # description, the result size (number of rows), and the query + # processing time (seconds). + query_times = [] + result_sizes = [] + result_yml_query_records = { + "name": benchmark_name, + "description": benchmark_description, + "queries": [], + } + if args.result_file: + if timeout: + result_yml_query_records["timeout"] = timeout + + index_time, index_size = compute_index_stats() + result_yml_query_records["index_time"] = index_time + result_yml_query_records["index_size"] = index_size + + num_failed = 0 + for name, description, query in filtered_queries: + if len(query) == 0: + log.error( + "Could not parse name, description and query, line is:" + ) + log.info("") + log.info(f"{name}\t{description}\t{query}") + return False + query_type = sparql_query_type(query) + if args.add_query_type_to_description or args.accept == "AUTO": + # If no query description, use name and append query type to it + description = f"{description or name} [{query_type}]" + + # Clear the cache. + if args.clear_cache == "yes": + args.server_url = sparql_endpoint + args.complete = False + clear_cache_successful = False + with mute_log(): + clear_cache_successful = ClearCacheCommand().execute(args) + if not clear_cache_successful: + log.warn("Failed to clear the cache") + + # Remove OFFSET and LIMIT (after the last closing bracket). + if args.remove_offset_and_limit or args.limit: + closing_bracket_idx = query.rfind("}") + regexes = [ + re.compile(r"OFFSET\s+\d+\s*", re.IGNORECASE), + re.compile(r"LIMIT\s+\d+\s*", re.IGNORECASE), + ] + for regex in regexes: + match = re.search(regex, query[closing_bracket_idx:]) + if match: + query = ( + query[: closing_bracket_idx + match.start()] + + query[closing_bracket_idx + match.end() :] + ) + + # Limit query. + if args.limit: + query += f" LIMIT {args.limit}" + + # Count query. + if args.download_or_count == "count": + # First find out if there is a FROM clause. + regex_from_clause = re.compile( + r"\s*FROM\s+<[^>]+>\s*", re.IGNORECASE + ) + match_from_clause = re.search(regex_from_clause, query) + from_clause = " " + if match_from_clause: + from_clause = match_from_clause.group(0) + query = ( + query[: match_from_clause.start()] + + " " + + query[match_from_clause.end() :] + ) + # Now we can add the outer SELECT COUNT(*). + query = ( + re.sub( + r"SELECT ", + "SELECT (COUNT(*) AS ?qlever_count_)" + + from_clause + + "WHERE { SELECT ", + query, + count=1, + flags=re.IGNORECASE, + ) + + " }" + ) + + # A bit of pretty-printing. + query = re.sub(r"\s+", " ", query) + query = re.sub(r"\s*\.\s*\}", " }", query) + if args.show_query == "always": + log.info("") + log.info( + colored( + pretty_printed_query( + query, args.show_prefixes, args.system + ), + "cyan", + ) + ) + + # Accept header. For "AUTO", use `text/turtle` for CONSTRUCT + # queries and `application/sparql-results+json` for all others. + accept_header = args.accept + if accept_header == "AUTO": + if query_type == "CONSTRUCT" or query_type == "DESCRIBE": + accept_header = "text/turtle" + else: + accept_header = "application/sparql-results+json" + + # Launch query. + curl_cmd = ( + f"curl -Ls {sparql_endpoint}" + f' -w "HTTP code: %{{http_code}}\\n"' + f' -H "Accept: {accept_header}"' + f" --data-urlencode query={shlex.quote(query)}" + ) + log.debug(curl_cmd) + result_file = ( + f"qlever.example_queries.result.{abs(hash(curl_cmd))}.tmp" + ) + result_size = 0 + single_int_result = None + start_time = time.time() + server_restarted = False + try: + max_time = None + if args.restart_on_hang and timeout: + max_time = timeout + 30 + http_code = run_curl_command( + sparql_endpoint, + headers={"Accept": accept_header}, + params={"query": query}, + result_file=result_file, + max_time=max_time, + ).strip() + time_seconds = time.time() - start_time + if http_code == "200": + error_msg = None + else: + error_msg = { + "short": f"HTTP code: {http_code}", + "long": re.sub( + r"\s+", " ", Path(result_file).read_text() + ), + } + except Exception as e: + time_seconds = time.time() - start_time + + # If curl timed out after hitting max_time = 30s + if "exit code 28" in str(e) and args.restart_on_hang: + server_restarted = restart_server() + # If server is not responding and has crashed + elif ( + "exit code 52" in str(e) or "exit code 7" in str(e) + ) and args.restart_on_hang: + server_restarted = restart_server(start_only=True) + + if args.log_level == "DEBUG": + traceback.print_exc() + error_msg = { + "short": "Exception", + "long": re.sub(r"\s+", " ", str(e)), + } + + # Get result size (via the command line, in order to avoid loading + # a potentially large JSON file into Python, which is slow). + if error_msg is None: + result_size, error_msg = get_result_size( + args.download_or_count == "count", + query_type, + accept_header, + result_file, + ) + if ( + result_size == 1 + and accept_header == "application/sparql-results+json" + and args.download_or_count == "download" + ): + single_int_result = get_single_int_result(result_file) + + # Get the result yaml record if output file needs to be generated + if args.result_file is not None: + result_length = None if error_msg is not None else 1 + result_length = ( + result_size + if args.download_or_count == "download" + and result_length is not None + else result_length + ) + query_results = ( + error_msg if error_msg is not None else result_file + ) + query_record = get_result_yml_query_record( + name=name, + description=description, + query=pretty_printed_query( + query, args.show_prefixes, args.system + ), + client_time=time_seconds, + result=query_results, + result_size=result_length, + max_result_size=args.max_results_output_file, + accept_header=accept_header, + server_restarted=server_restarted, + ) + result_yml_query_records["queries"].append(query_record) + + # Print name, time, result in tabular form. + if len(name) > width_query_name: + name = ( + name[: width_query_name_half - 2] + + " ... " + + name[-width_query_name_half + 2 :] + ) + if error_msg is None: + result_size = int(result_size) + single_int_result = ( + f" [single int result: {single_int_result:,}]" + if single_int_result is not None + else "" + ) + log.info( + f"{name:<{width_query_name}} " + f"{time_seconds:6.2f} s " + f"{result_size:>{args.width_result_size},}" + f"{single_int_result}" + ) + query_times.append(time_seconds) + result_sizes.append(result_size) + else: + num_failed += 1 + if ( + args.width_error_message > 0 + and len(error_msg["long"]) > args.width_error_message + and args.log_level != "DEBUG" + and args.show_query != "on-error" + ): + error_msg["long"] = ( + error_msg["long"][: args.width_error_message - 3] + + "..." + ) + seperator_short_long = ( + "\n" if args.show_query == "on-error" else " " + ) + log.info( + f"{name:<{width_query_name}} " + f"{colored('FAILED ', 'red')}" + f"{colored(error_msg['short'], 'red'):>{args.width_result_size}}" + f"{seperator_short_long}" + f"{colored(error_msg['long'], 'red')}" + ) + if args.show_query == "on-error": + log.info( + colored( + pretty_printed_query( + query, args.show_prefixes, args.system + ), + "cyan", + ) + ) + log.info("") + + # Remove the result file (unless in debug mode). + if args.log_level != "DEBUG": + Path(result_file).unlink(missing_ok=True) + + # Check that each query has a time and a result size, or it failed. + assert len(result_sizes) == len(query_times) + assert len(query_times) + num_failed == len(filtered_queries) + + if args.result_file: + if len(result_yml_query_records["queries"]) != 0: + outfile_name = f"{dataset}.{engine}.results.yaml" + outfile = Path(args.results_dir) / outfile_name + write_query_records_to_result_file( + query_data=result_yml_query_records, + out_file=outfile, + ) + else: + log.error( + f"Nothing to write to output result YML file: {args.result_file}" + ) + + # Show statistics. + if len(query_times) > 0: + n = len(query_times) + total_query_time = sum(query_times) + average_query_time = total_query_time / n + median_query_time = sorted(query_times)[n // 2] + total_result_size = sum(result_sizes) + average_result_size = round(total_result_size / n) + median_result_size = sorted(result_sizes)[n // 2] + query_or_queries = "query" if n == 1 else "queries" + description = f"TOTAL for {n} {query_or_queries}" + log.info("") + log.info( + f"{description:<{width_query_name}} " + f"{total_query_time:6.2f} s " + f"{total_result_size:>14,}" + ) + description = f"AVERAGE for {n} {query_or_queries}" + log.info( + f"{description:<{width_query_name}} " + f"{average_query_time:6.2f} s " + f"{average_result_size:>14,}" + ) + description = f"MEDIAN for {n} {query_or_queries}" + log.info( + f"{description:<{width_query_name}} " + f"{median_query_time:6.2f} s " + f"{median_result_size:>14,}" + ) + + # Show number of failed queries. + if num_failed > 0: + log.info("") + description = "Number of FAILED queries" + num_failed_string = f"{num_failed:>6}" + if num_failed == len(filtered_queries): + num_failed_string += " [all]" + log.info( + colored( + f"{description:<{width_query_name}} {num_failed:>24}", + "red", + ) + ) + + # Return success (has nothing to do with how many queries failed). + return True diff --git a/src/qlever/commands/cache_stats.py b/src/qlever/commands/cache_stats.py index 846bf756..481032c0 100644 --- a/src/qlever/commands/cache_stats.py +++ b/src/qlever/commands/cache_stats.py @@ -17,45 +17,58 @@ def __init__(self): pass def description(self) -> str: - return ("Show how much of the cache is currently being used") + return "Show how much of the cache is currently being used" def should_have_qleverfile(self) -> bool: return False - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"server": ["host_name", "port"]} def additional_arguments(self, subparser) -> None: - subparser.add_argument("--server-url", - help="URL of the QLever server, default is " - "{host_name}:{port}") - subparser.add_argument("--detailed", - action="store_true", - default=False, - help="Show detailed statistics and settings") + subparser.add_argument( + "--sparql-endpoint", + help="URL of the SPARQL endpoint, default is {host_name}:{port}", + ) + subparser.add_argument( + "--detailed", + action="store_true", + default=False, + help="Show detailed statistics and settings", + ) def execute(self, args) -> bool: # Construct the two curl commands. - server_url = (args.server_url if args.server_url - else f"{args.host_name}:{args.port}") - cache_stats_cmd = (f"curl -s {server_url} " - f"--data-urlencode \"cmd=cache-stats\"") - cache_settings_cmd = (f"curl -s {server_url} " - f"--data-urlencode \"cmd=get-settings\"") + sparql_endpoint = ( + args.sparql_endpoint + if args.sparql_endpoint + else f"{args.host_name}:{args.port}" + ) + cache_stats_cmd = ( + f'curl -s {sparql_endpoint} --data-urlencode "cmd=cache-stats"' + ) + cache_settings_cmd = ( + f'curl -s {sparql_endpoint} --data-urlencode "cmd=get-settings"' + ) # Show them. - self.show("\n".join([cache_stats_cmd, cache_settings_cmd]), - only_show=args.show) + self.show( + "\n".join([cache_stats_cmd, cache_settings_cmd]), + only_show=args.show, + ) if args.show: return True # Execute them. try: cache_stats = subprocess.check_output(cache_stats_cmd, shell=True) - cache_settings = subprocess.check_output(cache_settings_cmd, - shell=True) + cache_settings = subprocess.check_output( + cache_settings_cmd, shell=True + ) cache_stats_dict = json.loads(cache_stats) cache_settings_dict = json.loads(cache_settings) + if isinstance(cache_settings_dict, list): + cache_settings_dict = cache_settings_dict[0] except Exception as e: log.error(f"Failed to get cache stats and settings: {e}") return False @@ -64,27 +77,35 @@ def execute(self, args) -> bool: if not args.detailed: cache_size = cache_settings_dict["cache-max-size"] if not cache_size.endswith(" GB"): - log.error(f"Cache size {cache_size} is not in GB, " - f"QLever should return bytes instead") + log.error( + f"Cache size {cache_size} is not in GB, " + f"QLever should return bytes instead" + ) return False else: cache_size = float(cache_size[:-3]) - pinned_size = cache_stats_dict["pinned-size"] / 1e9 - non_pinned_size = cache_stats_dict["non-pinned-size"] / 1e9 + pinned_size = cache_stats_dict["cache-size-pinned"] / 1e9 + non_pinned_size = cache_stats_dict["cache-size-unpinned"] / 1e9 cached_size = pinned_size + non_pinned_size free_size = cache_size - cached_size if cached_size == 0: log.info(f"Cache is empty, all {cache_size:.1f} GB available") else: - log.info(f"Pinned queries : " - f"{pinned_size:5.1f} GB of {cache_size:5.1f} GB" - f" [{pinned_size / cache_size:5.1%}]") - log.info(f"Non-pinned queries : " - f"{non_pinned_size:5.1f} GB of {cache_size:5.1f} GB" - f" [{non_pinned_size / cache_size:5.1%}]") - log.info(f"FREE : " - f"{free_size:5.1f} GB of {cache_size:5.1f} GB" - f" [{1 - cached_size / cache_size:5.1%}]") + log.info( + f"Pinned queries : " + f"{pinned_size:5.1f} GB of {cache_size:5.1f} GB" + f" [{pinned_size / cache_size:5.1%}]" + ) + log.info( + f"Non-pinned queries : " + f"{non_pinned_size:5.1f} GB of {cache_size:5.1f} GB" + f" [{non_pinned_size / cache_size:5.1%}]" + ) + log.info( + f"FREE : " + f"{free_size:5.1f} GB of {cache_size:5.1f} GB" + f" [{1 - cached_size / cache_size:5.1%}]" + ) return True # Complete version. @@ -96,6 +117,7 @@ def show_dict_as_table(key_value_pairs): if re.match(r"^\d+\.\d+$", value): value = "{:.2f}".format(float(value)) log.info(f"{key.ljust(max_key_len)} : {value}") + show_dict_as_table(cache_stats_dict.items()) log.info("") show_dict_as_table(cache_settings_dict.items()) diff --git a/src/qlever/commands/clear_cache.py b/src/qlever/commands/clear_cache.py index a92713f4..30251cf9 100644 --- a/src/qlever/commands/clear_cache.py +++ b/src/qlever/commands/clear_cache.py @@ -1,11 +1,11 @@ from __future__ import annotations import re -import subprocess from qlever.command import QleverCommand from qlever.commands.cache_stats import CacheStatsCommand from qlever.log import log +from qlever.util import run_command class ClearCacheCommand(QleverCommand): @@ -22,12 +22,12 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"server": ["host_name", "port", "access_token"]} def additional_arguments(self, subparser) -> None: subparser.add_argument( - "--server-url", + "--sparql-endpoint", help="URL of the QLever server, default is {host_name}:{port}", ) subparser.add_argument( @@ -38,17 +38,19 @@ def additional_arguments(self, subparser) -> None: ) def execute(self, args) -> bool: + # Determine SPARQL endpoint. + sparql_endpoint = ( + args.sparql_endpoint + if args.sparql_endpoint + else (f"{args.host_name}:{args.port}") + ) + # Construct command line and show it. - clear_cache_cmd = "curl -s" - if args.server_url: - clear_cache_cmd += f" {args.server_url}" - else: - clear_cache_cmd += f" {args.host_name}:{args.port}" - cmd_val = "clear-cache-complete" if args.complete else "clear-cache" - clear_cache_cmd += f' --data-urlencode "cmd={cmd_val}"' + clear_cache_cmd = f"curl -s {sparql_endpoint} -d cmd=clear-cache" if args.complete: clear_cache_cmd += ( - f" --data-urlencode access-token=" f'"{args.access_token}"' + f"-complete" + f' --data-urlencode access-token="{args.access_token}"' ) self.show(clear_cache_cmd, only_show=args.show) if args.show: @@ -57,13 +59,7 @@ def execute(self, args) -> bool: # Execute the command. try: clear_cache_cmd += ' -w " %{http_code}"' - result = subprocess.run( - clear_cache_cmd, - shell=True, - capture_output=True, - text=True, - check=True, - ).stdout + result = run_command(clear_cache_cmd, return_output=True) match = re.match(r"^(.*) (\d+)$", result, re.DOTALL) if not match: raise Exception(f"Unexpected output:\n{result}") diff --git a/src/qlever/commands/extract_queries.py b/src/qlever/commands/extract_queries.py index a9802029..edea5648 100644 --- a/src/qlever/commands/extract_queries.py +++ b/src/qlever/commands/extract_queries.py @@ -20,7 +20,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"data": ["name"]} def additional_arguments(self, subparser) -> None: @@ -43,6 +43,12 @@ def additional_arguments(self, subparser) -> None: default="log-queries.txt", help="Output file for the extracted queries (default: `log-queries.txt`)", ) + subparser.add_argument( + "--use-alive-check-tag-as-description-base", + action="store_true", + help="Use the tag from 'Alive check' messages" + " as the base for query descriptions (default: False)", + ) def execute(self, args) -> bool: # Show what the command does. @@ -74,11 +80,12 @@ def execute(self, args) -> bool: for line in log_file: # An "Alive check" message contains a tag, which we use as the base # name of the query description. - alive_check_regex = r"Alive check with message \"(.*)\"" - match = re.search(alive_check_regex, line) - if match: - description_base = match.group(1) - continue + if args.use_alive_check_tag_as_description_base: + alive_check_regex = r"Alive check with message \"(.*)\"" + match = re.search(alive_check_regex, line) + if match: + description_base = match.group(1) + continue # A new query in the log. if "Processing the following SPARQL query" in line: diff --git a/src/qlever/commands/get_data.py b/src/qlever/commands/get_data.py index b27eca5f..d77071f0 100644 --- a/src/qlever/commands/get_data.py +++ b/src/qlever/commands/get_data.py @@ -21,7 +21,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"data": ["name", "get_data_cmd"], "index": ["input_files"]} def additional_arguments(self, subparser) -> None: diff --git a/src/qlever/commands/index.py b/src/qlever/commands/index.py index 41c25fff..b18bb02d 100644 --- a/src/qlever/commands/index.py +++ b/src/qlever/commands/index.py @@ -30,19 +30,23 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return { "data": ["name", "format"], "index": [ "input_files", "cat_input_files", + "encode_as_id", "multi_input_json", "parallel_parsing", "settings_json", + "materialized_views", + "vocabulary_type", "index_binary", "only_pso_and_pos_permutations", "ulimit", "use_patterns", + "add_has_word_triples", "text_index", "stxxl_memory", "parser_buffer_size", @@ -98,8 +102,7 @@ def get_input_options_for_json(self, args) -> str: # Check that `input_spec` is a dictionary. if not isinstance(input_spec, dict): raise self.InvalidInputJson( - f"Element {i} in `MULTI_INPUT_JSON` must be a JSON " - "object", + f"Element {i} in `MULTI_INPUT_JSON` must be a JSON object", input_spec, ) # For each `input_spec`, we must have a command. @@ -184,6 +187,7 @@ def execute(self, args) -> bool: index_cmd = ( f"{args.cat_input_files} | {args.index_binary}" f" -i {args.name} -s {args.name}.settings.json" + f" --vocabulary-type {args.vocabulary_type}" f" -F {args.format} -f -" ) if args.parallel_parsing: @@ -199,6 +203,7 @@ def execute(self, args) -> bool: index_cmd = ( f"{args.index_binary}" f" -i {args.name} -s {args.name}.settings.json" + f" --vocabulary-type {args.vocabulary_type}" f" {input_options}" ) else: @@ -212,17 +217,20 @@ def execute(self, args) -> bool: return False # Add remaining options. + if args.encode_as_id: + index_cmd += f" --encode-as-id {args.encode_as_id}" if args.only_pso_and_pos_permutations: - index_cmd += " --only-pso-and-pos-permutations --no-patterns" - if not args.use_patterns: + index_cmd += " --only-pso-and-pos-permutations" + if args.use_patterns == "no": index_cmd += " --no-patterns" + if args.add_has_word_triples: + index_cmd += " --add-has-word-triples" if args.text_index in [ "from_text_records", "from_text_records_and_literals", ]: index_cmd += ( - f" -w {args.name}.wordsfile.tsv" - f" -d {args.name}.docsfile.tsv" + f" -w {args.name}.wordsfile.tsv -d {args.name}.docsfile.tsv" ) if args.text_index in [ "from_literals", @@ -233,7 +241,11 @@ def execute(self, args) -> bool: index_cmd += f" --stxxl-memory {args.stxxl_memory}" if args.parser_buffer_size: index_cmd += f" --parser-buffer-size {args.parser_buffer_size}" - index_cmd += f" | tee {args.name}.index-log.txt" + if args.materialized_views: + index_cmd += ( + f" --materialized-views {shlex.quote(args.materialized_views)}" + ) + index_cmd += f" 2>&1 | tee {args.name}.index-log.txt" # If the total file size is larger than 10 GB, set ulimit (such that a # large number of open files is allowed). @@ -266,10 +278,8 @@ def execute(self, args) -> bool: if args.show: return True - # When running natively, check if the binary exists and works. - if args.system == "native": - if not binary_exists(args.index_binary, "index-binary"): - return False + if not binary_exists(args.index_binary, "index-binary", args): + return False # Check if all of the input files exist. for pattern in shlex.split(args.input_files): @@ -300,8 +310,7 @@ def execute(self, args) -> bool: ): if Containerize.is_running(args.system, args.index_container): log.info( - "Another index process is running, trying to stop " - "it ..." + "Another index process is running, trying to stop it ..." ) log.info("") try: diff --git a/src/qlever/commands/index_stats.py b/src/qlever/commands/index_stats.py index b997b8c7..d1c96b76 100644 --- a/src/qlever/commands/index_stats.py +++ b/src/qlever/commands/index_stats.py @@ -9,6 +9,261 @@ from qlever.util import get_total_file_size +def compute_durations( + lines: list[str], + time_unit: str, + ignore_text_index: bool, +) -> dict[str, tuple[float | None, str]]: + """ + Parse index build log lines and compute the duration of each + indexing phase. Returns a dict mapping phase names (e.g. + "Parse input", "TOTAL time") to (duration, unit) tuples. The + duration is None if the phase timestamps are missing. Returns + an empty dict on error. + """ + + # Helper function that finds the next line matching the given `regex`, + # starting from `current_line`, and extracts the time. Returns a tuple + # of the time and the regex match object. + # + # If `update_current_line` is `False`, then `current_line` will not be + # updated by this call. + # + # Otherwise, and this is the default behavior, `current_line` will be + # updated to the line after the first match, or one beyond the last + # line if no match is found. + current_line = 0 + + def find_next_line(regex: str, update_current_line: bool = True): + nonlocal lines + nonlocal current_line + current_line_backup = current_line + # Find starting from `current_line`. + while current_line < len(lines): + line = lines[current_line] + current_line += 1 + timestamp_regex = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}" + timestamp_format = "%Y-%m-%d %H:%M:%S" + regex_match = re.search(regex, line) + if regex_match: + try: + return datetime.strptime( + re.match(timestamp_regex, line).group(), + timestamp_format, + ), regex_match + except Exception as e: + log.error( + f"Could not parse timestamp of form " + f'"{timestamp_regex}" from line ' + f' "{line.rstrip()}" ({e})' + ) + # If we get here, we did not find a matching line. + if not update_current_line: + current_line = current_line_backup + return None, None + + # Find the lines matching the key_lines_regex and extract the time + # information from them. + overall_begin, _ = find_next_line(r"INFO:\s*Processing") + merge_begin, _ = find_next_line(r"INFO:\s*Merging partial vocab") + convert_begin, _ = find_next_line(r"INFO:\s*Converting triples") + perm_begin_and_info = [] + while True: + # Find the next line that starts a permutation. + # + # NOTE: Should work for the old and new format of the index log + # file (old format: "Creating a pair" + names of permutations in + # line "Writing meta data for ..."; new format: name of + # permutations already in line "Creating permutations ..."). + perm_begin, _ = find_next_line( + r"INFO:\s*Creating a pair", update_current_line=False + ) + if perm_begin is None: + perm_begin, perm_info = find_next_line( + r"INFO:\s*Creating permutations ([A-Z]+ and [A-Z]+)", + update_current_line=False, + ) + else: + _, perm_info = find_next_line( + r"INFO:\s*Writing meta data for ([A-Z]+ and [A-Z]+)", + update_current_line=False, + ) + if perm_info is None: + break + perm_begin_and_info.append((perm_begin, perm_info)) + convert_end = ( + perm_begin_and_info[0][0] if len(perm_begin_and_info) > 0 else None + ) + normal_end, _ = find_next_line(r"INFO:\s*Index build completed") + text_begin, _ = find_next_line( + r"INFO:\s*Adding text index", update_current_line=False + ) + text_end, _ = find_next_line( + r"INFO:\s*Text index build comp", update_current_line=False + ) + if ignore_text_index: + text_begin = text_end = None + + # Check whether at least the first phase is done. + if overall_begin is None: + log.error("Missing line that index build has started") + return {} + if overall_begin and not merge_begin: + log.error( + "According to the log file, the index build " + "has started, but is still in its first " + "phase (parsing the input)" + ) + return {} + + def duration( + start_end_pairs: list[tuple[datetime | None, datetime | None]], + ) -> float | None: + """ + Compute the total duration across all valid (start, end) pairs, + converted to `resolved_time_unit`. Returns None if no pair has + both timestamps available. + """ + nonlocal resolved_time_unit + num_start_end_pairs = 0 + diff_seconds = 0 + for start, end in start_end_pairs: + if start and end: + diff_seconds += (end - start).total_seconds() + num_start_end_pairs += 1 + if num_start_end_pairs > 0: + return diff_seconds / get_time_unit_factor(resolved_time_unit) + return None + + # Determine the time unit based on the duration of the first phase + # (parsing), unless explicitly specified. + parse_duration = None + if merge_begin and overall_begin: + parse_duration = (merge_begin - overall_begin).total_seconds() + resolved_time_unit = get_time_unit(time_unit, parse_duration) + + # Compute durations for each indexing phase. Each entry maps a + # phase name to (duration_in_time_unit, time_unit). + durations = {} + durations["Parse input"] = ( + duration([(overall_begin, merge_begin)]), + resolved_time_unit, + ) + durations["Build vocabularies"] = ( + duration([(merge_begin, convert_begin)]), + resolved_time_unit, + ) + durations["Convert to global IDs"] = ( + duration([(convert_begin, convert_end)]), + resolved_time_unit, + ) + for i in range(len(perm_begin_and_info)): + perm_begin, perm_info = perm_begin_and_info[i] + perm_end = ( + perm_begin_and_info[i + 1][0] + if i + 1 < len(perm_begin_and_info) + else normal_end + ) + perm_info_text = ( + perm_info.group(1).replace(" and ", " & ") + if perm_info + else f"#{i + 1}" + ) + perm_key = f"Permutation {perm_info_text}" + if perm_key in durations: + suffix = 2 + while f"{perm_key} ({suffix})" in durations: + suffix += 1 + perm_key = f"{perm_key} ({suffix})" + durations[perm_key] = ( + duration([(perm_begin, perm_end)]), + resolved_time_unit, + ) + durations["Text index"] = ( + duration([(text_begin, text_end)]), + resolved_time_unit, + ) + # TOTAL includes the text index time if it was built separately. + if text_begin and text_end: + durations["TOTAL time"] = ( + duration([(overall_begin, normal_end), (text_begin, text_end)]), + resolved_time_unit, + ) + elif normal_end: + durations["TOTAL time"] = ( + duration([(overall_begin, normal_end)]), + resolved_time_unit, + ) + return durations + + +def get_time_unit(time_unit: str, parse_duration: float | None) -> str: + """ + Resolve the time unit. If `time_unit` is not "auto", return it + as-is. Otherwise, pick a unit based on how long the parse phase + took (seconds if < 200s, minutes if < 1h, hours otherwise). + """ + if time_unit != "auto": + return time_unit + time_unit = "h" + if parse_duration is not None: + if parse_duration < 200: + time_unit = "s" + elif parse_duration < 3600: + time_unit = "min" + return time_unit + + +def get_time_unit_factor(time_unit: str) -> int: + """Return the number of seconds per `time_unit`.""" + return {"s": 1, "min": 60, "h": 3600}[time_unit] + + +def compute_sizes( + raw_sizes: dict[str, int], size_unit: str +) -> dict[str, tuple[float, str]]: + """ + Convert raw byte sizes into display-ready (size, unit) tuples. + `raw_sizes` maps category names ("index", "vocabulary", "text", + "total") to sizes in bytes. Returns a dict mapping display labels + (e.g. "Files index.*", "TOTAL size") to (converted_size, unit). + """ + size_unit = get_size_unit(size_unit, raw_sizes["total"]) + unit_factor = get_size_unit_factor(size_unit) + sizes = {k: v / unit_factor for k, v in raw_sizes.items()} + + sizes_to_show = {} + sizes_to_show["Files index.*"] = (sizes["index"], size_unit) + sizes_to_show["Files vocabulary.*"] = (sizes["vocabulary"], size_unit) + if sizes["text"] > 0: + sizes_to_show["Files text.*"] = (sizes["text"], size_unit) + sizes_to_show["TOTAL size"] = (sizes["total"], size_unit) + return sizes_to_show + + +def get_size_unit(size_unit: str, total_size: int) -> str: + """ + Resolve the size unit. If `size_unit` is not "auto", return it + as-is. Otherwise, pick the largest unit that keeps the total + size >= 1 in that unit. + """ + if size_unit != "auto": + return size_unit + size_unit = "TB" + if total_size < 1e6: + size_unit = "B" + elif total_size < 1e9: + size_unit = "MB" + elif total_size < 1e12: + size_unit = "GB" + return size_unit + + +def get_size_unit_factor(size_unit: str) -> int | float: + """Return the number of bytes per `size_unit`.""" + return {"B": 1, "MB": 1e6, "GB": 1e9, "TB": 1e12}[size_unit] + + class IndexStatsCommand(QleverCommand): """ Class for executing the `index-stats` command. @@ -23,7 +278,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return False - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"data": ["name"]} def additional_arguments(self, subparser) -> None: @@ -58,9 +313,13 @@ def additional_arguments(self, subparser) -> None: help="The size unit", ) - def execute_time(self, args, log_file_name) -> bool: + def execute_time( + self, args, log_file_name: str + ) -> dict[str, tuple[float | None, str]]: """ - Part of `execute` that shows the time used. + Read the index build log file(s) and delegate to + `compute_durations` for the actual parsing and computation. + Returns an empty dict on I/O error. """ # Read the content of `log_file_name` into a list of lines. @@ -69,213 +328,40 @@ def execute_time(self, args, log_file_name) -> bool: lines = log_file.readlines() except Exception as e: log.error(f"Problem reading index log file {log_file_name}: {e}") - return False + return {} # If there is a separate `add-text-index-log.txt` file, append those # lines. + text_log_file_name = f"{args.name}.text-index-log.txt" try: - text_log_file_name = f"{args.name}.text-index-log.txt" if Path(text_log_file_name).exists(): with open(text_log_file_name, "r") as text_log_file: lines.extend(text_log_file.readlines()) except Exception as e: log.error( - f"Problem reading text index log file " f"{text_log_file_name}: {e}" + f"Problem reading text index log file " + f"{text_log_file_name}: {e}" ) - return False + return {} - # Helper function that finds the next line matching the given `regex`, - # starting from `current_line`, and extracts the time. Returns a tuple - # of the time and the regex match object. - # - # If `update_current_line` is `False`, then `current_line` will not be - # updated by this call. - # - # Otherwise, and this is the default behavior, `current_line` will be - # updated to the line after the first match, or one beyond the last - # line if no match is found. - current_line = 0 - - def find_next_line(regex, update_current_line=True): - nonlocal lines - nonlocal current_line - current_line_backup = current_line - # Find starting from `current_line`. - while current_line < len(lines): - line = lines[current_line] - current_line += 1 - timestamp_regex = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}" - timestamp_format = "%Y-%m-%d %H:%M:%S" - regex_match = re.search(regex, line) - if regex_match: - try: - return datetime.strptime( - re.match(timestamp_regex, line).group(), timestamp_format - ), regex_match - except Exception as e: - log.error( - f"Could not parse timestamp of form " - f'"{timestamp_regex}" from line ' - f' "{line.rstrip()}" ({e})' - ) - # If we get here, we did not find a matching line. - if not update_current_line: - current_line = current_line_backup - return None, None - - # Find the lines matching the key_lines_regex and extract the time - # information from them. - overall_begin, _ = find_next_line(r"INFO:\s*Processing") - merge_begin, _ = find_next_line(r"INFO:\s*Merging partial vocab") - convert_begin, _ = find_next_line(r"INFO:\s*Converting triples") - perm_begin_and_info = [] - while True: - # Find the next line that starts a permutation. - # - # NOTE: Should work for the old and new format of the index log - # file (old format: "Creating a pair" + names of permutations in - # line "Writing meta data for ..."; new format: name of - # permutations already in line "Creating permutations ..."). - perm_begin, _ = find_next_line( - r"INFO:\s*Creating a pair", update_current_line=False - ) - if perm_begin is None: - perm_begin, perm_info = find_next_line( - r"INFO:\s*Creating permutations ([A-Z]+ and [A-Z]+)", - update_current_line=False, - ) - else: - _, perm_info = find_next_line( - r"INFO:\s*Writing meta data for ([A-Z]+ and [A-Z]+)", - update_current_line=False, - ) - if perm_info is None: - break - perm_begin_and_info.append((perm_begin, perm_info)) - convert_end = ( - perm_begin_and_info[0][0] if len(perm_begin_and_info) > 0 else None - ) - normal_end, _ = find_next_line(r"INFO:\s*Index build completed") - text_begin, _ = find_next_line( - r"INFO:\s*Adding text index", update_current_line=False - ) - text_end, _ = find_next_line( - r"INFO:\s*Text index build comp", update_current_line=False - ) - if args.ignore_text_index: - text_begin = text_end = None - - # Check whether at least the first phase is done. - if overall_begin is None: - log.error("Missing line that index build has started") - return False - if overall_begin and not merge_begin: - log.error( - "According to the log file, the index build " - "has started, but is still in its first " - "phase (parsing the input)" - ) - return False - - # Helper function that shows the duration for a phase (if the start and - # end timestamps are available). - def show_duration(heading, start_end_pairs): - nonlocal time_unit - num_start_end_pairs = 0 - diff_seconds = 0 - for start, end in start_end_pairs: - if start and end: - diff_seconds += (end - start).total_seconds() - num_start_end_pairs += 1 - if num_start_end_pairs > 0: - if time_unit == "h": - diff = diff_seconds / 3600 - elif time_unit == "min": - diff = diff_seconds / 60 - else: - diff = diff_seconds - log.info(f"{heading:<21} : {diff:>6.1f} {time_unit}") - - # Get the times of the various phases (hours or minutes, depending on - # how long the first phase took). - time_unit = args.time_unit - if time_unit == "auto": - time_unit = "h" - if merge_begin and overall_begin: - parse_duration = (merge_begin - overall_begin).total_seconds() - if parse_duration < 200: - time_unit = "s" - elif parse_duration < 3600: - time_unit = "min" - show_duration("Parse input", [(overall_begin, merge_begin)]) - show_duration("Build vocabularies", [(merge_begin, convert_begin)]) - show_duration("Convert to global IDs", [(convert_begin, convert_end)]) - for i in range(len(perm_begin_and_info)): - perm_begin, perm_info = perm_begin_and_info[i] - perm_end = ( - perm_begin_and_info[i + 1][0] - if i + 1 < len(perm_begin_and_info) - else normal_end - ) - perm_info_text = ( - perm_info.group(1).replace(" and ", " & ") if perm_info else f"#{i + 1}" - ) - show_duration(f"Permutation {perm_info_text}", [(perm_begin, perm_end)]) - show_duration("Text index", [(text_begin, text_end)]) - if text_begin and text_end: - log.info("") - show_duration( - "TOTAL time", [(overall_begin, normal_end), (text_begin, text_end)] - ) - elif normal_end: - log.info("") - show_duration("TOTAL time", [(overall_begin, normal_end)]) - return True + return compute_durations(lines, args.time_unit, args.ignore_text_index) - def execute_space(self, args) -> bool: + def execute_space(self, args) -> dict[str, tuple[float, str]]: """ - Part of `execute` that shows the space used. + Compute the disk space used by each group of index files. Returns + a dict mapping display labels (e.g. "Files index.*", "TOTAL size") + to (size, unit) tuples, where size is already converted to `unit`. """ - - # Get the sizes for the various groups of index files. - index_size = get_total_file_size([f"{args.name}.index.*"]) - vocab_size = get_total_file_size([f"{args.name}.vocabulary.*"]) - text_size = get_total_file_size([f"{args.name}.text.*"]) + # Collect raw sizes in bytes. + sizes = {} + for size_type in ["index", "vocabulary", "text"]: + sizes[size_type] = get_total_file_size( + [f"{args.name}.{size_type}.*"] + ) if args.ignore_text_index: - text_size = 0 - total_size = index_size + vocab_size + text_size - - # Determing the proper unit for the size. - size_unit = args.size_unit - if size_unit == "auto": - size_unit = "TB" - if total_size < 1e6: - size_unit = "B" - elif total_size < 1e9: - size_unit = "MB" - elif total_size < 1e12: - size_unit = "GB" - - # Helper function for showing the size in a uniform way. - def show_size(heading, size): - nonlocal size_unit - if size_unit == "GB": - size /= 1e9 - elif size_unit == "MB": - size /= 1e6 - elif size_unit == "TB": - size /= 1e12 - if size_unit == "B": - log.info(f"{heading:<21} : {size:,} {size_unit}") - else: - log.info(f"{heading:<21} : {size:>6.1f} {size_unit}") - - show_size("Files index.*", index_size) - show_size("Files vocabulary.*", vocab_size) - if text_size > 0: - show_size("Files text.*", text_size) - log.info("") - show_size("TOTAL size", total_size) - return True + sizes["text"] = 0 + sizes["total"] = sum(sizes.values()) + + return compute_sizes(sizes, args.size_unit) def execute(self, args) -> bool: return_value = True @@ -290,7 +376,17 @@ def execute(self, args) -> bool: only_show=args.show, ) if not args.show: - return_value &= self.execute_time(args, log_file_name) + durations = self.execute_time(args, log_file_name) + # Display each phase duration, skipping phases with + # missing timestamps (duration is None). + for heading, (duration, time_unit) in durations.items(): + if duration is not None: + if heading == "TOTAL time": + log.info("") + log.info( + f"{heading:<25} : {duration:>6.1f} {time_unit}" + ) + return_value &= len(durations) != 0 if not args.only_time: log.info("") @@ -301,6 +397,15 @@ def execute(self, args) -> bool: only_show=args.show, ) if not args.show: - return_value &= self.execute_space(args) + sizes = self.execute_space(args) + # Display the disk space used by each group of index files. + for heading, (size, size_unit) in sizes.items(): + if heading == "TOTAL size": + log.info("") + if size_unit == "B": + log.info(f"{heading:<25} : {size:,} {size_unit}") + else: + log.info(f"{heading:<25} : {size:>6.1f} {size_unit}") + return_value &= len(sizes) != 0 return return_value diff --git a/src/qlever/commands/log.py b/src/qlever/commands/log.py index 816072bc..34942ec0 100644 --- a/src/qlever/commands/log.py +++ b/src/qlever/commands/log.py @@ -20,7 +20,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return False - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"data": ["name"]} def additional_arguments(self, subparser) -> None: diff --git a/src/qlever/commands/materialized_view.py b/src/qlever/commands/materialized_view.py new file mode 100644 index 00000000..68e6d3f1 --- /dev/null +++ b/src/qlever/commands/materialized_view.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +import json +import re +import shlex +import time + +from qlever.command import QleverCommand +from qlever.log import log +from qlever.util import ( + run_command, +) + + +class MaterializedViewCommand(QleverCommand): + """ + Class for executing the `materialized-view` command. + """ + + def __init__(self): + self.materialized_view_name_regex = r"^[A-Za-z0-9-]+$" + pass + + def description(self) -> str: + return "Create a materialized view from the given query" + + def should_have_qleverfile(self) -> bool: + return True + + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: + return { + "data": ["name"], + "server": ["host_name", "port", "access_token"], + } + + def additional_arguments(self, subparser) -> None: + subparser.add_argument( + "view_name", + type=str, + help="Name of the materialized view", + ) + subparser.add_argument( + "view_query", + type=str, + help="SPARQL query from which to create the materialized view", + ) + subparser.add_argument( + "--sparql-endpoint", + type=str, + help="URL of the SPARQL endpoint (default: :)", + ) + + def execute(self, args) -> bool: + # SPARQL endpoint to use. + sparql_endpoint = ( + args.sparql_endpoint + if args.sparql_endpoint is not None + else f"{args.host_name}:{args.port}" + ) + + # Check that the name of the materialized view is valid. + if not re.match(self.materialized_view_name_regex, args.view_name): + log.error( + f"The name for the materialized view must match " + f"the regex {self.materialized_view_name_regex}" + ) + return False + + # Command for building the materialized view. + url = ( + f"{sparql_endpoint}" + f"?cmd=write-materialized-view" + f"&view-name={args.view_name}" + ) + materialized_view_cmd = ( + f"curl -s {shlex.quote(url)} " + f"-H 'Authorization: Bearer {args.access_token}' " + f"-H 'Content-type: application/sparql-query' " + f"-d {shlex.quote(args.view_query)}" + ) + self.show(materialized_view_cmd, only_show=args.show) + if args.show: + return True + + # Run the command (and time it). + time_start = time.monotonic() + try: + log.info("Creating the materialized view ... " + "(this may take a while, depending on the complexity " + "of the query and the size of the result)") + log.info("") + result = run_command(materialized_view_cmd, return_output=True) + except Exception as e: + log.error(f"Creating the materialized view failed: {e}") + return False + time_end = time.monotonic() + duration_seconds = round(time_end - time_start) + + # Try to parse the result (should be JSON). + try: + result_json = json.loads(result) + view_name = result_json.get("materialized-view-written") + log.info( + f"Materialized view '{view_name}' created successfully " + f"in {duration_seconds:,} seconds" + ) + except Exception as e: + log.error(f'Failed to parse JSON from "{result}": {e}') + + return True diff --git a/src/qlever/commands/query.py b/src/qlever/commands/query.py index c8c0d71f..fa727c4a 100644 --- a/src/qlever/commands/query.py +++ b/src/qlever/commands/query.py @@ -37,7 +37,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return False - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"server": ["host_name", "port", "access_token"]} def additional_arguments(self, subparser) -> None: @@ -72,6 +72,7 @@ def additional_arguments(self, subparser) -> None: "application/sparql-results+json", "application/sparql-results+xml", "application/qlever-results+json", + "application/octet-stream", ], default="text/tab-separated-values", help="Accept header for the SPARQL query", @@ -94,7 +95,7 @@ def execute(self, args) -> bool: if args.pin_to_cache: args.accept = "application/qlever-results+json" curl_cmd_additions = ( - f" --data pinresult=true --data send=0" + f" --data pin-result=true --data send=0" f" --data access-token=" f"{shlex.quote(args.access_token)}" f" | jq .resultsize | numfmt --grouping" diff --git a/src/qlever/commands/rebuild_index.py b/src/qlever/commands/rebuild_index.py new file mode 100644 index 00000000..b3bb7062 --- /dev/null +++ b/src/qlever/commands/rebuild_index.py @@ -0,0 +1,337 @@ +from __future__ import annotations + +import shlex +import shutil +import subprocess +import time +from pathlib import Path + +from termcolor import colored + +from qlever.command import QleverCommand +from qlever.log import log +from qlever.util import ( + get_existing_index_files, + run_command, +) + + +class RebuildIndexCommand(QleverCommand): + """ + Class for executing the `rebuild-index` command. + """ + + def __init__(self): + pass + + def description(self) -> str: + return "Rebuild the index from the current data (including updates)" + + def should_have_qleverfile(self) -> bool: + return True + + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: + return { + "data": ["name"], + "server": ["host_name", "port", "access_token"], + "runtime": ["server_container"], + } + + def additional_arguments(self, subparser) -> None: + subparser.add_argument( + "--new-index-dir", + type=str, + help="Target directory for the new index (default: not set, " + "move the old index instead; see `--old-index-dir`)", + ) + subparser.add_argument( + "--old-index-dir", + type=str, + help="Directory where to move the current index once the rebuild " + "is finished (default: subdirectory `previous.YYYY-MM-DDTHH:MM`, " + "where the timestamp is the time of the earliest index file)", + ) + subparser.add_argument( + "--new-index-dir-basename", + type=str, + default="rebuild.", + help="Basename prefix for the new index directory when " + "`--new-index-dir` is not specified (default: `rebuild.`)", + ) + subparser.add_argument( + "--old-index-dir-basename", + type=str, + default="previous.", + help="Basename prefix for the old index directory when " + "`--old-index-dir` is not specified (default: `previous.`)", + ) + subparser.add_argument( + "--keep-old-index-dirs", + choices=["all", "none", "oldest", "newest"], + default="oldest", + help="Which old index directories to keep: all (keep all), " + "none (delete all), oldest (keep only oldest), " + "newest (keep only newest) (default: oldest)", + ) + subparser.add_argument( + "--index-name", + type=str, + help="Base name of the files of the new index (default: use " + "the same basename as for the current index)", + ) + subparser.add_argument( + "--restart-when-finished", + action="store_true", + default=False, + help="When the rebuild is finished, stop the server with the old " + "index and start it again with the new index", + ) + + def execute(self, args) -> bool: + # Either `--new-index-dir` or `--old-index-dir`. + if args.new_index_dir is not None and args.old_index_dir is not None: + log.error( + "Please specify either --new-index-dir (the target directory " + "for the new index) or --old-index-dir (the directory where " + "to move the current index), but not both" + ) + return False + + # Get the list of all files from the current index and get the date of + # the earliest one (in UTC). Add the `Qleverfile` as well. + old_index_files = get_existing_index_files( + args.name, add_non_essential=True + ) + old_index_date = time.strftime( + "%Y-%m-%dT%H:%M:%SZ", + time.gmtime(min(Path(f).stat().st_mtime for f in old_index_files)), + ) + new_index_date = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + old_index_files.append("Qleverfile") + + # Default values for arguments. + # + # NOTE 1: When `--old-index-dir` is specified but not `--new-index-dir`, + # we nevertheless first build the new index in a temporary directory, + # and only when that is successful do we move the current index to the + # directory specified by `--old-index-dir` and move the new index to + # the current index directory. That way, if the rebuild fails, we still + # have the current index in its original location. + # + # NOTE 2: As a consequence of this logic, `args.new_index_dir` is + # always defined after this block, even when it was not specified on + # the command line. + if args.index_name is None: + args.index_name = args.name + if args.new_index_dir is None: + args.new_index_dir = ( + f"{args.new_index_dir_basename}{new_index_date}.tmp" + ) + if args.old_index_dir is None: + # Check if this is the first rebuild (no previous.* directories exist) + existing_previous_dirs = list( + Path(".").glob(f"{args.old_index_dir_basename}*") + ) + is_first_rebuild = len(existing_previous_dirs) == 0 + + args.old_index_dir = ( + f"{args.old_index_dir_basename}{old_index_date}" + + (".ORIGINAL" if is_first_rebuild else "") + ) + if args.new_index_dir.endswith("/"): + args.new_index_dir = args.new_index_dir[:-1] + + # Check that the new index directory either does not exist or is empty. + # Same for the old index directory, if specified. + new_index_path = Path(args.new_index_dir) + if new_index_path.exists() and any(new_index_path.iterdir()): + log.error( + f"The target directory '{args.new_index_dir}' for the new " + "index already exists and is not empty; please specify an " + "empty or non-existing directory" + ) + return False + if args.old_index_dir is not None: + old_index_path = Path(args.old_index_dir) + if old_index_path.exists() and any(old_index_path.iterdir()): + log.error( + f"The target directory '{args.old_index_dir}' for the " + "old index already exists and is not empty; please " + "specify an empty or non-existing directory" + ) + return False + + # Split `new_index_dir` into path and dir name. For example, if + # `new_index_dir` is `path/to/index`, then the path is `path/to` and + # the dir name is `index`. + # + # NOTE: We keep this separate because we can always create a + # subdirectory in the current directory (even when running in a + # container), but not necessarily a directory at an arbitrary path. If + # a path outside the current directory is desired, we move the index + # there after it has been built. + new_index_dir_path = str(Path(args.new_index_dir).parent) + new_index_dir_name = str(Path(args.new_index_dir).name) + log_file_name = f"{args.index_name}.rebuild-index-log.txt" + + # Note which indexes we have to move when done. + move_new_index_when_done = new_index_dir_path != "." + move_old_index_when_done = args.old_index_dir is not None + + # Command for rebuilding the index. + mkdir_cmd = ( + f"mkdir -p {new_index_dir_name} && " + f"cp -a Qleverfile {new_index_dir_name}" + ) + rebuild_index_cmd = ( + f"curl -s {args.host_name}:{args.port} " + f"-d cmd=rebuild-index " + f"-d index-name={new_index_dir_name}/{args.index_name} " + f"-d access-token={args.access_token}" + ) + move_new_index_cmd = f"mv {new_index_dir_name} {new_index_dir_path}" + move_old_index_cmd = ( + f"mkdir -p {shlex.quote(args.old_index_dir)} && " + f"mv {' '.join(shlex.quote(f) for f in old_index_files)} " + f"{shlex.quote(args.old_index_dir)} && " + f"mv {shlex.quote(new_index_dir_name)}/* . && " + f"rmdir {shlex.quote(new_index_dir_name)}" + ) + restart_server_cmd = "qlever stop && qlever start" + if not move_old_index_when_done: + restart_server_cmd = ( + f"cd {args.new_index_dir} && ${restart_server_cmd}" + ) + + # Show the command lines. + cmds_to_show = [mkdir_cmd, rebuild_index_cmd] + if move_old_index_when_done: + cmds_to_show.append(move_old_index_cmd) + if move_new_index_when_done: + cmds_to_show.append(move_new_index_cmd) + if args.restart_when_finished: + cmds_to_show.append(restart_server_cmd) + self.show("\n".join(cmds_to_show), only_show=args.show) + if args.show: + return True + + # Create the index directory and the log file. + try: + run_command(mkdir_cmd) + except Exception as e: + log.error(f"Creating the index directory failed: {e}") + return False + + # Show the server log while rebuilding the index. + # + # NOTE: This will only work satisfactorily when no other queries are + # being processed at the same time. It would be better if QLever + # logged the rebuild-index output to a separate log file. + tail_cmd = ( + f"touch {new_index_dir_name}/{log_file_name} && " + f"exec tail -n 0 -f {new_index_dir_name}/{log_file_name}" + ) + tail_proc = subprocess.Popen(tail_cmd, shell=True) + + # Run the index rebuild command (and time it). + try: + time_start = time.monotonic() + try: + run_command(rebuild_index_cmd, show_output=False) + except Exception as e: + log.error(f"Rebuilding the index failed: {e}") + return False + time_end = time.monotonic() + duration_seconds = round(time_end - time_start) + log.info("") + rebuild_done_msg = f"Rebuilt index in {duration_seconds:,} seconds" + if new_index_dir_path == ".": + rebuild_done_msg += ( + f", in the new directory '{args.new_index_dir}'" + ) + log.info(rebuild_done_msg) + finally: + tail_proc.terminate() + tail_proc.wait() + + # Move the old index to the specified directory, if needed. + if move_old_index_when_done: + try: + log.info(f"Moving the old index to {args.old_index_dir}") + run_command(move_old_index_cmd) + except Exception as e: + log.error(f"Moving the old index failed: {e}") + return False + + # Move the new index to the specified directory, if needed. + if move_new_index_when_done: + try: + log.info(f"Moving the new index to {args.new_index_dir}") + run_command(move_new_index_cmd) + except Exception as e: + log.error(f"Moving the new index failed: {e}") + return False + + # Restart the server with the new index, if requested. + if args.restart_when_finished: + try: + log.info("Restarting the server with the new index ...") + log.info("") + log.info(colored("Command: start", attrs=["bold"])) + log.info("") + run_command(restart_server_cmd, show_output=True) + except Exception as e: + log.error(f"Restarting the server failed: {e}") + return False + + # Clean up old index directories according to `--keep-old-index-dirs`. + # Find all subdirectories starting with `old_index_dir_basename`, + # ordered from oldest to newest (by creation time), and keep or delete + # them according to the specified policy. + if move_old_index_when_done: + old_index_dirs = sorted( + [ + dir + for dir in Path(".").iterdir() + if dir.is_dir() + and dir.name.startswith(args.old_index_dir_basename) + ], + key=lambda dir: dir.stat().st_ctime, + ) + if old_index_dirs: + log.info("") + log.info( + colored( + f"Iterate over old index directories (oldest to " + f"newest), and check which ones to keep or delete " + f"(keep_old_index_dirs = {args.keep_old_index_dirs}):", + color="blue", + ) + ) + for i, dir in enumerate(old_index_dirs): + is_oldest = i == 0 + is_newest = i == len(old_index_dirs) - 1 + if args.keep_old_index_dirs == "all": + action = "KEEP" + elif args.keep_old_index_dirs == "none": + action = "DELETE" + elif args.keep_old_index_dirs == "oldest": + action = "KEEP" if is_oldest else "DELETE" + elif args.keep_old_index_dirs == "newest": + action = "KEEP" if is_newest else "DELETE" + + log.info(f" {dir.name:<50} {action}") + + # Actually perform the deletion + if action == "DELETE": + try: + shutil.rmtree(dir) + log.info(f" → Deleted {dir.name}") + except Exception as e: + log.error( + f" → Failed to delete {dir.name}: {e}" + ) + + log.info("") + + return True diff --git a/src/qlever/commands/reset_updates.py b/src/qlever/commands/reset_updates.py new file mode 100644 index 00000000..93625d9e --- /dev/null +++ b/src/qlever/commands/reset_updates.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import re + +from qlever.command import QleverCommand +from qlever.log import log +from qlever.util import run_command + + +class ResetUpdatesCommand(QleverCommand): + """ + Class for executing the `reset-updates` command. + """ + + def __init__(self): + pass + + def description(self) -> str: + return "Reset the updates on the server" + + def should_have_qleverfile(self) -> bool: + return True + + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: + return {"server": ["host_name", "port", "access_token"]} + + def additional_arguments(self, subparser) -> None: + subparser.add_argument( + "--sparql-endpoint", + help="URL of the QLever server, default is {host_name}:{port}", + ) + + def execute(self, args) -> bool: + reset_cmd = "curl -s" + if args.sparql_endpoint: + reset_cmd += f" {args.sparql_endpoint}" + else: + reset_cmd += f" {args.host_name}:{args.port}" + reset_cmd += f' --data-urlencode "cmd=clear-delta-triples" --data-urlencode "access-token={args.access_token}"' + self.show(reset_cmd, only_show=args.show) + if args.show: + return True + + try: + reset_cmd += ' -w " %{http_code}"' + result = run_command(reset_cmd, return_output=True) + match = re.match(r"^(.*) (\d+)$", result, re.DOTALL) + if not match: + raise Exception(f"Unexpected output:\n{result}") + error_message = match.group(1).strip() + status_code = match.group(2) + if status_code != "200": + raise Exception(error_message) + message = "Updates reset successfully" + log.info(message) + return True + except Exception as e: + log.error(e) + return False diff --git a/src/qlever/commands/settings.py b/src/qlever/commands/settings.py index dcf52e37..3c19358d 100644 --- a/src/qlever/commands/settings.py +++ b/src/qlever/commands/settings.py @@ -6,6 +6,7 @@ from qlever.command import QleverCommand from qlever.log import log +from qlever.qleverfile import Qleverfile from qlever.util import run_command @@ -23,36 +24,19 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"server": ["port", "host_name", "access_token"]} def additional_arguments(self, subparser) -> None: - all_keys = [ - "always-multiply-unions", - "cache-max-num-entries", - "cache-max-size", - "cache-max-size-single-entry", - "cache-service-results", - "default-query-timeout", - "group-by-disable-index-scan-optimizations", - "group-by-hash-map-enabled", - "lazy-index-scan-max-size-materialization", - "lazy-index-scan-num-threads", - "lazy-index-scan-queue-size", - "lazy-result-max-cache-size", - "query-planning-budget", - "request-body-limit", - "service-max-value-rows", - "sort-estimate-cancellation-factor", - "throw-on-unbound-variables", - "use-binsearch-transitive-path", - ] subparser.add_argument( - "runtime_parameter", - nargs="?", - help="Set the given runtime parameter (key=value)" - "; if no argument is given, show all settings", - ).completer = lambda **kwargs: [f"{key}=" for key in all_keys] + "runtime_parameters", + nargs="*", + help="Space-separated list of runtime parameters to set " + "in the form `key=value`; afterwards shows all settings, " + "with the changed ones highlighted", + ).completer = lambda **kwargs: [ + f"{key}=" for key in Qleverfile.SERVER_RUNTIME_PARAMETERS + ] subparser.add_argument( "--endpoint_url", type=str, @@ -67,46 +51,65 @@ def execute(self, args) -> bool: else: endpoint_url = f"http://{args.host_name}:{args.port}" - # Construct the `curl` command for getting or setting. - if args.runtime_parameter: - try: - parameter_key, parameter_value = args.runtime_parameter.split( - "=" + # Construct the `curl` commands for setting and getting. + curl_cmds_setting = [] + keys_set = set() + if args.runtime_parameters: + for key_value_pair in args.runtime_parameters: + try: + key, value = key_value_pair.split("=") + except ValueError: + log.error("Runtime parameter must be given as `key=value`") + return False + curl_cmds_setting.append( + f"curl -s {endpoint_url} -w %{{http_code}}" + f' --data-urlencode "{key}={value}"' + f' --data-urlencode "access-token={args.access_token}"' ) - except ValueError: - log.error("Runtime parameter must be given as `key=value`") - return False - - curl_cmd = ( - f"curl -s {endpoint_url}" - f' --data-urlencode "{parameter_key}={parameter_value}"' - f' --data-urlencode "access-token={args.access_token}"' - ) - else: - curl_cmd = ( - f"curl -s {endpoint_url}" f" --data-urlencode cmd=get-settings" - ) - parameter_key, parameter_value = None, None - self.show(curl_cmd, only_show=args.show) + keys_set.add(key) + curl_cmd_getting = ( + f"curl -s {endpoint_url} -w %{{http_code}}" + f" --data-urlencode cmd=get-settings" + ) + self.show( + "\n".join(curl_cmds_setting + [curl_cmd_getting]), + only_show=args.show, + ) if args.show: return True - # Execute the `curl` command. Note that the `get-settings` command - # returns all settings in both scencarios (that is, also when setting a - # parameter). + # Execute the `curl` commands for setting the key-value pairs if any. + for curl_cmd in curl_cmds_setting: + try: + curl_result = run_command(curl_cmd, return_output=True) + body, http_code = curl_result[:-3], curl_result[-3:] + if http_code != "200": + raise Exception(body) + except Exception as e: + log.error( + f"curl command for setting key-value pair failed: {e}" + ) + return False + + # Execute the `curl` commands for getting the settings. try: - settings_json = run_command(curl_cmd, return_output=True) - settings_dict = json.loads(settings_json) + curl_result = run_command(curl_cmd_getting, return_output=True) + body, http_code = curl_result[:-3], curl_result[-3:] + if http_code != "200": + raise Exception(body) + settings_dict = json.loads(body) + if isinstance(settings_dict, list): + settings_dict = settings_dict[0] except Exception as e: - log.error(f"setting command failed: {e}") + log.error(f"curl command for getting settings failed: {e}") return False for key, value in settings_dict.items(): print( colored( f"{key:<45}: {value}", - "blue" - if parameter_key and key == parameter_key - else None, + "blue" if key in keys_set else None, ) ) + + # That's it. return True diff --git a/src/qlever/commands/setup_config.py b/src/qlever/commands/setup_config.py index 0eff3b23..1a53d604 100644 --- a/src/qlever/commands/setup_config.py +++ b/src/qlever/commands/setup_config.py @@ -26,7 +26,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return False - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {} def additional_arguments(self, subparser) -> None: diff --git a/src/qlever/commands/start.py b/src/qlever/commands/start.py index a6811c6f..f55abe6d 100644 --- a/src/qlever/commands/start.py +++ b/src/qlever/commands/start.py @@ -5,11 +5,13 @@ from qlever.command import QleverCommand from qlever.commands.cache_stats import CacheStatsCommand +from qlever.commands.settings import SettingsCommand from qlever.commands.status import StatusCommand from qlever.commands.stop import StopCommand from qlever.commands.warmup import WarmupCommand from qlever.containerize import Containerize from qlever.log import log +from qlever.qleverfile import Qleverfile from qlever.util import binary_exists, is_qlever_server_alive, run_command @@ -34,7 +36,7 @@ def construct_command(args) -> str: start_cmd += " --persist-updates" if args.only_pso_and_pos_permutations: start_cmd += " --only-pso-and-pos-permutations" - if not args.use_patterns: + if args.use_patterns == "no": start_cmd += " --no-patterns" if args.use_text_index == "yes": start_cmd += " -t" @@ -45,7 +47,7 @@ def construct_command(args) -> str: # Kill existing server on the same port. Trust that StopCommand() works? # Maybe return StopCommand().execute(args) and handle it with a try except? def kill_existing_server(args) -> bool: - args.cmdline_regex = f"^ServerMain.* -p {args.port}" + args.cmdline_regex = f"^qlever-server.* -p {args.port}" args.no_containers = True if not StopCommand().execute(args): log.error("Stopping the existing server failed") @@ -120,7 +122,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return { "data": ["name", "description", "text_description"], "server": [ @@ -165,14 +167,25 @@ def additional_arguments(self, subparser) -> None: help="Run the server in the foreground " "(default: run in the background with `nohup`)", ) + subparser.add_argument( + "runtime_parameters", + nargs="*", + help="Space-separated list of runtime parameters to set " + "(in the form `key=value`) once the server is running", + ).completer = lambda **kwargs: [ + f"{key}=" for key in Qleverfile.SERVER_RUNTIME_PARAMETERS + ] def execute(self, args) -> bool: + # Set the endpoint URL. + args.endpoint_url = f"http://{args.host_name}:{args.port}" + # Kill existing server with the same name if so desired. # # TODO: This is currently disabled because I never used it once over # the past weeks and it is not clear to me what the use case is. if False: # or args.kill_existing_with_same_name: - args.cmdline_regex = f"^ServerMain.* -i {args.name}" + args.cmdline_regex = f"^qlever-server.* -i {args.name}" args.no_containers = True StopCommand().execute(args) log.info("") @@ -200,17 +213,17 @@ def execute(self, args) -> bool: # Show the command line. self.show(start_cmd, only_show=args.show) if args.show: + if args.runtime_parameters: + log.info("") + SettingsCommand().execute(args) return True - # When running natively, check if the binary exists and works. - if args.system == "native": - if not binary_exists(args.server_binary, "server-binary"): - return False + if not binary_exists(args.server_binary, "server-binary", args): + return False # Check if a QLever server is already running on this port. - endpoint_url = f"http://{args.host_name}:{args.port}" - if is_qlever_server_alive(endpoint_url): - log.error(f"QLever server already running on {endpoint_url}") + if is_qlever_server_alive(args.endpoint_url): + log.error(f"QLever server already running on {args.endpoint_url}") log.info("") log.info( "To kill the existing server, use `qlever stop` " @@ -219,7 +232,7 @@ def execute(self, args) -> bool: ) # Show output of status command. - args.cmdline_regex = f"^ServerMain.* -p *{args.port}" + args.cmdline_regex = f"^qlever-server.* -p *{args.port}" log.info("") StatusCommand().execute(args) return False @@ -269,7 +282,7 @@ def execute(self, args) -> bool: log.info("") tail_cmd = f"exec tail -f {args.name}.server-log.txt" tail_proc = subprocess.Popen(tail_cmd, shell=True) - while not is_qlever_server_alive(endpoint_url): + while not is_qlever_server_alive(args.endpoint_url): time.sleep(1) # Set the description for the index and text. @@ -302,9 +315,14 @@ def execute(self, args) -> bool: if not args.run_in_foreground: log.info("") args.detailed = False - args.server_url = None + args.sparql_endpoint = None CacheStatsCommand().execute(args) + # Apply settings if any. + if args.runtime_parameters: + log.info("") + SettingsCommand().execute(args) + # With `--run-in-foreground`, wait until the server is stopped. if args.run_in_foreground: try: diff --git a/src/qlever/commands/status.py b/src/qlever/commands/status.py index a8efed54..f1683321 100644 --- a/src/qlever/commands/status.py +++ b/src/qlever/commands/status.py @@ -20,12 +20,12 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return False - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {} def additional_arguments(self, subparser) -> None: subparser.add_argument("--cmdline-regex", - default="^(ServerMain|IndexBuilderMain)", + default="^(qlever-server|qlever-index)", help="Show only processes where the command " "line matches this regex") diff --git a/src/qlever/commands/stop.py b/src/qlever/commands/stop.py index a68d433b..bcd57f6d 100644 --- a/src/qlever/commands/stop.py +++ b/src/qlever/commands/stop.py @@ -38,14 +38,14 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"data": ["name"], "server": ["port"], "runtime": ["server_container"]} def additional_arguments(self, subparser) -> None: subparser.add_argument("--cmdline-regex", - default="ServerMain.* -i [^ ]*%%NAME%%", + default="qlever-server.* -i [^ ]*%%NAME%%", help="Show only processes where the command " "line matches this regex") subparser.add_argument("--no-containers", action="store_true", @@ -84,7 +84,7 @@ def execute(self, args) -> bool: message = "No matching process found" if args.no_containers else \ "No matching process or container found" log.error(message) - args.cmdline_regex = "^ServerMain.* -i [^ ]*" + args.cmdline_regex = "^qlever-server.* -i [^ ]*" log.info("") StatusCommand().execute(args) return True diff --git a/src/qlever/commands/system_info.py b/src/qlever/commands/system_info.py index acf12ac8..e8b71a2b 100644 --- a/src/qlever/commands/system_info.py +++ b/src/qlever/commands/system_info.py @@ -48,7 +48,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"runtime": ["system", "image", "server_container"]} def additional_arguments(self, subparser) -> None: diff --git a/src/qlever/commands/ui.py b/src/qlever/commands/ui.py index fb49ee78..5c12e1d1 100644 --- a/src/qlever/commands/ui.py +++ b/src/qlever/commands/ui.py @@ -13,13 +13,16 @@ # Return a YAML string for the given dictionary. Format values with # newlines using the "|" style. -def dict_to_yaml(dictionary): - # Custom representer for yaml, which uses the "|" style only for - # multiline strings. - # - # NOTE: We replace all `\r\n` with `\n` because otherwise the `|` style - # does not work as expected. - class MultiLineDumper(yaml.Dumper): +def dict_to_yaml(dictionary: dict) -> str: + """ + Custom representer for yaml, which uses the "|" style only for + multiline strings. + + NOTE: We replace all `\r\n` with `\n` because otherwise the `|` style + does not work as expected. + """ + + class MultiLineDumper(yaml.SafeDumper): def represent_scalar(self, tag, value, style=None): value = value.replace("\r\n", "\n") if isinstance(value, str) and "\n" in value: @@ -30,6 +33,7 @@ def represent_scalar(self, tag, value, style=None): return yaml.dump( dictionary, sort_keys=False, + allow_unicode=True, Dumper=MultiLineDumper, ) @@ -48,7 +52,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return { "data": ["name"], "server": ["host_name", "port"], diff --git a/src/qlever/commands/update.py b/src/qlever/commands/update.py new file mode 100644 index 00000000..dd3cf46c --- /dev/null +++ b/src/qlever/commands/update.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import shlex +import time +import traceback + +from qlever.command import QleverCommand +from qlever.log import log +from qlever.util import run_command + + +class UpdateCommand(QleverCommand): + """ + Class for executing a SPARQL UPDATE against a SPARQL endpoint. + + The command accepts the update either directly on the command line or + via a file path provided with --update-file. + """ + + def __init__(self): + pass + + def description(self) -> str: + return "Send an update to a SPARQL endpoint" + + def should_have_qleverfile(self) -> bool: + return False + + def relevant_qleverfile_arguments(self) -> dict[str,list[str]]: + return {"server": ["host_name", "port", "access_token"]} + + def additional_arguments(self, subparser) -> None: + subparser.add_argument( + "update", + type=str, + nargs="?", + default=None, + help="SPARQL UPDATE to send (use --update-file to send from a file)", + ) + subparser.add_argument( + "--update-file", + type=str, + help="Path to a file containing the SPARQL UPDATE to send", + ) + subparser.add_argument( + "--sparql-endpoint", type=str, help="URL of the SPARQL endpoint" + ) + + def execute(self, args) -> bool: + sparql_endpoint = ( + args.sparql_endpoint if args.sparql_endpoint else f"{args.host_name}:{args.port}" + ) + + curl_cmd = ( + f"curl -s {sparql_endpoint} -X POST " + f"-H 'Authorization: Bearer {args.access_token}' " + f"-H 'Content-Type: application/sparql-update' " + ) + + if args.update: + curl_cmd += f"--data-binary {shlex.quote(args.update)}" + elif args.update_file: + curl_cmd += f"--data-binary @{shlex.quote(args.update_file)}" + else: + log.error("No SPARQL UPDATE provided. Pass it as an argument or via --update-file.") + return False + + # Show and exit if requested + self.show(curl_cmd, only_show=args.show) + if args.show: + return True + + # Execute update + try: + start_time = time.time() + run_command(curl_cmd) + time_msecs = round(1000 * (time.time() - start_time)) + if args.log_level != "NO_LOG": + log.info("") + log.info( + f"Update processing time (end-to-end): {time_msecs:,d} ms" + ) + except Exception as e: + if args.log_level == "DEBUG": + traceback.print_exc() + log.error(e) + return False + + return True + diff --git a/src/qlever/commands/update_wikidata.py b/src/qlever/commands/update_wikidata.py new file mode 100644 index 00000000..25650742 --- /dev/null +++ b/src/qlever/commands/update_wikidata.py @@ -0,0 +1,1306 @@ +from __future__ import annotations + +import glob +import json +import logging +import os +import re +import signal +import time +from datetime import datetime, timezone +from enum import Enum, auto +from pathlib import Path +from threading import Event + +import rdflib.term +import requests_sse +from rdflib import Graph +from termcolor import colored +from tqdm.contrib.logging import tqdm_logging_redirect + +from qlever.command import QleverCommand +from qlever.log import log +from qlever.util import run_command + + +# Monkey patch `rdflib.term._castLexicalToPython` to avoid casting of literals +# to Python types. We do not need it (all we want it convert Turtle to N-Triples), +# and we can speed up parsing by a factor of about 2. +def custom_cast_lexical_to_python(lexical, datatype): + return None # Your desired behavior + + +rdflib.term._castLexicalToPython = custom_cast_lexical_to_python + + +def connect_to_sse_stream(sse_stream_url, since=None, event_id=None): + """ + Connect to the SSE stream and return the connected EventSource. + + Args: + sse_stream_url: URL of the SSE stream + since: ISO date string to start from (mutually exclusive with event_id) + event_id: Event ID to resume from (mutually exclusive with since) + + Returns: + The connected EventSource object + """ + if event_id: + event_id_json = json.dumps(event_id) + source = requests_sse.EventSource( + sse_stream_url, + headers={ + "Accept": "text/event-stream", + "User-Agent": "qlever update-wikidata", + "Last-Event-ID": event_id_json, + }, + ) + else: + source = requests_sse.EventSource( + sse_stream_url, + params={"since": since} if since else {}, + headers={ + "Accept": "text/event-stream", + "User-Agent": "qlever update-wikidata", + }, + ) + + source.connect() + return source + + +def get_next_offset_from_endpoint(sparql_endpoint): + """Query the endpoint for the next stream offset. + + Args: + sparql_endpoint: URL of the SPARQL endpoint + + Returns: + int: The offset value from the endpoint + + Raises: + Exception: If the query fails or returns no results + """ + sparql_query_offset = ( + "PREFIX wikibase: " + "SELECT (MAX(?offset) AS ?maxOffset) WHERE { " + " " + "wikibase:updateStreamNextOffset ?offset " + "}" + ) + curl_cmd_check_offset = ( + f"curl -s {sparql_endpoint}" + f' -H "Accept: text/csv"' + f' -H "Content-type: application/sparql-query"' + f' --data "{sparql_query_offset}"' + ) + result = run_command( + f"{curl_cmd_check_offset} | sed 1d", + return_output=True, + ).strip() + if not result: + raise Exception("Query returned no results") + return int(result.strip('"')) + + +class UpdateWikidataCommand(QleverCommand): + """ + Class for executing the `update` command. + """ + + def __init__(self): + # SPARQL query to get the date until which the updates of the + # SPARQL endpoint are complete. + self.sparql_updates_complete_until_query = ( + "PREFIX wikibase: " + "PREFIX schema: " + "SELECT * WHERE { " + "{ SELECT (MIN(?date_modified) AS ?updates_complete_until) { " + "wikibase:Dump schema:dateModified ?date_modified } } " + "UNION { wikibase:Dump wikibase:updatesCompleteUntil ?updates_complete_until } " + "} ORDER BY DESC(?updates_complete_until) LIMIT 1" + ) + # URL of the Wikidata SSE stream. + self.wikidata_update_stream_url = ( + "https://stream.wikimedia.org/v2/" + "stream/rdf-streaming-updater.mutation.v2" + ) + # Remember if Ctrl+C was pressed, so we can handle it gracefully. + self.ctrl_c_pressed = Event() + # Set to `True` when finished. + self.finished = False + + def description(self) -> str: + return "Update from given SSE stream" + + def should_have_qleverfile(self) -> bool: + return True + + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: + return {"server": ["host_name", "port", "access_token"]} + + def additional_arguments(self, subparser) -> None: + subparser.add_argument( + "sse_stream_url", + nargs="?", + type=str, + default=self.wikidata_update_stream_url, + help="URL of the SSE stream to update from", + ) + subparser.add_argument( + "--batch-size", + type=int, + default=100000, + help="Group this many messages together into one update " + "(default: one update for each message); NOTE: this simply " + "concatenates the `rdf_added_data` and `rdf_deleted_data` fields, " + "which is not 100%% correct; as soon as chaining is supported, " + "this will be fixed", + ) + subparser.add_argument( + "--lag-seconds", + type=int, + default=1, + help="When a message is encountered that is within this many " + "seconds of the current time, finish the current batch " + "(and show a warning that this happened)", + ) + subparser.add_argument( + "--since", + type=str, + help="Consume stream messages since this date " + "(default: determine automatically from the SPARQL endpoint)", + ) + subparser.add_argument( + "--until", + type=str, + help="Stop consuming stream messages when reaching this date " + "(default: continue indefinitely)", + ) + subparser.add_argument( + "--offset", + type=int, + help="Consume stream messages starting from this offset " + "(default: not set)", + ) + subparser.add_argument( + "--topic", + type=str, + default="eqiad.rdf-streaming-updater.mutation", + help="The topic to consume from the SSE stream (default: " + "eqiad.rdf-streaming-updater.mutation)", + ) + subparser.add_argument( + "--partition", + type=int, + default=0, + help="The partition to consume from the SSE stream (default: 0)", + ) + subparser.add_argument( + "--wait-between-batches", + type=int, + default=5, + help="Wait this many seconds between batches that were " + "finished due to a message that is within `lag_seconds` of " + "the current time (default: 5 seconds)", + ) + subparser.add_argument( + "--num-messages", + type=int, + help="Process exactly this many messages and then exit " + "(default: no bound on the number of messages)", + ) + subparser.add_argument( + "--verbose", + choices=["no", "yes"], + default="yes", + help='Verbose logging, "yes" or "no" (default: "yes")', + ) + subparser.add_argument( + "--use-cached-sparql-queries", + action="store_true", + help="Use cached SPARQL query files if they exist with matching " + "offset and target batch size (default: off)", + ) + subparser.add_argument( + "--check-offset-before-each-batch", + choices=["yes", "no"], + default="yes", + help="Before each batch, verify that the stream offset matches the " + "offset from the endpoint (default: yes)", + ) + subparser.add_argument( + "--rewind-to-earlier-offset", + choices=["yes", "no"], + default="yes", + help="When the stream offset is later than the offset from the " + "endpoint (e.g., after a server restart), rewind to the endpoint " + "offset and reprocess messages (default: yes)", + ) + subparser.add_argument( + "--num-retries", + type=int, + default=10, + help="Number of retries for offset verification queries when they fail " + "(default: 10)", + ) + subparser.add_argument( + "--keep-update-requests", + choices=["none", "all", "last", "last-three"], + default="last", + help="Which update request files (update.*.{sparql,meta,result}) to keep: " + "none (delete all), all (keep all), last (keep only the most recent), " + "last-three (keep the three most recent) (default: last)", + ) + + def retry_with_backoff(self, operation, operation_name, max_retries): + """ + Retry an operation with exponential backoff, see backoff intervals below + (in seconds). Returns the result of the operation if successful, or raises + the last exception. + """ + backoff_intervals = [5, 10, 30, 60, 300, 900, 1800, 3600] + + for attempt in range(max_retries): + try: + return operation() + except Exception as e: + if self.ctrl_c_pressed.is_set(): + raise KeyboardInterrupt() + if attempt < max_retries - 1: + # Use the appropriate backoff interval (once we get to the end + # of the list, keep using the last interval). + retry_delay = ( + backoff_intervals[attempt] + if attempt < len(backoff_intervals) + else backoff_intervals[-1] + ) + # Show the delay as seconds, minutes, or hours. + if retry_delay >= 3600: + delay_str = f"{retry_delay // 3600}h" + elif retry_delay >= 60: + delay_str = f"{retry_delay // 60}min" + else: + delay_str = f"{retry_delay}s" + log.warn( + f"{operation_name} failed (attempt {attempt + 1}/{max_retries}): {e}. " + f"Retrying in {delay_str} ..." + ) + # Returns true if the wait ended because of the flag being set. + if self.ctrl_c_pressed.wait(timeout=retry_delay): + raise KeyboardInterrupt() + else: + # If this was the last attempt, re-raise the exception. + raise + + # Handle Ctrl+C gracefully by finishing the current batch and then exiting. + def handle_ctrl_c(self, signal_received, frame): + if self.ctrl_c_pressed.is_set(): + pass + # log.warn("\rCtrl+C pressed again, watch your blood pressure") + else: + self.ctrl_c_pressed.set() + + def determine_batch_size_for_cached_update(self, offset: int, batch_size: int) -> int | None: + options = list(Path.cwd().glob(f"update.{offset}.*.sparql")) + if len(options) == 0: + log.warn(f"Found no cached SPARQL update. Continuing with update stream.") + return None + elif len(options) > 1: + log.warn(f"Found {len(options)} candidates for cached SPARQL update. Using {options[0].name}.") + return int(re.search(r"update\.\d+\.(\d+)\.sparql", options[0].name).group(1)) + + def determine_next_cached_update(self, first_offset_in_batch: int, batch_size: int) -> tuple[str, int] | None: + batch_size = self.determine_batch_size_for_cached_update(first_offset_in_batch, batch_size) + if batch_size is None: + return None + cached_file_name = ( + f"update.{first_offset_in_batch}.{batch_size}.sparql" + ) + cached_meta_file_name = ( + f"update.{first_offset_in_batch}.{batch_size}.meta" + ) + + # Try to read metadata file for date range + cached_date_range = None + if os.path.exists(cached_meta_file_name): + try: + with open(cached_meta_file_name, "r") as f: + cached_date_range = f.read().strip() + except Exception: + pass + + log_msg = f"Using cached SPARQL query file: {cached_file_name}" + if cached_date_range: + log_msg += f" [date range: {cached_date_range}]" + log.debug(colored(log_msg, "cyan")) + + return cached_file_name, batch_size + + def execute(self, args) -> bool: + # cURL command to get the date until which the updates of the + # SPARQL endpoint are complete. + sparql_endpoint = f"http://{args.host_name}:{args.port}" + curl_cmd_updates_complete_until = ( + f"curl -s {sparql_endpoint}" + f' -H "Accept: text/csv"' + f' -H "Content-type: application/sparql-query"' + f' --data "{self.sparql_updates_complete_until_query}"' + ) + + # Construct the command and show it. + cmd_description = [] + if args.since: + cmd_description.append(f"SINCE={args.since}") + else: + cmd_description.append( + f"SINCE=$({curl_cmd_updates_complete_until} | sed 1d)" + ) + if args.until: + cmd_description.append(f"UNTIL={args.until}") + cmd_description.append( + f"Process SSE stream from {args.sse_stream_url} " + f"in batches of up to {args.batch_size:,} messages " + ) + self.show("\n".join(cmd_description), only_show=args.show) + if args.show: + return True + + # Compute the `since` date if not given. + if args.since: + since = args.since + else: + try: + since = run_command( + f"{curl_cmd_updates_complete_until} | sed 1d", + return_output=True, + ).strip() + except Exception as e: + log.error( + f"Error running `{curl_cmd_updates_complete_until}`: {e}" + ) + return False + + # Special handling of Ctrl+C, see `handle_ctrl_c` above. + signal.signal(signal.SIGINT, self.handle_ctrl_c) + log.warn("Press Ctrl+C to finish and exit gracefully") + log.info("") + + # If no `--offset` is provided, try to get the offset from + # the endpoint. + if args.offset is None: + try: + args.offset = get_next_offset_from_endpoint(sparql_endpoint) + log.info(f"Resuming from offset from endpoint: {args.offset}") + except Exception as e: + log.debug( + f"Could not retrieve offset from endpoint: {e}. " + f"Will determine offset from date instead." + ) + + # If the offset was neither provided via `--offset` nor could + # be retrieved from the endpoint, determine it by reading a + # single message from the SSE stream at the `since` date. + if args.offset is None: + try: + source = self.retry_with_backoff( + lambda: connect_to_sse_stream( + args.sse_stream_url, since=since + ), + "SSE stream connection", + args.num_retries, + ) + offset = None + for event in source: + if event.type == "message" and event.data: + event_data = json.loads(event.data) + event_topic = event_data.get("meta").get("topic") + if event_topic == args.topic: + offset = event_data.get("meta").get("offset") + log.debug( + f"Determined offset from date: {since} -> {offset}" + ) + break + source.close() + if offset is None: + raise Exception( + f"No event with topic {args.topic} found in stream" + ) + args.offset = offset + except KeyboardInterrupt: + log.warn( + "\rCtrl+C pressed while determine current state, exiting" + ) + return True + except Exception as e: + log.error(f"Error determining offset from stream: {e}") + return False + + # Initialize all the statistics variables. + batch_count = 0 + total_num_messages = 0 + total_update_time = 0 + start_time = time.perf_counter() + wait_before_next_batch = False + event_id_for_next_batch = ( + [ + { + "topic": args.topic, + "partition": args.partition, + "offset": args.offset, + } + ] + if args.offset is not None + else None + ) + + # Track whether this is the first batch (to skip offset check) + first_batch = True + + # Main event loop: Either resume from `event_id_for_next_batch` (if set), + # or start a new connection to `args.sse_stream_url` (with URL + # parameter `?since=`). + while True: + # Optionally wait before processing the next batch (make sure that + # the wait is interruptible by Ctrl+C). + if wait_before_next_batch: + log.info( + f"Waiting {args.wait_between_batches} " + f"second{'s' if args.wait_between_batches > 1 else ''} " + f"before processing the next batch" + ) + log.info("") + wait_before_next_batch = False + self.ctrl_c_pressed.wait(args.wait_between_batches) + if self.ctrl_c_pressed.is_set(): + log.warn( + "\rCtrl+C pressed while waiting in between batches, " + "exiting" + ) + break + + # Start stream from either `event_id_for_next_batch` or `since`. + # We'll extract the offset for first_offset_in_batch later. + if event_id_for_next_batch: + event_id_json = json.dumps(event_id_for_next_batch) + if args.verbose == "yes": + log.info( + colored( + f"Consuming stream from event ID: {event_id_json}", + attrs=["dark"], + ) + ) + else: + if args.verbose == "yes": + log.info( + colored( + f"Consuming stream from date: {since}", + attrs=["dark"], + ) + ) + + # Connect to the SSE stream with retry logic + try: + source = self.retry_with_backoff( + lambda: connect_to_sse_stream( + args.sse_stream_url, + since=since if not event_id_for_next_batch else None, + event_id=event_id_for_next_batch, + ), + "SSE stream connection for batch processing", + args.num_retries, + ) + except KeyboardInterrupt: + log.warn( + "\rCtrl+C pressed while while connecting to stream, " + "exiting" + ) + break + except Exception as e: + log.error( + f"Failed to connect to SSE stream after " + f"{args.num_retries} retry attempts, last error: {e}" + ) + break + + # Next comes the inner loop, which processes exactly one "batch" of + # messages. The batch is completed (simply using `break`) when either + # `args.batch_size` messages have been processed, or when one of a + # variety of conditions occur (Ctrl+C pressed, message within + # `args.lag_seconds` of current time, delete operation followed by + # insert of triple with that entity as subject). + + # Initialize all the batch variables. + current_batch_size = 0 + # Extract the offset from the event ID to use as the starting offset + # for this batch. This is set before processing any messages. + if event_id_for_next_batch: + first_offset_in_batch = event_id_for_next_batch[0]["offset"] + event_id_for_next_batch = None + else: + # This should not happen since we now always determine the offset + # before starting, but keep as fallback + first_offset_in_batch = None + + # Check that the stream offset matches the offset from the + # endpoint, unless disabled or this is the first batch. + if ( + args.check_offset_before_each_batch == "yes" + and not first_batch + and first_offset_in_batch is not None + ): + # Verify offset with retry logic + try: + endpoint_offset = self.retry_with_backoff( + lambda: get_next_offset_from_endpoint(sparql_endpoint), + "Offset verification", + args.num_retries, + ) + except KeyboardInterrupt: + log.warn( + "\rCtrl+C pressed while while verifying state, exiting" + ) + break + except Exception as e: + log.error( + f"Failed to retrieve offset from endpoint " + f"after {args.num_retries} retries: {e}. " + f"This might be the first update, or the offset triple is missing." + ) + return False + + if endpoint_offset < first_offset_in_batch: + # Stream offset is LATER than endpoint offset + if args.rewind_to_earlier_offset == "yes": + log.info( + colored( + f"Stream offset {first_offset_in_batch} is later " + f"than offset {endpoint_offset} from endpoint; " + f"this can happen after a server restart; " + f"rewinding to offset {endpoint_offset} from endpoint", + "cyan", + ) + ) + log.info("") + # Reconnect from the endpoint offset + event_id_for_next_batch = [ + { + "topic": args.topic, + "partition": args.partition, + "offset": endpoint_offset, + } + ] + continue # Skip this batch and reconnect + else: + log.error( + f"Offset mismatch: stream offset {first_offset_in_batch} " + f"is later than offset {endpoint_offset} from endpoint; " + f"rewind disabled by --rewind-to-earlier-offset=no" + ) + return False + elif endpoint_offset > first_offset_in_batch: + # Stream offset is EARLIER than endpoint offset - this is bad + log.error( + f"Offset mismatch: stream offset {first_offset_in_batch} " + f"is earlier than offset {endpoint_offset} from endpoint; " + f"this indicates that updates may have been applied " + f"out of order or some updates are missing" + ) + return False + + date_list = [] + delete_entity_ids = set() + delta_to_now_list = [] + batch_assembly_start_time = time.perf_counter() + insert_triples = set() + delete_triples = set() + + # Check if we can use a cached SPARQL query file + use_cached_file = False + cached_file_name = None + if ( + args.use_cached_sparql_queries + and first_offset_in_batch is not None + ): + cached_update = self.determine_next_cached_update(first_offset_in_batch, + args.batch_size) + if cached_update is not None: + cached_file_name, current_batch_size = cached_update + use_cached_file = True + + # Process one event at a time (unless using cached file). + if not use_cached_file: + with tqdm_logging_redirect( + loggers=[logging.getLogger("qlever")], + desc="Batch", + total=args.batch_size, + leave=False, + bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}{postfix}", + ) as pbar: + for event in source: + # Skip events that are not of type `message` (should not + # happen), have no field `data` (should not happen either), or + # where the topic is not in `args.topics` (one topic by itself + # should provide all relevant updates). + if event.type != "message" or not event.data: + continue + event_data = json.loads(event.data) + topic = event_data.get("meta").get("topic") + if topic != args.topic: + continue + + try: + # Extract offset, topic, and partition from the message metadata + # to construct a precise event ID for resuming. + meta = event_data.get("meta") + offset = meta.get("offset") + topic = meta.get("topic") + partition = meta.get("partition") + + # Get the date (rounded *down* to seconds). + date = meta.get("dt") + date = re.sub(r"\.\d*Z$", "Z", date) + + # Get the other relevant fields from the message. + entity_id = event_data.get("entity_id") + operation = event_data.get("operation") + rdf_added_data = event_data.get("rdf_added_data") + rdf_deleted_data = event_data.get( + "rdf_deleted_data" + ) + rdf_linked_shared_data = event_data.get( + "rdf_linked_shared_data" + ) + # rdf_unlinked_shared_data = event_data.get( + # "rdf_unlinked_shared_data" + # ) + + # Check batch completion conditions BEFORE processing the + # data of this message. If any of the conditions is met, + # we finish the batch and resume from the LAST PROCESSED + # message (not the current one that triggered the break). + # + # NOTE: We will update event_id_for_next_batch AFTER + # successfully processing each message (see below), so that + # when we break, it contains the last processed event ID. + since = None + + # Condition 1: Delete followed by insert for same entity. + operation_adds_data = ( + rdf_added_data is not None + or rdf_linked_shared_data is not None + ) + if ( + operation_adds_data + and entity_id in delete_entity_ids + ): + if args.verbose == "yes": + log.warn( + f"Encountered operation that adds data for " + f"an entity ID ({entity_id}) that was deleted " + f"earlier in this batch; finishing batch and " + f"resuming from this message in the next batch" + ) + break + + # Condition 2: Batch size or limit on number of + # messages reached. + if current_batch_size >= args.batch_size or ( + args.num_messages is not None + and total_num_messages >= args.num_messages + ): + break + + # Condition 3: Message close to current time. + date_obj = datetime.strptime( + date, "%Y-%m-%dT%H:%M:%SZ" + ).replace(tzinfo=timezone.utc) + date_as_epoch_s = date_obj.timestamp() + + now_as_epoch_s = time.time() + delta_to_now_s = now_as_epoch_s - date_as_epoch_s + if ( + delta_to_now_s < args.lag_seconds + and current_batch_size > 0 + ): + if args.verbose == "yes": + log.warn( + f"Encountered message with date {date}, which is within " + f"{args.lag_seconds} " + f"second{'s' if args.lag_seconds > 1 else ''} " + f"of the current time, finishing the current batch" + ) + wait_before_next_batch = ( + args.wait_between_batches is not None + and args.wait_between_batches > 0 + ) + break + + # Condition 4: Reached `--until` date and at least one + # message was processed. + if ( + args.until + and date >= args.until + and current_batch_size > 0 + ): + log.warn( + f"Reached --until date {args.until} " + f"(message date: {date}), that's it folks" + ) + self.finished = True + break + + # Delete operations are postponed until the end of the + # batch, so remember the entity ID here. + if operation == "delete": + delete_entity_ids.add(entity_id) + + # Replace each occurrence of `\\` by `\u005C\u005C` + # (which is twice the Unicode for backslash). + # + # NOTE: Strictly speaking, it would be enough to do + # this for two backslashes followed by a `u`, but + # doing it for all double backslashes does not + # harm. When parsing a SPARQL query, then according + # to the standar, first all occurrences of `\uxxxx` + # (where `xxxx` are four hex digits) are replaced + # by the corresponding Unicode character. That is a + # problem when `\\uxxxx` occurs in a literal, + # because then it would be replaced by `\` followed + # by the Unicode character, which is invalied + # SPARQL. The subsitution avoids that problem. + def node_to_sparql(node: rdflib.term.Node) -> str: + return node.n3().replace( + "\\\\", "\\u005C\\u005C" + ) + + # Process the to-be-deleted triples. + # + # NOTE: The triples from `rdf_unlinked_shared_data` + # must not be deleted, because they are only + # unlinked from the current entity, but may still + # be linked from other entities. If they are not + # linked from any other entity, they will be + # orphaned, but we don't mind that. + for rdf_to_be_deleted in (rdf_deleted_data,): + if rdf_to_be_deleted is not None: + try: + rdf_to_be_deleted_data = ( + rdf_to_be_deleted.get("data") + ) + graph = Graph() + log.debug( + f"RDF to_be_deleted data: {rdf_to_be_deleted_data}" + ) + graph.parse( + data=rdf_to_be_deleted_data, + format="turtle", + ) + for s, p, o in graph: + triple = f"{s.n3()} {p.n3()} {node_to_sparql(o)}" + # NOTE: In case there was a previous `insert` of that + # triple, it is safe to remove that `insert`, but not + # the `delete` (in case the triple is contained in the + # original data). + if triple in insert_triples: + insert_triples.remove(triple) + delete_triples.add(triple) + except Exception as e: + log.error( + f"Error reading `rdf_to_be_deleted_data`: {e}" + ) + return False + + # Process the to-be-added triples. + for rdf_to_be_added in ( + rdf_added_data, + rdf_linked_shared_data, + ): + if rdf_to_be_added is not None: + try: + rdf_to_be_added_data = ( + rdf_to_be_added.get("data") + ) + graph = Graph() + log.debug( + "RDF to be added data: {rdf_to_be_added_data}" + ) + graph.parse( + data=rdf_to_be_added_data, + format="turtle", + ) + for s, p, o in graph: + triple = f"{s.n3()} {p.n3()} {node_to_sparql(o)}" + # NOTE: In case there was a previous `delete` of that + # triple, it is safe to remove that `delete`, but not + # the `insert` (in case the triple is not contained in + # the original data). + if triple in delete_triples: + delete_triples.remove(triple) + insert_triples.add(triple) + except Exception as e: + log.error( + f"Error reading `rdf_to_be_added_data`: {e}" + ) + return False + + except Exception as e: + log.error(f"Error reading data from message: {e}") + log.info(event) + continue + + # Message was successfully processed, update batch tracking + current_batch_size += 1 + total_num_messages += 1 + pbar_update_frequency = 100 + if (current_batch_size % pbar_update_frequency) == 0: + pbar.set_postfix( + { + "Time": date_obj.strftime( + "%Y-%m-%d %H:%M:%S" + ) + } + ) + pbar.update(pbar_update_frequency) + log.debug( + f"DATE: {date_as_epoch_s:.0f} [{date}], " + f"NOW: {now_as_epoch_s:.0f}, " + f"DELTA: {now_as_epoch_s - date_as_epoch_s:.0f}" + ) + date_list.append(date) + delta_to_now_list.append(delta_to_now_s) + + # Update the event ID for the next batch. We increment the + # offset by 1 so that the next batch starts with the next + # message (not re-processing the current one). + event_id_for_next_batch = [ + { + "topic": topic, + "partition": partition, + "offset": offset + 1, + } + ] + + # Ctrl+C finishes the current batch (this should come at the + # end of the inner event loop so that always at least one + # message is processed). + if self.ctrl_c_pressed.is_set(): + log.warn( + "\rCtrl+C pressed while processing a batch, " + "finishing it and exiting" + ) + break + else: + # Using cached file - set batch size and calculate next offset + total_num_messages += current_batch_size + event_id_for_next_batch = [ + { + "topic": args.topic, + "partition": args.partition, + "offset": first_offset_in_batch + current_batch_size, + } + ] + + # Process the current batch of messages (or skip if using cached). + batch_count += 1 + if not use_cached_file: + batch_assembly_end_time = time.perf_counter() + batch_assembly_time_ms = int( + 1000 + * (batch_assembly_end_time - batch_assembly_start_time) + ) + date_list.sort() + delta_to_now_list.sort() + min_delta_to_now_s = delta_to_now_list[0] + if min_delta_to_now_s < 10: + min_delta_to_now_s = f"{min_delta_to_now_s:.1f}" + else: + min_delta_to_now_s = f"{int(min_delta_to_now_s):,}" + log.info( + f"Assembled batch #{batch_count}, " + f"#messages: {current_batch_size:2,}, " + f"date range: {date_list[0]} - {date_list[-1]} " + f"[assembly time: {batch_assembly_time_ms:3,}ms, " + f"min delta to NOW: {min_delta_to_now_s}s]" + ) + + # Add a triples `wikibase:Dump wikibase:updatesCompleteUntil + # DATE` and `wikibase:Dump wikibase:updateStreamNextOffset + # OFFSET`. + insert_triples.add( + f" " + f" " + f'"{date_list[-1]}"' + f"^^" + ) + insert_triples.add( + " " + " " + f'"{event_id_for_next_batch[0]["offset"]}"' + ) + + # Construct UPDATE operation. + delete_block = " . \n ".join(delete_triples) + insert_block = " . \n ".join(insert_triples) + delete_insert_operation = ( + f"DELETE {{\n {delete_block} \n}} " + f"INSERT {{\n {insert_block} \n}} " + f"WHERE {{ }}\n" + ) + + # If `delete_entity_ids` is non-empty, add a `DELETE WHERE` + # operation that deletes all triples that are associated with only + # those entities. + delete_entity_ids_as_values = " ".join( + [f"wd:{qid}" for qid in delete_entity_ids] + ) + if len(delete_entity_ids) > 0: + delete_where_operation = ( + f"PREFIX rdf: \n" + f"PREFIX wikibase: \n" + f"PREFIX wd: \n" + f"DELETE {{\n" + f" ?s ?p ?o .\n" + f"}} WHERE {{\n" + f" {{\n" + f" VALUES ?s {{ {delete_entity_ids_as_values} }}\n" + f" ?s ?p ?o .\n" + f" }} UNION {{\n" + f" VALUES ?_1 {{ {delete_entity_ids_as_values} }}\n" + f" ?_1 ?_2 ?s .\n" + f" ?s ?p ?o .\n" + f" ?s rdf:type wikibase:Statement .\n" + f" }}\n" + f"}}\n" + ) + delete_insert_operation += ";\n" + delete_where_operation + + # Construct curl command. For batch size 1, send the operation via + # `--data-urlencode`, otherwise write to file and send via `--data-binary`. + curl_cmd = ( + f"curl -s -X POST" + f' "{sparql_endpoint}?access-token={args.access_token}"' + f" -H 'Content-Type: application/sparql-update'" + ) + if use_cached_file: + # Use the cached file instead of writing a new one + update_arg_file_name = cached_file_name + else: + # Write the constructed SPARQL update to a file + update_arg_file_name = f"update.{first_offset_in_batch}.{current_batch_size}.sparql" + with open(update_arg_file_name, "w") as f: + f.write(delete_insert_operation) + # Write metadata file with date range + meta_file_name = ( + f"update.{first_offset_in_batch}.{current_batch_size}.meta" + ) + with open(meta_file_name, "w") as f: + f.write(f"{date_list[0]} - {date_list[-1]}") + curl_cmd += f" --data-binary @{update_arg_file_name}" + if args.verbose == "yes": + log.info(colored(curl_cmd, "blue")) + + # Send the UPDATE request. If it fails, reset to the beginning + # of this batch and retry in the next iteration of the outer + # loop. If this was a transient error, this makes sure that the + # batch is re-assembled and not lost. If the server has + # restarted, the offset check at the beginning of the next + # iteration will detect the mismatch and rewind. + try: + result = run_command(curl_cmd, return_output=True) + except Exception: + if self.ctrl_c_pressed.is_set(): + log.warn( + "\r \nCtrl+C pressed while executing update, exiting" + ) + return True + else: + log.warn( + "\r \nUpdate request failed; will reconnect and retry" + ) + event_id_for_next_batch = [ + { + "topic": args.topic, + "partition": args.partition, + "offset": first_offset_in_batch, + } + ] + continue + result_file_name = ( + f"update.{first_offset_in_batch}.{current_batch_size}.result" + ) + with open(result_file_name, "w") as f: + f.write(result) + + # Clean up old update request files according to --keep-update-requests + if args.keep_update_requests != "all": + # Find all update.*.{sparql,meta,result} files + update_files = {} + for ext in ["sparql", "meta", "result"]: + for file_path in glob.glob(f"update.*.*.{ext}"): + # Extract offset from filename (update.OFFSET.SIZE.ext) + parts = Path(file_path).stem.split(".") + if len(parts) >= 3: + offset = parts[1] + if offset not in update_files: + update_files[offset] = [] + update_files[offset].append(file_path) + + # Sort by offset (newest last) + sorted_offsets = sorted( + update_files.keys(), key=lambda x: int(x) + ) + + # Determine which to keep + if args.keep_update_requests == "none": + files_to_keep = [] + elif args.keep_update_requests == "last": + files_to_keep = ( + update_files[sorted_offsets[-1]] + if sorted_offsets + else [] + ) + elif args.keep_update_requests == "last-three": + files_to_keep = [] + for offset in sorted_offsets[-3:]: + files_to_keep.extend(update_files[offset]) + + # Delete files not in the keep list + for offset, files in update_files.items(): + for file_path in files: + if file_path not in files_to_keep: + try: + os.remove(file_path) + except Exception: + pass # Ignore errors during cleanup + + # Results should be a JSON, parse it. + try: + result = json.loads(result) + except Exception as e: + log.error( + f"Error parsing JSON result: {e}. " + f"The first 1000 characters are: {result[:1000]}" + ) + return False + + # Check if the result contains a QLever exception. + if "exception" in result: + error_msg = result["exception"] + log.error(f"QLever exception: {error_msg}") + log.info("") + continue + + # Helper function for getting the value of `stats["time"][...]` + # without the "ms" suffix. If the extraction fails, return 0 + + # (and optionally log the failure). + class FailureMode(Enum): + LOG_ERROR = auto() + SILENTLY_RETURN_ZERO = auto() + THROW_EXCEPTION = auto() + + def get_time_ms( + stats, *keys: str, failure_mode=FailureMode.LOG_ERROR + ) -> int: + try: + value = stats["time"] + for key in keys: + value = value[key] + value = int(value) + except Exception: + if failure_mode == FailureMode.THROW_EXCEPTION: + raise + elif failure_mode == FailureMode.LOG_ERROR: + log.error( + f"Error extracting time from JSON statistics, " + f"keys: {keys}" + ) + value = 0 + return value + + # Check for old JSON format (no `operations` or `time` on top level). + old_json_message_template = ( + "Result JSON does not contain `{}` field, you are " + "probably using an old version of QLever" + ) + for field in ["operations", "time"]: + if field not in result: + raise RuntimeError(old_json_message_template.format(field)) + + # Get the per-operation statistics. + for i, stats in enumerate(result["operations"]): + try: + ins_after = stats["delta-triples"]["after"]["inserted"] + del_after = stats["delta-triples"]["after"]["deleted"] + ops_after = stats["delta-triples"]["after"]["total"] + num_ins = int( + stats["delta-triples"]["operation"]["inserted"] + ) + num_del = int( + stats["delta-triples"]["operation"]["deleted"] + ) + num_ops = int(stats["delta-triples"]["operation"]["total"]) + time_op_total = get_time_ms(stats, "total") + time_us_per_op = ( + int(1000 * time_op_total / num_ops) + if num_ops > 0 + else 0 + ) + if args.verbose == "yes": + log.info( + colored( + f"TRIPLES: {num_ops:+10,} -> {ops_after:10,}, " + f"INS: {num_ins:+10,} -> {ins_after:10,}, " + f"DEL: {num_del:+10,} -> {del_after:10,}, " + f"TIME: {time_op_total:7,}ms, " + f"TIME/TRIPLE: {time_us_per_op:6,}µs", + attrs=["bold"], + ) + ) + + time_planning = get_time_ms(stats, "planning") + time_compute_ids = get_time_ms( + stats, + "execution", + "computeIds", + "total", + ) + time_where = get_time_ms( + stats, + "execution", + "evaluateWhere", + ) + time_metadata = get_time_ms( + stats, + "updateMetadata", + ) + time_insert = get_time_ms( + stats, + "execution", + "insertTriples", + "total", + failure_mode=FailureMode.SILENTLY_RETURN_ZERO, + ) + time_delete = get_time_ms( + stats, + "execution", + "deleteTriples", + "total", + failure_mode=FailureMode.SILENTLY_RETURN_ZERO, + ) + time_unaccounted = time_op_total - ( + time_planning + + time_compute_ids + + time_where + + time_metadata + + time_delete + + time_insert + ) + if args.verbose == "yes": + log.info( + f"METADATA: {100 * time_metadata / time_op_total:2.0f}%, " + f"PLANNING: {100 * time_planning / time_op_total:2.0f}%, " + f"WHERE: {100 * time_where / time_op_total:2.0f}%, " + f"IDS: {100 * time_compute_ids / time_op_total:2.0f}%, " + f"DELETE: {100 * time_delete / time_op_total:2.0f}%, " + f"INSERT: {100 * time_insert / time_op_total:2.0f}%, " + f"UNACCOUNTED: {100 * time_unaccounted / time_op_total:2.0f}%", + ) + + except Exception as e: + log.warn( + f"Error extracting statistics: {e}, " + f"curl command was: {curl_cmd}" + ) + # Show traceback for debugging. + import traceback + + traceback.print_exc() + log.info("") + continue + + # Get times for the whole request (not per operation). + time_parsing = get_time_ms( + result, + "parsing", + ) + time_metadata = get_time_ms( + result, + "metadataUpdateForSnapshot", + ) + time_snapshot = get_time_ms( + result, + "snapshotCreation", + ) + time_writeback = get_time_ms( + result, + "diskWriteback", + ) + time_operations = get_time_ms( + result, + "operations", + ) + time_total = get_time_ms( + result, + "total", + ) + time_unaccounted = time_total - ( + time_parsing + + time_metadata + + time_snapshot + + time_writeback + + time_operations + ) + + # Update the totals. + total_update_time += time_total / 1000.0 + total_elapsed_time = time.perf_counter() - start_time + + # Show statistics for the completed batch. + if args.verbose == "yes": + log.info( + colored( + f"TOTAL UPDATE TIME SO FAR: {total_update_time:4.0f}s, " + f"TOTAL ELAPSED TIME SO FAR: {total_elapsed_time:4.0f}s, " + f"TOTAL TIME FOR THIS UPDATE REQUEST: {time_total:7,}ms, ", + attrs=["bold"], + ) + ) + log.info( + f"PARSING: {100 * time_parsing / time_total:2.0f}%, " + f"OPERATIONS: {100 * time_operations / time_total:2.0f}%, " + f"METADATA: {100 * time_metadata / time_total:2.0f}%, " + f"SNAPSHOT: {100 * time_snapshot / time_total:2.0f}%, " + f"WRITEBACK: {100 * time_writeback / time_total:2.0f}%, " + f"UNACCOUNTED: {100 * time_unaccounted / time_total:2.0f}%", + ) + log.info("") + + # Close the source connection (for each batch, we open a new one, + # either from `event_id_for_next_batch` or from `since`). + source.close() + + # After the first batch is processed, enable offset checking for + # subsequent batches. + first_batch = False + + # If Ctrl+C was pressed, we reached `--until`, or we processed + # exactly `--num-messages`, finish. + if ( + self.ctrl_c_pressed.is_set() + or self.finished + or ( + args.num_messages is not None + and total_num_messages >= args.num_messages + ) + ): + break + + # Final message after all batches have been processed. + log.info( + f"Processed {batch_count} " + f"{'batches' if batch_count > 1 else 'batch'} " + f"terminating update command" + ) + return True diff --git a/src/qlever/commands/warmup.py b/src/qlever/commands/warmup.py index 49150262..12d2c376 100644 --- a/src/qlever/commands/warmup.py +++ b/src/qlever/commands/warmup.py @@ -20,7 +20,7 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str: list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return {"server": ["port", "warmup_cmd"]} def additional_arguments(self, subparser) -> None: diff --git a/src/qlever/config.py b/src/qlever/config.py index a9b4bdf5..dd6d48a1 100644 --- a/src/qlever/config.py +++ b/src/qlever/config.py @@ -220,4 +220,14 @@ def add_qleverfile_option(parser): "arguments on the command line. This is possible, " "but not recommended.") + # Warn if the old binary names are still being used. + if "IndexBuilderMain" in getattr(args, "index_binary", ""): + log.warning("The index binary has been renamed from " + "`IndexBuilderMain` to `qlever-index`. Please update " + "your Qleverfile or other configuration.") + if "ServerMain" in getattr(args, "server_binary", ""): + log.warning("The server binary has been renamed from " + "`ServerMain` to `qlever-server`. Please update " + "your Qleverfile or other configuration.") + return args diff --git a/src/qlever/containerize.py b/src/qlever/containerize.py index 8fb12343..11150ba7 100644 --- a/src/qlever/containerize.py +++ b/src/qlever/containerize.py @@ -66,7 +66,12 @@ def containerize_command( # Options for mounting volumes, setting ports, and setting the working # dir. - volume_options = "".join([f" -v {v1}:{v2}" for v1, v2 in volumes]) + volume_options = "".join( + [ + f' --mount type=bind,src="{v1}",target={v2}' + for v1, v2 in volumes + ] + ) port_options = "".join([f" -p {p1}:{p2}" for p1, p2 in ports]) working_directory_option = ( f" -w {working_directory}" if working_directory is not None else "" @@ -97,7 +102,8 @@ def is_running(container_system: str, container_name: str) -> bool: # Note: the `{{{{` and `}}}}` result in `{{` and `}}`, respectively. containers = ( run_command( - f'{container_system} ps --format="{{{{.Names}}}}"', return_output=True + f'{container_system} ps --format="{{{{.Names}}}}"', + return_output=True, ) .strip() .splitlines() @@ -105,7 +111,9 @@ def is_running(container_system: str, container_name: str) -> bool: return container_name in containers @staticmethod - def stop_and_remove_container(container_system: str, container_name: str) -> bool: + def stop_and_remove_container( + container_system: str, container_name: str + ) -> bool: """ Stop the container with the given name using the given system. Return `True` if a container with that name was found and stopped, `False` diff --git a/src/qlever/qlever_main.py b/src/qlever/qlever_main.py index 5a16d5fa..c9d76fae 100644 --- a/src/qlever/qlever_main.py +++ b/src/qlever/qlever_main.py @@ -40,8 +40,7 @@ def main(): if not command_successful: exit(1) except KeyboardInterrupt: - log.info("") - log.info("Ctrl-C pressed, exiting ...") + log.warn("\rCtrl-C pressed, exiting ...") log.info("") exit(1) except Exception as e: diff --git a/src/qlever/qleverfile.py b/src/qlever/qleverfile.py index 803b971c..83ed8794 100644 --- a/src/qlever/qleverfile.py +++ b/src/qlever/qleverfile.py @@ -21,6 +21,41 @@ class Qleverfile: Qleverfile + functions for parsing. """ + # Runtime parameters (for `settings` and `start` commands). + SERVER_RUNTIME_PARAMETERS = [ + "cache-max-num-entries", + "cache-max-size", + "cache-max-size-single-entry", + "cache-service-results", + "default-query-timeout", + "division-by-zero-is-undef", + "enable-distributive-union", + "enable-prefilter-on-index-scans", + "group-by-disable-index-scan-optimizations", + "group-by-hash-map-enabled", + "lazy-index-scan-max-size-materialization", + "lazy-index-scan-num-threads", + "lazy-index-scan-queue-size", + "lazy-result-max-cache-size", + "permutation-writer-num-threads", + "query-planning-budget", + "request-body-limit", + "service-allowed-iri-prefixes", + "service-max-redirects", + "service-max-value-rows", + "sort-estimate-cancellation-factor", + "sort-in-memory-threshold", + "sparql-results-json-with-time", + "spatial-join-prefilter-max-size", + "spatial-join-max-num-threads", + "strip-columns", + "syntax-test-mode", + "throw-on-unbound-variables", + "treat-default-graph-as-named-graph", + "use-binsearch-transitive-path", + "websocket-updates-enabled", + ] + @staticmethod def all_arguments(): """ @@ -61,7 +96,7 @@ def arg(*args, **kwargs): "--text-description", type=str, default=None, - help="A concise description of the additional text data" " if any", + help="A concise description of the additional text data if any", ) data_args["format"] = arg( "--format", @@ -108,6 +143,15 @@ def arg(*args, **kwargs): default="{}", help="The `.settings.json` file for the index", ) + index_args["materialized_views"] = arg( + "--materialized-views", + type=str, + default=None, + help="JSON to specify materialized views to be created at the " + 'end of the index build, of the form `{ "view_name": ' + '"SPARQL query", ... }`; default: do not create any ' + "materialized views", + ) index_args["ulimit"] = arg( "--ulimit", type=int, @@ -116,10 +160,24 @@ def arg(*args, **kwargs): "files (default: 1048576 when the total size of the input files " "is larger than 10 GB)", ) + index_args["vocabulary_type"] = arg( + "--vocabulary-type", + type=str, + choices=[ + "on-disk-compressed", + "on-disk-uncompressed", + "in-memory-compressed", + "in-memory-uncompressed", + "on-disk-compressed-geo-split", + ], + default="on-disk-compressed", + help="The type of the vocabulary to use for the index " + " (default: `on-disk-compressed`)", + ) index_args["index_binary"] = arg( "--index-binary", type=str, - default="IndexBuilderMain", + default="qlever-index", help="The binary for building the index (this requires " "that you have compiled QLever on your machine)", ) @@ -137,6 +195,14 @@ def arg(*args, **kwargs): "large enough to contain the end of at least one statement " "(default: 10M)", ) + index_args["encode_as_id"] = arg( + "--encode-as-id", + type=str, + help="Space-separated list of IRI prefixes (without angle " + "brackets); IRIs that start with one of these prefixes, followed " + "by a sequence of digits, do not require a vocabulary entry but " + "are directly encoded in the ID (default: none)", + ) index_args["only_pso_and_pos_permutations"] = arg( "--only-pso-and-pos-permutations", action="store_true", @@ -145,12 +211,19 @@ def arg(*args, **kwargs): ) index_args["use_patterns"] = arg( "--use-patterns", - action="store_true", - default=True, - help="Precompute so-called patterns needed for fast processing" - " of queries like SELECT ?p (COUNT(DISTINCT ?s) AS ?c) " + choices=["yes", "no"], + default="yes", + help="Whether to precompute the so-called patterns used for fast " + "processing of queries like SELECT ?p (COUNT(DISTINCT ?s) AS ?c) " "WHERE { ?s ?p [] ... } GROUP BY ?p", ) + index_args["add_has_word_triples"] = arg( + "--add-has-word-triples", + action="store_true", + default=False, + help="Whether to add `ql:has-word` triples for text literals " + "(which can then be used for custom text search queries)", + ) index_args["text_index"] = arg( "--text-index", choices=[ @@ -181,14 +254,13 @@ def arg(*args, **kwargs): server_args["server_binary"] = arg( "--server-binary", type=str, - default="ServerMain", + default="qlever-server", help="The binary for starting the server (this requires " "that you have compiled QLever on your machine)", ) server_args["host_name"] = arg( "--host-name", type=str, - default="localhost", help="The name of the host on which the server listens for " "requests", ) @@ -261,10 +333,10 @@ def arg(*args, **kwargs): ) server_args["use_patterns"] = arg( "--use-patterns", - action="store_true", - default=True, - help="Use the patterns precomputed during the index build" - " (see `qlever index --help` for their utility)", + choices=["yes", "no"], + default="yes", + help="Whether to use the patterns precomputed during the index " + "build (see `qlever index --help` for their utility)", ) server_args["use_text_index"] = arg( "--use-text-index", @@ -416,10 +488,15 @@ def read(qleverfile_path): server = config["server"] if index.get("text_index", "none") != "none": server["use_text_index"] = "yes" + if index.get("only_pso_and_pos_permutations", "false") == "true": + index["use_patterns"] = "no" + if index.get("use_patterns", None) == "no": + server["use_patterns"] = "no" # Add other non-trivial default values. try: - config["server"]["host_name"] = socket.gethostname() + if config["server"].get("host_name") is None: + config["server"]["host_name"] = socket.gethostname() except Exception: log.warning( "Could not get the hostname, using `localhost` as default" diff --git a/src/qlever/util.py b/src/qlever/util.py index b5c8f35d..60006e40 100644 --- a/src/qlever/util.py +++ b/src/qlever/util.py @@ -76,7 +76,7 @@ def run_command( else: raise Exception( f"Command failed with exit code {result.returncode}, " - f" nothing written to stderr" + f" nothing written to stderr (stdout: {result.stdout})" ) # Optionally, return what was written to `stdout`. if return_output: @@ -88,6 +88,7 @@ def run_curl_command( headers: dict[str, str] = {}, params: dict[str, str] = {}, result_file: Optional[str] = None, + max_time: int | None = None, ) -> str: """ Run `curl` with the given `url`, `headers`, and `params`. If `result_file` @@ -99,7 +100,7 @@ def run_curl_command( default_result_file = "/tmp/qlever.curl.result" actual_result_file = result_file if result_file else default_result_file curl_cmd = ( - f'curl -s -o "{actual_result_file}"' + f'curl -Ls -o "{actual_result_file}"' f' -w "%{{http_code}}\n" {url}' + "".join([f' -H "{key}: {value}"' for key, value in headers.items()]) + "".join( @@ -109,6 +110,8 @@ def run_curl_command( ] ) ) + if max_time is not None: + curl_cmd += f" --max-time {int(max_time)}" result = subprocess.run( curl_cmd, shell=True, @@ -154,17 +157,28 @@ def is_qlever_server_alive(endpoint_url: str) -> bool: return False -def get_existing_index_files(basename: str) -> list[str]: +def get_existing_index_files(basename: str, add_non_essential: bool = False) -> list[str]: """ Helper function that returns a list of all index files for `basename` in the current working directory. """ + + # Essential index files. existing_index_files = [] existing_index_files.extend(Path.cwd().glob(f"{basename}.index.*")) + existing_index_files.extend(Path.cwd().glob(f"{basename}.internal.index.*")) existing_index_files.extend(Path.cwd().glob(f"{basename}.text.*")) existing_index_files.extend(Path.cwd().glob(f"{basename}.vocabulary.*")) existing_index_files.extend(Path.cwd().glob(f"{basename}.meta-data.json")) existing_index_files.extend(Path.cwd().glob(f"{basename}.prefixes")) + + # Non-essential index files. + if add_non_essential: + existing_index_files.extend(Path.cwd().glob(f"{basename}.view.*")) + existing_index_files.extend(Path.cwd().glob(f"{basename}.settings.json")) + existing_index_files.extend(Path.cwd().glob(f"{basename}.index-log.txt")) + existing_index_files.extend(Path.cwd().glob(f"{basename}.server-log.txt")) + # Return only the file names, not the full paths. return [path.name for path in existing_index_files] @@ -287,8 +301,9 @@ def stop_process_with_regex(cmdline_regex: str) -> list[bool] | None: ) cmdline = " ".join(pinfo["cmdline"]) except Exception as e: + # For some processes (e.g., zombies), getting info may fail. log.debug(f"Error getting process info: {e}") - return None + continue if re.search(cmdline_regex, cmdline): log.info( f"Found process {pinfo['pid']} from user " @@ -299,19 +314,46 @@ def stop_process_with_regex(cmdline_regex: str) -> list[bool] | None: return stop_process_results -def binary_exists(binary: str, cmd_arg: str) -> bool: +def binary_exists(binary: str, cmd_arg: str, args) -> bool: """ - When a command is run natively, check if the binary exists on the system + Check if the binary exists on the user's system. If running inside a + container, check if the binary exists inside the container system. """ + from qlever.containerize import Containerize + + is_containerized = args.system in Containerize.supported_systems() + cmd = f"{binary} --help" + if is_containerized: + cmd = Containerize().containerize_command( + cmd, + args.system, + "run --rm", + args.image, + "qlever.check-binary", + volumes=[("$(pwd)", "/index")], + working_directory="/index", + ) + try: - run_command(f"{binary} --help") + run_command(cmd) return True except Exception as e: - log.error( - f'Running "{binary}" failed, ' - f"set `--{cmd_arg}` to a different binary or " - f"set `--system to a container system`" - ) + if is_containerized and ( + binary == "qlever-index" or binary == "qlever-server" + ): + log.error( + f'Running "{binary}" failed. ' + f"This might be because you are using a newer version of " + f"the `qlever` command-line tool together with an older " + f"Docker image; in that case update with " + f"`{args.system} pull {args.image}` " + ) + else: + log.error( + f'Running "{binary}" failed, ' + f"set `--{cmd_arg}` to a different binary or " + f"set `--system to a container system`" + ) log.info("") log.info(f"The error message was: {e}") return False diff --git a/src/qoxigraph/commands/benchmark_queries.py b/src/qoxigraph/commands/benchmark_queries.py new file mode 100644 index 00000000..4f285520 --- /dev/null +++ b/src/qoxigraph/commands/benchmark_queries.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from qlever.commands.benchmark_queries import ( + BenchmarkQueriesCommand as QleverBenchmarkQueriesCommand, +) + + +class BenchmarkQueriesCommand(QleverBenchmarkQueriesCommand): + """ + Run benchmark queries against the Oxigraph SPARQL endpoint. + Overrides the default endpoint to use Oxigraph's /query path. + """ + + def execute(self, args) -> bool: + if not args.sparql_endpoint: + args.sparql_endpoint = f"{args.host_name}:{args.port}/query" + return super().execute(args) diff --git a/src/qoxigraph/commands/index.py b/src/qoxigraph/commands/index.py index 128b9a82..82135914 100644 --- a/src/qoxigraph/commands/index.py +++ b/src/qoxigraph/commands/index.py @@ -1,18 +1,48 @@ from __future__ import annotations -import glob import shlex +import time from pathlib import Path +import qlever.util as util from qlever.command import QleverCommand from qlever.containerize import Containerize from qlever.log import log -from qlever.util import binary_exists, run_command + + +def wrap_cmd_in_container(args, cmd: str, ulimit: int | None = None) -> str: + """ + Wrap an indexing command in a container that is automatically removed + after the process exits (`--rm`) Use `use_bash=False` as Oxigraph image + doesn't support bash entrypoint. + """ + run_subcommand = "run --rm" + if ulimit: + run_subcommand += f" --ulimit nofile={ulimit}:{ulimit}" + return Containerize().containerize_command( + cmd=cmd, + container_system=args.system, + run_subcommand=run_subcommand, + image_name=args.image, + container_name=args.index_container, + volumes=[("$(pwd)", "/opt")], + working_directory="/opt", + use_bash=False, + ) class IndexCommand(QleverCommand): + """ + Build an Oxigraph index for an RDF dataset. The indexing workflow is: + 1. Run `oxigraph load` to import input files into a RocksDB store. + 2. Optionally run `oxigraph optimize` to compact storage for read-only use. + + For large datasets (>5 GB), the file descriptor ulimit is raised + automatically because RocksDB opens many .sst files concurrently. + """ + def __init__(self): - self.script_name = "qoxigraph" + pass def description(self) -> str: return "Build the index for a given RDF dataset" @@ -20,90 +50,122 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return { "data": ["name", "format"], - "index": ["input_files"], + "index": [ + "input_files", + "ulimit", + "index_binary", + "lenient", + "extra_args", + ], + "server": ["read_only"], "runtime": ["system", "image", "index_container"], } def additional_arguments(self, subparser): - subparser.add_argument( - "--index-binary", - type=str, - default="oxigraph", - help=( - "The binary for building the index (default: oxigraph) " - "(this requires that you have oxigraph-cli installed " - "on your machine)" - ), - ) - - @staticmethod - def wrap_cmd_in_container(args, cmd: str) -> str: - return Containerize().containerize_command( - cmd=cmd, - container_system=args.system, - run_subcommand="run --rm", - image_name=args.image, - container_name=args.index_container, - volumes=[("$(pwd)", "/index")], - working_directory="/index", - use_bash=False, - ) + pass def execute(self, args) -> bool: - index_cmd = f"load --location . --file {args.input_files}" - index_cmd += f" |& tee {args.name}.index-log.txt" - + cmds_to_execute = [] index_cmd = ( - f"{args.index_binary} {index_cmd}" - if args.system == "native" - else self.wrap_cmd_in_container(args, index_cmd) + f"load {'--lenient ' if args.lenient == 'yes' else ''}" + f"--location {args.name}_index/ --file {args.input_files} " + f"{args.extra_args} |& tee {args.name}.index-log.txt" + ) + + ulimit = args.ulimit + # RocksDB opens many .sst files concurrently. For datasets larger + # than 5 GB, raise the file descriptor limit so the process does + # not hit the default OS soft limit. + total_file_size = util.get_total_file_size( + shlex.split(args.input_files) ) + if not ulimit and total_file_size > 5e9: + ulimit = 500_000 + if args.system in Containerize.supported_systems(): + index_cmd = wrap_cmd_in_container(args, index_cmd, ulimit) + else: + index_cmd = f"{args.index_binary} {index_cmd}" + if ulimit: + index_cmd = f"ulimit -Sn {ulimit} && {index_cmd}" + + cmds_to_execute.append(index_cmd) + + # Compact the RocksDB storage for read-only serving. This reduces + # disk usage and speeds up queries but makes the index immutable. + optimize_cmd = None + if args.read_only == "yes": + optimize_cmd = f"optimize -l {args.name}_index/" + if args.system in Containerize.supported_systems(): + optimize_cmd = wrap_cmd_in_container(args, optimize_cmd) + else: + optimize_cmd = f"{args.index_binary} {optimize_cmd}" + cmds_to_execute.append(optimize_cmd) # Show the command line. - self.show(index_cmd, only_show=args.show) + self.show("\n".join(cmds_to_execute), only_show=args.show) if args.show: return True - # Check if all of the input files exist. - for pattern in shlex.split(args.input_files): - if len(glob.glob(pattern)) == 0: - log.error(f'No file matching "{pattern}" found') - log.info("") - log.info( - f"Did you call `{self.script_name} get-data`? If you did, " - "check GET_DATA_CMD and INPUT_FILES in the Qleverfile" - ) - return False + if not util.input_files_exist(args.input_files): + return False # When running natively, check if the binary exists and works. - if args.system == "native": - if not binary_exists(args.index_binary, "index-binary"): - return False - else: + if args.system in Containerize.supported_systems(): if Containerize().is_running(args.system, args.index_container): log.info( f"{args.system} container {args.index_container} is still up, " "which means that data loading is in progress. Please wait..." ) return False + else: + if not util.binary_exists(args.index_binary, "index-binary", args): + return False - if len([p.name for p in Path.cwd().glob("*.sst")]) != 0: + # Abort if a previous index already exists. RocksDB .sst files in + # the index directory indicate an existing store. + if ( + len([p.name for p in Path(f"{args.name}_index").glob("*.sst")]) + != 0 + ): log.error( - "Index files (*.sst) found in current directory " + f"Index files (*.sst) found in {args.name}_index directory " "which shows presence of a previous index" ) log.info("") log.info("Aborting the index operation...") return False - # Run the index command. + # Run the index command and record the elapsed time in the log + # file. Oxigraph's progress output is unreliable (may not print a + # final summary line when loading multiple files), so we measure + # the time externally. + log_file_name = f"{args.name}.index-log.txt" try: - run_command(index_cmd, show_output=True, show_stderr=True) + start_time = time.time() + util.run_command(index_cmd, show_output=True, show_stderr=True) + elapsed_s = time.time() - start_time + with open(log_file_name, "a") as f: + f.write(f"Total elapsed time: {elapsed_s:.0f}s\n") except Exception as e: log.error(f"Building the index failed: {e}") return False + if optimize_cmd: + try: + log.info("") + log.info("Optimizing read-only database storage:") + self.show(optimize_cmd) + util.run_command( + optimize_cmd, show_output=True, show_stderr=True + ) + except Exception as e: + log.error(f"Optimizing the database storage failed: {e}") + log.info( + f"Please run manually: " + f"{args.index_binary} optimize -l {args.name}_index/" + ) + return True diff --git a/src/qoxigraph/commands/index_stats.py b/src/qoxigraph/commands/index_stats.py new file mode 100644 index 00000000..672f4a53 --- /dev/null +++ b/src/qoxigraph/commands/index_stats.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import re + +import qlever.util as util +from qlever.commands.index_stats import ( + IndexStatsCommand as QleverIndexStatsCommand, +) +from qlever.commands.index_stats import ( + get_size_unit, + get_size_unit_factor, + get_time_unit, + get_time_unit_factor, +) +from qlever.log import log + + +class IndexStatsCommand(QleverIndexStatsCommand): + """ + Show index build time and disk space usage for an Oxigraph dataset. + Time is read from the "Total elapsed time" line appended to the + index log by the index command; space is the sum of all .sst files. + """ + + def execute_time( + self, args, log_file_name: str + ) -> dict[str, tuple[float | None, str]]: + """Parse total index build time from the index log file.""" + try: + # Read the last few lines of the log file (the total time is + # always near the end). + log_text = util.run_command( + f"tail {log_file_name}", return_output=True + ) + except Exception as e: + log.error(f"Problem reading index log file {log_file_name}: {e}") + return {} + + stats = {} + # Pattern: "Total elapsed time: s" (total time, always last) + total_pattern = re.compile(r"Total elapsed time: ([\d,]+)s$") + + for line in log_text.splitlines(): + match = total_pattern.search(line) + if not match: + continue + + try: + value_s = float(match.group(1).replace(",", "")) + except (ValueError, TypeError): + continue + + time_unit = get_time_unit(args.time_unit, value_s) + unit_factor = get_time_unit_factor(time_unit) + + stats["TOTAL time"] = (value_s / unit_factor, time_unit) + break + + return stats + + def execute_space(self, args) -> dict[str, tuple[float, str]]: + """ + Return the space used by the index files (*.sst) along with the unit. + """ + index_size = util.get_total_file_size([f"{args.name}_index/*.sst"]) + + size_unit = get_size_unit(args.size_unit, index_size) + unit_factor = get_size_unit_factor(size_unit) + + index_size /= unit_factor + + return {"TOTAL size": (index_size, size_unit)} diff --git a/src/qoxigraph/commands/log.py b/src/qoxigraph/commands/log.py index a90d2228..401d2148 100644 --- a/src/qoxigraph/commands/log.py +++ b/src/qoxigraph/commands/log.py @@ -1,5 +1,6 @@ from __future__ import annotations +from qlever import script_name from qlever.commands.log import LogCommand as QleverLogCommand from qlever.containerize import Containerize from qlever.log import log @@ -7,10 +8,16 @@ class LogCommand(QleverLogCommand): + """ + Show server logs for Oxigraph. For native execution, tails the log + file as usual. For containers, uses `docker/podman logs` as it is + not possible to redirect oxigraph logs to a log file. + """ + def __init__(self): - self.script_name = "qoxigraph" + pass - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return { "data": ["name"], "runtime": [ @@ -21,9 +28,12 @@ def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: } def execute(self, args) -> bool: - if args.system == "native": + if args.system not in Containerize.supported_systems(): return super().execute(args) + # Handle container logging using docker/podman logs command instead of tail + # This is because we don't have .server-log.txt for + # containerized execution log_cmd = f"{args.system} logs " if not args.from_beginning: @@ -40,7 +50,7 @@ def execute(self, args) -> bool: if not Containerize().is_running(args.system, args.server_container): log.error(f"No server container {args.server_container} found!\n") - log.info(f"Are you sure you called `{self.script_name} start`?") + log.info(f"Are you sure you called `{script_name} start`?") return False try: diff --git a/src/qoxigraph/commands/query.py b/src/qoxigraph/commands/query.py index 6518905f..bc3fb35c 100644 --- a/src/qoxigraph/commands/query.py +++ b/src/qoxigraph/commands/query.py @@ -4,6 +4,12 @@ class QueryCommand(QleverQueryCommand): + """ + Send a SPARQL query to the Oxigraph server. Extends the base query + command with Oxigraph's /query endpoint and supported result formats. + This class is used as the base QueryCommand by all the other new engines. + """ + def additional_arguments(self, subparser) -> None: subparser.add_argument( "query", @@ -47,8 +53,10 @@ def additional_arguments(self, subparser) -> None: ) def execute(self, args) -> bool: + # Oxigraph's SPARQL endpoint is at /query. if not args.sparql_endpoint: - args.sparql_endpoint = f"localhost:{args.port}/query" + args.sparql_endpoint = f"{args.host_name}:{args.port}/query" + # These QLever-specific options are not supported by Oxigraph. args.pin_to_cache = None args.access_token = None - super().execute(args) + return super().execute(args) diff --git a/src/qoxigraph/commands/setup_config.py b/src/qoxigraph/commands/setup_config.py index b6d9225b..ed286269 100644 --- a/src/qoxigraph/commands/setup_config.py +++ b/src/qoxigraph/commands/setup_config.py @@ -3,14 +3,24 @@ from configparser import RawConfigParser from pathlib import Path -from qlever.command import QleverCommand +from qlever.commands.setup_config import ( + SetupConfigCommand as QleverSetupConfigCommand, +) from qlever.log import log from qlever.qleverfile import Qleverfile -class SetupConfigCommand(QleverCommand): +class SetupConfigCommand(QleverSetupConfigCommand): + """ + Create a Qleverfile for Oxigraph from a dataset template from `src/qlever/Qleverfiles`. + Filters the template to keep only the relevant sections and adds Oxigraph-specific + defaults (read-only mode, query timeout). + This class is used as the base SetupConfigCommand by all the other new engines. + """ + IMAGE = "ghcr.io/oxigraph/oxigraph" + # Sections and keys to retain when filtering a Qleverfile template. FILTER_CRITERIA = { "data": [], "index": ["INPUT_FILES"], @@ -19,91 +29,63 @@ class SetupConfigCommand(QleverCommand): "ui": ["UI_CONFIG"], } - def __init__(self): - self.qleverfiles_path = ( - Path(__file__).parent.parent.parent / "qlever" / "Qleverfiles" - ) - self.qleverfile_names = [ - p.name.split(".")[1] - for p in self.qleverfiles_path.glob("Qleverfile.*") - ] - - def description(self) -> str: - return "Get a pre-configured Qleverfile" - - def should_have_qleverfile(self) -> bool: - return False - - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: - return {} - - def additional_arguments(self, subparser) -> None: - subparser.add_argument( - "config_name", - type=str, - choices=self.qleverfile_names, - help="The name of the pre-configured Qleverfile to create", - ) + @staticmethod + def construct_engine_specific_params(args) -> dict[str, dict[str, str]]: + """Return Oxigraph-specific defaults to inject into the Qleverfile.""" + return {"server": {"READ_ONLY": "yes", "TIMEOUT": "60s"}} + + @staticmethod + def add_engine_specific_option_values( + qleverfile_parser: RawConfigParser, + engine_specific_params: dict[str, dict[str, str]], + ) -> None: + """Merge engine-specific parameters into the Qleverfile parser.""" + for section, option_dict in engine_specific_params.items(): + if qleverfile_parser.has_section(section): + for option, value in option_dict.items(): + qleverfile_parser.set(section, option, value) - def validate_qleverfile_setup( - self, args, qleverfile_path: Path - ) -> bool | None: + def execute(self, args) -> bool: # Construct the command line and show it. + template_path = ( + self.qleverfiles_path / f"Qleverfile.{args.config_name}" + ) setup_config_show = ( - f"Creating Qleverfile for {args.config_name} using " - f"Qleverfile.{args.config_name} file in {self.qleverfiles_path}" + f"Qleverfile for {args.config_name} will be created using " + f"Qleverfile.{args.config_name} file in {template_path}" ) self.show(setup_config_show, only_show=args.show) if args.show: return True # If there is already a Qleverfile in the current directory, exit. - if qleverfile_path.exists(): - log.error("`Qleverfile` already exists in current directory") - log.info("") - log.info( - "If you want to create a new Qleverfile using " - "`qlever setup-config`, delete the existing Qleverfile " - "first" - ) + if self.check_qleverfile_exists(): return False - return None - def get_filtered_qleverfile_parser( - self, config_name: str - ) -> RawConfigParser: - qleverfile_config_path = ( - self.qleverfiles_path / f"Qleverfile.{config_name}" - ) - qleverfile_parser = Qleverfile.filter( - qleverfile_config_path, self.FILTER_CRITERIA - ) - if qleverfile_parser.has_section("runtime"): - qleverfile_parser.set("runtime", "IMAGE", self.IMAGE) - return qleverfile_parser - - def execute(self, args) -> bool: qleverfile_path = Path("Qleverfile") - exit_status = self.validate_qleverfile_setup(args, qleverfile_path) - if exit_status is not None: - return exit_status - qleverfile_parser = self.get_filtered_qleverfile_parser( - args.config_name - ) - # Copy the Qleverfile to the current directory. try: + qleverfile_parser = Qleverfile.filter( + template_path, self.FILTER_CRITERIA + ) + qleverfile_parser.set("runtime", "IMAGE", self.IMAGE) + params = self.construct_engine_specific_params(args) + self.add_engine_specific_option_values(qleverfile_parser, params) + for section, arg_name in self.override_args: + if arg_value := getattr(args, arg_name, None): + qleverfile_parser.set( + section, arg_name.upper(), str(arg_value) + ) with qleverfile_path.open("w") as f: qleverfile_parser.write(f) + + log.info( + f'Created Qleverfile for config "{args.config_name}"' + f" in current directory" + ) + return True except Exception as e: log.error( f'Could not copy "{qleverfile_path}" to current directory: {e}' ) return False - - # If we get here, everything went well. - log.info( - f'Created Qleverfile for config "{args.config_name}"' - f" in current directory" - ) - return True diff --git a/src/qoxigraph/commands/start.py b/src/qoxigraph/commands/start.py index 8a038344..7fbfd81f 100644 --- a/src/qoxigraph/commands/start.py +++ b/src/qoxigraph/commands/start.py @@ -4,15 +4,65 @@ import time from pathlib import Path +from qlever import script_name from qlever.command import QleverCommand from qlever.containerize import Containerize from qlever.log import log -from qlever.util import binary_exists, is_server_alive, run_command +from qlever.util import ( + binary_exists, + is_server_alive, + run_command, + tail_log_file, +) +from qoxigraph.commands.stop import StopCommand + + +def timeout_supported(args, serve_ps: str) -> bool: + """Check whether the oxigraph server binary supports query timeouts.""" + help_cmd = f"{serve_ps} --help" + if args.system in Containerize.supported_systems(): + help_cmd = f"{args.system} run --rm {args.image} {help_cmd}" + else: + help_cmd = f"{args.server_binary} {help_cmd}" + try: + help_output = run_command(help_cmd, return_output=True) + return "timeout-s" in help_output + except Exception as e: + log.warning( + "Could not determine if query timeouts are supported by this version " + f"of Oxigraph! Falling back to no timeouts. Error: {e}", + ) + return False + + +def wrap_cmd_in_container(args, cmd: str) -> str: + """Wrap the server start command in a container with restart policy.""" + run_subcommand = "run --restart=unless-stopped" + if not args.run_in_foreground: + run_subcommand += " -d" + return Containerize().containerize_command( + cmd=cmd, + container_system=args.system, + run_subcommand=run_subcommand, + image_name=args.image, + container_name=args.server_container, + volumes=[("$(pwd)", "/opt")], + ports=[(args.port, args.port)], + working_directory="/opt", + use_bash=False, + ) class StartCommand(QleverCommand): + """ + Start the Oxigraph SPARQL server for an already-indexed dataset. + Supports both native and containerized execution, with an option + to run in the foreground. Uses `serve-read-only` or `serve` + depending on the read_only setting. + """ + def __init__(self): - self.script_name = "qoxigraph" + pass def description(self) -> str: return ( @@ -23,10 +73,17 @@ def description(self) -> str: def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return { "data": ["name"], - "server": ["host_name", "port"], + "server": [ + "host_name", + "port", + "read_only", + "server_binary", + "timeout", + "extra_args", + ], "runtime": ["system", "image", "server_container"], } @@ -40,50 +97,44 @@ def additional_arguments(self, subparser): "(default: run in the background)" ), ) - subparser.add_argument( - "--server-binary", - type=str, - default="oxigraph", - help=( - "The binary for starting the server (default: oxigraph) " - "(this requires that you have oxigraph-cli installed " - "on your machine)" - ), - ) - - @staticmethod - def wrap_cmd_in_container(args, cmd: str) -> str: - run_subcommand = "run --restart=unless-stopped" - if not args.run_in_foreground: - run_subcommand += " -d" - return Containerize().containerize_command( - cmd=cmd, - container_system=args.system, - run_subcommand=run_subcommand, - image_name=args.image, - container_name=args.server_container, - volumes=[("$(pwd)", "/index")], - ports=[(args.port, args.port)], - working_directory="/index", - use_bash=False, - ) def execute(self, args) -> bool: + # Inside a container, bind to 0.0.0.0 so the port mapping is + # reachable from the host; natively, bind to the configured host. bind = ( - f"{args.host_name}:{args.port}" - if args.system == "native" - else f"0.0.0.0:{args.port}" + f"0.0.0.0:{args.port}" + if args.system in Containerize.supported_systems() + else f"{args.host_name}:{args.port}" ) - start_cmd = f"serve-read-only --location . --bind={bind}" - - if args.system == "native": - start_cmd = f"{args.server_binary} {start_cmd}" - if not args.run_in_foreground: - start_cmd = ( - f"nohup {start_cmd} > {args.name}.server-log.txt 2>&1 &" + process = "serve-read-only" if args.read_only == "yes" else "serve" + timeout_str = "" + if timeout_supported(args, process): + try: + timeout_s = int(args.timeout[:-1]) + except ValueError as e: + log.warning( + f"Invalid timeout value {args.timeout}. Error: {e}" ) + log.info("Setting timeout to 60s!") + timeout_s = 60 + timeout_str = f"--timeout-s {timeout_s}" else: - start_cmd = self.wrap_cmd_in_container(args, start_cmd) + log.info( + f"Ignoring the set timeout value of {args.timeout} as your " + "version of Oxigraph doesn't currently support query timeouts!" + ) + + start_cmd = ( + f"{process} --location {args.name}_index/ {args.extra_args} " + f"{timeout_str} --bind={bind}" + ) + + if args.system in Containerize.supported_systems(): + start_cmd = wrap_cmd_in_container(args, start_cmd) + else: + start_cmd = f"{args.server_binary} {start_cmd} > {args.name}.server-log.txt 2>&1" + if not args.run_in_foreground: + start_cmd = f"nohup {start_cmd} &" # Show the command line. self.show(start_cmd, only_show=args.show) @@ -93,35 +144,34 @@ def execute(self, args) -> bool: endpoint_url = f"http://{args.host_name}:{args.port}/query" # When running natively, check if the binary exists and works. - if args.system == "native": - if not binary_exists(args.server_binary, "server-binary"): - return False - else: - if Containerize().is_running(args.system, args.server_container): - log.error( - f"Server container {args.server_container} already exists!\n" - ) - log.info( - f"To kill the existing server, use `{self.script_name} stop`" - ) + if args.system not in Containerize.supported_systems(): + if not binary_exists(args.server_binary, "server-binary", args): return False - # Check if index files (*.sst) present in cwd - if len([p.name for p in Path.cwd().glob("*.sst")]) == 0: + # Check if index files (*.sst) present in index directory + if ( + len([p.name for p in Path(f"{args.name}_index/").glob("*.sst")]) + == 0 + ): log.error(f"No Oxigraph index files for {args.name} found!\n") log.info( - f"Did you call `{self.script_name} index`? If you did, check " - "if .sst index files are present in current working directory." + f"Did you call `{script_name} index`? If you did, check " + "if .sst index files are present in index directory." ) return False + # Check if server already alive at endpoint url from a previous run if is_server_alive(url=endpoint_url): log.error(f"Oxigraph server already running on {endpoint_url}\n") log.info( - f"To kill the existing server, use `{self.script_name} stop`" + f"To kill the existing server, use `{script_name} stop`" ) return False + # Remove old log file so that tail starts clean. + log_file = Path(f"{args.name}.server-log.txt") + log_file.unlink(missing_ok=True) + try: process = run_command( start_cmd, @@ -145,12 +195,17 @@ def execute(self, args) -> bool: " (Ctrl-C stops following the log, but NOT the server)" ) log.info("") - if args.system == "native": - log_cmd = f"exec tail -f {args.name}.server-log.txt" - else: + # For containers, use `docker/podman logs -f` as Oxigraph doesn't + # support redirecting logs to a log file. A short delay ensures + # the container is up before attaching. + if args.system in Containerize.supported_systems(): time.sleep(2) log_cmd = f"exec {args.system} logs -f {args.server_container}" - log_proc = subprocess.Popen(log_cmd, shell=True) + log_proc = subprocess.Popen(log_cmd, shell=True) + else: + log_proc = tail_log_file(log_file) + if log_proc is None: + return False while not is_server_alive(endpoint_url): time.sleep(1) @@ -165,11 +220,15 @@ def execute(self, args) -> bool: log_proc.terminate() # With `--run-in-foreground`, wait until the server is stopped. + # On Ctrl-C, terminate the process and clean up the container. if args.run_in_foreground: try: process.wait() except KeyboardInterrupt: process.terminate() + if args.system in Containerize.supported_systems(): + args.cmdline_regex = StopCommand.DEFAULT_REGEX + StopCommand().execute(args) log_proc.terminate() return True diff --git a/src/qoxigraph/commands/status.py b/src/qoxigraph/commands/status.py index eb2de86c..d73548dc 100644 --- a/src/qoxigraph/commands/status.py +++ b/src/qoxigraph/commands/status.py @@ -4,7 +4,9 @@ class StatusCommand(QleverStatusCommand): - DEFAULT_REGEX = "oxigraph\\s+serve-read-only" + """Show Oxigraph server processes running on this machine.""" + + DEFAULT_REGEX = "oxigraph\\s+serve" def description(self) -> str: return "Show Oxigraph processes running on this machine" diff --git a/src/qoxigraph/commands/stop.py b/src/qoxigraph/commands/stop.py index dedd1ff2..47308284 100644 --- a/src/qoxigraph/commands/stop.py +++ b/src/qoxigraph/commands/stop.py @@ -2,30 +2,37 @@ from qlever.command import QleverCommand from qlever.commands import stop as qlever_stop +from qlever.containerize import Containerize from qlever.log import log from qlever.util import stop_process_with_regex from qoxigraph.commands.status import StatusCommand class StopCommand(QleverCommand): + """ + Stop the Oxigraph server for a given dataset. For native execution, + finds and kills processes matching the dataset-name regex. For + containers, stops and removes the server container. + """ + # Override this with StatusCommand from child class for execute # method to work as intended STATUS_COMMAND = StatusCommand() - DEFAULT_REGEX = "oxigraph\\s+serve-read-only.*:%%PORT%%" + # %%NAME%% is replaced at runtime with the dataset name from the Qleverfile + DEFAULT_REGEX = "oxigraph\\s+serve.*%%NAME%%_index" def __init__(self): pass def description(self) -> str: - return "Stop Oxigraph server for a given dataset or port" + return "Stop Oxigraph server for a given dataset" def should_have_qleverfile(self) -> bool: return True - def relevant_qleverfile_arguments(self) -> dict[str : list[str]]: + def relevant_qleverfile_arguments(self) -> dict[str, list[str]]: return { "data": ["name"], - "server": ["port"], "runtime": ["system", "server_container"], } @@ -38,18 +45,24 @@ def additional_arguments(self, subparser) -> None: ) def execute(self, args) -> bool: - cmdline_regex = args.cmdline_regex.replace("%%PORT%%", str(args.port)) + # Substitute the dataset name into the regex template so we only + # match the server running for this dataset. + cmdline_regex = args.cmdline_regex + if "%%NAME%%" in args.cmdline_regex and hasattr(args, "name"): + cmdline_regex = args.cmdline_regex.replace( + "%%NAME%%", str(args.name) + ) description = ( - f'Checking for processes matching "{cmdline_regex}"' - if args.system == "native" - else f"Checking for container with name {args.server_container}" + f"Checking for container with name {args.server_container}" + if args.system in Containerize.supported_systems() + else f'Checking for processes matching "{cmdline_regex}"' ) self.show(description, only_show=args.show) if args.show: return True - if args.system == "native": + if args.system not in Containerize.supported_systems(): stop_process_results = stop_process_with_regex(cmdline_regex) if stop_process_results is None: return False diff --git a/src/qoxigraph/qleverfile.py b/src/qoxigraph/qleverfile.py new file mode 100644 index 00000000..467b77fa --- /dev/null +++ b/src/qoxigraph/qleverfile.py @@ -0,0 +1,77 @@ +from __future__ import annotations + + +def qleverfile_args(all_args: dict[str, dict[str, tuple]]) -> None: + """Define additional oxigraph specific Qleverfile parameters""" + + def arg(*args, **kwargs): + return (args, kwargs) + + index_args = all_args["index"] + server_args = all_args["server"] + + index_args["index_binary"] = arg( + "--index-binary", + type=str, + default="oxigraph", + help=( + "The binary for building the index (default: oxigraph) " + "(this requires that you have oxigraph-cli installed " + "on your machine)" + ), + ) + index_args["lenient"] = arg( + "--lenient", + type=str, + choices=["yes", "no"], + default="no", + help="Attempt to keep loading even if the data file is invalid", + ) + index_args["extra_args"] = arg( + "--extra-args", + type=str, + default="", + help=( + "Additional arguments to pass directly to the oxigraph load process. " + "This allows advanced users to specify options not exposed in " + "Qleverfile. The string is appended verbatim to the command." + ), + ) + + server_args["server_binary"] = arg( + "--server-binary", + type=str, + default="oxigraph", + help=( + "The binary for starting the server (default: oxigraph) " + "(this requires that you have oxigraph-cli installed " + "on your machine)" + ), + ) + server_args["read_only"] = arg( + "--read-only", + type=str, + choices=["yes", "no"], + default="yes", + help=( + "The HTTP server will not permit mutation operations in " + "read-only mode" + ), + ) + server_args["timeout"] = arg( + "--timeout", + type=str, + default="60s", + help="The maximal time in seconds a query is allowed to run", + ) + server_args["extra_args"] = arg( + "--extra-args", + type=str, + default="", + help=( + "Additional arguments to pass directly to the oxigraph " + "serve/serve-read-only. This allows advanced users to specify " + "options not exposed in Qleverfile. The string is appended " + "verbatim to the command." + ), + ) diff --git a/test/qlever/commands/test_benchmark_queries_methods.py b/test/qlever/commands/test_benchmark_queries_methods.py new file mode 100644 index 00000000..9676f175 --- /dev/null +++ b/test/qlever/commands/test_benchmark_queries_methods.py @@ -0,0 +1,454 @@ +import pytest + +from qlever.commands.benchmark_queries import ( + filter_queries, + get_result_size, + get_single_int_result, + parse_queries_tsv, + parse_queries_yml, + resolve_benchmark_metadata, + sparql_query_type, +) + +MODULE = "qlever.commands.benchmark_queries" + +JSON_ACCEPT_HEADERS_AND_RESULT_FILES = [ + ("application/sparql-results+json", "result.json"), + ("application/qlever-results+json", "result.json"), +] + +ALL_ACCEPT_HEADERS_AND_RESULT_FILES = [ + ("text/csv", "result.csv"), + ("text/tab-separated-values", "result.tsv"), + *JSON_ACCEPT_HEADERS_AND_RESULT_FILES, +] + + +@pytest.mark.parametrize("download_or_count", ["count", "download"]) +@pytest.mark.parametrize( + "accept_header, result_file", ALL_ACCEPT_HEADERS_AND_RESULT_FILES +) +def test_empty_result_non_construct_describe( + mock_command, + download_or_count, + accept_header, + result_file, +): + mock_path_stat = mock_command(MODULE, "Path.stat") + mock_path_stat.return_value.st_size = 0 + run_cmd_mock = mock_command(MODULE, "run_command") + + size, err = get_result_size( + count_only=download_or_count == "count", + query_type="SELECT", + accept_header=accept_header, + result_file=result_file, + ) + + assert size == 0 + assert err["short"] == "Empty result" + assert ( + err["long"] == "curl returned with code 200, but the result is empty" + ) + run_cmd_mock.assert_not_called() + + +@pytest.mark.parametrize("download_or_count", ["count", "download"]) +@pytest.mark.parametrize( + "accept_header, result_file", ALL_ACCEPT_HEADERS_AND_RESULT_FILES +) +@pytest.mark.parametrize("query_type", ["CONSTRUCT", "DESCRIBE"]) +def test_empty_result_construct_describe( + mock_command, + download_or_count, + query_type, + accept_header, + result_file, +): + mock_path_stat = mock_command(MODULE, "Path.stat") + mock_path_stat.return_value.st_size = 0 + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.return_value = "42" + + size, err = get_result_size( + count_only=download_or_count == "count", + query_type=query_type, + accept_header=accept_header, + result_file=result_file, + ) + + assert size == 42 + assert err is None + + +@pytest.mark.parametrize("download_or_count", ["count", "download"]) +@pytest.mark.parametrize( + "accept_header, result_file", ALL_ACCEPT_HEADERS_AND_RESULT_FILES +) +def test_count_and_download_success( + mock_command, + download_or_count, + accept_header, + result_file, +): + mock_path_stat = mock_command(MODULE, "Path.stat") + mock_path_stat.return_value.st_size = 100 + + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.return_value = "42" + + size, err = get_result_size( + count_only=download_or_count == "count", + query_type="SELECT", + accept_header=accept_header, + result_file=result_file, + ) + + run_cmd_mock.assert_called_once() + assert size == 42 + assert err is None + + +def test_download_turtle_success(mock_command): + mock_path_stat = mock_command(MODULE, "Path.stat") + mock_path_stat.return_value.st_size = 100 + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.return_value = "42" + + size, err = get_result_size( + count_only=False, + query_type="SELECT", + accept_header="text/turtle", + result_file="result.ttl", + ) + + run_cmd_mock.assert_called_once() + assert size == 42 + assert err is None + + +@pytest.mark.parametrize("download_or_count", ["count", "download"]) +@pytest.mark.parametrize( + "accept_header, result_file", JSON_ACCEPT_HEADERS_AND_RESULT_FILES +) +def test_download_and_count_json_malformed( + mock_command, + download_or_count, + accept_header, + result_file, +): + mock_path_stat = mock_command(MODULE, "Path.stat") + mock_path_stat.return_value.st_size = 100 + + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.side_effect = Exception("jq failed") + + size, err = get_result_size( + count_only=download_or_count == "count", + query_type="SELECT", + accept_header=accept_header, + result_file=result_file, + ) + + run_cmd_mock.assert_called_once() + assert size == 0 + assert err["short"] == "Malformed JSON" + assert ( + "curl returned with code 200, but the JSON is malformed: " + in err["long"] + ) + assert "jq failed" in err["long"] + + +def test_single_int_result_success(mock_command): + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.return_value = "123" + + single_int_result = get_single_int_result("result.json") + + run_cmd_mock.assert_called_once() + assert single_int_result == 123 + + +def test_single_int_result_non_int_fail(mock_command): + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.return_value = "abc" + + single_int_result = get_single_int_result("result.json") + + run_cmd_mock.assert_called_once() + assert single_int_result is None + + +def test_single_int_result_failure(mock_command): + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.side_effect = Exception("jq failed") + + single_int_result = get_single_int_result("result.json") + + run_cmd_mock.assert_called_once() + assert single_int_result is None + + +@pytest.mark.parametrize( + "query, expected", + [ + # Basic types + ("SELECT ?x WHERE { ?x ?y ?z }", "SELECT"), + ("ASK { ?x ?y ?z }", "ASK"), + ("CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }", "CONSTRUCT"), + ("DESCRIBE ", "DESCRIBE"), + # Case insensitivity + ("Select ?x WHERE { ?x ?y ?z }", "SELECT"), + ("ask { ?x ?y ?z }", "ASK"), + ("construct { ?s ?p ?o } WHERE { ?s ?p ?o }", "CONSTRUCT"), + ("Describe ", "DESCRIBE"), + # With prefixes (first match wins) + ( + "PREFIX ex: SELECT ?x WHERE { ?x ex:p ?y }", + "SELECT", + ), + # First keyword wins when multiple present + ("SELECT ?x WHERE { ?x ?y ?z } CONSTRUCT { ?a ?b ?c }", "SELECT"), + # Unknown types + ("DELETE WHERE { ?x ?y ?z }", "UNKNOWN"), + ("", "UNKNOWN"), + ("SELECTED ?x WHERE { ?x ?y ?z }", "UNKNOWN"), + ], +) +def test_sparql_query_type(query, expected): + assert sparql_query_type(query) == expected + + +SAMPLE_QUERIES = [ + ("q1", "cities query", "SELECT ?x WHERE { ?x a :City }"), + ("q2", "countries", "SELECT ?c WHERE { ?c a :Country }"), + ("q3", "people", "SELECT ?p WHERE { ?p a :Person }"), + ("q4", "rivers", "CONSTRUCT { ?r ?p ?o } WHERE { ?r a :River }"), + ("q5", "mountains", "ASK { ?m a :Mountain }"), +] + + +@pytest.mark.parametrize( + "query_ids, expected_names", + [ + # Single ID + ("2", ["q2"]), + # Range + ("1-3", ["q1", "q2", "q3"]), + # $ as end of range + ("3-$", ["q3", "q4", "q5"]), + # $ as single value (last query) + ("$", ["q5"]), + # Comma-separated mixed + ("1,3,5", ["q1", "q3", "q5"]), + ("1-2,4-5", ["q1", "q2", "q4", "q5"]), + # All queries + ("1-$", ["q1", "q2", "q3", "q4", "q5"]), + # Out-of-range indices skipped + ("99", []), + ("4-7", ["q4", "q5"]), + # Leading/trailing commas (empty parts skipped) + (",1,2", ["q1", "q2"]), + ("1,2,", ["q1", "q2"]), + # Whitespace around parts + (" 1 , 2 ", ["q1", "q2"]), + ], +) +def test_filter_queries_by_ids(query_ids, expected_names): + result = filter_queries(SAMPLE_QUERIES, query_ids, None) + assert [name for name, _, _ in result] == expected_names + + +@pytest.mark.parametrize( + "query_ids", + [ + # Negative range start → int("") raises ValueError + "-2", + # Non-numeric + "abc", + # Duplicate via single IDs + "1,1,2", + "1-3,2,4,$", + # Duplicate via overlapping ranges + "1-3,2-4", + ], +) +def test_filter_queries_invalid_ids(query_ids): + assert filter_queries(SAMPLE_QUERIES, query_ids, None) == [] + + +def test_filter_queries_empty_input(): + assert filter_queries([], "1-$", None) == [] + + +@pytest.mark.parametrize( + "query_regex, expected_names", + [ + # Match on name + ("q1", ["q1"]), + # Match on description + ("cities", ["q1"]), + # Match on query body + ("CONSTRUCT", ["q4"]), + # Case-insensitive + ("CITIES", ["q1"]), + # Regex matching multiple queries + ("Country|Person", ["q2", "q3"]), + # No match + ("abcd", []), + ], +) +def test_filter_queries_by_regex(query_regex, expected_names): + result = filter_queries(SAMPLE_QUERIES, "1-$", query_regex) + assert [name for name, _, _ in result] == expected_names + + +@pytest.mark.parametrize( + "query_ids, query_regex, expected_names", + [ + ("1-3", "Country", ["q2"]), + ("2,3", "cities", []), + ], +) +def test_filter_queries_ids_and_regex_combined( + query_ids, query_regex, expected_names +): + result = filter_queries(SAMPLE_QUERIES, query_ids, query_regex) + assert [name for name, _, _ in result] == expected_names + + +def test_filter_queries_invalid_regex(): + assert filter_queries(SAMPLE_QUERIES, "1-$", "[invalid") == [] + + +VALID_YML = """\ +name: My Benchmark +description: A test benchmark +queries: + - name: q1 + description: first query + query: SELECT ?x WHERE { ?x ?y ?z } + - name: q2 + query: ASK { ?x ?y ?z } +""" + + +def test_parse_queries_yml_valid(tmp_path): + yml_file = tmp_path / "test.yml" + yml_file.write_text(VALID_YML) + name, description, queries = parse_queries_yml(str(yml_file)) + assert name == "My Benchmark" + assert description == "A test benchmark" + assert queries == [ + ("q1", "first query", "SELECT ?x WHERE { ?x ?y ?z }"), + ("q2", "", "ASK { ?x ?y ?z }"), + ] + + +def test_parse_queries_yml_no_top_level_name(tmp_path): + yml_file = tmp_path / "test.yml" + yml_file.write_text("queries:\n - name: q1\n query: SELECT 1\n") + name, description, queries = parse_queries_yml(str(yml_file)) + assert name is None + assert description is None + assert queries == [("q1", "", "SELECT 1")] + + +@pytest.mark.parametrize( + "yml_content", + [ + # Missing top-level 'queries' key + "name: test\n", + # 'queries' is not a list + "queries: not_a_list\n", + # Query item missing 'name' + "queries:\n - query: SELECT 1\n", + # Query item missing 'query' + "queries:\n - name: q1\n", + # Query item is not a dict + "queries:\n - just a string\n", + # Not a dict at top level + "- item1\n- item2\n", + # Invalid YAML syntax + ":\n bad: [yaml\n", + ], +) +def test_parse_queries_yml_invalid(tmp_path, yml_content): + yml_file = tmp_path / "test.yml" + yml_file.write_text(yml_content) + assert parse_queries_yml(str(yml_file)) == (None, None, []) + + +def test_parse_queries_tsv_valid(mock_command): + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.return_value = ( + "q1\tSELECT ?x WHERE { ?x ?y ?z }\nq2\tASK { ?x ?y ?z }\n" + ) + result = parse_queries_tsv("cat queries.tsv") + assert result == [ + ("q1", "", "SELECT ?x WHERE { ?x ?y ?z }"), + ("q2", "", "ASK { ?x ?y ?z }"), + ] + + +def test_parse_queries_tsv_tab_in_query(mock_command): + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.return_value = "q1\tSELECT ?x\tWHERE { ?x ?y ?z }\n" + result = parse_queries_tsv("cat queries.tsv") + assert result == [("q1", "", "SELECT ?x\tWHERE { ?x ?y ?z }")] + + +def test_parse_queries_tsv_empty_output(mock_command): + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.return_value = "" + assert parse_queries_tsv("cat queries.tsv") == [] + + +def test_parse_queries_tsv_command_failure(mock_command): + run_cmd_mock = mock_command(MODULE, "run_command") + run_cmd_mock.side_effect = Exception("command failed") + assert parse_queries_tsv("cat queries.tsv") == [] + + +@pytest.mark.parametrize( + "case", + [ + pytest.param( + dict(cli=("CLI", "CLI Desc"), yml=("YML", "YML Desc"), + dataset="wikidata", expected=("CLI", "CLI Desc")), + id="cli-takes-priority", + ), + pytest.param( + dict(cli=(None, None), yml=("YML", "YML Desc"), + dataset="wikidata", expected=("YML", "YML Desc")), + id="yml-over-default", + ), + pytest.param( + dict(cli=(None, None), yml=(None, None), + dataset="wikidata", expected=("Wikidata", "auto")), + id="default-from-dataset", + ), + pytest.param( + dict(cli=(None, None), yml=(None, None), + dataset=None, expected=(None, None)), + id="all-none", + ), + pytest.param( + dict(cli=("CLI", None), yml=(None, "YML Desc"), + dataset="wikidata", expected=("CLI", "YML Desc")), + id="cli-name-yml-desc", + ), + ], +) +def test_resolve_benchmark_metadata(case): + name, desc = resolve_benchmark_metadata( + *case["cli"], *case["yml"], case["dataset"] + ) + exp_name, exp_desc = case["expected"] + assert name == exp_name + if exp_desc == "auto": + assert case["dataset"].capitalize() in desc + assert "benchmark-queries" in desc + else: + assert desc == exp_desc diff --git a/test/qlever/commands/test_cache_stats_execute.py b/test/qlever/commands/test_cache_stats_execute.py index 7dd716a0..3b6b5f6d 100644 --- a/test/qlever/commands/test_cache_stats_execute.py +++ b/test/qlever/commands/test_cache_stats_execute.py @@ -20,7 +20,7 @@ def test_execute_successful_basic_cache_stats( ): # Mock arguments for basic cache stats args = MagicMock() - args.server_url = None + args.sparql_endpoint = None args.host_name = "localhorst" args.port = 1234 args.show = False @@ -29,13 +29,13 @@ def test_execute_successful_basic_cache_stats( # Mock `subprocess.check_output` and `json.loads` as encoded bytes mock_check_output.side_effect = [ # Mock cache_stats - b'{"pinned-size": 1e9, "non-pinned-size": 3e9}', + b'{"cache-size-pinned": 1e9, "cache-size-unpinned": 3e9}', # Mock cache_settings b'{"cache-max-size": "10 GB"}', ] # mock cache_stats_dict and cache_settings_dict as a dictionary mock_json_loads.side_effect = [ - {"pinned-size": 1e9, "non-pinned-size": 3e9}, + {"cache-size-pinned": 1e9, "cache-size-unpinned": 3e9}, {"cache-max-size": "10 GB"}, ] @@ -77,20 +77,20 @@ def test_execute_detailed_cache_stats( ): # Mock arguments for detailed cache stats args = MagicMock() - args.server_url = "http://testlocalhost:1234" + args.sparql_endpoint = "http://testlocalhost:1234" args.show = False args.detailed = True # Mock the responses from `subprocess.check_output` and `json.loads` mock_check_output.side_effect = [ - b'{"pinned-size": 2e9, "non-pinned-size": 1e9, "test-stat": 500}', + b'{"cache-size-pinned": 2e9, "cache-size-unpinned": 1e9, "test-stat": 500}', b'{"cache-max-size": "10 GB", "test-setting": 1000}', ] # CAREFUL: if value is float you will get an error in re.match mock_json_loads.side_effect = [ { - "pinned-size": int(2e9), - "non-pinned-size": int(1e9), + "cache-size-pinned": int(2e9), + "cache-size-unpinned": int(1e9), "test-stat": 500, }, {"cache-max-size": "10 GB", "test-setting": 1000}, @@ -101,10 +101,10 @@ def test_execute_detailed_cache_stats( # Assertions expected_stats_call = ( - f"curl -s {args.server_url} " f'--data-urlencode "cmd=cache-stats"' + f"curl -s {args.sparql_endpoint} " f'--data-urlencode "cmd=cache-stats"' ) expected_settings_call = ( - f"curl -s {args.server_url} " + f"curl -s {args.sparql_endpoint} " f'--data-urlencode "cmd=get-settings"' ) @@ -112,10 +112,10 @@ def test_execute_detailed_cache_stats( mock_check_output.assert_any_call(expected_settings_call, shell=True) # Verify that detailed stats and settings were logged as a table - mock_log.info.assert_any_call("pinned-size : 2,000,000,000") - mock_log.info.assert_any_call("non-pinned-size : 1,000,000,000") - mock_log.info.assert_any_call("test-stat : 500") mock_log.info.assert_any_call("cache-max-size : 10 GB") + mock_log.info.assert_any_call("cache-size-pinned : 2,000,000,000") + mock_log.info.assert_any_call("cache-size-unpinned : 1,000,000,000") + mock_log.info.assert_any_call("test-stat : 500") mock_log.info.assert_any_call("test-setting : 1,000") self.assertTrue(result) @@ -127,7 +127,7 @@ def test_execute_detailed_cache_stats( def test_execute_failed_cache_stats(self, mock_log, mock_check_output): # Mock arguments for basic cache stats args = MagicMock() - args.server_url = "http://testlocalhost:1234" + args.sparql_endpoint = "http://testlocalhost:1234" args.show = False args.detailed = False @@ -153,7 +153,7 @@ def test_execute_invalid_cache_size_format( ): # Mock arguments for basic cache stats args = MagicMock() - args.server_url = None + args.sparql_endpoint = None args.port = 1234 args.show = False args.detailed = False @@ -189,18 +189,18 @@ def test_execute_empty_cache_size( ): # Mock arguments for basic cache stats args = MagicMock() - args.server_url = None + args.sparql_endpoint = None args.port = 1234 args.show = False args.detailed = False # Mock the responses with empty cache size mock_check_output.side_effect = [ - b'{"pinned-size": 0, "non-pinned-size": 0}', + b'{"cache-size-pinned": 0, "cache-size-unpinned": 0}', b'{"cache-max-size": "10 GB"}', ] mock_json_loads.side_effect = [ - {"pinned-size": 0, "non-pinned-size": 0}, + {"cache-size-pinned": 0, "cache-size-unpinned": 0}, {"cache-max-size": "10 GB"}, ] diff --git a/test/qlever/commands/test_cache_stats_other_methods.py b/test/qlever/commands/test_cache_stats_other_methods.py index e07bee6e..85f27eac 100644 --- a/test/qlever/commands/test_cache_stats_other_methods.py +++ b/test/qlever/commands/test_cache_stats_other_methods.py @@ -34,12 +34,12 @@ def test_additional_arguments(self): # Test that the default value for server-url is set correctly """Why is there no default="localhost:{port}"? """ - self.assertEqual(args.server_url, None) + self.assertEqual(args.sparql_endpoint, None) # Test that the help text for server-url is correctly set argument_help = subparser._group_actions[-2].help self.assertEqual( - "URL of the QLever server, default is {host_name}:{port}", + "URL of the SPARQL endpoint, default is {host_name}:{port}", argument_help, ) diff --git a/test/qlever/commands/test_index_execute.py b/test/qlever/commands/test_index_execute.py index 30484970..cff13ab9 100644 --- a/test/qlever/commands/test_index_execute.py +++ b/test/qlever/commands/test_index_execute.py @@ -35,16 +35,20 @@ def test_execute_successful_indexing_without_extras( args.only_pso_and_pos_permutations = False args.use_patterns = True args.parallel_parsing = False + args.add_has_word_triples = False args.text_index = "Test Index" args.stxxl_memory = False args.system = "native" args.show = False args.overwrite_existing = False + args.vocabulary_type = "on-disk-compressed" args.index_container = "test_container" args.image = "test_image" args.multi_input_json = False args.ulimit = None + args.encode_as_id = None args.parser_buffer_size = None + args.materialized_views = None # Mock glob, get_total_file_size, get_existing_index_files, # run_command and containerize @@ -61,7 +65,8 @@ def test_execute_successful_indexing_without_extras( expected_index_cmd = ( f"{args.cat_input_files} | {args.index_binary}" f" -i {args.name} -s {args.name}.settings.json" - f" -F {args.format} -f - | tee" + f" --vocabulary-type {args.vocabulary_type}" + f" -F {args.format} -f - 2>&1 | tee" f" {args.name}.index-log.txt" ) index_cmd_call = call(expected_index_cmd, show_output=True) @@ -113,6 +118,7 @@ def test_execute_indexing_with_already_existing_files( args.input_files = "*.nt" args.only_pso_and_pos_permutations = False args.use_patterns = True + args.add_has_word_triples = False args.text_index = None args.stxxl_memory = None args.system = "native" @@ -121,6 +127,7 @@ def test_execute_indexing_with_already_existing_files( args.index_container = "test_container" args.image = "test_image" args.multi_input_json = False + args.materialized_views = None # Mock glob, get_total_file_size, get_existing_index_files, # run_command and containerize @@ -174,6 +181,7 @@ def test_execute_fails_if_no_indexing_binary_is_found( args.input_files = "*.nt" args.only_pso_and_pos_permutations = False args.use_patterns = True + args.add_has_word_triples = False args.text_index = None args.stxxl_memory = None args.system = "native" @@ -182,6 +190,7 @@ def test_execute_fails_if_no_indexing_binary_is_found( args.index_container = "test_container" args.image = "test_image" args.multi_input_json = False + args.materialized_views = None # Mock glob, get_total_file_size, get_existing_index_files, # run_command and containerize @@ -238,16 +247,20 @@ def test_execute_total_file_size_greater_than_ten_gb( args.only_pso_and_pos_permutations = False args.use_patterns = True args.parallel_parsing = False + args.add_has_word_triples = False args.text_index = None args.stxxl_memory = None args.system = "native" args.show = False args.overwrite_existing = False + args.vocabulary_type = "on-disk-compressed" args.index_container = "test_container" args.image = "test_image" args.multi_input_json = False args.ulimit = None + args.encode_as_id = None args.parser_buffer_size = None + args.materialized_views = None # Mock glob, get_total_file_size, get_existing_index_files, # run_command and containerize @@ -264,8 +277,9 @@ def test_execute_total_file_size_greater_than_ten_gb( expected_index_cmd = ( f"ulimit -Sn 500000 && {args.cat_input_files} | {args.index_binary}" f" -i {args.name} -s {args.name}.settings.json" + f" --vocabulary-type {args.vocabulary_type}" f" -F {args.format} -f -" - f" | tee {args.name}.index-log.txt" + f" 2>&1 | tee {args.name}.index-log.txt" ) mock_util_run_command.assert_called_once_with( f"{args.index_binary} --help" @@ -344,15 +358,19 @@ def test_execute_successful_indexing_with_extras_and_show( args.multi_input_json = True args.cat_input_files = False args.only_pso_and_pos_permutations = True - args.use_patterns = False + args.use_patterns = "no" + args.add_has_word_triples = False args.text_index = "from_text_records_and_literals" args.stxxl_memory = True args.input_files = "*.nt" args.system = "native" args.settings_json = '{"example": "settings"}' + args.vocabulary_type = "on-disk-compressed" args.show = True args.ulimit = None + args.encode_as_id = None args.parser_buffer_size = None + args.materialized_views = None # Mock get_input_options_for_json mock_input_json.return_value = "test_input_stream" @@ -364,13 +382,14 @@ def test_execute_successful_indexing_with_extras_and_show( expected_index_cmd = ( f"{args.index_binary}" f" -i {args.name} -s {args.name}.settings.json" + f" --vocabulary-type {args.vocabulary_type}" f" {mock_input_json.return_value}" - f" --only-pso-and-pos-permutations --no-patterns" + f" --only-pso-and-pos-permutations" f" --no-patterns -w {args.name}.wordsfile.tsv" f" -d {args.name}.docsfile.tsv" f" --text-words-from-literals" f" --stxxl-memory {args.stxxl_memory}" - f" | tee {args.name}.index-log.txt" + f" 2>&1 | tee {args.name}.index-log.txt" ) settings_json_cmd = ( f"echo {shlex.quote(args.settings_json)} " diff --git a/test/qlever/commands/test_index_other_methods.py b/test/qlever/commands/test_index_other_methods.py index afc2b34c..2a808ad4 100644 --- a/test/qlever/commands/test_index_other_methods.py +++ b/test/qlever/commands/test_index_other_methods.py @@ -33,13 +33,17 @@ def test_relevant_qleverfile_arguments(self): "index": [ "input_files", "cat_input_files", + "encode_as_id", "multi_input_json", "parallel_parsing", "settings_json", + "materialized_views", + "vocabulary_type", "index_binary", "only_pso_and_pos_permutations", "ulimit", "use_patterns", + "add_has_word_triples", "text_index", "stxxl_memory", "parser_buffer_size", diff --git a/test/qlever/commands/test_index_stats_methods.py b/test/qlever/commands/test_index_stats_methods.py new file mode 100644 index 00000000..db27405b --- /dev/null +++ b/test/qlever/commands/test_index_stats_methods.py @@ -0,0 +1,266 @@ +import pytest + +from qlever.commands.index_stats import ( + compute_durations, + compute_sizes, + get_size_unit, + get_time_unit, +) + + +@pytest.mark.parametrize("explicit_unit", ["s", "min", "h"]) +@pytest.mark.parametrize("parse_duration", [None, 0, 50, 500, 5000]) +def test_get_time_unit_explicit(explicit_unit, parse_duration): + """Explicit time unit is returned as-is regardless of parse_duration.""" + assert get_time_unit(explicit_unit, parse_duration) == explicit_unit + + +@pytest.mark.parametrize( + "parse_duration, expected_unit", + [ + (None, "h"), + (0, "s"), + (199, "s"), + (200, "min"), + (3599, "min"), + (3600, "h"), + (10000, "h"), + ], +) +def test_get_time_unit_auto(parse_duration, expected_unit): + """Auto mode picks unit based on parse_duration thresholds.""" + assert get_time_unit("auto", parse_duration) == expected_unit + + +@pytest.mark.parametrize("explicit_unit", ["B", "MB", "GB", "TB"]) +@pytest.mark.parametrize("total_size", [0, 500, int(1e7), int(1e13)]) +def test_get_size_unit_explicit(explicit_unit, total_size): + """Explicit size unit is returned as-is regardless of total_size.""" + assert get_size_unit(explicit_unit, total_size) == explicit_unit + + +@pytest.mark.parametrize( + "total_size, expected_unit", + [ + (0, "B"), + (999_999, "B"), + (1_000_000, "MB"), + (999_999_999, "MB"), + (1_000_000_000, "GB"), + (999_999_999_999, "GB"), + (1_000_000_000_000, "TB"), + (5_000_000_000_000, "TB"), + ], +) +def test_get_size_unit_auto(total_size, expected_unit): + """Auto mode picks unit based on total_size thresholds.""" + assert get_size_unit("auto", total_size) == expected_unit + + +def test_compute_sizes_text_omitted_when_zero(): + """Text index entry is excluded from result when text index size is zero.""" + raw_sizes = {"index": 500, "vocabulary": 300, "text": 0, "total": 800} + result = compute_sizes(raw_sizes, "B") + assert "Files text.*" not in result + assert list(result.keys()) == [ + "Files index.*", + "Files vocabulary.*", + "TOTAL size", + ] + + +def test_compute_sizes_text_included_when_nonzero(): + """Text index entry is included in result when text index size is nonzero.""" + raw_sizes = { + "index": 500, + "vocabulary": 300, + "text": 200, + "total": 1000, + } + result = compute_sizes(raw_sizes, "B") + assert "Files text.*" in result + assert list(result.keys()) == [ + "Files index.*", + "Files vocabulary.*", + "Files text.*", + "TOTAL size", + ] + + +def test_compute_sizes_all_zero(): + """All sizes zero: auto resolves to 'B', text index is omitted.""" + raw_sizes = {"index": 0, "vocabulary": 0, "text": 0, "total": 0} + result = compute_sizes(raw_sizes, "auto") + assert result["Files index.*"] == (0, "B") + assert result["Files vocabulary.*"] == (0, "B") + assert result["TOTAL size"] == (0, "B") + assert "Files text.*" not in result + + +@pytest.mark.parametrize( + "size_unit, divisor", + [("B", 1), ("MB", 1e6), ("GB", 1e9), ("TB", 1e12)], +) +def test_compute_sizes_conversion(size_unit, divisor): + """Raw byte sizes are correctly divided by the unit factor.""" + raw_sizes = { + "index": 5_000_000_000, + "vocabulary": 1_000_000_000, + "text": 500_000_000, + "total": 6_500_000_000, + } + result = compute_sizes(raw_sizes, size_unit) + assert result["Files index.*"] == (5_000_000_000 / divisor, size_unit) + assert result["Files vocabulary.*"] == (1_000_000_000 / divisor, size_unit) + assert result["Files text.*"] == (500_000_000 / divisor, size_unit) + assert result["TOTAL size"] == (6_500_000_000 / divisor, size_unit) + + +def test_compute_sizes_auto_unit_propagated(): + """Auto-resolved unit is applied consistently to all entries.""" + raw_sizes = { + "index": 2_000_000_000, + "vocabulary": 500_000_000, + "text": 100_000_000, + "total": 2_600_000_000, + } + result = compute_sizes(raw_sizes, "auto") + # total is 2.6e9 -> auto resolves to GB + for _, (_, unit) in result.items(): + assert unit == "GB" + + +def log_line(time: str, message: str) -> str: + """Build a timestamped log line matching the real log format.""" + return f"2025-01-15 {time}.000 - INFO: {message}\n" + + +# A complete log with all phases (new format) +COMPLETE_LOG_LINES = [ + log_line("10:00:00", "Processing triples from single input stream"), + log_line("10:01:00", "Merging partial vocabularies ..."), + log_line( + "10:02:00", "Converting triples from local IDs to global IDs ..." + ), + log_line("10:03:00", "Creating permutations SPO and SOP ..."), + log_line("10:05:00", "Creating permutations OSP and OPS ..."), + log_line("10:07:00", "Creating permutations PSO and POS ..."), + log_line("10:09:00", "Index build completed"), +] + + +def test_compute_durations_complete_build(): + """All phases present: every phase has a duration, TOTAL is computed.""" + result = compute_durations(COMPLETE_LOG_LINES, "s", False) + assert result["Parse input"] == (60.0, "s") + assert result["Build vocabularies"] == (60.0, "s") + assert result["Convert to global IDs"] == (60.0, "s") + assert result["Permutation SPO & SOP"] == (120.0, "s") + assert result["Permutation OSP & OPS"] == (120.0, "s") + assert result["Permutation PSO & POS"] == (120.0, "s") + assert result["Text index"] == (None, "s") + assert result["TOTAL time"] == (540.0, "s") + + +def test_compute_durations_empty_lines(): + """Empty input: no 'Processing' line found, returns empty dict.""" + result = compute_durations([], "s", False) + assert result == {} + + +def test_compute_durations_only_processing(): + """Only 'Processing' line, no 'Merging': build still in first phase, + returns empty dict.""" + lines = [ + log_line("10:00:00", "Processing triples from single input stream") + ] + result = compute_durations(lines, "s", False) + assert result == {} + + +def test_compute_durations_partial_build(): + """Parse and merge done, but no convert or permutations yet: those + phases have None durations.""" + lines = [ + log_line("10:00:00", "Processing triples from single input stream"), + log_line("10:01:00", "Merging partial vocabularies ..."), + ] + result = compute_durations(lines, "s", False) + assert result["Parse input"] == (60.0, "s") + assert result["Build vocabularies"] == (None, "s") + assert result["Convert to global IDs"] == (None, "s") + assert result["Text index"] == (None, "s") + assert "TOTAL time" not in result + + +def test_compute_durations_with_text_index(): + """Separate text index built after main build: TOTAL includes both.""" + lines = COMPLETE_LOG_LINES + [ + log_line("11:00:00", "Adding text index"), + log_line("11:10:00", "Text index build completed"), + ] + result = compute_durations(lines, "s", False) + assert result["Text index"] == (600.0, "s") + # TOTAL = main build (540s) + text index (600s) + assert result["TOTAL time"] == (540.0 + 600.0, "s") + + +def test_compute_durations_ignore_text_index(): + """ignore_text_index=True: text index duration is None, TOTAL excludes + text time.""" + lines = COMPLETE_LOG_LINES + [ + log_line("11:00:00", "Adding text index"), + log_line("11:10:00", "Text index build completed"), + ] + result = compute_durations(lines, "s", True) + assert result["Text index"] == (None, "s") + assert result["TOTAL time"] == (540.0, "s") + + +def test_compute_durations_old_log_format(): + """Old format uses 'Creating a pair' + 'Writing meta data for ...' + instead of 'Creating permutations ...'.""" + lines = [ + log_line("10:00:00", "Processing triples from single input stream"), + log_line("10:01:00", "Merging partial vocabularies ..."), + log_line( + "10:02:00", "Converting triples from local IDs to global IDs ..." + ), + log_line("10:03:00", "Creating a pair of permutations ..."), + log_line("10:03:30", "Writing meta data for SPO and SOP ..."), + log_line("10:05:00", "Creating a pair of permutations ..."), + log_line("10:05:30", "Writing meta data for OSP and OPS ..."), + log_line("10:07:00", "Index build completed"), + ] + result = compute_durations(lines, "s", False) + assert "Permutation SPO & SOP" in result + assert "Permutation OSP & OPS" in result + assert result["Permutation SPO & SOP"] == (120.0, "s") + assert result["Permutation OSP & OPS"] == (120.0, "s") + + +def test_compute_durations_time_unit_conversion(): + """Explicit time unit 'min': all durations converted from seconds.""" + result = compute_durations(COMPLETE_LOG_LINES, "min", False) + assert result["Parse input"] == (1.0, "min") + assert result["TOTAL time"] == (9.0, "min") + + +def test_compute_durations_auto_time_unit(): + """Auto time unit resolved based on parse phase duration (60s < 200 + -> 's').""" + result = compute_durations(COMPLETE_LOG_LINES, "auto", False) + # Parse phase is 60s which is < 200, so auto resolves to "s" + for _, (_, unit) in result.items(): + assert unit == "s" + + +def test_compute_durations_no_index_build_completed(): + """Missing 'Index build completed' line: last permutation end and + TOTAL are None.""" + lines = COMPLETE_LOG_LINES[:-1] + result = compute_durations(lines, "s", False) + assert result["Permutation SPO & SOP"] == (120.0, "s") + assert result["Permutation OSP & OPS"] == (120.0, "s") + assert result["Permutation PSO & POS"] == (None, "s") + assert "TOTAL time" not in result diff --git a/test/qlever/commands/test_start_execute.py b/test/qlever/commands/test_start_execute.py index fac4357c..c5e23505 100644 --- a/test/qlever/commands/test_start_execute.py +++ b/test/qlever/commands/test_start_execute.py @@ -22,7 +22,7 @@ def test_construct_command_with_if(): args.persist_updates = False args.access_token = True args.only_pso_and_pos_permutations = True - args.use_patterns = False + args.use_patterns = "no" args.use_text_index = "yes" # Execute the function @@ -125,11 +125,12 @@ def test_check_binary_success(mock_run_cmd): # Setup args args = MagicMock() args.server_binary = "/test/path/server_binary" + args.system = "native" # mock run_cmd as successful mock_run_cmd.return_value = "Command works" # Execute the function - result = qlever.util.binary_exists(args.server_binary, "server-binary") + result = qlever.util.binary_exists(args.server_binary, "server-binary", args) # check if run_cmd was called once with mock_run_cmd.assert_called_once_with(f"{args.server_binary} --help") assert result @@ -143,12 +144,13 @@ def test_check_binary_exception(mock_log, mock_run_cmd): # Setup args args = MagicMock() args.server_binary = "false_binary" + args.system = "native" # Simulate an exception when run_command is called mock_run_cmd.side_effect = Exception("Mocked command failure") # Execute the function - result = qlever.util.binary_exists(args.server_binary, "server-binary") + result = qlever.util.binary_exists(args.server_binary, "server-binary", args) # check if run_cmd was called once with mock_run_cmd.assert_called_once_with(f"{args.server_binary} --help") @@ -331,7 +333,7 @@ def test_execute_kills_existing_server_on_same_port( args.persist_updates = False args.access_token = True args.only_pso_and_pos_permutations = True - args.use_patterns = False + args.use_patterns = "no" args.use_text_index = "yes" # Mock CacheStatsCommand @@ -397,7 +399,7 @@ def test_execute_fails_due_to_existing_server( args.kill_existing_with_same_port = False args.port = "localhorst" args.port = 1234 - args.cmdline_regex = f"^ServerMain.* -p {args.port}" + args.cmdline_regex = f"^qlever-server.* -p {args.port}" args.no_containers = True args.server_binary = "/test/path/server_binary" args.name = "TestName" @@ -552,9 +554,7 @@ def test_execute_server_with_warmup( ) # Check warmup was called - mock_run.assert_called_once_with( - args.warmup_cmd, shell=True, check=True - ) + mock_run.assert_any_call(args.warmup_cmd, shell=True, check=True) # Assertions # Ensure the server status was checked @@ -573,8 +573,10 @@ def test_execute_server_with_warmup( @patch("qlever.commands.start.Containerize.supported_systems") @patch("qlever.commands.start.wrap_command_in_container") @patch("qlever.commands.start.construct_command") + @patch("qlever.commands.start.binary_exists") def test_execute_containerize_and_description( self, + mock_binary_exists, mock_construct_cl, mock_run_containerize, mock_containerize, @@ -623,6 +625,8 @@ def test_execute_containerize_and_description( # Mock Containerize mock_containerize.return_value = ["test1", "test2"] + mock_binary_exists.return_value = True + # Instantiate the StartCommand sc = StartCommand() diff --git a/test/qlever/commands/test_start_other_methods.py b/test/qlever/commands/test_start_other_methods.py index d3c3efa3..2848dbb3 100644 --- a/test/qlever/commands/test_start_other_methods.py +++ b/test/qlever/commands/test_start_other_methods.py @@ -65,7 +65,7 @@ def test_additional_arguments(self): # Test that the help text for # --kill-existing-with-same-port is correctly set - argument_help = subparser._group_actions[-3].help + argument_help = subparser._group_actions[-4].help self.assertEqual( argument_help, "If a QLever server is already running " @@ -77,5 +77,5 @@ def test_additional_arguments(self): self.assertEqual(args.no_warmup, False) # Test that the help text for --no-warmup is correctly set - argument_help = subparser._group_actions[-2].help + argument_help = subparser._group_actions[-3].help self.assertEqual(argument_help, "Do not execute the warmup command") diff --git a/test/qlever/commands/test_status_execute.py b/test/qlever/commands/test_status_execute.py index 7f993c81..56a2edb3 100644 --- a/test/qlever/commands/test_status_execute.py +++ b/test/qlever/commands/test_status_execute.py @@ -9,7 +9,7 @@ def get_mock_args(only_show): args = MagicMock() - args.cmdline_regex = "^(ServerMain|IndexBuilderMain)" + args.cmdline_regex = "^(qlever-server|qlever-index)" args.show = only_show return [args, args.cmdline_regex, args.show] diff --git a/test/qlever/commands/test_status_other_methods.py b/test/qlever/commands/test_status_other_methods.py index c1954000..430a53bc 100644 --- a/test/qlever/commands/test_status_other_methods.py +++ b/test/qlever/commands/test_status_other_methods.py @@ -31,7 +31,7 @@ def test_additional_arguments(self): args = parser.parse_args([]) # Test that the default value is set correctly - self.assertEqual(args.cmdline_regex, "^(ServerMain|IndexBuilderMain)") + self.assertEqual(args.cmdline_regex, "^(qlever-server|qlever-index)") # Test that the help text is correctly set argument_help = subparser._group_actions[-1].help diff --git a/test/qlever/commands/test_stop_execute.py b/test/qlever/commands/test_stop_execute.py index 41090172..b037d3fb 100644 --- a/test/qlever/commands/test_stop_execute.py +++ b/test/qlever/commands/test_stop_execute.py @@ -20,7 +20,7 @@ def test_execute_no_matching_processes_or_containers( ): # Setup args args = MagicMock() - args.cmdline_regex = "ServerMain.* -i [^ ]*%%NAME%%" + args.cmdline_regex = "qlever-server.* -i [^ ]*%%NAME%%" args.name = "TestName" args.no_containers = True args.server_container = "test_container" @@ -61,7 +61,7 @@ def test_execute_with_matching_process( ): # Setup args args = MagicMock() - args.cmdline_regex = "ServerMain.* -i [^ ]*%%NAME%%" + args.cmdline_regex = "qlever-server.* -i [^ ]*%%NAME%%" args.name = "TestName" args.no_containers = True args.server_container = "test_container" @@ -75,7 +75,7 @@ def test_execute_with_matching_process( # to test with real psutil.process objects use this: mock_process.as_dict.return_value = { - "cmdline": ["ServerMain", "-i", "/some/path/TestName"], + "cmdline": ["qlever-server", "-i", "/some/path/TestName"], "pid": 1234, "username": "test_user", } @@ -115,7 +115,7 @@ def test_execute_with_containers( ): # Setup args args = MagicMock() - args.cmdline_regex = "ServerMain.* -i [^ ]*%%NAME%%" + args.cmdline_regex = "qlever-server.* -i [^ ]*%%NAME%%" args.name = "TestName" args.no_containers = False args.server_container = "test_container" @@ -157,7 +157,7 @@ def test_execute_with_no_containers_and_no_matching_process( ): # Setup args args = MagicMock() - args.cmdline_regex = "ServerMain.* -i [^ ]*%%NAME%%" + args.cmdline_regex = "qlever-server.* -i [^ ]*%%NAME%%" args.name = "TestName" args.no_containers = False args.server_container = "test_container" @@ -204,7 +204,7 @@ def test_execute_with_error_killing_process( ): # Setup args args = MagicMock() - args.cmdline_regex = "ServerMain.* -i [^ ]*%%NAME%%" + args.cmdline_regex = "qlever-server.* -i [^ ]*%%NAME%%" args.name = "TestName" args.no_containers = True args.server_container = "test_container" @@ -216,7 +216,7 @@ def test_execute_with_error_killing_process( # Creating mock psutil.Process objects with necessary attributes mock_process = MagicMock() mock_process.as_dict.return_value = { - "cmdline": ["ServerMain", "-i", "/some/path/TestName"], + "cmdline": ["qlever-server", "-i", "/some/path/TestName"], "pid": 1234, "create_time": 1234567890, "memory_info": MagicMock(rss=1024 * 1024 * 512), diff --git a/test/qlever/commands/test_stop_other_methods.py b/test/qlever/commands/test_stop_other_methods.py index 7675788b..b69d7b40 100644 --- a/test/qlever/commands/test_stop_other_methods.py +++ b/test/qlever/commands/test_stop_other_methods.py @@ -38,7 +38,7 @@ def test_additional_arguments(self): args = parser.parse_args([]) # Test that the default value for cmdline_regex is set correctly - self.assertEqual(args.cmdline_regex, "ServerMain.* -i [^ ]*%%NAME%%") + self.assertEqual(args.cmdline_regex, "qlever-server.* -i [^ ]*%%NAME%%") # Test that the help text for cmdline_regex is correctly set argument_help = subparser._group_actions[-2].help diff --git a/test/qlever/conftest.py b/test/qlever/conftest.py new file mode 100644 index 00000000..9f825cb1 --- /dev/null +++ b/test/qlever/conftest.py @@ -0,0 +1,16 @@ +from unittest.mock import MagicMock + +import pytest + + +@pytest.fixture +def mock_command(monkeypatch): + def _mock(module_name: str, function_name: str, override=None): + if override: + monkeypatch.setattr(f"{module_name}.{function_name}", override) + return override + mock = MagicMock(name=f"{function_name}_mock") + monkeypatch.setattr(f"{module_name}.{function_name}", mock) + return mock + + return _mock From 833e06f1434770fbdf20a9339d0d57d04a44280c Mon Sep 17 00:00:00 2001 From: tanmay-9 Date: Wed, 1 Apr 2026 13:34:34 +0200 Subject: [PATCH 4/7] Removed extract_queries.py --- src/qoxigraph/commands/extract_queries.py | 1 - 1 file changed, 1 deletion(-) delete mode 120000 src/qoxigraph/commands/extract_queries.py diff --git a/src/qoxigraph/commands/extract_queries.py b/src/qoxigraph/commands/extract_queries.py deleted file mode 120000 index 5667cc52..00000000 --- a/src/qoxigraph/commands/extract_queries.py +++ /dev/null @@ -1 +0,0 @@ -../../qlever/commands/extract_queries.py \ No newline at end of file From 274fff973c726f4376a8de285387e3e891d9a3ac Mon Sep 17 00:00:00 2001 From: tanmay-9 Date: Wed, 8 Apr 2026 14:36:53 +0200 Subject: [PATCH 5/7] Add memory monitor to qoxigraph along with separate time log for load and optimization. --- src/qlever/commands/index.py | 10 +- src/qlever/memory_monitor.py | 164 ++++++++++++++++++++++++++ src/qoxigraph/commands/index.py | 62 ++++++---- src/qoxigraph/commands/index_stats.py | 47 +++++--- 4 files changed, 246 insertions(+), 37 deletions(-) create mode 100644 src/qlever/memory_monitor.py diff --git a/src/qlever/commands/index.py b/src/qlever/commands/index.py index d47f616f..8b2415d7 100644 --- a/src/qlever/commands/index.py +++ b/src/qlever/commands/index.py @@ -8,6 +8,7 @@ from qlever.command import QleverCommand from qlever.containerize import Containerize from qlever.log import log +from qlever.memory_monitor import MemoryMonitor from qlever.util import ( binary_exists, get_existing_index_files, @@ -322,7 +323,14 @@ def execute(self, args) -> bool: # Run the index command. try: - run_command(index_cmd, show_output=True) + with MemoryMonitor( + engine="qlever", + dataset=args.name, + cmdline_regex=args.index_binary, + container=args.index_container, + system=args.system, + ): + run_command(index_cmd, show_output=True) except Exception as e: log.error(f"Building the index failed: {e}") return False diff --git a/src/qlever/memory_monitor.py b/src/qlever/memory_monitor.py new file mode 100644 index 00000000..60442625 --- /dev/null +++ b/src/qlever/memory_monitor.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +import json +import re +import threading +import time +from datetime import datetime +from pathlib import Path + +import psutil + +from qlever.containerize import Containerize +from qlever.log import log +from qlever.util import format_size, run_command + + +def parse_container_mem_usage(usage: str) -> int: + """ + Parse a memory usage string from `docker stats` / `podman stats` + like "4.2GiB", "150MiB", "512KiB" into bytes. + """ + usage = usage.strip() + units = { + "TIB": 1024**4, + "GIB": 1024**3, + "MIB": 1024**2, + "KIB": 1024, + "B": 1, + } + for suffix, multiplier in units.items(): + if usage.upper().endswith(suffix): + number = float(usage[: len(usage) - len(suffix)]) + return int(number * multiplier) + return 0 + + +class MemoryMonitor: + """ + Monitor memory usage of an index-building process. Works in both + native mode (via psutil) and container mode (via docker/podman stats). + + Usage as a context manager: + + with MemoryMonitor(engine="qlever", dataset="wikidata", + cmdline_regex=r"qlever-index"): + run_command(cmd, show_output=True) + + # For container mode: + with MemoryMonitor(engine="qlever", dataset="wikidata", + cmdline_regex=r"qlever-index", + container="qlever.index.wikidata", + system="docker"): + run_command(cmd, show_output=True) + """ + + def __init__( + self, + engine: str, + dataset: str, + cmdline_regex: str, + container: str | None = None, + system: str | None = None, + interval: float = 1.0, + output_dir: Path = Path.cwd(), + ): + self.engine = engine + self.dataset = dataset + self.cmdline_regex = cmdline_regex + self.container = container + self.system = system + self.interval = interval + self.output_dir = Path(output_dir) + self.peak_rss = 0 + self.samples = [] + self.stop_event = threading.Event() + self.thread = None + self.start_time = 0 + + def sample_native(self) -> int: + """ + Find the index process among our children by matching its + command line, then sum RSS of that process and all its + descendants. + """ + me = psutil.Process() + for child in me.children(recursive=True): + try: + cmdline = " ".join(child.cmdline()) + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + if re.search(self.cmdline_regex, cmdline): + rss = child.memory_info().rss + for grandchild in child.children(recursive=True): + try: + rss += grandchild.memory_info().rss + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return rss + return 0 + + def sample_container(self) -> int: + """ + Query the container runtime for the memory usage of the + index container. + """ + try: + output = run_command( + f"{self.system} stats --no-stream" + f" --format '{{{{.MemUsage}}}}' {self.container}", + return_output=True, + ) + usage = output.strip().split("/")[0].strip() + return parse_container_mem_usage(usage) + except Exception: + return 0 + + def run_loop(self): + sample = ( + self.sample_container + if self.system in Containerize.supported_systems() + else self.sample_native + ) + while not self.stop_event.is_set(): + rss = sample() + self.peak_rss = max(self.peak_rss, rss) + elapsed = time.monotonic() - self.start_time + self.samples.append((elapsed, rss)) + self.stop_event.wait(self.interval) + + def save(self): + path = ( + self.output_dir / f"{self.engine}.{self.dataset}.memory-log.json" + ) + data = { + "engine": self.engine, + "dataset": self.dataset, + "start_time": datetime.fromtimestamp( + time.time() - (time.monotonic() - self.start_time) + ).isoformat(timespec="seconds"), + "peak_rss_bytes": self.peak_rss, + "peak_rss_human": format_size(self.peak_rss), + "elapsed_s": ( + round(self.samples[-1][0], 1) if self.samples else 0 + ), + "samples": [ + {"elapsed_s": round(t, 1), "rss_bytes": r} + for t, r in self.samples + ], + } + with open(path, "w") as f: + json.dump(data, f, indent=2) + + def __enter__(self): + self.start_time = time.monotonic() + self.thread = threading.Thread(target=self.run_loop, daemon=True) + self.thread.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.stop_event.set() + self.thread.join() + self.save() + log.info(f"Peak memory usage: {format_size(self.peak_rss)}") + return False diff --git a/src/qoxigraph/commands/index.py b/src/qoxigraph/commands/index.py index 82135914..e3915ec9 100644 --- a/src/qoxigraph/commands/index.py +++ b/src/qoxigraph/commands/index.py @@ -8,6 +8,7 @@ from qlever.command import QleverCommand from qlever.containerize import Containerize from qlever.log import log +from qlever.memory_monitor import MemoryMonitor def wrap_cmd_in_container(args, cmd: str, ulimit: int | None = None) -> str: @@ -142,30 +143,51 @@ def execute(self, args) -> bool: # file. Oxigraph's progress output is unreliable (may not print a # final summary line when loading multiple files), so we measure # the time externally. + # + # The MemoryMonitor wraps both the load and optimize steps so + # that peak RSS is tracked across the entire indexing workflow. log_file_name = f"{args.name}.index-log.txt" - try: - start_time = time.time() - util.run_command(index_cmd, show_output=True, show_stderr=True) - elapsed_s = time.time() - start_time - with open(log_file_name, "a") as f: - f.write(f"Total elapsed time: {elapsed_s:.0f}s\n") - except Exception as e: - log.error(f"Building the index failed: {e}") - return False - - if optimize_cmd: + with MemoryMonitor( + engine="qoxigraph", + dataset=args.name, + cmdline_regex=args.index_binary, + container=args.index_container, + system=args.system, + ): try: - log.info("") - log.info("Optimizing read-only database storage:") - self.show(optimize_cmd) + load_start = time.time() util.run_command( - optimize_cmd, show_output=True, show_stderr=True + index_cmd, show_output=True, show_stderr=True ) + load_s = time.time() - load_start except Exception as e: - log.error(f"Optimizing the database storage failed: {e}") - log.info( - f"Please run manually: " - f"{args.index_binary} optimize -l {args.name}_index/" - ) + log.error(f"Building the index failed: {e}") + return False + + optimize_s = 0.0 + if optimize_cmd: + try: + log.info("") + log.info("Optimizing read-only database storage:") + self.show(optimize_cmd) + optimize_start = time.time() + util.run_command( + optimize_cmd, show_output=True, show_stderr=True + ) + optimize_s = time.time() - optimize_start + except Exception as e: + log.error(f"Optimizing the database storage failed: {e}") + log.info( + f"Please run manually: " + f"{args.index_binary} optimize -l {args.name}_index/" + ) + + with open(log_file_name, "a") as f: + f.write(f"Load time: {load_s:.0f}s\n") + if optimize_cmd: + f.write(f"Optimize time: {optimize_s:.0f}s\n") + f.write( + f"Total elapsed time: {load_s + optimize_s:.0f}s\n" + ) return True diff --git a/src/qoxigraph/commands/index_stats.py b/src/qoxigraph/commands/index_stats.py index 672f4a53..5e386bf6 100644 --- a/src/qoxigraph/commands/index_stats.py +++ b/src/qoxigraph/commands/index_stats.py @@ -25,9 +25,9 @@ class IndexStatsCommand(QleverIndexStatsCommand): def execute_time( self, args, log_file_name: str ) -> dict[str, tuple[float | None, str]]: - """Parse total index build time from the index log file.""" + """Parse index build times from the index log file.""" try: - # Read the last few lines of the log file (the total time is + # Read the last few lines of the log file (the times are # always near the end). log_text = util.run_command( f"tail {log_file_name}", return_output=True @@ -36,25 +36,40 @@ def execute_time( log.error(f"Problem reading index log file {log_file_name}: {e}") return {} - stats = {} - # Pattern: "Total elapsed time: s" (total time, always last) - total_pattern = re.compile(r"Total elapsed time: ([\d,]+)s$") + patterns = { + "Load time": re.compile(r"Load time: ([\d,]+)s$"), + "Optimize time": re.compile(r"Optimize time: ([\d,]+)s$"), + "TOTAL time": re.compile(r"Total elapsed time: ([\d,]+)s$"), + } + raw_seconds = {} for line in log_text.splitlines(): - match = total_pattern.search(line) - if not match: - continue + for name, pattern in patterns.items(): + match = pattern.search(line) + if match: + try: + raw_seconds[name] = float( + match.group(1).replace(",", "") + ) + except (ValueError, TypeError): + pass + + if not raw_seconds: + return {} - try: - value_s = float(match.group(1).replace(",", "")) - except (ValueError, TypeError): - continue + # Pick a time unit based on the total time. + total_s = raw_seconds.get("TOTAL time") + time_unit = get_time_unit(args.time_unit, total_s) + unit_factor = get_time_unit_factor(time_unit) - time_unit = get_time_unit(args.time_unit, value_s) - unit_factor = get_time_unit_factor(time_unit) + stats = {} + for name in ["Load time", "Optimize time", "TOTAL time"]: + if name in raw_seconds: + stats[name] = (raw_seconds[name] / unit_factor, time_unit) - stats["TOTAL time"] = (value_s / unit_factor, time_unit) - break + # If there was no optimize step, Load and TOTAL are identical + if "Optimize time" not in stats: + stats.pop("Load time", None) return stats From 17e66ca7a0556b706c6504f6c4579262645378df Mon Sep 17 00:00:00 2001 From: tanmay-9 Date: Wed, 8 Apr 2026 18:18:44 +0200 Subject: [PATCH 6/7] Take new memory monitor changes --- src/qlever/commands/index.py | 1 - src/qlever/memory_monitor.py | 48 +++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/qlever/commands/index.py b/src/qlever/commands/index.py index 8b2415d7..5f0cb4b5 100644 --- a/src/qlever/commands/index.py +++ b/src/qlever/commands/index.py @@ -324,7 +324,6 @@ def execute(self, args) -> bool: # Run the index command. try: with MemoryMonitor( - engine="qlever", dataset=args.name, cmdline_regex=args.index_binary, container=args.index_container, diff --git a/src/qlever/memory_monitor.py b/src/qlever/memory_monitor.py index 60442625..ea33df2c 100644 --- a/src/qlever/memory_monitor.py +++ b/src/qlever/memory_monitor.py @@ -9,6 +9,7 @@ import psutil +from qlever import engine_name from qlever.containerize import Containerize from qlever.log import log from qlever.util import format_size, run_command @@ -16,15 +17,20 @@ def parse_container_mem_usage(usage: str) -> int: """ - Parse a memory usage string from `docker stats` / `podman stats` - like "4.2GiB", "150MiB", "512KiB" into bytes. + Parse a memory usage string from ``docker stats`` or ``podman stats`` + into bytes. Docker reports binary units (GiB, MiB) while Podman + reports decimal units (GB, MB). """ usage = usage.strip() units = { "TIB": 1024**4, + "TB": 1000**4, "GIB": 1024**3, + "GB": 1000**3, "MIB": 1024**2, + "MB": 1000**2, "KIB": 1024, + "KB": 1000, "B": 1, } for suffix, multiplier in units.items(): @@ -41,13 +47,12 @@ class MemoryMonitor: Usage as a context manager: - with MemoryMonitor(engine="qlever", dataset="wikidata", - cmdline_regex=r"qlever-index"): + with MemoryMonitor(dataset="wikidata", cmdline_regex="qlever-index"): run_command(cmd, show_output=True) # For container mode: - with MemoryMonitor(engine="qlever", dataset="wikidata", - cmdline_regex=r"qlever-index", + with MemoryMonitor(dataset="wikidata", + cmdline_regex="qlever-index", container="qlever.index.wikidata", system="docker"): run_command(cmd, show_output=True) @@ -55,7 +60,6 @@ class MemoryMonitor: def __init__( self, - engine: str, dataset: str, cmdline_regex: str, container: str | None = None, @@ -63,7 +67,21 @@ def __init__( interval: float = 1.0, output_dir: Path = Path.cwd(), ): - self.engine = engine + """ + Args: + dataset: Name of the dataset being indexed. + cmdline_regex: Regex matched against child process command + lines to identify the index process (native + mode only). + container: Container name to query for memory stats. + When set together with ``system``, sampling + uses ``docker/podman stats`` instead of + psutil. + system: Container runtime ("docker" or "podman"). + interval: Seconds between samples (default 1.0). + output_dir: Directory for the JSON memory log file. + """ + self.engine = engine_name self.dataset = dataset self.cmdline_regex = cmdline_regex self.container = container @@ -115,6 +133,11 @@ def sample_container(self) -> int: return 0 def run_loop(self): + """ + Polling loop that runs on a background thread. Selects the + appropriate sampling method (native or container) and collects + (elapsed_seconds, rss_bytes) tuples until the stop event is set. + """ sample = ( self.sample_container if self.system in Containerize.supported_systems() @@ -128,8 +151,13 @@ def run_loop(self): self.stop_event.wait(self.interval) def save(self): + """ + Write all collected samples and metadata to a JSON file at + ``/..memory-log.json``. + """ path = ( - self.output_dir / f"{self.engine}.{self.dataset}.memory-log.json" + self.output_dir + / f"{self.engine.lower()}.{self.dataset.lower()}.memory-log.json" ) data = { "engine": self.engine, @@ -151,12 +179,14 @@ def save(self): json.dump(data, f, indent=2) def __enter__(self): + """Start the background sampling thread.""" self.start_time = time.monotonic() self.thread = threading.Thread(target=self.run_loop, daemon=True) self.thread.start() return self def __exit__(self, exc_type, exc_val, exc_tb): + """Stop sampling, persist results, and log peak memory usage.""" self.stop_event.set() self.thread.join() self.save() From bb59bc9de51a0bd41af69e33a02985aa409e95f9 Mon Sep 17 00:00:00 2001 From: tanmay-9 Date: Wed, 8 Apr 2026 18:26:17 +0200 Subject: [PATCH 7/7] Remove redundant example_queries and change symlinked get-data to a simple import --- src/qoxigraph/commands/example_queries.py | 12 ------------ src/qoxigraph/commands/get_data.py | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) delete mode 100644 src/qoxigraph/commands/example_queries.py mode change 120000 => 100644 src/qoxigraph/commands/get_data.py diff --git a/src/qoxigraph/commands/example_queries.py b/src/qoxigraph/commands/example_queries.py deleted file mode 100644 index 4ef76c24..00000000 --- a/src/qoxigraph/commands/example_queries.py +++ /dev/null @@ -1,12 +0,0 @@ -from __future__ import annotations - -from qlever.commands.example_queries import ( - ExampleQueriesCommand as QleverExampleQueriesCommand, -) - - -class ExampleQueriesCommand(QleverExampleQueriesCommand): - def execute(self, args) -> bool: - if not args.sparql_endpoint: - args.sparql_endpoint = f"{args.host_name}:{args.port}/query" - return super().execute(args) diff --git a/src/qoxigraph/commands/get_data.py b/src/qoxigraph/commands/get_data.py deleted file mode 120000 index 4900dbb8..00000000 --- a/src/qoxigraph/commands/get_data.py +++ /dev/null @@ -1 +0,0 @@ -../../qlever/commands/get_data.py \ No newline at end of file diff --git a/src/qoxigraph/commands/get_data.py b/src/qoxigraph/commands/get_data.py new file mode 100644 index 00000000..29bba0e2 --- /dev/null +++ b/src/qoxigraph/commands/get_data.py @@ -0,0 +1 @@ +from qlever.commands.get_data import GetDataCommand # noqa