From bed833a610b31a529ed4b3475505fe9634145411 Mon Sep 17 00:00:00 2001 From: Julian Hill Date: Fri, 29 May 2026 14:32:48 +0200 Subject: [PATCH 1/2] Improve Yara disk image scanning and logging --- workers/openrelik-worker-yara/Dockerfile | 2 + workers/openrelik-worker-yara/pyproject.toml | 2 +- workers/openrelik-worker-yara/src/tasks.py | 122 +++++++++-- .../openrelik-worker-yara/tests/test_tasks.py | 203 +++++++++++++++++- 4 files changed, 310 insertions(+), 19 deletions(-) diff --git a/workers/openrelik-worker-yara/Dockerfile b/workers/openrelik-worker-yara/Dockerfile index 7464a30..f19d425 100644 --- a/workers/openrelik-worker-yara/Dockerfile +++ b/workers/openrelik-worker-yara/Dockerfile @@ -13,6 +13,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ fdisk \ qemu-utils \ ntfs-3g \ + ewf-tools \ + btrfs-progs \ && rm -rf /var/lib/apt/lists/* # Configure debugging diff --git a/workers/openrelik-worker-yara/pyproject.toml b/workers/openrelik-worker-yara/pyproject.toml index fb2d733..8729491 100644 --- a/workers/openrelik-worker-yara/pyproject.toml +++ b/workers/openrelik-worker-yara/pyproject.toml @@ -10,7 +10,7 @@ requires-python = ">=3.11,<4.0" readme = "README.md" dependencies = [ "celery[redis]>=5.4.0,<6", - "openrelik-worker-common>=0.17.1,<1.0.0", + "openrelik-worker-common>=0.17.3,<1.0.0", "openrelik-common>=0.7.0,<1.0.0", ] diff --git a/workers/openrelik-worker-yara/src/tasks.py b/workers/openrelik-worker-yara/src/tasks.py index 22f6a06..3f1e2d6 100644 --- a/workers/openrelik-worker-yara/src/tasks.py +++ b/workers/openrelik-worker-yara/src/tasks.py @@ -98,6 +98,33 @@ class YaraMatch: score: int +def validate_scan_target(path: str, display_name: str = "input") -> None: + """Reject scan targets that would silently scan the worker container.""" + if not path: + raise RuntimeError("Input file path is empty.") + if os.path.abspath(path) == os.path.sep: + raise RuntimeError( + "Refusing to scan filesystem root as Yara input. " + "Select a file or folder input instead." + ) + if not os.path.exists(path): + raise RuntimeError(f"Input path does not exist: {path}") + if not os.path.isfile(path) and not os.path.isdir(path): + raise RuntimeError( + f"Unsupported Yara scan target: {display_name}. " + "The Yara worker can scan regular files and directories. " + "Disk images must be mounted before scanning." + ) + + +def send_progress(task, status: str, progress: str | None = None) -> None: + """Send task progress text for the OpenRelik UI.""" + data = {"status": status} + if progress: + data["progress"] = progress + task.send_event("task-progress", data=data) + + def cleanup_fraken_output_log(logfile: OutputFile) -> None: """Cleanup fraken-x output to be one entry per line. @@ -195,6 +222,7 @@ def command( telemetry.add_attribute_to_current_span("workflow_id", workflow_id) output_files = [] + task_files = [] all_patterns = "" global_yara = task_config.get("Global Yara rules", "") @@ -253,9 +281,17 @@ def command( fraken_output = create_output_file( output_path, display_name="fraken_out.jsonl", data_type="yara:yara-scan:jsonl" ) + fraken_stderr = create_output_file( + output_path, + display_name="fraken_stderr.log", + data_type="yara:yara-scan:log", + ) output_files.append(fraken_output.to_dict()) + task_files.append(fraken_stderr.to_dict()) - input_files = get_input_files(pipe_result, input_files) + input_files = get_input_files(pipe_result, input_files or []) + if not input_files: + raise RuntimeError("No input files were provided to Yara scan.") input_files_map = {} for input_file in input_files: @@ -275,37 +311,88 @@ def command( continue input_file_path = input_file.get("path") + display_name = input_file.get("display_name", input_file_path) + validate_scan_target(input_file_path, display_name) + # Check if disk image, mount and add mountpoints to scan - if mount_disk_images and is_disk_image(input_file): - bd = BlockDevice(input_file_path, min_partition_size=1) - bd.setup() - mountpoints = bd.mount() - disks_mounted.append(bd) + if is_disk_image(input_file): + if not mount_disk_images: + raise RuntimeError( + "Disk image input is not supported in regular scan mode: " + f"{display_name}. Enable mount_disk_images to scan files " + "inside supported disk image filesystems." + ) + + try: + send_progress(self, "Mounting disk image", display_name) + bd = BlockDevice(input_file_path, min_partition_size=1) + bd.setup() + disks_mounted.append(bd) + mountpoints = bd.mount() + send_progress( + self, + "Mounted disk image", + f"{display_name}: {len(mountpoints)} mountpoint(s)", + ) + except RuntimeError as e: + logger.error( + "Error mounting disk image %s (%s): %s", + display_name, + input_file_path, + str(e), + ) + raise RuntimeError( + "Disk image input is not supported or could not be " + f"mounted by the Yara worker: {display_name}." + ) from None if not mountpoints: - logger.info( - "No mountpoints returned for input file %s", - input_file.get("display_name"), + raise RuntimeError( + "No mountpoints returned for input file " + f"{input_file.get('display_name')}" ) for mountpoint in mountpoints: + validate_scan_target(mountpoint, display_name) folders_and_files.append("--folder") folders_and_files.append(mountpoint) else: folders_and_files.append("--folder") folders_and_files.append(input_file_path) + if not folders_and_files: + raise RuntimeError("No scan targets were produced from input files.") + cmd = ["fraken"] + folders_and_files + [f"{all_yara.path}"] + logger.info( + "fraken-x scan targets: %s", + folders_and_files[1::2], + ) logger.debug(f"fraken-x command: {cmd}") - with open(fraken_output.path, "w+", encoding="utf-8") as log: - self.send_event("task-progress") - process = subprocess.Popen(cmd, stdout=log, stderr=subprocess.PIPE) - process.wait() - if process.stderr: - # Note: fraken-x uses the eprintln! Rust macro to print progress, - # this outputs to stderr.... - logger.info(f"fraken-x: {process.stderr.readlines()}") + with ( + open(fraken_output.path, "w+", encoding="utf-8") as log, + open(fraken_stderr.path, "w+", encoding="utf-8") as stderr_log, + ): + send_progress(self, "Running Yara scan") + process = subprocess.run( + cmd, + stdout=log, + stderr=stderr_log, + check=False, + text=True, + ) + + if os.path.getsize(fraken_stderr.path) > 0: + logger.info(f"fraken-x stderr written to {fraken_stderr.path}") + + if process.returncode != 0: + raise RuntimeError( + "An error occurred while running fraken-x. " + f"Exit code: {process.returncode}. " + f"See stderr log for details: {fraken_stderr.path}" + ) except RuntimeError as e: logger.error("Error encountered: %s", str(e)) + raise finally: for blockdevice in disks_mounted: if blockdevice: @@ -347,6 +434,7 @@ def command( return create_task_result( output_files=output_files, + task_files=task_files, workflow_id=workflow_id, command="fraken", task_report=report.to_dict(), diff --git a/workers/openrelik-worker-yara/tests/test_tasks.py b/workers/openrelik-worker-yara/tests/test_tasks.py index 0d7fa1e..ed18333 100644 --- a/workers/openrelik-worker-yara/tests/test_tasks.py +++ b/workers/openrelik-worker-yara/tests/test_tasks.py @@ -1,7 +1,9 @@ +import base64 import json import os import pytest import shutil +from types import SimpleNamespace from unittest.mock import MagicMock, patch, mock_open from src.tasks import cleanup_fraken_output_log, command @@ -114,7 +116,7 @@ def test_command_empty_rules_collected(): # Mock os.path.isfile and os.path.isdir to return False for everything with patch("os.path.isfile", return_value=False), patch( "os.path.isdir", return_value=False - ): + ), patch.object(command, "send_event"): task_config = {"Global Yara rules": "/non/existent/path"} with pytest.raises(ValueError, match="No Yara rules were collected"): @@ -124,3 +126,202 @@ def test_command_empty_rules_collected(): input_files=[], output_path="/tmp", ) + + +def test_command_fails_without_input_files(tmp_path): + rule_file = tmp_path / "rule.yara" + rule_file.write_text('rule test { strings: $ = "test" condition: true }') + + with patch.dict(os.environ, {}, clear=True), patch.object(command, "send_event"): + with pytest.raises(RuntimeError, match="No input files"): + command.run( + None, + task_config={"Global Yara rules": str(rule_file)}, + input_files=[], + output_path=str(tmp_path), + ) + + +def test_command_refuses_root_input_path(tmp_path): + rule_file = tmp_path / "rule.yara" + rule_file.write_text('rule test { strings: $ = "test" condition: true }') + + with patch.dict(os.environ, {}, clear=True), patch.object(command, "send_event"): + with pytest.raises(RuntimeError, match="Refusing to scan filesystem root"): + command.run( + None, + task_config={"Global Yara rules": str(rule_file)}, + input_files=[{"path": "/", "display_name": "root"}], + output_path=str(tmp_path), + ) + + +def _mock_output_file(path): + output_file = MagicMock() + output_file.path = str(path) + output_file.to_dict.return_value = {"path": str(path)} + return output_file + + +def test_command_writes_fraken_stderr_to_log_file(tmp_path): + rule_file = tmp_path / "rule.yara" + rule_file.write_text('rule test { strings: $ = "test" condition: true }') + input_file = tmp_path / "input.txt" + input_file.write_text("test") + + all_yara = _mock_output_file(tmp_path / "all.yara") + fraken_output = _mock_output_file(tmp_path / "fraken_out.jsonl") + fraken_stderr = _mock_output_file(tmp_path / "fraken_stderr.log") + report_file = _mock_output_file(tmp_path / "yara-scan-report.md") + + with patch.dict(os.environ, {}, clear=True), patch( + "src.tasks.create_output_file", + side_effect=[all_yara, fraken_output, fraken_stderr, report_file], + ), patch( + "src.tasks.subprocess.run", + return_value=SimpleNamespace(returncode=0), + ) as mock_run, patch.object(command, "send_event") as mock_send_event: + result = command.run( + None, + task_config={"Global Yara rules": str(rule_file)}, + input_files=[ + {"path": str(input_file), "display_name": input_file.name}, + ], + output_path=str(tmp_path), + ) + + _, kwargs = mock_run.call_args + assert kwargs["stderr"].name == str(fraken_stderr.path) + assert kwargs["stdout"].name == str(fraken_output.path) + assert kwargs["stderr"].name != kwargs["stdout"].name + decoded_result = json.loads(base64.b64decode(result).decode("utf-8")) + assert decoded_result["output_files"] == [ + fraken_output.to_dict.return_value, + report_file.to_dict.return_value, + ] + assert decoded_result["task_files"] == [fraken_stderr.to_dict.return_value] + statuses = [ + call.kwargs["data"]["status"] + for call in mock_send_event.call_args_list + if "data" in call.kwargs + ] + assert statuses == ["Running Yara scan"] + + +def test_command_reports_fraken_failure_without_stderr_tail(tmp_path): + rule_file = tmp_path / "rule.yara" + rule_file.write_text('rule test { strings: $ = "test" condition: true }') + input_file = tmp_path / "input.txt" + input_file.write_text("test") + + all_yara = _mock_output_file(tmp_path / "all.yara") + fraken_output = _mock_output_file(tmp_path / "fraken_out.jsonl") + fraken_stderr = _mock_output_file(tmp_path / "fraken_stderr.log") + + def _run_with_stderr(*_, **kwargs): + kwargs["stderr"].write("fraken exploded\n") + return SimpleNamespace(returncode=2) + + with patch.dict(os.environ, {}, clear=True), patch( + "src.tasks.create_output_file", + side_effect=[all_yara, fraken_output, fraken_stderr], + ), patch("src.tasks.subprocess.run", side_effect=_run_with_stderr), patch.object( + command, "send_event" + ): + with pytest.raises(RuntimeError) as e: + command.run( + None, + task_config={"Global Yara rules": str(rule_file)}, + input_files=[ + {"path": str(input_file), "display_name": input_file.name}, + ], + output_path=str(tmp_path), + ) + + assert "An error occurred while running fraken-x" in str(e.value) + assert str(fraken_stderr.path) in str(e.value) + assert "fraken exploded" not in str(e.value) + with open(fraken_stderr.path, encoding="utf-8") as fh: + assert fh.read() == "fraken exploded\n" + + +def test_command_rejects_disk_image_without_mount_disk_images(tmp_path): + rule_file = tmp_path / "rule.yara" + rule_file.write_text('rule test { strings: $ = "test" condition: true }') + input_file = tmp_path / "disk.E01" + input_file.write_text("disk") + + all_yara = _mock_output_file(tmp_path / "all.yara") + fraken_output = _mock_output_file(tmp_path / "fraken_out.jsonl") + fraken_stderr = _mock_output_file(tmp_path / "fraken_stderr.log") + + with patch.dict(os.environ, {}, clear=True), patch( + "src.tasks.create_output_file", + side_effect=[all_yara, fraken_output, fraken_stderr], + ), patch("src.tasks.is_disk_image", return_value=True), patch( + "src.tasks.subprocess.run" + ) as mock_run, patch.object(command, "send_event"): + with pytest.raises(RuntimeError, match="not supported in regular scan mode"): + command.run( + None, + task_config={"Global Yara rules": str(rule_file)}, + input_files=[ + {"path": str(input_file), "display_name": input_file.name}, + ], + output_path=str(tmp_path), + ) + + mock_run.assert_not_called() + + +def test_command_mounts_disk_image_before_scanning(tmp_path): + rule_file = tmp_path / "rule.yara" + rule_file.write_text('rule test { strings: $ = "test" condition: true }') + input_file = tmp_path / "disk.img" + input_file.write_text("disk") + mountpoint = tmp_path / "mounted-partition" + mountpoint.mkdir() + + all_yara = _mock_output_file(tmp_path / "all.yara") + fraken_output = _mock_output_file(tmp_path / "fraken_out.jsonl") + fraken_stderr = _mock_output_file(tmp_path / "fraken_stderr.log") + report_file = _mock_output_file(tmp_path / "yara-scan-report.md") + block_device = MagicMock() + block_device.mount.return_value = [str(mountpoint)] + + with patch.dict(os.environ, {}, clear=True), patch( + "src.tasks.create_output_file", + side_effect=[all_yara, fraken_output, fraken_stderr, report_file], + ), patch("src.tasks.is_disk_image", return_value=True), patch( + "src.tasks.BlockDevice", return_value=block_device + ) as mock_block_device, patch( + "src.tasks.subprocess.run", + return_value=SimpleNamespace(returncode=0), + ) as mock_run, patch.object( + command, "send_event" + ): + command.run( + None, + task_config={ + "Global Yara rules": str(rule_file), + "mount_disk_images": True, + }, + input_files=[ + { + "path": str(input_file), + "display_name": input_file.name, + }, + ], + output_path=str(tmp_path), + ) + + mock_block_device.assert_called_once_with(str(input_file), min_partition_size=1) + block_device.setup.assert_called_once_with() + block_device.mount.assert_called_once_with() + block_device.umount.assert_called_once_with() + assert mock_run.call_args.args[0] == [ + "fraken", + "--folder", + str(mountpoint), + str(all_yara.path), + ] From 3edf54e93f238412a253b1bdff267cd724a340aa Mon Sep 17 00:00:00 2001 From: Julian Hill Date: Mon, 1 Jun 2026 14:49:20 +0200 Subject: [PATCH 2/2] Better handle skipped Yara inputs without failing completely with mixed input --- workers/openrelik-worker-yara/src/tasks.py | 115 +++++++++++++++--- .../openrelik-worker-yara/tests/test_tasks.py | 57 +++++++++ 2 files changed, 155 insertions(+), 17 deletions(-) diff --git a/workers/openrelik-worker-yara/src/tasks.py b/workers/openrelik-worker-yara/src/tasks.py index 3f1e2d6..8de2642 100644 --- a/workers/openrelik-worker-yara/src/tasks.py +++ b/workers/openrelik-worker-yara/src/tasks.py @@ -98,6 +98,14 @@ class YaraMatch: score: int +@dataclass +class SkippedInput: + """Dataclass to store skipped Yara input information.""" + + input_name: str + reason: str + + def validate_scan_target(path: str, display_name: str = "input") -> None: """Reject scan targets that would silently scan the worker container.""" if not path: @@ -109,7 +117,7 @@ def validate_scan_target(path: str, display_name: str = "input") -> None: ) if not os.path.exists(path): raise RuntimeError(f"Input path does not exist: {path}") - if not os.path.isfile(path) and not os.path.isdir(path): + if not (os.path.isfile(path) or os.path.isdir(path)): raise RuntimeError( f"Unsupported Yara scan target: {display_name}. " "The Yara worker can scan regular files and directories. " @@ -160,16 +168,24 @@ def cleanup_fraken_output_log(logfile: OutputFile) -> None: json.dump(extracted_dicts, f) -def generate_report_from_matches(matches: list[YaraMatch]) -> Report: +def generate_report_from_matches( + matches: list[YaraMatch], skipped_inputs: list[SkippedInput] | None = None +) -> Report: """Generate a report from Yara matches. Args: matches: List of YaraMatch objects. + skipped_inputs: List of skipped input files and the reason they were skipped. Returns: Report object. """ + skipped_inputs = skipped_inputs or [] report = Report("Yara scan report") + report.summary = f"{len(matches)} Yara match(es) found." + if skipped_inputs: + report.summary += f" {len(skipped_inputs)} input(s) skipped." + matches_section = report.add_section() matches_section.add_paragraph("List of Yara matches found in the scanned files.") if matches: @@ -189,6 +205,17 @@ def generate_report_from_matches(matches: list[YaraMatch]) -> Report: matches_section.add_table(match_table) + if skipped_inputs: + skipped_section = report.add_section() + skipped_section.add_header("Skipped inputs") + skipped_section.add_paragraph( + "Inputs that were not scanned because they were not valid scan targets." + ) + skipped_table = MarkdownTable(["input", "reason"]) + for skipped_input in skipped_inputs: + skipped_table.add_row([skipped_input.input_name, skipped_input.reason]) + skipped_section.add_table(skipped_table) + return report @@ -302,9 +329,16 @@ def command( disks_mounted = [] try: folders_and_files = [] + skipped_inputs = [] bd = None for input_file in input_files: if "path" not in input_file: + skipped_inputs.append( + SkippedInput( + input_file.get("display_name", "UNKNOWN FILE"), + "Input does not have a path.", + ) + ) logger.warning( "Skipping file %s as it does not have an path", input_file ) @@ -312,16 +346,33 @@ def command( input_file_path = input_file.get("path") display_name = input_file.get("display_name", input_file_path) - validate_scan_target(input_file_path, display_name) + try: + validate_scan_target(input_file_path, display_name) + except RuntimeError as e: + skipped_inputs.append(SkippedInput(display_name, str(e))) + logger.error( + "Skipping Yara scan input %s (%s): %s", + display_name, + input_file_path, + str(e), + ) + continue # Check if disk image, mount and add mountpoints to scan if is_disk_image(input_file): if not mount_disk_images: - raise RuntimeError( - "Disk image input is not supported in regular scan mode: " - f"{display_name}. Enable mount_disk_images to scan files " - "inside supported disk image filesystems." + reason = ( + "Disk image input is not supported in regular scan mode. " + "Enable mount_disk_images to scan files inside supported " + "disk image filesystems." ) + skipped_inputs.append(SkippedInput(display_name, reason)) + logger.error( + "%s: %s", + display_name, + reason, + ) + continue try: send_progress(self, "Mounting disk image", display_name) @@ -341,18 +392,41 @@ def command( input_file_path, str(e), ) - raise RuntimeError( - "Disk image input is not supported or could not be " - f"mounted by the Yara worker: {display_name}." - ) from None + skipped_inputs.append( + SkippedInput( + display_name, + "Disk image input is not supported or could not be " + "mounted by the Yara worker.", + ) + ) + continue if not mountpoints: - raise RuntimeError( - "No mountpoints returned for input file " - f"{input_file.get('display_name')}" + skipped_inputs.append( + SkippedInput( + input_file.get("display_name", input_file_path), + "No mountpoints returned for input file.", + ) + ) + logger.error( + "No mountpoints returned for input file %s", + input_file.get("display_name"), ) + continue for mountpoint in mountpoints: - validate_scan_target(mountpoint, display_name) + try: + validate_scan_target(mountpoint, display_name) + except RuntimeError as e: + skipped_inputs.append( + SkippedInput(f"{display_name}: {mountpoint}", str(e)) + ) + logger.error( + "Skipping Yara scan mountpoint %s for %s: %s", + mountpoint, + display_name, + str(e), + ) + continue folders_and_files.append("--folder") folders_and_files.append(mountpoint) else: @@ -360,7 +434,14 @@ def command( folders_and_files.append(input_file_path) if not folders_and_files: - raise RuntimeError("No scan targets were produced from input files.") + message = "No scan targets were produced from input files." + if skipped_inputs: + last_skipped_input = skipped_inputs[-1] + message = ( + f"{message} Last skipped input error: " + f"{last_skipped_input.input_name}: {last_skipped_input.reason}" + ) + raise RuntimeError(message) cmd = ["fraken"] + folders_and_files + [f"{all_yara.path}"] logger.info( @@ -421,7 +502,7 @@ def command( cleanup_fraken_output_log(fraken_output) - report = generate_report_from_matches(all_matches) + report = generate_report_from_matches(all_matches, skipped_inputs) report_file = create_output_file( output_path, display_name="yara-scan-report.md", diff --git a/workers/openrelik-worker-yara/tests/test_tasks.py b/workers/openrelik-worker-yara/tests/test_tasks.py index ed18333..7abc2cf 100644 --- a/workers/openrelik-worker-yara/tests/test_tasks.py +++ b/workers/openrelik-worker-yara/tests/test_tasks.py @@ -274,6 +274,63 @@ def test_command_rejects_disk_image_without_mount_disk_images(tmp_path): mock_run.assert_not_called() +def test_command_skips_disk_image_without_mount_and_scans_other_inputs(tmp_path): + rule_file = tmp_path / "rule.yara" + rule_file.write_text('rule test { strings: $ = "test" condition: true }') + disk_file = tmp_path / "disk.E01" + disk_file.write_text("disk") + regular_file = tmp_path / "input.txt" + regular_file.write_text("test") + + all_yara = _mock_output_file(tmp_path / "all.yara") + fraken_output = _mock_output_file(tmp_path / "fraken_out.jsonl") + fraken_stderr = _mock_output_file(tmp_path / "fraken_stderr.log") + report_file = _mock_output_file(tmp_path / "yara-scan-report.md") + + def _is_disk_image(input_file): + return input_file["path"] == str(disk_file) + + with patch.dict(os.environ, {}, clear=True), patch( + "src.tasks.create_output_file", + side_effect=[all_yara, fraken_output, fraken_stderr, report_file], + ), patch("src.tasks.is_disk_image", side_effect=_is_disk_image), patch( + "src.tasks.subprocess.run", + return_value=SimpleNamespace(returncode=0), + ) as mock_run, patch("src.tasks.logger") as mock_logger, patch.object( + command, "send_event" + ): + result = command.run( + None, + task_config={"Global Yara rules": str(rule_file)}, + input_files=[ + {"path": str(disk_file), "display_name": disk_file.name}, + {"path": str(regular_file), "display_name": regular_file.name}, + ], + output_path=str(tmp_path), + ) + + assert any( + "Disk image input is not supported in regular scan mode" in str(call) + for call in mock_logger.error.call_args_list + ) + assert mock_run.call_args.args[0] == [ + "fraken", + "--folder", + str(regular_file), + str(all_yara.path), + ] + with open(report_file.path, encoding="utf-8") as fh: + report_content = fh.read() + assert "Skipped inputs" in report_content + assert disk_file.name in report_content + assert "Disk image input is not supported in regular scan mode" in report_content + + decoded_result = json.loads(base64.b64decode(result).decode("utf-8")) + task_report = decoded_result["task_report"] + assert "1 input(s) skipped" in task_report["summary"] + assert "Skipped inputs" in task_report["content"] + + def test_command_mounts_disk_image_before_scanning(tmp_path): rule_file = tmp_path / "rule.yara" rule_file.write_text('rule test { strings: $ = "test" condition: true }')