From bea7e143bb2b8e06242229e074945bcb0d7fe195 Mon Sep 17 00:00:00 2001 From: Benjamin Gwynn Date: Wed, 9 Jul 2025 17:35:36 +0100 Subject: [PATCH 1/4] Replace bad UTF-8 bytes when parsing git blame MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This handles bad UTF-8 bytes by replacing them with the � character, fixing output parsing for files with invalid byte sequences. This fixes https://github.com/microsoft/sarif-tools/issues/97 --- sarif/operations/blame_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sarif/operations/blame_op.py b/sarif/operations/blame_op.py index f90f976..6b0d828 100644 --- a/sarif/operations/blame_op.py +++ b/sarif/operations/blame_op.py @@ -148,7 +148,7 @@ def _run_git_blame_on_files( for line_bytes in git_blame_output: # Convert byte sequence to string and remove trailing LF - line_string = line_bytes.decode("utf-8")[:-1] + line_string = line_bytes.decode("utf-8", errors="replace")[:-1] # Now parse output from git blame --porcelain if commit_hash: if line_string.startswith("\t"): From 26f04b46c4b3121ebe35f703ec0a2b873f7ad399 Mon Sep 17 00:00:00 2001 From: Erik De Bonte Date: Wed, 16 Jul 2025 14:22:44 -0700 Subject: [PATCH 2/4] Unit test --- tests/ops/blame/test_blame.py | 89 +++++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 20 deletions(-) diff --git a/tests/ops/blame/test_blame.py b/tests/ops/blame/test_blame.py index d301549..4cf03bb 100644 --- a/tests/ops/blame/test_blame.py +++ b/tests/ops/blame/test_blame.py @@ -4,7 +4,7 @@ import jsonschema import os import tempfile -from typing import List +from typing import Callable, List from sarif.operations import blame_op from sarif import sarif_file @@ -110,7 +110,7 @@ def test_blame_no_blame_info(): assert not os.path.isfile(output_file_path) -def test_blame_success(): +def blame_test(run_git_blame: Callable[[str, str], List[bytes]], expected_blame_properties: dict[str, dict[str, str]]): input_sarif_file = sarif_file.SarifFile( "SARIF_FILE", SARIF_FILE, mtime=datetime.datetime.now() ) @@ -122,17 +122,17 @@ def test_blame_success(): os.makedirs(repo_path) output_file_path = os.path.join(tmp, "blamed.json") - def run_git_blame(blame_repo_path: str, blame_file_path: str) -> List[bytes]: + def run_git_blame_wrapper(blame_repo_path: str, blame_file_path: str) -> List[bytes]: assert blame_repo_path == repo_path assert blame_file_path == ERROR_FILE_ABSOLUTE_PATH - return [x.encode() for x in GIT_BLAME_OUTPUT] + return run_git_blame(blame_repo_path, blame_file_path) blame_op.enhance_with_blame( input_sarif_file_set, repo_path, output_file_path, output_multiple_files=False, - run_git_blame=run_git_blame, + run_git_blame=run_git_blame_wrapper, ) with open(output_file_path, "rb") as f_out: @@ -140,19 +140,68 @@ def run_git_blame(blame_repo_path: str, blame_file_path: str) -> List[bytes]: jsonschema.validate(output_sarif, schema=get_sarif_schema()) expected_sarif = deepcopy(input_sarif_file.data) - blame_properties = { - "blame": { - "author": "Taylor Developer", - "author-mail": "", - "author-time": "1699272533", - "author-tz": "+0000", - "committer": "GitHub", - "committer-mail": "", - "committer-time": "1699272533", - "committer-tz": "+0000", - "summary": "Commit message 1", - "filename": ERROR_FILE_RELATIVE_PATH, - } - } - expected_sarif["runs"][0]["results"][0]["properties"] = blame_properties + expected_sarif["runs"][0]["results"][0]["properties"] = expected_blame_properties assert output_sarif == expected_sarif + + +def test_blame_success(): + def run_git_blame(blame_repo_path: str, blame_file_path: str) -> List[bytes]: + return [x.encode() for x in GIT_BLAME_OUTPUT] + + expected_blame_properties = { + "blame": { + "author": "Taylor Developer", + "author-mail": "", + "author-time": "1699272533", + "author-tz": "+0000", + "committer": "GitHub", + "committer-mail": "", + "committer-time": "1699272533", + "committer-tz": "+0000", + "summary": "Commit message 1", + "filename": ERROR_FILE_RELATIVE_PATH, + } + } + + blame_test(run_git_blame, expected_blame_properties) + + +GIT_BLAME_OUTPUT_WITH_INVALID_UTF8 = [ + b"f9db03438aba52affc5c3fcdb619afa620ad603a 1 1 7\n", + b"author Taylor Developer\n", + b"author-mail \n", + b"author-time 1699272533\n", + b"author-tz +0000\n", + b"committer GitHub\n", + b"committer-mail \n", + b"committer-time 1699272533\n", + b"committer-tz +0000\n", + b"summary Commit message \x80\n", + b"filename " + ERROR_FILE_RELATIVE_PATH.encode() + b"\n", + b"\tFile text line 1\n", + b"f9db03438aba52affc5c3fcdb619afa620ad603a 2 2\n", + b"\tFile text line 2\n", + b"f9db03438aba52affc5c3fcdb619afa620ad603a 3 3\n", + b"\tFile text line 3\n", + b"eec0471db074a037d820abdda1f210f8a8c987ca 4 4 1\n"] + +def test_blame_invalid_utf8(): + def run_git_blame(blame_repo_path: str, blame_file_path: str) -> List[bytes]: + return GIT_BLAME_OUTPUT_WITH_INVALID_UTF8 + + expected_blame_properties = { + "blame": { + "author": "Taylor Developer", + "author-mail": "", + "author-time": "1699272533", + "author-tz": "+0000", + "committer": "GitHub", + "committer-mail": "", + "committer-time": "1699272533", + "committer-tz": "+0000", + "summary": "Commit message �", + "filename": ERROR_FILE_RELATIVE_PATH, + } + } + + blame_test(run_git_blame, expected_blame_properties) \ No newline at end of file From a91cdea3042850de7cf475e682fa4e3fec4e3517 Mon Sep 17 00:00:00 2001 From: Erik De Bonte Date: Wed, 16 Jul 2025 14:25:56 -0700 Subject: [PATCH 3/4] Ruff hygiene --- tests/ops/blame/test_blame.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/ops/blame/test_blame.py b/tests/ops/blame/test_blame.py index 4cf03bb..57644b5 100644 --- a/tests/ops/blame/test_blame.py +++ b/tests/ops/blame/test_blame.py @@ -110,7 +110,10 @@ def test_blame_no_blame_info(): assert not os.path.isfile(output_file_path) -def blame_test(run_git_blame: Callable[[str, str], List[bytes]], expected_blame_properties: dict[str, dict[str, str]]): +def blame_test( + run_git_blame: Callable[[str, str], List[bytes]], + expected_blame_properties: dict[str, dict[str, str]], +): input_sarif_file = sarif_file.SarifFile( "SARIF_FILE", SARIF_FILE, mtime=datetime.datetime.now() ) @@ -122,7 +125,9 @@ def blame_test(run_git_blame: Callable[[str, str], List[bytes]], expected_blame_ os.makedirs(repo_path) output_file_path = os.path.join(tmp, "blamed.json") - def run_git_blame_wrapper(blame_repo_path: str, blame_file_path: str) -> List[bytes]: + def run_git_blame_wrapper( + blame_repo_path: str, blame_file_path: str + ) -> List[bytes]: assert blame_repo_path == repo_path assert blame_file_path == ERROR_FILE_ABSOLUTE_PATH return run_git_blame(blame_repo_path, blame_file_path) @@ -140,7 +145,9 @@ def run_git_blame_wrapper(blame_repo_path: str, blame_file_path: str) -> List[by jsonschema.validate(output_sarif, schema=get_sarif_schema()) expected_sarif = deepcopy(input_sarif_file.data) - expected_sarif["runs"][0]["results"][0]["properties"] = expected_blame_properties + expected_sarif["runs"][0]["results"][0]["properties"] = ( + expected_blame_properties + ) assert output_sarif == expected_sarif @@ -183,7 +190,9 @@ def run_git_blame(blame_repo_path: str, blame_file_path: str) -> List[bytes]: b"\tFile text line 2\n", b"f9db03438aba52affc5c3fcdb619afa620ad603a 3 3\n", b"\tFile text line 3\n", - b"eec0471db074a037d820abdda1f210f8a8c987ca 4 4 1\n"] + b"eec0471db074a037d820abdda1f210f8a8c987ca 4 4 1\n", +] + def test_blame_invalid_utf8(): def run_git_blame(blame_repo_path: str, blame_file_path: str) -> List[bytes]: @@ -204,4 +213,4 @@ def run_git_blame(blame_repo_path: str, blame_file_path: str) -> List[bytes]: } } - blame_test(run_git_blame, expected_blame_properties) \ No newline at end of file + blame_test(run_git_blame, expected_blame_properties) From afdc9dbea3c574e918bff520b60fb283391cd906 Mon Sep 17 00:00:00 2001 From: Erik De Bonte Date: Wed, 16 Jul 2025 14:31:37 -0700 Subject: [PATCH 4/4] Fix dict usage on 3.8 --- tests/ops/blame/test_blame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ops/blame/test_blame.py b/tests/ops/blame/test_blame.py index 57644b5..cf93c29 100644 --- a/tests/ops/blame/test_blame.py +++ b/tests/ops/blame/test_blame.py @@ -4,7 +4,7 @@ import jsonschema import os import tempfile -from typing import Callable, List +from typing import Callable, Dict, List from sarif.operations import blame_op from sarif import sarif_file @@ -112,7 +112,7 @@ def test_blame_no_blame_info(): def blame_test( run_git_blame: Callable[[str, str], List[bytes]], - expected_blame_properties: dict[str, dict[str, str]], + expected_blame_properties: Dict[str, Dict[str, str]], ): input_sarif_file = sarif_file.SarifFile( "SARIF_FILE", SARIF_FILE, mtime=datetime.datetime.now()