-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_diff_commit_pairs.py
More file actions
254 lines (219 loc) · 7.18 KB
/
extract_diff_commit_pairs.py
File metadata and controls
254 lines (219 loc) · 7.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python3
"""
Extract (diff, commit_message) pairs from a git repository's history.
Uses `git log` and `git diff` against the parent commit (or the empty tree for
root commits). Large diffs are compressed per DEVELOPER_GUIDE §5.3 (full patch;
``git diff --stat`` plus first N lines of patch; or ``--stat`` only).
Writes JSONL with ``commit``, ``diff``, ``commit_message``, ``diff_lines`` (raw
patch line count), and ``diff_tier`` (``full`` | ``stat_plus_head`` |
``stat_only``).
"""
from __future__ import annotations
import argparse
import json
import subprocess
import sys
from pathlib import Path
# Git's canonical empty tree object id (used for the first commit in a repo).
_EMPTY_TREE = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
# Large-diff handling (see DEVELOPER_GUIDE §5.3): full patch, stat+head, or stat-only.
_DEFAULT_FULL_DIFF_MAX_LINES = 300
_DEFAULT_PATCH_HEAD_LINES = 300
_DEFAULT_STAT_ONLY_ABOVE_LINES = 1000
def _run_git(
repo: Path,
*args: str,
check: bool = True,
) -> subprocess.CompletedProcess[str]:
return subprocess.run(
["git", "-C", str(repo), *args],
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
check=check,
)
def _commit_hashes(
repo: Path,
no_merges: bool,
rev: str,
max_count: int | None,
) -> list[str]:
cmd = ["log", rev, "--format=%H"]
if no_merges:
cmd.insert(1, "--no-merges")
if max_count is not None:
cmd.extend(["-n", str(max_count)])
proc = _run_git(repo, *cmd, check=True)
lines = [ln.strip() for ln in proc.stdout.splitlines() if ln.strip()]
return lines
def _parent_or_none(repo: Path, commit: str) -> str | None:
proc = _run_git(repo, "rev-parse", f"{commit}^", check=False)
if proc.returncode != 0:
return None
return proc.stdout.strip()
def _commit_message(repo: Path, commit: str, subject_only: bool) -> str:
fmt = "%s" if subject_only else "%B"
proc = _run_git(repo, "log", "-1", f"--format={fmt}", commit, check=True)
return proc.stdout.rstrip("\n")
def _diff_base(repo: Path, commit: str) -> str:
parent = _parent_or_none(repo, commit)
return parent if parent is not None else _EMPTY_TREE
def _raw_patch(repo: Path, base: str, commit: str) -> str:
proc = _run_git(repo, "diff", "--no-color", base, commit, check=True)
return proc.stdout
def _diff_stat(repo: Path, base: str, commit: str) -> str:
proc = _run_git(repo, "diff", "--no-color", "--stat", base, commit, check=True)
return proc.stdout.rstrip("\n")
def _shape_diff_for_size(
repo: Path,
base: str,
commit: str,
full_patch: str,
full_max_lines: int,
patch_head_lines: int,
stat_only_above_lines: int,
) -> tuple[str, int, str]:
"""
Apply §5.3 rules. Returns (stored_diff, raw_line_count, tier).
tier is ``full``, ``stat_plus_head``, or ``stat_only``.
"""
lines = full_patch.splitlines()
n = len(lines)
if n < full_max_lines:
return full_patch, n, "full"
stat_text = _diff_stat(repo, base, commit)
if n > stat_only_above_lines:
out = stat_text + ("\n" if stat_text else "")
return out, n, "stat_only"
head = "\n".join(lines[:patch_head_lines])
combined = stat_text + "\n\n" + head + "\n"
return combined, n, "stat_plus_head"
def _diff_for_commit(
repo: Path,
commit: str,
*,
full_max_lines: int,
patch_head_lines: int,
stat_only_above_lines: int,
) -> tuple[str, int, str]:
base = _diff_base(repo, commit)
full_patch = _raw_patch(repo, base, commit)
return _shape_diff_for_size(
repo,
base,
commit,
full_patch,
full_max_lines,
patch_head_lines,
stat_only_above_lines,
)
def main() -> int:
parser = argparse.ArgumentParser(
description="Dump diff + commit message pairs from git history as JSONL.",
)
parser.add_argument(
"repo",
nargs="?",
default=".",
type=Path,
help="Path to git repository (default: current directory)",
)
parser.add_argument(
"-o",
"--output",
type=Path,
help="Write JSONL to this file (default: stdout)",
)
parser.add_argument(
"--include-merges",
action="store_true",
help="Include merge commits (default: exclude them)",
)
parser.add_argument(
"--subject-only",
action="store_true",
help="Use only the commit subject line as commit_message",
)
parser.add_argument(
"-n",
"--max-count",
type=int,
metavar="N",
help="Limit to the N most recent commits",
)
parser.add_argument(
"--rev",
default="HEAD",
help="Revision range to walk (default: HEAD)",
)
parser.add_argument(
"--full-max-lines",
type=int,
default=_DEFAULT_FULL_DIFF_MAX_LINES,
metavar="N",
help=(
"Use full patch when raw diff has fewer than N lines "
f"(default: {_DEFAULT_FULL_DIFF_MAX_LINES})"
),
)
parser.add_argument(
"--patch-head-lines",
type=int,
default=_DEFAULT_PATCH_HEAD_LINES,
metavar="N",
help=(
"With medium-sized diffs, keep this many leading patch lines after "
f"--stat (default: {_DEFAULT_PATCH_HEAD_LINES})"
),
)
parser.add_argument(
"--stat-only-above",
type=int,
default=_DEFAULT_STAT_ONLY_ABOVE_LINES,
metavar="N",
help=(
"If raw diff has more than N lines, store only git diff --stat "
f"(default: {_DEFAULT_STAT_ONLY_ABOVE_LINES})"
),
)
args = parser.parse_args()
repo = args.repo.resolve()
if _run_git(repo, "rev-parse", "--git-dir", check=False).returncode != 0:
print(f"error: not a git repository: {repo}", file=sys.stderr)
return 1
no_merges = not args.include_merges
try:
hashes = _commit_hashes(repo, no_merges, args.rev, args.max_count)
except subprocess.CalledProcessError as e:
print(e.stderr or str(e), file=sys.stderr)
return e.returncode
out = open(args.output, "w", encoding="utf-8") if args.output else sys.stdout
try:
for h in hashes:
try:
diff, diff_lines, diff_tier = _diff_for_commit(
repo,
h,
full_max_lines=args.full_max_lines,
patch_head_lines=args.patch_head_lines,
stat_only_above_lines=args.stat_only_above,
)
msg = _commit_message(repo, h, args.subject_only)
except subprocess.CalledProcessError as e:
print(e.stderr or str(e), file=sys.stderr)
return e.returncode
record = {
"commit": h,
"diff": diff,
"commit_message": msg,
"diff_lines": diff_lines,
"diff_tier": diff_tier,
}
out.write(json.dumps(record, ensure_ascii=False) + "\n")
finally:
if args.output:
out.close()
return 0
if __name__ == "__main__":
raise SystemExit(main())