-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
102 lines (80 loc) · 3.31 KB
/
utils.py
File metadata and controls
102 lines (80 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import json
from configs.paths_config import TEACHER_EXTRACTED_ISSUES_PATH, REQUIRED_KEYS
from configs.paths_config import ISSUES2INDICES_PATH, INDICES2ISSUES_PATH
def load_jsonl(path: str) -> list[dict]:
"""Loads JSONL file and returns a list of dictionaries."""
data = []
with open(path, "r") as f:
for line in f:
data.append(json.loads(line))
return data
def save_jsonl(path: str, data: list[dict]) -> None:
"""Saves list of dictionaries to JSONL file."""
with open(path, "w") as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
def read_teacher_issues_scores() -> list[dict[str, str | dict[str, int]]]:
"""
Reads teacher extracted issues from JSONL file and returns a list of dictionaries.
Each dictionary has the structure:
{
"title": str,
"issues": {
"issue_name": int, # severity score
...
}
}
This function ensures that each dictionary contains these required keys.
"""
extracted_issues: list[dict[str, str | dict[str, int]]] = []
with open(TEACHER_EXTRACTED_ISSUES_PATH) as f:
for line in f:
issues_dict = json.loads(line)
if set(issues_dict.keys()) == REQUIRED_KEYS:
extracted_issues.append(issues_dict)
print("Successfully read teacher extracted issues.")
return extracted_issues
def vectorize_issues_scores(
extracted_issues: list[dict[str, str | dict[str, int]]]
) -> list[dict[str, list[int]]]:
"""
Vectorizes the issues and severity scores from the teacher model output.
Returns:
encoded: list of dicts, each with:
- title: str
- issues: list[int] (0 or 1 label)
- severity: list[int] (0 ~ 10 scores)
"""
occurred_issues = set() # Collect all distinct issues.
for item in extracted_issues:
occurred_issues.update(set(item["issues"].keys()))
occurred_issues = sorted(list(occurred_issues)) # Sort list for consistent orders.
issues2indices = {issue: i for i, issue in enumerate(occurred_issues)}
json.dump(issues2indices, open(ISSUES2INDICES_PATH, "w"))
print("Successfully saved issues to indices map.")
indices2issues = {i: issue for i, issue in enumerate(occurred_issues)}
json.dump(indices2issues, open(INDICES2ISSUES_PATH, "w"))
print("Successfully saved indices to issues map.")
encoded_issues_scores: list[dict[str, list[int]]] = []
for item in extracted_issues:
title = item["title"]
issues_scores = item["issues"]
issues_vector = [0] * len(occurred_issues)
severity_vector = [0] * len(occurred_issues)
for issue, score in issues_scores.items():
idx = issues2indices[issue]
issues_vector[idx] = 1 # 0 or 1 label.
severity_vector[idx] = score # 1 ~ 10 label.
encoded_issues_scores.append(
{
"title": title,
"issues": issues_vector,
"severity": severity_vector
}
)
print("Successfully vectorized issues and scores.")
print(f"Total unique issues: {len(occurred_issues)}.")
return encoded_issues_scores
if __name__ == "__main__":
teacher_issues = read_teacher_issues_scores()
vectorize_issues_scores(teacher_issues)