Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions report-generation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
JDK API Changes Report Generation

Tina Xia, tzxia@uwaterloo.ca for issues or questions
Friday, April 2, 2025

This project automates the review process for newly introduced methods across subsequent JDK versions.

-----------

Configuration

PERFORMING MINIMIZATION AND STORING RESULTS
1. Create a sub-directory called "results" in the "report-generation" directory

2. Update the absolute paths in config.py to match your environment:

JDK_REPO_PATH: path to your local JDK repository.
CHECKER_FRAMEWORK_REPO_PATH: path to your local Checker Framework repository.
RESULTS_BASE_DIR: path to the "results" sub-directory you just created

3. run minimization script:
- python3 perform-minimization.py
- This will generate and store the minimized files for each jdk version in "results"

GENERATE DIFF JSONS
4. run JSON generation script:
- python3 generate-json.py
- This will compare and store differences between files in a sub-directory called json_files in a directory called "report-files"
- This script can be modified to only generate JSON's for a defined sub-set of minimized files by altering the versions array in the processing portion

GENERATE CSVs
5. run CSV generation script:
- python3 generate-csv.py
- This will generate a csv tracker for the review status of classes by iterating over the JSON files generated in the previous step
- NOTE: when introducing a new jdk version, this file must be modified to generate a CSV for ONLY the newly introduced jdk version, or past CSVs will be completely reset.

GENERATE HTML Report
6. run HTML report generation script:

- python3 generate-report.py
- this creates and opens a report in your browser, loading in information from the CSVs and JSONs generated in the previous step

-----------

Use

Once a method has been reviewed, updated the final column in each row "checked" by changing the entry from "False" to "True". The next time the report generation script is run, a tag will appear on the report that indicates its updated status. You must re-run the report generation script "python3 generate-report.py" to see the changes.

-----------

Future Updates

New JDK Versions:
When a new JDK version is released, update the branch list in the scripts to only include the newly introuced versions. This ensures that processing a new branch does not overwrite progress in previous versions.
4 changes: 4 additions & 0 deletions report-generation/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Update these absolute paths for your environment
JDK_REPO_PATH = "/Users/tinaxia/Desktop/eisop/jdk"
CHECKER_FRAMEWORK_REPO_PATH = "/Users/tinaxia/Desktop/eisop/checker-framework"
RESULTS_BASE_DIR = "/Users/tinaxia/Desktop/eisop/jdk/report-generation/results"
62 changes: 62 additions & 0 deletions report-generation/generate-csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import json
import os
import csv

csv_folder = "../report-files/csv_reports"
os.makedirs(csv_folder, exist_ok=True)

data = {}
json_folder = "../report-files/json_files"
os.makedirs(csv_folder, exist_ok=True)

json_files = [f for f in os.listdir(json_folder) if f.endswith(".json")]

#this list must be modified for newly added jdk versions, so that past versions are not overwitten with every method review status set to False
versions_to_process = ['jdk-18', 'jdk-19', 'jdk-20', 'jdk-21', 'jdk-22', 'jdk-23', 'jdk-24']

for filename in os.listdir(json_folder):
if filename.endswith(".json"):
version = os.path.splitext(filename)[0]
#processes only the versions in the versions_to_process list
if version not in versions_to_process:
continue
file_path = os.path.join(json_folder, filename)
with open(file_path, "r", encoding="utf-8") as f:
file_data = json.load(f)
data[version] = file_data[version]

for version in data:
#gets current csv path
current_csv = os.path.join(csv_folder, f"{version}_methods.csv")
with open(current_csv, "w", newline='', encoding="utf-8") as csvfile:

fieldnames = ["version", "file_url", "method", "checked"]

writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()

#since methods are grouped by file_url in the methods json
for file_url, file_data in data[version].items():
new_changes = file_data.get("new_changes", {})
new_classes = new_changes.get("new_classes", [])
new_methods = new_changes.get("new_methods", [])

writer.writerow({
"version": version,
"file_url": file_url,
"checked": "False"
})

# Write new class methods
for cls in new_classes:
declaration = cls.get("declaration")

#store checked status for newly introduced classes
writer.writerow({
"version": version,
"file_url": file_url,
"method": cls.get("declaration"),
"checked": "False"
})

print(f"Created csv for {version}")
154 changes: 154 additions & 0 deletions report-generation/generate-json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import os
import json
import difflib
import re

#matches for empty lines, "import", any comment "/*", and closing braces (})
pattern = re.compile(r'^(?:\s*$|import\b|.*\/\*.*|\s*\*|\s*\})')

def extract_new_changes(old_lines, new_lines):

#get list of differences
diff = list(difflib.ndiff(old_lines, new_lines))
new_changes = [] #list of tuples (new_line_index, content)
new_index = 0 #get index to check whether method belongs to a new class

for line in diff:
if line.startswith('+ '):
content = line[2:].strip()
if not pattern.match(content):
new_changes.append((new_index, content))

#increase line counter
if line.startswith(' ') or line.startswith('+ '):
new_index += 1

#creates a list to get the position of every class
class_positions = []
for i, line in enumerate(new_lines):

if re.search(r'\bclass\b', line):
class_positions.append((i, line.strip()))

new_classes = {}
#for new methods that have been added to an existing class
new_methods = []

for idx, content in new_changes:
if re.search(r'\bclass\b', content):
new_classes[idx] = {"declaration": content, "methods": []}

for idx, content in new_changes:
if re.search(r'\bclass\b', content):
continue #already handled

parent_class_idx = None
for pos, decl in class_positions:
if pos <= idx:
parent_class_idx = pos
else:
break

if parent_class_idx is not None and parent_class_idx in new_classes:
new_classes[parent_class_idx]["methods"].append(content)
else:
new_methods.append(content)

return {
"new_classes": list(new_classes.values()),
"new_methods": new_methods
}

#https://stackoverflow.com/questions/8625991/use-python-os-walk-to-identify-a-list-of-files
#get all java files in every subdirectory of results (after minimization is performed)
def list_java_files(directory):
java_files = []
for root, _, files in os.walk(directory):
for file in files:
full_path = os.path.join(root, file)
relative_path = os.path.relpath(full_path, directory)
java_files.append(relative_path)

#list of relative path of every file
return java_files

def read_file_lines(filepath):

with open(filepath, 'r', encoding='utf-8') as f:
return f.readlines()

#https://docs.python.org/3/library/difflib.html
#uses python's difflib to compare two lines at a time
def extract_new_lines(old_lines, new_lines):

diff = difflib.ndiff(old_lines, new_lines)
newly_added = []

for line in diff:
#new lines will start with +
if line.startswith('+ '):
content = line[2:].strip()
#makes sure a new line passes regex check before appending
if not pattern.match(content):
newly_added.append(content)
return newly_added

#returns a dictionary of new changes between subsequent JDK versions
def compare_version_pair(old_version_dir, new_version_dir):
new_files = list_java_files(new_version_dir)
file_diff = {}

for rel_path in new_files:
#generates github URL
branch = os.path.basename(new_version_dir)
url = f"https://github.com/eisop/jdk/tree/{branch}"
parts = rel_path.split(os.path.sep)
url = url + '/' + '/'.join(parts[1:])

new_file_path = os.path.join(new_version_dir, rel_path)
old_file_path = os.path.join(old_version_dir, rel_path)

if os.path.exists(old_file_path):
old_lines = read_file_lines(old_file_path)
else:
old_lines = []

new_lines = read_file_lines(new_file_path)

changes = extract_new_changes(old_lines, new_lines)
if changes["new_classes"] or changes["new_methods"]:
file_diff[url] = {
"new_changes": changes
}
return file_diff

#this list must be modified for newly added jdk versions
versions = ['master', 'jdk-18', 'jdk-19', 'jdk-20', 'jdk-21', 'jdk-22', 'jdk-23', 'jdk-24']

base_path = './results/'

json_folder = "../report-files/json_files"
os.makedirs(json_folder, exist_ok=True)

#compares all subsequent versions, creates a seperate json file for each version pair
for i in range(1, len(versions)):

results = {}

old_version = versions[i - 1]
new_version = versions[i]

version_pair_key = new_version

old_dir = os.path.join(base_path, old_version)
new_dir = os.path.join(base_path, new_version)

current_diff = compare_version_pair(old_dir, new_dir)
results[version_pair_key] = current_diff

current_json = os.path.join(json_folder, f"{versions[i]}.json")

with open(current_json, 'w', encoding='utf-8') as out_f:
json.dump(results, out_f, indent=2)

print(f"Created json for {old_version} -> {new_version}")
Loading