diff --git a/report-generation/README.md b/report-generation/README.md new file mode 100644 index 00000000000..61c1b411cfd --- /dev/null +++ b/report-generation/README.md @@ -0,0 +1,54 @@ +JDK API Changes Report Generation + +Tina Xia, tzxia@uwaterloo.ca for issues or questions +Friday, April 2, 2025 + +This project automates the review process for newly introduced methods across subsequent JDK versions. + +----------- + +Configuration + +PERFORMING MINIMIZATION AND STORING RESULTS +1. Create a sub-directory called "results" in the "report-generation" directory + +2. Update the absolute paths in config.py to match your environment: + +JDK_REPO_PATH: path to your local JDK repository. +CHECKER_FRAMEWORK_REPO_PATH: path to your local Checker Framework repository. +RESULTS_BASE_DIR: path to the "results" sub-directory you just created + +3. run minimization script: +- python3 perform-minimization.py +- This will generate and store the minimized files for each jdk version in "results" + +GENERATE DIFF JSONS +4. run JSON generation script: +- python3 generate-json.py +- This will compare and store differences between files in a sub-directory called json_files in a directory called "report-files" +- This script can be modified to only generate JSON's for a defined sub-set of minimized files by altering the versions array in the processing portion + +GENERATE CSVs +5. run CSV generation script: +- python3 generate-csv.py +- This will generate a csv tracker for the review status of classes by iterating over the JSON files generated in the previous step +- NOTE: when introducing a new jdk version, this file must be modified to generate a CSV for ONLY the newly introduced jdk version, or past CSVs will be completely reset. + +GENERATE HTML Report +6. run HTML report generation script: + +- python3 generate-report.py +- this creates and opens a report in your browser, loading in information from the CSVs and JSONs generated in the previous step + +----------- + +Use + +Once a method has been reviewed, updated the final column in each row "checked" by changing the entry from "False" to "True". The next time the report generation script is run, a tag will appear on the report that indicates its updated status. You must re-run the report generation script "python3 generate-report.py" to see the changes. + +----------- + +Future Updates + +New JDK Versions: +When a new JDK version is released, update the branch list in the scripts to only include the newly introuced versions. This ensures that processing a new branch does not overwrite progress in previous versions. \ No newline at end of file diff --git a/report-generation/config.py b/report-generation/config.py new file mode 100644 index 00000000000..58986208afe --- /dev/null +++ b/report-generation/config.py @@ -0,0 +1,4 @@ +# Update these absolute paths for your environment +JDK_REPO_PATH = "/Users/tinaxia/Desktop/eisop/jdk" +CHECKER_FRAMEWORK_REPO_PATH = "/Users/tinaxia/Desktop/eisop/checker-framework" +RESULTS_BASE_DIR = "/Users/tinaxia/Desktop/eisop/jdk/report-generation/results" diff --git a/report-generation/generate-csv.py b/report-generation/generate-csv.py new file mode 100644 index 00000000000..de9c4df266d --- /dev/null +++ b/report-generation/generate-csv.py @@ -0,0 +1,62 @@ +import json +import os +import csv + +csv_folder = "../report-files/csv_reports" +os.makedirs(csv_folder, exist_ok=True) + +data = {} +json_folder = "../report-files/json_files" +os.makedirs(csv_folder, exist_ok=True) + +json_files = [f for f in os.listdir(json_folder) if f.endswith(".json")] + +#this list must be modified for newly added jdk versions, so that past versions are not overwitten with every method review status set to False +versions_to_process = ['jdk-18', 'jdk-19', 'jdk-20', 'jdk-21', 'jdk-22', 'jdk-23', 'jdk-24'] + +for filename in os.listdir(json_folder): + if filename.endswith(".json"): + version = os.path.splitext(filename)[0] + #processes only the versions in the versions_to_process list + if version not in versions_to_process: + continue + file_path = os.path.join(json_folder, filename) + with open(file_path, "r", encoding="utf-8") as f: + file_data = json.load(f) + data[version] = file_data[version] + +for version in data: + #gets current csv path + current_csv = os.path.join(csv_folder, f"{version}_methods.csv") + with open(current_csv, "w", newline='', encoding="utf-8") as csvfile: + + fieldnames = ["version", "file_url", "method", "checked"] + + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + #since methods are grouped by file_url in the methods json + for file_url, file_data in data[version].items(): + new_changes = file_data.get("new_changes", {}) + new_classes = new_changes.get("new_classes", []) + new_methods = new_changes.get("new_methods", []) + + writer.writerow({ + "version": version, + "file_url": file_url, + "checked": "False" + }) + + # Write new class methods + for cls in new_classes: + declaration = cls.get("declaration") + + #store checked status for newly introduced classes + writer.writerow({ + "version": version, + "file_url": file_url, + "method": cls.get("declaration"), + "checked": "False" + }) + + print(f"Created csv for {version}") diff --git a/report-generation/generate-json.py b/report-generation/generate-json.py new file mode 100644 index 00000000000..93643e286e4 --- /dev/null +++ b/report-generation/generate-json.py @@ -0,0 +1,154 @@ +import os +import json +import difflib +import re + +#matches for empty lines, "import", any comment "/*", and closing braces (}) +pattern = re.compile(r'^(?:\s*$|import\b|.*\/\*.*|\s*\*|\s*\})') + +def extract_new_changes(old_lines, new_lines): + + #get list of differences + diff = list(difflib.ndiff(old_lines, new_lines)) + new_changes = [] #list of tuples (new_line_index, content) + new_index = 0 #get index to check whether method belongs to a new class + + for line in diff: + if line.startswith('+ '): + content = line[2:].strip() + if not pattern.match(content): + new_changes.append((new_index, content)) + + #increase line counter + if line.startswith(' ') or line.startswith('+ '): + new_index += 1 + + #creates a list to get the position of every class + class_positions = [] + for i, line in enumerate(new_lines): + + if re.search(r'\bclass\b', line): + class_positions.append((i, line.strip())) + + new_classes = {} + #for new methods that have been added to an existing class + new_methods = [] + + for idx, content in new_changes: + if re.search(r'\bclass\b', content): + new_classes[idx] = {"declaration": content, "methods": []} + + for idx, content in new_changes: + if re.search(r'\bclass\b', content): + continue #already handled + + parent_class_idx = None + for pos, decl in class_positions: + if pos <= idx: + parent_class_idx = pos + else: + break + + if parent_class_idx is not None and parent_class_idx in new_classes: + new_classes[parent_class_idx]["methods"].append(content) + else: + new_methods.append(content) + + return { + "new_classes": list(new_classes.values()), + "new_methods": new_methods + } + +#https://stackoverflow.com/questions/8625991/use-python-os-walk-to-identify-a-list-of-files +#get all java files in every subdirectory of results (after minimization is performed) +def list_java_files(directory): + java_files = [] + for root, _, files in os.walk(directory): + for file in files: + full_path = os.path.join(root, file) + relative_path = os.path.relpath(full_path, directory) + java_files.append(relative_path) + + #list of relative path of every file + return java_files + +def read_file_lines(filepath): + + with open(filepath, 'r', encoding='utf-8') as f: + return f.readlines() + +#https://docs.python.org/3/library/difflib.html +#uses python's difflib to compare two lines at a time +def extract_new_lines(old_lines, new_lines): + + diff = difflib.ndiff(old_lines, new_lines) + newly_added = [] + + for line in diff: + #new lines will start with + + if line.startswith('+ '): + content = line[2:].strip() + #makes sure a new line passes regex check before appending + if not pattern.match(content): + newly_added.append(content) + return newly_added + +#returns a dictionary of new changes between subsequent JDK versions +def compare_version_pair(old_version_dir, new_version_dir): + new_files = list_java_files(new_version_dir) + file_diff = {} + + for rel_path in new_files: + #generates github URL + branch = os.path.basename(new_version_dir) + url = f"https://github.com/eisop/jdk/tree/{branch}" + parts = rel_path.split(os.path.sep) + url = url + '/' + '/'.join(parts[1:]) + + new_file_path = os.path.join(new_version_dir, rel_path) + old_file_path = os.path.join(old_version_dir, rel_path) + + if os.path.exists(old_file_path): + old_lines = read_file_lines(old_file_path) + else: + old_lines = [] + + new_lines = read_file_lines(new_file_path) + + changes = extract_new_changes(old_lines, new_lines) + if changes["new_classes"] or changes["new_methods"]: + file_diff[url] = { + "new_changes": changes + } + return file_diff + +#this list must be modified for newly added jdk versions +versions = ['master', 'jdk-18', 'jdk-19', 'jdk-20', 'jdk-21', 'jdk-22', 'jdk-23', 'jdk-24'] + +base_path = './results/' + +json_folder = "../report-files/json_files" +os.makedirs(json_folder, exist_ok=True) + +#compares all subsequent versions, creates a seperate json file for each version pair +for i in range(1, len(versions)): + + results = {} + + old_version = versions[i - 1] + new_version = versions[i] + + version_pair_key = new_version + + old_dir = os.path.join(base_path, old_version) + new_dir = os.path.join(base_path, new_version) + + current_diff = compare_version_pair(old_dir, new_dir) + results[version_pair_key] = current_diff + + current_json = os.path.join(json_folder, f"{versions[i]}.json") + + with open(current_json, 'w', encoding='utf-8') as out_f: + json.dump(results, out_f, indent=2) + + print(f"Created json for {old_version} -> {new_version}") \ No newline at end of file diff --git a/report-generation/generate-report.py b/report-generation/generate-report.py new file mode 100644 index 00000000000..3efefbf6280 --- /dev/null +++ b/report-generation/generate-report.py @@ -0,0 +1,181 @@ +import json +import os +import csv +import webbrowser +from html import escape + +data = {} +json_folder = "../report-files/json_files" + +#loads data from all present json files +for filename in os.listdir(json_folder): + if filename.endswith(".json"): + version = os.path.splitext(filename)[0] + file_path = os.path.join(json_folder, filename) + with open(file_path, "r", encoding="utf-8") as f: + file_data = json.load(f) + data[version] = file_data[version] + +checked_methods = {} +csv_folder = "../report-files/csv_reports" + +#loads the review status from the csv files +for version in data: + csv_file = os.path.join(csv_folder, f"{version}_methods.csv") + if os.path.exists(csv_file): + with open(csv_file, "r", encoding="utf-8") as f_csv: + reader = csv.DictReader(f_csv) + for row in reader: + key = (row["version"], row["file_url"], row["method"]) + checked_methods[key] = (row["checked"].strip().lower() == "true") + +output_dir = os.path.join("..", "report-files", "reports") +os.makedirs(output_dir, exist_ok=True) + +custom_css = """ + +""" + +# HTML for the index page +index_content = f""" + +
+ + +