diff --git a/report-generation/README.md b/report-generation/README.md new file mode 100644 index 00000000000..61c1b411cfd --- /dev/null +++ b/report-generation/README.md @@ -0,0 +1,54 @@ +JDK API Changes Report Generation + +Tina Xia, tzxia@uwaterloo.ca for issues or questions +Friday, April 2, 2025 + +This project automates the review process for newly introduced methods across subsequent JDK versions. + +----------- + +Configuration + +PERFORMING MINIMIZATION AND STORING RESULTS +1. Create a sub-directory called "results" in the "report-generation" directory + +2. Update the absolute paths in config.py to match your environment: + +JDK_REPO_PATH: path to your local JDK repository. +CHECKER_FRAMEWORK_REPO_PATH: path to your local Checker Framework repository. +RESULTS_BASE_DIR: path to the "results" sub-directory you just created + +3. run minimization script: +- python3 perform-minimization.py +- This will generate and store the minimized files for each jdk version in "results" + +GENERATE DIFF JSONS +4. run JSON generation script: +- python3 generate-json.py +- This will compare and store differences between files in a sub-directory called json_files in a directory called "report-files" +- This script can be modified to only generate JSON's for a defined sub-set of minimized files by altering the versions array in the processing portion + +GENERATE CSVs +5. run CSV generation script: +- python3 generate-csv.py +- This will generate a csv tracker for the review status of classes by iterating over the JSON files generated in the previous step +- NOTE: when introducing a new jdk version, this file must be modified to generate a CSV for ONLY the newly introduced jdk version, or past CSVs will be completely reset. + +GENERATE HTML Report +6. run HTML report generation script: + +- python3 generate-report.py +- this creates and opens a report in your browser, loading in information from the CSVs and JSONs generated in the previous step + +----------- + +Use + +Once a method has been reviewed, updated the final column in each row "checked" by changing the entry from "False" to "True". The next time the report generation script is run, a tag will appear on the report that indicates its updated status. You must re-run the report generation script "python3 generate-report.py" to see the changes. + +----------- + +Future Updates + +New JDK Versions: +When a new JDK version is released, update the branch list in the scripts to only include the newly introuced versions. This ensures that processing a new branch does not overwrite progress in previous versions. \ No newline at end of file diff --git a/report-generation/config.py b/report-generation/config.py new file mode 100644 index 00000000000..58986208afe --- /dev/null +++ b/report-generation/config.py @@ -0,0 +1,4 @@ +# Update these absolute paths for your environment +JDK_REPO_PATH = "/Users/tinaxia/Desktop/eisop/jdk" +CHECKER_FRAMEWORK_REPO_PATH = "/Users/tinaxia/Desktop/eisop/checker-framework" +RESULTS_BASE_DIR = "/Users/tinaxia/Desktop/eisop/jdk/report-generation/results" diff --git a/report-generation/generate-csv.py b/report-generation/generate-csv.py new file mode 100644 index 00000000000..de9c4df266d --- /dev/null +++ b/report-generation/generate-csv.py @@ -0,0 +1,62 @@ +import json +import os +import csv + +csv_folder = "../report-files/csv_reports" +os.makedirs(csv_folder, exist_ok=True) + +data = {} +json_folder = "../report-files/json_files" +os.makedirs(csv_folder, exist_ok=True) + +json_files = [f for f in os.listdir(json_folder) if f.endswith(".json")] + +#this list must be modified for newly added jdk versions, so that past versions are not overwitten with every method review status set to False +versions_to_process = ['jdk-18', 'jdk-19', 'jdk-20', 'jdk-21', 'jdk-22', 'jdk-23', 'jdk-24'] + +for filename in os.listdir(json_folder): + if filename.endswith(".json"): + version = os.path.splitext(filename)[0] + #processes only the versions in the versions_to_process list + if version not in versions_to_process: + continue + file_path = os.path.join(json_folder, filename) + with open(file_path, "r", encoding="utf-8") as f: + file_data = json.load(f) + data[version] = file_data[version] + +for version in data: + #gets current csv path + current_csv = os.path.join(csv_folder, f"{version}_methods.csv") + with open(current_csv, "w", newline='', encoding="utf-8") as csvfile: + + fieldnames = ["version", "file_url", "method", "checked"] + + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + #since methods are grouped by file_url in the methods json + for file_url, file_data in data[version].items(): + new_changes = file_data.get("new_changes", {}) + new_classes = new_changes.get("new_classes", []) + new_methods = new_changes.get("new_methods", []) + + writer.writerow({ + "version": version, + "file_url": file_url, + "checked": "False" + }) + + # Write new class methods + for cls in new_classes: + declaration = cls.get("declaration") + + #store checked status for newly introduced classes + writer.writerow({ + "version": version, + "file_url": file_url, + "method": cls.get("declaration"), + "checked": "False" + }) + + print(f"Created csv for {version}") diff --git a/report-generation/generate-json.py b/report-generation/generate-json.py new file mode 100644 index 00000000000..93643e286e4 --- /dev/null +++ b/report-generation/generate-json.py @@ -0,0 +1,154 @@ +import os +import json +import difflib +import re + +#matches for empty lines, "import", any comment "/*", and closing braces (}) +pattern = re.compile(r'^(?:\s*$|import\b|.*\/\*.*|\s*\*|\s*\})') + +def extract_new_changes(old_lines, new_lines): + + #get list of differences + diff = list(difflib.ndiff(old_lines, new_lines)) + new_changes = [] #list of tuples (new_line_index, content) + new_index = 0 #get index to check whether method belongs to a new class + + for line in diff: + if line.startswith('+ '): + content = line[2:].strip() + if not pattern.match(content): + new_changes.append((new_index, content)) + + #increase line counter + if line.startswith(' ') or line.startswith('+ '): + new_index += 1 + + #creates a list to get the position of every class + class_positions = [] + for i, line in enumerate(new_lines): + + if re.search(r'\bclass\b', line): + class_positions.append((i, line.strip())) + + new_classes = {} + #for new methods that have been added to an existing class + new_methods = [] + + for idx, content in new_changes: + if re.search(r'\bclass\b', content): + new_classes[idx] = {"declaration": content, "methods": []} + + for idx, content in new_changes: + if re.search(r'\bclass\b', content): + continue #already handled + + parent_class_idx = None + for pos, decl in class_positions: + if pos <= idx: + parent_class_idx = pos + else: + break + + if parent_class_idx is not None and parent_class_idx in new_classes: + new_classes[parent_class_idx]["methods"].append(content) + else: + new_methods.append(content) + + return { + "new_classes": list(new_classes.values()), + "new_methods": new_methods + } + +#https://stackoverflow.com/questions/8625991/use-python-os-walk-to-identify-a-list-of-files +#get all java files in every subdirectory of results (after minimization is performed) +def list_java_files(directory): + java_files = [] + for root, _, files in os.walk(directory): + for file in files: + full_path = os.path.join(root, file) + relative_path = os.path.relpath(full_path, directory) + java_files.append(relative_path) + + #list of relative path of every file + return java_files + +def read_file_lines(filepath): + + with open(filepath, 'r', encoding='utf-8') as f: + return f.readlines() + +#https://docs.python.org/3/library/difflib.html +#uses python's difflib to compare two lines at a time +def extract_new_lines(old_lines, new_lines): + + diff = difflib.ndiff(old_lines, new_lines) + newly_added = [] + + for line in diff: + #new lines will start with + + if line.startswith('+ '): + content = line[2:].strip() + #makes sure a new line passes regex check before appending + if not pattern.match(content): + newly_added.append(content) + return newly_added + +#returns a dictionary of new changes between subsequent JDK versions +def compare_version_pair(old_version_dir, new_version_dir): + new_files = list_java_files(new_version_dir) + file_diff = {} + + for rel_path in new_files: + #generates github URL + branch = os.path.basename(new_version_dir) + url = f"https://github.com/eisop/jdk/tree/{branch}" + parts = rel_path.split(os.path.sep) + url = url + '/' + '/'.join(parts[1:]) + + new_file_path = os.path.join(new_version_dir, rel_path) + old_file_path = os.path.join(old_version_dir, rel_path) + + if os.path.exists(old_file_path): + old_lines = read_file_lines(old_file_path) + else: + old_lines = [] + + new_lines = read_file_lines(new_file_path) + + changes = extract_new_changes(old_lines, new_lines) + if changes["new_classes"] or changes["new_methods"]: + file_diff[url] = { + "new_changes": changes + } + return file_diff + +#this list must be modified for newly added jdk versions +versions = ['master', 'jdk-18', 'jdk-19', 'jdk-20', 'jdk-21', 'jdk-22', 'jdk-23', 'jdk-24'] + +base_path = './results/' + +json_folder = "../report-files/json_files" +os.makedirs(json_folder, exist_ok=True) + +#compares all subsequent versions, creates a seperate json file for each version pair +for i in range(1, len(versions)): + + results = {} + + old_version = versions[i - 1] + new_version = versions[i] + + version_pair_key = new_version + + old_dir = os.path.join(base_path, old_version) + new_dir = os.path.join(base_path, new_version) + + current_diff = compare_version_pair(old_dir, new_dir) + results[version_pair_key] = current_diff + + current_json = os.path.join(json_folder, f"{versions[i]}.json") + + with open(current_json, 'w', encoding='utf-8') as out_f: + json.dump(results, out_f, indent=2) + + print(f"Created json for {old_version} -> {new_version}") \ No newline at end of file diff --git a/report-generation/generate-report.py b/report-generation/generate-report.py new file mode 100644 index 00000000000..3efefbf6280 --- /dev/null +++ b/report-generation/generate-report.py @@ -0,0 +1,181 @@ +import json +import os +import csv +import webbrowser +from html import escape + +data = {} +json_folder = "../report-files/json_files" + +#loads data from all present json files +for filename in os.listdir(json_folder): + if filename.endswith(".json"): + version = os.path.splitext(filename)[0] + file_path = os.path.join(json_folder, filename) + with open(file_path, "r", encoding="utf-8") as f: + file_data = json.load(f) + data[version] = file_data[version] + +checked_methods = {} +csv_folder = "../report-files/csv_reports" + +#loads the review status from the csv files +for version in data: + csv_file = os.path.join(csv_folder, f"{version}_methods.csv") + if os.path.exists(csv_file): + with open(csv_file, "r", encoding="utf-8") as f_csv: + reader = csv.DictReader(f_csv) + for row in reader: + key = (row["version"], row["file_url"], row["method"]) + checked_methods[key] = (row["checked"].strip().lower() == "true") + +output_dir = os.path.join("..", "report-files", "reports") +os.makedirs(output_dir, exist_ok=True) + +custom_css = """ + +""" + +# HTML for the index page +index_content = f""" + + + + + JDK API Changes + + {custom_css} + + +
+

JDK API Changes

+ +
+ + + +""" +#index page into the output_dir (report-files/reports) +index_path = os.path.join(output_dir, "index.html") +with open(index_path, "w", encoding="utf-8") as f_index: + f_index.write(index_content) + +webbrowser.open("file://" + os.path.abspath(index_path)) diff --git a/report-generation/perform-minimization.py b/report-generation/perform-minimization.py new file mode 100644 index 00000000000..9bb3568dae5 --- /dev/null +++ b/report-generation/perform-minimization.py @@ -0,0 +1,109 @@ +import os +import subprocess +import shutil +import platform +import config + +#update these absolute paths for your environment +JDK_REPO_PATH = config.JDK_REPO_PATH +CHECKER_FRAMEWORK_REPO_PATH = config.CHECKER_FRAMEWORK_REPO_PATH +RESULTS_BASE_DIR = config.RESULTS_BASE_DIR + +if platform.system() == "Windows": + gradle_task_cmd = ["cmd", "/c", "gradlew.bat", "copyAndMinimizeAnnotatedJdkFiles"] +else: + gradle_task_cmd = ["sh", "./gradlew", "copyAndMinimizeAnnotatedJdkFiles"] + +#error messages for missing directories +if not os.path.isdir(JDK_REPO_PATH): + print(f"'{JDK_REPO_PATH}' not found. Please ensure you have the JDK repo on your local machine") + exit(1) + +if not os.path.isdir(CHECKER_FRAMEWORK_REPO_PATH): + print(f"'{CHECKER_FRAMEWORK_REPO_PATH}' not found. Please ensure you have the Checker Framework repo on your local machine") + exit(1) + +if not os.path.isdir(RESULTS_BASE_DIR): + print(f"'{RESULTS_BASE_DIR}' not found. Please create this directory in the report-generation directory.") + exit(1) + + +#move into JDK repository +os.chdir(JDK_REPO_PATH) + +initial_branch_result = subprocess.run( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + capture_output=True, text=True, check=True +) +initial_branch = initial_branch_result.stdout.strip() + +#fetch remote branches +subprocess.run(["git", "fetch", "--all", "--prune"], check=True) + +#get list of remote branches using ls-remote +result = subprocess.run(["git", "ls-remote", "--heads", "origin"], + capture_output=True, text=True) +branches = [line.split("\t")[1].replace("refs/heads/", "") + for line in result.stdout.splitlines() if line.strip()] + +#processing +for jdk_version in branches: + print(f"\nprocessing version: {jdk_version}") + + # Checkout the branch and reset to the remote version + try: + subprocess.run(["git", "checkout", jdk_version], check=True) + subprocess.run(["git", "reset", "--hard", f"origin/{jdk_version}"], check=True) + except subprocess.CalledProcessError as e: + print(f"Error checking out branch {jdk_version}: {e.stderr}") + continue + + print(f"Checked out JDK version: {jdk_version}") + + #move into checker-framework to run minimization + try: + os.chdir(CHECKER_FRAMEWORK_REPO_PATH) + except FileNotFoundError: + print(f"Could not find {CHECKER_FRAMEWORK_REPO_PATH}") + exit(1) + + # Run the minimization Gradle task + print(f"Running Gradle task for {jdk_version}...") + try: + subprocess.run(gradle_task_cmd, check=True) + print(f"Gradle task done for {jdk_version}!") + except subprocess.CalledProcessError as e: + print(f"Error running for {jdk_version}: {e.stderr}") + exit(1) + + # get output directory of the gradle task + OUTPUT_DIR = os.path.join(CHECKER_FRAMEWORK_REPO_PATH, "framework", "build", "generated", "resources", "annotated-jdk") + if not os.path.exists(OUTPUT_DIR): + print("error finding otuput directory") + os.chdir(JDK_REPO_PATH) + continue + + # Define the final results directory for this branch + result_dir = os.path.join(RESULTS_BASE_DIR, jdk_version) + if os.path.exists(result_dir): + shutil.rmtree(result_dir) + os.makedirs(result_dir, exist_ok=True) + + #move the output directory into results directory + shutil.move(OUTPUT_DIR, result_dir) + print(f"Result stored in: {result_dir}") + + #switch back to JDK repo to process next version + os.chdir(JDK_REPO_PATH) + +print("\nAll minimizations completed!") + +try: + #move into the jdk + os.chdir(JDK_REPO_PATH) + #run git checkout on the initial branch + subprocess.run(["git", "checkout", initial_branch], check=True) + print(f"\nMoved back into the initial branch {initial_branch}") +except Exception as e: + #output error in switching back to initial branch + print(f"Error switching back to the initial branch '{initial_branch}'. Error: {e}")