From 41e130c5eeefc658305fe8f5f8fabfac3e241a61 Mon Sep 17 00:00:00 2001 From: janEbert Date: Mon, 23 Feb 2026 15:42:37 +0100 Subject: [PATCH 1/2] Close file objects manually ... just to ensure that they're closed as early as possible and don't potentially leak any resources. --- .../evaluation/evaluate_mathvista.py | 3 +- .../evaluation/evaluate_ocrbench.py | 3 +- .../evaluation/evaluate_video_mvbench.py | 3 +- .../evaluate_video_phys_game_bench.py | 3 +- .../multimodal/evaluation/evaluate_vqav2.py | 3 +- .../evaluation/evaluation_datasets.py | 233 +++++++++++------- examples/multimodal/evaluation/mmmu_utils.py | 8 +- 7 files changed, 160 insertions(+), 96 deletions(-) diff --git a/examples/multimodal/evaluation/evaluate_mathvista.py b/examples/multimodal/evaluation/evaluate_mathvista.py index cb6b2ebd236..48d1a0c37a9 100644 --- a/examples/multimodal/evaluation/evaluate_mathvista.py +++ b/examples/multimodal/evaluation/evaluate_mathvista.py @@ -73,7 +73,8 @@ def extract_answer(text): def compute_mathvista_accuracy(result_file): """Compute MathVista accuracy.""" - merged_results = json.load(open(result_file)) + with open(result_file, "r") as f: + merged_results = json.load(f) vqa = VQAEval(vqa=None, vqaRes=None) acc = 0 diff --git a/examples/multimodal/evaluation/evaluate_ocrbench.py b/examples/multimodal/evaluation/evaluate_ocrbench.py index b43d195494f..a2af310705c 100644 --- a/examples/multimodal/evaluation/evaluate_ocrbench.py +++ b/examples/multimodal/evaluation/evaluate_ocrbench.py @@ -32,7 +32,8 @@ def merge_input_files(input_path): def compute_ocrbench_score(result_file): """Compute OCRBench score.""" - merged_results = json.load(open(result_file)) + with open(result_file, "r") as f: + merged_results = json.load(f) # OCRBench score calculation is adopted from https://github.com/Yuliang-Liu/MultimodalOCR/blob/1b7713f44c91f30f64efb6d3e494c416861ef15f/example.py#L1 # MIT License. Copyright (c) 2023 Yuliang Liu diff --git a/examples/multimodal/evaluation/evaluate_video_mvbench.py b/examples/multimodal/evaluation/evaluate_video_mvbench.py index 0efcdbedb14..d1a683a07b0 100644 --- a/examples/multimodal/evaluation/evaluate_video_mvbench.py +++ b/examples/multimodal/evaluation/evaluate_video_mvbench.py @@ -98,7 +98,8 @@ def combine_all_res(acc_dict): def mvbench_eval(input_path): result_file_path = merge_input_files(input_path) - merged_results = json.load(open(result_file_path)) + with open(result_file_path, "r") as f: + merged_results = json.load(f) acc_dict = create_result_dict(merged_results) return combine_all_res(acc_dict) diff --git a/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py b/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py index feb4c558120..775a3c5006e 100644 --- a/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py +++ b/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py @@ -83,7 +83,8 @@ def compute_all_acc(result_list): def phys_game_bench_eval(input_path): result_file_path = merge_input_files(input_path) - merged_results = json.load(open(result_file_path)) + with open(result_file_path, "r") as f: + merged_results = json.load(f) return compute_all_acc(merged_results) diff --git a/examples/multimodal/evaluation/evaluate_vqav2.py b/examples/multimodal/evaluation/evaluate_vqav2.py index 9789e30379b..ff8d57a3119 100644 --- a/examples/multimodal/evaluation/evaluate_vqav2.py +++ b/examples/multimodal/evaluation/evaluate_vqav2.py @@ -93,7 +93,8 @@ def is_number(n: str): def compute_vqa_accuracy(result_file, task): """Compute VQA accuracy.""" - merged_results = json.load(open(result_file)) + with open(result_file, "r") as f: + merged_results = json.load(f) vqa = VQAEval(vqa=None, vqaRes=None) all_acc = [] diff --git a/examples/multimodal/evaluation/evaluation_datasets.py b/examples/multimodal/evaluation/evaluation_datasets.py index ebf0e2e15ee..443d3635749 100644 --- a/examples/multimodal/evaluation/evaluation_datasets.py +++ b/examples/multimodal/evaluation/evaluation_datasets.py @@ -10,6 +10,7 @@ import torch from image_processing import ImageTransform from PIL import Image +from PIL.ImageFile import ImageFile from megatron.training import print_rank_0 @@ -44,7 +45,8 @@ def __init__( vision_model_type, split="validation" ): - samples = json.load(open(gt_path, encoding='utf-8')) + with open(gt_path, "r", encoding="utf-8") as f: + samples = json.load(f) if "data" in samples: samples = samples["data"] @@ -79,15 +81,21 @@ def __getitem__(self, idx): if not os.path.exists(img_file): img_file = img_file.replace('.jpg', '.png') - img = Image.open(img_file) - imgs = self._transform_img( - img, - self._img_h, - self._img_w, - self._use_tiling, - self._max_num_tiles, - self._use_thumbnail, - augment=False, + with Image.open(img_file) as img: + imgs = self._transform_img( + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, + ) + # If the returned elements are not tensors, we may still have to keep the `ImageFile` + # returned by `Image.open` open. + assert not imgs or isinstance(imgs[0], torch.Tensor), ( + "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does " + "not need to still be open or adjust its lifetime" ) tile_count = torch.tensor([len(imgs)], dtype=torch.int) @@ -133,7 +141,8 @@ def __init__( ) image_files = image_files[lb:ub] - gts = json.load(open(gt_path)) + with open(gt_path, "r") as f: + gts = json.load(f) answers = defaultdict(list) for gt in gts["annotations"]: answers[gt["image_id"]].append(gt['caption']) @@ -157,15 +166,21 @@ def __getitem__(self, idx): except: image_id = int(img_file.split("/")[-1].split(".")[0]) # flickr - img = Image.open(img_file) - imgs = self._transform_img( - img, - self._img_h, - self._img_w, - self._use_tiling, - self._max_num_tiles, - self._use_thumbnail, - augment=False, + with Image.open(img_file) as img: + imgs = self._transform_img( + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, + ) + # If the returned elements are not tensors, we may still have to keep the `ImageFile` + # returned by `Image.open` open. + assert not imgs or isinstance(imgs[0], torch.Tensor), ( + "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does " + "not need to still be open or adjust its lifetime" ) tile_count = torch.tensor([len(imgs)], dtype=torch.int) @@ -467,7 +482,8 @@ def __init__( num_frames, vision_model_type, ): - ground_truth_original = json.load(open(gt_path)) + with open(gt_path, "r") as f: + ground_truth_original = json.load(f) ground_truth = [] for gt in ground_truth_original: video_path = gt["url"] @@ -567,7 +583,8 @@ def __init__( use_thumbnail, vision_model_type, ): - gt = json.load(open(gt_path, encoding='utf-8')) + with open(gt_path, "r", encoding="utf-8") as f: + gt = json.load(f) if num_partitions > 0: start_idx, end_idx = _get_partition_bounds( @@ -590,15 +607,21 @@ def __len__(self): def __getitem__(self, idx): img_path = os.path.join(self._input_image_path, self._gt[idx]['image_path']) - img = Image.open(img_path) - imgs = self._transform_img( - img, - self._img_h, - self._img_w, - self._use_tiling, - self._max_num_tiles, - self._use_thumbnail, - augment=False, + with Image.open(img_path) as img: + imgs = self._transform_img( + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, + ) + # If the returned elements are not tensors, we may still have to keep the `ImageFile` + # returned by `Image.open` open. + assert not imgs or isinstance(imgs[0], torch.Tensor), ( + "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does " + "not need to still be open or adjust its lifetime" ) tile_count = torch.tensor([len(imgs)], dtype=torch.int) @@ -759,15 +782,21 @@ def __len__(self): def __getitem__(self, idx): img_path = os.path.join(self._input_image_path, self._gt[idx]['image'].split("/")[-1]) - img = Image.open(img_path) - imgs = self._transform_img( - img, - self._img_h, - self._img_w, - self._use_tiling, - self._max_num_tiles, - self._use_thumbnail, - augment=False, + with Image.open(img_path) as img: + imgs = self._transform_img( + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, + ) + # If the returned elements are not tensors, we may still have to keep the `ImageFile` + # returned by `Image.open` open. + assert not imgs or isinstance(imgs[0], torch.Tensor), ( + "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does " + "not need to still be open or adjust its lifetime" ) tile_count = torch.tensor([len(imgs)], dtype=torch.int) @@ -831,15 +860,21 @@ def __len__(self): def __getitem__(self, idx): img_path = os.path.join(self._input_image_path, self._gt[idx]['image']) - img = Image.open(img_path) - imgs = self._transform_img( - img, - self._img_h, - self._img_w, - self._use_tiling, - self._max_num_tiles, - self._use_thumbnail, - augment=False, + with Image.open(img_path) as img: + imgs = self._transform_img( + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, + ) + # If the returned elements are not tensors, we may still have to keep the `ImageFile` + # returned by `Image.open` open. + assert not imgs or isinstance(imgs[0], torch.Tensor), ( + "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does " + "not need to still be open or adjust its lifetime" ) tile_count = torch.tensor([len(imgs)], dtype=torch.int) @@ -877,8 +912,8 @@ def __init__( use_thumbnail, vision_model_type, ): - gt = json.load(open(gt_path, encoding='utf-8')) - + with open(gt_path, "r", encoding="utf-8") as f: + gt = json.load(f) if num_partitions > 0: start_idx, end_idx = _get_partition_bounds( @@ -901,15 +936,21 @@ def __len__(self): def __getitem__(self, idx): img_path = os.path.join(self._input_image_path, self._gt[idx]['image']) - img = Image.open(img_path) - imgs = self._transform_img( - img, - self._img_h, - self._img_w, - self._use_tiling, - self._max_num_tiles, - self._use_thumbnail, - augment=False, + with Image.open(img_path) as img: + imgs = self._transform_img( + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, + ) + # If the returned elements are not tensors, we may still have to keep the `ImageFile` + # returned by `Image.open` open. + assert not imgs or isinstance(imgs[0], torch.Tensor), ( + "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does " + "not need to still be open or adjust its lifetime" ) question_id = int(self._gt[idx]['image'].replace(".webp", "")) @@ -1072,7 +1113,8 @@ def __init__( split ): - ground_truth_original = json.load(open(gt_path, encoding='utf-8')) + with open(gt_path, "r", encoding="utf-8") as f: + ground_truth_original = json.load(f) ground_truth = [] for gt in ground_truth_original: @@ -1309,28 +1351,37 @@ def __getitem__(self, idx): video_decode_func = self.decord_method[data['data_type']] - video_frames = video_decode_func(video_path, bound) - - imgs = [] - for img in video_frames: - from torchvision.transforms import ToPILImage - - if data['data_type'] == 'video': - to_pil = ToPILImage() - img = to_pil(img) - imgs += self._transform_img( - img, self._img_h, self._img_w, self._use_tiling, self._max_num_tiles, - self._use_thumbnail, augment=False - ) + try: + video_frames = video_decode_func(video_path, bound) + + imgs = [] + for img in video_frames: + from torchvision.transforms import ToPILImage + + if data['data_type'] == 'video': + to_pil = ToPILImage() + img = to_pil(img) + imgs += self._transform_img( + img, self._img_h, self._img_w, self._use_tiling, self._max_num_tiles, + self._use_thumbnail, augment=False + ) - num_tiles = torch.tensor([len(imgs)], dtype=torch.int) + num_tiles = torch.tensor([len(imgs)], dtype=torch.int) - q_id = data['question_id'] - metadata = {'task_type': data['task_type']} - question, answer = self.qa_template(data['data']) + q_id = data['question_id'] + metadata = {'task_type': data['task_type']} + question, answer = self.qa_template(data['data']) + tensor_imgs = torch.stack(imgs) + finally: + try: + for frame in video_frames: + if isinstance(frame, ImageFile): + frame.close() + except NameError: + pass return ( - torch.stack(imgs), + tensor_imgs, num_tiles, q_id, question, @@ -1378,15 +1429,21 @@ def __getitem__(self, idx): sample_imgs = [] sample_tile_count = [] for image_path in sample.get("image_paths", []): - img = Image.open(image_path) - imgs = self._transform_img( - img, - self._img_h, - self._img_w, - self._use_tiling, - self._max_num_tiles, - self._use_thumbnail, - augment=False, + with Image.open(image_path) as img: + imgs = self._transform_img( + img, + self._img_h, + self._img_w, + self._use_tiling, + self._max_num_tiles, + self._use_thumbnail, + augment=False, + ) + # If the returned elements are not tensors, we may still have to keep the `ImageFile` + # returned by `Image.open` open. + assert not imgs or isinstance(imgs[0], torch.Tensor), ( + "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does " + "not need to still be open or adjust its lifetime" ) sample_imgs.extend(imgs) diff --git a/examples/multimodal/evaluation/mmmu_utils.py b/examples/multimodal/evaluation/mmmu_utils.py index 61a876b067b..27fbb5e17bd 100644 --- a/examples/multimodal/evaluation/mmmu_utils.py +++ b/examples/multimodal/evaluation/mmmu_utils.py @@ -403,7 +403,8 @@ def calculate_ins_level_acc(results: Dict): def mmmu_main_eval(output_dict, task_cfg): - answer_dict = json.load(open(task_cfg["answer_dict"])) + with open(task_cfg["answer_dict"], "r") as f: + answer_dict = json.load(f) # group by category output_dict_w_cat = {} @@ -485,7 +486,8 @@ def mmmu_main_eval(output_dict, task_cfg): if __name__ == '__main__': - tasks = yaml.safe_load(open("eval_config/eval_mmmu_yi.yaml"))['datasets'] + with open("eval_config/eval_mmmu_yi.yaml", "r") as f: + tasks = yaml.safe_load(f)['datasets'] print(tasks) with open("eval_results.json") as f: @@ -532,4 +534,4 @@ def mmmu_main_eval(output_dict, task_cfg): x = mmmu_main_eval(eval_output_dict, task_cfg=tasks['mmmu']) - print(x) \ No newline at end of file + print(x) From bf910803f4bd379fc246d24e632a00ce25135d09 Mon Sep 17 00:00:00 2001 From: janEbert Date: Mon, 23 Feb 2026 23:10:50 +0100 Subject: [PATCH 2/2] Fix copyright headers --- examples/multimodal/evaluation/evaluate_mathvista.py | 2 ++ examples/multimodal/evaluation/evaluate_ocrbench.py | 2 ++ examples/multimodal/evaluation/evaluate_video_mvbench.py | 2 ++ .../multimodal/evaluation/evaluate_video_phys_game_bench.py | 2 ++ examples/multimodal/evaluation/evaluate_vqav2.py | 2 ++ examples/multimodal/evaluation/mmmu_utils.py | 2 ++ 6 files changed, 12 insertions(+) diff --git a/examples/multimodal/evaluation/evaluate_mathvista.py b/examples/multimodal/evaluation/evaluate_mathvista.py index 48d1a0c37a9..2461da0cecf 100644 --- a/examples/multimodal/evaluation/evaluate_mathvista.py +++ b/examples/multimodal/evaluation/evaluate_mathvista.py @@ -1,3 +1,5 @@ +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import argparse import json import re diff --git a/examples/multimodal/evaluation/evaluate_ocrbench.py b/examples/multimodal/evaluation/evaluate_ocrbench.py index a2af310705c..7fe5ce63698 100644 --- a/examples/multimodal/evaluation/evaluate_ocrbench.py +++ b/examples/multimodal/evaluation/evaluate_ocrbench.py @@ -1,3 +1,5 @@ +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import argparse import json diff --git a/examples/multimodal/evaluation/evaluate_video_mvbench.py b/examples/multimodal/evaluation/evaluate_video_mvbench.py index d1a683a07b0..7e7ec0270d3 100644 --- a/examples/multimodal/evaluation/evaluate_video_mvbench.py +++ b/examples/multimodal/evaluation/evaluate_video_mvbench.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import argparse import json diff --git a/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py b/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py index 775a3c5006e..a2fe02dfaf5 100644 --- a/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py +++ b/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import argparse import json diff --git a/examples/multimodal/evaluation/evaluate_vqav2.py b/examples/multimodal/evaluation/evaluate_vqav2.py index ff8d57a3119..8670d6ee2ad 100644 --- a/examples/multimodal/evaluation/evaluate_vqav2.py +++ b/examples/multimodal/evaluation/evaluate_vqav2.py @@ -1,3 +1,5 @@ +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import argparse import json from typing import List diff --git a/examples/multimodal/evaluation/mmmu_utils.py b/examples/multimodal/evaluation/mmmu_utils.py index 27fbb5e17bd..d5e5b31ed21 100644 --- a/examples/multimodal/evaluation/mmmu_utils.py +++ b/examples/multimodal/evaluation/mmmu_utils.py @@ -1,3 +1,5 @@ +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # The following code is adapted from # https://github.com/MMMU-Benchmark/MMMU/blob/main/mmmu/utils/data_utils.py, # which is licensed under the Apache License 2.0. More details on the license can be