From 41e130c5eeefc658305fe8f5f8fabfac3e241a61 Mon Sep 17 00:00:00 2001
From: janEbert <janpabloe@nvidia.com>
Date: Mon, 23 Feb 2026 15:42:37 +0100
Subject: [PATCH 1/2] Close file objects manually

... just to ensure that they're closed as early as possible and don't
potentially leak any resources.
---
 .../evaluation/evaluate_mathvista.py          |   3 +-
 .../evaluation/evaluate_ocrbench.py           |   3 +-
 .../evaluation/evaluate_video_mvbench.py      |   3 +-
 .../evaluate_video_phys_game_bench.py         |   3 +-
 .../multimodal/evaluation/evaluate_vqav2.py   |   3 +-
 .../evaluation/evaluation_datasets.py         | 233 +++++++++++-------
 examples/multimodal/evaluation/mmmu_utils.py  |   8 +-
 7 files changed, 160 insertions(+), 96 deletions(-)

diff --git a/examples/multimodal/evaluation/evaluate_mathvista.py b/examples/multimodal/evaluation/evaluate_mathvista.py
index cb6b2ebd236..48d1a0c37a9 100644
--- a/examples/multimodal/evaluation/evaluate_mathvista.py
+++ b/examples/multimodal/evaluation/evaluate_mathvista.py
@@ -73,7 +73,8 @@ def extract_answer(text):
 
 def compute_mathvista_accuracy(result_file):
     """Compute MathVista accuracy."""
-    merged_results = json.load(open(result_file))
+    with open(result_file, "r") as f:
+        merged_results = json.load(f)
 
     vqa = VQAEval(vqa=None, vqaRes=None)
     acc = 0
diff --git a/examples/multimodal/evaluation/evaluate_ocrbench.py b/examples/multimodal/evaluation/evaluate_ocrbench.py
index b43d195494f..a2af310705c 100644
--- a/examples/multimodal/evaluation/evaluate_ocrbench.py
+++ b/examples/multimodal/evaluation/evaluate_ocrbench.py
@@ -32,7 +32,8 @@ def merge_input_files(input_path):
 
 def compute_ocrbench_score(result_file):
     """Compute OCRBench score."""
-    merged_results = json.load(open(result_file))
+    with open(result_file, "r") as f:
+        merged_results = json.load(f)
 
     # OCRBench score calculation is adopted from https://github.com/Yuliang-Liu/MultimodalOCR/blob/1b7713f44c91f30f64efb6d3e494c416861ef15f/example.py#L1
     # MIT License. Copyright (c) 2023 Yuliang Liu
diff --git a/examples/multimodal/evaluation/evaluate_video_mvbench.py b/examples/multimodal/evaluation/evaluate_video_mvbench.py
index 0efcdbedb14..d1a683a07b0 100644
--- a/examples/multimodal/evaluation/evaluate_video_mvbench.py
+++ b/examples/multimodal/evaluation/evaluate_video_mvbench.py
@@ -98,7 +98,8 @@ def combine_all_res(acc_dict):
 def mvbench_eval(input_path):
     result_file_path = merge_input_files(input_path)
     
-    merged_results = json.load(open(result_file_path))
+    with open(result_file_path, "r") as f:
+        merged_results = json.load(f)
     acc_dict = create_result_dict(merged_results)
     
     return combine_all_res(acc_dict)
diff --git a/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py b/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py
index feb4c558120..775a3c5006e 100644
--- a/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py
+++ b/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py
@@ -83,7 +83,8 @@ def compute_all_acc(result_list):
 def phys_game_bench_eval(input_path):
     result_file_path = merge_input_files(input_path)
     
-    merged_results = json.load(open(result_file_path))
+    with open(result_file_path, "r") as f:
+        merged_results = json.load(f)
     
     return compute_all_acc(merged_results)
 
diff --git a/examples/multimodal/evaluation/evaluate_vqav2.py b/examples/multimodal/evaluation/evaluate_vqav2.py
index 9789e30379b..ff8d57a3119 100644
--- a/examples/multimodal/evaluation/evaluate_vqav2.py
+++ b/examples/multimodal/evaluation/evaluate_vqav2.py
@@ -93,7 +93,8 @@ def is_number(n: str):
 
 def compute_vqa_accuracy(result_file, task):
     """Compute VQA accuracy."""
-    merged_results = json.load(open(result_file))
+    with open(result_file, "r") as f:
+        merged_results = json.load(f)
 
     vqa = VQAEval(vqa=None, vqaRes=None)
     all_acc = []
diff --git a/examples/multimodal/evaluation/evaluation_datasets.py b/examples/multimodal/evaluation/evaluation_datasets.py
index ebf0e2e15ee..443d3635749 100644
--- a/examples/multimodal/evaluation/evaluation_datasets.py
+++ b/examples/multimodal/evaluation/evaluation_datasets.py
@@ -10,6 +10,7 @@
 import torch
 from image_processing import ImageTransform
 from PIL import Image
+from PIL.ImageFile import ImageFile
 
 from megatron.training import print_rank_0
 
@@ -44,7 +45,8 @@ def __init__(
         vision_model_type,
         split="validation"
     ):
-        samples = json.load(open(gt_path, encoding='utf-8'))
+        with open(gt_path, "r", encoding="utf-8") as f:
+            samples = json.load(f)
         if "data" in samples:
             samples = samples["data"]
 
@@ -79,15 +81,21 @@ def __getitem__(self, idx):
             if not os.path.exists(img_file):
                 img_file = img_file.replace('.jpg', '.png')
 
-        img = Image.open(img_file)
-        imgs = self._transform_img(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
+        with Image.open(img_file) as img:
+            imgs = self._transform_img(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+            )
+        # If the returned elements are not tensors, we may still have to keep the `ImageFile`
+        # returned by `Image.open` open.
+        assert not imgs or isinstance(imgs[0], torch.Tensor), (
+            "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does "
+            "not need to still be open or adjust its lifetime"
         )
         tile_count = torch.tensor([len(imgs)], dtype=torch.int)
 
@@ -133,7 +141,8 @@ def __init__(
             )
             image_files = image_files[lb:ub]
 
-        gts = json.load(open(gt_path))
+        with open(gt_path, "r") as f:
+            gts = json.load(f)
         answers = defaultdict(list)
         for gt in gts["annotations"]:
             answers[gt["image_id"]].append(gt['caption'])
@@ -157,15 +166,21 @@ def __getitem__(self, idx):
         except:
             image_id = int(img_file.split("/")[-1].split(".")[0])  # flickr
 
-        img = Image.open(img_file)
-        imgs = self._transform_img(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
+        with Image.open(img_file) as img:
+            imgs = self._transform_img(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+            )
+        # If the returned elements are not tensors, we may still have to keep the `ImageFile`
+        # returned by `Image.open` open.
+        assert not imgs or isinstance(imgs[0], torch.Tensor), (
+            "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does "
+            "not need to still be open or adjust its lifetime"
         )
 
         tile_count = torch.tensor([len(imgs)], dtype=torch.int)
@@ -467,7 +482,8 @@ def __init__(
         num_frames,
         vision_model_type,
     ):
-        ground_truth_original = json.load(open(gt_path))
+        with open(gt_path, "r") as f:
+            ground_truth_original = json.load(f)
         ground_truth = []
         for gt in ground_truth_original:
             video_path = gt["url"]
@@ -567,7 +583,8 @@ def __init__(
         use_thumbnail,
         vision_model_type,
     ):
-        gt = json.load(open(gt_path, encoding='utf-8'))
+        with open(gt_path, "r", encoding="utf-8") as f:
+            gt = json.load(f)
 
         if num_partitions > 0:
             start_idx, end_idx = _get_partition_bounds(
@@ -590,15 +607,21 @@ def __len__(self):
     def __getitem__(self, idx):
         img_path = os.path.join(self._input_image_path, self._gt[idx]['image_path'])
 
-        img = Image.open(img_path)
-        imgs = self._transform_img(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
+        with Image.open(img_path) as img:
+            imgs = self._transform_img(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+            )
+        # If the returned elements are not tensors, we may still have to keep the `ImageFile`
+        # returned by `Image.open` open.
+        assert not imgs or isinstance(imgs[0], torch.Tensor), (
+            "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does "
+            "not need to still be open or adjust its lifetime"
         )
 
         tile_count = torch.tensor([len(imgs)], dtype=torch.int)
@@ -759,15 +782,21 @@ def __len__(self):
     def __getitem__(self, idx):
         img_path = os.path.join(self._input_image_path, self._gt[idx]['image'].split("/")[-1])
 
-        img = Image.open(img_path)
-        imgs = self._transform_img(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
+        with Image.open(img_path) as img:
+            imgs = self._transform_img(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+            )
+        # If the returned elements are not tensors, we may still have to keep the `ImageFile`
+        # returned by `Image.open` open.
+        assert not imgs or isinstance(imgs[0], torch.Tensor), (
+            "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does "
+            "not need to still be open or adjust its lifetime"
         )
 
         tile_count = torch.tensor([len(imgs)], dtype=torch.int)
@@ -831,15 +860,21 @@ def __len__(self):
     def __getitem__(self, idx):
         img_path = os.path.join(self._input_image_path, self._gt[idx]['image'])
 
-        img = Image.open(img_path)
-        imgs = self._transform_img(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
+        with Image.open(img_path) as img:
+            imgs = self._transform_img(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+            )
+        # If the returned elements are not tensors, we may still have to keep the `ImageFile`
+        # returned by `Image.open` open.
+        assert not imgs or isinstance(imgs[0], torch.Tensor), (
+            "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does "
+            "not need to still be open or adjust its lifetime"
         )
 
         tile_count = torch.tensor([len(imgs)], dtype=torch.int)
@@ -877,8 +912,8 @@ def __init__(
         use_thumbnail,
         vision_model_type,
     ):
-        gt = json.load(open(gt_path, encoding='utf-8'))
-
+        with open(gt_path, "r", encoding="utf-8") as f:
+            gt = json.load(f)
 
         if num_partitions > 0:
             start_idx, end_idx = _get_partition_bounds(
@@ -901,15 +936,21 @@ def __len__(self):
 
     def __getitem__(self, idx):
         img_path = os.path.join(self._input_image_path, self._gt[idx]['image'])
-        img = Image.open(img_path)
-        imgs = self._transform_img(
-            img,
-            self._img_h,
-            self._img_w,
-            self._use_tiling,
-            self._max_num_tiles,
-            self._use_thumbnail,
-            augment=False,
+        with Image.open(img_path) as img:
+            imgs = self._transform_img(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+            )
+        # If the returned elements are not tensors, we may still have to keep the `ImageFile`
+        # returned by `Image.open` open.
+        assert not imgs or isinstance(imgs[0], torch.Tensor), (
+            "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does "
+            "not need to still be open or adjust its lifetime"
         )
 
         question_id = int(self._gt[idx]['image'].replace(".webp", ""))
@@ -1072,7 +1113,8 @@ def __init__(
         split
     ):
 
-        ground_truth_original = json.load(open(gt_path, encoding='utf-8'))
+        with open(gt_path, "r", encoding="utf-8") as f:
+            ground_truth_original = json.load(f)
 
         ground_truth = []
         for gt in ground_truth_original:
@@ -1309,28 +1351,37 @@ def __getitem__(self, idx):
 
         video_decode_func = self.decord_method[data['data_type']]
 
-        video_frames = video_decode_func(video_path, bound)
-
-        imgs = []
-        for img in video_frames:
-            from torchvision.transforms import ToPILImage
-
-            if data['data_type'] == 'video':
-                to_pil = ToPILImage()
-                img = to_pil(img)
-            imgs += self._transform_img(
-                img, self._img_h, self._img_w, self._use_tiling, self._max_num_tiles,
-                self._use_thumbnail, augment=False
-            )
+        try:
+            video_frames = video_decode_func(video_path, bound)
+
+            imgs = []
+            for img in video_frames:
+                from torchvision.transforms import ToPILImage
+
+                if data['data_type'] == 'video':
+                    to_pil = ToPILImage()
+                    img = to_pil(img)
+                imgs += self._transform_img(
+                    img, self._img_h, self._img_w, self._use_tiling, self._max_num_tiles,
+                    self._use_thumbnail, augment=False
+                )
 
-        num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
+            num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
 
-        q_id = data['question_id']
-        metadata = {'task_type': data['task_type']}
-        question, answer = self.qa_template(data['data'])
+            q_id = data['question_id']
+            metadata = {'task_type': data['task_type']}
+            question, answer = self.qa_template(data['data'])
 
+            tensor_imgs = torch.stack(imgs)
+        finally:
+            try:
+                for frame in video_frames:
+                    if isinstance(frame, ImageFile):
+                        frame.close()
+            except NameError:
+                pass
         return (
-            torch.stack(imgs),
+            tensor_imgs,
             num_tiles,
             q_id,
             question,
@@ -1378,15 +1429,21 @@ def __getitem__(self, idx):
         sample_imgs = []
         sample_tile_count = []
         for image_path in sample.get("image_paths", []):
-            img = Image.open(image_path)
-            imgs = self._transform_img(
-                img,
-                self._img_h,
-                self._img_w,
-                self._use_tiling,
-                self._max_num_tiles,
-                self._use_thumbnail,
-                augment=False,
+            with Image.open(image_path) as img:
+                imgs = self._transform_img(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    self._max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                )
+            # If the returned elements are not tensors, we may still have to keep the `ImageFile`
+            # returned by `Image.open` open.
+            assert not imgs or isinstance(imgs[0], torch.Tensor), (
+                "returned type is not expected list[torch.Tensor]; please ensure the `ImageFile` does "
+                "not need to still be open or adjust its lifetime"
             )
 
             sample_imgs.extend(imgs)
diff --git a/examples/multimodal/evaluation/mmmu_utils.py b/examples/multimodal/evaluation/mmmu_utils.py
index 61a876b067b..27fbb5e17bd 100644
--- a/examples/multimodal/evaluation/mmmu_utils.py
+++ b/examples/multimodal/evaluation/mmmu_utils.py
@@ -403,7 +403,8 @@ def calculate_ins_level_acc(results: Dict):
 
 
 def mmmu_main_eval(output_dict, task_cfg):
-    answer_dict = json.load(open(task_cfg["answer_dict"]))
+    with open(task_cfg["answer_dict"], "r") as f:
+        answer_dict = json.load(f)
 
     # group by category
     output_dict_w_cat = {}
@@ -485,7 +486,8 @@ def mmmu_main_eval(output_dict, task_cfg):
 
 
 if __name__ == '__main__':
-    tasks = yaml.safe_load(open("eval_config/eval_mmmu_yi.yaml"))['datasets']
+    with open("eval_config/eval_mmmu_yi.yaml", "r") as f:
+        tasks = yaml.safe_load(f)['datasets']
     print(tasks)
 
     with open("eval_results.json") as f:
@@ -532,4 +534,4 @@ def mmmu_main_eval(output_dict, task_cfg):
     x = mmmu_main_eval(eval_output_dict,
                    task_cfg=tasks['mmmu'])
 
-    print(x)
\ No newline at end of file
+    print(x)

From bf910803f4bd379fc246d24e632a00ce25135d09 Mon Sep 17 00:00:00 2001
From: janEbert <janpabloe@nvidia.com>
Date: Mon, 23 Feb 2026 23:10:50 +0100
Subject: [PATCH 2/2] Fix copyright headers

---
 examples/multimodal/evaluation/evaluate_mathvista.py            | 2 ++
 examples/multimodal/evaluation/evaluate_ocrbench.py             | 2 ++
 examples/multimodal/evaluation/evaluate_video_mvbench.py        | 2 ++
 .../multimodal/evaluation/evaluate_video_phys_game_bench.py     | 2 ++
 examples/multimodal/evaluation/evaluate_vqav2.py                | 2 ++
 examples/multimodal/evaluation/mmmu_utils.py                    | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/examples/multimodal/evaluation/evaluate_mathvista.py b/examples/multimodal/evaluation/evaluate_mathvista.py
index 48d1a0c37a9..2461da0cecf 100644
--- a/examples/multimodal/evaluation/evaluate_mathvista.py
+++ b/examples/multimodal/evaluation/evaluate_mathvista.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import argparse
 import json
 import re
diff --git a/examples/multimodal/evaluation/evaluate_ocrbench.py b/examples/multimodal/evaluation/evaluate_ocrbench.py
index a2af310705c..7fe5ce63698 100644
--- a/examples/multimodal/evaluation/evaluate_ocrbench.py
+++ b/examples/multimodal/evaluation/evaluate_ocrbench.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import argparse
 import json
 
diff --git a/examples/multimodal/evaluation/evaluate_video_mvbench.py b/examples/multimodal/evaluation/evaluate_video_mvbench.py
index d1a683a07b0..7e7ec0270d3 100644
--- a/examples/multimodal/evaluation/evaluate_video_mvbench.py
+++ b/examples/multimodal/evaluation/evaluate_video_mvbench.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import argparse
 import json
 
diff --git a/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py b/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py
index 775a3c5006e..a2fe02dfaf5 100644
--- a/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py
+++ b/examples/multimodal/evaluation/evaluate_video_phys_game_bench.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import argparse
 import json
 
diff --git a/examples/multimodal/evaluation/evaluate_vqav2.py b/examples/multimodal/evaluation/evaluate_vqav2.py
index ff8d57a3119..8670d6ee2ad 100644
--- a/examples/multimodal/evaluation/evaluate_vqav2.py
+++ b/examples/multimodal/evaluation/evaluate_vqav2.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import argparse
 import json
 from typing import List
diff --git a/examples/multimodal/evaluation/mmmu_utils.py b/examples/multimodal/evaluation/mmmu_utils.py
index 27fbb5e17bd..d5e5b31ed21 100644
--- a/examples/multimodal/evaluation/mmmu_utils.py
+++ b/examples/multimodal/evaluation/mmmu_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 # The following code is adapted from
 # https://github.com/MMMU-Benchmark/MMMU/blob/main/mmmu/utils/data_utils.py,
 # which is licensed under the Apache License 2.0. More details on the license can be