From fffa1b8a94dcf6eaf8c566b947b62abcb8ccd912 Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 17:43:50 +0000 Subject: [PATCH 01/18] Add vision genai inference path for multi-file VLM evaluation Adds _inference_vision_genai method to OnnxEvaluator that uses onnxruntime-genai for vision-language models (e.g., Qwen3-VL) with multi-file ONNX architectures (vision.onnx, text.onnx, embedding.onnx). The method is auto-detected when genai_config.json exists and contains a 'vision' field in the model config. This mirrors the existing auto-detection pattern used for speech models (whisper, nemotron_speech). For single-file ONNX VQA models, the existing _inference_vision path (classification-style single forward pass) is still used. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/evaluator/olive_evaluator.py | 140 ++++++++++++++++++++++++++++- 1 file changed, 137 insertions(+), 3 deletions(-) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index bfa684acb..4c7169529 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -593,9 +593,26 @@ def _evaluate_onnx_accuracy( ) -> MetricResult: if _is_vision_metric(metric): _validate_vision_task_metric(metric) - inference_output, targets = self._inference_vision( - model, metric, dataloader, post_func, device, execution_providers - ) + # Auto-detect genai vision model by checking for genai_config.json with vision field + genai_config_path = Path(model.model_path).parent / "genai_config.json" + if genai_config_path.exists(): + import json + + with genai_config_path.open() as f: + genai_config = json.load(f) + model_config = genai_config.get("model", {}) + if model_config.get("vision"): + inference_output, targets = self._inference_vision_genai( + model, metric, dataloader, device, execution_providers + ) + else: + inference_output, targets = self._inference_vision( + model, metric, dataloader, post_func, device, execution_providers + ) + else: + inference_output, targets = self._inference_vision( + model, metric, dataloader, post_func, device, execution_providers + ) elif _is_text_based_metric(metric): # Auto-detect genai model by checking for genai_config.json genai_config_path = Path(model.model_path).parent / "genai_config.json" @@ -795,6 +812,123 @@ def _inference_vision( return OliveModelOutput(preds=all_preds, logits=None), all_targets + def _inference_vision_genai( + self, + model: ONNXModelHandler, + metric: Metric, + dataloader: "DataLoader", + device: Device = Device.CPU, + execution_providers: Optional[Union[str, list[str]]] = None, + ) -> tuple[OliveModelOutput, Any]: + """Vision-based inference for VQA/OCR metrics using onnxruntime-genai. + + Auto-detected when the model directory contains genai_config.json with a vision field. + Uses og.Model with multimodal processor for vision-language models (e.g., Qwen3-VL). + The dataloader must yield (input_dict, labels) where input_dict contains + 'image' (PIL Image) and 'question' (str), and labels are reference answer strings. + """ + try: + import onnxruntime_genai as og + except ImportError: + raise ImportError( + "onnxruntime-genai is required for genai-based vision evaluation. " + "Install it with: pip install onnxruntime-genai" + ) from None + + import json + import os + import tempfile + + from PIL import Image + + model_dir = str(Path(model.model_path).parent) + + # Read genai_config for search options + with (Path(model_dir) / "genai_config.json").open() as f: + genai_config = json.load(f) + + max_length = genai_config.get("search", {}).get("max_length", 2048) + + # Build og.Model with appropriate execution provider + config = og.Config(model_dir) + config.clear_providers() + if device == Device.GPU: + config.append_provider("cuda") + og_model = og.Model(config) + processor = og_model.create_multimodal_processor() + tokenizer = og.Tokenizer(og_model) + + all_preds = [] + all_targets = [] + + for batch in dataloader: + input_data, labels = OliveEvaluator.unpack_batch_for_accuracy(batch) + + # input_data is a dict with 'image' (PIL) and 'question' (str) + # or a list of such dicts for batch_size > 1 + items = [input_data] if isinstance(input_data, dict) else input_data + + for item in items: + pil_image = item.get("image") + question = item.get("question", "") + + if pil_image is None: + all_preds.append("") + continue + + # Ensure PIL Image + if not isinstance(pil_image, Image.Image): + pil_image = Image.open(pil_image).convert("RGB") + + # Build chat messages for the vision-language model + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": question}, + ], + } + ] + messages_json = json.dumps(messages) + + # Save image to temp file for og.Images + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + pil_image.save(f, format="PNG") + tmp_path = f.name + + try: + images = og.Images.open(tmp_path) + prompt = tokenizer.apply_chat_template(messages_json, add_generation_prompt=True) + inputs = processor(prompt, images=images) + + params = og.GeneratorParams(og_model) + params.set_search_options(max_length=max_length, do_sample=False) + + generator = og.Generator(og_model, params) + generator.set_inputs(inputs) + + tokens = [] + while not generator.is_done(): + generator.generate_next_token() + tokens.append(generator.get_next_tokens()[0]) + del generator + + pred = tokenizer.decode(tokens).strip() + all_preds.append(pred) + finally: + os.unlink(tmp_path) + + # Collect reference texts + if isinstance(labels, (list, tuple)): + all_targets.extend(labels) + else: + all_targets.append(labels) + + del og_model + + return OliveModelOutput(preds=all_preds, logits=None), all_targets + def _inference_text_genai( self, model: ONNXModelHandler, From 120d0e742c219711c2268750cdf764cdf21536e4 Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 17:50:09 +0000 Subject: [PATCH 02/18] Fix task_type_components_map to apply all component overrides Previously, when a component (e.g., pre_process_data) specified a task type, only that same component's override was applied from the task map. This meant the vision-vqa dataloader override (vision_vqa_dataloader with custom collate_fn for PIL images) was never applied since it was a different component than the one specifying the task. Now, when any component specifies a task type, ALL component overrides from the task_type_components_map are applied. This ensures the custom dataloader with PIL-safe collation is used for vision tasks. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/data/config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/olive/data/config.py b/olive/data/config.py index 89ed9ee17..f74339063 100644 --- a/olive/data/config.py +++ b/olive/data/config.py @@ -217,8 +217,9 @@ def _update_default_component_type_with_task_type(self, dc_cls, default_componen if config and config.params: task_type = config.params.get("task") if task_type: - task_specific_override = dc_cls.task_type_components_map.get( + task_overrides = dc_cls.task_type_components_map.get( task_type.replace("-with-past", ""), {} - ).get(component_name) - if task_specific_override: - default_components_type[component_name] = task_specific_override + ) + # Apply all component overrides for this task type + for override_component, override_type in task_overrides.items(): + default_components_type[override_component] = override_type From 5bf9c893baa684c43585b8c883113b844be5773d Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 17:52:49 +0000 Subject: [PATCH 03/18] Address review comments on vision genai inference - Simplify dispatch logic: use single boolean flag instead of duplicated fallback branches - Honor execution_providers parameter: map user-specified EPs to og.Config providers instead of only checking device - Use TemporaryDirectory instead of per-file NamedTemporaryFile to avoid I/O overhead and file leak risk - Add comment clarifying pred/target alignment when image is None Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/evaluator/olive_evaluator.py | 103 +++++++++++++++-------------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 4c7169529..1dfd1eca9 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -594,21 +594,19 @@ def _evaluate_onnx_accuracy( if _is_vision_metric(metric): _validate_vision_task_metric(metric) # Auto-detect genai vision model by checking for genai_config.json with vision field + use_genai_vision = False genai_config_path = Path(model.model_path).parent / "genai_config.json" if genai_config_path.exists(): import json with genai_config_path.open() as f: genai_config = json.load(f) - model_config = genai_config.get("model", {}) - if model_config.get("vision"): - inference_output, targets = self._inference_vision_genai( - model, metric, dataloader, device, execution_providers - ) - else: - inference_output, targets = self._inference_vision( - model, metric, dataloader, post_func, device, execution_providers - ) + use_genai_vision = bool(genai_config.get("model", {}).get("vision")) + + if use_genai_vision: + inference_output, targets = self._inference_vision_genai( + model, metric, dataloader, device, execution_providers + ) else: inference_output, targets = self._inference_vision( model, metric, dataloader, post_func, device, execution_providers @@ -835,8 +833,8 @@ def _inference_vision_genai( "Install it with: pip install onnxruntime-genai" ) from None + import io import json - import os import tempfile from PIL import Image @@ -852,7 +850,12 @@ def _inference_vision_genai( # Build og.Model with appropriate execution provider config = og.Config(model_dir) config.clear_providers() - if device == Device.GPU: + if execution_providers: + # Honor user-specified execution providers + for ep in (execution_providers if isinstance(execution_providers, list) else [execution_providers]): + ep_lower = ep.lower().replace("executionprovider", "") + config.append_provider(ep_lower) + elif device == Device.GPU: config.append_provider("cuda") og_model = og.Model(config) processor = og_model.create_multimodal_processor() @@ -861,44 +864,46 @@ def _inference_vision_genai( all_preds = [] all_targets = [] - for batch in dataloader: - input_data, labels = OliveEvaluator.unpack_batch_for_accuracy(batch) + # Use a temporary directory for image files to avoid per-file create/delete overhead + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_img_path = Path(tmp_dir) / "input.png" - # input_data is a dict with 'image' (PIL) and 'question' (str) - # or a list of such dicts for batch_size > 1 - items = [input_data] if isinstance(input_data, dict) else input_data + for batch in dataloader: + input_data, labels = OliveEvaluator.unpack_batch_for_accuracy(batch) - for item in items: - pil_image = item.get("image") - question = item.get("question", "") + # input_data is a dict with 'image' (PIL) and 'question' (str) + # or a list of such dicts for batch_size > 1 + items = [input_data] if isinstance(input_data, dict) else input_data - if pil_image is None: - all_preds.append("") - continue + for idx, item in enumerate(items): + pil_image = item.get("image") + question = item.get("question", "") - # Ensure PIL Image - if not isinstance(pil_image, Image.Image): - pil_image = Image.open(pil_image).convert("RGB") - - # Build chat messages for the vision-language model - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": question}, - ], - } - ] - messages_json = json.dumps(messages) + if pil_image is None: + # Append empty pred to maintain alignment with targets + all_preds.append("") + continue - # Save image to temp file for og.Images - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - pil_image.save(f, format="PNG") - tmp_path = f.name + # Ensure PIL Image + if not isinstance(pil_image, Image.Image): + pil_image = Image.open(pil_image).convert("RGB") + + # Build chat messages for the vision-language model + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": question}, + ], + } + ] + messages_json = json.dumps(messages) + + # Save image to temp file for og.Images (reuse same path to minimize I/O) + pil_image.save(str(tmp_img_path), format="PNG") + images = og.Images.open(str(tmp_img_path)) - try: - images = og.Images.open(tmp_path) prompt = tokenizer.apply_chat_template(messages_json, add_generation_prompt=True) inputs = processor(prompt, images=images) @@ -916,14 +921,12 @@ def _inference_vision_genai( pred = tokenizer.decode(tokens).strip() all_preds.append(pred) - finally: - os.unlink(tmp_path) - # Collect reference texts - if isinstance(labels, (list, tuple)): - all_targets.extend(labels) - else: - all_targets.append(labels) + # Collect reference texts (aligned with preds including empty ones for None images) + if isinstance(labels, (list, tuple)): + all_targets.extend(labels) + else: + all_targets.append(labels) del og_model From bf8ca82d21a7788ed37ac810d0116e84cb7003ab Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 17:55:06 +0000 Subject: [PATCH 04/18] Fix EP mapping: skip CPUExecutionProvider for genai onnxruntime-genai uses CPU by default when no provider is appended. CPUExecutionProvider is not a recognized genai provider name, so skip it rather than trying to map it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/evaluator/olive_evaluator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 1dfd1eca9..c1078d58d 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -848,13 +848,15 @@ def _inference_vision_genai( max_length = genai_config.get("search", {}).get("max_length", 2048) # Build og.Model with appropriate execution provider + # Note: onnxruntime-genai uses CPU by default when no provider is appended. + # Only non-CPU providers need to be explicitly added. config = og.Config(model_dir) config.clear_providers() if execution_providers: - # Honor user-specified execution providers for ep in (execution_providers if isinstance(execution_providers, list) else [execution_providers]): - ep_lower = ep.lower().replace("executionprovider", "") - config.append_provider(ep_lower) + if ep == "CPUExecutionProvider": + continue + config.append_provider(ep) elif device == Device.GPU: config.append_provider("cuda") og_model = og.Model(config) From 398d65546f87b80754f885d0d5db1af2ee32fe42 Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 18:07:25 +0000 Subject: [PATCH 05/18] Fix lint: remove unused import, unused loop var, use .values() Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/data/config.py | 6 ++---- olive/evaluator/olive_evaluator.py | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/olive/data/config.py b/olive/data/config.py index f74339063..866438b37 100644 --- a/olive/data/config.py +++ b/olive/data/config.py @@ -213,13 +213,11 @@ def to_data_container(self) -> "DataContainer": return dc_cls(config=self) def _update_default_component_type_with_task_type(self, dc_cls, default_components_type): - for component_name, config in self.components.items(): + for config in self.components.values(): if config and config.params: task_type = config.params.get("task") if task_type: - task_overrides = dc_cls.task_type_components_map.get( - task_type.replace("-with-past", ""), {} - ) + task_overrides = dc_cls.task_type_components_map.get(task_type.replace("-with-past", ""), {}) # Apply all component overrides for this task type for override_component, override_type in task_overrides.items(): default_components_type[override_component] = override_type diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index c1078d58d..d9100abe9 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -833,7 +833,6 @@ def _inference_vision_genai( "Install it with: pip install onnxruntime-genai" ) from None - import io import json import tempfile @@ -853,7 +852,7 @@ def _inference_vision_genai( config = og.Config(model_dir) config.clear_providers() if execution_providers: - for ep in (execution_providers if isinstance(execution_providers, list) else [execution_providers]): + for ep in execution_providers if isinstance(execution_providers, list) else [execution_providers]: if ep == "CPUExecutionProvider": continue config.append_provider(ep) @@ -877,7 +876,7 @@ def _inference_vision_genai( # or a list of such dicts for batch_size > 1 items = [input_data] if isinstance(input_data, dict) else input_data - for idx, item in enumerate(items): + for item in items: pil_image = item.get("image") question = item.get("question", "") From ab380c3c16f394956feb6f3dd77acadc1b5b4f70 Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 18:27:49 +0000 Subject: [PATCH 06/18] Fix genai provider: use device field instead of ORT EP names onnxruntime-genai uses short provider names (e.g., 'cuda') not ORT-style names ('CUDAExecutionProvider'). Match the pattern used by the existing speech genai methods: only check device field for provider selection. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/evaluator/olive_evaluator.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index d9100abe9..9e562b764 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -848,15 +848,11 @@ def _inference_vision_genai( # Build og.Model with appropriate execution provider # Note: onnxruntime-genai uses CPU by default when no provider is appended. - # Only non-CPU providers need to be explicitly added. + # Only non-CPU providers need to be explicitly added using short names (e.g., "cuda"). + # This follows the same pattern as _inference_text_genai and _inference_text_genai_streaming. config = og.Config(model_dir) config.clear_providers() - if execution_providers: - for ep in execution_providers if isinstance(execution_providers, list) else [execution_providers]: - if ep == "CPUExecutionProvider": - continue - config.append_provider(ep) - elif device == Device.GPU: + if device == Device.GPU: config.append_provider("cuda") og_model = og.Model(config) processor = og_model.create_multimodal_processor() From e25cd2e137bf810ae19c702444bb126252afedc4 Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 19:36:32 +0000 Subject: [PATCH 07/18] Cap max_length to 128 for vision VQA generation The genai_config.json may specify max_length equal to the full context window (e.g., 262144) which causes near-infinite generation for VQA tasks where answers are typically 1-10 tokens. Cap at 128 tokens. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/evaluator/olive_evaluator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 9e562b764..15910325d 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -844,7 +844,10 @@ def _inference_vision_genai( with (Path(model_dir) / "genai_config.json").open() as f: genai_config = json.load(f) - max_length = genai_config.get("search", {}).get("max_length", 2048) + # Cap max_length for VQA tasks — answers are typically short (1-50 tokens). + # The genai_config may have a very large max_length (e.g., 262144 for context window) + # which is not appropriate for answer generation. + max_length = min(genai_config.get("search", {}).get("max_length", 2048), 128) # Build og.Model with appropriate execution provider # Note: onnxruntime-genai uses CPU by default when no provider is appended. From b7f46c92980667212a2340c8c6dcde965b6c0e77 Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 19:38:53 +0000 Subject: [PATCH 08/18] Increase max_length cap to 4096 for vision genai inference max_length in genai is total sequence length (input + output). Vision inputs include image tokens which can be 200+ tokens, so 128 was too small. Use 4096 which accommodates input tokens plus short VQA answers while still preventing runaway generation from 262K context windows. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/evaluator/olive_evaluator.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 15910325d..19b621a36 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -844,10 +844,11 @@ def _inference_vision_genai( with (Path(model_dir) / "genai_config.json").open() as f: genai_config = json.load(f) - # Cap max_length for VQA tasks — answers are typically short (1-50 tokens). - # The genai_config may have a very large max_length (e.g., 262144 for context window) - # which is not appropriate for answer generation. - max_length = min(genai_config.get("search", {}).get("max_length", 2048), 128) + # max_length in genai is total sequence length (input + output). + # Default to 1028 which accommodates image/prompt tokens (~200-500) plus answer tokens. + # Note: genai_config.json's search.max_length is typically the full context window + # (e.g., 262144) which is too large — the model will stop at EOS well before this cap. + max_length = 1028 # Build og.Model with appropriate execution provider # Note: onnxruntime-genai uses CPU by default when no provider is appended. From df8756bd3fc1fe61a2e9dd315b35d9efdf1bb500 Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 20:02:33 +0000 Subject: [PATCH 09/18] Address all review comments and fix lint errors - Remove unused params (metric, execution_providers) from _inference_vision_genai signature - Remove unused genai_config variable (was loaded but not used) - Document that device drives GPU/CPU selection in genai - Rename local var to genai_cfg to avoid shadowing - Run ruff format Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/evaluator/olive_evaluator.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 19b621a36..199114b18 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -594,19 +594,17 @@ def _evaluate_onnx_accuracy( if _is_vision_metric(metric): _validate_vision_task_metric(metric) # Auto-detect genai vision model by checking for genai_config.json with vision field - use_genai_vision = False genai_config_path = Path(model.model_path).parent / "genai_config.json" + use_genai_vision = False if genai_config_path.exists(): import json with genai_config_path.open() as f: - genai_config = json.load(f) - use_genai_vision = bool(genai_config.get("model", {}).get("vision")) + genai_cfg = json.load(f) + use_genai_vision = bool(genai_cfg.get("model", {}).get("vision")) if use_genai_vision: - inference_output, targets = self._inference_vision_genai( - model, metric, dataloader, device, execution_providers - ) + inference_output, targets = self._inference_vision_genai(model, dataloader, device) else: inference_output, targets = self._inference_vision( model, metric, dataloader, post_func, device, execution_providers @@ -813,10 +811,8 @@ def _inference_vision( def _inference_vision_genai( self, model: ONNXModelHandler, - metric: Metric, dataloader: "DataLoader", device: Device = Device.CPU, - execution_providers: Optional[Union[str, list[str]]] = None, ) -> tuple[OliveModelOutput, Any]: """Vision-based inference for VQA/OCR metrics using onnxruntime-genai. @@ -824,6 +820,9 @@ def _inference_vision_genai( Uses og.Model with multimodal processor for vision-language models (e.g., Qwen3-VL). The dataloader must yield (input_dict, labels) where input_dict contains 'image' (PIL Image) and 'question' (str), and labels are reference answer strings. + + Note: GPU/CPU selection is driven by the `device` parameter. onnxruntime-genai uses + short provider names internally (e.g., "cuda") which differ from ORT-style EP names. """ try: import onnxruntime_genai as og @@ -840,10 +839,6 @@ def _inference_vision_genai( model_dir = str(Path(model.model_path).parent) - # Read genai_config for search options - with (Path(model_dir) / "genai_config.json").open() as f: - genai_config = json.load(f) - # max_length in genai is total sequence length (input + output). # Default to 1028 which accommodates image/prompt tokens (~200-500) plus answer tokens. # Note: genai_config.json's search.max_length is typically the full context window From 23fa91b11abc42f384652c0fcaec04a9536a4ccd Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 20:16:07 +0000 Subject: [PATCH 10/18] Add system_prompt support for vision VQA evaluation Allow passing a system_prompt parameter in pre_process config to guide model responses (e.g., 'reply with only the option number'). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/data/component/pre_process_data.py | 8 +++++--- olive/evaluator/olive_evaluator.py | 8 ++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/olive/data/component/pre_process_data.py b/olive/data/component/pre_process_data.py index b61b39bf5..9923b4c6e 100644 --- a/olive/data/component/pre_process_data.py +++ b/olive/data/component/pre_process_data.py @@ -387,6 +387,7 @@ def vision_vqa_pre_process( image_col: str = "image", question_col: str = "question", answer_col: str = "answer", + system_prompt: str = "", max_samples: Optional[int] = None, limit: Optional[float] = None, seed: int = 42, @@ -438,11 +439,12 @@ class VisionVQADataset: Note: Use batch_size=1 in dataloader config as images have variable sizes. """ - def __init__(self, hf_dataset, image_column, question_column, answer_column): + def __init__(self, hf_dataset, image_column, question_column, answer_column, sys_prompt=""): self.dataset = hf_dataset self.image_column = image_column self.question_column = question_column self.answer_column = answer_column + self.system_prompt = sys_prompt def __len__(self): return len(self.dataset) @@ -456,7 +458,7 @@ def __getitem__(self, idx): # Join with | separator so metrics can match against any valid answer if isinstance(answer, (list, tuple)): answer = "|".join(str(a) for a in answer) if answer else "" - return {"image": image, "question": question}, str(answer) + return {"image": image, "question": question, "system_prompt": self.system_prompt}, str(answer) @staticmethod def collate_fn(batch): @@ -472,4 +474,4 @@ def collate_fn(batch): answers = [item[1] for item in batch] return (inputs, answers) - return VisionVQADataset(dataset, image_col, question_col, answer_col) + return VisionVQADataset(dataset, image_col, question_col, answer_col, system_prompt) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 199114b18..0aa2ddc39 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -874,6 +874,7 @@ def _inference_vision_genai( for item in items: pil_image = item.get("image") question = item.get("question", "") + sys_prompt = item.get("system_prompt", "") if pil_image is None: # Append empty pred to maintain alignment with targets @@ -885,7 +886,10 @@ def _inference_vision_genai( pil_image = Image.open(pil_image).convert("RGB") # Build chat messages for the vision-language model - messages = [ + messages = [] + if sys_prompt: + messages.append({"role": "system", "content": sys_prompt}) + messages.append( { "role": "user", "content": [ @@ -893,7 +897,7 @@ def _inference_vision_genai( {"type": "text", "text": question}, ], } - ] + ) messages_json = json.dumps(messages) # Save image to temp file for og.Images (reuse same path to minimize I/O) From a144c5e85ec8b3e18be05d85727c5872aed238c3 Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 20:32:06 +0000 Subject: [PATCH 11/18] Add options_col support and extract leading number from predictions - Add options_col param to format multiple-choice options into the question - Extract leading number from model responses (e.g. '1. D' -> '1') - Add debug logging to vision_eval_debug.jsonl in model dir Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/data/component/pre_process_data.py | 14 ++++++++++++-- olive/evaluator/olive_evaluator.py | 10 ++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/olive/data/component/pre_process_data.py b/olive/data/component/pre_process_data.py index 9923b4c6e..f9aa20893 100644 --- a/olive/data/component/pre_process_data.py +++ b/olive/data/component/pre_process_data.py @@ -387,6 +387,7 @@ def vision_vqa_pre_process( image_col: str = "image", question_col: str = "question", answer_col: str = "answer", + options_col: str = "", system_prompt: str = "", max_samples: Optional[int] = None, limit: Optional[float] = None, @@ -439,11 +440,12 @@ class VisionVQADataset: Note: Use batch_size=1 in dataloader config as images have variable sizes. """ - def __init__(self, hf_dataset, image_column, question_column, answer_column, sys_prompt=""): + def __init__(self, hf_dataset, image_column, question_column, answer_column, options_column="", sys_prompt=""): self.dataset = hf_dataset self.image_column = image_column self.question_column = question_column self.answer_column = answer_column + self.options_column = options_column self.system_prompt = sys_prompt def __len__(self): @@ -454,6 +456,14 @@ def __getitem__(self, idx): image = item[self.image_column] question = item[self.question_column] answer = item[self.answer_column] + + # Format options into the question if options_col is specified + if self.options_column and self.options_column in item: + options = item[self.options_column] + if isinstance(options, (list, tuple)): + options_text = "\n".join(f"{i}. {opt}" for i, opt in enumerate(options)) + question = f"{question}\n{options_text}" + # Handle list/tuple answers (some datasets have multiple valid answers) # Join with | separator so metrics can match against any valid answer if isinstance(answer, (list, tuple)): @@ -474,4 +484,4 @@ def collate_fn(batch): answers = [item[1] for item in batch] return (inputs, answers) - return VisionVQADataset(dataset, image_col, question_col, answer_col, system_prompt) + return VisionVQADataset(dataset, image_col, question_col, answer_col, options_col, system_prompt) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 0aa2ddc39..835eb1b04 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -920,8 +920,18 @@ def _inference_vision_genai( del generator pred = tokenizer.decode(tokens).strip() + # Extract leading number from responses like "1. D" or "0. krill" + import re + num_match = re.match(r"^(\d+)", pred) + if num_match: + pred = num_match.group(1) all_preds.append(pred) + # Debug logging + debug_path = Path(model_dir) / "vision_eval_debug.jsonl" + with open(debug_path, "a") as f: + f.write(json.dumps({"prompt": messages, "pred": pred, "target": labels}) + "\n") + # Collect reference texts (aligned with preds including empty ones for None images) if isinstance(labels, (list, tuple)): all_targets.extend(labels) From d51af371367183114c20044bb9ef2642e0eb4bd2 Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 20:55:59 +0000 Subject: [PATCH 12/18] Address review comments: extract helper, remove debug code, fix lint - Extract _load_genai_config helper to deduplicate config detection - Remove hardcoded number extraction (task-specific, not generic) - Remove debug logging (was dev instrumentation) - Use 'from e' instead of 'from None' in ImportError - Add missing docstring params for options_col and system_prompt Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/data/component/pre_process_data.py | 4 +++ olive/evaluator/olive_evaluator.py | 45 ++++++++++-------------- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/olive/data/component/pre_process_data.py b/olive/data/component/pre_process_data.py index f9aa20893..1ea765e6d 100644 --- a/olive/data/component/pre_process_data.py +++ b/olive/data/component/pre_process_data.py @@ -410,6 +410,10 @@ def vision_vqa_pre_process( image_col: Name of the image column. Defaults to "image". question_col: Name of the question column. Defaults to "question". answer_col: Name of the answer column. Defaults to "answer". + options_col: Name of the options column for multiple-choice questions. If specified, + options are formatted as numbered choices and appended to the question. Defaults to "". + system_prompt: System prompt to guide model responses (e.g., "Reply with only the + option number"). Passed through to the evaluator. Defaults to "". max_samples: Maximum number of samples (deprecated, use limit). Defaults to None. limit: Sampling limit following Olive convention: If >= 1: use first N samples. diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 835eb1b04..a648f4d5e 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -582,6 +582,17 @@ def _inference( dump_tuning_result(session.session, tuning_result_file) return OliveModelOutput(preds=preds, logits=logits), targets + @staticmethod + def _load_genai_config(model: ONNXModelHandler) -> Optional[dict]: + """Load genai_config.json from the model directory, or return None if not found.""" + genai_config_path = Path(model.model_path).parent / "genai_config.json" + if not genai_config_path.exists(): + return None + import json + + with genai_config_path.open() as f: + return json.load(f) + def _evaluate_onnx_accuracy( self, model: ONNXModelHandler, @@ -594,14 +605,8 @@ def _evaluate_onnx_accuracy( if _is_vision_metric(metric): _validate_vision_task_metric(metric) # Auto-detect genai vision model by checking for genai_config.json with vision field - genai_config_path = Path(model.model_path).parent / "genai_config.json" - use_genai_vision = False - if genai_config_path.exists(): - import json - - with genai_config_path.open() as f: - genai_cfg = json.load(f) - use_genai_vision = bool(genai_cfg.get("model", {}).get("vision")) + genai_cfg = self._load_genai_config(model) + use_genai_vision = bool(genai_cfg and genai_cfg.get("model", {}).get("vision")) if use_genai_vision: inference_output, targets = self._inference_vision_genai(model, dataloader, device) @@ -611,13 +616,9 @@ def _evaluate_onnx_accuracy( ) elif _is_text_based_metric(metric): # Auto-detect genai model by checking for genai_config.json - genai_config_path = Path(model.model_path).parent / "genai_config.json" - if genai_config_path.exists(): - import json - - with genai_config_path.open() as f: - genai_config = json.load(f) - model_type = genai_config.get("model", {}).get("type", "") + genai_cfg = self._load_genai_config(model) + if genai_cfg: + model_type = genai_cfg.get("model", {}).get("type", "") if model_type == "whisper": inference_output, targets = self._inference_text_genai( @@ -826,11 +827,11 @@ def _inference_vision_genai( """ try: import onnxruntime_genai as og - except ImportError: + except ImportError as e: raise ImportError( "onnxruntime-genai is required for genai-based vision evaluation. " "Install it with: pip install onnxruntime-genai" - ) from None + ) from e import json import tempfile @@ -920,18 +921,8 @@ def _inference_vision_genai( del generator pred = tokenizer.decode(tokens).strip() - # Extract leading number from responses like "1. D" or "0. krill" - import re - num_match = re.match(r"^(\d+)", pred) - if num_match: - pred = num_match.group(1) all_preds.append(pred) - # Debug logging - debug_path = Path(model_dir) / "vision_eval_debug.jsonl" - with open(debug_path, "a") as f: - f.write(json.dumps({"prompt": messages, "pred": pred, "target": labels}) + "\n") - # Collect reference texts (aligned with preds including empty ones for None images) if isinstance(labels, (list, tuple)): all_targets.extend(labels) From 7d0738ef03b543ff830890d14bcc8ac09fb528e0 Mon Sep 17 00:00:00 2001 From: David Fan Date: Mon, 1 Jun 2026 20:59:44 +0000 Subject: [PATCH 13/18] Add opt-in number extraction for multiple-choice VQA tasks When options_col is specified in pre_process config, set extract_number=True in the input dict. The evaluator uses this flag to extract the leading number from model responses (e.g. '1. D' -> '1'), which is needed for correct exact_match scoring on multiple-choice benchmarks like AI2D. This is not applied for OCR/ChartQA tasks where numeric predictions are valid. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/data/component/pre_process_data.py | 10 +++++++++- olive/evaluator/olive_evaluator.py | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/olive/data/component/pre_process_data.py b/olive/data/component/pre_process_data.py index 1ea765e6d..f818f1ac4 100644 --- a/olive/data/component/pre_process_data.py +++ b/olive/data/component/pre_process_data.py @@ -462,17 +462,25 @@ def __getitem__(self, idx): answer = item[self.answer_column] # Format options into the question if options_col is specified + has_options = False if self.options_column and self.options_column in item: options = item[self.options_column] if isinstance(options, (list, tuple)): options_text = "\n".join(f"{i}. {opt}" for i, opt in enumerate(options)) question = f"{question}\n{options_text}" + has_options = True # Handle list/tuple answers (some datasets have multiple valid answers) # Join with | separator so metrics can match against any valid answer if isinstance(answer, (list, tuple)): answer = "|".join(str(a) for a in answer) if answer else "" - return {"image": image, "question": question, "system_prompt": self.system_prompt}, str(answer) + input_dict = { + "image": image, + "question": question, + "system_prompt": self.system_prompt, + "extract_number": has_options, + } + return input_dict, str(answer) @staticmethod def collate_fn(batch): diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index a648f4d5e..a5e2a3f7e 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -834,6 +834,7 @@ def _inference_vision_genai( ) from e import json + import re import tempfile from PIL import Image @@ -876,6 +877,7 @@ def _inference_vision_genai( pil_image = item.get("image") question = item.get("question", "") sys_prompt = item.get("system_prompt", "") + extract_number = item.get("extract_number", False) if pil_image is None: # Append empty pred to maintain alignment with targets @@ -921,6 +923,12 @@ def _inference_vision_genai( del generator pred = tokenizer.decode(tokens).strip() + # For multiple-choice tasks, extract leading number from responses + # like "1. D" or "0. krill" to match the expected answer format + if extract_number: + num_match = re.match(r"^(\d+)", pred) + if num_match: + pred = num_match.group(1) all_preds.append(pred) # Collect reference texts (aligned with preds including empty ones for None images) From f3524f6dcda105649fab39bb7a2ac75c1c9bc2b5 Mon Sep 17 00:00:00 2001 From: David Fan Date: Tue, 2 Jun 2026 17:52:15 +0000 Subject: [PATCH 14/18] Re-trigger CI: flaky test_mnb_to_qdq failure unrelated to PR changes From fa7a23979c2d5e567974c9ef9a0478f450ceb349 Mon Sep 17 00:00:00 2001 From: David Fan Date: Tue, 2 Jun 2026 18:04:45 +0000 Subject: [PATCH 15/18] Address Copilot review: fix vision detection for empty dict, add unit tests - Use 'vision' in dict check instead of bool() to handle empty vision objects - Add TestOnnxEvaluatorGenaiVisionDetection test class with 8 tests covering: - _load_genai_config helper (present/missing) - Vision detection logic (with vision, empty vision, no vision, no config) - Dispatch routing (genai vs standard vision path) --- olive/evaluator/olive_evaluator.py | 2 +- test/evaluator/test_olive_evaluator.py | 148 +++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 1 deletion(-) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index a5e2a3f7e..440af30bf 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -606,7 +606,7 @@ def _evaluate_onnx_accuracy( _validate_vision_task_metric(metric) # Auto-detect genai vision model by checking for genai_config.json with vision field genai_cfg = self._load_genai_config(model) - use_genai_vision = bool(genai_cfg and genai_cfg.get("model", {}).get("vision")) + use_genai_vision = genai_cfg is not None and "vision" in genai_cfg.get("model", {}) if use_genai_vision: inference_output, targets = self._inference_vision_genai(model, dataloader, device) diff --git a/test/evaluator/test_olive_evaluator.py b/test/evaluator/test_olive_evaluator.py index 88c4c0d72..8aa2cc430 100644 --- a/test/evaluator/test_olive_evaluator.py +++ b/test/evaluator/test_olive_evaluator.py @@ -620,3 +620,151 @@ def test_validate_vision_task_metric_no_task_skips(self): metric = self._make_vision_metric(["exact_match"]) # No task specified, should not raise _validate_vision_task_metric(metric) + + +class TestOnnxEvaluatorGenaiVisionDetection: + """Tests for genai vision model detection and dispatch via the public evaluate() method.""" + + def _make_model_with_genai_config(self, tmp_path, genai_config_content): + """Create a mock ONNXModelHandler with a genai_config.json in its directory.""" + import json + + from olive.model.handler.onnx import ONNXModelHandler + + model_dir = tmp_path / "model" + model_dir.mkdir() + model_file = model_dir / "text.onnx" + model_file.write_text("") # dummy file + + if genai_config_content is not None: + config_path = model_dir / "genai_config.json" + config_path.write_text(json.dumps(genai_config_content)) + + model = MagicMock(spec=ONNXModelHandler) + model.model_path = str(model_file) + model.framework = "onnx" + return model + + def _make_vision_accuracy_metric(self): + """Create a metric that triggers the vision accuracy evaluation path.""" + metric = MagicMock() + metric.name = "accuracy" + metric.type = MetricType.ACCURACY + metric.sub_types = [MagicMock()] + metric.sub_types[0].name = "exact_match" + metric.data_config = None + metric.user_config = MagicMock() + metric.user_config.user_script = None + metric.user_config.script_dir = None + metric.user_config.data_dir = None + metric.user_config.batch_size = 1 + metric.user_config.dataloader_func = None + metric.user_config.post_processing_func = None + metric.user_config.evaluate_func = None + metric.user_config.input_names = None + metric.user_config.input_shapes = None + metric.backend = "huggingface_metrics" + return metric + + def test_genai_vision_detected_when_vision_field_present(self, tmp_path): + """Dispatch to genai vision path when genai_config.json has a vision field.""" + from olive.evaluator.olive_evaluator import OliveModelOutput + + config = {"model": {"vision": {"inputs": "pixel_values"}}} + model = self._make_model_with_genai_config(tmp_path, config) + + with patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, \ + patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, \ + patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), \ + patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io") as mock_gen: + mock_genai.return_value = (OliveModelOutput(preds=["answer"], logits=None), ["answer"]) + mock_compute.return_value = MagicMock() + metric = self._make_vision_accuracy_metric() + mock_gen.return_value = metric + mock_get_cfg.return_value = (MagicMock(), None, None) + + evaluator = OnnxEvaluator() + evaluator.evaluate(model, [metric], Device.CPU, None) + + mock_genai.assert_called_once() + mock_vision.assert_not_called() + + def test_genai_vision_detected_with_empty_vision_object(self, tmp_path): + """Dispatch to genai vision path even when vision value is an empty dict.""" + from olive.evaluator.olive_evaluator import OliveModelOutput + + config = {"model": {"vision": {}}} + model = self._make_model_with_genai_config(tmp_path, config) + + with patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, \ + patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, \ + patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), \ + patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io") as mock_gen: + mock_genai.return_value = (OliveModelOutput(preds=["answer"], logits=None), ["answer"]) + mock_compute.return_value = MagicMock() + metric = self._make_vision_accuracy_metric() + mock_gen.return_value = metric + mock_get_cfg.return_value = (MagicMock(), None, None) + + evaluator = OnnxEvaluator() + evaluator.evaluate(model, [metric], Device.CPU, None) + + mock_genai.assert_called_once() + mock_vision.assert_not_called() + + def test_standard_vision_when_no_vision_field(self, tmp_path): + """Dispatch to standard vision path when genai_config has no vision field.""" + from olive.evaluator.olive_evaluator import OliveModelOutput + + config = {"model": {"type": "whisper"}} + model = self._make_model_with_genai_config(tmp_path, config) + + with patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, \ + patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, \ + patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), \ + patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io") as mock_gen: + mock_vision.return_value = (OliveModelOutput(preds=["answer"], logits=None), ["answer"]) + mock_compute.return_value = MagicMock() + metric = self._make_vision_accuracy_metric() + mock_gen.return_value = metric + mock_get_cfg.return_value = (MagicMock(), None, None) + + evaluator = OnnxEvaluator() + evaluator.evaluate(model, [metric], Device.CPU, None) + + mock_vision.assert_called_once() + mock_genai.assert_not_called() + + def test_standard_vision_when_no_genai_config(self, tmp_path): + """Dispatch to standard vision path when genai_config.json is missing.""" + from olive.evaluator.olive_evaluator import OliveModelOutput + + model = self._make_model_with_genai_config(tmp_path, None) + + with patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, \ + patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, \ + patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), \ + patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, \ + patch("olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io") as mock_gen: + mock_vision.return_value = (OliveModelOutput(preds=["answer"], logits=None), ["answer"]) + mock_compute.return_value = MagicMock() + metric = self._make_vision_accuracy_metric() + mock_gen.return_value = metric + mock_get_cfg.return_value = (MagicMock(), None, None) + + evaluator = OnnxEvaluator() + evaluator.evaluate(model, [metric], Device.CPU, None) + + mock_vision.assert_called_once() + mock_genai.assert_not_called() From 57fb8b3c772b92293a4dc120c10d5873b8da9d64 Mon Sep 17 00:00:00 2001 From: David Fan Date: Tue, 2 Jun 2026 18:34:44 +0000 Subject: [PATCH 16/18] Address review: add JSON error handling, guard PIL import, fix file handle leak - Wrap genai_config.json parsing in try/except JSONDecodeError with filepath in message - Guard PIL import with ImportError and helpful install message - Use context manager for Image.open() to close file handle promptly --- olive/evaluator/olive_evaluator.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 440af30bf..87e7f2676 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -590,8 +590,11 @@ def _load_genai_config(model: ONNXModelHandler) -> Optional[dict]: return None import json - with genai_config_path.open() as f: - return json.load(f) + try: + with genai_config_path.open(encoding="utf-8") as f: + return json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in genai config file: {genai_config_path}") from e def _evaluate_onnx_accuracy( self, @@ -837,7 +840,12 @@ def _inference_vision_genai( import re import tempfile - from PIL import Image + try: + from PIL import Image + except ImportError as e: + raise ImportError( + "Pillow is required for vision evaluation. Install it with: pip install Pillow" + ) from e model_dir = str(Path(model.model_path).parent) @@ -886,7 +894,8 @@ def _inference_vision_genai( # Ensure PIL Image if not isinstance(pil_image, Image.Image): - pil_image = Image.open(pil_image).convert("RGB") + with Image.open(pil_image) as img: + pil_image = img.convert("RGB") # Build chat messages for the vision-language model messages = [] From 9e6156ec51a0923ebebfd8a64ce07b1348727b51 Mon Sep 17 00:00:00 2001 From: David Fan Date: Tue, 2 Jun 2026 18:44:40 +0000 Subject: [PATCH 17/18] Fix formatting: use parenthesized context managers --- test/evaluator/test_olive_evaluator.py | 72 ++++++++++++++++---------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/test/evaluator/test_olive_evaluator.py b/test/evaluator/test_olive_evaluator.py index 8aa2cc430..1812f64dc 100644 --- a/test/evaluator/test_olive_evaluator.py +++ b/test/evaluator/test_olive_evaluator.py @@ -673,13 +673,17 @@ def test_genai_vision_detected_when_vision_field_present(self, tmp_path): config = {"model": {"vision": {"inputs": "pixel_values"}}} model = self._make_model_with_genai_config(tmp_path, config) - with patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, \ - patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, \ - patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), \ - patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io") as mock_gen: + with ( + patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, + patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, + patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, + patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), + patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), + patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, + patch( + "olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io" + ) as mock_gen, + ): mock_genai.return_value = (OliveModelOutput(preds=["answer"], logits=None), ["answer"]) mock_compute.return_value = MagicMock() metric = self._make_vision_accuracy_metric() @@ -699,13 +703,17 @@ def test_genai_vision_detected_with_empty_vision_object(self, tmp_path): config = {"model": {"vision": {}}} model = self._make_model_with_genai_config(tmp_path, config) - with patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, \ - patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, \ - patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), \ - patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io") as mock_gen: + with ( + patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, + patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, + patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, + patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), + patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), + patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, + patch( + "olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io" + ) as mock_gen, + ): mock_genai.return_value = (OliveModelOutput(preds=["answer"], logits=None), ["answer"]) mock_compute.return_value = MagicMock() metric = self._make_vision_accuracy_metric() @@ -725,13 +733,17 @@ def test_standard_vision_when_no_vision_field(self, tmp_path): config = {"model": {"type": "whisper"}} model = self._make_model_with_genai_config(tmp_path, config) - with patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, \ - patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, \ - patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), \ - patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io") as mock_gen: + with ( + patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, + patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, + patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, + patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), + patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), + patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, + patch( + "olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io" + ) as mock_gen, + ): mock_vision.return_value = (OliveModelOutput(preds=["answer"], logits=None), ["answer"]) mock_compute.return_value = MagicMock() metric = self._make_vision_accuracy_metric() @@ -750,13 +762,17 @@ def test_standard_vision_when_no_genai_config(self, tmp_path): model = self._make_model_with_genai_config(tmp_path, None) - with patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, \ - patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, \ - patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), \ - patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, \ - patch("olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io") as mock_gen: + with ( + patch.object(OnnxEvaluator, "_inference_vision_genai") as mock_genai, + patch.object(OnnxEvaluator, "_inference_vision") as mock_vision, + patch("olive.evaluator.olive_evaluator.OliveEvaluator.compute_accuracy") as mock_compute, + patch("olive.evaluator.olive_evaluator._is_vision_metric", return_value=True), + patch("olive.evaluator.olive_evaluator._validate_vision_task_metric"), + patch("olive.evaluator.olive_evaluator.OliveEvaluator.get_user_config") as mock_get_cfg, + patch( + "olive.evaluator.olive_evaluator.OliveEvaluator.generate_metric_user_config_with_model_io" + ) as mock_gen, + ): mock_vision.return_value = (OliveModelOutput(preds=["answer"], logits=None), ["answer"]) mock_compute.return_value = MagicMock() metric = self._make_vision_accuracy_metric() From d1b55a77dd18eb489cf0e587ea29c176d0af3fe8 Mon Sep 17 00:00:00 2001 From: David Fan Date: Tue, 2 Jun 2026 19:33:23 +0000 Subject: [PATCH 18/18] Fix formatting: collapse ImportError raise to single line --- olive/evaluator/olive_evaluator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 87e7f2676..d1238c037 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -843,9 +843,7 @@ def _inference_vision_genai( try: from PIL import Image except ImportError as e: - raise ImportError( - "Pillow is required for vision evaluation. Install it with: pip install Pillow" - ) from e + raise ImportError("Pillow is required for vision evaluation. Install it with: pip install Pillow") from e model_dir = str(Path(model.model_path).parent)