From 59facd22682704ddf68e64c93c2a6170114596cb Mon Sep 17 00:00:00 2001 From: zhongzhoutan <1710115119@bjmu.edu.cn> Date: Tue, 17 Mar 2026 09:44:20 +0800 Subject: [PATCH 1/8] [feature] add refcoco support --- .../configs/datasets/refcoco/refcoco_gen.py | 53 +++++++++ ais_bench/benchmark/datasets/__init__.py | 1 + .../benchmark/datasets/refcoco/__init__.py | 1 + .../benchmark/datasets/refcoco/refcoco.py | 91 +++++++++++++++ .../openicl/icl_evaluator/__init__.py | 1 + .../icl_evaluator/bbox_iou_evaluator.py | 106 ++++++++++++++++++ 6 files changed, 253 insertions(+) create mode 100644 ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py create mode 100644 ais_bench/benchmark/datasets/refcoco/__init__.py create mode 100644 ais_bench/benchmark/datasets/refcoco/refcoco.py create mode 100644 ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py new file mode 100644 index 00000000..e278ac18 --- /dev/null +++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py @@ -0,0 +1,53 @@ +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate +from ais_bench.benchmark.datasets import RefCOCODataset +from ais_bench.benchmark.datasets.refcoco import refcoco_bbox_postprocess +from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator + + +refcoco_reader_cfg = dict( + input_columns=['ref_sentence', 'image'], + output_column='answer' +) + +refcoco_infer_cfg = dict( + prompt_template=dict( + type=MMPromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt_mm={ + 'text': {'type': 'text', 'text': 'Locate every object that matches the description "{ref_sentence}" in the image. Report bbox coordinates in JSON format.'}, + 'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}}, + }) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +refcoco_eval_cfg = dict( + evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0), + pred_postprocessor=dict(type=refcoco_bbox_postprocess), +) + +_splits = [ + ('RefCOCO_val', 'val'), + ('RefCOCO_test', 'test'), + ('RefCOCO_testA', 'testA'), + ('RefCOCO_testB', 'testB'), +] + +refcoco_datasets = [ + dict( + abbr=abbr, + type=RefCOCODataset, + path='ais_bench/datasets/RefCOCO/data', + split=split, + reader_cfg=refcoco_reader_cfg, + infer_cfg=refcoco_infer_cfg, + eval_cfg=refcoco_eval_cfg, + ) + for abbr, split in _splits +] \ No newline at end of file diff --git a/ais_bench/benchmark/datasets/__init__.py b/ais_bench/benchmark/datasets/__init__.py index 1581a2af..3f0d8425 100644 --- a/ais_bench/benchmark/datasets/__init__.py +++ b/ais_bench/benchmark/datasets/__init__.py @@ -53,3 +53,4 @@ from ais_bench.benchmark.datasets.mmstar import * # noqa: F401, F403 from ais_bench.benchmark.datasets.dapo_math import * # noqa: F401, F403 from ais_bench.benchmark.datasets.mooncake_trace import * # noqa: F401, F403 +from ais_bench.benchmark.datasets.refcoco import * # noqa: F401, F403 diff --git a/ais_bench/benchmark/datasets/refcoco/__init__.py b/ais_bench/benchmark/datasets/refcoco/__init__.py new file mode 100644 index 00000000..b95ad54f --- /dev/null +++ b/ais_bench/benchmark/datasets/refcoco/__init__.py @@ -0,0 +1 @@ +from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset, refcoco_bbox_postprocess # noqa: F401 \ No newline at end of file diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco.py b/ais_bench/benchmark/datasets/refcoco/refcoco.py new file mode 100644 index 00000000..0e8707cf --- /dev/null +++ b/ais_bench/benchmark/datasets/refcoco/refcoco.py @@ -0,0 +1,91 @@ +import glob +import io +import json +import os +import re + +import pandas as pd +from PIL import Image + +from datasets import Dataset + +from ais_bench.benchmark.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from ais_bench.benchmark.datasets.utils.datasets import get_data_path +from ais_bench.benchmark.utils.image_process import pil_to_base64 +from ais_bench.benchmark.utils.logging import AISLogger + +from ..base import BaseDataset + +logger = AISLogger() + + +def _remove_leading_articles(text: str) -> str: + cleaned_text = re.sub(r'^(a|an|the)\s+', '', text.strip(), flags=re.IGNORECASE) + return cleaned_text or text.strip() + + +def parse_float_sequence_within(input_str: str): + """Extract the first sequence of four floats inside square brackets.""" + pattern = r'\[\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\]' + match = re.search(pattern, input_str) + if match: + return [float(match.group(i)) for i in range(1, 5)] + return [0.0, 0.0, 0.0, 0.0] # Default bbox if parsing fails + + +@TEXT_POSTPROCESSORS.register_module('refcoco_bbox_1000') +def refcoco_bbox_postprocess(text) -> list: + if not isinstance(text, str): + raise ValueError('Prediction must be a string') + + stripped_text = text.strip() + bbox = parse_float_sequence_within(stripped_text) + + logger.debug(f'refcoco_bbox_postprocess: bbox={bbox}') + return bbox + + +@LOAD_DATASET.register_module() +class RefCOCODataset(BaseDataset): + + @staticmethod + def load(path, split, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + resolved_path = get_data_path(path) + shard_paths = sorted(glob.glob(os.path.join(resolved_path, f'{split}-*.parquet'))) + if not shard_paths: + raise FileNotFoundError( + f'No RefCOCO parquet shards found for split {split} in {resolved_path}' + ) + + logger.info(f'Loading RefCOCO split {split} from {len(shard_paths)} shard(s) in {resolved_path}') + data = pd.concat([pd.read_parquet(shard_path) for shard_path in shard_paths], ignore_index=True) + + rows = [] + for i in range(len(data)): + line = data.iloc[i] + img_field = line['image'] + if not isinstance(img_field, dict) or 'bytes' not in img_field: + raise ValueError(f'RefCOCO row {i} has invalid image payload: {type(img_field)}') + + pil_img = Image.open(io.BytesIO(img_field['bytes'])).convert('RGB') + width, height = pil_img.width, pil_img.height + image_b64 = pil_to_base64(pil_img, format='JPEG') + + x_coord, y_coord, bbox_width, bbox_height = [float(value) for value in line['bbox']] + pixel_bbox = [x_coord, y_coord, x_coord + bbox_width, y_coord + bbox_height] + + for answer_text in line['answer']: + ref_sentence = _remove_leading_articles(str(answer_text)) + answer = json.dumps({ + 'question_id': int(line['question_id']), + 'bbox': pixel_bbox, + 'image_width': width, + 'image_height': height, + }) + rows.append({ + 'ref_sentence': ref_sentence, + 'image': image_b64, + 'answer': answer, + }) + + return Dataset.from_list(rows) diff --git a/ais_bench/benchmark/openicl/icl_evaluator/__init__.py b/ais_bench/benchmark/openicl/icl_evaluator/__init__.py index 6a622d2a..8e72243b 100644 --- a/ais_bench/benchmark/openicl/icl_evaluator/__init__.py +++ b/ais_bench/benchmark/openicl/icl_evaluator/__init__.py @@ -1,4 +1,5 @@ from ais_bench.benchmark.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator # noqa +from ais_bench.benchmark.openicl.icl_evaluator.bbox_iou_evaluator import BBoxIoUEvaluator # noqa from ais_bench.benchmark.openicl.icl_evaluator.icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa from ais_bench.benchmark.openicl.icl_evaluator.math_evaluator import MATHEvaluator # noqa from ais_bench.benchmark.openicl.icl_evaluator.icl_hf_evaluator import * # noqa diff --git a/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py b/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py new file mode 100644 index 00000000..6f608d7f --- /dev/null +++ b/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py @@ -0,0 +1,106 @@ +import json + +from ais_bench.benchmark.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator +from ais_bench.benchmark.registry import ICL_EVALUATORS + + +def _compute_iou(box1: list, box2: list) -> float: + x_left = max(box1[0], box2[0]) + y_top = max(box1[1], box2[1]) + x_right = min(box1[2], box2[2]) + y_bottom = min(box1[3], box2[3]) + + inter = max(0.0, x_right - x_left) * max(0.0, y_bottom - y_top) + area1 = max(0.0, box1[2] - box1[0]) * max(0.0, box1[3] - box1[1]) + area2 = max(0.0, box2[2] - box2[0]) * max(0.0, box2[3] - box2[1]) + union = area1 + area2 - inter + return inter / union if union > 0 else 0.0 + + +@ICL_EVALUATORS.register_module() +class BBoxIoUEvaluator(BaseEvaluator): + + def __init__(self, + iou_threshold: float = 0.5, + coord_scale: float = 1000.0, + reference_bbox_key: str = 'bbox', + image_width_key: str = 'image_width', + image_height_key: str = 'image_height', + metric_prefix: str = 'Accuracy', + clip_to_image: bool = True) -> None: + super().__init__() + self.iou_threshold = iou_threshold + self.coord_scale = coord_scale + self.reference_bbox_key = reference_bbox_key + self.image_width_key = image_width_key + self.image_height_key = image_height_key + self.metric_prefix = metric_prefix + self.clip_to_image = clip_to_image + + def _scale_prediction(self, pred_box: list, image_width: float, image_height: float) -> list: + if len(pred_box) != 4: + raise ValueError('Predicted bbox must contain four coordinates') + + scaled_box = [ + float(pred_box[0]) / self.coord_scale * float(image_width), + float(pred_box[1]) / self.coord_scale * float(image_height), + float(pred_box[2]) / self.coord_scale * float(image_width), + float(pred_box[3]) / self.coord_scale * float(image_height), + ] + + if self.clip_to_image: + scaled_box = [ + min(max(scaled_box[0], 0.0), float(image_width)), + min(max(scaled_box[1], 0.0), float(image_height)), + min(max(scaled_box[2], 0.0), float(image_width)), + min(max(scaled_box[3], 0.0), float(image_height)), + ] + + if scaled_box[2] <= scaled_box[0] or scaled_box[3] <= scaled_box[1]: + raise ValueError('Predicted bbox is reversed or empty after scaling') + return scaled_box + + def score(self, predictions, references): # pyright: ignore[reportIncompatibleMethodOverride] + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + f'length. len(predictions): {len(predictions)}, ' + f'len(references): {len(references)}' + } + + details = [] + scores = [] + for pred, ref in zip(predictions, references): + refer = json.loads(ref) if isinstance(ref, str) else ref + gt_box = [float(value) for value in refer[self.reference_bbox_key]] + image_width = float(refer[self.image_width_key]) + image_height = float(refer[self.image_height_key]) + + detail = { + 'pred': pred, + 'answer': ref, + 'correct': False, + 'coord_mode': f'0-{int(self.coord_scale)}', + } + + try: + pred_box_pixel = self._scale_prediction(pred, image_width, image_height) + iou = _compute_iou(pred_box_pixel, gt_box) + correct = iou >= self.iou_threshold + detail['correct'] = correct + detail['iou'] = iou + detail['pred_bbox_pixel'] = pred_box_pixel + scores.append(1 if correct else 0) + except (TypeError, ValueError, KeyError, json.JSONDecodeError) as error: + detail['iou'] = 0.0 + detail['pred_bbox_pixel'] = None + detail['invalid'] = True + detail['error'] = str(error) + scores.append(0) + + details.append(detail) + + return { + f'{self.metric_prefix}@{self.iou_threshold}': 100 * sum(scores) / len(scores) if scores else 0.0, + 'details': details, + } \ No newline at end of file From f003fbdd046a789777f57ac90973b9dc7bf88f3b Mon Sep 17 00:00:00 2001 From: zhongzhoutan <1710115119@bjmu.edu.cn> Date: Tue, 17 Mar 2026 15:06:01 +0800 Subject: [PATCH 2/8] [feat] save image to local disk instead of storing in share memory --- .../configs/datasets/refcoco/refcoco_gen.py | 6 +- .../benchmark/datasets/refcoco/refcoco.py | 148 ++++++++++++++---- 2 files changed, 121 insertions(+), 33 deletions(-) diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py index e278ac18..46e30b65 100644 --- a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py +++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py @@ -7,7 +7,7 @@ refcoco_reader_cfg = dict( - input_columns=['ref_sentence', 'image'], + input_columns=['question', 'image'], output_column='answer' ) @@ -17,8 +17,8 @@ template=dict( round=[ dict(role='HUMAN', prompt_mm={ - 'text': {'type': 'text', 'text': 'Locate every object that matches the description "{ref_sentence}" in the image. Report bbox coordinates in JSON format.'}, - 'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}}, + 'text': {'type': 'text', 'text': '{question}'}, + 'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}}, }) ] ) diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco.py b/ais_bench/benchmark/datasets/refcoco/refcoco.py index 0e8707cf..4e87d7f2 100644 --- a/ais_bench/benchmark/datasets/refcoco/refcoco.py +++ b/ais_bench/benchmark/datasets/refcoco/refcoco.py @@ -9,15 +9,20 @@ from datasets import Dataset +from ais_bench.benchmark.datasets.utils.datasets import get_content_str from ais_bench.benchmark.registry import LOAD_DATASET, TEXT_POSTPROCESSORS from ais_bench.benchmark.datasets.utils.datasets import get_data_path -from ais_bench.benchmark.utils.image_process import pil_to_base64 from ais_bench.benchmark.utils.logging import AISLogger from ..base import BaseDataset logger = AISLogger() +REFCOCO_PROMPT_TEMPLATE = ( + 'Locate every object that matches the description "{ref_sentence}" ' + 'in the image. Report bbox coordinates in JSON format.' +) + def _remove_leading_articles(text: str) -> str: cleaned_text = re.sub(r'^(a|an|the)\s+', '', text.strip(), flags=re.IGNORECASE) @@ -47,10 +52,18 @@ def refcoco_bbox_postprocess(text) -> list: @LOAD_DATASET.register_module() class RefCOCODataset(BaseDataset): + TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCO_images' @staticmethod - def load(path, split, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] - resolved_path = get_data_path(path) + def _generate_image_store_dir(resolved_path: str, split: str) -> str: + image_root_path = os.path.join( + os.path.dirname(resolved_path), + RefCOCODataset.TEMP_REFCOCO_IMAGE_STORE_DIR, + ) + return os.path.join(image_root_path, split) + + @staticmethod + def _load_split_dataframe(resolved_path: str, split: str) -> pd.DataFrame: shard_paths = sorted(glob.glob(os.path.join(resolved_path, f'{split}-*.parquet'))) if not shard_paths: raise FileNotFoundError( @@ -58,34 +71,109 @@ def load(path, split, **kwargs): # pyright: ignore[reportIncompatibleMethodOver ) logger.info(f'Loading RefCOCO split {split} from {len(shard_paths)} shard(s) in {resolved_path}') - data = pd.concat([pd.read_parquet(shard_path) for shard_path in shard_paths], ignore_index=True) + return pd.concat([pd.read_parquet(shard_path) for shard_path in shard_paths], ignore_index=True) + + @staticmethod + def _persist_image_if_not_exist(image_payload, image_name: str, image_root_dir: str, row_index: int) -> tuple[str, int, int]: + if not isinstance(image_payload, dict) or 'bytes' not in image_payload: + raise ValueError(f'RefCOCO row {row_index} has invalid image payload: {type(image_payload)}') + + pil_img = Image.open(io.BytesIO(image_payload['bytes'])).convert('RGB') + image_path = os.path.join(image_root_dir, image_name) + os.makedirs(os.path.dirname(image_path), exist_ok=True) + if not os.path.exists(image_path): + pil_img.save(image_path, format='JPEG') + return image_path, pil_img.width, pil_img.height + + @staticmethod + def _build_pixel_bbox(raw_bbox) -> list[float]: + x_coord, y_coord, bbox_width, bbox_height = [float(value) for value in raw_bbox] + return [x_coord, y_coord, x_coord + bbox_width, y_coord + bbox_height] + + @staticmethod + def _build_prompt(answer_text) -> str: + ref_sentence = _remove_leading_articles(str(answer_text)) + return REFCOCO_PROMPT_TEMPLATE.format(ref_sentence=ref_sentence) + @staticmethod + def _build_answer_payload(question_id, pixel_bbox: list[float], width: int, height: int) -> str: + return json.dumps({ + 'question_id': int(question_id), + 'bbox': pixel_bbox, + 'image_width': width, + 'image_height': height, + }) + + @staticmethod + def _build_rows(sample, image_path: str, width: int, height: int, pixel_bbox: list[float]) -> list[dict]: rows = [] - for i in range(len(data)): - line = data.iloc[i] - img_field = line['image'] - if not isinstance(img_field, dict) or 'bytes' not in img_field: - raise ValueError(f'RefCOCO row {i} has invalid image payload: {type(img_field)}') - - pil_img = Image.open(io.BytesIO(img_field['bytes'])).convert('RGB') - width, height = pil_img.width, pil_img.height - image_b64 = pil_to_base64(pil_img, format='JPEG') - - x_coord, y_coord, bbox_width, bbox_height = [float(value) for value in line['bbox']] - pixel_bbox = [x_coord, y_coord, x_coord + bbox_width, y_coord + bbox_height] - - for answer_text in line['answer']: - ref_sentence = _remove_leading_articles(str(answer_text)) - answer = json.dumps({ - 'question_id': int(line['question_id']), - 'bbox': pixel_bbox, - 'image_width': width, - 'image_height': height, - }) - rows.append({ - 'ref_sentence': ref_sentence, - 'image': image_b64, - 'answer': answer, - }) + reference_answer = RefCOCODataset._build_answer_payload( + sample['question_id'], + pixel_bbox, + width, + height, + ) + + for answer_text in sample['answer']: + prompt = RefCOCODataset._build_prompt(answer_text) + content = get_content_str([ + {'type': 'image_url', 'image_url': image_path}, + {'type': 'text', 'text': prompt}, + ]) + rows.append({ + 'content': content, + 'question': prompt, + 'image': image_path, + 'answer': reference_answer, + }) + return rows + + @staticmethod + def load(path, split, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + """Load a RefCOCO split and normalize it into benchmark rows. + + The source data is stored as parquet shards under ``path`` with shard + names matching ``-*.parquet``. Each source row contains an image + payload, a ground-truth bounding box in ``[x, y, w, h]`` format, and a + list of referring expressions. This loader persists each image to + ``RefCOCO_images//``, converts the bbox to + ``[x_min, y_min, x_max, y_max]``, and expands the answer list into one + benchmark row per referring expression. + + Args: + path: Dataset root containing RefCOCO parquet shards. + split: Split prefix to load, for example ``val`` or ``testA``. + **kwargs: Unused extra keyword arguments passed by the dataset + builder. + + Returns: + A HuggingFace ``Dataset`` whose rows contain ``content`` for + multimodal prompting and ``answer`` as the serialized reference + bbox payload used by evaluation. + """ + resolved_path = get_data_path(path) + image_root_dir = RefCOCODataset._generate_image_store_dir(resolved_path, split) + logger.info(f'Saving RefCOCO images to {image_root_dir}') + data = RefCOCODataset._load_split_dataframe(resolved_path, split) + os.makedirs(image_root_dir, exist_ok=True) + + rows = [] + for row_index, (_, sample) in enumerate(data.iterrows()): + image_path, width, height = RefCOCODataset._persist_image_if_not_exist( + sample['image'], + sample['file_name'], + image_root_dir, + row_index, + ) + pixel_bbox = RefCOCODataset._build_pixel_bbox(sample['bbox']) + rows.extend( + RefCOCODataset._build_rows( + sample, + image_path, + width, + height, + pixel_bbox, + ) + ) return Dataset.from_list(rows) From 224c731fcc8f043ab790aa7dff5c47bd3e6a84ec Mon Sep 17 00:00:00 2001 From: zhongzhoutan <1710115119@bjmu.edu.cn> Date: Tue, 17 Mar 2026 19:20:53 +0800 Subject: [PATCH 3/8] [feature] add refcoco plus support --- .../datasets/refcoco_plus/refcoco_plus_gen.py | 52 +++++++++++++++++++ .../benchmark/datasets/refcoco/__init__.py | 3 +- .../datasets/refcoco/refcoco_plus.py | 8 +++ 3 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py create mode 100644 ais_bench/benchmark/datasets/refcoco/refcoco_plus.py diff --git a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py new file mode 100644 index 00000000..989f9230 --- /dev/null +++ b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py @@ -0,0 +1,52 @@ +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate +from ais_bench.benchmark.datasets import RefCOCOPlusDataset +from ais_bench.benchmark.datasets.refcoco import refcoco_bbox_postprocess +from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator + + +refcoco_plus_reader_cfg = dict( + input_columns=['content'], + output_column='answer' +) + +refcoco_plus_infer_cfg = dict( + prompt_template=dict( + type=MMPromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt_mm={ + 'text': {'type': 'text', 'text': '{question}'}, + 'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}}, + }) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +refcoco_plus_eval_cfg = dict( + evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0), + pred_postprocessor=dict(type=refcoco_bbox_postprocess), +) + +_splits = [ + ('RefCOCOPlus_val', 'val'), + ('RefCOCOPlus_testA', 'testA'), + ('RefCOCOPlus_testB', 'testB'), +] + +refcoco_plus_datasets = [ + dict( + abbr=abbr, + type=RefCOCOPlusDataset, + path='ais_bench/datasets/RefCOCOplus/data', + split=split, + reader_cfg=refcoco_plus_reader_cfg, + infer_cfg=refcoco_plus_infer_cfg, + eval_cfg=refcoco_plus_eval_cfg, + ) + for abbr, split in _splits +] \ No newline at end of file diff --git a/ais_bench/benchmark/datasets/refcoco/__init__.py b/ais_bench/benchmark/datasets/refcoco/__init__.py index b95ad54f..6e5e4fcd 100644 --- a/ais_bench/benchmark/datasets/refcoco/__init__.py +++ b/ais_bench/benchmark/datasets/refcoco/__init__.py @@ -1 +1,2 @@ -from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset, refcoco_bbox_postprocess # noqa: F401 \ No newline at end of file +from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset, refcoco_bbox_postprocess # noqa: F401 +from ais_bench.benchmark.datasets.refcoco.refcoco_plus import RefCOCOPlusDataset # noqa: F401 \ No newline at end of file diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py b/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py new file mode 100644 index 00000000..7415e65f --- /dev/null +++ b/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py @@ -0,0 +1,8 @@ +from ais_bench.benchmark.registry import LOAD_DATASET + +from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset + + +@LOAD_DATASET.register_module() +class RefCOCOPlusDataset(RefCOCODataset): + TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCOPlus_images' From 8d180d5b6236b5de76d7157fe693b60b19dffa09 Mon Sep 17 00:00:00 2001 From: zhongzhoutan <1710115119@bjmu.edu.cn> Date: Tue, 17 Mar 2026 19:28:14 +0800 Subject: [PATCH 4/8] [feature] add refcocog support --- .../configs/datasets/refcocog/refcocog_gen.py | 51 +++++++++++++++++++ .../benchmark/datasets/refcoco/__init__.py | 1 + .../benchmark/datasets/refcoco/refcoco_g.py | 8 +++ 3 files changed, 60 insertions(+) create mode 100644 ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py create mode 100644 ais_bench/benchmark/datasets/refcoco/refcoco_g.py diff --git a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py new file mode 100644 index 00000000..c4429bb3 --- /dev/null +++ b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py @@ -0,0 +1,51 @@ +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate +from ais_bench.benchmark.datasets import RefCOCOgDataset +from ais_bench.benchmark.datasets.refcoco import refcoco_bbox_postprocess +from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator + + +refcocog_reader_cfg = dict( + input_columns=['content'], + output_column='answer' +) + +refcocog_infer_cfg = dict( + prompt_template=dict( + type=MMPromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt_mm={ + 'text': {'type': 'text', 'text': '{question}'}, + 'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}}, + }) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +refcocog_eval_cfg = dict( + evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0), + pred_postprocessor=dict(type=refcoco_bbox_postprocess), +) + +_splits = [ + ('RefCOCOg_val', 'val'), + ('RefCOCOg_test', 'test'), +] + +refcocog_datasets = [ + dict( + abbr=abbr, + type=RefCOCOgDataset, + path='ais_bench/datasets/RefCOCOg/data', + split=split, + reader_cfg=refcocog_reader_cfg, + infer_cfg=refcocog_infer_cfg, + eval_cfg=refcocog_eval_cfg, + ) + for abbr, split in _splits +] \ No newline at end of file diff --git a/ais_bench/benchmark/datasets/refcoco/__init__.py b/ais_bench/benchmark/datasets/refcoco/__init__.py index 6e5e4fcd..e4f94a76 100644 --- a/ais_bench/benchmark/datasets/refcoco/__init__.py +++ b/ais_bench/benchmark/datasets/refcoco/__init__.py @@ -1,2 +1,3 @@ from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset, refcoco_bbox_postprocess # noqa: F401 +from ais_bench.benchmark.datasets.refcoco.refcoco_g import RefCOCOgDataset # noqa: F401 from ais_bench.benchmark.datasets.refcoco.refcoco_plus import RefCOCOPlusDataset # noqa: F401 \ No newline at end of file diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco_g.py b/ais_bench/benchmark/datasets/refcoco/refcoco_g.py new file mode 100644 index 00000000..efee0302 --- /dev/null +++ b/ais_bench/benchmark/datasets/refcoco/refcoco_g.py @@ -0,0 +1,8 @@ +from ais_bench.benchmark.registry import LOAD_DATASET + +from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset + + +@LOAD_DATASET.register_module() +class RefCOCOgDataset(RefCOCODataset): + TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCOg_images' From 8aaa9fab3ccd06a1fe41e2f63a88284b9b503af4 Mon Sep 17 00:00:00 2001 From: zhongzhoutan <1710115119@bjmu.edu.cn> Date: Tue, 17 Mar 2026 20:20:07 +0800 Subject: [PATCH 5/8] [refactor] use the more general dir name for the saving images --- ais_bench/benchmark/datasets/refcoco/refcoco.py | 4 ++-- ais_bench/benchmark/datasets/refcoco/refcoco_g.py | 9 ++++++++- ais_bench/benchmark/datasets/refcoco/refcoco_plus.py | 10 +++++++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco.py b/ais_bench/benchmark/datasets/refcoco/refcoco.py index 4e87d7f2..9bf4413f 100644 --- a/ais_bench/benchmark/datasets/refcoco/refcoco.py +++ b/ais_bench/benchmark/datasets/refcoco/refcoco.py @@ -52,13 +52,13 @@ def refcoco_bbox_postprocess(text) -> list: @LOAD_DATASET.register_module() class RefCOCODataset(BaseDataset): - TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCO_images' + TEMP_IMAGE_STORE_DIR = 'temp_save_images' @staticmethod def _generate_image_store_dir(resolved_path: str, split: str) -> str: image_root_path = os.path.join( os.path.dirname(resolved_path), - RefCOCODataset.TEMP_REFCOCO_IMAGE_STORE_DIR, + RefCOCODataset.TEMP_IMAGE_STORE_DIR, ) return os.path.join(image_root_path, split) diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco_g.py b/ais_bench/benchmark/datasets/refcoco/refcoco_g.py index efee0302..6e32abb7 100644 --- a/ais_bench/benchmark/datasets/refcoco/refcoco_g.py +++ b/ais_bench/benchmark/datasets/refcoco/refcoco_g.py @@ -5,4 +5,11 @@ @LOAD_DATASET.register_module() class RefCOCOgDataset(RefCOCODataset): - TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCOg_images' + """ + RefCOCOg is a variant of RefCOCO with more complex referring expressions. + Because the dataset field is same as the RefCOCO dataset, we can reuse the loading and evaluation code. + The only difference is refcoco_g only has two splits: + - `val`: 7.57k rows + - `test`: 5.02k rows + """ + pass diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py b/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py index 7415e65f..026c222a 100644 --- a/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py +++ b/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py @@ -5,4 +5,12 @@ @LOAD_DATASET.register_module() class RefCOCOPlusDataset(RefCOCODataset): - TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCOPlus_images' + """ + RefCOCOplus is a variant of RefCOCO with more complex referring expressions. + Because the dataset field is same as the RefCOCO dataset, we can reuse the loading and evaluation code. + The only difference is refcoco_plus only has three splits: + - `val`: 3.81k rows + - `testA`: 1.98k rows + - `testB`: 1.8k rows + """ + pass From b2f1baccb13cb1447c6c76f56b65aa92be4fa284 Mon Sep 17 00:00:00 2001 From: zhongzhoutan <1710115119@bjmu.edu.cn> Date: Thu, 19 Mar 2026 19:16:18 +0800 Subject: [PATCH 6/8] [feature] add refcoco/+/g base64 support --- .../configs/datasets/refcoco/refcoco_gen.py | 12 +- .../datasets/refcoco/refcoco_gen_base64.py | 54 ++++++ .../datasets/refcoco_plus/refcoco_plus_gen.py | 12 +- .../refcoco_plus/refcoco_plus_gen_base64.py | 53 ++++++ .../configs/datasets/refcocog/refcocog_gen.py | 10 +- .../datasets/refcocog/refcocog_gen_base64.py | 52 ++++++ .../benchmark/datasets/refcoco/__init__.py | 8 +- .../benchmark/datasets/refcoco/refcoco.py | 172 ++++++++++-------- 8 files changed, 282 insertions(+), 91 deletions(-) create mode 100644 ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py create mode 100644 ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py create mode 100644 ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py index 46e30b65..b4e2b311 100644 --- a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py +++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py @@ -33,15 +33,15 @@ ) _splits = [ - ('RefCOCO_val', 'val'), - ('RefCOCO_test', 'test'), - ('RefCOCO_testA', 'testA'), - ('RefCOCO_testB', 'testB'), + 'val', + 'test', + 'testA', + 'testB', ] refcoco_datasets = [ dict( - abbr=abbr, + abbr='RefCOCO_' + split, type=RefCOCODataset, path='ais_bench/datasets/RefCOCO/data', split=split, @@ -49,5 +49,5 @@ infer_cfg=refcoco_infer_cfg, eval_cfg=refcoco_eval_cfg, ) - for abbr, split in _splits + for split in _splits ] \ No newline at end of file diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py new file mode 100644 index 00000000..a261ab06 --- /dev/null +++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py @@ -0,0 +1,54 @@ +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate +from ais_bench.benchmark.datasets import RefCOCODataset +from ais_bench.benchmark.datasets.refcoco import IMAGE_BASE64_TYPE, refcoco_bbox_postprocess +from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator + + +refcoco_reader_cfg = dict( + input_columns=['question', 'image'], + output_column='answer' +) + +refcoco_infer_cfg = dict( + prompt_template=dict( + type=MMPromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt_mm={ + 'text': {'type': 'text', 'text': '{question}'}, + 'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}}, + }) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +refcoco_eval_cfg = dict( + evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0), + pred_postprocessor=dict(type=refcoco_bbox_postprocess), +) + +_splits = [ + 'val', + 'test', + 'testA', + 'testB', +] + +refcoco_datasets = [ + dict( + abbr='RefCOCO_base64_' + split, + type=RefCOCODataset, + path='ais_bench/datasets/RefCOCO/data', + split=split, + image_type=IMAGE_BASE64_TYPE, + reader_cfg=refcoco_reader_cfg, + infer_cfg=refcoco_infer_cfg, + eval_cfg=refcoco_eval_cfg, + ) + for split in _splits +] \ No newline at end of file diff --git a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py index 989f9230..03b10fd4 100644 --- a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py +++ b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py @@ -7,7 +7,7 @@ refcoco_plus_reader_cfg = dict( - input_columns=['content'], + input_columns=['question', 'image'], output_column='answer' ) @@ -33,14 +33,14 @@ ) _splits = [ - ('RefCOCOPlus_val', 'val'), - ('RefCOCOPlus_testA', 'testA'), - ('RefCOCOPlus_testB', 'testB'), + 'val', + 'testA', + 'testB', ] refcoco_plus_datasets = [ dict( - abbr=abbr, + abbr='RefCOCOPlus_' + split, type=RefCOCOPlusDataset, path='ais_bench/datasets/RefCOCOplus/data', split=split, @@ -48,5 +48,5 @@ infer_cfg=refcoco_plus_infer_cfg, eval_cfg=refcoco_plus_eval_cfg, ) - for abbr, split in _splits + for split in _splits ] \ No newline at end of file diff --git a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py new file mode 100644 index 00000000..8b1b41b3 --- /dev/null +++ b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py @@ -0,0 +1,53 @@ +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate +from ais_bench.benchmark.datasets import RefCOCOPlusDataset +from ais_bench.benchmark.datasets.refcoco import IMAGE_BASE64_TYPE, refcoco_bbox_postprocess +from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator + + +refcoco_plus_reader_cfg = dict( + input_columns=['question', 'image'], + output_column='answer' +) + +refcoco_plus_infer_cfg = dict( + prompt_template=dict( + type=MMPromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt_mm={ + 'text': {'type': 'text', 'text': '{question}'}, + 'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}}, + }) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +refcoco_plus_eval_cfg = dict( + evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0), + pred_postprocessor=dict(type=refcoco_bbox_postprocess), +) + +_splits = [ + 'val', + 'testA', + 'testB', +] + +refcoco_plus_datasets = [ + dict( + abbr='RefCOCOPlus_base64_' + split, + type=RefCOCOPlusDataset, + path='ais_bench/datasets/RefCOCOplus/data', + split=split, + image_type=IMAGE_BASE64_TYPE, + reader_cfg=refcoco_plus_reader_cfg, + infer_cfg=refcoco_plus_infer_cfg, + eval_cfg=refcoco_plus_eval_cfg, + ) + for split in _splits +] \ No newline at end of file diff --git a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py index c4429bb3..c1504f7a 100644 --- a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py +++ b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py @@ -7,7 +7,7 @@ refcocog_reader_cfg = dict( - input_columns=['content'], + input_columns=['question', 'image'], output_column='answer' ) @@ -33,13 +33,13 @@ ) _splits = [ - ('RefCOCOg_val', 'val'), - ('RefCOCOg_test', 'test'), + 'val', + 'test', ] refcocog_datasets = [ dict( - abbr=abbr, + abbr='RefCOCOg_' + split, type=RefCOCOgDataset, path='ais_bench/datasets/RefCOCOg/data', split=split, @@ -47,5 +47,5 @@ infer_cfg=refcocog_infer_cfg, eval_cfg=refcocog_eval_cfg, ) - for abbr, split in _splits + for split in _splits ] \ No newline at end of file diff --git a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py new file mode 100644 index 00000000..eedcda7a --- /dev/null +++ b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py @@ -0,0 +1,52 @@ +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate +from ais_bench.benchmark.datasets import RefCOCOgDataset +from ais_bench.benchmark.datasets.refcoco import IMAGE_BASE64_TYPE, refcoco_bbox_postprocess +from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator + + +refcocog_reader_cfg = dict( + input_columns=['question', 'image'], + output_column='answer' +) + +refcocog_infer_cfg = dict( + prompt_template=dict( + type=MMPromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt_mm={ + 'text': {'type': 'text', 'text': '{question}'}, + 'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}}, + }) + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +refcocog_eval_cfg = dict( + evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0), + pred_postprocessor=dict(type=refcoco_bbox_postprocess), +) + +_splits = [ + 'val', + 'test', +] + +refcocog_datasets = [ + dict( + abbr='RefCOCOg_base64_' + split, + type=RefCOCOgDataset, + path='ais_bench/datasets/RefCOCOg/data', + split=split, + image_type=IMAGE_BASE64_TYPE, + reader_cfg=refcocog_reader_cfg, + infer_cfg=refcocog_infer_cfg, + eval_cfg=refcocog_eval_cfg, + ) + for split in _splits +] \ No newline at end of file diff --git a/ais_bench/benchmark/datasets/refcoco/__init__.py b/ais_bench/benchmark/datasets/refcoco/__init__.py index e4f94a76..1590279f 100644 --- a/ais_bench/benchmark/datasets/refcoco/__init__.py +++ b/ais_bench/benchmark/datasets/refcoco/__init__.py @@ -1,3 +1,9 @@ -from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset, refcoco_bbox_postprocess # noqa: F401 +from ais_bench.benchmark.datasets.refcoco.refcoco import ( # noqa: F401 + IMAGE_BASE64_TYPE, + IMAGE_PATH_TYPE, + TEMP_IMAGE_STORE_DIR, + RefCOCODataset, + refcoco_bbox_postprocess, +) from ais_bench.benchmark.datasets.refcoco.refcoco_g import RefCOCOgDataset # noqa: F401 from ais_bench.benchmark.datasets.refcoco.refcoco_plus import RefCOCOPlusDataset # noqa: F401 \ No newline at end of file diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco.py b/ais_bench/benchmark/datasets/refcoco/refcoco.py index 9bf4413f..c5aa814d 100644 --- a/ais_bench/benchmark/datasets/refcoco/refcoco.py +++ b/ais_bench/benchmark/datasets/refcoco/refcoco.py @@ -4,6 +4,9 @@ import os import re +from abc import ABC, abstractmethod +from typing import Any + import pandas as pd from PIL import Image @@ -12,56 +15,93 @@ from ais_bench.benchmark.datasets.utils.datasets import get_content_str from ais_bench.benchmark.registry import LOAD_DATASET, TEXT_POSTPROCESSORS from ais_bench.benchmark.datasets.utils.datasets import get_data_path +from ais_bench.benchmark.utils.image_process import pil_to_base64 from ais_bench.benchmark.utils.logging import AISLogger from ..base import BaseDataset logger = AISLogger() +IMAGE_PATH_TYPE = 'image_path' +IMAGE_BASE64_TYPE = 'image_base64' + REFCOCO_PROMPT_TEMPLATE = ( 'Locate every object that matches the description "{ref_sentence}" ' 'in the image. Report bbox coordinates in JSON format.' ) +TEMP_IMAGE_STORE_DIR = 'temp_save_images' -def _remove_leading_articles(text: str) -> str: - cleaned_text = re.sub(r'^(a|an|the)\s+', '', text.strip(), flags=re.IGNORECASE) - return cleaned_text or text.strip() - - -def parse_float_sequence_within(input_str: str): +def _parse_float_sequence_within(input_str: str) -> list[float]: """Extract the first sequence of four floats inside square brackets.""" pattern = r'\[\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\]' match = re.search(pattern, input_str) if match: return [float(match.group(i)) for i in range(1, 5)] - return [0.0, 0.0, 0.0, 0.0] # Default bbox if parsing fails + return [0.0, 0.0, 0.0, 0.0] -@TEXT_POSTPROCESSORS.register_module('refcoco_bbox_1000') -def refcoco_bbox_postprocess(text) -> list: - if not isinstance(text, str): - raise ValueError('Prediction must be a string') +def _remove_leading_articles(text: str) -> str: + cleaned_text = re.sub(r'^(a|an|the)\s+', '', text.strip(), flags=re.IGNORECASE) + return cleaned_text or text.strip() + +@TEXT_POSTPROCESSORS.register_module('refcoco_bbox_1000') +def refcoco_bbox_postprocess(text: str) -> list[float]: stripped_text = text.strip() - bbox = parse_float_sequence_within(stripped_text) + bbox = _parse_float_sequence_within(stripped_text) logger.debug(f'refcoco_bbox_postprocess: bbox={bbox}') return bbox -@LOAD_DATASET.register_module() -class RefCOCODataset(BaseDataset): - TEMP_IMAGE_STORE_DIR = 'temp_save_images' +class ImageResolver(ABC): + """Strategy interface for converting a PIL image into a transport value.""" - @staticmethod - def _generate_image_store_dir(resolved_path: str, split: str) -> str: - image_root_path = os.path.join( - os.path.dirname(resolved_path), - RefCOCODataset.TEMP_IMAGE_STORE_DIR, + @abstractmethod + def setup(self, resolved_path: str, split: str) -> None: + ... + + @abstractmethod + def resolve(self, pil_img: Image.Image, file_name: str) -> str: + ... + + +class PathImageResolver(ImageResolver): + def setup(self, resolved_path: str, split: str) -> None: + image_cache_path = os.path.join( + resolved_path, + TEMP_IMAGE_STORE_DIR, + split, ) - return os.path.join(image_root_path, split) + logger.info(f'Saving RefCOCO images to {image_cache_path}') + os.makedirs(image_cache_path, exist_ok=True) + self._cache_dir = image_cache_path + def resolve(self, pil_img: Image.Image, file_name: str) -> str: + image_path = os.path.join(self._cache_dir, file_name) + os.makedirs(os.path.dirname(image_path), exist_ok=True) + if not os.path.exists(image_path): + pil_img.save(image_path, format='JPEG') + return image_path + + +class Base64ImageResolver(ImageResolver): + def setup(self, resolved_path: str, split: str) -> None: + logger.info(f'Encoding RefCOCO images as base64 for split {split}') + + def resolve(self, pil_img: Image.Image, file_name: str) -> str: + return pil_to_base64(pil_img, format='JPEG') + + +IMAGE_RESOLVERS = { + IMAGE_PATH_TYPE: PathImageResolver, + IMAGE_BASE64_TYPE: Base64ImageResolver, +} + + +@LOAD_DATASET.register_module() +class RefCOCODataset(BaseDataset): @staticmethod def _load_split_dataframe(resolved_path: str, split: str) -> pd.DataFrame: shard_paths = sorted(glob.glob(os.path.join(resolved_path, f'{split}-*.parquet'))) @@ -74,77 +114,69 @@ def _load_split_dataframe(resolved_path: str, split: str) -> pd.DataFrame: return pd.concat([pd.read_parquet(shard_path) for shard_path in shard_paths], ignore_index=True) @staticmethod - def _persist_image_if_not_exist(image_payload, image_name: str, image_root_dir: str, row_index: int) -> tuple[str, int, int]: + def _decode_image_payload(image_payload: Any, row_index: int) -> Image.Image: if not isinstance(image_payload, dict) or 'bytes' not in image_payload: raise ValueError(f'RefCOCO row {row_index} has invalid image payload: {type(image_payload)}') - pil_img = Image.open(io.BytesIO(image_payload['bytes'])).convert('RGB') - image_path = os.path.join(image_root_dir, image_name) - os.makedirs(os.path.dirname(image_path), exist_ok=True) - if not os.path.exists(image_path): - pil_img.save(image_path, format='JPEG') - return image_path, pil_img.width, pil_img.height + return Image.open(io.BytesIO(image_payload['bytes'])).convert('RGB') @staticmethod - def _build_pixel_bbox(raw_bbox) -> list[float]: + def _build_pixel_bbox(raw_bbox: Any) -> list[float]: x_coord, y_coord, bbox_width, bbox_height = [float(value) for value in raw_bbox] return [x_coord, y_coord, x_coord + bbox_width, y_coord + bbox_height] @staticmethod - def _build_prompt(answer_text) -> str: + def _build_prompt(answer_text: Any) -> str: ref_sentence = _remove_leading_articles(str(answer_text)) return REFCOCO_PROMPT_TEMPLATE.format(ref_sentence=ref_sentence) @staticmethod - def _build_answer_payload(question_id, pixel_bbox: list[float], width: int, height: int) -> str: - return json.dumps({ - 'question_id': int(question_id), - 'bbox': pixel_bbox, + def _build_rows( + sample: pd.Series, + image_value: str, + width: int, + height: int, + ) -> list[dict[str, str]]: + reference_answer = json.dumps({ + 'question_id': int(sample['question_id']), + 'bbox': RefCOCODataset._build_pixel_bbox(sample['bbox']), 'image_width': width, 'image_height': height, }) - @staticmethod - def _build_rows(sample, image_path: str, width: int, height: int, pixel_bbox: list[float]) -> list[dict]: - rows = [] - reference_answer = RefCOCODataset._build_answer_payload( - sample['question_id'], - pixel_bbox, - width, - height, - ) - + rows: list[dict[str, str]] = [] for answer_text in sample['answer']: prompt = RefCOCODataset._build_prompt(answer_text) content = get_content_str([ - {'type': 'image_url', 'image_url': image_path}, + {'type': 'image_url', 'image_url': image_value}, {'type': 'text', 'text': prompt}, ]) rows.append({ 'content': content, 'question': prompt, - 'image': image_path, + 'image': image_value, 'answer': reference_answer, }) return rows @staticmethod - def load(path, split, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + def load(path: str, split: str, **kwargs: Any) -> Dataset: # pyright: ignore[reportIncompatibleMethodOverride] """Load a RefCOCO split and normalize it into benchmark rows. The source data is stored as parquet shards under ``path`` with shard names matching ``-*.parquet``. Each source row contains an image payload, a ground-truth bounding box in ``[x, y, w, h]`` format, and a - list of referring expressions. This loader persists each image to - ``RefCOCO_images//``, converts the bbox to - ``[x_min, y_min, x_max, y_max]``, and expands the answer list into one - benchmark row per referring expression. + list of referring expressions. This loader can either persist each image + to a split-specific cache directory or encode it as base64, converts the + bbox to ``[x_min, y_min, x_max, y_max]``, and expands the answer list + into one benchmark row per referring expression. Args: path: Dataset root containing RefCOCO parquet shards. split: Split prefix to load, for example ``val`` or ``testA``. - **kwargs: Unused extra keyword arguments passed by the dataset - builder. + **kwargs: Extra keyword arguments passed by the dataset builder. + Supported key: ``image_type`` with values ``IMAGE_PATH_TYPE`` or + ``IMAGE_BASE64_TYPE``. Returns: A HuggingFace ``Dataset`` whose rows contain ``content`` for @@ -152,28 +184,22 @@ def load(path, split, **kwargs): # pyright: ignore[reportIncompatibleMethodOver bbox payload used by evaluation. """ resolved_path = get_data_path(path) - image_root_dir = RefCOCODataset._generate_image_store_dir(resolved_path, split) - logger.info(f'Saving RefCOCO images to {image_root_dir}') + image_type = kwargs.get('image_type', IMAGE_PATH_TYPE) + if image_type not in IMAGE_RESOLVERS: + raise ValueError( + f'Unsupported image_type: {image_type}. Expected one of {sorted(IMAGE_RESOLVERS)}' + ) data = RefCOCODataset._load_split_dataframe(resolved_path, split) - os.makedirs(image_root_dir, exist_ok=True) + resolver = IMAGE_RESOLVERS[image_type]() + resolver.setup(resolved_path, split) - rows = [] + rows: list[dict[str, str]] = [] for row_index, (_, sample) in enumerate(data.iterrows()): - image_path, width, height = RefCOCODataset._persist_image_if_not_exist( - sample['image'], - sample['file_name'], - image_root_dir, - row_index, - ) - pixel_bbox = RefCOCODataset._build_pixel_bbox(sample['bbox']) - rows.extend( - RefCOCODataset._build_rows( - sample, - image_path, - width, - height, - pixel_bbox, - ) - ) + pil_img = RefCOCODataset._decode_image_payload(sample['image'], row_index) + image_value = resolver.resolve(pil_img, sample['file_name']) + + width, height = pil_img.width, pil_img.height + sample_rows = RefCOCODataset._build_rows(sample, image_value, width, height) + rows.extend(sample_rows) return Dataset.from_list(rows) From 7d6802e1cd774bc4511b7356267bb52fcafa8be8 Mon Sep 17 00:00:00 2001 From: zhongzhoutan <1710115119@bjmu.edu.cn> Date: Fri, 20 Mar 2026 10:17:05 +0800 Subject: [PATCH 7/8] [refactor] avoid index error and type error to raise to user --- .../openicl/icl_evaluator/bbox_iou_evaluator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py b/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py index 6f608d7f..f514b2e1 100644 --- a/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py +++ b/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py @@ -71,11 +71,6 @@ def score(self, predictions, references): # pyright: ignore[reportIncompatibleM details = [] scores = [] for pred, ref in zip(predictions, references): - refer = json.loads(ref) if isinstance(ref, str) else ref - gt_box = [float(value) for value in refer[self.reference_bbox_key]] - image_width = float(refer[self.image_width_key]) - image_height = float(refer[self.image_height_key]) - detail = { 'pred': pred, 'answer': ref, @@ -84,14 +79,19 @@ def score(self, predictions, references): # pyright: ignore[reportIncompatibleM } try: + refer = json.loads(ref) if isinstance(ref, str) else ref + image_width = float(refer[self.image_width_key]) + image_height = float(refer[self.image_height_key]) pred_box_pixel = self._scale_prediction(pred, image_width, image_height) + gt_box = [float(value) for value in refer[self.reference_bbox_key]] + iou = _compute_iou(pred_box_pixel, gt_box) correct = iou >= self.iou_threshold detail['correct'] = correct detail['iou'] = iou detail['pred_bbox_pixel'] = pred_box_pixel scores.append(1 if correct else 0) - except (TypeError, ValueError, KeyError, json.JSONDecodeError) as error: + except (TypeError, ValueError, KeyError, json.JSONDecodeError, IndexError) as error: detail['iou'] = 0.0 detail['pred_bbox_pixel'] = None detail['invalid'] = True From 771f195617467651a775f449e1694fa17157c2e1 Mon Sep 17 00:00:00 2001 From: zhongzhoutan <1710115119@bjmu.edu.cn> Date: Fri, 20 Mar 2026 15:15:55 +0800 Subject: [PATCH 8/8] [refactor] remove unused image and question, also move the prompt to datasets config --- .../configs/datasets/refcoco/refcoco_gen.py | 5 ++- .../datasets/refcoco/refcoco_gen_base64.py | 5 ++- .../datasets/refcoco_plus/refcoco_plus_gen.py | 5 ++- .../refcoco_plus/refcoco_plus_gen_base64.py | 5 ++- .../configs/datasets/refcocog/refcocog_gen.py | 5 ++- .../datasets/refcocog/refcocog_gen_base64.py | 5 ++- .../benchmark/datasets/refcoco/refcoco.py | 31 +++++++++---------- 7 files changed, 38 insertions(+), 23 deletions(-) diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py index b4e2b311..008119cb 100644 --- a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py +++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py @@ -17,7 +17,10 @@ template=dict( round=[ dict(role='HUMAN', prompt_mm={ - 'text': {'type': 'text', 'text': '{question}'}, + 'text': { + 'type': 'text', + 'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.' + }, 'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}}, }) ] diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py index a261ab06..d807dbce 100644 --- a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py +++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py @@ -17,7 +17,10 @@ template=dict( round=[ dict(role='HUMAN', prompt_mm={ - 'text': {'type': 'text', 'text': '{question}'}, + 'text': { + 'type': 'text', + 'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.' + }, 'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}}, }) ] diff --git a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py index 03b10fd4..d505dd74 100644 --- a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py +++ b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py @@ -17,7 +17,10 @@ template=dict( round=[ dict(role='HUMAN', prompt_mm={ - 'text': {'type': 'text', 'text': '{question}'}, + 'text': { + 'type': 'text', + 'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.' + }, 'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}}, }) ] diff --git a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py index 8b1b41b3..5804cb61 100644 --- a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py +++ b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py @@ -17,7 +17,10 @@ template=dict( round=[ dict(role='HUMAN', prompt_mm={ - 'text': {'type': 'text', 'text': '{question}'}, + 'text': { + 'type': 'text', + 'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.' + }, 'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}}, }) ] diff --git a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py index c1504f7a..50cbb852 100644 --- a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py +++ b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py @@ -17,7 +17,10 @@ template=dict( round=[ dict(role='HUMAN', prompt_mm={ - 'text': {'type': 'text', 'text': '{question}'}, + 'text': { + 'type': 'text', + 'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.' + }, 'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}}, }) ] diff --git a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py index eedcda7a..cf6eb915 100644 --- a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py +++ b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py @@ -17,7 +17,10 @@ template=dict( round=[ dict(role='HUMAN', prompt_mm={ - 'text': {'type': 'text', 'text': '{question}'}, + 'text': { + 'type': 'text', + 'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.' + }, 'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}}, }) ] diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco.py b/ais_bench/benchmark/datasets/refcoco/refcoco.py index c5aa814d..319e4cfa 100644 --- a/ais_bench/benchmark/datasets/refcoco/refcoco.py +++ b/ais_bench/benchmark/datasets/refcoco/refcoco.py @@ -25,11 +25,6 @@ IMAGE_PATH_TYPE = 'image_path' IMAGE_BASE64_TYPE = 'image_base64' -REFCOCO_PROMPT_TEMPLATE = ( - 'Locate every object that matches the description "{ref_sentence}" ' - 'in the image. Report bbox coordinates in JSON format.' -) - TEMP_IMAGE_STORE_DIR = 'temp_save_images' def _parse_float_sequence_within(input_str: str) -> list[float]: @@ -125,11 +120,6 @@ def _build_pixel_bbox(raw_bbox: Any) -> list[float]: x_coord, y_coord, bbox_width, bbox_height = [float(value) for value in raw_bbox] return [x_coord, y_coord, x_coord + bbox_width, y_coord + bbox_height] - @staticmethod - def _build_prompt(answer_text: Any) -> str: - ref_sentence = _remove_leading_articles(str(answer_text)) - return REFCOCO_PROMPT_TEMPLATE.format(ref_sentence=ref_sentence) - @staticmethod def _build_rows( sample: pd.Series, @@ -146,15 +136,12 @@ def _build_rows( rows: list[dict[str, str]] = [] for answer_text in sample['answer']: - prompt = RefCOCODataset._build_prompt(answer_text) content = get_content_str([ {'type': 'image_url', 'image_url': image_value}, - {'type': 'text', 'text': prompt}, + {'type': 'text', 'text': answer_text}, ]) rows.append({ 'content': content, - 'question': prompt, - 'image': image_value, 'answer': reference_answer, }) return rows @@ -171,6 +158,14 @@ def load(path: str, split: str, **kwargs: Any) -> Dataset: # pyright: ignore[re bbox to ``[x_min, y_min, x_max, y_max]``, and expands the answer list into one benchmark row per referring expression. + Each output row has a ``content`` field that encodes the image and + referring expression together using ``AIS_CONTENT_TAG`` delimiters + (via :func:`get_content_str`). During inference the + :meth:`PromptList.format_mm` method splits ``content`` on + ``AIS_CONTENT_TAG`` and uses the ``AIS_IMAGE_START`` / + ``AIS_TEXT_START`` prefixes to populate the ``prompt_mm`` template + with the image URL and question text respectively. + Args: path: Dataset root containing RefCOCO parquet shards. split: Split prefix to load, for example ``val`` or ``testA``. @@ -179,9 +174,11 @@ def load(path: str, split: str, **kwargs: Any) -> Dataset: # pyright: ignore[re ``IMAGE_BASE64_TYPE``. Returns: - A HuggingFace ``Dataset`` whose rows contain ``content`` for - multimodal prompting and ``answer`` as the serialized reference - bbox payload used by evaluation. + A HuggingFace ``Dataset`` with columns: + - content: encoded multimodal string consumed by + ``format_mm`` to fill the ``prompt_mm`` template. + - answer: JSON-serialized reference bbox payload used by + evaluation. """ resolved_path = get_data_path(path) image_type = kwargs.get('image_type', IMAGE_PATH_TYPE)