From 59facd22682704ddf68e64c93c2a6170114596cb Mon Sep 17 00:00:00 2001
From: zhongzhoutan <1710115119@bjmu.edu.cn>
Date: Tue, 17 Mar 2026 09:44:20 +0800
Subject: [PATCH 1/8] [feature] add refcoco support

---
 .../configs/datasets/refcoco/refcoco_gen.py   |  53 +++++++++
 ais_bench/benchmark/datasets/__init__.py      |   1 +
 .../benchmark/datasets/refcoco/__init__.py    |   1 +
 .../benchmark/datasets/refcoco/refcoco.py     |  91 +++++++++++++++
 .../openicl/icl_evaluator/__init__.py         |   1 +
 .../icl_evaluator/bbox_iou_evaluator.py       | 106 ++++++++++++++++++
 6 files changed, 253 insertions(+)
 create mode 100644 ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
 create mode 100644 ais_bench/benchmark/datasets/refcoco/__init__.py
 create mode 100644 ais_bench/benchmark/datasets/refcoco/refcoco.py
 create mode 100644 ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py

diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
new file mode 100644
index 00000000..e278ac18
--- /dev/null
+++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
@@ -0,0 +1,53 @@
+from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
+from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
+from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate
+from ais_bench.benchmark.datasets import RefCOCODataset
+from ais_bench.benchmark.datasets.refcoco import refcoco_bbox_postprocess
+from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator
+
+
+refcoco_reader_cfg = dict(
+    input_columns=['ref_sentence', 'image'],
+    output_column='answer'
+)
+
+refcoco_infer_cfg = dict(
+    prompt_template=dict(
+        type=MMPromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt_mm={
+                    'text': {'type': 'text', 'text': 'Locate every object that matches the description "{ref_sentence}" in the image. Report bbox coordinates in JSON format.'},
+                    'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}},
+                })
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+refcoco_eval_cfg = dict(
+    evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0),
+    pred_postprocessor=dict(type=refcoco_bbox_postprocess),
+)
+
+_splits = [
+    ('RefCOCO_val', 'val'),
+    ('RefCOCO_test', 'test'),
+    ('RefCOCO_testA', 'testA'),
+    ('RefCOCO_testB', 'testB'),
+]
+
+refcoco_datasets = [
+    dict(
+        abbr=abbr,
+        type=RefCOCODataset,
+        path='ais_bench/datasets/RefCOCO/data',
+        split=split,
+        reader_cfg=refcoco_reader_cfg,
+        infer_cfg=refcoco_infer_cfg,
+        eval_cfg=refcoco_eval_cfg,
+    )
+    for abbr, split in _splits
+]
\ No newline at end of file
diff --git a/ais_bench/benchmark/datasets/__init__.py b/ais_bench/benchmark/datasets/__init__.py
index 1581a2af..3f0d8425 100644
--- a/ais_bench/benchmark/datasets/__init__.py
+++ b/ais_bench/benchmark/datasets/__init__.py
@@ -53,3 +53,4 @@
 from ais_bench.benchmark.datasets.mmstar import * # noqa: F401, F403
 from ais_bench.benchmark.datasets.dapo_math import * # noqa: F401, F403
 from ais_bench.benchmark.datasets.mooncake_trace import * # noqa: F401, F403
+from ais_bench.benchmark.datasets.refcoco import *  # noqa: F401, F403
diff --git a/ais_bench/benchmark/datasets/refcoco/__init__.py b/ais_bench/benchmark/datasets/refcoco/__init__.py
new file mode 100644
index 00000000..b95ad54f
--- /dev/null
+++ b/ais_bench/benchmark/datasets/refcoco/__init__.py
@@ -0,0 +1 @@
+from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset, refcoco_bbox_postprocess  # noqa: F401
\ No newline at end of file
diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco.py b/ais_bench/benchmark/datasets/refcoco/refcoco.py
new file mode 100644
index 00000000..0e8707cf
--- /dev/null
+++ b/ais_bench/benchmark/datasets/refcoco/refcoco.py
@@ -0,0 +1,91 @@
+import glob
+import io
+import json
+import os
+import re
+
+import pandas as pd
+from PIL import Image
+
+from datasets import Dataset
+
+from ais_bench.benchmark.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+from ais_bench.benchmark.datasets.utils.datasets import get_data_path
+from ais_bench.benchmark.utils.image_process import pil_to_base64
+from ais_bench.benchmark.utils.logging import AISLogger
+
+from ..base import BaseDataset
+
+logger = AISLogger()
+
+
+def _remove_leading_articles(text: str) -> str:
+    cleaned_text = re.sub(r'^(a|an|the)\s+', '', text.strip(), flags=re.IGNORECASE)
+    return cleaned_text or text.strip()
+
+
+def parse_float_sequence_within(input_str: str):
+    """Extract the first sequence of four floats inside square brackets."""
+    pattern = r'\[\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\]'
+    match = re.search(pattern, input_str)
+    if match:
+        return [float(match.group(i)) for i in range(1, 5)]
+    return [0.0, 0.0, 0.0, 0.0]  # Default bbox if parsing fails
+
+
+@TEXT_POSTPROCESSORS.register_module('refcoco_bbox_1000')
+def refcoco_bbox_postprocess(text) -> list:
+    if not isinstance(text, str):
+        raise ValueError('Prediction must be a string')
+
+    stripped_text = text.strip()
+    bbox = parse_float_sequence_within(stripped_text)
+
+    logger.debug(f'refcoco_bbox_postprocess: bbox={bbox}')
+    return bbox
+
+
+@LOAD_DATASET.register_module()
+class RefCOCODataset(BaseDataset):
+
+    @staticmethod
+    def load(path, split, **kwargs):  # pyright: ignore[reportIncompatibleMethodOverride]
+        resolved_path = get_data_path(path)
+        shard_paths = sorted(glob.glob(os.path.join(resolved_path, f'{split}-*.parquet')))
+        if not shard_paths:
+            raise FileNotFoundError(
+                f'No RefCOCO parquet shards found for split {split} in {resolved_path}'
+            )
+
+        logger.info(f'Loading RefCOCO split {split} from {len(shard_paths)} shard(s) in {resolved_path}')
+        data = pd.concat([pd.read_parquet(shard_path) for shard_path in shard_paths], ignore_index=True)
+
+        rows = []
+        for i in range(len(data)):
+            line = data.iloc[i]
+            img_field = line['image']
+            if not isinstance(img_field, dict) or 'bytes' not in img_field:
+                raise ValueError(f'RefCOCO row {i} has invalid image payload: {type(img_field)}')
+
+            pil_img = Image.open(io.BytesIO(img_field['bytes'])).convert('RGB')
+            width, height = pil_img.width, pil_img.height
+            image_b64 = pil_to_base64(pil_img, format='JPEG')
+
+            x_coord, y_coord, bbox_width, bbox_height = [float(value) for value in line['bbox']]
+            pixel_bbox = [x_coord, y_coord, x_coord + bbox_width, y_coord + bbox_height]
+
+            for answer_text in line['answer']:
+                ref_sentence = _remove_leading_articles(str(answer_text))
+                answer = json.dumps({
+                    'question_id': int(line['question_id']),
+                    'bbox': pixel_bbox,
+                    'image_width': width,
+                    'image_height': height,
+                })
+                rows.append({
+                    'ref_sentence': ref_sentence,
+                    'image': image_b64,
+                    'answer': answer,
+                })
+
+        return Dataset.from_list(rows)
diff --git a/ais_bench/benchmark/openicl/icl_evaluator/__init__.py b/ais_bench/benchmark/openicl/icl_evaluator/__init__.py
index 6a622d2a..8e72243b 100644
--- a/ais_bench/benchmark/openicl/icl_evaluator/__init__.py
+++ b/ais_bench/benchmark/openicl/icl_evaluator/__init__.py
@@ -1,4 +1,5 @@
 from ais_bench.benchmark.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator  # noqa
+from ais_bench.benchmark.openicl.icl_evaluator.bbox_iou_evaluator import BBoxIoUEvaluator  # noqa
 from ais_bench.benchmark.openicl.icl_evaluator.icl_jieba_rouge_evaluator import JiebaRougeEvaluator  # noqa
 from ais_bench.benchmark.openicl.icl_evaluator.math_evaluator import MATHEvaluator # noqa
 from ais_bench.benchmark.openicl.icl_evaluator.icl_hf_evaluator import *  # noqa
diff --git a/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py b/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py
new file mode 100644
index 00000000..6f608d7f
--- /dev/null
+++ b/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py
@@ -0,0 +1,106 @@
+import json
+
+from ais_bench.benchmark.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
+from ais_bench.benchmark.registry import ICL_EVALUATORS
+
+
+def _compute_iou(box1: list, box2: list) -> float:
+    x_left = max(box1[0], box2[0])
+    y_top = max(box1[1], box2[1])
+    x_right = min(box1[2], box2[2])
+    y_bottom = min(box1[3], box2[3])
+
+    inter = max(0.0, x_right - x_left) * max(0.0, y_bottom - y_top)
+    area1 = max(0.0, box1[2] - box1[0]) * max(0.0, box1[3] - box1[1])
+    area2 = max(0.0, box2[2] - box2[0]) * max(0.0, box2[3] - box2[1])
+    union = area1 + area2 - inter
+    return inter / union if union > 0 else 0.0
+
+
+@ICL_EVALUATORS.register_module()
+class BBoxIoUEvaluator(BaseEvaluator):
+
+    def __init__(self,
+                 iou_threshold: float = 0.5,
+                 coord_scale: float = 1000.0,
+                 reference_bbox_key: str = 'bbox',
+                 image_width_key: str = 'image_width',
+                 image_height_key: str = 'image_height',
+                 metric_prefix: str = 'Accuracy',
+                 clip_to_image: bool = True) -> None:
+        super().__init__()
+        self.iou_threshold = iou_threshold
+        self.coord_scale = coord_scale
+        self.reference_bbox_key = reference_bbox_key
+        self.image_width_key = image_width_key
+        self.image_height_key = image_height_key
+        self.metric_prefix = metric_prefix
+        self.clip_to_image = clip_to_image
+
+    def _scale_prediction(self, pred_box: list, image_width: float, image_height: float) -> list:
+        if len(pred_box) != 4:
+            raise ValueError('Predicted bbox must contain four coordinates')
+
+        scaled_box = [
+            float(pred_box[0]) / self.coord_scale * float(image_width),
+            float(pred_box[1]) / self.coord_scale * float(image_height),
+            float(pred_box[2]) / self.coord_scale * float(image_width),
+            float(pred_box[3]) / self.coord_scale * float(image_height),
+        ]
+
+        if self.clip_to_image:
+            scaled_box = [
+                min(max(scaled_box[0], 0.0), float(image_width)),
+                min(max(scaled_box[1], 0.0), float(image_height)),
+                min(max(scaled_box[2], 0.0), float(image_width)),
+                min(max(scaled_box[3], 0.0), float(image_height)),
+            ]
+
+        if scaled_box[2] <= scaled_box[0] or scaled_box[3] <= scaled_box[1]:
+            raise ValueError('Predicted bbox is reversed or empty after scaling')
+        return scaled_box
+
+    def score(self, predictions, references):  # pyright: ignore[reportIncompatibleMethodOverride]
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        details = []
+        scores = []
+        for pred, ref in zip(predictions, references):
+            refer = json.loads(ref) if isinstance(ref, str) else ref
+            gt_box = [float(value) for value in refer[self.reference_bbox_key]]
+            image_width = float(refer[self.image_width_key])
+            image_height = float(refer[self.image_height_key])
+
+            detail = {
+                'pred': pred,
+                'answer': ref,
+                'correct': False,
+                'coord_mode': f'0-{int(self.coord_scale)}',
+            }
+
+            try:
+                pred_box_pixel = self._scale_prediction(pred, image_width, image_height)
+                iou = _compute_iou(pred_box_pixel, gt_box)
+                correct = iou >= self.iou_threshold
+                detail['correct'] = correct
+                detail['iou'] = iou
+                detail['pred_bbox_pixel'] = pred_box_pixel
+                scores.append(1 if correct else 0)
+            except (TypeError, ValueError, KeyError, json.JSONDecodeError) as error:
+                detail['iou'] = 0.0
+                detail['pred_bbox_pixel'] = None
+                detail['invalid'] = True
+                detail['error'] = str(error)
+                scores.append(0)
+
+            details.append(detail)
+
+        return {
+            f'{self.metric_prefix}@{self.iou_threshold}': 100 * sum(scores) / len(scores) if scores else 0.0,
+            'details': details,
+        }
\ No newline at end of file

From f003fbdd046a789777f57ac90973b9dc7bf88f3b Mon Sep 17 00:00:00 2001
From: zhongzhoutan <1710115119@bjmu.edu.cn>
Date: Tue, 17 Mar 2026 15:06:01 +0800
Subject: [PATCH 2/8] [feat] save image to local disk instead of storing in
 share memory

---
 .../configs/datasets/refcoco/refcoco_gen.py   |   6 +-
 .../benchmark/datasets/refcoco/refcoco.py     | 148 ++++++++++++++----
 2 files changed, 121 insertions(+), 33 deletions(-)

diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
index e278ac18..46e30b65 100644
--- a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
+++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
@@ -7,7 +7,7 @@
 
 
 refcoco_reader_cfg = dict(
-    input_columns=['ref_sentence', 'image'],
+    input_columns=['question', 'image'],
     output_column='answer'
 )
 
@@ -17,8 +17,8 @@
         template=dict(
             round=[
                 dict(role='HUMAN', prompt_mm={
-                    'text': {'type': 'text', 'text': 'Locate every object that matches the description "{ref_sentence}" in the image. Report bbox coordinates in JSON format.'},
-                    'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}},
+                    'text': {'type': 'text', 'text': '{question}'},
+                    'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}},
                 })
             ]
         )
diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco.py b/ais_bench/benchmark/datasets/refcoco/refcoco.py
index 0e8707cf..4e87d7f2 100644
--- a/ais_bench/benchmark/datasets/refcoco/refcoco.py
+++ b/ais_bench/benchmark/datasets/refcoco/refcoco.py
@@ -9,15 +9,20 @@
 
 from datasets import Dataset
 
+from ais_bench.benchmark.datasets.utils.datasets import get_content_str
 from ais_bench.benchmark.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
 from ais_bench.benchmark.datasets.utils.datasets import get_data_path
-from ais_bench.benchmark.utils.image_process import pil_to_base64
 from ais_bench.benchmark.utils.logging import AISLogger
 
 from ..base import BaseDataset
 
 logger = AISLogger()
 
+REFCOCO_PROMPT_TEMPLATE = (
+    'Locate every object that matches the description "{ref_sentence}" '
+    'in the image. Report bbox coordinates in JSON format.'
+)
+
 
 def _remove_leading_articles(text: str) -> str:
     cleaned_text = re.sub(r'^(a|an|the)\s+', '', text.strip(), flags=re.IGNORECASE)
@@ -47,10 +52,18 @@ def refcoco_bbox_postprocess(text) -> list:
 
 @LOAD_DATASET.register_module()
 class RefCOCODataset(BaseDataset):
+    TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCO_images'
 
     @staticmethod
-    def load(path, split, **kwargs):  # pyright: ignore[reportIncompatibleMethodOverride]
-        resolved_path = get_data_path(path)
+    def _generate_image_store_dir(resolved_path: str, split: str) -> str:
+        image_root_path = os.path.join(
+            os.path.dirname(resolved_path),
+            RefCOCODataset.TEMP_REFCOCO_IMAGE_STORE_DIR,
+        )
+        return os.path.join(image_root_path, split)
+
+    @staticmethod
+    def _load_split_dataframe(resolved_path: str, split: str) -> pd.DataFrame:
         shard_paths = sorted(glob.glob(os.path.join(resolved_path, f'{split}-*.parquet')))
         if not shard_paths:
             raise FileNotFoundError(
@@ -58,34 +71,109 @@ def load(path, split, **kwargs):  # pyright: ignore[reportIncompatibleMethodOver
             )
 
         logger.info(f'Loading RefCOCO split {split} from {len(shard_paths)} shard(s) in {resolved_path}')
-        data = pd.concat([pd.read_parquet(shard_path) for shard_path in shard_paths], ignore_index=True)
+        return pd.concat([pd.read_parquet(shard_path) for shard_path in shard_paths], ignore_index=True)
+
+    @staticmethod
+    def _persist_image_if_not_exist(image_payload, image_name: str, image_root_dir: str, row_index: int) -> tuple[str, int, int]:
+        if not isinstance(image_payload, dict) or 'bytes' not in image_payload:
+            raise ValueError(f'RefCOCO row {row_index} has invalid image payload: {type(image_payload)}')
+
+        pil_img = Image.open(io.BytesIO(image_payload['bytes'])).convert('RGB')
+        image_path = os.path.join(image_root_dir, image_name)
+        os.makedirs(os.path.dirname(image_path), exist_ok=True)
+        if not os.path.exists(image_path):
+            pil_img.save(image_path, format='JPEG')
+        return image_path, pil_img.width, pil_img.height
+
+    @staticmethod
+    def _build_pixel_bbox(raw_bbox) -> list[float]:
+        x_coord, y_coord, bbox_width, bbox_height = [float(value) for value in raw_bbox]
+        return [x_coord, y_coord, x_coord + bbox_width, y_coord + bbox_height]
+
+    @staticmethod
+    def _build_prompt(answer_text) -> str:
+        ref_sentence = _remove_leading_articles(str(answer_text))
+        return REFCOCO_PROMPT_TEMPLATE.format(ref_sentence=ref_sentence)
 
+    @staticmethod
+    def _build_answer_payload(question_id, pixel_bbox: list[float], width: int, height: int) -> str:
+        return json.dumps({
+            'question_id': int(question_id),
+            'bbox': pixel_bbox,
+            'image_width': width,
+            'image_height': height,
+        })
+
+    @staticmethod
+    def _build_rows(sample, image_path: str, width: int, height: int, pixel_bbox: list[float]) -> list[dict]:
         rows = []
-        for i in range(len(data)):
-            line = data.iloc[i]
-            img_field = line['image']
-            if not isinstance(img_field, dict) or 'bytes' not in img_field:
-                raise ValueError(f'RefCOCO row {i} has invalid image payload: {type(img_field)}')
-
-            pil_img = Image.open(io.BytesIO(img_field['bytes'])).convert('RGB')
-            width, height = pil_img.width, pil_img.height
-            image_b64 = pil_to_base64(pil_img, format='JPEG')
-
-            x_coord, y_coord, bbox_width, bbox_height = [float(value) for value in line['bbox']]
-            pixel_bbox = [x_coord, y_coord, x_coord + bbox_width, y_coord + bbox_height]
-
-            for answer_text in line['answer']:
-                ref_sentence = _remove_leading_articles(str(answer_text))
-                answer = json.dumps({
-                    'question_id': int(line['question_id']),
-                    'bbox': pixel_bbox,
-                    'image_width': width,
-                    'image_height': height,
-                })
-                rows.append({
-                    'ref_sentence': ref_sentence,
-                    'image': image_b64,
-                    'answer': answer,
-                })
+        reference_answer = RefCOCODataset._build_answer_payload(
+            sample['question_id'],
+            pixel_bbox,
+            width,
+            height,
+        )
+
+        for answer_text in sample['answer']:
+            prompt = RefCOCODataset._build_prompt(answer_text)
+            content = get_content_str([
+                {'type': 'image_url', 'image_url': image_path},
+                {'type': 'text', 'text': prompt},
+            ])
+            rows.append({
+                'content': content,
+                'question': prompt,
+                'image': image_path,
+                'answer': reference_answer,
+            })
+        return rows
+
+    @staticmethod
+    def load(path, split, **kwargs):  # pyright: ignore[reportIncompatibleMethodOverride]
+        """Load a RefCOCO split and normalize it into benchmark rows.
+
+        The source data is stored as parquet shards under ``path`` with shard
+        names matching ``<split>-*.parquet``. Each source row contains an image
+        payload, a ground-truth bounding box in ``[x, y, w, h]`` format, and a
+        list of referring expressions. This loader persists each image to
+        ``RefCOCO_images/<split>/<file_name>``, converts the bbox to
+        ``[x_min, y_min, x_max, y_max]``, and expands the answer list into one
+        benchmark row per referring expression.
+
+        Args:
+            path: Dataset root containing RefCOCO parquet shards.
+            split: Split prefix to load, for example ``val`` or ``testA``.
+            **kwargs: Unused extra keyword arguments passed by the dataset
+                builder.
+
+        Returns:
+            A HuggingFace ``Dataset`` whose rows contain ``content`` for
+            multimodal prompting and ``answer`` as the serialized reference
+            bbox payload used by evaluation.
+        """
+        resolved_path = get_data_path(path)
+        image_root_dir = RefCOCODataset._generate_image_store_dir(resolved_path, split)
+        logger.info(f'Saving RefCOCO images to {image_root_dir}')
+        data = RefCOCODataset._load_split_dataframe(resolved_path, split)
+        os.makedirs(image_root_dir, exist_ok=True)
+
+        rows = []
+        for row_index, (_, sample) in enumerate(data.iterrows()):
+            image_path, width, height = RefCOCODataset._persist_image_if_not_exist(
+                sample['image'],
+                sample['file_name'],
+                image_root_dir,
+                row_index,
+            )
+            pixel_bbox = RefCOCODataset._build_pixel_bbox(sample['bbox'])
+            rows.extend(
+                RefCOCODataset._build_rows(
+                    sample,
+                    image_path,
+                    width,
+                    height,
+                    pixel_bbox,
+                )
+            )
 
         return Dataset.from_list(rows)

From 224c731fcc8f043ab790aa7dff5c47bd3e6a84ec Mon Sep 17 00:00:00 2001
From: zhongzhoutan <1710115119@bjmu.edu.cn>
Date: Tue, 17 Mar 2026 19:20:53 +0800
Subject: [PATCH 3/8] [feature] add refcoco plus support

---
 .../datasets/refcoco_plus/refcoco_plus_gen.py | 52 +++++++++++++++++++
 .../benchmark/datasets/refcoco/__init__.py    |  3 +-
 .../datasets/refcoco/refcoco_plus.py          |  8 +++
 3 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py
 create mode 100644 ais_bench/benchmark/datasets/refcoco/refcoco_plus.py

diff --git a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py
new file mode 100644
index 00000000..989f9230
--- /dev/null
+++ b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py
@@ -0,0 +1,52 @@
+from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
+from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
+from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate
+from ais_bench.benchmark.datasets import RefCOCOPlusDataset
+from ais_bench.benchmark.datasets.refcoco import refcoco_bbox_postprocess
+from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator
+
+
+refcoco_plus_reader_cfg = dict(
+    input_columns=['content'],
+    output_column='answer'
+)
+
+refcoco_plus_infer_cfg = dict(
+    prompt_template=dict(
+        type=MMPromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt_mm={
+                    'text': {'type': 'text', 'text': '{question}'},
+                    'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}},
+                })
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+refcoco_plus_eval_cfg = dict(
+    evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0),
+    pred_postprocessor=dict(type=refcoco_bbox_postprocess),
+)
+
+_splits = [
+    ('RefCOCOPlus_val', 'val'),
+    ('RefCOCOPlus_testA', 'testA'),
+    ('RefCOCOPlus_testB', 'testB'),
+]
+
+refcoco_plus_datasets = [
+    dict(
+        abbr=abbr,
+        type=RefCOCOPlusDataset,
+        path='ais_bench/datasets/RefCOCOplus/data',
+        split=split,
+        reader_cfg=refcoco_plus_reader_cfg,
+        infer_cfg=refcoco_plus_infer_cfg,
+        eval_cfg=refcoco_plus_eval_cfg,
+    )
+    for abbr, split in _splits
+]
\ No newline at end of file
diff --git a/ais_bench/benchmark/datasets/refcoco/__init__.py b/ais_bench/benchmark/datasets/refcoco/__init__.py
index b95ad54f..6e5e4fcd 100644
--- a/ais_bench/benchmark/datasets/refcoco/__init__.py
+++ b/ais_bench/benchmark/datasets/refcoco/__init__.py
@@ -1 +1,2 @@
-from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset, refcoco_bbox_postprocess  # noqa: F401
\ No newline at end of file
+from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset, refcoco_bbox_postprocess  # noqa: F401
+from ais_bench.benchmark.datasets.refcoco.refcoco_plus import RefCOCOPlusDataset  # noqa: F401
\ No newline at end of file
diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py b/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py
new file mode 100644
index 00000000..7415e65f
--- /dev/null
+++ b/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py
@@ -0,0 +1,8 @@
+from ais_bench.benchmark.registry import LOAD_DATASET
+
+from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset
+
+
+@LOAD_DATASET.register_module()
+class RefCOCOPlusDataset(RefCOCODataset):
+    TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCOPlus_images'

From 8d180d5b6236b5de76d7157fe693b60b19dffa09 Mon Sep 17 00:00:00 2001
From: zhongzhoutan <1710115119@bjmu.edu.cn>
Date: Tue, 17 Mar 2026 19:28:14 +0800
Subject: [PATCH 4/8] [feature] add refcocog support

---
 .../configs/datasets/refcocog/refcocog_gen.py | 51 +++++++++++++++++++
 .../benchmark/datasets/refcoco/__init__.py    |  1 +
 .../benchmark/datasets/refcoco/refcoco_g.py   |  8 +++
 3 files changed, 60 insertions(+)
 create mode 100644 ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py
 create mode 100644 ais_bench/benchmark/datasets/refcoco/refcoco_g.py

diff --git a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py
new file mode 100644
index 00000000..c4429bb3
--- /dev/null
+++ b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py
@@ -0,0 +1,51 @@
+from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
+from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
+from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate
+from ais_bench.benchmark.datasets import RefCOCOgDataset
+from ais_bench.benchmark.datasets.refcoco import refcoco_bbox_postprocess
+from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator
+
+
+refcocog_reader_cfg = dict(
+    input_columns=['content'],
+    output_column='answer'
+)
+
+refcocog_infer_cfg = dict(
+    prompt_template=dict(
+        type=MMPromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt_mm={
+                    'text': {'type': 'text', 'text': '{question}'},
+                    'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}},
+                })
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+refcocog_eval_cfg = dict(
+    evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0),
+    pred_postprocessor=dict(type=refcoco_bbox_postprocess),
+)
+
+_splits = [
+    ('RefCOCOg_val', 'val'),
+    ('RefCOCOg_test', 'test'),
+]
+
+refcocog_datasets = [
+    dict(
+        abbr=abbr,
+        type=RefCOCOgDataset,
+        path='ais_bench/datasets/RefCOCOg/data',
+        split=split,
+        reader_cfg=refcocog_reader_cfg,
+        infer_cfg=refcocog_infer_cfg,
+        eval_cfg=refcocog_eval_cfg,
+    )
+    for abbr, split in _splits
+]
\ No newline at end of file
diff --git a/ais_bench/benchmark/datasets/refcoco/__init__.py b/ais_bench/benchmark/datasets/refcoco/__init__.py
index 6e5e4fcd..e4f94a76 100644
--- a/ais_bench/benchmark/datasets/refcoco/__init__.py
+++ b/ais_bench/benchmark/datasets/refcoco/__init__.py
@@ -1,2 +1,3 @@
 from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset, refcoco_bbox_postprocess  # noqa: F401
+from ais_bench.benchmark.datasets.refcoco.refcoco_g import RefCOCOgDataset  # noqa: F401
 from ais_bench.benchmark.datasets.refcoco.refcoco_plus import RefCOCOPlusDataset  # noqa: F401
\ No newline at end of file
diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco_g.py b/ais_bench/benchmark/datasets/refcoco/refcoco_g.py
new file mode 100644
index 00000000..efee0302
--- /dev/null
+++ b/ais_bench/benchmark/datasets/refcoco/refcoco_g.py
@@ -0,0 +1,8 @@
+from ais_bench.benchmark.registry import LOAD_DATASET
+
+from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset
+
+
+@LOAD_DATASET.register_module()
+class RefCOCOgDataset(RefCOCODataset):
+    TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCOg_images'

From 8aaa9fab3ccd06a1fe41e2f63a88284b9b503af4 Mon Sep 17 00:00:00 2001
From: zhongzhoutan <1710115119@bjmu.edu.cn>
Date: Tue, 17 Mar 2026 20:20:07 +0800
Subject: [PATCH 5/8] [refactor] use the more general dir name for the saving
 images

---
 ais_bench/benchmark/datasets/refcoco/refcoco.py      |  4 ++--
 ais_bench/benchmark/datasets/refcoco/refcoco_g.py    |  9 ++++++++-
 ais_bench/benchmark/datasets/refcoco/refcoco_plus.py | 10 +++++++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco.py b/ais_bench/benchmark/datasets/refcoco/refcoco.py
index 4e87d7f2..9bf4413f 100644
--- a/ais_bench/benchmark/datasets/refcoco/refcoco.py
+++ b/ais_bench/benchmark/datasets/refcoco/refcoco.py
@@ -52,13 +52,13 @@ def refcoco_bbox_postprocess(text) -> list:
 
 @LOAD_DATASET.register_module()
 class RefCOCODataset(BaseDataset):
-    TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCO_images'
+    TEMP_IMAGE_STORE_DIR = 'temp_save_images'
 
     @staticmethod
     def _generate_image_store_dir(resolved_path: str, split: str) -> str:
         image_root_path = os.path.join(
             os.path.dirname(resolved_path),
-            RefCOCODataset.TEMP_REFCOCO_IMAGE_STORE_DIR,
+            RefCOCODataset.TEMP_IMAGE_STORE_DIR,
         )
         return os.path.join(image_root_path, split)
 
diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco_g.py b/ais_bench/benchmark/datasets/refcoco/refcoco_g.py
index efee0302..6e32abb7 100644
--- a/ais_bench/benchmark/datasets/refcoco/refcoco_g.py
+++ b/ais_bench/benchmark/datasets/refcoco/refcoco_g.py
@@ -5,4 +5,11 @@
 
 @LOAD_DATASET.register_module()
 class RefCOCOgDataset(RefCOCODataset):
-    TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCOg_images'
+    """
+    RefCOCOg is a variant of RefCOCO with more complex referring expressions. 
+    Because the dataset field is same as the RefCOCO dataset, we can reuse the loading and evaluation code.
+    The only difference is refcoco_g only has two splits:
+    - `val`: 7.57k rows
+    - `test`: 5.02k rows
+    """
+    pass
diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py b/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py
index 7415e65f..026c222a 100644
--- a/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py
+++ b/ais_bench/benchmark/datasets/refcoco/refcoco_plus.py
@@ -5,4 +5,12 @@
 
 @LOAD_DATASET.register_module()
 class RefCOCOPlusDataset(RefCOCODataset):
-    TEMP_REFCOCO_IMAGE_STORE_DIR = 'RefCOCOPlus_images'
+    """
+    RefCOCOplus is a variant of RefCOCO with more complex referring expressions. 
+    Because the dataset field is same as the RefCOCO dataset, we can reuse the loading and evaluation code.
+    The only difference is refcoco_plus only has three splits:
+    - `val`: 3.81k rows
+    - `testA`: 1.98k rows
+    - `testB`: 1.8k rows
+    """
+    pass

From b2f1baccb13cb1447c6c76f56b65aa92be4fa284 Mon Sep 17 00:00:00 2001
From: zhongzhoutan <1710115119@bjmu.edu.cn>
Date: Thu, 19 Mar 2026 19:16:18 +0800
Subject: [PATCH 6/8] [feature] add refcoco/+/g base64 support

---
 .../configs/datasets/refcoco/refcoco_gen.py   |  12 +-
 .../datasets/refcoco/refcoco_gen_base64.py    |  54 ++++++
 .../datasets/refcoco_plus/refcoco_plus_gen.py |  12 +-
 .../refcoco_plus/refcoco_plus_gen_base64.py   |  53 ++++++
 .../configs/datasets/refcocog/refcocog_gen.py |  10 +-
 .../datasets/refcocog/refcocog_gen_base64.py  |  52 ++++++
 .../benchmark/datasets/refcoco/__init__.py    |   8 +-
 .../benchmark/datasets/refcoco/refcoco.py     | 172 ++++++++++--------
 8 files changed, 282 insertions(+), 91 deletions(-)
 create mode 100644 ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py
 create mode 100644 ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py
 create mode 100644 ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py

diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
index 46e30b65..b4e2b311 100644
--- a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
+++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
@@ -33,15 +33,15 @@
 )
 
 _splits = [
-    ('RefCOCO_val', 'val'),
-    ('RefCOCO_test', 'test'),
-    ('RefCOCO_testA', 'testA'),
-    ('RefCOCO_testB', 'testB'),
+    'val',
+    'test',
+    'testA',
+    'testB',
 ]
 
 refcoco_datasets = [
     dict(
-        abbr=abbr,
+        abbr='RefCOCO_' + split,
         type=RefCOCODataset,
         path='ais_bench/datasets/RefCOCO/data',
         split=split,
@@ -49,5 +49,5 @@
         infer_cfg=refcoco_infer_cfg,
         eval_cfg=refcoco_eval_cfg,
     )
-    for abbr, split in _splits
+    for split in _splits
 ]
\ No newline at end of file
diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py
new file mode 100644
index 00000000..a261ab06
--- /dev/null
+++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py
@@ -0,0 +1,54 @@
+from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
+from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
+from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate
+from ais_bench.benchmark.datasets import RefCOCODataset
+from ais_bench.benchmark.datasets.refcoco import IMAGE_BASE64_TYPE, refcoco_bbox_postprocess
+from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator
+
+
+refcoco_reader_cfg = dict(
+    input_columns=['question', 'image'],
+    output_column='answer'
+)
+
+refcoco_infer_cfg = dict(
+    prompt_template=dict(
+        type=MMPromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt_mm={
+                    'text': {'type': 'text', 'text': '{question}'},
+                    'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}},
+                })
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+refcoco_eval_cfg = dict(
+    evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0),
+    pred_postprocessor=dict(type=refcoco_bbox_postprocess),
+)
+
+_splits = [
+    'val',
+    'test',
+    'testA',
+    'testB',
+]
+
+refcoco_datasets = [
+    dict(
+        abbr='RefCOCO_base64_' + split,
+        type=RefCOCODataset,
+        path='ais_bench/datasets/RefCOCO/data',
+        split=split,
+        image_type=IMAGE_BASE64_TYPE,
+        reader_cfg=refcoco_reader_cfg,
+        infer_cfg=refcoco_infer_cfg,
+        eval_cfg=refcoco_eval_cfg,
+    )
+    for split in _splits
+]
\ No newline at end of file
diff --git a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py
index 989f9230..03b10fd4 100644
--- a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py
+++ b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py
@@ -7,7 +7,7 @@
 
 
 refcoco_plus_reader_cfg = dict(
-    input_columns=['content'],
+    input_columns=['question', 'image'],
     output_column='answer'
 )
 
@@ -33,14 +33,14 @@
 )
 
 _splits = [
-    ('RefCOCOPlus_val', 'val'),
-    ('RefCOCOPlus_testA', 'testA'),
-    ('RefCOCOPlus_testB', 'testB'),
+    'val',
+    'testA',
+    'testB',
 ]
 
 refcoco_plus_datasets = [
     dict(
-        abbr=abbr,
+        abbr='RefCOCOPlus_' + split,
         type=RefCOCOPlusDataset,
         path='ais_bench/datasets/RefCOCOplus/data',
         split=split,
@@ -48,5 +48,5 @@
         infer_cfg=refcoco_plus_infer_cfg,
         eval_cfg=refcoco_plus_eval_cfg,
     )
-    for abbr, split in _splits
+    for split in _splits
 ]
\ No newline at end of file
diff --git a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py
new file mode 100644
index 00000000..8b1b41b3
--- /dev/null
+++ b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py
@@ -0,0 +1,53 @@
+from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
+from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
+from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate
+from ais_bench.benchmark.datasets import RefCOCOPlusDataset
+from ais_bench.benchmark.datasets.refcoco import IMAGE_BASE64_TYPE, refcoco_bbox_postprocess
+from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator
+
+
+refcoco_plus_reader_cfg = dict(
+    input_columns=['question', 'image'],
+    output_column='answer'
+)
+
+refcoco_plus_infer_cfg = dict(
+    prompt_template=dict(
+        type=MMPromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt_mm={
+                    'text': {'type': 'text', 'text': '{question}'},
+                    'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}},
+                })
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+refcoco_plus_eval_cfg = dict(
+    evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0),
+    pred_postprocessor=dict(type=refcoco_bbox_postprocess),
+)
+
+_splits = [
+    'val',
+    'testA',
+    'testB',
+]
+
+refcoco_plus_datasets = [
+    dict(
+        abbr='RefCOCOPlus_base64_' + split,
+        type=RefCOCOPlusDataset,
+        path='ais_bench/datasets/RefCOCOplus/data',
+        split=split,
+        image_type=IMAGE_BASE64_TYPE,
+        reader_cfg=refcoco_plus_reader_cfg,
+        infer_cfg=refcoco_plus_infer_cfg,
+        eval_cfg=refcoco_plus_eval_cfg,
+    )
+    for split in _splits
+]
\ No newline at end of file
diff --git a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py
index c4429bb3..c1504f7a 100644
--- a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py
+++ b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py
@@ -7,7 +7,7 @@
 
 
 refcocog_reader_cfg = dict(
-    input_columns=['content'],
+    input_columns=['question', 'image'],
     output_column='answer'
 )
 
@@ -33,13 +33,13 @@
 )
 
 _splits = [
-    ('RefCOCOg_val', 'val'),
-    ('RefCOCOg_test', 'test'),
+    'val',
+    'test',
 ]
 
 refcocog_datasets = [
     dict(
-        abbr=abbr,
+        abbr='RefCOCOg_' + split,
         type=RefCOCOgDataset,
         path='ais_bench/datasets/RefCOCOg/data',
         split=split,
@@ -47,5 +47,5 @@
         infer_cfg=refcocog_infer_cfg,
         eval_cfg=refcocog_eval_cfg,
     )
-    for abbr, split in _splits
+    for split in _splits
 ]
\ No newline at end of file
diff --git a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py
new file mode 100644
index 00000000..eedcda7a
--- /dev/null
+++ b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py
@@ -0,0 +1,52 @@
+from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
+from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
+from ais_bench.benchmark.openicl.icl_prompt_template import MMPromptTemplate
+from ais_bench.benchmark.datasets import RefCOCOgDataset
+from ais_bench.benchmark.datasets.refcoco import IMAGE_BASE64_TYPE, refcoco_bbox_postprocess
+from ais_bench.benchmark.openicl.icl_evaluator import BBoxIoUEvaluator
+
+
+refcocog_reader_cfg = dict(
+    input_columns=['question', 'image'],
+    output_column='answer'
+)
+
+refcocog_infer_cfg = dict(
+    prompt_template=dict(
+        type=MMPromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt_mm={
+                    'text': {'type': 'text', 'text': '{question}'},
+                    'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}},
+                })
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+refcocog_eval_cfg = dict(
+    evaluator=dict(type=BBoxIoUEvaluator, iou_threshold=0.5, coord_scale=1000.0),
+    pred_postprocessor=dict(type=refcoco_bbox_postprocess),
+)
+
+_splits = [
+    'val',
+    'test',
+]
+
+refcocog_datasets = [
+    dict(
+        abbr='RefCOCOg_base64_' + split,
+        type=RefCOCOgDataset,
+        path='ais_bench/datasets/RefCOCOg/data',
+        split=split,
+        image_type=IMAGE_BASE64_TYPE,
+        reader_cfg=refcocog_reader_cfg,
+        infer_cfg=refcocog_infer_cfg,
+        eval_cfg=refcocog_eval_cfg,
+    )
+    for split in _splits
+]
\ No newline at end of file
diff --git a/ais_bench/benchmark/datasets/refcoco/__init__.py b/ais_bench/benchmark/datasets/refcoco/__init__.py
index e4f94a76..1590279f 100644
--- a/ais_bench/benchmark/datasets/refcoco/__init__.py
+++ b/ais_bench/benchmark/datasets/refcoco/__init__.py
@@ -1,3 +1,9 @@
-from ais_bench.benchmark.datasets.refcoco.refcoco import RefCOCODataset, refcoco_bbox_postprocess  # noqa: F401
+from ais_bench.benchmark.datasets.refcoco.refcoco import (  # noqa: F401
+    IMAGE_BASE64_TYPE,
+    IMAGE_PATH_TYPE,
+    TEMP_IMAGE_STORE_DIR,
+    RefCOCODataset,
+    refcoco_bbox_postprocess,
+)
 from ais_bench.benchmark.datasets.refcoco.refcoco_g import RefCOCOgDataset  # noqa: F401
 from ais_bench.benchmark.datasets.refcoco.refcoco_plus import RefCOCOPlusDataset  # noqa: F401
\ No newline at end of file
diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco.py b/ais_bench/benchmark/datasets/refcoco/refcoco.py
index 9bf4413f..c5aa814d 100644
--- a/ais_bench/benchmark/datasets/refcoco/refcoco.py
+++ b/ais_bench/benchmark/datasets/refcoco/refcoco.py
@@ -4,6 +4,9 @@
 import os
 import re
 
+from abc import ABC, abstractmethod
+from typing import Any
+
 import pandas as pd
 from PIL import Image
 
@@ -12,56 +15,93 @@
 from ais_bench.benchmark.datasets.utils.datasets import get_content_str
 from ais_bench.benchmark.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
 from ais_bench.benchmark.datasets.utils.datasets import get_data_path
+from ais_bench.benchmark.utils.image_process import pil_to_base64
 from ais_bench.benchmark.utils.logging import AISLogger
 
 from ..base import BaseDataset
 
 logger = AISLogger()
 
+IMAGE_PATH_TYPE = 'image_path'
+IMAGE_BASE64_TYPE = 'image_base64'
+
 REFCOCO_PROMPT_TEMPLATE = (
     'Locate every object that matches the description "{ref_sentence}" '
     'in the image. Report bbox coordinates in JSON format.'
 )
 
+TEMP_IMAGE_STORE_DIR = 'temp_save_images'
 
-def _remove_leading_articles(text: str) -> str:
-    cleaned_text = re.sub(r'^(a|an|the)\s+', '', text.strip(), flags=re.IGNORECASE)
-    return cleaned_text or text.strip()
-
-
-def parse_float_sequence_within(input_str: str):
+def _parse_float_sequence_within(input_str: str) -> list[float]:
     """Extract the first sequence of four floats inside square brackets."""
     pattern = r'\[\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\]'
     match = re.search(pattern, input_str)
     if match:
         return [float(match.group(i)) for i in range(1, 5)]
-    return [0.0, 0.0, 0.0, 0.0]  # Default bbox if parsing fails
+    return [0.0, 0.0, 0.0, 0.0]
 
 
-@TEXT_POSTPROCESSORS.register_module('refcoco_bbox_1000')
-def refcoco_bbox_postprocess(text) -> list:
-    if not isinstance(text, str):
-        raise ValueError('Prediction must be a string')
+def _remove_leading_articles(text: str) -> str:
+    cleaned_text = re.sub(r'^(a|an|the)\s+', '', text.strip(), flags=re.IGNORECASE)
+    return cleaned_text or text.strip()
 
+
+@TEXT_POSTPROCESSORS.register_module('refcoco_bbox_1000')
+def refcoco_bbox_postprocess(text: str) -> list[float]:
     stripped_text = text.strip()
-    bbox = parse_float_sequence_within(stripped_text)
+    bbox = _parse_float_sequence_within(stripped_text)
 
     logger.debug(f'refcoco_bbox_postprocess: bbox={bbox}')
     return bbox
 
 
-@LOAD_DATASET.register_module()
-class RefCOCODataset(BaseDataset):
-    TEMP_IMAGE_STORE_DIR = 'temp_save_images'
+class ImageResolver(ABC):
+    """Strategy interface for converting a PIL image into a transport value."""
 
-    @staticmethod
-    def _generate_image_store_dir(resolved_path: str, split: str) -> str:
-        image_root_path = os.path.join(
-            os.path.dirname(resolved_path),
-            RefCOCODataset.TEMP_IMAGE_STORE_DIR,
+    @abstractmethod
+    def setup(self, resolved_path: str, split: str) -> None:
+        ...
+
+    @abstractmethod
+    def resolve(self, pil_img: Image.Image, file_name: str) -> str:
+        ...
+
+
+class PathImageResolver(ImageResolver):
+    def setup(self, resolved_path: str, split: str) -> None:
+        image_cache_path = os.path.join(
+            resolved_path,
+            TEMP_IMAGE_STORE_DIR,
+            split,
         )
-        return os.path.join(image_root_path, split)
+        logger.info(f'Saving RefCOCO images to {image_cache_path}')
+        os.makedirs(image_cache_path, exist_ok=True)
+        self._cache_dir = image_cache_path
 
+    def resolve(self, pil_img: Image.Image, file_name: str) -> str:
+        image_path = os.path.join(self._cache_dir, file_name)
+        os.makedirs(os.path.dirname(image_path), exist_ok=True)
+        if not os.path.exists(image_path):
+            pil_img.save(image_path, format='JPEG')
+        return image_path
+
+
+class Base64ImageResolver(ImageResolver):
+    def setup(self, resolved_path: str, split: str) -> None:
+        logger.info(f'Encoding RefCOCO images as base64 for split {split}')
+
+    def resolve(self, pil_img: Image.Image, file_name: str) -> str:
+        return pil_to_base64(pil_img, format='JPEG')
+
+
+IMAGE_RESOLVERS = {
+    IMAGE_PATH_TYPE: PathImageResolver,
+    IMAGE_BASE64_TYPE: Base64ImageResolver,
+}
+
+
+@LOAD_DATASET.register_module()
+class RefCOCODataset(BaseDataset):
     @staticmethod
     def _load_split_dataframe(resolved_path: str, split: str) -> pd.DataFrame:
         shard_paths = sorted(glob.glob(os.path.join(resolved_path, f'{split}-*.parquet')))
@@ -74,77 +114,69 @@ def _load_split_dataframe(resolved_path: str, split: str) -> pd.DataFrame:
         return pd.concat([pd.read_parquet(shard_path) for shard_path in shard_paths], ignore_index=True)
 
     @staticmethod
-    def _persist_image_if_not_exist(image_payload, image_name: str, image_root_dir: str, row_index: int) -> tuple[str, int, int]:
+    def _decode_image_payload(image_payload: Any, row_index: int) -> Image.Image:
         if not isinstance(image_payload, dict) or 'bytes' not in image_payload:
             raise ValueError(f'RefCOCO row {row_index} has invalid image payload: {type(image_payload)}')
 
-        pil_img = Image.open(io.BytesIO(image_payload['bytes'])).convert('RGB')
-        image_path = os.path.join(image_root_dir, image_name)
-        os.makedirs(os.path.dirname(image_path), exist_ok=True)
-        if not os.path.exists(image_path):
-            pil_img.save(image_path, format='JPEG')
-        return image_path, pil_img.width, pil_img.height
+        return Image.open(io.BytesIO(image_payload['bytes'])).convert('RGB')
 
     @staticmethod
-    def _build_pixel_bbox(raw_bbox) -> list[float]:
+    def _build_pixel_bbox(raw_bbox: Any) -> list[float]:
         x_coord, y_coord, bbox_width, bbox_height = [float(value) for value in raw_bbox]
         return [x_coord, y_coord, x_coord + bbox_width, y_coord + bbox_height]
 
     @staticmethod
-    def _build_prompt(answer_text) -> str:
+    def _build_prompt(answer_text: Any) -> str:
         ref_sentence = _remove_leading_articles(str(answer_text))
         return REFCOCO_PROMPT_TEMPLATE.format(ref_sentence=ref_sentence)
 
     @staticmethod
-    def _build_answer_payload(question_id, pixel_bbox: list[float], width: int, height: int) -> str:
-        return json.dumps({
-            'question_id': int(question_id),
-            'bbox': pixel_bbox,
+    def _build_rows(
+        sample: pd.Series,
+        image_value: str,
+        width: int,
+        height: int,
+    ) -> list[dict[str, str]]:
+        reference_answer = json.dumps({
+            'question_id': int(sample['question_id']),
+            'bbox': RefCOCODataset._build_pixel_bbox(sample['bbox']),
             'image_width': width,
             'image_height': height,
         })
 
-    @staticmethod
-    def _build_rows(sample, image_path: str, width: int, height: int, pixel_bbox: list[float]) -> list[dict]:
-        rows = []
-        reference_answer = RefCOCODataset._build_answer_payload(
-            sample['question_id'],
-            pixel_bbox,
-            width,
-            height,
-        )
-
+        rows: list[dict[str, str]] = []
         for answer_text in sample['answer']:
             prompt = RefCOCODataset._build_prompt(answer_text)
             content = get_content_str([
-                {'type': 'image_url', 'image_url': image_path},
+                {'type': 'image_url', 'image_url': image_value},
                 {'type': 'text', 'text': prompt},
             ])
             rows.append({
                 'content': content,
                 'question': prompt,
-                'image': image_path,
+                'image': image_value,
                 'answer': reference_answer,
             })
         return rows
 
     @staticmethod
-    def load(path, split, **kwargs):  # pyright: ignore[reportIncompatibleMethodOverride]
+    def load(path: str, split: str, **kwargs: Any) -> Dataset:  # pyright: ignore[reportIncompatibleMethodOverride]
         """Load a RefCOCO split and normalize it into benchmark rows.
 
         The source data is stored as parquet shards under ``path`` with shard
         names matching ``<split>-*.parquet``. Each source row contains an image
         payload, a ground-truth bounding box in ``[x, y, w, h]`` format, and a
-        list of referring expressions. This loader persists each image to
-        ``RefCOCO_images/<split>/<file_name>``, converts the bbox to
-        ``[x_min, y_min, x_max, y_max]``, and expands the answer list into one
-        benchmark row per referring expression.
+        list of referring expressions. This loader can either persist each image
+        to a split-specific cache directory or encode it as base64, converts the
+        bbox to ``[x_min, y_min, x_max, y_max]``, and expands the answer list
+        into one benchmark row per referring expression.
 
         Args:
             path: Dataset root containing RefCOCO parquet shards.
             split: Split prefix to load, for example ``val`` or ``testA``.
-            **kwargs: Unused extra keyword arguments passed by the dataset
-                builder.
+            **kwargs: Extra keyword arguments passed by the dataset builder.
+                Supported key: ``image_type`` with values ``IMAGE_PATH_TYPE`` or
+                ``IMAGE_BASE64_TYPE``.
 
         Returns:
             A HuggingFace ``Dataset`` whose rows contain ``content`` for
@@ -152,28 +184,22 @@ def load(path, split, **kwargs):  # pyright: ignore[reportIncompatibleMethodOver
             bbox payload used by evaluation.
         """
         resolved_path = get_data_path(path)
-        image_root_dir = RefCOCODataset._generate_image_store_dir(resolved_path, split)
-        logger.info(f'Saving RefCOCO images to {image_root_dir}')
+        image_type = kwargs.get('image_type', IMAGE_PATH_TYPE)
+        if image_type not in IMAGE_RESOLVERS:
+            raise ValueError(
+                f'Unsupported image_type: {image_type}. Expected one of {sorted(IMAGE_RESOLVERS)}'
+            )
         data = RefCOCODataset._load_split_dataframe(resolved_path, split)
-        os.makedirs(image_root_dir, exist_ok=True)
+        resolver = IMAGE_RESOLVERS[image_type]()
+        resolver.setup(resolved_path, split)
 
-        rows = []
+        rows: list[dict[str, str]] = []
         for row_index, (_, sample) in enumerate(data.iterrows()):
-            image_path, width, height = RefCOCODataset._persist_image_if_not_exist(
-                sample['image'],
-                sample['file_name'],
-                image_root_dir,
-                row_index,
-            )
-            pixel_bbox = RefCOCODataset._build_pixel_bbox(sample['bbox'])
-            rows.extend(
-                RefCOCODataset._build_rows(
-                    sample,
-                    image_path,
-                    width,
-                    height,
-                    pixel_bbox,
-                )
-            )
+            pil_img = RefCOCODataset._decode_image_payload(sample['image'], row_index)
+            image_value = resolver.resolve(pil_img, sample['file_name'])
+
+            width, height = pil_img.width, pil_img.height
+            sample_rows = RefCOCODataset._build_rows(sample, image_value, width, height)
+            rows.extend(sample_rows)
 
         return Dataset.from_list(rows)

From 7d6802e1cd774bc4511b7356267bb52fcafa8be8 Mon Sep 17 00:00:00 2001
From: zhongzhoutan <1710115119@bjmu.edu.cn>
Date: Fri, 20 Mar 2026 10:17:05 +0800
Subject: [PATCH 7/8] [refactor] avoid index error and type error to raise to
 user

---
 .../openicl/icl_evaluator/bbox_iou_evaluator.py      | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py b/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py
index 6f608d7f..f514b2e1 100644
--- a/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py
+++ b/ais_bench/benchmark/openicl/icl_evaluator/bbox_iou_evaluator.py
@@ -71,11 +71,6 @@ def score(self, predictions, references):  # pyright: ignore[reportIncompatibleM
         details = []
         scores = []
         for pred, ref in zip(predictions, references):
-            refer = json.loads(ref) if isinstance(ref, str) else ref
-            gt_box = [float(value) for value in refer[self.reference_bbox_key]]
-            image_width = float(refer[self.image_width_key])
-            image_height = float(refer[self.image_height_key])
-
             detail = {
                 'pred': pred,
                 'answer': ref,
@@ -84,14 +79,19 @@ def score(self, predictions, references):  # pyright: ignore[reportIncompatibleM
             }
 
             try:
+                refer = json.loads(ref) if isinstance(ref, str) else ref
+                image_width = float(refer[self.image_width_key])
+                image_height = float(refer[self.image_height_key])
                 pred_box_pixel = self._scale_prediction(pred, image_width, image_height)
+                gt_box = [float(value) for value in refer[self.reference_bbox_key]]
+
                 iou = _compute_iou(pred_box_pixel, gt_box)
                 correct = iou >= self.iou_threshold
                 detail['correct'] = correct
                 detail['iou'] = iou
                 detail['pred_bbox_pixel'] = pred_box_pixel
                 scores.append(1 if correct else 0)
-            except (TypeError, ValueError, KeyError, json.JSONDecodeError) as error:
+            except (TypeError, ValueError, KeyError, json.JSONDecodeError, IndexError) as error:
                 detail['iou'] = 0.0
                 detail['pred_bbox_pixel'] = None
                 detail['invalid'] = True

From 771f195617467651a775f449e1694fa17157c2e1 Mon Sep 17 00:00:00 2001
From: zhongzhoutan <1710115119@bjmu.edu.cn>
Date: Fri, 20 Mar 2026 15:15:55 +0800
Subject: [PATCH 8/8] [refactor] remove unused image and question, also move
 the prompt to datasets config

---
 .../configs/datasets/refcoco/refcoco_gen.py   |  5 ++-
 .../datasets/refcoco/refcoco_gen_base64.py    |  5 ++-
 .../datasets/refcoco_plus/refcoco_plus_gen.py |  5 ++-
 .../refcoco_plus/refcoco_plus_gen_base64.py   |  5 ++-
 .../configs/datasets/refcocog/refcocog_gen.py |  5 ++-
 .../datasets/refcocog/refcocog_gen_base64.py  |  5 ++-
 .../benchmark/datasets/refcoco/refcoco.py     | 31 +++++++++----------
 7 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
index b4e2b311..008119cb 100644
--- a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
+++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen.py
@@ -17,7 +17,10 @@
         template=dict(
             round=[
                 dict(role='HUMAN', prompt_mm={
-                    'text': {'type': 'text', 'text': '{question}'},
+                    'text': {
+                        'type': 'text',
+                        'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.'
+                    },
                     'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}},
                 })
             ]
diff --git a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py
index a261ab06..d807dbce 100644
--- a/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py
+++ b/ais_bench/benchmark/configs/datasets/refcoco/refcoco_gen_base64.py
@@ -17,7 +17,10 @@
         template=dict(
             round=[
                 dict(role='HUMAN', prompt_mm={
-                    'text': {'type': 'text', 'text': '{question}'},
+                    'text': {
+                        'type': 'text',
+                        'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.'
+                    },
                     'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}},
                 })
             ]
diff --git a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py
index 03b10fd4..d505dd74 100644
--- a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py
+++ b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen.py
@@ -17,7 +17,10 @@
         template=dict(
             round=[
                 dict(role='HUMAN', prompt_mm={
-                    'text': {'type': 'text', 'text': '{question}'},
+                    'text': {
+                        'type': 'text',
+                        'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.'
+                    },
                     'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}},
                 })
             ]
diff --git a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py
index 8b1b41b3..5804cb61 100644
--- a/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py
+++ b/ais_bench/benchmark/configs/datasets/refcoco_plus/refcoco_plus_gen_base64.py
@@ -17,7 +17,10 @@
         template=dict(
             round=[
                 dict(role='HUMAN', prompt_mm={
-                    'text': {'type': 'text', 'text': '{question}'},
+                    'text': {
+                        'type': 'text',
+                        'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.'
+                    },
                     'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}},
                 })
             ]
diff --git a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py
index c1504f7a..50cbb852 100644
--- a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py
+++ b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen.py
@@ -17,7 +17,10 @@
         template=dict(
             round=[
                 dict(role='HUMAN', prompt_mm={
-                    'text': {'type': 'text', 'text': '{question}'},
+                    'text': {
+                        'type': 'text',
+                        'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.'
+                    },
                     'image': {'type': 'image_url', 'image_url': {'url': 'file://{image}'}},
                 })
             ]
diff --git a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py
index eedcda7a..cf6eb915 100644
--- a/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py
+++ b/ais_bench/benchmark/configs/datasets/refcocog/refcocog_gen_base64.py
@@ -17,7 +17,10 @@
         template=dict(
             round=[
                 dict(role='HUMAN', prompt_mm={
-                    'text': {'type': 'text', 'text': '{question}'},
+                    'text': {
+                        'type': 'text',
+                        'text': 'Locate every object that matches the description "{question}" in the image. Report bbox coordinates in JSON format.'
+                    },
                     'image': {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}},
                 })
             ]
diff --git a/ais_bench/benchmark/datasets/refcoco/refcoco.py b/ais_bench/benchmark/datasets/refcoco/refcoco.py
index c5aa814d..319e4cfa 100644
--- a/ais_bench/benchmark/datasets/refcoco/refcoco.py
+++ b/ais_bench/benchmark/datasets/refcoco/refcoco.py
@@ -25,11 +25,6 @@
 IMAGE_PATH_TYPE = 'image_path'
 IMAGE_BASE64_TYPE = 'image_base64'
 
-REFCOCO_PROMPT_TEMPLATE = (
-    'Locate every object that matches the description "{ref_sentence}" '
-    'in the image. Report bbox coordinates in JSON format.'
-)
-
 TEMP_IMAGE_STORE_DIR = 'temp_save_images'
 
 def _parse_float_sequence_within(input_str: str) -> list[float]:
@@ -125,11 +120,6 @@ def _build_pixel_bbox(raw_bbox: Any) -> list[float]:
         x_coord, y_coord, bbox_width, bbox_height = [float(value) for value in raw_bbox]
         return [x_coord, y_coord, x_coord + bbox_width, y_coord + bbox_height]
 
-    @staticmethod
-    def _build_prompt(answer_text: Any) -> str:
-        ref_sentence = _remove_leading_articles(str(answer_text))
-        return REFCOCO_PROMPT_TEMPLATE.format(ref_sentence=ref_sentence)
-
     @staticmethod
     def _build_rows(
         sample: pd.Series,
@@ -146,15 +136,12 @@ def _build_rows(
 
         rows: list[dict[str, str]] = []
         for answer_text in sample['answer']:
-            prompt = RefCOCODataset._build_prompt(answer_text)
             content = get_content_str([
                 {'type': 'image_url', 'image_url': image_value},
-                {'type': 'text', 'text': prompt},
+                {'type': 'text', 'text': answer_text},
             ])
             rows.append({
                 'content': content,
-                'question': prompt,
-                'image': image_value,
                 'answer': reference_answer,
             })
         return rows
@@ -171,6 +158,14 @@ def load(path: str, split: str, **kwargs: Any) -> Dataset:  # pyright: ignore[re
         bbox to ``[x_min, y_min, x_max, y_max]``, and expands the answer list
         into one benchmark row per referring expression.
 
+        Each output row has a ``content`` field that encodes the image and
+        referring expression together using ``AIS_CONTENT_TAG`` delimiters
+        (via :func:`get_content_str`). During inference the
+        :meth:`PromptList.format_mm` method splits ``content`` on
+        ``AIS_CONTENT_TAG`` and uses the ``AIS_IMAGE_START`` /
+        ``AIS_TEXT_START`` prefixes to populate the ``prompt_mm`` template
+        with the image URL and question text respectively.
+
         Args:
             path: Dataset root containing RefCOCO parquet shards.
             split: Split prefix to load, for example ``val`` or ``testA``.
@@ -179,9 +174,11 @@ def load(path: str, split: str, **kwargs: Any) -> Dataset:  # pyright: ignore[re
                 ``IMAGE_BASE64_TYPE``.
 
         Returns:
-            A HuggingFace ``Dataset`` whose rows contain ``content`` for
-            multimodal prompting and ``answer`` as the serialized reference
-            bbox payload used by evaluation.
+            A HuggingFace ``Dataset`` with columns:
+            - content: encoded multimodal string consumed by
+              ``format_mm`` to fill the ``prompt_mm`` template.
+            - answer: JSON-serialized reference bbox payload used by
+              evaluation.
         """
         resolved_path = get_data_path(path)
         image_type = kwargs.get('image_type', IMAGE_PATH_TYPE)