Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions libreyolo/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,14 @@ def _preprocess(self, image, effective_imgsz, color_format):
)
return tensor, img, size, 1.0
else:
tensor, img, size = preprocess_image(
# YOLOv9 (and yolo9_e2e / yolo9-seg) use a centered letterbox.
# preprocess_image now returns (tensor, img, size, ratio, pad);
# pack the resize gain + left/top padding into the shared ``ratio``
# slot so _parse_outputs can undo the even padding precisely.
tensor, img, size, ratio, pad = preprocess_image(
image, input_size=effective_imgsz, color_format=color_format
)
return tensor, img, size, 1.0
return tensor, img, size, (ratio, pad[0], pad[1])

@staticmethod
def _preprocess_rfdetr(image, input_size, color_format):
Expand Down Expand Up @@ -428,8 +432,14 @@ def _parse_outputs(
)
return boxes, scores, cls, None
else:
# ``ratio`` for the yolo9 family carries (resize_gain, pad_w, pad_h)
# from the centered-letterbox _preprocess; tolerate the legacy
# scalar form (no pad offset).
pad = None
if isinstance(ratio, (tuple, list)) and len(ratio) == 3:
pad = (float(ratio[1]), float(ratio[2]))
parsed = self._parse_yolo9(
all_outputs, effective_imgsz, orig_w, orig_h, conf
all_outputs, effective_imgsz, orig_w, orig_h, conf, pad=pad
)
if len(parsed) == 4:
return parsed
Expand Down Expand Up @@ -555,8 +565,16 @@ def _parse_damoyolo(self, all_outputs, effective_imgsz, orig_w, orig_h, conf):

return boxes, max_scores, class_ids

def _parse_yolo9(self, all_outputs, effective_imgsz, orig_w, orig_h, conf):
"""Parse YOLO9 output: (B, 4+nc, N) — xyxy + class_scores."""
def _parse_yolo9(
self, all_outputs, effective_imgsz, orig_w, orig_h, conf, pad=None
):
"""Parse YOLO9 output: (B, 4+nc, N) — xyxy + class_scores.

``pad`` is the ``(pad_w, pad_h)`` left/top padding from the centered
letterbox; it is subtracted before the ratio divide so box geometry
matches the eager predict path. ``None`` falls back to the legacy
top-left assumption (no pad offset).
"""
outputs = all_outputs[0][0].T # (N, 4+nc)

boxes_input = outputs[:, :4]
Expand All @@ -577,6 +595,9 @@ def _parse_yolo9(self, all_outputs, effective_imgsz, orig_w, orig_h, conf):
return boxes, max_scores, class_ids

ratio = min(effective_imgsz / orig_h, effective_imgsz / orig_w)
if pad is not None:
boxes[:, [0, 2]] -= pad[0]
boxes[:, [1, 3]] -= pad[1]
boxes[:, :4] /= ratio
boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, orig_w)
boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, orig_h)
Expand All @@ -594,6 +615,7 @@ def _parse_yolo9(self, all_outputs, effective_imgsz, orig_w, orig_h, conf):
input_shape=(effective_imgsz, effective_imgsz),
original_size=(orig_w, orig_h),
letterbox=True,
pad=pad,
).numpy()
return boxes, max_scores, class_ids, masks_out

Expand Down
13 changes: 10 additions & 3 deletions libreyolo/export/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,12 @@ def __init__(
batch: Batch size for calibration.
fraction: Fraction of dataset to use (0.0-1.0). Use smaller values
for faster calibration with slight accuracy tradeoff.
preprocess_fn: Callable ``(img_rgb_hwc, input_size) -> (chw_float32, ratio)``.
Obtained from ``model._get_preprocess_numpy()``.
preprocess_fn: Callable ``(img_rgb_hwc, input_size) -> tuple`` whose
first element is the preprocessed ``chw_float32`` array. The
remaining elements vary by family (``(ratio,)`` for most,
``(ratio, (pad_w, pad_h))`` for the centered-letterbox YOLOv9
family) and are ignored here. Obtained from
``model._get_preprocess_numpy()``.
allow_download_scripts: Allow embedded Python in dataset YAML downloads.
"""
self.imgsz = imgsz
Expand Down Expand Up @@ -97,7 +101,10 @@ def _preprocess(self, img_path: Path) -> np.ndarray:
raise FileNotFoundError(f"Cannot read image: {img_path}")
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

result, _ = self._preprocess_fn(img_rgb, self.imgsz)
# preprocess_fn returns (chw_float32, ratio) for most families and
# (chw_float32, ratio, pad) for the centered-letterbox YOLOv9 family.
# Only the image array is needed for calibration, so discard the rest.
result = self._preprocess_fn(img_rgb, self.imgsz)[0]
return result

def __iter__(self) -> Iterator[np.ndarray]:
Expand Down
10 changes: 6 additions & 4 deletions libreyolo/models/picodet/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from typing import List, Sequence, Tuple

import cv2
import numpy as np
import torch
import torch.nn.functional as F
Expand Down Expand Up @@ -41,10 +42,11 @@ def preprocess_numpy(
non-letterbox resize but kept in the signature so it can flow through
the same postprocess pipeline as letterbox-based families.
"""
img = Image.fromarray(img_rgb_hwc).resize(
(input_size, input_size), Image.Resampling.BILINEAR
)
arr = np.array(img, dtype=np.float32)
# Upstream PaddleDetection / Bo's port resize with cv2.INTER_LINEAR.
# PIL's bilinear kernel differs and drifts ~0.3-0.5 mAP on COCO, so match cv2.
arr = cv2.resize(
img_rgb_hwc, (input_size, input_size), interpolation=cv2.INTER_LINEAR
).astype(np.float32)
arr -= np.array(IMAGENET_MEAN, dtype=np.float32)
arr /= np.array(IMAGENET_STD, dtype=np.float32)
return arr.transpose(2, 0, 1), 1.0
Expand Down
18 changes: 14 additions & 4 deletions libreyolo/models/yolo9/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,12 +212,15 @@ def _preprocess(
image: ImageInput,
color_format: str = "auto",
input_size: Optional[int] = None,
) -> Tuple[torch.Tensor, Image.Image, Tuple[int, int], float]:
) -> Tuple[torch.Tensor, Image.Image, Tuple[int, int], Any]:
effective_size = input_size if input_size is not None else self._get_input_size()
tensor, img, size = preprocess_image(
tensor, img, size, ratio, pad = preprocess_image(
image, input_size=effective_size, color_format=color_format
)
return tensor, img, size, 1.0
# Pack the resize gain + centered-letterbox padding into the ``ratio``
# slot of the shared (tensor, img, size, ratio) contract so the
# matching ``_postprocess`` can undo the padding precisely.
return tensor, img, size, (ratio, pad[0], pad[1])

def _forward(self, input_tensor: torch.Tensor) -> Any:
return self.model(input_tensor)
Expand All @@ -229,10 +232,15 @@ def _postprocess(
iou_thres: float,
original_size: Tuple[int, int],
max_det: int = 300,
ratio: float = 1.0,
ratio: Any = 1.0,
**kwargs,
) -> Dict:
actual_input_size = kwargs.get("input_size", self._get_input_size())
# ``ratio`` carries (resize_gain, pad_w, pad_h) from ``_preprocess``.
# Tolerate the legacy scalar form (no centered-letterbox padding).
pad = None
if isinstance(ratio, (tuple, list)) and len(ratio) == 3:
pad = (float(ratio[1]), float(ratio[2]))
return postprocess(
output,
conf_thres=conf_thres,
Expand All @@ -241,6 +249,8 @@ def _postprocess(
original_size=original_size,
max_det=max_det,
letterbox=kwargs.get("letterbox", True),
pad=pad,
multi_label=kwargs.get("multi_label", True),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve backend multi-label parity

With this new default, eager YOLO9 emits one detection for every class above conf_thres, but exported runtimes still go through BaseBackend._parse_yolo9, which uses np.max/np.argmax and keeps only one class per anchor. In the same scenario covered by the new multi-label unit test (one anchor with two above-threshold classes), native predict/validation returns both detections while ONNX/TensorRT/OpenVINO-style backends return only the max class, so backend results no longer round-trip the model semantics.

Useful? React with 👍 / 👎.

)

# =========================================================================
Expand Down
122 changes: 93 additions & 29 deletions libreyolo/models/yolo9/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,35 +18,46 @@
def preprocess_numpy(
img_rgb_hwc: np.ndarray,
input_size: int = 640,
) -> Tuple[np.ndarray, float]:
) -> Tuple[np.ndarray, float, Tuple[float, float]]:
"""
Preprocess RGB HWC uint8 image for YOLOv9 inference.

Letterbox resize + normalize to 0-1 range.
Centered letterbox resize (matches the reference WongKinYiu/yolov9
``utils.augmentations.letterbox``: aspect-preserving resize, the
remaining padding split evenly between the two sides) + normalize to
0-1 range.

Args:
img_rgb_hwc: Input image as RGB HWC uint8 numpy array.
input_size: Target size for the model.

Returns:
Tuple of (preprocessed CHW float32 array in RGB 0-1, ratio).
Tuple of ``(preprocessed CHW float32 array in RGB 0-1, ratio,
(pad_w, pad_h))`` where ``ratio`` is the resize gain and
``(pad_w, pad_h)`` is the left/top padding applied (so postprocess
can undo it as ``(coord - pad) / ratio``).
"""
orig_h, orig_w = img_rgb_hwc.shape[:2]
ratio = min(input_size / orig_h, input_size / orig_w)
new_h = int(orig_h * ratio)
new_w = int(orig_w * ratio)
new_w = int(round(orig_w * ratio))
new_h = int(round(orig_h * ratio))

dw = (input_size - new_w) / 2.0
dh = (input_size - new_h) / 2.0

resized = cv2.resize(img_rgb_hwc, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
padded = np.full((input_size, input_size, 3), 114, dtype=np.uint8)
padded[:new_h, :new_w] = resized
top = int(round(dh - 0.1))
left = int(round(dw - 0.1))
padded[top : top + new_h, left : left + new_w] = resized
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Align YOLO9 validation with centered letterbox

For non-square YOLO9/YOLO9-E2E images, prediction now center-pads here, but YOLO9ValPreprocessor and the YOLO9 training preproc still put the resized image at the top-left and validation calls _postprocess without a pad. That means validation metrics and locally trained checkpoints are using a different input geometry than model.predict, so reported mAP no longer measures the inference path users actually run unless the train/val preprocessors are updated too.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Align training letterbox with centered inference

For non-square YOLO9 training images, this centered placement is now the inference/validation geometry, but the current training path still uses libreyolo/models/yolo9/transforms.py:41 to paste the resized image at the top-left and only scales labels by r without adding the new pad. Fresh evidence in this revision is that validation was updated to center-pad, while the training transform was not; locally trained YOLO9/YOLO9-seg checkpoints will learn boxes/masks in a different input canvas than predict/export uses, causing systematic offsets on non-square images.

Useful? React with 👍 / 👎.


arr = np.ascontiguousarray(padded, dtype=np.float32) / 255.0
return arr.transpose(2, 0, 1), ratio
return arr.transpose(2, 0, 1), ratio, (float(left), float(top))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Keep calibration preprocess functions two-valued

For INT8 export calibration, CalibrationDataLoader._preprocess obtains model._get_preprocess_numpy() and unpacks it as result, _ = self._preprocess_fn(...). This new three-value return makes every YOLO9 calibration image hit ValueError: too many values to unpack and get skipped, which breaks or empties INT8 calibration for YOLO9-family exports.

Useful? React with 👍 / 👎.



def preprocess_image(
image: ImageInput, input_size: int = 640, color_format: str = "auto"
) -> Tuple[torch.Tensor, Image.Image, Tuple[int, int]]:
) -> Tuple[torch.Tensor, Image.Image, Tuple[int, int], float, Tuple[float, float]]:
"""
Preprocess image for YOLOv9 inference.

Expand All @@ -56,15 +67,16 @@ def preprocess_image(
color_format: Color format hint ("auto", "rgb", "bgr")

Returns:
Tuple of (preprocessed_tensor, original_image, original_size)
Tuple of ``(preprocessed_tensor, original_image, original_size,
ratio, (pad_w, pad_h))``.
"""
img = ImageLoader.load(image, color_format=color_format)
original_size = img.size # (width, height)
original_img = img.copy()

img_chw, _ = preprocess_numpy(np.array(img), input_size)
img_chw, ratio, pad = preprocess_numpy(np.array(img), input_size)
img_tensor = torch.from_numpy(img_chw).unsqueeze(0)
return img_tensor, original_img, original_size
return img_tensor, original_img, original_size, ratio, pad
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve YOLO9 backend preprocessing arity

When an exported/backend YOLO9 or YOLO9-E2E model runs inference, BaseBackend._preprocess_image still calls this helper as tensor, img, size = preprocess_image(...) and then supplies its own ratio. Returning five values here raises ValueError: too many values to unpack before the backend can execute any prediction, so ONNX/TensorRT/OpenVINO-style YOLO9 backends are broken until the backend is updated or this helper keeps the old arity for that path.

Useful? React with 👍 / 👎.



def decode_boxes(
Expand Down Expand Up @@ -157,6 +169,7 @@ def _process_masks(
input_shape: Tuple[int, int],
original_size: Tuple[int, int] | None,
letterbox: bool = True,
pad: Tuple[float, float] | None = None,
) -> torch.Tensor:
if coeffs.numel() == 0:
h = original_size[1] if original_size is not None else input_shape[0]
Expand All @@ -175,15 +188,21 @@ def _process_masks(
if original_size is not None and letterbox:
orig_w, orig_h = original_size
ratio = min(input_h / orig_h, input_w / orig_w)
new_h = max(int(orig_h * ratio), 1)
new_w = max(int(orig_w * ratio), 1)
# Use int(round(...)) to match the centered letterbox in
# preprocess_numpy: the resized content occupies exactly
# [top:top+new_h, left:left+new_w], so truncating here would crop the
# mask 1px off from where the image was actually placed.
new_h = max(int(round(orig_h * ratio)), 1)
new_w = max(int(round(orig_w * ratio)), 1)
masks = F.interpolate(
masks[:, None],
size=(int(input_h), int(input_w)),
mode="bilinear",
align_corners=False,
)[:, 0]
masks = masks[:, :new_h, :new_w]
left = int(round(pad[0])) if pad is not None else 0
top = int(round(pad[1])) if pad is not None else 0
masks = masks[:, top : top + new_h, left : left + new_w]
out_h, out_w = orig_h, orig_w
elif original_size is not None:
out_h, out_w = original_size[1], original_size[0]
Expand All @@ -206,6 +225,8 @@ def postprocess(
original_size: Tuple[int, int] | None = None,
max_det: int = 300,
letterbox: bool = True,
pad: Tuple[float, float] | None = None,
multi_label: bool = True,
) -> Dict:
"""
Postprocess YOLOv9 model outputs to get final detections.
Expand All @@ -217,6 +238,15 @@ def postprocess(
input_size: Input image size (default: 640)
original_size: Original image size (width, height) for scaling
max_det: Maximum number of detections to return (default: 300)
letterbox: Whether the input was letterboxed (aspect-preserving).
pad: ``(pad_w, pad_h)`` left/top padding applied at preprocess time,
used to undo a centered letterbox. ``None`` falls back to the
legacy top-left-padding assumption (no pad offset).
multi_label: When True (the reference WongKinYiu/yolov9 ``val.py``
default), every class whose score exceeds ``conf_thres`` emits a
detection for that anchor instead of only the argmax class. This
matches the original COCO eval protocol and is worth ~+0.7 mAP at
the low conf thresholds used for benchmarking.

Returns:
Dictionary with boxes, scores, classes, num_detections
Expand All @@ -231,31 +261,64 @@ def postprocess(
# Transpose to (total_anchors, 4+nc)
pred = pred.transpose(0, 1)

boxes_input = pred[:, :4] # xyxy format in model input pixels
boxes_all = pred[:, :4] # xyxy format in model input pixels
scores = pred[:, 4:] # class scores (already sigmoid applied in model)

max_scores, class_ids = torch.max(scores, dim=1)

mask = max_scores > conf_thres
if not mask.any():
return {"boxes": [], "scores": [], "classes": [], "num_detections": 0}

boxes_input = boxes_input[mask]
boxes = boxes_input.clone()
max_scores = max_scores[mask]
class_ids = class_ids[mask]

mask_coeffs = output.get("mask_coeffs")
proto = output.get("proto")
coeffs = None
coeffs_all = None
if mask_coeffs is not None and proto is not None:
coeffs_all = mask_coeffs[0].transpose(0, 1) if mask_coeffs.dim() == 3 else mask_coeffs
coeffs = coeffs_all[mask]
coeffs_all = (
mask_coeffs[0].transpose(0, 1) if mask_coeffs.dim() == 3 else mask_coeffs
)

# multi_label only helps when masks are not requested (segmentation uses
# one coeff vector per anchor, so stick to the best-class path there).
if multi_label and coeffs_all is None:
# Candidate guard (matches WongKinYiu/yolov9 non_max_suppression): first
# keep only anchors whose *best* class beats conf_thres. Every class
# above conf necessarily lives in such an anchor, so the result is
# identical to scanning the full 8400xnc score matrix — but the
# ``(scores > conf_thres).nonzero()`` below then runs on a few hundred
# rows instead of ~600k, bounding memory/time at conf=0.001.
cand = scores.amax(dim=1) > conf_thres
if not cand.any():
return {"boxes": [], "scores": [], "classes": [], "num_detections": 0}
cand_idx = cand.nonzero(as_tuple=True)[0]
scores_c = scores[cand_idx]
sub_anchor, class_ids = (scores_c > conf_thres).nonzero(as_tuple=True)
anchor_idx = cand_idx[sub_anchor]
boxes_input = boxes_all[anchor_idx]
boxes = boxes_input.clone()
max_scores = scores[anchor_idx, class_ids]
# Cap to max_nms candidates by score before NMS (upstream uses 30000),
# so a pathological frame cannot blow up the NMS pairwise IoU.
max_nms = 30000
if max_scores.numel() > max_nms:
topk = torch.topk(max_scores, max_nms).indices
boxes_input = boxes_input[topk]
boxes = boxes[topk]
max_scores = max_scores[topk]
class_ids = class_ids[topk]
coeffs = None
else:
max_scores, class_ids = torch.max(scores, dim=1)
mask = max_scores > conf_thres
if not mask.any():
return {"boxes": [], "scores": [], "classes": [], "num_detections": 0}
boxes_input = boxes_all[mask]
boxes = boxes_input.clone()
max_scores = max_scores[mask]
class_ids = class_ids[mask]
coeffs = coeffs_all[mask] if coeffs_all is not None else None

if original_size is not None:
if letterbox:
orig_w, orig_h = original_size
ratio = min(input_size / orig_h, input_size / orig_w)
if pad is not None:
boxes[:, [0, 2]] -= pad[0]
boxes[:, [1, 3]] -= pad[1]
boxes[:, :4] = boxes[:, :4] / ratio
else:
scale_x = original_size[0] / input_size
Expand Down Expand Up @@ -303,6 +366,7 @@ def postprocess(
input_shape=(input_size, input_size),
original_size=original_size,
letterbox=letterbox,
pad=pad,
)
result["masks"] = masks.detach().cpu()

Expand Down
Loading
Loading