diff --git a/demo_nerfstudio.py b/demo_nerfstudio.py new file mode 100644 index 0000000..9eaf3ba --- /dev/null +++ b/demo_nerfstudio.py @@ -0,0 +1,753 @@ +import torch +import argparse +from pi3.utils.basic import load_images_as_tensor +from pi3.utils.geometry import depth_edge +from pi3.models.pi3 import Pi3 +import os +import json +import struct +from tqdm import tqdm +import numpy as np +from scipy.spatial.transform import Rotation as R +from PIL import Image +import numpy as np + +def estimate_camera_intrinsics_from_pointcloud( + points: np.ndarray, + images: np.ndarray, + camera_poses: np.ndarray | None = None, + poses_are_cam_to_world: bool = True, + conf: np.ndarray | None = None, + valid_mask: np.ndarray | None = None, + pixel_center: float = 0.5, + max_points: int = 2_000_000, + huber_delta_px: float = 3.0, + irls_iters: int = 5, + z_min: float = 1e-6, +) -> tuple[np.ndarray, dict]: + """ + Estimate a single pinhole intrinsics matrix K from a set of frames that all + come from the same physical camera, using dense per-pixel 3D points. + + The fitting uses the projection model (no skew, no distortion): + u = fx * Xc / Zc + cx + v = fy * Yc / Zc + cy + and solves for fx, cx and fy, cy with robust, confidence-weighted least squares. + + Parameters + ---------- + points : np.ndarray + Array of 3D points per pixel. Shape: (N, H, W, 3). + - If `camera_poses is None`, these are assumed to already be in each + frame's *camera coordinates* (Xc, Yc, Zc). + - If `camera_poses` is provided, these are assumed to be in *world coordinates*, + and they will be transformed into camera coordinates for each frame. + images : np.ndarray + Corresponding images. Only used for H, W and pixel indexing. + Shape: (N, H, W, 3) or (N, H, W, C). Values are not used. + camera_poses : np.ndarray | None + If given, per-frame 4x4 or 3x4 SE(3) matrices describing the camera pose. + Shape: (N, 4, 4) or (N, 3, 4). + - If `poses_are_cam_to_world=True` (default), each pose maps camera -> world (c2w). + We'll invert per frame to get world -> camera. + - If False, poses are world -> camera already (w2c) and are used directly. + poses_are_cam_to_world : bool + See `camera_poses`. Ignored if `camera_poses is None`. + conf : np.ndarray | None + Optional confidence map. Shape: (N, H, W) or (N, H, W, 1). + Used as weights (clipped to [1e-6, 1]). + valid_mask : np.ndarray | None + Optional boolean mask of valid pixels. Shape: (N, H, W). + Combined with Zc > z_min and finite checks. + pixel_center : float + Add this offset to pixel indices to model pixel centers (0.5 is common). + max_points : int + Downsample the set of correspondences to at most this many to keep memory in check. + huber_delta_px : float + Huber delta for robust IRLS in *pixel* units. + irls_iters : int + Number of IRLS iterations. + z_min : float + Minimum positive depth (in camera coordinates) to consider a point valid. + + Returns + ------- + K : np.ndarray + 3x3 intrinsics matrix: + [[fx, 0, cx], + [0, fy, cy], + [0, 0, 1 ]] + stats : dict + Diagnostics: + { + 'num_used': int, + 'rmse_u_px': float, + 'rmse_v_px': float, + 'fx': float, 'fy': float, 'cx': float, 'cy': float, + 'H': int, 'W': int, 'N': int + } + + Notes + ----- + - This ignores lens distortion and skew. If your data has distortion, consider + pre-undistorting or extend the solver to estimate k1/k2 with non-linear least squares. + - If your `points` are already in camera coordinates per frame, pass `camera_poses=None`. + - For your Pi3 predictions dict, a typical call is: + + K, stats = estimate_camera_intrinsics_from_pointcloud( + points=predictions['points'], # (N,H,W,3) + images=predictions['images'], # (N,H,W,3) just for H,W + camera_poses=predictions.get('camera_poses'),# (N,4,4) if points are world-space + poses_are_cam_to_world=True, # Pi3 usually returns c2w + conf=predictions.get('conf'), # (N,H,W) + ) + """ + # ---- helper: robust weighted least squares (Huber IRLS) ---- + def _irls(A: np.ndarray, y: np.ndarray, w0: np.ndarray) -> tuple[np.ndarray, float]: + # A: (M,2), y: (M,), w0: (M,) + w = w0.copy() + theta = np.linalg.lstsq(A * w[:, None], y * w, rcond=None)[0] + for _ in range(max(0, irls_iters - 1)): + r = y - A @ theta + # Huber weights in pixels + abs_r = np.abs(r) + w = np.where(abs_r <= huber_delta_px, 1.0, huber_delta_px / (abs_r + 1e-12)) + # Re-apply base confidence as minimum + w = np.maximum(w, w0) + theta = np.linalg.lstsq(A * w[:, None], y * w, rcond=None)[0] + # final RMSE + r = y - A @ theta + rmse = np.sqrt(np.mean((r)**2)) + return theta, float(rmse) + + # ---- shapes and basic checks ---- + if points.ndim != 4 or points.shape[-1] != 3: + raise ValueError("`points` must have shape (N, H, W, 3).") + if images.ndim < 4: + raise ValueError("`images` must have shape (N, H, W, C).") + + N, H, W, _ = points.shape + # Build pixel coordinate grid (u: x, v: y) + u_grid, v_grid = np.meshgrid( + np.arange(W, dtype=np.float64) + pixel_center, + np.arange(H, dtype=np.float64) + pixel_center, + indexing="xy" + ) # each (H, W) + + # ---- get points in camera coordinates for each frame ---- + # If camera_poses is provided, assume `points` are world-space and transform to camera-space. + if camera_poses is not None: + if camera_poses.shape[-2:] == (3, 4): + # upgrade to 4x4 + last_row = np.array([0, 0, 0, 1], dtype=camera_poses.dtype) + last_row = np.broadcast_to(last_row, (*camera_poses.shape[:-2], 1, 4)) + poses_4x4 = np.concatenate([camera_poses, last_row], axis=-2) + elif camera_poses.shape[-2:] == (4, 4): + poses_4x4 = camera_poses + else: + raise ValueError("`camera_poses` must have shape (N, 4, 4) or (N, 3, 4).") + + if poses_are_cam_to_world: + # invert to get w2c + # vectorized inverse of rigid transform + R = poses_4x4[:, :3, :3] + t = poses_4x4[:, :3, 3:4] + Rt = np.transpose(R, (0, 2, 1)) + t_inv = -Rt @ t + w2c = np.concatenate([np.concatenate([Rt, t_inv], axis=-1), + np.tile(np.array([0, 0, 0, 1])[None, None, :], (N, 1, 1))], axis=-2) + else: + w2c = poses_4x4 # already world->camera + + # Apply transform: Xc = R^T (Xw - t) (which is what w2c does) + pts_cam = np.empty_like(points, dtype=np.float64) + # reshape for batched matmul + Pw = np.concatenate([points, np.ones((*points.shape[:-1], 1), dtype=np.float64)], axis=-1) # (N,H,W,4) + Pw = Pw.reshape(N, -1, 4).transpose(0, 2, 1) # (N,4,M) + Xc = (w2c @ Pw) # (N,4,M) + Xc = Xc[:, :3, :].transpose(0, 2, 1).reshape(N, H, W, 3) # (N,H,W,3) + else: + # Assume already camera-space + Xc = points.astype(np.float64, copy=False) + + # ---- validity mask ---- + Z = Xc[..., 2] + base_valid = np.isfinite(Xc).all(axis=-1) & np.isfinite(u_grid) & np.isfinite(v_grid) + base_valid &= (Z > z_min) + + if valid_mask is not None: + if valid_mask.shape != (N, H, W): + raise ValueError("`valid_mask` must match (N,H,W).") + base_valid &= valid_mask + + # Confidence weights + if conf is not None: + if conf.ndim == 4 and conf.shape[-1] == 1: + conf = conf[..., 0] + if conf.shape != (N, H, W): + raise ValueError("`conf` must have shape (N,H,W) or (N,H,W,1).") + w0 = np.clip(conf.astype(np.float64), 1e-6, 1.0) + else: + w0 = np.ones((N, H, W), dtype=np.float64) + + valid = base_valid & (w0 > 0) + num_valid = int(valid.sum()) + if num_valid < 10: + raise ValueError("Not enough valid correspondences to estimate intrinsics.") + + # ---- build linear systems ---- + # Flatten per-frame and stack across frames + X = Xc[..., 0][valid] + Y = Xc[..., 1][valid] + Z = Xc[..., 2][valid] + u = np.broadcast_to(u_grid, (N, H, W))[valid] + v = np.broadcast_to(v_grid, (N, H, W))[valid] + w = w0[valid] + + # Optional downsampling to keep things light + if num_valid > max_points: + idx = np.random.default_rng(0).choice(num_valid, size=max_points, replace=False) + X, Y, Z, u, v, w = X[idx], Y[idx], Z[idx], u[idx], v[idx], w[idx] + num_valid = max_points + + xnz = X / Z + ynz = Y / Z + + # Two independent 2-parameter fits: + # u = [xnz, 1] @ [fx, cx] + # v = [ynz, 1] @ [fy, cy] + Au = np.stack([xnz, np.ones_like(xnz)], axis=1) # (M,2) + Av = np.stack([ynz, np.ones_like(ynz)], axis=1) # (M,2) + + # Base weights incorporate confidence; Huber IRLS provides robustness + theta_u, rmse_u = _irls(Au, u, w0=w) + theta_v, rmse_v = _irls(Av, v, w0=w) + + fx, cx = float(theta_u[0]), float(theta_u[1]) + fy, cy = float(theta_v[0]), float(theta_v[1]) + + K = np.array([[fx, 0.0, cx], + [0.0, fy, cy], + [0.0, 0.0, 1.0]], dtype=np.float64) + + stats = { + "num_used": num_valid, + "rmse_u_px": rmse_u, + "rmse_v_px": rmse_v, + "fx": fx, "fy": fy, "cx": cx, "cy": cy, + "H": H, "W": W, "N": N, + } + return K, stats + + +def save_nerfstudio_format(predictions, intrinsics, path): + """ + Serialize a sequence of frames (poses + intrinsics) to a NeRF/Colmap-style JSON. + + This writes one shared pinhole intrinsics for all frames of a single camera and + a per-frame SE(3) transform. It is compatible with common loaders that expect + fields like `fl_x`, `fl_y`, `cx`, `cy`, `w`, `h`, and `camera_model="OPENCV"`. + + Parameters + ---------- + predictions : dict + Dictionary produced by your pipeline. Expected keys: + - 'points': np.ndarray of shape (N, H, W, 3). Used to infer N, H, W if images absent. + - 'camera_poses': np.ndarray of shape (N, 4, 4) or (N, 3, 4). Camera-to-world is typical. + - 'images' (optional): np.ndarray of shape (N, H, W, C). If present, used for H and W. + Any other keys in `predictions` are passed through unchanged (function returns the same dict). + + intrinsics : one of + - np.ndarray shape (3, 3): camera matrix K. + - tuple/list: (K, stats) as returned by `estimate_camera_intrinsics_from_pointcloud`, + where K is (3,3) and stats may include 'H' and 'W' used for automatic scaling. + - dict with keys 'fx', 'fy', 'cx', 'cy' (and optional 'H', 'W' for source resolution). + Optional distortion keys 'k1', 'k2', 'p1', 'p2', 'k3' are supported; if omitted they default to 0. + + If `intrinsics` contains a source resolution (stats['H']/['W'] or keys 'H'/'W'), + the focal lengths and principal point will be scaled to the current (H, W) of the frames. + + path : str + Output file path (e.g., "transforms.json"). + + Notes + ----- + - The JSON contains a list of `frames`, each with: + { + "file_path": "camera/XYZ.jpg", + "transform_matrix": 4x4 (list of lists), + "w": W, "h": H, + "fl_x": fx, "fl_y": fy, "cx": cx, "cy": cy, + "camera_model": "OPENCV", + "k1": 0, "k2": 0, "p1": 0, "p2": 0, "k3": 0 # included if available/assumed + } + File paths are templated as "camera/{i:03}.jpg". Adjust later if your filenames differ. + """ + # ---- resolve N, H, W from predictions ---- + if 'images' in predictions and predictions['images'] is not None: + N, H, W = predictions['images'].shape[:3] + else: + N, H, W, _ = predictions['points'].shape + + # ---- parse intrinsics into fx, fy, cx, cy and (optional) distortion ---- + def _parse_intrinsics(_intr): + k1 = k2 = p1 = p2 = k3 = 0.0 + srcH = srcW = None + + if isinstance(_intr, (list, tuple)) and len(_intr) >= 1 and ( + isinstance(_intr[0], np.ndarray) and _intr[0].shape == (3, 3) + ): + K = _intr[0] + fx, fy, cx, cy = float(K[0, 0]), float(K[1, 1]), float(K[0, 2]), float(K[1, 2]) + # source resolution from stats if present + if len(_intr) > 1 and isinstance(_intr[1], dict): + srcH = _intr[1].get('H') + srcW = _intr[1].get('W') + elif isinstance(_intr, np.ndarray) and _intr.shape == (3, 3): + fx, fy, cx, cy = float(_intr[0, 0]), float(_intr[1, 1]), float(_intr[0, 2]), float(_intr[1, 2]) + elif isinstance(_intr, dict): + if 'K' in _intr and isinstance(_intr['K'], np.ndarray) and _intr['K'].shape == (3, 3): + K = _intr['K'] + fx, fy, cx, cy = float(K[0, 0]), float(K[1, 1]), float(K[0, 2]), float(K[1, 2]) + else: + fx = float(_intr['fx']); fy = float(_intr['fy']); cx = float(_intr['cx']); cy = float(_intr['cy']) + # optional distortion + k1 = float(_intr.get('k1', 0.0)); k2 = float(_intr.get('k2', 0.0)) + p1 = float(_intr.get('p1', 0.0)); p2 = float(_intr.get('p2', 0.0)) + k3 = float(_intr.get('k3', 0.0)) + # optional source resolution + srcH = _intr.get('H'); srcW = _intr.get('W') + if 'stats' in _intr and isinstance(_intr['stats'], dict): + srcH = _intr['stats'].get('H', srcH) + srcW = _intr['stats'].get('W', srcW) + else: + raise TypeError("Unsupported `intrinsics` type. Provide (K, stats), K, or a dict with fx/fy/cx/cy.") + + # scale intrinsics if they were estimated at a different resolution + if srcH is not None and srcW is not None and (srcH != H or srcW != W): + sx = W / float(srcW) + sy = H / float(srcH) + fx *= sx; cx *= sx + fy *= sy; cy *= sy + + return { + 'fx': fx, 'fy': fy, 'cx': cx, 'cy': cy, + 'k1': k1, 'k2': k2, 'p1': p1, 'p2': p2, 'k3': k3 + } + + intr = _parse_intrinsics(intrinsics) + + # ---- normalize/upgrade poses to 4x4 lists ---- + poses = predictions.get('camera_poses') + if poses is None: + # fall back to identity transforms if not provided + poses = np.tile(np.eye(4, dtype=float), (N, 1, 1)) + poses = np.asarray(poses) + + if poses.ndim != 3 or poses.shape[0] != N or poses.shape[1] not in (3, 4) or poses.shape[2] != 4: + raise ValueError("`predictions['camera_poses']` must have shape (N, 4, 4) or (N, 3, 4).") + + if poses.shape[1] == 3: # (N,3,4) -> (N,4,4) + bottom = np.array([0.0, 0.0, 0.0, 1.0], dtype=float)[None, None, :] + bottom = np.repeat(bottom, repeats=N, axis=0) + poses_4x4 = np.concatenate([poses, bottom], axis=1) + else: + poses_4x4 = poses.astype(float, copy=False) + + # ---- build JSON ---- + output = {"frames": []} + for i in range(N): + frame = { + "file_path": f"camera/{i:03}.jpg", + "transform_matrix": (poses_4x4[i] @ np.diag([1.0, -1.0, -1.0, 1.0])).tolist(), # convert OpenCV to OpenGL + "w": int(W), + "h": int(H), + "fl_x": float(intr['fx']), + "fl_y": float(intr['fy']), + "cx": float(intr['cx']), + "cy": float(intr['cy']), + "camera_model": "PINHOLE", + } + # include distortion terms if non-zero (or include all with zeros for compatibility) + frame.update({ + "k1": float(intr.get('k1', 0.0)), + "k2": float(intr.get('k2', 0.0)), + "p1": float(intr.get('p1', 0.0)), + "p2": float(intr.get('p2', 0.0)), + "k3": float(intr.get('k3', 0.0)), + }) + output["frames"].append(frame) + + # ---- write ---- + with open(path, 'w') as f: + json.dump(output, f, indent=4) + + return + + +def save_images( + predictions: dict, + out_dir: str, + poses_are_cam_to_world: bool = True, + image_quality: int = 95, +) -> dict: + """ + Save per-frame RGB images + + + Images are written as JPEG to `/camera/{i:03}.jpg`. + + Parameters + ---------- + predictions : dict + Expected keys: + - 'images': (N, H, W, C) array in [0,1] or [0,255] (dtype float/uint8). + - 'points' : (N, H, W, 3) array. Camera-space or world-space (see below). + - 'local_points' (optional): (N, H, W, 3) camera-space points. + - 'camera_poses' (optional): (N, 4, 4) or (N, 3, 4) extrinsics. + If `poses_are_cam_to_world=True`, poses are c2w and will be inverted. + If False, they are world->camera and used directly. + out_dir : str + Root folder where `camera/` is created. + poses_are_cam_to_world : bool + See `camera_poses` above. + image_quality : int + JPEG quality for image saving (1..100). + + Returns + ------- + dict + The unmodified `predictions` (returned for convenience). + + """ + os.makedirs(out_dir, exist_ok=True) + cam_dir = os.path.join(out_dir, "camera") + os.makedirs(cam_dir, exist_ok=True) + + images = predictions["images"] + N = images.shape[0] + + # ---- save per-frame ---- + for i in range(N): + # Save image + img = images[i] + if img.shape[-1] > 3: + img = img[..., :3] # drop extra channels if any + if img.dtype != np.uint8: + # normalize into [0,255] + finite = np.isfinite(img) + if img.max() <= 1.01 and img.min() >= -0.01: + img = np.clip(img, 0.0, 1.0) * 255.0 + else: + img = np.clip(img, 0.0, 255.0) + img = np.where(finite, img, 0) + img = np.round(img).astype(np.uint8) + Image.fromarray(img).save( + os.path.join(cam_dir, f"{i:03}.jpg"), + quality=image_quality, + subsampling=0 + ) + + return + + +def save_local_points_pcd( + predictions: dict, + out_dir: str, + per_frame: bool = True, + poses_are_cam_to_world: bool = True, + stride: int = 1, + conf_thresh: float = 0.0, + z_min: float = 1e-6, + binary: bool = True, +) -> dict: + """ + Save colored point clouds from `predictions` in PCD format. + + This function exports **camera-space** point clouds using `local_points` + (preferred), or derives them by transforming `points` with `camera_poses`. + Colors are sampled from `images` and stored in the PCD `rgb` field + (packed float, PCL-compatible). You can save one PCD per frame or a single + merged PCD. + + Parameters + ---------- + predictions : dict + Expected keys: + - 'images' : (N,H,W,3) uint8 or float array in [0,1] or [0,255]. + - 'local_points' (preferred): (N,H,W,3) camera-space 3D. + - 'points' : (N,H,W,3) world- or camera-space 3D (fallback). + - 'camera_poses' (optional): (N,4,4) or (N,3,4) extrinsics. + If `poses_are_cam_to_world=True`, poses are c2w and will be inverted + when deriving camera-space points from world points. + - 'conf' (optional): (N,H,W) per-pixel confidence in [0,1] used to filter. + out_dir : str + Directory to write PCD files into. Subfolder `pcd/` will be created. + per_frame : bool + If True, writes one PCD per frame: `pcd/{i:03}.pcd`. + If False, writes a single merged file: `pcd/all_local.pcd`. + The merged file is still in **camera coordinates per frame** (i.e., not a + global/world-aligned cloud). If you need a world-space merged cloud, transform + `local_points` with c2w first and then call this function with `per_frame=False`. + poses_are_cam_to_world : bool + How to interpret `camera_poses` if we need to derive local points from world points. + stride : int + Spatial subsampling step (e.g., 2 keeps every other pixel) to reduce file size. + conf_thresh : float + Drop points with confidence < conf_thresh if `conf` is provided. + z_min : float + Minimum positive depth (Z in camera coordinates) to keep a point. + binary : bool + If True, write `DATA binary` PCD (smaller, faster). If False, write ASCII. + + Notes + ----- + - PCD layout: FIELDS x y z rgb (x,y,z float32; rgb packed float32). + - Invalid / non-finite / behind-camera points are omitted. + - Colors are RGB from `images`; if `images` are float, they are scaled to [0..255]. + """ + os.makedirs(out_dir, exist_ok=True) + pcd_dir = os.path.join(out_dir, "pcd") + os.makedirs(pcd_dir, exist_ok=True) + + images = np.asarray(predictions["images"]) + N, H, W = images.shape[:3] + + # Resolve camera-space points (local) + if "local_points" in predictions and predictions["local_points"] is not None: + local = np.asarray(predictions["local_points"], dtype=np.float32) + assert local.shape[:3] == (N, H, W), "`local_points` shape must match images." + else: + pts = np.asarray(predictions["points"], dtype=np.float64) + assert pts.shape[:3] == (N, H, W), "`points` shape must match images." + poses = predictions.get("camera_poses", None) + if poses is None: + # Assume `points` already in camera coords + local = pts.astype(np.float32, copy=False) + else: + poses = np.asarray(poses) + if poses.shape[-2:] == (3, 4): + bottom = np.array([0, 0, 0, 1.0], dtype=poses.dtype)[None, None, :] + bottom = np.repeat(bottom, repeats=N, axis=0) + poses = np.concatenate([poses, bottom], axis=1) + if poses.shape[-2:] != (4, 4): + raise ValueError("camera_poses must be (N,4,4) or (N,3,4).") + + # world->camera + if poses_are_cam_to_world: + R = poses[:, :3, :3] # c2w + t = poses[:, :3, 3] + Rt = np.transpose(R, (0, 2, 1)) + t_inv = -(Rt @ t[..., None])[..., 0] + R_w2c, t_w2c = Rt, t_inv + else: + R_w2c = poses[:, :3, :3] + t_w2c = poses[:, :3, 3] + + local = np.empty_like(pts, dtype=np.float32) + for i in range(N): + Xw = pts[i].reshape(-1, 3) + Xc = (Xw @ R_w2c[i].T) + t_w2c[i][None, :] + local[i] = Xc.reshape(H, W, 3) + + # Prepare masks + valid_z = local[..., 2] > z_min + finite = np.isfinite(local).all(axis=-1) + valid = valid_z & finite + if "conf" in predictions and predictions["conf"] is not None: + conf = np.asarray(predictions["conf"]) + if conf.ndim == 4 and conf.shape[-1] == 1: + conf = conf[..., 0] + assert conf.shape[:3] == (N, H, W) + valid &= (conf >= conf_thresh) + + # Normalize colors to uint8 RGB + imgs_u8 = images + if imgs_u8.dtype != np.uint8: + # Try to auto-scale + if imgs_u8.max() <= 1.01 and imgs_u8.min() >= -0.01: + imgs_u8 = np.clip(imgs_u8, 0, 1) * 255.0 + imgs_u8 = np.clip(imgs_u8, 0, 255).round().astype(np.uint8) + + # Optional stride + if stride < 1: + stride = 1 + + def _pack_rgb_float(r, g, b) -> float: + """Pack 3 uint8 into a single float32 'rgb' as PCL expects.""" + u = (int(r) << 16) | (int(g) << 8) | int(b) + return struct.unpack(' 1: + m_strided = np.zeros_like(m) + m_strided[::stride, ::stride] = True + m &= m_strided + if not np.any(m): + continue + pts = local[i][m].reshape(-1, 3) + cols = imgs_u8[i][m].reshape(-1, 3) # RGB + rgbf = np.array([_pack_rgb_float(r, g, b) for r, g, b in cols], dtype=np.float32) + out_path = os.path.join(pcd_dir, f"{i:03}.pcd") + _write_pcd_xyzrgb(out_path, pts, rgbf, binary_flag=binary) + else: + xyz_list = [] + rgb_list = [] + for i in range(N): + m = valid[i] + if stride > 1: + m_strided = np.zeros_like(m) + m_strided[::stride, ::stride] = True + m &= m_strided + if not np.any(m): + continue + pts = local[i][m].reshape(-1, 3) + cols = imgs_u8[i][m].reshape(-1, 3) + rgbf = np.array([_pack_rgb_float(r, g, b) for r, g, b in cols], dtype=np.float32) + xyz_list.append(pts) + rgb_list.append(rgbf) + if xyz_list: + xyz = np.concatenate(xyz_list, axis=0) + rgbf = np.concatenate(rgb_list, axis=0) + else: + xyz = np.empty((0, 3), dtype=np.float32) + rgbf = np.empty((0,), dtype=np.float32) + out_path = os.path.join(pcd_dir, "all_local.pcd") + _write_pcd_xyzrgb(out_path, xyz, rgbf, binary_flag=binary) + + return + + +if __name__ == '__main__': + # --- Argument Parsing --- + parser = argparse.ArgumentParser(description="Run inference with the Pi3 model.") + parser.add_argument("--data_path", type=str, required=True, + help="Path to the input image directory or a video file.") + parser.add_argument("--save_path", type=str, required=True, + help="Path to save the output .ply file.") + parser.add_argument("--interval", type=int, default=-1, + help="Interval to sample image. Default: 1 for images dir, 10 for video") + parser.add_argument("--ckpt", type=str, default=None, + help="Path to the model checkpoint file. Default: None") + parser.add_argument("--device", type=str, default='cuda', + help="Device to run inference on ('cuda' or 'cpu'). Default: 'cuda'") + parser.add_argument("--save_points", action='store_true', + help="Save colorized local_points in pcd format") + + args = parser.parse_args() + if args.interval < 0: + args.interval = 10 if args.data_path.endswith('.mp4') else 1 + print(f'Sampling interval: {args.interval}') + + # 1. Prepare model + print(f"Loading model...") + device = torch.device(args.device) + if args.ckpt is not None: + model = Pi3().to(device).eval() + if args.ckpt.endswith('.safetensors'): + from safetensors.torch import load_file + weight = load_file(args.ckpt) + else: + weight = torch.load(args.ckpt, map_location=device, weights_only=False) + + model.load_state_dict(weight) + else: + model = Pi3.from_pretrained("yyfz233/Pi3").to(device).eval() + + # 2. Prepare input data + imgs = load_images_as_tensor(args.data_path, interval=args.interval).to(device) # (N, 3, H, W) + # 3. create output directory + os.makedirs(args.save_path, exist_ok=True) + + # 4. Infer + print("Running model inference...") + dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 + with torch.no_grad(): + with torch.amp.autocast('cuda', dtype=dtype): + predictions = model(imgs[None]) # Add batch dimensionx + + + predictions['images'] = imgs[None].permute(0, 1, 3, 4, 2) + predictions['conf'] = torch.sigmoid(predictions['conf']) + edge = depth_edge(predictions['local_points'][..., 2], rtol=0.03) + predictions['conf'][edge] = 0.0 + # del predictions['local_points'] + # Convert tensors to numpy + for key in predictions.keys(): + if isinstance(predictions[key], torch.Tensor): + predictions[key] = predictions[key].cpu().numpy().squeeze(0) # remove batch dimension + print(predictions['camera_poses'].shape) + print(predictions['points'].shape) + print(predictions['conf'].shape) + + + # 5. estimate intrinsic parameters + _, H, W, _ = predictions['images'].shape + + # Flatten 3D points and image coordinates + points_3d = predictions['points'][0].reshape(-1, 3) + pixels_2d = np.stack(np.meshgrid(np.arange(W), np.arange(H)), axis=-1).reshape(-1, 2) + + + # Estimate intrinsics + K, stats = estimate_camera_intrinsics_from_pointcloud( + points=predictions['points'], # (N,H,W,3) + images=predictions['images'], # (N,H,W,3) + camera_poses=predictions.get('camera_poses'), # (N,4,4) if points are world-space + poses_are_cam_to_world=True, # Pi3 typically returns c2w poses + conf=predictions.get('conf'), # (N,H,W) + ) + print("K =\n", K) + print("diagnostics:", stats) + + # save predicitons as transfo + # rms.json (nerfstudio format) + save_nerfstudio_format(predictions, K, os.path.join(args.save_path, "transforms.json")) + + + save_images(predictions, + out_dir=args.save_path, + poses_are_cam_to_world=True) + + if args.save_points: + save_local_points_pcd( + predictions, + out_dir=args.save_path, + per_frame=True, + poses_are_cam_to_world=True, + stride=2, + conf_thresh=3.0, + z_min=1e-6, + binary=True, + )