Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 141 additions & 16 deletions tinynav/core/models_trt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from codetiming import Timer
import platform
import asyncio
from tinynav.core.func import alru_cache_numpy
from tinynav.core.func import alru_cache_numpy, lru_cache_numpy

from cuda import cudart
import ctypes
Expand Down Expand Up @@ -126,20 +126,63 @@ async def run_graph(self):
results[out["name"]] = out["host"].copy()
return results

def run_graph_sync(self):
if "aarch64" not in platform.machine():
for inp in self.inputs:
cudart.cudaMemcpyAsync(
inp["device"],
inp["host"].ctypes.data,
inp["nbytes"],
cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
self.stream,
)

cudart.cudaGraphLaunch(self.graph_exec, self.stream)

if "aarch64" not in platform.machine():
for out in self.outputs:
cudart.cudaMemcpyAsync(
out["host"].ctypes.data,
out["device"],
out["nbytes"],
cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
self.stream,
)

cudart.cudaStreamSynchronize(self.stream)

results = {}
for out in self.outputs:
results[out["name"]] = out["host"].copy()
return results


class SuperPointTRT(TRTBase):
def __init__(self, engine_path=f"/tinynav/tinynav/models/superpoint_fp16_dynamic_{platform.machine()}.plan"):
super().__init__(engine_path)
# model input [1,1,H,W]
self.input_shape = self.inputs[0]["shape"][2:4] # [H,W]

def _postprocess(self, input_image: np.ndarray, results: dict):
h_in, w_in = input_image.shape[0], input_image.shape[1]
h_net, w_net = self.input_shape[0], self.input_shape[1]
scale_x = w_in / w_net
scale_y = h_in / h_net
k = results["kpts"][0]
if k.shape[0] == 2:
k[0] = (k[0] + 0.5) * scale_x - 0.5
k[1] = (k[1] + 0.5) * scale_y - 0.5
else:
k[:, 0] = (k[:, 0] + 0.5) * scale_x - 0.5
k[:, 1] = (k[:, 1] + 0.5) * scale_y - 0.5
results["mask"] = results["mask"][:, :, None]
return results

# default threshold as
# https://github.com/cvg/LightGlue/blob/746fac2c042e05d1865315b1413419f1c1e7ba55/lightglue/superpoint.py#L111
#
@alru_cache_numpy(maxsize=32)
async def infer(self, input_image:np.ndarray, threshold = np.array([[0.0005]], dtype=np.float32)):
# Resize to engine input size (may change aspect ratio for non-matching resolutions).
h_in, w_in = input_image.shape[0], input_image.shape[1]
h_net, w_net = self.input_shape[0], self.input_shape[1]
image = cv2.resize(input_image, (w_net, h_net))
image = image[None, None, :, :]
Expand All @@ -148,20 +191,19 @@ async def infer(self, input_image:np.ndarray, threshold = np.array([[0.0005]], d
np.copyto(self.inputs[1]["host"], threshold)

results = await self.run_graph()
return self._postprocess(input_image, results)

# Scale keypoints from network coords (h_net, w_net) back to input image coords (h_in, w_in).
# Use per-axis scale so Looper (640x544) and other resolutions match; img_shape is (width, height).
scale_x = w_in / w_net
scale_y = h_in / h_net
k = results["kpts"][0]
if k.shape[0] == 2:
k[0] = (k[0] + 0.5) * scale_x - 0.5
k[1] = (k[1] + 0.5) * scale_y - 0.5
else:
k[:, 0] = (k[:, 0] + 0.5) * scale_x - 0.5
k[:, 1] = (k[:, 1] + 0.5) * scale_y - 0.5
results["mask"] = results["mask"][:, :, None]
return results
@lru_cache_numpy(maxsize=32)
def infer_sync(self, input_image:np.ndarray, threshold = np.array([[0.0005]], dtype=np.float32)):
h_net, w_net = self.input_shape[0], self.input_shape[1]
image = cv2.resize(input_image, (w_net, h_net))
image = image[None, None, :, :]

np.copyto(self.inputs[0]["host"], image)
np.copyto(self.inputs[1]["host"], threshold)

results = self.run_graph_sync()
return self._postprocess(input_image, results)

class LightGlueTRT(TRTBase):
def __init__(self, engine_path=f"/tinynav/tinynav/models/lightglue_fp16_{platform.machine()}.plan"):
Expand All @@ -184,6 +226,20 @@ async def infer(self, kpts0, kpts1, desc0, desc1, mask0, mask1, img_shape0, img_

return await self.run_graph()

@lru_cache_numpy(maxsize=32)
def infer_sync(self, kpts0, kpts1, desc0, desc1, mask0, mask1, img_shape0, img_shape1, match_threshold = np.array([[0.1]], dtype=np.float32)):
np.copyto(self.inputs[0]["host"], kpts0)
np.copyto(self.inputs[1]["host"], kpts1)
np.copyto(self.inputs[2]["host"], desc0)
np.copyto(self.inputs[3]["host"], desc1)
np.copyto(self.inputs[4]["host"], mask0)
np.copyto(self.inputs[5]["host"], mask1)
np.copyto(self.inputs[6]["host"], img_shape0)
np.copyto(self.inputs[7]["host"], img_shape1)
np.copyto(self.inputs[8]["host"], match_threshold)

return self.run_graph_sync()

class Dinov2TRT(TRTBase):
def __init__(self, engine_path=f"/tinynav/tinynav/models/dinov2_base_224x224_fp16_{platform.machine()}.plan"):
super().__init__(engine_path)
Expand All @@ -206,6 +262,12 @@ async def infer(self, image):
results = await self.run_graph()
return results["last_hidden_state"][:, 0, :].squeeze(0)

def infer_sync(self, image):
image = self.preprocess_image(image)
np.copyto(self.inputs[0]["host"], image)
results = self.run_graph_sync()
return results["last_hidden_state"][:, 0, :].squeeze(0)


class StereoEngineTRT(TRTBase):
def _get_static_shape(self, name):
Expand Down Expand Up @@ -285,6 +347,47 @@ async def run_graph(self):
results[name] = np.array(arr).copy() if arr.ndim == 0 else arr.copy()
return results

def run_graph_sync(self):
input_shapes = self._current_input_shapes
if "aarch64" not in platform.machine():
cudart.cudaMemcpyAsync(self.inputs[0]["device"], self.inputs[0]["host"].ctypes.data,
self._current_input_nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.stream)
cudart.cudaMemcpyAsync(self.inputs[1]["device"], self.inputs[1]["host"].ctypes.data,
self._current_input_nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.stream)
for inp in self.inputs[2:]:
cudart.cudaMemcpyAsync(inp["device"], inp["host"].ctypes.data,
inp["nbytes"], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.stream)
self.context.set_optimization_profile_async(0, self.stream)
self.context.set_input_shape("left", input_shapes)
self.context.set_input_shape("right", input_shapes)
self.context.execute_async_v3(stream_handle=self.stream)
h_net, w_net = input_shapes[2], input_shapes[3]
if "aarch64" not in platform.machine():
for out in self.outputs:
if out["name"] in ("disp", "depth"):
nbytes = input_shapes[2] * input_shapes[3] * np.float32().itemsize
else:
nbytes = out["nbytes"]
cudart.cudaMemcpyAsync(
out["host"].ctypes.data,
out["device"],
nbytes,
cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
self.stream,
)
cudart.cudaStreamSynchronize(self.stream)
results = {}
for out in self.outputs:
arr = out["host"]
name = out["name"]
if name in ("disp", "depth"):
flat = np.asarray(arr).reshape(-1)
needed = h_net * w_net
results[name] = flat[:needed].reshape(h_net, w_net).copy()
else:
results[name] = np.array(arr).copy() if arr.ndim == 0 else arr.copy()
return results

async def infer(self, left_img, right_img, baseline, focal_length):
h_in, w_in = left_img.shape[0], left_img.shape[1]

Expand All @@ -308,6 +411,28 @@ async def infer(self, left_img, right_img, baseline, focal_length):
)
return disp.astype(np.float32), depth.astype(np.float32)

def infer_sync(self, left_img, right_img, baseline, focal_length):
h_in, w_in = left_img.shape[0], left_img.shape[1]

self._current_input_shapes = (1, 1, h_in, w_in)
self._current_input_nbytes = h_in * w_in * np.uint8().itemsize

left_tensor = left_img.astype(np.uint8).ravel()
right_tensor = right_img.astype(np.uint8).ravel()
np.copyto(self.inputs[0]["host"].reshape(-1)[: left_tensor.size], left_tensor)
np.copyto(self.inputs[1]["host"].reshape(-1)[: right_tensor.size], right_tensor)
np.copyto(self.inputs[2]["host"], baseline)
np.copyto(self.inputs[3]["host"], focal_length)

results = self.run_graph_sync()
disp = results["disp"]
depth = results["depth"]
if disp.shape != (h_in, w_in) or depth.shape != (h_in, w_in):
raise RuntimeError(
f"StereoEngine output shape mismatch: got disp {disp.shape}, depth {depth.shape}, expected ({h_in}, {w_in})"
)
return disp.astype(np.float32), depth.astype(np.float32)


if __name__ == "__main__":
# Synthetic sanity test for both RealSense and Looper resolutions.
Expand Down
23 changes: 11 additions & 12 deletions tinynav/core/perception_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from math_utils import rot_from_two_vector, np2msg, np2tf, estimate_pose, se3_inv
from math_utils import uf_init, uf_union, uf_all_sets_list
from tf2_ros import TransformBroadcaster
import asyncio
import gtsam
import gtsam_unstable
from collections import deque
Expand Down Expand Up @@ -191,20 +190,20 @@ def images_callback(self, left_msg, right_msg):
self.last_processed_timestamp = current_timestamp
loop_start = time.perf_counter()
with Timer(name="Perception Loop", text="[{name}] Elapsed time: {milliseconds:.0f} ms\n\n", logger=self.logger.info):
processed = asyncio.run(self.process(left_msg, right_msg))
processed = self.process(left_msg, right_msg)
if processed:
loop_ms = (time.perf_counter() - loop_start) * 1000.0
self.stats_pub.publish(Float32MultiArray(data=[float(loop_ms)]))

async def process(self, left_msg, right_msg):
def process(self, left_msg, right_msg):
if self.K is None or self.T_body_last is None:
return False
self.process_cnt += 1
left_img = self.bridge.imgmsg_to_cv2(left_msg, "mono8")
right_img = self.bridge.imgmsg_to_cv2(right_msg, "mono8")
current_timestamp = stamp2second(left_msg.header.stamp)
if len(self.keyframe_queue) == 0: # first frame
disparity, depth = await self.stereo_engine.infer(left_img, right_img, np.array([[self.baseline]]), np.array([[self.K[0,0]]]))
disparity, depth = self.stereo_engine.infer_sync(left_img, right_img, np.array([[self.baseline]]), np.array([[self.K[0,0]]]))
self.keyframe_queue.append(
Keyframe(
timestamp=current_timestamp,
Expand All @@ -221,12 +220,12 @@ async def process(self, left_msg, right_msg):
return True

with Timer(name="[Stereo Inference]", text="[{name}] Elapsed time: {milliseconds:.0f} ms", logger=self.logger.debug):
disparity, depth = await self.stereo_engine.infer(left_img, right_img, np.array([[self.baseline]]), np.array([[self.K[0,0]]]))
disparity, depth = self.stereo_engine.infer_sync(left_img, right_img, np.array([[self.baseline]]), np.array([[self.K[0,0]]]))
kf_prev = self.keyframe_queue[-1]
prev_left_extract_result = await self.superpoint.infer(kf_prev.image)
current_left_extract_result = await self.superpoint.infer(left_img)
prev_left_extract_result = self.superpoint.infer_sync(kf_prev.image)
current_left_extract_result = self.superpoint.infer_sync(left_img)

match_result = await self.light_glue.infer(
match_result = self.light_glue.infer_sync(
prev_left_extract_result["kpts"],
current_left_extract_result["kpts"],
prev_left_extract_result["descps"],
Expand Down Expand Up @@ -337,7 +336,7 @@ async def process(self, left_msg, right_msg):
#current_i = len(self.keyframe_queue[-_N:])

with Timer(name="[init extract info]", text="[{name}] Elapsed time: {milliseconds:.0f} ms", logger=self.logger.debug):
extract_info = [await self.superpoint.infer(kf.image) for kf in self.keyframe_queue[-_N:]]
extract_info = [self.superpoint.infer_sync(kf.image) for kf in self.keyframe_queue[-_N:]]
parent, rank = uf_init(len(self.keyframe_queue[-_N:]) * _M)

self.logger.debug(f"Processing {len(self.keyframe_queue)} keyframes for data association.")
Expand All @@ -355,12 +354,12 @@ async def process(self, left_msg, right_msg):
self.logger.debug("timestamp prev: ", kf_prev.timestamp)
self.logger.debug("timestamp curr: ", kf_curr.timestamp)
with Timer(name="[cached result[1.1/3]]", text="[{name}] Elapsed time: {milliseconds:.03f} ms", logger=self.logger.debug):
prev_left_extract_result = await self.superpoint.infer(kf_prev.image)
prev_left_extract_result = self.superpoint.infer_sync(kf_prev.image)
with Timer(name="[cached result[1.2/3]]", text="[{name}] Elapsed time: {milliseconds:.03f} ms", logger=self.logger.debug):
current_left_extract_result = await self.superpoint.infer(kf_curr.image)
current_left_extract_result = self.superpoint.infer_sync(kf_curr.image)

with Timer(name="[cached result[1.3/3]]", text="[{name}] Elapsed time: {milliseconds:.03f} ms", logger=self.logger.debug):
match_result = await self.light_glue.infer(
match_result = self.light_glue.infer_sync(
prev_left_extract_result["kpts"],
current_left_extract_result["kpts"],
prev_left_extract_result["descps"],
Expand Down
Loading