Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions crates/openshell-bootstrap/src/docker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,19 @@ pub async fn ensure_container(
}]);
}

// On Tegra platforms the nvidia runtime or CDI spec generation reads
// host-file injection config from
// /etc/nvidia-container-runtime/host-files-for-container.d on the host.
// Bind-mount that directory (read-only) into the gateway so the same
// nvidia runtime or CDI spec generation running inside k3s (for sandbox
// pods) can apply the same config.
const HOST_FILES_DIR: &str = "/etc/nvidia-container-runtime/host-files-for-container.d";
if std::path::Path::new(HOST_FILES_DIR).is_dir() {
let mut binds = host_config.binds.take().unwrap_or_default();
binds.push(format!("{HOST_FILES_DIR}:{HOST_FILES_DIR}:ro"));
host_config.binds = Some(binds);
}

let mut cmd = vec![
"server".to_string(),
"--disable=traefik".to_string(),
Expand Down
19 changes: 19 additions & 0 deletions crates/openshell-sandbox/src/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,26 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> {
target_os = "redox"
)))]
{
// Snapshot the container-level supplemental GIDs (e.g. injected by
// CDI for GPU device access) before initgroups replaces them.
// Exclude GID 0 (root) to avoid inadvertent privilege retention.
let root_gid = nix::unistd::Gid::from_raw(0);
let container_gids: Vec<nix::unistd::Gid> = nix::unistd::getgroups()
.unwrap_or_default()
.into_iter()
.filter(|&g| g != root_gid)
.collect();
nix::unistd::initgroups(user_cstr.as_c_str(), group.gid).into_diagnostic()?;
// Merge back any CDI-injected GIDs that initgroups dropped so that
// exec'd processes retain access to GPU devices (e.g. /dev/nvmap on
// Tegra requires the video GID).
let mut merged: Vec<nix::unistd::Gid> = nix::unistd::getgroups().unwrap_or_default();
for gid in container_gids {
if !merged.contains(&gid) {
merged.push(gid);
}
}
nix::unistd::setgroups(&merged).into_diagnostic()?;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ spec:
targetNamespace: nvidia-device-plugin
createNamespace: true
valuesContent: |-
image:
repository: ghcr.io/nvidia/k8s-device-plugin
tag: "2ab68c16"
runtimeClassName: nvidia
gfd:
enabled: false
Expand Down
10 changes: 6 additions & 4 deletions e2e/python/test_sandbox_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@ def test_gpu_sandbox_reports_available_gpu(
sandbox: Callable[..., Sandbox],
gpu_sandbox_spec: datamodel_pb2.SandboxSpec,
) -> None:
nvidia_smi_args = ["--query-gpu=name", "--format=csv,noheader"]
with sandbox(spec=gpu_sandbox_spec, delete_on_exit=True) as sb:
result = sb.exec(
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
timeout_seconds=30,
)
result = sb.exec(["nvidia-smi", *nvidia_smi_args], timeout_seconds=30)
if result.exit_code != 0:
# On some platforms (e.g. Tegra/Jetson) nvidia-smi lives in
# /usr/sbin rather than /usr/bin and may not be on PATH.
result = sb.exec(["/usr/sbin/nvidia-smi", *nvidia_smi_args], timeout_seconds=30)

assert result.exit_code == 0, result.stderr
assert result.stdout.strip()
Loading