Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions docs/container-runtime/release-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ Compatibility Matrix
* - AMD Container Toolkit
- Docker Version
- Supported OS
* - 1.3.0
- 25.0+
- Ubuntu 22.04, Ubuntu 24.04
* - 1.2.0
- 25.0+
- Ubuntu 22.04, Ubuntu 24.04
Expand All @@ -31,6 +34,9 @@ Versioning Information
* - Version
- Release Date
- Highlights
* - v1.3.0
- May 2026
- ``amd-ctk gpu list``, GPU partition grouping, CDI and runtime improvements
* - v1.2.0
- November 2025
- GPU Tracker feature support, Docker Swarm Support
Expand All @@ -41,6 +47,47 @@ Versioning Information
- June 2025
- Initial Release with `amd-ctk`, CDI, Docker Integration, Ubuntu 22.04/24.04 Support

-------------------
v1.3.0 (May 2026)
-------------------

Overview
--------

This release extends the command-line tool with GPU discovery and tightens CDI and runtime behavior.

- **``amd-ctk gpu list``:** A dedicated command to display GPU information from the host.
- **CDI hardening and usability:** Human-readable CDI spec formatting, support for custom CDI spec file names, and the ability for non-root users to generate and validate CDI specifications where appropriate.
- **GPU partition grouping:** Partitions of the same physical GPU are now grouped by PCI device topology (``location_id`` and ``domain``) instead of ``unique_id``, improving reliability on multi-GPU and partitioned systems.
- **Runtime and GPU Tracker:** Rootful-Docker checks and more consistent behavior and messaging where GPU Tracker and the CLI interact.

New Features
~~~~~~~~~~~~

- **``amd-ctk gpu list``**

- Lists GPU details for AMD devices on the system, complementing existing ``amd-ctk cdi`` workflows.

- **CDI and runtime**

- CDI spec output is formatted for easier reading and review.
- Non-root users can run CDI spec generation and validation in supported configurations.
- Custom CDI spec file names (not limited to ``amd.json``) are supported when loading specs.

Improvements
------------

- **GPU partition grouping:** GPU partitions are now grouped by parent PCI device address derived from topology ``location_id`` and ``domain`` fields, rather than by ``unique_id``. This produces correct grouping on systems where partitions of the same GPU have distinct ``unique_id`` values.
- **Container runtime in rootless / non-rootful Docker:** The runtime exits early when the Docker setup is not rootful, matching supported deployment models.
- **Docker and CDI:** Using ``--gpus`` together with CDI-backed AMD devices requires Docker **29.3.0** or newer.

Upgrade Notes
-------------

- Regenerate or validate CDI specifications after upgrading; manage specs explicitly with ``amd-ctk cdi``.
- If you want to use ``--gpus`` with CDI-backed AMD GPUs, upgrade Docker to **29.3.0** or newer.
- Rootless or non-rootful Docker configurations are not supported for the same runtime paths; ensure a rootful Docker engine where the toolkit expects it.

-------------------
v1.2.0 (November 2025)
-------------------
Expand Down
70 changes: 60 additions & 10 deletions internal/amdgpu/amdgpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {

renderDevIds := GetDevIdsFromTopology(fs)

// Map to store devices by unique_id to maintain grouping
uniqueIdDevices := make(map[string][]DeviceInfo)
// Map to store devices by parent dev ID to maintain grouping
devIdToDevices := make(map[string][]DeviceInfo)
var uniqueIds []string // To maintain order

// Process PCI devices
Expand Down Expand Up @@ -156,17 +156,17 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {

if len(drmDevs) > 0 && renderMinor > 0 {
if devID, exists := renderDevIds[renderMinor]; exists {
if _, exists := uniqueIdDevices[devID]; !exists {
if _, exists := devIdToDevices[devID]; !exists {
uniqueIds = append(uniqueIds, devID)
}
uniqueIdDevices[devID] = append(uniqueIdDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType})
devIdToDevices[devID] = append(devIdToDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType})
}
}
}

// Sort devices within each unique_id group by render minor number
// Sort devices within each parent dev ID group by render minor number
for _, devID := range uniqueIds {
sort.Slice(uniqueIdDevices[devID], func(i, j int) bool {
sort.Slice(devIdToDevices[devID], func(i, j int) bool {
getRenderID := func(devInfo DeviceInfo) int {
devs := devInfo.DrmDevices
for _, dev := range devs {
Expand All @@ -178,14 +178,14 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {
}
return 0
}
return getRenderID(uniqueIdDevices[devID][i]) < getRenderID(uniqueIdDevices[devID][j])
return getRenderID(devIdToDevices[devID][i]) < getRenderID(devIdToDevices[devID][j])
})
}

// Combine all devices maintaining the unique_id order
var devs []DeviceInfo
for _, devID := range uniqueIds {
devs = append(devs, uniqueIdDevices[devID]...)
devs = append(devs, devIdToDevices[devID]...)
}

return devs, nil
Expand Down Expand Up @@ -242,8 +242,10 @@ func GetAMDGPUWithFS(fs FileSystem, dev string) (AMDGPU, error) {

var topoUniqueIdRe = regexp.MustCompile(`unique_id\s(\d+)`)
var renderMinorRe = regexp.MustCompile(`drm_render_minor\s(\d+)`)
var locationIdRe = regexp.MustCompile(`location_id\s(\d+)`)
var domainRe = regexp.MustCompile(`domain\s(\d+)`)

// GetDevIdsFromTopology returns a map of render minor numbers to unique_ids
// GetDevIdsFromTopology returns a map of render minor numbers to parent devID
func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]string {
topoRoot := "/sys/class/kfd/kfd"
if len(topoRootParam) == 1 {
Expand All @@ -257,6 +259,54 @@ func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]strin
return renderDevIds
}

for _, nodeFile := range nodeFiles {
slog.Debug("Parsing topology node file", "file", nodeFile)
renderMinor, err := ParseTopologyProperties(fs, nodeFile, renderMinorRe)
if err != nil {
slog.Debug("Error parsing render minor", "file", nodeFile, "error", err)
continue
}

if renderMinor <= 0 || renderMinor > math.MaxInt32 {
continue
}

locationId, e := ParseTopologyProperties(fs, nodeFile, locationIdRe)
if e != nil {
slog.Debug("Error parsing location_id", "file", nodeFile, "error", e)
continue
}

domain, e := ParseTopologyProperties(fs, nodeFile, domainRe)
if e != nil {
slog.Debug("Error parsing domain", "file", nodeFile, "error", e)
continue
}

dev := (locationId >> 3) & 0x1f
bus := (locationId >> 8) & 0xff
devID := fmt.Sprintf("%04x:%02x:%02x:0", domain, bus, dev)

renderDevIds[int(renderMinor)] = devID
}

return renderDevIds
}

// GetUniqueIdsFromTopology returns a map of render minor numbers to unique_ids
func GetUniqueIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]string {
topoRoot := "/sys/class/kfd/kfd"
if len(topoRootParam) == 1 {
topoRoot = topoRootParam[0]
}

renderDevIds := make(map[int]string)
nodeFiles, err := fs.Glob(topoRoot + "/topology/nodes/*/properties")
if err != nil {
slog.Warn("Failed to glob topology nodes", "error", err)
return renderDevIds
}

for _, nodeFile := range nodeFiles {
slog.Debug("Parsing topology node file", "file", nodeFile)
renderMinor, err := ParseTopologyProperties(fs, nodeFile, renderMinorRe)
Expand Down Expand Up @@ -331,7 +381,7 @@ func GetUniqueIdToDeviceIndexMapWithFS(fs FileSystem) (map[string][]int, error)
return nil, fmt.Errorf("getting AMD GPUs: %w", err)
}

renderDevIds := GetDevIdsFromTopology(fs)
renderDevIds := GetUniqueIdsFromTopology(fs)
uniqueIdToIndex := make(map[string][]int)

// Process each device group and assign index
Expand Down
36 changes: 20 additions & 16 deletions internal/amdgpu/amdgpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -527,32 +527,32 @@ func TestGetDevIdsFromTopology(t *testing.T) {
name: "single GPU topology",
testCase: "single_gpu",
expectedResult: map[int]string{
128: "1",
128: "0000:05:00:0",
},
},
{
name: "GPU with partition topology",
testCase: "gpu_with_partition",
expectedResult: map[int]string{
128: "1",
129: "1",
128: "0000:05:00:0",
129: "0000:05:00:0",
},
},
{
name: "multiple GPUs topology",
testCase: "multiple_gpus",
expectedResult: map[int]string{
128: "1",
130: "2",
128: "0000:05:00:0",
130: "0000:48:00:0",
},
},
{
name: "unordered partitions topology",
testCase: "unordered_partitions",
expectedResult: map[int]string{
128: "1",
129: "1",
130: "2",
128: "0000:05:00:0",
129: "0000:05:00:0",
130: "0000:48:00:0",
},
},
}
Expand Down Expand Up @@ -589,8 +589,10 @@ func TestGetUniqueIdToDeviceIndexMapWithFS(t *testing.T) {
name: "GPU with partition UUID mapping",
testCase: "gpu_with_partition",
expectedResult: map[string][]int{
"0x1": {0, 1},
"1": {0, 1},
"0x1": {0},
"0x2": {1},
"1": {0},
"2": {1},
},
expectedError: nil,
},
Expand All @@ -599,20 +601,22 @@ func TestGetUniqueIdToDeviceIndexMapWithFS(t *testing.T) {
testCase: "multiple_gpus",
expectedResult: map[string][]int{
"0x1": {0},
"0x3": {1},
"1": {0},
"0x2": {1},
"2": {1},
"3": {1},
},
expectedError: nil,
},
{
name: "unordered partitions UUID mapping",
testCase: "unordered_partitions",
expectedResult: map[string][]int{
"0x1": {0, 1},
"1": {0, 1},
"0x2": {2},
"2": {2},
"0x1": {0},
"0x2": {1},
"0x3": {2},
"1": {0},
"2": {1},
"3": {2},
},
expectedError: nil,
},
Expand Down
4 changes: 3 additions & 1 deletion tests/amdgpu/topology/nodes/0/properties
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ mem_banks_count 1
caches_count 0
io_links_count 1
cpu_core_id_base 0
simd_id_base 0
simd_id_base 0
location_id 1280
domain 0
6 changes: 4 additions & 2 deletions tests/amdgpu/topology/nodes/1/properties
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
drm_render_minor 129
unique_id 1
unique_id 2
gfx_target_version 90402
cpu_cores_count 20
simd_count 0
mem_banks_count 1
caches_count 0
io_links_count 1
cpu_core_id_base 0
simd_id_base 0
simd_id_base 0
location_id 1281
domain 0
6 changes: 4 additions & 2 deletions tests/amdgpu/topology/nodes/2/properties
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
drm_render_minor 130
unique_id 2
unique_id 3
gfx_target_version 90402
cpu_cores_count 20
simd_count 0
mem_banks_count 1
caches_count 0
io_links_count 1
cpu_core_id_base 0
simd_id_base 0
simd_id_base 0
location_id 18432
domain 0