From 093e8f77b21137e0d97454dbdbd45ae9f706c60b Mon Sep 17 00:00:00 2001 From: Uday Bhaskar Date: Fri, 24 Apr 2026 14:19:18 +0000 Subject: [PATCH 1/4] update logic for grouping paritions --- internal/amdgpu/amdgpu.go | 66 ++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/internal/amdgpu/amdgpu.go b/internal/amdgpu/amdgpu.go index dd70585..d3b67ec 100644 --- a/internal/amdgpu/amdgpu.go +++ b/internal/amdgpu/amdgpu.go @@ -101,7 +101,7 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) { renderDevIds := GetDevIdsFromTopology(fs) // Map to store devices by unique_id to maintain grouping - uniqueIdDevices := make(map[string][]DeviceInfo) + uniqueDevIdDevices := make(map[string][]DeviceInfo) var uniqueIds []string // To maintain order // Process PCI devices @@ -156,17 +156,17 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) { if len(drmDevs) > 0 && renderMinor > 0 { if devID, exists := renderDevIds[renderMinor]; exists { - if _, exists := uniqueIdDevices[devID]; !exists { + if _, exists := uniqueDevIdDevices[devID]; !exists { uniqueIds = append(uniqueIds, devID) } - uniqueIdDevices[devID] = append(uniqueIdDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType}) + uniqueDevIdDevices[devID] = append(uniqueDevIdDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType}) } } } // Sort devices within each unique_id group by render minor number for _, devID := range uniqueIds { - sort.Slice(uniqueIdDevices[devID], func(i, j int) bool { + sort.Slice(uniqueDevIdDevices[devID], func(i, j int) bool { getRenderID := func(devInfo DeviceInfo) int { devs := devInfo.DrmDevices for _, dev := range devs { @@ -178,14 +178,14 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) { } return 0 } - return getRenderID(uniqueIdDevices[devID][i]) < getRenderID(uniqueIdDevices[devID][j]) + return getRenderID(uniqueDevIdDevices[devID][i]) < getRenderID(uniqueDevIdDevices[devID][j]) }) } // Combine all devices maintaining the unique_id order var devs []DeviceInfo for _, devID := range uniqueIds { - devs = append(devs, uniqueIdDevices[devID]...) + devs = append(devs, uniqueDevIdDevices[devID]...) } return devs, nil @@ -242,8 +242,10 @@ func GetAMDGPUWithFS(fs FileSystem, dev string) (AMDGPU, error) { var topoUniqueIdRe = regexp.MustCompile(`unique_id\s(\d+)`) var renderMinorRe = regexp.MustCompile(`drm_render_minor\s(\d+)`) +var locationIdRe = regexp.MustCompile(`location_id\s(\d+)`) +var domainRe = regexp.MustCompile(`domain\s(\d+)`) -// GetDevIdsFromTopology returns a map of render minor numbers to unique_ids +// GetDevIdsFromTopology returns a map of render minor numbers to parent devID func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]string { topoRoot := "/sys/class/kfd/kfd" if len(topoRootParam) == 1 { @@ -257,6 +259,54 @@ func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]strin return renderDevIds } + for _, nodeFile := range nodeFiles { + slog.Debug("Parsing topology node file", "file", nodeFile) + renderMinor, err := ParseTopologyProperties(fs, nodeFile, renderMinorRe) + if err != nil { + slog.Debug("Error parsing render minor", "file", nodeFile, "error", err) + continue + } + + if renderMinor <= 0 || renderMinor > math.MaxInt32 { + continue + } + + locationId, e := ParseTopologyProperties(fs, nodeFile, locationIdRe) + if e != nil { + slog.Debug("Error parsing location_id", "file", nodeFile, "error", e) + continue + } + + domain, e := ParseTopologyProperties(fs, nodeFile, domainRe) + if e != nil { + slog.Debug("Error parsing domain", "file", nodeFile, "error", e) + continue + } + + dev := (locationId >> 3) & 0x1f + bus := (locationId >> 8) & 0xff + devID := fmt.Sprintf("%04x:%02x:%02x:0", domain, bus, dev) + + renderDevIds[int(renderMinor)] = devID + } + + return renderDevIds +} + +// GetUniqueIdsFromTopology returns a map of render minor numbers to unique_ids +func GetUniqueIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]string { + topoRoot := "/sys/class/kfd/kfd" + if len(topoRootParam) == 1 { + topoRoot = topoRootParam[0] + } + + renderDevIds := make(map[int]string) + nodeFiles, err := fs.Glob(topoRoot + "/topology/nodes/*/properties") + if err != nil { + slog.Warn("Failed to glob topology nodes", "error", err) + return renderDevIds + } + for _, nodeFile := range nodeFiles { slog.Debug("Parsing topology node file", "file", nodeFile) renderMinor, err := ParseTopologyProperties(fs, nodeFile, renderMinorRe) @@ -331,7 +381,7 @@ func GetUniqueIdToDeviceIndexMapWithFS(fs FileSystem) (map[string][]int, error) return nil, fmt.Errorf("getting AMD GPUs: %w", err) } - renderDevIds := GetDevIdsFromTopology(fs) + renderDevIds := GetUniqueIdsFromTopology(fs) uniqueIdToIndex := make(map[string][]int) // Process each device group and assign index From e152d725cef9e05ef347892eb1baba5ac1c4d957 Mon Sep 17 00:00:00 2001 From: Uday Bhaskar Date: Fri, 24 Apr 2026 14:43:15 +0000 Subject: [PATCH 2/4] fix unit test cases --- internal/amdgpu/amdgpu_test.go | 16 ++++++++-------- tests/amdgpu/topology/nodes/0/properties | 4 +++- tests/amdgpu/topology/nodes/1/properties | 4 +++- tests/amdgpu/topology/nodes/2/properties | 4 +++- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/internal/amdgpu/amdgpu_test.go b/internal/amdgpu/amdgpu_test.go index f4984f6..83577fe 100644 --- a/internal/amdgpu/amdgpu_test.go +++ b/internal/amdgpu/amdgpu_test.go @@ -527,32 +527,32 @@ func TestGetDevIdsFromTopology(t *testing.T) { name: "single GPU topology", testCase: "single_gpu", expectedResult: map[int]string{ - 128: "1", + 128: "0000:05:00:0", }, }, { name: "GPU with partition topology", testCase: "gpu_with_partition", expectedResult: map[int]string{ - 128: "1", - 129: "1", + 128: "0000:05:00:0", + 129: "0000:28:00:0", }, }, { name: "multiple GPUs topology", testCase: "multiple_gpus", expectedResult: map[int]string{ - 128: "1", - 130: "2", + 128: "0000:05:00:0", + 130: "0000:48:00:0", }, }, { name: "unordered partitions topology", testCase: "unordered_partitions", expectedResult: map[int]string{ - 128: "1", - 129: "1", - 130: "2", + 128: "0000:05:00:0", + 129: "0000:28:00:0", + 130: "0000:48:00:0", }, }, } diff --git a/tests/amdgpu/topology/nodes/0/properties b/tests/amdgpu/topology/nodes/0/properties index 0bf2078..c524b72 100644 --- a/tests/amdgpu/topology/nodes/0/properties +++ b/tests/amdgpu/topology/nodes/0/properties @@ -7,4 +7,6 @@ mem_banks_count 1 caches_count 0 io_links_count 1 cpu_core_id_base 0 -simd_id_base 0 \ No newline at end of file +simd_id_base 0 +location_id 1280 +domain 0 \ No newline at end of file diff --git a/tests/amdgpu/topology/nodes/1/properties b/tests/amdgpu/topology/nodes/1/properties index 2274403..bfa7b5d 100644 --- a/tests/amdgpu/topology/nodes/1/properties +++ b/tests/amdgpu/topology/nodes/1/properties @@ -7,4 +7,6 @@ mem_banks_count 1 caches_count 0 io_links_count 1 cpu_core_id_base 0 -simd_id_base 0 \ No newline at end of file +simd_id_base 0 +location_id 10240 +domain 0 \ No newline at end of file diff --git a/tests/amdgpu/topology/nodes/2/properties b/tests/amdgpu/topology/nodes/2/properties index 57b29f8..f44c7d6 100644 --- a/tests/amdgpu/topology/nodes/2/properties +++ b/tests/amdgpu/topology/nodes/2/properties @@ -7,4 +7,6 @@ mem_banks_count 1 caches_count 0 io_links_count 1 cpu_core_id_base 0 -simd_id_base 0 \ No newline at end of file +simd_id_base 0 +location_id 18432 +domain 0 \ No newline at end of file From f96c4a5f08faa545372b8261d6662626c660084d Mon Sep 17 00:00:00 2001 From: Uday Bhaskar Date: Mon, 27 Apr 2026 06:35:45 +0000 Subject: [PATCH 3/4] address review comments --- internal/amdgpu/amdgpu.go | 16 ++++++++-------- internal/amdgpu/amdgpu_test.go | 24 ++++++++++++++---------- tests/amdgpu/topology/nodes/0/properties | 2 +- tests/amdgpu/topology/nodes/1/properties | 6 +++--- tests/amdgpu/topology/nodes/2/properties | 4 ++-- 5 files changed, 28 insertions(+), 24 deletions(-) diff --git a/internal/amdgpu/amdgpu.go b/internal/amdgpu/amdgpu.go index d3b67ec..de627b2 100644 --- a/internal/amdgpu/amdgpu.go +++ b/internal/amdgpu/amdgpu.go @@ -100,8 +100,8 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) { renderDevIds := GetDevIdsFromTopology(fs) - // Map to store devices by unique_id to maintain grouping - uniqueDevIdDevices := make(map[string][]DeviceInfo) + // Map to store devices by parent dev ID to maintain grouping + devIdToDevices := make(map[string][]DeviceInfo) var uniqueIds []string // To maintain order // Process PCI devices @@ -156,17 +156,17 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) { if len(drmDevs) > 0 && renderMinor > 0 { if devID, exists := renderDevIds[renderMinor]; exists { - if _, exists := uniqueDevIdDevices[devID]; !exists { + if _, exists := devIdToDevices[devID]; !exists { uniqueIds = append(uniqueIds, devID) } - uniqueDevIdDevices[devID] = append(uniqueDevIdDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType}) + devIdToDevices[devID] = append(devIdToDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType}) } } } - // Sort devices within each unique_id group by render minor number + // Sort devices within each parent dev ID group by render minor number for _, devID := range uniqueIds { - sort.Slice(uniqueDevIdDevices[devID], func(i, j int) bool { + sort.Slice(devIdToDevices[devID], func(i, j int) bool { getRenderID := func(devInfo DeviceInfo) int { devs := devInfo.DrmDevices for _, dev := range devs { @@ -178,14 +178,14 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) { } return 0 } - return getRenderID(uniqueDevIdDevices[devID][i]) < getRenderID(uniqueDevIdDevices[devID][j]) + return getRenderID(devIdToDevices[devID][i]) < getRenderID(devIdToDevices[devID][j]) }) } // Combine all devices maintaining the unique_id order var devs []DeviceInfo for _, devID := range uniqueIds { - devs = append(devs, uniqueDevIdDevices[devID]...) + devs = append(devs, devIdToDevices[devID]...) } return devs, nil diff --git a/internal/amdgpu/amdgpu_test.go b/internal/amdgpu/amdgpu_test.go index 83577fe..2b57988 100644 --- a/internal/amdgpu/amdgpu_test.go +++ b/internal/amdgpu/amdgpu_test.go @@ -535,7 +535,7 @@ func TestGetDevIdsFromTopology(t *testing.T) { testCase: "gpu_with_partition", expectedResult: map[int]string{ 128: "0000:05:00:0", - 129: "0000:28:00:0", + 129: "0000:05:00:0", }, }, { @@ -551,7 +551,7 @@ func TestGetDevIdsFromTopology(t *testing.T) { testCase: "unordered_partitions", expectedResult: map[int]string{ 128: "0000:05:00:0", - 129: "0000:28:00:0", + 129: "0000:05:00:0", 130: "0000:48:00:0", }, }, @@ -589,8 +589,10 @@ func TestGetUniqueIdToDeviceIndexMapWithFS(t *testing.T) { name: "GPU with partition UUID mapping", testCase: "gpu_with_partition", expectedResult: map[string][]int{ - "0x1": {0, 1}, - "1": {0, 1}, + "0x1": {0}, + "0x2": {1}, + "1": {0}, + "2": {1}, }, expectedError: nil, }, @@ -599,9 +601,9 @@ func TestGetUniqueIdToDeviceIndexMapWithFS(t *testing.T) { testCase: "multiple_gpus", expectedResult: map[string][]int{ "0x1": {0}, + "0x3": {1}, "1": {0}, - "0x2": {1}, - "2": {1}, + "3": {1}, }, expectedError: nil, }, @@ -609,10 +611,12 @@ func TestGetUniqueIdToDeviceIndexMapWithFS(t *testing.T) { name: "unordered partitions UUID mapping", testCase: "unordered_partitions", expectedResult: map[string][]int{ - "0x1": {0, 1}, - "1": {0, 1}, - "0x2": {2}, - "2": {2}, + "0x1": {0}, + "0x2": {1}, + "0x3": {2}, + "1": {0}, + "2": {1}, + "3": {2}, }, expectedError: nil, }, diff --git a/tests/amdgpu/topology/nodes/0/properties b/tests/amdgpu/topology/nodes/0/properties index c524b72..6680fb5 100644 --- a/tests/amdgpu/topology/nodes/0/properties +++ b/tests/amdgpu/topology/nodes/0/properties @@ -9,4 +9,4 @@ io_links_count 1 cpu_core_id_base 0 simd_id_base 0 location_id 1280 -domain 0 \ No newline at end of file +domain 0 diff --git a/tests/amdgpu/topology/nodes/1/properties b/tests/amdgpu/topology/nodes/1/properties index bfa7b5d..c45b736 100644 --- a/tests/amdgpu/topology/nodes/1/properties +++ b/tests/amdgpu/topology/nodes/1/properties @@ -1,5 +1,5 @@ drm_render_minor 129 -unique_id 1 +unique_id 2 gfx_target_version 90402 cpu_cores_count 20 simd_count 0 @@ -8,5 +8,5 @@ caches_count 0 io_links_count 1 cpu_core_id_base 0 simd_id_base 0 -location_id 10240 -domain 0 \ No newline at end of file +location_id 1281 +domain 0 diff --git a/tests/amdgpu/topology/nodes/2/properties b/tests/amdgpu/topology/nodes/2/properties index f44c7d6..f49e0c2 100644 --- a/tests/amdgpu/topology/nodes/2/properties +++ b/tests/amdgpu/topology/nodes/2/properties @@ -1,5 +1,5 @@ drm_render_minor 130 -unique_id 2 +unique_id 3 gfx_target_version 90402 cpu_cores_count 20 simd_count 0 @@ -9,4 +9,4 @@ io_links_count 1 cpu_core_id_base 0 simd_id_base 0 location_id 18432 -domain 0 \ No newline at end of file +domain 0 From 042f7ffee0e4f148708a1a19ec58ee1ad0f39a48 Mon Sep 17 00:00:00 2001 From: Shiv Tyagi Date: Mon, 27 Apr 2026 02:44:08 +0000 Subject: [PATCH 4/4] Add release notes for v1.3.0 Made-with: Cursor --- docs/container-runtime/release-notes.rst | 47 ++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/docs/container-runtime/release-notes.rst b/docs/container-runtime/release-notes.rst index 4c5a5b7..97b3178 100644 --- a/docs/container-runtime/release-notes.rst +++ b/docs/container-runtime/release-notes.rst @@ -12,6 +12,9 @@ Compatibility Matrix * - AMD Container Toolkit - Docker Version - Supported OS + * - 1.3.0 + - 25.0+ + - Ubuntu 22.04, Ubuntu 24.04 * - 1.2.0 - 25.0+ - Ubuntu 22.04, Ubuntu 24.04 @@ -31,6 +34,9 @@ Versioning Information * - Version - Release Date - Highlights + * - v1.3.0 + - May 2026 + - ``amd-ctk gpu list``, GPU partition grouping, CDI and runtime improvements * - v1.2.0 - November 2025 - GPU Tracker feature support, Docker Swarm Support @@ -41,6 +47,47 @@ Versioning Information - June 2025 - Initial Release with `amd-ctk`, CDI, Docker Integration, Ubuntu 22.04/24.04 Support +------------------- +v1.3.0 (May 2026) +------------------- + +Overview +-------- + +This release extends the command-line tool with GPU discovery and tightens CDI and runtime behavior. + +- **``amd-ctk gpu list``:** A dedicated command to display GPU information from the host. +- **CDI hardening and usability:** Human-readable CDI spec formatting, support for custom CDI spec file names, and the ability for non-root users to generate and validate CDI specifications where appropriate. +- **GPU partition grouping:** Partitions of the same physical GPU are now grouped by PCI device topology (``location_id`` and ``domain``) instead of ``unique_id``, improving reliability on multi-GPU and partitioned systems. +- **Runtime and GPU Tracker:** Rootful-Docker checks and more consistent behavior and messaging where GPU Tracker and the CLI interact. + +New Features +~~~~~~~~~~~~ + +- **``amd-ctk gpu list``** + + - Lists GPU details for AMD devices on the system, complementing existing ``amd-ctk cdi`` workflows. + +- **CDI and runtime** + + - CDI spec output is formatted for easier reading and review. + - Non-root users can run CDI spec generation and validation in supported configurations. + - Custom CDI spec file names (not limited to ``amd.json``) are supported when loading specs. + +Improvements +------------ + +- **GPU partition grouping:** GPU partitions are now grouped by parent PCI device address derived from topology ``location_id`` and ``domain`` fields, rather than by ``unique_id``. This produces correct grouping on systems where partitions of the same GPU have distinct ``unique_id`` values. +- **Container runtime in rootless / non-rootful Docker:** The runtime exits early when the Docker setup is not rootful, matching supported deployment models. +- **Docker and CDI:** Using ``--gpus`` together with CDI-backed AMD devices requires Docker **29.3.0** or newer. + +Upgrade Notes +------------- + +- Regenerate or validate CDI specifications after upgrading; manage specs explicitly with ``amd-ctk cdi``. +- If you want to use ``--gpus`` with CDI-backed AMD GPUs, upgrade Docker to **29.3.0** or newer. +- Rootless or non-rootful Docker configurations are not supported for the same runtime paths; ensure a rootful Docker engine where the toolkit expects it. + ------------------- v1.2.0 (November 2025) -------------------