diff --git a/README.md b/README.md index b5c619a..77ba192 100644 --- a/README.md +++ b/README.md @@ -182,50 +182,27 @@ The AMD Container Toolkit supports GPU selection using unique identifiers (UUIDs ## Getting GPU UUIDs -GPU UUIDs can be obtained using different tools: +GPU UUIDs can be obtained using the `amd-ctk gpu list` command: -### Using ROCm SMI ```bash -rocm-smi --showuniqueid +amd-ctk gpu list ``` This will display output similar to: ``` -GPU[0] : Unique ID: 0xef2c1799a1f3e2ed -GPU[1] : Unique ID: 0x1234567890abcdef +Found 2 AMD GPU devices +--------------------------------------------------------------------------- +GPU Id UUID DRM Devices +--------------------------------------------------------------------------- +0 0xEF2C1799A1F3E2ED /dev/dri/renderD128 +1 0x1234567890ABCDEF /dev/dri/renderD129 ``` -### Using AMD-SMI -The `amd-smi` tool can also be used to get the ASIC_SERIAL, which serves as the GPU UUID: +Use the `UUID` value (e.g., `0xEF2C1799A1F3E2ED`) as the GPU UUID in container configurations. -```bash -amd-smi static -aB -``` - -This will display output similar to: -``` -GPU: 0 - ASIC: - MARKET_NAME: AMD Instinct MI210 - VENDOR_ID: 0x1002 - VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] - SUBVENDOR_ID: 0x1002 - DEVICE_ID: 0x740f - SUBSYSTEM_ID: 0x0c34 - REV_ID: 0x02 - ASIC_SERIAL: 0xD1CC3F11CFDD5112 - OAM_ID: N/A - NUM_COMPUTE_UNITS: 104 - TARGET_GRAPHICS_VERSION: gfx90a - BOARD: - MODEL_NUMBER: 102-D67302-00 - PRODUCT_SERIAL: 692231000131 - FRU_ID: 113-HPED67302000B.009 - PRODUCT_NAME: Instinct MI210 - MANUFACTURER_NAME: AMD -``` +If GPU Tracker is enabled, `amd-ctk gpu-tracker status` also displays UUIDs alongside container allocation and accessibility information. -Use the `ASIC_SERIAL` value (e.g., `0xD1CC3F11CFDD5112`) as the GPU UUID in container configurations. +**Note:** The UUID used by the AMD Container Toolkit is sourced from the KFD topology (`/sys/class/kfd/kfd/topology/nodes/*/properties`). This may differ from the `ASIC_SERIAL` reported by `amd-smi` or the Unique ID reported by `rocm-smi`. Always use the UUID shown by `amd-ctk gpu list` for container configurations. ## Using UUIDs with Environment Variables diff --git a/cmd/amd-ctk/gpu/gpu.go b/cmd/amd-ctk/gpu/gpu.go new file mode 100644 index 0000000..262e60b --- /dev/null +++ b/cmd/amd-ctk/gpu/gpu.go @@ -0,0 +1,36 @@ +/** +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package gpu + +import ( + "github.com/ROCm/container-toolkit/cmd/amd-ctk/gpu/list" + "github.com/urfave/cli/v2" +) + +func AddNewCommand() *cli.Command { + gpuCmd := cli.Command{ + Name: "gpu", + Usage: "GPU related commands", + UsageText: "amd-ctk gpu [command] [options]", + } + + gpuCmd.Subcommands = []*cli.Command{ + list.AddNewCommand(), + } + + return &gpuCmd +} diff --git a/cmd/amd-ctk/gpu/list/list.go b/cmd/amd-ctk/gpu/list/list.go new file mode 100644 index 0000000..beaa05a --- /dev/null +++ b/cmd/amd-ctk/gpu/list/list.go @@ -0,0 +1,89 @@ +/** +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package list + +import ( + "fmt" + "strings" + + "github.com/ROCm/container-toolkit/internal/amdgpu" + "github.com/urfave/cli/v2" +) + +func AddNewCommand() *cli.Command { + gpuListCmd := cli.Command{ + Name: "list", + Usage: "List AMD GPUs with their UUIDs", + UsageText: "amd-ctk gpu list", + Action: func(c *cli.Context) error { + return performAction(c) + }, + } + + return &gpuListCmd +} + +func performAction(c *cli.Context) error { + devs, err := amdgpu.GetAMDGPUs() + if err != nil { + return fmt.Errorf("failed to list AMD devices: %v", err) + } + + uuidToGPUIdMap, err := amdgpu.GetUniqueIdToDeviceIndexMap() + if err != nil { + uuidToGPUIdMap = make(map[string][]int) + } + + gpuIdToUUIDMap := make(map[int]string) + for uuid, gpuIds := range uuidToGPUIdMap { + if strings.HasPrefix(uuid, "0x") || strings.HasPrefix(uuid, "0X") { + uuid = uuid[2:] + } + uuid = "0x" + strings.ToUpper(uuid) + for _, gpuId := range gpuIds { + gpuIdToUUIDMap[gpuId] = uuid + } + } + + suffix := "devices" + if len(devs) == 1 { + suffix = "device" + } + fmt.Printf("Found %v AMD GPU %s\n", len(devs), suffix) + + fmt.Println(strings.Repeat("-", 75)) + fmt.Printf("%-10s%-25s%-40s\n", "GPU Id", "UUID", "DRM Devices") + fmt.Println(strings.Repeat("-", 75)) + for idx, dev := range devs { + uuid := gpuIdToUUIDMap[idx] + if uuid == "" { + uuid = "N/A" + } + + var renderDevs []string + for _, dd := range dev.DrmDevices { + if !strings.HasPrefix(dd, "/dev/dri/card") { + renderDevs = append(renderDevs, dd) + } + } + + drmStr := strings.Join(renderDevs, ", ") + fmt.Printf("%-10v%-25s%-40s\n", idx, uuid, drmStr) + } + + return nil +} diff --git a/cmd/amd-ctk/main.go b/cmd/amd-ctk/main.go index 2bad27f..8abbc16 100644 --- a/cmd/amd-ctk/main.go +++ b/cmd/amd-ctk/main.go @@ -22,6 +22,7 @@ import ( "os" "github.com/ROCm/container-toolkit/cmd/amd-ctk/cdi" + "github.com/ROCm/container-toolkit/cmd/amd-ctk/gpu" gpuTracker "github.com/ROCm/container-toolkit/cmd/amd-ctk/gpu-tracker" "github.com/ROCm/container-toolkit/cmd/amd-ctk/runtime" "github.com/urfave/cli/v2" @@ -86,6 +87,7 @@ func main() { showVersion(), runtime.AddNewCommand(), cdi.AddNewCommand(), + gpu.AddNewCommand(), gpuTracker.AddNewCommand(), }