diff --git a/internal/modules/system/gpu.go b/internal/modules/system/gpu.go new file mode 100644 index 0000000..79aaa0d --- /dev/null +++ b/internal/modules/system/gpu.go @@ -0,0 +1,187 @@ +package system + +import ( + "os" + "path/filepath" + "strconv" + "strings" + + "nadir/internal/oscmd" +) + +type GPUInfo struct { + Model string `json:"model" example:"AMD Radeon RX 7900 XTX" doc:"GPU model name, or vendor:device hex ID if lspci unavailable"` + Vendor string `json:"vendor" example:"1002" doc:"PCI vendor ID (hex)"` + DeviceID string `json:"device_id" example:"744c" doc:"PCI device ID (hex)"` + Driver string `json:"driver" example:"amdgpu" doc:"Kernel driver in use"` + MemoryTotalBytes uint64 `json:"memory_total_bytes" example:"8589934592" doc:"Total VRAM in bytes (driver-dependent; 0 if unavailable)"` + MemoryUsedBytes uint64 `json:"memory_used_bytes" example:"27267072" doc:"Used VRAM in bytes (driver-dependent; 0 if unavailable)"` + UtilizationPct float64 `json:"utilization_pct" example:"23.5" doc:"GPU compute utilization percentage (driver-dependent; 0 if unavailable)"` + MemUtilizationPct float64 `json:"mem_utilization_pct" example:"15.0" doc:"GPU memory controller utilization percentage (driver-dependent; 0 if unavailable)"` +} + +func gpuInfo() []GPUInfo { + return readGPUsFromSysfs("/sys/class/drm") +} + +func readGPUsFromSysfs(drmRoot string) []GPUInfo { + entries, err := os.ReadDir(drmRoot) + if err != nil { + return nil + } + + seen := map[string]bool{} + var gpus []GPUInfo + + for _, e := range entries { + if !isGPUCard(e.Name()) { + continue + } + + pciAddr, vendor, device, driver := readGPUFromCard(drmRoot, e.Name()) + if pciAddr == "" || seen[pciAddr] { + continue + } + seen[pciAddr] = true + + model := vendor + ":" + device + if m := lookupGPUName(pciAddr); m != "" { + model = m + } + + gpu := GPUInfo{ + Model: model, + Vendor: vendor, + DeviceID: device, + Driver: driver, + } + + devPath := filepath.Join(drmRoot, e.Name(), "device") + enrichGPUInfo(&gpu, devPath, pciAddr, driver) + gpus = append(gpus, gpu) + } + + return gpus +} + +func enrichGPUInfo(gpu *GPUInfo, devPath, pciAddr, driver string) { + switch driver { + case "amdgpu": + enrichAMDGPU(gpu, devPath) + case "nvidia": + enrichNvidiaGPU(gpu, pciAddr) + } +} + +func enrichAMDGPU(gpu *GPUInfo, devPath string) { + if total := readUint64FromFile(filepath.Join(devPath, "mem_info_vram_total")); total > 0 { + gpu.MemoryTotalBytes = total + gpu.MemoryUsedBytes = readUint64FromFile(filepath.Join(devPath, "mem_info_vram_used")) + } + if pct := readIntFromFile(filepath.Join(devPath, "gpu_busy_percent")); pct >= 0 { + gpu.UtilizationPct = float64(pct) + } + if pct := readIntFromFile(filepath.Join(devPath, "mem_busy_percent")); pct >= 0 { + gpu.MemUtilizationPct = float64(pct) + } +} + +func enrichNvidiaGPU(gpu *GPUInfo, pciAddr string) { + out, err := oscmd.Run("nvidia-smi", + "-i", pciAddr, + "--query-gpu=memory.total,memory.used,utilization.gpu,utilization.memory", + "--format=csv,noheader,nounits", + ) + if err != nil { + return + } + parts := strings.Split(strings.TrimSpace(out), ", ") + if len(parts) < 4 { + return + } + if total, err := strconv.ParseUint(parts[0], 10, 64); err == nil && total > 0 { + gpu.MemoryTotalBytes = total * 1024 * 1024 + if used, err := strconv.ParseUint(parts[1], 10, 64); err == nil { + gpu.MemoryUsedBytes = used * 1024 * 1024 + } + } + if pct, err := strconv.ParseFloat(parts[2], 64); err == nil { + gpu.UtilizationPct = pct + } + if pct, err := strconv.ParseFloat(parts[3], 64); err == nil { + gpu.MemUtilizationPct = pct + } +} + +func isGPUCard(name string) bool { + if !strings.HasPrefix(name, "card") { + return false + } + if len(name) == 4 { + return false + } + ch := name[4] + return ch >= '0' && ch <= '9' +} + +func readGPUFromCard(drmRoot, name string) (pciAddr, vendor, device, driver string) { + cardPath := filepath.Join(drmRoot, name) + devPath := filepath.Join(cardPath, "device") + + resolved, err := filepath.EvalSymlinks(cardPath) + if err != nil { + return "", "", "", "" + } + + parts := strings.Split(resolved, "/") + for i, p := range parts { + if p == "drm" && i > 0 { + pciAddr = parts[i-1] + break + } + } + if pciAddr == "" { + return "", "", "", "" + } + + vendor = strings.TrimPrefix(readTrim(filepath.Join(devPath, "vendor")), "0x") + device = strings.TrimPrefix(readTrim(filepath.Join(devPath, "device")), "0x") + driver = readDriver(devPath) + return +} + +func readDriver(devPath string) string { + target, err := os.Readlink(filepath.Join(devPath, "driver")) + if err != nil { + return "" + } + return filepath.Base(target) +} + +func lookupGPUName(pciAddr string) string { + out, err := oscmd.Run("lspci", "-nns", pciAddr) + if err != nil { + return "" + } + _, rest, ok := strings.Cut(out, " ") + if !ok { + return "" + } + return strings.TrimSpace(rest) +} + +func readUint64FromFile(path string) uint64 { + v, err := strconv.ParseUint(readTrim(path), 10, 64) + if err != nil { + return 0 + } + return v +} + +func readIntFromFile(path string) int { + v, err := strconv.Atoi(readTrim(path)) + if err != nil { + return -1 + } + return v +} diff --git a/internal/modules/system/info.go b/internal/modules/system/info.go index 0812918..f28cee7 100644 --- a/internal/modules/system/info.go +++ b/internal/modules/system/info.go @@ -11,7 +11,7 @@ import ( ) // SystemInfoBody is the dashboard overview: OS identity plus live CPU, memory, -// disk, load, network, and temperature readings. Every section is best-effort — +// disk, load, network, GPU, and temperature readings. Every section is best-effort — // a source that's unavailable (e.g. no thermal zones in a VM) yields a zero // value or empty list rather than failing the whole call. type SystemInfoBody struct { @@ -24,6 +24,7 @@ type SystemInfoBody struct { Disks []DiskInfo `json:"disks" doc:"Mounted block-device filesystems"` NetworkInterfaces []NetInterface `json:"network_interfaces" doc:"Network interfaces and their addresses"` Temperatures []Temperature `json:"temperatures" doc:"Thermal sensor readings in Celsius"` + GPUs []GPUInfo `json:"gpus" doc:"Graphics processors detected via DRM sysfs"` } type GetInfoOutput struct{ Body SystemInfoBody } @@ -36,9 +37,9 @@ func registerInfo(api huma.API, sampler *Sampler) { Summary: "Get system information", Description: "Returns an overview for a dashboard: OS/kernel identity, CPU, " + "memory and swap, mounted disks, load averages, uptime, network " + - "interfaces, and temperatures. All values come from cheap local reads " + - "(/proc, /sys, syscalls) with no D-Bus dependency; each section is " + - "best-effort.", + "interfaces, temperatures, and GPU information. All values come from cheap " + + "local reads (/proc, /sys, syscalls) with no D-Bus dependency; each " + + "section is best-effort.", Tags: []string{tagSystem}, Metadata: op("read"), Errors: readErrors, @@ -54,6 +55,7 @@ func registerInfo(api huma.API, sampler *Sampler) { Disks: diskInfo(), NetworkInterfaces: netInfo(), Temperatures: tempInfo(), + GPUs: gpuInfo(), }}, nil }) } diff --git a/internal/modules/system/info_test.go b/internal/modules/system/info_test.go index 2fae7df..3b11ec6 100644 --- a/internal/modules/system/info_test.go +++ b/internal/modules/system/info_test.go @@ -6,6 +6,106 @@ import ( "testing" ) +func TestReadGPUsFromSysfs(t *testing.T) { + root := t.TempDir() + + // card0 — AMD GPU with VRAM and utilization files + pciDev := filepath.Join(root, "devices/pci0000:00/0000:00:02.0") + mkdirAll(t, pciDev) + write(t, pciDev, ".", "vendor", "0x1002") + write(t, pciDev, ".", "device", "0x7480") + write(t, pciDev, ".", "mem_info_vram_total", "8573157376") + write(t, pciDev, ".", "mem_info_vram_used", "27267072") + write(t, pciDev, ".", "gpu_busy_percent", "23") + write(t, pciDev, ".", "mem_busy_percent", "15") + + driverDir := filepath.Join(root, "bus/pci/drivers/amdgpu") + mkdirAll(t, driverDir) + mustSymlink(t, driverDir, filepath.Join(pciDev, "driver")) + + cardTarget := filepath.Join(pciDev, "drm", "card0") + mkdirAll(t, cardTarget) + mustSymlink(t, pciDev, filepath.Join(cardTarget, "device")) + mustSymlink(t, cardTarget, filepath.Join(root, "card0")) + + // Distractors + write(t, root, ".", "card0-HDMI-1", "distract") + write(t, root, ".", "renderD128", "distract") + + gpus := readGPUsFromSysfs(root) + if len(gpus) != 1 { + t.Fatalf("want 1 GPU, got %d: %+v", len(gpus), gpus) + } + if gpus[0].Vendor != "1002" { + t.Errorf("vendor = %q, want 1002", gpus[0].Vendor) + } + if gpus[0].DeviceID != "7480" { + t.Errorf("device_id = %q, want 7480", gpus[0].DeviceID) + } + if gpus[0].Driver != "amdgpu" { + t.Errorf("driver = %q, want amdgpu", gpus[0].Driver) + } + if gpus[0].MemoryTotalBytes != 8573157376 { + t.Errorf("MemoryTotalBytes = %d, want 8573157376", gpus[0].MemoryTotalBytes) + } + if gpus[0].MemoryUsedBytes != 27267072 { + t.Errorf("MemoryUsedBytes = %d, want 27267072", gpus[0].MemoryUsedBytes) + } + if gpus[0].UtilizationPct != 23.0 { + t.Errorf("UtilizationPct = %f, want 23.0", gpus[0].UtilizationPct) + } + if gpus[0].MemUtilizationPct != 15.0 { + t.Errorf("MemUtilizationPct = %f, want 15.0", gpus[0].MemUtilizationPct) + } +} + +func TestReadGPUsFromSysfsNoEnrichment(t *testing.T) { + root := t.TempDir() + + // i915 GPU with no VRAM or utilization files + pciDev := filepath.Join(root, "devices/pci0000:00/0000:00:02.0") + mkdirAll(t, pciDev) + write(t, pciDev, ".", "vendor", "0x8086") + write(t, pciDev, ".", "device", "0x46a6") + + driverDir := filepath.Join(root, "bus/pci/drivers/i915") + mkdirAll(t, driverDir) + mustSymlink(t, driverDir, filepath.Join(pciDev, "driver")) + + cardTarget := filepath.Join(pciDev, "drm", "card0") + mkdirAll(t, cardTarget) + mustSymlink(t, pciDev, filepath.Join(cardTarget, "device")) + mustSymlink(t, cardTarget, filepath.Join(root, "card0")) + + gpus := readGPUsFromSysfs(root) + if len(gpus) != 1 { + t.Fatalf("want 1 GPU, got %d", len(gpus)) + } + if gpus[0].MemoryTotalBytes != 0 || gpus[0].MemoryUsedBytes != 0 { + t.Errorf("expected 0 VRAM for i915, got total=%d used=%d", gpus[0].MemoryTotalBytes, gpus[0].MemoryUsedBytes) + } + if gpus[0].UtilizationPct != 0 || gpus[0].MemUtilizationPct != 0 { + t.Errorf("expected 0 utilization for i915, got gpu=%f mem=%f", gpus[0].UtilizationPct, gpus[0].MemUtilizationPct) + } +} + +func TestReadGPUsFromSysfsMissingDir(t *testing.T) { + gpus := readGPUsFromSysfs("/nonexistent/drm") + if gpus != nil { + t.Errorf("expected nil, got %+v", gpus) + } +} + +func TestReadGPUsFromSysfsSkipsNonGPU(t *testing.T) { + root := t.TempDir() + write(t, root, ".", "renderD128", "x") + write(t, root, ".", "card0-HDMI-1", "x") + gpus := readGPUsFromSysfs(root) + if len(gpus) != 0 { + t.Errorf("expected 0 GPUs, got %d", len(gpus)) + } +} + func TestReadHwmonTemps(t *testing.T) { root := t.TempDir() // k10temp: CPU, labelled Tctl. @@ -33,6 +133,20 @@ func TestReadHwmonTemps(t *testing.T) { } } +func mkdirAll(t *testing.T, path string) { + t.Helper() + if err := os.MkdirAll(path, 0o755); err != nil { + t.Fatal(err) + } +} + +func mustSymlink(t *testing.T, target, link string) { + t.Helper() + if err := os.Symlink(target, link); err != nil { + t.Fatal(err) + } +} + func write(t *testing.T, root, chip, file, val string) { t.Helper() dir := filepath.Join(root, chip) diff --git a/internal/modules/system/system_handler_test.go b/internal/modules/system/system_handler_test.go index 01290ef..067936a 100644 --- a/internal/modules/system/system_handler_test.go +++ b/internal/modules/system/system_handler_test.go @@ -34,6 +34,14 @@ func TestSystemHandlers(t *testing.T) { } return oscmd.MockCommand{ExitCode: 1} }) + // Mock lspci to prevent real calls in case the test host has GPUs. + oscmd.SetMock("lspci", func(args []string) oscmd.MockCommand { + return oscmd.MockCommand{ExitCode: 1} + }) + // Mock nvidia-smi: return failure so enrichment is a no-op. + oscmd.SetMock("nvidia-smi", func(args []string) oscmd.MockCommand { + return oscmd.MockCommand{ExitCode: 1} + }) defer oscmd.ClearMocks() // 1. Test GET /api/system/info