diff --git a/pkg/agent/containerd/config_linux.go b/pkg/agent/containerd/config_linux.go index 957af1226f..94b67b5d61 100644 --- a/pkg/agent/containerd/config_linux.go +++ b/pkg/agent/containerd/config_linux.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package containerd @@ -57,6 +58,7 @@ func setupContainerdConfig(ctx context.Context, cfg *config.Node) error { DisableCgroup: disableCgroup, IsRunningInUserNS: isRunningInUserNS, PrivateRegistryConfig: privRegistries.Registry(), + ExtraRuntimes: findNvidiaContainerRuntimes(os.DirFS(string(os.PathSeparator))), } selEnabled, selConfigured, err := selinuxStatus() diff --git a/pkg/agent/containerd/nvidia.go b/pkg/agent/containerd/nvidia.go new file mode 100644 index 0000000000..28b4a96857 --- /dev/null +++ b/pkg/agent/containerd/nvidia.go @@ -0,0 +1,66 @@ +// +build linux + +package containerd + +import ( + "errors" + "io/fs" + "path/filepath" + + "github.com/rancher/k3s/pkg/agent/templates" + "github.com/sirupsen/logrus" +) + +// findNvidiaContainerRuntimes returns a list of nvidia container runtimes that +// are available on the system. It checks install locations used by the nvidia +// gpu operator and by system package managers. The gpu operator installation +// takes precedence over the system package manager installation. +// The given fs.FS should represent the filesystem root directory to search in. +func findNvidiaContainerRuntimes(root fs.FS) map[string]templates.ContainerdRuntimeConfig { + // Check these locations in order. The GPU operator's installation should + // take precedence over the package manager's installation. + locationsToCheck := []string{ + "usr/local/nvidia/toolkit", // Path when installing via GPU Operator + "usr/bin", // Path when installing via package manager + } + + // Fill in the binary location with just the name of the binary, + // and check against each of the possible locations. If a match is found, + // set the location to the full path. + potentialRuntimes := map[string]templates.ContainerdRuntimeConfig{ + "nvidia": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "nvidia-container-runtime", + }, + "nvidia-experimental": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "nvidia-container-runtime-experimental", + }, + } + foundRuntimes := map[string]templates.ContainerdRuntimeConfig{} +RUNTIME: + for runtimeName, runtimeConfig := range potentialRuntimes { + for _, location := range locationsToCheck { + binaryPath := filepath.Join(location, runtimeConfig.BinaryName) + logrus.Debugf("Searching for %s container runtime at /%s", runtimeName, binaryPath) + if info, err := fs.Stat(root, binaryPath); err == nil { + if info.IsDir() { + logrus.Debugf("Found %s container runtime at /%s, but it is a directory. Skipping.", runtimeName, binaryPath) + continue + } + runtimeConfig.BinaryName = filepath.Join("/", binaryPath) + logrus.Infof("Found %s container runtime at %s", runtimeName, runtimeConfig.BinaryName) + foundRuntimes[runtimeName] = runtimeConfig + // Skip to the next runtime to enforce precedence. + continue RUNTIME + } else { + if errors.Is(err, fs.ErrNotExist) { + logrus.Debugf("%s container runtime not found at /%s", runtimeName, binaryPath) + } else { + logrus.Errorf("Error searching for %s container runtime at /%s: %v", runtimeName, binaryPath, err) + } + } + } + } + return foundRuntimes +} diff --git a/pkg/agent/containerd/nvidia_test.go b/pkg/agent/containerd/nvidia_test.go new file mode 100644 index 0000000000..8a9497e767 --- /dev/null +++ b/pkg/agent/containerd/nvidia_test.go @@ -0,0 +1,218 @@ +// +build linux + +package containerd + +import ( + "io/fs" + "reflect" + "testing" + "testing/fstest" + + "github.com/rancher/k3s/pkg/agent/templates" +) + +func Test_UnitFindNvidiaContainerRuntimes(t *testing.T) { + executable := &fstest.MapFile{Mode: 0755} + type args struct { + root fs.FS + } + tests := []struct { + name string + args args + want map[string]templates.ContainerdRuntimeConfig + }{ + { + name: "No runtimes", + args: args{ + root: fstest.MapFS{}, + }, + want: map[string]templates.ContainerdRuntimeConfig{}, + }, + { + name: "Nvidia runtime in /usr/bin", + args: args{ + root: fstest.MapFS{ + "usr/bin/nvidia-container-runtime": executable, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{ + "nvidia": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/bin/nvidia-container-runtime", + }, + }, + }, + { + name: "Experimental runtime in /usr/local/nvidia/toolkit", + args: args{ + root: fstest.MapFS{ + "usr/local/nvidia/toolkit/nvidia-container-runtime": executable, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{ + "nvidia": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/local/nvidia/toolkit/nvidia-container-runtime", + }, + }, + }, + { + name: "Two runtimes in separate directories", + args: args{ + root: fstest.MapFS{ + "usr/bin/nvidia-container-runtime": executable, + "usr/local/nvidia/toolkit/nvidia-container-runtime": executable, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{ + "nvidia": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/local/nvidia/toolkit/nvidia-container-runtime", + }, + }, + }, + { + name: "Experimental runtime in /usr/bin", + args: args{ + root: fstest.MapFS{ + "usr/bin/nvidia-container-runtime-experimental": executable, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{ + "nvidia-experimental": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/bin/nvidia-container-runtime-experimental", + }, + }, + }, + { + name: "Same runtime in two directories", + args: args{ + root: fstest.MapFS{ + "usr/bin/nvidia-container-runtime-experimental": executable, + "usr/local/nvidia/toolkit/nvidia-container-runtime-experimental": executable, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{ + "nvidia-experimental": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/local/nvidia/toolkit/nvidia-container-runtime-experimental", + }, + }, + }, + { + name: "Both runtimes in /usr/bin", + args: args{ + root: fstest.MapFS{ + "usr/bin/nvidia-container-runtime-experimental": executable, + "usr/bin/nvidia-container-runtime": executable, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{ + "nvidia": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/bin/nvidia-container-runtime", + }, + "nvidia-experimental": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/bin/nvidia-container-runtime-experimental", + }, + }, + }, + { + name: "Both runtimes in both directories", + args: args{ + root: fstest.MapFS{ + "usr/local/nvidia/toolkit/nvidia-container-runtime": executable, + "usr/local/nvidia/toolkit/nvidia-container-runtime-experimental": executable, + "usr/bin/nvidia-container-runtime": executable, + "usr/bin/nvidia-container-runtime-experimental": executable, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{ + "nvidia": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/local/nvidia/toolkit/nvidia-container-runtime", + }, + "nvidia-experimental": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/local/nvidia/toolkit/nvidia-container-runtime-experimental", + }, + }, + }, + { + name: "Both runtimes in /usr/local/nvidia/toolkit", + args: args{ + root: fstest.MapFS{ + "usr/local/nvidia/toolkit/nvidia-container-runtime": executable, + "usr/local/nvidia/toolkit/nvidia-container-runtime-experimental": executable, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{ + "nvidia": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/local/nvidia/toolkit/nvidia-container-runtime", + }, + "nvidia-experimental": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/local/nvidia/toolkit/nvidia-container-runtime-experimental", + }, + }, + }, + { + name: "Both runtimes in /usr/bin and one duplicate in /usr/local/nvidia/toolkit", + args: args{ + root: fstest.MapFS{ + "usr/bin/nvidia-container-runtime": executable, + "usr/bin/nvidia-container-runtime-experimental": executable, + "usr/local/nvidia/toolkit/nvidia-container-runtime-experimental": executable, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{ + "nvidia": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/bin/nvidia-container-runtime", + }, + "nvidia-experimental": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/local/nvidia/toolkit/nvidia-container-runtime-experimental", + }, + }, + }, + { + name: "Runtime is a directory", + args: args{ + root: fstest.MapFS{ + "usr/bin/nvidia-container-runtime": &fstest.MapFile{ + Mode: fs.ModeDir, + }, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{}, + }, + { + name: "Runtime in both directories, but one is a directory", + args: args{ + root: fstest.MapFS{ + "usr/bin/nvidia-container-runtime": executable, + "usr/local/nvidia/toolkit/nvidia-container-runtime": &fstest.MapFile{ + Mode: fs.ModeDir, + }, + }, + }, + want: map[string]templates.ContainerdRuntimeConfig{ + "nvidia": { + RuntimeType: "io.containerd.runc.v2", + BinaryName: "/usr/bin/nvidia-container-runtime", + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := findNvidiaContainerRuntimes(tt.args.root); !reflect.DeepEqual(got, tt.want) { + t.Errorf("findNvidiaContainerRuntimes() = %+v\nWant = %+v", got, tt.want) + } + }) + } +} diff --git a/pkg/agent/templates/templates.go b/pkg/agent/templates/templates.go index 06183eba22..21bdc2f9e1 100644 --- a/pkg/agent/templates/templates.go +++ b/pkg/agent/templates/templates.go @@ -6,9 +6,15 @@ import ( "github.com/rancher/k3s/pkg/daemons/config" ) +type ContainerdRuntimeConfig struct { + RuntimeType string + BinaryName string +} + type ContainerdConfig struct { NodeConfig *config.Node DisableCgroup bool IsRunningInUserNS bool PrivateRegistryConfig *registries.Registry + ExtraRuntimes map[string]ContainerdRuntimeConfig } diff --git a/pkg/agent/templates/templates_linux.go b/pkg/agent/templates/templates_linux.go index 72b93d78e7..cb920beaa8 100644 --- a/pkg/agent/templates/templates_linux.go +++ b/pkg/agent/templates/templates_linux.go @@ -112,6 +112,13 @@ enable_keychain = true {{end}} {{end}} {{end}} + +{{range $k, $v := .ExtraRuntimes}} +[plugins.cri.containerd.runtimes."{{$k}}"] + runtime_type = "{{$v.RuntimeType}}" +[plugins.cri.containerd.runtimes."{{$k}}".options] + BinaryName = "{{$v.BinaryName}}" +{{end}} ` func ParseTemplateFromConfig(templateBuffer string, config interface{}) (string, error) {