rootless: enable resource limitation (requires cgroup v2, systemd)

Now rootless mode can be used with cgroup v2 resource limitations.
A pod is executed in a cgroup like "/user.slice/user-1001.slice/user@1001.service/k3s-rootless.service/kubepods/podd0eb6921-c81a-4214-b36c-d3b9bb212fac/63b5a253a1fd4627da16bfce9bec58d72144cf30fe833e0ca9a6d60ebf837475".

This is accomplished by running `kubelet` in a cgroup namespace, and enabling `cgroupfs` driver for the cgroup hierarchy delegated by systemd.

To enable cgroup v2 resource limitation, `k3s server --rootless` needs to be launched as `systemctl --user` service.
Please see the comment lines in `k3s-rootless.service` for the usage.

Running `k3s server --rootless` via a terminal is not supported.
When it really needs to be launched via a terminal, `systemd-run --user -p Delegate --tty` needs to be prepended to create a systemd scope.

Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
This commit is contained in:
Akihiro Suda 2021-03-02 17:57:40 +09:00 committed by Brad Davidson
parent 11ef43011a
commit 6e8284e3d4
5 changed files with 117 additions and 14 deletions

45
k3s-rootless.service Normal file
View File

@ -0,0 +1,45 @@
# systemd unit file for k3s (rootless)
#
# Usage:
# - [Optional] Enable cgroup v2 delegation, see https://rootlesscontaine.rs/getting-started/common/cgroup2/ .
# This step is optional, but highly recommended for enabling CPU and memory resource limtitation.
#
# - Copy this file as `~/.config/systemd/user/k3s-rootless.service`.
# Installing this file as a system-wide service (`/etc/systemd/...`) is not supported.
# Depending on the path of `k3s` binary, you might need to modify the `ExecStart=/usr/local/bin/k3s ...` line of this file.
#
# - Run `systemctl --user daemon-reload`
#
# - Run `systemctl --user enable --now k3s-rootless`
#
# - Run `KUBECONFIG=~/.kube/k3s.yaml kubectl get pods -A`, and make sure the pods are running.
#
# Troubleshooting:
# - See `systemctl --user status k3s-rootless` to check the daemon status
# - See `journalctl --user -f -u k3s-rootless` to see the daemon log
# - See also https://rootlesscontaine.rs/
[Unit]
Description=k3s (Rootless)
[Service]
Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
# NOTE: Don't try to run `k3s server --rootless` on a terminal, as it doesn't enable cgroup v2 delegation.
# If you really need to try it on a terminal, prepend `systemd-run --user -p Delegate=yes --tty` to create a systemd scope.
ExecStart=/usr/local/bin/k3s server --rootless
ExecReload=/bin/kill -s HUP $MAINPID
TimeoutSec=0
RestartSec=2
Restart=always
StartLimitBurst=3
StartLimitInterval=60s
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
TasksMax=infinity
Delegate=yes
Type=simple
KillMode=mixed
[Install]
WantedBy=default.target

View File

@ -26,11 +26,13 @@ import (
"github.com/pkg/errors"
"github.com/rancher/k3s/pkg/agent/templates"
util2 "github.com/rancher/k3s/pkg/agent/util"
"github.com/rancher/k3s/pkg/daemons/agent"
"github.com/rancher/k3s/pkg/daemons/config"
"github.com/rancher/k3s/pkg/untar"
"github.com/rancher/k3s/pkg/version"
"github.com/rancher/wrangler/pkg/merr"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"google.golang.org/grpc"
yaml "gopkg.in/yaml.v2"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
@ -336,10 +338,21 @@ func setupContainerdConfig(ctx context.Context, cfg *config.Node) error {
if err != nil {
return err
}
isRunningInUserNS := system.RunningInUserNS()
_, _, hasCFS, hasPIDs := agent.CheckCgroups()
// "/sys/fs/cgroup" is namespaced
cgroupfsWritable := unix.Access("/sys/fs/cgroup", unix.W_OK) == nil
disableCgroup := isRunningInUserNS && (!hasCFS || !hasPIDs || !cgroupfsWritable)
if disableCgroup {
logrus.Warn("cgroup v2 controllers are not delegated for rootless. Disabling cgroup.")
}
var containerdTemplate string
containerdConfig := templates.ContainerdConfig{
NodeConfig: cfg,
IsRunningInUserNS: system.RunningInUserNS(),
DisableCgroup: disableCgroup,
IsRunningInUserNS: isRunningInUserNS,
PrivateRegistryConfig: privRegistries,
}

View File

@ -9,6 +9,7 @@ import (
type ContainerdConfig struct {
NodeConfig *config.Node
DisableCgroup bool
IsRunningInUserNS bool
PrivateRegistryConfig *Registry
}
@ -22,8 +23,10 @@ const ContainerdConfigTemplate = `
stream_server_port = "10010"
enable_selinux = {{ .NodeConfig.SELinux }}
{{- if .IsRunningInUserNS }}
{{- if .DisableCgroup}}
disable_cgroup = true
{{end}}
{{- if .IsRunningInUserNS }}
disable_apparmor = true
restrict_oom_score_adj = true
{{end}}

View File

@ -15,6 +15,7 @@ import (
"github.com/rancher/k3s/pkg/daemons/executor"
"github.com/rancher/k3s/pkg/version"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"k8s.io/apimachinery/pkg/util/net"
"k8s.io/component-base/logs"
"k8s.io/kubernetes/pkg/kubeapiserver/authorizer/modes"
@ -128,7 +129,7 @@ func startKubelet(cfg *config.Agent) error {
if err != nil || defaultIP.String() != cfg.NodeIP {
argsMap["node-ip"] = cfg.NodeIP
}
kubeletRoot, runtimeRoot, hasCFS, hasPIDs := checkCgroups()
kubeletRoot, runtimeRoot, hasCFS, hasPIDs := CheckCgroups()
if !hasCFS {
logrus.Warn("Disabling CPU quotas due to missing cpu.cfs_period_us")
argsMap["cpu-cfs-quota"] = "false"
@ -158,11 +159,20 @@ func startKubelet(cfg *config.Agent) error {
}
if cfg.Rootless {
// flags are from https://github.com/rootless-containers/usernetes/blob/v20190826.0/boot/kubelet.sh
argsMap["cgroup-driver"] = "none"
argsMap["feature-gates=SupportNoneCgroupDriver"] = "true"
argsMap["cgroups-per-qos"] = "false"
argsMap["enforce-node-allocatable"] = ""
// "/sys/fs/cgroup" is namespaced
cgroupfsWritable := unix.Access("/sys/fs/cgroup", unix.W_OK) == nil
if hasCFS && hasPIDs && cgroupfsWritable {
logrus.Info("cgroup v2 controllers are delegated for rootless.")
// cgroupfs v2, delegated for rootless by systemd
argsMap["cgroup-driver"] = "cgroupfs"
} else {
logrus.Warn("cgroup v2 controllers are not delegated for rootless. Setting cgroup driver to \"none\".")
// flags are from https://github.com/rootless-containers/usernetes/blob/v20190826.0/boot/kubelet.sh
argsMap["cgroup-driver"] = "none"
argsMap["feature-gates=SupportNoneCgroupDriver"] = "true"
argsMap["cgroups-per-qos"] = "false"
argsMap["enforce-node-allocatable"] = ""
}
}
if cfg.ProtectKernelDefaults {
@ -182,7 +192,7 @@ func addFeatureGate(current, new string) string {
return current + "," + new
}
func checkCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) {
func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) {
cgroupsModeV2 := cgroups.Mode() == cgroups.Unified
// For Unified (v2) cgroups we can directly check to see what controllers are mounted

View File

@ -8,8 +8,10 @@ import (
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/pkg/errors"
"github.com/rootless-containers/rootlesskit/pkg/child"
"github.com/rootless-containers/rootlesskit/pkg/copyup/tmpfssymlink"
@ -17,12 +19,14 @@ import (
"github.com/rootless-containers/rootlesskit/pkg/parent"
portbuiltin "github.com/rootless-containers/rootlesskit/pkg/port/builtin"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
var (
pipeFD = "_K3S_ROOTLESS_FD"
childEnv = "_K3S_ROOTLESS_SOCK"
Sock = ""
pipeFD = "_K3S_ROOTLESS_FD"
childEnv = "_K3S_ROOTLESS_SOCK"
evacuateCgroup2Env = "_K3S_ROOTLESS_EVACUATE_CGROUP2" // boolean
Sock = ""
)
func Rootless(stateDir string) error {
@ -61,6 +65,9 @@ func Rootless(stateDir string) error {
}
os.Setenv(childEnv, filepath.Join(parentOpt.StateDir, parent.StateFileAPISock))
if parentOpt.EvacuateCgroup2 != "" {
os.Setenv(evacuateCgroup2Env, "1")
}
if err := parent.Parent(*parentOpt); err != nil {
logrus.Fatal(err)
}
@ -128,8 +135,26 @@ func createParentOpt(stateDir string) (*parent.Opt, error) {
}
opt := &parent.Opt{
StateDir: stateDir,
CreatePIDNS: true,
StateDir: stateDir,
CreatePIDNS: true,
CreateCgroupNS: true,
CreateUTSNS: true,
CreateIPCNS: true,
}
selfCgroupMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return nil, err
}
if selfCgroup2 := selfCgroupMap[""]; selfCgroup2 == "" {
logrus.Warnf("enabling cgroup2 is highly recommended, see https://rootlesscontaine.rs/getting-started/common/cgroup2/")
} else {
selfCgroup2Dir := filepath.Join("/sys/fs/cgroup", selfCgroup2)
if unix.Access(selfCgroup2Dir, unix.W_OK) == nil {
opt.EvacuateCgroup2 = "k3s_evac"
} else {
logrus.Warn("cannot set cgroup2 evacuation, make sure to run k3s as a systemd unit")
}
}
mtu := 0
@ -177,5 +202,12 @@ func createChildOpt() (*child.Opt, error) {
opt.CopyUpDriver = tmpfssymlink.NewChildDriver()
opt.MountProcfs = true
opt.Reaper = true
if v := os.Getenv(evacuateCgroup2Env); v != "" {
var err error
opt.EvacuateCgroup2, err = strconv.ParseBool(v)
if err != nil {
return nil, err
}
}
return opt, nil
}