From 9391cd3d6d1cbb3c556dfd4149f0b929dadfb636 Mon Sep 17 00:00:00 2001 From: Fraser Tweedale Date: Wed, 16 Jun 2021 15:30:42 +1000 Subject: [PATCH] chown cgroup to process uid in container namespace Delegating cgroups to the container enables more complex workloads, including systemd-based workloads. The OCI runtime-spec was recently updated to explicitly admit such delegation, through specification of cgroup ownership semantics: https://github.com/opencontainers/runtime-spec/pull/1123 Pursuant to the updated OCI runtime-spec, change the ownership of the container's cgroup directory and particular files therein, when using cgroups v2 and when the cgroupfs is to be mounted read/write. As a result of this change, systemd workloads can run in isolated user namespaces on OpenShift when the sandbox's cgroupfs is mounted read/write. It might be possible to implement this feature in other cgroup managers, but that work is deferred. Signed-off-by: Fraser Tweedale --- libcontainer/cgroups/systemd/v2.go | 39 ++++++++++++++++++++++++++++ libcontainer/configs/cgroup_linux.go | 7 +++++ 2 files changed, 46 insertions(+) diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index 49c0bfab6..c31f0ecfd 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -1,8 +1,10 @@ package systemd import ( + "bufio" "fmt" "math" + "os" "path/filepath" "strconv" "strings" @@ -288,9 +290,46 @@ func (m *unifiedManager) Apply(pid int) error { if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil { return err } + + if c.OwnerUID != nil { + filesToChown, err := cgroupFilesToChown() + if err != nil { + return err + } + + for _, v := range filesToChown { + err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1) + if err != nil { + return err + } + } + } + return nil } +// The kernel exposes a list of files that should be chowned to the delegate +// uid in /sys/kernel/cgroup/delegate. If the file is not present +// (Linux < 4.15), use the initial values mentioned in cgroups(7). +func cgroupFilesToChown() ([]string, error) { + filesToChown := []string{"."} // the directory itself must be chowned + const cgroupDelegateFile = "/sys/kernel/cgroup/delegate" + f, err := os.Open(cgroupDelegateFile) + if err == nil { + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + filesToChown = append(filesToChown, scanner.Text()) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err) + } + } else { + filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads") + } + return filesToChown, nil +} + func (m *unifiedManager) Destroy() error { m.mu.Lock() defer m.mu.Unlock() diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index 25424bdcb..2d4a89871 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -41,6 +41,13 @@ type Cgroup struct { // Rootless tells if rootless cgroups should be used. Rootless bool + + // The host UID that should own the cgroup, or nil to accept + // the default ownership. This should only be set when the + // cgroupfs is to be mounted read/write. + // Not all cgroup manager implementations support changing + // the ownership. + OwnerUID *int `json:"owner_uid,omitempty"` } type Resources struct {