diff --git a/docs/isolated-cpu-affinity-transition.md b/docs/isolated-cpu-affinity-transition.md new file mode 100644 index 00000000000..d2f3b12e899 --- /dev/null +++ b/docs/isolated-cpu-affinity-transition.md @@ -0,0 +1,125 @@ +## Isolated CPU affinity transition + +The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 +in 5.7 has affected a deterministic scheduling behavior by distributing tasks +across CPU cores within a cgroups cpuset. It means that `runc exec` might be +impacted under some circumstances, by example when a container has been +created within a cgroup cpuset entirely composed of isolated CPU cores +usually sets either with `nohz_full` and/or `isolcpus` kernel boot parameters. + +Some containerized real-time applications are relying on this deterministic +behavior and uses the first CPU core to run a slow thread while other CPU +cores are fully used by the real-time threads with SCHED_FIFO policy. +Such applications can prevent runc process from joining a container when the +runc process is randomly scheduled on a CPU core owned by a real-time thread. + +Runc introduces a way to restore this behavior by adding the following +annotation to the container runtime spec (`config.json`): + +`org.opencontainers.runc.exec.isolated-cpu-affinity-transition` + +This annotation can take one of those values: + +* `temporary` to temporarily set the runc process CPU affinity to the first +isolated CPU core of the container cgroup cpuset. +* `definitive`: to definitively set the runc process CPU affinity to the first +isolated CPU core of the container cgroup cpuset. + +For example: + +```json + "annotations": { + "org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "temporary" + } +``` + +__WARNING:__ `definitive` requires a kernel >= 6.2, also works with RHEL 9 and +above. + +### How it works? + +When enabled and during `runc exec`, runc is looking for the `nohz_full` kernel +boot parameter value and considers the CPUs in the list as isolated, it doesn't +look for `isolcpus` boot parameter, it just assumes that `isolcpus` value is +identical to `nohz_full` when specified. If `nohz_full` parameter is not found, +runc also attempts to read the list from `/sys/devices/system/cpu/nohz_full`. + +Once it gets the isolated CPU list, it returns an eligible CPU core within the +container cgroup cpuset based on those heuristics: + +* when there is not cpuset cores: no eligible CPU +* when there is not isolated cores: no eligible CPU +* when cpuset cores are not in isolated core list: no eligible CPU +* when cpuset cores are all isolated cores: return the first CPU of the cpuset +* when cpuset cores are mixed between housekeeping/isolated cores: return the + first housekeeping CPU not in isolated CPUs. + +The returned CPU core is then used to set the `runc init` CPU affinity before +the container cgroup cpuset transition. + +#### Transition example + +`nohz_full` has the isolated cores `4-7`. A container has been created with +the cgroup cpuset `4-7` to only run on the isolated CPU cores 4 to 7. +`runc exec` is called by a process with CPU affinity set to `0-3` + +* with `temporary` transition: + + runc exec (affinity 0-3) -> runc init (affinity 4) -> container process (affinity 4-7) + +* with `definitive` transition: + + runc exec (affinity 0-3) -> runc init (affinity 4) -> container process (affinity 4) + +The difference between `temporary` and `definitive` is the container process +affinity, `definitive` will constraint the container process to run on the +first isolated CPU core of the cgroup cpuset, while `temporary` restore the +CPU affinity to match the container cgroup cpuset. + +`definitive` transition might be helpful when `nohz_full` is used without +`isolcpus` to avoid runc and container process to be a noisy neighbour for +real-time applications. + +### How to use it with Kubernetes? + +Kubernetes doesn't manage container directly, instead it uses the Container Runtime +Interface (CRI) to communicate with a software implementing this interface and responsible +to manage the lifecycle of containers. There are popular CRI implementations like Containerd +and CRI-O. Those implementations allows to pass pod annotations to the container runtime +via the container runtime spec. Currently runc is the runtime used by default for both. + +#### Containerd configuration + +Containerd CRI uses runc by default but requires an extra step to pass the annotation to runc. +You have to whitelist `org.opencontainers.runc.exec.isolated-cpu-affinity-transition` as a pod +annotation allowed to be passed to the container runtime in `/etc/containerd/config.toml`: + +```toml +[plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "runc" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + base_runtime_spec = "/etc/containerd/cri-base.json" + pod_annotations = ["org.opencontainers.runc.exec.isolated-cpu-affinity-transition"] +``` + +#### CRI-O configuration + +CRI-O doesn't require any extra step, however some annotations could be excluded by +configuration. + +#### Pod deployment example + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: demo-pod + annotations: + org.opencontainers.runc.exec.isolated-cpu-affinity-transition: "temporary" +spec: + containers: + - name: demo + image: registry.com/demo:latest +``` diff --git a/features.go b/features.go index eff04c1b2d2..190b7d10422 100644 --- a/features.go +++ b/features.go @@ -68,6 +68,7 @@ var featuresCommand = cli.Command{ "bundle", "org.systemd.property.", // prefix form "org.criu.config", + "org.opencontainers.runc.exec.isolated-cpu-affinity-transition", }, } diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go index b9ba889b7a0..d97c874eab6 100644 --- a/libcontainer/cgroups/cgroups.go +++ b/libcontainer/cgroups/cgroups.go @@ -71,4 +71,8 @@ type Manager interface { // OOMKillCount reports OOM kill count for the cgroup. OOMKillCount() (uint64, error) + + // GetEffectiveCPUs returns the effective CPUs of the cgroup, an empty + // value means that the cgroups cpuset subsystem/controller is not enabled. + GetEffectiveCPUs() string } diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go index d2decb127ca..723d18b7637 100644 --- a/libcontainer/cgroups/fs/fs.go +++ b/libcontainer/cgroups/fs/fs.go @@ -4,6 +4,8 @@ import ( "errors" "fmt" "os" + "path/filepath" + "strings" "sync" "golang.org/x/sys/unix" @@ -263,3 +265,28 @@ func (m *Manager) OOMKillCount() (uint64, error) { return c, err } + +func (m *Manager) GetEffectiveCPUs() string { + return GetEffectiveCPUs(m.Path("cpuset"), m.cgroups) +} + +func GetEffectiveCPUs(cpusetPath string, cgroups *configs.Cgroup) string { + // Fast path. + if cgroups.CpusetCpus != "" { + return cgroups.CpusetCpus + } else if !strings.HasPrefix(cpusetPath, defaultCgroupRoot) { + return "" + } + + // Iterates until it goes to the cgroup root path. + // It's required for containers in which cpuset controller + // is not enabled, in this case a parent cgroup is used. + for path := cpusetPath; path != defaultCgroupRoot; path = filepath.Dir(path) { + cpus, err := fscommon.GetCgroupParamString(path, "cpuset.effective_cpus") + if err == nil { + return cpus + } + } + + return "" +} diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 0760be74b97..0a579b23e84 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -4,11 +4,13 @@ import ( "errors" "fmt" "os" + "path/filepath" "strings" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" ) type parseError = fscommon.ParseError @@ -32,6 +34,9 @@ func NewManager(config *configs.Cgroup, dirPath string) (*Manager, error) { if err != nil { return nil, err } + } else { + // Clean path for safety. + dirPath = utils.CleanPath(dirPath) } m := &Manager{ @@ -316,3 +321,26 @@ func CheckMemoryUsage(dirPath string, r *configs.Resources) error { return nil } + +func (m *Manager) GetEffectiveCPUs() string { + // Fast path. + if m.config.CpusetCpus != "" { + return m.config.CpusetCpus + } else if !strings.HasPrefix(m.dirPath, UnifiedMountpoint) { + return "" + } + + // Iterates until it goes outside of the cgroup root path. + // It's required for containers in which cpuset controller + // is not enabled, in this case a parent cgroup is used. + outsidePath := filepath.Dir(UnifiedMountpoint) + + for path := m.dirPath; path != outsidePath; path = filepath.Dir(path) { + cpus, err := fscommon.GetCgroupParamString(path, "cpuset.cpus.effective") + if err == nil { + return cpus + } + } + + return "" +} diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 8c64a5887a9..e04e35682cd 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -411,3 +411,7 @@ func (m *LegacyManager) Exists() bool { func (m *LegacyManager) OOMKillCount() (uint64, error) { return fs.OOMKillCount(m.Path("memory")) } + +func (m *LegacyManager) GetEffectiveCPUs() string { + return fs.GetEffectiveCPUs(m.Path("cpuset"), m.cgroups) +} diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index b28ec6b22f2..2c4a8c4c3ac 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -514,3 +514,7 @@ func (m *UnifiedManager) Exists() bool { func (m *UnifiedManager) OOMKillCount() (uint64, error) { return m.fsMgr.OOMKillCount() } + +func (m *UnifiedManager) GetEffectiveCPUs() string { + return m.fsMgr.GetEffectiveCPUs() +} diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index 99908376562..c76178d808a 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -69,6 +69,10 @@ func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) { return configs.Thawed, nil } +func (m *mockCgroupManager) GetEffectiveCPUs() string { + return "" +} + type mockProcess struct { _pid int started uint64 diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 6d51eada2b3..93c0e313e00 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,11 +1,13 @@ package libcontainer import ( + "bytes" "context" "encoding/json" "errors" "fmt" "io" + "io/fs" "net" "os" "os/exec" @@ -21,10 +23,12 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/logs" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/system/kernelparam" "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -133,8 +137,58 @@ func (p *setnsProcess) start() (retErr error) { // get the "before" value of oom kill count oom, _ := p.manager.OOMKillCount() - err := p.cmd.Start() - // close the child-side of the pipes (controlled by child) + + // When greater or equal to zero, it will set a temporary single CPU + // affinity before cgroup cpuset transition, this handles a corner + // case when joining a container having all the processes running + // exclusively on isolated CPU cores to force the kernel to schedule + // runc process on the first CPU core within the cgroups cpuset. + // The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 + // in 5.7 has affected this deterministic scheduling behavior by + // distributing tasks across CPU cores within the cgroups cpuset. + // Some intensive real-time application are relying on this + // deterministic behavior and use the first CPU core to run a slow + // thread while other CPU cores are fully used by real-time threads + // with SCHED_FIFO policy. Such applications prevent runc process + // from joining a container when the runc process is randomly + // scheduled on a CPU core owned by a real-time thread. + cpuAffinity := -1 + resetCPUAffinity := true + + if len(p.manager.GetPaths()) > 0 { + // Get the target container cgroup. + if cg, err := p.manager.GetCgroups(); err != nil { + // Close the pipe to not be blocked in the parent. + p.comm.closeChild() + return fmt.Errorf("getting container cgroups: %w", err) + } else if cg.CpusetCpus != "" { + definitive := false + + _, annotations := utils.Annotations(p.config.Config.Labels) + cpuAffinity, definitive, err = isolatedCPUAffinityTransition( + os.DirFS("/"), + cg.CpusetCpus, + annotations, + ) + if err != nil { + // Close the pipe to not be blocked in the parent. + p.comm.closeChild() + return fmt.Errorf("getting CPU affinity: %w", err) + } else if definitive { + resetCPUAffinity = false + } + } + } + + var err error + + if cpuAffinity < 0 { + err = p.cmd.Start() + } else { + err = startCommandWithCPUAffinity(p.cmd, cpuAffinity) + } + + // Close the write-side of the pipes (controlled by child). p.comm.closeChild() if err != nil { return fmt.Errorf("error starting setns process: %w", err) @@ -193,6 +247,18 @@ func (p *setnsProcess) start() (retErr error) { } } } + + if resetCPUAffinity { + // Fix the container process CPU affinity to match container cgroup cpuset, + // since kernel 6.2, the runc CPU affinity might affect the container process + // CPU affinity after cgroup cpuset transition, by example if runc is running + // with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the + // resulting container process CPU affinity will be 1 instead of 1-2. + if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil { + return fmt.Errorf("error resetting container process CPU affinity: %w", err) + } + } + if p.intelRdtPath != "" { // if Intel RDT "resource control" filesystem path exists _, err := os.Stat(p.intelRdtPath) @@ -744,6 +810,14 @@ func (p *initProcess) start() (retErr error) { if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil { return fmt.Errorf("error setting cgroup config for procHooks process: %w", err) } + // Reset container process CPU affinity to match container cgroup cpuset, + // since kernel 6.2, the runc CPU affinity might affect the container process + // CPU affinity after cgroup cpuset transition, by example if runc is running + // with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the + // resulting container process CPU affinity will be 1 instead of 1-2. + if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil { + return fmt.Errorf("error resetting container process CPU affinity: %w", err) + } if p.intelRdtManager != nil { if err := p.intelRdtManager.Set(p.config.Config); err != nil { return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err) @@ -995,5 +1069,196 @@ func setIOPriority(ioprio *configs.IOPriority) error { if errno != 0 { return fmt.Errorf("failed to set io priority: %w", errno) } + + return nil +} + +// isolatedCPUAffinityTransition returns a CPU affinity if necessary based on heuristics +// and org.opencontainers.runc.exec.isolated-cpu-affinity-transition annotation value. +func isolatedCPUAffinityTransition(rootFS fs.FS, cpusetList string, annotations map[string]string) (int, bool, error) { + const ( + isolatedCPUAffinityTransitionAnnotation = "org.opencontainers.runc.exec.isolated-cpu-affinity-transition" + nohzFullParam = "nohz_full" + ) + + definitive := false + + transition := annotations[isolatedCPUAffinityTransitionAnnotation] + switch transition { + case "temporary": + case "definitive": + definitive = true + default: + if transition != "" { + return -1, false, fmt.Errorf( + "unknown transition value %q for annotation %s", + transition, isolatedCPUAffinityTransitionAnnotation, + ) + } + return -1, false, nil + } + + kernelParams, err := kernelparam.LookupKernelBootParameters( + rootFS, + nohzFullParam, + ) + if err != nil { + // If /proc/cmdline does not exist or isn't readable, continue to read + // nohz_full from sysfs below. + if !errors.Is(err, os.ErrNotExist) && !errors.Is(err, os.ErrPermission) { + return -1, false, err + } + } + + // First get nohz_full value from kernel boot params, if not + // present, get the value from sysfs, to cover the case where + // CONFIG_NO_HZ_FULL_ALL is set, it also makes the integration + // tests not dependent on /sys/devices/system/cpu/nohz_full. + isolatedList := kernelParams[nohzFullParam] + if isolatedList == "" { + // Get the isolated CPU list, the error is not checked here because + // no matter what the error is, it returns without error the same way + // as with empty data. + isolatedData, _ := fs.ReadFile(rootFS, "sys/devices/system/cpu/nohz_full") + isolatedList = string(bytes.TrimSpace(isolatedData)) + if isolatedList == "" || isolatedList == "(null)" { + return -1, false, nil + } + } + + cpu, err := getEligibleCPU(cpusetList, isolatedList) + if err != nil { + return -1, false, fmt.Errorf("getting eligible cpu: %w", err) + } else if cpu == -1 { + definitive = false + } + + return cpu, definitive, nil +} + +// getEligibleCPU returns the first eligible CPU for CPU affinity before +// entering in a cgroup cpuset: +// - when there is not cpuset cores: no eligible CPU (-1) +// - when there is not isolated cores: no eligible CPU (-1) +// - when cpuset cores are not in isolated cores: no eligible CPU (-1) +// - when cpuset cores are all isolated cores: return the first CPU of the cpuset +// - when cpuset cores are mixed between housekeeping/isolated cores: return the +// first housekeeping CPU not in isolated CPUs. +func getEligibleCPU(cpusetList, isolatedList string) (int, error) { + if isolatedList == "" || cpusetList == "" { + return -1, nil + } + + // The target container has a cgroup cpuset, get the bit range. + cpusetBits, err := systemd.RangeToBits(cpusetList) + if err != nil { + return -1, fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) + } + + isolatedBits, err := systemd.RangeToBits(isolatedList) + if err != nil { + return -1, fmt.Errorf("parsing isolated cpus list %s: %w", isolatedList, err) + } + + eligibleCore := -1 + isolatedCores := 0 + + // Start from cpu core #0. + currentCore := 0 + // Handle mixed sets. + mixed := false + + // CPU core start from the first slice element and bits are read + // from the least to the most significant bit. + for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { + if byteRange >= len(isolatedBits) { + // No more isolated cores. + break + } + for bit := 0; bit < 8; bit++ { + if cpusetBits[byteRange]&(1< 0 { + return eligibleCore, nil + } + } + currentCore++ + } + } + + // We have an eligible CPU if there is at least one isolated CPU in the cpuset. + if isolatedCores == 0 { + return -1, nil + } + + return eligibleCore, nil +} + +// startCommandWithCPUAffinity starts a command on a specific CPU if set. +func startCommandWithCPUAffinity(cmd *exec.Cmd, cpuAffinity int) error { + errCh := make(chan error) + defer close(errCh) + + // Use a goroutine to dedicate an OS thread. + go func() { + cpuSet := new(unix.CPUSet) + cpuSet.Zero() + cpuSet.Set(cpuAffinity) + + // Don't call runtime.UnlockOSThread to terminate the OS thread + // when goroutine exits. + runtime.LockOSThread() + + // Command inherits the CPU affinity. + if err := unix.SchedSetaffinity(unix.Gettid(), cpuSet); err != nil { + errCh <- fmt.Errorf("setting os thread CPU affinity: %w", err) + return + } + + errCh <- cmd.Start() + }() + + return <-errCh +} + +// fixProcessCPUAffinity sets the CPU affinity of a container process +// to all CPUs allowed by container cgroup cpuset. +func fixProcessCPUAffinity(pid int, manager cgroups.Manager) error { + cpusetList := manager.GetEffectiveCPUs() + if cpusetList == "" { + // If the cgroup cpuset is not present, the container will inherit + // this process CPU affinity, so it can return without further actions. + return nil + } + + cpusetBits, err := systemd.RangeToBits(cpusetList) + if err != nil { + return fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) + } + + processCPUSet := new(unix.CPUSet) + + for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { + for bit := 0; bit < 8; bit++ { + processCPUSet.Set(byteRange*8 + bit) + } + } + + if err := unix.SchedSetaffinity(pid, processCPUSet); err != nil { + return fmt.Errorf("setting process PID %d CPU affinity: %w", pid, err) + } + return nil } diff --git a/libcontainer/process_linux_test.go b/libcontainer/process_linux_test.go new file mode 100644 index 00000000000..8303643967b --- /dev/null +++ b/libcontainer/process_linux_test.go @@ -0,0 +1,232 @@ +package libcontainer + +import ( + "io/fs" + "testing" + "testing/fstest" +) + +func TestIsolatedCPUAffinityTransition(t *testing.T) { + const isolatedCPUAffinityTransitionAnnotation = "org.opencontainers.runc.exec.isolated-cpu-affinity-transition" + + noAffinity := -1 + temporaryTransition := "temporary" + definitiveTransition := "definitive" + + tests := []struct { + name string + testFS fs.FS + cpuset string + expectedErr bool + expectedAffinityCore int + expectedDefinitiveTransition bool + annotations map[string]string + }{ + { + name: "no affinity", + cpuset: "0-15", + testFS: fstest.MapFS{ + "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, + }, + expectedAffinityCore: noAffinity, + expectedDefinitiveTransition: false, + }, + { + name: "affinity match with temporary transition", + cpuset: "3-4", + testFS: fstest.MapFS{ + "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, + }, + expectedAffinityCore: 3, + expectedDefinitiveTransition: false, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: temporaryTransition, + }, + }, + { + name: "affinity match with temporary transition and nohz_full boot param", + cpuset: "3-4", + testFS: fstest.MapFS{ + "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=0-4\n")}, + }, + expectedAffinityCore: 3, + expectedDefinitiveTransition: false, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: temporaryTransition, + }, + }, + { + name: "affinity match with definitive transition", + cpuset: "3-4", + testFS: fstest.MapFS{ + "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, + }, + expectedAffinityCore: 3, + expectedDefinitiveTransition: true, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: definitiveTransition, + }, + }, + { + name: "affinity match with definitive transition and nohz_full boot param", + cpuset: "3-4", + testFS: fstest.MapFS{ + "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=0-4\n")}, + }, + expectedAffinityCore: 3, + expectedDefinitiveTransition: true, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: definitiveTransition, + }, + }, + { + name: "affinity error with bad isolated set", + cpuset: "0-15", + testFS: fstest.MapFS{ + "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("bad_isolated_set\n")}, + }, + expectedErr: true, + expectedAffinityCore: noAffinity, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: temporaryTransition, + }, + }, + { + name: "affinity error with bad isolated set for nohz_full boot param", + cpuset: "0-15", + testFS: fstest.MapFS{ + "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=bad_isolated_set\n")}, + }, + expectedErr: true, + expectedAffinityCore: noAffinity, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: temporaryTransition, + }, + }, + { + name: "no affinity with null isolated set value", + cpuset: "0-15", + testFS: fstest.MapFS{ + "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("(null)\n")}, + }, + expectedAffinityCore: noAffinity, + expectedDefinitiveTransition: false, + annotations: map[string]string{ + isolatedCPUAffinityTransitionAnnotation: temporaryTransition, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + affinityCore, definitive, err := isolatedCPUAffinityTransition(tt.testFS, tt.cpuset, tt.annotations) + if err != nil && !tt.expectedErr { + t.Fatalf("unexpected error: %s", err) + } else if err == nil && tt.expectedErr { + t.Fatalf("unexpected success") + } else if tt.expectedDefinitiveTransition != definitive { + t.Fatalf("expected reset affinity %t: got %t instead", tt.expectedDefinitiveTransition, definitive) + } else if tt.expectedAffinityCore != affinityCore { + t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore) + } + }) + } +} + +func TestGetEligibleCPU(t *testing.T) { + tests := []struct { + name string + cpuset string + isolset string + expectedErr bool + expectedAffinityCore int + expectedEligible bool + }{ + { + name: "no cpuset", + isolset: "2-15,18-31,34-47", + expectedEligible: false, + }, + { + name: "no isolated set", + cpuset: "0-15", + expectedEligible: false, + }, + { + name: "bad cpuset format", + cpuset: "core0 to core15", + isolset: "2-15,18-31,34-47", + expectedErr: true, + }, + { + name: "bad isolated set format", + cpuset: "0-15", + isolset: "core0 to core15", + expectedErr: true, + }, + { + name: "no eligible core", + cpuset: "0-1,16-17,32-33", + isolset: "2-15,18-31,34-47", + expectedEligible: false, + }, + { + name: "no eligible core inverted", + cpuset: "2-15,18-31,34-47", + isolset: "0-1,16-17,32-33", + expectedEligible: false, + }, + { + name: "eligible core mixed", + cpuset: "8-31", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 16, + }, + { + name: "eligible core #4", + cpuset: "4-7", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 4, + }, + { + name: "eligible core #40", + cpuset: "40-47", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 40, + }, + { + name: "eligible core #24", + cpuset: "24-31", + isolset: "2-15,18-31,34-47", + expectedEligible: true, + expectedAffinityCore: 24, + }, + { + name: "no eligible core small isolated set", + cpuset: "60-63", + isolset: "0-1", + expectedEligible: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + affinityCore, err := getEligibleCPU(tt.cpuset, tt.isolset) + eligible := affinityCore >= 0 + if err != nil && !tt.expectedErr { + t.Fatalf("unexpected error: %s", err) + } else if err == nil && tt.expectedErr { + t.Fatalf("unexpected success") + } else if tt.expectedEligible && !eligible { + t.Fatalf("was expecting eligible core but no eligible core returned") + } else if !tt.expectedEligible && eligible { + t.Fatalf("was not expecting eligible core but got eligible core") + } else if tt.expectedEligible && tt.expectedAffinityCore != affinityCore { + t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore) + } + }) + } +} diff --git a/libcontainer/system/kernelparam/lookup.go b/libcontainer/system/kernelparam/lookup.go new file mode 100644 index 00000000000..4cf452412ff --- /dev/null +++ b/libcontainer/system/kernelparam/lookup.go @@ -0,0 +1,41 @@ +package kernelparam + +import ( + "io/fs" + "strings" +) + +func runeFilter(c rune) bool { + return c < '!' || c > '~' +} + +// LookupKernelBootParameters returns the selected kernel parameters specified +// in the kernel command line. The parameters are returned as a map of key-value pairs. +func LookupKernelBootParameters(rootFS fs.FS, lookupParameters ...string) (map[string]string, error) { + cmdline, err := fs.ReadFile(rootFS, "proc/cmdline") + if err != nil { + return nil, err + } + + kernelParameters := make(map[string]string) + remaining := len(lookupParameters) + + for _, parameter := range strings.FieldsFunc(string(cmdline), runeFilter) { + if remaining == 0 { + break + } + idx := strings.IndexByte(parameter, '=') + if idx == -1 { + continue + } + for _, lookupParam := range lookupParameters { + if lookupParam == parameter[:idx] { + kernelParameters[lookupParam] = parameter[idx+1:] + remaining-- + break + } + } + } + + return kernelParameters, nil +} diff --git a/libcontainer/system/kernelparam/lookup_test.go b/libcontainer/system/kernelparam/lookup_test.go new file mode 100644 index 00000000000..9d906301eb4 --- /dev/null +++ b/libcontainer/system/kernelparam/lookup_test.go @@ -0,0 +1,60 @@ +package kernelparam + +import ( + "testing" + "testing/fstest" +) + +func TestLookupKernelBootParameters(t *testing.T) { + for _, test := range []struct { + cmdline string + lookupParameters []string + expectedKernelParameters map[string]string + }{ + { + cmdline: "root=/dev/sda1 ro console=ttyS0 console=tty0", + lookupParameters: []string{"root"}, + expectedKernelParameters: map[string]string{ + "root": "/dev/sda1", + }, + }, + { + cmdline: "ro runc.kernel_parameter=a_value console=ttyS0 console=tty0", + lookupParameters: []string{"runc.kernel_parameter"}, + expectedKernelParameters: map[string]string{ + "runc.kernel_parameter": "a_value", + }, + }, + { + cmdline: "ro runc.kernel_parameter_a=value_a runc.kernel_parameter_b=value_a:value_b", + lookupParameters: []string{ + "runc.kernel_parameter_a", + "runc.kernel_parameter_b", + }, + expectedKernelParameters: map[string]string{ + "runc.kernel_parameter_a": "value_a", + "runc.kernel_parameter_b": "value_a:value_b", + }, + }, + { + cmdline: "root=/dev/sda1 ro console=ttyS0 console=tty0", + lookupParameters: []string{"runc.kernel_parameter_a"}, + expectedKernelParameters: map[string]string{}, + }, + } { + params, err := LookupKernelBootParameters(fstest.MapFS{ + "proc/cmdline": &fstest.MapFile{Data: []byte(test.cmdline + "\n")}, + }, test.lookupParameters...) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + if len(params) != len(test.expectedKernelParameters) { + t.Fatalf("expected %d parameters, got %d", len(test.expectedKernelParameters), len(params)) + } + for k, v := range test.expectedKernelParameters { + if params[k] != v { + t.Fatalf("expected parameter %s to be %s, got %s", k, v, params[k]) + } + } + } +} diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats index a92a237809d..c0ea7f5dce6 100644 --- a/tests/integration/exec.bats +++ b/tests/integration/exec.bats @@ -340,3 +340,139 @@ EOF [ ${#lines[@]} -eq 1 ] [[ ${lines[0]} = *"exec /run.sh: no such file or directory"* ]] } + +@test "runc exec with isolated cpus affinity temporary transition [cgroup cpuset]" { + requires root cgroups_cpuset + + tmp=$(mktemp -d "$BATS_RUN_TMPDIR/runc.XXXXXX") + + set_cgroup_cpuset_all_cpus + local all_cpus + all_cpus="$(get_all_online_cpus)" + + # set temporary isolated CPU affinity transition + update_config '.annotations += {"org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "temporary"}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_isolated_temporary_transition + [ "$status" -eq 0 ] + + # set all online cpus as isolated + echo "nohz_full=$all_cpus" >"$tmp/cmdline" + + mount --bind "$tmp/cmdline" /proc/cmdline + + runc exec test_isolated_temporary_transition grep "Cpus_allowed_list:" /proc/self/status + + umount /proc/cmdline + + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] +} + +@test "runc exec with isolated cpus affinity definitive transition [cgroup cpuset]" { + requires root cgroups_cpuset + + tmp=$(mktemp -d "$BATS_RUN_TMPDIR/runc.XXXXXX") + + set_cgroup_cpuset_all_cpus + local all_cpus + all_cpus="$(get_all_online_cpus)" + + # set definitive isolated CPU affinity transition + update_config '.annotations += {"org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "definitive"}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_isolated_definitive_transition + [ "$status" -eq 0 ] + + # set all online cpus as isolated + echo "nohz_full=$all_cpus" >"$tmp/cmdline" + + mount --bind "$tmp/cmdline" /proc/cmdline + + runc exec test_isolated_definitive_transition grep "Cpus_allowed_list:" /proc/self/status + + umount /proc/cmdline + + [ "$status" -eq 0 ] + + load /etc/os-release + + # fix unbound variable in condition below + VERSION_ID=${VERSION_ID:-} + + allowed_cpus=$all_cpus + # use first cpu on systems with RHEL >= 9 or systems with kernel >= 6.2 + if [[ "${ID_LIKE:-}" =~ "rhel" && "${VERSION_ID%%.*}" -ge "9" ]] || is_kernel_gte 6.2; then + allowed_cpus="$(get_first_online_cpu)" + fi + + [[ "${lines[0]}" == "Cpus_allowed_list: $allowed_cpus" ]] +} + +@test "runc exec with isolated cpus affinity bad transition [cgroup cpuset]" { + requires root cgroups_cpuset + + tmp=$(mktemp -d "$BATS_RUN_TMPDIR/runc.XXXXXX") + + set_cgroup_cpuset_all_cpus + local all_cpus + all_cpus="$(get_all_online_cpus)" + + # set a bad isolated CPU affinity transition + update_config '.annotations += {"org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "bad"}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_isolated_bad_transition + [ "$status" -eq 0 ] + + # set all online cpus as isolated + echo "nohz_full=$all_cpus" >"$tmp/cmdline" + + mount --bind "$tmp/cmdline" /proc/cmdline + + runc exec test_isolated_bad_transition true + + umount /proc/cmdline + + [ "$status" -eq 255 ] +} + +@test "runc exec with taskset affinity [cgroup cpuset]" { + requires root cgroups_cpuset + + set_cgroup_cpuset_all_cpus + local all_cpus + all_cpus="$(get_all_online_cpus)" + + taskset -p -c "$(get_first_online_cpu)" $$ + + runc run -d --console-socket "$CONSOLE_SOCKET" test_with_taskset + [ "$status" -eq 0 ] + + runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/1/status + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] + + runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/self/status + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] +} + +@test "runc exec with taskset affinity [rootless cgroups_v2]" { + requires rootless cgroups_v2 + + local all_cpus + all_cpus="$(get_all_online_cpus)" + + taskset -p -c "$(get_first_online_cpu)" $$ + + runc run -d --console-socket "$CONSOLE_SOCKET" test_with_taskset + [ "$status" -eq 0 ] + + runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/1/status + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] + + runc exec test_with_taskset grep "Cpus_allowed_list:" /proc/self/status + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "Cpus_allowed_list: $all_cpus" ]] +} diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index e8a2894b915..6b836f15d41 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -354,6 +354,27 @@ function set_cgroup_mount_writable() { update_config '.mounts |= map((select(.type == "cgroup") | .options -= ["ro"]) // .)' } +# Helper function to get all online cpus. +function get_all_online_cpus() { + cat /sys/devices/system/cpu/online +} + +# Helper function to get the first online cpu. +function get_first_online_cpu() { + [[ $(get_all_online_cpus) =~ [^0-9]*([0-9]+)([-,][0-9]+)? ]] && echo "${BASH_REMATCH[1]}" +} + +# Helper function to set all cpus/mems in container cgroup cpuset. +function set_cgroup_cpuset_all_cpus() { + update_config ".linux.resources.cpu.cpus = \"$(get_all_online_cpus)\"" + + local mems + mems="$(cat /sys/devices/system/node/online 2>/dev/null || true)" + if [[ -n $mems ]]; then + update_config ".linux.resources.cpu.mems = \"$mems\"" + fi +} + # Fails the current test, providing the error given. function fail() { echo "$@" >&2