diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ebb6072239..aae5d9f46d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 IDs before calling libcontainer; it is recommended to use Go package github.com/moby/sys/user for that. (#3999) +### Fixed + * `runc exec -p` no longer ignores specified `ioPriority` and `scheduler` + settings. Similarly, libcontainer's `Container.Start` and `Container.Run` + methods no longer ignore `Process.IOPriority` and `Process.Scheduler` + settings. (#4585) + ## [1.2.0] - 2024-10-22 > できるときにできることをやるんだ。それが今だ。 diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go index 8ed3cac0870..8bddc0007b7 100644 --- a/libcontainer/capabilities/capabilities.go +++ b/libcontainer/capabilities/capabilities.go @@ -47,6 +47,9 @@ func KnownCapabilities() []string { // printing a warning instead. func New(capConfig *configs.Capabilities) (*Caps, error) { var c Caps + if capConfig == nil { + return &c, nil + } _, err := capMap() if err != nil { @@ -103,6 +106,9 @@ type Caps struct { // ApplyBoundingSet sets the capability bounding set to those specified in the whitelist. func (c *Caps) ApplyBoundingSet() error { + if c.pid == nil { + return nil + } c.pid.Clear(capability.BOUNDING) c.pid.Set(capability.BOUNDING, c.caps[capability.BOUNDING]...) return c.pid.Apply(capability.BOUNDING) @@ -110,6 +116,9 @@ func (c *Caps) ApplyBoundingSet() error { // Apply sets all the capabilities for the current process in the config. func (c *Caps) ApplyCaps() error { + if c.pid == nil { + return nil + } c.pid.Clear(capability.CAPS | capability.BOUNDS) for _, g := range []capability.CapType{ capability.EFFECTIVE, diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index a411d40813d..54a0eaafe06 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -689,6 +689,9 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm } func (c *Container) newInitConfig(process *Process) *initConfig { + // Set initial properties. For those properties that exist + // both in the container config and the process, use the ones + // from the container config first, and override them later. cfg := &initConfig{ Config: c.config, Args: process.Args, @@ -697,19 +700,25 @@ func (c *Container) newInitConfig(process *Process) *initConfig { GID: process.GID, AdditionalGroups: process.AdditionalGroups, Cwd: process.Cwd, - Capabilities: process.Capabilities, + Capabilities: c.config.Capabilities, PassedFilesCount: len(process.ExtraFiles), ContainerID: c.ID(), NoNewPrivileges: c.config.NoNewPrivileges, - RootlessEUID: c.config.RootlessEUID, - RootlessCgroups: c.config.RootlessCgroups, AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, + IOPriority: c.config.IOPriority, + Scheduler: c.config.Scheduler, CreateConsole: process.ConsoleSocket != nil, ConsoleWidth: process.ConsoleWidth, ConsoleHeight: process.ConsoleHeight, } + + // Overwrite config properties with ones from process. + + if process.Capabilities != nil { + cfg.Capabilities = process.Capabilities + } if process.NoNewPrivileges != nil { cfg.NoNewPrivileges = *process.NoNewPrivileges } @@ -722,6 +731,15 @@ func (c *Container) newInitConfig(process *Process) *initConfig { if len(process.Rlimits) > 0 { cfg.Rlimits = process.Rlimits } + if process.IOPriority != nil { + cfg.IOPriority = process.IOPriority + } + if process.Scheduler != nil { + cfg.Scheduler = process.Scheduler + } + + // Set misc properties. + if cgroups.IsCgroup2UnifiedMode() { cfg.Cgroup2Path = c.cgroupManager.Path("") } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index cff21e1bc66..f78e561755f 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -47,30 +47,54 @@ type network struct { TempVethPeerName string `json:"temp_veth_peer_name"` } -// initConfig is used for transferring parameters from Exec() to Init() +// initConfig is used for transferring parameters from Exec() to Init(). +// It contains: +// - original container config; +// - some [Process] properties; +// - set of properties merged from the container config ([configs.Config]) +// and the process ([Process]); +// - some properties that come from the container. +// +// When adding new fields, please make sure they go into the relevant section. type initConfig struct { - Args []string `json:"args"` - Env []string `json:"env"` - Cwd string `json:"cwd"` - Capabilities *configs.Capabilities `json:"capabilities"` - ProcessLabel string `json:"process_label"` - AppArmorProfile string `json:"apparmor_profile"` - NoNewPrivileges bool `json:"no_new_privileges"` - UID int `json:"uid"` - GID int `json:"gid"` - AdditionalGroups []int `json:"additional_groups"` - Config *configs.Config `json:"config"` - Networks []*network `json:"network"` - PassedFilesCount int `json:"passed_files_count"` - ContainerID string `json:"containerid"` - Rlimits []configs.Rlimit `json:"rlimits"` - CreateConsole bool `json:"create_console"` - ConsoleWidth uint16 `json:"console_width"` - ConsoleHeight uint16 `json:"console_height"` - RootlessEUID bool `json:"rootless_euid,omitempty"` - RootlessCgroups bool `json:"rootless_cgroups,omitempty"` - SpecState *specs.State `json:"spec_state,omitempty"` - Cgroup2Path string `json:"cgroup2_path,omitempty"` + // Config is the original container config. + Config *configs.Config `json:"config"` + + // Properties that are unique to and come from [Process]. + + Args []string `json:"args"` + Env []string `json:"env"` + UID int `json:"uid"` + GID int `json:"gid"` + AdditionalGroups []int `json:"additional_groups"` + Cwd string `json:"cwd"` + CreateConsole bool `json:"create_console"` + ConsoleWidth uint16 `json:"console_width"` + ConsoleHeight uint16 `json:"console_height"` + PassedFilesCount int `json:"passed_files_count"` + + // Properties that exists both in the container config and the process, + // as merged by [Container.newInitConfig] (process properties has preference). + + AppArmorProfile string `json:"apparmor_profile"` + Capabilities *configs.Capabilities `json:"capabilities"` + NoNewPrivileges bool `json:"no_new_privileges"` + ProcessLabel string `json:"process_label"` + Rlimits []configs.Rlimit `json:"rlimits"` + IOPriority *configs.IOPriority `json:"io_priority,omitempty"` + Scheduler *configs.Scheduler `json:"scheduler,omitempty"` + + // Miscellaneous properties, filled in by [Container.newInitConfig] + // unless documented otherwise. + + ContainerID string `json:"containerid"` + Cgroup2Path string `json:"cgroup2_path,omitempty"` + + // Networks is filled in from container config by [initProcess.createNetworkInterfaces]. + Networks []*network `json:"network"` + + // SpecState is filled in by [initProcess.Start]. + SpecState *specs.State `json:"spec_state,omitempty"` } // Init is part of "runc init" implementation. @@ -300,13 +324,7 @@ func finalizeNamespace(config *initConfig) error { } } - caps := &configs.Capabilities{} - if config.Capabilities != nil { - caps = config.Capabilities - } else if config.Config.Capabilities != nil { - caps = config.Config.Capabilities - } - w, err := capabilities.New(caps) + w, err := capabilities.New(config.Capabilities) if err != nil { return err } @@ -456,7 +474,7 @@ func setupUser(config *initConfig) error { // There's nothing we can do about /etc/group entries, so we silently // ignore setting groups here (since the user didn't explicitly ask us to // set the group). - allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny" + allowSupGroups := !config.Config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny" if allowSupGroups { if err := unix.Setgroups(config.AdditionalGroups); err != nil { @@ -590,7 +608,7 @@ func setupRlimits(limits []configs.Rlimit, pid int) error { return nil } -func setupScheduler(config *configs.Config) error { +func setupScheduler(config *initConfig) error { if config.Scheduler == nil { return nil } @@ -599,7 +617,7 @@ func setupScheduler(config *configs.Config) error { return err } if err := unix.SchedSetAttr(0, attr, 0); err != nil { - if errors.Is(err, unix.EPERM) && config.Cgroups.CpusetCpus != "" { + if errors.Is(err, unix.EPERM) && config.Config.Cgroups.CpusetCpus != "" { return errors.New("process scheduler can't be used together with AllowedCPUs") } return fmt.Errorf("error setting scheduler: %w", err) @@ -607,7 +625,7 @@ func setupScheduler(config *configs.Config) error { return nil } -func setupIOPriority(config *configs.Config) error { +func setupIOPriority(config *initConfig) error { const ioprioWhoPgrp = 1 ioprio := config.IOPriority diff --git a/libcontainer/process.go b/libcontainer/process.go index 09162be9a40..0e24c548ed8 100644 --- a/libcontainer/process.go +++ b/libcontainer/process.go @@ -17,8 +17,11 @@ type processOperations interface { pid() int } -// Process specifies the configuration and IO for a process inside -// a container. +// Process defines the configuration and IO for a process inside a container. +// +// Note that some Process properties are also present in container configuration +// ([configs.Config]). In all such cases, Process properties take precedence +// over container configuration ones. type Process struct { // The command to be run followed by any arguments. Args []string @@ -34,44 +37,54 @@ type Process struct { // in addition to those that the user belongs to. AdditionalGroups []int - // Cwd will change the processes current working directory inside the container's rootfs. + // Cwd will change the process's current working directory inside the container's rootfs. Cwd string - // Stdin is a pointer to a reader which provides the standard input stream. + // Stdin is a reader which provides the standard input stream. Stdin io.Reader - // Stdout is a pointer to a writer which receives the standard output stream. + // Stdout is a writer which receives the standard output stream. Stdout io.Writer - // Stderr is a pointer to a writer which receives the standard error stream. + // Stderr is a writer which receives the standard error stream. Stderr io.Writer - // ExtraFiles specifies additional open files to be inherited by the container + // ExtraFiles specifies additional open files to be inherited by the process. ExtraFiles []*os.File - // open handles to cloned binaries -- see dmz.CloneSelfExe for more details + // Open handles to cloned binaries -- see dmz.CloneSelfExe for more details. clonedExes []*os.File - // Initial sizings for the console + // Initial size for the console. ConsoleWidth uint16 ConsoleHeight uint16 - // Capabilities specify the capabilities to keep when executing the process inside the container - // All capabilities not specified will be dropped from the processes capability mask + // Capabilities specify the capabilities to keep when executing the process. + // All capabilities not specified will be dropped from the processes capability mask. + // + // If not nil, takes precedence over container's [configs.Config.Capabilities]. Capabilities *configs.Capabilities // AppArmorProfile specifies the profile to apply to the process and is - // changed at the time the process is execed + // changed at the time the process is executed. + // + // If not empty, takes precedence over container's [configs.Config.AppArmorProfile]. AppArmorProfile string - // Label specifies the label to apply to the process. It is commonly used by selinux + // Label specifies the label to apply to the process. It is commonly used by selinux. + // + // If not empty, takes precedence over container's [configs.Config.ProcessLabel]. Label string // NoNewPrivileges controls whether processes can gain additional privileges. + // + // If not nil, takes precedence over container's [configs.Config.NoNewPrivileges]. NoNewPrivileges *bool - // Rlimits specifies the resource limits, such as max open files, to set in the container - // If Rlimits are not set, the container will inherit rlimits from the parent process + // Rlimits specifies the resource limits, such as max open files, to set for the process. + // If unset, the process will inherit rlimits from the parent process. + // + // If not empty, takes precedence over container's [configs.Config.Rlimit]. Rlimits []configs.Rlimit // ConsoleSocket provides the masterfd console. @@ -99,8 +112,14 @@ type Process struct { // For cgroup v2, the only key allowed is "". SubCgroupPaths map[string]string + // Scheduler represents the scheduling attributes for a process. + // + // If not empty, takes precedence over container's [configs.Config.Scheduler]. Scheduler *configs.Scheduler + // IOPriority is a process I/O priority. + // + // If not empty, takes precedence over container's [configs.Config.IOPriority]. IOPriority *configs.IOPriority } diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 012b0506713..68e16b7920b 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -106,7 +106,7 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) { root: config.Rootfs, label: config.MountLabel, cgroup2Path: iConfig.Cgroup2Path, - rootlessCgroups: iConfig.RootlessCgroups, + rootlessCgroups: config.RootlessCgroups, cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), } for _, m := range config.Mounts { diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index d1885b3fdda..0a79f197e6d 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -71,11 +71,11 @@ func (l *linuxSetnsInit) Init() error { unix.Umask(int(*l.config.Config.Umask)) } - if err := setupScheduler(l.config.Config); err != nil { + if err := setupScheduler(l.config); err != nil { return err } - if err := setupIOPriority(l.config.Config); err != nil { + if err := setupIOPriority(l.config); err != nil { return err } // Tell our parent that we're ready to exec. This must be done before the diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 9517820bcad..384750bf837 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -155,11 +155,11 @@ func (l *linuxStandardInit) Init() error { } } - if err := setupScheduler(l.config.Config); err != nil { + if err := setupScheduler(l.config); err != nil { return err } - if err := setupIOPriority(l.config.Config); err != nil { + if err := setupIOPriority(l.config); err != nil { return err } diff --git a/tests/integration/ioprio.bats b/tests/integration/ioprio.bats index a907d782f01..9faa72d61ab 100644 --- a/tests/integration/ioprio.bats +++ b/tests/integration/ioprio.bats @@ -20,11 +20,25 @@ function teardown() { # Check the init process. runc exec test_ioprio ionice -p 1 [ "$status" -eq 0 ] - [[ "$output" = *'best-effort: prio 4'* ]] + [ "${lines[0]}" = 'best-effort: prio 4' ] - # Check the process made from the exec command. + # Check an exec process, which should derive ioprio from config.json. runc exec test_ioprio ionice [ "$status" -eq 0 ] - - [[ "$output" = *'best-effort: prio 4'* ]] + [ "${lines[0]}" = 'best-effort: prio 4' ] + + # Check an exec with a priority taken from process.json, + # which should override the ioprio in config.json. + proc=' +{ + "terminal": false, + "ioPriority": { + "class": "IOPRIO_CLASS_IDLE" + }, + "args": [ "/usr/bin/ionice" ], + "cwd": "/" +}' + runc exec --process <(echo "$proc") test_ioprio + [ "$status" -eq 0 ] + [ "${lines[0]}" = 'idle' ] } diff --git a/tests/integration/scheduler.bats b/tests/integration/scheduler.bats index b7cd96f8890..6c80d86426b 100644 --- a/tests/integration/scheduler.bats +++ b/tests/integration/scheduler.bats @@ -12,17 +12,49 @@ function teardown() { } @test "scheduler is applied" { - update_config ' .process.scheduler = {"policy": "SCHED_DEADLINE", "nice": 19, "priority": 0, "runtime": 42000, "deadline": 1000000, "period": 1000000, }' + update_config ' .process.scheduler = { + "policy": "SCHED_BATCH", + "priority": 0, + "nice": 19 + }' runc run -d --console-socket "$CONSOLE_SOCKET" test_scheduler [ "$status" -eq 0 ] + # Check init settings. runc exec test_scheduler chrt -p 1 [ "$status" -eq 0 ] + [[ "${lines[0]}" == *"scheduling policy: SCHED_BATCH" ]] + [[ "${lines[1]}" == *"priority: 0" ]] + + # Check exec settings derived from config.json. + runc exec test_scheduler sh -c 'chrt -p $$' + [ "$status" -eq 0 ] + [[ "${lines[0]}" == *"scheduling policy: SCHED_BATCH" ]] + [[ "${lines[1]}" == *"priority: 0" ]] + + # Another exec, with different scheduler settings. + proc=' +{ + "terminal": false, + "args": [ "/bin/sleep", "600" ], + "cwd": "/", + "scheduler": { + "policy": "SCHED_DEADLINE", + "flags": [ "SCHED_FLAG_RESET_ON_FORK" ], + "nice": 19, + "priority": 0, + "runtime": 42000, + "deadline": 100000, + "period": 1000000 + } +}' + __runc exec -d --pid-file pid.txt --process <(echo "$proc") test_scheduler - [[ "${lines[0]}" == *"scheduling policy: SCHED_DEADLINE" ]] + run chrt -p "$(cat pid.txt)" + [[ "${lines[0]}" == *"scheduling policy: SCHED_DEADLINE|SCHED_RESET_ON_FORK" ]] [[ "${lines[1]}" == *"priority: 0" ]] - [[ "${lines[2]}" == *"runtime/deadline/period parameters: 42000/1000000/1000000" ]] + [[ "${lines[2]}" == *"runtime/deadline/period parameters: 42000/100000/1000000" ]] } # Checks that runc emits a specific error when scheduling policy is used