diff --git a/go.mod b/go.mod index dcbb0ae3c16..4b2adfc2e80 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/cyphar/filepath-securejoin v0.2.4 github.com/docker/go-units v0.5.0 github.com/godbus/dbus/v5 v5.1.0 - github.com/moby/sys/mountinfo v0.6.2 + github.com/moby/sys/mountinfo v0.7.1 github.com/moby/sys/user v0.1.0 github.com/mrunalp/fileutils v0.5.1 github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 diff --git a/go.sum b/go.sum index 3b347e30a91..fbeb4d83575 100644 --- a/go.sum +++ b/go.sum @@ -26,8 +26,8 @@ github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/moby/sys/mountinfo v0.6.2 h1:BzJjoreD5BMFNmD9Rus6gdd1pLuecOFPt8wC+Vygl78= -github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= +github.com/moby/sys/mountinfo v0.7.1 h1:/tTvQaSJRr2FshkhXiIpux6fQ2Zvc4j7tAhMTStAG2g= +github.com/moby/sys/mountinfo v0.7.1/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= github.com/moby/sys/user v0.1.0 h1:WmZ93f5Ux6het5iituh9x2zAG7NFY9Aqi49jjE1PaQg= github.com/moby/sys/user v0.1.0/go.mod h1:fKJhFOnsCN6xZ5gSfbM6zaHGgDJMrqt9/reuj4T7MmU= github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q= diff --git a/libcontainer/apparmor/apparmor_linux.go b/libcontainer/apparmor/apparmor_linux.go index 8b1483c7de7..17d36ed15a3 100644 --- a/libcontainer/apparmor/apparmor_linux.go +++ b/libcontainer/apparmor/apparmor_linux.go @@ -26,14 +26,19 @@ func isEnabled() bool { } func setProcAttr(attr, value string) error { - // Under AppArmor you can only change your own attr, so use /proc/self/ - // instead of /proc// like libapparmor does - attrPath := "/proc/self/attr/apparmor/" + attr - if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) { + attr = utils.CleanPath(attr) + attrSubPath := "attr/apparmor/" + attr + if _, err := os.Stat("/proc/self/" + attrSubPath); errors.Is(err, os.ErrNotExist) { // fall back to the old convention - attrPath = "/proc/self/attr/" + attr + attrSubPath = "attr/" + attr } + // Under AppArmor you can only change your own attr, so there's no reason + // to not use /proc/thread-self/ (instead of /proc//, like libapparmor + // does). + attrPath, closer := utils.ProcThreadSelf(attrSubPath) + defer closer() + f, err := os.OpenFile(attrPath, os.O_WRONLY, 0) if err != nil { return err diff --git a/libcontainer/cgroups/cgroups_test.go b/libcontainer/cgroups/cgroups_test.go index b31412f5a04..b7ca7b1837a 100644 --- a/libcontainer/cgroups/cgroups_test.go +++ b/libcontainer/cgroups/cgroups_test.go @@ -5,6 +5,9 @@ import ( ) func TestParseCgroups(t *testing.T) { + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { t.Fatal(err) diff --git a/libcontainer/cgroups/file.go b/libcontainer/cgroups/file.go index b0d6e33beb0..6c93ce4502d 100644 --- a/libcontainer/cgroups/file.go +++ b/libcontainer/cgroups/file.go @@ -136,13 +136,14 @@ func openFile(dir, file string, flags int) (*os.File, error) { // // TODO: if such usage will ever be common, amend this // to reopen cgroupFd and retry openat2. - fdStr := strconv.Itoa(cgroupFd) - fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) + fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(cgroupFd)) + defer closer() + fdDest, _ := os.Readlink(fdPath) if fdDest != cgroupfsDir { // Wrap the error so it is clear that cgroupFd // is opened to an unexpected/wrong directory. - err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w", - fdStr, fdDest, cgroupfsDir, err) + err = fmt.Errorf("cgroupFd %d unexpectedly opened to %s != %s: %w", + cgroupFd, fdDest, cgroupfsDir, err) } return nil, err } diff --git a/libcontainer/cgroups/fs2/defaultpath.go b/libcontainer/cgroups/fs2/defaultpath.go index 9c949c91f08..8ac8312017b 100644 --- a/libcontainer/cgroups/fs2/defaultpath.go +++ b/libcontainer/cgroups/fs2/defaultpath.go @@ -55,6 +55,9 @@ func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) { return filepath.Join(root, innerPath), nil } + // we don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. ownCgroup, err := parseCgroupFile("/proc/self/cgroup") if err != nil { return "", err diff --git a/libcontainer/cgroups/v1_utils.go b/libcontainer/cgroups/v1_utils.go index 8524c468434..81193e20983 100644 --- a/libcontainer/cgroups/v1_utils.go +++ b/libcontainer/cgroups/v1_utils.go @@ -99,11 +99,12 @@ func tryDefaultPath(cgroupPath, subsystem string) string { // expensive), so it is assumed that cgroup mounts are not being changed. func readCgroupMountinfo() ([]*mountinfo.Info, error) { readMountinfoOnce.Do(func() { + // mountinfo.GetMounts uses /proc/thread-self, so we can use it without + // issues. cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts( mountinfo.FSTypeFilter("cgroup"), ) }) - return cgroupMountinfo, readMountinfoErr } @@ -196,6 +197,9 @@ func getCgroupMountsV1(all bool) ([]Mount, error) { return nil, err } + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return nil, err @@ -214,6 +218,10 @@ func GetOwnCgroup(subsystem string) (string, error) { if IsCgroup2UnifiedMode() { return "", errUnified } + + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return "", err diff --git a/libcontainer/configs/mount.go b/libcontainer/configs/mount.go index e39440fc08d..bfd356e497f 100644 --- a/libcontainer/configs/mount.go +++ b/libcontainer/configs/mount.go @@ -3,5 +3,5 @@ package configs const ( // EXT_COPYUP is a directive to copy up the contents of a directory when // a tmpfs is mounted over it. - EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning + EXT_COPYUP = 1 << iota //nolint:golint,revive // ignore "don't use ALL_CAPS" warning ) diff --git a/libcontainer/configs/mount_linux.go b/libcontainer/configs/mount_linux.go index 3f489295d97..b69e9ab238e 100644 --- a/libcontainer/configs/mount_linux.go +++ b/libcontainer/configs/mount_linux.go @@ -2,6 +2,24 @@ package configs import "golang.org/x/sys/unix" +type MountIDMapping struct { + // Recursive indicates if the mapping needs to be recursive. + Recursive bool `json:"recursive"` + + // UserNSPath is a path to a user namespace that indicates the necessary + // id-mappings for MOUNT_ATTR_IDMAP. If set to non-"", UIDMappings and + // GIDMappings must be set to nil. + UserNSPath string `json:"userns_path,omitempty"` + + // UIDMappings is the uid mapping set for this mount, to be used with + // MOUNT_ATTR_IDMAP. + UIDMappings []IDMap `json:"uid_mappings,omitempty"` + + // GIDMappings is the gid mapping set for this mount, to be used with + // MOUNT_ATTR_IDMAP. + GIDMappings []IDMap `json:"gid_mappings,omitempty"` +} + type Mount struct { // Source path for the mount. Source string `json:"source"` @@ -34,17 +52,9 @@ type Mount struct { // Extensions are additional flags that are specific to runc. Extensions int `json:"extensions"` - // UIDMappings is used to changing file user owners w/o calling chown. - // Note that, the underlying filesystem should support this feature to be - // used. - // Every mount point could have its own mapping. - UIDMappings []IDMap `json:"uid_mappings,omitempty"` - - // GIDMappings is used to changing file group owners w/o calling chown. - // Note that, the underlying filesystem should support this feature to be - // used. - // Every mount point could have its own mapping. - GIDMappings []IDMap `json:"gid_mappings,omitempty"` + // Mapping is the MOUNT_ATTR_IDMAP configuration for the mount. If non-nil, + // the mount is configured to use MOUNT_ATTR_IDMAP-style id mappings. + IDMapping *MountIDMapping `json:"id_mapping,omitempty"` } func (m *Mount) IsBind() bool { @@ -52,5 +62,5 @@ func (m *Mount) IsBind() bool { } func (m *Mount) IsIDMapped() bool { - return len(m.UIDMappings) > 0 || len(m.GIDMappings) > 0 + return m.IDMapping != nil } diff --git a/libcontainer/configs/namespaces_linux.go b/libcontainer/configs/namespaces_linux.go index 5062432f8c3..898f96fd0f5 100644 --- a/libcontainer/configs/namespaces_linux.go +++ b/libcontainer/configs/namespaces_linux.go @@ -59,6 +59,9 @@ func IsNamespaceSupported(ns NamespaceType) bool { if nsFile == "" { return false } + // We don't need to use /proc/thread-self here because the list of + // namespace types is unrelated to the thread. This lets us avoid having to + // do runtime.LockOSThread. _, err := os.Stat("/proc/self/ns/" + nsFile) // a namespace is supported if it exists and we have permissions to read it supported = err == nil diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 2594d05c723..4708e1b773f 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -309,29 +309,32 @@ func checkBindOptions(m *configs.Mount) error { } func checkIDMapMounts(config *configs.Config, m *configs.Mount) error { + // Make sure MOUNT_ATTR_IDMAP is not set on any of our mounts. This + // attribute is handled differently to all other attributes (through + // m.IDMapping), so make sure we never store it in the actual config. This + // really shouldn't ever happen. + if m.RecAttr != nil && (m.RecAttr.Attr_set|m.RecAttr.Attr_clr)&unix.MOUNT_ATTR_IDMAP != 0 { + return errors.New("mount configuration cannot contain recAttr for MOUNT_ATTR_IDMAP") + } if !m.IsIDMapped() { return nil } - if !m.IsBind() { - return fmt.Errorf("gidMappings/uidMappings is supported only for mounts with the option 'bind'") + return errors.New("id-mapped mounts are only supported for bind-mounts") } if config.RootlessEUID { - return fmt.Errorf("gidMappings/uidMappings is not supported when runc is being launched with EUID != 0, needs CAP_SYS_ADMIN on the runc parent's user namespace") - } - if len(config.UIDMappings) == 0 || len(config.GIDMappings) == 0 { - return fmt.Errorf("not yet supported to use gidMappings/uidMappings in a mount without also using a user namespace") - } - if !sameMapping(config.UIDMappings, m.UIDMappings) { - return fmt.Errorf("not yet supported for the mount uidMappings to be different than user namespace uidMapping") - } - if !sameMapping(config.GIDMappings, m.GIDMappings) { - return fmt.Errorf("not yet supported for the mount gidMappings to be different than user namespace gidMapping") + return errors.New("id-mapped mounts are not supported for rootless containers") } - if !filepath.IsAbs(m.Source) { - return fmt.Errorf("mount source not absolute") + if m.IDMapping.UserNSPath == "" { + if len(m.IDMapping.UIDMappings) == 0 || len(m.IDMapping.GIDMappings) == 0 { + return errors.New("id-mapped mounts must have both uid and gid mappings specified") + } + } else { + if m.IDMapping.UIDMappings != nil || m.IDMapping.GIDMappings != nil { + // should never happen + return errors.New("[internal error] id-mapped mounts cannot have both userns_path and uid and gid mappings specified") + } } - return nil } @@ -356,27 +359,6 @@ func mountsStrict(config *configs.Config) error { return nil } -// sameMapping checks if the mappings are the same. If the mappings are the same -// but in different order, it returns false. -func sameMapping(a, b []configs.IDMap) bool { - if len(a) != len(b) { - return false - } - - for i := range a { - if a[i].ContainerID != b[i].ContainerID { - return false - } - if a[i].HostID != b[i].HostID { - return false - } - if a[i].Size != b[i].Size { - return false - } - } - return true -} - func isHostNetNS(path string) (bool, error) { const currentProcessNetns = "/proc/self/ns/net" diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go index 755d2a07be8..5c12e08a1db 100644 --- a/libcontainer/configs/validate/validator_test.go +++ b/libcontainer/configs/validate/validator_test.go @@ -504,41 +504,56 @@ func TestValidateIDMapMounts(t *testing.T) { config *configs.Config }{ { - name: "idmap mount without bind opt specified", + name: "idmap non-bind mount", isErr: true, config: &configs.Config{ UIDMappings: mapping, GIDMappings: mapping, Mounts: []*configs.Mount{ { - Source: "/abs/path/", + Source: "/dev/sda1", Destination: "/abs/path/", - UIDMappings: mapping, - GIDMappings: mapping, + Device: "ext4", + IDMapping: &configs.MountIDMapping{ + UIDMappings: mapping, + GIDMappings: mapping, + }, }, }, }, }, { - name: "rootless idmap mount", + name: "idmap option non-bind mount", isErr: true, config: &configs.Config{ - RootlessEUID: true, - UIDMappings: mapping, - GIDMappings: mapping, Mounts: []*configs.Mount{ { - Source: "/abs/path/", + Source: "/dev/sda1", Destination: "/abs/path/", - Flags: unix.MS_BIND, - UIDMappings: mapping, - GIDMappings: mapping, + Device: "ext4", + IDMapping: &configs.MountIDMapping{}, + }, + }, + }, + }, + { + name: "ridmap option non-bind mount", + isErr: true, + config: &configs.Config{ + Mounts: []*configs.Mount{ + { + Source: "/dev/sda1", + Destination: "/abs/path/", + Device: "ext4", + IDMapping: &configs.MountIDMapping{ + Recursive: true, + }, }, }, }, }, { - name: "idmap mount without userns mappings", + name: "idmap mount no uid mapping", isErr: true, config: &configs.Config{ Mounts: []*configs.Mount{ @@ -546,61 +561,51 @@ func TestValidateIDMapMounts(t *testing.T) { Source: "/abs/path/", Destination: "/abs/path/", Flags: unix.MS_BIND, - UIDMappings: mapping, - GIDMappings: mapping, + IDMapping: &configs.MountIDMapping{ + GIDMappings: mapping, + }, }, }, }, }, { - name: "idmap mounts with different userns and mount mappings", + name: "idmap mount no gid mapping", isErr: true, config: &configs.Config{ - UIDMappings: mapping, - GIDMappings: mapping, Mounts: []*configs.Mount{ { Source: "/abs/path/", Destination: "/abs/path/", Flags: unix.MS_BIND, - UIDMappings: []configs.IDMap{ - { - ContainerID: 10, - HostID: 10, - Size: 1, - }, + IDMapping: &configs.MountIDMapping{ + UIDMappings: mapping, }, - GIDMappings: mapping, }, }, }, }, { - name: "idmap mounts with different userns and mount mappings", + name: "rootless idmap mount", isErr: true, config: &configs.Config{ - UIDMappings: mapping, - GIDMappings: mapping, + RootlessEUID: true, + UIDMappings: mapping, + GIDMappings: mapping, Mounts: []*configs.Mount{ { Source: "/abs/path/", Destination: "/abs/path/", Flags: unix.MS_BIND, - UIDMappings: mapping, - GIDMappings: []configs.IDMap{ - { - ContainerID: 10, - HostID: 10, - Size: 1, - }, + IDMapping: &configs.MountIDMapping{ + UIDMappings: mapping, + GIDMappings: mapping, }, }, }, }, }, { - name: "idmap mounts without abs source path", - isErr: true, + name: "idmap mounts without abs source path", config: &configs.Config{ UIDMappings: mapping, GIDMappings: mapping, @@ -609,8 +614,10 @@ func TestValidateIDMapMounts(t *testing.T) { Source: "./rel/path/", Destination: "/abs/path/", Flags: unix.MS_BIND, - UIDMappings: mapping, - GIDMappings: mapping, + IDMapping: &configs.MountIDMapping{ + UIDMappings: mapping, + GIDMappings: mapping, + }, }, }, }, @@ -625,8 +632,10 @@ func TestValidateIDMapMounts(t *testing.T) { Source: "/abs/path/", Destination: "./rel/path/", Flags: unix.MS_BIND, - UIDMappings: mapping, - GIDMappings: mapping, + IDMapping: &configs.MountIDMapping{ + UIDMappings: mapping, + GIDMappings: mapping, + }, }, }, }, @@ -641,8 +650,10 @@ func TestValidateIDMapMounts(t *testing.T) { Source: "/another-abs/path/", Destination: "/abs/path/", Flags: unix.MS_BIND, - UIDMappings: mapping, - GIDMappings: mapping, + IDMapping: &configs.MountIDMapping{ + UIDMappings: mapping, + GIDMappings: mapping, + }, }, }, }, @@ -657,8 +668,104 @@ func TestValidateIDMapMounts(t *testing.T) { Source: "/another-abs/path/", Destination: "/abs/path/", Flags: unix.MS_BIND | unix.MS_RDONLY, - UIDMappings: mapping, - GIDMappings: mapping, + IDMapping: &configs.MountIDMapping{ + UIDMappings: mapping, + GIDMappings: mapping, + }, + }, + }, + }, + }, + { + name: "idmap mount without userns mappings", + config: &configs.Config{ + Mounts: []*configs.Mount{ + { + Source: "/abs/path/", + Destination: "/abs/path/", + Flags: unix.MS_BIND, + IDMapping: &configs.MountIDMapping{ + UIDMappings: mapping, + GIDMappings: mapping, + }, + }, + }, + }, + }, + { + name: "idmap mounts with different userns and mount mappings", + config: &configs.Config{ + UIDMappings: mapping, + GIDMappings: mapping, + Mounts: []*configs.Mount{ + { + Source: "/abs/path/", + Destination: "/abs/path/", + Flags: unix.MS_BIND, + IDMapping: &configs.MountIDMapping{ + UIDMappings: []configs.IDMap{ + { + ContainerID: 10, + HostID: 10, + Size: 1, + }, + }, + GIDMappings: mapping, + }, + }, + }, + }, + }, + { + name: "idmap mounts with different userns and mount mappings", + config: &configs.Config{ + UIDMappings: mapping, + GIDMappings: mapping, + Mounts: []*configs.Mount{ + { + Source: "/abs/path/", + Destination: "/abs/path/", + Flags: unix.MS_BIND, + IDMapping: &configs.MountIDMapping{ + UIDMappings: mapping, + GIDMappings: []configs.IDMap{ + { + ContainerID: 10, + HostID: 10, + Size: 1, + }, + }, + }, + }, + }, + }, + }, + { + name: "mount with 'idmap' option but no mappings", + isErr: true, + config: &configs.Config{ + Mounts: []*configs.Mount{ + { + Source: "/abs/path/", + Destination: "/abs/path/", + Flags: unix.MS_BIND, + IDMapping: &configs.MountIDMapping{}, + }, + }, + }, + }, + { + name: "mount with 'ridmap' option but no mappings", + isErr: true, + config: &configs.Config{ + Mounts: []*configs.Mount{ + { + Source: "/abs/path/", + Destination: "/abs/path/", + Flags: unix.MS_BIND, + IDMapping: &configs.MountIDMapping{ + Recursive: true, + }, }, }, }, diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index f36abaf1032..c79988e5ff0 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -2,7 +2,6 @@ package libcontainer import ( "bytes" - "encoding/json" "errors" "fmt" "io" @@ -510,14 +509,20 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { if dmz.IsSelfExeCloned() { // /proc/self/exe is already a cloned binary -- no need to do anything logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!") + // We don't need to use /proc/thread-self here because the exe mm of a + // thread-group is guaranteed to be the same for all threads by + // definition. This lets us avoid having to do runtime.LockOSThread. exePath = "/proc/self/exe" } else { var err error if isDmzBinarySafe(c.config) { dmzExe, err = dmz.Binary(c.stateDir) if err == nil { - // We can use our own executable without cloning if we are using - // runc-dmz. + // We can use our own executable without cloning if we are + // using runc-dmz. We don't need to use /proc/thread-self here + // because the exe mm of a thread-group is guaranteed to be the + // same for all threads by definition. This lets us avoid + // having to do runtime.LockOSThread. exePath = "/proc/self/exe" p.clonedExes = append(p.clonedExes, dmzExe) logrus.Debug("runc-dmz: using runc-dmz") // used for tests @@ -629,112 +634,6 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { return c.newSetnsProcess(p, cmd, comm) } -// shouldSendMountSources says whether the child process must setup bind mounts with -// the source pre-opened (O_PATH) in the host user namespace. -// See https://github.com/opencontainers/runc/issues/2484 -func (c *Container) shouldSendMountSources() bool { - // Passing the mount sources via SCM_RIGHTS is only necessary when - // both userns and mntns are active. - if !c.config.Namespaces.Contains(configs.NEWUSER) || - !c.config.Namespaces.Contains(configs.NEWNS) { - return false - } - - // nsexec.c send_mountsources() requires setns(mntns) capabilities - // CAP_SYS_CHROOT and CAP_SYS_ADMIN. - if c.config.RootlessEUID { - return false - } - - // We need to send sources if there are non-idmap bind-mounts. - for _, m := range c.config.Mounts { - if m.IsBind() && !m.IsIDMapped() { - return true - } - } - - return false -} - -// shouldSendIdmapSources says whether the child process must setup idmap mounts with -// the mount_setattr already done in the host user namespace. -func (c *Container) shouldSendIdmapSources() bool { - // nsexec.c mount_setattr() requires CAP_SYS_ADMIN in: - // * the user namespace the filesystem was mounted in; - // * the user namespace we're trying to idmap the mount to; - // * the owning user namespace of the mount namespace you're currently located in. - // - // See the comment from Christian Brauner: - // https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972 - // - // Let's just rule out rootless, we don't have those permission in the - // rootless case. - if c.config.RootlessEUID { - return false - } - - // For the time being we require userns to be in use. - if !c.config.Namespaces.Contains(configs.NEWUSER) { - return false - } - - // We need to send sources if there are idmap bind-mounts. - for _, m := range c.config.Mounts { - if m.IsBind() && m.IsIDMapped() { - return true - } - } - - return false -} - -func (c *Container) sendMountSources(cmd *exec.Cmd, comm *processComm) error { - if !c.shouldSendMountSources() { - return nil - } - - return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool { - return m.IsBind() && !m.IsIDMapped() - }) -} - -func (c *Container) sendIdmapSources(cmd *exec.Cmd, comm *processComm) error { - if !c.shouldSendIdmapSources() { - return nil - } - - return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool { - return m.IsBind() && m.IsIDMapped() - }) -} - -func (c *Container) sendFdsSources(cmd *exec.Cmd, comm *processComm, envVar string, condition func(*configs.Mount) bool) error { - // Elements on these slices will be paired with mounts (see StartInitialization() and - // prepareRootfs()). These slices MUST have the same size as c.config.Mounts. - fds := make([]int, len(c.config.Mounts)) - for i, m := range c.config.Mounts { - if !condition(m) { - // The -1 fd is ignored later. - fds[i] = -1 - continue - } - - // The fd passed here will not be used: nsexec.c will overwrite it with - // dup3(). We just need to allocate a fd so that we know the number to - // pass in the environment variable. The fd must not be closed before - // cmd.Start(), so we reuse initSockChild because the lifecycle of that - // fd is already taken care of. - cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild) - fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1 - } - fdsJSON, err := json.Marshal(fds) - if err != nil { - return fmt.Errorf("Error creating %v: %w", envVar, err) - } - cmd.Env = append(cmd.Env, envVar+"="+string(fdsJSON)) - return nil -} - func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) nsMaps := make(map[configs.NamespaceType]string) @@ -743,16 +642,10 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) nsMaps[ns.Type] = ns.Path } } - data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard) + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) if err != nil { return nil, err } - if err := c.sendMountSources(cmd, comm); err != nil { - return nil, err - } - if err := c.sendIdmapSources(cmd, comm); err != nil { - return nil, err - } init := &initProcess{ cmd: cmd, @@ -776,7 +669,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm } // for setns process, we don't have to set cloneflags as the process namespaces // will only be set via setns syscall - data, err := c.bootstrapData(0, state.NamespacePaths, initSetns) + data, err := c.bootstrapData(0, state.NamespacePaths) if err != nil { return nil, err } @@ -1165,7 +1058,7 @@ type netlinkError struct{ error } // such as one that uses nsenter package to bootstrap the container's // init process correctly, i.e. with correct namespaces, uid/gid // mapping etc. -func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) { +func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) { // create the netlink message r := nl.NewNetlinkRequest(int(InitMsg), 0) @@ -1267,48 +1160,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa Value: c.config.RootlessEUID, }) - // Bind mount source to open. - if it == initStandard && c.shouldSendMountSources() { - var mounts []byte - for _, m := range c.config.Mounts { - if m.IsBind() && !m.IsIDMapped() { - if strings.IndexByte(m.Source, 0) >= 0 { - return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source) - } - mounts = append(mounts, []byte(m.Source)...) - } - mounts = append(mounts, byte(0)) - } - - r.AddData(&Bytemsg{ - Type: MountSourcesAttr, - Value: mounts, - }) - } - - // Idmap mount sources to open. - if it == initStandard && c.shouldSendIdmapSources() { - var mounts []byte - for _, m := range c.config.Mounts { - if m.IsBind() && m.IsIDMapped() { - // While other parts of the code check this too (like - // libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer - // users don't use those functions. - if strings.IndexByte(m.Source, 0) >= 0 { - return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source) - } - - mounts = append(mounts, []byte(m.Source)...) - } - mounts = append(mounts, byte(0)) - } - - r.AddData(&Bytemsg{ - Type: IdmapSourcesAttr, - Value: mounts, - }) - } - // write boottime and monotonic time ns offsets. if c.config.TimeOffsets != nil { var offsetSpec bytes.Buffer diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go index 360639e7fcd..0567152403f 100644 --- a/libcontainer/criu_linux.go +++ b/libcontainer/criu_linux.go @@ -526,6 +526,15 @@ func (c *Container) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { // restore using CRIU. This function is inspired from the code in // rootfs_linux.go. func (c *Container) makeCriuRestoreMountpoints(m *configs.Mount) error { + me := mountEntry{Mount: m} + dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination) + if err != nil { + return err + } + // TODO: pass srcFD? Not sure if criu is impacted by issue #2484. + if err := checkProcMount(c.config.Rootfs, dest, me); err != nil { + return err + } switch m.Device { case "cgroup": // No mount point(s) need to be created: @@ -537,21 +546,19 @@ func (c *Container) makeCriuRestoreMountpoints(m *configs.Mount) error { // the mountpoint appears as soon as /sys is mounted return nil case "bind": - // The prepareBindMount() function checks if source - // exists. So it cannot be used for other filesystem types. - // TODO: pass srcFD? Not sure if criu is impacted by issue #2484. - if err := prepareBindMount(mountEntry{Mount: m}, c.config.Rootfs); err != nil { - return err - } - default: - // for all other filesystems just create the mountpoints - dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination) + // For bind-mounts (unlike other filesystem types), we need to check if + // the source exists. + fi, _, err := me.srcStat() if err != nil { + // error out if the source of a bind mount does not exist as we + // will be unable to bind anything to it. return err } - if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil { + if err := createIfNotExists(dest, fi.IsDir()); err != nil { return err } + default: + // for all other filesystems just create the mountpoints if err := os.MkdirAll(dest, 0o755); err != nil { return err } @@ -618,8 +625,8 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error { // because during initial container creation mounts are // set up in the order they are configured. if m.Device == "bind" { - if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFD string) error { - return mountViaFDs(m.Source, nil, m.Destination, dstFD, "", unix.MS_BIND|unix.MS_REC, "") + if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error { + return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "") }); err != nil { return err } diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index b0bf6142109..bc682f23ed8 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -214,18 +214,3 @@ func validateID(id string) error { return nil } - -func parseFdsFromEnv(envVar string) ([]int, error) { - fdsJSON := os.Getenv(envVar) - if fdsJSON == "" { - // Always return the nil slice if no fd is present. - return nil, nil - } - - var fds []int - if err := json.Unmarshal([]byte(fdsJSON), &fds); err != nil { - return nil, fmt.Errorf("Error unmarshalling %v: %w", envVar, err) - } - - return fds, nil -} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index c02538d6847..0117ace5990 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -47,18 +47,6 @@ type network struct { TempVethPeerName string `json:"temp_veth_peer_name"` } -type mountFds struct { - // sourceFds are the fds to use as source when mounting. - // The slice size should be the same as container mounts, as it will be - // paired with them. - // The value -1 is used when no fd is needed for the mount. - // Can't have a valid fd in the same position that other slices in this struct. - // We need to use only one of these fds on any single mount. - sourceFds []int - // Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount(). - idmapFds []int -} - // initConfig is used for transferring parameters from Exec() to Init() type initConfig struct { Args []string `json:"args"` @@ -189,18 +177,6 @@ func startInitialization() (retErr error) { defer pidfdSocket.Close() } - // Get mount files (O_PATH). - mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS") - if err != nil { - return err - } - - // Get idmap fds. - idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS") - if err != nil { - return err - } - // Get runc-dmz fds. var dmzExe *os.File if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" { @@ -232,21 +208,16 @@ func startInitialization() (retErr error) { } // If init succeeds, it will not return, hence none of the defers will be called. - return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) + return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe) } -func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error { +func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File) error { if err := populateProcessEnvironment(config.Env); err != nil { return err } switch t { case initSetns: - // mount and idmap fds must be nil in this case. We don't mount while doing runc exec. - if mountFds.sourceFds != nil || mountFds.idmapFds != nil { - return errors.New("mount and idmap fds must be nil; can't mount from exec") - } - i := &linuxSetnsInit{ pipe: pipe, consoleSocket: consoleSocket, @@ -266,7 +237,6 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock fifoFd: fifoFd, logFd: logFd, dmzExe: dmzExe, - mountFds: mountFds, } return i.Init() } @@ -510,6 +480,9 @@ func setupUser(config *initConfig) error { return err } + // We don't need to use /proc/thread-self here because setgroups is a + // per-userns file and thus is global to all threads in a thread-group. + // This lets us avoid having to do runtime.LockOSThread. setgroups, err := os.ReadFile("/proc/self/setgroups") if err != nil && !os.IsNotExist(err) { return err diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 1bc840116c2..9dd056ad502 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -19,6 +19,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/userns" + "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" @@ -1691,6 +1692,20 @@ func TestFdLeaksSystemd(t *testing.T) { testFdLeaks(t, true) } +func fdList(t *testing.T) []string { + procSelfFd, closer := utils.ProcThreadSelf("fd") + defer closer() + + fdDir, err := os.Open(procSelfFd) + ok(t, err) + defer fdDir.Close() + + fds, err := fdDir.Readdirnames(-1) + ok(t, err) + + return fds +} + func testFdLeaks(t *testing.T, systemd bool) { if testing.Short() { return @@ -1705,21 +1720,12 @@ func testFdLeaks(t *testing.T, systemd bool) { // - /sys/fs/cgroup dirfd opened by prepareOpenat2 in libct/cgroups; // - dbus connection opened by getConnection in libct/cgroups/systemd. _ = runContainerOk(t, config, "true") - - pfd, err := os.Open("/proc/self/fd") - ok(t, err) - defer pfd.Close() - fds0, err := pfd.Readdirnames(0) - ok(t, err) - _, err = pfd.Seek(0, 0) - ok(t, err) + fds0 := fdList(t) _ = runContainerOk(t, config, "true") + fds1 := fdList(t) - fds1, err := pfd.Readdirnames(0) - ok(t, err) - - if len(fds1) == len(fds0) { + if reflect.DeepEqual(fds0, fds1) { return } // Show the extra opened files. @@ -1729,6 +1735,10 @@ func testFdLeaks(t *testing.T, systemd bool) { } count := 0 + + procSelfFd, closer := utils.ProcThreadSelf("fd/") + defer closer() + next_fd: for _, fd1 := range fds1 { for _, fd0 := range fds0 { @@ -1736,7 +1746,7 @@ next_fd: continue next_fd } } - dst, _ := os.Readlink("/proc/self/fd/" + fd1) + dst, _ := os.Readlink(filepath.Join(procSelfFd, fd1)) for _, ex := range excludedPaths { if ex == dst { continue next_fd diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 4e48826cdb5..2790f018d06 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -21,9 +21,7 @@ const ( RootlessEUIDAttr uint16 = 27287 UidmapPathAttr uint16 = 27288 GidmapPathAttr uint16 = 27289 - MountSourcesAttr uint16 = 27290 - IdmapSourcesAttr uint16 = 27291 - TimeOffsetsAttr uint16 = 27292 + TimeOffsetsAttr uint16 = 27290 ) type Int32msg struct { diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go index da11da68ea0..6b5edbb0c14 100644 --- a/libcontainer/mount_linux.go +++ b/libcontainer/mount_linux.go @@ -1,22 +1,48 @@ package libcontainer import ( + "errors" + "fmt" "io/fs" + "os" "strconv" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/userns" + "github.com/opencontainers/runc/libcontainer/utils" ) +// mountSourceType indicates what type of file descriptor is being returned. It +// is used to tell rootfs_linux.go whether or not to use move_mount(2) to +// install the mount. +type mountSourceType string + +const ( + // An open_tree(2)-style file descriptor that needs to be installed using + // move_mount(2) to install. + mountSourceOpenTree mountSourceType = "open_tree" + // A plain file descriptor that can be mounted through /proc/thread-self/fd. + mountSourcePlain mountSourceType = "plain-open" +) + +type mountSource struct { + Type mountSourceType `json:"type"` + file *os.File `json:"-"` +} + // mountError holds an error from a failed mount or unmount operation. type mountError struct { - op string - source string - srcFD *int - target string - dstFD string - flags uintptr - data string - err error + op string + source string + srcFile *mountSource + target string + dstFd string + flags uintptr + data string + err error } // Error provides a string error representation. @@ -25,13 +51,14 @@ func (e *mountError) Error() string { if e.source != "" { out += "src=" + e.source + ", " - if e.srcFD != nil { - out += "srcFD=" + strconv.Itoa(*e.srcFD) + ", " + if e.srcFile != nil { + out += "srcType=" + string(e.srcFile.Type) + ", " + out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", " } } out += "dst=" + e.target - if e.dstFD != "" { - out += ", dstFD=" + e.dstFD + if e.dstFd != "" { + out += ", dstFd=" + e.dstFd } if e.flags != uintptr(0) { @@ -54,38 +81,69 @@ func (e *mountError) Unwrap() error { // mount is a simple unix.Mount wrapper, returning an error with more context // in case it failed. func mount(source, target, fstype string, flags uintptr, data string) error { - return mountViaFDs(source, nil, target, "", fstype, flags, data) + return mountViaFds(source, nil, target, "", fstype, flags, data) } -// mountViaFDs is a unix.Mount wrapper which uses srcFD instead of source, -// and dstFD instead of target, unless those are empty. -// If srcFD is different than nil, its path (i.e. "/proc/self/fd/NN") will be -// constructed by this function. -// dstFD argument, if non-empty, is expected to be in the form of a path to an -// opened file descriptor on procfs (i.e. "/proc/self/fd/NN"). +// mountViaFds is a unix.Mount wrapper which uses srcFile instead of source, +// and dstFd instead of target, unless those are empty. // -// If case an FD is used instead of a source or a target path, the -// corresponding path is only used to add context to an error in case -// the mount operation has failed. -func mountViaFDs(source string, srcFD *int, target, dstFD, fstype string, flags uintptr, data string) error { - src := source - if srcFD != nil { - src = "/proc/self/fd/" + strconv.Itoa(*srcFD) +// If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds +// will mount it according to the mountSourceType of the file descriptor. +// +// The dstFd argument, if non-empty, is expected to be in the form of a path to +// an opened file descriptor on procfs (i.e. "/proc/thread-self/fd/NN"). +// +// If a file descriptor is used instead of a source or a target path, the +// corresponding path is only used to add context to an error in case the mount +// operation has failed. +func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error { + // MS_REMOUNT and srcFile don't make sense together. + if srcFile != nil && flags&unix.MS_REMOUNT != 0 { + logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile") + srcFile = nil } dst := target - if dstFD != "" { - dst = dstFD + if dstFd != "" { + dst = dstFd } - if err := unix.Mount(src, dst, fstype, flags, data); err != nil { + src := source + isMoveMount := srcFile != nil && srcFile.Type == mountSourceOpenTree + if srcFile != nil { + // If we're going to use the /proc/thread-self/... path for classic + // mount(2), we need to get a safe handle to /proc/thread-self. This + // isn't needed for move_mount(2) because in that case the path is just + // a dummy string used for error info. + fdStr := strconv.Itoa(int(srcFile.file.Fd())) + if isMoveMount { + src = "/proc/self/fd/" + fdStr + } else { + var closer utils.ProcThreadSelfCloser + src, closer = utils.ProcThreadSelf("fd/" + fdStr) + defer closer() + } + } + + var op string + var err error + if isMoveMount { + op = "move_mount" + err = unix.MoveMount(int(srcFile.file.Fd()), "", + unix.AT_FDCWD, dstFd, + unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS) + } else { + op = "mount" + err = unix.Mount(src, dst, fstype, flags, data) + } + if err != nil { return &mountError{ - op: "mount", - source: source, - srcFD: srcFD, - target: target, - dstFD: dstFD, - flags: flags, - data: data, - err: err, + op: op, + source: source, + srcFile: srcFile, + target: target, + dstFd: dstFd, + flags: flags, + data: data, + err: err, } } return nil @@ -121,3 +179,97 @@ func syscallMode(i fs.FileMode) (o uint32) { // No mapping for Go's ModeTemporary (plan9 only). return } + +// mountFd creates a "mount source fd" (either through open_tree(2) or just +// open(O_PATH)) based on the provided configuration. This function must be +// called from within the container's mount namespace. +// +// In the case of idmapped mount configurations, the returned mount source will +// be an open_tree(2) file with MOUNT_ATTR_IDMAP applied. For other +// bind-mounts, it will be an O_PATH. If the type of mount cannot be handled, +// the returned mountSource will be nil, indicating that the container init +// process will need to do an old-fashioned mount(2) themselves. +// +// This helper is only intended to be used by goCreateMountSources. +func mountFd(nsHandles *userns.Handles, m *configs.Mount) (*mountSource, error) { + if !m.IsBind() { + return nil, errors.New("new mount api: only bind-mounts are supported") + } + if nsHandles == nil { + nsHandles = new(userns.Handles) + defer nsHandles.Release() + } + + var mountFile *os.File + var sourceType mountSourceType + + // Ideally, we would use OPEN_TREE_CLONE for everything, because we can + // be sure that the file descriptor cannot be used to escape outside of + // the mount root. Unfortunately, OPEN_TREE_CLONE is far more expensive + // than open(2) because it requires doing mounts inside a new anonymous + // mount namespace. So we use open(2) for standard bind-mounts, and + // OPEN_TREE_CLONE when we need to set mount attributes here. + // + // While passing open(2)'d paths from the host rootfs isn't exactly the + // safest thing in the world, the files will not survive across + // execve(2) and "runc init" is non-dumpable so it should not be + // possible for a malicious container process to gain access to the + // file descriptors. We also don't do any of this for "runc exec", + // lessening the risk even further. + if m.IsIDMapped() { + flags := uint(unix.OPEN_TREE_CLONE | unix.OPEN_TREE_CLOEXEC) + if m.Flags&unix.MS_REC == unix.MS_REC { + flags |= unix.AT_RECURSIVE + } + fd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags) + if err != nil { + return nil, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err} + } + mountFile = os.NewFile(uintptr(fd), m.Source) + sourceType = mountSourceOpenTree + + // Configure the id mapping. + var usernsFile *os.File + if m.IDMapping.UserNSPath == "" { + usernsFile, err = nsHandles.Get(userns.Mapping{ + UIDMappings: m.IDMapping.UIDMappings, + GIDMappings: m.IDMapping.GIDMappings, + }) + if err != nil { + return nil, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err) + } + } else { + usernsFile, err = os.Open(m.IDMapping.UserNSPath) + if err != nil { + return nil, fmt.Errorf("failed to open existing userns for %s id-mapping: %w", m.Source, err) + } + } + defer usernsFile.Close() + + setAttrFlags := uint(unix.AT_EMPTY_PATH) + // If the mount has "ridmap" set, we apply the configuration + // recursively. This allows you to create "rbind" mounts where only + // the top-level mount has an idmapping. I'm not sure why you'd + // want that, but still... + if m.IDMapping.Recursive { + setAttrFlags |= unix.AT_RECURSIVE + } + if err := unix.MountSetattr(int(mountFile.Fd()), "", setAttrFlags, &unix.MountAttr{ + Attr_set: unix.MOUNT_ATTR_IDMAP, + Userns_fd: uint64(usernsFile.Fd()), + }); err != nil { + return nil, fmt.Errorf("failed to set MOUNT_ATTR_IDMAP on %s: %w", m.Source, err) + } + } else { + var err error + mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return nil, err + } + sourceType = mountSourcePlain + } + return &mountSource{ + Type: sourceType, + file: mountFile, + }, nil +} diff --git a/libcontainer/nsenter/idmap.h b/libcontainer/nsenter/idmap.h deleted file mode 100644 index fa25a3bfb2f..00000000000 --- a/libcontainer/nsenter/idmap.h +++ /dev/null @@ -1,76 +0,0 @@ -#ifndef IDMAP_H -#define IDMAP_H - -#include -// Centos-7 doesn't have this file nor the __has_include() directive, so let's -// just leave this commented out and we can uncomment when it hits EOL (2024-06-30). -//#include -#include -#include - -/* mount_setattr() */ -#ifndef MOUNT_ATTR_IDMAP -#define MOUNT_ATTR_IDMAP 0x00100000 -#endif - -#ifndef __NR_mount_setattr - #if defined _MIPS_SIM - #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ - #define __NR_mount_setattr (442 + 4000) - #endif - #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ - #define __NR_mount_setattr (442 + 6000) - #endif - #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ - #define __NR_mount_setattr (442 + 5000) - #endif - #else - #define __NR_mount_setattr 442 - #endif -#endif -#ifndef MOUNT_ATTR_SIZE_VER0 -struct mount_attr { - __u64 attr_set; - __u64 attr_clr; - __u64 propagation; - __u64 userns_fd; -}; -#endif - -/* open_tree() */ -#ifndef OPEN_TREE_CLONE -#define OPEN_TREE_CLONE 1 -#endif - -#ifndef OPEN_TREE_CLOEXEC -#define OPEN_TREE_CLOEXEC O_CLOEXEC -#endif - -#ifndef __NR_open_tree - #if defined _MIPS_SIM - #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ - #define __NR_open_tree 4428 - #endif - #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ - #define __NR_open_tree 6428 - #endif - #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ - #define __NR_open_tree 5428 - #endif - #else - #define __NR_open_tree 428 - #endif -#endif - -static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flags, struct mount_attr *attr, size_t size) -{ - return syscall(__NR_mount_setattr, dfd, path, flags, attr, size); -} - -static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) -{ - return syscall(__NR_open_tree, dfd, filename, flags); -} - - -#endif /* IDMAP_H */ diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index f64cc41eb4a..bfc6a79a8e1 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -33,9 +33,6 @@ /* Get all of the CLONE_NEW* flags. */ #include "namespace.h" -/* Get definitions for idmap sources */ -#include "idmap.h" - /* Synchronisation values. */ enum sync_t { SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ @@ -44,12 +41,8 @@ enum sync_t { SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ - SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */ - SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */ - SYNC_MOUNT_IDMAP_PLS = 0x48, /* Tell parent to mount idmap sources. */ - SYNC_MOUNT_IDMAP_ACK = 0x49, /* All idmap mounts have been done. */ - SYNC_TIMEOFFSETS_PLS = 0x50, /* Request parent to write timens offsets. */ - SYNC_TIMEOFFSETS_ACK = 0x51, /* Timens offsets were written. */ + SYNC_TIMEOFFSETS_PLS = 0x46, /* Request parent to write timens offsets. */ + SYNC_TIMEOFFSETS_ACK = 0x47, /* Timens offsets were written. */ }; #define STAGE_SETUP -1 @@ -99,14 +92,6 @@ struct nlconfig_t { char *gidmappath; size_t gidmappath_len; - /* Mount sources opened outside the container userns. */ - char *mountsources; - size_t mountsources_len; - - /* Idmap sources opened outside the container userns which will be id mapped. */ - char *idmapsources; - size_t idmapsources_len; - /* Time NS offsets. */ char *timensoffset; size_t timensoffset_len; @@ -126,9 +111,7 @@ struct nlconfig_t { #define ROOTLESS_EUID_ATTR 27287 #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 -#define MOUNT_SOURCES_ATTR 27290 -#define IDMAP_SOURCES_ATTR 27291 -#define TIMENSOFFSET_ATTR 27292 +#define TIMENSOFFSET_ATTR 27290 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -446,14 +429,6 @@ static void nl_parse(int fd, struct nlconfig_t *config) case SETGROUP_ATTR: config->is_setgroup = readint8(current); break; - case MOUNT_SOURCES_ATTR: - config->mountsources = current; - config->mountsources_len = payload_len; - break; - case IDMAP_SOURCES_ATTR: - config->idmapsources = current; - config->idmapsources_len = payload_len; - break; case TIMENSOFFSET_ATTR: config->timensoffset = current; config->timensoffset_len = payload_len; @@ -546,115 +521,6 @@ static inline int sane_kill(pid_t pid, int signum) return 0; } -/* receive_fd_sources parses env_var as an array of fd numbers and, for each element that is - * not -1, it receives an fd via SCM_RIGHTS and dup3 it to the fd requested in - * the element of the env var. - */ -void receive_fd_sources(int sockfd, const char *env_var) -{ - char *fds, *endp; - long new_fd; - - // This env var must be a json array of ints. - fds = getenv(env_var); - - if (fds[0] != '[') { - bail("malformed %s env var: missing '['", env_var); - } - fds++; - - for (endp = fds; *endp != ']'; fds = endp + 1) { - new_fd = strtol(fds, &endp, 10); - if (endp == fds) { - bail("malformed %s env var: not a number", env_var); - } - if (*endp == '\0') { - bail("malformed %s env var: missing ]", env_var); - } - // The list contains -1 when no fd is needed. Ignore them. - if (new_fd == -1) { - continue; - } - - if (new_fd == LONG_MAX || new_fd < 0 || new_fd > INT_MAX) { - bail("malformed %s env var: fds out of range", env_var); - } - - int recv_fd = receive_fd(sockfd); - if (dup3(recv_fd, new_fd, O_CLOEXEC) < 0) { - bail("cannot dup3 fd %d to %ld", recv_fd, new_fd); - } - if (close(recv_fd) < 0) { - bail("cannot close fd %d", recv_fd); - } - } -} - -void receive_mountsources(int sockfd) -{ - receive_fd_sources(sockfd, "_LIBCONTAINER_MOUNT_FDS"); -} - -void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mountsources_len) -{ - char proc_path[PATH_MAX]; - int host_mntns_fd; - int container_mntns_fd; - int fd; - int ret; - - // container_linux.go shouldSendMountSources() decides if mount sources - // should be pre-opened (O_PATH) and passed via SCM_RIGHTS - if (mountsources == NULL) - return; - - host_mntns_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC); - if (host_mntns_fd == -1) - bail("failed to get current mount namespace"); - - if (snprintf(proc_path, PATH_MAX, "/proc/%d/ns/mnt", child) < 0) - bail("failed to get mount namespace path"); - - container_mntns_fd = open(proc_path, O_RDONLY | O_CLOEXEC); - if (container_mntns_fd == -1) - bail("failed to get container mount namespace"); - - if (setns(container_mntns_fd, CLONE_NEWNS) < 0) - bail("failed to setns to container mntns"); - - char *mountsources_end = mountsources + mountsources_len; - while (mountsources < mountsources_end) { - if (mountsources[0] == '\0') { - mountsources++; - continue; - } - - fd = open(mountsources, O_PATH | O_CLOEXEC); - if (fd < 0) - bail("failed to open mount source %s", mountsources); - - write_log(DEBUG, "~> sending fd for: %s", mountsources); - if (send_fd(sockfd, fd) < 0) - bail("failed to send fd %d via unix socket %d", fd, sockfd); - - ret = close(fd); - if (ret != 0) - bail("failed to close mount source fd %d", fd); - - mountsources += strlen(mountsources) + 1; - } - - if (setns(host_mntns_fd, CLONE_NEWNS) < 0) - bail("failed to setns to host mntns"); - - ret = close(host_mntns_fd); - if (ret != 0) - bail("failed to close host mount namespace fd %d", host_mntns_fd); - ret = close(container_mntns_fd); - if (ret != 0) - bail("failed to close container mount namespace fd %d", container_mntns_fd); -} - void try_unshare(int flags, const char *msg) { write_log(DEBUG, "unshare %s", msg); @@ -674,89 +540,6 @@ void try_unshare(int flags, const char *msg) bail("failed to unshare %s", msg); } -void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len) -{ - char proc_user_path[PATH_MAX]; - - /* Open the userns fd only once. - * Currently we only support idmap mounts that use the same mapping than - * the userns. This is validated in libcontainer/configs/validate/validator.go, - * so if we reached here, we know the mapping for the idmap is the same - * as the userns. This is why we just open the userns_fd once from the - * PID of the child process that has the userns already applied. - */ - int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid); - if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) { - sane_kill(pid, SIGKILL); - bail("failed to create userns path string"); - } - - int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY); - if (userns_fd < 0) { - sane_kill(pid, SIGKILL); - bail("failed to get user namespace fd"); - } - - char *idmap_end = idmap_src + idmap_src_len; - while (idmap_src < idmap_end) { - if (idmap_src[0] == '\0') { - idmap_src++; - continue; - } - - int fd_tree = sys_open_tree(-EBADF, idmap_src, - OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | - AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT); - if (fd_tree < 0) { - sane_kill(pid, SIGKILL); - if (errno == ENOSYS) { - bail("open_tree(2) failed, the kernel doesn't support ID-mapped mounts"); - } else if (errno == EINVAL) { - bail("open_tree(2) failed with path: %s, the kernel doesn't support ID-mapped mounts", - idmap_src); - } else { - bail("open_tree(2) failed with path: %s", idmap_src); - } - } - - struct mount_attr attr = { - .attr_set = MOUNT_ATTR_IDMAP, - .userns_fd = userns_fd, - }; - - ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr)); - if (ret < 0) { - sane_kill(pid, SIGKILL); - if (errno == ENOSYS) - bail("mount_setattr(2) failed, the kernel doesn't support ID-mapped mounts"); - else if (errno == EINVAL) - bail("mount_setattr(2) failed with path: %s, maybe the filesystem doesn't support ID-mapped mounts", idmap_src); - else - bail("mount_setattr(2) failed with path: %s", idmap_src); - } - - write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path); - send_fd(sockfd, fd_tree); - - if (close(fd_tree) < 0) { - sane_kill(pid, SIGKILL); - bail("error closing fd_tree"); - } - - idmap_src += strlen(idmap_src) + 1; - } - - if (close(userns_fd) < 0) { - sane_kill(pid, SIGKILL); - bail("error closing userns fd"); - } -} - -void receive_idmapsources(int sockfd) -{ - receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS"); -} - static void update_timens_offsets(pid_t pid, char *map, size_t map_len) { if (map == NULL || map_len == 0) @@ -988,28 +771,6 @@ void nsexec(void) sane_kill(stage2_pid, SIGKILL); bail("failed to sync with runc: write(pid-JSON)"); } - break; - case SYNC_MOUNTSOURCES_PLS: - write_log(DEBUG, "stage-1 requested to open mount sources"); - send_mountsources(syncfd, stage1_pid, config.mountsources, - config.mountsources_len); - - s = SYNC_MOUNTSOURCES_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - sane_kill(stage1_pid, SIGKILL); - bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)"); - } - break; - case SYNC_MOUNT_IDMAP_PLS: - write_log(DEBUG, "stage-1 requested to open idmap sources"); - send_idmapsources(syncfd, stage1_pid, config.idmapsources, - config.idmapsources_len); - s = SYNC_MOUNT_IDMAP_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - sane_kill(stage1_pid, SIGKILL); - bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)"); - } - break; case SYNC_TIMEOFFSETS_PLS: write_log(DEBUG, "stage-1 requested timens offsets to be configured"); @@ -1186,38 +947,6 @@ void nsexec(void) bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s); } - /* Ask our parent to send the mount sources fds. */ - if (config.mountsources) { - write_log(DEBUG, "request stage-0 to send mount sources"); - s = SYNC_MOUNTSOURCES_PLS; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)"); - - /* Receive and install all mount sources fds. */ - receive_mountsources(syncfd); - - /* Parent finished to send the mount sources fds. */ - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)"); - if (s != SYNC_MOUNTSOURCES_ACK) - bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s); - } - - if (config.idmapsources) { - write_log(DEBUG, "request stage-0 to send idmap sources"); - s = SYNC_MOUNT_IDMAP_PLS; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)"); - - /* Receive and install all idmap fds. */ - receive_idmapsources(syncfd); - - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)"); - if (s != SYNC_MOUNT_IDMAP_ACK) - bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s); - } - /* * TODO: What about non-namespace clone flags that we're dropping here? * diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index c1b60c49884..ba916571489 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,6 +1,7 @@ package libcontainer import ( + "context" "encoding/json" "errors" "fmt" @@ -11,18 +12,21 @@ import ( "path/filepath" "runtime" "strconv" + "sync" "time" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/logs" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) type parentProcess interface { @@ -208,6 +212,9 @@ func (p *setnsProcess) start() (retErr error) { case procHooks: // This shouldn't happen. panic("unexpected procHooks in setns") + case procMountPlease: + // This shouldn't happen. + panic("unexpected procMountPlease in setns") case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { return errors.New("seccomp listenerPath is not set") @@ -398,6 +405,110 @@ func (p *initProcess) waitForChildExit(childPid int) error { return nil } +type mountSourceRequestFn func(*configs.Mount) (*mountSource, error) + +// goCreateMountSources spawns a goroutine which creates open_tree(2)-style +// mountfds based on the requested configs.Mount configuration. The returned +// requestFn and cancelFn are used to interact with the goroutine. +// +// The caller of the returned mountSourceRequestFn is responsible for closing +// the returned file. +func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) { + type response struct { + src *mountSource + err error + } + + errCh := make(chan error, 1) + requestCh := make(chan *configs.Mount) + responseCh := make(chan response) + + ctx, cancelFn := context.WithTimeout(ctx, 1*time.Minute) + go func() { + // We lock this thread because we need to setns(2) here. There is no + // UnlockOSThread() here, to ensure that the Go runtime will kill this + // thread once this goroutine returns (ensuring no other goroutines run + // in this context). + runtime.LockOSThread() + + // Detach from the shared fs of the rest of the Go process in order to + // be able to CLONE_NEWNS. + if err := unix.Unshare(unix.CLONE_FS); err != nil { + err = os.NewSyscallError("unshare(CLONE_FS)", err) + errCh <- fmt.Errorf("mount source thread: %w", err) + return + } + + // Attach to the container's mount namespace. + nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid())) + if err != nil { + errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err) + return + } + defer nsFd.Close() + if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil { + err = os.NewSyscallError("setns", err) + errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err) + return + } + + // No errors during setup! + close(errCh) + logrus.Debugf("mount source thread: successfully running in container mntns") + + nsHandles := new(userns.Handles) + defer nsHandles.Release() + loop: + for { + select { + case m, ok := <-requestCh: + if !ok { + break loop + } + src, err := mountFd(nsHandles, m) + logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err) + responseCh <- response{ + src: src, + err: err, + } + case <-ctx.Done(): + break loop + } + } + logrus.Debugf("mount source thread: closing thread: %v", ctx.Err()) + close(responseCh) + }() + + // Check for setup errors. + err := <-errCh + if err != nil { + cancelFn() + return nil, nil, err + } + + // TODO: Switch to context.AfterFunc when we switch to Go 1.21. + var requestChCloseOnce sync.Once + requestFn := func(m *configs.Mount) (*mountSource, error) { + var err error + select { + case requestCh <- m: + select { + case resp, ok := <-responseCh: + if ok { + return resp.src, resp.err + } + case <-ctx.Done(): + err = fmt.Errorf("receive mount source context cancelled: %w", ctx.Err()) + } + case <-ctx.Done(): + err = fmt.Errorf("send mount request cancelled: %w", ctx.Err()) + } + requestChCloseOnce.Do(func() { close(requestCh) }) + return nil, err + } + return requestFn, cancelFn, nil +} + func (p *initProcess) start() (retErr error) { defer p.comm.closeParent() err := p.cmd.Start() @@ -487,6 +598,22 @@ func (p *initProcess) start() (retErr error) { return fmt.Errorf("error waiting for our first child to exit: %w", err) } + // Spin up a goroutine to handle remapping mount requests by runc init. + // There is no point doing this for rootless containers because they cannot + // configure MOUNT_ATTR_IDMAP, nor do OPEN_TREE_CLONE. We could just + // service plain-open requests for plain bind-mounts but there's no need + // (rootless containers will never have permission issues on a source mount + // that the parent process can help with -- they are the same user). + var mountRequest mountSourceRequestFn + if !p.container.config.RootlessEUID { + request, cancel, err := p.goCreateMountSources(context.Background()) + if err != nil { + return fmt.Errorf("error spawning mount remapping thread: %w", err) + } + defer cancel() + mountRequest = request + } + if err := p.createNetworkInterfaces(); err != nil { return fmt.Errorf("error creating network interfaces: %w", err) } @@ -500,6 +627,35 @@ func (p *initProcess) start() (retErr error) { var seenProcReady bool ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error { switch sync.Type { + case procMountPlease: + if mountRequest == nil { + return fmt.Errorf("cannot fulfil mount requests as a rootless user") + } + var m *configs.Mount + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + if err := json.Unmarshal(*sync.Arg, &m); err != nil { + return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err) + } + mnt, err := mountRequest(m) + if err != nil { + return fmt.Errorf("failed to fulfil mount request: %w", err) + } + defer mnt.file.Close() + + arg, err := json.Marshal(mnt) + if err != nil { + return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err) + } + argMsg := json.RawMessage(arg) + if err := doWriteSync(p.comm.syncSockParent, syncT{ + Type: procMountFd, + Arg: &argMsg, + File: mnt.file, + }); err != nil { + return err + } case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { return errors.New("seccomp listenerPath is not set") diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index a2e41ea5638..89faea085a5 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -1,6 +1,7 @@ package libcontainer import ( + "encoding/json" "errors" "fmt" "os" @@ -8,21 +9,23 @@ import ( "path/filepath" "strconv" "strings" + "syscall" "time" securejoin "github.com/cyphar/filepath-securejoin" "github.com/moby/sys/mountinfo" "github.com/mrunalp/fileutils" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux/label" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV @@ -39,16 +42,47 @@ type mountConfig struct { // mountEntry contains mount data specific to a mount point. type mountEntry struct { *configs.Mount - srcFD *int + srcFile *mountSource } -func (m *mountEntry) src() string { - if m.srcFD != nil { - return "/proc/self/fd/" + strconv.Itoa(*m.srcFD) +// srcName is only meant for error messages, it returns a "friendly" name. +func (m mountEntry) srcName() string { + if m.srcFile != nil { + return m.srcFile.file.Name() } return m.Source } +func (m mountEntry) srcStat() (os.FileInfo, *syscall.Stat_t, error) { + var ( + st os.FileInfo + err error + ) + if m.srcFile != nil { + st, err = m.srcFile.file.Stat() + } else { + st, err = os.Stat(m.Source) + } + if err != nil { + return nil, nil, err + } + return st, st.Sys().(*syscall.Stat_t), nil +} + +func (m mountEntry) srcStatfs() (*unix.Statfs_t, error) { + var st unix.Statfs_t + if m.srcFile != nil { + if err := unix.Fstatfs(int(m.srcFile.file.Fd()), &st); err != nil { + return nil, os.NewSyscallError("fstatfs", err) + } + } else { + if err := unix.Statfs(m.Source, &st); err != nil { + return nil, &os.PathError{Op: "statfs", Path: m.Source, Err: err} + } + } + return &st, nil +} + // needsSetupDev returns true if /dev needs to be set up. func needsSetupDev(config *configs.Config) bool { for _, m := range config.Mounts { @@ -62,20 +96,12 @@ func needsSetupDev(config *configs.Config) bool { // prepareRootfs sets up the devices, mount points, and filesystems for use // inside a new mount namespace. It doesn't set anything as ro. You must call // finalizeRootfs after this function to finish setting up the rootfs. -func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (err error) { +func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) { config := iConfig.Config if err := prepareRoot(config); err != nil { return fmt.Errorf("error preparing rootfs: %w", err) } - if mountFds.sourceFds != nil && len(mountFds.sourceFds) != len(config.Mounts) { - return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds)) - } - - if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) { - return fmt.Errorf("malformed idmapFds slice: expected size: %v, got: %v", len(config.Mounts), len(mountFds.idmapFds)) - } - mountConfig := &mountConfig{ root: config.Rootfs, label: config.MountLabel, @@ -83,22 +109,53 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (er rootlessCgroups: iConfig.RootlessCgroups, cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), } - for i, m := range config.Mounts { + for _, m := range config.Mounts { entry := mountEntry{Mount: m} - // Just before the loop we checked that if not empty, len(mountFds.sourceFds) == len(config.Mounts). - // Therefore, we can access mountFds.sourceFds[i] without any concerns. - if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 { - entry.srcFD = &mountFds.sourceFds[i] + // Figure out whether we need to request runc to give us an + // open_tree(2)-style mountfd. For idmapped mounts, this is always + // necessary. For bind-mounts, this is only necessary if we cannot + // resolve the parent mount (this is only hit if you are running in a + // userns -- but for rootless the host-side thread can't help). + wantSourceFile := m.IsIDMapped() + if m.IsBind() && !config.RootlessEUID { + if _, err := os.Stat(m.Source); err != nil { + wantSourceFile = true + } } - - // We validated before we can access mountFds.idmapFds[i]. - if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 { - if entry.srcFD != nil { - return fmt.Errorf("malformed mountFds and idmapFds slice, entry: %v has fds in both slices", i) + if wantSourceFile { + // Request a source file from the host. + if err := writeSyncArg(pipe, procMountPlease, m); err != nil { + return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err) + } + sync, err := readSyncFull(pipe, procMountFd) + if err != nil { + return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err) + } + if sync.File == nil { + return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source) + } + defer sync.File.Close() + // Sanity-check to make sure we didn't get the wrong fd back. Note + // that while m.Source might contain symlinks, the (*os.File).Name + // is based on the path provided to os.OpenFile, not what it + // resolves to. So this should never happen. + if sync.File.Name() != m.Source { + return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name()) + } + // Unmarshal the procMountFd argument (the file is sync.File). + var src *mountSource + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) } - entry.srcFD = &mountFds.idmapFds[i] + if err := json.Unmarshal(*sync.Arg, &src); err != nil { + return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err) + } + if src == nil { + return fmt.Errorf("mountfd request for %q: no mount source info received", m.Source) + } + src.file = sync.File + entry.srcFile = src } - if err := mountToRootfs(mountConfig, entry); err != nil { return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) } @@ -224,32 +281,6 @@ func cleanupTmp(tmpdir string) { _ = os.RemoveAll(tmpdir) } -func prepareBindMount(m mountEntry, rootfs string) error { - source := m.src() - stat, err := os.Stat(source) - if err != nil { - // error out if the source of a bind mount does not exist as we will be - // unable to bind anything to it. - return err - } - // ensure that the destination of the bind mount is resolved of symlinks at mount time because - // any previous mounts can invalidate the next mount's destination. - // this can happen when a user specifies mounts within other mounts to cause breakouts or other - // evil stuff to try to escape the container's rootfs. - var dest string - if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { - return err - } - if err := checkProcMount(rootfs, dest, source); err != nil { - return err - } - if err := createIfNotExists(dest, stat.IsDir()); err != nil { - return err - } - - return nil -} - func mountCgroupV1(m *configs.Mount, c *mountConfig) error { binds, err := getCgroupMounts(m) if err != nil { @@ -281,7 +312,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error { if err := os.MkdirAll(subsystemPath, 0o755); err != nil { return err } - if err := utils.WithProcfd(c.root, b.Destination, func(dstFD string) error { + if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error { flags := defaultMountFlags if m.Flags&unix.MS_RDONLY != 0 { flags = flags | unix.MS_RDONLY @@ -294,7 +325,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error { data = cgroups.CgroupNamePrefix + data source = "systemd" } - return mountViaFDs(source, nil, b.Destination, dstFD, "cgroup", uintptr(flags), data) + return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data) }); err != nil { return err } @@ -325,8 +356,8 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error { if err := os.MkdirAll(dest, 0o755); err != nil { return err } - err = utils.WithProcfd(c.root, m.Destination, func(dstFD string) error { - return mountViaFDs(m.Source, nil, m.Destination, dstFD, "cgroup2", uintptr(m.Flags), m.Data) + err = utils.WithProcfd(c.root, m.Destination, func(dstFd string) error { + return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data) }) if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) { return err @@ -347,7 +378,6 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error { bindM.Source = c.cgroup2Path } // mountToRootfs() handles remounting for MS_RDONLY. - // No need to set mountEntry.srcFD here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate(). err = mountToRootfs(c, mountEntry{Mount: bindM}) if c.rootlessCgroups && errors.Is(err, unix.ENOENT) { // ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed @@ -392,15 +422,15 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) { } }() - return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) (Err error) { + return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) { // Copy the container data to the host tmpdir. We append "/" to force // CopyDirectory to resolve the symlink rather than trying to copy the // symlink itself. - if err := fileutils.CopyDirectory(dstFD+"/", tmpDir); err != nil { - return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFD, tmpDir, err) + if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil { + return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err) } // Now move the mount into the container. - if err := mountViaFDs(tmpDir, nil, m.Destination, dstFD, "", unix.MS_MOVE, ""); err != nil { + if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil { return fmt.Errorf("tmpcopyup: failed to move mount: %w", err) } return nil @@ -469,6 +499,9 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { // Do not use securejoin as it resolves symlinks. dest = filepath.Join(rootfs, dest) } + if err := checkProcMount(rootfs, dest, m); err != nil { + return err + } if fi, err := os.Lstat(dest); err != nil { if !os.IsNotExist(err) { return err @@ -488,6 +521,9 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { if err != nil { return err } + if err := checkProcMount(rootfs, dest, m); err != nil { + return err + } switch m.Device { case "mqueue": @@ -519,38 +555,18 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { return err case "bind": - if err := prepareBindMount(m, rootfs); err != nil { + fi, _, err := m.srcStat() + if err != nil { + // error out if the source of a bind mount does not exist as we will be + // unable to bind anything to it. return err } - - if m.IsBind() && m.IsIDMapped() { - if m.srcFD == nil { - return fmt.Errorf("error creating mount %+v: idmapFD is invalid, should point to a valid fd", m) - } - if err := unix.MoveMount(*m.srcFD, "", unix.AT_FDCWD, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil { - return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err) - } - - // In nsexec.c, we did not set the propagation field of mount_attr struct. - // So, let's deal with these flags right now! - if err := utils.WithProcfd(rootfs, dest, func(dstFD string) error { - for _, pflag := range m.PropagationFlags { - // When using mount for setting propagations flags, the source, file - // system type and data arguments are ignored: - // https://man7.org/linux/man-pages/man2/mount.2.html - // We also ignore procfd because we want to act on dest. - if err := mountViaFDs("", nil, dest, dstFD, "", uintptr(pflag), ""); err != nil { - return err - } - } - return nil - }); err != nil { - return fmt.Errorf("change mount propagation through procfd: %w", err) - } - } else { - if err := mountPropagate(m, rootfs, mountLabel); err != nil { - return err - } + if err := createIfNotExists(dest, fi.IsDir()); err != nil { + return err + } + // open_tree()-related shenanigans are all handled in mountViaFds. + if err := mountPropagate(m, rootfs, mountLabel); err != nil { + return err } // The initial MS_BIND won't change the mount options, we need to do a @@ -563,7 +579,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { // contrast to mount(8)'s current behaviour, but is what users probably // expect. See . if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 { - if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { + if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT // The runtime-spec says we SHOULD map to the relevant mount(8) // behaviour. However, it's not clear whether we want the @@ -590,7 +606,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { // different set of flags. This also has the mount(8) bug where // "nodiratime,norelatime" will result in a // "nodiratime,relatime" mount. - mountErr := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "") + mountErr := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "") if mountErr == nil { return nil } @@ -603,11 +619,11 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { // that we handle atimes correctly to make sure we error out if // we cannot fulfil the requested mount flags. - var st unix.Statfs_t - if err := unix.Statfs(m.src(), &st); err != nil { - return &os.PathError{Op: "statfs", Path: m.src(), Err: err} + st, err := m.srcStatfs() + if err != nil { + return err } - srcFlags := statfsToMountFlags(st) + srcFlags := statfsToMountFlags(*st) // If the user explicitly request one of the locked flags *not* // be set, we need to return an error to avoid producing mounts // that don't match the user's request. @@ -639,7 +655,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { // Retry the mount with the existing lockable mount flags // applied. flags |= srcFlags & mntLockFlags - mountErr = mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "") + mountErr = mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "") logrus.Debugf("remount retry: srcFlags=0x%x flagsSet=0x%x flagsClr=0x%x: %v", srcFlags, m.Flags, m.ClearedFlags, mountErr) return mountErr }); err != nil { @@ -663,9 +679,6 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { } return mountCgroupV1(m.Mount, c) default: - if err := checkProcMount(rootfs, dest, m.Source); err != nil { - return err - } if err := os.MkdirAll(dest, 0o755); err != nil { return err } @@ -679,6 +692,9 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { return nil, err } + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup") if err != nil { return nil, err @@ -707,11 +723,16 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { return binds, nil } +// Taken from . If a file is on a filesystem of type +// PROC_SUPER_MAGIC, we're guaranteed that only the root of the superblock will +// have this inode number. +const procRootIno = 1 + // checkProcMount checks to ensure that the mount destination is not over the top of /proc. // dest is required to be an abs path and have any symlinks resolved before calling this function. // -// if source is nil, don't stat the filesystem. This is used for restore of a checkpoint. -func checkProcMount(rootfs, dest, source string) error { +// If m is nil, don't stat the filesystem. This is used for restore of a checkpoint. +func checkProcMount(rootfs, dest string, m mountEntry) error { const procPath = "/proc" path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest) if err != nil { @@ -722,18 +743,28 @@ func checkProcMount(rootfs, dest, source string) error { return nil } if path == "." { - // an empty source is pasted on restore - if source == "" { - return nil - } - // only allow a mount on-top of proc if it's source is "proc" - isproc, err := isProc(source) - if err != nil { - return err - } - // pass if the mount is happening on top of /proc and the source of - // the mount is a proc filesystem - if isproc { + // Only allow bind-mounts on top of /proc, and only if the source is a + // procfs mount. + if m.IsBind() { + fsSt, err := m.srcStatfs() + if err != nil { + return err + } + if fsSt.Type == unix.PROC_SUPER_MAGIC { + if _, uSt, err := m.srcStat(); err != nil { + return err + } else if uSt.Ino != procRootIno { + // We cannot error out in this case, because we've + // supported these kinds of mounts for a long time. + // However, we would expect users to bind-mount the root of + // a real procfs on top of /proc in the container. We might + // want to block this in the future. + logrus.Warnf("bind-mount %v (source %v) is of type procfs but is not the root of a procfs (inode %d). Future versions of runc might block this configuration -- please report an issue to if you see this warning.", dest, m.srcName(), uSt.Ino) + } + return nil + } + } else if m.Device == "proc" { + // Fresh procfs-type mounts are always safe to mount on top of /proc. return nil } return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest) @@ -766,15 +797,10 @@ func checkProcMount(rootfs, dest, source string) error { return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest) } -func isProc(path string) (bool, error) { - var s unix.Statfs_t - if err := unix.Statfs(path, &s); err != nil { - return false, &os.PathError{Op: "statfs", Path: path, Err: err} - } - return s.Type == unix.PROC_SUPER_MAGIC, nil -} - func setupDevSymlinks(rootfs string) error { + // In theory, these should be links to /proc/thread-self, but systems + // expect these to be /proc/self and this matches how most distributions + // work. links := [][2]string{ {"/proc/self/fd", "/dev/fd"}, {"/proc/self/fd/0", "/dev/stdin"}, @@ -857,8 +883,8 @@ func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error { if f != nil { _ = f.Close() } - return utils.WithProcfd(rootfs, dest, func(dstFD string) error { - return mountViaFDs(node.Path, nil, dest, dstFD, "bind", unix.MS_BIND, "") + return utils.WithProcfd(rootfs, dest, func(dstFd string) error { + return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "") }) } @@ -1251,17 +1277,17 @@ func mountPropagate(m mountEntry, rootfs string, mountLabel string) error { // mutating underneath us, we verify that we are actually going to mount // inside the container with WithProcfd() -- mounting through a procfd // mounts on the target. - if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { - return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, uintptr(flags), data) + if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { + return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data) }); err != nil { return err } // We have to apply mount propagation flags in a separate WithProcfd() call // because the previous call invalidates the passed procfd -- the mount // target needs to be re-opened. - if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { + if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { for _, pflag := range m.PropagationFlags { - if err := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(pflag), ""); err != nil { + if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil { return err } } diff --git a/libcontainer/rootfs_linux_test.go b/libcontainer/rootfs_linux_test.go index 8709a5e47f7..6ce099d256d 100644 --- a/libcontainer/rootfs_linux_test.go +++ b/libcontainer/rootfs_linux_test.go @@ -4,45 +4,133 @@ import ( "testing" "github.com/opencontainers/runc/libcontainer/configs" + + "golang.org/x/sys/unix" ) -func TestCheckMountDestOnProc(t *testing.T) { +func TestCheckMountDestInProc(t *testing.T) { + m := mountEntry{ + Mount: &configs.Mount{ + Destination: "/proc/sys", + Source: "/proc/sys", + Device: "bind", + Flags: unix.MS_BIND, + }, + } dest := "/rootfs/proc/sys" - err := checkProcMount("/rootfs", dest, "") + err := checkProcMount("/rootfs", dest, m) if err == nil { t.Fatal("destination inside proc should return an error") } } -func TestCheckMountDestOnProcChroot(t *testing.T) { +func TestCheckProcMountOnProc(t *testing.T) { + m := mountEntry{ + Mount: &configs.Mount{ + Destination: "/proc", + Source: "foo", + Device: "proc", + }, + } + dest := "/rootfs/proc/" + err := checkProcMount("/rootfs", dest, m) + if err != nil { + t.Fatalf("procfs type mount on /proc should not return an error: %v", err) + } +} + +func TestCheckBindMountOnProc(t *testing.T) { + m := mountEntry{ + Mount: &configs.Mount{ + Destination: "/proc", + Source: "/proc/self", + Device: "bind", + Flags: unix.MS_BIND, + }, + } dest := "/rootfs/proc/" - err := checkProcMount("/rootfs", dest, "/proc") + err := checkProcMount("/rootfs", dest, m) if err != nil { - t.Fatal("destination inside proc when using chroot should not return an error") + t.Fatalf("bind-mount of procfs on top of /proc should not return an error (for now): %v", err) + } +} + +func TestCheckTrickyMountOnProc(t *testing.T) { + // Make a non-bind mount that looks like a bit like a bind-mount. + m := mountEntry{ + Mount: &configs.Mount{ + Destination: "/proc", + Source: "/proc", + Device: "overlay", + Data: "lowerdir=/tmp/fakeproc,upperdir=/tmp/fakeproc2,workdir=/tmp/work", + }, + } + dest := "/rootfs/proc/" + err := checkProcMount("/rootfs", dest, m) + if err == nil { + t.Fatalf("dodgy overlayfs mount on top of /proc should return an error") + } +} + +func TestCheckTrickyBindMountOnProc(t *testing.T) { + // Make a bind mount that looks like it might be a procfs mount. + m := mountEntry{ + Mount: &configs.Mount{ + Destination: "/proc", + Source: "/sys", + Device: "proc", + Flags: unix.MS_BIND, + }, + } + dest := "/rootfs/proc/" + err := checkProcMount("/rootfs", dest, m) + if err == nil { + t.Fatalf("dodgy bind-mount on top of /proc should return an error") } } func TestCheckMountDestInSys(t *testing.T) { + m := mountEntry{ + Mount: &configs.Mount{ + Destination: "/sys/fs/cgroup", + Source: "tmpfs", + Device: "tmpfs", + }, + } dest := "/rootfs//sys/fs/cgroup" - err := checkProcMount("/rootfs", dest, "") + err := checkProcMount("/rootfs", dest, m) if err != nil { - t.Fatal("destination inside /sys should not return an error") + t.Fatalf("destination inside /sys should not return an error: %v", err) } } func TestCheckMountDestFalsePositive(t *testing.T) { + m := mountEntry{ + Mount: &configs.Mount{ + Destination: "/sysfiles/fs/cgroup", + Source: "tmpfs", + Device: "tmpfs", + }, + } dest := "/rootfs/sysfiles/fs/cgroup" - err := checkProcMount("/rootfs", dest, "") + err := checkProcMount("/rootfs", dest, m) if err != nil { t.Fatal(err) } } func TestCheckMountDestNsLastPid(t *testing.T) { + m := mountEntry{ + Mount: &configs.Mount{ + Destination: "/proc/sys/kernel/ns_last_pid", + Source: "lxcfs", + Device: "fuse.lxcfs", + }, + } dest := "/rootfs/proc/sys/kernel/ns_last_pid" - err := checkProcMount("/rootfs", dest, "/proc") + err := checkProcMount("/rootfs", dest, m) if err != nil { - t.Fatal("/proc/sys/kernel/ns_last_pid should not return an error") + t.Fatalf("/proc/sys/kernel/ns_last_pid should not return an error: %v", err) } } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 90275043eca..6f9f58ad1d0 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -38,6 +38,7 @@ var ( clear bool flag int } + complexFlags map[string]func(*configs.Mount) ) func initMaps() { @@ -126,7 +127,6 @@ func initMaps() { "rnostrictatime": {true, unix.MOUNT_ATTR_STRICTATIME}, "rnosymfollow": {false, unix.MOUNT_ATTR_NOSYMFOLLOW}, // since kernel 5.14 "rsymfollow": {true, unix.MOUNT_ATTR_NOSYMFOLLOW}, // since kernel 5.14 - // No support for MOUNT_ATTR_IDMAP yet (needs UserNS FD) } extensionFlags = map[string]struct { @@ -135,6 +135,17 @@ func initMaps() { }{ "tmpcopyup": {false, configs.EXT_COPYUP}, } + + complexFlags = map[string]func(*configs.Mount){ + "idmap": func(m *configs.Mount) { + m.IDMapping = new(configs.MountIDMapping) + m.IDMapping.Recursive = false // noop + }, + "ridmap": func(m *configs.Mount) { + m.IDMapping = new(configs.MountIDMapping) + m.IDMapping.Recursive = true + }, + } }) } @@ -415,6 +426,19 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { if err := setupUserNamespace(spec, config); err != nil { return nil, err } + // For idmap and ridmap mounts without explicit mappings, use the + // ones from the container's userns. If we are joining another + // userns, stash the path. + for _, m := range config.Mounts { + if m.IDMapping != nil && m.IDMapping.UIDMappings == nil && m.IDMapping.GIDMappings == nil { + if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { + m.IDMapping.UserNSPath = path + } else { + m.IDMapping.UIDMappings = config.UIDMappings + m.IDMapping.GIDMappings = config.GIDMappings + } + } + } } config.MaskPaths = spec.Linux.MaskedPaths config.ReadonlyPaths = spec.Linux.ReadonlyPaths @@ -447,6 +471,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { Domain: domain, } } + } // Set the host UID that should own the container's cgroup. @@ -552,8 +577,14 @@ func createLibcontainerMount(cwd string, m specs.Mount) (*configs.Mount, error) } } - mnt.UIDMappings = toConfigIDMap(m.UIDMappings) - mnt.GIDMappings = toConfigIDMap(m.GIDMappings) + if m.UIDMappings != nil || m.GIDMappings != nil { + if mnt.IDMapping == nil { + // Neither "idmap" nor "ridmap" were specified. + mnt.IDMapping = new(configs.MountIDMapping) + } + mnt.IDMapping.UIDMappings = toConfigIDMap(m.UIDMappings) + mnt.IDMapping.GIDMappings = toConfigIDMap(m.GIDMappings) + } // None of the mount arguments can contain a null byte. Normally such // strings would either cause some other failure or would just be truncated @@ -1036,20 +1067,24 @@ func parseMountOptions(options []string) *configs.Mount { } else if f, exists := recAttrFlags[o]; exists { if f.clear { recAttrClr |= f.flag + recAttrSet &= ^f.flag } else { recAttrSet |= f.flag + recAttrClr &= ^f.flag if f.flag&unix.MOUNT_ATTR__ATIME == f.flag { // https://man7.org/linux/man-pages/man2/mount_setattr.2.html // "cannot simply specify the access-time setting in attr_set, but must also include MOUNT_ATTR__ATIME in the attr_clr field." recAttrClr |= unix.MOUNT_ATTR__ATIME } } - } else if f, exists := extensionFlags[o]; exists && f.flag != 0 { + } else if f, exists := extensionFlags[o]; exists { if f.clear { m.Extensions &= ^f.flag } else { m.Extensions |= f.flag } + } else if fn, exists := complexFlags[o]; exists { + fn(&m) } else { data = append(data, o) } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 0f80cb33373..3096d0d81ee 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -17,6 +17,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) type linuxStandardInit struct { @@ -27,7 +28,6 @@ type linuxStandardInit struct { fifoFd int logFd int dmzExe *os.File - mountFds mountFds config *initConfig } @@ -88,17 +88,7 @@ func (l *linuxStandardInit) Init() error { // initialises the labeling system selinux.GetEnabled() - // We don't need the mount nor idmap fds after prepareRootfs() nor if it fails. - err := prepareRootfs(l.pipe, l.config, l.mountFds) - for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) { - if m == -1 { - continue - } - if err := unix.Close(m); err != nil { - return fmt.Errorf("unable to close mountFds fds: %w", err) - } - } - + err := prepareRootfs(l.pipe, l.config) if err != nil { return err } @@ -258,11 +248,13 @@ func (l *linuxStandardInit) Init() error { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } + fifoPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(l.fifoFd)) + defer closer() + // Wait for the FIFO to be opened on the other side before exec-ing the // user process. We open it through /proc/self/fd/$fd, because the fd that // was given to us was an O_PATH fd to the fifo itself. Linux allows us to // re-open an O_PATH fd through /proc. - fifoPath := "/proc/self/fd/" + strconv.Itoa(l.fifoFd) fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0) if err != nil { return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err} diff --git a/libcontainer/sync.go b/libcontainer/sync.go index 1894e870e53..0a54a4b81e0 100644 --- a/libcontainer/sync.go +++ b/libcontainer/sync.go @@ -21,6 +21,11 @@ type syncType string // // [ child ] <-> [ parent ] // +// procMountPlease --> [open(2) or open_tree(2) and configure mount] +// Arg: configs.Mount +// <-- procMountFd +// file: mountfd +// // procSeccomp --> [forward fd to listenerPath] // file: seccomp fd // --- no return synchronisation @@ -39,6 +44,8 @@ const ( procRun syncType = "procRun" procHooks syncType = "procHooks" procHooksDone syncType = "procHooksDone" + procMountPlease syncType = "procMountPlease" + procMountFd syncType = "procMountFd" procSeccomp syncType = "procSeccomp" procSeccompDone syncType = "procSeccompDone" ) diff --git a/libcontainer/userns/usernsfd_linux.go b/libcontainer/userns/usernsfd_linux.go new file mode 100644 index 00000000000..721215619b4 --- /dev/null +++ b/libcontainer/userns/usernsfd_linux.go @@ -0,0 +1,156 @@ +package userns + +import ( + "fmt" + "os" + "sort" + "strings" + "sync" + "syscall" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Mapping struct { + UIDMappings []configs.IDMap + GIDMappings []configs.IDMap +} + +func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) { + for _, uid := range m.UIDMappings { + uids = append(uids, syscall.SysProcIDMap{ + ContainerID: uid.ContainerID, + HostID: uid.HostID, + Size: uid.Size, + }) + } + for _, gid := range m.GIDMappings { + gids = append(gids, syscall.SysProcIDMap{ + ContainerID: gid.ContainerID, + HostID: gid.HostID, + Size: gid.Size, + }) + } + return +} + +// id returns a unique identifier for this mapping, agnostic of the order of +// the uid and gid mappings (because the order doesn't matter to the kernel). +// The set of userns handles is indexed using this ID. +func (m Mapping) id() string { + var uids, gids []string + for _, idmap := range m.UIDMappings { + uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size)) + } + for _, idmap := range m.GIDMappings { + gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size)) + } + // We don't care about the sort order -- just sort them. + sort.Strings(uids) + sort.Strings(gids) + return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",") +} + +type Handles struct { + m sync.Mutex + maps map[string]*os.File +} + +// Release all resources associated with this Handle. All existing files +// returned from Get() will continue to work even after calling Release(). The +// same Handles can be re-used after calling Release(). +func (hs *Handles) Release() { + hs.m.Lock() + defer hs.m.Unlock() + + // Close the files for good measure, though GC will do that for us anyway. + for _, file := range hs.maps { + _ = file.Close() + } + hs.maps = nil +} + +func spawnProc(req Mapping) (*os.Process, error) { + // We need to spawn a subprocess with the requested mappings, which is + // unfortunately quite expensive. The "safe" way of doing this is natively + // with Go (and then spawning something like "sleep infinity"), but + // execve() is a waste of cycles because we just need some process to have + // the right mapping, we don't care what it's executing. The "unsafe" + // option of doing a clone() behind the back of Go is probably okay in + // theory as long as we just do kill(getpid(), SIGSTOP). However, if we + // tell Go to put the new process into PTRACE_TRACEME mode, we can avoid + // the exec and not have to faff around with the mappings. + // + // Note that Go's stdlib does not support newuidmap, but in the case of + // id-mapped mounts, it seems incredibly unlikely that the user will be + // requesting us to do a remapping as an unprivileged user with mappings + // they have privileges over. + logrus.Debugf("spawning dummy process for id-mapping %s", req.id()) + uidMappings, gidMappings := req.toSys() + // We don't need to use /proc/thread-self here because the exe mm of a + // thread-group is guaranteed to be the same for all threads by definition. + // This lets us avoid having to do runtime.LockOSThread. + return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{ + Sys: &syscall.SysProcAttr{ + Cloneflags: unix.CLONE_NEWUSER, + UidMappings: uidMappings, + GidMappings: gidMappings, + GidMappingsEnableSetgroups: false, + // Put the process into PTRACE_TRACEME mode to allow us to get the + // userns without having a proper execve() target. + Ptrace: true, + }, + }) +} + +func dupFile(f *os.File) (*os.File, error) { + newFd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err) + } + return os.NewFile(uintptr(newFd), f.Name()), nil +} + +// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested +// mapping. The processes spawned to produce userns nsfds are cached, so if +// equivalent user namespace mappings are requested, the same user namespace +// will be returned. The caller is responsible for closing the returned file +// descriptor. +func (hs *Handles) Get(req Mapping) (file *os.File, err error) { + hs.m.Lock() + defer hs.m.Unlock() + + if hs.maps == nil { + hs.maps = make(map[string]*os.File) + } + + file, ok := hs.maps[req.id()] + if !ok { + proc, err := spawnProc(req) + if err != nil { + return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err) + } + // Make sure we kill the helper process. We ignore errors because + // there's not much we can do about them anyway, and ultimately + defer func() { + _ = proc.Kill() + _, _ = proc.Wait() + }() + + // Stash away a handle to the userns file. This is neater than keeping + // the process alive, because Go's GC can handle files much better than + // leaked processes, and having long-living useless processes seems + // less than ideal. + file, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid)) + if err != nil { + return nil, err + } + hs.maps[req.id()] = file + } + // Duplicate the file, to make sure the lifecycle of each *os.File we + // return is independent. + return dupFile(file) +} diff --git a/libcontainer/userns/usernsfd_linux_test.go b/libcontainer/userns/usernsfd_linux_test.go new file mode 100644 index 00000000000..23a3419fcae --- /dev/null +++ b/libcontainer/userns/usernsfd_linux_test.go @@ -0,0 +1,52 @@ +package userns + +import ( + "os" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +func BenchmarkSpawnProc(b *testing.B) { + if os.Geteuid() != 0 { + b.Skip("spawning user namespaced processes requires root") + } + + // We can reuse the mapping as we call spawnProc() directly. + mapping := Mapping{ + UIDMappings: []configs.IDMap{ + {ContainerID: 0, HostID: 1337, Size: 142}, + {ContainerID: 150, HostID: 0, Size: 1}, + {ContainerID: 442, HostID: 1111, Size: 12}, + {ContainerID: 1000, HostID: 9999, Size: 92}, + {ContainerID: 9999, HostID: 1000000, Size: 4}, + // Newer kernels support more than 5 entries, but stick to 5 here. + }, + GIDMappings: []configs.IDMap{ + {ContainerID: 1, HostID: 2337, Size: 142}, + {ContainerID: 145, HostID: 6, Size: 1}, + {ContainerID: 200, HostID: 1000, Size: 12}, + {ContainerID: 1000, HostID: 9888, Size: 92}, + {ContainerID: 8999, HostID: 1000000, Size: 4}, + // Newer kernels support more than 5 entries, but stick to 5 here. + }, + } + + procs := make([]*os.Process, 0, b.N) + b.Cleanup(func() { + for _, proc := range procs { + if proc != nil { + _ = proc.Kill() + _, _ = proc.Wait() + } + } + }) + + for i := 0; i < b.N; i++ { + proc, err := spawnProc(mapping) + if err != nil { + b.Error(err) + } + procs = append(procs, proc) + } +} diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index 74d9d20c7f1..1b523d8ac54 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -3,15 +3,12 @@ package utils import ( "encoding/binary" "encoding/json" - "fmt" "io" "os" "path/filepath" - "strconv" "strings" "unsafe" - securejoin "github.com/cyphar/filepath-securejoin" "golang.org/x/sys/unix" ) @@ -102,39 +99,6 @@ func stripRoot(root, path string) string { return CleanPath("/" + path) } -// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) -// corresponding to the unsafePath resolved within the root. Before passing the -// fd, this path is verified to have been inside the root -- so operating on it -// through the passed fdpath should be safe. Do not access this path through -// the original path strings, and do not attempt to use the pathname outside of -// the passed closure (the file handle will be freed once the closure returns). -func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { - // Remove the root then forcefully resolve inside the root. - unsafePath = stripRoot(root, unsafePath) - path, err := securejoin.SecureJoin(root, unsafePath) - if err != nil { - return fmt.Errorf("resolving path inside rootfs failed: %w", err) - } - - // Open the target path. - fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) - if err != nil { - return fmt.Errorf("open o_path procfd: %w", err) - } - defer fh.Close() - - // Double-check the path is the one we expected. - procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) - if realpath, err := os.Readlink(procfd); err != nil { - return fmt.Errorf("procfd verification failed: %w", err) - } else if realpath != path { - return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) - } - - // Run the closure. - return fn(procfd) -} - // SearchLabels searches through a list of key=value pairs for a given key, // returning its value, and the binary flag telling whether the key exist. func SearchLabels(labels []string, key string) (string, bool) { diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index e5f11523d1d..a48221b000a 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -7,9 +7,13 @@ import ( "fmt" "math" "os" + "path/filepath" + "runtime" "strconv" "sync" + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -57,7 +61,10 @@ func CloseExecFrom(minFd int) error { return os.NewSyscallError("close_range", err) } - fdDir, err := os.Open("/proc/self/fd") + procSelfFd, closer := ProcThreadSelf("fd") + defer closer() + + fdDir, err := os.Open(procSelfFd) if err != nil { return err } @@ -98,3 +105,100 @@ func NewSockPair(name string) (parent, child *os.File, err error) { } return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil } + +// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +// corresponding to the unsafePath resolved within the root. Before passing the +// fd, this path is verified to have been inside the root -- so operating on it +// through the passed fdpath should be safe. Do not access this path through +// the original path strings, and do not attempt to use the pathname outside of +// the passed closure (the file handle will be freed once the closure returns). +func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { + // Remove the root then forcefully resolve inside the root. + unsafePath = stripRoot(root, unsafePath) + path, err := securejoin.SecureJoin(root, unsafePath) + if err != nil { + return fmt.Errorf("resolving path inside rootfs failed: %w", err) + } + + procSelfFd, closer := ProcThreadSelf("fd/") + defer closer() + + // Open the target path. + fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open o_path procfd: %w", err) + } + defer fh.Close() + + procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd()))) + // Double-check the path is the one we expected. + if realpath, err := os.Readlink(procfd); err != nil { + return fmt.Errorf("procfd verification failed: %w", err) + } else if realpath != path { + return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) + } + + return fn(procfd) +} + +type ProcThreadSelfCloser func() + +var ( + haveProcThreadSelf bool + haveProcThreadSelfOnce sync.Once +) + +// ProcThreadSelf returns a string that is equivalent to +// /proc/thread-self/, with a graceful fallback on older kernels where +// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, +// meaning that the passed string needs to be trusted. The caller _must_ call +// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread) +// *only once* after it has finished using the returned path string. +func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) { + haveProcThreadSelfOnce.Do(func() { + if _, err := os.Stat("/proc/thread-self/"); err == nil { + haveProcThreadSelf = true + } else { + logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err) + } + }) + + // We need to lock our thread until the caller is done with the path string + // because any non-atomic operation on the path (such as opening a file, + // then reading it) could be interrupted by the Go runtime where the + // underlying thread is swapped out and the original thread is killed, + // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In + // addition, the pre-3.17 fallback makes everything non-atomic because the + // same thing could happen between unix.Gettid() and the path operations. + // + // In theory, we don't need to lock in the atomic user case when using + // /proc/thread-self/, but it's better to be safe than sorry (and there are + // only one or two truly atomic users of /proc/thread-self/). + runtime.LockOSThread() + + threadSelf := "/proc/thread-self/" + if !haveProcThreadSelf { + // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. + threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" + if _, err := os.Stat(threadSelf); err != nil { + // Unfortunately, this code is called from rootfs_linux.go where we + // are running inside the pid namespace of the container but /proc + // is the host's procfs. Unfortunately there is no real way to get + // the correct tid to use here (the kernel age means we cannot do + // things like set up a private fsopen("proc") -- even scanning + // NSpid in all of the tasks in /proc/self/task/*/status requires + // Linux 4.1). + // + // So, we just have to assume that /proc/self is acceptable in this + // one specific case. + if os.Getpid() == 1 { + logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) + } else { + // This should never happen, but the fallback should work in most cases... + logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) + } + threadSelf = "/proc/self/" + } + } + return threadSelf + subpath, runtime.UnlockOSThread +} diff --git a/tests/integration/idmap.bats b/tests/integration/idmap.bats index 53784b55672..a816fe96863 100644 --- a/tests/integration/idmap.bats +++ b/tests/integration/idmap.bats @@ -3,167 +3,690 @@ load helpers function setup() { + OVERFLOW_UID="$(cat /proc/sys/kernel/overflowuid)" + OVERFLOW_GID="$(cat /proc/sys/kernel/overflowgid)" requires root requires_kernel 5.12 - requires_idmap_fs /tmp setup_debian + requires_idmap_fs . - # Prepare source folders for bind mount - mkdir -p source-{1,2}/ - touch source-{1,2}/foo.txt + # Prepare source folders for mounts. + mkdir -p source-{1,2,multi{1,2,3}}/ + touch source-{1,2,multi{1,2,3}}/foo.txt + touch source-multi{1,2,3}/{bar,baz}.txt - # Use other owner for source-2 + # Change the owners for everything other than source-1. chown 1:1 source-2/foo.txt - mkdir -p rootfs/tmp/mount-{1,2} - mkdir -p rootfs/mnt/bind-mount-{1,2} - - update_config ' .linux.namespaces += [{"type": "user"}] - | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] - | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] - | .process.args += ["-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"] - | .mounts += [ - { - "source": "source-1/", - "destination": "/tmp/mount-1", - "options": ["bind"], - "uidMappings": [ { - "containerID": 0, - "hostID": 100000, - "size": 65536 - } - ], - "gidMappings": [ { - "containerID": 0, - "hostID": 100000, - "size": 65536 - } - ] - } - ] ' + # A source with multiple users owning files. + chown 100:211 source-multi1/foo.txt + chown 101:222 source-multi1/bar.txt + chown 102:233 source-multi1/baz.txt - remap_rootfs + # Same gids as multi1, different uids. + chown 200:211 source-multi2/foo.txt + chown 201:222 source-multi2/bar.txt + chown 202:233 source-multi2/baz.txt + + # Even more users -- 1000 uids, 500 gids. + chown 5000528:6000491 source-multi3/foo.txt + chown 5000133:6000337 source-multi3/bar.txt + chown 5000999:6000444 source-multi3/baz.txt + + # Add a symlink-containing source. + ln -s source-multi1 source-multi1-symlink + + # Add some top-level files in the mount tree. + mkdir -p mnt-subtree/multi{1,2} + touch mnt-subtree/{foo,bar,baz}.txt + chown 100:211 mnt-subtree/foo.txt + chown 200:222 mnt-subtree/bar.txt + chown 300:233 mnt-subtree/baz.txt + + mounts_file="$PWD/.all-mounts" + echo -n >"$mounts_file" } function teardown() { + if [ -v mounts_file ]; then + xargs -n 1 -a "$mounts_file" -- umount -l + rm -f "$mounts_file" + fi teardown_bundle } -@test "simple idmap mount" { +function setup_host_bind_mount() { + src="$1" + dst="$2" + + mount --bind "$src" "$dst" + echo "$dst" >>"$mounts_file" +} + +function setup_idmap_userns() { + update_config '.linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}] + | .linux.gidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]' + remap_rootfs +} + +function setup_bind_mount() { + mountname="${1:-1}" + update_config '.mounts += [ + { + "source": "source-'"$mountname"'/", + "destination": "/tmp/bind-mount-'"$mountname"'", + "options": ["bind"] + } + ]' +} + +function setup_idmap_single_mount() { + uidmap="$1" # ctr:host:size + gidmap="$2" # ctr:host:size + mountname="$3" + destname="${4:-$mountname}" + + read -r uid_containerID uid_hostID uid_size <<<"$(tr : ' ' <<<"$uidmap")" + read -r gid_containerID gid_hostID gid_size <<<"$(tr : ' ' <<<"$gidmap")" + + update_config '.mounts += [ + { + "source": "source-'"$mountname"'/", + "destination": "/tmp/mount-'"$destname"'", + "options": ["bind"], + "uidMappings": [{"containerID": '"$uid_containerID"', "hostID": '"$uid_hostID"', "size": '"$uid_size"'}], + "gidMappings": [{"containerID": '"$gid_containerID"', "hostID": '"$gid_hostID"', "size": '"$gid_size"'}] + } + ]' +} + +function setup_idmap_basic_mount() { + mountname="${1:-1}" + setup_idmap_single_mount 0:100000:65536 0:100000:65536 "$mountname" +} + +@test "simple idmap mount [userns]" { + setup_idmap_userns + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]' + runc run test_debian [ "$status" -eq 0 ] [[ "$output" == *"=0=0="* ]] } -@test "write to an idmap mount" { - update_config ' .process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]' +@test "simple idmap mount [no userns]" { + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]' + + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=100000=100000="* ]] +} + +@test "write to an idmap mount [userns]" { + setup_idmap_userns + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]' + runc run test_debian [ "$status" -eq 0 ] [[ "$output" == *"=0=0="* ]] } -@test "idmap mount with propagation flag" { - update_config ' .process.args = ["sh", "-c", "findmnt -o PROPAGATION /tmp/mount-1"]' - # Add the shared option to the idmap mount - update_config ' .mounts |= map((select(.source == "source-1/") | .options += ["shared"]) // .)' +@test "write to an idmap mount [no userns]" { + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]' + + runc run test_debian + # The write must fail because the user is unmapped. + [ "$status" -ne 0 ] + [[ "$output" == *"Value too large for defined data type"* ]] # ERANGE +} + +@test "idmap mount with propagation flag [userns]" { + setup_idmap_userns + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "findmnt -o PROPAGATION /tmp/mount-1"]' + # Add the shared option to the idmap mount. + update_config '.mounts |= map((select(.source == "source-1/") | .options += ["shared"]) // .)' runc run test_debian [ "$status" -eq 0 ] [[ "$output" == *"shared"* ]] } -@test "idmap mount with relative path" { - update_config ' .mounts |= map((select(.source == "source-1/") | .destination = "tmp/mount-1") // .)' +@test "idmap mount with relative path [userns]" { + setup_idmap_userns + setup_idmap_basic_mount + + update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]' + # Switch the mount to have a relative mount destination. + update_config '.mounts |= map((select(.source == "source-1/") | .destination = "tmp/mount-1") // .)' runc run test_debian [ "$status" -eq 0 ] [[ "$output" == *"=0=0="* ]] } -@test "idmap mount with bind mount" { - update_config ' .mounts += [ - { - "source": "source-2/", - "destination": "/tmp/mount-2", - "options": ["bind"] - } - ] ' +@test "idmap mount with bind mount [userns]" { + setup_idmap_userns + setup_idmap_basic_mount + setup_bind_mount + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/{,bind-}mount-1/foo.txt"]' runc run test_debian [ "$status" -eq 0 ] - [[ "$output" == *"=0=0="* ]] + [[ "$output" == *"=/tmp/mount-1/foo.txt:0=0="* ]] + [[ "$output" == *"=/tmp/bind-mount-1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] } -@test "two idmap mounts with two bind mounts" { - update_config ' .process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt /tmp/mount-2/foo.txt"] - | .mounts += [ - { - "source": "source-1/", - "destination": "/mnt/bind-mount-1", - "options": ["bind"] - }, - { - "source": "source-2/", - "destination": "/mnt/bind-mount-2", - "options": ["bind"] - }, - { - "source": "source-2/", - "destination": "/tmp/mount-2", - "options": ["bind"], - "uidMappings": [ { - "containerID": 0, - "hostID": 100000, - "size": 65536 - } - ], - "gidMappings": [ { - "containerID": 0, - "hostID": 100000, - "size": 65536 - } - ] - } - ] ' +@test "idmap mount with bind mount [no userns]" { + setup_idmap_basic_mount + setup_bind_mount + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/{,bind-}mount-1/foo.txt"]' runc run test_debian [ "$status" -eq 0 ] - [[ "$output" == *"=0=0="* ]] - # source-2/ is owned by 1:1, so we expect this with the idmap mount too. - [[ "$output" == *"=1=1="* ]] + [[ "$output" == *"=/tmp/mount-1/foo.txt:100000=100000="* ]] + [[ "$output" == *"=/tmp/bind-mount-1/foo.txt:0=0="* ]] } -@test "idmap mount without gidMappings fails" { - update_config ' .mounts |= map((select(.source == "source-1/") | del(.gidMappings) ) // .)' +@test "two idmap mounts (same mapping) with two bind mounts [userns]" { + setup_idmap_userns + + setup_idmap_basic_mount 1 + setup_bind_mount 1 + setup_bind_mount 2 + setup_idmap_basic_mount 2 + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-[12]/foo.txt"]' runc run test_debian - [ "$status" -eq 1 ] - [[ "${output}" == *"invalid mount"* ]] + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-1/foo.txt:0=0="* ]] + [[ "$output" == *"=/tmp/mount-2/foo.txt:1=1="* ]] } -@test "idmap mount without uidMappings fails" { - update_config ' .mounts |= map((select(.source == "source-1/") | del(.uidMappings) ) // .)' +@test "same idmap mount (different mappings) [userns]" { + setup_idmap_userns + + # Mount the same directory with different mappings. Make sure we also use + # different mappings for uids and gids. + setup_idmap_single_mount 100:100000:100 200:100000:100 multi1 + setup_idmap_single_mount 100:101000:100 200:102000:100 multi1 multi1-alt + setup_idmap_single_mount 100:102000:100 200:103000:100 multi1-symlink multi1-alt-sym + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1{,-alt{,-sym}}/{foo,bar,baz}.txt"]' runc run test_debian - [ "$status" -eq 1 ] - [[ "${output}" == *"invalid mount"* ]] + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:0=11="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:1=22="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:2=33="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/foo.txt:1000=2011="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/bar.txt:1001=2022="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/baz.txt:1002=2033="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt-sym/foo.txt:2000=3011="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt-sym/bar.txt:2001=3022="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt-sym/baz.txt:2002=3033="* ]] } -@test "idmap mount without bind fails" { - update_config ' .mounts |= map((select(.source == "source-1/") | .options = [""]) // .)' +@test "same idmap mount (different mappings) [no userns]" { + # Mount the same directory with different mappings. Make sure we also use + # different mappings for uids and gids. + setup_idmap_single_mount 100:100000:100 200:100000:100 multi1 + setup_idmap_single_mount 100:101000:100 200:102000:100 multi1 multi1-alt + setup_idmap_single_mount 100:102000:100 200:103000:100 multi1-symlink multi1-alt-sym + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1{,-alt{,-sym}}/{foo,bar,baz}.txt"]' runc run test_debian - [ "$status" -eq 1 ] - [[ "${output}" == *"invalid mount"* ]] + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:100000=100011="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:100001=100022="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:100002=100033="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/foo.txt:101000=102011="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/bar.txt:101001=102022="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt/baz.txt:101002=102033="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt-sym/foo.txt:102000=103011="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt-sym/bar.txt:102001=103022="* ]] + [[ "$output" == *"=/tmp/mount-multi1-alt-sym/baz.txt:102002=103033="* ]] } -@test "idmap mount with different mapping than userns fails" { - # Let's modify the containerID to be 1, instead of 0 as it is in the - # userns config. - update_config ' .mounts |= map((select(.source == "source-1/") | .uidMappings[0]["containerID"] = 1 ) // .)' +@test "multiple idmap mounts (different mappings) [userns]" { + setup_idmap_userns + + # Make sure we use different mappings for uids and gids. + setup_idmap_single_mount 100:101100:3 200:101900:50 multi1 + setup_idmap_single_mount 200:102200:3 200:102900:100 multi2 + setup_idmap_single_mount 5000000:103000:1000 6000000:103000:500 multi3 + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi[123]/{foo,bar,baz}.txt"]' runc run test_debian - [ "$status" -eq 1 ] - [[ "${output}" == *"invalid mount"* ]] + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:1100=1911="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:1101=1922="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:1102=1933="* ]] + [[ "$output" == *"=/tmp/mount-multi2/foo.txt:2200=2911="* ]] + [[ "$output" == *"=/tmp/mount-multi2/bar.txt:2201=2922="* ]] + [[ "$output" == *"=/tmp/mount-multi2/baz.txt:2202=2933="* ]] + [[ "$output" == *"=/tmp/mount-multi3/foo.txt:3528=3491="* ]] + [[ "$output" == *"=/tmp/mount-multi3/bar.txt:3133=3337="* ]] + [[ "$output" == *"=/tmp/mount-multi3/baz.txt:3999=3444="* ]] +} + +@test "multiple idmap mounts (different mappings) [no userns]" { + # Make sure we use different mappings for uids and gids. + setup_idmap_single_mount 100:1100:3 200:1900:50 multi1 + setup_idmap_single_mount 200:2200:3 200:2900:100 multi2 + setup_idmap_single_mount 5000000:3000:1000 6000000:3000:500 multi3 + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi[123]/{foo,bar,baz}.txt"]' + + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:1100=1911="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:1101=1922="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:1102=1933="* ]] + [[ "$output" == *"=/tmp/mount-multi2/foo.txt:2200=2911="* ]] + [[ "$output" == *"=/tmp/mount-multi2/bar.txt:2201=2922="* ]] + [[ "$output" == *"=/tmp/mount-multi2/baz.txt:2202=2933="* ]] + [[ "$output" == *"=/tmp/mount-multi3/foo.txt:3528=3491="* ]] + [[ "$output" == *"=/tmp/mount-multi3/bar.txt:3133=3337="* ]] + [[ "$output" == *"=/tmp/mount-multi3/baz.txt:3999=3444="* ]] +} + +@test "idmap mount (complicated mapping) [userns]" { + setup_idmap_userns + + update_config '.mounts += [ + { + "source": "source-multi1/", + "destination": "/tmp/mount-multi1", + "options": ["bind"], + "uidMappings": [ + {"containerID": 100, "hostID": 101000, "size": 1}, + {"containerID": 101, "hostID": 102000, "size": 1}, + {"containerID": 102, "hostID": 103000, "size": 1} + ], + "gidMappings": [ + {"containerID": 210, "hostID": 101100, "size": 10}, + {"containerID": 220, "hostID": 102200, "size": 10}, + {"containerID": 230, "hostID": 103300, "size": 10} + ] + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:1000=1101="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]] +} + +@test "idmap mount (complicated mapping) [no userns]" { + update_config '.mounts += [ + { + "source": "source-multi1/", + "destination": "/tmp/mount-multi1", + "options": ["bind"], + "uidMappings": [ + {"containerID": 100, "hostID": 1000, "size": 1}, + {"containerID": 101, "hostID": 2000, "size": 1}, + {"containerID": 102, "hostID": 3000, "size": 1} + ], + "gidMappings": [ + {"containerID": 210, "hostID": 1100, "size": 10}, + {"containerID": 220, "hostID": 2200, "size": 10}, + {"containerID": 230, "hostID": 3300, "size": 10} + ] + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-multi1/foo.txt:1000=1101="* ]] + [[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]] + [[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]] +} + +@test "idmap mount (non-recursive idmap) [userns]" { + setup_idmap_userns + + setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1" + setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2" + + update_config '.mounts += [ + { + "source": "mnt-subtree/", + "destination": "/tmp/mount-tree", + "options": ["rbind"], + "uidMappings": [ + {"containerID": 100, "hostID": 101000, "size": 3}, + {"containerID": 200, "hostID": 102000, "size": 3}, + {"containerID": 300, "hostID": 103000, "size": 3} + ], + "gidMappings": [ + {"containerID": 210, "hostID": 101100, "size": 10}, + {"containerID": 220, "hostID": 102200, "size": 10}, + {"containerID": 230, "hostID": 103300, "size": 10} + ] + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-tree/foo.txt:1000=1101="* ]] + [[ "$output" == *"=/tmp/mount-tree/bar.txt:2000=2202="* ]] + [[ "$output" == *"=/tmp/mount-tree/baz.txt:3000=3303="* ]] + # Because we used "idmap", the child mounts were not remapped recursively. + [[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] +} + +@test "idmap mount (non-recursive idmap) [no userns]" { + setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1" + setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2" + + update_config '.mounts += [ + { + "source": "mnt-subtree/", + "destination": "/tmp/mount-tree", + "options": ["rbind"], + "uidMappings": [ + {"containerID": 100, "hostID": 101000, "size": 3}, + {"containerID": 200, "hostID": 102000, "size": 3}, + {"containerID": 300, "hostID": 103000, "size": 3} + ], + "gidMappings": [ + {"containerID": 210, "hostID": 101100, "size": 10}, + {"containerID": 220, "hostID": 102200, "size": 10}, + {"containerID": 230, "hostID": 103300, "size": 10} + ] + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-tree/foo.txt:101000=101101="* ]] + [[ "$output" == *"=/tmp/mount-tree/bar.txt:102000=102202="* ]] + [[ "$output" == *"=/tmp/mount-tree/baz.txt:103000=103303="* ]] + # Because we used "idmap", the child mounts were not remapped recursively. + [[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:100=211="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:101=222="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:102=233="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:200=211="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:201=222="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:202=233="* ]] +} + +@test "idmap mount (idmap flag) [userns]" { + setup_idmap_userns + + setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1" + setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2" + + update_config '.mounts += [ + { + "source": "mnt-subtree/", + "destination": "/tmp/mount-tree", + "options": ["rbind", "idmap"], + "uidMappings": [ + {"containerID": 100, "hostID": 101000, "size": 3}, + {"containerID": 200, "hostID": 102000, "size": 3}, + {"containerID": 300, "hostID": 103000, "size": 3} + ], + "gidMappings": [ + {"containerID": 210, "hostID": 101100, "size": 10}, + {"containerID": 220, "hostID": 102200, "size": 10}, + {"containerID": 230, "hostID": 103300, "size": 10} + ] + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-tree/foo.txt:1000=1101="* ]] + [[ "$output" == *"=/tmp/mount-tree/bar.txt:2000=2202="* ]] + [[ "$output" == *"=/tmp/mount-tree/baz.txt:3000=3303="* ]] + # Because we used "idmap", the child mounts were not remapped recursively. + [[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] +} + +@test "idmap mount (idmap flag) [no userns]" { + setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1" + setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2" + + update_config '.mounts += [ + { + "source": "mnt-subtree/", + "destination": "/tmp/mount-tree", + "options": ["rbind", "idmap"], + "uidMappings": [ + {"containerID": 100, "hostID": 101000, "size": 3}, + {"containerID": 200, "hostID": 102000, "size": 3}, + {"containerID": 300, "hostID": 103000, "size": 3} + ], + "gidMappings": [ + {"containerID": 210, "hostID": 101100, "size": 10}, + {"containerID": 220, "hostID": 102200, "size": 10}, + {"containerID": 230, "hostID": 103300, "size": 10} + ] + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-tree/foo.txt:101000=101101="* ]] + [[ "$output" == *"=/tmp/mount-tree/bar.txt:102000=102202="* ]] + [[ "$output" == *"=/tmp/mount-tree/baz.txt:103000=103303="* ]] + # Because we used "idmap", the child mounts were not remapped recursively. + [[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:100=211="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:101=222="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:102=233="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:200=211="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:201=222="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:202=233="* ]] +} + +@test "idmap mount (ridmap flag) [userns]" { + setup_idmap_userns + + setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1" + setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2" + + update_config '.mounts += [ + { + "source": "mnt-subtree/", + "destination": "/tmp/mount-tree", + "options": ["rbind", "ridmap"], + "uidMappings": [ + {"containerID": 100, "hostID": 101000, "size": 3}, + {"containerID": 200, "hostID": 102000, "size": 3}, + {"containerID": 300, "hostID": 103000, "size": 3} + ], + "gidMappings": [ + {"containerID": 210, "hostID": 101100, "size": 10}, + {"containerID": 220, "hostID": 102200, "size": 10}, + {"containerID": 230, "hostID": 103300, "size": 10} + ] + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-tree/foo.txt:1000=1101="* ]] + [[ "$output" == *"=/tmp/mount-tree/bar.txt:2000=2202="* ]] + [[ "$output" == *"=/tmp/mount-tree/baz.txt:3000=3303="* ]] + # The child mounts have the same mapping applied. + [[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:1000=1101="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:1001=2202="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:1002=3303="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:2000=1101="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:2001=2202="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:2002=3303="* ]] +} + +@test "idmap mount (ridmap flag) [no userns]" { + setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1" + setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2" + + update_config '.mounts += [ + { + "source": "mnt-subtree/", + "destination": "/tmp/mount-tree", + "options": ["rbind", "ridmap"], + "uidMappings": [ + {"containerID": 100, "hostID": 101000, "size": 3}, + {"containerID": 200, "hostID": 102000, "size": 3}, + {"containerID": 300, "hostID": 103000, "size": 3} + ], + "gidMappings": [ + {"containerID": 210, "hostID": 101100, "size": 10}, + {"containerID": 220, "hostID": 102200, "size": 10}, + {"containerID": 230, "hostID": 103300, "size": 10} + ] + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-tree/foo.txt:101000=101101="* ]] + [[ "$output" == *"=/tmp/mount-tree/bar.txt:102000=102202="* ]] + [[ "$output" == *"=/tmp/mount-tree/baz.txt:103000=103303="* ]] + # The child mounts have the same mapping applied. + [[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:101000=101101="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:101001=102202="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:101002=103303="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:102000=101101="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:102001=102202="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:102002=103303="* ]] +} + +@test "idmap mount (idmap flag, implied mapping) [userns]" { + setup_idmap_userns + + setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1" + setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2" + + update_config '.mounts += [ + { + "source": "mnt-subtree/", + "destination": "/tmp/mount-tree", + "options": ["rbind", "idmap"], + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-tree/foo.txt:100=211="* ]] + [[ "$output" == *"=/tmp/mount-tree/bar.txt:200=222="* ]] + [[ "$output" == *"=/tmp/mount-tree/baz.txt:300=233="* ]] + # Because we used "idmap", the child mounts were not remapped recursively. + [[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] +} + +@test "idmap mount (ridmap flag, implied mapping) [userns]" { + setup_idmap_userns + + setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1" + setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2" + + update_config '.mounts += [ + { + "source": "mnt-subtree/", + "destination": "/tmp/mount-tree", + "options": ["rbind", "ridmap"], + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-tree/foo.txt:100=211="* ]] + [[ "$output" == *"=/tmp/mount-tree/bar.txt:200=222="* ]] + [[ "$output" == *"=/tmp/mount-tree/baz.txt:300=233="* ]] + # The child mounts have the same mapping applied. + [[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:100=211="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:101=222="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:102=233="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:200=211="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:201=222="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:202=233="* ]] +} + +@test "idmap mount (idmap flag, implied mapping, userns join) [userns]" { + # Create a detached container with the id-mapping we want. + cp config.json config.json.bak + update_config '.linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}] + | .linux.gidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]' + update_config '.process.args = ["sleep", "infinity"]' + + runc run -d --console-socket "$CONSOLE_SOCKET" target_userns + [ "$status" -eq 0 ] + + # Configure our container to attach to the first container's userns. + target_pid="$(__runc state target_userns | jq .pid)" + update_config '.linux.namespaces |= map(if .type == "user" then (.path = "/proc/'"$target_pid"'/ns/" + .type) else . end)' + update_config 'del(.linux.uidMappings) | del(.linux.gidMappings)' + + setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1" + setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2" + + update_config '.mounts += [ + { + "source": "mnt-subtree/", + "destination": "/tmp/mount-tree", + "options": ["rbind", "idmap"], + } + ]' + + update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]' + runc run test_debian + [ "$status" -eq 0 ] + [[ "$output" == *"=/tmp/mount-tree/foo.txt:100=211="* ]] + [[ "$output" == *"=/tmp/mount-tree/bar.txt:200=222="* ]] + [[ "$output" == *"=/tmp/mount-tree/baz.txt:300=233="* ]] + # Because we used "idmap", the child mounts were not remapped recursively. + [[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] + [[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]] } diff --git a/tests/integration/mounts.bats b/tests/integration/mounts.bats index a4d997ca765..5dafb655c92 100644 --- a/tests/integration/mounts.bats +++ b/tests/integration/mounts.bats @@ -24,6 +24,109 @@ function test_ro_cgroup_mount() { for line in "${lines[@]}"; do [[ "${line}" == *'ro,'* ]]; done } +# Parse an "optstring" of the form foo,bar into $is_foo and $is_bar variables. +# Usage: parse_optstring foo,bar foo bar baz +function parse_optstring() { + optstring="$1" + shift + + for flag in "$@"; do + is_set= + if grep -wq "$flag" <<<"$optstring"; then + is_set=1 + fi + eval "is_$flag=$is_set" + done +} + +function config_add_bind_mount() { + src="$1" + dst="$2" + parse_optstring "${3:-}" rbind idmap + + bindtype=bind + if [ -n "$is_rbind" ]; then + bindtype=rbind + fi + + mappings="" + if [ -n "$is_idmap" ]; then + mappings=' + "uidMappings": [{"containerID": 0, "hostID": 100000, "size": 65536}], + "gidMappings": [{"containerID": 0, "hostID": 100000, "size": 65536}], + ' + fi + + update_config '.mounts += [{ + "source": "'"$src"'", + "destination": "'"$dst"'", + "type": "bind", + '"$mappings"' + "options": [ "'"$bindtype"'", "rprivate" ] + }]' +} + +# This needs to be placed at the top of the bats file to work around +# a shellcheck bug. See . +function test_mount_order() { + parse_optstring "${1:-}" userns idmap + + if [ -n "$is_userns" ]; then + requires root + + update_config '.linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}] + | .linux.gidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]' + remap_rootfs + fi + + ctr_src_opts="rbind" + if [ -n "$is_idmap" ]; then + requires root + requires_kernel 5.12 + requires_idmap_fs . + + ctr_src_opts+=",idmap" + fi + + mkdir -p rootfs/{mnt,final} + # Create a set of directories we can create a mount tree with. + for subdir in a/x b/y c/z; do + dir="bind-src/$subdir" + mkdir -p "$dir" + echo "$subdir" >"$dir/file" + # Add a symlink to make sure + topdir="$(dirname "$subdir")" + ln -s "$topdir" "bind-src/sym-$topdir" + done + # In userns tests, make sure that the source directory cannot be accessed, + # to make sure we're exercising the bind-mount source fd logic. + chmod o-rwx bind-src + + rootfs="$(pwd)/rootfs" + rm -rf rootfs/mnt + mkdir rootfs/mnt + + # Create a bind-mount tree. + config_add_bind_mount "$PWD/bind-src/a" "/mnt" + config_add_bind_mount "$PWD/bind-src/sym-b" "/mnt/x" + config_add_bind_mount "$PWD/bind-src/c" "/mnt/x/y" + config_add_bind_mount "$PWD/bind-src/sym-a" "/mnt/x/y/z" + # Create a recursive bind-mount that uses part of the current tree in the + # container. + config_add_bind_mount "$rootfs/mnt/x" "$rootfs/mnt/x/y/z/x" "$ctr_src_opts" + config_add_bind_mount "$rootfs/mnt/x/y" "$rootfs/mnt/x/y/z" "$ctr_src_opts" + # Finally, bind-mount the whole thing on top of /final. + config_add_bind_mount "$rootfs/mnt" "$rootfs/final" "$ctr_src_opts" + + # Check that the entire tree was copied and the mounts were done in the + # expected order. + update_config '.process.args = ["cat", "/final/x/y/z/z/x/y/z/x/file"]' + runc run test_busybox + [ "$status" -eq 0 ] + [[ "$output" == *"a/x"* ]] # the final "file" was from a/x. +} + # https://github.com/opencontainers/runc/issues/3991 @test "runc run [tmpcopyup]" { mkdir -p rootfs/dir1/dir2 @@ -108,3 +211,19 @@ function test_ro_cgroup_mount() { update_config '.linux.namespaces |= if index({"type": "cgroup"}) then . else . + [{"type": "cgroup"}] end' test_ro_cgroup_mount } + +@test "runc run [mount order, container bind-mount source]" { + test_mount_order +} + +@test "runc run [mount order, container bind-mount source] (userns)" { + test_mount_order userns +} + +@test "runc run [mount order, container idmap source]" { + test_mount_order idmap +} + +@test "runc run [mount order, container idmap source] (userns)" { + test_mount_order userns,idmap +} diff --git a/utils_linux.go b/utils_linux.go index 5de00628493..e7b362cdb2e 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -224,8 +224,10 @@ func (r *runner) run(config *specs.Process) (int, error) { process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...) } baseFd := 3 + len(process.ExtraFiles) + procSelfFd, closer := utils.ProcThreadSelf("fd/") + defer closer() for i := baseFd; i < baseFd+r.preserveFDs; i++ { - _, err = os.Stat("/proc/self/fd/" + strconv.Itoa(i)) + _, err = os.Stat(filepath.Join(procSelfFd, strconv.Itoa(i))) if err != nil { return -1, fmt.Errorf("unable to stat preserved-fd %d (of %d): %w", i-baseFd, r.preserveFDs, err) } diff --git a/vendor/github.com/moby/sys/mountinfo/mountinfo_linux.go b/vendor/github.com/moby/sys/mountinfo/mountinfo_linux.go index 59332b07bf4..b32b5c9b150 100644 --- a/vendor/github.com/moby/sys/mountinfo/mountinfo_linux.go +++ b/vendor/github.com/moby/sys/mountinfo/mountinfo_linux.go @@ -5,15 +5,19 @@ import ( "fmt" "io" "os" + "runtime" "strconv" "strings" + "sync" + + "golang.org/x/sys/unix" ) // GetMountsFromReader retrieves a list of mounts from the // reader provided, with an optional filter applied (use nil // for no filter). This can be useful in tests or benchmarks // that provide fake mountinfo data, or when a source other -// than /proc/self/mountinfo needs to be read from. +// than /proc/thread-self/mountinfo needs to be read from. // // This function is Linux-specific. func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) { @@ -127,8 +131,40 @@ func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) { return out, nil } -func parseMountTable(filter FilterFunc) ([]*Info, error) { - f, err := os.Open("/proc/self/mountinfo") +var ( + haveProcThreadSelf bool + haveProcThreadSelfOnce sync.Once +) + +func parseMountTable(filter FilterFunc) (_ []*Info, err error) { + haveProcThreadSelfOnce.Do(func() { + _, err := os.Stat("/proc/thread-self/mountinfo") + haveProcThreadSelf = err == nil + }) + + // We need to lock ourselves to the current OS thread in order to make sure + // that the thread referenced by /proc/thread-self stays alive until we + // finish parsing the file. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + var f *os.File + if haveProcThreadSelf { + f, err = os.Open("/proc/thread-self/mountinfo") + } else { + // On pre-3.17 kernels (such as CentOS 7), we don't have + // /proc/thread-self/ so we need to manually construct + // /proc/self/task// as a fallback. + f, err = os.Open("/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/mountinfo") + if os.IsNotExist(err) { + // If /proc/self/task/... failed, it means that our active pid + // namespace doesn't match the pid namespace of the /proc mount. In + // this case we just have to make do with /proc/self, since there + // is no other way of figuring out our tid in a parent pid + // namespace on pre-3.17 kernels. + f, err = os.Open("/proc/self/mountinfo") + } + } if err != nil { return nil, err } @@ -158,10 +194,10 @@ func PidMountInfo(pid int) ([]*Info, error) { // A few specific characters in mountinfo path entries (root and mountpoint) // are escaped using a backslash followed by a character's ascii code in octal. // -// space -- as \040 -// tab (aka \t) -- as \011 -// newline (aka \n) -- as \012 -// backslash (aka \\) -- as \134 +// space -- as \040 +// tab (aka \t) -- as \011 +// newline (aka \n) -- as \012 +// backslash (aka \\) -- as \134 // // This function converts path from mountinfo back, i.e. it unescapes the above sequences. func unescape(path string) (string, error) { diff --git a/vendor/modules.txt b/vendor/modules.txt index 1863d3de21f..23438cb8215 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -33,7 +33,7 @@ github.com/docker/go-units # github.com/godbus/dbus/v5 v5.1.0 ## explicit; go 1.12 github.com/godbus/dbus/v5 -# github.com/moby/sys/mountinfo v0.6.2 +# github.com/moby/sys/mountinfo v0.7.1 ## explicit; go 1.16 github.com/moby/sys/mountinfo # github.com/moby/sys/user v0.1.0