From 4ecff8d9d886e31d8a1d4fbb7836782327b380db Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 15 Mar 2021 15:28:59 -0700 Subject: [PATCH] start: don't kill runc init too early The stars can be aligned in a way that results in runc to leave a stale bind mount in container's state directory, which manifests itself later, while trying to remove the container, in an error like this: > remove /run/runc/test2: unlinkat /run/runc/test2/runc.W24K2t: device or resource busy The stale mount happens because runc start/run/exec kills runc init while it is inside ensure_cloned_binary(). One such scenario is when a unified cgroup resource is specified for cgroup v1, a cgroup manager's Apply returns an error (as of commit b006f4a18047), and when (*initProcess).start() kills runc init just after it was started. One solution is NOT to kill runc init too early. To achieve that, amend the libcontainer/nsenter code to send a \0 byte to signal that it is past the initial setup, and make start() (for both run/start and exec) wait for this byte before proceeding with kill on an error path. While at it, improve some error messages. Signed-off-by: Kir Kolyshkin --- libcontainer/nsenter/nsenter_test.go | 24 +++++++++++++ libcontainer/nsenter/nsexec.c | 7 ++++ libcontainer/process_linux.go | 54 ++++++++++++++++++++++++++-- 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go index 86aeb042e04..83340913b89 100644 --- a/libcontainer/nsenter/nsenter_test.go +++ b/libcontainer/nsenter/nsenter_test.go @@ -3,6 +3,7 @@ package nsenter import ( "bytes" "encoding/json" + "errors" "fmt" "io" "io/ioutil" @@ -48,6 +49,7 @@ func TestNsenterValidPaths(t *testing.T) { if err := cmd.Start(); err != nil { t.Fatalf("nsenter failed to start %v", err) } + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ @@ -62,6 +64,8 @@ func TestNsenterValidPaths(t *testing.T) { t.Fatal(err) } + initWaiter(t, parent) + decoder := json.NewDecoder(parent) var pid *pid @@ -118,6 +122,7 @@ func TestNsenterInvalidPaths(t *testing.T) { t.Fatal(err) } + initWaiter(t, parent) if err := cmd.Wait(); err == nil { t.Fatalf("nsenter exits with a zero exit status") } @@ -158,6 +163,7 @@ func TestNsenterIncorrectPathType(t *testing.T) { t.Fatal(err) } + initWaiter(t, parent) if err := cmd.Wait(); err == nil { t.Fatalf("nsenter exits with a zero exit status") } @@ -206,6 +212,8 @@ func TestNsenterChildLogging(t *testing.T) { t.Fatal(err) } + initWaiter(t, parent) + logsDecoder := json.NewDecoder(logread) var logentry *logentry @@ -235,3 +243,19 @@ func newPipe() (parent *os.File, child *os.File, err error) { } return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil } + +// initWaiter reads back the initial \0 from runc init +func initWaiter(t *testing.T, r io.Reader) { + inited := make([]byte, 1) + n, err := r.Read(inited) + if err == nil { + if n < 1 { + err = errors.New("short read") + } else if inited[0] != 0 { + err = fmt.Errorf("unexpected %d != 0", inited[0]) + } else { + return + } + } + t.Fatalf("waiting for init preliminary setup: %v", err) +} diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index e8660e7c27f..e33a60edd43 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -598,6 +598,13 @@ void nsexec(void) if (ensure_cloned_binary() < 0) bail("could not ensure we are a cloned binary"); + /* + * Inform the parent we're past initial setup. + * For the other side of this, see initWaiter. + */ + if (write(pipenum, "", 1) != 1) + bail("could not inform the parent we are past initial setup"); + write_log(DEBUG, "nsexec started"); /* Parse all of the netlink configuration. */ diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 8315d709aaa..1046594910d 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -98,23 +98,34 @@ func (p *setnsProcess) start() (retErr error) { if err != nil { return newSystemErrorWithCause(err, "starting setns process") } + + waitInit := initWaiter(p.messageSockPair.parent) defer func() { if retErr != nil { if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom { // Someone in this cgroup was killed, this _might_ be us. retErr = newSystemErrorWithCause(retErr, "possibly OOM-killed") } + werr := <-waitInit + if werr != nil { + logrus.WithError(werr).Warn() + } err := ignoreTerminateErrors(p.terminate()) if err != nil { logrus.WithError(err).Warn("unable to terminate setnsProcess") } } }() + if p.bootstrapData != nil { if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } } + err = <-waitInit + if err != nil { + return err + } if err := p.execSetns(); err != nil { return newSystemErrorWithCause(err, "executing setns process") } @@ -326,9 +337,13 @@ func (p *initProcess) start() (retErr error) { p.process.ops = nil return newSystemErrorWithCause(err, "starting init process command") } + + waitInit := initWaiter(p.messageSockPair.parent) defer func() { if retErr != nil { - // init might be killed by the kernel's OOM killer. + // Find out if init is killed by the kernel's OOM killer. + // Get the count before killing init as otherwise cgroup + // might be removed by systemd. oom, err := p.manager.OOMKillCount() if err != nil { logrus.WithError(err).Warn("unable to get oom kill count") @@ -346,7 +361,12 @@ func (p *initProcess) start() (retErr error) { } } - // terminate the process to ensure we can remove cgroups + werr := <-waitInit + if werr != nil { + logrus.WithError(werr).Warn() + } + + // Terminate the process to ensure we can remove cgroups. if err := ignoreTerminateErrors(p.terminate()); err != nil { logrus.WithError(err).Warn("unable to terminate initProcess") } @@ -372,6 +392,11 @@ func (p *initProcess) start() (retErr error) { if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } + err = <-waitInit + if err != nil { + return err + } + childPid, err := p.getChildPid() if err != nil { return newSystemErrorWithCause(err, "getting the final child's pid from pipe") @@ -674,3 +699,28 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { } return i, nil } + +// initWaiter returns a channel to wait on for making sure +// runc init has finished the initial setup. +func initWaiter(r io.Reader) chan error { + ch := make(chan error, 1) + go func() { + defer close(ch) + + inited := make([]byte, 1) + n, err := r.Read(inited) + if err == nil { + if n < 1 { + err = errors.New("short read") + } else if inited[0] != 0 { + err = fmt.Errorf("unexpected %d != 0", inited[0]) + } else { + ch <- nil + return + } + } + ch <- newSystemErrorWithCause(err, "waiting for init preliminary setup") + }() + + return ch +}