diff --git a/chart/templates/ws-daemon-daemonset.yaml b/chart/templates/ws-daemon-daemonset.yaml index 5be4114cf7b012..59a3dd0bd5d9ac 100644 --- a/chart/templates/ws-daemon-daemonset.yaml +++ b/chart/templates/ws-daemon-daemonset.yaml @@ -258,6 +258,12 @@ spec: path: "/" initialDelaySeconds: 5 periodSeconds: 10 + livenessProbe: + httpGet: + port: 9999 + path: "/" + initialDelaySeconds: 5 + periodSeconds: 10 securityContext: privileged: true procMount: Default diff --git a/components/ws-daemon/pkg/container/container.go b/components/ws-daemon/pkg/container/container.go index e60df1fbad8bba..ca4401a5437fa0 100644 --- a/components/ws-daemon/pkg/container/container.go +++ b/components/ws-daemon/pkg/container/container.go @@ -44,8 +44,8 @@ type Runtime interface { // ContainerPID returns the PID of the container's namespace root process, e.g. the container shim. ContainerPID(ctx context.Context, id ID) (pid uint64, err error) - // Error listens for errors in the interaction with the container runtime - Error() <-chan error + // IsContainerdReady returns is the status of containerd. + IsContainerdReady(ctx context.Context) (bool, error) } var ( diff --git a/components/ws-daemon/pkg/container/containerd.go b/components/ws-daemon/pkg/container/containerd.go index 4a83689f2ec064..5ba9a87c99848d 100644 --- a/components/ws-daemon/pkg/container/containerd.go +++ b/components/ws-daemon/pkg/container/containerd.go @@ -52,11 +52,10 @@ func NewContainerd(cfg *ContainerdConfig, mounts *NodeMountsLookup, pathMapping Mounts: mounts, Mapping: pathMapping, - cond: sync.NewCond(&sync.Mutex{}), - cntIdx: make(map[string]*containerInfo), - podIdx: make(map[string]*containerInfo), - wsiIdx: make(map[string]*containerInfo), - errchan: make(chan error), + cond: sync.NewCond(&sync.Mutex{}), + cntIdx: make(map[string]*containerInfo), + podIdx: make(map[string]*containerInfo), + wsiIdx: make(map[string]*containerInfo), } go res.start() @@ -69,11 +68,10 @@ type Containerd struct { Mounts *NodeMountsLookup Mapping PathMapping - cond *sync.Cond - podIdx map[string]*containerInfo - wsiIdx map[string]*containerInfo - cntIdx map[string]*containerInfo - errchan chan error + cond *sync.Cond + podIdx map[string]*containerInfo + wsiIdx map[string]*containerInfo + cntIdx map[string]*containerInfo } type containerInfo struct { @@ -104,7 +102,6 @@ func (s *Containerd) start() { cs, err := s.Client.ContainerService().List(ctx) if err != nil { log.WithError(err).Error("cannot list container") - s.errchan <- xerrors.Errorf("cannot list container: %w", err) time.Sleep(reconnectionInterval) continue } @@ -123,7 +120,6 @@ func (s *Containerd) start() { tsks, err := s.Client.TaskService().List(ctx, &tasks.ListTasksRequest{}) if err != nil { log.WithError(err).Error("cannot list tasks") - s.errchan <- xerrors.Errorf("cannot list tasks: %w", err) time.Sleep(reconnectionInterval) continue } @@ -144,7 +140,6 @@ func (s *Containerd) start() { s.handleContainerdEvent(ev) case err := <-errchan: log.WithError(err).Error("lost connection to containerd - will attempt to reconnect") - s.errchan <- err time.Sleep(reconnectionInterval) break } @@ -304,11 +299,6 @@ func (s *Containerd) handleNewTask(cid string, rootfs []*types.Mount, pid uint32 s.cond.Broadcast() } -// Error listens for errors in the interaction with the container runtime -func (s *Containerd) Error() <-chan error { - return s.errchan -} - // WaitForContainer waits for workspace container to come into existence. func (s *Containerd) WaitForContainer(ctx context.Context, workspaceInstanceID string) (cid ID, err error) { //nolint:ineffassign @@ -460,6 +450,11 @@ func (s *Containerd) ContainerPID(ctx context.Context, id ID) (pid uint64, err e return uint64(info.PID), nil } +// ContainerPID returns the PID of the container's namespace root process, e.g. the container shim. +func (s *Containerd) IsContainerdReady(ctx context.Context) (bool, error) { + return s.Client.IsServing(ctx) +} + // ExtractCGroupPathFromContainer retrieves the CGroupPath from the linux section // in a container's OCI spec. func ExtractCGroupPathFromContainer(container containers.Container) (cgroupPath string, err error) { diff --git a/components/ws-daemon/pkg/daemon/daemon.go b/components/ws-daemon/pkg/daemon/daemon.go index ac424363974869..8b4fa816f3f58b 100644 --- a/components/ws-daemon/pkg/daemon/daemon.go +++ b/components/ws-daemon/pkg/daemon/daemon.go @@ -6,6 +6,7 @@ package daemon import ( "context" + "fmt" "net/http" "os" @@ -40,11 +41,6 @@ func NewDaemon(config Config, reg prometheus.Registerer) (*Daemon, error) { if containerRuntime == nil { return nil, xerrors.Errorf("no container runtime configured") } - go func() { - // TODO(cw): handle this case more gracefully - err := <-containerRuntime.Error() - log.WithError(err).Fatal("container runtime interface error") - }() nodename := os.Getenv("NODENAME") if nodename == "" { @@ -162,6 +158,17 @@ func (d *Daemon) startReadinessSignal() { return } + isContainerdReady, err := d.dispatch.Runtime.IsContainerdReady(context.Background()) + if err != nil { + http.Error(w, fmt.Sprintf("containerd error: %v", err), http.StatusTooEarly) + return + } + + if !isContainerdReady { + http.Error(w, "containerd is not ready", http.StatusServiceUnavailable) + return + } + w.WriteHeader(http.StatusOK) })) log.WithField("addr", d.Config.ReadinessSignal.Addr).Info("started readiness signal")