From e86c81ff3bb64b71da4f78f5a6f918b327cb268f Mon Sep 17 00:00:00 2001 From: jiangpengfei Date: Sat, 8 Aug 2020 16:00:45 -0400 Subject: [PATCH 1/3] virtcontainers: fix delete sandbox failed problem fixes: #2882 reason: If error happens after container create and before sandbox updateResouce in the `CreateContainer()`, then delete sandbox forcefully will return error because s.config.Containers config not flushed into persist store. Signed-off-by: jiangpengfei --- virtcontainers/sandbox.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index 7760a9085b..e6cfbffc56 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -1163,6 +1163,10 @@ func (s *Sandbox) CreateContainer(contConfig ContainerConfig) (VCContainer, erro if len(s.config.Containers) > 0 { // delete container config s.config.Containers = s.config.Containers[:len(s.config.Containers)-1] + // need to flush change to persist storage + if newErr := s.storeSandbox(); newErr != nil { + s.Logger().WithError(newErr).Error("Fail to flush s.config.Containers change into sandbox store") + } } if !storeAlreadyExists { if delerr := c.store.Delete(); delerr != nil { From 42bda650349c3633457bf63a509529a65a8abb0a Mon Sep 17 00:00:00 2001 From: Evan Foster Date: Mon, 31 Aug 2020 10:12:56 -0600 Subject: [PATCH 2/3] sandbox: Disconnect from agent after VM shutdown When a one-shot pod dies in CRI-O, the shimv2 process isn't killed until the pod is actually deleted, even though the VM is shut down. In this case, the shim appears to busyloop when attempting to talk to the (now dead) agent via VSOCK. To address this, we disconnect from the agent after the VM is shut down. This is especially catastrophic for one-shot pods that may persist for hours or days, but it also applies to any shimv2 pod where Kata is configured to use VSOCK for communication. Backport of https://github.com/kata-containers/kata-containers/pull/556 to kata-containers/runtime master branch. See github.com/kata-containers/runtime#2719 for details. Fixes #2719 Signed-off-by: Evan Foster --- virtcontainers/sandbox.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index e6cfbffc56..0960f58845 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -1598,6 +1598,11 @@ func (s *Sandbox) Stop(force bool) error { return err } + // Stop communicating with the agent. + if err := s.agent.disconnect(); err != nil && !force { + return err + } + return nil } From 6f33f4e2f10b035f3050f6aa9c95a12bc37e063a Mon Sep 17 00:00:00 2001 From: Yves Chan Date: Mon, 28 Sep 2020 22:22:34 +0800 Subject: [PATCH 3/3] shimv2: handle ctx passed by containerd Sometimes shim process cannot be shutdown because of container list is not empty. This container list is written in shim service, while creating container. We find that if containerd cancel its Create Container Request due to timeout, but runtime didn't handle it properly and continue creating action, then this container cannot be deleted at all. So we should make sure the ctx passed to Create Service rpc call is effective. Fixes: #2992 Signed-off-by: Yves Chan --- containerd-shim-v2/service.go | 64 +++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/containerd-shim-v2/service.go b/containerd-shim-v2/service.go index af26d94ef5..65052c1895 100644 --- a/containerd-shim-v2/service.go +++ b/containerd-shim-v2/service.go @@ -342,34 +342,46 @@ func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ * s.mu.Lock() defer s.mu.Unlock() - var c *container - - c, err = create(ctx, s, r) - if err != nil { - return nil, err - } - - c.status = task.StatusCreated - - s.containers[r.ID] = c + type Result struct { + container *container + err error + } + ch := make(chan Result, 1) + go func() { + container, err := create(ctx, s, r) + ch <- Result{container, err} + }() - s.send(&eventstypes.TaskCreate{ - ContainerID: r.ID, - Bundle: r.Bundle, - Rootfs: r.Rootfs, - IO: &eventstypes.TaskIO{ - Stdin: r.Stdin, - Stdout: r.Stdout, - Stderr: r.Stderr, - Terminal: r.Terminal, - }, - Checkpoint: r.Checkpoint, - Pid: s.pid, - }) + select { + case <-ctx.Done(): + return nil, errors.Errorf("create container timeout: %v", r.ID) + case res := <-ch: + if res.err != nil { + return nil, res.err + } + container := res.container + container.status = task.StatusCreated + + s.containers[r.ID] = container + + s.send(&eventstypes.TaskCreate{ + ContainerID: r.ID, + Bundle: r.Bundle, + Rootfs: r.Rootfs, + IO: &eventstypes.TaskIO{ + Stdin: r.Stdin, + Stdout: r.Stdout, + Stderr: r.Stderr, + Terminal: r.Terminal, + }, + Checkpoint: r.Checkpoint, + Pid: s.pid, + }) - return &taskAPI.CreateTaskResponse{ - Pid: s.pid, - }, nil + return &taskAPI.CreateTaskResponse{ + Pid: s.pid, + }, nil + } } // Start a process