From da4c432e7ed7a33057fd7907bd51cabfcd5d8dd3 Mon Sep 17 00:00:00 2001 From: bin liu Date: Wed, 11 Nov 2020 22:47:37 +0800 Subject: [PATCH] runtime: sleep 1 second after GetOOMEvent failed In some cases, for example agent crashed and not marked dead yet, the GetOOMEvent will return errors like `connection reset by peer` or `ttrpc: closed`. Do a sleep with 1 second (agent check interval) and let agent health check to do the check. Fixes: #3064 Signed-off-by: bin liu --- containerd-shim-v2/wait.go | 4 +++- virtcontainers/monitor.go | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/containerd-shim-v2/wait.go b/containerd-shim-v2/wait.go index 3d38d5976c..62ed49925c 100644 --- a/containerd-shim-v2/wait.go +++ b/containerd-shim-v2/wait.go @@ -14,6 +14,7 @@ import ( "github.com/containerd/containerd/api/events" "github.com/containerd/containerd/api/types/task" "github.com/containerd/containerd/mount" + vc "github.com/kata-containers/runtime/virtcontainers" "github.com/kata-containers/runtime/virtcontainers/pkg/oci" "github.com/sirupsen/logrus" "google.golang.org/grpc/codes" @@ -147,9 +148,10 @@ func watchOOMEvents(ctx context.Context, s *service) { logrus.WithField("sandbox", s.sandbox.ID()).WithError(err).Warn("failed to get OOM event from sandbox") // If the GetOOMEvent call is not implemented, then the agent is most likely an older version, // stop attempting to get OOM events. - if isGRPCErrorCode(codes.Unimplemented, err) { + if isGRPCErrorCode(codes.NotFound, err) || err.Error() == "Dead agent" { return } + time.Sleep(vc.DefaultMonitorCheckInterval) continue } diff --git a/virtcontainers/monitor.go b/virtcontainers/monitor.go index 5b8f5f517b..9aabb258aa 100644 --- a/virtcontainers/monitor.go +++ b/virtcontainers/monitor.go @@ -13,8 +13,8 @@ import ( ) const ( - defaultCheckInterval = 1 * time.Second - watcherChannelSize = 128 + DefaultMonitorCheckInterval = 1 * time.Second + watcherChannelSize = 128 ) type monitor struct { @@ -31,7 +31,7 @@ type monitor struct { func newMonitor(s *Sandbox) *monitor { return &monitor{ sandbox: s, - checkInterval: defaultCheckInterval, + checkInterval: DefaultMonitorCheckInterval, stopCh: make(chan bool, 1), } }