diff --git a/agent.go b/agent.go index 769f593bd3..6e6eff908a 100644 --- a/agent.go +++ b/agent.go @@ -7,6 +7,7 @@ package main import ( + "errors" "flag" "fmt" "io" @@ -17,7 +18,6 @@ import ( "os/signal" "path/filepath" "runtime" - "runtime/debug" "strings" "sync" "syscall" @@ -97,6 +97,9 @@ var agentLog = logrus.WithFields(agentFields) // version is the agent version. This variable is populated at build time. var version = "unknown" +// if true, coredump when an internal error occurs or a fatal signal is received +var crashOnError = false + // This is the list of file descriptors we can properly close after the process // has been started. When the new process is exec(), those file descriptors are // duplicated and it is our responsibility to close them since we have opened @@ -363,15 +366,26 @@ func (s *sandbox) signalHandlerLoop(sigCh chan os.Signal) { for sig := range sigCh { logger := agentLog.WithField("signal", sig) - switch sig { - case unix.SIGCHLD: + if sig == unix.SIGCHLD { if err := s.subreaper.reap(); err != nil { logger.WithError(err).Error("failed to reap") - return + continue } - default: - logger.Info("ignoring unexpected signal") } + + nativeSignal, ok := sig.(syscall.Signal) + if !ok { + err := errors.New("unknown signal") + logger.WithError(err).Error("failed to handle signal") + continue + } + + if fatalSignal(nativeSignal) { + logger.Error("received fatal signal") + die() + } + + logger.Info("ignoring unexpected signal") } } @@ -385,6 +399,10 @@ func (s *sandbox) setupSignalHandler() error { sigCh := make(chan os.Signal, 512) signal.Notify(sigCh, unix.SIGCHLD) + for _, sig := range handledSignals() { + signal.Notify(sigCh, sig) + } + go s.signalHandlerLoop(sigCh) return nil @@ -648,9 +666,6 @@ func initAgentAsInit() error { } func init() { - // Force full stacktrace on internal error - debug.SetTraceback("system") - if len(os.Args) > 1 && os.Args[1] == "init" { runtime.GOMAXPROCS(1) runtime.LockOSThread() @@ -662,7 +677,7 @@ func init() { } } -func main() { +func realMain() { var err error var showVersion bool @@ -730,3 +745,8 @@ func main() { s.wg.Wait() } + +func main() { + defer handlePanic() + realMain() +} diff --git a/config.go b/config.go index 226efc2f43..765582b1d0 100644 --- a/config.go +++ b/config.go @@ -18,6 +18,7 @@ import ( const ( optionPrefix = "agent." logLevelFlag = optionPrefix + "log" + devModeFlag = optionPrefix + "devmode" kernelCmdlineFile = "/proc/cmdline" ) @@ -83,6 +84,8 @@ func (c *agentConfig) parseCmdlineOption(option string) error { return err } c.logLevel = level + case devModeFlag: + crashOnError = true default: if strings.HasPrefix(split[optionPosition], optionPrefix) { return grpcStatus.Errorf(codes.NotFound, "Unknown option %s", split[optionPosition]) diff --git a/signals.go b/signals.go new file mode 100644 index 0000000000..46bfc11653 --- /dev/null +++ b/signals.go @@ -0,0 +1,87 @@ +// Copyright 2018 Intel Corporation. +// +// SPDX-License-Identifier: Apache-2.0 +// + +package main + +import ( + "bytes" + "fmt" + "os" + "os/signal" + "runtime/pprof" + "strings" + "syscall" +) + +// List of handled signals. +// +// The value is true if receiving the signal should be fatal. +var handledSignalsMap = map[syscall.Signal]bool{ + syscall.SIGABRT: true, + syscall.SIGBUS: true, + syscall.SIGILL: true, + syscall.SIGQUIT: true, + syscall.SIGSEGV: true, + syscall.SIGSTKFLT: true, + syscall.SIGSYS: true, + syscall.SIGTRAP: true, +} + +func handlePanic() { + r := recover() + + if r != nil { + msg := fmt.Sprintf("%s", r) + agentLog.WithField("panic", msg).Error("fatal error") + + die() + } +} + +func backtrace() { + profiles := pprof.Profiles() + + buf := &bytes.Buffer{} + + for _, p := range profiles { + // The magic number requests a full stacktrace. See + // https://golang.org/pkg/runtime/pprof/#Profile.WriteTo. + pprof.Lookup(p.Name()).WriteTo(buf, 2) + } + + for _, line := range strings.Split(buf.String(), "\n") { + agentLog.Error(line) + } +} + +func fatalSignal(sig syscall.Signal) bool { + s, exists := handledSignalsMap[sig] + if !exists { + return false + } + + return s +} + +func handledSignals() []syscall.Signal { + var signals []syscall.Signal + + for sig := range handledSignalsMap { + signals = append(signals, sig) + } + + return signals +} + +func die() { + backtrace() + + if crashOnError { + signal.Reset(syscall.SIGABRT) + syscall.Kill(0, syscall.SIGABRT) + } + + os.Exit(1) +} diff --git a/signals_test.go b/signals_test.go new file mode 100644 index 0000000000..e4a9aaabac --- /dev/null +++ b/signals_test.go @@ -0,0 +1,99 @@ +// Copyright (c) 2018 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package main + +import ( + "bytes" + "reflect" + goruntime "runtime" + "sort" + "strings" + "syscall" + "testing" + + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" +) + +func TestSignalHandledSignalsMap(t *testing.T) { + assert := assert.New(t) + + for sig, fatal := range handledSignalsMap { + result := fatalSignal(sig) + if fatal { + assert.True(result) + } else { + assert.False(result) + } + } +} + +func TestSignalHandledSignals(t *testing.T) { + assert := assert.New(t) + + var expected []syscall.Signal + + for sig := range handledSignalsMap { + expected = append(expected, sig) + } + + got := handledSignals() + + sort.Slice(expected, func(i, j int) bool { + return int(expected[i]) < int(expected[j]) + }) + + sort.Slice(got, func(i, j int) bool { + return int(got[i]) < int(got[j]) + }) + + assert.True(reflect.DeepEqual(expected, got)) +} + +func TestSignalFatalSignalInvalidSignal(t *testing.T) { + assert := assert.New(t) + + sig := syscall.SIGXCPU + + result := fatalSignal(sig) + assert.False(result) +} + +func TestSignalBacktrace(t *testing.T) { + assert := assert.New(t) + + // create buffer to save logger output + buf := &bytes.Buffer{} + + savedLog := agentLog + defer func() { + agentLog = savedLog + }() + + agentLog = logrus.WithField("test-agent-logger", true) + + agentLog.Logger.Formatter = &logrus.TextFormatter{ + DisableColors: true, + } + + // capture output to buffer + agentLog.Logger.Out = buf + + // determine name of *this* function + pc := make([]uintptr, 1) + goruntime.Callers(1, pc) + fn := goruntime.FuncForPC(pc[0]) + name := fn.Name() + + backtrace() + + b := buf.String() + + // very basic tests to check if a backtrace was produced + assert.True(strings.Contains(b, "contention:")) + assert.True(strings.Contains(b, `level=error`)) + assert.True(strings.Contains(b, name)) +}