summaryrefslogtreecommitdiff
path: root/daemon
diff options
authorSamuele Pedroni <pedronis@lucediurna.net>2019-07-19 09:34:23 +0200
committerGitHub <noreply@github.com>2019-07-19 09:34:23 +0200
commit357ba8a35a97c599b52a4aaba2e65c97fa8a19f6 (patch)
tree4c698beae18b2aaad87d4303e88bcc2350da25a2 /daemon
parenta4729395c85ba58808dc9c8f9da5a6d58d98e1ce (diff)
parent3c8e40b4de141390715bb458cd642d14bf82c02c (diff)
many: retry to reboot if snapd gets restarted before expected reboot
At the low level this is done by remembering the boot id in the state when requesting RestartSystem and adding a VerifyBoot method to state itself. Overlord then calls VerifyBoot with the current boot id early in the inititialization process. The outcome of the check - positive or negative - gets passed for acting to hooks RebootAsExpected, respectively RebootDidNotHappen (part of a new overlord.RestartBehavior interface). daemon.go implements overlord.RestartBehavor. In the negative case the code proceeds without a Overlord through special cases through Start and Stop and snapd main code (to keep systemd notify/watchdog happy) and then in Stop a reboot is scheduled and waited again. If these retries fail for 3 times snapd gives up which likely will result in some rollback.
Diffstat (limited to 'daemon')
-rw-r--r--daemon/daemon.go226
-rw-r--r--daemon/daemon_test.go130
2 files changed, 285 insertions, 71 deletions
diff --git a/daemon/daemon.go b/daemon/daemon.go
index 6c6dbd596b..a21eaf4432 100644
--- a/daemon/daemon.go
+++ b/daemon/daemon.go
@@ -59,6 +59,7 @@ var systemdSdNotify = systemd.SdNotify
type Daemon struct {
Version string
overlord *overlord.Overlord
+ state *state.State
snapdListener net.Listener
snapListener net.Listener
connTracker *connTracker
@@ -75,6 +76,8 @@ type Daemon struct {
// degradedErr is set when the daemon is in degraded mode
degradedErr error
+ expectedRebootDidNotHappen bool
+
mu sync.Mutex
}
@@ -210,7 +213,7 @@ func (c *Command) canAccess(r *http.Request, user *auth.UserState) accessResult
}
func (c *Command) ServeHTTP(w http.ResponseWriter, r *http.Request) {
- st := c.d.overlord.State()
+ st := c.d.state
st.Lock()
// TODO Look at the error and fail if there's an attempt to authenticate with invalid data.
user, _ := UserFromRequest(st, r)
@@ -405,7 +408,7 @@ func (ct *connTracker) trackConn(conn net.Conn, state http.ConnState) {
}
func (d *Daemon) initStandbyHandling() {
- d.standbyOpinions = standby.New(d.overlord.State())
+ d.standbyOpinions = standby.New(d.state)
d.standbyOpinions.AddOpinion(d.connTracker)
d.standbyOpinions.AddOpinion(d.overlord)
d.standbyOpinions.AddOpinion(d.overlord.SnapManager())
@@ -415,33 +418,16 @@ func (d *Daemon) initStandbyHandling() {
// Start the Daemon
func (d *Daemon) Start() {
- // die when asked to restart (systemd should get us back up!)
- d.overlord.SetRestartHandler(func(t state.RestartType) {
- switch t {
- case state.RestartDaemon:
- d.tomb.Kill(nil)
- case state.RestartSystem:
- // try to schedule a fallback slow reboot already here
- // in case we get stuck shutting down
- if err := reboot(rebootWaitTimeout); err != nil {
- logger.Noticef("%s", err)
- }
-
- d.mu.Lock()
- defer d.mu.Unlock()
- // remember we need to restart the system
- d.restartSystem = true
- d.tomb.Kill(nil)
- case state.RestartSocket:
- d.mu.Lock()
- defer d.mu.Unlock()
- d.restartSocket = true
- d.tomb.Kill(nil)
- default:
- logger.Noticef("internal error: restart handler called with unknown restart type: %v", t)
- d.tomb.Kill(nil)
- }
- })
+ if d.expectedRebootDidNotHappen {
+ // we need to schedule and wait for a system restart
+ d.tomb.Kill(nil)
+ // avoid systemd killing us again while we wait
+ systemdSdNotify("READY=1")
+ return
+ }
+ if d.overlord == nil {
+ panic("internal error: no Overlord")
+ }
d.connTracker = &connTracker{conns: make(map[net.Conn]struct{})}
d.serve = &http.Server{
@@ -477,29 +463,49 @@ func (d *Daemon) Start() {
systemdSdNotify("READY=1")
}
-var shutdownMsg = i18n.G("reboot scheduled to update the system")
+// HandleRestart implements overlord.RestartBehavior.
+func (d *Daemon) HandleRestart(t state.RestartType) {
+ // die when asked to restart (systemd should get us back up!) etc
+ switch t {
+ case state.RestartDaemon:
+ case state.RestartSystem:
+ // try to schedule a fallback slow reboot already here
+ // in case we get stuck shutting down
+ if err := reboot(rebootWaitTimeout); err != nil {
+ logger.Noticef("%s", err)
+ }
-func rebootImpl(rebootDelay time.Duration) error {
- if rebootDelay < 0 {
- rebootDelay = 0
- }
- mins := int64((rebootDelay + time.Minute - 1) / time.Minute)
- cmd := exec.Command("shutdown", "-r", fmt.Sprintf("+%d", mins), shutdownMsg)
- if out, err := cmd.CombinedOutput(); err != nil {
- return osutil.OutputErr(out, err)
+ d.mu.Lock()
+ defer d.mu.Unlock()
+ // remember we need to restart the system
+ d.restartSystem = true
+ case state.RestartSocket:
+ d.mu.Lock()
+ defer d.mu.Unlock()
+ d.restartSocket = true
+ default:
+ logger.Noticef("internal error: restart handler called with unknown restart type: %v", t)
}
- return nil
+ d.tomb.Kill(nil)
}
-var reboot = rebootImpl
-
var (
- rebootNoticeWait = 3 * time.Second
- rebootWaitTimeout = 10 * time.Minute
+ rebootNoticeWait = 3 * time.Second
+ rebootWaitTimeout = 10 * time.Minute
+ rebootRetryWaitTimeout = 5 * time.Minute
+ rebootMaxTentatives = 3
)
// Stop shuts down the Daemon
func (d *Daemon) Stop(sigCh chan<- os.Signal) error {
+ // we need to schedule/wait for a system restart again
+ if d.expectedRebootDidNotHappen {
+ return d.doReboot(sigCh, rebootRetryWaitTimeout)
+ }
+ if d.overlord == nil {
+ return fmt.Errorf("internal error: no Overlord")
+ }
+
d.tomb.Kill(nil)
d.mu.Lock()
@@ -514,7 +520,7 @@ func (d *Daemon) Stop(sigCh chan<- os.Signal) error {
// stop running hooks first
// and do it more gracefully if we are restarting
hookMgr := d.overlord.HookManager()
- if ok, _ := d.overlord.State().Restarting(); ok {
+ if ok, _ := d.state.Restarting(); ok {
logger.Noticef("gracefully waiting for running hooks")
hookMgr.GracefullyWaitRunningHooks()
logger.Noticef("done waiting for running hooks")
@@ -570,9 +576,30 @@ func (d *Daemon) Stop(sigCh chan<- os.Signal) error {
}
if restartSystem {
- // ask for shutdown and wait for it to happen.
- // if we exit snapd will be restared by systemd
- rebootDelay := 1 * time.Minute
+ return d.doReboot(sigCh, rebootWaitTimeout)
+ }
+
+ if d.restartSocket {
+ return ErrRestartSocket
+ }
+
+ return nil
+}
+
+func (d *Daemon) rebootDelay() (time.Duration, error) {
+ d.state.Lock()
+ defer d.state.Unlock()
+ now := time.Now()
+ // see whether a reboot had already been scheduled
+ var rebootAt time.Time
+ err := d.state.Get("daemon-system-restart-at", &rebootAt)
+ if err != nil && err != state.ErrNoState {
+ return 0, err
+ }
+ rebootDelay := 1 * time.Minute
+ if err == nil {
+ rebootDelay = rebootAt.Sub(now)
+ } else {
ovr := os.Getenv("SNAPD_REBOOT_DELAY") // for tests
if ovr != "" {
d, err := time.ParseDuration(ovr)
@@ -580,39 +607,106 @@ func (d *Daemon) Stop(sigCh chan<- os.Signal) error {
rebootDelay = d
}
}
- if err := reboot(rebootDelay); err != nil {
- return err
- }
- // wait for reboot to happen
- logger.Noticef("Waiting for system reboot")
- if sigCh != nil {
- signal.Stop(sigCh)
- if len(sigCh) > 0 {
- // a signal arrived in between
- return nil
- }
- close(sigCh)
- }
- time.Sleep(rebootWaitTimeout)
- return fmt.Errorf("expected reboot did not happen")
+ rebootAt = now.Add(rebootDelay)
+ d.state.Set("daemon-system-restart-at", rebootAt)
}
- if d.restartSocket {
- return ErrRestartSocket
+ return rebootDelay, nil
+}
+
+func (d *Daemon) doReboot(sigCh chan<- os.Signal, waitTimeout time.Duration) error {
+ rebootDelay, err := d.rebootDelay()
+ if err != nil {
+ return err
}
+ // ask for shutdown and wait for it to happen.
+ // if we exit snapd will be restared by systemd
+ if err := reboot(rebootDelay); err != nil {
+ return err
+ }
+ // wait for reboot to happen
+ logger.Noticef("Waiting for system reboot")
+ if sigCh != nil {
+ signal.Stop(sigCh)
+ if len(sigCh) > 0 {
+ // a signal arrived in between
+ return nil
+ }
+ close(sigCh)
+ }
+ time.Sleep(waitTimeout)
+ return fmt.Errorf("expected reboot did not happen")
+}
+var shutdownMsg = i18n.G("reboot scheduled to update the system")
+
+func rebootImpl(rebootDelay time.Duration) error {
+ if rebootDelay < 0 {
+ rebootDelay = 0
+ }
+ mins := int64(rebootDelay / time.Minute)
+ cmd := exec.Command("shutdown", "-r", fmt.Sprintf("+%d", mins), shutdownMsg)
+ if out, err := cmd.CombinedOutput(); err != nil {
+ return osutil.OutputErr(out, err)
+ }
return nil
}
+var reboot = rebootImpl
+
// Dying is a tomb-ish thing
func (d *Daemon) Dying() <-chan struct{} {
return d.tomb.Dying()
}
+func clearReboot(st *state.State) {
+ st.Set("daemon-system-restart-at", nil)
+ st.Set("daemon-system-restart-tentative", nil)
+}
+
+// RebootAsExpected implements part of overlord.RestartBehavior.
+func (d *Daemon) RebootAsExpected(st *state.State) error {
+ clearReboot(st)
+ return nil
+}
+
+// RebootDidNotHappen implements part of overlord.RestartBehavior.
+func (d *Daemon) RebootDidNotHappen(st *state.State) error {
+ var nTentative int
+ err := st.Get("daemon-system-restart-tentative", &nTentative)
+ if err != nil && err != state.ErrNoState {
+ return err
+ }
+ nTentative++
+ if nTentative > rebootMaxTentatives {
+ // giving up, proceed normally, some in-progress refresh
+ // might get rolled back!!
+ st.ClearReboot()
+ clearReboot(st)
+ logger.Noticef("snapd was restarted while a system restart was expected, snapd retried to schedule and waited again for a system restart %d times and is giving up", rebootMaxTentatives)
+ return nil
+ }
+ st.Set("daemon-system-restart-tentative", nTentative)
+ d.state = st
+ logger.Noticef("snapd was restarted while a system restart was expected, snapd will try to schedule and wait for a system restart again (tenative %d/%d)", nTentative, rebootMaxTentatives)
+ return state.ErrExpectedReboot
+}
+
// New Daemon
func New() (*Daemon, error) {
- ovld, err := overlord.New()
+ d := &Daemon{}
+ ovld, err := overlord.New(d)
+ if err == state.ErrExpectedReboot {
+ // we proceed without overlord until we reach Stop
+ // where we will schedule and wait again for a system restart.
+ // ATM we cannot do that in New because we need to satisfy
+ // systemd notify mechanisms.
+ d.expectedRebootDidNotHappen = true
+ return d, nil
+ }
if err != nil {
return nil, err
}
- return &Daemon{overlord: ovld}, nil
+ d.overlord = ovld
+ d.state = ovld.State()
+ return d, nil
}
diff --git a/daemon/daemon_test.go b/daemon/daemon_test.go
index cb9b6dec47..3e00db612a 100644
--- a/daemon/daemon_test.go
+++ b/daemon/daemon_test.go
@@ -42,9 +42,11 @@ import (
"github.com/snapcore/snapd/client"
"github.com/snapcore/snapd/dirs"
"github.com/snapcore/snapd/logger"
+ "github.com/snapcore/snapd/osutil"
"github.com/snapcore/snapd/overlord/auth"
"github.com/snapcore/snapd/overlord/devicestate/devicestatetest"
"github.com/snapcore/snapd/overlord/ifacestate"
+ "github.com/snapcore/snapd/overlord/patch"
"github.com/snapcore/snapd/overlord/snapstate"
"github.com/snapcore/snapd/overlord/standby"
"github.com/snapcore/snapd/overlord/state"
@@ -736,6 +738,8 @@ func (s *daemonSuite) TestRestartSystemWiring(c *check.C) {
d.Start()
defer d.Stop(nil)
+ st := d.overlord.State()
+
snapdDone := make(chan struct{})
go func() {
select {
@@ -775,7 +779,9 @@ func (s *daemonSuite) TestRestartSystemWiring(c *check.C) {
return nil
}
- d.overlord.State().RequestRestart(state.RestartSystem)
+ st.Lock()
+ st.RequestRestart(state.RestartSystem)
+ st.Unlock()
defer func() {
d.mu.Lock()
@@ -798,14 +804,25 @@ func (s *daemonSuite) TestRestartSystemWiring(c *check.C) {
c.Check(delays, check.HasLen, 1)
c.Check(delays[0], check.DeepEquals, rebootWaitTimeout)
+ now := time.Now()
+
err = d.Stop(nil)
c.Check(err, check.ErrorMatches, "expected reboot did not happen")
+
c.Check(delays, check.HasLen, 2)
c.Check(delays[1], check.DeepEquals, 1*time.Minute)
// we are not stopping, we wait for the reboot instead
c.Check(s.notified, check.DeepEquals, []string{"READY=1"})
+
+ st.Lock()
+ defer st.Unlock()
+ var rebootAt time.Time
+ err = st.Get("daemon-system-restart-at", &rebootAt)
+ c.Assert(err, check.IsNil)
+ approxAt := now.Add(time.Minute)
+ c.Check(rebootAt.After(approxAt) || rebootAt.Equal(approxAt), check.Equals, true)
}
func (s *daemonSuite) TestRebootHelper(c *check.C) {
@@ -820,7 +837,7 @@ func (s *daemonSuite) TestRebootHelper(c *check.C) {
{0, "+0"},
{time.Minute, "+1"},
{10 * time.Minute, "+10"},
- {30 * time.Second, "+1"},
+ {30 * time.Second, "+0"},
}
for _, t := range tests {
@@ -867,7 +884,11 @@ func (s *daemonSuite) TestRestartShutdownWithSigtermInBetween(c *check.C) {
s.markSeeded(d)
d.Start()
- d.overlord.State().RequestRestart(state.RestartSystem)
+ st := d.overlord.State()
+
+ st.Lock()
+ st.RequestRestart(state.RestartSystem)
+ st.Unlock()
ch := make(chan os.Signal, 2)
ch <- syscall.SIGTERM
@@ -882,7 +903,6 @@ func (s *daemonSuite) TestRestartShutdown(c *check.C) {
oldRebootNoticeWait := rebootNoticeWait
oldRebootWaitTimeout := rebootWaitTimeout
defer func() {
- reboot = rebootImpl
rebootNoticeWait = oldRebootNoticeWait
rebootWaitTimeout = oldRebootWaitTimeout
}()
@@ -897,7 +917,11 @@ func (s *daemonSuite) TestRestartShutdown(c *check.C) {
s.markSeeded(d)
d.Start()
- d.overlord.State().RequestRestart(state.RestartSystem)
+ st := d.overlord.State()
+
+ st.Lock()
+ st.RequestRestart(state.RestartSystem)
+ st.Unlock()
sigCh := make(chan os.Signal, 2)
// stop (this will timeout but thats not relevant for this test)
@@ -908,6 +932,102 @@ func (s *daemonSuite) TestRestartShutdown(c *check.C) {
c.Assert(chOpen, check.Equals, false)
}
+func (s *daemonSuite) TestRestartExpectedRebootDidNotHappen(c *check.C) {
+ curBootID, err := osutil.BootID()
+ c.Assert(err, check.IsNil)
+
+ fakeState := []byte(fmt.Sprintf(`{"data":{"patch-level":%d,"patch-sublevel":%d,"some":"data","refresh-privacy-key":"0123456789ABCDEF","system-restart-from-boot-id":%q,"daemon-system-restart-at":"%s"},"changes":null,"tasks":null,"last-change-id":0,"last-task-id":0,"last-lane-id":0}`, patch.Level, patch.Sublevel, curBootID, time.Now().UTC().Format(time.RFC3339)))
+ err = ioutil.WriteFile(dirs.SnapStateFile, fakeState, 0600)
+ c.Assert(err, check.IsNil)
+
+ oldRebootNoticeWait := rebootNoticeWait
+ oldRebootRetryWaitTimeout := rebootRetryWaitTimeout
+ defer func() {
+ rebootNoticeWait = oldRebootNoticeWait
+ rebootRetryWaitTimeout = oldRebootRetryWaitTimeout
+ }()
+ rebootRetryWaitTimeout = 100 * time.Millisecond
+ rebootNoticeWait = 150 * time.Millisecond
+
+ cmd := testutil.MockCommand(c, "shutdown", "")
+ defer cmd.Restore()
+
+ d := newTestDaemon(c)
+ c.Check(d.overlord, check.IsNil)
+ c.Check(d.expectedRebootDidNotHappen, check.Equals, true)
+
+ var n int
+ d.state.Lock()
+ err = d.state.Get("daemon-system-restart-tentative", &n)
+ d.state.Unlock()
+ c.Check(err, check.IsNil)
+ c.Check(n, check.Equals, 1)
+
+ d.Start()
+
+ c.Check(s.notified, check.DeepEquals, []string{"READY=1"})
+
+ select {
+ case <-d.Dying():
+ case <-time.After(2 * time.Second):
+ c.Fatal("expected reboot not happening should proceed to try to shutdown again")
+ }
+
+ sigCh := make(chan os.Signal, 2)
+ // stop (this will timeout but thats not relevant for this test)
+ d.Stop(sigCh)
+
+ // an immediate shutdown was scheduled again
+ c.Check(cmd.Calls(), check.DeepEquals, [][]string{
+ {"shutdown", "-r", "+0", "reboot scheduled to update the system"},
+ })
+}
+
+func (s *daemonSuite) TestRestartExpectedRebootOK(c *check.C) {
+ fakeState := []byte(fmt.Sprintf(`{"data":{"patch-level":%d,"patch-sublevel":%d,"some":"data","refresh-privacy-key":"0123456789ABCDEF","system-restart-from-boot-id":%q,"daemon-system-restart-at":"%s"},"changes":null,"tasks":null,"last-change-id":0,"last-task-id":0,"last-lane-id":0}`, patch.Level, patch.Sublevel, "boot-id-0", time.Now().UTC().Format(time.RFC3339)))
+ err := ioutil.WriteFile(dirs.SnapStateFile, fakeState, 0600)
+ c.Assert(err, check.IsNil)
+
+ cmd := testutil.MockCommand(c, "shutdown", "")
+ defer cmd.Restore()
+
+ d := newTestDaemon(c)
+ c.Assert(d.overlord, check.NotNil)
+
+ st := d.overlord.State()
+ st.Lock()
+ defer st.Unlock()
+ var v interface{}
+ // these were cleared
+ c.Check(st.Get("daemon-system-restart-at", &v), check.Equals, state.ErrNoState)
+ c.Check(st.Get("system-restart-from-boot-id", &v), check.Equals, state.ErrNoState)
+}
+
+func (s *daemonSuite) TestRestartExpectedRebootGiveUp(c *check.C) {
+ // we give up trying to restart the system after 3 retry tentatives
+ curBootID, err := osutil.BootID()
+ c.Assert(err, check.IsNil)
+
+ fakeState := []byte(fmt.Sprintf(`{"data":{"patch-level":%d,"patch-sublevel":%d,"some":"data","refresh-privacy-key":"0123456789ABCDEF","system-restart-from-boot-id":%q,"daemon-system-restart-at":"%s","daemon-system-restart-tentative":3},"changes":null,"tasks":null,"last-change-id":0,"last-task-id":0,"last-lane-id":0}`, patch.Level, patch.Sublevel, curBootID, time.Now().UTC().Format(time.RFC3339)))
+ err = ioutil.WriteFile(dirs.SnapStateFile, fakeState, 0600)
+ c.Assert(err, check.IsNil)
+
+ cmd := testutil.MockCommand(c, "shutdown", "")
+ defer cmd.Restore()
+
+ d := newTestDaemon(c)
+ c.Assert(d.overlord, check.NotNil)
+
+ st := d.overlord.State()
+ st.Lock()
+ defer st.Unlock()
+ var v interface{}
+ // these were cleared
+ c.Check(st.Get("daemon-system-restart-at", &v), check.Equals, state.ErrNoState)
+ c.Check(st.Get("system-restart-from-boot-id", &v), check.Equals, state.ErrNoState)
+ c.Check(st.Get("daemon-system-restart-tentative", &v), check.Equals, state.ErrNoState)
+}
+
func (s *daemonSuite) TestRestartIntoSocketModeNoNewChanges(c *check.C) {
restore := standby.MockStandbyWait(5 * time.Millisecond)
defer restore()