diff options
| author | Samuele Pedroni <pedronis@lucediurna.net> | 2019-07-19 09:34:23 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2019-07-19 09:34:23 +0200 |
| commit | 357ba8a35a97c599b52a4aaba2e65c97fa8a19f6 (patch) | |
| tree | 4c698beae18b2aaad87d4303e88bcc2350da25a2 /daemon | |
| parent | a4729395c85ba58808dc9c8f9da5a6d58d98e1ce (diff) | |
| parent | 3c8e40b4de141390715bb458cd642d14bf82c02c (diff) | |
many: retry to reboot if snapd gets restarted before expected reboot
At the low level this is done by remembering the boot id in the state when requesting RestartSystem and adding a VerifyBoot method to state itself. Overlord then calls VerifyBoot with the current boot id early in the inititialization process. The outcome of the check - positive or negative - gets passed for acting to hooks RebootAsExpected, respectively RebootDidNotHappen (part of a new overlord.RestartBehavior interface). daemon.go implements overlord.RestartBehavor. In the negative case the code proceeds without a Overlord through special cases through Start and Stop and snapd main code (to keep systemd notify/watchdog happy) and then in Stop a reboot is scheduled and waited again. If these retries fail for 3 times snapd gives up which likely will result in some rollback.
Diffstat (limited to 'daemon')
| -rw-r--r-- | daemon/daemon.go | 226 | ||||
| -rw-r--r-- | daemon/daemon_test.go | 130 |
2 files changed, 285 insertions, 71 deletions
diff --git a/daemon/daemon.go b/daemon/daemon.go index 6c6dbd596b..a21eaf4432 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -59,6 +59,7 @@ var systemdSdNotify = systemd.SdNotify type Daemon struct { Version string overlord *overlord.Overlord + state *state.State snapdListener net.Listener snapListener net.Listener connTracker *connTracker @@ -75,6 +76,8 @@ type Daemon struct { // degradedErr is set when the daemon is in degraded mode degradedErr error + expectedRebootDidNotHappen bool + mu sync.Mutex } @@ -210,7 +213,7 @@ func (c *Command) canAccess(r *http.Request, user *auth.UserState) accessResult } func (c *Command) ServeHTTP(w http.ResponseWriter, r *http.Request) { - st := c.d.overlord.State() + st := c.d.state st.Lock() // TODO Look at the error and fail if there's an attempt to authenticate with invalid data. user, _ := UserFromRequest(st, r) @@ -405,7 +408,7 @@ func (ct *connTracker) trackConn(conn net.Conn, state http.ConnState) { } func (d *Daemon) initStandbyHandling() { - d.standbyOpinions = standby.New(d.overlord.State()) + d.standbyOpinions = standby.New(d.state) d.standbyOpinions.AddOpinion(d.connTracker) d.standbyOpinions.AddOpinion(d.overlord) d.standbyOpinions.AddOpinion(d.overlord.SnapManager()) @@ -415,33 +418,16 @@ func (d *Daemon) initStandbyHandling() { // Start the Daemon func (d *Daemon) Start() { - // die when asked to restart (systemd should get us back up!) - d.overlord.SetRestartHandler(func(t state.RestartType) { - switch t { - case state.RestartDaemon: - d.tomb.Kill(nil) - case state.RestartSystem: - // try to schedule a fallback slow reboot already here - // in case we get stuck shutting down - if err := reboot(rebootWaitTimeout); err != nil { - logger.Noticef("%s", err) - } - - d.mu.Lock() - defer d.mu.Unlock() - // remember we need to restart the system - d.restartSystem = true - d.tomb.Kill(nil) - case state.RestartSocket: - d.mu.Lock() - defer d.mu.Unlock() - d.restartSocket = true - d.tomb.Kill(nil) - default: - logger.Noticef("internal error: restart handler called with unknown restart type: %v", t) - d.tomb.Kill(nil) - } - }) + if d.expectedRebootDidNotHappen { + // we need to schedule and wait for a system restart + d.tomb.Kill(nil) + // avoid systemd killing us again while we wait + systemdSdNotify("READY=1") + return + } + if d.overlord == nil { + panic("internal error: no Overlord") + } d.connTracker = &connTracker{conns: make(map[net.Conn]struct{})} d.serve = &http.Server{ @@ -477,29 +463,49 @@ func (d *Daemon) Start() { systemdSdNotify("READY=1") } -var shutdownMsg = i18n.G("reboot scheduled to update the system") +// HandleRestart implements overlord.RestartBehavior. +func (d *Daemon) HandleRestart(t state.RestartType) { + // die when asked to restart (systemd should get us back up!) etc + switch t { + case state.RestartDaemon: + case state.RestartSystem: + // try to schedule a fallback slow reboot already here + // in case we get stuck shutting down + if err := reboot(rebootWaitTimeout); err != nil { + logger.Noticef("%s", err) + } -func rebootImpl(rebootDelay time.Duration) error { - if rebootDelay < 0 { - rebootDelay = 0 - } - mins := int64((rebootDelay + time.Minute - 1) / time.Minute) - cmd := exec.Command("shutdown", "-r", fmt.Sprintf("+%d", mins), shutdownMsg) - if out, err := cmd.CombinedOutput(); err != nil { - return osutil.OutputErr(out, err) + d.mu.Lock() + defer d.mu.Unlock() + // remember we need to restart the system + d.restartSystem = true + case state.RestartSocket: + d.mu.Lock() + defer d.mu.Unlock() + d.restartSocket = true + default: + logger.Noticef("internal error: restart handler called with unknown restart type: %v", t) } - return nil + d.tomb.Kill(nil) } -var reboot = rebootImpl - var ( - rebootNoticeWait = 3 * time.Second - rebootWaitTimeout = 10 * time.Minute + rebootNoticeWait = 3 * time.Second + rebootWaitTimeout = 10 * time.Minute + rebootRetryWaitTimeout = 5 * time.Minute + rebootMaxTentatives = 3 ) // Stop shuts down the Daemon func (d *Daemon) Stop(sigCh chan<- os.Signal) error { + // we need to schedule/wait for a system restart again + if d.expectedRebootDidNotHappen { + return d.doReboot(sigCh, rebootRetryWaitTimeout) + } + if d.overlord == nil { + return fmt.Errorf("internal error: no Overlord") + } + d.tomb.Kill(nil) d.mu.Lock() @@ -514,7 +520,7 @@ func (d *Daemon) Stop(sigCh chan<- os.Signal) error { // stop running hooks first // and do it more gracefully if we are restarting hookMgr := d.overlord.HookManager() - if ok, _ := d.overlord.State().Restarting(); ok { + if ok, _ := d.state.Restarting(); ok { logger.Noticef("gracefully waiting for running hooks") hookMgr.GracefullyWaitRunningHooks() logger.Noticef("done waiting for running hooks") @@ -570,9 +576,30 @@ func (d *Daemon) Stop(sigCh chan<- os.Signal) error { } if restartSystem { - // ask for shutdown and wait for it to happen. - // if we exit snapd will be restared by systemd - rebootDelay := 1 * time.Minute + return d.doReboot(sigCh, rebootWaitTimeout) + } + + if d.restartSocket { + return ErrRestartSocket + } + + return nil +} + +func (d *Daemon) rebootDelay() (time.Duration, error) { + d.state.Lock() + defer d.state.Unlock() + now := time.Now() + // see whether a reboot had already been scheduled + var rebootAt time.Time + err := d.state.Get("daemon-system-restart-at", &rebootAt) + if err != nil && err != state.ErrNoState { + return 0, err + } + rebootDelay := 1 * time.Minute + if err == nil { + rebootDelay = rebootAt.Sub(now) + } else { ovr := os.Getenv("SNAPD_REBOOT_DELAY") // for tests if ovr != "" { d, err := time.ParseDuration(ovr) @@ -580,39 +607,106 @@ func (d *Daemon) Stop(sigCh chan<- os.Signal) error { rebootDelay = d } } - if err := reboot(rebootDelay); err != nil { - return err - } - // wait for reboot to happen - logger.Noticef("Waiting for system reboot") - if sigCh != nil { - signal.Stop(sigCh) - if len(sigCh) > 0 { - // a signal arrived in between - return nil - } - close(sigCh) - } - time.Sleep(rebootWaitTimeout) - return fmt.Errorf("expected reboot did not happen") + rebootAt = now.Add(rebootDelay) + d.state.Set("daemon-system-restart-at", rebootAt) } - if d.restartSocket { - return ErrRestartSocket + return rebootDelay, nil +} + +func (d *Daemon) doReboot(sigCh chan<- os.Signal, waitTimeout time.Duration) error { + rebootDelay, err := d.rebootDelay() + if err != nil { + return err } + // ask for shutdown and wait for it to happen. + // if we exit snapd will be restared by systemd + if err := reboot(rebootDelay); err != nil { + return err + } + // wait for reboot to happen + logger.Noticef("Waiting for system reboot") + if sigCh != nil { + signal.Stop(sigCh) + if len(sigCh) > 0 { + // a signal arrived in between + return nil + } + close(sigCh) + } + time.Sleep(waitTimeout) + return fmt.Errorf("expected reboot did not happen") +} +var shutdownMsg = i18n.G("reboot scheduled to update the system") + +func rebootImpl(rebootDelay time.Duration) error { + if rebootDelay < 0 { + rebootDelay = 0 + } + mins := int64(rebootDelay / time.Minute) + cmd := exec.Command("shutdown", "-r", fmt.Sprintf("+%d", mins), shutdownMsg) + if out, err := cmd.CombinedOutput(); err != nil { + return osutil.OutputErr(out, err) + } return nil } +var reboot = rebootImpl + // Dying is a tomb-ish thing func (d *Daemon) Dying() <-chan struct{} { return d.tomb.Dying() } +func clearReboot(st *state.State) { + st.Set("daemon-system-restart-at", nil) + st.Set("daemon-system-restart-tentative", nil) +} + +// RebootAsExpected implements part of overlord.RestartBehavior. +func (d *Daemon) RebootAsExpected(st *state.State) error { + clearReboot(st) + return nil +} + +// RebootDidNotHappen implements part of overlord.RestartBehavior. +func (d *Daemon) RebootDidNotHappen(st *state.State) error { + var nTentative int + err := st.Get("daemon-system-restart-tentative", &nTentative) + if err != nil && err != state.ErrNoState { + return err + } + nTentative++ + if nTentative > rebootMaxTentatives { + // giving up, proceed normally, some in-progress refresh + // might get rolled back!! + st.ClearReboot() + clearReboot(st) + logger.Noticef("snapd was restarted while a system restart was expected, snapd retried to schedule and waited again for a system restart %d times and is giving up", rebootMaxTentatives) + return nil + } + st.Set("daemon-system-restart-tentative", nTentative) + d.state = st + logger.Noticef("snapd was restarted while a system restart was expected, snapd will try to schedule and wait for a system restart again (tenative %d/%d)", nTentative, rebootMaxTentatives) + return state.ErrExpectedReboot +} + // New Daemon func New() (*Daemon, error) { - ovld, err := overlord.New() + d := &Daemon{} + ovld, err := overlord.New(d) + if err == state.ErrExpectedReboot { + // we proceed without overlord until we reach Stop + // where we will schedule and wait again for a system restart. + // ATM we cannot do that in New because we need to satisfy + // systemd notify mechanisms. + d.expectedRebootDidNotHappen = true + return d, nil + } if err != nil { return nil, err } - return &Daemon{overlord: ovld}, nil + d.overlord = ovld + d.state = ovld.State() + return d, nil } diff --git a/daemon/daemon_test.go b/daemon/daemon_test.go index cb9b6dec47..3e00db612a 100644 --- a/daemon/daemon_test.go +++ b/daemon/daemon_test.go @@ -42,9 +42,11 @@ import ( "github.com/snapcore/snapd/client" "github.com/snapcore/snapd/dirs" "github.com/snapcore/snapd/logger" + "github.com/snapcore/snapd/osutil" "github.com/snapcore/snapd/overlord/auth" "github.com/snapcore/snapd/overlord/devicestate/devicestatetest" "github.com/snapcore/snapd/overlord/ifacestate" + "github.com/snapcore/snapd/overlord/patch" "github.com/snapcore/snapd/overlord/snapstate" "github.com/snapcore/snapd/overlord/standby" "github.com/snapcore/snapd/overlord/state" @@ -736,6 +738,8 @@ func (s *daemonSuite) TestRestartSystemWiring(c *check.C) { d.Start() defer d.Stop(nil) + st := d.overlord.State() + snapdDone := make(chan struct{}) go func() { select { @@ -775,7 +779,9 @@ func (s *daemonSuite) TestRestartSystemWiring(c *check.C) { return nil } - d.overlord.State().RequestRestart(state.RestartSystem) + st.Lock() + st.RequestRestart(state.RestartSystem) + st.Unlock() defer func() { d.mu.Lock() @@ -798,14 +804,25 @@ func (s *daemonSuite) TestRestartSystemWiring(c *check.C) { c.Check(delays, check.HasLen, 1) c.Check(delays[0], check.DeepEquals, rebootWaitTimeout) + now := time.Now() + err = d.Stop(nil) c.Check(err, check.ErrorMatches, "expected reboot did not happen") + c.Check(delays, check.HasLen, 2) c.Check(delays[1], check.DeepEquals, 1*time.Minute) // we are not stopping, we wait for the reboot instead c.Check(s.notified, check.DeepEquals, []string{"READY=1"}) + + st.Lock() + defer st.Unlock() + var rebootAt time.Time + err = st.Get("daemon-system-restart-at", &rebootAt) + c.Assert(err, check.IsNil) + approxAt := now.Add(time.Minute) + c.Check(rebootAt.After(approxAt) || rebootAt.Equal(approxAt), check.Equals, true) } func (s *daemonSuite) TestRebootHelper(c *check.C) { @@ -820,7 +837,7 @@ func (s *daemonSuite) TestRebootHelper(c *check.C) { {0, "+0"}, {time.Minute, "+1"}, {10 * time.Minute, "+10"}, - {30 * time.Second, "+1"}, + {30 * time.Second, "+0"}, } for _, t := range tests { @@ -867,7 +884,11 @@ func (s *daemonSuite) TestRestartShutdownWithSigtermInBetween(c *check.C) { s.markSeeded(d) d.Start() - d.overlord.State().RequestRestart(state.RestartSystem) + st := d.overlord.State() + + st.Lock() + st.RequestRestart(state.RestartSystem) + st.Unlock() ch := make(chan os.Signal, 2) ch <- syscall.SIGTERM @@ -882,7 +903,6 @@ func (s *daemonSuite) TestRestartShutdown(c *check.C) { oldRebootNoticeWait := rebootNoticeWait oldRebootWaitTimeout := rebootWaitTimeout defer func() { - reboot = rebootImpl rebootNoticeWait = oldRebootNoticeWait rebootWaitTimeout = oldRebootWaitTimeout }() @@ -897,7 +917,11 @@ func (s *daemonSuite) TestRestartShutdown(c *check.C) { s.markSeeded(d) d.Start() - d.overlord.State().RequestRestart(state.RestartSystem) + st := d.overlord.State() + + st.Lock() + st.RequestRestart(state.RestartSystem) + st.Unlock() sigCh := make(chan os.Signal, 2) // stop (this will timeout but thats not relevant for this test) @@ -908,6 +932,102 @@ func (s *daemonSuite) TestRestartShutdown(c *check.C) { c.Assert(chOpen, check.Equals, false) } +func (s *daemonSuite) TestRestartExpectedRebootDidNotHappen(c *check.C) { + curBootID, err := osutil.BootID() + c.Assert(err, check.IsNil) + + fakeState := []byte(fmt.Sprintf(`{"data":{"patch-level":%d,"patch-sublevel":%d,"some":"data","refresh-privacy-key":"0123456789ABCDEF","system-restart-from-boot-id":%q,"daemon-system-restart-at":"%s"},"changes":null,"tasks":null,"last-change-id":0,"last-task-id":0,"last-lane-id":0}`, patch.Level, patch.Sublevel, curBootID, time.Now().UTC().Format(time.RFC3339))) + err = ioutil.WriteFile(dirs.SnapStateFile, fakeState, 0600) + c.Assert(err, check.IsNil) + + oldRebootNoticeWait := rebootNoticeWait + oldRebootRetryWaitTimeout := rebootRetryWaitTimeout + defer func() { + rebootNoticeWait = oldRebootNoticeWait + rebootRetryWaitTimeout = oldRebootRetryWaitTimeout + }() + rebootRetryWaitTimeout = 100 * time.Millisecond + rebootNoticeWait = 150 * time.Millisecond + + cmd := testutil.MockCommand(c, "shutdown", "") + defer cmd.Restore() + + d := newTestDaemon(c) + c.Check(d.overlord, check.IsNil) + c.Check(d.expectedRebootDidNotHappen, check.Equals, true) + + var n int + d.state.Lock() + err = d.state.Get("daemon-system-restart-tentative", &n) + d.state.Unlock() + c.Check(err, check.IsNil) + c.Check(n, check.Equals, 1) + + d.Start() + + c.Check(s.notified, check.DeepEquals, []string{"READY=1"}) + + select { + case <-d.Dying(): + case <-time.After(2 * time.Second): + c.Fatal("expected reboot not happening should proceed to try to shutdown again") + } + + sigCh := make(chan os.Signal, 2) + // stop (this will timeout but thats not relevant for this test) + d.Stop(sigCh) + + // an immediate shutdown was scheduled again + c.Check(cmd.Calls(), check.DeepEquals, [][]string{ + {"shutdown", "-r", "+0", "reboot scheduled to update the system"}, + }) +} + +func (s *daemonSuite) TestRestartExpectedRebootOK(c *check.C) { + fakeState := []byte(fmt.Sprintf(`{"data":{"patch-level":%d,"patch-sublevel":%d,"some":"data","refresh-privacy-key":"0123456789ABCDEF","system-restart-from-boot-id":%q,"daemon-system-restart-at":"%s"},"changes":null,"tasks":null,"last-change-id":0,"last-task-id":0,"last-lane-id":0}`, patch.Level, patch.Sublevel, "boot-id-0", time.Now().UTC().Format(time.RFC3339))) + err := ioutil.WriteFile(dirs.SnapStateFile, fakeState, 0600) + c.Assert(err, check.IsNil) + + cmd := testutil.MockCommand(c, "shutdown", "") + defer cmd.Restore() + + d := newTestDaemon(c) + c.Assert(d.overlord, check.NotNil) + + st := d.overlord.State() + st.Lock() + defer st.Unlock() + var v interface{} + // these were cleared + c.Check(st.Get("daemon-system-restart-at", &v), check.Equals, state.ErrNoState) + c.Check(st.Get("system-restart-from-boot-id", &v), check.Equals, state.ErrNoState) +} + +func (s *daemonSuite) TestRestartExpectedRebootGiveUp(c *check.C) { + // we give up trying to restart the system after 3 retry tentatives + curBootID, err := osutil.BootID() + c.Assert(err, check.IsNil) + + fakeState := []byte(fmt.Sprintf(`{"data":{"patch-level":%d,"patch-sublevel":%d,"some":"data","refresh-privacy-key":"0123456789ABCDEF","system-restart-from-boot-id":%q,"daemon-system-restart-at":"%s","daemon-system-restart-tentative":3},"changes":null,"tasks":null,"last-change-id":0,"last-task-id":0,"last-lane-id":0}`, patch.Level, patch.Sublevel, curBootID, time.Now().UTC().Format(time.RFC3339))) + err = ioutil.WriteFile(dirs.SnapStateFile, fakeState, 0600) + c.Assert(err, check.IsNil) + + cmd := testutil.MockCommand(c, "shutdown", "") + defer cmd.Restore() + + d := newTestDaemon(c) + c.Assert(d.overlord, check.NotNil) + + st := d.overlord.State() + st.Lock() + defer st.Unlock() + var v interface{} + // these were cleared + c.Check(st.Get("daemon-system-restart-at", &v), check.Equals, state.ErrNoState) + c.Check(st.Get("system-restart-from-boot-id", &v), check.Equals, state.ErrNoState) + c.Check(st.Get("daemon-system-restart-tentative", &v), check.Equals, state.ErrNoState) +} + func (s *daemonSuite) TestRestartIntoSocketModeNoNewChanges(c *check.C) { restore := standby.MockStandbyWait(5 * time.Millisecond) defer restore() |
