From ccb09f627a0556227228429b8744b5e2e84a5992 Mon Sep 17 00:00:00 2001 From: Derek Nola Date: Thu, 16 Jun 2022 08:22:42 -0700 Subject: [PATCH] Delay service readiness until after startuphooks have finished (#5736) * Move startup hooks wg into a runtime pointer, check before notifying systemd * Switch default systemd notification to server * Add 1 sec delay to allow etcd to write to disk Signed-off-by: Derek Nola --- pkg/agent/run.go | 10 ++++++++-- pkg/cli/server/server.go | 11 ++++++----- pkg/daemons/config/types.go | 2 ++ pkg/server/secrets-encrypt.go | 4 ++++ pkg/server/server.go | 12 +++++++----- 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/pkg/agent/run.go b/pkg/agent/run.go index 1bfc65141d..ee452fc208 100644 --- a/pkg/agent/run.go +++ b/pkg/agent/run.go @@ -29,6 +29,7 @@ import ( "github.com/rancher/k3s/pkg/nodeconfig" "github.com/rancher/k3s/pkg/rootless" "github.com/rancher/k3s/pkg/util" + "github.com/rancher/k3s/pkg/version" "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" @@ -128,8 +129,13 @@ func run(ctx context.Context, cfg cmds.Agent, proxy proxy.Proxy) error { } } - os.Setenv("NOTIFY_SOCKET", notifySocket) - systemd.SdNotify(true, "READY=1\n") + // By default, the server is responsible for notifying systemd + // On agent-only nodes, the agent will notify systemd + if notifySocket != "" { + logrus.Info(version.Program + " agent is up and running") + os.Setenv("NOTIFY_SOCKET", notifySocket) + systemd.SdNotify(true, "READY=1\n") + } <-ctx.Done() return ctx.Err() diff --git a/pkg/cli/server/server.go b/pkg/cli/server/server.go index afa0fa9a81..4dd7a8bc5a 100644 --- a/pkg/cli/server/server.go +++ b/pkg/cli/server/server.go @@ -419,6 +419,7 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont logrus.Info("Starting " + version.Program + " " + app.App.Version) notifySocket := os.Getenv("NOTIFY_SOCKET") + os.Unsetenv("NOTIFY_SOCKET") ctx := signals.SetupSignalContext() @@ -430,16 +431,16 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont if !serverConfig.ControlConfig.DisableAPIServer { <-serverConfig.ControlConfig.Runtime.APIServerReady logrus.Info("Kube API server is now running") - } else { + serverConfig.ControlConfig.Runtime.StartupHooksWg.Wait() + } + if !serverConfig.ControlConfig.DisableETCD { <-serverConfig.ControlConfig.Runtime.ETCDReady logrus.Info("ETCD server is now running") } logrus.Info(version.Program + " is up and running") - if (cfg.DisableAgent || cfg.DisableAPIServer) && notifySocket != "" { - os.Setenv("NOTIFY_SOCKET", notifySocket) - systemd.SdNotify(true, "READY=1\n") - } + os.Setenv("NOTIFY_SOCKET", notifySocket) + systemd.SdNotify(true, "READY=1\n") }() ip := serverConfig.ControlConfig.BindAddress diff --git a/pkg/daemons/config/types.go b/pkg/daemons/config/types.go index da74db109a..fb41bb720a 100644 --- a/pkg/daemons/config/types.go +++ b/pkg/daemons/config/types.go @@ -8,6 +8,7 @@ import ( "net/http" "sort" "strings" + "sync" "time" "github.com/k3s-io/kine/pkg/endpoint" @@ -216,6 +217,7 @@ type ControlRuntime struct { APIServerReady <-chan struct{} AgentReady <-chan struct{} ETCDReady <-chan struct{} + StartupHooksWg *sync.WaitGroup ClusterControllerStart func(ctx context.Context) error LeaderElectedClusterControllerStart func(ctx context.Context) error diff --git a/pkg/server/secrets-encrypt.go b/pkg/server/secrets-encrypt.go index f1d153f5e5..a4a8c4587c 100644 --- a/pkg/server/secrets-encrypt.go +++ b/pkg/server/secrets-encrypt.go @@ -186,6 +186,10 @@ func encryptionConfigHandler(ctx context.Context, server *config.Control) http.H genErrorMessage(resp, http.StatusBadRequest, err) return } + // If a user kills the k3s server immediately after this call, we run into issues where the files + // have not yet been written. This sleep ensures that things have time to sync to disk before + // the request completes. + time.Sleep(1 * time.Second) resp.WriteHeader(http.StatusOK) }) } diff --git a/pkg/server/server.go b/pkg/server/server.go index 3ae2426044..06b85056ca 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -66,6 +66,8 @@ func StartServer(ctx context.Context, config *Config, cfg *cmds.Server) error { wg.Add(len(config.StartupHooks)) config.ControlConfig.Runtime.Handler = router(ctx, config, cfg) + config.ControlConfig.Runtime.StartupHooksWg = wg + shArgs := cmds.StartupHookArgs{ APIServerReady: config.ControlConfig.Runtime.APIServerReady, KubeConfigAdmin: config.ControlConfig.Runtime.KubeConfigAdmin, @@ -81,7 +83,7 @@ func StartServer(ctx context.Context, config *Config, cfg *cmds.Server) error { if config.ControlConfig.DisableAPIServer { go setETCDLabelsAndAnnotations(ctx, config) } else { - go startOnAPIServerReady(ctx, wg, config) + go startOnAPIServerReady(ctx, config) } ip := net2.ParseIP(config.ControlConfig.BindAddress) @@ -101,18 +103,18 @@ func StartServer(ctx context.Context, config *Config, cfg *cmds.Server) error { return writeKubeConfig(config.ControlConfig.Runtime.ServerCA, config) } -func startOnAPIServerReady(ctx context.Context, wg *sync.WaitGroup, config *Config) { +func startOnAPIServerReady(ctx context.Context, config *Config) { select { case <-ctx.Done(): return case <-config.ControlConfig.Runtime.APIServerReady: - if err := runControllers(ctx, wg, config); err != nil { + if err := runControllers(ctx, config); err != nil { logrus.Fatalf("failed to start controllers: %v", err) } } } -func runControllers(ctx context.Context, wg *sync.WaitGroup, config *Config) error { +func runControllers(ctx context.Context, config *Config) error { controlConfig := &config.ControlConfig sc, err := NewContext(ctx, controlConfig.Runtime.KubeConfigAdmin) @@ -120,7 +122,7 @@ func runControllers(ctx context.Context, wg *sync.WaitGroup, config *Config) err return err } - wg.Wait() + controlConfig.Runtime.StartupHooksWg.Wait() if err := stageFiles(ctx, sc, controlConfig); err != nil { return err }