From 13229019f8d4ff24e0c0be7d4f10fa200fadbbb9 Mon Sep 17 00:00:00 2001 From: Brian Downs Date: Thu, 21 Jan 2021 14:09:15 -0700 Subject: [PATCH] Add ability to perform an etcd on-demand snapshot via cli (#2819) * add ability to perform an etcd on-demand snapshot via cli --- cmd/etcdsnapshot/main.go | 22 +++++++++ cmd/k3s/main.go | 8 ++-- cmd/server/main.go | 2 + main.go | 2 + pkg/cli/cmds/etcd_snapshot.go | 39 +++++++++++++++ pkg/cli/cmds/server.go | 7 +++ pkg/cli/etcdsnapshot/etcd_snapshot.go | 62 ++++++++++++++++++++++++ pkg/cli/server/server.go | 1 + pkg/cluster/bootstrap.go | 10 ++++ pkg/cluster/managed/drivers.go | 1 + pkg/daemons/config/types.go | 1 + pkg/etcd/etcd.go | 68 +++++++++++++++++++-------- pkg/server/server.go | 4 +- scripts/build | 2 + scripts/package-cli | 2 +- 15 files changed, 205 insertions(+), 26 deletions(-) create mode 100644 cmd/etcdsnapshot/main.go create mode 100644 pkg/cli/cmds/etcd_snapshot.go create mode 100644 pkg/cli/etcdsnapshot/etcd_snapshot.go diff --git a/cmd/etcdsnapshot/main.go b/cmd/etcdsnapshot/main.go new file mode 100644 index 0000000000..5d4520ed7c --- /dev/null +++ b/cmd/etcdsnapshot/main.go @@ -0,0 +1,22 @@ +package main + +import ( + "os" + + "github.com/rancher/k3s/pkg/cli/cmds" + "github.com/rancher/k3s/pkg/cli/etcdsnapshot" + "github.com/rancher/k3s/pkg/configfilearg" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +func main() { + app := cmds.NewApp() + app.Commands = []cli.Command{ + cmds.NewEtcdSnapshotCommand(etcdsnapshot.Run), + } + + if err := app.Run(configfilearg.MustParse(os.Args)); err != nil { + logrus.Fatal(err) + } +} diff --git a/cmd/k3s/main.go b/cmd/k3s/main.go index aee99fe017..061c1c47c2 100644 --- a/cmd/k3s/main.go +++ b/cmd/k3s/main.go @@ -35,10 +35,10 @@ func main() { cmds.NewCRICTL(externalCLIAction("crictl", dataDir)), cmds.NewCtrCommand(externalCLIAction("ctr", dataDir)), cmds.NewCheckConfigCommand(externalCLIAction("check-config", dataDir)), + cmds.NewEtcdSnapshotCommand(wrap(version.Program+"-"+cmds.EtcdSnapshotCommand, dataDir, os.Args)), } - err := app.Run(os.Args) - if err != nil { + if err := app.Run(os.Args); err != nil { logrus.Fatal(err) } } @@ -96,7 +96,7 @@ func externalCLI(cli, dataDir string, args []string) error { return stageAndRun(dataDir, cli, append([]string{cli}, args...)) } -func wrap(cmd string, dataDir string, args []string) func(ctx *cli.Context) error { +func wrap(cmd, dataDir string, args []string) func(ctx *cli.Context) error { return func(ctx *cli.Context) error { return stageAndRunCLI(ctx, cmd, dataDir, args) } @@ -111,7 +111,7 @@ func stageAndRunCLI(cli *cli.Context, cmd string, dataDir string, args []string) return stageAndRun(dataDir, cmd, args) } -func stageAndRun(dataDir string, cmd string, args []string) error { +func stageAndRun(dataDir, cmd string, args []string) error { dir, err := extract(dataDir) if err != nil { return errors.Wrap(err, "extracting data") diff --git a/cmd/server/main.go b/cmd/server/main.go index fffafb7248..528d5369b4 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -10,6 +10,7 @@ import ( "github.com/rancher/k3s/pkg/cli/cmds" "github.com/rancher/k3s/pkg/cli/crictl" "github.com/rancher/k3s/pkg/cli/ctr" + "github.com/rancher/k3s/pkg/cli/etcdsnapshot" "github.com/rancher/k3s/pkg/cli/kubectl" "github.com/rancher/k3s/pkg/cli/server" "github.com/rancher/k3s/pkg/configfilearg" @@ -42,6 +43,7 @@ func main() { cmds.NewKubectlCommand(kubectl.Run), cmds.NewCRICTL(crictl.Run), cmds.NewCtrCommand(ctr.Run), + cmds.NewEtcdSnapshotCommand(etcdsnapshot.Run), } err := app.Run(configfilearg.MustParse(os.Args)) diff --git a/main.go b/main.go index c7e02278f1..06e239400d 100644 --- a/main.go +++ b/main.go @@ -12,6 +12,7 @@ import ( "github.com/rancher/k3s/pkg/cli/agent" "github.com/rancher/k3s/pkg/cli/cmds" "github.com/rancher/k3s/pkg/cli/crictl" + "github.com/rancher/k3s/pkg/cli/etcdsnapshot" "github.com/rancher/k3s/pkg/cli/kubectl" "github.com/rancher/k3s/pkg/cli/server" "github.com/rancher/k3s/pkg/configfilearg" @@ -26,6 +27,7 @@ func main() { cmds.NewAgentCommand(agent.Run), cmds.NewKubectlCommand(kubectl.Run), cmds.NewCRICTL(crictl.Run), + cmds.NewEtcdSnapshotCommand(etcdsnapshot.Run), } if err := app.Run(configfilearg.MustParse(os.Args)); err != nil { diff --git a/pkg/cli/cmds/etcd_snapshot.go b/pkg/cli/cmds/etcd_snapshot.go new file mode 100644 index 0000000000..1438746ce6 --- /dev/null +++ b/pkg/cli/cmds/etcd_snapshot.go @@ -0,0 +1,39 @@ +package cmds + +import ( + "github.com/rancher/k3s/pkg/version" + "github.com/urfave/cli" +) + +const EtcdSnapshotCommand = "etcd-snapshot" + +func NewEtcdSnapshotCommand(action func(*cli.Context) error) cli.Command { + return cli.Command{ + Name: EtcdSnapshotCommand, + Usage: "Trigger an immediate etcd snapshot", + SkipFlagParsing: false, + SkipArgReorder: true, + Action: action, + Flags: []cli.Flag{ + DebugFlag, + LogFile, + AlsoLogToStderr, + cli.StringFlag{ + Name: "data-dir,d", + Usage: "(data) Folder to hold state default /var/lib/rancher/" + version.Program + " or ${HOME}/.rancher/" + version.Program + " if not root", + Destination: &ServerConfig.DataDir, + }, + &cli.StringFlag{ + Name: "name", + Usage: "(db) Set the base name of the etcd on-demand snapshot (appended with UNIX timestamp).", + Destination: &ServerConfig.EtcdSnapshotName, + Value: "on-demand", + }, + &cli.StringFlag{ + Name: "dir", + Usage: "(db) Directory to save etcd on-demand snapshot. (default: ${data-dir}/db/snapshots)", + Destination: &ServerConfig.EtcdSnapshotDir, + }, + }, + } +} diff --git a/pkg/cli/cmds/server.go b/pkg/cli/cmds/server.go index 09cbffc8c4..d4429c4f1f 100644 --- a/pkg/cli/cmds/server.go +++ b/pkg/cli/cmds/server.go @@ -58,6 +58,7 @@ type Server struct { ClusterResetRestorePath string EncryptSecrets bool StartupHooks []func(context.Context, <-chan struct{}, string) error + EtcdSnapshotName string EtcdDisableSnapshots bool EtcdSnapshotDir string EtcdSnapshotCron string @@ -214,6 +215,12 @@ func NewServerCommand(action func(*cli.Context) error) cli.Command { Usage: "(db) Disable automatic etcd snapshots", Destination: &ServerConfig.EtcdDisableSnapshots, }, + &cli.StringFlag{ + Name: "etcd-snapshot-name", + Usage: "(db) Set the base name of etcd snapshots. Default: etcd-snapshot-", + Destination: &ServerConfig.EtcdSnapshotName, + Value: "etcd-snapshot", + }, &cli.StringFlag{ Name: "etcd-snapshot-schedule-cron", Usage: "(db) Snapshot interval time in cron spec. eg. every 5 hours '* */5 * * *'", diff --git a/pkg/cli/etcdsnapshot/etcd_snapshot.go b/pkg/cli/etcdsnapshot/etcd_snapshot.go new file mode 100644 index 0000000000..5e2b305e75 --- /dev/null +++ b/pkg/cli/etcdsnapshot/etcd_snapshot.go @@ -0,0 +1,62 @@ +package etcdsnapshot + +import ( + "context" + "errors" + "os" + "path/filepath" + + "github.com/erikdubbelboer/gspt" + "github.com/rancher/k3s/pkg/cli/cmds" + "github.com/rancher/k3s/pkg/cluster" + "github.com/rancher/k3s/pkg/daemons/config" + "github.com/rancher/k3s/pkg/etcd" + "github.com/rancher/k3s/pkg/server" + "github.com/rancher/wrangler/pkg/signals" + "github.com/urfave/cli" +) + +func Run(app *cli.Context) error { + if err := cmds.InitLogging(); err != nil { + return err + } + return run(app, &cmds.ServerConfig) +} + +func run(app *cli.Context, cfg *cmds.Server) error { + gspt.SetProcTitle(os.Args[0]) + + dataDir, err := server.ResolveDataDir(cfg.DataDir) + if err != nil { + return err + } + + var serverConfig server.Config + serverConfig.DisableAgent = true + serverConfig.ControlConfig.DataDir = dataDir + serverConfig.ControlConfig.EtcdSnapshotName = cfg.EtcdSnapshotName + serverConfig.ControlConfig.EtcdSnapshotDir = cfg.EtcdSnapshotDir + serverConfig.ControlConfig.EtcdSnapshotRetention = 0 // disable retention check + serverConfig.ControlConfig.Runtime = &config.ControlRuntime{} + serverConfig.ControlConfig.Runtime.ETCDServerCA = filepath.Join(dataDir, "tls", "etcd", "server-ca.crt") + serverConfig.ControlConfig.Runtime.ClientETCDCert = filepath.Join(dataDir, "tls", "etcd", "client.crt") + serverConfig.ControlConfig.Runtime.ClientETCDKey = filepath.Join(dataDir, "tls", "etcd", "client.key") + + ctx := signals.SetupSignalHandler(context.Background()) + + initialized, err := etcd.NewETCD().IsInitialized(ctx, &serverConfig.ControlConfig) + if err != nil { + return err + } + if !initialized { + return errors.New("managed etcd database has not been initialized") + } + + cluster := cluster.New(&serverConfig.ControlConfig) + + if err := cluster.Bootstrap(ctx); err != nil { + return err + } + + return cluster.Snapshot(ctx, &serverConfig.ControlConfig) +} diff --git a/pkg/cli/server/server.go b/pkg/cli/server/server.go index 59107f1252..15c9abea5d 100644 --- a/pkg/cli/server/server.go +++ b/pkg/cli/server/server.go @@ -111,6 +111,7 @@ func run(app *cli.Context, cfg *cmds.Server) error { serverConfig.ControlConfig.DisableKubeProxy = cfg.DisableKubeProxy serverConfig.ControlConfig.ClusterInit = cfg.ClusterInit serverConfig.ControlConfig.EncryptSecrets = cfg.EncryptSecrets + serverConfig.ControlConfig.EtcdSnapshotName = cfg.EtcdSnapshotName serverConfig.ControlConfig.EtcdSnapshotCron = cfg.EtcdSnapshotCron serverConfig.ControlConfig.EtcdSnapshotDir = cfg.EtcdSnapshotDir serverConfig.ControlConfig.EtcdSnapshotRetention = cfg.EtcdSnapshotRetention diff --git a/pkg/cluster/bootstrap.go b/pkg/cluster/bootstrap.go index 13198f430b..24f02a22cc 100644 --- a/pkg/cluster/bootstrap.go +++ b/pkg/cluster/bootstrap.go @@ -9,6 +9,7 @@ import ( "github.com/rancher/k3s/pkg/bootstrap" "github.com/rancher/k3s/pkg/clientaccess" + "github.com/rancher/k3s/pkg/daemons/config" "github.com/rancher/k3s/pkg/version" "github.com/sirupsen/logrus" ) @@ -147,3 +148,12 @@ func (c *Cluster) bootstrap(ctx context.Context) error { func (c *Cluster) bootstrapStamp() string { return filepath.Join(c.config.DataDir, "db/joined-"+keyHash(c.config.Token)) } + +// Snapshot is a proxy method to call the snapshot method on the managedb +// interface for etcd clusters. +func (c *Cluster) Snapshot(ctx context.Context, config *config.Control) error { + if c.managedDB == nil { + return errors.New("unable to perform etcd snapshot on non-etcd system") + } + return c.managedDB.Snapshot(ctx, config) +} diff --git a/pkg/cluster/managed/drivers.go b/pkg/cluster/managed/drivers.go index 097c04b8ee..a92c60c3e5 100644 --- a/pkg/cluster/managed/drivers.go +++ b/pkg/cluster/managed/drivers.go @@ -21,6 +21,7 @@ type Driver interface { Test(ctx context.Context) error Restore(ctx context.Context) error EndpointName() string + Snapshot(ctx context.Context, config *config.Control) error } func RegisterDriver(d Driver) { diff --git a/pkg/daemons/config/types.go b/pkg/daemons/config/types.go index 52accb3f89..23db3274bc 100644 --- a/pkg/daemons/config/types.go +++ b/pkg/daemons/config/types.go @@ -130,6 +130,7 @@ type Control struct { EncryptSecrets bool TLSMinVersion uint16 TLSCipherSuites []uint16 + EtcdSnapshotName string EtcdDisableSnapshots bool EtcdSnapshotDir string EtcdSnapshotCron string diff --git a/pkg/etcd/etcd.go b/pkg/etcd/etcd.go index 85a98b5fae..220986d400 100644 --- a/pkg/etcd/etcd.go +++ b/pkg/etcd/etcd.go @@ -419,7 +419,6 @@ func getClientConfig(ctx context.Context, runtime *config.ControlRuntime, endpoi if err != nil { return nil, err } - cfg := &etcd.Config{ Endpoints: endpoints, TLS: tlsConfig, @@ -428,7 +427,6 @@ func getClientConfig(ctx context.Context, runtime *config.ControlRuntime, endpoi DialKeepAliveTime: defaultKeepAliveTime, DialKeepAliveTimeout: defaultKeepAliveTimeout, } - return cfg, nil } @@ -723,48 +721,80 @@ func snapshotDir(config *config.Control) (string, error) { return config.EtcdSnapshotDir, nil } -// snapshot attempts to save a new snapshot to the configured directory, and then clean up any old -// snapshots in excess of the retention limits. -func (e *ETCD) snapshot(ctx context.Context) { +// preSnapshotSetup checks to see if the necessary components are in place +// to perform an Etcd snapshot. This is necessary primarily for on-demand +// snapshots since they're performed before normal Etcd setup is completed. +func (e *ETCD) preSnapshotSetup(ctx context.Context, config *config.Control) error { + if e.client == nil { + if e.config == nil { + e.config = config + } + client, err := getClient(ctx, e.config.Runtime, endpoint) + if err != nil { + return err + } + e.client = client + } + if e.runtime == nil { + e.runtime = config.Runtime + } + return nil +} + +// Snapshot attempts to save a new snapshot to the configured directory, and then clean up any old +// snapshots in excess of the retention limits. This method is used in the internal cron snapshot +// system as well as used to do on-demand snapshots. +func (e *ETCD) Snapshot(ctx context.Context, config *config.Control) error { + if err := e.preSnapshotSetup(ctx, config); err != nil { + return err + } + status, err := e.client.Status(ctx, endpoint) if err != nil { - logrus.Errorf("Failed to check etcd status for snapshot: %v", err) - return + return errors.Wrap(err, "failed to check etcd status for snapshot") } if status.IsLearner { logrus.Warnf("Skipping snapshot: not supported for learner") - return + return nil } snapshotDir, err := snapshotDir(e.config) if err != nil { - logrus.Errorf("Failed to get the snapshot dir: %v", err) - return + return errors.Wrap(err, "failed to get the snapshot dir") } cfg, err := getClientConfig(ctx, e.runtime, endpoint) if err != nil { - logrus.Errorf("Failed to get config for etcd snapshot: %v", err) - return + return errors.Wrap(err, "failed to get config for etcd snapshot") } - snapshotPath := filepath.Join(snapshotDir, snapshotPrefix+strconv.Itoa(int(time.Now().Unix()))) + snapshotName := fmt.Sprintf("%s-%d", e.config.EtcdSnapshotName, time.Now().Unix()) + snapshotPath := filepath.Join(snapshotDir, snapshotName) + logrus.Infof("Saving etcd snapshot to %s", snapshotPath) if err := snapshot.NewV3(nil).Save(ctx, *cfg, snapshotPath); err != nil { - logrus.Errorf("Failed to save snapshot: %v", err) - return + return errors.Wrap(err, "failed to save snapshot") } - if err := snapshotRetention(e.config.EtcdSnapshotRetention, snapshotDir); err != nil { - logrus.Errorf("Failed to apply snapshot retention: %v", err) - return + + // check if we need to perform a retention check + if e.config.EtcdSnapshotRetention >= 1 { + if err := snapshotRetention(e.config.EtcdSnapshotRetention, snapshotDir); err != nil { + return errors.Wrap(err, "failed to apply snapshot retention") + } } + + return nil } // setSnapshotFunction schedules snapshots at the configured interval func (e *ETCD) setSnapshotFunction(ctx context.Context) { - e.cron.AddFunc(e.config.EtcdSnapshotCron, func() { e.snapshot(ctx) }) + e.cron.AddFunc(e.config.EtcdSnapshotCron, func() { + if err := e.Snapshot(ctx, e.config); err != nil { + logrus.Error(err) + } + }) } // Restore performs a restore of the ETCD datastore from diff --git a/pkg/server/server.go b/pkg/server/server.go index 26adfd58a0..c13ac98612 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -42,7 +42,7 @@ const ( ControlPlaneRoleLabelKey = "node-role.kubernetes.io/control-plane" ) -func resolveDataDir(dataDir string) (string, error) { +func ResolveDataDir(dataDir string) (string, error) { dataDir, err := datadir.Resolve(dataDir) return filepath.Join(dataDir, "server"), err } @@ -322,7 +322,7 @@ func setupDataDirAndChdir(config *config.Control) error { err error ) - config.DataDir, err = resolveDataDir(config.DataDir) + config.DataDir, err = ResolveDataDir(config.DataDir) if err != nil { return err } diff --git a/scripts/build b/scripts/build index bf9b388bd8..fedc2e83a8 100755 --- a/scripts/build +++ b/scripts/build @@ -77,6 +77,7 @@ rm -f \ bin/containerd-shim-runc-v1 \ bin/containerd-shim-runc-v2 \ bin/k3s-server \ + bin/k3s-etcd-snapshot \ bin/kubectl \ bin/crictl \ bin/ctr @@ -105,6 +106,7 @@ echo Building server CGO_ENABLED=1 "${GO}" build -tags "$TAGS" -ldflags "$VERSIONFLAGS $LDFLAGS $STATIC_SQLITE" -o bin/containerd ./cmd/server/main.go ln -s containerd ./bin/k3s-agent ln -s containerd ./bin/k3s-server +ln -s containerd ./bin/k3s-etcd-snapshot ln -s containerd ./bin/kubectl ln -s containerd ./bin/crictl ln -s containerd ./bin/ctr diff --git a/scripts/package-cli b/scripts/package-cli index d1aad638d9..3a37400f7e 100755 --- a/scripts/package-cli +++ b/scripts/package-cli @@ -7,7 +7,7 @@ cd $(dirname $0)/.. GO=${GO-go} -for i in crictl kubectl k3s-agent k3s-server k3s; do +for i in crictl kubectl k3s-agent k3s-server k3s-etcd-snapshot k3s; do rm -f bin/$i ln -s containerd bin/$i done