Add ability to perform an etcd on-demand snapshot via cli (#2819)

* add ability to perform an etcd on-demand snapshot via cli
This commit is contained in:
Brian Downs 2021-01-21 14:09:15 -07:00 committed by GitHub
parent 84f6655342
commit 13229019f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 205 additions and 26 deletions

22
cmd/etcdsnapshot/main.go Normal file
View File

@ -0,0 +1,22 @@
package main
import (
"os"
"github.com/rancher/k3s/pkg/cli/cmds"
"github.com/rancher/k3s/pkg/cli/etcdsnapshot"
"github.com/rancher/k3s/pkg/configfilearg"
"github.com/sirupsen/logrus"
"github.com/urfave/cli"
)
func main() {
app := cmds.NewApp()
app.Commands = []cli.Command{
cmds.NewEtcdSnapshotCommand(etcdsnapshot.Run),
}
if err := app.Run(configfilearg.MustParse(os.Args)); err != nil {
logrus.Fatal(err)
}
}

View File

@ -35,10 +35,10 @@ func main() {
cmds.NewCRICTL(externalCLIAction("crictl", dataDir)), cmds.NewCRICTL(externalCLIAction("crictl", dataDir)),
cmds.NewCtrCommand(externalCLIAction("ctr", dataDir)), cmds.NewCtrCommand(externalCLIAction("ctr", dataDir)),
cmds.NewCheckConfigCommand(externalCLIAction("check-config", dataDir)), cmds.NewCheckConfigCommand(externalCLIAction("check-config", dataDir)),
cmds.NewEtcdSnapshotCommand(wrap(version.Program+"-"+cmds.EtcdSnapshotCommand, dataDir, os.Args)),
} }
err := app.Run(os.Args) if err := app.Run(os.Args); err != nil {
if err != nil {
logrus.Fatal(err) logrus.Fatal(err)
} }
} }
@ -96,7 +96,7 @@ func externalCLI(cli, dataDir string, args []string) error {
return stageAndRun(dataDir, cli, append([]string{cli}, args...)) return stageAndRun(dataDir, cli, append([]string{cli}, args...))
} }
func wrap(cmd string, dataDir string, args []string) func(ctx *cli.Context) error { func wrap(cmd, dataDir string, args []string) func(ctx *cli.Context) error {
return func(ctx *cli.Context) error { return func(ctx *cli.Context) error {
return stageAndRunCLI(ctx, cmd, dataDir, args) return stageAndRunCLI(ctx, cmd, dataDir, args)
} }
@ -111,7 +111,7 @@ func stageAndRunCLI(cli *cli.Context, cmd string, dataDir string, args []string)
return stageAndRun(dataDir, cmd, args) return stageAndRun(dataDir, cmd, args)
} }
func stageAndRun(dataDir string, cmd string, args []string) error { func stageAndRun(dataDir, cmd string, args []string) error {
dir, err := extract(dataDir) dir, err := extract(dataDir)
if err != nil { if err != nil {
return errors.Wrap(err, "extracting data") return errors.Wrap(err, "extracting data")

View File

@ -10,6 +10,7 @@ import (
"github.com/rancher/k3s/pkg/cli/cmds" "github.com/rancher/k3s/pkg/cli/cmds"
"github.com/rancher/k3s/pkg/cli/crictl" "github.com/rancher/k3s/pkg/cli/crictl"
"github.com/rancher/k3s/pkg/cli/ctr" "github.com/rancher/k3s/pkg/cli/ctr"
"github.com/rancher/k3s/pkg/cli/etcdsnapshot"
"github.com/rancher/k3s/pkg/cli/kubectl" "github.com/rancher/k3s/pkg/cli/kubectl"
"github.com/rancher/k3s/pkg/cli/server" "github.com/rancher/k3s/pkg/cli/server"
"github.com/rancher/k3s/pkg/configfilearg" "github.com/rancher/k3s/pkg/configfilearg"
@ -42,6 +43,7 @@ func main() {
cmds.NewKubectlCommand(kubectl.Run), cmds.NewKubectlCommand(kubectl.Run),
cmds.NewCRICTL(crictl.Run), cmds.NewCRICTL(crictl.Run),
cmds.NewCtrCommand(ctr.Run), cmds.NewCtrCommand(ctr.Run),
cmds.NewEtcdSnapshotCommand(etcdsnapshot.Run),
} }
err := app.Run(configfilearg.MustParse(os.Args)) err := app.Run(configfilearg.MustParse(os.Args))

View File

@ -12,6 +12,7 @@ import (
"github.com/rancher/k3s/pkg/cli/agent" "github.com/rancher/k3s/pkg/cli/agent"
"github.com/rancher/k3s/pkg/cli/cmds" "github.com/rancher/k3s/pkg/cli/cmds"
"github.com/rancher/k3s/pkg/cli/crictl" "github.com/rancher/k3s/pkg/cli/crictl"
"github.com/rancher/k3s/pkg/cli/etcdsnapshot"
"github.com/rancher/k3s/pkg/cli/kubectl" "github.com/rancher/k3s/pkg/cli/kubectl"
"github.com/rancher/k3s/pkg/cli/server" "github.com/rancher/k3s/pkg/cli/server"
"github.com/rancher/k3s/pkg/configfilearg" "github.com/rancher/k3s/pkg/configfilearg"
@ -26,6 +27,7 @@ func main() {
cmds.NewAgentCommand(agent.Run), cmds.NewAgentCommand(agent.Run),
cmds.NewKubectlCommand(kubectl.Run), cmds.NewKubectlCommand(kubectl.Run),
cmds.NewCRICTL(crictl.Run), cmds.NewCRICTL(crictl.Run),
cmds.NewEtcdSnapshotCommand(etcdsnapshot.Run),
} }
if err := app.Run(configfilearg.MustParse(os.Args)); err != nil { if err := app.Run(configfilearg.MustParse(os.Args)); err != nil {

View File

@ -0,0 +1,39 @@
package cmds
import (
"github.com/rancher/k3s/pkg/version"
"github.com/urfave/cli"
)
const EtcdSnapshotCommand = "etcd-snapshot"
func NewEtcdSnapshotCommand(action func(*cli.Context) error) cli.Command {
return cli.Command{
Name: EtcdSnapshotCommand,
Usage: "Trigger an immediate etcd snapshot",
SkipFlagParsing: false,
SkipArgReorder: true,
Action: action,
Flags: []cli.Flag{
DebugFlag,
LogFile,
AlsoLogToStderr,
cli.StringFlag{
Name: "data-dir,d",
Usage: "(data) Folder to hold state default /var/lib/rancher/" + version.Program + " or ${HOME}/.rancher/" + version.Program + " if not root",
Destination: &ServerConfig.DataDir,
},
&cli.StringFlag{
Name: "name",
Usage: "(db) Set the base name of the etcd on-demand snapshot (appended with UNIX timestamp).",
Destination: &ServerConfig.EtcdSnapshotName,
Value: "on-demand",
},
&cli.StringFlag{
Name: "dir",
Usage: "(db) Directory to save etcd on-demand snapshot. (default: ${data-dir}/db/snapshots)",
Destination: &ServerConfig.EtcdSnapshotDir,
},
},
}
}

View File

@ -58,6 +58,7 @@ type Server struct {
ClusterResetRestorePath string ClusterResetRestorePath string
EncryptSecrets bool EncryptSecrets bool
StartupHooks []func(context.Context, <-chan struct{}, string) error StartupHooks []func(context.Context, <-chan struct{}, string) error
EtcdSnapshotName string
EtcdDisableSnapshots bool EtcdDisableSnapshots bool
EtcdSnapshotDir string EtcdSnapshotDir string
EtcdSnapshotCron string EtcdSnapshotCron string
@ -214,6 +215,12 @@ func NewServerCommand(action func(*cli.Context) error) cli.Command {
Usage: "(db) Disable automatic etcd snapshots", Usage: "(db) Disable automatic etcd snapshots",
Destination: &ServerConfig.EtcdDisableSnapshots, Destination: &ServerConfig.EtcdDisableSnapshots,
}, },
&cli.StringFlag{
Name: "etcd-snapshot-name",
Usage: "(db) Set the base name of etcd snapshots. Default: etcd-snapshot-<unix-timestamp>",
Destination: &ServerConfig.EtcdSnapshotName,
Value: "etcd-snapshot",
},
&cli.StringFlag{ &cli.StringFlag{
Name: "etcd-snapshot-schedule-cron", Name: "etcd-snapshot-schedule-cron",
Usage: "(db) Snapshot interval time in cron spec. eg. every 5 hours '* */5 * * *'", Usage: "(db) Snapshot interval time in cron spec. eg. every 5 hours '* */5 * * *'",

View File

@ -0,0 +1,62 @@
package etcdsnapshot
import (
"context"
"errors"
"os"
"path/filepath"
"github.com/erikdubbelboer/gspt"
"github.com/rancher/k3s/pkg/cli/cmds"
"github.com/rancher/k3s/pkg/cluster"
"github.com/rancher/k3s/pkg/daemons/config"
"github.com/rancher/k3s/pkg/etcd"
"github.com/rancher/k3s/pkg/server"
"github.com/rancher/wrangler/pkg/signals"
"github.com/urfave/cli"
)
func Run(app *cli.Context) error {
if err := cmds.InitLogging(); err != nil {
return err
}
return run(app, &cmds.ServerConfig)
}
func run(app *cli.Context, cfg *cmds.Server) error {
gspt.SetProcTitle(os.Args[0])
dataDir, err := server.ResolveDataDir(cfg.DataDir)
if err != nil {
return err
}
var serverConfig server.Config
serverConfig.DisableAgent = true
serverConfig.ControlConfig.DataDir = dataDir
serverConfig.ControlConfig.EtcdSnapshotName = cfg.EtcdSnapshotName
serverConfig.ControlConfig.EtcdSnapshotDir = cfg.EtcdSnapshotDir
serverConfig.ControlConfig.EtcdSnapshotRetention = 0 // disable retention check
serverConfig.ControlConfig.Runtime = &config.ControlRuntime{}
serverConfig.ControlConfig.Runtime.ETCDServerCA = filepath.Join(dataDir, "tls", "etcd", "server-ca.crt")
serverConfig.ControlConfig.Runtime.ClientETCDCert = filepath.Join(dataDir, "tls", "etcd", "client.crt")
serverConfig.ControlConfig.Runtime.ClientETCDKey = filepath.Join(dataDir, "tls", "etcd", "client.key")
ctx := signals.SetupSignalHandler(context.Background())
initialized, err := etcd.NewETCD().IsInitialized(ctx, &serverConfig.ControlConfig)
if err != nil {
return err
}
if !initialized {
return errors.New("managed etcd database has not been initialized")
}
cluster := cluster.New(&serverConfig.ControlConfig)
if err := cluster.Bootstrap(ctx); err != nil {
return err
}
return cluster.Snapshot(ctx, &serverConfig.ControlConfig)
}

View File

@ -111,6 +111,7 @@ func run(app *cli.Context, cfg *cmds.Server) error {
serverConfig.ControlConfig.DisableKubeProxy = cfg.DisableKubeProxy serverConfig.ControlConfig.DisableKubeProxy = cfg.DisableKubeProxy
serverConfig.ControlConfig.ClusterInit = cfg.ClusterInit serverConfig.ControlConfig.ClusterInit = cfg.ClusterInit
serverConfig.ControlConfig.EncryptSecrets = cfg.EncryptSecrets serverConfig.ControlConfig.EncryptSecrets = cfg.EncryptSecrets
serverConfig.ControlConfig.EtcdSnapshotName = cfg.EtcdSnapshotName
serverConfig.ControlConfig.EtcdSnapshotCron = cfg.EtcdSnapshotCron serverConfig.ControlConfig.EtcdSnapshotCron = cfg.EtcdSnapshotCron
serverConfig.ControlConfig.EtcdSnapshotDir = cfg.EtcdSnapshotDir serverConfig.ControlConfig.EtcdSnapshotDir = cfg.EtcdSnapshotDir
serverConfig.ControlConfig.EtcdSnapshotRetention = cfg.EtcdSnapshotRetention serverConfig.ControlConfig.EtcdSnapshotRetention = cfg.EtcdSnapshotRetention

View File

@ -9,6 +9,7 @@ import (
"github.com/rancher/k3s/pkg/bootstrap" "github.com/rancher/k3s/pkg/bootstrap"
"github.com/rancher/k3s/pkg/clientaccess" "github.com/rancher/k3s/pkg/clientaccess"
"github.com/rancher/k3s/pkg/daemons/config"
"github.com/rancher/k3s/pkg/version" "github.com/rancher/k3s/pkg/version"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
) )
@ -147,3 +148,12 @@ func (c *Cluster) bootstrap(ctx context.Context) error {
func (c *Cluster) bootstrapStamp() string { func (c *Cluster) bootstrapStamp() string {
return filepath.Join(c.config.DataDir, "db/joined-"+keyHash(c.config.Token)) return filepath.Join(c.config.DataDir, "db/joined-"+keyHash(c.config.Token))
} }
// Snapshot is a proxy method to call the snapshot method on the managedb
// interface for etcd clusters.
func (c *Cluster) Snapshot(ctx context.Context, config *config.Control) error {
if c.managedDB == nil {
return errors.New("unable to perform etcd snapshot on non-etcd system")
}
return c.managedDB.Snapshot(ctx, config)
}

View File

@ -21,6 +21,7 @@ type Driver interface {
Test(ctx context.Context) error Test(ctx context.Context) error
Restore(ctx context.Context) error Restore(ctx context.Context) error
EndpointName() string EndpointName() string
Snapshot(ctx context.Context, config *config.Control) error
} }
func RegisterDriver(d Driver) { func RegisterDriver(d Driver) {

View File

@ -130,6 +130,7 @@ type Control struct {
EncryptSecrets bool EncryptSecrets bool
TLSMinVersion uint16 TLSMinVersion uint16
TLSCipherSuites []uint16 TLSCipherSuites []uint16
EtcdSnapshotName string
EtcdDisableSnapshots bool EtcdDisableSnapshots bool
EtcdSnapshotDir string EtcdSnapshotDir string
EtcdSnapshotCron string EtcdSnapshotCron string

View File

@ -419,7 +419,6 @@ func getClientConfig(ctx context.Context, runtime *config.ControlRuntime, endpoi
if err != nil { if err != nil {
return nil, err return nil, err
} }
cfg := &etcd.Config{ cfg := &etcd.Config{
Endpoints: endpoints, Endpoints: endpoints,
TLS: tlsConfig, TLS: tlsConfig,
@ -428,7 +427,6 @@ func getClientConfig(ctx context.Context, runtime *config.ControlRuntime, endpoi
DialKeepAliveTime: defaultKeepAliveTime, DialKeepAliveTime: defaultKeepAliveTime,
DialKeepAliveTimeout: defaultKeepAliveTimeout, DialKeepAliveTimeout: defaultKeepAliveTimeout,
} }
return cfg, nil return cfg, nil
} }
@ -723,48 +721,80 @@ func snapshotDir(config *config.Control) (string, error) {
return config.EtcdSnapshotDir, nil return config.EtcdSnapshotDir, nil
} }
// snapshot attempts to save a new snapshot to the configured directory, and then clean up any old // preSnapshotSetup checks to see if the necessary components are in place
// snapshots in excess of the retention limits. // to perform an Etcd snapshot. This is necessary primarily for on-demand
func (e *ETCD) snapshot(ctx context.Context) { // snapshots since they're performed before normal Etcd setup is completed.
func (e *ETCD) preSnapshotSetup(ctx context.Context, config *config.Control) error {
if e.client == nil {
if e.config == nil {
e.config = config
}
client, err := getClient(ctx, e.config.Runtime, endpoint)
if err != nil {
return err
}
e.client = client
}
if e.runtime == nil {
e.runtime = config.Runtime
}
return nil
}
// Snapshot attempts to save a new snapshot to the configured directory, and then clean up any old
// snapshots in excess of the retention limits. This method is used in the internal cron snapshot
// system as well as used to do on-demand snapshots.
func (e *ETCD) Snapshot(ctx context.Context, config *config.Control) error {
if err := e.preSnapshotSetup(ctx, config); err != nil {
return err
}
status, err := e.client.Status(ctx, endpoint) status, err := e.client.Status(ctx, endpoint)
if err != nil { if err != nil {
logrus.Errorf("Failed to check etcd status for snapshot: %v", err) return errors.Wrap(err, "failed to check etcd status for snapshot")
return
} }
if status.IsLearner { if status.IsLearner {
logrus.Warnf("Skipping snapshot: not supported for learner") logrus.Warnf("Skipping snapshot: not supported for learner")
return return nil
} }
snapshotDir, err := snapshotDir(e.config) snapshotDir, err := snapshotDir(e.config)
if err != nil { if err != nil {
logrus.Errorf("Failed to get the snapshot dir: %v", err) return errors.Wrap(err, "failed to get the snapshot dir")
return
} }
cfg, err := getClientConfig(ctx, e.runtime, endpoint) cfg, err := getClientConfig(ctx, e.runtime, endpoint)
if err != nil { if err != nil {
logrus.Errorf("Failed to get config for etcd snapshot: %v", err) return errors.Wrap(err, "failed to get config for etcd snapshot")
return
} }
snapshotPath := filepath.Join(snapshotDir, snapshotPrefix+strconv.Itoa(int(time.Now().Unix()))) snapshotName := fmt.Sprintf("%s-%d", e.config.EtcdSnapshotName, time.Now().Unix())
snapshotPath := filepath.Join(snapshotDir, snapshotName)
logrus.Infof("Saving etcd snapshot to %s", snapshotPath) logrus.Infof("Saving etcd snapshot to %s", snapshotPath)
if err := snapshot.NewV3(nil).Save(ctx, *cfg, snapshotPath); err != nil { if err := snapshot.NewV3(nil).Save(ctx, *cfg, snapshotPath); err != nil {
logrus.Errorf("Failed to save snapshot: %v", err) return errors.Wrap(err, "failed to save snapshot")
return
} }
if err := snapshotRetention(e.config.EtcdSnapshotRetention, snapshotDir); err != nil {
logrus.Errorf("Failed to apply snapshot retention: %v", err) // check if we need to perform a retention check
return if e.config.EtcdSnapshotRetention >= 1 {
if err := snapshotRetention(e.config.EtcdSnapshotRetention, snapshotDir); err != nil {
return errors.Wrap(err, "failed to apply snapshot retention")
}
} }
return nil
} }
// setSnapshotFunction schedules snapshots at the configured interval // setSnapshotFunction schedules snapshots at the configured interval
func (e *ETCD) setSnapshotFunction(ctx context.Context) { func (e *ETCD) setSnapshotFunction(ctx context.Context) {
e.cron.AddFunc(e.config.EtcdSnapshotCron, func() { e.snapshot(ctx) }) e.cron.AddFunc(e.config.EtcdSnapshotCron, func() {
if err := e.Snapshot(ctx, e.config); err != nil {
logrus.Error(err)
}
})
} }
// Restore performs a restore of the ETCD datastore from // Restore performs a restore of the ETCD datastore from

View File

@ -42,7 +42,7 @@ const (
ControlPlaneRoleLabelKey = "node-role.kubernetes.io/control-plane" ControlPlaneRoleLabelKey = "node-role.kubernetes.io/control-plane"
) )
func resolveDataDir(dataDir string) (string, error) { func ResolveDataDir(dataDir string) (string, error) {
dataDir, err := datadir.Resolve(dataDir) dataDir, err := datadir.Resolve(dataDir)
return filepath.Join(dataDir, "server"), err return filepath.Join(dataDir, "server"), err
} }
@ -322,7 +322,7 @@ func setupDataDirAndChdir(config *config.Control) error {
err error err error
) )
config.DataDir, err = resolveDataDir(config.DataDir) config.DataDir, err = ResolveDataDir(config.DataDir)
if err != nil { if err != nil {
return err return err
} }

View File

@ -77,6 +77,7 @@ rm -f \
bin/containerd-shim-runc-v1 \ bin/containerd-shim-runc-v1 \
bin/containerd-shim-runc-v2 \ bin/containerd-shim-runc-v2 \
bin/k3s-server \ bin/k3s-server \
bin/k3s-etcd-snapshot \
bin/kubectl \ bin/kubectl \
bin/crictl \ bin/crictl \
bin/ctr bin/ctr
@ -105,6 +106,7 @@ echo Building server
CGO_ENABLED=1 "${GO}" build -tags "$TAGS" -ldflags "$VERSIONFLAGS $LDFLAGS $STATIC_SQLITE" -o bin/containerd ./cmd/server/main.go CGO_ENABLED=1 "${GO}" build -tags "$TAGS" -ldflags "$VERSIONFLAGS $LDFLAGS $STATIC_SQLITE" -o bin/containerd ./cmd/server/main.go
ln -s containerd ./bin/k3s-agent ln -s containerd ./bin/k3s-agent
ln -s containerd ./bin/k3s-server ln -s containerd ./bin/k3s-server
ln -s containerd ./bin/k3s-etcd-snapshot
ln -s containerd ./bin/kubectl ln -s containerd ./bin/kubectl
ln -s containerd ./bin/crictl ln -s containerd ./bin/crictl
ln -s containerd ./bin/ctr ln -s containerd ./bin/ctr

View File

@ -7,7 +7,7 @@ cd $(dirname $0)/..
GO=${GO-go} GO=${GO-go}
for i in crictl kubectl k3s-agent k3s-server k3s; do for i in crictl kubectl k3s-agent k3s-server k3s-etcd-snapshot k3s; do
rm -f bin/$i rm -f bin/$i
ln -s containerd bin/$i ln -s containerd bin/$i
done done