Add jitter to scheduled snapshots and retry harder on conflicts

Also ensure that the snapshot job does not attempt to trigger multiple concurrent runs, as this is not supported. Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
2024-06-07 19:41:36 +00:00 · 2023-01-10 18:51:39 +00:00 · 2023-01-10 18:51:39 +00:00 · a298bfdb18
commit a298bfdb18
parent f0ec6a4c12
1 changed files with 28 additions and 4 deletions
--- a/pkg/etcd/etcd.go
+++ b/pkg/etcd/etcd.go
@ -10,6 +10,7 @@ import (
 	"fmt"
 	"io"
 	"io/fs"
+	"math/rand"
 	"net"
 	"net/http"
 	"net/url"
@ -49,6 +50,7 @@ import (
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	utilnet "k8s.io/apimachinery/pkg/util/net"
+	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/util/retry"
 )

@ -58,6 +60,10 @@ const (
 	learnerMaxStallTime  = time.Minute * 5
 	memberRemovalTimeout = time.Minute * 1

+	// snapshotJitterMax defines the maximum time skew on cron-triggered snapshots. The actual jitter
+	// will be a random Duration somewhere between 0 and snapshotJitterMax.
+	snapshotJitterMax = time.Second * 5
+
 	// defaultDialTimeout is intentionally short so that connections timeout within the testTimeout defined above
 	defaultDialTimeout = 2 * time.Second
 	// other defaults from k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go
@ -81,6 +87,19 @@ var (
 	snapshotExtraMetadataConfigMapName = version.Program + "-etcd-snapshot-extra-metadata"
 	snapshotConfigMapName              = version.Program + "-etcd-snapshots"

+	// snapshotDataBackoff will retry at increasing steps for up to ~30 seconds.
+	// If the ConfigMap update fails, the list won't be reconciled again until next time
+	// the server starts, so we should be fairly persistent in retrying.
+	snapshotDataBackoff = wait.Backoff{
+		Steps:    9,
+		Duration: 10 * time.Millisecond,
+		Factor:   3.0,
+		Jitter:   0.1,
+	}
+
+	// cronLogger wraps logrus's Printf output as cron-compatible logger
+	cronLogger = cron.VerbosePrintfLogger(logrus.StandardLogger())
+
 	NodeNameAnnotation    = "etcd." + version.Program + ".cattle.io/node-name"
 	NodeAddressAnnotation = "etcd." + version.Program + ".cattle.io/node-address"

@ -139,7 +158,7 @@ func errNotMember() error { return &MembershipError{} }
 // ETCD with an initialized cron value.
 func NewETCD() *ETCD {
 	return &ETCD{
-		cron: cron.New(),
+		cron: cron.New(cron.WithLogger(cronLogger)),
 	}
 }

@ -1712,7 +1731,7 @@ func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error {
 // AddSnapshotData adds the given snapshot file information to the snapshot configmap, using the existing extra metadata
 // available at the time.
 func (e *ETCD) addSnapshotData(sf snapshotFile) error {
-	return retry.OnError(retry.DefaultBackoff, func(err error) bool {
+	return retry.OnError(snapshotDataBackoff, func(err error) bool {
 		return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err)
 	}, func() error {
 		// make sure the core.Factory is initialized. There can
@ -1920,11 +1939,16 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error {

 // setSnapshotFunction schedules snapshots at the configured interval.
 func (e *ETCD) setSnapshotFunction(ctx context.Context) {
-	e.cron.AddFunc(e.config.EtcdSnapshotCron, func() {
+	skipJob := cron.SkipIfStillRunning(cronLogger)
+	e.cron.AddJob(e.config.EtcdSnapshotCron, skipJob(cron.FuncJob(func() {
+		// Add a small amount of jitter to the actual snapshot execution. On clusters with multiple servers,
+		// having all the nodes take a snapshot at the exact same time can lead to excessive retry thrashing
+		// when updating the snapshot list configmap.
+		time.Sleep(time.Duration(rand.Float64() * float64(snapshotJitterMax)))
 		if err := e.Snapshot(ctx, e.config); err != nil {
 			logrus.Error(err)
 		}
-	})
+	})))
 }

 // Restore performs a restore of the ETCD datastore from