mirror of
https://github.com/k3s-io/k3s.git
synced 2024-06-07 19:41:36 +00:00
Add jitter to scheduled snapshots and retry harder on conflicts
Also ensure that the snapshot job does not attempt to trigger multiple concurrent runs, as this is not supported. Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
This commit is contained in:
parent
f0ec6a4c12
commit
a298bfdb18
@ -10,6 +10,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"math/rand"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
@ -49,6 +50,7 @@ import (
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
utilnet "k8s.io/apimachinery/pkg/util/net"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/client-go/util/retry"
|
||||
)
|
||||
|
||||
@ -58,6 +60,10 @@ const (
|
||||
learnerMaxStallTime = time.Minute * 5
|
||||
memberRemovalTimeout = time.Minute * 1
|
||||
|
||||
// snapshotJitterMax defines the maximum time skew on cron-triggered snapshots. The actual jitter
|
||||
// will be a random Duration somewhere between 0 and snapshotJitterMax.
|
||||
snapshotJitterMax = time.Second * 5
|
||||
|
||||
// defaultDialTimeout is intentionally short so that connections timeout within the testTimeout defined above
|
||||
defaultDialTimeout = 2 * time.Second
|
||||
// other defaults from k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go
|
||||
@ -81,6 +87,19 @@ var (
|
||||
snapshotExtraMetadataConfigMapName = version.Program + "-etcd-snapshot-extra-metadata"
|
||||
snapshotConfigMapName = version.Program + "-etcd-snapshots"
|
||||
|
||||
// snapshotDataBackoff will retry at increasing steps for up to ~30 seconds.
|
||||
// If the ConfigMap update fails, the list won't be reconciled again until next time
|
||||
// the server starts, so we should be fairly persistent in retrying.
|
||||
snapshotDataBackoff = wait.Backoff{
|
||||
Steps: 9,
|
||||
Duration: 10 * time.Millisecond,
|
||||
Factor: 3.0,
|
||||
Jitter: 0.1,
|
||||
}
|
||||
|
||||
// cronLogger wraps logrus's Printf output as cron-compatible logger
|
||||
cronLogger = cron.VerbosePrintfLogger(logrus.StandardLogger())
|
||||
|
||||
NodeNameAnnotation = "etcd." + version.Program + ".cattle.io/node-name"
|
||||
NodeAddressAnnotation = "etcd." + version.Program + ".cattle.io/node-address"
|
||||
|
||||
@ -139,7 +158,7 @@ func errNotMember() error { return &MembershipError{} }
|
||||
// ETCD with an initialized cron value.
|
||||
func NewETCD() *ETCD {
|
||||
return &ETCD{
|
||||
cron: cron.New(),
|
||||
cron: cron.New(cron.WithLogger(cronLogger)),
|
||||
}
|
||||
}
|
||||
|
||||
@ -1712,7 +1731,7 @@ func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error {
|
||||
// AddSnapshotData adds the given snapshot file information to the snapshot configmap, using the existing extra metadata
|
||||
// available at the time.
|
||||
func (e *ETCD) addSnapshotData(sf snapshotFile) error {
|
||||
return retry.OnError(retry.DefaultBackoff, func(err error) bool {
|
||||
return retry.OnError(snapshotDataBackoff, func(err error) bool {
|
||||
return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err)
|
||||
}, func() error {
|
||||
// make sure the core.Factory is initialized. There can
|
||||
@ -1920,11 +1939,16 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error {
|
||||
|
||||
// setSnapshotFunction schedules snapshots at the configured interval.
|
||||
func (e *ETCD) setSnapshotFunction(ctx context.Context) {
|
||||
e.cron.AddFunc(e.config.EtcdSnapshotCron, func() {
|
||||
skipJob := cron.SkipIfStillRunning(cronLogger)
|
||||
e.cron.AddJob(e.config.EtcdSnapshotCron, skipJob(cron.FuncJob(func() {
|
||||
// Add a small amount of jitter to the actual snapshot execution. On clusters with multiple servers,
|
||||
// having all the nodes take a snapshot at the exact same time can lead to excessive retry thrashing
|
||||
// when updating the snapshot list configmap.
|
||||
time.Sleep(time.Duration(rand.Float64() * float64(snapshotJitterMax)))
|
||||
if err := e.Snapshot(ctx, e.config); err != nil {
|
||||
logrus.Error(err)
|
||||
}
|
||||
})
|
||||
})))
|
||||
}
|
||||
|
||||
// Restore performs a restore of the ETCD datastore from
|
||||
|
Loading…
Reference in New Issue
Block a user