Add containerd ready channel to delay etcd node join

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
This commit is contained in:
Brad Davidson 2021-10-11 23:13:10 -07:00 committed by Brad Davidson
parent 623f579869
commit 5a923ab8dc
14 changed files with 150 additions and 62 deletions

View File

@ -84,6 +84,14 @@ func run(ctx context.Context, cfg cmds.Agent, proxy proxy.Proxy) error {
}
}
// the agent runtime is ready to host workloads when containerd is up and the airgap
// images have finished loading, as that portion of startup may block for an arbitrary
// amount of time depending on how long it takes to import whatever the user has placed
// in the images directory.
if cfg.AgentReady != nil {
close(cfg.AgentReady)
}
notifySocket := os.Getenv("NOTIFY_SOCKET")
os.Unsetenv("NOTIFY_SOCKET")

View File

@ -53,6 +53,9 @@ func Setup(ctx context.Context, config *config.Node, proxy proxy.Proxy) error {
return err
}
// Do an immediate fill of proxy addresses from the server endpoint list, before going into the
// watch loop. This will fail on the first server, as the apiserver won't be started yet - but
// that's fine because the local server is already seeded into the proxy address list.
endpoint, _ := client.CoreV1().Endpoints("default").Get(ctx, "kubernetes", metav1.GetOptions{})
if endpoint != nil {
addresses := util.GetAddresses(endpoint)
@ -61,8 +64,9 @@ func Setup(ctx context.Context, config *config.Node, proxy proxy.Proxy) error {
}
}
// Attempt to connect to supervisors, storing their cancellation function for later when we
// need to disconnect.
disconnect := map[string]context.CancelFunc{}
wg := &sync.WaitGroup{}
for _, address := range proxy.SupervisorAddresses() {
if _, ok := disconnect[address]; !ok {
@ -70,7 +74,11 @@ func Setup(ctx context.Context, config *config.Node, proxy proxy.Proxy) error {
}
}
// Once the apiserver is up, go into a watch loop, adding and removing tunnels as endpoints come
// and go from the cluster. We go into a faster but noisier connect loop if the watch fails
// following a successful connection.
go func() {
util.WaitForAPIServerReady(client, 30*time.Second)
connect:
for {
time.Sleep(5 * time.Second)

View File

@ -46,6 +46,7 @@ type Agent struct {
Taints cli.StringSlice
ImageCredProvBinDir string
ImageCredProvConfig string
AgentReady chan<- struct{}
AgentShared
}

View File

@ -16,6 +16,7 @@ import (
"github.com/rancher/k3s/pkg/agent/loadbalancer"
"github.com/rancher/k3s/pkg/cli/cmds"
"github.com/rancher/k3s/pkg/clientaccess"
"github.com/rancher/k3s/pkg/daemons/config"
"github.com/rancher/k3s/pkg/datadir"
"github.com/rancher/k3s/pkg/etcd"
"github.com/rancher/k3s/pkg/netutil"
@ -84,8 +85,11 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont
cfg.Token = cfg.ClusterSecret
}
agentReady := make(chan struct{})
serverConfig := server.Config{}
serverConfig.DisableAgent = cfg.DisableAgent
serverConfig.ControlConfig.Runtime = &config.ControlRuntime{AgentReady: agentReady}
serverConfig.ControlConfig.Token = cfg.Token
serverConfig.ControlConfig.AgentToken = cfg.AgentToken
serverConfig.ControlConfig.JoinURL = cfg.ServerURL
@ -426,6 +430,7 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont
}()
if cfg.DisableAgent {
close(agentReady)
<-ctx.Done()
return nil
}
@ -442,6 +447,7 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont
}
agentConfig := cmds.AgentConfig
agentConfig.AgentReady = agentReady
agentConfig.Debug = app.GlobalBool("debug")
agentConfig.DataDir = filepath.Dir(serverConfig.ControlConfig.DataDir)
agentConfig.ServerURL = url

View File

@ -198,6 +198,7 @@ type ControlRuntime struct {
HTTPBootstrap bool
APIServerReady <-chan struct{}
AgentReady <-chan struct{}
ETCDReady <-chan struct{}
ClusterControllerStart func(ctx context.Context) error
LeaderElectedClusterControllerStart func(ctx context.Context) error
@ -217,6 +218,7 @@ type ControlRuntime struct {
ServingKubeletKey string
ServerToken string
AgentToken string
APIServer http.Handler
Handler http.Handler
Tunnel http.Handler
Authenticator authenticator.Request

View File

@ -4,7 +4,6 @@ import (
"context"
"math/rand"
"net"
"net/http"
"os"
"path/filepath"
"strconv"
@ -22,7 +21,6 @@ import (
authorizationv1 "k8s.io/api/authorization/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/authentication/authenticator"
"k8s.io/client-go/kubernetes"
authorizationv1client "k8s.io/client-go/kubernetes/typed/authorization/v1"
"k8s.io/client-go/tools/clientcmd"
@ -37,9 +35,7 @@ var localhostIP = net.ParseIP("127.0.0.1")
func Server(ctx context.Context, cfg *config.Control) error {
rand.Seed(time.Now().UTC().UnixNano())
runtime := &config.ControlRuntime{}
cfg.Runtime = runtime
runtime := cfg.Runtime
if err := prepare(ctx, cfg, runtime); err != nil {
return errors.Wrap(err, "preparing server")
@ -48,13 +44,16 @@ func Server(ctx context.Context, cfg *config.Control) error {
cfg.Runtime.Tunnel = setupTunnel()
proxyutil.DisableProxyHostnameCheck = true
var auth authenticator.Request
var handler http.Handler
var err error
basicAuth, err := basicAuthenticator(runtime.PasswdFile)
if err != nil {
return err
}
runtime.Authenticator = basicAuth
if !cfg.DisableAPIServer {
auth, handler, err = apiServer(ctx, cfg, runtime)
if err != nil {
go waitForAPIServerHandlers(ctx, runtime)
if err := apiServer(ctx, cfg, runtime); err != nil {
return err
}
@ -62,13 +61,6 @@ func Server(ctx context.Context, cfg *config.Control) error {
return err
}
}
basicAuth, err := basicAuthenticator(runtime.PasswdFile)
if err != nil {
return err
}
runtime.Authenticator = combineAuthenticators(basicAuth, auth)
runtime.Handler = handler
if !cfg.DisableScheduler {
if err := scheduler(ctx, cfg, runtime); err != nil {
@ -145,7 +137,7 @@ func scheduler(ctx context.Context, cfg *config.Control, runtime *config.Control
return executor.Scheduler(ctx, runtime.APIServerReady, args)
}
func apiServer(ctx context.Context, cfg *config.Control, runtime *config.ControlRuntime) (authenticator.Request, http.Handler, error) {
func apiServer(ctx context.Context, cfg *config.Control, runtime *config.ControlRuntime) error {
argsMap := map[string]string{
"feature-gates": "JobTrackingWithFinalizers=true",
}
@ -381,6 +373,15 @@ func checkForCloudControllerPrivileges(ctx context.Context, runtime *config.Cont
return err
}
func waitForAPIServerHandlers(ctx context.Context, runtime *config.ControlRuntime) {
auth, handler, err := executor.APIServerHandlers(ctx)
if err != nil {
logrus.Fatalf("Failed to get request handlers from apiserver: %v", err)
}
runtime.Authenticator = combineAuthenticators(runtime.Authenticator, auth)
runtime.APIServer = handler
}
func waitForAPIServerInBackground(ctx context.Context, runtime *config.ControlRuntime) error {
restConfig, err := clientcmd.BuildConfigFromFlags("", runtime.KubeConfigAdmin)
if err != nil {

View File

@ -70,12 +70,17 @@ func (Embedded) KubeProxy(ctx context.Context, args []string) error {
return nil
}
func (Embedded) APIServer(ctx context.Context, etcdReady <-chan struct{}, args []string) (authenticator.Request, http.Handler, error) {
<-etcdReady
func (Embedded) APIServerHandlers(ctx context.Context) (authenticator.Request, http.Handler, error) {
startupConfig := <-app.StartupConfig
return startupConfig.Authenticator, startupConfig.Handler, nil
}
func (Embedded) APIServer(ctx context.Context, etcdReady <-chan struct{}, args []string) error {
command := app.NewAPIServerCommand(ctx.Done())
command.SetArgs(args)
go func() {
<-etcdReady
defer func() {
if err := recover(); err != nil {
logrus.Fatalf("apiserver panic: %v", err)
@ -84,8 +89,7 @@ func (Embedded) APIServer(ctx context.Context, etcdReady <-chan struct{}, args [
logrus.Fatalf("apiserver exited: %v", command.ExecuteContext(ctx))
}()
startupConfig := <-app.StartupConfig
return startupConfig.Authenticator, startupConfig.Handler, nil
return nil
}
func (Embedded) Scheduler(ctx context.Context, apiReady <-chan struct{}, args []string) error {

View File

@ -29,7 +29,7 @@ func (e Embedded) ETCD(ctx context.Context, args ETCDConfig) error {
}
etcd, err := embed.StartEtcd(cfg)
if err != nil {
return nil
return err
}
go func() {

View File

@ -22,7 +22,8 @@ type Executor interface {
Bootstrap(ctx context.Context, nodeConfig *daemonconfig.Node, cfg cmds.Agent) error
Kubelet(ctx context.Context, args []string) error
KubeProxy(ctx context.Context, args []string) error
APIServer(ctx context.Context, etcdReady <-chan struct{}, args []string) (authenticator.Request, http.Handler, error)
APIServerHandlers(ctx context.Context) (authenticator.Request, http.Handler, error)
APIServer(ctx context.Context, etcdReady <-chan struct{}, args []string) error
Scheduler(ctx context.Context, apiReady <-chan struct{}, args []string) error
ControllerManager(ctx context.Context, apiReady <-chan struct{}, args []string) error
CurrentETCDOptions() (InitialOptions, error)
@ -97,7 +98,11 @@ func KubeProxy(ctx context.Context, args []string) error {
return executor.KubeProxy(ctx, args)
}
func APIServer(ctx context.Context, etcdReady <-chan struct{}, args []string) (authenticator.Request, http.Handler, error) {
func APIServerHandlers(ctx context.Context) (authenticator.Request, http.Handler, error) {
return executor.APIServerHandlers(ctx)
}
func APIServer(ctx context.Context, etcdReady <-chan struct{}, args []string) error {
return executor.APIServer(ctx, etcdReady, args)
}

View File

@ -192,6 +192,7 @@ func (e *ETCD) IsInitialized(ctx context.Context, config *config.Control) (bool,
func (e *ETCD) Reset(ctx context.Context, rebootstrap func() error) error {
// Wait for etcd to come up as a new single-node cluster, then exit
go func() {
<-e.runtime.AgentReady
t := time.NewTicker(5 * time.Second)
defer t.Stop()
for range t.C {
@ -291,8 +292,14 @@ func (e *ETCD) Start(ctx context.Context, clientAccessInfo *clientaccess.Info) e
return e.newCluster(ctx, false)
}
err = e.join(ctx, clientAccessInfo)
return errors.Wrap(err, "joining etcd cluster")
go func() {
<-e.runtime.AgentReady
if err := e.join(ctx, clientAccessInfo); err != nil {
logrus.Fatalf("ETCD join failed: %v", err)
}
}()
return nil
}
// join attempts to add a member to an existing cluster
@ -335,9 +342,9 @@ func (e *ETCD) join(ctx context.Context, clientAccessInfo *clientaccess.Info) er
// make sure to remove the name file if a duplicate node name is used
nameFile := nameFile(e.config)
if err := os.Remove(nameFile); err != nil {
return err
logrus.Errorf("Failed to remove etcd name file %s: %v", nameFile, err)
}
return errors.New("Failed to join etcd cluster due to duplicate node names, please use unique node name for the server")
return errors.New("duplicate node name found, please use a unique name for this node")
}
for _, peer := range member.PeerURLs {
u, err := url.Parse(peer)
@ -358,7 +365,7 @@ func (e *ETCD) join(ctx context.Context, clientAccessInfo *clientaccess.Info) er
}
if add {
logrus.Infof("Adding %s to etcd cluster %v", e.peerURL(), cluster)
logrus.Infof("Adding member %s=%s to etcd cluster %v", e.name, e.peerURL(), cluster)
if _, err = client.MemberAddAsLearner(clientCtx, []string{e.peerURL()}); err != nil {
return err
}
@ -444,7 +451,7 @@ func (e *ETCD) handler(next http.Handler) http.Handler {
return mux
}
// infoHandler returns etcd cluster information. This is used by new members when joining the custer.
// infoHandler returns etcd cluster information. This is used by new members when joining the cluster.
func (e *ETCD) infoHandler() http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
ctx, cancel := context.WithTimeout(req.Context(), 2*time.Second)
@ -500,6 +507,10 @@ func getClientConfig(ctx context.Context, runtime *config.ControlRuntime, endpoi
// toTLSConfig converts the ControlRuntime configuration to TLS configuration suitable
// for use by etcd.
func toTLSConfig(runtime *config.ControlRuntime) (*tls.Config, error) {
if runtime.ClientETCDCert == "" || runtime.ClientETCDKey == "" || runtime.ETCDServerCA == "" {
return nil, errors.New("runtime is not ready yet")
}
clientCert, err := tls.LoadX509KeyPair(runtime.ClientETCDCert, runtime.ClientETCDKey)
if err != nil {
return nil, err
@ -533,8 +544,8 @@ func GetAdvertiseAddress(advertiseIP string) (string, error) {
// newCluster returns options to set up etcd for a new cluster
func (e *ETCD) newCluster(ctx context.Context, reset bool) error {
err := e.cluster(ctx, reset, executor.InitialOptions{
AdvertisePeerURL: fmt.Sprintf("https://%s:2380", e.address),
Cluster: fmt.Sprintf("%s=https://%s:2380", e.name, e.address),
AdvertisePeerURL: e.peerURL(),
Cluster: fmt.Sprintf("%s=%s", e.name, e.peerURL()),
State: "new",
})
if err != nil {
@ -684,6 +695,7 @@ func (e *ETCD) RemovePeer(ctx context.Context, name, address string, allowSelfRe
// being promoted to full voting member. The checks only run on the cluster member that is
// the etcd leader.
func (e *ETCD) manageLearners(ctx context.Context) error {
<-e.runtime.AgentReady
t := time.NewTicker(manageTickerTime)
defer t.Stop()
@ -1377,9 +1389,6 @@ func backupDirWithRetention(dir string, maxBackupRetention int) (string, error)
// GetAPIServerURLFromETCD will try to fetch the version.Program/apiaddresses key from etcd
// when it succeed it will parse the first address in the list and return back an address
func GetAPIServerURLFromETCD(ctx context.Context, cfg *config.Control) (string, error) {
if cfg.Runtime == nil {
return "", fmt.Errorf("runtime is not ready yet")
}
cl, err := GetClient(ctx, cfg.Runtime, endpoint)
if err != nil {
return "", err

View File

@ -7,16 +7,29 @@ import (
"os"
"path/filepath"
"testing"
"time"
"github.com/rancher/k3s/pkg/clientaccess"
"github.com/rancher/k3s/pkg/daemons/config"
testutil "github.com/rancher/k3s/tests/util"
"github.com/robfig/cron/v3"
clientv3 "go.etcd.io/etcd/client/v3"
utilnet "k8s.io/apimachinery/pkg/util/net"
)
func mustGetAddress() string {
ipAddr, err := utilnet.ChooseHostInterface()
if err != nil {
panic(err)
}
return ipAddr.String()
}
func generateTestConfig() *config.Control {
agentReady := make(chan struct{})
close(agentReady)
return &config.Control{
Runtime: &config.ControlRuntime{AgentReady: agentReady},
HTTPSPort: 6443,
SupervisorPort: 6443,
AdvertisePort: 6443,
@ -189,42 +202,48 @@ func Test_UnitETCD_Register(t *testing.T) {
}
func Test_UnitETCD_Start(t *testing.T) {
type contextInfo struct {
ctx context.Context
cancel context.CancelFunc
}
type fields struct {
context contextInfo
client *clientv3.Client
config *config.Control
name string
runtime *config.ControlRuntime
address string
cron *cron.Cron
s3 *S3
}
type args struct {
ctx context.Context
clientAccessInfo *clientaccess.Info
}
tests := []struct {
name string
fields fields
args args
setup func(cnf *config.Control) error
teardown func(cnf *config.Control) error
setup func(cnf *config.Control, ctxInfo *contextInfo) error
teardown func(cnf *config.Control, ctxInfo *contextInfo) error
wantErr bool
}{
{
name: "Start etcd without clientAccessInfo and without snapshots",
fields: fields{
config: generateTestConfig(),
address: "192.168.1.123", // Local IP address
address: mustGetAddress(),
name: "default",
},
args: args{
ctx: context.TODO(),
clientAccessInfo: nil,
},
setup: func(cnf *config.Control) error {
setup: func(cnf *config.Control, ctxInfo *contextInfo) error {
ctxInfo.ctx, ctxInfo.cancel = context.WithCancel(context.Background())
cnf.EtcdDisableSnapshots = true
return testutil.GenerateRuntime(cnf)
},
teardown: func(cnf *config.Control) error {
teardown: func(cnf *config.Control, ctxInfo *contextInfo) error {
ctxInfo.cancel()
time.Sleep(5 * time.Second)
testutil.CleanupDataDir(cnf)
return nil
},
@ -233,17 +252,20 @@ func Test_UnitETCD_Start(t *testing.T) {
name: "Start etcd without clientAccessInfo on",
fields: fields{
config: generateTestConfig(),
address: "192.168.1.123", // Local IP address
address: mustGetAddress(),
name: "default",
cron: cron.New(),
},
args: args{
ctx: context.TODO(),
clientAccessInfo: nil,
},
setup: func(cnf *config.Control) error {
setup: func(cnf *config.Control, ctxInfo *contextInfo) error {
ctxInfo.ctx, ctxInfo.cancel = context.WithCancel(context.Background())
return testutil.GenerateRuntime(cnf)
},
teardown: func(cnf *config.Control) error {
teardown: func(cnf *config.Control, ctxInfo *contextInfo) error {
ctxInfo.cancel()
time.Sleep(5 * time.Second)
testutil.CleanupDataDir(cnf)
return nil
},
@ -252,20 +274,23 @@ func Test_UnitETCD_Start(t *testing.T) {
name: "Start etcd with an existing cluster",
fields: fields{
config: generateTestConfig(),
address: "192.168.1.123", // Local IP address
address: mustGetAddress(),
name: "default",
cron: cron.New(),
},
args: args{
ctx: context.TODO(),
clientAccessInfo: nil,
},
setup: func(cnf *config.Control) error {
setup: func(cnf *config.Control, ctxInfo *contextInfo) error {
ctxInfo.ctx, ctxInfo.cancel = context.WithCancel(context.Background())
if err := testutil.GenerateRuntime(cnf); err != nil {
return err
}
return os.MkdirAll(walDir(cnf), 0700)
},
teardown: func(cnf *config.Control) error {
teardown: func(cnf *config.Control, ctxInfo *contextInfo) error {
ctxInfo.cancel()
time.Sleep(5 * time.Second)
testutil.CleanupDataDir(cnf)
os.Remove(walDir(cnf))
return nil
@ -278,17 +303,17 @@ func Test_UnitETCD_Start(t *testing.T) {
client: tt.fields.client,
config: tt.fields.config,
name: tt.fields.name,
runtime: tt.fields.runtime,
runtime: tt.fields.config.Runtime,
address: tt.fields.address,
cron: tt.fields.cron,
s3: tt.fields.s3,
}
defer tt.teardown(e.config)
if err := tt.setup(e.config); err != nil {
defer tt.teardown(e.config, &tt.fields.context)
if err := tt.setup(e.config, &tt.fields.context); err != nil {
t.Errorf("Setup for ETCD.Start() failed = %v", err)
return
}
if err := e.Start(tt.args.ctx, tt.args.clientAccessInfo); (err != nil) != tt.wantErr {
if err := e.Start(tt.fields.context.ctx, tt.args.clientAccessInfo); (err != nil) != tt.wantErr {
t.Errorf("ETCD.Start() error = %v, wantErr %v", err, tt.wantErr)
}
})

View File

@ -39,7 +39,7 @@ func router(ctx context.Context, config *Config, cfg *cmds.Server) http.Handler
prefix := "/v1-" + version.Program
authed := mux.NewRouter()
authed.Use(authMiddleware(serverConfig, version.Program+":agent"))
authed.NotFoundHandler = serverConfig.Runtime.Handler
authed.NotFoundHandler = apiserver(serverConfig.Runtime)
authed.Path(prefix + "/serving-kubelet.crt").Handler(servingKubeletCert(serverConfig, serverConfig.Runtime.ServingKubeletKey, nodeAuth))
authed.Path(prefix + "/client-kubelet.crt").Handler(clientKubeletCert(serverConfig, serverConfig.Runtime.ClientKubeletKey, nodeAuth))
authed.Path(prefix + "/client-kube-proxy.crt").Handler(fileHandler(serverConfig.Runtime.ClientKubeProxyCert, serverConfig.Runtime.ClientKubeProxyKey))
@ -72,6 +72,20 @@ func router(ctx context.Context, config *Config, cfg *cmds.Server) http.Handler
return router
}
func apiserver(runtime *config.ControlRuntime) http.Handler {
return http.HandlerFunc(func(resp http.ResponseWriter, req *http.Request) {
if runtime != nil && runtime.APIServer != nil {
runtime.APIServer.ServeHTTP(resp, req)
} else {
data := []byte("apiserver not ready")
resp.WriteHeader(http.StatusInternalServerError)
resp.Header().Set("Content-Type", "text/plain")
resp.Header().Set("Content-length", strconv.Itoa(len(data)))
resp.Write(data)
}
})
}
func cacerts(serverCA string) http.Handler {
var ca []byte
return http.HandlerFunc(func(resp http.ResponseWriter, req *http.Request) {
@ -269,7 +283,10 @@ func configHandler(server *config.Control, cfg *cmds.Server) http.Handler {
// At this time we don't sync all the fields, just those known to be touched by startup hooks.
server.DisableKubeProxy = cfg.DisableKubeProxy
resp.Header().Set("content-type", "application/json")
json.NewEncoder(resp).Encode(server)
if err := json.NewEncoder(resp).Encode(server); err != nil {
logrus.Errorf("Failed to encode agent config: %v", err)
resp.WriteHeader(http.StatusInternalServerError)
}
})
}

View File

@ -15,6 +15,9 @@ docker ps
# ---
. ./scripts/test-unit
echo "Did test-unit $?"
. ./scripts/test-run-basics
echo "Did test-run-basics $?"
@ -41,6 +44,4 @@ echo "Did test-run-sonobuoy $?"
test-run-sonobuoy mysql
test-run-sonobuoy postgres
. ./scripts/test-unit
exit 0

View File

@ -34,8 +34,9 @@ func GenerateDataDir(cnf *config.Control) error {
}
// CleanupDataDir removes the associated "/tmp/k3s/<RANDOM_STRING>"
// directory.
// directory along with the 'latest' symlink that points at it.
func CleanupDataDir(cnf *config.Control) {
os.Remove(filepath.Join(cnf.DataDir, "..", "latest"))
os.RemoveAll(cnf.DataDir)
}