From 2165464af6296080a0c29232b9a6d8df5812899b Mon Sep 17 00:00:00 2001 From: galal-hussein Date: Mon, 4 Nov 2019 18:09:54 +0200 Subject: [PATCH] more changes and Add readme --- tests/perf/Makefile | 8 +- tests/perf/README.md | 116 ++++++++++ tests/perf/agents/main.tf | 12 +- tests/perf/agents/variables.tf | 4 +- tests/perf/scripts/config | 32 +-- tests/perf/scripts/perf | 57 ++++- tests/perf/scripts/test | 14 +- tests/perf/server/data.tf | 16 -- tests/perf/server/files/etcd.tmpl | 31 +++ tests/perf/server/files/etcd_build.sh | 22 ++ tests/perf/server/files/metrics.yaml | 227 ------------------- tests/perf/server/files/prom.yaml | 86 ------- tests/perf/server/files/server_userdata.tmpl | 42 ++-- tests/perf/server/files/worker_userdata.tmpl | 2 +- tests/perf/server/main.tf | 99 +++++--- tests/perf/server/outputs.tf | 4 +- tests/perf/server/variables.tf | 34 ++- tests/perf/tests/load/config.yaml | 2 +- 18 files changed, 395 insertions(+), 413 deletions(-) create mode 100644 tests/perf/README.md create mode 100644 tests/perf/server/files/etcd.tmpl create mode 100755 tests/perf/server/files/etcd_build.sh delete mode 100644 tests/perf/server/files/metrics.yaml delete mode 100644 tests/perf/server/files/prom.yaml diff --git a/tests/perf/Makefile b/tests/perf/Makefile index a1d63a52ad..9b216482d2 100644 --- a/tests/perf/Makefile +++ b/tests/perf/Makefile @@ -1,6 +1,6 @@ MODULE := $(shell basename $$PWD) -.PHONY: init config apply destroy clean test +.PHONY: init config apply destroy clean test info init: @scripts/perf init @@ -8,6 +8,9 @@ init: config: @scripts/perf config +plan: + @scripts/perf plan + apply: @scripts/perf apply @@ -19,3 +22,6 @@ clean: test: @scripts/test test_load + +info: + @scripts/perf info diff --git a/tests/perf/README.md b/tests/perf/README.md new file mode 100644 index 0000000000..60ae23a2c5 --- /dev/null +++ b/tests/perf/README.md @@ -0,0 +1,116 @@ +## K3S Performance Tests +--- + +These scripts uses Terraform to automate building and testing on k3s clusters on AWS, it supports building normal and HA clusters with N master nodes, N workers nodes and multiple storage backends including: + +- MySQL RDS +- Postgres RDS +- Etcd +- SQlite + +The scripts divides into three sections: + +- server +- agents +- tests + +### Server + +The server section deploys the storage backend and then deploys N master nodes, the scripts can be customized to use HA mode or use a single node cluster with sqlite backend, it can also support using 1 master node with external DB, the scripts can also be customized to specify instance type and k3s version, all available options are described in the variable section below. + +The server section will also create a one or more agent nodes specifically for Prometheus deployment, clusterloader2 will deploy prometheus and grafana. + +### Agents + +The agents section deploys the k3s agents, it can be customized with different options that controls the agent node count and the instance types. + +### Tests + +The tests section uses a fork off the (clusterloader2)[https://github.com/kubernetes/perf-tests/tree/master/clusterloader2] tool, the fork just modifies the logging and removes the etcd metrics probes. + +this section will use a dockerized version of the tool, which will run the tests and save the report in `tests/-`. + +The current available tests are: + +- load test +- density test + +## Variables + +The scripts can be modified by customizing the variables in `scripts/config`, the variables includes: + +**Main Vars** + +| Name | Description | +|:----------------:|:------------------------------------------------------------------------------:| +| CLUSTER_NAME | The cluster name on aws, this will prefix each component in the cluster | +| DOMAIN_NAME | DNS name of the Loadbalancer for k3s master(s) | +| ZONE_ID | AWS route53 zone id for modifying the dns name | +| K3S_VERSION | K3S version that will be used with the cluster | +| EXTRA_SSH_KEYS | Public ssh keys that will be added to the servers | +| PRIVATE_KEY_PATH | Private ssh key that will be used by clusterloader2 to ssh and collect metrics | +| DEBUG | Debug mode for k3s servers | + +**Database Variables** + +| Name | Description | +|:----------------:|:---------------------------------------------------------------------------------------------------:| +| DB_ENGINE | The database type, this can be "mysql", "postgres", or "etcd" | +| DB_INSTANCE_TYPE | The RDS instance type for mysql and postgres, etcd uses db.* class as well as its parsed internally | +| DB_NAME | Database name created only in postgres and mysql | +| DB_USERNAME | Database username created only for postgres and mysql | +| DB_PASSWORD | Database password for the user created only for postgres and mysql | +| DB_VERSION | Database version | + +**K3S Server Variables** + +| Name | Description | +|:--------------------:|:---------------------------------------------------------------------------------:| +| SERVER_HA | Whether or not to use HA mode, if not then sqlite will be used as storage backend | +| SERVER_COUNT | k3s master node count | +| SERVER_INSTANCE_TYPE | Ec2 instance type created for k3s server(s) | + +**K3S Agent Variables** + +| Name | Description | +|:-------------------:|:-----------------------------------------:| +| AGENT_NODE_COUNT | Number of k3s agents that will be created | +| AGENT_INSTANCE_TYPE | Ec2 instance type created for k3s agents | + +**Prometheus server Variables** + +| Name | Description | +|:-------------------------:|:-------------------------------------------------------------------:| +| PROM_WORKER_NODE_COUNT | Number of k3s agents that will be created for prometheus deployment | +| PROM_WORKER_INSTANCE_TYPE | Ec2 instance type created for k3s prometheus agents | + + +## Usage + +### build + +The script includes a Makefile that run different sections, to build the master and workers, adjust the config file in `tests/perf/scripts/config` and then use the following: + +``` +cd tests/perf +make apply +``` + +This will basically build the db, server, and agent layers, it will also deploy a kubeconfig file in tests/kubeconfig.yaml. + +### test + +To start the clusterloader2 load test you can modify the tests/perf/tests/load/config.yaml and then run the following: + +``` +cd tests/perf +make test +``` + +### destroy + +To destroy the cluster just run the following: +``` +make destroy +make clean +``` diff --git a/tests/perf/agents/main.tf b/tests/perf/agents/main.tf index 975117cddd..f62c432fd3 100644 --- a/tests/perf/agents/main.tf +++ b/tests/perf/agents/main.tf @@ -52,15 +52,19 @@ module "k3s-pool-worker-asg" { version = "3.0.0" name = "${local.name}-pool" asg_name = "${local.name}-pool" - instance_type = var.worker_instance_type + instance_type = var.agent_instance_type image_id = data.aws_ami.ubuntu.id user_data = base64encode(templatefile("${path.module}/files/pool_worker_userdata.tmpl", { k3s_url = data.terraform_remote_state.server.outputs.public_ip, k3s_cluster_secret = local.k3s_cluster_secret, extra_ssh_keys = var.extra_ssh_keys, install_k3s_version = var.k3s_version })) ebs_optimized = true - desired_capacity = var.node_count + default_cooldown = 10 + health_check_grace_period = 30 + wait_for_capacity_timeout = "60m" + + desired_capacity = var.agent_node_count health_check_type = "EC2" - max_size = var.node_count - min_size = var.node_count + max_size = var.agent_node_count + min_size = var.agent_node_count vpc_zone_identifier = [data.aws_subnet.selected.id] spot_price = "0.680" diff --git a/tests/perf/agents/variables.tf b/tests/perf/agents/variables.tf index 37a587d413..f0924930fe 100644 --- a/tests/perf/agents/variables.tf +++ b/tests/perf/agents/variables.tf @@ -1,10 +1,10 @@ -variable "node_count" { +variable "agent_node_count" { description = "Number of nodes to run k3s agents on." type = number # default = 10 } -variable "worker_instance_type" { +variable "agent_instance_type" { type = string default = "t3.2xlarge" } diff --git a/tests/perf/scripts/config b/tests/perf/scripts/config index 8e5f09a3fd..3505846732 100755 --- a/tests/perf/scripts/config +++ b/tests/perf/scripts/config @@ -1,28 +1,34 @@ ## MAIN VARIABLES ## #################### -CLUSTER_NAME="hgalal-k3s" -K3S_VERSION="v0.10.0" -EXTRA_SSH_KEYS="ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDZBAE6I9J733HJfCBVu7iWSUuJ7th0U4P4IFfpFDca52n/Hk4yFFr8SPR8JJc1n42c3vEVCbExp/MD4ihqEBy9+pLewxA+fkb7UAT4cT2eLfvZdTTVe8KSiw6lVN6tWSoNXmNqY+wH7zWQ04lfjXPa/c01L1n2XwV/O+5xii9vEuSxN9YhfQ/s61SdLFqQ5yS8gPsM0qQW+bFt5KGGbapqztDO+h9lxGbZRcRAKbCzZ5kF1mhjI/+VubTWKtoVLCumjzjYqILYyx9g/mLSo26qjDEZvtwBQB9KLugDAtnalLVp0HgivC5YfLHr8PxViVSHfIIKS2DhUpn07jr8eKi9" -PRIVATE_KEY_PATH="/home/hussein/.ssh/id_rsa" #this has to be a full path +CLUSTER_NAME="loadtest-k3s" +DOMAIN_NAME="" +ZONE_ID="" +K3S_VERSION="v0.11.0-alpha2" +EXTRA_SSH_KEYS="" # comma separated public keys +PRIVATE_KEY_PATH="~/.ssh/id_rsa" +DEBUG=1 +## K3S DB VARIABLES ## +########################## +DB_ENGINE="postgres" +DB_INSTANCE_TYPE="db.m4.4xlarge" +DB_NAME="k3s" +DB_USERNAME="k3suser" +DB_PASSWORD="024d9442b3add64b7ef90655bc302cd8" +DB_VERSION=11.5 ## K3S SERVER VARIABLES ## ########################## -K3S_HA=1 -MASTER_COUNT=3 -DB_INSTANCE_TYPE="db.m4.4xlarge" +SERVER_HA=1 +SERVER_COUNT=3 SERVER_INSTANCE_TYPE="m5.2xlarge" -DEBUG=1 - ## PROMETHEUS SERVER VARIABLES ## ################################# PROM_WORKER_NODE_COUNT=1 -PROM_HOST="prometheus-load.eng.rancher.space" -GRAF_HOST="prometheus-load.eng.rancher.space" - +PROM_WORKER_INSTANCE_TYPE="m5.large" ## K3S AGENTS VARIABLES ## ########################## AGENT_NODE_COUNT=100 -WORKER_INSTANCE_TYPE="m5.xlarge" +AGENT_INSTANCE_TYPE="m5.large" diff --git a/tests/perf/scripts/perf b/tests/perf/scripts/perf index bf60a715e0..9dbae96166 100755 --- a/tests/perf/scripts/perf +++ b/tests/perf/scripts/perf @@ -18,6 +18,8 @@ init() { apply() { # init terraform + init + # configure variables config # Run apply for server and agents for i in server agents; do @@ -32,32 +34,52 @@ apply() { done } +plan() { + # init terraform + config + # Run apply for server and agents + for i in server agents; do + pushd $i + $TERRAFORM_PLAN_CMD + popd + done +} + + config() { source scripts/config pushd ./server + eval PRIVATE_KEY_PATH=$PRIVATE_KEY_PATH + EXPANDED_PRIV_KEY_PATH=`readlink -f $PRIVATE_KEY_PATH` cat <
variables.tfvars name = "${CLUSTER_NAME}" db_instance_type = "${DB_INSTANCE_TYPE}" +db_name = "${DB_NAME}" +db_username = "${DB_USERNAME}" +db_password = "${DB_PASSWORD}" +db_engine = "${DB_ENGINE}" +db_version = "${DB_VERSION}" server_instance_type = "${SERVER_INSTANCE_TYPE}" extra_ssh_keys = ["${EXTRA_SSH_KEYS}"] -master_count = ${MASTER_COUNT} -k3s_ha = ${K3S_HA} +server_count = ${SERVER_COUNT} +server_ha = ${SERVER_HA} k3s_version = "${K3S_VERSION}" prom_worker_node_count = ${PROM_WORKER_NODE_COUNT} -prom_host = "${PROM_HOST}" -graf_host = "${GRAF_HOST}" -ssh_key_path = "${PRIVATE_KEY_PATH}" +prom_worker_instance_type = "${PROM_WORKER_INSTANCE_TYPE}" +ssh_key_path = "${EXPANDED_PRIV_KEY_PATH}" debug = ${DEBUG} +domain_name = "${DOMAIN_NAME}" +zone_id = "${ZONE_ID}" MAIN popd pushd ./agents cat <
variables.tfvars name = "${CLUSTER_NAME}" -node_count = ${AGENT_NODE_COUNT} extra_ssh_keys = ["${EXTRA_SSH_KEYS}"] k3s_version = "${K3S_VERSION}" -worker_instance_type = "${WORKER_INSTANCE_TYPE}" +agent_node_count = ${AGENT_NODE_COUNT} +agent_instance_type = "${AGENT_INSTANCE_TYPE}" MAIN popd } @@ -71,6 +93,16 @@ clean() { done } +cleanall() { + clean + # clean kubeconfig + pushd tests/ + rm -f kubeconfig + rm -rf load_tests_results* + rm -rf density_tests_results* + popd +} + destroy() { for i in agents server; do pushd $i @@ -80,4 +112,15 @@ destroy() { clean } +info() { + set +x + for i in agents server; do + pushd $i + if [ -f $i.tfstate ]; then + terraform output --state=$i.tfstate + fi + popd + done +} + $@ diff --git a/tests/perf/scripts/test b/tests/perf/scripts/test index 150bd9eff9..5866907849 100755 --- a/tests/perf/scripts/test +++ b/tests/perf/scripts/test @@ -2,9 +2,11 @@ test_load() { source scripts/config + eval PRIVATE_KEY_PATH=$PRIVATE_KEY_PATH + EXPANDED_PRIV_KEY_PATH=`readlink -f $PRIVATE_KEY_PATH` masterips=`terraform output -state=server/server.tfstate | grep k3s_server_ips | cut -d "=" -f 2` pushd tests/ - docker run -v $PRIVATE_KEY_PATH:/opt/priv_key \ + docker run -v $EXPANDED_PRIV_KEY_PATH:/opt/priv_key \ -e KUBE_SSH_USER=ubuntu \ -e LOCAL_SSH_KEY=/opt/priv_key \ -it -v $PWD/:/opt/k3s/perf-tests husseingalal/clusterloader:dev \ @@ -12,7 +14,7 @@ test_load() { --kubeconfig /opt/k3s/perf-tests/kubeconfig.yaml \ --masterip $masterips \ --provider=local \ - --report-dir /opt/k3s/perf-tests/load_tests_results \ + --report-dir /opt/k3s/perf-tests/load_tests_results-$RANDOM \ --enable-prometheus-server \ --tear-down-prometheus-server=0 popd @@ -20,17 +22,19 @@ test_load() { test_density() { source scripts/config + eval PRIVATE_KEY_PATH=$PRIVATE_KEY_PATH + EXPANDED_PRIV_KEY_PATH=`readlink -f $PRIVATE_KEY_PATH` masterips=`terraform output -state=server/server.tfstate | grep k3s_server_ips | cut -d "=" -f 2` pushd tests/ docker run -e KUBE_SSH_USER=ubuntu \ - -v $PRIVATE_KEY_PATH:/opt/priv_key \ + -v $EXPANDED_PRIV_KEY_PATH:/opt/priv_key \ -e LOCAL_SSH_KEY=/opt/priv_key \ -it -v $PWD/:/opt/k3s/perf-tests husseingalal/clusterloader:dev \ clusterloader --testconfig /opt/k3s/perf-tests/density/config.yaml \ --kubeconfig /opt/k3s/perf-tests/kubeconfig.yaml \ --masterip $masterips \ --provider=local \ - --report-dir /opt/k3s/perf-tests/density_tests_results \ + --report-dir /opt/k3s/perf-tests/density_tests_results-$RANDOM \ --enable-prometheus-server \ --tear-down-prometheus-server=0 popd @@ -40,7 +44,7 @@ clean() { # clean kubeconfig pushd tests/ rm -f kubeconfig - rm -rf load_tests_results/ + rm -rf load_tests_results* rm -rf density_tests_results/ popd } diff --git a/tests/perf/server/data.tf b/tests/perf/server/data.tf index 9a269d4e1e..240c9f225e 100644 --- a/tests/perf/server/data.tf +++ b/tests/perf/server/data.tf @@ -34,19 +34,3 @@ data "aws_ami" "ubuntu" { values = ["x86_64"] } } - -data "template_file" "metrics" { - template = file("${path.module}/files/metrics.yaml") - vars = { - prom_worker_node_count = local.prom_worker_node_count - - } -} -data "template_file" "k3s-prom-yaml" { - template = file("${path.module}/files/prom.yaml") - vars = { - prom_host = var.prom_host - graf_host = var.graf_host - prom_worker_node_count = local.prom_worker_node_count - } -} diff --git a/tests/perf/server/files/etcd.tmpl b/tests/perf/server/files/etcd.tmpl new file mode 100644 index 0000000000..41727d6708 --- /dev/null +++ b/tests/perf/server/files/etcd.tmpl @@ -0,0 +1,31 @@ +#cloud-config +%{ if length(extra_ssh_keys) > 0 } +ssh_authorized_keys: +%{ for ssh_key in extra_ssh_keys } +- ${ssh_key} +%{ endfor } +%{ endif } +runcmd: +- echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh3 = 16384" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh2 = 8192" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh1 = 4096" >> /etc/sysctl.conf +- echo "fs.file-max = 12000500" >> /etc/sysctl.conf +- echo "fs.nr_open = 20000500" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_mem = '10000000 10000000 10000000'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_rmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_wmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.core.rmem_max = 16384" >> /etc/sysctl.conf +- echo "net.core.wmem_max = 16384" >> /etc/sysctl.conf +- ulimit -n 20000000 +- echo "# " >> /etc/security/limits.d/limits.conf +- echo " * soft nofile 20000" >> /etc/security/limits.d/limits.conf +- echo " * hard nofile 20000" >> /etc/security/limits.d/limits.conf +- sysctl -p +- apt-get update +- apt-get install -y git vim software-properties-common resolvconf linux-headers-$(uname -r) +- echo "nameserver 1.1.1.1" > /etc/resolvconf/resolv.conf.d/tail +- echo "RateLimitIntervalSec=0" >> /etc/systemd/journald.conf +- echo "RateLimitBurst=0" >> /etc/systemd/journald.conf +- curl -sSL https://releases.rancher.com/install-docker/19.03.sh | sh diff --git a/tests/perf/server/files/etcd_build.sh b/tests/perf/server/files/etcd_build.sh new file mode 100755 index 0000000000..51d3074668 --- /dev/null +++ b/tests/perf/server/files/etcd_build.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -x + +IFS=',' read -r -a public_ips <<< "$PUBLIC_IPS" +IFS=',' read -r -a private_ips <<< "$PRIVATE_IPS" + +conn_string="" +for i in "${!private_ips[@]}"; do + conn_string=$conn_string"etcd-$i=http://${private_ips[i]}:2380," +done +conn_string=${conn_string%?} +for i in "${!public_ips[@]}"; do + while true; do + ssh -i $SSH_KEY_PATH -l ubuntu ${public_ips[i]} "sudo docker run -v /etcd-data:/etcd-data -d -p ${private_ips[i]}:2379:2379 -p ${private_ips[i]}:2380:2380 quay.io/coreos/etcd:$DB_VERSION etcd --initial-advertise-peer-urls http://${private_ips[i]}:2380 --name=etcd-$i --data-dir=/etcd-data --advertise-client-urls=http://0.0.0.0:2379 --listen-peer-urls=http://0.0.0.0:2380 --listen-client-urls=http://0.0.0.0:2379 --initial-cluster-token=etcd-cluster-1 --initial-cluster-state new --initial-cluster $conn_string" + if [ $? == 0 ]; then + break + fi + sleep 10 + done +done + +# diff --git a/tests/perf/server/files/metrics.yaml b/tests/perf/server/files/metrics.yaml deleted file mode 100644 index d3cfb79659..0000000000 --- a/tests/perf/server/files/metrics.yaml +++ /dev/null @@ -1,227 +0,0 @@ -%{ if prom_worker_node_count != 0 } ---- -apiVersion: rbac.authorization.k8s.io/v1 -# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1 -kind: ClusterRoleBinding -metadata: - name: kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kube-state-metrics -subjects: -- kind: ServiceAccount - name: kube-state-metrics - namespace: kube-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1 -kind: ClusterRole -metadata: - name: kube-state-metrics -rules: -- apiGroups: [""] - resources: - - configmaps - - secrets - - nodes - - pods - - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints - verbs: ["list", "watch"] -- apiGroups: ["extensions"] - resources: - - daemonsets - - deployments - - replicasets - - ingresses - verbs: ["list", "watch"] -- apiGroups: ["apps"] - resources: - - daemonsets - - deployments - - replicasets - - statefulsets - verbs: ["list", "watch"] -- apiGroups: ["batch"] - resources: - - cronjobs - - jobs - verbs: ["list", "watch"] -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] -- apiGroups: ["autoscaling.k8s.io"] - resources: - - verticalpodautoscalers - verbs: ["list", "watch"] ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - k8s-app: kube-state-metrics - name: kube-state-metrics - namespace: kube-system -spec: - selector: - matchLabels: - k8s-app: kube-state-metrics - replicas: 1 - template: - metadata: - labels: - k8s-app: kube-state-metrics - spec: - serviceAccountName: kube-state-metrics - containers: - - name: kube-state-metrics - image: quay.io/coreos/kube-state-metrics:v1.7.2 - ports: - - name: http-metrics - containerPort: 8080 - - name: telemetry - containerPort: 8081 - livenessProbe: - httpGet: - path: /healthz - port: 8080 - initialDelaySeconds: 5 - timeoutSeconds: 5 - readinessProbe: - httpGet: - path: / - port: 8080 - initialDelaySeconds: 5 - timeoutSeconds: 5 ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: kube-state-metrics - namespace: kube-system ---- -apiVersion: v1 -kind: Service -metadata: - name: kube-state-metrics - namespace: kube-system - labels: - k8s-app: kube-state-metrics - annotations: - prometheus.io/scrape: 'true' -spec: - ports: - - name: http-metrics - port: 8080 - targetPort: http-metrics - protocol: TCP - - name: telemetry - port: 8081 - targetPort: telemetry - protocol: TCP - selector: - k8s-app: kube-state-metrics ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: slo-monitor -subjects: -- kind: ServiceAccount - name: slo-monitor - namespace: kube-system -roleRef: - kind: ClusterRole - name: slo-monitor - apiGroup: rbac.authorization.k8s.io ---- -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: slo-monitor - namespace: kube-system -rules: -- apiGroups: [""] - resources: ["pods", "events"] - verbs: ["get", "watch", "list"] ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: slo-monitor - namespace: kube-system ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: slo-monitor - namespace: kube-system - labels: - app: slo-monitor -spec: - selector: - matchLabels: - app: slo-monitor - template: - metadata: - labels: - app: slo-monitor - annotations: - prometheus.io/scrape: "true" - spec: - containers: - - name: slo-monitor - image: gcr.io/google-containers/slo-monitor:0.12.0 - command: - - /slo-monitor - - --alsologtostderr=true - imagePullPolicy: Always - ports: - - name: metrics - containerPort: 8080 - resources: - requests: - cpu: 300m - memory: 100Mi - limits: - cpu: 300m - memory: 100Mi - restartPolicy: Always - serviceAccountName: slo-monitor ---- -apiVersion: v1 -kind: Service -metadata: - name: slo-monitor - namespace: kube-system - labels: - app: slo-monitor -spec: - selector: - app: slo-monitor - ports: - - name: metrics - port: 80 - targetPort: metrics - type: ClusterIP -%{ endif } diff --git a/tests/perf/server/files/prom.yaml b/tests/perf/server/files/prom.yaml deleted file mode 100644 index 369a922548..0000000000 --- a/tests/perf/server/files/prom.yaml +++ /dev/null @@ -1,86 +0,0 @@ -%{ if prom_worker_node_count != 0 } ---- -apiVersion: v1 -kind: Namespace -metadata: - name: monitoring - ---- -apiVersion: helm.cattle.io/v1 -kind: HelmChart -metadata: - name: prometheus - namespace: kube-system -spec: - chart: https://raw.githubusercontent.com/galal-hussein/charts/master/prometheus-9.2.0.tgz - targetNamespace: monitoring - valuesContent: |- - alertmanager: - nodeSelector: - prom: "true" - persistentVolume: - enabled: false - kubeStateMetrics: - nodeSelector: - prom: "true" - nodeExporter: - nodeSelector: - prom: "true" - server: - nodeSelector: - prom: "true" - ingress: - enabled: true - hosts: - - ${prom_host} - persistentVolume: - enabled: false - pushgateway: - nodeSelector: - prom: "true" - persistentVolume: - enabled: false - serverFiles: - prometheus.yml: - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - job_name: kubernetes-apiservers - scrape_interval: 10s - scrape_timeout: 10s - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - api_server: null - role: endpoints - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - relabel_configs: - - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - separator: ; - regex: default;kubernetes;https - replacement: $1 - action: keep ---- -apiVersion: helm.cattle.io/v1 -kind: HelmChart -metadata: - name: grafana - namespace: kube-system -spec: - chart: stable/grafana - targetNamespace: monitoring - valuesContent: |- - ingress: - enabled: true - hosts: - - ${graf_host} - nodeSelector: - prom: "true" -%{ endif } diff --git a/tests/perf/server/files/server_userdata.tmpl b/tests/perf/server/files/server_userdata.tmpl index 17cad50b88..65145c7c68 100644 --- a/tests/perf/server/files/server_userdata.tmpl +++ b/tests/perf/server/files/server_userdata.tmpl @@ -6,16 +6,33 @@ ssh_authorized_keys: %{ endfor } %{ endif } write_files: -- path: /var/lib/rancher/k3s/server/manifests/metrics.yaml +- path: /opt/k3s/run_k3s.sh permissions: "0755" owner: root:root - encoding: b64 - content: ${metrics_yaml} -- path: /var/lib/rancher/k3s/server/manifests/prom.yaml - permissions: "0755" - owner: root:root - encoding: b64 - content: ${prom_yaml} + content: | + #!/bin/bash + set -x + if [ ${db_engine} == "postgres" ]; then + STORAGE_ENDPOINT="postgres://${db_username}:${db_password}@${db_address}:5432/${db_name}" + elif [ ${db_engine} == "mysql" ]; then + STORAGE_ENDPOINT="mysql://${db_username}:${db_password}@(${db_address})/${db_name}" + else + IFS=',' read -r -a private_ips <<< "${db_address}" + for i in "$${!private_ips[@]}"; do + STORAGE_ENDPOINT=$STORAGE_ENDPOINT"http://$${private_ips[i]}:2379", + done + STORAGE_ENDPOINT=$${STORAGE_ENDPOINT%?} + echo hello + fi + while true; do + curl -sfL https://get.k3s.io | K3S_CLUSTER_SECRET="${k3s_cluster_secret}" \ + INSTALL_K3S_VERSION="${install_k3s_version}" \ + INSTALL_K3S_EXEC="${k3s_server_args} --cluster-cidr=10.0.0.0/8 --no-deploy traefik --no-deploy servicelb --tls-san ${lb_address} %{ if use_ha == "true" } --storage-endpoint=$STORAGE_ENDPOINT %{ endif }" sh - + if [ $? -eq 0 ]; then + break + fi + sleep 1 + done runcmd: - echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf - echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf @@ -41,14 +58,11 @@ runcmd: - echo "RateLimitBurst=0" >> /etc/systemd/journald.conf - systemctl restart systemd-journald.service - systemctl start resolvconf -- wget https://raw.githubusercontent.com/galal-hussein/k3s/k3s_with_kine_fix/k3s -- cp k3s /usr/local/bin/k3s -- chmod +x /usr/local/bin/k3s -%{if master_index != 0 } +%{ if master_index != 0 } - sleep 20 %{ endif } -- until (curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_DOWNLOAD=true K3S_CLUSTER_SECRET="${k3s_cluster_secret}" INSTALL_K3S_VERSION="${install_k3s_version}" INSTALL_K3S_EXEC="${k3s_server_args} --cluster-cidr=10.0.0.0/8 --no-deploy traefik --no-deploy servicelb --tls-san ${lb_address} %{ if use_ha == "true" } --storage-endpoint="postgres://${db_username}:${db_password}@${db_address}:5432/${db_name}" %{ if master_index == 0 }--bootstrap-save%{ endif } %{ endif }" sh -); do echo 'Error installing k3s'; sleep 1; done -%{if debug != 0 } +- /opt/k3s/run_k3s.sh +%{ if debug != 0 } - sed -i 's/bin\/k3s/bin\/k3s --debug/g' /etc/systemd/system/k3s.service - systemctl daemon-reload - systemctl restart k3s diff --git a/tests/perf/server/files/worker_userdata.tmpl b/tests/perf/server/files/worker_userdata.tmpl index 90712c0bdc..ae2aaa9609 100644 --- a/tests/perf/server/files/worker_userdata.tmpl +++ b/tests/perf/server/files/worker_userdata.tmpl @@ -23,7 +23,7 @@ runcmd: - echo " * soft nofile 20000" >> /etc/security/limits.d/limits.conf - echo " * hard nofile 20000" >> /etc/security/limits.d/limits.conf - sysctl -p -- wget https://raw.githubusercontent.com/galal-hussein/k3s/k3s_with_kine_fix/k3s +- wget https://raw.githubusercontent.com/galal-hussein/k3s/scale_test/k3s - cp k3s /usr/local/bin/k3s - chmod +x /usr/local/bin/k3s - until (curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${install_k3s_version} INSTALL_K3S_EXEC="${k3s_exec}" K3S_URL=https://${k3s_url}:6443 K3S_CLUSTER_SECRET="${k3s_cluster_secret}" sh -); do echo 'k3s did not install correctly'; sleep 1; done diff --git a/tests/perf/server/main.tf b/tests/perf/server/main.tf index bffc863b2a..9c5bdcceda 100644 --- a/tests/perf/server/main.tf +++ b/tests/perf/server/main.tf @@ -5,10 +5,11 @@ terraform { } locals { - name = var.name - k3s_cluster_secret = var.k3s_cluster_secret - install_k3s_version = var.k3s_version - prom_worker_node_count = var.prom_worker_node_count + name = var.name + k3s_cluster_secret = var.k3s_cluster_secret + install_k3s_version = var.k3s_version + prom_worker_node_count = var.prom_worker_node_count + prom_worker_instance_type = var.prom_worker_instance_type } provider "aws" { @@ -57,12 +58,12 @@ resource "aws_security_group" "k3s" { } resource "aws_db_instance" "k3s_db" { - count = "${var.k3s_ha}" + count = "${var.db_engine == "etcd" ? 0 : var.server_ha}" allocated_storage = 100 #baseline iops is 300 with gp2 storage_type = "io1" iops = "3000" - engine = "postgres" - engine_version = "11.5" + engine = "${var.db_engine}" + engine_version = "${var.db_version}" instance_class = "${var.db_instance_type}" name = "${var.db_name}" username = "${var.db_username}" @@ -71,13 +72,48 @@ resource "aws_db_instance" "k3s_db" { multi_az = false } +resource "aws_instance" "k3s_etcd" { + count = "${var.etcd_count * (var.db_engine == "etcd" ? 1 * var.server_ha : 0)}" + instance_type = replace(var.db_instance_type, "/db./", "") + ami = data.aws_ami.ubuntu.id + user_data = base64encode(templatefile("${path.module}/files/etcd.tmpl", + { + extra_ssh_keys = var.extra_ssh_keys, + db_version = var.db_version + etcd_count = var.etcd_count + })) + security_groups = [ + aws_security_group.k3s.name, + ] + + root_block_device { + volume_size = "100" + volume_type = "gp2" + } + + tags = { + Name = "${local.name}-etcd-${count.index}" + } +} + resource "aws_lb" "k3s-master-nlb" { name = "${local.name}-nlb" internal = false load_balancer_type = "network" - subnets = [data.aws_subnet.selected.id] + subnets = data.aws_subnet_ids.available.ids } +resource "aws_route53_record" "www" { + # currently there is the only way to use nlb dns name in k3s + # because the real dns name is too long and cause an issue + zone_id = "${var.zone_id}" + name = "${var.domain_name}" + type = "CNAME" + ttl = "30" + records = ["${aws_lb.k3s-master-nlb.dns_name}"] +} + + resource "aws_lb_target_group" "k3s-master-nlb-tg" { name = "${local.name}-nlb-tg" port = "6443" @@ -104,35 +140,33 @@ resource "aws_lb_listener" "k3s-master-nlb-tg" { } resource "aws_lb_target_group_attachment" "test" { - count = "${var.master_count}" + count = "${var.server_count}" target_group_arn = "${aws_lb_target_group.k3s-master-nlb-tg.arn}" - target_id = "${aws_spot_instance_request.k3s-server[count.index].spot_instance_id}" + target_id = "${aws_instance.k3s-server[count.index].id}" port = 6443 } -resource "aws_spot_instance_request" "k3s-server" { - count = "${var.master_count}" +resource "aws_instance" "k3s-server" { + count = "${var.server_count}" instance_type = var.server_instance_type ami = data.aws_ami.ubuntu.id user_data = base64encode(templatefile("${path.module}/files/server_userdata.tmpl", { extra_ssh_keys = var.extra_ssh_keys, - metrics_yaml = base64encode(data.template_file.metrics.rendered), - prom_yaml = base64encode(data.template_file.k3s-prom-yaml.rendered), k3s_cluster_secret = local.k3s_cluster_secret, install_k3s_version = local.install_k3s_version, k3s_server_args = var.k3s_server_args, - db_address = aws_db_instance.k3s_db[0].address, - db_name = aws_db_instance.k3s_db[0].name, - db_username = aws_db_instance.k3s_db[0].username, - db_password = aws_db_instance.k3s_db[0].password, - use_ha = "${var.k3s_ha == 1 ? "true": "false"}", + db_engine = var.db_engine + db_address = "${var.db_engine == "etcd" ? join(",",aws_instance.k3s_etcd.*.private_ip) : aws_db_instance.k3s_db[0].address}", + db_name = var.db_name, + db_username = var.db_username, + db_password = var.db_password, + use_ha = "${var.server_ha == 1 ? "true": "false"}", master_index = count.index, - lb_address = aws_lb.k3s-master-nlb.dns_name, + lb_address = var.domain_name, prom_worker_node_count = local.prom_worker_node_count, - debug = var.debug,})) - - wait_for_fulfillment = true + debug = var.debug, + k3s_cluster_secret = local.k3s_cluster_secret,})) security_groups = [ aws_security_group.k3s.name, ] @@ -155,9 +189,9 @@ module "k3s-prom-worker-asg" { version = "3.0.0" name = "${local.name}-prom-worker" asg_name = "${local.name}-prom-worker" - instance_type = "m5.large" + instance_type = local.prom_worker_instance_type image_id = data.aws_ami.ubuntu.id - user_data = base64encode(templatefile("${path.module}/files/worker_userdata.tmpl", { extra_ssh_keys = var.extra_ssh_keys, k3s_url = aws_lb.k3s-master-nlb.dns_name, k3s_cluster_secret = local.k3s_cluster_secret, install_k3s_version = local.install_k3s_version, k3s_exec = "--node-label prom=true" })) + user_data = base64encode(templatefile("${path.module}/files/worker_userdata.tmpl", { extra_ssh_keys = var.extra_ssh_keys, k3s_url = var.domain_name, k3s_cluster_secret = local.k3s_cluster_secret, install_k3s_version = local.install_k3s_version, k3s_exec = "--node-label prom=true" })) desired_capacity = local.prom_worker_node_count health_check_type = "EC2" @@ -180,9 +214,22 @@ module "k3s-prom-worker-asg" { ] } +resource "null_resource" "run_etcd" { + count = "${var.db_engine == "etcd" ? 1 : 0}" + + triggers = { + etcd_instance_ids = "${join(",", aws_instance.k3s_etcd.*.id)}" + } + + provisioner "local-exec" { + interpreter = ["bash", "-c"] + command = "DB_VERSION=${var.db_version} SSH_KEY_PATH=${var.ssh_key_path} PUBLIC_IPS=${join(",",aws_instance.k3s_etcd.*.public_ip)} PRIVATE_IPS=${join(",",aws_instance.k3s_etcd.*.private_ip)} files/etcd_build.sh" + } +} + resource "null_resource" "get-kubeconfig" { provisioner "local-exec" { interpreter = ["bash", "-c"] - command = "until ssh -i ${var.ssh_key_path} ubuntu@${aws_spot_instance_request.k3s-server[0].public_ip} 'sudo sed \"s/localhost/$aws_lb.k3s-master-nlb.dns_name}/g;s/127.0.0.1/${aws_lb.k3s-master-nlb.dns_name}/g\" /etc/rancher/k3s/k3s.yaml' >| ../tests/kubeconfig.yaml; do sleep 5; done" + command = "until ssh -i ${var.ssh_key_path} ubuntu@${aws_instance.k3s-server[0].public_ip} 'sudo sed \"s/localhost/$var.domain_name}/g;s/127.0.0.1/${var.domain_name}/g\" /etc/rancher/k3s/k3s.yaml' >| ../tests/kubeconfig.yaml; do sleep 5; done" } } diff --git a/tests/perf/server/outputs.tf b/tests/perf/server/outputs.tf index 6e2ffd61ea..7c5c84fd2a 100644 --- a/tests/perf/server/outputs.tf +++ b/tests/perf/server/outputs.tf @@ -1,5 +1,5 @@ output "public_ip" { - value = aws_lb.k3s-master-nlb.dns_name + value = var.domain_name } output "install_k3s_version" { @@ -11,5 +11,5 @@ output "k3s_cluster_secret" { } output "k3s_server_ips" { - value = join(",", aws_spot_instance_request.k3s-server.*.public_ip) + value = join(",", aws_instance.k3s-server.*.public_ip) } diff --git a/tests/perf/server/variables.tf b/tests/perf/server/variables.tf index 0a7209ed42..cbe680d24a 100644 --- a/tests/perf/server/variables.tf +++ b/tests/perf/server/variables.tf @@ -23,12 +23,7 @@ variable "k3s_cluster_secret" { type = string description = "Cluster secret for k3s cluster registration" } -variable "prom_host" { - default = "" -} -variable "graf_host" { - default = "" -} + variable "name" { default = "k3s-loadtest" type = string @@ -47,11 +42,19 @@ variable "extra_ssh_keys" { description = "Extra ssh keys to inject into Rancher instances" } -variable "k3s_ha" { +variable "server_ha" { default = 0 description = "Enable k3s in HA mode" } +variable "etcd_count" { + default = 3 +} + +variable "db_engine" { + default = "postgres" +} + variable "db_instance_type" { } @@ -67,7 +70,9 @@ variable "db_password" { default = "b58bf234c4bd0133fc7a92b782e498a6" } -variable "master_count" { +variable "db_version" {} + +variable "server_count" { default = 1 description = "Count of k3s master servers" } @@ -76,3 +81,16 @@ variable "debug" { default = 0 description = "Enable Debug log" } + +variable "prom_worker_instance_type" { + default = "m5.large" + description = "Prometheus instance type" +} + +variable "domain_name" { + description = "FQDN of the cluster" +} + +variable "zone_id" { + description = "route53 zone id to register the domain name" +} diff --git a/tests/perf/tests/load/config.yaml b/tests/perf/tests/load/config.yaml index 413fd81eec..a0612bc371 100644 --- a/tests/perf/tests/load/config.yaml +++ b/tests/perf/tests/load/config.yaml @@ -7,7 +7,7 @@ #Constants {{$NODE_MODE := DefaultParam .NODE_MODE "allnodes"}} {{$NODES_PER_NAMESPACE := DefaultParam .NODES_PER_NAMESPACE 100}} -{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}} +{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 100}} {{$LOAD_TEST_THROUGHPUT := DefaultParam .LOAD_TEST_THROUGHPUT 10}} {{$BIG_GROUP_SIZE := 1000}} {{$MEDIUM_GROUP_SIZE := 500}}