diff --git a/tests/perf/.gitignore b/tests/perf/.gitignore new file mode 100644 index 0000000000..99829f7ce8 --- /dev/null +++ b/tests/perf/.gitignore @@ -0,0 +1,7 @@ +.terraform* +*.tfstate* +*.tfvars* +*.plan* +*tests_results* +*junit.xml +*kubeconfig.yaml diff --git a/tests/perf/Makefile b/tests/perf/Makefile new file mode 100644 index 0000000000..9b216482d2 --- /dev/null +++ b/tests/perf/Makefile @@ -0,0 +1,27 @@ +MODULE := $(shell basename $$PWD) + +.PHONY: init config apply destroy clean test info + +init: + @scripts/perf init + +config: + @scripts/perf config + +plan: + @scripts/perf plan + +apply: + @scripts/perf apply + +destroy: + @scripts/perf destroy + +clean: + @scripts/perf clean + +test: + @scripts/test test_load + +info: + @scripts/perf info diff --git a/tests/perf/README.md b/tests/perf/README.md new file mode 100644 index 0000000000..60ae23a2c5 --- /dev/null +++ b/tests/perf/README.md @@ -0,0 +1,116 @@ +## K3S Performance Tests +--- + +These scripts uses Terraform to automate building and testing on k3s clusters on AWS, it supports building normal and HA clusters with N master nodes, N workers nodes and multiple storage backends including: + +- MySQL RDS +- Postgres RDS +- Etcd +- SQlite + +The scripts divides into three sections: + +- server +- agents +- tests + +### Server + +The server section deploys the storage backend and then deploys N master nodes, the scripts can be customized to use HA mode or use a single node cluster with sqlite backend, it can also support using 1 master node with external DB, the scripts can also be customized to specify instance type and k3s version, all available options are described in the variable section below. + +The server section will also create a one or more agent nodes specifically for Prometheus deployment, clusterloader2 will deploy prometheus and grafana. + +### Agents + +The agents section deploys the k3s agents, it can be customized with different options that controls the agent node count and the instance types. + +### Tests + +The tests section uses a fork off the (clusterloader2)[https://github.com/kubernetes/perf-tests/tree/master/clusterloader2] tool, the fork just modifies the logging and removes the etcd metrics probes. + +this section will use a dockerized version of the tool, which will run the tests and save the report in `tests/-`. + +The current available tests are: + +- load test +- density test + +## Variables + +The scripts can be modified by customizing the variables in `scripts/config`, the variables includes: + +**Main Vars** + +| Name | Description | +|:----------------:|:------------------------------------------------------------------------------:| +| CLUSTER_NAME | The cluster name on aws, this will prefix each component in the cluster | +| DOMAIN_NAME | DNS name of the Loadbalancer for k3s master(s) | +| ZONE_ID | AWS route53 zone id for modifying the dns name | +| K3S_VERSION | K3S version that will be used with the cluster | +| EXTRA_SSH_KEYS | Public ssh keys that will be added to the servers | +| PRIVATE_KEY_PATH | Private ssh key that will be used by clusterloader2 to ssh and collect metrics | +| DEBUG | Debug mode for k3s servers | + +**Database Variables** + +| Name | Description | +|:----------------:|:---------------------------------------------------------------------------------------------------:| +| DB_ENGINE | The database type, this can be "mysql", "postgres", or "etcd" | +| DB_INSTANCE_TYPE | The RDS instance type for mysql and postgres, etcd uses db.* class as well as its parsed internally | +| DB_NAME | Database name created only in postgres and mysql | +| DB_USERNAME | Database username created only for postgres and mysql | +| DB_PASSWORD | Database password for the user created only for postgres and mysql | +| DB_VERSION | Database version | + +**K3S Server Variables** + +| Name | Description | +|:--------------------:|:---------------------------------------------------------------------------------:| +| SERVER_HA | Whether or not to use HA mode, if not then sqlite will be used as storage backend | +| SERVER_COUNT | k3s master node count | +| SERVER_INSTANCE_TYPE | Ec2 instance type created for k3s server(s) | + +**K3S Agent Variables** + +| Name | Description | +|:-------------------:|:-----------------------------------------:| +| AGENT_NODE_COUNT | Number of k3s agents that will be created | +| AGENT_INSTANCE_TYPE | Ec2 instance type created for k3s agents | + +**Prometheus server Variables** + +| Name | Description | +|:-------------------------:|:-------------------------------------------------------------------:| +| PROM_WORKER_NODE_COUNT | Number of k3s agents that will be created for prometheus deployment | +| PROM_WORKER_INSTANCE_TYPE | Ec2 instance type created for k3s prometheus agents | + + +## Usage + +### build + +The script includes a Makefile that run different sections, to build the master and workers, adjust the config file in `tests/perf/scripts/config` and then use the following: + +``` +cd tests/perf +make apply +``` + +This will basically build the db, server, and agent layers, it will also deploy a kubeconfig file in tests/kubeconfig.yaml. + +### test + +To start the clusterloader2 load test you can modify the tests/perf/tests/load/config.yaml and then run the following: + +``` +cd tests/perf +make test +``` + +### destroy + +To destroy the cluster just run the following: +``` +make destroy +make clean +``` diff --git a/tests/perf/agents/data.tf b/tests/perf/agents/data.tf new file mode 100644 index 0000000000..bff5eb3ea3 --- /dev/null +++ b/tests/perf/agents/data.tf @@ -0,0 +1,44 @@ +data "terraform_remote_state" "server" { + backend = "local" + + config = { + path = "${path.module}/../server/server.tfstate" + } +} + +data "aws_vpc" "default" { + default = true +} + +data "aws_subnet_ids" "available" { + vpc_id = data.aws_vpc.default.id +} + +data "aws_subnet" "selected" { + id = "${tolist(data.aws_subnet_ids.available.ids)[1]}" +} + +data "aws_ami" "ubuntu" { + most_recent = true + owners = ["099720109477"] + + filter { + name = "name" + values = ["ubuntu-minimal/images/*/ubuntu-bionic-18.04-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "root-device-type" + values = ["ebs"] + } + + filter { + name = "architecture" + values = ["x86_64"] + } +} diff --git a/tests/perf/agents/files/pool_worker_userdata.tmpl b/tests/perf/agents/files/pool_worker_userdata.tmpl new file mode 100644 index 0000000000..b117a5635b --- /dev/null +++ b/tests/perf/agents/files/pool_worker_userdata.tmpl @@ -0,0 +1,30 @@ +#cloud-config +%{ if length(extra_ssh_keys) > 0 } +ssh_authorized_keys: +%{ for ssh_key in extra_ssh_keys } +- ${ssh_key} +%{ endfor } +%{ endif } +runcmd: +- echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh3 = 16384" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh2 = 8192" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh1 = 4096" >> /etc/sysctl.conf +- echo "fs.file-max = 12000500" >> /etc/sysctl.conf +- echo "fs.nr_open = 20000500" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_mem = '10000000 10000000 10000000'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_rmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_wmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.core.rmem_max = 16384" >> /etc/sysctl.conf +- echo "net.core.wmem_max = 16384" >> /etc/sysctl.conf +- ulimit -n 20000000 +- echo "# " >> /etc/security/limits.d/limits.conf +- echo " * soft nofile 20000" >> /etc/security/limits.d/limits.conf +- echo " * hard nofile 20000" >> /etc/security/limits.d/limits.conf +- sysctl -p +- apt-get update +- apt-get install -y software-properties-common +- apt-get install -y resolvconf linux-headers-$(uname -r) && echo "nameserver 1.1.1.1" > /etc/resolvconf/resolv.conf.d/tail && systemctl start resolvconf +- DEBIAN_FRONTEND=noninteractive apt-get upgrade -y +- until (curl -sfL https://get.k3s.io | K3S_URL=https://${k3s_url}:6443 K3S_CLUSTER_SECRET="${k3s_cluster_secret}" K3S_CLUSTER_SECRET="${k3s_cluster_secret}" INSTALL_K3S_VERSION="${install_k3s_version}" sh -); do echo 'Error installing k3s agent'; sleep 1; done diff --git a/tests/perf/agents/main.tf b/tests/perf/agents/main.tf new file mode 100644 index 0000000000..ece3c1a8e9 --- /dev/null +++ b/tests/perf/agents/main.tf @@ -0,0 +1,83 @@ +terraform { + backend "local" { + path = "pool.tfstate" + } +} + +locals { + name = var.name + k3s_cluster_secret = var.k3s_cluster_secret +} + +provider "aws" { + region = "us-east-2" + profile = "rancher-eng" +} + +resource "aws_security_group" "k3s" { + name = "${local.name}-pool" + vpc_id = data.aws_vpc.default.id + + ingress { + from_port = 22 + to_port = 22 + protocol = "TCP" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 0 + to_port = 0 + protocol = "-1" + self = true + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +module "k3s-pool-worker-asg" { + source = "terraform-aws-modules/autoscaling/aws" + version = "3.0.0" + name = "${local.name}-pool" + asg_name = "${local.name}-pool" + instance_type = var.agent_instance_type + image_id = data.aws_ami.ubuntu.id + user_data = base64encode(templatefile("${path.module}/files/pool_worker_userdata.tmpl", { k3s_url = data.terraform_remote_state.server.outputs.public_ip, k3s_cluster_secret = local.k3s_cluster_secret, extra_ssh_keys = var.extra_ssh_keys, install_k3s_version = var.k3s_version })) + ebs_optimized = true + + default_cooldown = 10 + health_check_grace_period = 30 + wait_for_capacity_timeout = "60m" + + desired_capacity = var.agent_node_count + health_check_type = "EC2" + max_size = var.agent_node_count + min_size = var.agent_node_count + vpc_zone_identifier = [data.aws_subnet.selected.id] + spot_price = "0.680" + + security_groups = [ + aws_security_group.k3s.id, + ] + + lc_name = "${local.name}-pool" + + root_block_device = [ + { + volume_size = "30" + volume_type = "gp2" + }, + ] +} diff --git a/tests/perf/agents/outputs.tf b/tests/perf/agents/outputs.tf new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/perf/agents/variables.tf b/tests/perf/agents/variables.tf new file mode 100644 index 0000000000..8cf3b28a9f --- /dev/null +++ b/tests/perf/agents/variables.tf @@ -0,0 +1,33 @@ +variable "agent_node_count" { + description = "Number of nodes to run k3s agents on." + type = number + # default = 10 +} + +variable "agent_instance_type" { + type = string + default = "t3.2xlarge" +} + +variable "extra_ssh_keys" { + type = list + default = [] + description = "Extra ssh keys to inject into Rancher instances" +} + +variable "k3s_version" { + default = "v0.9.1" + type = string + description = "Version of K3S to install" +} + +variable "name" { + default = "k3s-loadtest" + type = string + description = "Name to identify this cluster" +} + +variable "k3s_cluster_secret" { + type = string + description = "Cluster secret for k3s cluster registration" +} \ No newline at end of file diff --git a/tests/perf/agents/versions.tf b/tests/perf/agents/versions.tf new file mode 100644 index 0000000000..ac97c6ac8e --- /dev/null +++ b/tests/perf/agents/versions.tf @@ -0,0 +1,4 @@ + +terraform { + required_version = ">= 0.12" +} diff --git a/tests/perf/scripts/config b/tests/perf/scripts/config new file mode 100755 index 0000000000..5467439d05 --- /dev/null +++ b/tests/perf/scripts/config @@ -0,0 +1,35 @@ +## MAIN VARIABLES ## +#################### +CLUSTER_NAME="loadtest-k3s" +CLUSTER_SECRET="" +DOMAIN_NAME="loadtest.eng.rancher.space" +ZONE_ID="" +K3S_VERSION="v1.0.0" +EXTRA_SSH_KEYS="" # comma separated public keys +PRIVATE_KEY_PATH="~/.ssh/id_rsa" +DEBUG=1 + +## K3S DB VARIABLES ## +########################## +DB_ENGINE="dqlite" +DB_INSTANCE_TYPE="db.m4.4xlarge" +DB_NAME="k3s" +DB_USERNAME="k3suser" +DB_PASSWORD="" +DB_VERSION=5.7 + +## K3S SERVER VARIABLES ## +########################## +SERVER_HA=1 +SERVER_COUNT=3 +SERVER_INSTANCE_TYPE="m5.2xlarge" + +## PROMETHEUS SERVER VARIABLES ## +################################# +PROM_WORKER_NODE_COUNT=1 +PROM_WORKER_INSTANCE_TYPE="m5.large" + +## K3S AGENTS VARIABLES ## +########################## +AGENT_NODE_COUNT=10 +AGENT_INSTANCE_TYPE="m5.large" diff --git a/tests/perf/scripts/perf b/tests/perf/scripts/perf new file mode 100755 index 0000000000..e5e2f4f6a1 --- /dev/null +++ b/tests/perf/scripts/perf @@ -0,0 +1,136 @@ +#!/bin/bash -ex + +TERRAFORM_PLAN_CMD="terraform plan --var-file variables.tfvars --out k3s.plan" +TERRAFORM_APPLY_CMD="terraform apply k3s.plan" +TERRAFORM_DESTROY_CMD="terraform destroy --var-file variables.tfvars --force" + +for bin in docker kubectl terraform; do + type $bin >/dev/null 2>&1 || (echo "$bin is not in the path. Please make sure it is installed and in PATH."; exit 1) +done + +init() { + for i in server agents; do + pushd $i + terraform init + popd + done +} + +apply() { + # init terraform + init + # configure variables + config + # Run apply for server and agents + for i in server agents; do + if [ $i == "agents" ]; then + echo "Sleeping 1 minute until server(s) is initialized" + sleep 60 + fi + pushd $i + $TERRAFORM_PLAN_CMD + $TERRAFORM_APPLY_CMD + popd + done +} + +plan() { + # init terraform + config + # Run apply for server and agents + for i in server agents; do + pushd $i + $TERRAFORM_PLAN_CMD + popd + done +} + + +config() { + source scripts/config + pushd ./server + eval PRIVATE_KEY_PATH=$PRIVATE_KEY_PATH + EXPANDED_PRIV_KEY_PATH=`readlink -f $PRIVATE_KEY_PATH` + if [ -z "$DB_PASSWORD" ]; then + # randomize database password + DB_PASSWORD=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1) + fi + if [ -z "$CLUSTER_SECRET" ]; then + # randomize cluster secret + CLUSTER_SECRET=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1) + fi +cat <
variables.tfvars +name = "${CLUSTER_NAME}" +k3s_cluster_secret = "${CLUSTER_SECRET}" +db_instance_type = "${DB_INSTANCE_TYPE}" +db_name = "${DB_NAME}" +db_username = "${DB_USERNAME}" +db_password = "${DB_PASSWORD}" +db_engine = "${DB_ENGINE}" +db_version = "${DB_VERSION}" +server_instance_type = "${SERVER_INSTANCE_TYPE}" +extra_ssh_keys = ["${EXTRA_SSH_KEYS}"] +server_count = ${SERVER_COUNT} +server_ha = ${SERVER_HA} +k3s_version = "${K3S_VERSION}" +prom_worker_node_count = ${PROM_WORKER_NODE_COUNT} +prom_worker_instance_type = "${PROM_WORKER_INSTANCE_TYPE}" +ssh_key_path = "${EXPANDED_PRIV_KEY_PATH}" +debug = ${DEBUG} +domain_name = "${DOMAIN_NAME}" +zone_id = "${ZONE_ID}" +MAIN +popd + +pushd ./agents +cat <
variables.tfvars +name = "${CLUSTER_NAME}" +extra_ssh_keys = ["${EXTRA_SSH_KEYS}"] +k3s_version = "${K3S_VERSION}" +agent_node_count = ${AGENT_NODE_COUNT} +agent_instance_type = "${AGENT_INSTANCE_TYPE}" +k3s_cluster_secret = "${CLUSTER_SECRET}" +MAIN +popd +} + +clean() { + # clean server and agents + for i in server agents; do + pushd $i + rm -f *.plan *.tfvars *.tfstate* + popd + done +} + +cleanall() { + clean + # clean kubeconfig + pushd tests/ + rm -f kubeconfig + rm -rf load_tests_results* + rm -rf density_tests_results* + popd +} + +destroy() { + for i in agents server; do + pushd $i + terraform destroy --var-file variables.tfvars --force + popd + done + clean +} + +info() { + set +x + for i in agents server; do + pushd $i + if [ -f $i.tfstate ]; then + terraform output --state=$i.tfstate + fi + popd + done +} + +$@ diff --git a/tests/perf/scripts/test b/tests/perf/scripts/test new file mode 100755 index 0000000000..5866907849 --- /dev/null +++ b/tests/perf/scripts/test @@ -0,0 +1,52 @@ +#!/bin/bash -ex + +test_load() { + source scripts/config + eval PRIVATE_KEY_PATH=$PRIVATE_KEY_PATH + EXPANDED_PRIV_KEY_PATH=`readlink -f $PRIVATE_KEY_PATH` + masterips=`terraform output -state=server/server.tfstate | grep k3s_server_ips | cut -d "=" -f 2` + pushd tests/ + docker run -v $EXPANDED_PRIV_KEY_PATH:/opt/priv_key \ + -e KUBE_SSH_USER=ubuntu \ + -e LOCAL_SSH_KEY=/opt/priv_key \ + -it -v $PWD/:/opt/k3s/perf-tests husseingalal/clusterloader:dev \ + clusterloader --testconfig /opt/k3s/perf-tests/load/config.yaml \ + --kubeconfig /opt/k3s/perf-tests/kubeconfig.yaml \ + --masterip $masterips \ + --provider=local \ + --report-dir /opt/k3s/perf-tests/load_tests_results-$RANDOM \ + --enable-prometheus-server \ + --tear-down-prometheus-server=0 + popd +} + +test_density() { + source scripts/config + eval PRIVATE_KEY_PATH=$PRIVATE_KEY_PATH + EXPANDED_PRIV_KEY_PATH=`readlink -f $PRIVATE_KEY_PATH` + masterips=`terraform output -state=server/server.tfstate | grep k3s_server_ips | cut -d "=" -f 2` + pushd tests/ + docker run -e KUBE_SSH_USER=ubuntu \ + -v $EXPANDED_PRIV_KEY_PATH:/opt/priv_key \ + -e LOCAL_SSH_KEY=/opt/priv_key \ + -it -v $PWD/:/opt/k3s/perf-tests husseingalal/clusterloader:dev \ + clusterloader --testconfig /opt/k3s/perf-tests/density/config.yaml \ + --kubeconfig /opt/k3s/perf-tests/kubeconfig.yaml \ + --masterip $masterips \ + --provider=local \ + --report-dir /opt/k3s/perf-tests/density_tests_results-$RANDOM \ + --enable-prometheus-server \ + --tear-down-prometheus-server=0 + popd +} + +clean() { + # clean kubeconfig + pushd tests/ + rm -f kubeconfig + rm -rf load_tests_results* + rm -rf density_tests_results/ + popd +} + +$@ diff --git a/tests/perf/server/data.tf b/tests/perf/server/data.tf new file mode 100644 index 0000000000..240c9f225e --- /dev/null +++ b/tests/perf/server/data.tf @@ -0,0 +1,36 @@ +data "aws_vpc" "default" { + default = true +} + +data "aws_subnet_ids" "available" { + vpc_id = data.aws_vpc.default.id +} + +data "aws_subnet" "selected" { + id = "${tolist(data.aws_subnet_ids.available.ids)[1]}" +} + +data "aws_ami" "ubuntu" { + most_recent = true + owners = ["099720109477"] + + filter { + name = "name" + values = ["ubuntu-minimal/images/*/ubuntu-bionic-18.04-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "root-device-type" + values = ["ebs"] + } + + filter { + name = "architecture" + values = ["x86_64"] + } +} diff --git a/tests/perf/server/files/etcd.tmpl b/tests/perf/server/files/etcd.tmpl new file mode 100644 index 0000000000..41727d6708 --- /dev/null +++ b/tests/perf/server/files/etcd.tmpl @@ -0,0 +1,31 @@ +#cloud-config +%{ if length(extra_ssh_keys) > 0 } +ssh_authorized_keys: +%{ for ssh_key in extra_ssh_keys } +- ${ssh_key} +%{ endfor } +%{ endif } +runcmd: +- echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh3 = 16384" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh2 = 8192" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh1 = 4096" >> /etc/sysctl.conf +- echo "fs.file-max = 12000500" >> /etc/sysctl.conf +- echo "fs.nr_open = 20000500" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_mem = '10000000 10000000 10000000'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_rmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_wmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.core.rmem_max = 16384" >> /etc/sysctl.conf +- echo "net.core.wmem_max = 16384" >> /etc/sysctl.conf +- ulimit -n 20000000 +- echo "# " >> /etc/security/limits.d/limits.conf +- echo " * soft nofile 20000" >> /etc/security/limits.d/limits.conf +- echo " * hard nofile 20000" >> /etc/security/limits.d/limits.conf +- sysctl -p +- apt-get update +- apt-get install -y git vim software-properties-common resolvconf linux-headers-$(uname -r) +- echo "nameserver 1.1.1.1" > /etc/resolvconf/resolv.conf.d/tail +- echo "RateLimitIntervalSec=0" >> /etc/systemd/journald.conf +- echo "RateLimitBurst=0" >> /etc/systemd/journald.conf +- curl -sSL https://releases.rancher.com/install-docker/19.03.sh | sh diff --git a/tests/perf/server/files/etcd_build.sh b/tests/perf/server/files/etcd_build.sh new file mode 100755 index 0000000000..51d3074668 --- /dev/null +++ b/tests/perf/server/files/etcd_build.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -x + +IFS=',' read -r -a public_ips <<< "$PUBLIC_IPS" +IFS=',' read -r -a private_ips <<< "$PRIVATE_IPS" + +conn_string="" +for i in "${!private_ips[@]}"; do + conn_string=$conn_string"etcd-$i=http://${private_ips[i]}:2380," +done +conn_string=${conn_string%?} +for i in "${!public_ips[@]}"; do + while true; do + ssh -i $SSH_KEY_PATH -l ubuntu ${public_ips[i]} "sudo docker run -v /etcd-data:/etcd-data -d -p ${private_ips[i]}:2379:2379 -p ${private_ips[i]}:2380:2380 quay.io/coreos/etcd:$DB_VERSION etcd --initial-advertise-peer-urls http://${private_ips[i]}:2380 --name=etcd-$i --data-dir=/etcd-data --advertise-client-urls=http://0.0.0.0:2379 --listen-peer-urls=http://0.0.0.0:2380 --listen-client-urls=http://0.0.0.0:2379 --initial-cluster-token=etcd-cluster-1 --initial-cluster-state new --initial-cluster $conn_string" + if [ $? == 0 ]; then + break + fi + sleep 10 + done +done + +# diff --git a/tests/perf/server/files/server_userdata.tmpl b/tests/perf/server/files/server_userdata.tmpl new file mode 100644 index 0000000000..e831429736 --- /dev/null +++ b/tests/perf/server/files/server_userdata.tmpl @@ -0,0 +1,75 @@ +#cloud-config +%{ if length(extra_ssh_keys) > 0 } +ssh_authorized_keys: +%{ for ssh_key in extra_ssh_keys } +- ${ssh_key} +%{ endfor } +%{ endif } +write_files: +- path: /opt/k3s/run_k3s.sh + permissions: "0755" + owner: root:root + content: | + #!/bin/bash + set -x + if [ ${db_engine} == "postgres" ]; then + STORAGE_ENDPOINT="postgres://${db_username}:${db_password}@${db_address}:5432/${db_name}" + elif [ ${db_engine} == "mysql" ]; then + STORAGE_ENDPOINT="mysql://${db_username}:${db_password}@(${db_address})/${db_name}" + elif [ ${db_engine} == "etcd" ]; then + IFS=',' read -r -a private_ips <<< "${db_address}" + for i in "$${!private_ips[@]}"; do + STORAGE_ENDPOINT=$STORAGE_ENDPOINT"http://$${private_ips[i]}:2379", + done + STORAGE_ENDPOINT=$${STORAGE_ENDPOINT%?} + fi + + while true; do + if [ ${db_engine} == "dqlite" ]; then + curl -sfL https://get.k3s.io | K3S_CLUSTER_SECRET="${k3s_cluster_secret}" \ + INSTALL_K3S_VERSION="${install_k3s_version}" \ + INSTALL_K3S_EXEC="${k3s_server_args} --cluster-cidr=10.0.0.0/8 --no-deploy traefik --no-deploy servicelb --tls-san ${lb_address} %{ if master_index != 0 } --server https://${lb_address}:6443 %{ else } --cluster-init %{ endif }" sh - + else + curl -sfL https://get.k3s.io | K3S_CLUSTER_SECRET="${k3s_cluster_secret}" \ + INSTALL_K3S_VERSION="${install_k3s_version}" \ + INSTALL_K3S_EXEC="${k3s_server_args} --cluster-cidr=10.0.0.0/8 --no-deploy traefik --no-deploy servicelb --tls-san ${lb_address} %{ if use_ha == "true" } --datastore-endpoint=$STORAGE_ENDPOINT %{ endif }" sh - + fi + if [ $? -eq 0 ]; then + break + fi + sleep 1 + done +runcmd: +- echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh3 = 16384" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh2 = 8192" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh1 = 4096" >> /etc/sysctl.conf +- echo "fs.file-max = 12000500" >> /etc/sysctl.conf +- echo "fs.nr_open = 20000500" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_mem = '10000000 10000000 10000000'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_rmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_wmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.core.rmem_max = 16384" >> /etc/sysctl.conf +- echo "net.core.wmem_max = 16384" >> /etc/sysctl.conf +- ulimit -n 20000000 +- echo "# " >> /etc/security/limits.d/limits.conf +- echo " * soft nofile 20000" >> /etc/security/limits.d/limits.conf +- echo " * hard nofile 20000" >> /etc/security/limits.d/limits.conf +- sysctl -p +- apt-get update +- apt-get install -y git vim software-properties-common resolvconf linux-headers-$(uname -r) +- echo "nameserver 1.1.1.1" > /etc/resolvconf/resolv.conf.d/tail +- echo "RateLimitIntervalSec=0" >> /etc/systemd/journald.conf +- echo "RateLimitBurst=0" >> /etc/systemd/journald.conf +- systemctl restart systemd-journald.service +- systemctl start resolvconf +%{ if master_index != 0 } +- sleep 20 +%{ endif } +- /opt/k3s/run_k3s.sh +%{ if debug != 0 } +- sed -i 's/bin\/k3s/bin\/k3s --debug/g' /etc/systemd/system/k3s.service +- systemctl daemon-reload +- systemctl restart k3s +%{ endif } diff --git a/tests/perf/server/files/worker_userdata.tmpl b/tests/perf/server/files/worker_userdata.tmpl new file mode 100644 index 0000000000..e451a6d0ce --- /dev/null +++ b/tests/perf/server/files/worker_userdata.tmpl @@ -0,0 +1,26 @@ +#cloud-config +%{ if length(extra_ssh_keys) > 0 } +ssh_authorized_keys: +%{ for ssh_key in extra_ssh_keys } +- ${ssh_key} +%{ endfor } +%{ endif } +runcmd: +- echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh3 = 16384" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh2 = 8192" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh1 = 4096" >> /etc/sysctl.conf +- echo "fs.file-max = 12000500" >> /etc/sysctl.conf +- echo "fs.nr_open = 20000500" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_mem = '10000000 10000000 10000000'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_rmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_wmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.core.rmem_max = 16384" >> /etc/sysctl.conf +- echo "net.core.wmem_max = 16384" >> /etc/sysctl.conf +- ulimit -n 20000 +- echo "# " >> /etc/security/limits.d/limits.conf +- echo " * soft nofile 20000" >> /etc/security/limits.d/limits.conf +- echo " * hard nofile 20000" >> /etc/security/limits.d/limits.conf +- sysctl -p +- until (curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${install_k3s_version} INSTALL_K3S_EXEC="${k3s_exec}" K3S_URL=https://${k3s_url}:6443 K3S_CLUSTER_SECRET="${k3s_cluster_secret}" sh -); do echo 'k3s did not install correctly'; sleep 1; done diff --git a/tests/perf/server/main.tf b/tests/perf/server/main.tf new file mode 100644 index 0000000000..0e5e1895c8 --- /dev/null +++ b/tests/perf/server/main.tf @@ -0,0 +1,236 @@ +terraform { + backend "local" { + path = "server.tfstate" + } +} + +locals { + name = var.name + k3s_cluster_secret = var.k3s_cluster_secret + install_k3s_version = var.k3s_version + prom_worker_node_count = var.prom_worker_node_count + prom_worker_instance_type = var.prom_worker_instance_type +} + +provider "aws" { + region = "us-east-2" + profile = "rancher-eng" +} + +resource "aws_security_group" "k3s" { + name = "${local.name}-sg" + vpc_id = data.aws_vpc.default.id + + ingress { + from_port = 22 + to_port = 22 + protocol = "TCP" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 6443 + to_port = 6443 + protocol = "TCP" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 0 + to_port = 0 + protocol = "-1" + self = true + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_db_instance" "k3s_db" { + count = "${var.db_engine == "postgres" || var.db_engine == "mysql" ? 1 : 0 }" + allocated_storage = 100 #baseline iops is 300 with gp2 + storage_type = "gp2" + engine = "${var.db_engine}" + engine_version = "${var.db_version}" + instance_class = "${var.db_instance_type}" + name = "${var.db_name}" + username = "${var.db_username}" + password = "${var.db_password}" + skip_final_snapshot = true + multi_az = false +} + +resource "aws_instance" "k3s_etcd" { + count = "${var.etcd_count * (var.db_engine == "etcd" ? 1 * var.server_ha : 0)}" + instance_type = replace(var.db_instance_type, "/db./", "") + ami = data.aws_ami.ubuntu.id + user_data = base64encode(templatefile("${path.module}/files/etcd.tmpl", + { + extra_ssh_keys = var.extra_ssh_keys, + db_version = var.db_version + etcd_count = var.etcd_count + })) + security_groups = [ + aws_security_group.k3s.name, + ] + + root_block_device { + volume_size = "30" + volume_type = "gp2" + } + + tags = { + Name = "${local.name}-etcd-${count.index}" + } +} + +resource "aws_lb" "k3s-master-nlb" { + name = "${local.name}-nlb" + internal = false + load_balancer_type = "network" + subnets = data.aws_subnet_ids.available.ids +} + +resource "aws_route53_record" "www" { + # currently there is the only way to use nlb dns name in k3s + # because the real dns name is too long and cause an issue + zone_id = "${var.zone_id}" + name = "${var.domain_name}" + type = "CNAME" + ttl = "30" + records = ["${aws_lb.k3s-master-nlb.dns_name}"] +} + + +resource "aws_lb_target_group" "k3s-master-nlb-tg" { + name = "${local.name}-nlb-tg" + port = "6443" + protocol = "TCP" + vpc_id = data.aws_vpc.default.id + deregistration_delay = "300" + health_check { + interval = "30" + port = "6443" + protocol = "TCP" + healthy_threshold = "10" + unhealthy_threshold= "10" + } +} + +resource "aws_lb_listener" "k3s-master-nlb-tg" { + load_balancer_arn = "${aws_lb.k3s-master-nlb.arn}" + port = "6443" + protocol = "TCP" + default_action { + target_group_arn = "${aws_lb_target_group.k3s-master-nlb-tg.arn}" + type = "forward" + } +} + +resource "aws_lb_target_group_attachment" "test" { + count = "${var.server_count}" + target_group_arn = "${aws_lb_target_group.k3s-master-nlb-tg.arn}" + target_id = "${aws_instance.k3s-server[count.index].id}" + port = 6443 +} + +resource "aws_instance" "k3s-server" { + count = "${var.server_count}" + instance_type = var.server_instance_type + ami = data.aws_ami.ubuntu.id + user_data = base64encode(templatefile("${path.module}/files/server_userdata.tmpl", + { + extra_ssh_keys = var.extra_ssh_keys, + k3s_cluster_secret = local.k3s_cluster_secret, + install_k3s_version = local.install_k3s_version, + k3s_server_args = var.k3s_server_args, + db_engine = var.db_engine, + db_address = "${var.db_engine == "etcd" ? join(",",aws_instance.k3s_etcd.*.private_ip) : var.db_engine == "dqlite" ? "null" : aws_db_instance.k3s_db[0].address}", + db_name = var.db_name, + db_username = var.db_username, + db_password = var.db_password, + use_ha = "${var.server_ha == 1 ? "true": "false"}", + master_index = count.index, + lb_address = var.domain_name, + prom_worker_node_count = local.prom_worker_node_count, + debug = var.debug, + k3s_cluster_secret = local.k3s_cluster_secret,})) + security_groups = [ + aws_security_group.k3s.name, + ] + + root_block_device { + volume_size = "30" + volume_type = "gp2" + } + + tags = { + Name = "${local.name}-server-${count.index}" + Role = "master" + Leader = "${count.index == 0 ? "true" : "false"}" + } + provisioner "local-exec" { + command = "sleep 10" + } +} + +module "k3s-prom-worker-asg" { + source = "terraform-aws-modules/autoscaling/aws" + version = "3.0.0" + name = "${local.name}-prom-worker" + asg_name = "${local.name}-prom-worker" + instance_type = local.prom_worker_instance_type + image_id = data.aws_ami.ubuntu.id + user_data = base64encode(templatefile("${path.module}/files/worker_userdata.tmpl", { extra_ssh_keys = var.extra_ssh_keys, k3s_url = var.domain_name, k3s_cluster_secret = local.k3s_cluster_secret, install_k3s_version = local.install_k3s_version, k3s_exec = "--node-label prom=true" })) + + desired_capacity = local.prom_worker_node_count + health_check_type = "EC2" + max_size = local.prom_worker_node_count + min_size = local.prom_worker_node_count + vpc_zone_identifier = [data.aws_subnet.selected.id] + spot_price = "0.340" + + security_groups = [ + aws_security_group.k3s.id, + ] + + lc_name = "${local.name}-prom-worker" + + root_block_device = [ + { + volume_size = "30" + volume_type = "gp2" + }, + ] +} + +resource "null_resource" "run_etcd" { + count = "${var.db_engine == "etcd" ? 1 : 0}" + + triggers = { + etcd_instance_ids = "${join(",", aws_instance.k3s_etcd.*.id)}" + } + + provisioner "local-exec" { + interpreter = ["bash", "-c"] + command = "DB_VERSION=${var.db_version} SSH_KEY_PATH=${var.ssh_key_path} PUBLIC_IPS=${join(",",aws_instance.k3s_etcd.*.public_ip)} PRIVATE_IPS=${join(",",aws_instance.k3s_etcd.*.private_ip)} files/etcd_build.sh" + } +} + +resource "null_resource" "get-kubeconfig" { + provisioner "local-exec" { + interpreter = ["bash", "-c"] + command = "until ssh -i ${var.ssh_key_path} ubuntu@${aws_instance.k3s-server[0].public_ip} 'sudo sed \"s/localhost/$var.domain_name}/g;s/127.0.0.1/${var.domain_name}/g\" /etc/rancher/k3s/k3s.yaml' >| ../tests/kubeconfig.yaml; do sleep 5; done" + } +} diff --git a/tests/perf/server/outputs.tf b/tests/perf/server/outputs.tf new file mode 100644 index 0000000000..7c5c84fd2a --- /dev/null +++ b/tests/perf/server/outputs.tf @@ -0,0 +1,15 @@ +output "public_ip" { + value = var.domain_name +} + +output "install_k3s_version" { + value = local.install_k3s_version +} + +output "k3s_cluster_secret" { + value = local.k3s_cluster_secret +} + +output "k3s_server_ips" { + value = join(",", aws_instance.k3s-server.*.public_ip) +} diff --git a/tests/perf/server/variables.tf b/tests/perf/server/variables.tf new file mode 100644 index 0000000000..3ae6f96fea --- /dev/null +++ b/tests/perf/server/variables.tf @@ -0,0 +1,93 @@ +variable "server_instance_type" { + # default = "c4.8xlarge" +} + +variable "k3s_version" { + default = "v0.9.1" + type = string + description = "Version of K3S to install" +} + +variable "k3s_server_args" { + default = "" +} + +variable "prom_worker_node_count" { + default = 0 + type = number + description = "The number of workers to create labeled for prometheus" +} + +variable "k3s_cluster_secret" { + type = string + description = "Cluster secret for k3s cluster registration" +} + +variable "name" { + default = "k3s-loadtest" + type = string + description = "Name to identify this cluster" +} + +variable "ssh_key_path" { + default = "~/.ssh/id_rsa" + type = string + description = "Path of the private key to ssh to the nodes" +} + +variable "extra_ssh_keys" { + type = list + default = [] + description = "Extra ssh keys to inject into Rancher instances" +} + +variable "server_ha" { + default = 0 + description = "Enable k3s in HA mode" +} + +variable "etcd_count" { + default = 3 +} + +variable "db_engine" { + default = "postgres" +} + +variable "db_instance_type" { +} + +variable "db_name" { + default = "k3s" +} + +variable "db_username" { + default = "postgres" +} + +variable "db_password" {} + +variable "db_version" {} + +variable "server_count" { + default = 1 + description = "Count of k3s master servers" +} + +variable "debug" { + default = 0 + description = "Enable Debug log" +} + +variable "prom_worker_instance_type" { + default = "m5.large" + description = "Prometheus instance type" +} + +variable "domain_name" { + description = "FQDN of the cluster" +} + +variable "zone_id" { + description = "route53 zone id to register the domain name" +} diff --git a/tests/perf/server/versions.tf b/tests/perf/server/versions.tf new file mode 100644 index 0000000000..ac97c6ac8e --- /dev/null +++ b/tests/perf/server/versions.tf @@ -0,0 +1,4 @@ + +terraform { + required_version = ">= 0.12" +} diff --git a/tests/perf/tests/density/2000_nodes/override.yaml b/tests/perf/tests/density/2000_nodes/override.yaml new file mode 100644 index 0000000000..8d38cbac56 --- /dev/null +++ b/tests/perf/tests/density/2000_nodes/override.yaml @@ -0,0 +1 @@ +NODE_MODE: masteranddns diff --git a/tests/perf/tests/density/5000_nodes/override.yaml b/tests/perf/tests/density/5000_nodes/override.yaml new file mode 100644 index 0000000000..8d38cbac56 --- /dev/null +++ b/tests/perf/tests/density/5000_nodes/override.yaml @@ -0,0 +1 @@ +NODE_MODE: masteranddns diff --git a/tests/perf/tests/density/600_nodes/high_density_override.yaml b/tests/perf/tests/density/600_nodes/high_density_override.yaml new file mode 100644 index 0000000000..56d78a0775 --- /dev/null +++ b/tests/perf/tests/density/600_nodes/high_density_override.yaml @@ -0,0 +1 @@ +PODS_PER_NODE: 95 diff --git a/tests/perf/tests/density/config.yaml b/tests/perf/tests/density/config.yaml new file mode 100644 index 0000000000..802d47acde --- /dev/null +++ b/tests/perf/tests/density/config.yaml @@ -0,0 +1,248 @@ +# ASSUMPTIONS: +# - Underlying cluster should have 100+ nodes. +# - Number of nodes should be divisible by NODES_PER_NAMESPACE (default 100). + +#Constants +{{$DENSITY_RESOURCE_CONSTRAINTS_FILE := DefaultParam .DENSITY_RESOURCE_CONSTRAINTS_FILE ""}} +{{$NODE_MODE := DefaultParam .NODE_MODE "allnodes"}} +{{$NODES_PER_NAMESPACE := DefaultParam .NODES_PER_NAMESPACE 100}} +{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}} +{{$DENSITY_TEST_THROUGHPUT := DefaultParam .DENSITY_TEST_THROUGHPUT 20}} +# LATENCY_POD_MEMORY and LATENCY_POD_CPU are calculated for 1-core 4GB node. +# Increasing allocation of both memory and cpu by 10% +# decreases the value of priority function in scheduler by one point. +# This results in decreased probability of choosing the same node again. +{{$LATENCY_POD_CPU := DefaultParam .LATENCY_POD_CPU 100}} +{{$LATENCY_POD_MEMORY := DefaultParam .LATENCY_POD_MEMORY 350}} +{{$MIN_LATENCY_PODS := 500}} +{{$MIN_SATURATION_PODS_TIMEOUT := 180}} +{{$ENABLE_CHAOSMONKEY := DefaultParam .ENABLE_CHAOSMONKEY false}} +{{$ENABLE_PROMETHEUS_API_RESPONSIVENESS := DefaultParam .ENABLE_PROMETHEUS_API_RESPONSIVENESS false}} +{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}} +#Variables +{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}} +{{$podsPerNamespace := MultiplyInt $PODS_PER_NODE $NODES_PER_NAMESPACE}} +{{$totalPods := MultiplyInt $podsPerNamespace $namespaces}} +{{$latencyReplicas := DivideInt (MaxInt $MIN_LATENCY_PODS .Nodes) $namespaces}} +{{$totalLatencyPods := MultiplyInt $namespaces $latencyReplicas}} +{{$saturationDeploymentTimeout := DivideFloat $totalPods $DENSITY_TEST_THROUGHPUT | AddInt $MIN_SATURATION_PODS_TIMEOUT}} +# saturationDeploymentHardTimeout must be at least 20m to make sure that ~10m node +# failure won't fail the test. See https://github.com/kubernetes/kubernetes/issues/73461#issuecomment-467338711 +{{$saturationDeploymentHardTimeout := MaxInt $saturationDeploymentTimeout 1200}} + +name: density +automanagedNamespaces: {{$namespaces}} +tuningSets: +- name: Uniform5qps + qpsLoad: + qps: 5 +{{if $ENABLE_CHAOSMONKEY}} +chaosMonkey: + nodeFailure: + failureRate: 0.01 + interval: 1m + jitterFactor: 10.0 + simulatedDowntime: 10m +{{end}} +steps: +- name: Starting measurements + measurements: + - Identifier: APIResponsiveness + Method: APIResponsiveness + Params: + action: reset + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: start + # TODO(oxddr): figure out how many probers to run in function of cluster + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: start + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: start + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + nodeMode: {{$NODE_MODE}} + resourceConstraints: {{$DENSITY_RESOURCE_CONSTRAINTS_FILE}} + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + +- name: Starting saturation pod measurements + measurements: + - Identifier: SaturationPodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = saturation + threshold: {{$saturationDeploymentTimeout}}s + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = saturation + operationTimeout: {{$saturationDeploymentHardTimeout}}s + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: start + labelSelector: group = saturation + +- name: Creating saturation pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: Uniform5qps + objectBundle: + - basename: saturation-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + Replicas: {{$podsPerNamespace}} + Group: saturation + CpuRequest: 1m + MemoryRequest: 10M + +- name: Collecting saturation pod measurements + measurements: + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- measurements: + - Identifier: SaturationPodStartupLatency + Method: PodStartupLatency + Params: + action: gather +- measurements: + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: gather + +- name: Starting latency pod measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = latency + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = latency + operationTimeout: 15m + +- name: Creating latency pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$latencyReplicas}} + tuningSet: Uniform5qps + objectBundle: + - basename: latency-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + Replicas: 1 + Group: latency + CpuRequest: {{$LATENCY_POD_CPU}}m + MemoryRequest: {{$LATENCY_POD_MEMORY}}M + +- name: Waiting for latency pods to be running + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Deleting latency pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Uniform5qps + objectBundle: + - basename: latency-deployment + objectTemplatePath: deployment.yaml + +- name: Waiting for latency pods to be deleted + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collecting pod startup latency + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + +- name: Deleting saturation pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Uniform5qps + objectBundle: + - basename: saturation-deployment + objectTemplatePath: deployment.yaml + +- name: Waiting for saturation pods to be deleted + measurements: + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collecting measurements + measurements: + - Identifier: APIResponsiveness + Method: APIResponsiveness + Params: + action: gather + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + {{if $ENABLE_PROMETHEUS_API_RESPONSIVENESS}} + enableViolations: true + {{end}} + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + {{if not $USE_SIMPLE_LATENCY_QUERY}} + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: gather + {{end}} + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: gather + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: gather + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} diff --git a/tests/perf/tests/density/deployment.yaml b/tests/perf/tests/density/deployment.yaml new file mode 100644 index 0000000000..1903dbaf89 --- /dev/null +++ b/tests/perf/tests/density/deployment.yaml @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + spec: + containers: + - image: k8s.gcr.io/pause:3.1 + imagePullPolicy: IfNotPresent + name: {{.Name}} + ports: + resources: + requests: + cpu: {{.CpuRequest}} + memory: {{.MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/tests/perf/tests/load/config.yaml b/tests/perf/tests/load/config.yaml new file mode 100644 index 0000000000..a70765330a --- /dev/null +++ b/tests/perf/tests/load/config.yaml @@ -0,0 +1,765 @@ +# ASSUMPTIONS: +# - Underlying cluster should have 100+ nodes. +# - Number of nodes should be divisible by NODES_PER_NAMESPACE (default 100). +# - The number of created SVCs is half the number of created Deployments. +# - Only half of Deployments will be assigned 1-1 to existing SVCs. + +#Constants +{{$NODE_MODE := DefaultParam .NODE_MODE "allnodes"}} +{{$NODES_PER_NAMESPACE := DefaultParam .NODES_PER_NAMESPACE 10}} +{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}} +{{$LOAD_TEST_THROUGHPUT := DefaultParam .LOAD_TEST_THROUGHPUT 10}} +{{$BIG_GROUP_SIZE := 300}} +{{$MEDIUM_GROUP_SIZE := 150}} +{{$SMALL_GROUP_SIZE := 50}} +{{$SMALL_STATEFUL_SETS_PER_NAMESPACE := 1}} +{{$MEDIUM_STATEFUL_SETS_PER_NAMESPACE := 1}} +{{$ENABLE_CHAOSMONKEY := DefaultParam .ENABLE_CHAOSMONKEY false}} +{{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}} +{{$ENABLE_PROMETHEUS_API_RESPONSIVENESS := DefaultParam .ENABLE_PROMETHEUS_API_RESPONSIVENESS false}} +{{$ENABLE_CONFIGMAPS := DefaultParam .ENABLE_CONFIGMAPS false}} +{{$ENABLE_DAEMONSETS := DefaultParam .ENABLE_DAEMONSETS false}} +{{$ENABLE_JOBS := DefaultParam .ENABLE_JOBS false}} +{{$ENABLE_PVS := DefaultParam .ENABLE_PVS false}} +{{$ENABLE_SECRETS := DefaultParam .ENABLE_SECRETS false}} +{{$ENABLE_STATEFULSETS := DefaultParam .ENABLE_STATEFULSETS false}} +{{$ENABLE_NETWORKPOLICIES := DefaultParam .ENABLE_NETWORKPOLICIES false}} +{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}} +#Variables +{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}} +{{$totalPods := MultiplyInt $namespaces $NODES_PER_NAMESPACE $PODS_PER_NODE}} +{{$podsPerNamespace := DivideInt $totalPods $namespaces}} +{{$saturationTime := DivideInt $totalPods $LOAD_TEST_THROUGHPUT}} +# bigDeployments - 1/4 of namespace pods should be in big Deployments. +{{$bigDeploymentsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 4 $BIG_GROUP_SIZE)}} +# mediumDeployments - 1/4 of namespace pods should be in medium Deployments. +{{$mediumDeploymentsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 4 $MEDIUM_GROUP_SIZE)}} +# smallDeployments - 1/2 of namespace pods should be in small Deployments. +{{$smallDeploymentsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 2 $SMALL_GROUP_SIZE)}} +# If StatefulSets are enabled reduce the number of small and medium deployments per namespace +{{$smallDeploymentsPerNamespace := SubtractInt $smallDeploymentsPerNamespace (IfThenElse $ENABLE_STATEFULSETS $SMALL_STATEFUL_SETS_PER_NAMESPACE 0)}} +{{$mediumDeploymentsPerNamespace := SubtractInt $mediumDeploymentsPerNamespace (IfThenElse $ENABLE_STATEFULSETS $MEDIUM_STATEFUL_SETS_PER_NAMESPACE 0)}} + +# If Jobs are enabled reduce the number of small, medium, big deployments per namespace. +{{$smallDeploymentsPerNamespace := SubtractInt $smallDeploymentsPerNamespace (IfThenElse $ENABLE_JOBS 1 0)}} +{{$mediumDeploymentsPerNamespace := SubtractInt $mediumDeploymentsPerNamespace (IfThenElse $ENABLE_JOBS 1 0)}} +{{$bigDeploymentsPerNamespace := SubtractInt $bigDeploymentsPerNamespace (IfThenElse $ENABLE_JOBS 1 0)}} + +name: load +automanagedNamespaces: {{$namespaces}} +tuningSets: +- name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 +- name: RandomizedSaturationTimeLimited + RandomizedTimeLimitedLoad: + timeLimit: {{$saturationTime}}s +- name: RandomizedScalingTimeLimited + RandomizedTimeLimitedLoad: + # The expected number of created/deleted pods is totalPods/4 when scaling, + # as each RS changes its size from X to a uniform random value in [X/2, 3X/2]. + # To match 10 [pods/s] requirement, we need to divide saturationTime by 4. + timeLimit: {{DivideInt $saturationTime 4}}s +{{if $ENABLE_CHAOSMONKEY}} +chaosMonkey: + nodeFailure: + failureRate: 0.01 + interval: 1m + jitterFactor: 10.0 + simulatedDowntime: 10m +{{end}} +steps: +- name: Starting measurements + measurements: + - Identifier: APIResponsiveness + Method: APIResponsiveness + Params: + action: reset + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = load + threshold: 1h + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: start + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: start + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + {{if $PROMETHEUS_SCRAPE_KUBE_PROXY}} + - Identifier: NetworkProgrammingLatency + Method: NetworkProgrammingLatency + Params: + action: start + {{end}} + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + nodeMode: {{$NODE_MODE}} + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + +- name: Creating SVCs + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{DivideInt (AddInt $bigDeploymentsPerNamespace 1) 2}} + tuningSet: Sequence + objectBundle: + - basename: big-service + objectTemplatePath: service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{DivideInt (AddInt $mediumDeploymentsPerNamespace 1) 2}} + tuningSet: Sequence + objectBundle: + - basename: medium-service + objectTemplatePath: service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{DivideInt (AddInt $smallDeploymentsPerNamespace 1) 2}} + tuningSet: Sequence + objectBundle: + - basename: small-service + objectTemplatePath: service.yaml + +{{if $ENABLE_DAEMONSETS}} +- name: Creating PriorityClass for DaemonSets + phases: + - replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: daemonset-priorityclass + objectTemplatePath: daemonset-priorityclass.yaml +{{end}} + +- name: Starting measurement for waiting for pods + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = load + operationTimeout: 15m + {{if $ENABLE_STATEFULSETS}} + - Identifier: WaitForRunningStatefulSets + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: StatefulSet + labelSelector: group = load + operationTimeout: 15m + {{end}} + {{if $ENABLE_DAEMONSETS}} + - Identifier: WaitForRunningDaemonSets + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: DaemonSet + labelSelector: group = load + operationTimeout: 15m + {{end}} + {{if $ENABLE_JOBS}} + - Identifier: WaitForRunningJobs + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: batch/v1 + kind: Job + labelSelector: group = load + operationTimeout: 15m + {{end}} + +- name: Creating objects + phases: + {{if $ENABLE_DAEMONSETS}} + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: daemonset + objectTemplatePath: daemonset.yaml + templateFillMap: + Image: k8s.gcr.io/pause:3.0 + {{end}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$bigDeploymentsPerNamespace}} + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + {{if $ENABLE_CONFIGMAPS}} + - basename: big-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: big-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: big-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + - basename: big-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{$BIG_GROUP_SIZE}} + ReplicasMax: {{$BIG_GROUP_SIZE}} + SvcName: big-service + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$mediumDeploymentsPerNamespace}} + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + {{if $ENABLE_CONFIGMAPS}} + - basename: medium-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: medium-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: medium-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + - basename: medium-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{$MEDIUM_GROUP_SIZE}} + ReplicasMax: {{$MEDIUM_GROUP_SIZE}} + SvcName: medium-service + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$smallDeploymentsPerNamespace}} + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + {{if $ENABLE_CONFIGMAPS}} + - basename: small-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: small-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: small-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + - basename: small-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{$SMALL_GROUP_SIZE}} + ReplicasMax: {{$SMALL_GROUP_SIZE}} + SvcName: small-service + {{if $ENABLE_STATEFULSETS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$SMALL_STATEFUL_SETS_PER_NAMESPACE}} + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: small-statefulset + objectTemplatePath: statefulset_service.yaml + - basename: small-statefulset + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{$SMALL_GROUP_SIZE}} + ReplicasMax: {{$SMALL_GROUP_SIZE}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$MEDIUM_STATEFUL_SETS_PER_NAMESPACE}} + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: medium-statefulset + objectTemplatePath: statefulset_service.yaml + - basename: medium-statefulset + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{$MEDIUM_GROUP_SIZE}} + ReplicasMax: {{$MEDIUM_GROUP_SIZE}} + {{end}} + {{if $ENABLE_JOBS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: small-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{$SMALL_GROUP_SIZE}} + ReplicasMax: {{$SMALL_GROUP_SIZE}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: medium-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{$MEDIUM_GROUP_SIZE}} + ReplicasMax: {{$MEDIUM_GROUP_SIZE}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: big-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{$BIG_GROUP_SIZE}} + ReplicasMax: {{$BIG_GROUP_SIZE}} + {{end}} + +- name: Waiting for pods to be running + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + {{if $ENABLE_STATEFULSETS}} + - Identifier: WaitForRunningStatefulSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_DAEMONSETS}} + - Identifier: WaitForRunningDaemonSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_JOBS}} + - Identifier: WaitForRunningJobs + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + +- name: Scaling and updating objects + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$bigDeploymentsPerNamespace}} + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: big-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $BIG_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $BIG_GROUP_SIZE 1.5}} + SvcName: big-service + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$mediumDeploymentsPerNamespace}} + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: medium-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $MEDIUM_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $MEDIUM_GROUP_SIZE 1.5}} + SvcName: medium-service + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$smallDeploymentsPerNamespace}} + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: small-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $SMALL_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $SMALL_GROUP_SIZE 1.5}} + SvcName: small-service + {{if $ENABLE_STATEFULSETS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$SMALL_STATEFUL_SETS_PER_NAMESPACE}} + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: small-statefulset + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $SMALL_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $SMALL_GROUP_SIZE 1.5}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$MEDIUM_STATEFUL_SETS_PER_NAMESPACE}} + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: medium-statefulset + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $MEDIUM_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $MEDIUM_GROUP_SIZE 1.5}} + {{end}} + {{if $ENABLE_DAEMONSETS}} + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: daemonset + objectTemplatePath: daemonset.yaml + templateFillMap: + Image: k8s.gcr.io/pause:3.1 + {{end}} + {{if $ENABLE_JOBS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: small-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $SMALL_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $SMALL_GROUP_SIZE 1.5}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: medium-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $MEDIUM_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $MEDIUM_GROUP_SIZE 1.5}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: big-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $BIG_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $BIG_GROUP_SIZE 1.5}} + {{end}} + +- name: Waiting for objects to become scaled + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + {{if $ENABLE_STATEFULSETS}} + - Identifier: WaitForRunningStatefulSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_DAEMONSETS}} + - Identifier: WaitForRunningDaemonSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_JOBS}} + - Identifier: WaitForRunningJobs + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + +- name: Deleting objects + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: big-deployment + objectTemplatePath: deployment.yaml + {{if $ENABLE_CONFIGMAPS}} + - basename: big-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: big-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: big-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: medium-deployment + objectTemplatePath: deployment.yaml + {{if $ENABLE_CONFIGMAPS}} + - basename: medium-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: medium-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: medium-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: small-deployment + objectTemplatePath: deployment.yaml + {{if $ENABLE_CONFIGMAPS}} + - basename: small-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: small-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: small-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + {{if $ENABLE_STATEFULSETS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: small-statefulset + objectTemplatePath: statefulset.yaml + - basename: small-statefulset + objectTemplatePath: statefulset_service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: medium-statefulset + objectTemplatePath: statefulset.yaml + - basename: medium-statefulset + objectTemplatePath: statefulset_service.yaml + {{end}} + {{if $ENABLE_DAEMONSETS}} + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: daemonset + objectTemplatePath: daemonset.yaml + {{end}} + {{if $ENABLE_JOBS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: small-job + objectTemplatePath: job.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: medium-job + objectTemplatePath: job.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: big-job + objectTemplatePath: job.yaml + {{end}} + # If both StatefulSets and PVs were enabled we need to delete PVs manually. + {{if and $ENABLE_STATEFULSETS $ENABLE_PVS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + {{range $ssIndex := Seq $SMALL_STATEFUL_SETS_PER_NAMESPACE}} + - basename: pv-small-statefulset-{{$ssIndex}} + objectTemplatePath: pvc.yaml + listUnknownObjectOptions: + labelSelector: + matchLabels: + name: small-statefulset-{{$ssIndex}} + {{end}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + {{range $ssIndex := Seq $MEDIUM_STATEFUL_SETS_PER_NAMESPACE}} + - basename: pv-medium-statefulset-{{$ssIndex}} + objectTemplatePath: pvc.yaml + listUnknownObjectOptions: + labelSelector: + matchLabels: + name: medium-statefulset-{{$ssIndex}} + {{end}} + {{end}} + +- name: Waiting for pods to be deleted + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + {{if $ENABLE_STATEFULSETS}} + - Identifier: WaitForRunningStatefulSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_DAEMONSETS}} + - Identifier: WaitForRunningDaemonSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_JOBS}} + - Identifier: WaitForRunningJobs + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if and $ENABLE_STATEFULSETS $ENABLE_PVS}} + - Identifier: WaitForPVCsToBeDeleted + Method: WaitForBoundPVCs + Params: + desiredPVCCount: 0 + labelSelector: group = load + timeout: 15m + {{end}} + +{{if $ENABLE_DAEMONSETS}} +- name: Deleting PriorityClass for DaemonSets + phases: + - replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: daemonset-priorityclass + objectTemplatePath: daemonset-priorityclass.yaml +{{end}} + +- name: Deleting SVCs + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: big-service + objectTemplatePath: service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: medium-service + objectTemplatePath: service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: small-service + objectTemplatePath: service.yaml + +- name: Collecting measurements + measurements: + - Identifier: APIResponsiveness + Method: APIResponsiveness + Params: + action: gather + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + {{if $ENABLE_PROMETHEUS_API_RESPONSIVENESS}} + enableViolations: true + {{end}} + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + {{if not $USE_SIMPLE_LATENCY_QUERY}} + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: gather + {{end}} + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: gather + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: gather + {{if $PROMETHEUS_SCRAPE_KUBE_PROXY}} + - Identifier: NetworkProgrammingLatency + Method: NetworkProgrammingLatency + Params: + action: gather + {{end}} + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} diff --git a/tests/perf/tests/load/configmap.yaml b/tests/perf/tests/load/configmap.yaml new file mode 100644 index 0000000000..b249a39143 --- /dev/null +++ b/tests/perf/tests/load/configmap.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{.Name}} +data: + data.yaml: |- + a: 1 + b: 2 + c: 3 diff --git a/tests/perf/tests/load/daemonset-priorityclass.yaml b/tests/perf/tests/load/daemonset-priorityclass.yaml new file mode 100644 index 0000000000..e264a740d5 --- /dev/null +++ b/tests/perf/tests/load/daemonset-priorityclass.yaml @@ -0,0 +1,9 @@ +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: {{.Name}} +value: 1000000 +globalDefault: false +description: "Designated priority class to be used for DaemonSet pods. This is + to make sure they have higher priority than other test pods and there is always + place for them on each node, see kubernetes/kubernetes#82818." diff --git a/tests/perf/tests/load/daemonset.yaml b/tests/perf/tests/load/daemonset.yaml new file mode 100644 index 0000000000..68acfefaec --- /dev/null +++ b/tests/perf/tests/load/daemonset.yaml @@ -0,0 +1,41 @@ +{{$Image := DefaultParam .Image "k8s.gcr.io/pause:3.1"}} + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{.Name}} + labels: + group: load +spec: + updateStrategy: + rollingUpdate: + maxUnavailable: {{MaxInt 10 (DivideInt .Nodes 20)}} # 5% of nodes, but not less than 10 + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: load + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: {{$Image}} + resources: + requests: + cpu: 10m + memory: "10M" + priorityClassName: daemonset-priorityclass-0 # Name is autogenerated, hence the -0 prefix. + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/tests/perf/tests/load/deployment.yaml b/tests/perf/tests/load/deployment.yaml new file mode 100644 index 0000000000..8a2f3a798b --- /dev/null +++ b/tests/perf/tests/load/deployment.yaml @@ -0,0 +1,63 @@ +{{$EnableConfigMaps := DefaultParam .ENABLE_CONFIGMAPS false}} +{{$EnableSecrets := DefaultParam .ENABLE_SECRETS false}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: load + svc: {{.SvcName}}-{{.Index}} +spec: + replicas: {{RandIntRange .ReplicasMin .ReplicasMax}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: load + name: {{.Name}} + svc: {{.SvcName}}-{{.Index}} + spec: + containers: + - image: k8s.gcr.io/pause:3.1 + name: {{.Name}} + resources: + requests: + cpu: 10m + memory: "10M" + volumeMounts: + {{if and $EnableConfigMaps (eq (Mod .Index 20) 0 19) }} # .Index % 20 in {0,19} - 10% deployments will have ConfigMap + - name: configmap + mountPath: /var/configmap + {{end}} + {{if and $EnableSecrets (eq (Mod .Index 20) 10 19) }} # .Index % 20 in {10,19} - 10% deployments will have Secret + - name: secret + mountPath: /var/secret + {{end}} + dnsPolicy: Default + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + volumes: + {{if and $EnableConfigMaps (eq (Mod .Index 20) 0 19) }} # .Index % 20 in {0,19} - 10% deployments will have ConfigMap + - name: configmap + configMap: + name: {{.BaseName}}-{{.Index}} + {{end}} + {{if and $EnableSecrets (eq (Mod .Index 20) 10 19) }} # .Index % 20 in {10,19} - 10% deployments will have Secret + - name: secret + secret: + secretName: {{.BaseName}}-{{.Index}} + {{end}} + diff --git a/tests/perf/tests/load/job.yaml b/tests/perf/tests/load/job.yaml new file mode 100644 index 0000000000..f28e1b3ee2 --- /dev/null +++ b/tests/perf/tests/load/job.yaml @@ -0,0 +1,39 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{.Name}} + labels: + group: load +spec: + manualSelector: true + parallelism: {{RandIntRange .ReplicasMin .ReplicasMax}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: load + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + # TODO(#799): We should test the "run-to-completion" workflow and hence don't use pause pods. + image: k8s.gcr.io/pause:3.1 + resources: + requests: + cpu: 10m + memory: "10M" + restartPolicy: Never + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/tests/perf/tests/load/networkpolicy.yaml b/tests/perf/tests/load/networkpolicy.yaml new file mode 100644 index 0000000000..1aae9b23c0 --- /dev/null +++ b/tests/perf/tests/load/networkpolicy.yaml @@ -0,0 +1,19 @@ +{{if eq (Mod .Index 10) 0}} # Create for only 10% of deployments +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{.Name}} +spec: + podSelector: + matchLabels: + name: {{.BaseName}}-{{.Index}} + policyTypes: + - Egress + egress: + - to: + - ipBlock: + cidr: 10.0.0.0/24 + ports: + - protocol: TCP + port: 8080 +{{end}} diff --git a/tests/perf/tests/load/pvc.yaml b/tests/perf/tests/load/pvc.yaml new file mode 100644 index 0000000000..d19d23053e --- /dev/null +++ b/tests/perf/tests/load/pvc.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{.Name}} diff --git a/tests/perf/tests/load/secret.yaml b/tests/perf/tests/load/secret.yaml new file mode 100644 index 0000000000..67134b355f --- /dev/null +++ b/tests/perf/tests/load/secret.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{.Name}} +type: Opaque +data: + password: c2NhbGFiaWxpdHkK diff --git a/tests/perf/tests/load/service.yaml b/tests/perf/tests/load/service.yaml new file mode 100644 index 0000000000..ed6a22c8cf --- /dev/null +++ b/tests/perf/tests/load/service.yaml @@ -0,0 +1,16 @@ +{{$SetServiceProxyLabel := DefaultParam .SetServiceProxyLabel false}} + +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} +{{if and $SetServiceProxyLabel (eq (Mod .Index 2) 0)}} + labels: + service.kubernetes.io/service-proxy-name: foo +{{end}} +spec: + selector: + svc: {{.Name}} + ports: + - port: 80 + targetPort: 80 diff --git a/tests/perf/tests/load/statefulset.yaml b/tests/perf/tests/load/statefulset.yaml new file mode 100644 index 0000000000..43157b7928 --- /dev/null +++ b/tests/perf/tests/load/statefulset.yaml @@ -0,0 +1,61 @@ +{{$EnablePVs := DefaultParam .ENABLE_PVS false}} + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{.Name}} + labels: + group: load +spec: + podManagementPolicy: Parallel + selector: + matchLabels: + group: load + name: {{.Name}} + serviceName: {{.Name}} + replicas: {{RandIntRange .ReplicasMin .ReplicasMax}} + template: + metadata: + labels: + group: load + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: k8s.gcr.io/pause:3.1 + ports: + - containerPort: 80 + name: web + resources: + requests: + cpu: 10m + memory: "10M" + {{if $EnablePVs}} + volumeMounts: + - name: pv + mountPath: /var/pv + {{end}} + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + {{if $EnablePVs}} + # NOTE: PVs created this way should be cleaned-up manually, as deleting the StatefulSet doesn't automatically delete PVs. + # To avoid deleting all the PVs at once during namespace deletion, they should be deleted explicitly via Phase. + volumeClaimTemplates: + - metadata: + name: pv + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 100Mi + {{end}} diff --git a/tests/perf/tests/load/statefulset_service.yaml b/tests/perf/tests/load/statefulset_service.yaml new file mode 100644 index 0000000000..5e16a47a19 --- /dev/null +++ b/tests/perf/tests/load/statefulset_service.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} + labels: + name: {{.Name}} +spec: + clusterIP: None + selector: + name: {{.Name}}