Add ansible

This commit is contained in:
Tyler Perkins 2024-11-02 19:11:50 -04:00
parent 40957db6aa
commit 2917781c4b
Signed by: tyler
GPG Key ID: 03B27509E17EFDC8
18 changed files with 659 additions and 0 deletions

9
ansible/README.md Normal file
View File

@ -0,0 +1,9 @@
# Gluttony Cluster Ansible
Ansible playbooks for my cluster.
# Onboarding a server
1) Add it to the inventory.yaml
2) Run the debian-systemd-bootstrap playbook

View File

@ -0,0 +1,9 @@
#!/bin/bash
if [ "${PAM_TYPE}" = "open_session" ]; then
curl \
-H prio:high \
-H tags:warning \
-u ssh:AXxccvd9um \
-d "SSH Login ($(cat /etc/hostname)): ${PAM_USER} from ${PAM_RHOST}" \
https://ntfy.clortox.com/ssh
fi

63
ansible/inventory.yaml Normal file
View File

@ -0,0 +1,63 @@
###############################################################################
# Hosts #######################################################################
###############################################################################
network:
hosts:
bastion:
ansible_host: 10.0.3.64
gluttony:
hosts:
gluttony:
ansible_host: 10.0.3.3
ansible_become_password: "{{ vault_gluttony_become_password }}"
longhorn_drive_uuid: "77703127-E9E2-EB44-A9B8-C61C3CAD6174"
gluttony_gpu:
hosts:
gluttony_gpu:
ansible_host: 10.0.3.2
ansible_become_password: "{{ vault_gluttony_gpu_become_password }}"
longhorn_drive_uuid: "622C7470-1A12-3A46-B506-9DBFDCCA07A0"
gluttony_m3:
hosts:
gluttony_m3:
ansible_host: 10.0.3.4
ansible_become_password: "{{ vault_gluttony_m3_become_password }}"
longhorn_drive_uuid: "F5122704-F411-F046-AE8F-F583B3A2D1A3"
gluttony_nas:
hosts:
gluttony_nas:
ansible_host: 10.0.3.5
################################################################################
# Groups #######################################################################
################################################################################
k3s_masters:
children:
gluttony:
gluttony_gpu:
gluttony_m3:
graphics_devices:
children:
gluttony:
gluttony_gpu:
storage_devices:
children:
gluttony:
datacenter:
children:
k3s_masters:
network:
debian_hosts:
children:
k3s_masters:
network:

View File

@ -0,0 +1,100 @@
- name: Bootstrap debian-nvidia-cuda
hosts: graphics_devices
become: yes
vars_files:
# Secrets
- ../secrets/gluttonycluster-credentials.yaml
tasks:
- name: Update APT package index
apt:
update_cache: yes
- name: Install prerequisites
apt:
name: "{{ packages }}"
vars:
packages:
- build-essential
- dkms
- curl
- gnupg2
- ca-certificates
- software-properties-common
- name: Add NVIDIA PPA repository
apt_repository:
repo: "ppa:graphics-drivers/ppa"
state: present
- name: Add NVIDIA container runtime GPG key
apt_key:
url: https://nvidia.github.io/nvidia-docker/gpgkey
state: present
- name: Add NVIDIA container runtime repository
apt_repository:
repo: "deb https://nvidia.github.io/libnvidia-container/stable/ubuntu18.04/$(ARCH) /"
filename: "nvidia-container-runtime"
state: present
- name: Update APT package index after adding PPA
apt:
update_cache: yes
- name: Install the latest NVIDIA driver
apt:
name: "{{ nvidia_driver }}"
state: present
vars:
nvidia_driver: "nvidia-driver-535" # Replace with the latest driver version if needed
- name: Install CUDA toolkit (optional)
apt:
name: "{{ cuda_packages }}"
state: present
vars:
cuda_packages:
- nvidia-cuda-toolkit
- name: Install NVIDIA container runtime
apt:
name: "{{ nvidia_container_packages }}"
state: present
vars:
nvidia_container_packages:
- nvidia-container-toolkit
- nvidia-container-runtime
- name: Configure Docker to use the NVIDIA runtime
lineinfile:
path: /etc/docker/daemon.json
create: yes
line: '"default-runtime": "nvidia"'
insertafter: '"runtimes": {'
state: present
notify: Restart Docker
- name: Reboot the server to apply changes
reboot:
msg: "Rebooting to apply NVIDIA driver installation"
- name: Verify NVIDIA driver installation
shell: nvidia-smi
register: nvidia_smi_output
ignore_errors: yes
- name: Display NVIDIA driver installation result
debug:
var: nvidia_smi_output.stdout
- name: Fail if NVIDIA driver is not installed correctly
fail:
msg: "NVIDIA driver installation failed. Please check the output."
when: "'NVIDIA-SMI' not in nvidia_smi_output.stdout"
handlers:
- name: Restart Docker
service:
name: docker
state: restarted

View File

@ -0,0 +1,63 @@
- name: Bootstrap k3s cluster
hosts: k3s_masters
become: yes
vars_files:
# Secrets
- ../secrets/gluttonycluster-credentials.yaml
- ../secrets/k3s-token.yaml
- ../secrets/gitea-token.yaml
# Generic configurations
- ../vars/gitea-gluttony-cluster.yaml
- ../vars/k3s.yaml
tasks:
- name: Install dependencies
apt:
name:
- curl
- apt-transport-https
state: present
update_cache: yes
- name: Check if k3s is already installed
command: which k3s
register: k3s_installed
ignore_errors: yes
changed_when: false
- name: Install k3s master on the bootstrap node
shell: |
curl -sfL https://get.k3s.io | K3S_TOKEN={{ vault_k3s_token | quote }} INSTALL_K3S_VERSION={{ k3s_version }} sh -s - --disable traefik --disable servicelb server --cluster-init
when: inventory_hostname == primary_node_name and k3s_installed.rc != 0
register: k3s_install_bootstrap
ignore_errors: yes
- name: Install k3s on subsequent masters
shell: |
curl -sfL https://get.k3s.io | K3S_TOKEN={{ vault_k3s_token | quote }} INSTALL_K3S_VERSION={{ k3s_version }} sh -s - server --disable servicelb --server https://{{ hostvars[primary_node_name].ansible_host }}:6443 --token {{ vault_k3s_token | quote }}
when: inventory_hostname != primary_node_name and k3s_installed.rc != 0
ignore_errors: yes
- name: Ensure k3s service is running and enabled
service:
name: k3s
state: started
enabled: yes
when: k3s_installed.rc == 0
- name: Check if Flux is installed
command: which flux
register: flux_installed
ignore_errors: yes
changed_when: false
- name: Install Flux CLI
shell: |
curl -s https://fluxcd.io/install.sh | sudo bash
when: flux_installed.rc != 0
ignore_errors: yes
- name: Bootstrap flux
shell: |
GITEA_TOKEN={{ vault_gitea_token | quote }} flux bootstrap gitea --owner={{ gitea_owner }} --repository={{ gitea_repository }} --hostname={{ gitea_hostname }} --personal=false --private=false --kubeconfig=/etc/rancher/k3s/k3s.yaml
when: inventory_hostname == primary_node_name
ignore_errors: no

View File

@ -0,0 +1,68 @@
- name: Install kubeseal
hosts: k3s_masters
become: yes
vars_files:
# Secrets
- ../secrets/gluttonycluster-credentials.yaml
tasks:
- name: Check if kubeseal is installed
command: kubeseal --version
register: kubeseal_installed
ignore_errors: yes
- name: Fetch the latest sealed-secrets version using GitHub API
shell: >
curl -s https://api.github.com/repos/bitnami-labs/sealed-secrets/tags | jq -r '.[0].name' | cut -c 2-
register: kubeseal_version
when: kubeseal_installed.failed
- name: Check if the version was fetched successfully
fail:
msg: "Failed to fetch the latest KUBESEAL_VERSION"
when:
- kubeseal_installed.failed
- kubeseal_version.stdout == ""
- name: Set kubeseal version fact
set_fact:
kubeseal_version: "{{ kubeseal_version.stdout }}"
when: kubeseal_installed.failed
- name: Download kubeseal tarball
get_url:
url: "https://github.com/bitnami-labs/sealed-secrets/releases/download/v{{ kubeseal_version }}/kubeseal-{{ kubeseal_version }}-linux-amd64.tar.gz"
dest: "/root/kubeseal-{{ kubeseal_version }}-linux-amd64.tar.gz"
mode: '0644'
when: kubeseal_installed.failed
- name: Extract kubeseal binary
unarchive:
src: "/root/kubeseal-{{ kubeseal_version }}-linux-amd64.tar.gz"
dest: "/root/"
remote_src: yes
creates: "/root/kubeseal"
when: kubeseal_installed.failed
- name: Chown kubeseal binary to root
file:
path: "/root/kubeseal"
owner: root
group: root
mode: '0755'
when: kubeseal_installed.failed
- name: Install kubeseal
copy:
src: "/root/kubeseal"
dest: "/usr/local/bin/kubeseal"
mode: '0755'
become: true
when: kubeseal_installed.failed
notify: cleanup
handlers:
- name: cleanup
file:
path: "/tmp/kubeseal-{{ kubeseal_version }}-linux-amd64.tar.gz"
state: absent
when: kubeseal_installed.failed

View File

@ -0,0 +1,172 @@
- name: Bootstrap longhorn drive
hosts: k3s_masters
become: yes
vars_files:
- ../secrets/gluttonycluster-credentials.yaml
- ../vars/k3s.yaml
vars:
longhorn_mount_point: "/media/longhorn"
tasks:
- name: Install requirements
apt:
name:
- nfs-common
- open-iscsi
state: present
update_cache: yes
- name: Ensure iscsi_tcp kernel module is loaded
ansible.builtin.modprobe:
name: iscsi_tcp
state: present
- name: Ensure iscsi_tcp module is loaded on boot
ansible.builtin.copy:
dest: /etc/modules-load.d/iscsi_tcp.conf
content: |
iscsi_tcp
owner: root
group: root
mode: '0644'
- name: Get the device corresponding to the drive UUID (case-insensitive)
shell: |
lsblk -dn -o NAME,PTUUID | awk '{if (NF == 2 && tolower($2) == tolower("{{ longhorn_drive_uuid }}")) print "/dev/" $1}'
register: drive_device
failed_when: drive_device.stdout == ""
changed_when: false
- name: Debug drive_device
debug:
msg: "Drive device is {{ drive_device.stdout }}"
- name: Set drive device fact
set_fact:
drive_device: "{{ drive_device.stdout }}"
- name: Get partition device name
set_fact:
partition_device: "{{ drive_device }}1"
- name: Check if partition exists
stat:
path: "{{ partition_device }}"
register: partition_stat
- name: Create partition on the drive
parted:
device: "{{ drive_device }}"
number: 1
state: present
align: optimal
label: gpt
unit: MiB
part_start: 0%
part_end: 100%
name: longhorn
when: partition_stat.stat.exists == false
- name: Ensure partition table is re-read
command: "partprobe {{ drive_device }}"
when: partition_stat.stat.exists == false
- name: Wait for partition to be available
wait_for:
path: "{{ partition_device }}"
timeout: 10
when: partition_stat.stat.exists == false
- name: Check if filesystem exists on the partition
command: "blkid -o value -s TYPE {{ partition_device }}"
register: fs_type
failed_when: false
changed_when: false
- name: Format the partition with ext4
filesystem:
fstype: ext4
dev: "{{ partition_device }}"
when: fs_type.stdout == ""
- name: Get the UUID of the partition
command: "blkid -o value -s UUID {{ partition_device }}"
register: partition_uuid
- name: Ensure the mount point directory exists
file:
path: "{{ longhorn_mount_point }}"
state: directory
mode: '0755'
- name: Ensure the partition is mounted via fstab
mount:
path: "{{ longhorn_mount_point }}"
src: "/dev/disk/by-uuid/{{ partition_uuid.stdout }}"
fstype: ext4
opts: defaults
state: mounted
- name: Check if the flag file exists
stat:
path: /etc/.longhorn_initialized
register: flag_stat
- name: Remove all contents from the mount point
shell: "rm -rf {{ longhorn_mount_point }}/*"
when: flag_stat.stat.exists == false
- name: Create the flag file
file:
path: /etc/.longhorn_initialized
state: touch
when: flag_stat.stat.exists == false
- name: Check if /etc/multipath.conf exists
stat:
path: /etc/multipath.conf
register: multipath_conf_stat
- name: Create /etc/multipath.conf with content if it does not exist
copy:
dest: /etc/multipath.conf
content: |
blacklist {
devnode "^sd[a-z0-9]+"
}
owner: root
group: root
mode: '0644'
when: not multipath_conf_stat.stat.exists
- name: Check if blacklist block exists in /etc/multipath.conf
shell: grep -q '^[[:space:]]*blacklist[[:space:]]*{' /etc/multipath.conf
register: blacklist_block_check
failed_when: false
changed_when: false
when: multipath_conf_stat.stat.exists
- name: Append blacklist block to /etc/multipath.conf if it does not exist
blockinfile:
path: /etc/multipath.conf
block: |
blacklist {
devnode "^sd[a-z0-9]+"
}
insertafter: EOF
state: present
when:
- multipath_conf_stat.stat.exists
- blacklist_block_check.rc != 0
- name: Debug warning if blacklist block already exists
debug:
msg: "Warning: 'blacklist' block already exists in /etc/multipath.conf on this host."
when:
- multipath_conf_stat.stat.exists
- blacklist_block_check.rc == 0
- name: Restart the multipathd service
service:
name: multipathd
state: restarted

View File

@ -0,0 +1,39 @@
- name: Update PAM configuration and deploy script for SSH login notifications
hosts: debian_hosts
become: yes
vars_files:
- ../secrets/gluttonycluster-credentials.yaml
tasks:
- name: Install preffered tooling
apt:
name:
- vim
- ncdu
- smartmontools
- jq
state: present
update_cache: yes
- name: Check if /etc/pam.d/sshd exists
ansible.builtin.stat:
path: /etc/pam.d/sshd
register: sshd_config
- name: Append pam_exec.so configuration to /etc/pam.d/sshd
ansible.builtin.lineinfile:
path: /etc/pam.d/sshd
line: 'session optional pam_exec.so /usr/bin/ntfy-ssh-login.sh'
create: no
state: present
when: sshd_config.stat.exists
- name: Check if ntfy-ssh-login.sh exists and has correct permissions
ansible.builtin.stat:
path: /usr/bin/ntfy-ssh-login.sh
register: ntfy_script
- name: Copy ntfy-ssh-login.sh script to /usr/bin/
ansible.builtin.copy:
src: ../artifacts/ntfy-ssh-login.sh
dest: /usr/bin/ntfy-ssh-login.sh
mode: '0755'
when: not (ntfy_script.stat.exists and ntfy_script.stat.mode == '0755')

View File

@ -0,0 +1,32 @@
- name: Bootstrap zfs
hosts: storage_devices
become: yes
vars_files:
# Secrets
- ../secrets/gluttonycluster-credentials.yaml
tasks:
- name: Update apt package cache
apt:
update_cache: yes
cache_valid_time: 3600
- name: Install ZFS utilities
apt:
name: zfsutils-linux
state: present
update_cache: yes
- name: Ensure ZFS module is loaded
modprobe:
name: zfs
state: present
- name: Verify ZFS installation
command: zfs version
register: zfs_version
ignore_errors: yes
- name: Show ZFS version if installed
debug:
msg: "ZFS installed successfully. Version info: {{ zfs_version.stdout }}"
when: zfs_version.rc == 0

View File

@ -0,0 +1,10 @@
- name: Alive ping test
hosts: datacenter
vars_files:
- ../secrets/gluttonycluster-credentials.yaml
tasks:
- name: Ping test
ansible.builtin.ping:
- name: Print message test
ansible.builtin.debug:
msg: Hello World!

View File

@ -0,0 +1,11 @@
- name: Reboot machines
hosts: k3s_masters
become: yes
vars_files:
# Secrets
- ../secrets/gluttonycluster-credentials.yaml
# Generic configurations
- ../vars/k3s.yaml
tasks:
- name: Reboot the server
ansible.builtin.reboot:

View File

@ -0,0 +1,29 @@
- name: Uninstall k3s cluster
hosts: k3s_masters
become: yes
vars_files:
# Secrets
- ../secrets/gluttonycluster-credentials.yaml
# Generic configurations
- ../vars/gitea-gluttony-cluster.yaml
- ../vars/k3s.yaml
tasks:
- name: Check if k3s is already installed
command: which k3s
register: k3s_installed
ignore_errors: yes
changed_when: false
- name: Ensure k3s service is stopped
service:
name: k3s
state: stopped
enabled: yes
when: k3s_installed.rc == 0
- name: Uninstall k3s
shell: |
k3s-uninstall.sh
when: k3s_installed.rc == 0
register: k3s_install_bootstrap
ignore_errors: yes

View File

@ -0,0 +1,13 @@
- name: Update all machines in cluster
hosts: debian_hosts
become: yes
become_method: sudo
vars_files:
- ../secrets/gluttonycluster-credentials.yaml
tasks:
- name: Update system
become_user: root
apt:
update_cache: yes
upgrade: dist
become: yes

View File

@ -0,0 +1,8 @@
vault_gitea_token: !vault |
$ANSIBLE_VAULT;1.1;AES256
37646661646437356434623633306633653937393234663462303433626233643230326334313830
6232316233303836363263313962356465326430653561320a623964386566393832366331313537
65666263376439643565383439623465616437353166353762386661316531633137353165363539
3437633564376634650a613963323535653536303362386262643237356239396263363466643533
32313361633365373734323137643862313134313864306632373730633930623837356636303862
6233323463306436363763616466626161316234613034396336

View File

@ -0,0 +1,21 @@
vault_gluttony_become_password: !vault |
$ANSIBLE_VAULT;1.1;AES256
33663333393361613039363734306165343466636336303264653939633339316433376233616434
6539643732613639616666383666656132363061653761330a663336383732386466343033393037
39303337373739623139376662316361633864663635366237643639333931336238383363383164
6433376338313964630a373134373331633065393632313433373830643062613833663331396139
39336538323135313164643133316131376431383131663434313437383637663535
vault_gluttony_gpu_become_password: !vault |
$ANSIBLE_VAULT;1.1;AES256
34383832326531386339393133386339623561663839323163326562313066336632656137616330
3565353735616136366232373837303564343237383531320a646239343735633565356530386163
39663464333339613739633466613466376535303837656130613233643664653165616539363439
6164643533633166350a366132386634326533376437656562373666383139323136363064636163
6362
vault_gluttony_m3_become_password: !vault |
$ANSIBLE_VAULT;1.1;AES256
39393239626130653636643538353939393666303435313966666138366264653033386631326161
3338333931306237306334613262343832393565376264370a663137323665336466653730323734
64353362636539373065663466393632396632663364353139363733303331633735623865626134
3535633832363166610a333930376263366133303230316239353432363762373130653164333066
61386538373930376564373865316264306639363936333038663533323161336436

View File

@ -0,0 +1,7 @@
vault_k3s_token: !vault |
$ANSIBLE_VAULT;1.1;AES256
31373039326161636337333466393566643261343930313761346162663962336561633736366366
3930353939306633386636633639666262383063316535630a373866343739623731386462323936
31373464313932653661333438363064666332366565333633303339336235636334616564306434
3033396336623765370a663863323638303733356134303062633363313034613463383666653830
38343430653333353438633031373533303866666236653134353137363934373464

View File

@ -0,0 +1,3 @@
gitea_owner: "Infrastructure"
gitea_repository: "Gluttony-Cluster"
gitea_hostname: "git.clortox.com"

2
ansible/vars/k3s.yaml Normal file
View File

@ -0,0 +1,2 @@
primary_node_name: "gluttony_gpu"
k3s_version: "v1.31.1+k3s1"