Skip to content
This repository has been archived by the owner on Jun 29, 2022. It is now read-only.

Commit

Permalink
Merge pull request #1502 from kinvolk/kai/bare-metal-reprovisioning
Browse files Browse the repository at this point in the history
baremetal: integrate automated (re-)provisioning logic
  • Loading branch information
pothos authored Jun 22, 2021
2 parents 0876c88 + bbb13a6 commit b0da2a1
Show file tree
Hide file tree
Showing 18 changed files with 324 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,22 @@ module "controller" {
set_standard_hostname = false
clc_snippets = concat(lookup(var.clc_snippets, var.controller_names[count.index], []), [
<<EOF
filesystems:
- name: root
mount:
device: /dev/disk/by-label/ROOT
format: ext4
wipe_filesystem: true
label: ROOT
storage:
files:
- path: /ignition_ran
filesystem: root
mode: 0644
contents:
inline: |
Flag file indicating that Ignition ran.
Should be deleted by the SSH step that checks it.
- path: /etc/hostname
filesystem: root
mode: 0644
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
module "controller_profile" {
source = "../../../matchbox-flatcar"
count = length(var.controller_names)
asset_dir = var.asset_dir
node_name = var.controller_names[count.index]
node_mac = var.controller_macs[count.index]
node_domain = var.controller_domains[count.index]
download_protocol = var.download_protocol
os_channel = var.os_channel
os_version = var.os_version
http_endpoint = var.matchbox_http_endpoint
kernel_args = var.kernel_args
kernel_console = var.kernel_console
installer_clc_snippets = lookup(var.installer_clc_snippets, var.controller_names[count.index], [])
install_disk = var.install_disk
install_to_smallest_disk = var.install_to_smallest_disk
container_linux_oem = var.container_linux_oem
ssh_keys = var.ssh_keys
ignition_clc_config = module.controller[count.index].clc_config
cached_install = var.cached_install
wipe_additional_disks = var.wipe_additional_disks
ignore_changes = true
pxe_commands = var.pxe_commands
install_pre_reboot_cmds = var.install_pre_reboot_cmds
}
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,12 @@ resource "null_resource" "copy-controller-secrets" {
]
}


# Triggered when the Ignition Config changes (used to recreate a controller)
triggers = {
clc_config = module.controller[count.index].clc_config
kernel_console = join(" ", var.kernel_console)
kernel_args = join(" ", var.kernel_args)
etcd_ca_cert = module.bootkube.etcd_ca_cert
etcd_server_cert = module.bootkube.etcd_server_cert
etcd_peer_cert = module.bootkube.etcd_peer_cert
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ variable "clc_snippets" {
default = {}
}

variable "installer_clc_snippets" {
type = map(list(string))
description = "Map from machine names to lists of Container Linux Config snippets, applied for the PXE-booted installer OS"
default = {}
}

variable "labels" {
type = map(string)
description = "Map of labels for worker nodes."
Expand Down Expand Up @@ -221,3 +227,15 @@ variable "wipe_additional_disks" {
description = "Wipes any additional disks attached, if set to true"
default = false
}

variable "pxe_commands" {
type = string
default = "echo 'you must (re)provision the node by booting via iPXE from http://MATCHBOX/boot.ipxe'; exit 1"
description = "shell commands to execute for PXE (re)provisioning, with access to the variables $mac (the MAC address), $name (the node name), and $domain (the domain name), e.g., 'bmc=bmc-$domain; ipmitool -H $bmc power off; ipmitool -H $bmc chassis bootdev pxe; ipmitool -H $bmc power on'"
}

variable "install_pre_reboot_cmds" {
type = string
default = "true"
description = "shell commands to execute on the provisioned host after installation finished and before reboot, e.g., docker run --privileged --net host --rm debian sh -c 'apt update && apt install -y ipmitool && ipmitool chassis bootdev disk options=persistent'"
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,22 @@ module "worker" {
set_standard_hostname = false
clc_snippets = concat(lookup(var.clc_snippets, var.worker_names[count.index], []), [
<<EOF
filesystems:
- name: root
mount:
device: /dev/disk/by-label/ROOT
format: ext4
wipe_filesystem: true
label: ROOT
storage:
files:
- path: /ignition_ran
filesystem: root
mode: 0644
contents:
inline: |
Flag file indicating that Ignition ran.
Should be deleted by the SSH step that checks it.
- path: /etc/hostname
filesystem: root
mode: 0644
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
module "worker_profile" {
source = "../../../matchbox-flatcar"
count = length(var.worker_names)
asset_dir = var.asset_dir
node_name = var.worker_names[count.index]
node_mac = var.worker_macs[count.index]
node_domain = var.worker_domains[count.index]
download_protocol = var.download_protocol
os_channel = var.os_channel
os_version = var.os_version
http_endpoint = var.matchbox_http_endpoint
kernel_args = var.kernel_args
kernel_console = var.kernel_console
installer_clc_snippets = lookup(var.installer_clc_snippets, var.worker_names[count.index], [])
install_disk = var.install_disk
install_to_smallest_disk = var.install_to_smallest_disk
container_linux_oem = var.container_linux_oem
ssh_keys = var.ssh_keys
ignition_clc_config = module.worker[count.index].clc_config
cached_install = var.cached_install
wipe_additional_disks = var.wipe_additional_disks
pxe_commands = var.pxe_commands
install_pre_reboot_cmds = var.install_pre_reboot_cmds
}
24 changes: 22 additions & 2 deletions assets/terraform-modules/matchbox-flatcar/profiles.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ resource "matchbox_profile" "flatcar-install" {
var.kernel_args,
])

container_linux_config = templatefile("${path.module}/templates/install.yaml.tmpl", {
raw_ignition = data.ct_config.install-ignitions.rendered
}

data "ct_config" "install-ignitions" {
content = templatefile("${path.module}/templates/install.yaml.tmpl", {
os_channel = var.os_channel
os_version = var.os_version
ignition_endpoint = format("%s/ignition", var.http_endpoint)
Expand All @@ -29,9 +33,15 @@ resource "matchbox_profile" "flatcar-install" {
kernel_console = join(" ", var.kernel_console)
kernel_args = join(" ", var.kernel_args)
wipe_additional_disks = var.wipe_additional_disks
install_pre_reboot_cmds = var.install_pre_reboot_cmds
# only cached-container-linux profile adds -b baseurl
baseurl_flag = ""
mac_address = var.node_mac
})

pretty_print = false

snippets = var.installer_clc_snippets
}

// Flatcar Container Linux Install profile (from matchbox /assets cache)
Expand All @@ -56,7 +66,11 @@ resource "matchbox_profile" "cached-flatcar-linux-install" {
var.kernel_args,
])

container_linux_config = templatefile("${path.module}/templates/install.yaml.tmpl", {
raw_ignition = data.ct_config.cached-install-ignitions.rendered
}

data "ct_config" "cached-install-ignitions" {
content = templatefile("${path.module}/templates/install.yaml.tmpl", {
os_channel = var.os_channel
os_version = var.os_version
ignition_endpoint = format("%s/ignition", var.http_endpoint)
Expand All @@ -67,9 +81,15 @@ resource "matchbox_profile" "cached-flatcar-linux-install" {
kernel_console = join(" ", var.kernel_console)
kernel_args = join(" ", var.kernel_args)
wipe_additional_disks = var.wipe_additional_disks
install_pre_reboot_cmds = var.install_pre_reboot_cmds
# profile uses -b baseurl to install from matchbox cache
baseurl_flag = "-b ${var.http_endpoint}/assets/flatcar"
mac_address = var.node_mac
})

pretty_print = false

snippets = var.installer_clc_snippets
}

resource "matchbox_profile" "node" {
Expand Down
87 changes: 87 additions & 0 deletions assets/terraform-modules/matchbox-flatcar/pxe-helper.sh.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# (executed in-line, #!/... would be ignored)
# Terraform template variable substitution:
name=${name}
domain=${domain}
mac=${mac}
asset_dir=${asset_dir}
ignore_changes=${ignore_changes}
kernel_args="${kernel_args}"
kernel_console="${kernel_console}"
ignition_endpoint="${ignition_endpoint}"
# From now on use $var for dynamic shell substitution

if test -f "$asset_dir/$mac" && [ "$(cat "$asset_dir/$mac")" = "$domain" ]; then
echo "found $asset_dir/$mac containing $domain, skipping PXE install"
node_exists=yes
else
echo "$asset_dir/$mac does not contain $domain, forcing PXE install"
node_exists=no
fi

if [ $node_exists = yes ]; then
if $ignore_changes ; then
echo "Keeping old config because 'ignore_changes' is set."
exit 0
else
# run single commands that can be retried without a side effect in case the connection got disrupted
count=30
while [ $count -gt 0 ] && ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o NumberOfPasswordPrompts=0 core@$domain sudo touch /boot/flatcar/first_boot; do
sleep 1
count=$((count - 1))
done
if [ $count -eq 0 ]; then
echo "error reaching $domain via SSH, please remove the $asset_dir/$mac file to force a PXE install"
exit 1
fi
echo "created the first_boot flag file to reprovision $domain"
count=5
while [ $count -gt 0 ] && ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o NumberOfPasswordPrompts=0 core@$domain "printf 'set linux_append=\"$kernel_args ignition.config.url=$ignition_endpoint?mac=$mac&os=installed\"\\nset linux_console=\"$kernel_console\"\\n' | sudo tee /usr/share/oem/grub.cfg"; do
sleep 1
count=$((count - 1))
done
if [ $count -eq 0 ]; then
echo "error reaching $domain via SSH, please retry"
exit 1
fi
count=5
while [ $count -gt 0 ] && ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o NumberOfPasswordPrompts=0 core@$domain sudo systemctl reboot; do
sleep 1
count=$((count - 1))
done
if [ $count -eq 0 ]; then
echo "error reaching $domain via SSH, please reboot manually"
exit 1
fi
echo "rebooted the $domain"
fi
else
# the user may provide ipmitool commands or any other logic for forcing a PXE boot
${pxe_commands}
fi

echo "checking that $domain comes up"
count=600
# check that we can reach the node and that it has the flag file which we remove here, indicating a reboot happened which prevents a race when issuing the reboot takes longer (both the systemctl reboot and PXE case)
# Just in case the connection breaks and SSH may report an error code but still execute successfully, we will first check file existence and then delete with "rm -f" to be able to rerun both commands.
# This sequence gives us the same error reporting as just running "rm" once.
while [ $count -gt 0 ] && ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o NumberOfPasswordPrompts=0 core@$domain test -f /ignition_ran; do
sleep 1
count=$((count - 1))
done
if [ $count -eq 0 ]; then
echo "error: failed verifying with SSH if $domain came up by checking the /ignition_ran flag file"
exit 1
fi
count=5
while [ $count -gt 0 ] && ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o NumberOfPasswordPrompts=0 core@$domain sudo rm -f /ignition_ran; do
sleep 1
count=$((count - 1))
done
if [ $count -eq 0 ]; then
echo "error: failed to remove the /ignition_ran flag file on $domain"
exit 1
else
echo "$domain came up again"
fi
# only write the state file once the system is up, this allows to rerun lokoctl if the first PXE boot did not work and it will try again
echo $domain > "$asset_dir/$mac"
14 changes: 14 additions & 0 deletions assets/terraform-modules/matchbox-flatcar/ssh.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
resource "null_resource" "reprovision-node-when-ignition-changes" {
# Triggered when the Ignition Config changes
triggers = {
ignition_config = matchbox_profile.node.raw_ignition
kernel_args = join(" ", var.kernel_args)
kernel_console = join(" ", var.kernel_console)
}
# Wait for the new Ignition config object to be ready before rebooting
depends_on = [matchbox_group.node]
# Trigger running Ignition on the next reboot (first_boot flag file) and reboot the instance, or, if the instance needs to be (re)provisioned, run external commands for PXE booting (also runs on the first provisioning)
provisioner "local-exec" {
command = templatefile("${path.module}/pxe-helper.sh.tmpl", { domain = var.node_domain, name = var.node_name, mac = var.node_mac, pxe_commands = var.pxe_commands, asset_dir = var.asset_dir, kernel_args = join(" ", var.kernel_args), kernel_console = join(" ", var.kernel_console), ignition_endpoint = format("%s/ignition", var.http_endpoint), ignore_changes = var.ignore_changes })
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ storage:
wipefs -f -a "$${disk}" || echo "error: failed to wipe $${disk}"
done
%{~ endif ~}
curl --retry 10 "${ignition_endpoint}?{{.request.raw_query}}&os=installed" -o ignition.json
flatcar-install \
%{~ if install_to_smallest_disk ~}
-s \
Expand All @@ -59,16 +58,16 @@ storage:
-C ${os_channel} \
-V ${os_version} \
-o "${container_linux_oem}" \
${baseurl_flag} \
-i ignition.json
${baseurl_flag}
udevadm settle
OEM_DEV="$(blkid -t "LABEL=OEM" -o device)"
mkdir -p /tmp/oemfs
mount "$${OEM_DEV}" /tmp/oemfs
# append to file on newly created partition, do not remove the defaults
echo 'set linux_append="${kernel_args}"' >> /tmp/oemfs/grub.cfg
echo 'set linux_append="${kernel_args} ignition.config.url=${ignition_endpoint}?mac=${mac_address}&os=installed"' >> /tmp/oemfs/grub.cfg
echo 'set linux_console="${kernel_console}"' >> /tmp/oemfs/grub.cfg
umount /tmp/oemfs
${install_pre_reboot_cmds}
systemctl reboot
passwd:
users:
Expand Down
34 changes: 34 additions & 0 deletions assets/terraform-modules/matchbox-flatcar/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ variable "ignition_clc_config" {
description = "Ignition CLC snippets to include in the configuration."
}

variable "installer_clc_snippets" {
type = list(string)
description = "List of Container Linux Config snippets."
default = []
}

variable "node_name" {
type = string
description = "Name of the node/machine."
Expand All @@ -82,3 +88,31 @@ variable "wipe_additional_disks" {
description = "Wipes any additional disks attached, if set to true"
default = false
}

variable "ignore_changes" {
description = "When set to true, ignores the reprovisioning of the node (unless the MAC address flag file is removed to force a PXE install)."
type = bool
default = false
}

variable "asset_dir" {
description = "Path to a directory where generated assets should be placed (contains secrets)"
type = string
}

variable "node_domain" {
type = string
description = "Node FQDN (e.g node1.example.com)."
}

variable "pxe_commands" {
type = string
description = "shell commands to execute for PXE (re)provisioning, with access to the variables $mac (the MAC address), $name (the node name), and $domain (the domain name), e.g., 'bmc=bmc-$domain; ipmitool -H $bmc power off; ipmitool -H $bmc chassis bootdev pxe; ipmitool -H $bmc power on'."
default = "echo 'you must (re)provision the node by booting via iPXE from http://MATCHBOX/boot.ipxe'; exit 1"
}

variable "install_pre_reboot_cmds" {
type = string
description = "shell commands to execute on the provisioned host after installation finished and before reboot, e.g., docker run --privileged --net host --rm debian sh -c 'apt update && apt install -y ipmitool && ipmitool chassis bootdev disk options=persistent'."
default = "true"
}
4 changes: 4 additions & 0 deletions assets/terraform-modules/matchbox-flatcar/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ terraform {
required_version = ">= 0.13"

required_providers {
ct = {
source = "poseidon/ct"
version = "0.8.0"
}
template = {
source = "hashicorp/template"
version = "2.2.0"
Expand Down
1 change: 1 addition & 0 deletions ci/baremetal/baremetal-cluster.lokocfg.envsubst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ cluster "bare-metal" {
"node2",
"node3",
]
pxe_commands = "true" # The VMs are booted up outside of the CI Docker image at the right time already and we will not reprovision nor could do so because the VMs are managed at another level
# Adds oidc flags to API server with default values.
# Acts as a smoke test to check if API server is functional after addition
# of extra flags.
Expand Down
Loading

0 comments on commit b0da2a1

Please sign in to comment.