diff --git a/CHANGES.md b/CHANGES.md index 1bf511f6..05b15bbb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -32,11 +32,23 @@ version: 1.0.0 ### AWS -* Rename worker autoscaling group `${cluster_name}-worker` -* Rename launch configuration `${cluster_name}-worker` +* Rename worker autoscaling group `${cluster_name}-worker` ([#1202](https://github.com/poseidon/typhoon/pull/1202)) + * Rename launch configuration `${cluster_name}-worker` instead of a random id ### Google +* [Roll](https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups) instance template changes to worker managed instance groups ([#1207](https://github.com/poseidon/typhoon/pull/1207)) (**important**) + * Changes to worker instance templates roll out by gradually replacing instances + * Automatic rollouts create surge instances, wait for Kubelet health checks, then delete old instances (0 unavailable instances) + * Changing `worker_type`, `disk_size`, `preemptible`, or Butane `worker_snippets` on existing worker nodes will replace instances + * New AMIs or changing `os_stream` will be ignored, to allow Fedora CoreOS or Flatcar Linux to keep themselves updated + * Previously, new instance templates were made in the same way, but not applied to instances unless manually replaced +* Add health checks to worker managed instance groups (i.e. "autohealing") ([#1207](https://github.com/poseidon/typhoon/pull/1207)) + * Use SSL health checks to probe the Kubelet every 30s + * Replace worker nodes that fail the health check 6 times (3min) +* Name `kube-apiserver` and `kubelet` health checks consistently ([#1207](https://github.com/poseidon/typhoon/pull/1207)) + * Use name `${cluster_name}-apiserver-health` and `${cluster_name}-kubelet-health` +* Rename managed instance group from `${cluster_name}-worker-group` to `${cluster_name}-worker` ([#1207](https://github.com/poseidon/typhoon/pull/1207)) * Fix bug provisioning clusters with multiple controller nodes ([#1195](https://github.com/poseidon/typhoon/pull/1195)) ### Addons diff --git a/google-cloud/fedora-coreos/kubernetes/apiserver.tf b/google-cloud/fedora-coreos/kubernetes/apiserver.tf index 318d9c1f..7a98b7a8 100644 --- a/google-cloud/fedora-coreos/kubernetes/apiserver.tf +++ b/google-cloud/fedora-coreos/kubernetes/apiserver.tf @@ -75,10 +75,10 @@ resource "google_compute_instance_group" "controllers" { ) } -# TCP health check for apiserver +# Health check for kube-apiserver resource "google_compute_health_check" "apiserver" { - name = "${var.cluster_name}-apiserver-tcp-health" - description = "TCP health check for kube-apiserver" + name = "${var.cluster_name}-apiserver-health" + description = "Health check for kube-apiserver" timeout_sec = 5 check_interval_sec = 5 @@ -86,7 +86,7 @@ resource "google_compute_health_check" "apiserver" { healthy_threshold = 1 unhealthy_threshold = 3 - tcp_health_check { + ssl_health_check { port = "6443" } } diff --git a/google-cloud/fedora-coreos/kubernetes/network.tf b/google-cloud/fedora-coreos/kubernetes/network.tf index e7c3090c..6fcbcb85 100644 --- a/google-cloud/fedora-coreos/kubernetes/network.tf +++ b/google-cloud/fedora-coreos/kubernetes/network.tf @@ -196,6 +196,24 @@ resource "google_compute_firewall" "allow-ingress" { target_tags = ["${var.cluster_name}-worker"] } +resource "google_compute_firewall" "google-kubelet-health-checks" { + name = "${var.cluster_name}-kubelet-health" + network = google_compute_network.network.name + + allow { + protocol = "tcp" + ports = [10250] + } + + # https://cloud.google.com/compute/docs/instance-groups/autohealing-instances-in-migs + source_ranges = [ + "35.191.0.0/16", + "130.211.0.0/22", + ] + + target_tags = ["${var.cluster_name}-worker"] +} + resource "google_compute_firewall" "google-ingress-health-checks" { name = "${var.cluster_name}-ingress-health" network = google_compute_network.network.name diff --git a/google-cloud/fedora-coreos/kubernetes/workers/workers.tf b/google-cloud/fedora-coreos/kubernetes/workers/workers.tf index 14a92b36..fe7ce43b 100644 --- a/google-cloud/fedora-coreos/kubernetes/workers/workers.tf +++ b/google-cloud/fedora-coreos/kubernetes/workers/workers.tf @@ -1,6 +1,6 @@ # Managed instance group of workers resource "google_compute_region_instance_group_manager" "workers" { - name = "${var.name}-worker-group" + name = "${var.name}-worker" description = "Compute instance group of ${var.name} workers" # instance name prefix for instances in the group @@ -11,6 +11,16 @@ resource "google_compute_region_instance_group_manager" "workers" { instance_template = google_compute_instance_template.worker.self_link } + # Roll out MIG instance template changes by replacing instances. + # - Surge to create new instances, then delete old instances. + # - Replace ensures new Ignition is picked up + update_policy { + type = "PROACTIVE" + max_surge_fixed = 3 + max_unavailable_fixed = 0 + minimal_action = "REPLACE" + } + target_size = var.worker_count target_pools = [google_compute_target_pool.workers.self_link] @@ -23,12 +33,33 @@ resource "google_compute_region_instance_group_manager" "workers" { name = "https" port = "443" } + + auto_healing_policies { + health_check = google_compute_health_check.worker.id + initial_delay_sec = 120 + } +} + +# Health check for worker Kubelet +resource "google_compute_health_check" "worker" { + name = "${var.name}-kubelet-health" + description = "Health check for worker Kubelet" + + timeout_sec = 20 + check_interval_sec = 30 + + healthy_threshold = 1 + unhealthy_threshold = 6 + + ssl_health_check { + port = "10250" + } } # Worker instance template resource "google_compute_instance_template" "worker" { name_prefix = "${var.name}-worker-" - description = "Worker Instance template" + description = "${var.name} worker instance template" machine_type = var.machine_type metadata = { @@ -36,8 +67,11 @@ resource "google_compute_instance_template" "worker" { } scheduling { - automatic_restart = var.preemptible ? false : true - preemptible = var.preemptible + provisioning_model = var.preemptible ? "SPOT" : "STANDARD" + preemptible = var.preemptible + automatic_restart = var.preemptible ? false : true + # Spot instances with termination action DELETE cannot be used with MIGs + instance_termination_action = var.preemptible ? "STOP" : null } disk { @@ -49,10 +83,8 @@ resource "google_compute_instance_template" "worker" { network_interface { network = var.network - # Ephemeral external IP - access_config { - } + access_config {} } can_ip_forward = true diff --git a/google-cloud/flatcar-linux/kubernetes/apiserver.tf b/google-cloud/flatcar-linux/kubernetes/apiserver.tf index 318d9c1f..7a98b7a8 100644 --- a/google-cloud/flatcar-linux/kubernetes/apiserver.tf +++ b/google-cloud/flatcar-linux/kubernetes/apiserver.tf @@ -75,10 +75,10 @@ resource "google_compute_instance_group" "controllers" { ) } -# TCP health check for apiserver +# Health check for kube-apiserver resource "google_compute_health_check" "apiserver" { - name = "${var.cluster_name}-apiserver-tcp-health" - description = "TCP health check for kube-apiserver" + name = "${var.cluster_name}-apiserver-health" + description = "Health check for kube-apiserver" timeout_sec = 5 check_interval_sec = 5 @@ -86,7 +86,7 @@ resource "google_compute_health_check" "apiserver" { healthy_threshold = 1 unhealthy_threshold = 3 - tcp_health_check { + ssl_health_check { port = "6443" } } diff --git a/google-cloud/flatcar-linux/kubernetes/network.tf b/google-cloud/flatcar-linux/kubernetes/network.tf index e7c3090c..6fcbcb85 100644 --- a/google-cloud/flatcar-linux/kubernetes/network.tf +++ b/google-cloud/flatcar-linux/kubernetes/network.tf @@ -196,6 +196,24 @@ resource "google_compute_firewall" "allow-ingress" { target_tags = ["${var.cluster_name}-worker"] } +resource "google_compute_firewall" "google-kubelet-health-checks" { + name = "${var.cluster_name}-kubelet-health" + network = google_compute_network.network.name + + allow { + protocol = "tcp" + ports = [10250] + } + + # https://cloud.google.com/compute/docs/instance-groups/autohealing-instances-in-migs + source_ranges = [ + "35.191.0.0/16", + "130.211.0.0/22", + ] + + target_tags = ["${var.cluster_name}-worker"] +} + resource "google_compute_firewall" "google-ingress-health-checks" { name = "${var.cluster_name}-ingress-health" network = google_compute_network.network.name diff --git a/google-cloud/flatcar-linux/kubernetes/workers/workers.tf b/google-cloud/flatcar-linux/kubernetes/workers/workers.tf index 881b6dea..460ebc9b 100644 --- a/google-cloud/flatcar-linux/kubernetes/workers/workers.tf +++ b/google-cloud/flatcar-linux/kubernetes/workers/workers.tf @@ -1,6 +1,6 @@ # Managed instance group of workers resource "google_compute_region_instance_group_manager" "workers" { - name = "${var.name}-worker-group" + name = "${var.name}-worker" description = "Compute instance group of ${var.name} workers" # instance name prefix for instances in the group @@ -11,6 +11,16 @@ resource "google_compute_region_instance_group_manager" "workers" { instance_template = google_compute_instance_template.worker.self_link } + # Roll out MIG instance template changes by replacing instances. + # - Surge to create new instances, then delete old instances. + # - Replace ensures new Ignition is picked up + update_policy { + type = "PROACTIVE" + max_surge_fixed = 3 + max_unavailable_fixed = 0 + minimal_action = "REPLACE" + } + target_size = var.worker_count target_pools = [google_compute_target_pool.workers.self_link] @@ -23,6 +33,27 @@ resource "google_compute_region_instance_group_manager" "workers" { name = "https" port = "443" } + + auto_healing_policies { + health_check = google_compute_health_check.worker.id + initial_delay_sec = 120 + } +} + +# Health check for worker Kubelet +resource "google_compute_health_check" "worker" { + name = "${var.name}-kubelet-health" + description = "Health check for worker Kubelet" + + timeout_sec = 20 + check_interval_sec = 30 + + healthy_threshold = 1 + unhealthy_threshold = 6 + + ssl_health_check { + port = "10250" + } } # Worker instance template @@ -36,8 +67,11 @@ resource "google_compute_instance_template" "worker" { } scheduling { - automatic_restart = var.preemptible ? false : true - preemptible = var.preemptible + provisioning_model = var.preemptible ? "SPOT" : "STANDARD" + preemptible = var.preemptible + automatic_restart = var.preemptible ? false : true + # Spot instances with termination action DELETE cannot be used with MIGs + instance_termination_action = var.preemptible ? "STOP" : null } disk { @@ -49,10 +83,8 @@ resource "google_compute_instance_template" "worker" { network_interface { network = var.network - # Ephemeral external IP - access_config { - } + access_config {} } can_ip_forward = true @@ -64,6 +96,9 @@ resource "google_compute_instance_template" "worker" { } lifecycle { + ignore_changes = [ + disk[0].source_image + ] # To update an Instance Template, Terraform should replace the existing resource create_before_destroy = true }