From 084e8bea49b8f2c9afe7f2c24bc8b3d70957c1fb Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Sun, 11 Apr 2021 12:08:56 -0700 Subject: [PATCH] Allow custom initial node taints on worker pool nodes * Add `node_taints` variable to worker modules to set custom initial node taints on cloud platforms that support auto-scaling worker pools of heterogeneous nodes (i.e. AWS, Azure, GCP) * Worker pools could use custom `node_labels` to allowed workloads to select among differentiated nodes, while custom `node_taints` allows a worker pool's nodes to be tainted as special to prevent scheduling, except by workloads that explicitly tolerate the taint * Expose `daemonset_tolerations` in AWS, Azure, and GCP kubernetes cluster modules, to determine whether `kube-system` components should tolerate the custom taint (advanced use covered in docs) Rel: #550, #663 Closes #429 --- CHANGES.md | 18 +++ aws/fedora-coreos/kubernetes/variables.tf | 1 - aws/flatcar-linux/kubernetes/bootstrap.tf | 1 + aws/flatcar-linux/kubernetes/variables.tf | 5 + .../kubernetes/workers/cl/worker.yaml | 3 + .../kubernetes/workers/variables.tf | 6 + .../kubernetes/workers/workers.tf | 1 + azure/fedora-coreos/kubernetes/bootstrap.tf | 1 + azure/fedora-coreos/kubernetes/variables.tf | 5 + .../kubernetes/workers/fcc/worker.yaml | 3 + .../kubernetes/workers/variables.tf | 6 + .../kubernetes/workers/workers.tf | 1 + azure/flatcar-linux/kubernetes/bootstrap.tf | 1 + azure/flatcar-linux/kubernetes/variables.tf | 5 + .../kubernetes/workers/cl/worker.yaml | 3 + .../kubernetes/workers/variables.tf | 6 + .../kubernetes/workers/workers.tf | 1 + docs/advanced/arm64.md | 20 +-- docs/advanced/nodes.md | 134 ++++++++++++++++++ docs/advanced/worker-pools.md | 3 + .../fedora-coreos/kubernetes/bootstrap.tf | 1 + .../fedora-coreos/kubernetes/variables.tf | 5 + .../kubernetes/workers/fcc/worker.yaml | 3 + .../kubernetes/workers/variables.tf | 6 + .../kubernetes/workers/workers.tf | 1 + .../flatcar-linux/kubernetes/bootstrap.tf | 1 + .../flatcar-linux/kubernetes/variables.tf | 5 + .../kubernetes/workers/cl/worker.yaml | 3 + .../kubernetes/workers/variables.tf | 6 + .../kubernetes/workers/workers.tf | 1 + mkdocs.yml | 1 + 31 files changed, 246 insertions(+), 11 deletions(-) create mode 100644 docs/advanced/nodes.md diff --git a/CHANGES.md b/CHANGES.md index fd125dd1..8db6e0d4 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,24 @@ Notable changes between versions. * Kubernetes [v1.21.0](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG/CHANGELOG-1.21.md#v1210) +### AWS + +* Allow setting custom initial node taints on worker pools ([#968](https://github.com/poseidon/typhoon/pull/968)) + * Add `node_taints` variable to internal `workers` pool module to set initial node taints + * Add `daemonset_tolerations` so `kube-system` DaemonSets can tolerate custom taints + +### Azure + +* Allow setting custom initial node taints on worker pools ([#968](https://github.com/poseidon/typhoon/pull/968)) + * Add `node_taints` variable to internal `workers` pool module to set initial node taints + * Add `daemonset_tolerations` so `kube-system` DaemonSets can tolerate custom taints + +### Google Cloud + +* Allow setting custom initial node taints on worker pools ([#968](https://github.com/poseidon/typhoon/pull/968)) + * Add `node_taints` variable to internal `workers` pool module to set initial node taints + * Add `daemonset_tolerations` so `kube-system` DaemonSets can tolerate custom taints + ### Addons * Update kube-state-metrics from v2.0.0-rc.0 to [v2.0.0-rc.1](https://github.com/kubernetes/kube-state-metrics/releases/tag/v2.0.0-rc.1) diff --git a/aws/fedora-coreos/kubernetes/variables.tf b/aws/fedora-coreos/kubernetes/variables.tf index 53e3f59b..95a09e26 100644 --- a/aws/fedora-coreos/kubernetes/variables.tf +++ b/aws/fedora-coreos/kubernetes/variables.tf @@ -176,4 +176,3 @@ variable "daemonset_tolerations" { description = "List of additional taint keys kube-system DaemonSets should tolerate (e.g. ['custom-role', 'gpu-role'])" default = [] } - diff --git a/aws/flatcar-linux/kubernetes/bootstrap.tf b/aws/flatcar-linux/kubernetes/bootstrap.tf index 6eb740b6..ae16e7be 100644 --- a/aws/flatcar-linux/kubernetes/bootstrap.tf +++ b/aws/flatcar-linux/kubernetes/bootstrap.tf @@ -12,5 +12,6 @@ module "bootstrap" { cluster_domain_suffix = var.cluster_domain_suffix enable_reporting = var.enable_reporting enable_aggregation = var.enable_aggregation + daemonset_tolerations = var.daemonset_tolerations } diff --git a/aws/flatcar-linux/kubernetes/variables.tf b/aws/flatcar-linux/kubernetes/variables.tf index 0678bbbc..d38f8aa9 100644 --- a/aws/flatcar-linux/kubernetes/variables.tf +++ b/aws/flatcar-linux/kubernetes/variables.tf @@ -160,3 +160,8 @@ variable "cluster_domain_suffix" { default = "cluster.local" } +variable "daemonset_tolerations" { + type = list(string) + description = "List of additional taint keys kube-system DaemonSets should tolerate (e.g. ['custom-role', 'gpu-role'])" + default = [] +} diff --git a/aws/flatcar-linux/kubernetes/workers/cl/worker.yaml b/aws/flatcar-linux/kubernetes/workers/cl/worker.yaml index 01bf0905..9e7848e3 100644 --- a/aws/flatcar-linux/kubernetes/workers/cl/worker.yaml +++ b/aws/flatcar-linux/kubernetes/workers/cl/worker.yaml @@ -73,6 +73,9 @@ systemd: %{~ for label in split(",", node_labels) ~} --node-labels=${label} \ %{~ endfor ~} + %{~ for taint in split(",", node_taints) ~} + --register-with-taints=${taint} \ + %{~ endfor ~} --pod-manifest-path=/etc/kubernetes/manifests \ --provider-id=aws:///$${COREOS_EC2_AVAILABILITY_ZONE}/$${COREOS_EC2_INSTANCE_ID} \ --read-only-port=0 \ diff --git a/aws/flatcar-linux/kubernetes/workers/variables.tf b/aws/flatcar-linux/kubernetes/workers/variables.tf index 8ed3a8a7..a4dbab75 100644 --- a/aws/flatcar-linux/kubernetes/workers/variables.tf +++ b/aws/flatcar-linux/kubernetes/workers/variables.tf @@ -113,3 +113,9 @@ variable "node_labels" { description = "List of initial node labels" default = [] } + +variable "node_taints" { + type = list(string) + description = "List of initial node taints" + default = [] +} diff --git a/aws/flatcar-linux/kubernetes/workers/workers.tf b/aws/flatcar-linux/kubernetes/workers/workers.tf index 2f052f4c..621763fb 100644 --- a/aws/flatcar-linux/kubernetes/workers/workers.tf +++ b/aws/flatcar-linux/kubernetes/workers/workers.tf @@ -86,6 +86,7 @@ data "template_file" "worker-config" { cluster_dns_service_ip = cidrhost(var.service_cidr, 10) cluster_domain_suffix = var.cluster_domain_suffix node_labels = join(",", var.node_labels) + node_taints = join(",", var.node_taints) } } diff --git a/azure/fedora-coreos/kubernetes/bootstrap.tf b/azure/fedora-coreos/kubernetes/bootstrap.tf index ed297982..710c2cf8 100644 --- a/azure/fedora-coreos/kubernetes/bootstrap.tf +++ b/azure/fedora-coreos/kubernetes/bootstrap.tf @@ -18,6 +18,7 @@ module "bootstrap" { cluster_domain_suffix = var.cluster_domain_suffix enable_reporting = var.enable_reporting enable_aggregation = var.enable_aggregation + daemonset_tolerations = var.daemonset_tolerations # Fedora CoreOS trusted_certs_dir = "/etc/pki/tls/certs" diff --git a/azure/fedora-coreos/kubernetes/variables.tf b/azure/fedora-coreos/kubernetes/variables.tf index 79164034..d5191047 100644 --- a/azure/fedora-coreos/kubernetes/variables.tf +++ b/azure/fedora-coreos/kubernetes/variables.tf @@ -135,3 +135,8 @@ variable "cluster_domain_suffix" { default = "cluster.local" } +variable "daemonset_tolerations" { + type = list(string) + description = "List of additional taint keys kube-system DaemonSets should tolerate (e.g. ['custom-role', 'gpu-role'])" + default = [] +} diff --git a/azure/fedora-coreos/kubernetes/workers/fcc/worker.yaml b/azure/fedora-coreos/kubernetes/workers/fcc/worker.yaml index ba6622e8..c2c7a2f4 100644 --- a/azure/fedora-coreos/kubernetes/workers/fcc/worker.yaml +++ b/azure/fedora-coreos/kubernetes/workers/fcc/worker.yaml @@ -67,6 +67,9 @@ systemd: %{~ for label in split(",", node_labels) ~} --node-labels=${label} \ %{~ endfor ~} + %{~ for taint in split(",", node_taints) ~} + --register-with-taints=${taint} \ + %{~ endfor ~} --pod-manifest-path=/etc/kubernetes/manifests \ --read-only-port=0 \ --rotate-certificates \ diff --git a/azure/fedora-coreos/kubernetes/workers/variables.tf b/azure/fedora-coreos/kubernetes/workers/variables.tf index 0e12fd08..b7084636 100644 --- a/azure/fedora-coreos/kubernetes/workers/variables.tf +++ b/azure/fedora-coreos/kubernetes/workers/variables.tf @@ -88,6 +88,12 @@ variable "node_labels" { default = [] } +variable "node_taints" { + type = list(string) + description = "List of initial node taints" + default = [] +} + # unofficial, undocumented, unsupported variable "cluster_domain_suffix" { diff --git a/azure/fedora-coreos/kubernetes/workers/workers.tf b/azure/fedora-coreos/kubernetes/workers/workers.tf index 6ecb81a5..defd462e 100644 --- a/azure/fedora-coreos/kubernetes/workers/workers.tf +++ b/azure/fedora-coreos/kubernetes/workers/workers.tf @@ -87,6 +87,7 @@ data "template_file" "worker-config" { cluster_dns_service_ip = cidrhost(var.service_cidr, 10) cluster_domain_suffix = var.cluster_domain_suffix node_labels = join(",", var.node_labels) + node_taints = join(",", var.node_taints) } } diff --git a/azure/flatcar-linux/kubernetes/bootstrap.tf b/azure/flatcar-linux/kubernetes/bootstrap.tf index 827352d3..abbe58d7 100644 --- a/azure/flatcar-linux/kubernetes/bootstrap.tf +++ b/azure/flatcar-linux/kubernetes/bootstrap.tf @@ -18,5 +18,6 @@ module "bootstrap" { cluster_domain_suffix = var.cluster_domain_suffix enable_reporting = var.enable_reporting enable_aggregation = var.enable_aggregation + daemonset_tolerations = var.daemonset_tolerations } diff --git a/azure/flatcar-linux/kubernetes/variables.tf b/azure/flatcar-linux/kubernetes/variables.tf index 429cdeb8..07694d0f 100644 --- a/azure/flatcar-linux/kubernetes/variables.tf +++ b/azure/flatcar-linux/kubernetes/variables.tf @@ -141,3 +141,8 @@ variable "cluster_domain_suffix" { default = "cluster.local" } +variable "daemonset_tolerations" { + type = list(string) + description = "List of additional taint keys kube-system DaemonSets should tolerate (e.g. ['custom-role', 'gpu-role'])" + default = [] +} diff --git a/azure/flatcar-linux/kubernetes/workers/cl/worker.yaml b/azure/flatcar-linux/kubernetes/workers/cl/worker.yaml index 014f735b..932e7f45 100644 --- a/azure/flatcar-linux/kubernetes/workers/cl/worker.yaml +++ b/azure/flatcar-linux/kubernetes/workers/cl/worker.yaml @@ -70,6 +70,9 @@ systemd: %{~ for label in split(",", node_labels) ~} --node-labels=${label} \ %{~ endfor ~} + %{~ for taint in split(",", node_taints) ~} + --register-with-taints=${taint} \ + %{~ endfor ~} --pod-manifest-path=/etc/kubernetes/manifests \ --read-only-port=0 \ --rotate-certificates \ diff --git a/azure/flatcar-linux/kubernetes/workers/variables.tf b/azure/flatcar-linux/kubernetes/workers/variables.tf index 973fc1fb..9d168394 100644 --- a/azure/flatcar-linux/kubernetes/workers/variables.tf +++ b/azure/flatcar-linux/kubernetes/workers/variables.tf @@ -94,6 +94,12 @@ variable "node_labels" { default = [] } +variable "node_taints" { + type = list(string) + description = "List of initial node taints" + default = [] +} + # unofficial, undocumented, unsupported variable "cluster_domain_suffix" { diff --git a/azure/flatcar-linux/kubernetes/workers/workers.tf b/azure/flatcar-linux/kubernetes/workers/workers.tf index ce9ba08d..a9a3891c 100644 --- a/azure/flatcar-linux/kubernetes/workers/workers.tf +++ b/azure/flatcar-linux/kubernetes/workers/workers.tf @@ -105,6 +105,7 @@ data "template_file" "worker-config" { cluster_dns_service_ip = cidrhost(var.service_cidr, 10) cluster_domain_suffix = var.cluster_domain_suffix node_labels = join(",", var.node_labels) + node_taints = join(",", var.node_taints) } } diff --git a/docs/advanced/arm64.md b/docs/advanced/arm64.md index 49962071..28b68bf6 100644 --- a/docs/advanced/arm64.md +++ b/docs/advanced/arm64.md @@ -21,7 +21,7 @@ Create a cluster with ARM64 controller and worker nodes. Container workloads mus ```tf module "gravitas" { - source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes?ref=v1.19.4" + source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes?ref=v1.21.0" # AWS cluster_name = "gravitas" @@ -47,9 +47,9 @@ Verify the cluster has only arm64 (`aarch64`) nodes. ``` $ kubectl get nodes -o wide NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME -ip-10-0-12-178 Ready 101s v1.19.4 10.0.12.178 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 -ip-10-0-18-93 Ready 102s v1.19.4 10.0.18.93 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 -ip-10-0-90-10 Ready 104s v1.19.4 10.0.90.10 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +ip-10-0-12-178 Ready 101s v1.21.0 10.0.12.178 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +ip-10-0-18-93 Ready 102s v1.21.0 10.0.18.93 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +ip-10-0-90-10 Ready 104s v1.21.0 10.0.90.10 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 ``` ## Hybrid @@ -60,7 +60,7 @@ Create a hybrid/mixed arch cluster by defining an AWS cluster. Then define a [wo ```tf module "gravitas" { - source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes?ref=v1.19.4" + source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes?ref=v1.21.0" # AWS cluster_name = "gravitas" @@ -83,7 +83,7 @@ Create a hybrid/mixed arch cluster by defining an AWS cluster. Then define a [wo ```tf module "gravitas-arm64" { - source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes/workers?ref=v1.19.4" + source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes/workers?ref=v1.21.0" # AWS vpc_id = module.gravitas.vpc_id @@ -108,9 +108,9 @@ Verify amd64 (x86_64) and arm64 (aarch64) nodes are present. ``` $ kubectl get nodes -o wide NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME -ip-10-0-14-73 Ready 116s v1.19.4 10.0.14.73 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 -ip-10-0-17-167 Ready 104s v1.19.4 10.0.17.167 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 -ip-10-0-47-166 Ready 110s v1.19.4 10.0.47.166 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 -ip-10-0-7-237 Ready 111s v1.19.4 10.0.7.237 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 +ip-10-0-14-73 Ready 116s v1.21.0 10.0.14.73 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 +ip-10-0-17-167 Ready 104s v1.21.0 10.0.17.167 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 +ip-10-0-47-166 Ready 110s v1.21.0 10.0.47.166 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +ip-10-0-7-237 Ready 111s v1.21.0 10.0.7.237 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 ``` diff --git a/docs/advanced/nodes.md b/docs/advanced/nodes.md new file mode 100644 index 00000000..929e6306 --- /dev/null +++ b/docs/advanced/nodes.md @@ -0,0 +1,134 @@ +# Nodes + +Typhoon clusters consist of controller node(s) and a (default) set of worker nodes. + +## Overview + +Typhoon nodes use the standard set of Kubernetes node labels. + +```yaml +Labels: kubernetes.io/arch=amd64 + kubernetes.io/hostname=node-name + kubernetes.io/os=linux +``` + +Controller node(s) are labeled to allow node selection (for rare components that run on controllers) and tainted to prevent ordinary workloads running on controllers. + +```yaml +Labels: node.kubernetes.io/controller=true +Taints: node-role.kubernetes.io/controller:NoSchedule +``` + +Worker nodes are labeled to allow node selection and untainted. Workloads will schedule on worker nodes by default, baring any contraindications. + +```yaml +Labels: node.kubernetes.io/node= +Taints: +``` + +On auto-scaling cloud platforms, you may add [worker pools](/advanced/worker-pools) with different groups of nodes with their own labels and taints. On platforms like bare-metal, with heterogeneous machines, you may manage node labels and taints per node. + +## Node Labels + +Add custom initial worker node labels to default workers or worker pool nodes to allow workloads to select among nodes that differ. + +=== "Cluster" + + ```tf + module "yavin" { + source = "git::https://github.com/poseidon/typhoon//google-cloud/fedora-coreos/kubernetes?ref=v1.21.0" + + # Google Cloud + cluster_name = "yavin" + region = "us-central1" + dns_zone = "example.com" + dns_zone_name = "example-zone" + + # configuration + ssh_authorized_key = local.ssh_key + + # optional + worker_count = 2 + worker_node_labels = ["pool=default"] + } + ``` + +=== "Worker Pool" + + ```tf + module "yavin-pool" { + source = "git::https://github.com/poseidon/typhoon//google-cloud/fedora-coreos/kubernetes/workers?ref=v1.21.0" + + # Google Cloud + cluster_name = "yavin" + region = "europe-west2" + network = module.yavin.network_name + + # configuration + name = "yavin-16x" + kubeconfig = module.yavin.kubeconfig + ssh_authorized_key = local.ssh_key + + # optional + worker_count = 1 + machine_type = "n1-standard-16" + node_labels = ["pool=big"] + } + ``` + +In the example above, the two default workers would be labeled `pool: default` and the additional worker would be labeled `pool: big`. + +## Node Taints + +Add custom initial taints on worker pool nodes to indicate a node is unique and should only schedule workloads that explicitly tolerate a given taint key. + +!!! warning + Since taints prevent workloads scheduling onto a node, you must decide whether `kube-system` DaemonSets (e.g. flannel, Calico, Cilium) should tolerate your custom taint by setting `daemonset_tolerations`. If you don't list your custom taint(s), important components won't run on these nodes. + +=== "Cluster" + + ```tf + module "yavin" { + source = "git::https://github.com/poseidon/typhoon//google-cloud/fedora-coreos/kubernetes?ref=v1.21.0" + + # Google Cloud + cluster_name = "yavin" + region = "us-central1" + dns_zone = "example.com" + dns_zone_name = "example-zone" + + # configuration + ssh_authorized_key = local.ssh_key + + # optional + worker_count = 2 + daemonset_tolerations = ["role"] + } + ``` + +=== "Worker Pool" + + ```tf + module "yavin-pool" { + source = "git::https://github.com/poseidon/typhoon//google-cloud/fedora-coreos/kubernetes/workers?ref=v1.21.0" + + # Google Cloud + cluster_name = "yavin" + region = "europe-west2" + network = module.yavin.network_name + + # configuration + name = "yavin-16x" + kubeconfig = module.yavin.kubeconfig + ssh_authorized_key = local.ssh_key + + # optional + worker_count = 1 + accelerator_type = "nvidia-tesla-p100" + accelerator_count = 1 + node_taints = ["role=gpu:NoSchedule"] + } + ``` + +In the example above, the the additional worker would be tainted with `role=gpu:NoSchedule` to prevent workloads scheduling, but `kube-system` components like flannel, Calico, or Cilium would tolerate that custom taint to run there. + diff --git a/docs/advanced/worker-pools.md b/docs/advanced/worker-pools.md index 339f2379..d58d5863 100644 --- a/docs/advanced/worker-pools.md +++ b/docs/advanced/worker-pools.md @@ -99,6 +99,7 @@ The AWS internal `workers` module supports a number of [variables](https://githu | snippets | Fedora CoreOS or Container Linux Config snippets | [] | [examples](/advanced/customization/) | | service_cidr | Must match `service_cidr` of cluster | "10.3.0.0/16" | "10.3.0.0/24" | | node_labels | List of initial node labels | [] | ["worker-pool=foo"] | +| node_taints | List of initial node taints | [] | ["role=gpu:NoSchedule"] | Check the list of valid [instance types](https://aws.amazon.com/ec2/instance-types/) or per-region and per-type [spot prices](https://aws.amazon.com/ec2/spot/pricing/). @@ -194,6 +195,7 @@ The Azure internal `workers` module supports a number of [variables](https://git | snippets | Container Linux Config snippets | [] | [examples](/advanced/customization/) | | service_cidr | CIDR IPv4 range to assign to Kubernetes services | "10.3.0.0/16" | "10.3.0.0/24" | | node_labels | List of initial node labels | [] | ["worker-pool=foo"] | +| node_taints | List of initial node taints | [] | ["role=gpu:NoSchedule"] | Check the list of valid [machine types](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/) and their [specs](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes-general). Use `az vm list-skus` to get the identifier. @@ -297,6 +299,7 @@ Check the list of regions [docs](https://cloud.google.com/compute/docs/regions-z | snippets | Container Linux Config snippets | [] | [examples](/advanced/customization/) | | service_cidr | Must match `service_cidr` of cluster | "10.3.0.0/16" | "10.3.0.0/24" | | node_labels | List of initial node labels | [] | ["worker-pool=foo"] | +| node_taints | List of initial node taints | [] | ["role=gpu:NoSchedule"] | Check the list of valid [machine types](https://cloud.google.com/compute/docs/machine-types). diff --git a/google-cloud/fedora-coreos/kubernetes/bootstrap.tf b/google-cloud/fedora-coreos/kubernetes/bootstrap.tf index 28772f8f..657d055d 100644 --- a/google-cloud/fedora-coreos/kubernetes/bootstrap.tf +++ b/google-cloud/fedora-coreos/kubernetes/bootstrap.tf @@ -12,6 +12,7 @@ module "bootstrap" { cluster_domain_suffix = var.cluster_domain_suffix enable_reporting = var.enable_reporting enable_aggregation = var.enable_aggregation + daemonset_tolerations = var.daemonset_tolerations trusted_certs_dir = "/etc/pki/tls/certs" diff --git a/google-cloud/fedora-coreos/kubernetes/variables.tf b/google-cloud/fedora-coreos/kubernetes/variables.tf index dce03d6c..c3d2d700 100644 --- a/google-cloud/fedora-coreos/kubernetes/variables.tf +++ b/google-cloud/fedora-coreos/kubernetes/variables.tf @@ -136,3 +136,8 @@ variable "cluster_domain_suffix" { default = "cluster.local" } +variable "daemonset_tolerations" { + type = list(string) + description = "List of additional taint keys kube-system DaemonSets should tolerate (e.g. ['custom-role', 'gpu-role'])" + default = [] +} diff --git a/google-cloud/fedora-coreos/kubernetes/workers/fcc/worker.yaml b/google-cloud/fedora-coreos/kubernetes/workers/fcc/worker.yaml index 525fc84f..1c65b0ea 100644 --- a/google-cloud/fedora-coreos/kubernetes/workers/fcc/worker.yaml +++ b/google-cloud/fedora-coreos/kubernetes/workers/fcc/worker.yaml @@ -67,6 +67,9 @@ systemd: %{~ for label in split(",", node_labels) ~} --node-labels=${label} \ %{~ endfor ~} + %{~ for taint in split(",", node_taints) ~} + --register-with-taints=${taint} \ + %{~ endfor ~} --pod-manifest-path=/etc/kubernetes/manifests \ --read-only-port=0 \ --rotate-certificates \ diff --git a/google-cloud/fedora-coreos/kubernetes/workers/variables.tf b/google-cloud/fedora-coreos/kubernetes/workers/variables.tf index 576886dc..7d19f3a1 100644 --- a/google-cloud/fedora-coreos/kubernetes/workers/variables.tf +++ b/google-cloud/fedora-coreos/kubernetes/workers/variables.tf @@ -90,6 +90,12 @@ variable "node_labels" { default = [] } +variable "node_taints" { + type = list(string) + description = "List of initial node taints" + default = [] +} + # unofficial, undocumented, unsupported, temporary variable "cluster_domain_suffix" { diff --git a/google-cloud/fedora-coreos/kubernetes/workers/workers.tf b/google-cloud/fedora-coreos/kubernetes/workers/workers.tf index 3c36b1aa..21ebd456 100644 --- a/google-cloud/fedora-coreos/kubernetes/workers/workers.tf +++ b/google-cloud/fedora-coreos/kubernetes/workers/workers.tf @@ -89,6 +89,7 @@ data "template_file" "worker-config" { cluster_dns_service_ip = cidrhost(var.service_cidr, 10) cluster_domain_suffix = var.cluster_domain_suffix node_labels = join(",", var.node_labels) + node_taints = join(",", var.node_taints) } } diff --git a/google-cloud/flatcar-linux/kubernetes/bootstrap.tf b/google-cloud/flatcar-linux/kubernetes/bootstrap.tf index cb9e30fe..def4f0bb 100644 --- a/google-cloud/flatcar-linux/kubernetes/bootstrap.tf +++ b/google-cloud/flatcar-linux/kubernetes/bootstrap.tf @@ -12,6 +12,7 @@ module "bootstrap" { cluster_domain_suffix = var.cluster_domain_suffix enable_reporting = var.enable_reporting enable_aggregation = var.enable_aggregation + daemonset_tolerations = var.daemonset_tolerations // temporary external_apiserver_port = 443 diff --git a/google-cloud/flatcar-linux/kubernetes/variables.tf b/google-cloud/flatcar-linux/kubernetes/variables.tf index f09bccab..e23a304b 100644 --- a/google-cloud/flatcar-linux/kubernetes/variables.tf +++ b/google-cloud/flatcar-linux/kubernetes/variables.tf @@ -130,3 +130,8 @@ variable "cluster_domain_suffix" { default = "cluster.local" } +variable "daemonset_tolerations" { + type = list(string) + description = "List of additional taint keys kube-system DaemonSets should tolerate (e.g. ['custom-role', 'gpu-role'])" + default = [] +} diff --git a/google-cloud/flatcar-linux/kubernetes/workers/cl/worker.yaml b/google-cloud/flatcar-linux/kubernetes/workers/cl/worker.yaml index 014f735b..932e7f45 100644 --- a/google-cloud/flatcar-linux/kubernetes/workers/cl/worker.yaml +++ b/google-cloud/flatcar-linux/kubernetes/workers/cl/worker.yaml @@ -70,6 +70,9 @@ systemd: %{~ for label in split(",", node_labels) ~} --node-labels=${label} \ %{~ endfor ~} + %{~ for taint in split(",", node_taints) ~} + --register-with-taints=${taint} \ + %{~ endfor ~} --pod-manifest-path=/etc/kubernetes/manifests \ --read-only-port=0 \ --rotate-certificates \ diff --git a/google-cloud/flatcar-linux/kubernetes/workers/variables.tf b/google-cloud/flatcar-linux/kubernetes/workers/variables.tf index 56a92303..b430c6c8 100644 --- a/google-cloud/flatcar-linux/kubernetes/workers/variables.tf +++ b/google-cloud/flatcar-linux/kubernetes/workers/variables.tf @@ -84,6 +84,12 @@ variable "node_labels" { default = [] } +variable "node_taints" { + type = list(string) + description = "List of initial node taints" + default = [] +} + # unofficial, undocumented, unsupported, temporary variable "cluster_domain_suffix" { diff --git a/google-cloud/flatcar-linux/kubernetes/workers/workers.tf b/google-cloud/flatcar-linux/kubernetes/workers/workers.tf index e635592f..5d56b3d4 100644 --- a/google-cloud/flatcar-linux/kubernetes/workers/workers.tf +++ b/google-cloud/flatcar-linux/kubernetes/workers/workers.tf @@ -86,6 +86,7 @@ data "template_file" "worker-config" { cluster_dns_service_ip = cidrhost(var.service_cidr, 10) cluster_domain_suffix = var.cluster_domain_suffix node_labels = join(",", var.node_labels) + node_taints = join(",", var.node_taints) } } diff --git a/mkdocs.yml b/mkdocs.yml index 5d599f54..cca4920d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -80,6 +80,7 @@ nav: - 'Overview': 'advanced/overview.md' - 'ARM64': 'advanced/arm64.md' - 'Customization': 'advanced/customization.md' + - 'Nodes': 'advanced/nodes.md' - 'Worker Pools': 'advanced/worker-pools.md' - 'Addons': - 'Overview': 'addons/overview.md'