etc/cke-template.yml

name: neco
nodes:
- user: cybozu
  control_plane: true
  labels:
    cke.cybozu.com/role: "cs"
- user: cybozu
  control_plane: false
  labels:
    cke.cybozu.com/role: "cs"
    cke.cybozu.com/weight: "18"
- user: cybozu
  control_plane: false
  labels:
    cke.cybozu.com/role: "ss"
    cke.cybozu.com/weight: "10"
  taints:
  - key: cke.cybozu.com/role
    value: storage
    effect: NoSchedule
service_subnet: 10.68.0.0/16
dns_service: internet-egress/unbound
control_plane_tolerations: ["node.cilium.io/agent-not-ready"]
reboot:
  reboot_command: ["/usr/bin/neco", "power", "restart", "--wait-for-stop"]
  boot_check_command: ["/usr/bin/neco", "reboot-check"]
  max_concurrent_reboots: 28
  eviction_timeout_seconds: 1800
  command_timeout_seconds: 30
  command_retries: 5
  command_interval: 60
  evict_retries: 30
  evict_interval: 10
repair:
  repair_procedures:
  - machine_types: ["iDRAC"]
    repair_operations:
    - operation: unhealthy
      repair_steps:
      - repair_command: ["sh", "-c", "exec ckecli ssh cybozu@$1 -- docker exec setup-hw racadm racreset soft", "reset-idrac"]
        command_timeout_seconds: 30
        command_retries: 2
        command_interval: 30
        watch_seconds: 600
      - repair_command: ["neco", "bmc", "repair", "dell", "discharge"]
        command_timeout_seconds: 60 # "discharge" is a compound command and requires a little more time
        command_retries: 2
        command_interval: 30
        need_drain: true
        watch_seconds: 360 # too short watch for discharging; this is to power on machine 6 minutes after
      - repair_command: ["sh", "-c", "neco power start $1 || true", "power-start"]
        command_timeout_seconds: 30
        watch_seconds: 240 # power on again 4 minutes after
      - repair_command: ["sh", "-c", "neco power start $1 || true", "power-start"]
        command_timeout_seconds: 30
        watch_seconds: 1500
      health_check_command: ["sh", "-c", "check-machine-state $1 healthy retired", "check-machine-state"]
      command_timeout_seconds: 10
    - operation: unreachable
      repair_steps:
      - repair_command: ["neco", "bmc", "repair", "dell", "reset-idrac"]
        command_timeout_seconds: 30
        command_retries: 2
        command_interval: 30
        watch_seconds: 600
      - repair_command: ["neco", "bmc", "repair", "dell", "discharge"]
        command_timeout_seconds: 60 # "discharge" is a compound command and requires a little more time
        command_retries: 2
        command_interval: 30
        need_drain: false
        watch_seconds: 360 # too short watch for discharging; this is to power on machine 6 minutes after
      - repair_command: ["sh", "-c", "neco power start $1 || true", "power-start"]
        command_timeout_seconds: 30
        watch_seconds: 240 # power on again 4 minutes after
      - repair_command: ["sh", "-c", "neco power start $1 || true", "power-start"]
        command_timeout_seconds: 30
        watch_seconds: 1500
      - repair_command: ["retire-machine"]
        command_timeout_seconds: 30
        command_retries: 2
        command_interval: 30
        watch_seconds: 600
      health_check_command: ["sh", "-c", "check-machine-state $1 healthy retired", "check-machine-state"]
      command_timeout_seconds: 10
  max_concurrent_repairs: 1
  evict_retries: 30
  evict_interval: 10
  eviction_timeout_seconds: 1800
options:
  kube-api:
    audit_log_enabled: true
    audit_log_path: /var/log/audit/audit.log
    audit_log_policy: |
      apiVersion: audit.k8s.io/v1
      kind: Policy
      omitStages:
        - RequestReceived
      omitManagedFields: true
      # Rules are evaluated in order, and the first matching rule is applied. Our rules and purpose are as follows:
      # - Logging all requests to secrets and sealedsecrets at Metadata level. Read operation is also targeted in order to record invalid access to secret. The reason why Metadata level is used is that we want to avoid logging the value of secrets.
      # - Do not log get, watch, list requests to any resources. This aims to reduce log volume from read-only operations that are less critical to audit.
      # - Log create requests to any resources at RequestResponse level for fault investigation.
      # - Log delete, deletecollection requests to any resources at Metadata level for fault investigation.
      # - Log requests to events resources at RequestResponse level in order to store timestamp and messages in events to Loki for fault investigation.
      # - Log requests to any resources from teleport service accounts at RequestResponse level for recording user's manual operations. system:serviceaccounts:teleport is used when users access the cluster via teleport-node.
      # - Do not log requests from nodes and service accounts, and kuberenetes components.
      # - Log all requests that do not match any rules above at RequestResponse level for recording user's manual operations. and unexpected requests. This includes requests from boot servers.
      rules:
        - level: Metadata
          resources:
            - group: ""
              resources: ["secrets"]
            - group: "bitnami.com"
              resources: ["sealedsecrets"]
        - level: None
          verbs: ["get", "watch", "list"]
        - level: RequestResponse
          verbs: ["create"]
        - level: Metadata
          verbs: ["delete", "deletecollection"]
        - level: RequestResponse
          resources:
            - group: ""
              resources: ["events"]
        - level: RequestResponse
          userGroups:
            - system:serviceaccounts:teleport
        - level: None
          userGroups:
            - system:nodes
            - system:serviceaccounts
        - level: None
          users:
            - system:apiserver
            - system:kube-controller-manager
            - system:kube-proxy
            - system:kube-scheduler
        - level: RequestResponse
    extra_args:
      - "--feature-gates=ValidatingAdmissionPolicy=true"
      - "--audit-log-maxage=1"
      - "--audit-log-maxsize=10"
      - "--audit-log-maxbackup=10"
      - "--runtime-config=admissionregistration.k8s.io/v1beta1=true"
    extra_binds:
      - source: /var/log/audit
        destination: /var/log/audit
        read_only: false
  kube-controller-manager:
    extra_args:
      - "--leader-elect-retry-period=5s"
      - "--leader-elect-renew-deadline=15s"
      - "--leader-elect-lease-duration=20s"
      - "--kube-api-qps=50"
  kube-proxy:
    disable: true
    # For local-proxy running on boot servers
    config:
      apiVersion: kubeproxy.config.k8s.io/v1alpha1
      kind: KubeProxyConfiguration
      mode: ipvs
  kube-scheduler:
    config:
      apiVersion: kubescheduler.config.k8s.io/v1
      kind: KubeSchedulerConfiguration
      extenders:
      - urlPrefix: "http://127.0.0.1:9251"
        filterVerb: predicate
        prioritizeVerb: prioritize
        weight: 1
        nodeCacheCapable: false
        managedResources:
        - name: topolvm.cybozu.com/capacity
          ignoredByScheduler: true
      profiles:
      - schedulerName: default-scheduler
        plugins:
          score:
            disabled:
            - name: PodTopologySpread
            enabled:
            - name: PodTopologySpread
              weight: 500
        pluginConfig:
        - name: PodTopologySpread
          args:
            # https://pkg.go.dev/k8s.io/kube-scheduler/config/v1#PodTopologySpreadArgs
            defaultConstraints:
            - maxSkew: 1
              topologyKey: topology.kubernetes.io/zone
              whenUnsatisfiable: ScheduleAnyway
            - maxSkew: 1
              topologyKey: kubernetes.io/hostname
              whenUnsatisfiable: ScheduleAnyway
            defaultingType: List
    extra_args:
      - "--leader-elect-retry-period=5s"
      - "--leader-elect-renew-deadline=15s"
      - "--leader-elect-lease-duration=20s"
  kubelet:
    boot_taints:
      - key: node.cilium.io/agent-not-ready
        value: "true"
        effect: NoExecute
    config:
      apiVersion: kubelet.config.k8s.io/v1beta1
      kind: KubeletConfiguration
      cgroupDriver: systemd
      containerLogMaxSize: 10Mi
      containerLogMaxFiles: 10
      cpuManagerPolicy: static
      maxPods: 150
      topologyManagerPolicy: best-effort
      systemReserved:
        cpu: "1"
      imageGCHighThresholdPercent: 65
      imageGCLowThresholdPercent: 60
    cri_endpoint: unix:///var/run/k8s-containerd.sock
    extra_args:
      - "--containerd=/var/run/k8s-containerd.sock"
      - "--seccomp-default=true"
    extra_binds:
      - source: /var/lib/k8s-containerd
        destination: /var/lib/k8s-containerd
        read_only: false
      - source: /var/lib/rook
        destination: /var/lib/rook
        read_only: false
  rivers:
    extra_args:
      - "--dial-timeout=4s"
      - "--dial-keep-alive=6s"
      - "--check-interval=5s"
  etcd-rivers:
    extra_args:
      - "--dial-timeout=4s"
      - "--dial-keep-alive=6s"
      - "--check-interval=5s"
  etcd:
    extra_args:
      - "--listen-metrics-urls=http://0.0.0.0:2381"
      # 8 * 1024 * 1024 * 1024 = 8589934592 = 8GB
      - "--quota-backend-bytes=8589934592"
      #GCPONLY - "--heartbeat-interval=500"
      #GCPONLY - "--election-timeout=2500"