-
Notifications
You must be signed in to change notification settings - Fork 11
/
cke-template.yml
245 lines (245 loc) · 9.09 KB
/
cke-template.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
name: neco
nodes:
- user: cybozu
control_plane: true
labels:
cke.cybozu.com/role: "cs"
- user: cybozu
control_plane: false
labels:
cke.cybozu.com/role: "cs"
cke.cybozu.com/weight: "18"
- user: cybozu
control_plane: false
labels:
cke.cybozu.com/role: "ss"
cke.cybozu.com/weight: "10"
taints:
- key: cke.cybozu.com/role
value: storage
effect: NoSchedule
service_subnet: 10.68.0.0/16
dns_service: internet-egress/unbound
control_plane_tolerations: ["node.cilium.io/agent-not-ready"]
reboot:
reboot_command: ["/usr/bin/neco", "power", "restart", "--wait-for-stop"]
boot_check_command: ["/usr/bin/neco", "reboot-check"]
max_concurrent_reboots: 28
eviction_timeout_seconds: 1800
command_timeout_seconds: 30
command_retries: 5
command_interval: 60
evict_retries: 30
evict_interval: 10
repair:
repair_procedures:
- machine_types: ["iDRAC"]
repair_operations:
- operation: unhealthy
repair_steps:
- repair_command: ["sh", "-c", "exec ckecli ssh cybozu@$1 -- docker exec setup-hw racadm racreset soft", "reset-idrac"]
command_timeout_seconds: 30
command_retries: 2
command_interval: 30
watch_seconds: 600
- repair_command: ["neco", "bmc", "repair", "dell", "discharge"]
command_timeout_seconds: 60 # "discharge" is a compound command and requires a little more time
command_retries: 2
command_interval: 30
need_drain: true
watch_seconds: 360 # too short watch for discharging; this is to power on machine 6 minutes after
- repair_command: ["sh", "-c", "neco power start $1 || true", "power-start"]
command_timeout_seconds: 30
watch_seconds: 240 # power on again 4 minutes after
- repair_command: ["sh", "-c", "neco power start $1 || true", "power-start"]
command_timeout_seconds: 30
watch_seconds: 1500
health_check_command: ["sh", "-c", "check-machine-state $1 healthy retired", "check-machine-state"]
command_timeout_seconds: 10
- operation: unreachable
repair_steps:
- repair_command: ["neco", "bmc", "repair", "dell", "reset-idrac"]
command_timeout_seconds: 30
command_retries: 2
command_interval: 30
watch_seconds: 600
- repair_command: ["neco", "bmc", "repair", "dell", "discharge"]
command_timeout_seconds: 60 # "discharge" is a compound command and requires a little more time
command_retries: 2
command_interval: 30
need_drain: false
watch_seconds: 360 # too short watch for discharging; this is to power on machine 6 minutes after
- repair_command: ["sh", "-c", "neco power start $1 || true", "power-start"]
command_timeout_seconds: 30
watch_seconds: 240 # power on again 4 minutes after
- repair_command: ["sh", "-c", "neco power start $1 || true", "power-start"]
command_timeout_seconds: 30
watch_seconds: 1500
- repair_command: ["retire-machine"]
command_timeout_seconds: 30
command_retries: 2
command_interval: 30
watch_seconds: 600
health_check_command: ["sh", "-c", "check-machine-state $1 healthy retired", "check-machine-state"]
command_timeout_seconds: 10
max_concurrent_repairs: 1
evict_retries: 30
evict_interval: 10
eviction_timeout_seconds: 1800
options:
kube-api:
audit_log_enabled: true
audit_log_path: /var/log/audit/audit.log
audit_log_policy: |
apiVersion: audit.k8s.io/v1
kind: Policy
omitStages:
- RequestReceived
omitManagedFields: true
# Rules are evaluated in order, and the first matching rule is applied. Our rules and purpose are as follows:
# - Logging all requests to secrets and sealedsecrets at Metadata level. Read operation is also targeted in order to record invalid access to secret. The reason why Metadata level is used is that we want to avoid logging the value of secrets.
# - Do not log get, watch, list requests to any resources. This aims to reduce log volume from read-only operations that are less critical to audit.
# - Log create requests to any resources at RequestResponse level for fault investigation.
# - Log delete, deletecollection requests to any resources at Metadata level for fault investigation.
# - Log requests to events resources at RequestResponse level in order to store timestamp and messages in events to Loki for fault investigation.
# - Log requests to any resources from teleport service accounts at RequestResponse level for recording user's manual operations. system:serviceaccounts:teleport is used when users access the cluster via teleport-node.
# - Do not log requests from nodes and service accounts, and kuberenetes components.
# - Log all requests that do not match any rules above at RequestResponse level for recording user's manual operations. and unexpected requests. This includes requests from boot servers.
rules:
- level: Metadata
resources:
- group: ""
resources: ["secrets"]
- group: "bitnami.com"
resources: ["sealedsecrets"]
- level: None
verbs: ["get", "watch", "list"]
- level: RequestResponse
verbs: ["create"]
- level: Metadata
verbs: ["delete", "deletecollection"]
- level: RequestResponse
resources:
- group: ""
resources: ["events"]
- level: RequestResponse
userGroups:
- system:serviceaccounts:teleport
- level: None
userGroups:
- system:nodes
- system:serviceaccounts
- level: None
users:
- system:apiserver
- system:kube-controller-manager
- system:kube-proxy
- system:kube-scheduler
- level: RequestResponse
extra_args:
- "--feature-gates=ValidatingAdmissionPolicy=true"
- "--audit-log-maxage=1"
- "--audit-log-maxsize=10"
- "--audit-log-maxbackup=10"
- "--runtime-config=admissionregistration.k8s.io/v1beta1=true"
extra_binds:
- source: /var/log/audit
destination: /var/log/audit
read_only: false
kube-controller-manager:
extra_args:
- "--leader-elect-retry-period=5s"
- "--leader-elect-renew-deadline=15s"
- "--leader-elect-lease-duration=20s"
- "--kube-api-qps=50"
kube-proxy:
disable: true
# For local-proxy running on boot servers
config:
apiVersion: kubeproxy.config.k8s.io/v1alpha1
kind: KubeProxyConfiguration
mode: ipvs
kube-scheduler:
config:
apiVersion: kubescheduler.config.k8s.io/v1
kind: KubeSchedulerConfiguration
extenders:
- urlPrefix: "http://127.0.0.1:9251"
filterVerb: predicate
prioritizeVerb: prioritize
weight: 1
nodeCacheCapable: false
managedResources:
- name: topolvm.cybozu.com/capacity
ignoredByScheduler: true
profiles:
- schedulerName: default-scheduler
plugins:
score:
disabled:
- name: PodTopologySpread
enabled:
- name: PodTopologySpread
weight: 500
pluginConfig:
- name: PodTopologySpread
args:
# https://pkg.go.dev/k8s.io/kube-scheduler/config/v1#PodTopologySpreadArgs
defaultConstraints:
- maxSkew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: ScheduleAnyway
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
defaultingType: List
extra_args:
- "--leader-elect-retry-period=5s"
- "--leader-elect-renew-deadline=15s"
- "--leader-elect-lease-duration=20s"
kubelet:
boot_taints:
- key: node.cilium.io/agent-not-ready
value: "true"
effect: NoExecute
config:
apiVersion: kubelet.config.k8s.io/v1beta1
kind: KubeletConfiguration
cgroupDriver: systemd
containerLogMaxSize: 10Mi
containerLogMaxFiles: 10
cpuManagerPolicy: static
maxPods: 150
topologyManagerPolicy: best-effort
systemReserved:
cpu: "1"
imageGCHighThresholdPercent: 65
imageGCLowThresholdPercent: 60
cri_endpoint: unix:///var/run/k8s-containerd.sock
extra_args:
- "--containerd=/var/run/k8s-containerd.sock"
- "--seccomp-default=true"
extra_binds:
- source: /var/lib/k8s-containerd
destination: /var/lib/k8s-containerd
read_only: false
- source: /var/lib/rook
destination: /var/lib/rook
read_only: false
rivers:
extra_args:
- "--dial-timeout=4s"
- "--dial-keep-alive=6s"
- "--check-interval=5s"
etcd-rivers:
extra_args:
- "--dial-timeout=4s"
- "--dial-keep-alive=6s"
- "--check-interval=5s"
etcd:
extra_args:
- "--listen-metrics-urls=http://0.0.0.0:2381"
# 8 * 1024 * 1024 * 1024 = 8589934592 = 8GB
- "--quota-backend-bytes=8589934592"
#GCPONLY - "--heartbeat-interval=500"
#GCPONLY - "--election-timeout=2500"