forked from bloomberg/chef-bach
-
Notifications
You must be signed in to change notification settings - Fork 0
/
repxe_host.rb
executable file
·531 lines (467 loc) · 14.8 KB
/
repxe_host.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
#!/usr/bin/env ruby
#
# repxe_host.rb
#
# This script coordinates the re-installation and re-chefing of an
# existing host.
#
# Provisos:
#
# * It still calls out to c-a-r for the chef bootstrap and re-chefing.
# * Hosts still have to be manually rebooted.
#
# To use from the command line:
#
# 1. If necessary, configure your local rubygems mirror.
# Replace 'http://mirror.example.com' with your actual mirror.
# ```
# bundle config mirror.https://rubygems.org http://mirror.example.com
# ```
#
# 2. Run 'bundle install --deployment' on a bootstrap node with
# access to a rubygems mirror.
#
# 3. If not already using the target bootstrap, sync the updated
# repository, including 'vendor' directory, to the target
# bootstrap.
#
# 4. Run 'bundle exec ./repxe_host.rb -m <hostname>' to begin the process.
# To run on a brand new node that doesn't need to be shut down first, run:
# 'bundle exec ./repxe_host.rb -m <hostname> -s'
#
# 5. When prompted, manually reboot the host, then press enter.
#
# It is also possible to use methods from this script at a ruby REPL
# instead of running the script from a UNIX shell. To load methods
# into `irb`:
#
# 1. Change to the repo directory.
#
# 2. Verify that dependencies are installed:
# `bundle list`
#
# 3. Run irb inside the repo directory.
# `bundle exec irb`
#
# 4. Load this file.
# `irb(main):001:0> load ./repxe_host.rb`
#
require 'chef/provisioning/transport/ssh'
require 'English'
require 'mixlib/shellout'
require 'pry'
require 'timeout'
require 'optparse'
require_relative 'lib/cluster_data'
include BACH::ClusterData
def cluster_assign_roles(environment, type, entry=nil)
types = %w[basic hadoop kafka]
unless types.include?(type.to_s.downcase)
raise "#{type} is not one of #{types.join(',')} !"
end
#
# We use system() instead of Mixlib::ShellOut specifically so that the
# child process re-uses our STDOUT/STDERR.
#
# TODO: replace with IO::popen3
#
if entry.nil?
system('sudo', './cluster-assign-roles.sh',
environment, type.to_s.downcase.capitalize)
else
system('sudo', './cluster-assign-roles.sh',
environment, type.to_s.downcase.capitalize, entry[:hostname])
end
# Why doesn't this raise an error?
puts 'cluster-assign-roles.sh failed!' unless $CHILD_STATUS.success?
end
def restart_chef_server
c = Mixlib::ShellOut.new('sudo', 'chef-server-ctl', 'restart')
c.run_command
raise 'Failed to restart chef-server' unless c.status.success?
puts 'restarted chef-server'
end
def cobbler_unenroll(entry)
c = Mixlib::ShellOut.new('sudo', 'cobbler', 'system', 'remove',
'--name', entry[:hostname])
c.run_command
raise "Failed to un-enroll #{entry[:hostname]}!" unless c.status.success?
puts "Un-enrolled #{entry[:hostname]} from cobbler"
end
def cobbler_enroll(entry)
c = Mixlib::ShellOut.new('sudo', 'cobbler', 'system', 'add',
'--name', entry[:hostname],
'--hostname', fqdn(entry),
'--profile', entry[:cobbler_profile],
'--ip-address', entry[:ip_address],
'--interface=eth0',
'--mac', corrected_mac(entry))
c.run_command
unless c.status.success?
raise "Failed to enroll #{entry[:hostname]}!"
end
puts "Enrolled #{entry[:hostname]} in cobbler"
end
# This is mostly copy/pasted out of BACH-next chef helpers.
def cobbler_root_password
require 'json'
require 'mixlib/shellout'
vault_command =
Mixlib::ShellOut.new('sudo',
'knife', 'vault', 'show',
'os', 'cobbler',
'-F', 'json',
'-p', 'all',
'-m', 'client')
vault_command.run_command
unless vault_command.status.success?
raise 'Could not retrieve cobbler password!\n' +
vault_command.stdout + '\n' +
vault_command.stderr
end
JSON.parse(vault_command.stdout)['root-password']
end
def cobbler_sync
c = Mixlib::ShellOut.new('sudo', 'cobbler', 'sync')
c.run_command
raise 'Failed to sync cobbler' unless c.status.success?
end
# Removes the Chef server objects and SSH known_hosts entries for a host.
# Takes a hash from cluster.txt as sole argument.
def delete_node_data(entry)
['client',
'node'].each do |object|
Mixlib::ShellOut.new('sudo', 'knife',
object, 'delete',
entry[:fqdn], '--yes').run_command
end
# Running knife with sudo can set the permissions to root:root.
# We need to correct the permissions before running ssh-keygen.
Mixlib::ShellOut.new('sudo', 'chown',
`whoami`.chomp,
"#{ENV['HOME']}/.ssh/known_hosts").run_command
[entry[:fqdn],
entry[:ip_address],
entry[:hostname]].each do |ssh_name|
del = Mixlib::ShellOut.new('ssh-keygen', '-R', ssh_name)
del.run_command
unless del.status.success?
raise "Failed to delete SSH key for #{ssh_name}: #{del.stderr}"
end
end
puts "Deleted SSH fingerprints and Chef objects for #{entry[:hostname]}"
end
def restart_host(entry)
# if is_virtualbox_vm?(entry)
# # If it's a virtualbox VM, prompt the user to do it for us.
# puts 'Please reboot ' + entry[:hostname] + ', then hit enter'
# STDIN.gets
# else
# # Otherwise, reach out via IPMI
# raise "IPMI is unimplemented!"
# end
puts 'Please reboot ' + entry[:hostname] + ' ' \
'into pxe-boot mode, then hit enter'
STDIN.gets
end
def rotate_vault_keys
#
# There's no error checking here, because it will fail to rotate
# keys on data bags where the node is an admin.
#
# The correct solution would be to rescue from error, scrape a vault
# name from stderr, then check whether the dead node is an admin on
# that particular data bag / vault.
#
c = Mixlib::ShellOut.new('sudo', 'knife',
'vault', 'rotate', 'all', 'keys',
'-m', 'client')
c.run_command
end
# This is mostly copy/pasted out of the BACH-next 'setup_pxe_demo' recipe.
def wait_for_host(entry)
ssh_options = { auth_methods: ['password'],
config: false,
password: cobbler_root_password,
user_known_hosts_file: '/dev/null' }
prompts = { number_of_password_prompts: 0 }
options = {}
config = { log_level: :warn }
ssh_transport =
Chef::Provisioning::Transport::SSH.new(entry[:ip_address],
'ubuntu',
ssh_options.merge(prompts),
options,
config)
#
# If it takes more than half an hour for the node to respond,
# something is really broken.
#
# This will make 60 attempts with a 1 minute sleep between attempts,
# or timeout after 61 minutes.
#
Timeout.timeout(3720) do
max = 60
1.upto(max) do |idx|
break if ssh_transport.available?
puts "Waiting for #{entry[:hostname]} to respond to SSH " \
"on #{entry[:ip_address]} (attempt #{idx}/#{max})"
sleep 60
end
end
if ssh_transport.available?
puts "Reached #{entry[:hostname]} via SSH, continuing"
true
else
raise "Failed to reach #{entry[:hostname]} via SSH!"
end
end
# Find the Chef environment
def find_chef_env
require 'json'
require 'rubygems'
require 'ohai'
require 'mixlib/shellout'
o = Ohai::System.new
o.all_plugins
env_command =
Mixlib::ShellOut.new('sudo', 'knife',
'node', 'show',
o[:fqdn] || o[:hostname], '-E',
'-F', 'json')
env_command.run_command
unless env_command.status.success?
raise 'Could not retrieve Chef environment!\n' +
env_command.stdout + '\n' +
env_command.stderr
end
JSON.parse(env_command.stdout)['chef_environment']
end
def get_mounted_disks(chef_env, vm_entry)
c = Mixlib::ShellOut.new('./nodessh.sh',
chef_env,
vm_entry[:hostname],
'df -h')
c.run_command
disks = c.stdout.split("\n")
disks = disks[1..disks.length]
# return all disks mapped to /disk/#
disks.map do |disk|
disk.split(' ')[-1]
end.map do |disk|
%r{\/disk\/\d+}.match(disk).nil? ? nil : disk
end.compact
end
def unmount_disks(chef_env, vm_entry)
puts 'Unmounting disks.'
get_mounted_disks(chef_env, vm_entry).each do |disk|
puts 'unmounting ' + disk
c = Mixlib::ShellOut.new('./nodessh.sh',
chef_env,
vm_entry[:hostname],
'umount ' + disk,
'sudo')
c.run_command
if c.status.success?
puts 'Unmounted ' + disk
else
raise 'Could not unmount ' + disk + ' ' + c.stdout + '\n' + c.stderr
end
end
end
def confirm_chef_client_down(chef_env, vm_entry)
#
# If it takes more than 2 minutes
# something is really broken.
#
# This will make 30 attempts with a 1 minute sleep between attempts,
# or timeout after 31 minutes.
#
command = 'ps -ef | grep chef-client | grep -v grep'
Timeout.timeout(120) do
max = 5
1.upto(max) do |idx|
c = Mixlib::ShellOut.new('./nodessh.sh',
chef_env,
vm_entry[:hostname],
command)
c.run_command
if c.exitstatus == 1 && c.stdout == ''
puts 'chef client is down'
break
else
puts "Waiting for chef to go down (attempt #{idx}/#{max})"
sleep 30
end
end
end
end
def kill_chef_client(chef_env, vm_entry)
puts 'Stopping chef-client'
[
'service chef-client stop ',
'pkill -f chef-client'
].each do |command|
c = Mixlib::ShellOut.new('./nodessh.sh',
chef_env,
vm_entry[:hostname],
command,
'sudo')
c.run_command
end
confirm_chef_client_down(chef_env, vm_entry)
puts 'Chef client is down'
end
def start_chef_client(chef_env, vm_entry)
puts 'Starting chef-client'
c = Mixlib::ShellOut.new('./nodessh.sh',
chef_env,
vm_entry[:hostname],
'service chef-client start',
'sudo')
c.run_command
if c.status.success?
puts 'Chef client started.'
else
puts 'Chef client did not start successfully: ' +
c.stdout + '\n' + c.stderr
end
end
def run_chef_client(chef_env, vm_entry, params = ' ')
puts 'Running chef-client'
c = Mixlib::ShellOut.new('./nodessh.sh',
chef_env,
vm_entry[:hostname],
"chef-client #{params}",
'sudo')
c.run_command
if c.status.success?
puts 'Chef client ran.'
else
puts 'Chef client did not run successfully: ' +
c.stdout + '\n' + c.stderr
end
end
def stop_all_services(chef_env, vm_entry)
puts 'Stopping services.'
[
'carbon-relay',
'carbon-aggregator',
'carbon-cache',
'apache2', # graphite-web
'jmxtrans',
'hbase-regionserver',
'hbase-master',
'hadoop-hdfs-datanode',
'hadoop-httpfs',
'hadoop-yarn-nodemanager',
'hadoop-hdfs-journalnode',
'hadoop-hdfs-namenode',
'hadoop-hdfs-zkfc',
'haproxy'
].each do |service|
c = Mixlib::ShellOut.new('./nodessh.sh',
chef_env,
vm_entry[:hostname],
'service ' + service + ' stop',
'sudo')
c.run_command
if c.status.success?
puts 'Stopped ' + service
else
puts 'Could not stop service ' +
service + ' ' + c.stdout + '\n' + c.stderr
end
end
end
def shutdown_box(chef_env, vm_entry)
c = Mixlib::ShellOut.new('./nodessh.sh',
chef_env,
vm_entry[:hostname],
'shutdown -h now',
'sudo')
c.run_command
if !c.status.success?
raise 'Could not shut down host ' +
vm_entry[:hostname] + '\n' + c.stdout + '\n' + c.stderr
else
puts 'Host has been shut down.'
end
end
# Graceful shutdown - bring down all services, unmount disks, shutdown
def graceful_shutdown(chef_env, vm_entry)
puts 'Running graceful shutdown of ' + vm_entry[:hostname]
kill_chef_client(chef_env, vm_entry)
stop_all_services(chef_env, vm_entry)
unmount_disks(chef_env, vm_entry)
shutdown_box(chef_env, vm_entry)
end
#
# This conditional allows us to use the methods into irb instead of
# invoking the script from a UNIX shell.
#
if __FILE__ == $PROGRAM_NAME
options = { shutdown: true, newmachine: false, down: false }
parser = OptionParser.new do |opts|
opts.banner = 'Usage: repxe_host.rb [options]'
opts.on('-s', '--skipShutdown', 'Skip Shutdown') do
options[:shutdown] = false
end
opts.on('-n', '--newmachine', 'PXE boot a new machine') do
options[:newmachine] = true
options[:shutdown] = false
end
opts.on('-m', '--machine machine', 'Machine') do |machine|
options[:machine] = machine
puts 'found an option for machine with value ' + machine
end
opts.on('-d', '--down', 'Just bring the machine down') do
options[:down] = true
options[:shutdown] = true
end
opts.on('-h', '--help', 'Displays Help') do
puts opts
exit
end
end
parser.parse!
if options[:machine].nil?
puts parser
exit(-1)
end
vm_entry = get_entry(options[:machine])
if vm_entry.nil?
puts "'#{options[:machine]}' was not found in cluster.txt!"
exit(-1)
end
puts 'Repxe script started for node ' + options[:machine]
chef_env = find_chef_env
graceful_shutdown(chef_env, vm_entry) if options[:shutdown]
if options[:down]
puts 'Machine has been shut down. Exiting.'
exit
end
unless options[:newmachine]
delete_node_data(vm_entry)
rotate_vault_keys
cobbler_unenroll(vm_entry)
end
# HACK: restart to free up memory. Chef server 11 has a memory leak.
restart_chef_server
cobbler_enroll(vm_entry)
cobbler_sync
restart_host(vm_entry)
wait_for_host(vm_entry)
cluster_assign_roles(chef_env, :basic, vm_entry)
# HACK: vas cookbook has issues, so sleep and try again
sleep(360)
cluster_assign_roles(chef_env, :basic, vm_entry)
rotate_vault_keys
refresh_vault_keys
# HACK: Forces convergence of certain ohai attributes
run_chef_client(chef_env, vm_entry, '-r \'bach_hadoop_wrapper,bcpc::chef_vault_install\'')
cluster_assign_roles(chef_env, :hadoop, vm_entry)
# HACK: sometimes rechefing seems to do the trick...
run_chef_client(chef_env, vm_entry)
start_chef_client(chef_env, vm_entry)
end