forked from codalab/codalab-worksheets
-
Notifications
You must be signed in to change notification settings - Fork 1
/
monitor.py
354 lines (287 loc) · 10.8 KB
/
monitor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#!/usr/bin/env python3
import os, sys
import datetime
from collections import defaultdict
from smtplib import SMTP
from email.mime.text import MIMEText
import subprocess
import time
import argparse
BASE_DIR = os.path.dirname(__file__)
# This script runs in a loop monitoring the health of the CodaLab instance.
# It reads config.json in your CODALAB_HOME (~/.codalab).
# Here are some of the things the script does:
# - Make sure we don't run out of disk space.
# - Backup the database.
# - Make sure runs finish in a reasonable amount of time.
# - Email if anything goes wrong (but bound the number of emails as not to spam).
# - Email a daily report.
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument(
'--codalab-home',
help='where the CodaLab instance lives',
default=os.getenv('CODALAB_HOME', os.path.join(os.getenv('HOME'), '.codalab')),
)
# Where to write out information
parser.add_argument('--log-path', help='file to write the log', default='monitor.log')
parser.add_argument('--backup-path', help='directory to backup database', default='monitor.backup')
# How often to do things
parser.add_argument(
'--ping-interval', help='ping the server every this many seconds', type=int, default=30
)
parser.add_argument(
'--run-interval', help='run a job every this many seconds', type=int, default=5 * 60
)
parser.add_argument(
'--email-interval',
help='email a report every this many seconds',
type=int,
default=24 * 60 * 60,
)
args = parser.parse_args()
# Get MySQL username and password for bundles
bundles_host = os.environ['CODALAB_MYSQL_HOST']
bundles_port = os.environ['CODALAB_MYSQL_PORT']
bundles_database = os.environ['CODALAB_MYSQL_DATABASE']
bundles_username = os.environ['CODALAB_MYSQL_USERNAME']
bundles_password = os.environ['CODALAB_MYSQL_PASSWORD']
print(
'user = {}, password = {}, db = {}, host = {}, port = {}'.format(
bundles_username, '*' * len(bundles_password), bundles_database, bundles_host, bundles_port
)
)
hostname = os.environ['HOSTNAME']
# Email
admin_email = os.environ['CODALAB_ADMIN_EMAIL']
sender_host = os.environ['CODALAB_EMAIL_HOST']
sender_username = os.environ['CODALAB_EMAIL_USERNAME']
sender_password = os.environ['CODALAB_EMAIL_PASSWORD']
# Create backup directory
if not os.path.exists(args.backup_path):
os.mkdir(args.backup_path)
# Comma-separated list of worker ids to monitor. Example: vm-clws-prod-worker-0,vm-clws-prod-worker-1
public_workers = set(os.environ['CODALAB_PUBLIC_WORKERS'].split(','))
report = [] # Build up the current report to send in an email
# message is a list
def send_email(subject, message):
print(
'send_email to %s from %s@%s; subject: %s; message contains %d lines'
% (admin_email, sender_username, sender_host, subject, len(message))
)
sys.stdout.flush()
if not admin_email:
return
# Default to authless SMTP (supported by some servers) if user/password is unspecified.
# Default sender_username has to be a valid RFC 822 from-address string for transport (distinct from msg headers)
# Ref: https://docs.python.org/2/library/smtplib.html#smtplib.SMTP.sendmail
do_login = sender_password is not None
s = SMTP(sender_host, 587)
s.ehlo()
s.starttls()
s.ehlo()
msg = MIMEText('<pre style="font: monospace">' + '\n'.join(message) + '</pre>', 'html')
msg['Subject'] = 'CodaLab on %s: %s' % (hostname, subject)
msg['To'] = admin_email
msg['From'] = '[email protected]'
if do_login:
s.login(sender_username, sender_password)
s.sendmail(sender_username, admin_email, msg.as_string())
s.quit()
def get_date():
# Only save a backup for every month to save space
return datetime.datetime.utcnow().strftime('%Y-%m')
def log(line, newline=True):
line = '[%s] %s' % (get_date(), line)
if newline:
print(line)
else:
print(line)
sys.stdout.flush()
report.append(line)
out = open(args.log_path, 'a')
print(line, file=out)
out.close()
def logs(s):
for line in s.split('\n'):
log(line)
num_errors = defaultdict(int)
last_sent = defaultdict(int)
def error_logs(error_type, s):
logs(s)
num_errors[error_type] += 1
n = num_errors[error_type]
last_t = last_sent[error_type]
t = time.time()
# Send email only every 4 hours
if t > last_t + 60 * 60 * 4:
send_email('%s [%d times]' % (error_type, n), s.split('\n'))
last_sent[error_type] = t
durations = defaultdict(list) # Command => durations for that command
def run_command(args, soft_time_limit=15, hard_time_limit=60, include_output=True):
# We cap the running time to hard_time_limit, but print out an error if we exceed soft_time_limit.
start_time = time.time()
args = ['timeout', '%ss' % hard_time_limit] + args
proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8")
output, err_output = proc.communicate()
exitcode = proc.returncode
end_time = time.time()
# Add to the list
duration = end_time - start_time
l = durations[str(args)]
l.append(duration)
while len(l) > 1000: # Keep the list bounded
l.pop(0)
average_duration = sum(l) // len(l)
max_duration = max(l)
# Abstract away the concrete uuids
simple_args = ['0x*' if arg.startswith('0x') else arg for arg in args]
# Include stderr in output if returncode != 0
full_output = output
if exitcode != 0:
full_output += err_output
message = '>> %s (exit code %s, time %.2fs [limit: %ds,%ds]; avg %.2fs; max %.2fs)\n%s' % (
' '.join(args),
exitcode,
duration,
soft_time_limit,
hard_time_limit,
average_duration,
max_duration,
full_output if include_output else '',
)
if exitcode == 0:
logs(message)
else:
error_logs('command failed: ' + ' '.join(simple_args), message)
if duration > soft_time_limit:
error_logs('command too slow: ' + ' '.join(simple_args), message)
return output.rstrip()
timer = 0
def ping_time():
global timer
return timer % args.ping_interval == 0
def run_time():
global timer
return timer % args.run_interval == 0
def email_time():
global timer
return timer % args.email_interval == 0
def backup_db():
log('Backup DB (note that errors are not detected due to shell pipes)')
date = get_date()
mysql_conf_path = os.path.join(args.codalab_home, 'monitor-mysql.cnf')
with open(mysql_conf_path, 'w') as f:
print('[client]', file=f)
print('host="%s"' % bundles_host, file=f)
print('port="%s"' % bundles_port, file=f)
print('user="%s"' % bundles_username, file=f)
print('password="%s"' % bundles_password, file=f)
path = '%s/%s-%s.mysqldump.gz' % (args.backup_path, bundles_database, date)
run_command(
[
'bash',
'-c',
'mysqldump --defaults-file=%s --single-transaction --quick %s | gzip > %s'
% (mysql_conf_path, bundles_database, path),
],
600,
600,
) # Backup might take a while.
os.unlink(mysql_conf_path)
size = os.path.getsize(path)
log('Size of backup {} is {}'.format(path, size))
if size < 100:
log('Size is suspiciously small!')
def check_disk_space(paths):
lines = run_command(['df'] + paths).split('\n')[1:]
results = [int(line.split()[3]) for line in lines]
# Flag an error if disk space running low
total = sum(results)
if total < 1000 * 1024:
error_logs(
'low disk space',
'Only %s MB of disk space left on %s!' % (total // 1024, ' '.join(paths)),
)
def poll_online_workers():
if len(public_workers) == 0:
error_logs(
'worker check failed', 'Missing value for environment variable CODALAB_PUBLIC_WORKERS.'
)
lines = run_command(['cl', 'workers']).split('\n')
workers_info = lines[2:]
online_workers = set()
for line in workers_info:
online_workers.add(line.split()[0].strip())
workers_intersection = public_workers.intersection(online_workers)
offline_public_workers = public_workers - workers_intersection
if len(offline_public_workers) > 0:
error_logs(
'worker offline',
'The following public workers are offline:\n{}.'.format(
'\n'.join(offline_public_workers)
),
)
# Make sure we can connect (might prompt for username/password)
if subprocess.call(['cl', 'work']) != 0:
sys.exit(1)
# Begin monitoring loop
while True:
del report[:]
if ping_time():
log('=== BEGIN REPORT')
try:
# Backup DB
if email_time():
backup_db()
# Check remaining disk space
if ping_time():
check_disk_space(['/']) # Always bad if root partition is low
check_disk_space(['/var/lib/docker']) # Docker images
base_path = os.path.join(args.codalab_home, 'partitions')
paths = [os.path.join(base_path, fname) for fname in os.listdir(base_path)]
check_disk_space(paths)
# Get statistics on bundles
if ping_time():
# Simple things
run_command(['cl', 'workers'])
run_command(['cl', 'work'])
run_command(['cl', 'search', '.count'])
# Get online workers and contact administrators when there are public workers offline.
if ping_time():
poll_online_workers()
if run_time():
# More intense
run_command(['cl', 'search', 'size=.sum'], 20)
run_command(['cl', 'search', 'size=.sort-', '.limit=5'], 20)
run_command(['cl', 'search', '.last', '.limit=5'])
# Try uploading, downloading and running a job with a dependency.
if run_time():
upload_uuid = run_command(
['cl', 'upload', os.path.join(BASE_DIR, 'scripts', 'stress-test.pl')]
)
cat_result = run_command(['cl', 'cat', upload_uuid], include_output=False)
if 'BYTES_IN_MB' not in cat_result:
error_logs(
'download failed',
'Uploaded file should contain the string BYTES_IN_MB, contents:\n' + cat_result,
)
uuid = run_command(
['cl', 'run', 'stress-test.pl:' + upload_uuid, 'perl stress-test.pl 5 10 10']
)
run_command(
['cl', 'wait', uuid], 600, 3600
) # Running might take a while (includes staged time)
run_command(['cl', 'rm', upload_uuid, uuid])
except Exception as e:
error_logs('exception', 'Exception: %s' % e)
if ping_time():
log('=== END REPORT')
# Email the report
if email_time():
send_email('report', report)
if ping_time():
print()
# Update timer
time.sleep(1)
timer += 1