-
Notifications
You must be signed in to change notification settings - Fork 11
/
upgrade-takeover.sh
executable file
·494 lines (439 loc) · 16.1 KB
/
upgrade-takeover.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
#!/bin/bash
#
# Upgrades balenaOS version by running takeover to flash storage, potentially
# including repartitioning. See https://github.com/balena-os/takeover
#
# Arguments:
# * --hostos-version is required to specify target OS version
# * See help() below for details
#
# Outputs:
# * Writes log file to /mnt/boot/balenahup
# * Notifies balenaAPI by PATCHing the device via /usr/bin/resin-device-progress script
#
# Results:
# * Flashes new OS and reboots into it on success
# * Returns 9 if another upgrade script may already be running
# * Returns 1 on other errors in this script
# * Reboots into current OS if fails later in the attempt
#
# Notes:
# This script was derived from upgrade-2.x.sh for traditional balenahup upgrades.
SCRIPTNAME=upgrade-takeover.sh
# Set up cautious error handling, run ERR trap on failure
set -o errexit
set -E
set -o pipefail
# Define variables about running balenaOS, including VERSION and SLUG (device type)
# shellcheck disable=SC1091
source /etc/os-release
# Set PATH for binary lookups
# shellcheck disable=SC1091
source /etc/profile
# Prevent running multiple instances of upgrade
LOCKFILE="/var/lock/resinhup.lock"
LOCKFD=99
# Private functions
_lock() { flock "-$1" $LOCKFD; }
_exit_handler() {
_exit_status=$?
if [ "${_exit_status}" -ne 0 ]; then
log "Exit on error ${_exit_status}"
if ! report_upgrade_failed > /dev/null 2>&1; then
log "Failed to report progress on exit with status $?"
fi
if [ "${_exit_status}" -eq 9 ]; then
log "No concurrent updates allowed - lock file in place."
fi
fi
_no_more_locking
log "Lock removed - end."
}
_no_more_locking() { _lock u; _lock xn && rm -f $LOCKFILE;rm -f "${outfifo}";rm -f "${errfifo}"; }
_prepare_locking() { eval "exec $LOCKFD>\"$LOCKFILE\""; trap _exit_handler EXIT; }
# Public functions
exlock_now() { _lock xn; } # obtain an exclusive lock immediately or fail
# Helper function to report progress to device API
# $1 -- integer: completion percentage
# $2 -- string: detail message
function progress {
pct=$1
message=$2
resin-device-progress --percentage "${pct}" --state "${message}" > /dev/null || true
}
function help {
cat << EOF
Upgrade balenaOS via takeover tool
Options:
-h, --help
Display this help and exit.
--force-slug <SLUG>
Override slug detection and force this slug to be used for the script.
--hostos-version <HOSTOS_VERSION>
Run the updater for this specific HostOS version as semver or ESR, where
semver is in the format major.minor.patch, like 2.5.1; and ESR is in the
format year.month.patch, like 2024.4.0. The version must begin with a
digit, not a 'v'. This is a mandatory argument.
--balenaos-registry
Unused but accepted for compatibility with upgrade-2.x.sh when called
by balena-proxy.
EOF
}
# Notify backend device API that upgrade has failed. This report is essential
# to allow a user to retry the upgrade. Sends the report, and then independently
# verifies success. Tries once per minute for an hour.
function report_upgrade_failed() {
pct=100
state="OS update failed"
while ! compare_device_state "${pct}" "${state}"; do
((c++)) && ((c==60)) && break
if resin-device-progress --percentage "${pct}" --state "${state}"; then
continue
fi
log WARN "Retrying failure report - try $c"
sleep 60
done
}
# Log operational message; writes provided text to journal and echos to stdout.
# If log at ERROR level, cleanup work directory and exit this script with code 1.
#
# $1 -- optional log level, must be ERROR or WARN; otherwise defaults to INFO
# $2 -- log message
function log {
# Process log level if provided to function
priority=6
case $1 in
ERROR)
loglevel=ERROR
# 3 is "err"
priority=3
shift
;;
WARN)
loglevel=WARNING
# 4 is "warning"
priority=4
shift
;;
*)
loglevel=INFO
;;
esac
echo "${1}" | systemd-cat --level-prefix=0 --identifier="${SCRIPTNAME}" --priority="${priority}" 2> /dev/null || true
endtime=$(date +%s)
printf "[%s][%09d%s%s\n" "$SCRIPTNAME" "$((endtime - starttime))" "][$loglevel]" "$1"
if [ "$loglevel" == "ERROR" ]; then
cleanup_work_dir
exit 1
fi
}
# Test if the first arg is greater than the second, when the args are compared
# as semvers.
# Return 0 if the first item is greater; 1 otherwise
# For example, version_gt "1.2.10" "1.2.3" returns true.
#
# $1 -- expected greater version
# $2 -- version to compare
function version_gt() {
test "$(echo "$@" | tr " " "\n" | sort -V | head -n 1)" != "$1"
}
# Compare the provided percentage and state values with backend device API
# provisioning_progress and provisioning_state.
#
# $1 -- local percentage
# $2 -- local state
#
# Return 0 if the values are equal; otherwise return 1
function compare_device_state() {
pct=$1
state=$2
resp=$(CURL_CA_BUNDLE="${TMPCRT}" ${CURL} --header "Authorization: Bearer ${APIKEY}" \
"${API_ENDPOINT}/v6/device(uuid='${UUID}')?\$select=provisioning_state,provisioning_progress" | jq '.d[]')
remote_pct=$(echo "${resp}" | jq -r '.provisioning_progress')
remote_state=$(echo "${resp}" | jq -r '.provisioning_state')
if [ -n "${remote_pct}" ] && [ -n "${remote_state}" ]; then
test "${pct}" -eq "${remote_pct}" && test "${state}" = "${remote_state}"
else
return 1
fi
}
# Stop Supervisor and related systemd services
function stop_services() {
log "Stopping supervisor and related services..."
systemctl stop update-balena-supervisor.timer > /dev/null 2>&1 || systemctl stop update-resin-supervisor.timer > /dev/null 2>&1
systemctl stop balena-supervisor > /dev/null 2>&1 || systemctl stop resin-supervisor > /dev/null 2>&1
${DOCKER_CMD} rm -f balena_supervisor resin_supervisor > /dev/null 2>&1 || true
}
# Remove contents of work directory
function cleanup_work_dir() {
# Ensure takover path is not just '/' because deletion command is
if [[ -n "${work_dir}" && "${#work_dir}" -gt 3 ]]; then
if [[ -d "${work_dir}" ]]; then
rm -rf "${work_dir:?}/*"
fi
fi
}
# Download takeover tool binary for device's architecture
# Requires $work_dir
# $1 -- takeover release version, like "v0.9.0" OR "latest"
# Exits script on download failure
function download_takeover_binary() {
release_version="$1"
architecture=$(uname -m)
case ${architecture} in
aarch64|x86_64)
log "Using takeover arch ${architecture}"
;;
*)
log ERROR "Takeover binary for arch: ${architecture} not found"
esac
if [ "${release_version}" = "latest" ]; then
release_segment="latest/download"
else
release_segment="download/${release_version}"
fi
download_url="https://github.com/balena-os/takeover/releases/${release_segment}/takeover-${architecture}-unknown-linux-musl.tar.gz"
log "Downloading takeover binary ${download_url}"
${CURL} -o "${work_dir}/takeover.tar.gz" "$download_url" || log ERROR "Could not download takeover binary, aborting."
# Extract and prepare for use
tar -C "${work_dir}" -zxvf "${work_dir}/takeover.tar.gz"
chown root:root "${work_dir}/takeover"
rm "${work_dir}/takeover.tar.gz"
log "Download takeover binary success"
}
# Download target balenaOS image; supports both public and private images when
# authorized. Also verifies sufficient disk space for download.
# Requires:
# $target_version; already verified
# $SLUG; already verified
# $CURL_NO_FAIL
# $work_dir; already verified -- must be in data partition
# Exits script on download failure
function download_target_image() {
log "Verifying target image size"
# Download headers and verify content-length less than free space on data partition.
# Still on curl 7.82 so can't use "-w '%header{content-length}'"
status_code=$(\
CURL_CA_BUNDLE="${TMPCRT}" ${CURL_NO_FAIL} -H "Authorization: Bearer ${APIKEY}" \
--head -w "%{http_code}" \
-o "${work_dir}/headers.txt" \
"${API_ENDPOINT}/download?deviceType=${SLUG}&version=${target_version}&fileType=.gz" \
2>/dev/null \
)
if [[ "${status_code:0:1}" == "2" && -f "${work_dir}/headers.txt" ]]; then
# Use printf to avoid newline in var.
image_bytes=$(
awk 'BEGIN {FS=": "}/^content-length/{printf "%d",$2}' "${work_dir}/headers.txt"
)
data_part_kb=$(df --output=avail /mnt/data/ | tail -n 1)
if [[ -n "${image_bytes}" && -n "${data_part_kb}" ]]; then
data_part_bytes=$(( data_part_kb * 1024 ))
log "Target image size: ${image_bytes}"
if [ "${image_bytes}" -ge "${data_part_bytes}" ]; then
log ERROR "Image too big for data partition: ${data_part_bytes}"
fi
fi
else
# Header retrieval ought to work, but not critical if it doesn't.
log WARN "Verify target image; code: ${status_code}"
fi
log "Downloading target image"
status_code=$(\
CURL_CA_BUNDLE="${TMPCRT}" ${CURL_NO_FAIL} -H "Authorization: Bearer ${APIKEY}" \
-H "Content-Type: application/json" -w "%{http_code}" \
--output "${work_dir}/balenaos.img.gz" \
"${API_ENDPOINT}/download?deviceType=${SLUG}&version=${target_version}&fileType=.gz" \
2>/dev/null \
)
# expecting 200 response
if [ "${status_code:0:1}" == "2" ]; then
log "Download image success; code: ${status_code}"
else
log ERROR "Download image failed; code: ${status_code}"
fi
# sanity check
if [ ! -f "${work_dir}/balenaos.img.gz" ]; then
log ERROR "Target image not found"
fi
}
###
# Script start
###
# If no arguments passed, just display the help
if [ $# -eq 0 ]; then
help
exit 0
fi
# Log timer
starttime=$(date +%s)
# For compatibility purposes
if [ -d "/mnt/data/resinhup" ] && [ ! -e "/mnt/data/balenahup" ]; then
ln -s "/mnt/data/resinhup" "/mnt/data/balenahup"
fi
# LOGFILE init and header
logtime=$(date +"%Y%m%d_%H%M%S")
LOGFILE="/mnt/data/balenahup/$SCRIPTNAME.${logtime}.log"
mkdir -p "$(dirname "$LOGFILE")"
log "================$SCRIPTNAME HEADER START====================" > "$LOGFILE"
date >> "$LOGFILE"
log "Loading info from config.json"
if [ -f /mnt/boot/config.json ]; then
CONFIGJSON=/mnt/boot/config.json
else
log ERROR "Can't find config.json"
fi
# Use the user's api key if it exists rather than deviceApiKey; it means we haven't
# done the key exchange yet.
APIKEY=$(jq -r '.apiKey // .deviceApiKey' "${CONFIGJSON}")
UUID=$(jq -r '.uuid' "${CONFIGJSON}")
API_ENDPOINT=$(jq -r '.apiEndpoint' "${CONFIGJSON}")
[ -z "${APIKEY}" ] && log ERROR "Error parsing config.json"
[ -z "${UUID}" ] && log ERROR "Error parsing config.json"
[ -z "${API_ENDPOINT}" ] && log ERROR "Error parsing config.json"
# Create a certificate bundle file incorporating any CA provided in config.json.
# We create this primarily for use by curl. The OS started integrating this CA
# with v2.58, but use this variable in case we have an older version.
TMPCRT=$(mktemp)
jq -r '.balenaRootCA' < "${CONFIGJSON}" | base64 -d > "${TMPCRT}"
cat /etc/ssl/certs/ca-certificates.crt >> "${TMPCRT}"
# Set up curl for use within a script, retry many times for reliability, follow
# redirects, and compress responses.
CURL="curl --silent --retry 10 --fail --location --compressed"
# Variant so can collect response status code
CURL_NO_FAIL="curl --silent --retry 10 --location --compressed"
_err_handler(){
log ERROR "Interrupted on error"
}
_int_handler(){
log ERROR "Interrupted"
}
_term_handler(){
log ERROR "Terminated"
}
trap '_err_handler' ERR
trap '_int_handler' INT
trap '_term_handler' TERM
if [ -x "$(command -v balena)" ]; then
DOCKER_CMD="balena"
else
DOCKER_CMD="docker"
fi
# Redirect all logs to the logfile
outfifo=$(mktemp -u)
errfifo=$(mktemp -u)
mkfifo "${outfifo}" "${errfifo}"
# Read from the stdout FIFO and append to LOGFILE
cat "${outfifo}" >> "${LOGFILE}" &
# Read from the stderr FIFO, append to LOGFILE, and also display to terminal's stderr
cat "${errfifo}" >> "${LOGFILE}" &
# Redirect script's stdout and stderr to the respective FIFOs
exec >"${outfifo}" 2>"${errfifo}"
# Parse arguments
while [[ $# -gt 0 ]]; do
arg="$1"
case $arg in
-h|--help)
help
exit 0
;;
--force-slug)
if [ -z "$2" ]; then
log ERROR "\"$1\" argument needs a value."
fi
FORCED_SLUG=$2
shift
;;
--hostos-version)
if [ -z "$2" ]; then
log ERROR "\"$1\" argument needs a value."
fi
target_version=$2
log "Raw target version: ${target_version}"
case $target_version in
*.prod)
target_version="${target_version%%.prod}"
log "Normalized target version: ${target_version}"
;;
esac
shift
;;
--balenaos-registry)
if [ -z "$2" ]; then
log ERROR "\"$1\" argument needs a value."
fi
shift
;;
*)
log WARN "Unrecognized option $1."
;;
esac
shift
done
# Run on start
_prepare_locking
# Try to get the lock. Exit if unable; another instance is running already.
exlock_now || exit 9
if [ -z "$target_version" ]; then
log ERROR "--hostos-version is required."
fi
progress 25 "Preparing OS update"
# Retrieve slug (device type) from API, and use this value if not provided to the script.
FETCHED_SLUG=$(CURL_CA_BUNDLE="${TMPCRT}" ${CURL} -H "Authorization: Bearer ${APIKEY}" \
"${API_ENDPOINT}/v6/device?\$select=is_of__device_type&\$expand=is_of__device_type(\$select=slug)&\$filter=uuid%20eq%20%27${UUID}%27" 2>/dev/null \
| jq -r '.d[0].is_of__device_type[0].slug'
)
SLUG=${FORCED_SLUG:-$FETCHED_SLUG}
# Validate target version in semver (major > 1) or year.month.patch format
if [ -n "$target_version" ]; then
case $target_version in
[2-9].*|[1-9][0-9].*|2[0-9][0-9][0-9].*.*)
log "Target OS version \"$target_version\" OK."
;;
*)
log ERROR "Target OS version \"$target_version\" not supported."
;;
esac
else
log ERROR "No target OS version specified."
fi
# Validate host OS version, similar to target_version above
case $VERSION in
[2-9].*|[1-9][0-9].*|2[0-9][0-9][0-9].*.*)
log "Host OS version \"$VERSION\" OK."
;;
*)
log ERROR "Host OS version \"$VERSION\" not supported."
;;
esac
# Ensure working directory is empty; takeover binary does not cleanup on failure.
cleanup_work_dir
# Verify data partition filesystem/storage is clean. Flashing over bad sectors
# may brick device.
data_device=$(mount |grep /mnt/data |awk '{print $1}' |head -n 1)
if [ -n "${data_device}" ]; then
e2fsck -n "${data_device}" > /dev/null
if [ $? -ge 4 ]; then
log ERROR "Filesystem check failed on data partition ${data_device}"
fi
fi
# Ensure takeover environment is prepared. We'll add files here. MUST be in
# data partition since stores sizable downloaded OS image. Conserves memory for
# takeover binary.
work_dir="/mnt/data/takeover"
mkdir -p "${work_dir}"
# Retrieve target hostOS image and takeover binary
download_target_image
download_takeover_binary "v0.9.0-dev.1"
progress 50 "Running OS update"
# Run takeover
# Must run from a writable directory; takeover creates temp files there
cd ${work_dir}
# No need to specify config.json path; defaults to /mnt/boot.
# API check fails on BoB, so disabled
# We do *not* expect takeover to return on success; it flashes the device with
# the new balenaOS.
res=$(./takeover -i balenaos.img.gz \
--no-ack --no-nwmgr-check --no-os-check \
--log-level debug --fallback-log --fallback-log-file "advanced.${logtime}.log" \
--s2-log-level debug --report-hup-progress)
log ERROR "Takeover result ${res}; OS not updated"