diff --git a/nix/workbench/backend/nomad-job.nix b/nix/workbench/backend/nomad-job.nix index 64fd73b0aa8..940e5cd8519 100644 --- a/nix/workbench/backend/nomad-job.nix +++ b/nix/workbench/backend/nomad-job.nix @@ -262,6 +262,74 @@ let unlimited = false; }; + # Prevent allocations from being restarted: + ########################################### + # Nomad Clients periodically heartbeat to Nomad Servers to confirm they + # are operating as expected. By default, Nomad Clients which do not + # heartbeat in the specified amount of time are considered down and + # their allocations are marked as lost (or disconnected if + # "max_client_disconnect" is set) and rescheduled. + # This means that if not properly configured allocations running on a + # client that fails to heartbeat will be marked "lost" and when the + # client reconnects, its allocations, which may still be healthy, + # restarted because they have been marked "lost"!!! + # See: + # - https://developer.hashicorp.com/nomad/docs/configuration/server#client-heartbeats + # - https://developer.hashicorp.com/nomad/docs/job-specification/group#stop-after-client-disconnect + # - https://developer.hashicorp.com/nomad/docs/job-specification/group#max-client-disconnect + # We want these allocations to reconnect without a restart. + ### Nomad 1.6.X solution: + ### Specifies a duration during which a Nomad client will attempt to + ### reconnect allocations after it fails to heartbeat in the + ### "heartbeat_grace" window. See the example code below for more + ### details. This setting cannot be used with + ### "stop_after_client_disconnect". + ### When "max_client_disconnect" is specified, the Nomad server will + ### mark clients that fail to heartbeat as "disconnected" rather than + ### "down", and will mark allocations on a disconnected client as + ### "unknown" rather than "lost". These allocations may continue to run + ### on the disconnected client. Replacement allocations will be + ### scheduled according to the allocations' reschedule policy until the + ### disconnected client reconnects. Once a disconnected client + ### reconnects, Nomad will compare the "unknown" allocations with their + ### replacements and keep the one with the best node score. If the + ### "max_client_disconnect" duration expires before the client + ### reconnects, the allocations will be marked "lost". Clients that + ### contain "unknown" allocations will transition to "disconnected" + ### rather than "down" until the last "max_client_disconnect" duration + ### has expired. + ### https://developer.hashicorp.com/nomad/docs/v1.6.x/job-specification/group#max-client-disconnect + max_client_disconnect = "999h"; + ### Nomad 1.7.X solution: + ### (TODO blocker issue https://github.com/hashicorp/nomad/issues/19506) + ### Defines the reschedule behaviour of an allocation when the node it + ### is running on misses heartbeats. When enabled, if the node it is + ### running on becomes disconnected or goes down, this allocations won't + ### be rescheduled and will show up as unknown until the node comes back + ### up or it is manually restarted. + ### This behaviour will only modify the reschedule process on the + ### server. To modify the allocation behaviour on the client, see + ### "stop_after_client_disconnect" below. + ### The unknown allocation has to be manually stopped to run it again. + ### Setting `max_client_disconnect` and + ### `prevent_reschedule_on_lost = true` at the same time requires that + ### rescheduling is disabled entirely (what is done above in the + ### reschedule stanza). + # prevent_reschedule_on_lost = true; + ### Specifies a duration after which a Nomad client will stop + ### allocations, if it cannot communicate with the servers. By default, + ### a client will not stop an allocation until explicitly told to by a + ### server. A client that fails to heartbeat to a server within the + ### "heartbeat_grace" window and any allocations running on it will be + ### marked "lost" and Nomad will schedule replacement allocations. The + ### replaced allocations will normally continue to run on the + ### non-responsive client. But you may want them to stop instead — for + ### example, allocations requiring exclusive access to an external + ### resource. When specified, the Nomad client will stop them after this + ### duration. The Nomad client process must be running for this to + ### occur. This setting cannot be used with "max_client_disconnect". + # stop_after_client_disconnect = "999h"; + # Specifies the restart policy for all tasks in this group. If omitted, # a default policy exists for each job type, which can be found in the # restart stanza documentation. diff --git a/nix/workbench/backend/nomad.sh b/nix/workbench/backend/nomad.sh index 43cf4a1e54a..f6dde0f1528 100644 --- a/nix/workbench/backend/nomad.sh +++ b/nix/workbench/backend/nomad.sh @@ -1415,17 +1415,17 @@ backend_nomad() { then if ! wait_kill_em_all "${jobs_array[@]}" then - # Don't use fatal here, let `start` decide! msg "$(red "Failed to start tracer(s)")" - return 1 + backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")" + fatal "scenario.sh start-tracers failed!" else for node in ${nodes[*]} do if ! test -f "${dir}"/tracer/"${node}"/started then - # Don't use fatal here, let `start` decide! msg "$(red "Tracer for \"${node}\" failed to start!")" - return 1 + backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")" + fatal "scenario.sh start-tracers failed!" fi done fi @@ -3487,7 +3487,9 @@ client { servers = [ ${servers_addresses} ] # Sets the search path that is used for CNI plugin discovery. Multiple paths can # be searched using colon delimited paths - cni_path = "${cni_plugins_path}" +# TODO: needed to allow having more than one Nomad profile running locally +# Nomad 1.7.X fails somewhat silently when reading this configuration option. +# cni_path = "${cni_plugins_path}" # Specifies the maximum amount of time a job is allowed to wait to exit. # Individual jobs may customize their own kill timeout, but it may not exceed # this value. diff --git a/nix/workbench/backend/nomad/exec.nix b/nix/workbench/backend/nomad/exec.nix index e1fee1b673d..3b658b492db 100644 --- a/nix/workbench/backend/nomad/exec.nix +++ b/nix/workbench/backend/nomad/exec.nix @@ -20,22 +20,8 @@ let extraShellPkgs = let # If we are going to use the `exec` driver we use the SRE patched version of # Nomad that allows to use `nix_installables` as artifacts. - nomad-sre = (pkgs.buildGo119Module rec { - pname = "nomad"; - version = "1.4.3"; - subPackages = [ "." ]; - doCheck = true; - src = pkgs.fetchFromGitHub { # "github:input-output-hk/nomad/release/1.4.3" - owner = "input-output-hk"; - repo = pname; - rev = "2b8a93390"; # Use to be "release/${version}" but it changes. - # nix-prefetch-url --unpack https://github.com/input-output-hk/nomad/archive/2b8a93390/1.4.3.tar.gz - sha256 = "0l2sfhpg0p5mjdbipib7q63wlsrczr2fkq9xi641vhgxsjmprvwm"; - }; - # error: either `vendorHash` or `vendorSha256` is required - # https://discourse.nixos.org/t/buildgomodule-how-to-get-vendorsha256/9317 - vendorSha256 = "sha256-JQRpsQhq5r/QcgFwtnptmvnjBEhdCFrXFrTKkJioL3A="; - }); + commit = "8f3b74796a8f56f38a812813c64dba995956a66e"; # Patched 1.6.3 + nomad-sre = (__getFlake "github:input-output-hk/cardano-perf/${commit}").packages.x86_64-linux.nomad; in [ nomad-sre # The HTTP server to upload/download the genesis tar file in a local env. diff --git a/nix/workbench/backend/nomad/exec.sh b/nix/workbench/backend/nomad/exec.sh index 26966077694..42b6f9da724 100644 --- a/nix/workbench/backend/nomad/exec.sh +++ b/nix/workbench/backend/nomad/exec.sh @@ -186,8 +186,6 @@ deploy-genesis-nomadexec() { local usage="USAGE: wb backend $op RUN-DIR" local dir=${1:?$usage}; shift local nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json) - local server_name=$(envjqr 'nomad_server_name') - local client_name=$(envjqr 'nomad_client_name') # Add genesis to HTTP cache server local nomad_agents_were_already_running=$(envjqr 'nomad_agents_were_already_running') @@ -199,8 +197,8 @@ deploy-genesis-nomadexec() { if test "${nomad_agents_were_already_running}" = "false" then msg "$(red "Startup of webfs failed, cleaning up ...")" + # `stop-nomad-job` takes care of stopping the Nomad agents. backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")" - wb_nomad agents stop "${server_name}" "${client_name}" "exec" fi fatal "Failed to start a local HTTP server" fi @@ -213,8 +211,8 @@ deploy-genesis-nomadexec() { if test "${nomad_agents_were_already_running}" = "false" then msg "$(red "Startup of webfs failed, cleaning up ...")" + # `stop-nomad-job` takes care of stopping the Nomad agents. backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")" - wb_nomad agents stop "${server_name}" "${client_name}" "exec" fi fatal "Failed to add genesis file to local HTTP server" else @@ -225,8 +223,8 @@ deploy-genesis-nomadexec() { if ! backend_nomad deploy-genesis-wget "${dir}" "${uri}" then msg "$(red "Deploy of genesis failed, cleaning up ...")" + # `stop-nomad-job` takes care of stopping the Nomad agents. backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")" - wb_nomad agents stop "${server_name}" "${client_name}" "exec" fatal "Deploy of genesis \"${uri}\" failed" else msg "$(green "Genesis \"${uri}\" deployed successfully")" diff --git a/nix/workbench/nomad.sh b/nix/workbench/nomad.sh index d31f45031ab..b55c92e6607 100644 --- a/nix/workbench/nomad.sh +++ b/nix/workbench/nomad.sh @@ -83,6 +83,7 @@ usage_nomad() { $(helpcmd job monitor-alloc-id) $(helpcmd job monitor-alloc-id-task-name) $(helpcmd job task-name-allocation-id) + $(helpcmd job task-name-node-name) EOF } @@ -633,7 +634,7 @@ EOL # can represent an abnormal exit / uknown state! if wb_nomad server is-running "${name}" then - msg "$(red "FATAL: Nomad server \"${name}\" is already running or in an uknown state, call 'wb nomad server stop ${name}' or 'wb nomad nuke' first")" + msg "$(red "FATAL: Nomad server \"${name}\" is already running or in an unknown state, call 'wb nomad server stop ${name}' or 'wb nomad nuke' first")" return 1 else local state_dir=$(wb_nomad server state-dir-path "${name}") @@ -709,7 +710,7 @@ EOL # Checks if wb_nomad server is-running "${name}" then - msg "$(red "FATAL: Nomad server \"${name}\" is already running or in an uknown state, call 'wb nomad server stop ${name}' or 'wb nomad nuke' first")" + msg "$(red "FATAL: Nomad server \"${name}\" is already running or in an unknown state, call 'wb nomad server stop ${name}' or 'wb nomad nuke' first")" return 1 fi # Start `nomad` server". @@ -741,7 +742,7 @@ EOL msg "$(yellow "port \"127.0.0.1:${http_port}\" not ready")" msg "$(yellow "Check logs (${state_dir})")" # Let the "stop" subcommand clean everything! - wb_nomad server stop "${name}" + wb_nomad server stop "${name}" || true return 1 fi echo -ne "\b\b\b" @@ -755,30 +756,36 @@ EOL local name=${1:?$usage}; shift # Stop Nomad server by name local pids=$(wb_nomad server pids-array "${name}") - for pid_number in ${pids[@]} - do - msg "$(blue Stopping) Nomad $(yellow "server \"${name}\"") process PID ${pid_number} ..." - if ! kill -SIGINT "${pid_number}" >/dev/null 2>&1 - then - msg "$(red "Killing PID ${pid_number} failed")" - else - # Wait 15 seconds for the process to fully exit or kill it. - msg "$(blue Wait) up to 15 seconds for PID ${pid_number} to exit" - timeout 15 tail --pid="${pid_number}" -f /dev/null || true - if kill -0 "${pid_number}" >/dev/null 2>&1 + if test -z "${pids}" + then + msg "$(red "Found no running Nomad server process to stop, manually clean possible remaining processes")" + return 1 + else + for pid_number in ${pids[@]} + do + msg "$(blue Stopping) Nomad $(yellow "server \"${name}\"") process PID ${pid_number} ..." + if ! kill -SIGINT "${pid_number}" >/dev/null 2>&1 then - msg "$(yellow "Timeout killing PID ${pid_number}, trying SIGKILL")" - kill -SIGKILL "${pid_number}" >/dev/null 2>&1 || true + msg "$(red "Killing PID ${pid_number} failed")" + else + # Wait 15 seconds for the process to fully exit or kill it. + msg "$(blue Wait) up to 15 seconds for PID ${pid_number} to exit" + timeout 15 tail --pid="${pid_number}" -f /dev/null || true + if kill -0 "${pid_number}" >/dev/null 2>&1 + then + msg "$(yellow "Timeout killing PID ${pid_number}, trying SIGKILL")" + kill -SIGKILL "${pid_number}" >/dev/null 2>&1 || true + fi fi - fi - done - # Remove PID file if process was really killed (or wasn't running)! - if test -z "$(wb_nomad server pids-array "${name}")" - then - local pid_file=$(wb_nomad server pid-filepath "${name}") - if test -f "${pid_file}" + done + # Remove PID file if all processes were killed! + if test -z "$(wb_nomad server pids-array "${name}")" then - rm "${pid_file}" + local pid_file=$(wb_nomad server pid-filepath "${name}") + if test -f "${pid_file}" + then + rm "${pid_file}" + fi fi fi ;; @@ -997,7 +1004,7 @@ EOL msg "$(yellow "port \"127.0.0.1:${http_port}\" not ready")" msg "$(yellow "Check logs (${state_dir})")" # Let the "stop" subcommand clean everything! - wb_nomad client stop "${name}" + wb_nomad client stop "${name}" || true return 1 fi echo -ne "\b\b\b" @@ -1017,7 +1024,7 @@ EOL msg "$(yellow "Nomad client not connected to Nomad server")" msg "$(yellow "Check logs (${state_dir})")" # Let the "stop" subcommand clean everything! - wb_nomad client stop "${name}" + wb_nomad client stop "${name}" || true return 1 fi echo -ne "\b\b\b" @@ -1079,42 +1086,45 @@ EOL fi # Stop Nomad client by name local pids=$(wb_nomad client pids-array "${name}") - for pid_number in ${pids[@]} - do - msg "$(blue Stopping) Nomad $(yellow "client \"${name}\"") process PID ${pid_number} ..." - local cmd_array=("${root_prefix}" "bash" "-c") - if ! ${cmd_array[@]} "kill -SIGINT ${pid_number}" >/dev/null 2>&1 - then - msg "Killing PID ${pid_number} failed" - else - # Wait 15 seconds for the process to fully exit or kill it. - msg "$(blue Wait) up to 30 seconds for PID ${pid_number} to exit" - timeout 30 tail --pid="${pid_number}" -f /dev/null || true + if test -z "${pids}" + then + msg "$(red "Found no running Nomad client process to stop, manually clean possible remaining processes")" + return 1 + else + for pid_number in ${pids[@]} + do + msg "$(blue Stopping) Nomad $(yellow "client \"${name}\"") process PID ${pid_number} ..." local cmd_array=("${root_prefix}" "bash" "-c") - if ${cmd_array[@]} "kill -0 ${pid_number}" >/dev/null 2>&1 + if ! ${cmd_array[@]} "kill -SIGINT ${pid_number}" >/dev/null 2>&1 then - msg "$(yellow "Timeout killing PID ${pid_number}, trying SIGKILL")" + msg "Killing PID ${pid_number} failed" + else + # Wait 15 seconds for the process to fully exit or kill it. + msg "$(blue Wait) up to 30 seconds for PID ${pid_number} to exit" + timeout 30 tail --pid="${pid_number}" -f /dev/null || true local cmd_array=("${root_prefix}" "bash" "-c") - ${cmd_array[@]} "kill -SIGKILL ${pid_number}" >/dev/null 2>&1 || true + if ${cmd_array[@]} "kill -0 ${pid_number}" >/dev/null 2>&1 + then + msg "$(yellow "Timeout killing PID ${pid_number}, trying SIGKILL")" + local cmd_array=("${root_prefix}" "bash" "-c") + ${cmd_array[@]} "kill -SIGKILL ${pid_number}" >/dev/null 2>&1 || true + fi fi - fi - done - # Remove PID file if process was really killed (or wasn't running)! - if test -z "$(wb_nomad client pids-array "${name}")" - then - # WHY? The client is keeping some directories mounted! - # Maybe because of the 2 processes it creates (testes running - # only one client instance), I may be killing a child first? - # Or the timeout needs more time? - msg "Unmount any folders left by the client" - local cmd_array=("${root_prefix}" "bash" "-c") - # Command fails when there's nothing to umount! - grep "${state_dir}" /proc/mounts | cut -f2 -d" " | sort -r | ${cmd_array[@]} 'xargs -I "{}" umount -n "{}"' || true - # Now mark as "not running" - local pid_file=$(wb_nomad client pid-filepath "${name}") - if test -f "${pid_file}" + done + # Remove PID file if all processes were killed! + if test -z "$(wb_nomad client pids-array "${name}")" then - rm "${pid_file}" + # WHY? The client is keeping some directories mounted! + msg "Unmount any folders left by the client" + local cmd_array=("${root_prefix}" "bash" "-c") + # Command fails when there's nothing to umount! + grep "${state_dir}" /proc/mounts | cut -f2 -d" " | sort -r | ${cmd_array[@]} 'xargs -I "{}" umount -n "{}"' || true + # Now mark as "not running" + local pid_file=$(wb_nomad client pid-filepath "${name}") + if test -f "${pid_file}" + then + rm "${pid_file}" + fi fi fi ;; @@ -1405,10 +1415,34 @@ EOF local nomad_clients_dir="$(wb_nomad dir-path client)" # Nuke all Nomad clients for client_name in $(ls "${nomad_clients_dir}"); do + msg "Config folder of Nomad client \"${client_name}\" found" if wb_nomad client is-running "${client_name}" then - wb_nomad client stop "${client_name}" - wb_nomad client cleanup "${client_name}" + msg "Nomad client \"${client_name}\" is running" + if wb_nomad client stop "${client_name}" + then + # Only call cleanup if stop did not fail + wb_nomad client cleanup "${client_name}" + else + msg "Failed to stop Nomad client \"${client_name}\", now in unknown state, manual cleanup of processes needed" + fi + else + msg "Nomad client \"${client_name}\" is not running" + fi + # Nuke the client's dir + local state_dir=$(wb_nomad client state-dir-path "${client_name}") + msg "Removing \"${state_dir}\" ..." + local root_prefix + if test -e "${state_dir}"/root + then + root_prefix=$(cat "${state_dir}"/root) + else + root_prefix="" + fi + local cmd_array=("${root_prefix}" "bash" "-c") + if ! ${cmd_array[@]} "rm -rf ${state_dir}" >/dev/null 2>&1 + then + msg "Failed to remove config folder of Nomad client \"${client_name}\", now in unknown state, manual cleanup needed" fi done # Nuke the nomad-driver-podman plugin @@ -1423,16 +1457,32 @@ EOF fi # Nuke all Nomad servers for server_name in $(ls "${nomad_servers_dir}"); do + msg "Config folder of Nomad server \"${server_name}\" found" if wb_nomad server is-running "${server_name}" then - wb_nomad server stop "${server_name}" - wb_nomad server cleanup "${server_name}" + msg "Nomad server \"${server_name}\" is running" + if wb_nomad server stop "${server_name}" + then + # Only call cleanup if stop did not fail + wb_nomad server cleanup "${server_name}" + else + msg "Failed to stop Nomad server \"${server_name}\", now in unknown state, manual cleanup of processes needed" + fi + else + msg "Nomad server \"${server_name}\" is not running" + fi + # Nuke the server's dir + local state_dir=$(wb_nomad server state-dir-path "${server_name}") + msg "Removing \"${state_dir}\" ..." + if ! rm -rf "${state_dir}" >/dev/null 2>&1 + then + msg "Failed to remove config folder of Nomad server \"${server_name}\", now in unknown state, manual cleanup needed" fi done # Nuke the Nomad Agents' .cache dir # Keep top level Nomad cache dir because it includes Vault's dirs. - rm -rf "${nomad_servers_dir}" >/dev/null 2>&1 rm -rf "${nomad_clients_dir}" >/dev/null 2>&1 + rm -rf "${nomad_servers_dir}" >/dev/null 2>&1 # Bye HTTP server if wb_nomad webfs is-running then @@ -2105,6 +2155,13 @@ EOF local task_name=${1:?$usage}; shift jq -r '.ID' "${job_file}".run/task.${task_name}.final.json ;; +####### job -> task-name-node-name )############################################ + task-name-node-name ) + local usage="USAGE:wb nomad ${op} ${subop} JOB-FILE TASK-NAME" + local job_file=${1:?$usage}; shift + local task_name=${1:?$usage}; shift + jq -r '.NodeName' "${job_file}".run/task.${task_name}.final.json + ;; ####### job -> stop )########################################################### stop ) local usage="USAGE: wb nomad ${op} ${subop} JOB-FILE TASK-NAME" diff --git a/nix/workbench/scenario.sh b/nix/workbench/scenario.sh index 90744743511..5bb68f4b965 100644 --- a/nix/workbench/scenario.sh +++ b/nix/workbench/scenario.sh @@ -31,7 +31,16 @@ fi case "$op" in idle ) backend start-tracers "$dir" + + scenario_setup_exit_trap "$dir" + # Trap start + ############ backend start-nodes "$dir" + # Trap end + ########## + scenario_cleanup_termination + + backend stop-all "$dir" ;; tracer-only )