Skip to content

Commit

Permalink
Merge pull request #5660 from IntersectMBO/bench-nomad-todo
Browse files Browse the repository at this point in the history
bench | improvements Nomad backend
  • Loading branch information
mgmeier authored Feb 7, 2024
2 parents 1375419 + ae22a4b commit 32a7687
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 88 deletions.
68 changes: 68 additions & 0 deletions nix/workbench/backend/nomad-job.nix
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,74 @@ let
unlimited = false;
};

# Prevent allocations from being restarted:
###########################################
# Nomad Clients periodically heartbeat to Nomad Servers to confirm they
# are operating as expected. By default, Nomad Clients which do not
# heartbeat in the specified amount of time are considered down and
# their allocations are marked as lost (or disconnected if
# "max_client_disconnect" is set) and rescheduled.
# This means that if not properly configured allocations running on a
# client that fails to heartbeat will be marked "lost" and when the
# client reconnects, its allocations, which may still be healthy,
# restarted because they have been marked "lost"!!!
# See:
# - https://developer.hashicorp.com/nomad/docs/configuration/server#client-heartbeats
# - https://developer.hashicorp.com/nomad/docs/job-specification/group#stop-after-client-disconnect
# - https://developer.hashicorp.com/nomad/docs/job-specification/group#max-client-disconnect
# We want these allocations to reconnect without a restart.
### Nomad 1.6.X solution:
### Specifies a duration during which a Nomad client will attempt to
### reconnect allocations after it fails to heartbeat in the
### "heartbeat_grace" window. See the example code below for more
### details. This setting cannot be used with
### "stop_after_client_disconnect".
### When "max_client_disconnect" is specified, the Nomad server will
### mark clients that fail to heartbeat as "disconnected" rather than
### "down", and will mark allocations on a disconnected client as
### "unknown" rather than "lost". These allocations may continue to run
### on the disconnected client. Replacement allocations will be
### scheduled according to the allocations' reschedule policy until the
### disconnected client reconnects. Once a disconnected client
### reconnects, Nomad will compare the "unknown" allocations with their
### replacements and keep the one with the best node score. If the
### "max_client_disconnect" duration expires before the client
### reconnects, the allocations will be marked "lost". Clients that
### contain "unknown" allocations will transition to "disconnected"
### rather than "down" until the last "max_client_disconnect" duration
### has expired.
### https://developer.hashicorp.com/nomad/docs/v1.6.x/job-specification/group#max-client-disconnect
max_client_disconnect = "999h";
### Nomad 1.7.X solution:
### (TODO blocker issue https://github.com/hashicorp/nomad/issues/19506)
### Defines the reschedule behaviour of an allocation when the node it
### is running on misses heartbeats. When enabled, if the node it is
### running on becomes disconnected or goes down, this allocations won't
### be rescheduled and will show up as unknown until the node comes back
### up or it is manually restarted.
### This behaviour will only modify the reschedule process on the
### server. To modify the allocation behaviour on the client, see
### "stop_after_client_disconnect" below.
### The unknown allocation has to be manually stopped to run it again.
### Setting `max_client_disconnect` and
### `prevent_reschedule_on_lost = true` at the same time requires that
### rescheduling is disabled entirely (what is done above in the
### reschedule stanza).
# prevent_reschedule_on_lost = true;
### Specifies a duration after which a Nomad client will stop
### allocations, if it cannot communicate with the servers. By default,
### a client will not stop an allocation until explicitly told to by a
### server. A client that fails to heartbeat to a server within the
### "heartbeat_grace" window and any allocations running on it will be
### marked "lost" and Nomad will schedule replacement allocations. The
### replaced allocations will normally continue to run on the
### non-responsive client. But you may want them to stop instead — for
### example, allocations requiring exclusive access to an external
### resource. When specified, the Nomad client will stop them after this
### duration. The Nomad client process must be running for this to
### occur. This setting cannot be used with "max_client_disconnect".
# stop_after_client_disconnect = "999h";

# Specifies the restart policy for all tasks in this group. If omitted,
# a default policy exists for each job type, which can be found in the
# restart stanza documentation.
Expand Down
12 changes: 7 additions & 5 deletions nix/workbench/backend/nomad.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1415,17 +1415,17 @@ backend_nomad() {
then
if ! wait_kill_em_all "${jobs_array[@]}"
then
# Don't use fatal here, let `start` decide!
msg "$(red "Failed to start tracer(s)")"
return 1
backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")"
fatal "scenario.sh start-tracers failed!"
else
for node in ${nodes[*]}
do
if ! test -f "${dir}"/tracer/"${node}"/started
then
# Don't use fatal here, let `start` decide!
msg "$(red "Tracer for \"${node}\" failed to start!")"
return 1
backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")"
fatal "scenario.sh start-tracers failed!"
fi
done
fi
Expand Down Expand Up @@ -3487,7 +3487,9 @@ client {
servers = [ ${servers_addresses} ]
# Sets the search path that is used for CNI plugin discovery. Multiple paths can
# be searched using colon delimited paths
cni_path = "${cni_plugins_path}"
# TODO: needed to allow having more than one Nomad profile running locally
# Nomad 1.7.X fails somewhat silently when reading this configuration option.
# cni_path = "${cni_plugins_path}"
# Specifies the maximum amount of time a job is allowed to wait to exit.
# Individual jobs may customize their own kill timeout, but it may not exceed
# this value.
Expand Down
18 changes: 2 additions & 16 deletions nix/workbench/backend/nomad/exec.nix
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,8 @@ let
extraShellPkgs = let
# If we are going to use the `exec` driver we use the SRE patched version of
# Nomad that allows to use `nix_installables` as artifacts.
nomad-sre = (pkgs.buildGo119Module rec {
pname = "nomad";
version = "1.4.3";
subPackages = [ "." ];
doCheck = true;
src = pkgs.fetchFromGitHub { # "github:input-output-hk/nomad/release/1.4.3"
owner = "input-output-hk";
repo = pname;
rev = "2b8a93390"; # Use to be "release/${version}" but it changes.
# nix-prefetch-url --unpack https://github.com/input-output-hk/nomad/archive/2b8a93390/1.4.3.tar.gz
sha256 = "0l2sfhpg0p5mjdbipib7q63wlsrczr2fkq9xi641vhgxsjmprvwm";
};
# error: either `vendorHash` or `vendorSha256` is required
# https://discourse.nixos.org/t/buildgomodule-how-to-get-vendorsha256/9317
vendorSha256 = "sha256-JQRpsQhq5r/QcgFwtnptmvnjBEhdCFrXFrTKkJioL3A=";
});
commit = "8f3b74796a8f56f38a812813c64dba995956a66e"; # Patched 1.6.3
nomad-sre = (__getFlake "github:input-output-hk/cardano-perf/${commit}").packages.x86_64-linux.nomad;
in
[ nomad-sre
# The HTTP server to upload/download the genesis tar file in a local env.
Expand Down
8 changes: 3 additions & 5 deletions nix/workbench/backend/nomad/exec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,6 @@ deploy-genesis-nomadexec() {
local usage="USAGE: wb backend $op RUN-DIR"
local dir=${1:?$usage}; shift
local nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json)
local server_name=$(envjqr 'nomad_server_name')
local client_name=$(envjqr 'nomad_client_name')

# Add genesis to HTTP cache server
local nomad_agents_were_already_running=$(envjqr 'nomad_agents_were_already_running')
Expand All @@ -199,8 +197,8 @@ deploy-genesis-nomadexec() {
if test "${nomad_agents_were_already_running}" = "false"
then
msg "$(red "Startup of webfs failed, cleaning up ...")"
# `stop-nomad-job` takes care of stopping the Nomad agents.
backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")"
wb_nomad agents stop "${server_name}" "${client_name}" "exec"
fi
fatal "Failed to start a local HTTP server"
fi
Expand All @@ -213,8 +211,8 @@ deploy-genesis-nomadexec() {
if test "${nomad_agents_were_already_running}" = "false"
then
msg "$(red "Startup of webfs failed, cleaning up ...")"
# `stop-nomad-job` takes care of stopping the Nomad agents.
backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")"
wb_nomad agents stop "${server_name}" "${client_name}" "exec"
fi
fatal "Failed to add genesis file to local HTTP server"
else
Expand All @@ -225,8 +223,8 @@ deploy-genesis-nomadexec() {
if ! backend_nomad deploy-genesis-wget "${dir}" "${uri}"
then
msg "$(red "Deploy of genesis failed, cleaning up ...")"
# `stop-nomad-job` takes care of stopping the Nomad agents.
backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")"
wb_nomad agents stop "${server_name}" "${client_name}" "exec"
fatal "Deploy of genesis \"${uri}\" failed"
else
msg "$(green "Genesis \"${uri}\" deployed successfully")"
Expand Down
Loading

0 comments on commit 32a7687

Please sign in to comment.