Merge pull request #5660 from IntersectMBO/bench-nomad-todo

bench | improvements Nomad backend
IntersectMBO · Feb 7, 2024 · 32a7687 · 32a7687
2 parents 1375419 + ae22a4b
commit 32a7687
Show file tree

Hide file tree

Showing 6 changed files with 208 additions and 88 deletions.
diff --git a/nix/workbench/backend/nomad-job.nix b/nix/workbench/backend/nomad-job.nix
@@ -262,6 +262,74 @@ let
           unlimited = false;
         };
 
+        # Prevent allocations from being restarted:
+        ###########################################
+        # Nomad Clients periodically heartbeat to Nomad Servers to confirm they
+        # are operating as expected. By default, Nomad Clients which do not
+        # heartbeat in the specified amount of time are considered down and
+        # their allocations are marked as lost (or disconnected if
+        # "max_client_disconnect" is set) and rescheduled.
+        # This means that if not properly configured allocations running on a
+        # client that fails to heartbeat will be marked "lost" and when the
+        # client reconnects, its allocations, which may still be healthy,
+        # restarted because they have been marked "lost"!!!
+        # See:
+        # - https://developer.hashicorp.com/nomad/docs/configuration/server#client-heartbeats
+        # - https://developer.hashicorp.com/nomad/docs/job-specification/group#stop-after-client-disconnect
+        # - https://developer.hashicorp.com/nomad/docs/job-specification/group#max-client-disconnect
+        # We want these allocations to reconnect without a restart.
+        ### Nomad 1.6.X solution:
+        ### Specifies a duration during which a Nomad client will attempt to
+        ### reconnect allocations after it fails to heartbeat in the
+        ### "heartbeat_grace" window. See the example code below for more
+        ### details. This setting cannot be used with
+        ### "stop_after_client_disconnect".
+        ### When "max_client_disconnect" is specified, the Nomad server will
+        ### mark clients that fail to heartbeat as "disconnected" rather than
+        ### "down", and will mark allocations on a disconnected client as
+        ### "unknown" rather than "lost". These allocations may continue to run
+        ### on the disconnected client. Replacement allocations will be
+        ### scheduled according to the allocations' reschedule policy until the
+        ### disconnected client reconnects. Once a disconnected client
+        ### reconnects, Nomad will compare the "unknown" allocations with their
+        ### replacements and keep the one with the best node score. If the
+        ### "max_client_disconnect" duration expires before the client
+        ### reconnects, the allocations will be marked "lost". Clients that
+        ### contain "unknown" allocations will transition to "disconnected"
+        ### rather than "down" until the last "max_client_disconnect" duration
+        ### has expired.
+        ### https://developer.hashicorp.com/nomad/docs/v1.6.x/job-specification/group#max-client-disconnect
+        max_client_disconnect = "999h";
+        ### Nomad 1.7.X solution:
+        ### (TODO blocker issue https://github.com/hashicorp/nomad/issues/19506)
+        ### Defines the reschedule behaviour of an allocation when the node it
+        ### is running on misses heartbeats. When enabled, if the node it is
+        ### running on becomes disconnected or goes down, this allocations won't
+        ### be rescheduled and will show up as unknown until the node comes back
+        ### up or it is manually restarted.
+        ### This behaviour will only modify the reschedule process on the
+        ### server. To modify the allocation behaviour on the client, see
+        ### "stop_after_client_disconnect" below.
+        ### The unknown allocation has to be manually stopped to run it again.
+        ### Setting `max_client_disconnect` and
+        ### `prevent_reschedule_on_lost = true` at the same time requires that
+        ### rescheduling is disabled entirely (what is done above in the
+        ### reschedule stanza).
+        # prevent_reschedule_on_lost = true;
+        ### Specifies a duration after which a Nomad client will stop
+        ### allocations, if it cannot communicate with the servers. By default,
+        ### a client will not stop an allocation until explicitly told to by a
+        ### server. A client that fails to heartbeat to a server within the
+        ### "heartbeat_grace" window and any allocations running on it will be
+        ### marked "lost" and Nomad will schedule replacement allocations. The
+        ### replaced allocations will normally continue to run on the
+        ### non-responsive client. But you may want them to stop instead — for
+        ###  example, allocations requiring exclusive access to an external
+        ### resource. When specified, the Nomad client will stop them after this
+        ### duration. The Nomad client process must be running for this to
+        ### occur. This setting cannot be used with "max_client_disconnect".
+        # stop_after_client_disconnect = "999h";
+
         # Specifies the restart policy for all tasks in this group. If omitted,
         # a default policy exists for each job type, which can be found in the
         # restart stanza documentation.

diff --git a/nix/workbench/backend/nomad.sh b/nix/workbench/backend/nomad.sh
@@ -1415,17 +1415,17 @@ backend_nomad() {
         then
           if ! wait_kill_em_all "${jobs_array[@]}"
           then
-            # Don't use fatal here, let `start` decide!
             msg "$(red "Failed to start tracer(s)")"
-            return 1
+            backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")"
+            fatal "scenario.sh start-tracers failed!"
           else
             for node in ${nodes[*]}
             do
               if ! test -f "${dir}"/tracer/"${node}"/started
               then
-                # Don't use fatal here, let `start` decide!
                 msg "$(red "Tracer for \"${node}\" failed to start!")"
-                return 1
+                backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")"
+                fatal "scenario.sh start-tracers failed!"
               fi
             done
           fi
@@ -3487,7 +3487,9 @@ client {
   servers = [ ${servers_addresses} ]
   # Sets the search path that is used for CNI plugin discovery. Multiple paths can
   # be searched using colon delimited paths
-  cni_path = "${cni_plugins_path}"
+# TODO: needed to allow having more than one Nomad profile running locally
+# Nomad 1.7.X fails somewhat silently when reading this configuration option.
+#  cni_path = "${cni_plugins_path}"
   # Specifies the maximum amount of time a job is allowed to wait to exit.
   # Individual jobs may customize their own kill timeout, but it may not exceed
   # this value.

diff --git a/nix/workbench/backend/nomad/exec.nix b/nix/workbench/backend/nomad/exec.nix
@@ -20,22 +20,8 @@ let
   extraShellPkgs = let
     # If we are going to use the `exec` driver we use the SRE patched version of
     # Nomad that allows to use `nix_installables` as artifacts.
-    nomad-sre = (pkgs.buildGo119Module rec {
-      pname = "nomad";
-      version = "1.4.3";
-      subPackages = [ "." ];
-      doCheck = true;
-      src = pkgs.fetchFromGitHub { # "github:input-output-hk/nomad/release/1.4.3"
-        owner = "input-output-hk";
-        repo = pname;
-        rev = "2b8a93390"; # Use to be "release/${version}" but it changes.
-        # nix-prefetch-url --unpack https://github.com/input-output-hk/nomad/archive/2b8a93390/1.4.3.tar.gz
-        sha256 = "0l2sfhpg0p5mjdbipib7q63wlsrczr2fkq9xi641vhgxsjmprvwm";
-      };
-      # error: either `vendorHash` or `vendorSha256` is required
-      # https://discourse.nixos.org/t/buildgomodule-how-to-get-vendorsha256/9317
-      vendorSha256 = "sha256-JQRpsQhq5r/QcgFwtnptmvnjBEhdCFrXFrTKkJioL3A=";
-    });
+    commit = "8f3b74796a8f56f38a812813c64dba995956a66e"; # Patched 1.6.3
+    nomad-sre = (__getFlake "github:input-output-hk/cardano-perf/${commit}").packages.x86_64-linux.nomad;
   in
     [ nomad-sre
       # The HTTP server to upload/download the genesis tar file in a local env.

diff --git a/nix/workbench/backend/nomad/exec.sh b/nix/workbench/backend/nomad/exec.sh
@@ -186,8 +186,6 @@ deploy-genesis-nomadexec() {
   local usage="USAGE: wb backend $op RUN-DIR"
   local dir=${1:?$usage}; shift
   local nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json)
-  local server_name=$(envjqr 'nomad_server_name')
-  local client_name=$(envjqr 'nomad_client_name')
 
   # Add genesis to HTTP cache server
   local nomad_agents_were_already_running=$(envjqr 'nomad_agents_were_already_running')
@@ -199,8 +197,8 @@ deploy-genesis-nomadexec() {
       if test "${nomad_agents_were_already_running}" = "false"
       then
         msg "$(red "Startup of webfs failed, cleaning up ...")"
+        # `stop-nomad-job` takes care of stopping the Nomad agents.
         backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")"
-        wb_nomad agents stop "${server_name}" "${client_name}" "exec"
       fi
       fatal "Failed to start a local HTTP server"
     fi
@@ -213,8 +211,8 @@ deploy-genesis-nomadexec() {
     if test "${nomad_agents_were_already_running}" = "false"
     then
       msg "$(red "Startup of webfs failed, cleaning up ...")"
+      # `stop-nomad-job` takes care of stopping the Nomad agents.
       backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")"
-      wb_nomad agents stop "${server_name}" "${client_name}" "exec"
     fi
     fatal "Failed to add genesis file to local HTTP server"
   else
@@ -225,8 +223,8 @@ deploy-genesis-nomadexec() {
   if ! backend_nomad deploy-genesis-wget "${dir}" "${uri}"
   then
     msg "$(red "Deploy of genesis failed, cleaning up ...")"
+    # `stop-nomad-job` takes care of stopping the Nomad agents.
     backend_nomad stop-nomad-job "${dir}" || msg "$(red "Failed to stop Nomad Job")"
-    wb_nomad agents stop "${server_name}" "${client_name}" "exec"
     fatal "Deploy of genesis \"${uri}\" failed"
   else
     msg "$(green "Genesis \"${uri}\" deployed successfully")"