diff --git a/README.md b/README.md
index c36437c..abd1334 100644
--- a/README.md
+++ b/README.md
@@ -20,12 +20,12 @@ Validation capacities for physics compares OACC and original Fortran on.
 
 Automatic benchmarking are as follow (legends after table)
 
-| Experimentation               | Resolutions |  Layout   | Setup                                    |
-| ----------------------------- | ------------|-----------| ---------------------------------------- |
-| Held-Suarez                   | C180-L72    | 1x1       | Discover @ node-to-node (exclusive GPU)  |
-|                               | C180-L91    | 1x1       | Discover @ node-to-node (exclusive GPU)  |
-|                               | C180-L137   | 1x1       | Discover @ node-to-node (exclusive GPU)  |
-| Aquaplanet                    | C180-L72    | 1x1       | Discover @ node-to-node (exclusive GPU)  |
+| Experimentation               | Resolutions | Layout | CPU/GPU                           |
+| ----------------------------- | ----------- | ------ | --------------------------------- |
+| Held-Suarez                   | C180-L72    | 4x4    | 96/8 Node-to-node (sharing GPU)   |
+|                               | C180-L137   | 4x4    | 96/8 Node-to-node (sharing GPU)   |
+|                               | C360-L72    | 4x4    | 96/8 Node-to-node (sharing GPU)   |
+| Aquaplanet                    | C180-L72    | 1x1    | 6/6  Node-to-node (exclusive GPU) |
 
 Legend:
 
diff --git a/geosongpu_ci/actions/slurm.py b/geosongpu_ci/actions/slurm.py
index b6b0c6a..75794c0 100644
--- a/geosongpu_ci/actions/slurm.py
+++ b/geosongpu_ci/actions/slurm.py
@@ -79,3 +79,14 @@ def slurm_96CPUs_8GPUs(cls, output: Optional[str] = None) -> "SlurmConfiguration
             mem_per_gpu="40G",
             output=output or cls.output,
         )
+
+    @classmethod
+    def slurm_96CPUs(cls, output: Optional[str] = None) -> "SlurmConfiguration":
+        """2 nodes configuration on Discover with Rome Epyc"""
+        return cls(
+            nodes=2,
+            ntasks=96,
+            ntasks_per_node=48,
+            sockets_per_node=2,
+            output=output or cls.output,
+        )
diff --git a/geosongpu_ci/pipeline/gtfv3_config.py b/geosongpu_ci/pipeline/gtfv3_config.py
index 3b7c092..020319e 100644
--- a/geosongpu_ci/pipeline/gtfv3_config.py
+++ b/geosongpu_ci/pipeline/gtfv3_config.py
@@ -19,6 +19,8 @@ def sh(self) -> str:
             f"export PACE_FLOAT_PRECISION={self.PACE_FLOAT_PRECISION}\n"
             f"export PACE_LOGLEVEL={self.PACE_LOGLEVEL}\n"
             f"export GTFV3_BACKEND={self.GTFV3_BACKEND}\n"
+            f"export PER_DEVICE_PROCESS=12\n"  # default for Discover
+            f"export PYTHONOPTIMIZE=1\n"
         )
 
     @classmethod
diff --git a/geosongpu_ci/pipeline/held_suarez.py b/geosongpu_ci/pipeline/held_suarez.py
index 96a307c..e08d6c0 100644
--- a/geosongpu_ci/pipeline/held_suarez.py
+++ b/geosongpu_ci/pipeline/held_suarez.py
@@ -46,48 +46,11 @@ def __init__(
     def _make_gpu_wrapper_script(
         self,
         experiment_directory: str,
-        hardware_sampling: bool = False,
     ) -> None:
-        script_name = "gpu-wrapper-slurm"
-        pre_execution = []
-        post_execution = []
-        if hardware_sampling:
-            script_name += "-hws"
-            pre_execution.append("if [ $SLURM_LOCALID -eq 0 ]; then")
-            pre_execution.append("    geosongpu_hws server &")
-            pre_execution.append("    sleep 20")
-            pre_execution.append("    geosongpu_hws client start")
-            pre_execution.append("fi")
-
-            post_execution.append(
-                "if [ $SLURM_LOCALID -eq 0 ]; then",
-            )
-            post_execution.append(
-                "    geosongpu_hws client dump",
-            )
-            post_execution.append(
-                "    geosongpu_hws client stop",
-            )
-            post_execution.append(
-                "fi",
-            )
-
-        cuda_device_setup = [
-            "export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID",
-            'echo "Node: $SLURM_NODEID | Rank: $SLURM_PROCID,'
-            ' pinned to GPU: $CUDA_VISIBLE_DEVICES"',
-        ]
-        execution = ["$*"]
         self.gpu_wrapper = ShellScript(
-            script_name,
+            "gpu-wrapper-slurm-mps",
             working_directory=experiment_directory,
-        ).write(
-            modules=[],
-            shell_commands=cuda_device_setup
-            + pre_execution
-            + execution
-            + post_execution,
-        )
+        ).from_template(template_name="gpu-wrapper-slurm-mps.sh")
 
     def _copy_executable_script(
         self,
@@ -114,13 +77,29 @@ def _make_srun_script(
     slurm_config: SlurmConfiguration,
     gtfv3_config: GTFV3Config,
     prolog_scripts: PrologScripts,
+    hardware_sampler_on: bool = False,
+    mps_on: bool = False,
+    local_redirect_log: bool = False,
 ) -> ShellScript:
+    # Executing command with the SLURM setup
     srun_cmd = slurm_config.srun_bash(
         wrapper=prolog_scripts.gpu_wrapper.path,
         executable_name=executable_name,
     )
-    srun_script_script = ShellScript(
-        f"srun_{slurm_config.ntasks}tasks_{gtfv3_config.backend_sanitized()}",
+    # Options
+    options = f"""{'export HARDWARE_SAMPLING=1' if hardware_sampler_on else 'unset HARDWARE_SAMPLING' }
+{'export MPS_ON=1' if mps_on else 'unset MPS_ON' }
+{f'export LOCAL_REDIRECT_LOG=1' if local_redirect_log else 'unset LOCAL_REDIRECT_LOG' }
+    """
+
+    if "dace" in gtfv3_config.GTFV3_BACKEND:
+        backend = f"{gtfv3_config.backend_sanitized()}.{gtfv3_config.FV3_DACEMODE}"
+    else:
+        backend = f"{gtfv3_config.backend_sanitized()}"
+    srun_script_name = f"srun_{slurm_config.ntasks}tasks_{backend}"
+
+    srun_script = ShellScript(
+        srun_script_name,
         working_directory=experiment_directory,
     ).write(
         env_to_source=[
@@ -132,13 +111,14 @@ def _make_srun_script(
             f"source {prolog_scripts.copy_executable.path}",
             "",
             f"{gtfv3_config.sh()}",
-            "export PYTHONOPTIMIZE=1",
             f"export CUPY_CACHE_DIR={experiment_directory}/.cupy",
             "",
+            f"{options}",
+            "",
             f"{srun_cmd}",
         ],
     )
-    return srun_script_script
+    return srun_script
 
 
 VALIDATION_RESOLUTION = "C180-L72"
@@ -169,7 +149,7 @@ def _setup_1ts_1node_gtfv3(self, experiment_directory: str) -> ShellScript:
 
     def _setup_1day_1node_gtfv3(self, experiment_directory: str) -> ShellScript:
         return ShellScript(
-            name="_setup_config_1day_1node_gtfv3",
+            name="setup_config_1day_1node_gtfv3",
             working_directory=experiment_directory,
         ).write(
             shell_commands=[
@@ -193,6 +173,45 @@ def _setup_1day_1node_fortran(self, experiment_directory: str) -> ShellScript:
             ],
         )
 
+    def _setup_1ts_2nodes_gtfv3(self, experiment_directory: str) -> ShellScript:
+        return ShellScript(
+            name="setup_config_1ts_2nodes_gtfv3",
+            working_directory=experiment_directory,
+        ).write(
+            shell_commands=[
+                f"cd {experiment_directory}",
+                "cp -f AgcmSimple.rc.4x24.gtfv3 AgcmSimple.rc",
+                "cp -f input.nml.4x4 input.nml",
+                "cp -f CAP.rc.1ts CAP.rc",
+            ],
+        )
+
+    def _setup_1day_2nodes_gtfv3(self, experiment_directory: str) -> ShellScript:
+        return ShellScript(
+            name="setup_config_1day_2nodes_gtfv3",
+            working_directory=experiment_directory,
+        ).write(
+            shell_commands=[
+                f"cd {experiment_directory}",
+                "cp -f AgcmSimple.rc.4x24.gtfv3 AgcmSimple.rc",
+                "cp -f input.nml.4x4 input.nml",
+                "cp -f CAP.rc.1day CAP.rc",
+            ],
+        )
+
+    def _setup_1day_2nodes_fortran(self, experiment_directory: str) -> ShellScript:
+        return ShellScript(
+            name="setup_config_1day_2nodes_fortran",
+            working_directory=experiment_directory,
+        ).write(
+            shell_commands=[
+                f"cd {experiment_directory}",
+                "cp -f AgcmSimple.rc.4x24.fortran AgcmSimple.rc",
+                "cp -f input.nml.4x4 input.nml",
+                "cp -f CAP.rc.1day CAP.rc",
+            ],
+        )
+
     def prepare_experiment(
         self,
         input_directory: str,
@@ -224,6 +243,9 @@ def simulate(
         gtfv3_config: GTFV3Config,
         setup_script: ShellScript,
         setup_only: bool = False,
+        hardware_sampler_on: bool = False,
+        mps_on: bool = False,
+        local_redirect_log: bool = False,
     ):
         srun_script = _make_srun_script(
             executable_name=executable_name,
@@ -231,6 +253,9 @@ def simulate(
             slurm_config=slurm_config,
             gtfv3_config=gtfv3_config,
             prolog_scripts=prolog_scripts,
+            hardware_sampler_on=hardware_sampler_on,
+            mps_on=mps_on,
+            local_redirect_log=local_redirect_log,
         )
 
         setup_script.execute()
@@ -310,11 +335,11 @@ def run_action(
                         experiment_directory=experiment_directory,
                         executable_name=self.executable_name,
                         prolog_scripts=prolog_scripts,
-                        slurm_config=SlurmConfiguration.slurm_6CPUs_6GPUs(
+                        slurm_config=SlurmConfiguration.slurm_96CPUs_8GPUs(
                             output="benchmark.cache.dacegpu.%t.out"
                         ),
                         gtfv3_config=GTFV3Config.dace_gpu_32_bit_BAR(),
-                        setup_script=self._setup_1ts_1node_gtfv3(experiment_directory),
+                        setup_script=self._setup_1ts_2nodes_gtfv3(experiment_directory),
                         setup_only=env.setup_only,
                     )
 
@@ -323,12 +348,13 @@ def run_action(
                     experiment_directory=experiment_directory,  # type: ignore
                     executable_name=self.executable_name,
                     prolog_scripts=prolog_scripts,  # type: ignore
-                    slurm_config=SlurmConfiguration.slurm_6CPUs_6GPUs(
-                        output="benchmark.1day.dacegpu.%t.out"
+                    slurm_config=SlurmConfiguration.slurm_96CPUs_8GPUs(
+                        output="benchmark.1day.MPS.44.dacegpu.%t.out"
                     ),
                     gtfv3_config=GTFV3Config.dace_gpu_32_bit_BAR(dacemode="Run"),
-                    setup_script=self._setup_1day_1node_gtfv3(experiment_directory),  # type: ignore
+                    setup_script=self._setup_1day_2nodes_gtfv3(experiment_directory),  # type: ignore
                     setup_only=env.setup_only,
+                    mps_on=True,
                 )
 
                 # Run 1 day Fortran
@@ -336,12 +362,13 @@ def run_action(
                     experiment_directory=experiment_directory,  # type: ignore
                     executable_name=self.executable_name,
                     prolog_scripts=prolog_scripts,  # type: ignore
-                    slurm_config=SlurmConfiguration.slurm_72CPUs(
-                        output="benchmark.1day.fortran.%t.out"
+                    slurm_config=SlurmConfiguration.slurm_96CPUs(
+                        output="benchmark.1day.MPS.44.fortran.%t.out"
                     ),
                     gtfv3_config=GTFV3Config.fortran(),
-                    setup_script=self._setup_1day_1node_fortran(experiment_directory),  # type: ignore
+                    setup_script=self._setup_1day_2nodes_fortran(experiment_directory),  # type: ignore
                     setup_only=env.setup_only,
+                    mps_on=True,
                 )
 
     def check(
diff --git a/geosongpu_ci/pipeline/templates/__init__.py b/geosongpu_ci/pipeline/templates/__init__.py
new file mode 100644
index 0000000..57a20ff
--- /dev/null
+++ b/geosongpu_ci/pipeline/templates/__init__.py
@@ -0,0 +1,22 @@
+import os
+import sys
+import site
+
+
+def find_template(name: str) -> str:
+    # pip install geosongpu-ci
+    candidate = f"{sys.prefix}/geosongpu/templates/{name}.tpl"
+    if os.path.isfile(candidate):
+        return candidate
+    # pip install --user geosongpu-ci
+    candidate = f"{site.USER_BASE}/geosongpu/templates/{name}.tpl"
+    if os.path.isfile(candidate):
+        return candidate
+    # pip install -e geosongpu-ci
+    candidate = os.path.join(
+        os.path.dirname(__file__),
+        f"{name}.tpl",
+    )
+    if os.path.isfile(candidate):
+        return candidate
+    raise FileNotFoundError(f"Template: could not locate {name}")
diff --git a/geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl b/geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl
new file mode 100644
index 0000000..e2af09f
--- /dev/null
+++ b/geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl
@@ -0,0 +1,76 @@
+#!/bin/sh
+
+# We open GPU visibility to full node at first
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Hardware sampling is a python tools that reads at intervals
+# various hardware sensors (power, usage, memory load...)
+if [ -z ${HARDWARE_SAMPLING} ]; then
+    echo "Hardware sampling is OFF"
+else
+    echo "Hardware sampling is ON"
+    # We restrict usage to (world) rank 0
+    if [ $SLURM_PROCID -eq 0 ]; then
+        geosongpu_hws server &
+        sleep 10
+        geosongpu_hws client start
+    fi
+
+fi
+
+if [ -z ${MPS_ON} ]; then
+    echo "MPS is OFF"
+    # No MPS, we assume rank==GPU
+    GPU=$SLURM_LOCALID
+    export CUDA_VISIBLE_DEVICES=$GPU
+else
+    echo "MPS is ON"
+    if [ -z ${PER_DEVICE_PROCESS} ]; then
+        echo "PER_DEVICE_PROCESS needs to be setup on MPS. Exiting."
+        exit 1
+    fi
+    # All ranks needs to know where to look
+    export CUDA_MPS_PIPE_DIRECTORY=./nvidia-mps/$SLURM_NODEID
+    export CUDA_MPS_LOG_DIRECTORY=./nvidia-log/$SLURM_NODEID
+    # Only 1 rank per node (local rank 0) handles the server chatter
+    if [ $SLURM_LOCALID -eq 0 ]; then
+        echo "Turn nvidia-cuda-mps-control on for node $SLURM_NODEID"
+        mkdir -p nvidia-mps
+        mkdir -p nvidia-log/$SLURM_NODEID
+        # sudo nividia -i 0 -c 3 # Per docs, we should insure GPU is in EXCLUSIVE mode but we might be curtail by HPC settings
+        nvidia-cuda-mps-control -d
+    fi
+    # MPS server is socket base, leave time for the filesystem
+    sleep 10
+    # Server should be spun, we restrict this rank to a single GPU
+    GPU=$((SLURM_LOCALID/PER_DEVICE_PROCESS))
+    export CUDA_VISIBLE_DEVICES=$GPU
+fi
+
+
+echo "Node: $SLURM_NODEID | Rank: $SLURM_PROCID, pinned to GPU: $CUDA_VISIBLE_DEVICES"
+
+# Run program with or without log dump in file
+if [ -z ${LOCAL_REDIRECT_LOG} ]; then
+    $*
+else
+    $* > log.redirect_local.$SLURM_PROCID.out 2>&1
+fi
+
+# Clean up of all tools
+if [ -z ${HARDWARE_SAMPLING} ]; then
+    echo ""
+else 
+    if [ $SLURM_PROCID -eq 0 ]; then
+        geosongpu_hws client dump
+        geosongpu_hws client stop
+    fi
+fi
+if [ -z ${MPS_ON} ]; then
+    echo ""
+else 
+    if [ $SLURM_LOCALID -eq 0 ]; then
+        echo quit | nvidia-cuda-mps-control
+        # sudo nividia -i 0 -c 0 # Per docs, we should insure GPU is flipped back to DEFAULT mode but we might be curtail by HPC settings
+    fi
+fi
diff --git a/geosongpu_ci/tools/benchmark/geos_log_parser.py b/geosongpu_ci/tools/benchmark/geos_log_parser.py
index 33d84c6..20a2bf2 100644
--- a/geosongpu_ci/tools/benchmark/geos_log_parser.py
+++ b/geosongpu_ci/tools/benchmark/geos_log_parser.py
@@ -68,7 +68,9 @@ def parse_geos_log(filename: str) -> BenchmarkRawData:
         benchmark.fv_dyncore_timings = _extract_numerics(interface_timings)
 
         if "dace" in benchmark.backend:
-            dycore_timings = _grep(filename, "] Run...", exclude_pattern=True)
+            dycore_timings = _grep(
+                filename, "] Run...", exclude_pattern=True, expected=False
+            )
             benchmark.inner_dycore_timings = _extract_numerics(dycore_timings)
     else:
         dycore_timings = _grep(filename, "0: fv_dynamics", exclude_pattern=True)
@@ -131,8 +133,16 @@ def parse_geos_log(filename: str) -> BenchmarkRawData:
 
     # Model throughput
     gloabl_profiler_entry = "Model Throughput"
+    global_init_time = _grep(
+        filename, "--Initialize", start_pattern=gloabl_profiler_entry
+    )
+    benchmark.global_init_time = _extract_numerics(global_init_time)[1]
     global_run_time = _grep(filename, "--Run", start_pattern=gloabl_profiler_entry)
     benchmark.global_run_time = _extract_numerics(global_run_time)[1]
+    global_finalize_time = _grep(
+        filename, "--Finalize", start_pattern=gloabl_profiler_entry
+    )
+    benchmark.global_finalize_time = _extract_numerics(global_finalize_time)[1]
 
     return benchmark
 
diff --git a/geosongpu_ci/tools/benchmark/raw_data.py b/geosongpu_ci/tools/benchmark/raw_data.py
index 27096ec..70d6796 100644
--- a/geosongpu_ci/tools/benchmark/raw_data.py
+++ b/geosongpu_ci/tools/benchmark/raw_data.py
@@ -7,7 +7,9 @@ class BenchmarkRawData:
     backend: str = ""
     grid_resolution: Tuple[int, int, int] = (0, 0, 0)  # nx / ny / nz
     node_setup: Tuple[int, int, int] = (0, 0, 0)  # NX / NY / Total ranks used
+    global_init_time: float = 0  # seconds fort the global INITIALIZE
     global_run_time: float = 0  # seconds fort the global RUN
+    global_finalize_time: float = 0  # seconds fort the global FINALIZE
     fv_dyncore_timings: List[float] = field(default_factory=list)  # seconds
     inner_dycore_timings: List[float] = field(default_factory=list)  # seconds
     fv_gridcomp_detailed_profiling: List[Tuple[str, float, str]] = field(
diff --git a/geosongpu_ci/tools/hws/analysis.py b/geosongpu_ci/tools/hws/analysis.py
new file mode 100644
index 0000000..c202bcc
--- /dev/null
+++ b/geosongpu_ci/tools/hws/analysis.py
@@ -0,0 +1,66 @@
+import numpy as np
+from typing import Dict, Any
+import geosongpu_ci.tools.hws.constants as cst
+import dataclasses
+
+
+@dataclasses.dataclass
+class EnergyReport:
+    CPU_envelop_integrated: float = 0  # kW * sample_count
+    CPU_envelop_kWh: float = 0
+    GPU_envelop_integrated: float = 0  # kW * sample_count
+    GPU_envelop_kWh: float = 0
+    overall_envelop_integrated: float = 0  # kW * sample_count
+    overall_envelop_kWh: float = 0
+
+
+def energy_envelop_calculation(
+    cpu_psu_data: np.ndarray, gpu_psu_data: np.ndarray, verbose: bool = True
+) -> EnergyReport:
+    report = EnergyReport()
+    # TODO we need the sample rate here too
+    sample_count = len(cpu_psu_data)
+
+    # Grab integrated power using trapezoide integration on all samples
+    report.GPU_envelop_integrated = np.trapz(gpu_psu_data / 1000)
+    report.CPU_envelop_integrated = np.trapz(cpu_psu_data / 1000)
+    report.overall_envelop_integrated = (
+        report.GPU_envelop_integrated + report.CPU_envelop_integrated
+    )
+
+    # Average kWh calculation
+    # TODO: Wrong?!
+    sampling_time_hours_default = (
+        (sample_count - 1) * cst.DEFAULT_SAMPLERATE_IN_S / (60 * 60)
+    )
+    report.overall_envelop_kWh = (
+        report.overall_envelop_integrated / sample_count
+    ) / sampling_time_hours_default
+    report.CPU_envelop_kWh = (
+        report.CPU_envelop_integrated / sample_count
+    ) / sampling_time_hours_default
+    report.GPU_envelop_kWh = (
+        report.GPU_envelop_integrated / sample_count
+    ) / sampling_time_hours_default
+
+    if verbose:
+        print(
+            f"Number of samples: {sample_count}\n"
+            f"CPU envelop:{report.CPU_envelop_integrated:.0f} kW.sample_count\n"
+            f"CPU envelop (@ default sample rate): {report.CPU_envelop_kWh:.2f} kW/h\n"
+            f"GPU envelop:{report.GPU_envelop_integrated:.0f} kW.sample_count\n"
+            f"GPU envelop (@ default sample rate): {report.GPU_envelop_kWh:.2f} kW/h\n"
+            f"Overall envelop: {report.overall_envelop_integrated:.0f} kW.sample_count\n"
+            f"Overall envelop (@ default sample rate): {report.overall_envelop_kWh:.2f} kW/h"
+        )
+
+    return report
+
+
+def load_data(
+    data_filepath: str,
+    data_format: str = "npz",
+) -> Dict[str, Any]:
+    if data_format != "npz":
+        raise NotImplementedError(f"Format {data_format} not implemented for graphing")
+    return np.load(data_filepath)
diff --git a/geosongpu_ci/tools/hws/cli.py b/geosongpu_ci/tools/hws/cli.py
index a1b951f..49f7f8c 100644
--- a/geosongpu_ci/tools/hws/cli.py
+++ b/geosongpu_ci/tools/hws/cli.py
@@ -2,6 +2,8 @@
 import geosongpu_ci.tools.hws.server as hws_server
 import geosongpu_ci.tools.hws.client as hws_client
 import geosongpu_ci.tools.hws.graph as hws_graph
+import geosongpu_ci.tools.hws.constants as cst
+from typing import Tuple, Optional
 
 
 @click.group()
@@ -28,5 +30,21 @@ def graph(data_filepath: str):
     hws_graph.cli(data_filepath)
 
 
+@cli.command()
+@click.argument("data_filepath")
+@click.option("--data_range", nargs=2, type=float)
+def envelop(
+    data_filepath: str, data_range: Optional[Tuple[float, float]]
+):  # TODO: this should be saved in a header in the data file
+    dt = cst.CLIENT_CMDS[cst.CLIENT_CMD_START]["dt"]
+    if data_range:
+        range_start, range_stop = data_range
+        range_start = int(range_start / dt)
+        range_stop = int(range_stop / dt)
+    else:
+        range_start, range_stop = None, None
+    hws_graph.cli(data_filepath, data_range=slice(range_start, range_stop))
+
+
 if __name__ == "__main__":
     cli()
diff --git a/geosongpu_ci/tools/hws/client.py b/geosongpu_ci/tools/hws/client.py
index 24abec0..74f3f98 100755
--- a/geosongpu_ci/tools/hws/client.py
+++ b/geosongpu_ci/tools/hws/client.py
@@ -4,9 +4,9 @@
 
 
 def client_main(order: str, dump_name: str):
-    order = CLIENT_CMDS[order]
-    order["dump_name"] = dump_name
-    data = json.dumps(order)
+    filtered_order = CLIENT_CMDS[order]
+    filtered_order["dump_name"] = dump_name
+    data = json.dumps(filtered_order)
     server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
     server.connect(SOCKET_FILENAME)
     server.send(data.encode("utf8"))
diff --git a/geosongpu_ci/tools/hws/constants.py b/geosongpu_ci/tools/hws/constants.py
index 598dca2..66f3b08 100755
--- a/geosongpu_ci/tools/hws/constants.py
+++ b/geosongpu_ci/tools/hws/constants.py
@@ -26,10 +26,12 @@
 CLIENT_CMD_DUMP = "dump"
 CLIENT_CMD_TICK = "tick"
 
+DEFAULT_SAMPLERATE_IN_S = 0.1
+
 CLIENT_CMDS = {
     CLIENT_CMD_START: {
         "action": SERV_ORDER_START,
-        "dt": 0.01,
+        "dt": DEFAULT_SAMPLERATE_IN_S,
     },
     CLIENT_CMD_STOP: {"action": SERV_ORDER_STOP},
     CLIENT_CMD_DUMP: {
diff --git a/geosongpu_ci/tools/hws/graph.py b/geosongpu_ci/tools/hws/graph.py
index 0add21b..ee12d01 100644
--- a/geosongpu_ci/tools/hws/graph.py
+++ b/geosongpu_ci/tools/hws/graph.py
@@ -1,74 +1,72 @@
-from matplotlib import pyplot
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
 import numpy as np
 from geosongpu_ci.tools.hws.constants import (
     HWS_HW_GPU,
     HWS_HARDWARE_SPECS,
 )
-from typing import Dict, Any
+from geosongpu_ci.tools.hws.analysis import load_data, energy_envelop_calculation
 
 COLOR_VRAM = "C4"
 
 
-def energy_envelop_calculation(cpu_psu_data: np.ndarray, gpu_psu_data: np.ndarray):
-    gpu_kW_envelop = np.trapz(gpu_psu_data / 1000)
-    cpu_kW_envelop = np.trapz(cpu_psu_data / 1000)
-    return gpu_kW_envelop, cpu_kW_envelop
-
-def load_data(
-    data_filepath: str,
-    data_format: str = "npz",
-) -> Dict[str, Any]:
-    if data_format != "npz":
-        raise NotImplementedError(f"Format {data_format} not implemented for graphing")
-    return np.load(data_filepath)
-
 def cli(
     data_filepath: str,
     data_format: str = "npz",
-    dynamic_gpu_load: bool = True,
+    data_range: slice = slice(None),
 ):
     d = load_data(data_filepath, data_format)
-    n = len(d["cpu_psu"])
-    s = slice(0, n)
-    print(n, s)
-    yd = np.arange(len(d["cpu_psu"][s]))
+    sample_count = len(d["cpu_psu"][data_range])
+    yd = np.arange(sample_count)
 
-    fig, ax1 = pyplot.subplots(figsize=(8, 8))
-    ax2 = pyplot.twinx()
-
-    ax1.plot(yd, d["gpu_psu"][s], label="GPU PSU(W)", linewidth=0.5)
-    ax1.plot(yd, d["gpu_exe_utl"][s], label="GPU Utilization(%)", linewidth=0.5)
-    ax2.plot(yd, d["gpu_mem"][s], label="GPU VRAM(Mb)", color=COLOR_VRAM, linewidth=0.5)
-    ax1.plot(yd, d["cpu_psu"][s], label="CPU PSU(W - extrapolated)", linewidth=0.5)
-    ax1.plot(yd, d["cpu_exe_utl"][s], label="CPU Utilization(%)", linewidth=0.5)
-
-    ax1.legend(
-        loc="upper center",
-        bbox_to_anchor=(0.5, 1.15),
-        ncol=2,
-        fancybox=True,
-        shadow=True,
+    fig = make_subplots(specs=[[{"secondary_y": True}]])
+    # Add traces
+    fig.add_trace(
+        go.Scatter(y=d["gpu_psu"][data_range], x=yd, name="GPU PSU(W)"),
+        secondary_y=False,
     )
-    ax1.set_ylabel("W/%", fontsize=10)
-    ax1.set_ylim(0, HWS_HARDWARE_SPECS[HWS_HW_GPU]["PSU_TDP"])
-
-    ax2.legend(
-        loc="upper center",
-        bbox_to_anchor=(0.5, 1.05),
-        ncol=3,
-        fancybox=True,
-        shadow=True,
+    fig.add_trace(
+        go.Scatter(y=d["gpu_exe_utl"][data_range], x=yd, name="GPU Utilization(%)"),
+        secondary_y=False,
+    )
+    fig.add_trace(
+        go.Scatter(y=d["cpu_psu"][data_range], x=yd, name="CPU PSU(W - extrapolated)"),
+        secondary_y=False,
+    )
+    fig.add_trace(
+        go.Scatter(y=d["cpu_exe_utl"][data_range], x=yd, name="CPU Utilization(%)"),
+        secondary_y=False,
     )
-    ax2.set_ylabel("Mb", color=COLOR_VRAM, fontsize=10)
-    ax2.tick_params(axis="y", labelcolor="C4")
-    ax2.set_ylim(0, HWS_HARDWARE_SPECS[HWS_HW_GPU]["MAX_VRAM"])
 
-    fig.savefig(data_filepath.replace(f".{data_format}", ""))
+    fig.add_trace(
+        go.Scatter(y=d["gpu_mem"][data_range], x=yd, name="GPU VRAM (Mb)"),
+        secondary_y=True,
+    )
 
-    gpu_kW_envelop, cpu_kW_envelop = energy_envelop_calculation(
-        d["cpu_psu"][s], d["gpu_psu"][s]
+    # Labels
+    fig.update_layout(
+        title_text="Hardware sensors",
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
     )
-    print(
-        f"Overall CPU W usage:{cpu_kW_envelop:.0f} kW\n",
-        f"Overall GPU W usage:{gpu_kW_envelop:.0f} kW",
+    fig.update_xaxes(title_text="Sample #")
+    fig.update_yaxes(
+        title_text="W or %",
+        secondary_y=False,
+        range=[0, HWS_HARDWARE_SPECS[HWS_HW_GPU]["PSU_TDP"]],
     )
+    fig.update_yaxes(
+        title_text="Mb",
+        secondary_y=True,
+        range=[0, HWS_HARDWARE_SPECS[HWS_HW_GPU]["MAX_VRAM"]],
+    )
+
+    fig.write_image(data_filepath.replace(f".{data_format}", ".png"))
+
+    energy_envelop_calculation(d["cpu_psu"][data_range], d["gpu_psu"][data_range])
+
+
+# Useful for debug
+if __name__ == "__main__":
+    import sys
+
+    cli(sys.argv[1])
diff --git a/geosongpu_ci/utils/shell.py b/geosongpu_ci/utils/shell.py
index 2f9cfb3..21a8285 100644
--- a/geosongpu_ci/utils/shell.py
+++ b/geosongpu_ci/utils/shell.py
@@ -4,6 +4,7 @@
 import stat
 from geosongpu_ci.utils.progress import Progress
 from time import sleep
+from geosongpu_ci.pipeline.templates import find_template
 
 
 class ShellScript:
@@ -21,6 +22,19 @@ def path(self) -> str:
     def name(self) -> str:
         return self._name
 
+    def from_template(self, template_name: str):
+        # Read template
+        template_file = find_template(template_name)
+        with open(template_file, "r") as f:
+            tpl = f.read()
+
+        # Write file
+        with open(self.path, "w") as f:
+            f.write(tpl)
+
+        self._make_executable()
+        return self
+
     def write(
         self,
         shell_commands: List[str],
diff --git a/setup.cfg b/setup.cfg
index a06f8cd..db9514c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,7 +3,7 @@
 [flake8]
 # Recommend matching the black line length (default 88),
 # rather than using the flake8 default of 79:
-max-line-length = 88
+max-line-length = 110
 extend-ignore =
     # See https://github.com/PyCQA/pycodestyle/issues/373
     E203,
\ No newline at end of file
diff --git a/setup.py b/setup.py
index b772948..881e448 100644
--- a/setup.py
+++ b/setup.py
@@ -23,12 +23,15 @@
         "pdoc",
         "pynvml",
         "psutil",
-        "matplotlib",
         "plotly",
-        "kaleido"
+        "kaleido",
     ],
     data_files=[
         ("./geosongpu/experiments", ["./experiments/experiments.yaml"]),
+        (
+            "./geosongpu/templates",
+            ["./geosongpu_ci/pipeline/templates/gpu-wrapper-slurm-mps.sh.tpl"],
+        ),
     ],
     entry_points={
         "console_scripts": [