From 0cdc9bbc967b14a62dba26b1d80a3eeff7d8dce2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Barb=C3=A1chano?= <pablob@amazon.com>
Date: Mon, 6 May 2024 15:57:54 +0200
Subject: [PATCH] test: stop all monitors before killing microvm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a test fails, the FCmetricsMonitor keeps running in some tests. After
we kill the microvm and remove the resources, the monitor complains it
cannot find the metrics file.

This is tricky to fix simply since we don't have a handle to the monitor
either at fixture teardown or in the Microvm class.

Add a hook in the Microvm class to register monitors, so we can stop all
of them before killing the microvm.

Also adapt existing monitors to fix this pattern.

Signed-off-by: Pablo Barbáchano <pablob@amazon.com>
---
 tests/framework/microvm.py    |  9 ++++++---
 tests/host_tools/fcmetrics.py | 16 +++++++++-------
 tests/host_tools/memory.py    |  6 ++++++
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py
index 538ae1704a52..fccf032be893 100644
--- a/tests/framework/microvm.py
+++ b/tests/framework/microvm.py
@@ -204,9 +204,11 @@ def __init__(
         if int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", 1)) > 1:
             self.time_api_requests = False
 
+        self.monitors = []
         self.memory_monitor = None
         if monitor_memory:
             self.memory_monitor = MemoryMonitor(self)
+            self.monitors.append(self.memory_monitor)
 
         self.api = None
         self.log_file = None
@@ -237,6 +239,10 @@ def kill(self):
         """All clean up associated with this microVM should go here."""
         # pylint: disable=subprocess-run-check
 
+        # Stop any registered monitors
+        for monitor in self.monitors:
+            monitor.stop()
+
         # We start with vhost-user backends,
         # because if we stop Firecracker first, the backend will want
         # to exit as well and this will cause a race condition.
@@ -286,9 +292,6 @@ def kill(self):
             self._validate_api_response_times()
 
         if self.memory_monitor:
-            if self.memory_monitor.is_alive():
-                self.memory_monitor.signal_stop()
-                self.memory_monitor.join(timeout=1)
             self.memory_monitor.check_samples()
 
     def _validate_api_response_times(self):
diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py
index 1306c22bf59c..53caf4d4adcd 100644
--- a/tests/host_tools/fcmetrics.py
+++ b/tests/host_tools/fcmetrics.py
@@ -500,6 +500,7 @@ class FCMetricsMonitor(Thread):
     def __init__(self, vm, timer=60):
         Thread.__init__(self, daemon=True)
         self.vm = vm
+        vm.monitors.append(self)
         self.timer = timer
 
         self.metrics_index = 0
@@ -538,13 +539,14 @@ def stop(self):
         in sleep when stop is called and once it wakes out of sleep
         the "vm" might not be avaiable to provide the metrics.
         """
-        self.running = False
-        # wait for the running thread to finish
-        # this should also avoid any race condition leading to
-        # uploading the same metrics twice
-        self.join()
-        self.vm.api.actions.put(action_type="FlushMetrics")
-        self._flush_metrics()
+        if self.is_alive():
+            self.running = False
+            # wait for the running thread to finish
+            # this should also avoid any race condition leading to
+            # uploading the same metrics twice
+            self.join()
+            self.vm.api.actions.put(action_type="FlushMetrics")
+            self._flush_metrics()
 
     def run(self):
         self.running = True
diff --git a/tests/host_tools/memory.py b/tests/host_tools/memory.py
index fe9e7af931ea..690cda387016 100644
--- a/tests/host_tools/memory.py
+++ b/tests/host_tools/memory.py
@@ -46,6 +46,12 @@ def signal_stop(self):
         """Signal that the thread should stop."""
         self._should_stop = True
 
+    def stop(self):
+        """Stop the thread"""
+        if self.is_alive():
+            self.signal_stop()
+            self.join(timeout=1)
+
     def run(self):
         """Thread for monitoring the RSS memory usage of a Firecracker process.