Merge pull request #104 from stackhpc/upstream/2023.1-2024-09-02

Synchronise 2023.1 with upstream
stackhpc · Sep 5, 2024 · 1456f6d · 1456f6d
2 parents 7f8fa87 + 6b78420
commit 1456f6d
Show file tree

Hide file tree

Showing 10 changed files with 325 additions and 132 deletions.
diff --git a/doc/notification_samples/instance-create-error.json b/doc/notification_samples/instance-create-error.json
@@ -18,7 +18,9 @@
             "ip_addresses": [],
             "launched_at": null,
             "power_state": "pending",
-            "state": "building"
+            "state": "building",
+            "host": null,
+            "node": null
         }
     },
     "priority":"ERROR",

diff --git a/doc/source/cli/nova-manage.rst b/doc/source/cli/nova-manage.rst
@@ -1531,7 +1531,9 @@ command.
    * - 5
      - Instance state invalid (must be stopped and unlocked)
    * - 6
-     - Instance is not attached to volume
+     - Volume is not attached to the instance
+   * - 7
+     - Connector host is not correct
 
 
 Libvirt Commands

diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py
@@ -22,6 +22,7 @@
 """
 
 import collections
+from contextlib import contextmanager
 import functools
 import os
 import re
@@ -144,6 +145,33 @@ def format_dict(dct, dict_property="Property", dict_value='Value',
     return encodeutils.safe_encode(pt.get_string()).decode()
 
 
+@contextmanager
+def locked_instance(cell_mapping, instance, reason):
+    """Context manager to lock and unlock instance,
+    lock state will be restored regardless of the success or failure
+    of target functionality.
+
+    :param cell_mapping: instance-cell-mapping
+    :param instance: instance to be lock and unlock
+    :param reason: reason, why lock is required
+    """
+
+    compute_api = api.API()
+
+    initial_state = 'locked' if instance.locked else 'unlocked'
+    if not instance.locked:
+        with context.target_cell(
+                context.get_admin_context(), cell_mapping) as cctxt:
+            compute_api.lock(cctxt, instance, reason=reason)
+    try:
+        yield
+    finally:
+        if initial_state == 'unlocked':
+            with context.target_cell(
+                    context.get_admin_context(), cell_mapping) as cctxt:
+                compute_api.unlock(cctxt, instance)
+
+
 class DbCommands(object):
     """Class for managing the main database."""
 
@@ -2998,10 +3026,8 @@ def _refresh(self, instance_uuid, volume_id, connector):
         :param instance_uuid: UUID of instance
         :param volume_id: ID of volume attached to the instance
         :param connector: Connector with which to create the new attachment
+        :return status_code: volume-refresh status_code 0 on success
         """
-        volume_api = cinder.API()
-        compute_rpcapi = rpcapi.ComputeAPI()
-        compute_api = api.API()
 
         ctxt = context.get_admin_context()
         im = objects.InstanceMapping.get_by_instance_uuid(ctxt, instance_uuid)
@@ -3017,111 +3043,104 @@ def _refresh(self, instance_uuid, volume_id, connector):
                     state=instance.vm_state,
                     method='refresh connection_info (must be stopped)')
 
-            if instance.locked:
-                raise exception.InstanceInvalidState(
-                    instance_uuid=instance_uuid, attr='locked', state='True',
-                    method='refresh connection_info (must be unlocked)')
-
-            compute_api.lock(
-                cctxt, instance,
-                reason=(
-                    f'Refreshing connection_info for BDM {bdm.uuid} '
-                    f'associated with instance {instance_uuid} and volume '
-                    f'{volume_id}.'))
-
-        # NOTE(lyarwood): Yes this is weird but we need to recreate the admin
-        # context here to ensure the lock above uses a unique request-id
-        # versus the following refresh and eventual unlock.
-        ctxt = context.get_admin_context()
-        with context.target_cell(ctxt, im.cell_mapping) as cctxt:
-            instance_action = None
-            new_attachment_id = None
-            try:
-                # Log this as an instance action so operators and users are
-                # aware that this has happened.
-                instance_action = objects.InstanceAction.action_start(
-                    cctxt, instance_uuid,
-                    instance_actions.NOVA_MANAGE_REFRESH_VOLUME_ATTACHMENT)
-
-                # Create a blank attachment to keep the volume reserved
-                new_attachment_id = volume_api.attachment_create(
-                    cctxt, volume_id, instance_uuid)['id']
-
-                # RPC call to the compute to cleanup the connections, which
-                # will in turn unmap the volume from the compute host
-                # TODO(lyarwood): Add delete_attachment as a kwarg to
-                # remove_volume_connection as is available in the private
-                # method within the manager.
+            locking_reason = (
+                f'Refreshing connection_info for BDM {bdm.uuid} '
+                f'associated with instance {instance_uuid} and volume '
+                f'{volume_id}.')
+
+            with locked_instance(im.cell_mapping, instance, locking_reason):
+                return self._do_refresh(
+                    cctxt, instance, volume_id, bdm, connector)
+
+    def _do_refresh(self, cctxt, instance,
+            volume_id, bdm, connector):
+        volume_api = cinder.API()
+        compute_rpcapi = rpcapi.ComputeAPI()
+
+        new_attachment_id = None
+        try:
+            # Log this as an instance action so operators and users are
+            # aware that this has happened.
+            instance_action = objects.InstanceAction.action_start(
+                cctxt, instance.uuid,
+                instance_actions.NOVA_MANAGE_REFRESH_VOLUME_ATTACHMENT)
+
+            # Create a blank attachment to keep the volume reserved
+            new_attachment_id = volume_api.attachment_create(
+                cctxt, volume_id, instance.uuid)['id']
+
+            # RPC call to the compute to cleanup the connections, which
+            # will in turn unmap the volume from the compute host
+            # TODO(lyarwood): Add delete_attachment as a kwarg to
+            # remove_volume_connection as is available in the private
+            # method within the manager.
+            if instance.host == connector['host']:
                 compute_rpcapi.remove_volume_connection(
                     cctxt, instance, volume_id, instance.host)
+            else:
+                msg = (
+                    f"The compute host '{connector['host']}' in the "
+                    f"connector does not match the instance host "
+                    f"'{instance.host}'.")
+                raise exception.HostConflict(_(msg))
+
+            # Delete the existing volume attachment if present in the bdm.
+            # This isn't present when the original attachment was made
+            # using the legacy cinderv2 APIs before the cinderv3 attachment
+            # based APIs were present.
+            if bdm.attachment_id:
+                volume_api.attachment_delete(cctxt, bdm.attachment_id)
+
+            # Update the attachment with host connector, this regenerates
+            # the connection_info that we can now stash in the bdm.
+            new_connection_info = volume_api.attachment_update(
+                cctxt, new_attachment_id, connector,
+                bdm.device_name)['connection_info']
+
+            # Before we save it to the BDM ensure the serial is stashed as
+            # is done in various other codepaths when attaching volumes.
+            if 'serial' not in new_connection_info:
+                new_connection_info['serial'] = bdm.volume_id
+
+            # Save the new attachment id and connection_info to the DB
+            bdm.attachment_id = new_attachment_id
+            bdm.connection_info = jsonutils.dumps(new_connection_info)
+            bdm.save()
+
+            # Finally mark the attachment as complete, moving the volume
+            # status from attaching to in-use ahead of the instance
+            # restarting
+            volume_api.attachment_complete(cctxt, new_attachment_id)
+            return 0
 
-                # Delete the existing volume attachment if present in the bdm.
-                # This isn't present when the original attachment was made
-                # using the legacy cinderv2 APIs before the cinderv3 attachment
-                # based APIs were present.
-                if bdm.attachment_id:
-                    volume_api.attachment_delete(cctxt, bdm.attachment_id)
-
-                # Update the attachment with host connector, this regenerates
-                # the connection_info that we can now stash in the bdm.
-                new_connection_info = volume_api.attachment_update(
-                    cctxt, new_attachment_id, connector,
-                    bdm.device_name)['connection_info']
-
-                # Before we save it to the BDM ensure the serial is stashed as
-                # is done in various other codepaths when attaching volumes.
-                if 'serial' not in new_connection_info:
-                    new_connection_info['serial'] = bdm.volume_id
-
-                # Save the new attachment id and connection_info to the DB
-                bdm.attachment_id = new_attachment_id
-                bdm.connection_info = jsonutils.dumps(new_connection_info)
+        finally:
+            # If the bdm.attachment_id wasn't updated make sure we clean
+            # up any attachments created during the run.
+            bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
+                cctxt, volume_id, instance.uuid)
+            if (
+                new_attachment_id and
+                bdm.attachment_id != new_attachment_id
+            ):
+                volume_api.attachment_delete(cctxt, new_attachment_id)
+
+            # If we failed during attachment_update the bdm.attachment_id
+            # has already been deleted so recreate it now to ensure the
+            # volume is still associated with the instance and clear the
+            # now stale connection_info.
+            try:
+                volume_api.attachment_get(cctxt, bdm.attachment_id)
+            except exception.VolumeAttachmentNotFound:
+                bdm.attachment_id = volume_api.attachment_create(
+                    cctxt, volume_id, instance.uuid)['id']
+                bdm.connection_info = None
                 bdm.save()
 
-                # Finally mark the attachment as complete, moving the volume
-                # status from attaching to in-use ahead of the instance
-                # restarting
-                volume_api.attachment_complete(cctxt, new_attachment_id)
-                return 0
-
-            finally:
-                # If the bdm.attachment_id wasn't updated make sure we clean
-                # up any attachments created during the run.
-                bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
-                    cctxt, volume_id, instance_uuid)
-                if (
-                    new_attachment_id and
-                    bdm.attachment_id != new_attachment_id
-                ):
-                    volume_api.attachment_delete(cctxt, new_attachment_id)
-
-                # If we failed during attachment_update the bdm.attachment_id
-                # has already been deleted so recreate it now to ensure the
-                # volume is still associated with the instance and clear the
-                # now stale connection_info.
-                try:
-                    volume_api.attachment_get(cctxt, bdm.attachment_id)
-                except exception.VolumeAttachmentNotFound:
-                    bdm.attachment_id = volume_api.attachment_create(
-                        cctxt, volume_id, instance_uuid)['id']
-                    bdm.connection_info = None
-                    bdm.save()
-
-                # Finish the instance action if it was created and started
-                # TODO(lyarwood): While not really required we should store
-                # the exec and traceback in here on failure.
-                if instance_action:
-                    instance_action.finish()
-
-                # NOTE(lyarwood): As above we need to unlock the instance with
-                # a fresh context and request-id to keep it unique. It's safe
-                # to assume that the instance is locked as this point as the
-                # earlier call to lock isn't part of this block.
-                with context.target_cell(
-                    context.get_admin_context(),
-                    im.cell_mapping
-                ) as u_cctxt:
-                    compute_api.unlock(u_cctxt, instance)
+            # Finish the instance action if it was created and started
+            # TODO(lyarwood): While not really required we should store
+            # the exec and traceback in here on failure.
+            if instance_action:
+                instance_action.finish()
 
     @action_description(
         _("Refresh the connection info for a given volume attachment"))
@@ -3145,6 +3164,7 @@ def refresh(self, instance_uuid=None, volume_id=None, connector_path=None):
         * 4: Instance does not exist.
         * 5: Instance state invalid.
         * 6: Volume is not attached to instance.
+        * 7: Connector host is not correct.
         """
         try:
             # TODO(lyarwood): Make this optional and provide a rpcapi capable
@@ -3160,6 +3180,12 @@ def refresh(self, instance_uuid=None, volume_id=None, connector_path=None):
             # Refresh the volume attachment
             return self._refresh(instance_uuid, volume_id, connector)
 
+        except exception.HostConflict as e:
+            print(
+                f"The command 'nova-manage volume_attachment get_connector' "
+                f"may have been run on the wrong compute host. Or the "
+                f"instance host may be wrong and in need of repair.\n{e}")
+            return 7
         except exception.VolumeBDMNotFound as e:
             print(str(e))
             return 6
@@ -3172,12 +3198,15 @@ def refresh(self, instance_uuid=None, volume_id=None, connector_path=None):
         ) as e:
             print(str(e))
             return 4
-        except (ValueError, OSError):
+        except ValueError as e:
             print(
                 f'Failed to open {connector_path}. Does it contain valid '
-                f'connector_info data?'
+                f'connector_info data?\nError: {str(e)}'
             )
             return 3
+        except OSError as e:
+            print(str(e))
+            return 3
         except exception.InvalidInput as e:
             print(str(e))
             return 2

diff --git a/nova/compute/claims.py b/nova/compute/claims.py
@@ -64,6 +64,7 @@ def __init__(
         super().__init__(migration=migration)
         # Stash a copy of the instance at the current point of time
         self.instance = instance.obj_clone()
+        self.instance_ref = instance
         self.nodename = nodename
         self.tracker = tracker
         self._pci_requests = pci_requests
@@ -82,7 +83,7 @@ def abort(self):
         been aborted.
         """
         LOG.debug("Aborting claim: %s", self, instance=self.instance)
-        self.tracker.abort_instance_claim(self.context, self.instance,
+        self.tracker.abort_instance_claim(self.context, self.instance_ref,
                                           self.nodename)
 
     def _claim_test(self, compute_node, limits=None):

diff --git a/nova/exception.py b/nova/exception.py
@@ -2530,3 +2530,7 @@ class NotSupportedComputeForEvacuateV295(NotSupported):
                 "instance on destination. To evacuate before upgrades are "
                 "complete please use an older microversion. Required version "
                 "for compute %(expected), current version %(currently)s")
+
+
+class HostConflict(Exception):
+    pass
diff --git a/nova/pci/manager.py b/nova/pci/manager.py
@@ -407,7 +407,7 @@ def free_instance_allocations(
         for dev in self.pci_devs:
             if (dev.status == fields.PciDeviceStatus.ALLOCATED and
                     dev.instance_uuid == instance['uuid']):
-                self._free_device(dev)
+                self._free_device(dev, instance)
 
     def free_instance_claims(
         self, context: ctx.RequestContext, instance: 'objects.Instance',
@@ -423,7 +423,7 @@ def free_instance_claims(
         for dev in self.pci_devs:
             if (dev.status == fields.PciDeviceStatus.CLAIMED and
                     dev.instance_uuid == instance['uuid']):
-                self._free_device(dev)
+                self._free_device(dev, instance)
 
     def free_instance(
         self, context: ctx.RequestContext, instance: 'objects.Instance',