Skip to content

Commit

Permalink
Merge pull request #104 from stackhpc/upstream/2023.1-2024-09-02
Browse files Browse the repository at this point in the history
Synchronise 2023.1 with upstream
  • Loading branch information
priteau authored Sep 5, 2024
2 parents 7f8fa87 + 6b78420 commit 1456f6d
Show file tree
Hide file tree
Showing 10 changed files with 325 additions and 132 deletions.
4 changes: 3 additions & 1 deletion doc/notification_samples/instance-create-error.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
"ip_addresses": [],
"launched_at": null,
"power_state": "pending",
"state": "building"
"state": "building",
"host": null,
"node": null
}
},
"priority":"ERROR",
Expand Down
4 changes: 3 additions & 1 deletion doc/source/cli/nova-manage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1531,7 +1531,9 @@ command.
* - 5
- Instance state invalid (must be stopped and unlocked)
* - 6
- Instance is not attached to volume
- Volume is not attached to the instance
* - 7
- Connector host is not correct


Libvirt Commands
Expand Down
239 changes: 134 additions & 105 deletions nova/cmd/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"""

import collections
from contextlib import contextmanager
import functools
import os
import re
Expand Down Expand Up @@ -144,6 +145,33 @@ def format_dict(dct, dict_property="Property", dict_value='Value',
return encodeutils.safe_encode(pt.get_string()).decode()


@contextmanager
def locked_instance(cell_mapping, instance, reason):
"""Context manager to lock and unlock instance,
lock state will be restored regardless of the success or failure
of target functionality.
:param cell_mapping: instance-cell-mapping
:param instance: instance to be lock and unlock
:param reason: reason, why lock is required
"""

compute_api = api.API()

initial_state = 'locked' if instance.locked else 'unlocked'
if not instance.locked:
with context.target_cell(
context.get_admin_context(), cell_mapping) as cctxt:
compute_api.lock(cctxt, instance, reason=reason)
try:
yield
finally:
if initial_state == 'unlocked':
with context.target_cell(
context.get_admin_context(), cell_mapping) as cctxt:
compute_api.unlock(cctxt, instance)


class DbCommands(object):
"""Class for managing the main database."""

Expand Down Expand Up @@ -2998,10 +3026,8 @@ def _refresh(self, instance_uuid, volume_id, connector):
:param instance_uuid: UUID of instance
:param volume_id: ID of volume attached to the instance
:param connector: Connector with which to create the new attachment
:return status_code: volume-refresh status_code 0 on success
"""
volume_api = cinder.API()
compute_rpcapi = rpcapi.ComputeAPI()
compute_api = api.API()

ctxt = context.get_admin_context()
im = objects.InstanceMapping.get_by_instance_uuid(ctxt, instance_uuid)
Expand All @@ -3017,111 +3043,104 @@ def _refresh(self, instance_uuid, volume_id, connector):
state=instance.vm_state,
method='refresh connection_info (must be stopped)')

if instance.locked:
raise exception.InstanceInvalidState(
instance_uuid=instance_uuid, attr='locked', state='True',
method='refresh connection_info (must be unlocked)')

compute_api.lock(
cctxt, instance,
reason=(
f'Refreshing connection_info for BDM {bdm.uuid} '
f'associated with instance {instance_uuid} and volume '
f'{volume_id}.'))

# NOTE(lyarwood): Yes this is weird but we need to recreate the admin
# context here to ensure the lock above uses a unique request-id
# versus the following refresh and eventual unlock.
ctxt = context.get_admin_context()
with context.target_cell(ctxt, im.cell_mapping) as cctxt:
instance_action = None
new_attachment_id = None
try:
# Log this as an instance action so operators and users are
# aware that this has happened.
instance_action = objects.InstanceAction.action_start(
cctxt, instance_uuid,
instance_actions.NOVA_MANAGE_REFRESH_VOLUME_ATTACHMENT)

# Create a blank attachment to keep the volume reserved
new_attachment_id = volume_api.attachment_create(
cctxt, volume_id, instance_uuid)['id']

# RPC call to the compute to cleanup the connections, which
# will in turn unmap the volume from the compute host
# TODO(lyarwood): Add delete_attachment as a kwarg to
# remove_volume_connection as is available in the private
# method within the manager.
locking_reason = (
f'Refreshing connection_info for BDM {bdm.uuid} '
f'associated with instance {instance_uuid} and volume '
f'{volume_id}.')

with locked_instance(im.cell_mapping, instance, locking_reason):
return self._do_refresh(
cctxt, instance, volume_id, bdm, connector)

def _do_refresh(self, cctxt, instance,
volume_id, bdm, connector):
volume_api = cinder.API()
compute_rpcapi = rpcapi.ComputeAPI()

new_attachment_id = None
try:
# Log this as an instance action so operators and users are
# aware that this has happened.
instance_action = objects.InstanceAction.action_start(
cctxt, instance.uuid,
instance_actions.NOVA_MANAGE_REFRESH_VOLUME_ATTACHMENT)

# Create a blank attachment to keep the volume reserved
new_attachment_id = volume_api.attachment_create(
cctxt, volume_id, instance.uuid)['id']

# RPC call to the compute to cleanup the connections, which
# will in turn unmap the volume from the compute host
# TODO(lyarwood): Add delete_attachment as a kwarg to
# remove_volume_connection as is available in the private
# method within the manager.
if instance.host == connector['host']:
compute_rpcapi.remove_volume_connection(
cctxt, instance, volume_id, instance.host)
else:
msg = (
f"The compute host '{connector['host']}' in the "
f"connector does not match the instance host "
f"'{instance.host}'.")
raise exception.HostConflict(_(msg))

# Delete the existing volume attachment if present in the bdm.
# This isn't present when the original attachment was made
# using the legacy cinderv2 APIs before the cinderv3 attachment
# based APIs were present.
if bdm.attachment_id:
volume_api.attachment_delete(cctxt, bdm.attachment_id)

# Update the attachment with host connector, this regenerates
# the connection_info that we can now stash in the bdm.
new_connection_info = volume_api.attachment_update(
cctxt, new_attachment_id, connector,
bdm.device_name)['connection_info']

# Before we save it to the BDM ensure the serial is stashed as
# is done in various other codepaths when attaching volumes.
if 'serial' not in new_connection_info:
new_connection_info['serial'] = bdm.volume_id

# Save the new attachment id and connection_info to the DB
bdm.attachment_id = new_attachment_id
bdm.connection_info = jsonutils.dumps(new_connection_info)
bdm.save()

# Finally mark the attachment as complete, moving the volume
# status from attaching to in-use ahead of the instance
# restarting
volume_api.attachment_complete(cctxt, new_attachment_id)
return 0

# Delete the existing volume attachment if present in the bdm.
# This isn't present when the original attachment was made
# using the legacy cinderv2 APIs before the cinderv3 attachment
# based APIs were present.
if bdm.attachment_id:
volume_api.attachment_delete(cctxt, bdm.attachment_id)

# Update the attachment with host connector, this regenerates
# the connection_info that we can now stash in the bdm.
new_connection_info = volume_api.attachment_update(
cctxt, new_attachment_id, connector,
bdm.device_name)['connection_info']

# Before we save it to the BDM ensure the serial is stashed as
# is done in various other codepaths when attaching volumes.
if 'serial' not in new_connection_info:
new_connection_info['serial'] = bdm.volume_id

# Save the new attachment id and connection_info to the DB
bdm.attachment_id = new_attachment_id
bdm.connection_info = jsonutils.dumps(new_connection_info)
finally:
# If the bdm.attachment_id wasn't updated make sure we clean
# up any attachments created during the run.
bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
cctxt, volume_id, instance.uuid)
if (
new_attachment_id and
bdm.attachment_id != new_attachment_id
):
volume_api.attachment_delete(cctxt, new_attachment_id)

# If we failed during attachment_update the bdm.attachment_id
# has already been deleted so recreate it now to ensure the
# volume is still associated with the instance and clear the
# now stale connection_info.
try:
volume_api.attachment_get(cctxt, bdm.attachment_id)
except exception.VolumeAttachmentNotFound:
bdm.attachment_id = volume_api.attachment_create(
cctxt, volume_id, instance.uuid)['id']
bdm.connection_info = None
bdm.save()

# Finally mark the attachment as complete, moving the volume
# status from attaching to in-use ahead of the instance
# restarting
volume_api.attachment_complete(cctxt, new_attachment_id)
return 0

finally:
# If the bdm.attachment_id wasn't updated make sure we clean
# up any attachments created during the run.
bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
cctxt, volume_id, instance_uuid)
if (
new_attachment_id and
bdm.attachment_id != new_attachment_id
):
volume_api.attachment_delete(cctxt, new_attachment_id)

# If we failed during attachment_update the bdm.attachment_id
# has already been deleted so recreate it now to ensure the
# volume is still associated with the instance and clear the
# now stale connection_info.
try:
volume_api.attachment_get(cctxt, bdm.attachment_id)
except exception.VolumeAttachmentNotFound:
bdm.attachment_id = volume_api.attachment_create(
cctxt, volume_id, instance_uuid)['id']
bdm.connection_info = None
bdm.save()

# Finish the instance action if it was created and started
# TODO(lyarwood): While not really required we should store
# the exec and traceback in here on failure.
if instance_action:
instance_action.finish()

# NOTE(lyarwood): As above we need to unlock the instance with
# a fresh context and request-id to keep it unique. It's safe
# to assume that the instance is locked as this point as the
# earlier call to lock isn't part of this block.
with context.target_cell(
context.get_admin_context(),
im.cell_mapping
) as u_cctxt:
compute_api.unlock(u_cctxt, instance)
# Finish the instance action if it was created and started
# TODO(lyarwood): While not really required we should store
# the exec and traceback in here on failure.
if instance_action:
instance_action.finish()

@action_description(
_("Refresh the connection info for a given volume attachment"))
Expand All @@ -3145,6 +3164,7 @@ def refresh(self, instance_uuid=None, volume_id=None, connector_path=None):
* 4: Instance does not exist.
* 5: Instance state invalid.
* 6: Volume is not attached to instance.
* 7: Connector host is not correct.
"""
try:
# TODO(lyarwood): Make this optional and provide a rpcapi capable
Expand All @@ -3160,6 +3180,12 @@ def refresh(self, instance_uuid=None, volume_id=None, connector_path=None):
# Refresh the volume attachment
return self._refresh(instance_uuid, volume_id, connector)

except exception.HostConflict as e:
print(
f"The command 'nova-manage volume_attachment get_connector' "
f"may have been run on the wrong compute host. Or the "
f"instance host may be wrong and in need of repair.\n{e}")
return 7
except exception.VolumeBDMNotFound as e:
print(str(e))
return 6
Expand All @@ -3172,12 +3198,15 @@ def refresh(self, instance_uuid=None, volume_id=None, connector_path=None):
) as e:
print(str(e))
return 4
except (ValueError, OSError):
except ValueError as e:
print(
f'Failed to open {connector_path}. Does it contain valid '
f'connector_info data?'
f'connector_info data?\nError: {str(e)}'
)
return 3
except OSError as e:
print(str(e))
return 3
except exception.InvalidInput as e:
print(str(e))
return 2
Expand Down
3 changes: 2 additions & 1 deletion nova/compute/claims.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def __init__(
super().__init__(migration=migration)
# Stash a copy of the instance at the current point of time
self.instance = instance.obj_clone()
self.instance_ref = instance
self.nodename = nodename
self.tracker = tracker
self._pci_requests = pci_requests
Expand All @@ -82,7 +83,7 @@ def abort(self):
been aborted.
"""
LOG.debug("Aborting claim: %s", self, instance=self.instance)
self.tracker.abort_instance_claim(self.context, self.instance,
self.tracker.abort_instance_claim(self.context, self.instance_ref,
self.nodename)

def _claim_test(self, compute_node, limits=None):
Expand Down
4 changes: 4 additions & 0 deletions nova/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -2530,3 +2530,7 @@ class NotSupportedComputeForEvacuateV295(NotSupported):
"instance on destination. To evacuate before upgrades are "
"complete please use an older microversion. Required version "
"for compute %(expected), current version %(currently)s")


class HostConflict(Exception):
pass
4 changes: 2 additions & 2 deletions nova/pci/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ def free_instance_allocations(
for dev in self.pci_devs:
if (dev.status == fields.PciDeviceStatus.ALLOCATED and
dev.instance_uuid == instance['uuid']):
self._free_device(dev)
self._free_device(dev, instance)

def free_instance_claims(
self, context: ctx.RequestContext, instance: 'objects.Instance',
Expand All @@ -423,7 +423,7 @@ def free_instance_claims(
for dev in self.pci_devs:
if (dev.status == fields.PciDeviceStatus.CLAIMED and
dev.instance_uuid == instance['uuid']):
self._free_device(dev)
self._free_device(dev, instance)

def free_instance(
self, context: ctx.RequestContext, instance: 'objects.Instance',
Expand Down
Loading

0 comments on commit 1456f6d

Please sign in to comment.