Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jg ironic rebalance #5

Open
wants to merge 5 commits into
base: stackhpc/train
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 27 additions & 6 deletions nova/compute/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8823,6 +8823,8 @@ def update_available_resource(self, context, startup=False):
use_slave=True,
startup=startup)

self.rt.clean_compute_node_cache(compute_nodes_in_db)

# Delete orphan compute node not reported by driver but still in db
for cn in compute_nodes_in_db:
if cn.hypervisor_hostname not in nodenames:
Expand All @@ -8831,12 +8833,31 @@ def update_available_resource(self, context, startup=False):
"nodes are %(nodes)s",
{'id': cn.id, 'hh': cn.hypervisor_hostname,
'nodes': nodenames})
cn.destroy()
self.rt.remove_node(cn.hypervisor_hostname)
# Delete the corresponding resource provider in placement,
# along with any associated allocations and inventory.
self.reportclient.delete_resource_provider(context, cn,
cascade=True)
try:
cn.destroy()
except exception.ComputeHostNotFound:
# NOTE(mgoddard): it's possible that another compute
# service took ownership of this compute node since we
# queried it due to a rebalance, and this will cause the
# deletion to fail. Ignore the error in that case.
LOG.info("Ignoring failure to delete orphan compute node "
"%(id)s on hypervisor host %(hh)s due to "
"possible node rebalance",
{'id': cn.id, 'hh': cn.hypervisor_hostname})
self.rt.remove_node(cn.hypervisor_hostname)
self.reportclient.invalidate_resource_provider(cn.uuid)
else:
self.rt.remove_node(cn.hypervisor_hostname)
# Delete the corresponding resource provider in placement,
# along with any associated allocations.
try:
self.reportclient.delete_resource_provider(
context, cn, cascade=True)
except keystone_exception.ClientException as e:
LOG.error(
"Failed to delete compute node resource provider "
"for compute node %s: %s",
cn.uuid, six.text_type(e))

for nodename in nodenames:
self._update_available_resource_for_node(context, nodename,
Expand Down
18 changes: 18 additions & 0 deletions nova/compute/resource_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1801,3 +1801,21 @@ def free_pci_device_claims_for_instance(self, context, instance):
"""
self.pci_tracker.free_instance_claims(context, instance)
self.pci_tracker.save(context)

@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
def clean_compute_node_cache(self, compute_nodes_in_db):
"""Clean the compute node cache of any nodes that no longer exist.

:param compute_nodes_in_db: list of ComputeNode objects from the DB.
"""
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
for cn in compute_nodes_in_db}
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames

for stale_cn in stale_cns:
# NOTE(mgoddard): we have found a node in the cache that has no
# compute node in the DB. This could be due to a node rebalance
# where another compute service took ownership of the node. Clean
# up the cache.
self.remove_node(stale_cn)
self.reportclient.invalidate_resource_provider(stale_cn)
5 changes: 3 additions & 2 deletions nova/db/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,15 +344,16 @@ def compute_node_update(context, compute_id, values):
return IMPL.compute_node_update(context, compute_id, values)


def compute_node_delete(context, compute_id):
def compute_node_delete(context, compute_id, compute_host):
"""Delete a compute node from the database.

:param context: The security context
:param compute_id: ID of the compute node
:param compute_host: Hostname of the compute service

Raises ComputeHostNotFound if compute node with the given ID doesn't exist.
"""
return IMPL.compute_node_delete(context, compute_id)
return IMPL.compute_node_delete(context, compute_id, compute_host)


def compute_node_statistics(context):
Expand Down
21 changes: 17 additions & 4 deletions nova/db/sqlalchemy/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,16 +690,29 @@ def compute_node_search_by_hypervisor(context, hypervisor_match):


@pick_context_manager_writer
def compute_node_create(context, values):
def _compute_node_create(context, values):
"""Creates a new ComputeNode and populates the capacity fields
with the most recent data.
"""
convert_objects_related_datetimes(values)

compute_node_ref = models.ComputeNode()
compute_node_ref.update(values)
compute_node_ref.save(context.session)
return compute_node_ref


# NOTE(mgoddard): We avoid decorating this with @pick_context_manager_writer,
# so that we get a separate transaction in the exception handler. This avoids
# an error message about inactive DB sessions during a transaction rollback.
# See https://bugs.launchpad.net/nova/+bug/1853159.
def compute_node_create(context, values):
"""Creates a new ComputeNode and populates the capacity fields
with the most recent data. Will restore a soft deleted compute node if a
UUID has been explicitly requested.
"""
try:
compute_node_ref.save(context.session)
compute_node_ref = _compute_node_create(context, values)
except db_exc.DBDuplicateEntry:
with excutils.save_and_reraise_exception(logger=LOG) as err_ctx:
# Check to see if we have a (soft) deleted ComputeNode with the
Expand Down Expand Up @@ -763,10 +776,10 @@ def compute_node_update(context, compute_id, values):


@pick_context_manager_writer
def compute_node_delete(context, compute_id):
def compute_node_delete(context, compute_id, compute_host):
"""Delete a ComputeNode record."""
result = model_query(context, models.ComputeNode).\
filter_by(id=compute_id).\
filter_by(id=compute_id, host=compute_host).\
soft_delete(synchronize_session=False)

if not result:
Expand Down
2 changes: 1 addition & 1 deletion nova/objects/compute_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def save(self, prune_stats=False):

@base.remotable
def destroy(self):
db.compute_node_delete(self._context, self.id)
db.compute_node_delete(self._context, self.id, self.host)

def update_from_virt_driver(self, resources):
# NOTE(pmurray): the virt driver provides a dict of values that
Expand Down
17 changes: 12 additions & 5 deletions nova/scheduler/client/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,11 +677,7 @@ def _delete_provider(self, rp_uuid, global_request_id=None):
if resp:
LOG.info("Deleted resource provider %s", rp_uuid)
# clean the caches
try:
self._provider_tree.remove(rp_uuid)
except ValueError:
pass
self._association_refresh_time.pop(rp_uuid, None)
self.invalidate_resource_provider(rp_uuid)
return

msg = ("[%(placement_req_id)s] Failed to delete resource provider "
Expand Down Expand Up @@ -2155,6 +2151,17 @@ def delete_resource_provider(self, context, compute_node, cascade=False):
# for backward compatibility.
pass

def invalidate_resource_provider(self, name_or_uuid):
"""Invalidate the cache for a resource provider.

:param name_or_uuid: Name or UUID of the resource provider to look up.
"""
try:
self._provider_tree.remove(name_or_uuid)
except ValueError:
pass
self._association_refresh_time.pop(name_or_uuid, None)

def get_provider_by_name(self, context, name):
"""Queries the placement API for resource provider information matching
a supplied name.
Expand Down
157 changes: 157 additions & 0 deletions nova/tests/functional/regressions/test_bug_1853009.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import mock

from nova import context
from nova import objects
from nova.tests.functional import integrated_helpers


class NodeRebalanceDeletedComputeNodeRaceTestCase(
integrated_helpers.ProviderUsageBaseTestCase):
"""Regression test for bug 1853009 observed in Rocky & later.

When an ironic node re-balances from one host to another, there can be a
race where the old host deletes the orphan compute node after the new host
has taken ownership of it which results in the new host failing to create
the compute node and resource provider because the ResourceTracker does not
detect a change.
"""
# Make sure we're using the fake driver that has predictable uuids
# for each node.
compute_driver = 'fake.PredictableNodeUUIDDriver'

def _assert_hypervisor_api(self, nodename, expected_host):
# We should have one compute node shown by the API.
hypervisors = self.api.api_get(
'/os-hypervisors/detail').body['hypervisors']
self.assertEqual(1, len(hypervisors), hypervisors)
hypervisor = hypervisors[0]
self.assertEqual(nodename, hypervisor['hypervisor_hostname'])
self.assertEqual(expected_host, hypervisor['service']['host'])

def _start_compute(self, host):
host = self.start_service('compute', host)
# Ironic compute driver has rebalances_nodes = True.
host.manager.driver.rebalances_nodes = True
return host

def setUp(self):
super(NodeRebalanceDeletedComputeNodeRaceTestCase, self).setUp()
nodename = 'fake-node'
ctxt = context.get_admin_context()

# Simulate a service running and then stopping.
# host2 runs, creates fake-node, then is stopped. The fake-node compute
# node is destroyed. This leaves a soft-deleted node in the DB.
host2 = self._start_compute('host2')
host2.manager.driver._set_nodes([nodename])
host2.manager.update_available_resource(ctxt)
host2.stop()
cn = objects.ComputeNode.get_by_host_and_nodename(
ctxt, 'host2', nodename)
cn.destroy()

def test_node_rebalance_deleted_compute_node_race(self):
nodename = 'fake-node'
ctxt = context.get_admin_context()

# First we create a compute service to manage our node.
# When start_service runs, it will create a host1 ComputeNode. We want
# to delete that and inject our fake node into the driver which will
# be re-balanced to another host later.
host1 = self._start_compute('host1')
host1.manager.driver._set_nodes([nodename])

# Run the update_available_resource periodic to register fake-node and
# have it managed by host1. This will also detect the "host1" node as
# orphaned and delete it along with its resource provider.

# host1[1]: Finds no compute record in RT. Tries to create one
# (_init_compute_node).
host1.manager.update_available_resource(ctxt)
self._assert_hypervisor_api(nodename, 'host1')
# There should only be one resource provider (fake-node).
original_rps = self._get_all_providers()
self.assertEqual(1, len(original_rps), original_rps)
self.assertEqual(nodename, original_rps[0]['name'])

# Simulate a re-balance by starting host2 and make it manage fake-node.
# At this point both host1 and host2 think they own fake-node.
host2 = self._start_compute('host2')
host2.manager.driver._set_nodes([nodename])

# host2[1]: Finds no compute record in RT, 'moves' existing node from
# host1
host2.manager.update_available_resource(ctxt)
# Assert that fake-node was re-balanced from host1 to host2.
self.assertIn('ComputeNode fake-node moving from host1 to host2',
self.stdlog.logger.output)
self._assert_hypervisor_api(nodename, 'host2')

# host2[2]: Begins periodic update, queries compute nodes for this
# host, finds the fake-node.
cn = objects.ComputeNode.get_by_host_and_nodename(
ctxt, 'host2', nodename)

# host1[2]: Finds no compute record in RT, 'moves' existing node from
# host2
host1.manager.update_available_resource(ctxt)
# Assert that fake-node was re-balanced from host2 to host1.
self.assertIn('ComputeNode fake-node moving from host2 to host1',
self.stdlog.logger.output)
self._assert_hypervisor_api(nodename, 'host1')

# Complete rebalance, as host2 realises it does not own fake-node.
host2.manager.driver._set_nodes([])

# host2[2]: Deletes orphan compute node.
# Mock out the compute node query to simulate a race condition where
# the list includes an orphan compute node that is taken ownership of
# by host1 by the time host2 deletes it.
with mock.patch('nova.compute.manager.ComputeManager.'
'_get_compute_nodes_in_db') as mock_get:
mock_get.return_value = [cn]
host2.manager.update_available_resource(ctxt)

# Verify that the node was almost deleted, but saved by the host check.
self.assertIn("Deleting orphan compute node %s hypervisor host "
"is fake-node, nodes are" % cn.id,
self.stdlog.logger.output)
self.assertIn("Ignoring failure to delete orphan compute node %s on "
"hypervisor host fake-node" % cn.id,
self.stdlog.logger.output)
self._assert_hypervisor_api(nodename, 'host1')
rps = self._get_all_providers()
self.assertEqual(1, len(rps), rps)
self.assertEqual(nodename, rps[0]['name'])

# Simulate deletion of an orphan by host2. It shouldn't happen anymore,
# but let's assume it already did.
cn = objects.ComputeNode.get_by_host_and_nodename(
ctxt, 'host1', nodename)
cn.destroy()
host2.manager.rt.remove_node(cn.hypervisor_hostname)
host2.manager.reportclient.delete_resource_provider(
ctxt, cn, cascade=True)

# host1[3]: Should recreate compute node and resource provider.
host1.manager.update_available_resource(ctxt)

# Verify that the node was recreated.
self._assert_hypervisor_api(nodename, 'host1')

# The resource provider has now been created.
rps = self._get_all_providers()
self.assertEqual(1, len(rps), rps)
self.assertEqual(nodename, rps[0]['name'])
4 changes: 3 additions & 1 deletion nova/tests/unit/compute/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,10 @@ def fake_get_compute_nodes_in_db(self, context, *args, **kwargs):
context, objects.ComputeNode(), cn)
for cn in fake_compute_nodes]

def fake_compute_node_delete(context, compute_node_id):
def fake_compute_node_delete(context, compute_node_id,
compute_node_host):
self.assertEqual(2, compute_node_id)
self.assertEqual('fake_phyp1', compute_node_host)

self.stub_out(
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db',
Expand Down
30 changes: 30 additions & 0 deletions nova/tests/unit/compute/test_compute_mgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
)

# First node in set should have been removed from DB
# Last node in set should have been added to DB.
for db_node in db_nodes:
if db_node.hypervisor_hostname == 'node1':
db_node.destroy.assert_called_once_with()
Expand All @@ -350,6 +351,8 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
'node1')
else:
self.assertFalse(db_node.destroy.called)
self.assertEqual(1, mock_rt.remove_node.call_count)
mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes)

@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'delete_resource_provider')
Expand All @@ -371,6 +374,33 @@ def test_update_available_resource_not_ready(self, get_db_nodes,
update_mock.assert_not_called()
del_rp_mock.assert_not_called()

@mock.patch.object(manager.ComputeManager,
'_update_available_resource_for_node')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
def test_update_available_resource_destroy_rebalance(
self, get_db_nodes, get_avail_nodes, update_mock):
mock_rt = self._mock_rt()
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
self.compute, 'reportclient')).mock
db_nodes = [self._make_compute_node('node1', 1)]
get_db_nodes.return_value = db_nodes
# Destroy can fail if nodes were rebalanced between getting the node
# list and calling destroy.
db_nodes[0].destroy.side_effect = exception.ComputeHostNotFound(
'node1')
get_avail_nodes.return_value = set()
self.compute.update_available_resource(self.context)
get_db_nodes.assert_called_once_with(self.context, set(),
use_slave=True, startup=False)
self.assertEqual(0, update_mock.call_count)

db_nodes[0].destroy.assert_called_once_with()
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
mock_rt.remove_node.assert_called_once_with('node1')
rc_mock.invalidate_resource_provider.assert_called_once_with(
db_nodes[0].uuid)

@mock.patch('nova.context.get_admin_context')
def test_pre_start_hook(self, get_admin_context):
"""Very simple test just to make sure update_available_resource is
Expand Down
Loading