diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 6f47428adfc..0dfd1592768 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -8823,6 +8823,8 @@ def update_available_resource(self, context, startup=False): use_slave=True, startup=startup) + self.rt.clean_compute_node_cache(compute_nodes_in_db) + # Delete orphan compute node not reported by driver but still in db for cn in compute_nodes_in_db: if cn.hypervisor_hostname not in nodenames: @@ -8831,12 +8833,31 @@ def update_available_resource(self, context, startup=False): "nodes are %(nodes)s", {'id': cn.id, 'hh': cn.hypervisor_hostname, 'nodes': nodenames}) - cn.destroy() - self.rt.remove_node(cn.hypervisor_hostname) - # Delete the corresponding resource provider in placement, - # along with any associated allocations and inventory. - self.reportclient.delete_resource_provider(context, cn, - cascade=True) + try: + cn.destroy() + except exception.ComputeHostNotFound: + # NOTE(mgoddard): it's possible that another compute + # service took ownership of this compute node since we + # queried it due to a rebalance, and this will cause the + # deletion to fail. Ignore the error in that case. + LOG.info("Ignoring failure to delete orphan compute node " + "%(id)s on hypervisor host %(hh)s due to " + "possible node rebalance", + {'id': cn.id, 'hh': cn.hypervisor_hostname}) + self.rt.remove_node(cn.hypervisor_hostname) + self.reportclient.invalidate_resource_provider(cn.uuid) + else: + self.rt.remove_node(cn.hypervisor_hostname) + # Delete the corresponding resource provider in placement, + # along with any associated allocations. + try: + self.reportclient.delete_resource_provider( + context, cn, cascade=True) + except keystone_exception.ClientException as e: + LOG.error( + "Failed to delete compute node resource provider " + "for compute node %s: %s", + cn.uuid, six.text_type(e)) for nodename in nodenames: self._update_available_resource_for_node(context, nodename, diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index b2ba0145cf4..4cc370a5329 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1801,3 +1801,21 @@ def free_pci_device_claims_for_instance(self, context, instance): """ self.pci_tracker.free_instance_claims(context, instance) self.pci_tracker.save(context) + + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) + def clean_compute_node_cache(self, compute_nodes_in_db): + """Clean the compute node cache of any nodes that no longer exist. + + :param compute_nodes_in_db: list of ComputeNode objects from the DB. + """ + compute_nodes_in_db_nodenames = {cn.hypervisor_hostname + for cn in compute_nodes_in_db} + stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames + + for stale_cn in stale_cns: + # NOTE(mgoddard): we have found a node in the cache that has no + # compute node in the DB. This could be due to a node rebalance + # where another compute service took ownership of the node. Clean + # up the cache. + self.remove_node(stale_cn) + self.reportclient.invalidate_resource_provider(stale_cn) diff --git a/nova/db/api.py b/nova/db/api.py index c65803bed43..a5d63c327b7 100644 --- a/nova/db/api.py +++ b/nova/db/api.py @@ -344,15 +344,16 @@ def compute_node_update(context, compute_id, values): return IMPL.compute_node_update(context, compute_id, values) -def compute_node_delete(context, compute_id): +def compute_node_delete(context, compute_id, compute_host): """Delete a compute node from the database. :param context: The security context :param compute_id: ID of the compute node + :param compute_host: Hostname of the compute service Raises ComputeHostNotFound if compute node with the given ID doesn't exist. """ - return IMPL.compute_node_delete(context, compute_id) + return IMPL.compute_node_delete(context, compute_id, compute_host) def compute_node_statistics(context): diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py index 02311f240d5..cd47f18771a 100644 --- a/nova/db/sqlalchemy/api.py +++ b/nova/db/sqlalchemy/api.py @@ -690,7 +690,7 @@ def compute_node_search_by_hypervisor(context, hypervisor_match): @pick_context_manager_writer -def compute_node_create(context, values): +def _compute_node_create(context, values): """Creates a new ComputeNode and populates the capacity fields with the most recent data. """ @@ -698,8 +698,21 @@ def compute_node_create(context, values): compute_node_ref = models.ComputeNode() compute_node_ref.update(values) + compute_node_ref.save(context.session) + return compute_node_ref + + +# NOTE(mgoddard): We avoid decorating this with @pick_context_manager_writer, +# so that we get a separate transaction in the exception handler. This avoids +# an error message about inactive DB sessions during a transaction rollback. +# See https://bugs.launchpad.net/nova/+bug/1853159. +def compute_node_create(context, values): + """Creates a new ComputeNode and populates the capacity fields + with the most recent data. Will restore a soft deleted compute node if a + UUID has been explicitly requested. + """ try: - compute_node_ref.save(context.session) + compute_node_ref = _compute_node_create(context, values) except db_exc.DBDuplicateEntry: with excutils.save_and_reraise_exception(logger=LOG) as err_ctx: # Check to see if we have a (soft) deleted ComputeNode with the @@ -763,10 +776,10 @@ def compute_node_update(context, compute_id, values): @pick_context_manager_writer -def compute_node_delete(context, compute_id): +def compute_node_delete(context, compute_id, compute_host): """Delete a ComputeNode record.""" result = model_query(context, models.ComputeNode).\ - filter_by(id=compute_id).\ + filter_by(id=compute_id, host=compute_host).\ soft_delete(synchronize_session=False) if not result: diff --git a/nova/objects/compute_node.py b/nova/objects/compute_node.py index b7f7f966b9d..d5e5853f6d6 100644 --- a/nova/objects/compute_node.py +++ b/nova/objects/compute_node.py @@ -354,7 +354,7 @@ def save(self, prune_stats=False): @base.remotable def destroy(self): - db.compute_node_delete(self._context, self.id) + db.compute_node_delete(self._context, self.id, self.host) def update_from_virt_driver(self, resources): # NOTE(pmurray): the virt driver provides a dict of values that diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index 98eb8164dba..37dd069ef33 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -677,11 +677,7 @@ def _delete_provider(self, rp_uuid, global_request_id=None): if resp: LOG.info("Deleted resource provider %s", rp_uuid) # clean the caches - try: - self._provider_tree.remove(rp_uuid) - except ValueError: - pass - self._association_refresh_time.pop(rp_uuid, None) + self.invalidate_resource_provider(rp_uuid) return msg = ("[%(placement_req_id)s] Failed to delete resource provider " @@ -2155,6 +2151,17 @@ def delete_resource_provider(self, context, compute_node, cascade=False): # for backward compatibility. pass + def invalidate_resource_provider(self, name_or_uuid): + """Invalidate the cache for a resource provider. + + :param name_or_uuid: Name or UUID of the resource provider to look up. + """ + try: + self._provider_tree.remove(name_or_uuid) + except ValueError: + pass + self._association_refresh_time.pop(name_or_uuid, None) + def get_provider_by_name(self, context, name): """Queries the placement API for resource provider information matching a supplied name. diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py new file mode 100644 index 00000000000..303867718a5 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1853009.py @@ -0,0 +1,157 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import mock + +from nova import context +from nova import objects +from nova.tests.functional import integrated_helpers + + +class NodeRebalanceDeletedComputeNodeRaceTestCase( + integrated_helpers.ProviderUsageBaseTestCase): + """Regression test for bug 1853009 observed in Rocky & later. + + When an ironic node re-balances from one host to another, there can be a + race where the old host deletes the orphan compute node after the new host + has taken ownership of it which results in the new host failing to create + the compute node and resource provider because the ResourceTracker does not + detect a change. + """ + # Make sure we're using the fake driver that has predictable uuids + # for each node. + compute_driver = 'fake.PredictableNodeUUIDDriver' + + def _assert_hypervisor_api(self, nodename, expected_host): + # We should have one compute node shown by the API. + hypervisors = self.api.api_get( + '/os-hypervisors/detail').body['hypervisors'] + self.assertEqual(1, len(hypervisors), hypervisors) + hypervisor = hypervisors[0] + self.assertEqual(nodename, hypervisor['hypervisor_hostname']) + self.assertEqual(expected_host, hypervisor['service']['host']) + + def _start_compute(self, host): + host = self.start_service('compute', host) + # Ironic compute driver has rebalances_nodes = True. + host.manager.driver.rebalances_nodes = True + return host + + def setUp(self): + super(NodeRebalanceDeletedComputeNodeRaceTestCase, self).setUp() + nodename = 'fake-node' + ctxt = context.get_admin_context() + + # Simulate a service running and then stopping. + # host2 runs, creates fake-node, then is stopped. The fake-node compute + # node is destroyed. This leaves a soft-deleted node in the DB. + host2 = self._start_compute('host2') + host2.manager.driver._set_nodes([nodename]) + host2.manager.update_available_resource(ctxt) + host2.stop() + cn = objects.ComputeNode.get_by_host_and_nodename( + ctxt, 'host2', nodename) + cn.destroy() + + def test_node_rebalance_deleted_compute_node_race(self): + nodename = 'fake-node' + ctxt = context.get_admin_context() + + # First we create a compute service to manage our node. + # When start_service runs, it will create a host1 ComputeNode. We want + # to delete that and inject our fake node into the driver which will + # be re-balanced to another host later. + host1 = self._start_compute('host1') + host1.manager.driver._set_nodes([nodename]) + + # Run the update_available_resource periodic to register fake-node and + # have it managed by host1. This will also detect the "host1" node as + # orphaned and delete it along with its resource provider. + + # host1[1]: Finds no compute record in RT. Tries to create one + # (_init_compute_node). + host1.manager.update_available_resource(ctxt) + self._assert_hypervisor_api(nodename, 'host1') + # There should only be one resource provider (fake-node). + original_rps = self._get_all_providers() + self.assertEqual(1, len(original_rps), original_rps) + self.assertEqual(nodename, original_rps[0]['name']) + + # Simulate a re-balance by starting host2 and make it manage fake-node. + # At this point both host1 and host2 think they own fake-node. + host2 = self._start_compute('host2') + host2.manager.driver._set_nodes([nodename]) + + # host2[1]: Finds no compute record in RT, 'moves' existing node from + # host1 + host2.manager.update_available_resource(ctxt) + # Assert that fake-node was re-balanced from host1 to host2. + self.assertIn('ComputeNode fake-node moving from host1 to host2', + self.stdlog.logger.output) + self._assert_hypervisor_api(nodename, 'host2') + + # host2[2]: Begins periodic update, queries compute nodes for this + # host, finds the fake-node. + cn = objects.ComputeNode.get_by_host_and_nodename( + ctxt, 'host2', nodename) + + # host1[2]: Finds no compute record in RT, 'moves' existing node from + # host2 + host1.manager.update_available_resource(ctxt) + # Assert that fake-node was re-balanced from host2 to host1. + self.assertIn('ComputeNode fake-node moving from host2 to host1', + self.stdlog.logger.output) + self._assert_hypervisor_api(nodename, 'host1') + + # Complete rebalance, as host2 realises it does not own fake-node. + host2.manager.driver._set_nodes([]) + + # host2[2]: Deletes orphan compute node. + # Mock out the compute node query to simulate a race condition where + # the list includes an orphan compute node that is taken ownership of + # by host1 by the time host2 deletes it. + with mock.patch('nova.compute.manager.ComputeManager.' + '_get_compute_nodes_in_db') as mock_get: + mock_get.return_value = [cn] + host2.manager.update_available_resource(ctxt) + + # Verify that the node was almost deleted, but saved by the host check. + self.assertIn("Deleting orphan compute node %s hypervisor host " + "is fake-node, nodes are" % cn.id, + self.stdlog.logger.output) + self.assertIn("Ignoring failure to delete orphan compute node %s on " + "hypervisor host fake-node" % cn.id, + self.stdlog.logger.output) + self._assert_hypervisor_api(nodename, 'host1') + rps = self._get_all_providers() + self.assertEqual(1, len(rps), rps) + self.assertEqual(nodename, rps[0]['name']) + + # Simulate deletion of an orphan by host2. It shouldn't happen anymore, + # but let's assume it already did. + cn = objects.ComputeNode.get_by_host_and_nodename( + ctxt, 'host1', nodename) + cn.destroy() + host2.manager.rt.remove_node(cn.hypervisor_hostname) + host2.manager.reportclient.delete_resource_provider( + ctxt, cn, cascade=True) + + # host1[3]: Should recreate compute node and resource provider. + host1.manager.update_available_resource(ctxt) + + # Verify that the node was recreated. + self._assert_hypervisor_api(nodename, 'host1') + + # The resource provider has now been created. + rps = self._get_all_providers() + self.assertEqual(1, len(rps), rps) + self.assertEqual(nodename, rps[0]['name']) diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index d84b5be3e5c..6a833d4073f 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -202,8 +202,10 @@ def fake_get_compute_nodes_in_db(self, context, *args, **kwargs): context, objects.ComputeNode(), cn) for cn in fake_compute_nodes] - def fake_compute_node_delete(context, compute_node_id): + def fake_compute_node_delete(context, compute_node_id, + compute_node_host): self.assertEqual(2, compute_node_id) + self.assertEqual('fake_phyp1', compute_node_host) self.stub_out( 'nova.compute.manager.ComputeManager._get_compute_nodes_in_db', diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 5f01113525d..d608fcd00bc 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -341,6 +341,7 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes, ) # First node in set should have been removed from DB + # Last node in set should have been added to DB. for db_node in db_nodes: if db_node.hypervisor_hostname == 'node1': db_node.destroy.assert_called_once_with() @@ -350,6 +351,8 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes, 'node1') else: self.assertFalse(db_node.destroy.called) + self.assertEqual(1, mock_rt.remove_node.call_count) + mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes) @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'delete_resource_provider') @@ -371,6 +374,33 @@ def test_update_available_resource_not_ready(self, get_db_nodes, update_mock.assert_not_called() del_rp_mock.assert_not_called() + @mock.patch.object(manager.ComputeManager, + '_update_available_resource_for_node') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') + def test_update_available_resource_destroy_rebalance( + self, get_db_nodes, get_avail_nodes, update_mock): + mock_rt = self._mock_rt() + rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( + self.compute, 'reportclient')).mock + db_nodes = [self._make_compute_node('node1', 1)] + get_db_nodes.return_value = db_nodes + # Destroy can fail if nodes were rebalanced between getting the node + # list and calling destroy. + db_nodes[0].destroy.side_effect = exception.ComputeHostNotFound( + 'node1') + get_avail_nodes.return_value = set() + self.compute.update_available_resource(self.context) + get_db_nodes.assert_called_once_with(self.context, set(), + use_slave=True, startup=False) + self.assertEqual(0, update_mock.call_count) + + db_nodes[0].destroy.assert_called_once_with() + self.assertEqual(0, rc_mock.delete_resource_provider.call_count) + mock_rt.remove_node.assert_called_once_with('node1') + rc_mock.invalidate_resource_provider.assert_called_once_with( + db_nodes[0].uuid) + @mock.patch('nova.context.get_admin_context') def test_pre_start_hook(self, get_admin_context): """Very simple test just to make sure update_available_resource is diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index 9bae648efe9..c39fd1b03b7 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -4006,3 +4006,24 @@ def test_init_ensure_provided_reportclient_is_used(self): rt = resource_tracker.ResourceTracker( _HOSTNAME, mock.sentinel.driver, mock.sentinel.reportclient) self.assertIs(rt.reportclient, mock.sentinel.reportclient) + + +class TestCleanComputeNodeCache(BaseTestCase): + + def setUp(self): + super(TestCleanComputeNodeCache, self).setUp() + self._setup_rt() + self.context = context.RequestContext(mock.sentinel.user_id, + mock.sentinel.project_id) + + @mock.patch.object(resource_tracker.ResourceTracker, "remove_node") + def test_clean_compute_node_cache(self, mock_remove): + invalid_nodename = "invalid-node" + self.rt.compute_nodes[_NODENAME] = self.compute + self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute + with mock.patch.object( + self.rt.reportclient, + "invalidate_resource_provider") as mock_invalidate: + self.rt.clean_compute_node_cache([self.compute]) + mock_remove.assert_called_once_with(invalid_nodename) + mock_invalidate.assert_called_once_with(invalid_nodename) diff --git a/nova/tests/unit/db/test_db_api.py b/nova/tests/unit/db/test_db_api.py index fb42d1af524..7f22102cc48 100644 --- a/nova/tests/unit/db/test_db_api.py +++ b/nova/tests/unit/db/test_db_api.py @@ -7317,6 +7317,20 @@ def test_compute_node_create_duplicate_host_hypervisor_hostname(self): self.assertRaises(db_exc.DBDuplicateEntry, db.compute_node_create, self.ctxt, other_node) + def test_compute_node_create_duplicate_uuid(self): + """Tests to make sure that no exception is raised when trying to create + a compute node with the same host, hypervisor_hostname and uuid values + as another compute node that was previously soft-deleted. + """ + # Prior to fixing https://bugs.launchpad.net/nova/+bug/1853159, this + # raised the following error: + # sqlalchemy.exc.InvalidRequestError: This session is in 'inactive' + # state, due to the SQL transaction being rolled back; no further SQL + # can be emitted within this transaction. + db.compute_node_delete(self.ctxt, self.item['id'], self.item['host']) + new_node = db.compute_node_create(self.ctxt, self.compute_node_dict) + self.assertEqual(self.item['uuid'], new_node['uuid']) + def test_compute_node_get_all(self): nodes = db.compute_node_get_all(self.ctxt) self.assertEqual(1, len(nodes)) @@ -7427,7 +7441,7 @@ def test_compute_node_get_all_deleted_compute_node(self): # Now delete the newly-created compute node to ensure the related # compute node stats are wiped in a cascaded fashion - db.compute_node_delete(self.ctxt, node['id']) + db.compute_node_delete(self.ctxt, node['id'], node['host']) # Clean up the service db.service_destroy(self.ctxt, service['id']) @@ -7601,10 +7615,18 @@ def test_compute_node_update(self): def test_compute_node_delete(self): compute_node_id = self.item['id'] - db.compute_node_delete(self.ctxt, compute_node_id) + compute_node_host = self.item['host'] + db.compute_node_delete(self.ctxt, compute_node_id, compute_node_host) nodes = db.compute_node_get_all(self.ctxt) self.assertEqual(len(nodes), 0) + def test_compute_node_delete_different_host(self): + compute_node_id = self.item['id'] + compute_node_host = 'invalid-host' + self.assertRaises(exception.ComputeHostNotFound, + db.compute_node_delete, + self.ctxt, compute_node_id, compute_node_host) + def test_compute_node_search_by_hypervisor(self): nodes_created = [] new_service = copy.copy(self.service_dict) diff --git a/nova/tests/unit/objects/test_compute_node.py b/nova/tests/unit/objects/test_compute_node.py index aeaf4b957fe..237c0c7cb27 100644 --- a/nova/tests/unit/objects/test_compute_node.py +++ b/nova/tests/unit/objects/test_compute_node.py @@ -414,8 +414,9 @@ def test_set_id_failure(self, mock_get, db_mock): def test_destroy(self, mock_delete): compute = compute_node.ComputeNode(context=self.context) compute.id = 123 + compute.host = 'fake' compute.destroy() - mock_delete.assert_called_once_with(self.context, 123) + mock_delete.assert_called_once_with(self.context, 123, 'fake') @mock.patch.object(db, 'compute_node_get_all') def test_get_all(self, mock_get_all): diff --git a/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml b/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml new file mode 100644 index 00000000000..d6e71c42224 --- /dev/null +++ b/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes an issue with multiple ``nova-compute`` services used with Ironic, + where a rebalance operation could result in a compute node being deleted + from the database and not recreated. See `bug 1853009 + `__ for details.