diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000000..a914011ae8b --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @stackhpc/openstack diff --git a/.github/workflows/tag-and-release.yml b/.github/workflows/tag-and-release.yml new file mode 100644 index 00000000000..4ef4d2618d7 --- /dev/null +++ b/.github/workflows/tag-and-release.yml @@ -0,0 +1,12 @@ +--- +name: Tag & Release +'on': + push: + branches: + - stackhpc/yoga +permissions: + actions: read + contents: write +jobs: + tag-and-release: + uses: stackhpc/.github/.github/workflows/tag-and-release.yml@main diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml new file mode 100644 index 00000000000..8713f0e02dc --- /dev/null +++ b/.github/workflows/tox.yml @@ -0,0 +1,7 @@ +--- +name: Tox Continuous Integration +'on': + pull_request: +jobs: + tox: + uses: stackhpc/.github/.github/workflows/tox.yml@main diff --git a/bindep.txt b/bindep.txt index 3a4d7bef806..f774113bc26 100644 --- a/bindep.txt +++ b/bindep.txt @@ -55,3 +55,5 @@ pcre-devel [platform:rpm test] # runtime and in unit tests. Net result is the same that lsscsi will be # installed for any nova installation. lsscsi +# NOTE(priteau): Install qemu-img for format inspector tests +qemu-utils [platform:dpkg test] diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py index 55d810cce89..c8eedcb0e5c 100644 --- a/nova/conf/workarounds.py +++ b/nova/conf/workarounds.py @@ -426,6 +426,21 @@ known-features detection *before* passing the image to qemu-img. Generally, this inspection should be enabled for maximum safety, but this workaround option allows disabling it if there is a compatibility concern. +"""), + cfg.BoolOpt( + 'skip_reserve_in_use_ironic_nodes', + default=False, + help=""" +This may be useful if you use the Ironic driver, but don't have +automatic cleaning enabled in Ironic. Nova, by default, will mark +Ironic nodes as reserved as soon as they are in use. When you free +the Ironic node (by deleting the nova instance) it takes a while +for Nova to un-reserve that Ironic node in placement. Usually this +is a good idea, because it avoids placement providing an Ironic +as a valid candidate when it is still being cleaned. +Howerver, if you don't use automatic cleaning, it can cause an +extra delay before and Ironic node is available for building a +new Nova instance. """), ] diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index ea5c0dbc05a..b3fb2fbd20a 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -931,6 +931,48 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr, self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename) + expected = { + 'CUSTOM_IRON_NFV': { + 'total': 1, + 'reserved': 1, + 'min_unit': 1, + 'max_unit': 1, + 'step_size': 1, + 'allocation_ratio': 1.0, + }, + } + mock_nfc.assert_called_once_with(mock.sentinel.nodename) + mock_nr.assert_called_once_with(mock_nfc.return_value) + mock_res_used.assert_called_once_with(mock_nfc.return_value) + mock_res_unavail.assert_called_once_with(mock_nfc.return_value) + result = self.ptree.data(mock.sentinel.nodename).inventory + self.assertEqual(expected, result) + + @mock.patch.object(ironic_driver.IronicDriver, + '_node_resources_used', return_value=True) + @mock.patch.object(ironic_driver.IronicDriver, + '_node_resources_unavailable', return_value=False) + @mock.patch.object(ironic_driver.IronicDriver, '_node_resource') + @mock.patch.object(ironic_driver.IronicDriver, '_node_from_cache') + def test_update_provider_tree_with_rc_occupied_workaround(self, + mock_nfc, mock_nr, mock_res_unavail, mock_res_used): + """Ensure that when a node is used, we report the inventory matching + the consumed resources. + """ + self.flags(skip_reserve_in_use_ironic_nodes=True, + group="workarounds") + mock_nr.return_value = { + 'vcpus': 24, + 'vcpus_used': 24, + 'memory_mb': 1024, + 'memory_mb_used': 1024, + 'local_gb': 100, + 'local_gb_used': 100, + 'resource_class': 'iron-nfv', + } + + self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename) + expected = { 'CUSTOM_IRON_NFV': { 'total': 1, @@ -944,7 +986,7 @@ def test_update_provider_tree_with_rc_occupied(self, mock_nfc, mock_nr, mock_nfc.assert_called_once_with(mock.sentinel.nodename) mock_nr.assert_called_once_with(mock_nfc.return_value) mock_res_used.assert_called_once_with(mock_nfc.return_value) - self.assertFalse(mock_res_unavail.called) + mock_res_unavail.assert_called_once_with(mock_nfc.return_value) result = self.ptree.data(mock.sentinel.nodename).inventory self.assertEqual(expected, result) @@ -1015,7 +1057,7 @@ def test_update_provider_tree_no_traits(self, mock_nfc, mock_nr, mock_nfc.assert_called_once_with(mock.sentinel.nodename) mock_nr.assert_called_once_with(mock_nfc.return_value) mock_res_used.assert_called_once_with(mock_nfc.return_value) - self.assertFalse(mock_res_unavail.called) + mock_res_unavail.assert_called_once_with(mock_nfc.return_value) result = self.ptree.data(mock.sentinel.nodename).traits self.assertEqual(set(), result) @@ -1047,7 +1089,7 @@ def test_update_provider_tree_with_traits(self, mock_nfc, mock_nr, mock_nfc.assert_called_once_with(mock.sentinel.nodename) mock_nr.assert_called_once_with(mock_nfc.return_value) mock_res_used.assert_called_once_with(mock_nfc.return_value) - self.assertFalse(mock_res_unavail.called) + mock_res_unavail.assert_called_once_with(mock_nfc.return_value) result = self.ptree.data(mock.sentinel.nodename).traits self.assertEqual(set(traits), result) diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index f21694da478..ca0bb978e69 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -886,15 +886,25 @@ def update_provider_tree(self, provider_tree, nodename, allocations=None): """ # nodename is the ironic node's UUID. node = self._node_from_cache(nodename) + reserved = False - if (not self._node_resources_used(node) and - self._node_resources_unavailable(node)): - LOG.debug('Node %(node)s is not ready for a deployment, ' - 'reporting resources as reserved for it. Node\'s ' - 'provision state is %(prov)s, power state is ' - '%(power)s and maintenance is %(maint)s.', - {'node': node.uuid, 'prov': node.provision_state, - 'power': node.power_state, 'maint': node.maintenance}) + if self._node_resources_unavailable(node): + # Operators might mark a node as in maintainance, + # even when an instance is on the node, + # either way lets mark this as reserved + reserved = True + + if (self._node_resources_used(node) and + not CONF.workarounds.skip_reserve_in_use_ironic_nodes): + # Make resources as reserved once we have + # and instance here. + # When the allocation is deleted, most likely + # automatic clean will start, so we keep the node + # reserved until it becomes available again. + # In the case without automatic clean, once + # the allocation is removed in placement it + # also stays as reserved until we notice on + # the next periodic its actually available. reserved = True info = self._node_resource(node) diff --git a/releasenotes/notes/fix-ironic-scheduler-race-08cf8aba0365f512.yaml b/releasenotes/notes/fix-ironic-scheduler-race-08cf8aba0365f512.yaml new file mode 100644 index 00000000000..4fd2cc1ca9f --- /dev/null +++ b/releasenotes/notes/fix-ironic-scheduler-race-08cf8aba0365f512.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Fixed when placement returns ironic nodes that have just started automatic + cleaning as possible valid candidates. This is done by marking all ironic + nodes with an instance on them as reserved, such that nova only makes them + available once we have double checked Ironic reports the node as available. + If you don't have automatic cleaning on, this might mean it takes longer + than normal for Ironic nodes to become available for new instances. + If you want the old behaviour use the following workaround config: + `[workarounds]skip_reserve_in_use_ironic_nodes=true`