diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py index c2678d73686..5485e21bbea 100644 --- a/nova/cmd/manage.py +++ b/nova/cmd/manage.py @@ -257,10 +257,10 @@ def version(self): print(migration.db_version()) @args('--max_rows', type=int, metavar='', dest='max_rows', - help='Maximum number of deleted rows to archive. Defaults to 1000. ' - 'Note that this number does not include the corresponding ' - 'rows, if any, that are removed from the API database for ' - 'deleted instances.') + help='Maximum number of deleted rows to archive per table. Defaults ' + 'to 1000. Note that this number is a soft limit and does not ' + 'include the corresponding rows, if any, that are removed ' + 'from the API database for deleted instances.') @args('--before', metavar='', help=('Archive rows that have been deleted before this date. ' 'Accepts date strings in the default format output by the ' @@ -432,7 +432,10 @@ def _do_archive( 'cell1.instances': 5} :param cctxt: Cell-targeted nova.context.RequestContext if archiving across all cells - :param max_rows: Maximum number of deleted rows to archive + :param max_rows: Maximum number of deleted rows to archive per table. + Note that this number is a soft limit and does not include the + corresponding rows, if any, that are removed from the API database + for deleted instances. :param until_complete: Whether to run continuously until all deleted rows are archived :param verbose: Whether to print how many rows were archived per table @@ -445,15 +448,26 @@ def _do_archive( """ ctxt = context.get_admin_context() while True: - run, deleted_instance_uuids, total_rows_archived = \ + # table_to_rows = {table_name: number_of_rows_archived} + # deleted_instance_uuids = ['uuid1', 'uuid2', ...] + table_to_rows, deleted_instance_uuids, total_rows_archived = \ db.archive_deleted_rows( cctxt, max_rows, before=before_date, task_log=task_log) - for table_name, rows_archived in run.items(): + + for table_name, rows_archived in table_to_rows.items(): if cell_name: table_name = cell_name + '.' + table_name table_to_rows_archived.setdefault(table_name, 0) table_to_rows_archived[table_name] += rows_archived - if deleted_instance_uuids: + + # deleted_instance_uuids does not necessarily mean that any + # instances rows were archived because it is obtained by a query + # separate from the archive queries. For example, if a + # DBReferenceError was raised while processing the instances table, + # we would have skipped the table and had 0 rows archived even + # though deleted instances rows were found. + instances_archived = table_to_rows.get('instances', 0) + if deleted_instance_uuids and instances_archived: table_to_rows_archived.setdefault( 'API_DB.instance_mappings', 0) table_to_rows_archived.setdefault( @@ -476,8 +490,9 @@ def _do_archive( # If we're not archiving until there is nothing more to archive, we # have reached max_rows in this cell DB or there was nothing to - # archive. - if not until_complete or not run: + # archive. We check the values() in case we get something like + # table_to_rows = {'instances': 0} back somehow. + if not until_complete or not any(table_to_rows.values()): break if verbose: sys.stdout.write('.') diff --git a/nova/conf/pci.py b/nova/conf/pci.py index 533bf52eadd..64cb89a787b 100644 --- a/nova/conf/pci.py +++ b/nova/conf/pci.py @@ -175,8 +175,9 @@ its corresponding PF), otherwise they will be ignored and not available for allocation. - ``resource_class`` - optional Placement resource class name to be used - to track the matching PCI devices in Placement when [pci]device_spec is - True. It can be a standard resource class from the + to track the matching PCI devices in Placement when + [pci]report_in_placement is True. + It can be a standard resource class from the ``os-resource-classes`` lib. Or can be any string. In that case Nova will normalize it to a proper Placement resource class by making it upper case, replacing any consecutive character outside of ``[A-Z0-9_]`` with a diff --git a/nova/db/main/api.py b/nova/db/main/api.py index 7d24f974f91..ae6533f49cd 100644 --- a/nova/db/main/api.py +++ b/nova/db/main/api.py @@ -4407,6 +4407,9 @@ def _archive_deleted_rows_for_table( select = select.order_by(column).limit(max_rows) with conn.begin(): rows = conn.execute(select).fetchall() + + # This is a list of IDs of rows that should be archived from this table, + # limited to a length of max_rows. records = [r[0] for r in rows] # We will archive deleted rows for this table and also generate insert and @@ -4419,51 +4422,103 @@ def _archive_deleted_rows_for_table( # Keep track of any extra tablenames to number of rows that we archive by # following FK relationships. - # {tablename: extra_rows_archived} + # + # extras = {tablename: number_of_extra_rows_archived} extras = collections.defaultdict(int) - if records: - insert = shadow_table.insert().from_select( - columns, sql.select(table).where(column.in_(records)) - ).inline() - delete = table.delete().where(column.in_(records)) + + if not records: + # Nothing to archive, so return. + return rows_archived, deleted_instance_uuids, extras + + # Keep track of how many rows we accumulate for the insert+delete database + # transaction and cap it as soon as it is >= max_rows. Because we will + # archive all child rows of a parent row along with the parent at the same + # time, we end up with extra rows to archive in addition to len(records). + num_rows_in_batch = 0 + # The sequence of query statements we will execute in a batch. These are + # ordered: [child1, child1, parent1, child2, child2, child2, parent2, ...] + # Parent + child "trees" are kept together to avoid FK constraint + # violations. + statements_in_batch = [] + # The list of records in the batch. This is used for collecting deleted + # instance UUIDs in the case of the 'instances' table. + records_in_batch = [] + + # (melwitt): We will gather rows related by foreign key relationship for + # each deleted row, one at a time. We do it this way to keep track of and + # limit the total number of rows that will be archived in a single database + # transaction. In a large scale database with potentially hundreds of + # thousands of deleted rows, if we don't limit the size of the transaction + # based on max_rows, we can get into a situation where we get stuck not + # able to make much progress. The value of max_rows has to be 1) small + # enough to not exceed the database's max packet size limit or timeout with + # a deadlock but 2) large enough to make progress in an environment with a + # constant high volume of create and delete traffic. By archiving each + # parent + child rows tree one at a time, we can ensure meaningful progress + # can be made while allowing the caller to predictably control the size of + # the database transaction with max_rows. + for record in records: # Walk FK relationships and add insert/delete statements for rows that # refer to this table via FK constraints. fk_inserts and fk_deletes # will be prepended to by _get_fk_stmts if referring rows are found by # FK constraints. fk_inserts, fk_deletes = _get_fk_stmts( - metadata, conn, table, column, records) - - # NOTE(tssurya): In order to facilitate the deletion of records from - # instance_mappings, request_specs and instance_group_member tables in - # the nova_api DB, the rows of deleted instances from the instances - # table are stored prior to their deletion. Basically the uuids of the - # archived instances are queried and returned. - if tablename == "instances": - query_select = sql.select(table.c.uuid).where( - table.c.id.in_(records) - ) - with conn.begin(): - rows = conn.execute(query_select).fetchall() - deleted_instance_uuids = [r[0] for r in rows] + metadata, conn, table, column, [record]) + statements_in_batch.extend(fk_inserts + fk_deletes) + # statement to add parent row to shadow table + insert = shadow_table.insert().from_select( + columns, sql.select(table).where(column.in_([record]))).inline() + statements_in_batch.append(insert) + # statement to remove parent row from main table + delete = table.delete().where(column.in_([record])) + statements_in_batch.append(delete) - try: - # Group the insert and delete in a transaction. - with conn.begin(): - for fk_insert in fk_inserts: - conn.execute(fk_insert) - for fk_delete in fk_deletes: - result_fk_delete = conn.execute(fk_delete) - extras[fk_delete.table.name] += result_fk_delete.rowcount - conn.execute(insert) - result_delete = conn.execute(delete) - rows_archived += result_delete.rowcount - except db_exc.DBReferenceError as ex: - # A foreign key constraint keeps us from deleting some of - # these rows until we clean up a dependent table. Just - # skip this table for now; we'll come back to it later. - LOG.warning("IntegrityError detected when archiving table " - "%(tablename)s: %(error)s", - {'tablename': tablename, 'error': str(ex)}) + records_in_batch.append(record) + + # Check whether were have a full batch >= max_rows. Rows are counted as + # the number of rows that will be moved in the database transaction. + # So each insert+delete pair represents one row that will be moved. + # 1 parent + its fks + num_rows_in_batch += 1 + len(fk_inserts) + + if max_rows is not None and num_rows_in_batch >= max_rows: + break + + # NOTE(tssurya): In order to facilitate the deletion of records from + # instance_mappings, request_specs and instance_group_member tables in the + # nova_api DB, the rows of deleted instances from the instances table are + # stored prior to their deletion. Basically the uuids of the archived + # instances are queried and returned. + if tablename == "instances": + query_select = sql.select(table.c.uuid).where( + table.c.id.in_(records_in_batch)) + with conn.begin(): + rows = conn.execute(query_select).fetchall() + # deleted_instance_uuids = ['uuid1', 'uuid2', ...] + deleted_instance_uuids = [r[0] for r in rows] + + try: + # Group the insert and delete in a transaction. + with conn.begin(): + for statement in statements_in_batch: + result = conn.execute(statement) + result_tablename = statement.table.name + # Add to archived row counts if not a shadow table. + if not result_tablename.startswith(_SHADOW_TABLE_PREFIX): + if result_tablename == tablename: + # Number of tablename (parent) rows archived. + rows_archived += result.rowcount + else: + # Number(s) of child rows archived. + extras[result_tablename] += result.rowcount + + except db_exc.DBReferenceError as ex: + # A foreign key constraint keeps us from deleting some of these rows + # until we clean up a dependent table. Just skip this table for now; + # we'll come back to it later. + LOG.warning("IntegrityError detected when archiving table " + "%(tablename)s: %(error)s", + {'tablename': tablename, 'error': str(ex)}) conn.close() diff --git a/nova/exception.py b/nova/exception.py index 85e0a9c85dd..f319cf58a2c 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -1131,6 +1131,10 @@ class FileNotFound(NotFound): msg_fmt = _("File %(file_path)s could not be found.") +class DeviceBusy(NovaException): + msg_fmt = _("device %(file_path)s is busy.") + + class ClassNotFound(NotFound): msg_fmt = _("Class %(class_name)s could not be found: %(exception)s") diff --git a/nova/filesystem.py b/nova/filesystem.py index 5394d2d8351..2db3093236e 100644 --- a/nova/filesystem.py +++ b/nova/filesystem.py @@ -12,37 +12,79 @@ """Functions to address filesystem calls, particularly sysfs.""" +import functools import os +import time from oslo_log import log as logging from nova import exception LOG = logging.getLogger(__name__) +SYS = '/sys' +RETRY_LIMIT = 5 -SYS = '/sys' +# a retry decorator to handle EBUSY +def retry_if_busy(func): + """Decorator to retry a function if it raises DeviceBusy. + + This decorator will retry the function RETRY_LIMIT=5 times if it raises + DeviceBusy. It will sleep for 1 second on the first retry, 2 seconds on + the second retry, and so on, up to RETRY_LIMIT seconds. If the function + still raises DeviceBusy after RETRY_LIMIT retries, the exception will be + raised. + """ + @functools.wraps(func) + def wrapper(*args, **kwargs): + for i in range(RETRY_LIMIT): + try: + return func(*args, **kwargs) + except exception.DeviceBusy as e: + # if we have retried RETRY_LIMIT times, raise the exception + # otherwise, sleep and retry, i is 0-based so we need + # to add 1 to it + count = i + 1 + if count < RETRY_LIMIT: + LOG.debug( + f"File {e.kwargs['file_path']} is busy, " + f"sleeping {count} seconds before retrying") + time.sleep(count) + continue + raise + return wrapper # NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint + + +@retry_if_busy def read_sys(path: str) -> str: """Reads the content of a file in the sys filesystem. :param path: relative or absolute. If relative, will be prefixed by /sys. :returns: contents of that file. :raises: nova.exception.FileNotFound if we can't read that file. + :raises: nova.exception.DeviceBusy if the file is busy. """ try: # The path can be absolute with a /sys prefix but that's fine. with open(os.path.join(SYS, path), mode='r') as data: return data.read() - except (OSError, ValueError) as exc: + except OSError as exc: + # errno 16 is EBUSY, which means the file is busy. + if exc.errno == 16: + raise exception.DeviceBusy(file_path=path) from exc + # convert permission denied to file not found + raise exception.FileNotFound(file_path=path) from exc + except ValueError as exc: raise exception.FileNotFound(file_path=path) from exc # NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint # In order to correctly use it, you need to decorate the caller with a specific # privsep entrypoint. +@retry_if_busy def write_sys(path: str, data: str) -> None: """Writes the content of a file in the sys filesystem with data. @@ -50,10 +92,17 @@ def write_sys(path: str, data: str) -> None: :param data: the data to write. :returns: contents of that file. :raises: nova.exception.FileNotFound if we can't write that file. + :raises: nova.exception.DeviceBusy if the file is busy. """ try: # The path can be absolute with a /sys prefix but that's fine. with open(os.path.join(SYS, path), mode='w') as fd: fd.write(data) - except (OSError, ValueError) as exc: + except OSError as exc: + # errno 16 is EBUSY, which means the file is busy. + if exc.errno == 16: + raise exception.DeviceBusy(file_path=path) from exc + # convert permission denied to file not found + raise exception.FileNotFound(file_path=path) from exc + except ValueError as exc: raise exception.FileNotFound(file_path=path) from exc diff --git a/nova/tests/functional/db/test_archive.py b/nova/tests/functional/db/test_archive.py index 4a813d10a7c..1bcce9f3ee0 100644 --- a/nova/tests/functional/db/test_archive.py +++ b/nova/tests/functional/db/test_archive.py @@ -174,6 +174,50 @@ def test_archive_deleted_rows_incomplete(self): break self.assertFalse(exceptions) + def test_archive_deleted_rows_parent_child_trees_one_at_time(self): + """Test that we are archiving parent + child rows "trees" one at a time + + Previously, we archived deleted rows in batches of max_rows parents + + their child rows in a single database transaction. Doing it that way + limited how high a value of max_rows could be specified by the caller + because of the size of the database transaction it could generate. + + For example, in a large scale deployment with hundreds of thousands of + deleted rows and constant server creation and deletion activity, a + value of max_rows=1000 might exceed the database's configured maximum + packet size or timeout due to a database deadlock, forcing the operator + to use a much lower max_rows value like 100 or 50. + + And when the operator has e.g. 500,000 deleted instances rows (and + millions of deleted rows total) they are trying to archive, being + forced to use a max_rows value serveral orders of magnitude lower than + the number of rows they need to archive was a poor user experience. + + This tests that we are archiving each parent plus their child rows one + at a time while limiting the total number of rows per table in a batch + as soon as the total number of archived rows is >= max_rows. + """ + # Boot two servers and delete them, then try to archive rows. + for i in range(2): + server = self._create_server() + self._delete_server(server) + + # Use max_rows=5 to limit the archive to one complete parent + + # child rows tree. + table_to_rows, _, _ = db.archive_deleted_rows(max_rows=5) + + # We should have archived only one instance because the parent + + # child tree will have >= max_rows of 5. + self.assertEqual(1, table_to_rows['instances']) + + # Archive once more to archive the other instance (and its child rows). + table_to_rows, _, _ = db.archive_deleted_rows(max_rows=5) + self.assertEqual(1, table_to_rows['instances']) + + # There should be nothing else to archive. + _, _, total_rows_archived = db.archive_deleted_rows(max_rows=100) + self.assertEqual(0, total_rows_archived) + def _get_table_counts(self): engine = db.get_engine() conn = engine.connect() diff --git a/nova/tests/functional/libvirt/test_power_manage.py b/nova/tests/functional/libvirt/test_power_manage.py index f6e4b1ba5b4..3839c798317 100644 --- a/nova/tests/functional/libvirt/test_power_manage.py +++ b/nova/tests/functional/libvirt/test_power_manage.py @@ -10,10 +10,10 @@ # License for the specific language governing permissions and limitations # under the License. -from unittest import mock - import fixtures +from unittest import mock + from nova import context as nova_context from nova import exception from nova import objects @@ -240,7 +240,6 @@ def setUp(self): cpu_cores=5, cpu_threads=2) self.compute1 = self.start_compute(host_info=self.host_info, hostname='compute1') - # All cores are shutdown at startup, let's check. cpu_dedicated_set = hardware.get_cpu_dedicated_set() self._assert_cpu_set_state(cpu_dedicated_set, expected='offline') @@ -266,6 +265,34 @@ def test_create_server(self): cpu_dedicated_set = hardware.get_cpu_dedicated_set() unused_cpus = cpu_dedicated_set - instance_pcpus self._assert_cpu_set_state(unused_cpus, expected='offline') + return server + + def test_delete_server(self): + server = self.test_create_server() + self._delete_server(server) + # Let's verify that the pinned CPUs are now offline + cpu_dedicated_set = hardware.get_cpu_dedicated_set() + self._assert_cpu_set_state(cpu_dedicated_set, expected='offline') + + def test_delete_server_device_busy(self): + # This test verifies bug 2065927 is resolved. + server = self.test_create_server() + inst = objects.Instance.get_by_uuid(self.ctxt, server['id']) + instance_pcpus = inst.numa_topology.cpu_pinning + self._assert_cpu_set_state(instance_pcpus, expected='online') + with mock.patch( + 'nova.filesystem.write_sys.__wrapped__', + side_effect=[ + exception.DeviceBusy(file_path='fake'), + None]): + + self._delete_server(server) + cpu_dedicated_set = hardware.get_cpu_dedicated_set() + # Verify that the unused CPUs are still offline + unused_cpus = cpu_dedicated_set - instance_pcpus + self._assert_cpu_set_state(unused_cpus, expected='offline') + # and the pinned CPUs are offline + self._assert_cpu_set_state(instance_pcpus, expected='offline') def test_create_server_with_emulator_threads_isolate(self): server = self._create_server( diff --git a/nova/tests/functional/test_nova_manage.py b/nova/tests/functional/test_nova_manage.py index 888b43cea09..31996df2d72 100644 --- a/nova/tests/functional/test_nova_manage.py +++ b/nova/tests/functional/test_nova_manage.py @@ -2238,57 +2238,71 @@ def setUp(self): def test_archive_deleted_rows(self): admin_context = context.get_admin_context(read_deleted='yes') - # Boot a server to cell1 - server_ids = {} - server = self._build_server(az='nova:host1') - created_server = self.api.post_server({'server': server}) - self._wait_for_state_change(created_server, 'ACTIVE') - server_ids['cell1'] = created_server['id'] - # Boot a server to cell2 - server = self._build_server(az='nova:host2') - created_server = self.api.post_server({'server': server}) - self._wait_for_state_change(created_server, 'ACTIVE') - server_ids['cell2'] = created_server['id'] - # Boot a server to cell0 (cause ERROR state prior to schedule) - server = self._build_server() - # Flavor m1.xlarge cannot be fulfilled - server['flavorRef'] = 'http://fake.server/5' - created_server = self.api.post_server({'server': server}) - self._wait_for_state_change(created_server, 'ERROR') - server_ids['cell0'] = created_server['id'] + server_ids_by_cell = collections.defaultdict(list) + # Create two servers per cell to make sure archive for table iterates + # at least once. + for i in range(2): + # Boot a server to cell1 + server = self._build_server(az='nova:host1') + created_server = self.api.post_server({'server': server}) + self._wait_for_state_change(created_server, 'ACTIVE') + server_ids_by_cell['cell1'].append(created_server['id']) + # Boot a server to cell2 + server = self._build_server(az='nova:host2') + created_server = self.api.post_server({'server': server}) + self._wait_for_state_change(created_server, 'ACTIVE') + server_ids_by_cell['cell2'].append(created_server['id']) + # Boot a server to cell0 (cause ERROR state prior to schedule) + server = self._build_server() + # Flavor m1.xlarge cannot be fulfilled + server['flavorRef'] = 'http://fake.server/5' + created_server = self.api.post_server({'server': server}) + self._wait_for_state_change(created_server, 'ERROR') + server_ids_by_cell['cell0'].append(created_server['id']) + # Verify all the servers are in the databases - for cell_name, server_id in server_ids.items(): - with context.target_cell(admin_context, - self.cell_mappings[cell_name]) as cctxt: - objects.Instance.get_by_uuid(cctxt, server_id) + for cell_name, server_ids in server_ids_by_cell.items(): + for server_id in server_ids: + with context.target_cell( + admin_context, + self.cell_mappings[cell_name] + ) as cctxt: + objects.Instance.get_by_uuid(cctxt, server_id) # Delete the servers - for cell_name in server_ids.keys(): - self.api.delete_server(server_ids[cell_name]) + for cell_name, server_ids in server_ids_by_cell.items(): + for server_id in server_ids: + self.api.delete_server(server_id) # Verify all the servers are in the databases still (as soft deleted) - for cell_name, server_id in server_ids.items(): - with context.target_cell(admin_context, - self.cell_mappings[cell_name]) as cctxt: - objects.Instance.get_by_uuid(cctxt, server_id) + for cell_name, server_ids in server_ids_by_cell.items(): + for server_id in server_ids: + with context.target_cell( + admin_context, + self.cell_mappings[cell_name] + ) as cctxt: + objects.Instance.get_by_uuid(cctxt, server_id) # Archive the deleted rows self.cli.archive_deleted_rows(verbose=True, all_cells=True) - # Three instances should have been archived (cell0, cell1, cell2) + # 6 instances should have been archived (cell0, cell1, cell2) self.assertRegex(self.output.getvalue(), - r"| cell0\.instances.*\| 1.*") + r"\| cell0\.instances\s+\| 2") self.assertRegex(self.output.getvalue(), - r"| cell1\.instances.*\| 1.*") + r"\| cell1\.instances\s+\| 2") self.assertRegex(self.output.getvalue(), - r"| cell2\.instances.*\| 1.*") + r"\| cell2\.instances\s+\| 2") self.assertRegex(self.output.getvalue(), - r"| API_DB\.instance_mappings.*\| 3.*") + r"\| API_DB\.instance_mappings\s+\| 6") self.assertRegex(self.output.getvalue(), - r"| API_DB\.request_specs.*\| 3.*") + r"\| API_DB\.request_specs\s+\| 6") # Verify all the servers are gone from the cell databases - for cell_name, server_id in server_ids.items(): - with context.target_cell(admin_context, - self.cell_mappings[cell_name]) as cctxt: - self.assertRaises(exception.InstanceNotFound, - objects.Instance.get_by_uuid, - cctxt, server_id) + for cell_name, server_ids in server_ids_by_cell.items(): + for server_id in server_ids: + with context.target_cell( + admin_context, + self.cell_mappings[cell_name] + ) as cctxt: + self.assertRaises(exception.InstanceNotFound, + objects.Instance.get_by_uuid, + cctxt, server_id) class TestDBArchiveDeletedRowsMultiCellTaskLog( diff --git a/nova/tests/unit/cmd/test_manage.py b/nova/tests/unit/cmd/test_manage.py index fca97180e4d..0756624f6e6 100644 --- a/nova/tests/unit/cmd/test_manage.py +++ b/nova/tests/unit/cmd/test_manage.py @@ -501,6 +501,26 @@ def test_archive_deleted_rows_verbose_no_results(self, mock_get_all, self.assertIn('Nothing was archived.', output) self.assertEqual(0, result) + @mock.patch.object(db, 'archive_deleted_rows') + @mock.patch.object(objects.CellMappingList, 'get_all') + def test_archive_deleted_rows_deleted_instances_no_rows(self, mock_get_all, + mock_db_archive): + # Simulate a scenario where we didn't archive any rows (example: + # DBReferenceError was raised while processing the instances table) but + # the separate query for deleted_instance_uuids found rows. + mock_db_archive.return_value = ( + {}, [uuidsentinel.instance1, uuidsentinel.instance2], 0) + + result = self.commands.archive_deleted_rows(20, verbose=True) + + mock_db_archive.assert_called_once_with( + test.MatchType(context.RequestContext), 20, before=None, + task_log=False) + output = self.output.getvalue() + # If nothing was archived, there should be no purge messages + self.assertIn('Nothing was archived.', output) + self.assertEqual(0, result) + @mock.patch.object(db, 'archive_deleted_rows') @mock.patch.object(objects.RequestSpec, 'destroy_bulk') @mock.patch.object(objects.InstanceGroup, 'destroy_members_bulk') diff --git a/nova/tests/unit/test_filesystem.py b/nova/tests/unit/test_filesystem.py index 85f16157eeb..5c3107472d8 100644 --- a/nova/tests/unit/test_filesystem.py +++ b/nova/tests/unit/test_filesystem.py @@ -10,6 +10,7 @@ # License for the specific language governing permissions and limitations # under the License. +import io import os from unittest import mock @@ -35,6 +36,29 @@ def test_read_sys_error(self): expected_path = os.path.join(filesystem.SYS, 'foo') m_open.assert_called_once_with(expected_path, mode='r') + def test_read_sys_retry(self): + open_mock = mock.mock_open() + with mock.patch('builtins.open', open_mock) as m_open: + m_open.side_effect = [ + OSError(16, 'Device or resource busy'), + io.StringIO('bar'), + ] + self.assertEqual('bar', filesystem.read_sys('foo')) + expected_path = os.path.join(filesystem.SYS, 'foo') + m_open.assert_has_calls([ + mock.call(expected_path, mode='r'), + mock.call(expected_path, mode='r'), + ]) + + def test_read_sys_retry_limit(self): + open_mock = mock.mock_open() + with mock.patch('builtins.open', open_mock) as m_open: + m_open.side_effect = ( + [OSError(16, 'Device or resource busy')] * + (filesystem.RETRY_LIMIT + 1)) + self.assertRaises( + exception.DeviceBusy, filesystem.read_sys, 'foo') + def test_write_sys(self): open_mock = mock.mock_open() with mock.patch('builtins.open', open_mock) as m_open: @@ -50,3 +74,26 @@ def test_write_sys_error(self): filesystem.write_sys, 'foo', 'bar') expected_path = os.path.join(filesystem.SYS, 'foo') m_open.assert_called_once_with(expected_path, mode='w') + + def test_write_sys_retry(self): + open_mock = mock.mock_open() + with mock.patch('builtins.open', open_mock) as m_open: + m_open.side_effect = [ + OSError(16, 'Device or resource busy'), + io.StringIO(), + ] + self.assertIsNone(filesystem.write_sys('foo', 'bar')) + expected_path = os.path.join(filesystem.SYS, 'foo') + m_open.assert_has_calls([ + mock.call(expected_path, mode='w'), + mock.call(expected_path, mode='w'), + ]) + + def test_write_sys_retry_limit(self): + open_mock = mock.mock_open() + with mock.patch('builtins.open', open_mock) as m_open: + m_open.side_effect = ( + [OSError(16, 'Device or resource busy')] * + (filesystem.RETRY_LIMIT + 1)) + self.assertRaises( + exception.DeviceBusy, filesystem.write_sys, 'foo', 'bar') diff --git a/nova/virt/libvirt/cpu/core.py b/nova/virt/libvirt/cpu/core.py index 8274236850c..2d71bd60e40 100644 --- a/nova/virt/libvirt/cpu/core.py +++ b/nova/virt/libvirt/cpu/core.py @@ -56,13 +56,23 @@ def get_online(core: int) -> bool: @nova.privsep.sys_admin_pctxt.entrypoint def set_online(core: int) -> bool: + # failure to online a core should be considered a failure + # so we don't catch any exception here. filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='1') return get_online(core) @nova.privsep.sys_admin_pctxt.entrypoint def set_offline(core: int) -> bool: - filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='0') + try: + filesystem.write_sys(os.path.join( + gen_cpu_path(core), 'online'), data='0') + except exception.DeviceBusy: + # if nova is not able to offline a core it should not break anything + # so we just log a warning and return False to indicate that the core + # is not offline. + LOG.warning('Unable to offline CPU: %s', core) + return False return not get_online(core) diff --git a/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml b/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml new file mode 100644 index 00000000000..537d67b3838 --- /dev/null +++ b/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml @@ -0,0 +1,21 @@ +--- +fixes: + - | + `Bug #2024258`_: Fixes an issue with performance degradation archiving + databases with large numbers of foreign key related records. + + Previously, deleted rows were archived in batches of max_rows parents + + their child rows in a single database transaction. It limited how high + a value of max_rows could be specified by the user because of the size of + the database transaction it could generate. Symptoms of the behavior were + exceeding the maximum configured packet size of the database or timing out + due to a deadlock. + + The behavior has been changed to archive batches of complete parent + + child rows trees while limiting each batch when it has reached >= max_rows + records. This allows the size of the database transaction to be controlled + by the user and enables more rows to be archived per invocation of + ``nova-manage db archive_deleted_rows`` when there are a large number of + foreign key related records. + + .. _Bug #2024258: https://bugs.launchpad.net/nova/+bug/2024258