diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py
index c2678d73686..5485e21bbea 100644
--- a/nova/cmd/manage.py
+++ b/nova/cmd/manage.py
@@ -257,10 +257,10 @@ def version(self):
         print(migration.db_version())
 
     @args('--max_rows', type=int, metavar='<number>', dest='max_rows',
-          help='Maximum number of deleted rows to archive. Defaults to 1000. '
-               'Note that this number does not include the corresponding '
-               'rows, if any, that are removed from the API database for '
-               'deleted instances.')
+          help='Maximum number of deleted rows to archive per table. Defaults '
+               'to 1000. Note that this number is a soft limit and does not '
+               'include the corresponding rows, if any, that are removed '
+               'from the API database for deleted instances.')
     @args('--before', metavar='<date>',
           help=('Archive rows that have been deleted before this date. '
                 'Accepts date strings in the default format output by the '
@@ -432,7 +432,10 @@ def _do_archive(
              'cell1.instances': 5}
         :param cctxt: Cell-targeted nova.context.RequestContext if archiving
             across all cells
-        :param max_rows: Maximum number of deleted rows to archive
+        :param max_rows: Maximum number of deleted rows to archive per table.
+            Note that this number is a soft limit and does not include the
+            corresponding rows, if any, that are removed from the API database
+            for deleted instances.
         :param until_complete: Whether to run continuously until all deleted
             rows are archived
         :param verbose: Whether to print how many rows were archived per table
@@ -445,15 +448,26 @@ def _do_archive(
         """
         ctxt = context.get_admin_context()
         while True:
-            run, deleted_instance_uuids, total_rows_archived = \
+            # table_to_rows = {table_name: number_of_rows_archived}
+            # deleted_instance_uuids = ['uuid1', 'uuid2', ...]
+            table_to_rows, deleted_instance_uuids, total_rows_archived = \
                 db.archive_deleted_rows(
                     cctxt, max_rows, before=before_date, task_log=task_log)
-            for table_name, rows_archived in run.items():
+
+            for table_name, rows_archived in table_to_rows.items():
                 if cell_name:
                     table_name = cell_name + '.' + table_name
                 table_to_rows_archived.setdefault(table_name, 0)
                 table_to_rows_archived[table_name] += rows_archived
-            if deleted_instance_uuids:
+
+            # deleted_instance_uuids does not necessarily mean that any
+            # instances rows were archived because it is obtained by a query
+            # separate from the archive queries. For example, if a
+            # DBReferenceError was raised while processing the instances table,
+            # we would have skipped the table and had 0 rows archived even
+            # though deleted instances rows were found.
+            instances_archived = table_to_rows.get('instances', 0)
+            if deleted_instance_uuids and instances_archived:
                 table_to_rows_archived.setdefault(
                     'API_DB.instance_mappings', 0)
                 table_to_rows_archived.setdefault(
@@ -476,8 +490,9 @@ def _do_archive(
 
             # If we're not archiving until there is nothing more to archive, we
             # have reached max_rows in this cell DB or there was nothing to
-            # archive.
-            if not until_complete or not run:
+            # archive. We check the values() in case we get something like
+            # table_to_rows = {'instances': 0} back somehow.
+            if not until_complete or not any(table_to_rows.values()):
                 break
             if verbose:
                 sys.stdout.write('.')
diff --git a/nova/conf/pci.py b/nova/conf/pci.py
index 533bf52eadd..64cb89a787b 100644
--- a/nova/conf/pci.py
+++ b/nova/conf/pci.py
@@ -175,8 +175,9 @@
       its corresponding PF), otherwise they will be ignored and not
       available for allocation.
     - ``resource_class`` - optional Placement resource class name to be used
-      to track the matching PCI devices in Placement when [pci]device_spec is
-      True. It can be a standard resource class from the
+      to track the matching PCI devices in Placement when
+      [pci]report_in_placement is True.
+      It can be a standard resource class from the
       ``os-resource-classes`` lib. Or can be any string. In that case Nova will
       normalize it to a proper Placement resource class by making it upper
       case, replacing any consecutive character outside of ``[A-Z0-9_]`` with a
diff --git a/nova/db/main/api.py b/nova/db/main/api.py
index 7d24f974f91..ae6533f49cd 100644
--- a/nova/db/main/api.py
+++ b/nova/db/main/api.py
@@ -4407,6 +4407,9 @@ def _archive_deleted_rows_for_table(
     select = select.order_by(column).limit(max_rows)
     with conn.begin():
         rows = conn.execute(select).fetchall()
+
+    # This is a list of IDs of rows that should be archived from this table,
+    # limited to a length of max_rows.
     records = [r[0] for r in rows]
 
     # We will archive deleted rows for this table and also generate insert and
@@ -4419,51 +4422,103 @@ def _archive_deleted_rows_for_table(
 
     # Keep track of any extra tablenames to number of rows that we archive by
     # following FK relationships.
-    # {tablename: extra_rows_archived}
+    #
+    # extras = {tablename: number_of_extra_rows_archived}
     extras = collections.defaultdict(int)
-    if records:
-        insert = shadow_table.insert().from_select(
-            columns, sql.select(table).where(column.in_(records))
-        ).inline()
-        delete = table.delete().where(column.in_(records))
+
+    if not records:
+        # Nothing to archive, so return.
+        return rows_archived, deleted_instance_uuids, extras
+
+    # Keep track of how many rows we accumulate for the insert+delete database
+    # transaction and cap it as soon as it is >= max_rows. Because we will
+    # archive all child rows of a parent row along with the parent at the same
+    # time, we end up with extra rows to archive in addition to len(records).
+    num_rows_in_batch = 0
+    # The sequence of query statements we will execute in a batch. These are
+    # ordered: [child1, child1, parent1, child2, child2, child2, parent2, ...]
+    # Parent + child "trees" are kept together to avoid FK constraint
+    # violations.
+    statements_in_batch = []
+    # The list of records in the batch. This is used for collecting deleted
+    # instance UUIDs in the case of the 'instances' table.
+    records_in_batch = []
+
+    # (melwitt): We will gather rows related by foreign key relationship for
+    # each deleted row, one at a time. We do it this way to keep track of and
+    # limit the total number of rows that will be archived in a single database
+    # transaction. In a large scale database with potentially hundreds of
+    # thousands of deleted rows, if we don't limit the size of the transaction
+    # based on max_rows, we can get into a situation where we get stuck not
+    # able to make much progress. The value of max_rows has to be 1) small
+    # enough to not exceed the database's max packet size limit or timeout with
+    # a deadlock but 2) large enough to make progress in an environment with a
+    # constant high volume of create and delete traffic. By archiving each
+    # parent + child rows tree one at a time, we can ensure meaningful progress
+    # can be made while allowing the caller to predictably control the size of
+    # the database transaction with max_rows.
+    for record in records:
         # Walk FK relationships and add insert/delete statements for rows that
         # refer to this table via FK constraints. fk_inserts and fk_deletes
         # will be prepended to by _get_fk_stmts if referring rows are found by
         # FK constraints.
         fk_inserts, fk_deletes = _get_fk_stmts(
-            metadata, conn, table, column, records)
-
-        # NOTE(tssurya): In order to facilitate the deletion of records from
-        # instance_mappings, request_specs and instance_group_member tables in
-        # the nova_api DB, the rows of deleted instances from the instances
-        # table are stored prior to their deletion. Basically the uuids of the
-        # archived instances are queried and returned.
-        if tablename == "instances":
-            query_select = sql.select(table.c.uuid).where(
-                table.c.id.in_(records)
-            )
-            with conn.begin():
-                rows = conn.execute(query_select).fetchall()
-            deleted_instance_uuids = [r[0] for r in rows]
+            metadata, conn, table, column, [record])
+        statements_in_batch.extend(fk_inserts + fk_deletes)
+        # statement to add parent row to shadow table
+        insert = shadow_table.insert().from_select(
+            columns, sql.select(table).where(column.in_([record]))).inline()
+        statements_in_batch.append(insert)
+        # statement to remove parent row from main table
+        delete = table.delete().where(column.in_([record]))
+        statements_in_batch.append(delete)
 
-        try:
-            # Group the insert and delete in a transaction.
-            with conn.begin():
-                for fk_insert in fk_inserts:
-                    conn.execute(fk_insert)
-                for fk_delete in fk_deletes:
-                    result_fk_delete = conn.execute(fk_delete)
-                    extras[fk_delete.table.name] += result_fk_delete.rowcount
-                conn.execute(insert)
-                result_delete = conn.execute(delete)
-            rows_archived += result_delete.rowcount
-        except db_exc.DBReferenceError as ex:
-            # A foreign key constraint keeps us from deleting some of
-            # these rows until we clean up a dependent table.  Just
-            # skip this table for now; we'll come back to it later.
-            LOG.warning("IntegrityError detected when archiving table "
-                        "%(tablename)s: %(error)s",
-                        {'tablename': tablename, 'error': str(ex)})
+        records_in_batch.append(record)
+
+        # Check whether were have a full batch >= max_rows. Rows are counted as
+        # the number of rows that will be moved in the database transaction.
+        # So each insert+delete pair represents one row that will be moved.
+        # 1 parent + its fks
+        num_rows_in_batch += 1 + len(fk_inserts)
+
+        if max_rows is not None and num_rows_in_batch >= max_rows:
+            break
+
+    # NOTE(tssurya): In order to facilitate the deletion of records from
+    # instance_mappings, request_specs and instance_group_member tables in the
+    # nova_api DB, the rows of deleted instances from the instances table are
+    # stored prior to their deletion. Basically the uuids of the archived
+    # instances are queried and returned.
+    if tablename == "instances":
+        query_select = sql.select(table.c.uuid).where(
+            table.c.id.in_(records_in_batch))
+        with conn.begin():
+            rows = conn.execute(query_select).fetchall()
+        # deleted_instance_uuids = ['uuid1', 'uuid2', ...]
+        deleted_instance_uuids = [r[0] for r in rows]
+
+    try:
+        # Group the insert and delete in a transaction.
+        with conn.begin():
+            for statement in statements_in_batch:
+                result = conn.execute(statement)
+                result_tablename = statement.table.name
+                # Add to archived row counts if not a shadow table.
+                if not result_tablename.startswith(_SHADOW_TABLE_PREFIX):
+                    if result_tablename == tablename:
+                        # Number of tablename (parent) rows archived.
+                        rows_archived += result.rowcount
+                    else:
+                        # Number(s) of child rows archived.
+                        extras[result_tablename] += result.rowcount
+
+    except db_exc.DBReferenceError as ex:
+        # A foreign key constraint keeps us from deleting some of these rows
+        # until we clean up a dependent table. Just skip this table for now;
+        # we'll come back to it later.
+        LOG.warning("IntegrityError detected when archiving table "
+                    "%(tablename)s: %(error)s",
+                    {'tablename': tablename, 'error': str(ex)})
 
     conn.close()
 
diff --git a/nova/exception.py b/nova/exception.py
index 85e0a9c85dd..f319cf58a2c 100644
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -1131,6 +1131,10 @@ class FileNotFound(NotFound):
     msg_fmt = _("File %(file_path)s could not be found.")
 
 
+class DeviceBusy(NovaException):
+    msg_fmt = _("device %(file_path)s is busy.")
+
+
 class ClassNotFound(NotFound):
     msg_fmt = _("Class %(class_name)s could not be found: %(exception)s")
 
diff --git a/nova/filesystem.py b/nova/filesystem.py
index 5394d2d8351..2db3093236e 100644
--- a/nova/filesystem.py
+++ b/nova/filesystem.py
@@ -12,37 +12,79 @@
 
 """Functions to address filesystem calls, particularly sysfs."""
 
+import functools
 import os
+import time
 
 from oslo_log import log as logging
 
 from nova import exception
 
 LOG = logging.getLogger(__name__)
+SYS = '/sys'
+RETRY_LIMIT = 5
 
 
-SYS = '/sys'
+# a retry decorator to handle EBUSY
+def retry_if_busy(func):
+    """Decorator to retry a function if it raises DeviceBusy.
+
+    This decorator will retry the function RETRY_LIMIT=5 times if it raises
+    DeviceBusy. It will sleep for 1 second on the first retry, 2 seconds on
+    the second retry, and so on, up to RETRY_LIMIT seconds. If the function
+    still raises DeviceBusy after RETRY_LIMIT retries, the exception will be
+    raised.
+    """
 
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        for i in range(RETRY_LIMIT):
+            try:
+                return func(*args, **kwargs)
+            except exception.DeviceBusy as e:
+                # if we have retried RETRY_LIMIT times, raise the exception
+                # otherwise, sleep and retry, i is 0-based so we need
+                # to add 1 to it
+                count = i + 1
+                if count < RETRY_LIMIT:
+                    LOG.debug(
+                        f"File {e.kwargs['file_path']} is busy, "
+                        f"sleeping {count} seconds before retrying")
+                    time.sleep(count)
+                    continue
+                raise
+    return wrapper
 
 # NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
+
+
+@retry_if_busy
 def read_sys(path: str) -> str:
     """Reads the content of a file in the sys filesystem.
 
     :param path: relative or absolute. If relative, will be prefixed by /sys.
     :returns: contents of that file.
     :raises: nova.exception.FileNotFound if we can't read that file.
+    :raises: nova.exception.DeviceBusy if the file is busy.
     """
     try:
         # The path can be absolute with a /sys prefix but that's fine.
         with open(os.path.join(SYS, path), mode='r') as data:
             return data.read()
-    except (OSError, ValueError) as exc:
+    except OSError as exc:
+        # errno 16 is EBUSY, which means the file is busy.
+        if exc.errno == 16:
+            raise exception.DeviceBusy(file_path=path) from exc
+        # convert permission denied to file not found
+        raise exception.FileNotFound(file_path=path) from exc
+    except ValueError as exc:
         raise exception.FileNotFound(file_path=path) from exc
 
 
 # NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
 # In order to correctly use it, you need to decorate the caller with a specific
 # privsep entrypoint.
+@retry_if_busy
 def write_sys(path: str, data: str) -> None:
     """Writes the content of a file in the sys filesystem with data.
 
@@ -50,10 +92,17 @@ def write_sys(path: str, data: str) -> None:
     :param data: the data to write.
     :returns: contents of that file.
     :raises: nova.exception.FileNotFound if we can't write that file.
+    :raises: nova.exception.DeviceBusy if the file is busy.
     """
     try:
         # The path can be absolute with a /sys prefix but that's fine.
         with open(os.path.join(SYS, path), mode='w') as fd:
             fd.write(data)
-    except (OSError, ValueError) as exc:
+    except OSError as exc:
+        # errno 16 is EBUSY, which means the file is busy.
+        if exc.errno == 16:
+            raise exception.DeviceBusy(file_path=path) from exc
+        # convert permission denied to file not found
+        raise exception.FileNotFound(file_path=path) from exc
+    except ValueError as exc:
         raise exception.FileNotFound(file_path=path) from exc
diff --git a/nova/tests/functional/db/test_archive.py b/nova/tests/functional/db/test_archive.py
index 4a813d10a7c..1bcce9f3ee0 100644
--- a/nova/tests/functional/db/test_archive.py
+++ b/nova/tests/functional/db/test_archive.py
@@ -174,6 +174,50 @@ def test_archive_deleted_rows_incomplete(self):
                 break
         self.assertFalse(exceptions)
 
+    def test_archive_deleted_rows_parent_child_trees_one_at_time(self):
+        """Test that we are archiving parent + child rows "trees" one at a time
+
+        Previously, we archived deleted rows in batches of max_rows parents +
+        their child rows in a single database transaction. Doing it that way
+        limited how high a value of max_rows could be specified by the caller
+        because of the size of the database transaction it could generate.
+
+        For example, in a large scale deployment with hundreds of thousands of
+        deleted rows and constant server creation and deletion activity, a
+        value of max_rows=1000 might exceed the database's configured maximum
+        packet size or timeout due to a database deadlock, forcing the operator
+        to use a much lower max_rows value like 100 or 50.
+
+        And when the operator has e.g. 500,000 deleted instances rows (and
+        millions of deleted rows total) they are trying to archive, being
+        forced to use a max_rows value serveral orders of magnitude lower than
+        the number of rows they need to archive was a poor user experience.
+
+        This tests that we are archiving each parent plus their child rows one
+        at a time while limiting the total number of rows per table in a batch
+        as soon as the total number of archived rows is >= max_rows.
+        """
+        # Boot two servers and delete them, then try to archive rows.
+        for i in range(2):
+            server = self._create_server()
+            self._delete_server(server)
+
+        # Use max_rows=5 to limit the archive to one complete parent +
+        # child rows tree.
+        table_to_rows, _, _ = db.archive_deleted_rows(max_rows=5)
+
+        # We should have archived only one instance because the parent +
+        # child tree will have >= max_rows of 5.
+        self.assertEqual(1, table_to_rows['instances'])
+
+        # Archive once more to archive the other instance (and its child rows).
+        table_to_rows, _, _ = db.archive_deleted_rows(max_rows=5)
+        self.assertEqual(1, table_to_rows['instances'])
+
+        # There should be nothing else to archive.
+        _, _, total_rows_archived = db.archive_deleted_rows(max_rows=100)
+        self.assertEqual(0, total_rows_archived)
+
     def _get_table_counts(self):
         engine = db.get_engine()
         conn = engine.connect()
diff --git a/nova/tests/functional/libvirt/test_power_manage.py b/nova/tests/functional/libvirt/test_power_manage.py
index f6e4b1ba5b4..3839c798317 100644
--- a/nova/tests/functional/libvirt/test_power_manage.py
+++ b/nova/tests/functional/libvirt/test_power_manage.py
@@ -10,10 +10,10 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
-from unittest import mock
-
 import fixtures
 
+from unittest import mock
+
 from nova import context as nova_context
 from nova import exception
 from nova import objects
@@ -240,7 +240,6 @@ def setUp(self):
                                          cpu_cores=5, cpu_threads=2)
         self.compute1 = self.start_compute(host_info=self.host_info,
                                            hostname='compute1')
-
         # All cores are shutdown at startup, let's check.
         cpu_dedicated_set = hardware.get_cpu_dedicated_set()
         self._assert_cpu_set_state(cpu_dedicated_set, expected='offline')
@@ -266,6 +265,34 @@ def test_create_server(self):
         cpu_dedicated_set = hardware.get_cpu_dedicated_set()
         unused_cpus = cpu_dedicated_set - instance_pcpus
         self._assert_cpu_set_state(unused_cpus, expected='offline')
+        return server
+
+    def test_delete_server(self):
+        server = self.test_create_server()
+        self._delete_server(server)
+        # Let's verify that the pinned CPUs are now offline
+        cpu_dedicated_set = hardware.get_cpu_dedicated_set()
+        self._assert_cpu_set_state(cpu_dedicated_set, expected='offline')
+
+    def test_delete_server_device_busy(self):
+        # This test verifies bug 2065927 is resolved.
+        server = self.test_create_server()
+        inst = objects.Instance.get_by_uuid(self.ctxt, server['id'])
+        instance_pcpus = inst.numa_topology.cpu_pinning
+        self._assert_cpu_set_state(instance_pcpus, expected='online')
+        with mock.patch(
+            'nova.filesystem.write_sys.__wrapped__',
+            side_effect=[
+                exception.DeviceBusy(file_path='fake'),
+                None]):
+
+            self._delete_server(server)
+        cpu_dedicated_set = hardware.get_cpu_dedicated_set()
+        # Verify that the unused CPUs are still offline
+        unused_cpus = cpu_dedicated_set - instance_pcpus
+        self._assert_cpu_set_state(unused_cpus, expected='offline')
+        # and the pinned CPUs are offline
+        self._assert_cpu_set_state(instance_pcpus, expected='offline')
 
     def test_create_server_with_emulator_threads_isolate(self):
         server = self._create_server(
diff --git a/nova/tests/functional/test_nova_manage.py b/nova/tests/functional/test_nova_manage.py
index 888b43cea09..31996df2d72 100644
--- a/nova/tests/functional/test_nova_manage.py
+++ b/nova/tests/functional/test_nova_manage.py
@@ -2238,57 +2238,71 @@ def setUp(self):
 
     def test_archive_deleted_rows(self):
         admin_context = context.get_admin_context(read_deleted='yes')
-        # Boot a server to cell1
-        server_ids = {}
-        server = self._build_server(az='nova:host1')
-        created_server = self.api.post_server({'server': server})
-        self._wait_for_state_change(created_server, 'ACTIVE')
-        server_ids['cell1'] = created_server['id']
-        # Boot a server to cell2
-        server = self._build_server(az='nova:host2')
-        created_server = self.api.post_server({'server': server})
-        self._wait_for_state_change(created_server, 'ACTIVE')
-        server_ids['cell2'] = created_server['id']
-        # Boot a server to cell0 (cause ERROR state prior to schedule)
-        server = self._build_server()
-        # Flavor m1.xlarge cannot be fulfilled
-        server['flavorRef'] = 'http://fake.server/5'
-        created_server = self.api.post_server({'server': server})
-        self._wait_for_state_change(created_server, 'ERROR')
-        server_ids['cell0'] = created_server['id']
+        server_ids_by_cell = collections.defaultdict(list)
+        # Create two servers per cell to make sure archive for table iterates
+        # at least once.
+        for i in range(2):
+            # Boot a server to cell1
+            server = self._build_server(az='nova:host1')
+            created_server = self.api.post_server({'server': server})
+            self._wait_for_state_change(created_server, 'ACTIVE')
+            server_ids_by_cell['cell1'].append(created_server['id'])
+            # Boot a server to cell2
+            server = self._build_server(az='nova:host2')
+            created_server = self.api.post_server({'server': server})
+            self._wait_for_state_change(created_server, 'ACTIVE')
+            server_ids_by_cell['cell2'].append(created_server['id'])
+            # Boot a server to cell0 (cause ERROR state prior to schedule)
+            server = self._build_server()
+            # Flavor m1.xlarge cannot be fulfilled
+            server['flavorRef'] = 'http://fake.server/5'
+            created_server = self.api.post_server({'server': server})
+            self._wait_for_state_change(created_server, 'ERROR')
+            server_ids_by_cell['cell0'].append(created_server['id'])
+
         # Verify all the servers are in the databases
-        for cell_name, server_id in server_ids.items():
-            with context.target_cell(admin_context,
-                                     self.cell_mappings[cell_name]) as cctxt:
-                objects.Instance.get_by_uuid(cctxt, server_id)
+        for cell_name, server_ids in server_ids_by_cell.items():
+            for server_id in server_ids:
+                with context.target_cell(
+                    admin_context,
+                    self.cell_mappings[cell_name]
+                ) as cctxt:
+                    objects.Instance.get_by_uuid(cctxt, server_id)
         # Delete the servers
-        for cell_name in server_ids.keys():
-            self.api.delete_server(server_ids[cell_name])
+        for cell_name, server_ids in server_ids_by_cell.items():
+            for server_id in server_ids:
+                self.api.delete_server(server_id)
         # Verify all the servers are in the databases still (as soft deleted)
-        for cell_name, server_id in server_ids.items():
-            with context.target_cell(admin_context,
-                                     self.cell_mappings[cell_name]) as cctxt:
-                objects.Instance.get_by_uuid(cctxt, server_id)
+        for cell_name, server_ids in server_ids_by_cell.items():
+            for server_id in server_ids:
+                with context.target_cell(
+                    admin_context,
+                    self.cell_mappings[cell_name]
+                ) as cctxt:
+                    objects.Instance.get_by_uuid(cctxt, server_id)
         # Archive the deleted rows
         self.cli.archive_deleted_rows(verbose=True, all_cells=True)
-        # Three instances should have been archived (cell0, cell1, cell2)
+        # 6 instances should have been archived (cell0, cell1, cell2)
         self.assertRegex(self.output.getvalue(),
-                         r"| cell0\.instances.*\| 1.*")
+                         r"\| cell0\.instances\s+\| 2")
         self.assertRegex(self.output.getvalue(),
-                         r"| cell1\.instances.*\| 1.*")
+                         r"\| cell1\.instances\s+\| 2")
         self.assertRegex(self.output.getvalue(),
-                         r"| cell2\.instances.*\| 1.*")
+                         r"\| cell2\.instances\s+\| 2")
         self.assertRegex(self.output.getvalue(),
-                         r"| API_DB\.instance_mappings.*\| 3.*")
+                         r"\| API_DB\.instance_mappings\s+\| 6")
         self.assertRegex(self.output.getvalue(),
-                         r"| API_DB\.request_specs.*\| 3.*")
+                         r"\| API_DB\.request_specs\s+\| 6")
         # Verify all the servers are gone from the cell databases
-        for cell_name, server_id in server_ids.items():
-            with context.target_cell(admin_context,
-                                     self.cell_mappings[cell_name]) as cctxt:
-                self.assertRaises(exception.InstanceNotFound,
-                                  objects.Instance.get_by_uuid,
-                                  cctxt, server_id)
+        for cell_name, server_ids in server_ids_by_cell.items():
+            for server_id in server_ids:
+                with context.target_cell(
+                    admin_context,
+                    self.cell_mappings[cell_name]
+                ) as cctxt:
+                    self.assertRaises(exception.InstanceNotFound,
+                                      objects.Instance.get_by_uuid,
+                                      cctxt, server_id)
 
 
 class TestDBArchiveDeletedRowsMultiCellTaskLog(
diff --git a/nova/tests/unit/cmd/test_manage.py b/nova/tests/unit/cmd/test_manage.py
index fca97180e4d..0756624f6e6 100644
--- a/nova/tests/unit/cmd/test_manage.py
+++ b/nova/tests/unit/cmd/test_manage.py
@@ -501,6 +501,26 @@ def test_archive_deleted_rows_verbose_no_results(self, mock_get_all,
         self.assertIn('Nothing was archived.', output)
         self.assertEqual(0, result)
 
+    @mock.patch.object(db, 'archive_deleted_rows')
+    @mock.patch.object(objects.CellMappingList, 'get_all')
+    def test_archive_deleted_rows_deleted_instances_no_rows(self, mock_get_all,
+                                                            mock_db_archive):
+        # Simulate a scenario where we didn't archive any rows (example:
+        # DBReferenceError was raised while processing the instances table) but
+        # the separate query for deleted_instance_uuids found rows.
+        mock_db_archive.return_value = (
+            {}, [uuidsentinel.instance1, uuidsentinel.instance2], 0)
+
+        result = self.commands.archive_deleted_rows(20, verbose=True)
+
+        mock_db_archive.assert_called_once_with(
+            test.MatchType(context.RequestContext), 20, before=None,
+            task_log=False)
+        output = self.output.getvalue()
+        # If nothing was archived, there should be no purge messages
+        self.assertIn('Nothing was archived.', output)
+        self.assertEqual(0, result)
+
     @mock.patch.object(db, 'archive_deleted_rows')
     @mock.patch.object(objects.RequestSpec, 'destroy_bulk')
     @mock.patch.object(objects.InstanceGroup, 'destroy_members_bulk')
diff --git a/nova/tests/unit/test_filesystem.py b/nova/tests/unit/test_filesystem.py
index 85f16157eeb..5c3107472d8 100644
--- a/nova/tests/unit/test_filesystem.py
+++ b/nova/tests/unit/test_filesystem.py
@@ -10,6 +10,7 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
+import io
 import os
 from unittest import mock
 
@@ -35,6 +36,29 @@ def test_read_sys_error(self):
         expected_path = os.path.join(filesystem.SYS, 'foo')
         m_open.assert_called_once_with(expected_path, mode='r')
 
+    def test_read_sys_retry(self):
+        open_mock = mock.mock_open()
+        with mock.patch('builtins.open', open_mock) as m_open:
+            m_open.side_effect = [
+                OSError(16, 'Device or resource busy'),
+                io.StringIO('bar'),
+            ]
+            self.assertEqual('bar', filesystem.read_sys('foo'))
+        expected_path = os.path.join(filesystem.SYS, 'foo')
+        m_open.assert_has_calls([
+            mock.call(expected_path, mode='r'),
+            mock.call(expected_path, mode='r'),
+        ])
+
+    def test_read_sys_retry_limit(self):
+        open_mock = mock.mock_open()
+        with mock.patch('builtins.open', open_mock) as m_open:
+            m_open.side_effect = (
+                [OSError(16, 'Device or resource busy')] *
+                (filesystem.RETRY_LIMIT + 1))
+            self.assertRaises(
+                exception.DeviceBusy, filesystem.read_sys, 'foo')
+
     def test_write_sys(self):
         open_mock = mock.mock_open()
         with mock.patch('builtins.open', open_mock) as m_open:
@@ -50,3 +74,26 @@ def test_write_sys_error(self):
                               filesystem.write_sys, 'foo', 'bar')
         expected_path = os.path.join(filesystem.SYS, 'foo')
         m_open.assert_called_once_with(expected_path, mode='w')
+
+    def test_write_sys_retry(self):
+        open_mock = mock.mock_open()
+        with mock.patch('builtins.open', open_mock) as m_open:
+            m_open.side_effect = [
+                OSError(16, 'Device or resource busy'),
+                io.StringIO(),
+            ]
+            self.assertIsNone(filesystem.write_sys('foo', 'bar'))
+        expected_path = os.path.join(filesystem.SYS, 'foo')
+        m_open.assert_has_calls([
+            mock.call(expected_path, mode='w'),
+            mock.call(expected_path, mode='w'),
+        ])
+
+    def test_write_sys_retry_limit(self):
+        open_mock = mock.mock_open()
+        with mock.patch('builtins.open', open_mock) as m_open:
+            m_open.side_effect = (
+                [OSError(16, 'Device or resource busy')] *
+                (filesystem.RETRY_LIMIT + 1))
+            self.assertRaises(
+                exception.DeviceBusy, filesystem.write_sys, 'foo', 'bar')
diff --git a/nova/virt/libvirt/cpu/core.py b/nova/virt/libvirt/cpu/core.py
index 8274236850c..2d71bd60e40 100644
--- a/nova/virt/libvirt/cpu/core.py
+++ b/nova/virt/libvirt/cpu/core.py
@@ -56,13 +56,23 @@ def get_online(core: int) -> bool:
 
 @nova.privsep.sys_admin_pctxt.entrypoint
 def set_online(core: int) -> bool:
+    # failure to online a core should be considered a failure
+    # so we don't catch any exception here.
     filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='1')
     return get_online(core)
 
 
 @nova.privsep.sys_admin_pctxt.entrypoint
 def set_offline(core: int) -> bool:
-    filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='0')
+    try:
+        filesystem.write_sys(os.path.join(
+            gen_cpu_path(core), 'online'), data='0')
+    except exception.DeviceBusy:
+        # if nova is not able to offline a core it should not break anything
+        # so we just log a warning and return False to indicate that the core
+        # is not offline.
+        LOG.warning('Unable to offline CPU: %s', core)
+        return False
     return not get_online(core)
 
 
diff --git a/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml b/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml
new file mode 100644
index 00000000000..537d67b3838
--- /dev/null
+++ b/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml
@@ -0,0 +1,21 @@
+---
+fixes:
+  - |
+    `Bug #2024258`_: Fixes an issue with performance degradation archiving
+    databases with large numbers of foreign key related records.
+
+    Previously, deleted rows were archived in batches of max_rows parents +
+    their child rows in a single database transaction. It limited how high
+    a value of max_rows could be specified by the user because of the size of
+    the database transaction it could generate. Symptoms of the behavior were
+    exceeding the maximum configured packet size of the database or timing out
+    due to a deadlock.
+
+    The behavior has been changed to archive batches of complete parent +
+    child rows trees while limiting each batch when it has reached >= max_rows
+    records. This allows the size of the database transaction to be controlled
+    by the user and enables more rows to be archived per invocation of
+    ``nova-manage db archive_deleted_rows`` when there are a large number of
+    foreign key related records.
+
+    .. _Bug #2024258: https://bugs.launchpad.net/nova/+bug/2024258