Merge pull request #121 from stackhpc/upstream/2023.1-2024-10-21

Synchronise 2023.1 with upstream
stackhpc · Oct 21, 2024 · 1c8031b · 1c8031b
2 parents a2f49a7 + 945aabc
commit 1c8031b
Show file tree

Hide file tree

Showing 12 changed files with 404 additions and 97 deletions.
diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py
@@ -257,10 +257,10 @@ def version(self):
         print(migration.db_version())
 
     @args('--max_rows', type=int, metavar='<number>', dest='max_rows',
-          help='Maximum number of deleted rows to archive. Defaults to 1000. '
-               'Note that this number does not include the corresponding '
-               'rows, if any, that are removed from the API database for '
-               'deleted instances.')
+          help='Maximum number of deleted rows to archive per table. Defaults '
+               'to 1000. Note that this number is a soft limit and does not '
+               'include the corresponding rows, if any, that are removed '
+               'from the API database for deleted instances.')
     @args('--before', metavar='<date>',
           help=('Archive rows that have been deleted before this date. '
                 'Accepts date strings in the default format output by the '
@@ -432,7 +432,10 @@ def _do_archive(
              'cell1.instances': 5}
         :param cctxt: Cell-targeted nova.context.RequestContext if archiving
             across all cells
-        :param max_rows: Maximum number of deleted rows to archive
+        :param max_rows: Maximum number of deleted rows to archive per table.
+            Note that this number is a soft limit and does not include the
+            corresponding rows, if any, that are removed from the API database
+            for deleted instances.
         :param until_complete: Whether to run continuously until all deleted
             rows are archived
         :param verbose: Whether to print how many rows were archived per table
@@ -445,15 +448,26 @@ def _do_archive(
         """
         ctxt = context.get_admin_context()
         while True:
-            run, deleted_instance_uuids, total_rows_archived = \
+            # table_to_rows = {table_name: number_of_rows_archived}
+            # deleted_instance_uuids = ['uuid1', 'uuid2', ...]
+            table_to_rows, deleted_instance_uuids, total_rows_archived = \
                 db.archive_deleted_rows(
                     cctxt, max_rows, before=before_date, task_log=task_log)
-            for table_name, rows_archived in run.items():
+
+            for table_name, rows_archived in table_to_rows.items():
                 if cell_name:
                     table_name = cell_name + '.' + table_name
                 table_to_rows_archived.setdefault(table_name, 0)
                 table_to_rows_archived[table_name] += rows_archived
-            if deleted_instance_uuids:
+
+            # deleted_instance_uuids does not necessarily mean that any
+            # instances rows were archived because it is obtained by a query
+            # separate from the archive queries. For example, if a
+            # DBReferenceError was raised while processing the instances table,
+            # we would have skipped the table and had 0 rows archived even
+            # though deleted instances rows were found.
+            instances_archived = table_to_rows.get('instances', 0)
+            if deleted_instance_uuids and instances_archived:
                 table_to_rows_archived.setdefault(
                     'API_DB.instance_mappings', 0)
                 table_to_rows_archived.setdefault(
@@ -476,8 +490,9 @@ def _do_archive(
 
             # If we're not archiving until there is nothing more to archive, we
             # have reached max_rows in this cell DB or there was nothing to
-            # archive.
-            if not until_complete or not run:
+            # archive. We check the values() in case we get something like
+            # table_to_rows = {'instances': 0} back somehow.
+            if not until_complete or not any(table_to_rows.values()):
                 break
             if verbose:
                 sys.stdout.write('.')

diff --git a/nova/conf/pci.py b/nova/conf/pci.py
@@ -175,8 +175,9 @@
       its corresponding PF), otherwise they will be ignored and not
       available for allocation.
     - ``resource_class`` - optional Placement resource class name to be used
-      to track the matching PCI devices in Placement when [pci]device_spec is
-      True. It can be a standard resource class from the
+      to track the matching PCI devices in Placement when
+      [pci]report_in_placement is True.
+      It can be a standard resource class from the
       ``os-resource-classes`` lib. Or can be any string. In that case Nova will
       normalize it to a proper Placement resource class by making it upper
       case, replacing any consecutive character outside of ``[A-Z0-9_]`` with a

diff --git a/nova/db/main/api.py b/nova/db/main/api.py
@@ -4407,6 +4407,9 @@ def _archive_deleted_rows_for_table(
     select = select.order_by(column).limit(max_rows)
     with conn.begin():
         rows = conn.execute(select).fetchall()
+
+    # This is a list of IDs of rows that should be archived from this table,
+    # limited to a length of max_rows.
     records = [r[0] for r in rows]
 
     # We will archive deleted rows for this table and also generate insert and
@@ -4419,51 +4422,103 @@ def _archive_deleted_rows_for_table(
 
     # Keep track of any extra tablenames to number of rows that we archive by
     # following FK relationships.
-    # {tablename: extra_rows_archived}
+    #
+    # extras = {tablename: number_of_extra_rows_archived}
     extras = collections.defaultdict(int)
-    if records:
-        insert = shadow_table.insert().from_select(
-            columns, sql.select(table).where(column.in_(records))
-        ).inline()
-        delete = table.delete().where(column.in_(records))
+
+    if not records:
+        # Nothing to archive, so return.
+        return rows_archived, deleted_instance_uuids, extras
+
+    # Keep track of how many rows we accumulate for the insert+delete database
+    # transaction and cap it as soon as it is >= max_rows. Because we will
+    # archive all child rows of a parent row along with the parent at the same
+    # time, we end up with extra rows to archive in addition to len(records).
+    num_rows_in_batch = 0
+    # The sequence of query statements we will execute in a batch. These are
+    # ordered: [child1, child1, parent1, child2, child2, child2, parent2, ...]
+    # Parent + child "trees" are kept together to avoid FK constraint
+    # violations.
+    statements_in_batch = []
+    # The list of records in the batch. This is used for collecting deleted
+    # instance UUIDs in the case of the 'instances' table.
+    records_in_batch = []
+
+    # (melwitt): We will gather rows related by foreign key relationship for
+    # each deleted row, one at a time. We do it this way to keep track of and
+    # limit the total number of rows that will be archived in a single database
+    # transaction. In a large scale database with potentially hundreds of
+    # thousands of deleted rows, if we don't limit the size of the transaction
+    # based on max_rows, we can get into a situation where we get stuck not
+    # able to make much progress. The value of max_rows has to be 1) small
+    # enough to not exceed the database's max packet size limit or timeout with
+    # a deadlock but 2) large enough to make progress in an environment with a
+    # constant high volume of create and delete traffic. By archiving each
+    # parent + child rows tree one at a time, we can ensure meaningful progress
+    # can be made while allowing the caller to predictably control the size of
+    # the database transaction with max_rows.
+    for record in records:
         # Walk FK relationships and add insert/delete statements for rows that
         # refer to this table via FK constraints. fk_inserts and fk_deletes
         # will be prepended to by _get_fk_stmts if referring rows are found by
         # FK constraints.
         fk_inserts, fk_deletes = _get_fk_stmts(
-            metadata, conn, table, column, records)
-
-        # NOTE(tssurya): In order to facilitate the deletion of records from
-        # instance_mappings, request_specs and instance_group_member tables in
-        # the nova_api DB, the rows of deleted instances from the instances
-        # table are stored prior to their deletion. Basically the uuids of the
-        # archived instances are queried and returned.
-        if tablename == "instances":
-            query_select = sql.select(table.c.uuid).where(
-                table.c.id.in_(records)
-            )
-            with conn.begin():
-                rows = conn.execute(query_select).fetchall()
-            deleted_instance_uuids = [r[0] for r in rows]
+            metadata, conn, table, column, [record])
+        statements_in_batch.extend(fk_inserts + fk_deletes)
+        # statement to add parent row to shadow table
+        insert = shadow_table.insert().from_select(
+            columns, sql.select(table).where(column.in_([record]))).inline()
+        statements_in_batch.append(insert)
+        # statement to remove parent row from main table
+        delete = table.delete().where(column.in_([record]))
+        statements_in_batch.append(delete)
 
-        try:
-            # Group the insert and delete in a transaction.
-            with conn.begin():
-                for fk_insert in fk_inserts:
-                    conn.execute(fk_insert)
-                for fk_delete in fk_deletes:
-                    result_fk_delete = conn.execute(fk_delete)
-                    extras[fk_delete.table.name] += result_fk_delete.rowcount
-                conn.execute(insert)
-                result_delete = conn.execute(delete)
-            rows_archived += result_delete.rowcount
-        except db_exc.DBReferenceError as ex:
-            # A foreign key constraint keeps us from deleting some of
-            # these rows until we clean up a dependent table.  Just
-            # skip this table for now; we'll come back to it later.
-            LOG.warning("IntegrityError detected when archiving table "
-                        "%(tablename)s: %(error)s",
-                        {'tablename': tablename, 'error': str(ex)})
+        records_in_batch.append(record)
+
+        # Check whether were have a full batch >= max_rows. Rows are counted as
+        # the number of rows that will be moved in the database transaction.
+        # So each insert+delete pair represents one row that will be moved.
+        # 1 parent + its fks
+        num_rows_in_batch += 1 + len(fk_inserts)
+
+        if max_rows is not None and num_rows_in_batch >= max_rows:
+            break
+
+    # NOTE(tssurya): In order to facilitate the deletion of records from
+    # instance_mappings, request_specs and instance_group_member tables in the
+    # nova_api DB, the rows of deleted instances from the instances table are
+    # stored prior to their deletion. Basically the uuids of the archived
+    # instances are queried and returned.
+    if tablename == "instances":
+        query_select = sql.select(table.c.uuid).where(
+            table.c.id.in_(records_in_batch))
+        with conn.begin():
+            rows = conn.execute(query_select).fetchall()
+        # deleted_instance_uuids = ['uuid1', 'uuid2', ...]
+        deleted_instance_uuids = [r[0] for r in rows]
+
+    try:
+        # Group the insert and delete in a transaction.
+        with conn.begin():
+            for statement in statements_in_batch:
+                result = conn.execute(statement)
+                result_tablename = statement.table.name
+                # Add to archived row counts if not a shadow table.
+                if not result_tablename.startswith(_SHADOW_TABLE_PREFIX):
+                    if result_tablename == tablename:
+                        # Number of tablename (parent) rows archived.
+                        rows_archived += result.rowcount
+                    else:
+                        # Number(s) of child rows archived.
+                        extras[result_tablename] += result.rowcount
+
+    except db_exc.DBReferenceError as ex:
+        # A foreign key constraint keeps us from deleting some of these rows
+        # until we clean up a dependent table. Just skip this table for now;
+        # we'll come back to it later.
+        LOG.warning("IntegrityError detected when archiving table "
+                    "%(tablename)s: %(error)s",
+                    {'tablename': tablename, 'error': str(ex)})
 
     conn.close()
 

diff --git a/nova/exception.py b/nova/exception.py
@@ -1131,6 +1131,10 @@ class FileNotFound(NotFound):
     msg_fmt = _("File %(file_path)s could not be found.")
 
 
+class DeviceBusy(NovaException):
+    msg_fmt = _("device %(file_path)s is busy.")
+
+
 class ClassNotFound(NotFound):
     msg_fmt = _("Class %(class_name)s could not be found: %(exception)s")
 

diff --git a/nova/filesystem.py b/nova/filesystem.py
@@ -12,48 +12,97 @@
 
 """Functions to address filesystem calls, particularly sysfs."""
 
+import functools
 import os
+import time
 
 from oslo_log import log as logging
 
 from nova import exception
 
 LOG = logging.getLogger(__name__)
+SYS = '/sys'
+RETRY_LIMIT = 5
 
 
-SYS = '/sys'
+# a retry decorator to handle EBUSY
+def retry_if_busy(func):
+    """Decorator to retry a function if it raises DeviceBusy.
+
+    This decorator will retry the function RETRY_LIMIT=5 times if it raises
+    DeviceBusy. It will sleep for 1 second on the first retry, 2 seconds on
+    the second retry, and so on, up to RETRY_LIMIT seconds. If the function
+    still raises DeviceBusy after RETRY_LIMIT retries, the exception will be
+    raised.
+    """
 
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        for i in range(RETRY_LIMIT):
+            try:
+                return func(*args, **kwargs)
+            except exception.DeviceBusy as e:
+                # if we have retried RETRY_LIMIT times, raise the exception
+                # otherwise, sleep and retry, i is 0-based so we need
+                # to add 1 to it
+                count = i + 1
+                if count < RETRY_LIMIT:
+                    LOG.debug(
+                        f"File {e.kwargs['file_path']} is busy, "
+                        f"sleeping {count} seconds before retrying")
+                    time.sleep(count)
+                    continue
+                raise
+    return wrapper
 
 # NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
+
+
+@retry_if_busy
 def read_sys(path: str) -> str:
     """Reads the content of a file in the sys filesystem.
 
     :param path: relative or absolute. If relative, will be prefixed by /sys.
     :returns: contents of that file.
     :raises: nova.exception.FileNotFound if we can't read that file.
+    :raises: nova.exception.DeviceBusy if the file is busy.
     """
     try:
         # The path can be absolute with a /sys prefix but that's fine.
         with open(os.path.join(SYS, path), mode='r') as data:
             return data.read()
-    except (OSError, ValueError) as exc:
+    except OSError as exc:
+        # errno 16 is EBUSY, which means the file is busy.
+        if exc.errno == 16:
+            raise exception.DeviceBusy(file_path=path) from exc
+        # convert permission denied to file not found
+        raise exception.FileNotFound(file_path=path) from exc
+    except ValueError as exc:
         raise exception.FileNotFound(file_path=path) from exc
 
 
 # NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
 # In order to correctly use it, you need to decorate the caller with a specific
 # privsep entrypoint.
+@retry_if_busy
 def write_sys(path: str, data: str) -> None:
     """Writes the content of a file in the sys filesystem with data.
 
     :param path: relative or absolute. If relative, will be prefixed by /sys.
     :param data: the data to write.
     :returns: contents of that file.
     :raises: nova.exception.FileNotFound if we can't write that file.
+    :raises: nova.exception.DeviceBusy if the file is busy.
     """
     try:
         # The path can be absolute with a /sys prefix but that's fine.
         with open(os.path.join(SYS, path), mode='w') as fd:
             fd.write(data)
-    except (OSError, ValueError) as exc:
+    except OSError as exc:
+        # errno 16 is EBUSY, which means the file is busy.
+        if exc.errno == 16:
+            raise exception.DeviceBusy(file_path=path) from exc
+        # convert permission denied to file not found
+        raise exception.FileNotFound(file_path=path) from exc
+    except ValueError as exc:
         raise exception.FileNotFound(file_path=path) from exc
diff --git a/nova/tests/functional/db/test_archive.py b/nova/tests/functional/db/test_archive.py
@@ -174,6 +174,50 @@ def test_archive_deleted_rows_incomplete(self):
                 break
         self.assertFalse(exceptions)
 
+    def test_archive_deleted_rows_parent_child_trees_one_at_time(self):
+        """Test that we are archiving parent + child rows "trees" one at a time
+
+        Previously, we archived deleted rows in batches of max_rows parents +
+        their child rows in a single database transaction. Doing it that way
+        limited how high a value of max_rows could be specified by the caller
+        because of the size of the database transaction it could generate.
+
+        For example, in a large scale deployment with hundreds of thousands of
+        deleted rows and constant server creation and deletion activity, a
+        value of max_rows=1000 might exceed the database's configured maximum
+        packet size or timeout due to a database deadlock, forcing the operator
+        to use a much lower max_rows value like 100 or 50.
+
+        And when the operator has e.g. 500,000 deleted instances rows (and
+        millions of deleted rows total) they are trying to archive, being
+        forced to use a max_rows value serveral orders of magnitude lower than
+        the number of rows they need to archive was a poor user experience.
+
+        This tests that we are archiving each parent plus their child rows one
+        at a time while limiting the total number of rows per table in a batch
+        as soon as the total number of archived rows is >= max_rows.
+        """
+        # Boot two servers and delete them, then try to archive rows.
+        for i in range(2):
+            server = self._create_server()
+            self._delete_server(server)
+
+        # Use max_rows=5 to limit the archive to one complete parent +
+        # child rows tree.
+        table_to_rows, _, _ = db.archive_deleted_rows(max_rows=5)
+
+        # We should have archived only one instance because the parent +
+        # child tree will have >= max_rows of 5.
+        self.assertEqual(1, table_to_rows['instances'])
+
+        # Archive once more to archive the other instance (and its child rows).
+        table_to_rows, _, _ = db.archive_deleted_rows(max_rows=5)
+        self.assertEqual(1, table_to_rows['instances'])
+
+        # There should be nothing else to archive.
+        _, _, total_rows_archived = db.archive_deleted_rows(max_rows=100)
+        self.assertEqual(0, total_rows_archived)
+
     def _get_table_counts(self):
         engine = db.get_engine()
         conn = engine.connect()