From 0cc9990717fc7fa59cfcd635605fb581020e53d1 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 15 Nov 2023 16:57:01 +0000 Subject: [PATCH 1/7] fact gathering: Remove gathering of package facts These were added in I20e65a6771ebeee462a3aaaabaa5f0596bdd0581 but not used in the final version of the changeset. Change-Id: I4fca47e16e87db8b655ff5571fba568ef4edb95c (cherry picked from commit 8709db2a257849a34c745b68282ebc5907d39e00) --- ansible/gather-facts.yml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/ansible/gather-facts.yml b/ansible/gather-facts.yml index b50389660a..f5247a6ab9 100644 --- a/ansible/gather-facts.yml +++ b/ansible/gather-facts.yml @@ -15,13 +15,6 @@ when: - not ansible_facts - - name: Gather package facts - package_facts: - when: - - "'packages' not in ansible_facts" - - kolla_action is defined - - kolla_action == "precheck" - - name: Group hosts to determine when using --limit group_by: key: "all_using_limit_{{ (ansible_play_batch | length) != (groups['all'] | length) }}" @@ -56,14 +49,4 @@ # We gathered facts for all hosts in the batch during the first play. when: - not hostvars[item].ansible_facts - - - name: Gather package facts - package_facts: - delegate_facts: True - delegate_to: "{{ item }}" - with_items: "{{ delegate_hosts }}" - when: - - "'packages' not in hostvars[item].ansible_facts" - - kolla_action is defined - - "kolla_action == 'precheck'" tags: always From 1a8c74a209a46fcc460bb204e3848cc47775ff23 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 30 Oct 2023 14:36:48 +0000 Subject: [PATCH 2/7] fact gathering: Group hosts before gathering facts If not using --limit, each host runs the setup module for itself only. Hosts should be able to fail at this point and drop out of execution without affecting others. However, there is a bug that causes us to hit the --limit code path for an unreachable host and therefore become subject to its limitations. These include the use of delegated fact gathering, and failing the delegated host when the delegating host is unreachable. This change fixes the issue by reversing the order of two tasks in the gather-facts.yml playbook, ensuring that we check the ansible_play_batch variable before any hosts have had a chance to fail. Change-Id: I4b1da63e8f0cc2774f1a9ab2e0414746108f7e12 Closes-Bug: #2041859 (cherry picked from commit 7f01f47bdad93383a4e2b884c4216d7db925d67f) --- ansible/gather-facts.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ansible/gather-facts.yml b/ansible/gather-facts.yml index f5247a6ab9..a375197c99 100644 --- a/ansible/gather-facts.yml +++ b/ansible/gather-facts.yml @@ -8,17 +8,17 @@ serial: '{{ kolla_serial|default("0") }}' gather_facts: false tasks: + - name: Group hosts to determine when using --limit + group_by: + key: "all_using_limit_{{ (ansible_play_batch | length) != (groups['all'] | length) }}" + changed_when: false + - name: Gather facts setup: filter: "{{ kolla_ansible_setup_filter }}" gather_subset: "{{ kolla_ansible_setup_gather_subset }}" when: - not ansible_facts - - - name: Group hosts to determine when using --limit - group_by: - key: "all_using_limit_{{ (ansible_play_batch | length) != (groups['all'] | length) }}" - changed_when: false tags: always # NOTE(pbourke): This case covers deploying subsets of hosts using --limit. The From 3d6a36fc768a683da6968b122823cdb3147f1a30 Mon Sep 17 00:00:00 2001 From: Michal Nasiadka Date: Mon, 16 Jan 2023 10:19:10 +0000 Subject: [PATCH 3/7] CI: improve get_logs.sh Change-Id: I2546d714d1e6a1b648a4c6242ece18a2108b9183 (cherry picked from commit 68fd0cd2884e8c1f2c00037030f2b796fce018ad) --- tests/get_logs.sh | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/tests/get_logs.sh b/tests/get_logs.sh index 29c6f48650..a883b0dc4f 100644 --- a/tests/get_logs.sh +++ b/tests/get_logs.sh @@ -14,10 +14,10 @@ copy_logs() { exit 1 fi - cp -rnL ${VOLUMES_DIR}/kolla_logs/_data/* ${LOG_DIR}/kolla/ - cp -rnL /etc/kolla/* ${LOG_DIR}/kolla_configs/ + [ -d ${VOLUMES_DIR}/kolla_logs/_data ] && cp -rnL ${VOLUMES_DIR}/kolla_logs/_data/* ${LOG_DIR}/kolla/ + [ -d /etc/kolla ] && cp -rnL /etc/kolla/* ${LOG_DIR}/kolla_configs/ # Don't save the IPA images. - rm ${LOG_DIR}/kolla_configs/config/ironic/ironic-agent.{kernel,initramfs} + rm -f ${LOG_DIR}/kolla_configs/config/ironic/ironic-agent.{kernel,initramfs} mkdir ${LOG_DIR}/system_configs/ cp -rL /etc/{hostname,hosts,host.conf,resolv.conf,nsswitch.conf,systemd} ${LOG_DIR}/system_configs/ # copy docker configs if used @@ -35,10 +35,6 @@ copy_logs() { if [ "$CONTAINER_ENGINE" = "docker" ]; then journalctl --no-pager -u containerd.service > ${LOG_DIR}/system_logs/containerd.log fi - else - if [ "$CONTAINER_ENGINE" = "docker" ]; then - cp /var/log/upstart/docker.log ${LOG_DIR}/system_logs/docker.log - fi fi cp -r /etc/sudoers.d ${LOG_DIR}/system_logs/ @@ -99,21 +95,22 @@ copy_logs() { ps -eo user,pid,ppid,lwp,%cpu,%mem,size,rss,cmd > ${LOG_DIR}/system_logs/ps.txt # container engine related information - (${CONTAINER_ENGINE} info && - ${CONTAINER_ENGINE} images && - ${CONTAINER_ENGINE} ps -a && - ${CONTAINER_ENGINE} network ls && - ${CONTAINER_ENGINE} inspect $(${CONTAINER_ENGINE} ps -aq)) > ${LOG_DIR}/system_logs/${CONTAINER_ENGINE}-info.txt + [ `command -v ${CONTAINER_ENGINE}` ] && + ( ${CONTAINER_ENGINE} info && + ${CONTAINER_ENGINE} images && + ${CONTAINER_ENGINE} ps -a && + ${CONTAINER_ENGINE} network ls && + ${CONTAINER_ENGINE} inspect $(${CONTAINER_ENGINE} ps -aq)) > ${LOG_DIR}/system_logs/${CONTAINER_ENGINE}-info.txt # save dbus services - dbus-send --system --print-reply --dest=org.freedesktop.DBus /org/freedesktop/DBus org.freedesktop.DBus.ListNames > ${LOG_DIR}/system_logs/dbus-services.txt + [ `command -v dbus-send` ] && dbus-send --system --print-reply --dest=org.freedesktop.DBus /org/freedesktop/DBus org.freedesktop.DBus.ListNames > ${LOG_DIR}/system_logs/dbus-services.txt # cephadm related logs if [ `command -v cephadm` ]; then mkdir -p ${LOG_DIR}/ceph - sudo cp /etc/ceph/ceph.conf ${LOG_DIR}/ceph - sudo cp /var/run/ceph/*/cluster.yml ${LOG_DIR}/ceph/cluster.yml - sudo cp /var/log/ceph/cephadm.log* ${LOG_DIR}/ceph/ + [ -d /etc/ceph ] && sudo cp /etc/ceph/ceph.conf ${LOG_DIR}/ceph + [ -d /var/run/ceph ] && sudo cp /var/run/ceph/*/cluster.yml ${LOG_DIR}/ceph/cluster.yml + [ -d /var/log/ceph ] && sudo cp /var/log/ceph/cephadm.log* ${LOG_DIR}/ceph/ sudo cephadm shell -- ceph --connect-timeout 5 -s > ${LOG_DIR}/ceph/ceph_s.txt sudo cephadm shell -- ceph --connect-timeout 5 osd tree > ${LOG_DIR}/ceph/ceph_osd_tree.txt fi From c6dc57f564b4d098a3704135bd868a5be80762fe Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Thu, 29 Feb 2024 10:48:40 +0000 Subject: [PATCH 4/7] Revert "Allow setting any_errors_fatal true for gather-facts" This reverts commit 5b431f0f7f932293362a01703673ee05d0a5bd8d. Reason for revert: the any_errors_fatal play parameter is not templated by Ansible (tested up to ansible-core 2.15.9). This behaviour is demonstrated in [1]. This means that "{{ kolla_ansible_setup_any_errors_fatal }}" is always interpreted as 'true', regardless of the value of kolla_ansible_setup_any_errors_fatal. This is particularly bad because the default value of kolla_ansible_setup_any_errors_fatal is false. We now have gather_facts_max_fail_percentage which can be set to 0 to provide the same functionality. [1] https://github.com/markgoddard/ansible-experiments/tree/master/15-fatal-errors Change-Id: I2e0ea49701b5900eae26434bcdb6b1bb44507ee7 (cherry picked from commit 9cebdb5d9eb46b5ae28a52ef86e5bb50342c3648) --- ansible/gather-facts.yml | 2 -- ansible/group_vars/all.yml | 8 -------- etc/kolla/globals.yml | 8 -------- .../remove-any-errors-fatal-664fc2207074f0ef.yaml | 14 ++++++++++++++ tests/templates/globals-default.j2 | 4 ---- 5 files changed, 14 insertions(+), 22 deletions(-) create mode 100644 releasenotes/notes/remove-any-errors-fatal-664fc2207074f0ef.yaml diff --git a/ansible/gather-facts.yml b/ansible/gather-facts.yml index b50389660a..524f312d92 100644 --- a/ansible/gather-facts.yml +++ b/ansible/gather-facts.yml @@ -4,7 +4,6 @@ # building their configurations. - name: Gather facts for all hosts hosts: all - any_errors_fatal: "{{ kolla_ansible_setup_any_errors_fatal | bool }}" serial: '{{ kolla_serial|default("0") }}' gather_facts: false tasks: @@ -36,7 +35,6 @@ # the limit. - name: Gather facts for all hosts (if using --limit) hosts: all_using_limit_True - any_errors_fatal: "{{ kolla_ansible_setup_any_errors_fatal | bool }}" serial: '{{ kolla_serial|default("0") }}' gather_facts: false vars: diff --git a/ansible/group_vars/all.yml b/ansible/group_vars/all.yml index 025f074649..c6f59d0081 100644 --- a/ansible/group_vars/all.yml +++ b/ansible/group_vars/all.yml @@ -36,14 +36,6 @@ kolla_ansible_setup_filter: "{{ omit }}" # By default, we do not provide a gather subset. kolla_ansible_setup_gather_subset: "{{ omit }}" -# This variable is used as "any_errors_fatal" setting for the setup (gather -# facts) plays. -# This is useful for weeding out failing hosts early to avoid late failures -# due to missing facts (especially cross-host). -# Do note this still supports host fact caching and it will not affect -# scenarios with all facts cached (as there is no task to fail). -kolla_ansible_setup_any_errors_fatal: false - ################### # Kolla options ################### diff --git a/etc/kolla/globals.yml b/etc/kolla/globals.yml index aa192aee81..66c53db61c 100644 --- a/etc/kolla/globals.yml +++ b/etc/kolla/globals.yml @@ -24,14 +24,6 @@ # Dummy variable to allow Ansible to accept this file. workaround_ansible_issue_8743: yes -# This variable is used as "any_errors_fatal" setting for the setup (gather -# facts) plays. -# This is useful for weeding out failing hosts early to avoid late failures -# due to missing facts (especially cross-host). -# Do note this still supports host fact caching and it will not affect -# scenarios with all facts cached (as there is no task to fail). -#kolla_ansible_setup_any_errors_fatal: false - ############### # Kolla options ############### diff --git a/releasenotes/notes/remove-any-errors-fatal-664fc2207074f0ef.yaml b/releasenotes/notes/remove-any-errors-fatal-664fc2207074f0ef.yaml new file mode 100644 index 0000000000..7060d884ba --- /dev/null +++ b/releasenotes/notes/remove-any-errors-fatal-664fc2207074f0ef.yaml @@ -0,0 +1,14 @@ +--- +upgrade: + - | + Support for failing execution early if fact collection fails on any of the + hosts by setting ``kolla_ansible_setup_any_errors_fatal`` to ``true`` has + been removed. This is due to Ansible's ``any_errors_fatal`` parameter not + being templated, resulting in the value always being interpreted as + ``true``, even though the default value of + ``kolla_ansible_setup_any_errors_fatal`` is ``false``. + + Equivalent behaviour is possible by setting the maximum failure percentage + to 0. This may be done specifically for fact gathering using + ``gather_facts_max_fail_percentage`` or globally using + ``kolla_max_fail_percentage``. diff --git a/tests/templates/globals-default.j2 b/tests/templates/globals-default.j2 index d60b8edbce..5a0dd7d3e1 100644 --- a/tests/templates/globals-default.j2 +++ b/tests/templates/globals-default.j2 @@ -5,10 +5,6 @@ ansible_python_interpreter: /usr/bin/python3 {% endif %} -# NOTE(yoctozepto): In CI it makes sense to always try to fail -# as early as possible. -kolla_ansible_setup_any_errors_fatal: true - kolla_base_distro: "{{ base_distro }}" # Zed dropped install_type so we have it only on upgrades network_interface: "{{ api_interface_name }}" From 08534944931be88193bdff1c8410993a443e0ec1 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 18 Sep 2024 13:28:01 +0100 Subject: [PATCH 5/7] Fix typo in RabbitMQ versions pinning We don't use dots in the image name Change-Id: I29172448c14a1ca9a5fa23abe701366f875959e0 (cherry picked from commit 07312e61b2c370360db8f554b477718fe657a2a0) --- doc/source/reference/message-queues/rabbitmq.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/reference/message-queues/rabbitmq.rst b/doc/source/reference/message-queues/rabbitmq.rst index dad6bdf419..fed14c811d 100644 --- a/doc/source/reference/message-queues/rabbitmq.rst +++ b/doc/source/reference/message-queues/rabbitmq.rst @@ -200,7 +200,7 @@ you must override the image. if you want to use version 3.12 change .. code-block:: yaml - rabbitmq_image: "{{ docker_registry ~ '/' if docker_registry else '' }}{{ docker_namespace }}/rabbitmq-3.12" + rabbitmq_image: "{{ docker_registry ~ '/' if docker_registry else '' }}{{ docker_namespace }}/rabbitmq-3-12" You can then upgrade RabbitMQ with the usual command: From 28f57f4196231f5e37120903c71142592c1c9120 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 11 Sep 2024 13:56:53 +0100 Subject: [PATCH 6/7] Docs: point to RMQ SLURP handing in upgrade guide Change-Id: I7934bda26a134fe38a63716f9878a27b4f3819ce (cherry picked from commit f8807b4af326d5cabf3fa266c726687d1da388ad) --- doc/source/user/operating-kolla.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/source/user/operating-kolla.rst b/doc/source/user/operating-kolla.rst index 729b80b3b7..022db9eb20 100644 --- a/doc/source/user/operating-kolla.rst +++ b/doc/source/user/operating-kolla.rst @@ -187,6 +187,16 @@ issues: At a convenient time, the upgrade can now be run. +SLURP extra preparations +++++++++++++++++++++++++ + +RabbitMQ has two major version releases per year but does not support jumping +two versions in one upgrade. So if you want to perform a skip-level upgrade, +you must first upgrade RabbitMQ to an intermediary version. Please see the +`RabbitMQ SLURP section +`__ +for details. + Perform the Upgrade ------------------- From ce108e82832b5ca6c98e7d162db3221641daf397 Mon Sep 17 00:00:00 2001 From: Grzegorz Koper Date: Wed, 18 Sep 2024 12:04:50 +0200 Subject: [PATCH 7/7] Fixing typo in etc/kolla/globals.yml Closes-bug: #2077511 Change-Id: Icd15e8d04771cf50bc704f0c40006a8ac0aeb3ef (cherry picked from commit 28ed5063c0a5ae14195695c0e023e35c57b38d4f) --- etc/kolla/globals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kolla/globals.yml b/etc/kolla/globals.yml index aa192aee81..c3deff412d 100644 --- a/etc/kolla/globals.yml +++ b/etc/kolla/globals.yml @@ -721,7 +721,7 @@ workaround_ansible_issue_8743: yes #enable_prometheus_mysqld_exporter: "{{ enable_mariadb | bool }}" #enable_prometheus_node_exporter: "{{ enable_prometheus | bool }}" #enable_prometheus_cadvisor: "{{ enable_prometheus | bool }}" -#enable_prometheus_fluentd_integration: "{{ enable_prometheus | bool and enable fluentd | bool }}" +#enable_prometheus_fluentd_integration: "{{ enable_prometheus | bool and enable_fluentd | bool }}" #enable_prometheus_memcached: "{{ enable_prometheus | bool }}" #enable_prometheus_alertmanager: "{{ enable_prometheus | bool }}" #enable_prometheus_alertmanager_external: "{{ enable_prometheus_alertmanager | bool }}"