diff --git a/.zuul.yaml b/.zuul.yaml index 785522e964e..b5224367b23 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -56,6 +56,17 @@ bindep_profile: test py38 timeout: 3600 +- job: + name: nova-tox-validate-backport + parent: openstack-tox + description: | + Determine whether a backport is ready to be merged by checking whether it + has already been merged to master or more recent stable branches. + + Uses tox with the ``validate-backport`` environment. + vars: + tox_envlist: validate-backport + - job: name: nova-live-migration parent: tempest-multinode-full-py3 @@ -180,6 +191,12 @@ # reduce the number of placement calls in steady state. Added in # Stein. resource_provider_association_refresh: 0 + workarounds: + # This wa is an improvement on hard reboot that cannot be turned + # on unconditionally. But we know that ml2/ovs sends plug time + # events so we can enable this in this ovs job for vnic_type + # normal + wait_for_vif_plugged_event_during_hard_reboot: normal $NOVA_CONF: quota: # Added in Train. @@ -253,22 +270,24 @@ - job: name: nova-grenade-multinode - parent: nova-dsvm-multinode-base + parent: grenade-multinode description: | - Multi-node grenade job which runs gate/live_migration/hooks tests under - python 3. - In other words, this tests live and cold migration and resize with - mixed-version compute services which is important for things like - rolling upgrade support. + Run a multinode grenade job and run the smoke, cold and live migration + tests with the controller upgraded and the compute on the older release. The former names for this job were "nova-grenade-live-migration" and "legacy-grenade-dsvm-neutron-multinode-live-migration". - run: playbooks/legacy/nova-grenade-multinode/run.yaml - post-run: playbooks/legacy/nova-grenade-multinode/post.yaml - required-projects: - - openstack/grenade - - openstack/devstack-gate - - openstack/nova irrelevant-files: *dsvm-irrelevant-files + vars: + devstack_local_conf: + test-config: + $TEMPEST_CONFIG: + compute-feature-enabled: + live_migration: true + volume_backed_live_migration: true + block_migration_for_live_migration: true + block_migrate_cinder_iscsi: true + tox_envlist: all + tempest_test_regex: ((tempest\.(api\.compute|scenario)\..*smoke.*)|(^tempest\.api\.compute\.admin\.(test_live_migration|test_migration))) - job: name: nova-multi-cell @@ -398,7 +417,6 @@ - check-requirements - integrated-gate-compute - openstack-cover-jobs - - openstack-lower-constraints-jobs - openstack-python3-victoria-jobs - periodic-stable-jobs - publish-openstack-docs-pti @@ -418,11 +436,12 @@ # so that we only run it on changes to networking and libvirt/vif # code; we don't need to run this on all changes. - ^(?!nova/network/.*)(?!nova/virt/libvirt/vif.py).*$ - - nova-grenade-multinode - nova-live-migration - nova-lvm - nova-multi-cell - nova-next + - nova-tox-validate-backport: + voting: false - nova-tox-functional-py38 - tempest-integrated-compute: # NOTE(gmann): Policies changes do not need to run all the @@ -443,7 +462,7 @@ - ^setup.cfg$ - ^tools/.*$ - ^tox.ini$ - - grenade: + - nova-grenade-multinode: irrelevant-files: *policies-irrelevant-files - tempest-ipv6-only: irrelevant-files: *dsvm-irrelevant-files @@ -457,11 +476,11 @@ voting: false gate: jobs: - - nova-grenade-multinode - nova-live-migration - nova-tox-functional-py38 - nova-multi-cell - nova-next + - nova-tox-validate-backport - nova-ceph-multistore: irrelevant-files: *dsvm-irrelevant-files - neutron-tempest-linuxbridge: @@ -472,7 +491,7 @@ - ^(?!nova/network/.*)(?!nova/virt/libvirt/vif.py).*$ - tempest-integrated-compute: irrelevant-files: *policies-irrelevant-files - - grenade: + - nova-grenade-multinode: irrelevant-files: *policies-irrelevant-files - tempest-ipv6-only: irrelevant-files: *dsvm-irrelevant-files diff --git a/doc/source/admin/evacuate.rst b/doc/source/admin/evacuate.rst index ef9eccd9312..18796d9c237 100644 --- a/doc/source/admin/evacuate.rst +++ b/doc/source/admin/evacuate.rst @@ -97,3 +97,17 @@ instances up and running. using a pattern you might want to use the ``--strict`` flag which got introduced in version 10.2.0 to make sure nova matches the ``FAILED_HOST`` exactly. + +.. note:: + .. code-block:: bash + + +------+--------+--------------+ + | Name | Status | Task State | + +------+--------+--------------+ + | vm_1 | ACTIVE | powering-off | + +------------------------------+ + + If the instance task state is not None, evacuation will be possible. However, + depending on the ongoing operation, there may be clean up required in other + services which the instance was using, such as neutron, cinder, glance, or + the storage backend. \ No newline at end of file diff --git a/doc/source/admin/networking.rst b/doc/source/admin/networking.rst index 407a43aafea..626bb895922 100644 --- a/doc/source/admin/networking.rst +++ b/doc/source/admin/networking.rst @@ -24,6 +24,18 @@ A full guide on configuring and using SR-IOV is provided in the :neutron-doc:`OpenStack Networking service documentation ` +.. note:: + + Nova only supports PCI addresses where the fields are restricted to the + following maximum value: + + * domain - 0xFFFF + * bus - 0xFF + * slot - 0x1F + * function - 0x7 + + Nova will ignore PCI devices reported by the hypervisor if the address is + outside of these ranges. NUMA Affinity ------------- diff --git a/doc/source/admin/pci-passthrough.rst b/doc/source/admin/pci-passthrough.rst index 227538edb0c..5a4c001e6d6 100644 --- a/doc/source/admin/pci-passthrough.rst +++ b/doc/source/admin/pci-passthrough.rst @@ -37,6 +37,18 @@ devices with potentially different capabilities. supported until the 14.0.0 Newton release, see `bug 1512800 `_ for details. +.. note:: + + Nova only supports PCI addresses where the fields are restricted to the + following maximum value: + + * domain - 0xFFFF + * bus - 0xFF + * slot - 0x1F + * function - 0x7 + + Nova will ignore PCI devices reported by the hypervisor if the address is + outside of these ranges. Configure host (Compute) ------------------------ diff --git a/gate/live_migration/hooks/ceph.sh b/gate/live_migration/hooks/ceph.sh deleted file mode 100755 index 3d596ff0b31..00000000000 --- a/gate/live_migration/hooks/ceph.sh +++ /dev/null @@ -1,208 +0,0 @@ -#!/bin/bash - -function prepare_ceph { - git clone https://opendev.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph - source /tmp/devstack-plugin-ceph/devstack/settings - source /tmp/devstack-plugin-ceph/devstack/lib/ceph - install_ceph - configure_ceph - #install ceph-common package and additional python3 ceph libraries on compute nodes - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m raw -a "executable=/bin/bash - USE_PYTHON3=${USE_PYTHON3:-True} - source $BASE/new/devstack/functions - source $BASE/new/devstack/functions-common - git clone https://opendev.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph - source /tmp/devstack-plugin-ceph/devstack/lib/ceph - install_ceph_remote - " - - #copy ceph admin keyring to compute nodes - sudo cp /etc/ceph/ceph.client.admin.keyring /tmp/ceph.client.admin.keyring - sudo chown ${STACK_USER}:${STACK_USER} /tmp/ceph.client.admin.keyring - sudo chmod 644 /tmp/ceph.client.admin.keyring - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.admin.keyring dest=/etc/ceph/ceph.client.admin.keyring owner=ceph group=ceph" - sudo rm -f /tmp/ceph.client.admin.keyring - #copy ceph.conf to compute nodes - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/etc/ceph/ceph.conf dest=/etc/ceph/ceph.conf owner=root group=root" - - start_ceph -} - -function _ceph_configure_glance { - GLANCE_API_CONF=${GLANCE_API_CONF:-/etc/glance/glance-api.conf} - sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${GLANCE_CEPH_POOL} ${GLANCE_CEPH_POOL_PG} ${GLANCE_CEPH_POOL_PGP} - sudo ceph -c ${CEPH_CONF_FILE} auth get-or-create client.${GLANCE_CEPH_USER} \ - mon "allow r" \ - osd "allow class-read object_prefix rbd_children, allow rwx pool=${GLANCE_CEPH_POOL}" | \ - sudo tee ${CEPH_CONF_DIR}/ceph.client.${GLANCE_CEPH_USER}.keyring - sudo chown ${STACK_USER}:$(id -g -n $whoami) ${CEPH_CONF_DIR}/ceph.client.${GLANCE_CEPH_USER}.keyring - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=DEFAULT option=show_image_direct_url value=True" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=default_store value=rbd" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=stores value='file, http, rbd'" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_ceph_conf value=$CEPH_CONF_FILE" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_user value=$GLANCE_CEPH_USER" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_pool value=$GLANCE_CEPH_POOL" - - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${GLANCE_CEPH_POOL} size ${CEPH_REPLICAS} - if [[ $CEPH_REPLICAS -ne 1 ]]; then - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${GLANCE_CEPH_POOL} crush_ruleset ${RULE_ID} - fi - - #copy glance keyring to compute only node - sudo cp /etc/ceph/ceph.client.glance.keyring /tmp/ceph.client.glance.keyring - sudo chown $STACK_USER:$STACK_USER /tmp/ceph.client.glance.keyring - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.glance.keyring dest=/etc/ceph/ceph.client.glance.keyring" - sudo rm -f /tmp/ceph.client.glance.keyring -} - -function configure_and_start_glance { - _ceph_configure_glance - echo 'check processes before glance-api stop' - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep glance-api" - - # restart glance - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl restart devstack@g-api" - - echo 'check processes after glance-api stop' - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep glance-api" -} - -function _ceph_configure_nova { - #setup ceph for nova, we don't reuse configure_ceph_nova - as we need to emulate case where cinder is not configured for ceph - sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${NOVA_CEPH_POOL} ${NOVA_CEPH_POOL_PG} ${NOVA_CEPH_POOL_PGP} - NOVA_CONF=${NOVA_CPU_CONF:-/etc/nova/nova.conf} - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=rbd_user value=${CINDER_CEPH_USER}" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=rbd_secret_uuid value=${CINDER_CEPH_UUID}" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=inject_key value=false" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=inject_partition value=-2" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=disk_cachemodes value='network=writeback'" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_type value=rbd" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_rbd_pool value=${NOVA_CEPH_POOL}" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_rbd_ceph_conf value=${CEPH_CONF_FILE}" - - sudo ceph -c ${CEPH_CONF_FILE} auth get-or-create client.${CINDER_CEPH_USER} \ - mon "allow r" \ - osd "allow class-read object_prefix rbd_children, allow rwx pool=${CINDER_CEPH_POOL}, allow rwx pool=${NOVA_CEPH_POOL},allow rwx pool=${GLANCE_CEPH_POOL}" | \ - sudo tee ${CEPH_CONF_DIR}/ceph.client.${CINDER_CEPH_USER}.keyring > /dev/null - sudo chown ${STACK_USER}:$(id -g -n $whoami) ${CEPH_CONF_DIR}/ceph.client.${CINDER_CEPH_USER}.keyring - - #copy cinder keyring to compute only node - sudo cp /etc/ceph/ceph.client.cinder.keyring /tmp/ceph.client.cinder.keyring - sudo chown stack:stack /tmp/ceph.client.cinder.keyring - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.cinder.keyring dest=/etc/ceph/ceph.client.cinder.keyring" - sudo rm -f /tmp/ceph.client.cinder.keyring - - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${NOVA_CEPH_POOL} size ${CEPH_REPLICAS} - if [[ $CEPH_REPLICAS -ne 1 ]]; then - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${NOVA_CEPH_POOL} crush_ruleset ${RULE_ID} - fi -} - -function _wait_for_nova_compute_service_state { - source $BASE/new/devstack/openrc admin admin - local status=$1 - local attempt=1 - local max_attempts=24 - local attempt_sleep=5 - local computes_count=$(openstack compute service list | grep -c nova-compute) - local computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l) - - echo "Waiting for $computes_count computes to report as $status" - while [ "$computes_ready" -ne "$computes_count" ]; do - if [ "$attempt" -eq "$max_attempts" ]; then - echo "Failed waiting for computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${max_attempts} attempts" - exit 4 - fi - echo "Waiting ${attempt_sleep} seconds for ${computes_count} computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${attempt}/${max_attempts} attempts" - sleep $attempt_sleep - attempt=$((attempt+1)) - computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l) - done - echo "All computes are now reporting as ${status} after ${attempt} attempts" -} - -function configure_and_start_nova { - - echo "Checking all n-cpu services" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute" - - # stop nova-compute - echo "Stopping all n-cpu services" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl stop devstack@n-cpu" - - # Wait for the service to be marked as down - _wait_for_nova_compute_service_state "down" - - _ceph_configure_nova - - #import secret to libvirt - _populate_libvirt_secret - - # start nova-compute - echo "Starting all n-cpu services" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl start devstack@n-cpu" - - echo "Checking all n-cpu services" - # test that they are all running again - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute" - - # Wait for the service to be marked as up - _wait_for_nova_compute_service_state "up" -} - -function _ceph_configure_cinder { - sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${CINDER_CEPH_POOL} ${CINDER_CEPH_POOL_PG} ${CINDER_CEPH_POOL_PGP} - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${CINDER_CEPH_POOL} size ${CEPH_REPLICAS} - if [[ $CEPH_REPLICAS -ne 1 ]]; then - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${CINDER_CEPH_POOL} crush_ruleset ${RULE_ID} - fi - - CINDER_CONF=${CINDER_CONF:-/etc/cinder/cinder.conf} - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=volume_backend_name value=ceph" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=volume_driver value=cinder.volume.drivers.rbd.RBDDriver" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_ceph_conf value=$CEPH_CONF_FILE" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_pool value=$CINDER_CEPH_POOL" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_user value=$CINDER_CEPH_USER" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_uuid value=$CINDER_CEPH_UUID" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_flatten_volume_from_snapshot value=False" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_max_clone_depth value=5" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=DEFAULT option=default_volume_type value=ceph" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=DEFAULT option=enabled_backends value=ceph" - -} - -function configure_and_start_cinder { - _ceph_configure_cinder - - # restart cinder - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl restart devstack@c-vol" - - source $BASE/new/devstack/openrc - - export OS_USERNAME=admin - export OS_PROJECT_NAME=admin - lvm_type=$(cinder type-list | awk -F "|" 'NR==4{ print $2}') - cinder type-delete $lvm_type - openstack volume type create --os-volume-api-version 1 --property volume_backend_name="ceph" ceph -} - -function _populate_libvirt_secret { - cat > /tmp/secret.xml < - ${CINDER_CEPH_UUID} - - client.${CINDER_CEPH_USER} secret - - -EOF - - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/secret.xml dest=/tmp/secret.xml" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "virsh secret-define --file /tmp/secret.xml" - local secret=$(sudo ceph -c ${CEPH_CONF_FILE} auth get-key client.${CINDER_CEPH_USER}) - # TODO(tdurakov): remove this escaping as https://github.com/ansible/ansible/issues/13862 fixed - secret=${secret//=/'\='} - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "virsh secret-set-value --secret ${CINDER_CEPH_UUID} --base64 $secret" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m file -a "path=/tmp/secret.xml state=absent" - -} diff --git a/gate/live_migration/hooks/nfs.sh b/gate/live_migration/hooks/nfs.sh deleted file mode 100755 index acadb36d6ca..00000000000 --- a/gate/live_migration/hooks/nfs.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -function nfs_setup { - if uses_debs; then - module=apt - elif is_fedora; then - module=yum - fi - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m $module \ - -a "name=nfs-common state=present" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m $module \ - -a "name=nfs-kernel-server state=present" - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=/etc/idmapd.conf section=Mapping option=Nobody-User value=nova" - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=/etc/idmapd.conf section=Mapping option=Nobody-Group value=nova" - - for SUBNODE in $SUBNODES ; do - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m lineinfile -a "dest=/etc/exports line='/opt/stack/data/nova/instances $SUBNODE(rw,fsid=0,insecure,no_subtree_check,async,no_root_squash)'" - done - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "exportfs -a" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m service -a "name=nfs-kernel-server state=restarted" - GetDistro - if [[ ! ${DISTRO} =~ (xenial) ]]; then - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m service -a "name=idmapd state=restarted" - fi - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p tcp --dport 111 -j ACCEPT" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p udp --dport 111 -j ACCEPT" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p tcp --dport 2049 -j ACCEPT" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p udp --dport 2049 -j ACCEPT" - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "mount -t nfs4 -o proto\=tcp,port\=2049 $primary_node:/ /opt/stack/data/nova/instances/" -} - -function nfs_configure_tempest { - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$BASE/new/tempest/etc/tempest.conf section=compute-feature-enabled option=block_migration_for_live_migration value=False" -} - -function nfs_verify_setup { - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m file -a "path=/opt/stack/data/nova/instances/test_file state=touch" - if [ ! -e '/opt/stack/data/nova/instances/test_file' ]; then - die $LINENO "NFS configuration failure" - fi -} - -function nfs_teardown { - #teardown nfs shared storage - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "umount -t nfs4 /opt/stack/data/nova/instances/" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m service -a "name=nfs-kernel-server state=stopped" -} \ No newline at end of file diff --git a/gate/live_migration/hooks/run_tests.sh b/gate/live_migration/hooks/run_tests.sh deleted file mode 100755 index 8334df633dd..00000000000 --- a/gate/live_migration/hooks/run_tests.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash -# Live migration dedicated ci job will be responsible for testing different -# environments based on underlying storage, used for ephemerals. -# This hook allows to inject logic of environment reconfiguration in ci job. -# Base scenario for this would be: -# -# 1. test with all local storage (use default for volumes) -# 2. test with NFS for root + ephemeral disks -# 3. test with Ceph for root + ephemeral disks -# 4. test with Ceph for volumes and root + ephemeral disk - -set -xe -cd $BASE/new/tempest - -source $BASE/new/devstack/functions -source $BASE/new/devstack/functions-common -source $BASE/new/devstack/lib/nova -source $WORKSPACE/devstack-gate/functions.sh -source $BASE/new/nova/gate/live_migration/hooks/utils.sh -source $BASE/new/nova/gate/live_migration/hooks/nfs.sh -source $BASE/new/nova/gate/live_migration/hooks/ceph.sh -primary_node=$(cat /etc/nodepool/primary_node_private) -SUBNODES=$(cat /etc/nodepool/sub_nodes_private) -SERVICE_HOST=$primary_node -STACK_USER=${STACK_USER:-stack} - -echo '1. test with all local storage (use default for volumes)' -echo 'NOTE: test_volume_backed_live_migration is skipped due to https://bugs.launchpad.net/nova/+bug/1524898' -echo 'NOTE: test_live_block_migration_paused is skipped due to https://bugs.launchpad.net/nova/+bug/1901739' -run_tempest "block migration test" "^.*test_live_migration(?!.*(test_volume_backed_live_migration|test_live_block_migration_paused))" - -# TODO(mriedem): Run $BASE/new/nova/gate/test_evacuate.sh for local storage - -#all tests bellow this line use shared storage, need to update tempest.conf -echo 'disabling block_migration in tempest' -$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$BASE/new/tempest/etc/tempest.conf section=compute-feature-enabled option=block_migration_for_live_migration value=False" - -echo '2. NFS testing is skipped due to setup failures with Ubuntu 16.04' -#echo '2. test with NFS for root + ephemeral disks' - -#nfs_setup -#nfs_configure_tempest -#nfs_verify_setup -#run_tempest "NFS shared storage test" "live_migration" -#nfs_teardown - -# The nova-grenade-multinode job also runs resize and cold migration tests -# so we check for a grenade-only variable. -if [[ -n "$GRENADE_NEW_BRANCH" ]]; then - echo '3. test cold migration and resize' - run_tempest "cold migration and resize test" "test_resize_server|test_cold_migration|test_revert_cold_migration" -else - echo '3. cold migration and resize is skipped for non-grenade jobs' -fi - -echo '4. test with Ceph for root + ephemeral disks' -# Discover and set variables for the OS version so the devstack-plugin-ceph -# scripts can find the correct repository to install the ceph packages. -GetOSVersion -USE_PYTHON3=${USE_PYTHON3:-True} -prepare_ceph -GLANCE_API_CONF=${GLANCE_API_CONF:-/etc/glance/glance-api.conf} -configure_and_start_glance - -configure_and_start_nova -run_tempest "Ceph nova&glance test" "^.*test_live_migration(?!.*(test_volume_backed_live_migration))" - -set +e -#echo '5. test with Ceph for volumes and root + ephemeral disk' - -#configure_and_start_cinder -#run_tempest "Ceph nova&glance&cinder test" "live_migration" diff --git a/gate/live_migration/hooks/utils.sh b/gate/live_migration/hooks/utils.sh deleted file mode 100755 index 9f98ca2e254..00000000000 --- a/gate/live_migration/hooks/utils.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -function run_tempest { - local message=$1 - local tempest_regex=$2 - sudo -H -u tempest tox -eall -- $tempest_regex --concurrency=$TEMPEST_CONCURRENCY - exitcode=$? - if [[ $exitcode -ne 0 ]]; then - die $LINENO "$message failure" - fi -} diff --git a/lower-constraints.txt b/lower-constraints.txt deleted file mode 100644 index f41ff689292..00000000000 --- a/lower-constraints.txt +++ /dev/null @@ -1,165 +0,0 @@ -alembic==0.9.8 -amqp==2.5.0 -appdirs==1.4.3 -asn1crypto==0.24.0 -attrs==17.4.0 -automaton==1.14.0 -bandit==1.1.0 -cachetools==2.0.1 -castellan==0.16.0 -cffi==1.14.0 -cliff==2.11.0 -cmd2==0.8.1 -colorama==0.3.9 -coverage==4.0 -cryptography==2.7 -cursive==0.2.1 -dataclasses==0.7 -ddt==1.2.1 -debtcollector==1.19.0 -decorator==4.1.0 -deprecation==2.0 -dogpile.cache==0.6.5 -enum-compat==0.0.2 -eventlet==0.22.0 -extras==1.0.0 -fasteners==0.14.1 -fixtures==3.0.0 -future==0.16.0 -futurist==1.8.0 -gabbi==1.35.0 -gitdb2==2.0.3 -GitPython==2.1.8 -greenlet==0.4.15 -idna==2.6 -iso8601==0.1.11 -Jinja2==2.10 -jmespath==0.9.3 -jsonpatch==1.21 -jsonpath-rw==1.4.0 -jsonpath-rw-ext==1.1.3 -jsonpointer==2.0 -jsonschema==3.2.0 -keystoneauth1==3.16.0 -keystonemiddleware==4.20.0 -kombu==4.6.1 -linecache2==1.0.0 -lxml==4.5.0 -Mako==1.0.7 -MarkupSafe==1.1.1 -microversion-parse==0.2.1 -mock==3.0.0 -msgpack==0.5.6 -msgpack-python==0.5.6 -munch==2.2.0 -mypy==0.761 -netaddr==0.7.18 -netifaces==0.10.4 -networkx==2.1.0 -numpy==1.19.0 -openstacksdk==0.35.0 -os-brick==3.1.0 -os-client-config==1.29.0 -os-resource-classes==0.4.0 -os-service-types==1.7.0 -os-traits==2.4.0 -os-vif==1.14.0 -os-win==4.2.0 -os-xenapi==0.3.4 -osc-lib==1.10.0 -oslo.cache==1.26.0 -oslo.concurrency==3.29.0 -oslo.config==6.8.0 -oslo.context==2.22.0 -oslo.db==4.44.0 -oslo.i18n==3.15.3 -oslo.log==3.36.0 -oslo.messaging==10.3.0 -oslo.middleware==3.31.0 -oslo.policy==3.4.0 -oslo.privsep==1.33.2 -oslo.reports==1.18.0 -oslo.rootwrap==5.8.0 -oslo.serialization==2.21.1 -oslo.service==1.40.1 -oslo.upgradecheck==0.1.1 -oslo.utils==4.5.0 -oslo.versionedobjects==1.35.0 -oslo.vmware==2.17.0 -oslotest==3.8.0 -osprofiler==1.4.0 -ovs==2.10.0 -ovsdbapp==0.15.0 -packaging==20.4 -paramiko==2.7.1 -Paste==2.0.2 -PasteDeploy==1.5.0 -pbr==2.0.0 -pluggy==0.6.0 -ply==3.11 -prettytable==0.7.1 -psutil==3.2.2 -psycopg2==2.8 -py==1.5.2 -pyasn1==0.4.2 -pyasn1-modules==0.2.1 -pycadf==2.7.0 -pycparser==2.18 -pyinotify==0.9.6 -pyroute2==0.5.4 -PyJWT==1.7.0 -PyMySQL==0.8.0 -pyOpenSSL==17.5.0 -pyparsing==2.2.0 -pyperclip==1.6.0 -pypowervm==1.1.15 -pytest==3.4.2 -python-barbicanclient==4.5.2 -python-cinderclient==3.3.0 -python-dateutil==2.5.3 -python-editor==1.0.3 -python-glanceclient==2.8.0 -python-ironicclient==3.0.0 -python-keystoneclient==3.15.0 -python-mimeparse==1.6.0 -python-neutronclient==6.7.0 -python-subunit==1.4.0 -pytz==2018.3 -PyYAML==3.13 -repoze.lru==0.7 -requests==2.23.0 -requests-mock==1.2.0 -requestsexceptions==1.4.0 -retrying==1.3.3 -rfc3986==1.2.0 -Routes==2.3.1 -simplejson==3.13.2 -six==1.11.0 -smmap2==2.0.3 -sortedcontainers==2.1.0 -SQLAlchemy==1.2.19 -sqlalchemy-migrate==0.13.0 -sqlparse==0.2.4 -statsd==3.2.2 -stestr==2.0.0 -stevedore==1.20.0 -suds-jurko==0.6 -taskflow==3.8.0 -Tempita==0.5.2 -tenacity==6.0.0 -testrepository==0.0.20 -testresources==2.0.0 -testscenarios==0.4 -testtools==2.2.0 -tooz==1.58.0 -traceback2==1.4.0 -unittest2==1.1.0 -urllib3==1.22 -vine==1.1.4 -voluptuous==0.11.1 -warlock==1.3.1 -WebOb==1.8.2 -websockify==0.9.0 -wrapt==1.10.11 -wsgi-intercept==1.7.0 -zVMCloudConnector==1.3.0 diff --git a/nova/api/openstack/compute/services.py b/nova/api/openstack/compute/services.py index b462b1a9d94..be35f62ee3f 100644 --- a/nova/api/openstack/compute/services.py +++ b/nova/api/openstack/compute/services.py @@ -267,10 +267,25 @@ def delete(self, req, id): # service delete since we could orphan resource providers and # break the ability to do things like confirm/revert instances # in VERIFY_RESIZE status. - compute_nodes = objects.ComputeNodeList.get_all_by_host( - context, service.host) - self._assert_no_in_progress_migrations( - context, id, compute_nodes) + compute_nodes = [] + try: + compute_nodes = objects.ComputeNodeList.get_all_by_host( + context, service.host) + self._assert_no_in_progress_migrations( + context, id, compute_nodes) + except exception.ComputeHostNotFound: + # NOTE(artom) Consider the following situation: + # - Using the Ironic virt driver + # - Replacing (so removing and re-adding) all baremetal + # nodes associated with a single nova-compute service + # The update resources periodic will have destroyed the + # compute node records because they're no longer being + # reported by the virt driver. If we then attempt to + # manually delete the compute service record, + # get_all_host() above will raise, as there are no longer + # any compute node records for the host. Catch it here and + # continue to allow compute service deletion. + pass aggrs = self.aggregate_api.get_aggregates_by_host(context, service.host) diff --git a/nova/api/openstack/wsgi_app.py b/nova/api/openstack/wsgi_app.py index 8e8fbd6eacf..9039b888cb0 100644 --- a/nova/api/openstack/wsgi_app.py +++ b/nova/api/openstack/wsgi_app.py @@ -29,6 +29,8 @@ CONFIG_FILES = ['api-paste.ini', 'nova.conf'] +LOG = logging.getLogger(__name__) + objects.register_all() @@ -77,8 +79,12 @@ def application(environ, start_response): return application -def init_application(name): - conf_files = _get_config_files() +@utils.run_once('Global data already initialized, not re-initializing.', + LOG.info) +def init_global_data(conf_files): + # NOTE(melwitt): parse_args initializes logging and calls global rpc.init() + # and db_api.configure(). The db_api.configure() call does not initiate any + # connection to the database. config.parse_args([], default_config_files=conf_files) logging.setup(CONF, "nova") @@ -93,11 +99,25 @@ def init_application(name): logging.getLogger(__name__), logging.DEBUG) + +def init_application(name): + conf_files = _get_config_files() + + # NOTE(melwitt): The init_application method can be called multiple times + # within a single python interpreter instance if any exception is raised + # during it (example: DBConnectionError while setting up the service) and + # apache/mod_wsgi reloads the init_application script. So, we initialize + # global data separately and decorate the method to run only once in a + # python interpreter instance. + init_global_data(conf_files) + try: _setup_service(CONF.host, name) except exception.ServiceTooOld as exc: return error_application(exc, name) + # This global init is safe because if we got here, we already successfully + # set up the service and setting up the profile cannot fail. service.setup_profiler(name, CONF.host) conf = conf_files[0] diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py index 92c48c71e68..e704336d51b 100644 --- a/nova/cmd/manage.py +++ b/nova/cmd/manage.py @@ -2668,7 +2668,7 @@ def _get_resource_providers(self, context, placement, **kwargs): """ url = '/resource_providers' if 'uuid' in kwargs: - url += '&uuid=%s' % kwargs['uuid'] + url += '?uuid=%s' % kwargs['uuid'] resp = placement.get(url, global_request_id=context.global_id, version='1.14') diff --git a/nova/compute/api.py b/nova/compute/api.py index 97868490c34..8b5ba453ce6 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -5185,7 +5185,7 @@ def live_migrate_abort(self, context, instance, migration_id, @reject_vtpm_instances(instance_actions.EVACUATE) @block_accelerators(until_service=SUPPORT_ACCELERATOR_SERVICE_FOR_REBUILD) @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED, - vm_states.ERROR]) + vm_states.ERROR], task_state=None) def evacuate(self, context, instance, host, on_shared_storage, admin_password=None, force=None): """Running evacuate to target host. @@ -5212,7 +5212,7 @@ def evacuate(self, context, instance, host, on_shared_storage, context, instance.uuid) instance.task_state = task_states.REBUILDING - instance.save(expected_task_state=[None]) + instance.save(expected_task_state=None) self._record_action_start(context, instance, instance_actions.EVACUATE) # NOTE(danms): Create this as a tombstone for the source compute diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 65245af1c34..adc206f3b7a 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1646,7 +1646,11 @@ def _decode(f): return [_decode(f) for f in injected_files] def _validate_instance_group_policy(self, context, instance, - scheduler_hints): + scheduler_hints=None): + + if CONF.workarounds.disable_group_policy_check_upcall: + return + # NOTE(russellb) Instance group policy is enforced by the scheduler. # However, there is a race condition with the enforcement of # the policy. Since more than one instance may be scheduled at the @@ -1655,29 +1659,63 @@ def _validate_instance_group_policy(self, context, instance, # multiple instances with an affinity policy could end up on different # hosts. This is a validation step to make sure that starting the # instance here doesn't violate the policy. - group_hint = scheduler_hints.get('group') - if not group_hint: - return - - # The RequestSpec stores scheduler_hints as key=list pairs so we need - # to check the type on the value and pull the single entry out. The - # API request schema validates that the 'group' hint is a single value. - if isinstance(group_hint, list): - group_hint = group_hint[0] + if scheduler_hints is not None: + # only go through here if scheduler_hints is provided, even if it + # is empty. + group_hint = scheduler_hints.get('group') + if not group_hint: + return + else: + # The RequestSpec stores scheduler_hints as key=list pairs so + # we need to check the type on the value and pull the single + # entry out. The API request schema validates that + # the 'group' hint is a single value. + if isinstance(group_hint, list): + group_hint = group_hint[0] + + group = objects.InstanceGroup.get_by_hint(context, group_hint) + else: + # TODO(ganso): a call to DB can be saved by adding request_spec + # to rpcapi payload of live_migration, pre_live_migration and + # check_can_live_migrate_destination + try: + group = objects.InstanceGroup.get_by_instance_uuid( + context, instance.uuid) + except exception.InstanceGroupNotFound: + return - @utils.synchronized(group_hint) - def _do_validation(context, instance, group_hint): - group = objects.InstanceGroup.get_by_hint(context, group_hint) + @utils.synchronized(group['uuid']) + def _do_validation(context, instance, group): if group.policy and 'anti-affinity' == group.policy: + + # instances on host instances_uuids = objects.InstanceList.get_uuids_by_host( context, self.host) ins_on_host = set(instances_uuids) + + # instance param is just for logging, the nodename obtained is + # not actually related to the instance at all + nodename = self._get_nodename(instance) + + # instances being migrated to host + migrations = ( + objects.MigrationList.get_in_progress_by_host_and_node( + context, self.host, nodename)) + migration_vm_uuids = set([mig['instance_uuid'] + for mig in migrations]) + + total_instances = migration_vm_uuids | ins_on_host + + # refresh group to get updated members within locked block + group = objects.InstanceGroup.get_by_uuid(context, + group['uuid']) members = set(group.members) # Determine the set of instance group members on this host # which are not the instance in question. This is used to # determine how many other members from the same anti-affinity # group can be on this host. - members_on_host = ins_on_host & members - set([instance.uuid]) + members_on_host = (total_instances & members - + set([instance.uuid])) rules = group.rules if rules and 'max_server_per_host' in rules: max_server = rules['max_server_per_host'] @@ -1689,6 +1727,12 @@ def _do_validation(context, instance, group_hint): raise exception.RescheduledException( instance_uuid=instance.uuid, reason=msg) + + # NOTE(ganso): The check for affinity below does not work and it + # can easily be violated because the lock happens in different + # compute hosts. + # The only fix seems to be a DB lock to perform the check whenever + # setting the host field to an instance. elif group.policy and 'affinity' == group.policy: group_hosts = group.get_hosts(exclude=[instance.uuid]) if group_hosts and self.host not in group_hosts: @@ -1697,8 +1741,7 @@ def _do_validation(context, instance, group_hint): instance_uuid=instance.uuid, reason=msg) - if not CONF.workarounds.disable_group_policy_check_upcall: - _do_validation(context, instance, group_hint) + _do_validation(context, instance, group) def _log_original_error(self, exc_info, instance_uuid): LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid, @@ -5217,10 +5260,24 @@ def prep_resize(self, context, image, instance, instance_type, with self._error_out_instance_on_exception( context, instance, instance_state=instance_state),\ errors_out_migration_ctxt(migration): + self._send_prep_resize_notifications( context, instance, fields.NotificationPhase.START, instance_type) try: + scheduler_hints = self._get_scheduler_hints(filter_properties, + request_spec) + # Error out if this host cannot accept the new instance due + # to anti-affinity. At this point the migration is already + # in-progress, so this is the definitive moment to abort due to + # the policy violation. Also, exploding here is covered by the + # cleanup methods in except block. + try: + self._validate_instance_group_policy(context, instance, + scheduler_hints) + except exception.RescheduledException as e: + raise exception.InstanceFaultRollback(inner_exception=e) + self._prep_resize(context, image, instance, instance_type, filter_properties, node, migration, request_spec, @@ -5597,6 +5654,14 @@ def _resize_instance(self, context, instance, image, instance.host = migration.dest_compute instance.node = migration.dest_node + # NOTE(gibi): as the instance now tracked on the destination we + # have to make sure that the source compute resource track can + # track this instance as a migration. For that the resource tracker + # needs to see the old_flavor set on the instance. The old_flavor + # setting used to be done on the destination host in finish_resize + # but that is racy with a source host update_available_resource + # periodic run + instance.old_flavor = instance.flavor instance.task_state = task_states.RESIZE_MIGRATED instance.save(expected_task_state=task_states.RESIZE_MIGRATING) @@ -5710,6 +5775,10 @@ def _finish_resize(self, context, instance, migration, disk_info, # to ACTIVE for backwards compatibility old_vm_state = instance.system_metadata.get('old_vm_state', vm_states.ACTIVE) + # NOTE(gibi): this is already set by the resize_instance on the source + # node before calling finish_resize on destination but during upgrade + # it can be that the source node is not having the fix for bug 1944759 + # yet. This assignment can be removed in Z release. instance.old_flavor = old_flavor if old_instance_type_id != new_instance_type_id: @@ -7823,6 +7892,20 @@ def check_can_live_migrate_destination(self, ctxt, instance, :param limits: objects.SchedulerLimits object for this live migration. :returns: a LiveMigrateData object (hypervisor-dependent) """ + + # Error out if this host cannot accept the new instance due + # to anti-affinity. This check at this moment is not very accurate, as + # multiple requests may be happening concurrently and miss the lock, + # but when it works it provides a better user experience by failing + # earlier. Also, it should be safe to explode here, error becomes + # NoValidHost and instance status remains ACTIVE. + try: + self._validate_instance_group_policy(ctxt, instance) + except exception.RescheduledException as e: + msg = ("Failed to validate instance group policy " + "due to: {}".format(e)) + raise exception.MigrationPreCheckError(reason=msg) + src_compute_info = obj_base.obj_to_primitive( self._get_compute_info(ctxt, instance.host)) dst_compute_info = obj_base.obj_to_primitive( @@ -7965,6 +8048,13 @@ def pre_live_migration(self, context, instance, block_migration, disk, """ LOG.debug('pre_live_migration data is %s', migrate_data) + # Error out if this host cannot accept the new instance due + # to anti-affinity. At this point the migration is already in-progress, + # so this is the definitive moment to abort due to the policy + # violation. Also, it should be safe to explode here. The instance + # status remains ACTIVE, migration status failed. + self._validate_instance_group_policy(context, instance) + migrate_data.old_vol_attachment_ids = {} bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) @@ -9898,6 +9988,8 @@ def update_available_resource(self, context, startup=False): use_slave=True, startup=startup) + self.rt.clean_compute_node_cache(compute_nodes_in_db) + # Delete orphan compute node not reported by driver but still in db for cn in compute_nodes_in_db: if cn.hypervisor_hostname not in nodenames: @@ -9906,17 +9998,30 @@ def update_available_resource(self, context, startup=False): "nodes are %(nodes)s", {'id': cn.id, 'hh': cn.hypervisor_hostname, 'nodes': nodenames}) - cn.destroy() - self.rt.remove_node(cn.hypervisor_hostname) - # Delete the corresponding resource provider in placement, - # along with any associated allocations. try: - self.reportclient.delete_resource_provider(context, cn, - cascade=True) - except keystone_exception.ClientException as e: - LOG.error( - "Failed to delete compute node resource provider " - "for compute node %s: %s", cn.uuid, six.text_type(e)) + cn.destroy() + except exception.ComputeHostNotFound: + # NOTE(mgoddard): it's possible that another compute + # service took ownership of this compute node since we + # queried it due to a rebalance, and this will cause the + # deletion to fail. Ignore the error in that case. + LOG.info("Ignoring failure to delete orphan compute node " + "%(id)s on hypervisor host %(hh)s due to " + "possible node rebalance", + {'id': cn.id, 'hh': cn.hypervisor_hostname}) + self.rt.remove_node(cn.hypervisor_hostname) + self.reportclient.invalidate_resource_provider(cn.uuid) + else: + self.rt.remove_node(cn.hypervisor_hostname) + # Delete the corresponding resource provider in placement, + # along with any associated allocations. + try: + self.reportclient.delete_resource_provider( + context, cn, cascade=True) + except keystone_exception.ClientException as e: + LOG.error( + "Failed to delete compute node resource provider " + "for compute node %s: %s", cn.uuid, str(e)) for nodename in nodenames: self._update_available_resource_for_node(context, nodename, diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 5b9e64e2190..cf75d4be651 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -933,14 +933,41 @@ def _update_available_resource(self, context, resources, startup=False): 'flavor', 'migration_context', 'resources']) - # Now calculate usage based on instance utilization: - instance_by_uuid = self._update_usage_from_instances( - context, instances, nodename) - # Grab all in-progress migrations and error migrations: migrations = objects.MigrationList.get_in_progress_and_error( context, self.host, nodename) + # Check for tracked instances with in-progress, incoming, but not + # finished migrations. For those instance the migration context + # is not applied yet (it will be during finish_resize when the + # migration goes to finished state). We need to manually and + # temporary apply the migration context here when the resource usage is + # updated. See bug 1953359 for more details. + instance_by_uuid = {instance.uuid: instance for instance in instances} + for migration in migrations: + if ( + migration.instance_uuid in instance_by_uuid and + migration.dest_compute == self.host and + migration.dest_node == nodename + ): + # we does not check for the 'post-migrating' migration status + # as applying the migration context for an instance already + # in finished migration status is a no-op anyhow. + instance = instance_by_uuid[migration.instance_uuid] + LOG.debug( + 'Applying migration context for instance %s as it has an ' + 'incoming, in-progress migration %s. ' + 'Migration status is %s', + migration.instance_uuid, migration.uuid, migration.status + ) + # It is OK not to revert the migration context at the end of + # the periodic as the instance is not saved during the periodic + instance.apply_migration_context() + + # Now calculate usage based on instance utilization: + instance_by_uuid = self._update_usage_from_instances( + context, instances, nodename) + self._pair_instances_to_migrations(migrations, instance_by_uuid) self._update_usage_from_migrations(context, migrations, nodename) @@ -1988,3 +2015,21 @@ def finish_evacuation(self, instance, node, migration): if migration: migration.status = 'done' migration.save() + + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) + def clean_compute_node_cache(self, compute_nodes_in_db): + """Clean the compute node cache of any nodes that no longer exist. + + :param compute_nodes_in_db: list of ComputeNode objects from the DB. + """ + compute_nodes_in_db_nodenames = {cn.hypervisor_hostname + for cn in compute_nodes_in_db} + stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames + + for stale_cn in stale_cns: + # NOTE(mgoddard): we have found a node in the cache that has no + # compute node in the DB. This could be due to a node rebalance + # where another compute service took ownership of the node. Clean + # up the cache. + self.remove_node(stale_cn) + self.reportclient.invalidate_resource_provider(stale_cn) diff --git a/nova/conf/pci.py b/nova/conf/pci.py index b812b39676a..b383d0a69fc 100644 --- a/nova/conf/pci.py +++ b/nova/conf/pci.py @@ -116,7 +116,13 @@ ``address`` PCI address of the device. Both traditional glob style and regular - expression syntax is supported. + expression syntax is supported. Please note that the address fields are + restricted to the following maximum values: + + * domain - 0xFFFF + * bus - 0xFF + * slot - 0x1F + * function - 0x7 ``devname`` Device name of the device (for e.g. interface name). Not all PCI devices diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py index 8eadc0b6ec3..efb5aa4940d 100644 --- a/nova/conf/workarounds.py +++ b/nova/conf/workarounds.py @@ -345,6 +345,67 @@ * :oslo.config:option:`DEFAULT.instances_path` * :oslo.config:option:`image_cache.subdirectory_name` * :oslo.config:option:`update_resources_interval` +"""), + cfg.ListOpt('wait_for_vif_plugged_event_during_hard_reboot', + item_type=cfg.types.String( + choices=[ + "normal", + "direct", + "macvtap", + "baremetal", + "direct-physical", + "virtio-forwarder", + "smart-nic", + ]), + default=[], + help=""" +The libvirt virt driver implements power on and hard reboot by tearing down +every vif of the instance being rebooted then plug them again. By default nova +does not wait for network-vif-plugged event from neutron before it lets the +instance run. This can cause the instance to requests the IP via DHCP before +the neutron backend has a chance to set up the networking backend after the vif +plug. + +This flag defines which vifs nova expects network-vif-plugged events from +during hard reboot. The possible values are neutron port vnic types: + +* normal +* direct +* macvtap +* baremetal +* direct-physical +* virtio-forwarder +* smart-nic + +Adding a ``vnic_type`` to this configuration makes Nova wait for a +network-vif-plugged event for each of the instance's vifs having the specific +``vnic_type`` before unpausing the instance, similarly to how new instance +creation works. + +Please note that not all neutron networking backends send plug time events, for +certain ``vnic_type`` therefore this config is empty by default. + +The ml2/ovs and the networking-odl backends are known to send plug time events +for ports with ``normal`` ``vnic_type`` so it is safe to add ``normal`` to this +config if you are using only those backends in the compute host. + +The neutron in-tree SRIOV backend does not reliably send network-vif-plugged +event during plug time for ports with ``direct`` vnic_type and never sends +that event for port with ``direct-physical`` vnic_type during plug time. For +other ``vnic_type`` and backend pairs, please consult the developers of the +backend. + +Related options: + +* :oslo.config:option:`DEFAULT.vif_plugging_timeout` +"""), + cfg.BoolOpt('skip_cpu_compare_on_dest', + default=False, + help=""" +With the libvirt driver, during live migration, skip comparing guest CPU +with the destination host. When using QEMU >= 2.9 and libvirt >= +4.4.0, libvirt will do the correct thing with respect to checking CPU +compatibility on the destination host during live migration. """), ] diff --git a/nova/console/websocketproxy.py b/nova/console/websocketproxy.py index 5f2985dfbbc..8512ac62ad7 100644 --- a/nova/console/websocketproxy.py +++ b/nova/console/websocketproxy.py @@ -19,6 +19,8 @@ ''' import copy +from http import HTTPStatus +import os import socket import sys @@ -281,6 +283,22 @@ def new_websocket_client(self): def socket(self, *args, **kwargs): return websockifyserver.WebSockifyServer.socket(*args, **kwargs) + def send_head(self): + # This code is copied from this example patch: + # https://bugs.python.org/issue32084#msg306545 + path = self.translate_path(self.path) + if os.path.isdir(path): + parts = urlparse.urlsplit(self.path) + if not parts.path.endswith('/'): + # Browsers interpret "Location: //uri" as an absolute URI + # like "http://URI" + if self.path.startswith('//'): + self.send_error(HTTPStatus.BAD_REQUEST, + "URI must not start with //") + return None + + return super(NovaProxyRequestHandler, self).send_head() + class NovaWebSocketProxy(websockify.WebSocketProxy): def __init__(self, *args, **kwargs): diff --git a/nova/db/api.py b/nova/db/api.py index 14578c1efdb..22d263f2a59 100644 --- a/nova/db/api.py +++ b/nova/db/api.py @@ -344,15 +344,16 @@ def compute_node_update(context, compute_id, values): return IMPL.compute_node_update(context, compute_id, values) -def compute_node_delete(context, compute_id): +def compute_node_delete(context, compute_id, compute_host): """Delete a compute node from the database. :param context: The security context :param compute_id: ID of the compute node + :param compute_host: Hostname of the compute service Raises ComputeHostNotFound if compute node with the given ID doesn't exist. """ - return IMPL.compute_node_delete(context, compute_id) + return IMPL.compute_node_delete(context, compute_id, compute_host) def compute_node_statistics(context): diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py index 9bc673faa82..03240bd2abe 100644 --- a/nova/db/sqlalchemy/api.py +++ b/nova/db/sqlalchemy/api.py @@ -690,7 +690,7 @@ def compute_node_search_by_hypervisor(context, hypervisor_match): @pick_context_manager_writer -def compute_node_create(context, values): +def _compute_node_create(context, values): """Creates a new ComputeNode and populates the capacity fields with the most recent data. """ @@ -698,8 +698,21 @@ def compute_node_create(context, values): compute_node_ref = models.ComputeNode() compute_node_ref.update(values) + compute_node_ref.save(context.session) + return compute_node_ref + + +# NOTE(mgoddard): We avoid decorating this with @pick_context_manager_writer, +# so that we get a separate transaction in the exception handler. This avoids +# an error message about inactive DB sessions during a transaction rollback. +# See https://bugs.launchpad.net/nova/+bug/1853159. +def compute_node_create(context, values): + """Creates a new ComputeNode and populates the capacity fields + with the most recent data. Will restore a soft deleted compute node if a + UUID has been explicitly requested. + """ try: - compute_node_ref.save(context.session) + compute_node_ref = _compute_node_create(context, values) except db_exc.DBDuplicateEntry: with excutils.save_and_reraise_exception(logger=LOG) as err_ctx: # Check to see if we have a (soft) deleted ComputeNode with the @@ -763,10 +776,10 @@ def compute_node_update(context, compute_id, values): @pick_context_manager_writer -def compute_node_delete(context, compute_id): +def compute_node_delete(context, compute_id, compute_host): """Delete a ComputeNode record.""" result = model_query(context, models.ComputeNode).\ - filter_by(id=compute_id).\ + filter_by(id=compute_id, host=compute_host).\ soft_delete(synchronize_session=False) if not result: diff --git a/nova/middleware.py b/nova/middleware.py index 717fecd4efe..8b0fc595615 100644 --- a/nova/middleware.py +++ b/nova/middleware.py @@ -24,11 +24,15 @@ def set_defaults(): 'X-Roles', 'X-Service-Catalog', 'X-User-Id', - 'X-Tenant-Id'], + 'X-Tenant-Id', + 'X-OpenStack-Nova-API-Version', + 'OpenStack-API-Version'], expose_headers=['X-Auth-Token', 'X-Openstack-Request-Id', 'X-Subject-Token', - 'X-Service-Token'], + 'X-Service-Token', + 'X-OpenStack-Nova-API-Version', + 'OpenStack-API-Version'], allow_methods=['GET', 'PUT', 'POST', diff --git a/nova/network/neutron.py b/nova/network/neutron.py index db4cb44d473..b05ab1a0558 100644 --- a/nova/network/neutron.py +++ b/nova/network/neutron.py @@ -271,6 +271,7 @@ def _get_ksa_client(context, admin=False): client = utils.get_ksa_adapter( 'network', ksa_auth=auth_plugin, ksa_session=session) client.additional_headers = {'accept': 'application/json'} + client.connect_retries = CONF.neutron.http_retries return client @@ -802,9 +803,15 @@ def _process_security_groups(self, instance, neutron, security_groups): # TODO(arosen) Should optimize more to do direct query for security # group if len(security_groups) == 1 if len(security_groups): + # NOTE(slaweq): fields other than name and id aren't really needed + # so asking only about those fields will allow Neutron to not + # prepare list of rules for each found security group. That may + # speed processing of this request a lot in case when tenant has + # got many security groups + sg_fields = ['id', 'name'] search_opts = {'tenant_id': instance.project_id} user_security_groups = neutron.list_security_groups( - **search_opts).get('security_groups') + fields=sg_fields, **search_opts).get('security_groups') for security_group in security_groups: name_match = None diff --git a/nova/objects/compute_node.py b/nova/objects/compute_node.py index cf5d3ca8d88..fa327fbefdb 100644 --- a/nova/objects/compute_node.py +++ b/nova/objects/compute_node.py @@ -360,7 +360,7 @@ def save(self, prune_stats=False): @base.remotable def destroy(self): - db.compute_node_delete(self._context, self.id) + db.compute_node_delete(self._context, self.id, self.host) def update_from_virt_driver(self, resources): # NOTE(pmurray): the virt driver provides a dict of values that diff --git a/nova/pci/manager.py b/nova/pci/manager.py index 05930b0bebb..c55e732c01f 100644 --- a/nova/pci/manager.py +++ b/nova/pci/manager.py @@ -117,8 +117,42 @@ def update_devices_from_hypervisor_resources(self, devices_json): devices = [] for dev in jsonutils.loads(devices_json): - if self.dev_filter.device_assignable(dev): - devices.append(dev) + try: + if self.dev_filter.device_assignable(dev): + devices.append(dev) + except exception.PciConfigInvalidWhitelist as e: + # The raised exception is misleading as the problem is not with + # the whitelist config but with the host PCI device reported by + # libvirt. The code that matches the host PCI device to the + # withelist spec reuses the WhitelistPciAddress object to parse + # the host PCI device address. That parsing can fail if the + # PCI address has a 32 bit domain. But this should not prevent + # processing the rest of the devices. So we simply skip this + # device and continue. + # Please note that this except block does not ignore the + # invalid whitelist configuration. The whitelist config has + # already been parsed or rejected in case it was invalid. At + # this point the self.dev_filter representes the parsed and + # validated whitelist config. + LOG.debug( + 'Skipping PCI device %s reported by the hypervisor: %s', + {k: v for k, v in dev.items() + if k in ['address', 'parent_addr']}, + # NOTE(gibi): this is ugly but the device_assignable() call + # uses the PhysicalPciAddress class to parse the PCI + # addresses and that class reuses the code from + # PciAddressSpec that was originally designed to parse + # whitelist spec. Hence the raised exception talks about + # whitelist config. This is misleading as in our case the + # PCI address that we failed to parse came from the + # hypervisor. + # TODO(gibi): refactor the false abstraction to make the + # code reuse clean from the false assumption that we only + # parse whitelist config with + # devspec.PciAddressSpec._set_pci_dev_info() + str(e).replace( + 'Invalid PCI devices Whitelist config:', 'The')) + self._set_hvdevs(devices) @staticmethod diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index d09f1334877..f1f60fd67bc 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -678,11 +678,7 @@ def _delete_provider(self, rp_uuid, global_request_id=None): if resp: LOG.info("Deleted resource provider %s", rp_uuid) # clean the caches - try: - self._provider_tree.remove(rp_uuid) - except ValueError: - pass - self._association_refresh_time.pop(rp_uuid, None) + self.invalidate_resource_provider(rp_uuid) return msg = ("[%(placement_req_id)s] Failed to delete resource provider " @@ -2181,6 +2177,17 @@ def delete_resource_provider(self, context, compute_node, cascade=False): # left a no-op for backward compatibility. pass + def invalidate_resource_provider(self, name_or_uuid): + """Invalidate the cache for a resource provider. + + :param name_or_uuid: Name or UUID of the resource provider to look up. + """ + try: + self._provider_tree.remove(name_or_uuid) + except ValueError: + pass + self._association_refresh_time.pop(name_or_uuid, None) + def get_provider_by_name(self, context, name): """Queries the placement API for resource provider information matching a supplied name. diff --git a/nova/storage/rbd_utils.py b/nova/storage/rbd_utils.py index 22bafe50536..431dfc9aec0 100644 --- a/nova/storage/rbd_utils.py +++ b/nova/storage/rbd_utils.py @@ -405,7 +405,14 @@ def get_pool_info(self): # MAX_AVAIL stat will divide by the replication size when doing the # calculation. args = ['ceph', 'df', '--format=json'] + self.ceph_args() - out, _ = processutils.execute(*args) + + try: + out, _ = processutils.execute(*args) + except processutils.ProcessExecutionError: + LOG.exception('Could not determine disk usage') + raise exception.StorageError( + reason='Could not determine disk usage') + stats = jsonutils.loads(out) # Find the pool for which we are configured. diff --git a/nova/test.py b/nova/test.py index 89e4bed0842..9bb6e5e032a 100644 --- a/nova/test.py +++ b/nova/test.py @@ -54,6 +54,7 @@ from sqlalchemy.dialects import sqlite import testtools +from nova.api.openstack import wsgi_app from nova.compute import rpcapi as compute_rpcapi from nova import context from nova.db.sqlalchemy import api as sqlalchemy_api @@ -285,6 +286,10 @@ def setUp(self): self.useFixture(nova_fixtures.GenericPoisonFixture()) + # make sure that the wsgi app is fully initialized for all testcase + # instead of only once initialized for test worker + wsgi_app.init_global_data.reset() + def _setup_cells(self): """Setup a normal cellsv2 environment. diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index f0e7e148ebd..44afcb0381a 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -94,6 +94,12 @@ def router(self): return rpc.ClientRouter(default_client) +# placeholder used as a default parameter value to distinguish between the case +# when the parameter is specified by the caller with None from the case when it +# was not specified +NOT_SPECIFIED = object() + + class InstanceHelperMixin: def _wait_for_server_parameter( @@ -385,7 +391,15 @@ def _create_server(self, name=None, image_uuid=None, flavor_id=None, """ # if forcing the server onto a host, we have to use the admin API if not api: - api = self.api if not az else getattr(self, 'admin_api', self.api) + api = self.api if not az and not host else getattr( + self, 'admin_api', self.api) + + if host and not api.microversion: + api.microversion = '2.74' + # with 2.74 networks param needs to use 'none' instead of None + # if no network is needed + if networks is None: + networks = 'none' body = self._build_server( name, image_uuid, flavor_id, networks, az, host) @@ -485,12 +499,42 @@ def _unshelve_server(self, server, expected_state='ACTIVE'): self.api.post_server_action(server['id'], {'unshelve': {}}) return self._wait_for_state_change(server, expected_state) - def _evacuate_server(self, server, host, expected_state='ACTIVE'): + def _evacuate_server( + self, server, extra_post_args=None, expected_host=None, + expected_state='ACTIVE', expected_task_state=NOT_SPECIFIED, + expected_migration_status='done'): """Evacuate a server.""" - self.api.post_server_action(server['id'], {'evacuate': {}}) - self._wait_for_server_parameter( - self.server, {'OS-EXT-SRV-ATTR:host': host, - 'status': expected_state}) + api = getattr(self, 'admin_api', self.api) + + post = {'evacuate': {}} + if extra_post_args: + post['evacuate'].update(extra_post_args) + + expected_result = {'status': expected_state} + if expected_host: + expected_result['OS-EXT-SRV-ATTR:host'] = expected_host + if expected_task_state is not NOT_SPECIFIED: + expected_result['OS-EXT-STS:task_state'] = expected_task_state + + api.post_server_action(server['id'], post) + + # NOTE(gibi): The order of waiting for the migration and returning + # a fresh server from _wait_for_server_parameter is important as + # the compute manager sets status of the instance before sets the + # host and finally sets the migration status. So waiting for the + # migration first makes the returned server object more consistent. + self._wait_for_migration_status(server, [expected_migration_status]) + return self._wait_for_server_parameter(server, expected_result) + + def _start_server(self, server): + self.api.post_server_action(server['id'], {'os-start': None}) + return self._wait_for_state_change(server, 'ACTIVE') + + def _stop_server(self, server, wait_for_stop=True): + self.api.post_server_action(server['id'], {'os-stop': None}) + if wait_for_stop: + return self._wait_for_state_change(server, 'SHUTOFF') + return server class PlacementHelperMixin: diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py index 28f8463aeae..db61b5fd4b3 100644 --- a/nova/tests/functional/libvirt/test_numa_servers.py +++ b/nova/tests/functional/libvirt/test_numa_servers.py @@ -21,6 +21,7 @@ from oslo_log import log as logging import nova +from nova.compute import manager from nova.conf import neutron as neutron_conf from nova import context as nova_context from nova import objects @@ -711,6 +712,211 @@ def fake_confirm_migration(*args, **kwargs): server = self._wait_for_state_change(server, 'ACTIVE') + def _assert_pinned_cpus(self, hostname, expected_number_of_pinned): + numa_topology = objects.NUMATopology.obj_from_db_obj( + objects.ComputeNode.get_by_nodename( + self.ctxt, hostname, + ).numa_topology, + ) + self.assertEqual( + expected_number_of_pinned, len(numa_topology.cells[0].pinned_cpus)) + + def _create_server_and_resize_bug_1944759(self): + self.flags( + cpu_dedicated_set='0-3', cpu_shared_set='4-7', group='compute') + self.flags(vcpu_pin_set=None) + + # start services + self.start_compute(hostname='test_compute0') + self.start_compute(hostname='test_compute1') + + flavor_a_id = self._create_flavor( + vcpu=2, extra_spec={'hw:cpu_policy': 'dedicated'}) + server = self._create_server(flavor_id=flavor_a_id) + + src_host = server['OS-EXT-SRV-ATTR:host'] + self._assert_pinned_cpus(src_host, 2) + + # we don't really care what the new flavor is, so long as the old + # flavor is using pinning. We use a similar flavor for simplicity. + flavor_b_id = self._create_flavor( + vcpu=2, extra_spec={'hw:cpu_policy': 'dedicated'}) + + orig_rpc_finish_resize = nova.compute.rpcapi.ComputeAPI.finish_resize + + # Simulate that the finish_resize call overlaps with an + # update_available_resource periodic job + def inject_periodic_to_finish_resize(*args, **kwargs): + self._run_periodics() + return orig_rpc_finish_resize(*args, **kwargs) + + self.stub_out( + 'nova.compute.rpcapi.ComputeAPI.finish_resize', + inject_periodic_to_finish_resize, + ) + + # TODO(stephenfin): The mock of 'migrate_disk_and_power_off' should + # probably be less...dumb + with mock.patch( + 'nova.virt.libvirt.driver.LibvirtDriver' + '.migrate_disk_and_power_off', return_value='{}', + ): + post = {'resize': {'flavorRef': flavor_b_id}} + self.api.post_server_action(server['id'], post) + server = self._wait_for_state_change(server, 'VERIFY_RESIZE') + + dst_host = server['OS-EXT-SRV-ATTR:host'] + + # we have 2 cpus pinned on both computes. The source should have it + # due to the outbound migration and the destination due to the + # instance running there + self._assert_pinned_cpus(src_host, 2) + self._assert_pinned_cpus(dst_host, 2) + + return server, src_host, dst_host + + def test_resize_confirm_bug_1944759(self): + server, src_host, dst_host = ( + self._create_server_and_resize_bug_1944759()) + + # Now confirm the resize + post = {'confirmResize': None} + + self.api.post_server_action(server['id'], post) + self._wait_for_state_change(server, 'ACTIVE') + + # the resource allocation reflects that the VM is running on the dest + # node + self._assert_pinned_cpus(src_host, 0) + self._assert_pinned_cpus(dst_host, 2) + + # and running periodics does not break it either + self._run_periodics() + + self._assert_pinned_cpus(src_host, 0) + self._assert_pinned_cpus(dst_host, 2) + + def test_resize_revert_bug_1944759(self): + server, src_host, dst_host = ( + self._create_server_and_resize_bug_1944759()) + + # Now revert the resize + post = {'revertResize': None} + + # reverts actually succeeds (not like confirm) but the resource + # allocation is still flaky + self.api.post_server_action(server['id'], post) + self._wait_for_state_change(server, 'ACTIVE') + + # After the revert the source host should have 2 cpus pinned due to + # the instance. + self._assert_pinned_cpus(src_host, 2) + self._assert_pinned_cpus(dst_host, 0) + + # running the periodic job will not break it either + self._run_periodics() + + self._assert_pinned_cpus(src_host, 2) + self._assert_pinned_cpus(dst_host, 0) + + def test_resize_dedicated_policy_race_on_dest_bug_1953359(self): + + self.flags(cpu_dedicated_set='0-2', cpu_shared_set=None, + group='compute') + self.flags(vcpu_pin_set=None) + + host_info = fakelibvirt.HostInfo(cpu_nodes=1, cpu_sockets=1, + cpu_cores=2, cpu_threads=1, + kB_mem=15740000) + self.start_compute(host_info=host_info, hostname='compute1') + + extra_spec = { + 'hw:cpu_policy': 'dedicated', + } + flavor_id = self._create_flavor(vcpu=1, extra_spec=extra_spec) + expected_usage = {'DISK_GB': 20, 'MEMORY_MB': 2048, 'PCPU': 1} + + server = self._run_build_test(flavor_id, expected_usage=expected_usage) + + inst = objects.Instance.get_by_uuid(self.ctxt, server['id']) + self.assertEqual(1, len(inst.numa_topology.cells)) + # assert that the pcpu 0 is used on compute1 + self.assertEqual({'0': 0}, inst.numa_topology.cells[0].cpu_pinning_raw) + + # start another compute with the same config + self.start_compute(host_info=host_info, hostname='compute2') + + # boot another instance but now on compute2 so that it occupies the + # pcpu 0 on compute2 + # NOTE(gibi): _run_build_test cannot be used here as it assumes only + # compute1 exists + server2 = self._create_server( + flavor_id=flavor_id, + host='compute2', + ) + inst2 = objects.Instance.get_by_uuid(self.ctxt, server2['id']) + self.assertEqual(1, len(inst2.numa_topology.cells)) + # assert that the pcpu 0 is used + self.assertEqual( + {'0': 0}, inst2.numa_topology.cells[0].cpu_pinning_raw) + + # migrate the first instance from compute1 to compute2 but stop + # migrating at the start of finish_resize. Then start a racing periodic + # update_available_resources. + orig_finish_resize = manager.ComputeManager.finish_resize + + def fake_finish_resize(*args, **kwargs): + # start a racing update_available_resource periodic + self._run_periodics() + # we expect it that CPU pinning fails on the destination node + # as the resource_tracker will use the source node numa_topology + # and that does not fit to the dest node as pcpu 0 in the dest + # is already occupied. + log = self.stdlog.logger.output + # The resize_claim correctly calculates that the instance should be + # pinned to pcpu id 1 instead of 0 + self.assertIn( + 'Computed NUMA topology CPU pinning: usable pCPUs: [[1]], ' + 'vCPUs mapping: [(0, 1)]', + log, + ) + # We expect that the periodic not fails as bug 1953359 is fixed. + log = self.stdlog.logger.output + self.assertIn('Running periodic for compute (compute2)', log) + self.assertNotIn('Error updating resources for node compute2', log) + self.assertNotIn( + 'nova.exception.CPUPinningInvalid: CPU set to pin [0] must be ' + 'a subset of free CPU set [1]', + log, + ) + + # now let the resize finishes + return orig_finish_resize(*args, **kwargs) + + # TODO(stephenfin): The mock of 'migrate_disk_and_power_off' should + # probably be less...dumb + with mock.patch('nova.virt.libvirt.driver.LibvirtDriver' + '.migrate_disk_and_power_off', return_value='{}'): + with mock.patch( + 'nova.compute.manager.ComputeManager.finish_resize', + new=fake_finish_resize, + ): + post = {'migrate': None} + # this is expected to succeed + self.admin_api.post_server_action(server['id'], post) + + server = self._wait_for_state_change(server, 'VERIFY_RESIZE') + + # As bug 1953359 is fixed the revert should succeed too + post = {'revertResize': {}} + self.admin_api.post_server_action(server['id'], post) + self._wait_for_state_change(server, 'ACTIVE') + self.assertNotIn( + 'nova.exception.CPUUnpinningInvalid: CPU set to unpin [1] must be ' + 'a subset of pinned CPU set [0]', + self.stdlog.logger.output, + ) + class NUMAServerTestWithCountingQuotaFromPlacement(NUMAServersTest): diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py new file mode 100644 index 00000000000..2ec69482a25 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1853009.py @@ -0,0 +1,171 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import mock + +from nova import context +from nova import objects +from nova.tests.functional import integrated_helpers + + +class NodeRebalanceDeletedComputeNodeRaceTestCase( + integrated_helpers.ProviderUsageBaseTestCase, +): + """Regression test for bug 1853009 observed in Rocky & later. + + When an ironic node re-balances from one host to another, there can be a + race where the old host deletes the orphan compute node after the new host + has taken ownership of it which results in the new host failing to create + the compute node and resource provider because the ResourceTracker does not + detect a change. + """ + # Make sure we're using the fake driver that has predictable uuids + # for each node. + compute_driver = 'fake.PredictableNodeUUIDDriver' + + def _assert_hypervisor_api(self, nodename, expected_host): + # We should have one compute node shown by the API. + hypervisors = self.api.api_get( + '/os-hypervisors/detail' + ).body['hypervisors'] + self.assertEqual(1, len(hypervisors), hypervisors) + hypervisor = hypervisors[0] + self.assertEqual(nodename, hypervisor['hypervisor_hostname']) + self.assertEqual(expected_host, hypervisor['service']['host']) + + def _start_compute(self, host): + host = self.start_service('compute', host) + # Ironic compute driver has rebalances_nodes = True. + host.manager.driver.rebalances_nodes = True + return host + + def setUp(self): + super(NodeRebalanceDeletedComputeNodeRaceTestCase, self).setUp() + + self.nodename = 'fake-node' + self.ctxt = context.get_admin_context() + + def test_node_rebalance_deleted_compute_node_race(self): + # Simulate a service running and then stopping. host_a runs, creates + # fake-node, then is stopped. The fake-node compute node is destroyed. + # This leaves a soft-deleted node in the DB. + host_a = self._start_compute('host_a') + host_a.manager.driver._set_nodes([self.nodename]) + host_a.manager.update_available_resource(self.ctxt) + host_a.stop() + cn = objects.ComputeNode.get_by_host_and_nodename( + self.ctxt, 'host_a', self.nodename, + ) + cn.destroy() + + self.assertEqual(0, len(objects.ComputeNodeList.get_all(self.ctxt))) + + # Now we create a new compute service to manage our node. + host_b = self._start_compute('host_b') + host_b.manager.driver._set_nodes([self.nodename]) + + # When start_service runs, it will create a host_b ComputeNode. We want + # to delete that and inject our fake node into the driver which will + # be re-balanced to another host later. First assert this actually + # exists. + self._assert_hypervisor_api('host_b', expected_host='host_b') + + # Now run the update_available_resource periodic to register fake-node + # and have it managed by host_b. This will also detect the "host_b" + # node as orphaned and delete it along with its resource provider. + + # host_b[1]: Finds no compute record in RT. Tries to create one + # (_init_compute_node). + host_b.manager.update_available_resource(self.ctxt) + self._assert_hypervisor_api(self.nodename, expected_host='host_b') + # There should only be one resource provider (fake-node). + original_rps = self._get_all_providers() + self.assertEqual(1, len(original_rps), original_rps) + self.assertEqual(self.nodename, original_rps[0]['name']) + + # Simulate a re-balance by restarting host_a and make it manage + # fake-node. At this point both host_b and host_a think they own + # fake-node. + host_a = self._start_compute('host_a') + host_a.manager.driver._set_nodes([self.nodename]) + + # host_a[1]: Finds no compute record in RT, 'moves' existing node from + # host_b + host_a.manager.update_available_resource(self.ctxt) + # Assert that fake-node was re-balanced from host_b to host_a. + self.assertIn( + 'ComputeNode fake-node moving from host_b to host_a', + self.stdlog.logger.output) + self._assert_hypervisor_api(self.nodename, expected_host='host_a') + + # host_a[2]: Begins periodic update, queries compute nodes for this + # host, finds the fake-node. + cn = objects.ComputeNode.get_by_host_and_nodename( + self.ctxt, 'host_a', self.nodename, + ) + + # host_b[2]: Finds no compute record in RT, 'moves' existing node from + # host_a + host_b.manager.update_available_resource(self.ctxt) + # Assert that fake-node was re-balanced from host_a to host_b. + self.assertIn( + 'ComputeNode fake-node moving from host_a to host_b', + self.stdlog.logger.output) + self._assert_hypervisor_api(self.nodename, expected_host='host_b') + + # Complete rebalance, as host_a realises it does not own fake-node. + host_a.manager.driver._set_nodes([]) + + # host_a[2]: Deletes orphan compute node. + # Mock out the compute node query to simulate a race condition where + # the list includes an orphan compute node that is newly owned by + # host_b by the time host_a attempts to delete it. + with mock.patch( + 'nova.compute.manager.ComputeManager._get_compute_nodes_in_db' + ) as mock_get: + mock_get.return_value = [cn] + host_a.manager.update_available_resource(self.ctxt) + + # Verify that the node was almost deleted, but was saved by the host + # check + self.assertIn( + 'Deleting orphan compute node %s hypervisor host ' + 'is fake-node, nodes are' % cn.id, + self.stdlog.logger.output) + self.assertIn( + 'Ignoring failure to delete orphan compute node %s on ' + 'hypervisor host fake-node' % cn.id, + self.stdlog.logger.output) + self._assert_hypervisor_api(self.nodename, 'host_b') + rps = self._get_all_providers() + self.assertEqual(1, len(rps), rps) + self.assertEqual(self.nodename, rps[0]['name']) + + # Simulate deletion of an orphan by host_a. It shouldn't happen + # anymore, but let's assume it already did. + cn = objects.ComputeNode.get_by_host_and_nodename( + self.ctxt, 'host_b', self.nodename) + cn.destroy() + host_a.manager.rt.remove_node(cn.hypervisor_hostname) + host_a.manager.reportclient.delete_resource_provider( + self.ctxt, cn, cascade=True) + + # host_b[3]: Should recreate compute node and resource provider. + host_b.manager.update_available_resource(self.ctxt) + + # Verify that the node was recreated. + self._assert_hypervisor_api(self.nodename, 'host_b') + + # The resource provider has now been created. + rps = self._get_all_providers() + self.assertEqual(1, len(rps), rps) + self.assertEqual(self.nodename, rps[0]['name']) diff --git a/nova/tests/functional/regressions/test_bug_1978983.py b/nova/tests/functional/regressions/test_bug_1978983.py new file mode 100644 index 00000000000..51465900da0 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1978983.py @@ -0,0 +1,71 @@ +# Copyright 2022 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import fixtures as func_fixtures +from nova.tests.functional import integrated_helpers + + +class EvacuateServerWithTaskState( + test.TestCase, integrated_helpers.InstanceHelperMixin, +): + """Regression test for bug 1978983 + If instance task state is powering-off or not None + instance should be allowed to evacuate. + """ + + def setUp(self): + super().setUp() + # Stub out external dependencies. + self.useFixture(nova_fixtures.NeutronFixture(self)) + self.useFixture(nova_fixtures.GlanceFixture(self)) + self.useFixture(func_fixtures.PlacementFixture()) + self.useFixture(nova_fixtures.HostNameWeigherFixture()) + + # Start nova controller services. + self.start_service('conductor') + self.start_service('scheduler') + + api_fixture = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')) + self.api = api_fixture.admin_api + self.api.microversion = 'latest' + + self.src = self._start_compute(host='host1') + self.dest = self._start_compute(host='host2') + + def test_evacuate_instance(self): + """Evacuating a server + """ + server = self._create_server(networks=[]) + + server = self._wait_for_state_change(server, 'ACTIVE') + self.assertEqual(self.src.host, server['OS-EXT-SRV-ATTR:host']) + + # stop host1 compute service + self.src.stop() + self.api.put_service_force_down(self.src.service_ref.uuid, True) + + # poweroff instance + self._stop_server(server, wait_for_stop=False) + server = self._wait_for_server_parameter( + server, {'OS-EXT-STS:task_state': 'powering-off'}) + + # evacuate instance + server = self._evacuate_server( + server, expected_host=self.dest.host + ) + self.assertEqual(self.dest.host, server['OS-EXT-SRV-ATTR:host']) diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index 426745cb8ae..90f3f48457d 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -8181,7 +8181,8 @@ def test_evacuate_ok(self): arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id']) compute_to_stop, compute_to_evacuate = self._test_evacuate( self.server, self.NUM_HOSTS) - self._evacuate_server(self.server, compute_to_evacuate.host) + self._evacuate_server(self.server, + expected_host=compute_to_evacuate.host) compute_to_stop.start() self.server = self.api.get_server(self.server['id']) arqs_new = self.cyborg.fake_get_arqs_for_instance(self.server['id']) diff --git a/nova/tests/unit/api/openstack/compute/test_services.py b/nova/tests/unit/api/openstack/compute/test_services.py index 170b39f2084..94263833c72 100644 --- a/nova/tests/unit/api/openstack/compute/test_services.py +++ b/nova/tests/unit/api/openstack/compute/test_services.py @@ -701,6 +701,21 @@ def test_services_delete(self, mock_get_compute_nodes): mock_get_compute_nodes.assert_called_once_with( self.req.environ['nova.context'], compute.host) + @mock.patch( + 'nova.objects.ComputeNodeList.get_all_by_host', + side_effect=exception.ComputeHostNotFound(host='fake-compute-host')) + def test_services_delete_compute_host_not_found( + self, mock_get_all_by_host): + compute = objects.Service(self.ctxt, + **{'host': 'fake-compute-host', + 'binary': 'nova-compute', + 'topic': 'compute', + 'report_count': 0}) + compute.create() + self.controller.delete(self.req, compute.id) + mock_get_all_by_host.assert_called_with( + self.req.environ['nova.context'], 'fake-compute-host') + def test_services_delete_not_found(self): self.assertRaises(webob.exc.HTTPNotFound, diff --git a/nova/tests/unit/api/openstack/test_wsgi_app.py b/nova/tests/unit/api/openstack/test_wsgi_app.py new file mode 100644 index 00000000000..4cb7459c982 --- /dev/null +++ b/nova/tests/unit/api/openstack/test_wsgi_app.py @@ -0,0 +1,85 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import tempfile + +import fixtures +import mock +from oslo_config import fixture as config_fixture +from oslotest import base + +from nova.api.openstack import wsgi_app +from nova import test +from nova.tests import fixtures as nova_fixtures + + +class WSGIAppTest(base.BaseTestCase): + + _paste_config = """ +[app:nova-api] +use = egg:Paste#static +document_root = /tmp + """ + + def setUp(self): + # Ensure BaseTestCase's ConfigureLogging fixture is disabled since + # we're using our own (StandardLogging). + with fixtures.EnvironmentVariable('OS_LOG_CAPTURE', '0'): + super(WSGIAppTest, self).setUp() + self.stdlog = self.useFixture(nova_fixtures.StandardLogging()) + self.conf = tempfile.NamedTemporaryFile(mode='w+t') + self.conf.write(self._paste_config.lstrip()) + self.conf.seek(0) + self.conf.flush() + self.addCleanup(self.conf.close) + # Use of this fixture takes care of cleaning up config settings for + # subsequent tests. + self.useFixture(config_fixture.Config()) + + @mock.patch('sys.argv', return_value=mock.sentinel.argv) + @mock.patch('nova.db.sqlalchemy.api.configure') + @mock.patch('nova.api.openstack.wsgi_app._setup_service') + @mock.patch('nova.api.openstack.wsgi_app._get_config_files') + def test_init_application_called_twice(self, mock_get_files, mock_setup, + mock_db_configure, mock_argv): + """Test that init_application can tolerate being called twice in a + single python interpreter instance. + + When nova-api is run via mod_wsgi, if any exception is raised during + init_application, mod_wsgi will re-run the WSGI script without + restarting the daemon process even when configured for Daemon Mode. + + We access the database as part of init_application, so if nova-api + starts up before the database is up, we'll get, for example, a + DBConnectionError raised during init_application and our WSGI script + will get reloaded/re-run by mod_wsgi. + """ + mock_get_files.return_value = [self.conf.name] + mock_setup.side_effect = [test.TestingException, None] + # We need to mock the global database configure() method, else we will + # be affected by global database state altered by other tests that ran + # before this test, causing this test to fail with + # oslo_db.sqlalchemy.enginefacade.AlreadyStartedError. We can instead + # mock the method to raise an exception if it's called a second time in + # this test to simulate the fact that the database does not tolerate + # re-init [after a database query has been made]. + mock_db_configure.side_effect = [None, test.TestingException] + # Run init_application the first time, simulating an exception being + # raised during it. + self.assertRaises(test.TestingException, wsgi_app.init_application, + 'nova-api') + # Now run init_application a second time, it should succeed since no + # exception is being raised (the init of global data should not be + # re-attempted). + wsgi_app.init_application('nova-api') + self.assertIn('Global data already initialized, not re-initializing.', + self.stdlog.logger.output) diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index fadf14f7fee..50a06fa9eae 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -202,8 +202,10 @@ def fake_get_compute_nodes_in_db(self, context, *args, **kwargs): context, objects.ComputeNode(), cn) for cn in fake_compute_nodes] - def fake_compute_node_delete(context, compute_node_id): + def fake_compute_node_delete(context, compute_node_id, + compute_node_host): self.assertEqual(2, compute_node_id) + self.assertEqual('fake_phyp1', compute_node_host) self.stub_out( 'nova.compute.manager.ComputeManager._get_compute_nodes_in_db', diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 93c895e8734..87de3ce7bdb 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -376,18 +376,20 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes, ) # First node in set should have been removed from DB + # Last node in set should have been added to DB. for db_node in db_nodes: if db_node.hypervisor_hostname == 'node1': db_node.destroy.assert_called_once_with() rc_mock.delete_resource_provider.assert_called_once_with( self.context, db_node, cascade=True) - mock_rt.remove_node.assert_called_once_with( - 'node1') + mock_rt.remove_node.assert_called_once_with('node1') mock_log.error.assert_called_once_with( "Failed to delete compute node resource provider for " "compute node %s: %s", db_node.uuid, mock.ANY) else: self.assertFalse(db_node.destroy.called) + self.assertEqual(1, mock_rt.remove_node.call_count) + mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes) @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'delete_resource_provider') @@ -409,6 +411,33 @@ def test_update_available_resource_not_ready(self, get_db_nodes, update_mock.assert_not_called() del_rp_mock.assert_not_called() + @mock.patch.object(manager.ComputeManager, + '_update_available_resource_for_node') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') + def test_update_available_resource_destroy_rebalance( + self, get_db_nodes, get_avail_nodes, update_mock): + mock_rt = self._mock_rt() + rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( + self.compute, 'reportclient')).mock + db_nodes = [self._make_compute_node('node1', 1)] + get_db_nodes.return_value = db_nodes + # Destroy can fail if nodes were rebalanced between getting the node + # list and calling destroy. + db_nodes[0].destroy.side_effect = exception.ComputeHostNotFound( + 'node1') + get_avail_nodes.return_value = set() + self.compute.update_available_resource(self.context) + get_db_nodes.assert_called_once_with(self.context, set(), + use_slave=True, startup=False) + self.assertEqual(0, update_mock.call_count) + + db_nodes[0].destroy.assert_called_once_with() + self.assertEqual(0, rc_mock.delete_resource_provider.call_count) + mock_rt.remove_node.assert_called_once_with('node1') + rc_mock.invalidate_resource_provider.assert_called_once_with( + db_nodes[0].uuid) + @mock.patch('nova.context.get_admin_context') def test_pre_start_hook(self, get_admin_context): """Very simple test just to make sure update_available_resource is @@ -3387,12 +3416,16 @@ def _test_check_can_live_migrate_destination(self, do_raise=False, CONF.host, instance.uuid, graceful_exit=False) return result + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_success(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) self._test_check_can_live_migrate_destination() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_fail(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', @@ -3402,7 +3435,9 @@ def test_check_can_live_migrate_destination_fail(self): self._test_check_can_live_migrate_destination, do_raise=True) - def test_check_can_live_migrate_destination_contins_vifs(self): + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) + def test_check_can_live_migrate_destination_contains_vifs(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) @@ -3410,6 +3445,8 @@ def test_check_can_live_migrate_destination_contins_vifs(self): self.assertIn('vifs', migrate_data) self.assertIsNotNone(migrate_data.vifs) + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_no_binding_extended(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', @@ -3417,18 +3454,40 @@ def test_check_can_live_migrate_destination_no_binding_extended(self): migrate_data = self._test_check_can_live_migrate_destination() self.assertNotIn('vifs', migrate_data) + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_src_numa_lm_false(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) self._test_check_can_live_migrate_destination(src_numa_lm=False) + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_src_numa_lm_true(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) self._test_check_can_live_migrate_destination(src_numa_lm=True) + @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') + def test_check_can_live_migrate_destination_fail_group_policy( + self, mock_fail_db): + + instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, vm_state=vm_states.ACTIVE, + node='fake-node') + + ex = exception.RescheduledException( + instance_uuid=instance.uuid, reason="policy violated") + + with mock.patch.object(self.compute, '_validate_instance_group_policy', + side_effect=ex): + self.assertRaises( + exception.MigrationPreCheckError, + self.compute.check_can_live_migrate_destination, + self.context, instance, None, None, None, None) + def test_dest_can_numa_live_migrate(self): positive_dest_check_data = objects.LibvirtLiveMigrateData( dst_supports_numa_live_migration=True) @@ -7532,7 +7591,8 @@ def test_prep_block_device_maintain_original_error_message(self, def test_validate_policy_honors_workaround_disabled(self, mock_get): instance = objects.Instance(uuid=uuids.instance) hints = {'group': 'foo'} - mock_get.return_value = objects.InstanceGroup(policy=None) + mock_get.return_value = objects.InstanceGroup(policy=None, + uuid=uuids.group) self.compute._validate_instance_group_policy(self.context, instance, hints) mock_get.assert_called_once_with(self.context, 'foo') @@ -7558,10 +7618,14 @@ def test_validate_instance_group_policy_handles_hint_list(self, mock_get): instance, hints) mock_get.assert_called_once_with(self.context, uuids.group_hint) + @mock.patch('nova.objects.InstanceGroup.get_by_uuid') @mock.patch('nova.objects.InstanceList.get_uuids_by_host') @mock.patch('nova.objects.InstanceGroup.get_by_hint') - def test_validate_instance_group_policy_with_rules(self, mock_get_by_hint, - mock_get_by_host): + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch('nova.objects.MigrationList.get_in_progress_by_host_and_node') + def test_validate_instance_group_policy_with_rules( + self, migration_list, nodes, mock_get_by_hint, mock_get_by_host, + mock_get_by_uuid): # Create 2 instance in same host, inst2 created before inst1 instance = objects.Instance(uuid=uuids.inst1) hints = {'group': [uuids.group_hint]} @@ -7570,17 +7634,26 @@ def test_validate_instance_group_policy_with_rules(self, mock_get_by_hint, mock_get_by_host.return_value = existing_insts # if group policy rules limit to 1, raise RescheduledException - mock_get_by_hint.return_value = objects.InstanceGroup( + group = objects.InstanceGroup( policy='anti-affinity', rules={'max_server_per_host': '1'}, - hosts=['host1'], members=members_uuids) + hosts=['host1'], members=members_uuids, + uuid=uuids.group) + mock_get_by_hint.return_value = group + mock_get_by_uuid.return_value = group + nodes.return_value = ['nodename'] + migration_list.return_value = [objects.Migration( + uuid=uuids.migration, instance_uuid=uuids.instance)] self.assertRaises(exception.RescheduledException, self.compute._validate_instance_group_policy, self.context, instance, hints) # if group policy rules limit change to 2, validate OK - mock_get_by_hint.return_value = objects.InstanceGroup( + group2 = objects.InstanceGroup( policy='anti-affinity', rules={'max_server_per_host': 2}, - hosts=['host1'], members=members_uuids) + hosts=['host1'], members=members_uuids, + uuid=uuids.group) + mock_get_by_hint.return_value = group2 + mock_get_by_uuid.return_value = group2 self.compute._validate_instance_group_policy(self.context, instance, hints) @@ -9107,6 +9180,8 @@ def test_max_concurrent_live_semaphore_unlimited(self, mock_executor): manager.ComputeManager() mock_executor.assert_called_once_with() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_cinder_v3_api(self): # This tests that pre_live_migration with a bdm with an # attachment_id, will create a new attachment and update @@ -9184,6 +9259,8 @@ def _test(mock_attach, mock_get_bdms, mock_ivbi, _test() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_exception_cinder_v3_api(self): # The instance in this test has 2 attachments. The second attach_create # will throw an exception. This will test that the first attachment @@ -9253,6 +9330,8 @@ def _test(mock_attach_create, mock_attach_delete, mock_get_bdms, self.assertGreater(len(m.mock_calls), 0) _test() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_exceptions_delete_attachments(self): # The instance in this test has 2 attachments. The call to # driver.pre_live_migration will raise an exception. This will test @@ -10631,6 +10710,54 @@ def fake_reschedule_resize_or_reraise(*args, **kwargs): # (_error_out_instance_on_exception will set to ACTIVE by default). self.assertEqual(vm_states.STOPPED, instance.vm_state) + @mock.patch('nova.compute.utils.notify_usage_exists') + @mock.patch('nova.compute.manager.ComputeManager.' + '_notify_about_instance_usage') + @mock.patch('nova.compute.utils.notify_about_resize_prep_instance') + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.manager.ComputeManager._revert_allocation') + @mock.patch('nova.compute.manager.ComputeManager.' + '_reschedule_resize_or_reraise') + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + # this is almost copy-paste from test_prep_resize_fails_rollback + def test_prep_resize_fails_group_validation( + self, add_instance_fault_from_exc, _reschedule_resize_or_reraise, + _revert_allocation, mock_instance_save, + notify_about_resize_prep_instance, _notify_about_instance_usage, + notify_usage_exists): + """Tests that if _validate_instance_group_policy raises + InstanceFaultRollback, the instance.vm_state is reset properly in + _error_out_instance_on_exception + """ + instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, vm_state=vm_states.STOPPED, + node='fake-node', expected_attrs=['system_metadata', 'flavor']) + migration = mock.MagicMock(spec='nova.objects.Migration') + request_spec = mock.MagicMock(spec='nova.objects.RequestSpec') + ex = exception.RescheduledException( + instance_uuid=instance.uuid, reason="policy violated") + ex2 = exception.InstanceFaultRollback( + inner_exception=ex) + + def fake_reschedule_resize_or_reraise(*args, **kwargs): + raise ex2 + + _reschedule_resize_or_reraise.side_effect = ( + fake_reschedule_resize_or_reraise) + + with mock.patch.object(self.compute, '_validate_instance_group_policy', + side_effect=ex): + self.assertRaises( + # _error_out_instance_on_exception should reraise the + # RescheduledException inside InstanceFaultRollback. + exception.RescheduledException, self.compute.prep_resize, + self.context, instance.image_meta, instance, instance.flavor, + request_spec, filter_properties={}, node=instance.node, + clean_shutdown=True, migration=migration, host_list=[]) + # The instance.vm_state should remain unchanged + # (_error_out_instance_on_exception will set to ACTIVE by default). + self.assertEqual(vm_states.STOPPED, instance.vm_state) + @mock.patch('nova.compute.rpcapi.ComputeAPI.resize_instance') @mock.patch('nova.compute.resource_tracker.ResourceTracker.resize_claim') @mock.patch('nova.objects.Instance.save') diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index ecac5ffb179..bb846dba3dd 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -4227,3 +4227,24 @@ def test__get_providers_to_update_not_in_tree(self, mock_log): mock_log.warning.assert_called_once_with(*expected_log_call) self.assertIn(uuids.unknown, self.rt.absent_providers) self.assertEqual(result, []) + + +class TestCleanComputeNodeCache(BaseTestCase): + + def setUp(self): + super(TestCleanComputeNodeCache, self).setUp() + self._setup_rt() + self.context = context.RequestContext( + mock.sentinel.user_id, mock.sentinel.project_id) + + @mock.patch.object(resource_tracker.ResourceTracker, "remove_node") + def test_clean_compute_node_cache(self, mock_remove): + invalid_nodename = "invalid-node" + self.rt.compute_nodes[_NODENAME] = self.compute + self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute + with mock.patch.object( + self.rt.reportclient, "invalidate_resource_provider", + ) as mock_invalidate: + self.rt.clean_compute_node_cache([self.compute]) + mock_remove.assert_called_once_with(invalid_nodename) + mock_invalidate.assert_called_once_with(invalid_nodename) diff --git a/nova/tests/unit/console/test_websocketproxy.py b/nova/tests/unit/console/test_websocketproxy.py index 3c234df891c..92883134a2f 100644 --- a/nova/tests/unit/console/test_websocketproxy.py +++ b/nova/tests/unit/console/test_websocketproxy.py @@ -15,6 +15,7 @@ """Tests for nova websocketproxy.""" import copy +import io import socket import mock @@ -626,6 +627,72 @@ def test_tcp_rst_no_compute_rpcapi(self): self.wh.server.top_new_client(conn, address) self.assertIsNone(self.wh._compute_rpcapi) + def test_reject_open_redirect(self): + # This will test the behavior when an attempt is made to cause an open + # redirect. It should be rejected. + mock_req = mock.MagicMock() + mock_req.makefile().readline.side_effect = [ + b'GET //example.com/%2F.. HTTP/1.1\r\n', + b'' + ] + + client_addr = ('8.8.8.8', 54321) + mock_server = mock.MagicMock() + # This specifies that the server will be able to handle requests other + # than only websockets. + mock_server.only_upgrade = False + + # Constructing a handler will process the mock_req request passed in. + handler = websocketproxy.NovaProxyRequestHandler( + mock_req, client_addr, mock_server) + + # Collect the response data to verify at the end. The + # SimpleHTTPRequestHandler writes the response data to a 'wfile' + # attribute. + output = io.BytesIO() + handler.wfile = output + # Process the mock_req again to do the capture. + handler.do_GET() + output.seek(0) + result = output.readlines() + + # Verify no redirect happens and instead a 400 Bad Request is returned. + self.assertIn('400 URI must not start with //', result[0].decode()) + + def test_reject_open_redirect_3_slashes(self): + # This will test the behavior when an attempt is made to cause an open + # redirect. It should be rejected. + mock_req = mock.MagicMock() + mock_req.makefile().readline.side_effect = [ + b'GET ///example.com/%2F.. HTTP/1.1\r\n', + b'' + ] + + # Collect the response data to verify at the end. The + # SimpleHTTPRequestHandler writes the response data by calling the + # request socket sendall() method. + self.data = b'' + + def fake_sendall(data): + self.data += data + + mock_req.sendall.side_effect = fake_sendall + + client_addr = ('8.8.8.8', 54321) + mock_server = mock.MagicMock() + # This specifies that the server will be able to handle requests other + # than only websockets. + mock_server.only_upgrade = False + + # Constructing a handler will process the mock_req request passed in. + websocketproxy.NovaProxyRequestHandler( + mock_req, client_addr, mock_server) + + # Verify no redirect happens and instead a 400 Bad Request is returned. + self.data = self.data.decode() + self.assertIn('Error code: 400', self.data) + self.assertIn('Message: URI must not start with //', self.data) + @mock.patch('websockify.websocketproxy.select_ssl_version') def test_ssl_min_version_is_not_set(self, mock_select_ssl): websocketproxy.NovaWebSocketProxy() diff --git a/nova/tests/unit/db/test_db_api.py b/nova/tests/unit/db/test_db_api.py index 6b80f216612..47b7e87c61c 100644 --- a/nova/tests/unit/db/test_db_api.py +++ b/nova/tests/unit/db/test_db_api.py @@ -5283,6 +5283,20 @@ def test_compute_node_create_duplicate_host_hypervisor_hostname(self): self.assertRaises(db_exc.DBDuplicateEntry, db.compute_node_create, self.ctxt, other_node) + def test_compute_node_create_duplicate_uuid(self): + """Tests to make sure that no exception is raised when trying to create + a compute node with the same host, hypervisor_hostname and uuid values + as another compute node that was previously soft-deleted. + """ + # Prior to fixing https://bugs.launchpad.net/nova/+bug/1853159, this + # raised the following error: + # sqlalchemy.exc.InvalidRequestError: This session is in 'inactive' + # state, due to the SQL transaction being rolled back; no further SQL + # can be emitted within this transaction. + db.compute_node_delete(self.ctxt, self.item['id'], self.item['host']) + new_node = db.compute_node_create(self.ctxt, self.compute_node_dict) + self.assertEqual(self.item['uuid'], new_node['uuid']) + def test_compute_node_get_all(self): nodes = db.compute_node_get_all(self.ctxt) self.assertEqual(1, len(nodes)) @@ -5393,7 +5407,7 @@ def test_compute_node_get_all_deleted_compute_node(self): # Now delete the newly-created compute node to ensure the related # compute node stats are wiped in a cascaded fashion - db.compute_node_delete(self.ctxt, node['id']) + db.compute_node_delete(self.ctxt, node['id'], node['host']) # Clean up the service db.service_destroy(self.ctxt, service['id']) @@ -5567,10 +5581,18 @@ def test_compute_node_update(self): def test_compute_node_delete(self): compute_node_id = self.item['id'] - db.compute_node_delete(self.ctxt, compute_node_id) + compute_node_host = self.item['host'] + db.compute_node_delete(self.ctxt, compute_node_id, compute_node_host) nodes = db.compute_node_get_all(self.ctxt) self.assertEqual(len(nodes), 0) + def test_compute_node_delete_different_host(self): + compute_node_id = self.item['id'] + compute_node_host = 'invalid-host' + self.assertRaises(exception.ComputeHostNotFound, + db.compute_node_delete, + self.ctxt, compute_node_id, compute_node_host) + def test_compute_node_search_by_hypervisor(self): nodes_created = [] new_service = copy.copy(self.service_dict) diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py index e23203c04aa..fb7fa8713ca 100644 --- a/nova/tests/unit/network/test_neutron.py +++ b/nova/tests/unit/network/test_neutron.py @@ -253,6 +253,8 @@ def test_neutron_http_retries(self): auth_token='token') cl = neutronapi.get_client(my_context) self.assertEqual(retries, cl.httpclient.connect_retries) + kcl = neutronapi._get_ksa_client(my_context) + self.assertEqual(retries, kcl.connect_retries) class TestAPIBase(test.TestCase): diff --git a/nova/tests/unit/objects/test_compute_node.py b/nova/tests/unit/objects/test_compute_node.py index aeaf4b957fe..237c0c7cb27 100644 --- a/nova/tests/unit/objects/test_compute_node.py +++ b/nova/tests/unit/objects/test_compute_node.py @@ -414,8 +414,9 @@ def test_set_id_failure(self, mock_get, db_mock): def test_destroy(self, mock_delete): compute = compute_node.ComputeNode(context=self.context) compute.id = 123 + compute.host = 'fake' compute.destroy() - mock_delete.assert_called_once_with(self.context, 123) + mock_delete.assert_called_once_with(self.context, 123, 'fake') @mock.patch.object(db, 'compute_node_get_all') def test_get_all(self, mock_get_all): diff --git a/nova/tests/unit/pci/test_manager.py b/nova/tests/unit/pci/test_manager.py index fe7d918e27b..c1b26d9726c 100644 --- a/nova/tests/unit/pci/test_manager.py +++ b/nova/tests/unit/pci/test_manager.py @@ -236,6 +236,40 @@ def test_update_devices_from_hypervisor_resources(self, _mock_dev_assign): tracker.update_devices_from_hypervisor_resources(fake_pci_devs_json) self.assertEqual(2, len(tracker.pci_devs)) + @mock.patch("nova.pci.manager.LOG.debug") + def test_update_devices_from_hypervisor_resources_32bit_domain( + self, mock_debug): + self.flags( + group='pci', + passthrough_whitelist=[ + '{"product_id":"2032", "vendor_id":"8086"}']) + # There are systems where 32 bit PCI domain is used. See bug 1897528 + # for example. While nova (and qemu) does not support assigning such + # devices but the existence of such device in the system should not + # lead to an error. + fake_pci = { + 'compute_node_id': 1, + 'address': '10000:00:02.0', + 'product_id': '2032', + 'vendor_id': '8086', + 'request_id': None, + 'status': fields.PciDeviceStatus.AVAILABLE, + 'dev_type': fields.PciDeviceType.STANDARD, + 'parent_addr': None, + 'numa_node': 0} + + fake_pci_devs = [fake_pci] + fake_pci_devs_json = jsonutils.dumps(fake_pci_devs) + tracker = manager.PciDevTracker(self.fake_context) + # We expect that the device with 32bit PCI domain is ignored + tracker.update_devices_from_hypervisor_resources(fake_pci_devs_json) + self.assertEqual(0, len(tracker.pci_devs)) + mock_debug.assert_called_once_with( + 'Skipping PCI device %s reported by the hypervisor: %s', + {'address': '10000:00:02.0', 'parent_addr': None}, + 'The property domain (10000) is greater than the maximum ' + 'allowable value (FFFF).') + def test_set_hvdev_new_dev(self): fake_pci_3 = dict(fake_pci, address='0000:00:00.4', vendor_id='v2') fake_pci_devs = [copy.deepcopy(fake_pci), copy.deepcopy(fake_pci_1), diff --git a/nova/tests/unit/storage/__init__.py b/nova/tests/unit/storage/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/nova/tests/unit/storage/test_rbd.py b/nova/tests/unit/storage/test_rbd.py index 5a39bdbd5a4..396f22c643a 100644 --- a/nova/tests/unit/storage/test_rbd.py +++ b/nova/tests/unit/storage/test_rbd.py @@ -13,6 +13,7 @@ from eventlet import tpool import mock +from oslo_concurrency import processutils from oslo_serialization import jsonutils from oslo_utils.fixture import uuidsentinel as uuids @@ -110,7 +111,7 @@ def setUp(self): self.rbd_pool = 'rbd' self.rbd_connect_timeout = 5 self.flags( - images_rbd_pool=self.images_rbd_pool, + images_rbd_pool=self.rbd_pool, images_rbd_ceph_conf='/foo/bar.conf', rbd_connect_timeout=self.rbd_connect_timeout, rbd_user='foo', group='libvirt') @@ -118,13 +119,15 @@ def setUp(self): rados_patcher = mock.patch.object(rbd_utils, 'rados') self.mock_rados = rados_patcher.start() self.addCleanup(rados_patcher.stop) - self.mock_rados.Rados = mock.Mock - self.mock_rados.Rados.ioctx = mock.Mock() - self.mock_rados.Rados.connect = mock.Mock() - self.mock_rados.Rados.shutdown = mock.Mock() - self.mock_rados.Rados.open_ioctx = mock.Mock() - self.mock_rados.Rados.open_ioctx.return_value = \ - self.mock_rados.Rados.ioctx + self.mock_rados.Rados = mock.Mock() + self.rados_inst = mock.Mock() + self.mock_rados.Rados.return_value = self.rados_inst + self.rados_inst.ioctx = mock.Mock() + self.rados_inst.connect = mock.Mock() + self.rados_inst.shutdown = mock.Mock() + self.rados_inst.open_ioctx = mock.Mock() + self.rados_inst.open_ioctx.return_value = \ + self.rados_inst.ioctx self.mock_rados.Error = Exception rbd_patcher = mock.patch.object(rbd_utils, 'rbd') @@ -338,33 +341,31 @@ def test_rbd_volume_proxy_init(self, mock_connect_from_rados, def test_connect_to_rados_default(self): ret = self.driver._connect_to_rados() - self.mock_rados.Rados.connect.assert_called_once_with( + self.rados_inst.connect.assert_called_once_with( timeout=self.rbd_connect_timeout) - self.assertTrue(self.mock_rados.Rados.open_ioctx.called) - self.assertIsInstance(ret[0], self.mock_rados.Rados) - self.assertEqual(self.mock_rados.Rados.ioctx, ret[1]) - self.mock_rados.Rados.open_ioctx.assert_called_with(self.rbd_pool) + self.assertTrue(self.rados_inst.open_ioctx.called) + self.assertEqual(self.rados_inst.ioctx, ret[1]) + self.rados_inst.open_ioctx.assert_called_with(self.rbd_pool) def test_connect_to_rados_different_pool(self): ret = self.driver._connect_to_rados('alt_pool') - self.mock_rados.Rados.connect.assert_called_once_with( + self.rados_inst.connect.assert_called_once_with( timeout=self.rbd_connect_timeout) - self.assertTrue(self.mock_rados.Rados.open_ioctx.called) - self.assertIsInstance(ret[0], self.mock_rados.Rados) - self.assertEqual(self.mock_rados.Rados.ioctx, ret[1]) - self.mock_rados.Rados.open_ioctx.assert_called_with('alt_pool') + self.assertTrue(self.rados_inst.open_ioctx.called) + self.assertEqual(self.rados_inst.ioctx, ret[1]) + self.rados_inst.open_ioctx.assert_called_with('alt_pool') def test_connect_to_rados_error(self): - self.mock_rados.Rados.open_ioctx.side_effect = self.mock_rados.Error + self.rados_inst.open_ioctx.side_effect = self.mock_rados.Error self.assertRaises(self.mock_rados.Error, self.driver._connect_to_rados) - self.mock_rados.Rados.open_ioctx.assert_called_once_with( + self.rados_inst.open_ioctx.assert_called_once_with( self.rbd_pool) - self.mock_rados.Rados.shutdown.assert_called_once_with() + self.rados_inst.shutdown.assert_called_once_with() def test_connect_to_rados_unicode_arg(self): self.driver._connect_to_rados(u'unicode_pool') - self.mock_rados.Rados.open_ioctx.assert_called_with( + self.rados_inst.open_ioctx.assert_called_with( test.MatchType(str)) def test_ceph_args_none(self): @@ -653,6 +654,11 @@ def test_get_pool_info(self, mock_execute): 'used': ceph_df_json['pools'][1]['stats']['bytes_used']} self.assertDictEqual(expected, self.driver.get_pool_info()) + @mock.patch('oslo_concurrency.processutils.execute', autospec=True, + side_effect=processutils.ProcessExecutionError("failed")) + def test_get_pool_info_execute_failed(self, mock_execute): + self.assertRaises(exception.StorageError, self.driver.get_pool_info) + @mock.patch('oslo_concurrency.processutils.execute') def test_get_pool_info_not_found(self, mock_execute): # Make the pool something other than self.rbd_pool so it won't be found diff --git a/nova/tests/unit/test_utils.py b/nova/tests/unit/test_utils.py index b64a753aeb2..b7516ef7e82 100644 --- a/nova/tests/unit/test_utils.py +++ b/nova/tests/unit/test_utils.py @@ -1294,3 +1294,101 @@ def test_api_db_is_configured_but_the_service_cannot_access_db( 'not allowed to directly access the database. You should run this ' 'service without the [api_database]/connection config option. The ' 'service version check will only query the local cell.') + + +class RunOnceTests(test.NoDBTestCase): + + fake_logger = mock.MagicMock() + + @utils.run_once("already ran once", fake_logger) + def dummy_test_func(self, fail=False): + if fail: + raise ValueError() + return True + + def setUp(self): + super(RunOnceTests, self).setUp() + self.dummy_test_func.reset() + RunOnceTests.fake_logger.reset_mock() + + def test_wrapped_funtions_called_once(self): + self.assertFalse(self.dummy_test_func.called) + result = self.dummy_test_func() + self.assertTrue(result) + self.assertTrue(self.dummy_test_func.called) + + # assert that on second invocation no result + # is returned and that the logger is invoked. + result = self.dummy_test_func() + RunOnceTests.fake_logger.assert_called_once() + self.assertIsNone(result) + + def test_wrapped_funtions_called_once_raises(self): + self.assertFalse(self.dummy_test_func.called) + self.assertRaises(ValueError, self.dummy_test_func, fail=True) + self.assertTrue(self.dummy_test_func.called) + + # assert that on second invocation no result + # is returned and that the logger is invoked. + result = self.dummy_test_func() + RunOnceTests.fake_logger.assert_called_once() + self.assertIsNone(result) + + def test_wrapped_funtions_can_be_reset(self): + # assert we start with a clean state + self.assertFalse(self.dummy_test_func.called) + result = self.dummy_test_func() + self.assertTrue(result) + + self.dummy_test_func.reset() + # assert we restored a clean state + self.assertFalse(self.dummy_test_func.called) + result = self.dummy_test_func() + self.assertTrue(result) + + # assert that we never called the logger + RunOnceTests.fake_logger.assert_not_called() + + def test_reset_calls_cleanup(self): + mock_clean = mock.Mock() + + @utils.run_once("already ran once", self.fake_logger, + cleanup=mock_clean) + def f(): + pass + + f() + self.assertTrue(f.called) + + f.reset() + self.assertFalse(f.called) + mock_clean.assert_called_once_with() + + def test_clean_is_not_called_at_reset_if_wrapped_not_called(self): + mock_clean = mock.Mock() + + @utils.run_once("already ran once", self.fake_logger, + cleanup=mock_clean) + def f(): + pass + + self.assertFalse(f.called) + + f.reset() + self.assertFalse(f.called) + self.assertFalse(mock_clean.called) + + def test_reset_works_even_if_cleanup_raises(self): + mock_clean = mock.Mock(side_effect=ValueError()) + + @utils.run_once("already ran once", self.fake_logger, + cleanup=mock_clean) + def f(): + pass + + f() + self.assertTrue(f.called) + + self.assertRaises(ValueError, f.reset) + self.assertFalse(f.called) + mock_clean.assert_called_once_with() diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 9b8a6a04726..7f907336f92 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -2018,17 +2018,12 @@ def test_plug_vifs_with_port(self, mock_vatt): @mock.patch.object(ironic_driver.IronicDriver, '_plug_vifs') def test_plug_vifs(self, mock__plug_vifs): node_id = 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' - node = _get_cached_node(id=node_id) - - self.mock_conn.get_node.return_value = node instance = fake_instance.fake_instance_obj(self.ctx, node=node_id) network_info = utils.get_test_network_info() self.driver.plug_vifs(instance, network_info) - self.mock_conn.get_node.assert_called_once_with( - node_id, fields=ironic_driver._NODE_FIELDS) - mock__plug_vifs.assert_called_once_with(node, instance, network_info) + mock__plug_vifs.assert_not_called() @mock.patch.object(FAKE_CLIENT.node, 'vif_attach') def test_plug_vifs_multiple_ports(self, mock_vatt): @@ -2162,11 +2157,17 @@ def test_unplug_vifs_no_network_info(self, mock_vdet): self.driver.unplug_vifs(instance, network_info) self.assertFalse(mock_vdet.called) - @mock.patch.object(ironic_driver.IronicDriver, 'plug_vifs') + @mock.patch.object(ironic_driver.IronicDriver, '_plug_vifs') def test_attach_interface(self, mock_pv): - self.driver.attach_interface('fake_context', 'fake_instance', + node_uuid = 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' + node = _get_cached_node(uuid=node_uuid) + instance = fake_instance.fake_instance_obj(self.ctx, + node=node_uuid) + self.mock_conn.get_node.return_value = node + + self.driver.attach_interface('fake_context', instance, 'fake_image_meta', 'fake_vif') - mock_pv.assert_called_once_with('fake_instance', ['fake_vif']) + mock_pv.assert_called_once_with(node, instance, ['fake_vif']) @mock.patch.object(ironic_driver.IronicDriver, 'unplug_vifs') def test_detach_interface(self, mock_uv): @@ -2459,17 +2460,23 @@ def test_get_volume_connector_no_ip_without_mac(self): def test_get_volume_connector_no_ip_no_fixed_ip(self): self._test_get_volume_connector_no_ip(False, no_fixed_ip=True) - @mock.patch.object(ironic_driver.IronicDriver, 'plug_vifs') + @mock.patch.object(ironic_driver.IronicDriver, '_plug_vifs') def test_prepare_networks_before_block_device_mapping(self, mock_pvifs): + node_uuid = 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' + node = _get_cached_node(uuid=node_uuid) + self.mock_conn.get_node.return_value = node instance = fake_instance.fake_instance_obj(self.ctx) network_info = utils.get_test_network_info() self.driver.prepare_networks_before_block_device_mapping(instance, network_info) - mock_pvifs.assert_called_once_with(instance, network_info) + mock_pvifs.assert_called_once_with(node, instance, network_info) - @mock.patch.object(ironic_driver.IronicDriver, 'plug_vifs') + @mock.patch.object(ironic_driver.IronicDriver, '_plug_vifs') def test_prepare_networks_before_block_device_mapping_error(self, mock_pvifs): + node_uuid = 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' + node = _get_cached_node(uuid=node_uuid) + self.mock_conn.get_node.return_value = node instance = fake_instance.fake_instance_obj(self.ctx) network_info = utils.get_test_network_info() mock_pvifs.side_effect = ironic_exception.BadRequest('fake error') @@ -2477,7 +2484,7 @@ def test_prepare_networks_before_block_device_mapping_error(self, ironic_exception.BadRequest, self.driver.prepare_networks_before_block_device_mapping, instance, network_info) - mock_pvifs.assert_called_once_with(instance, network_info) + mock_pvifs.assert_called_once_with(node, instance, network_info) @mock.patch.object(ironic_driver.IronicDriver, 'unplug_vifs') def test_clean_networks_preparation(self, mock_upvifs): @@ -3441,6 +3448,9 @@ def _test__refresh_cache(self, instances, nodes, hosts, mock_instances, mock_instances.return_value = instances mock_nodes.return_value = nodes mock_hosts.side_effect = hosts + parent_mock = mock.MagicMock() + parent_mock.attach_mock(mock_nodes, 'get_node_list') + parent_mock.attach_mock(mock_instances, 'get_uuids_by_host') if not can_send_146: mock_can_send.side_effect = ( exception.IronicAPIVersionNotAvailable(version='1.46')) @@ -3453,6 +3463,15 @@ def _test__refresh_cache(self, instances, nodes, hosts, mock_instances, self.driver._refresh_cache() + # assert if get_node_list() is called before get_uuids_by_host() + parent_mock.assert_has_calls( + [ + mock.call.get_node_list(fields=ironic_driver._NODE_FIELDS, + **kwargs), + mock.call.get_uuids_by_host(mock.ANY, self.host) + ] + ) + mock_hash_ring.assert_called_once_with(mock.ANY) mock_instances.assert_called_once_with(mock.ANY, self.host) mock_nodes.assert_called_once_with(fields=ironic_driver._NODE_FIELDS, diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 3a93eaa2a65..02459d0292a 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -347,6 +347,29 @@ + """, + "pci_0000_06_00_1": """ + + pci_0000_06_00_1 + /sys/devices/pci0000:00/0000:00:06.1 + + + i915 + + + 0 + 6 + 0 + 1 + HD Graphics P630 + Intel Corporation + + + vfio-pci + 2 + + + """, "mdev_4b20d080_1b54_4048_85b3_a6a62d165c01": """ @@ -10928,6 +10951,25 @@ def test_check_can_live_migrate_guest_cpu_none_model( '_create_shared_storage_test_file', return_value='fake') @mock.patch.object(libvirt_driver.LibvirtDriver, '_compare_cpu') + def test_check_can_live_migrate_guest_cpu_none_model_skip_compare( + self, mock_cpu, mock_test_file): + self.flags(group='workarounds', skip_cpu_compare_on_dest=True) + instance_ref = objects.Instance(**self.test_instance) + instance_ref.vcpu_model = test_vcpu_model.fake_vcpumodel + instance_ref.vcpu_model.model = None + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + compute_info = {'cpu_info': 'asdf', 'disk_available_least': 1} + drvr.check_can_live_migrate_destination( + self.context, instance_ref, compute_info, compute_info) + mock_cpu.assert_not_called() + + @mock.patch( + 'nova.network.neutron.API.has_port_binding_extension', + new=mock.Mock(return_value=False)) + @mock.patch.object(libvirt_driver.LibvirtDriver, + '_create_shared_storage_test_file', + return_value='fake') + @mock.patch.object(libvirt_driver.LibvirtDriver, '_compare_cpu') def test_check_can_live_migrate_dest_numa_lm( self, mock_cpu, mock_test_file): instance_ref = objects.Instance(**self.test_instance) @@ -16288,7 +16330,48 @@ def test_hard_reboot(self, mock_get_mdev, mock_destroy, mock_get_disk_info, accel_info=accel_info) mock_create_guest_with_network.assert_called_once_with(self.context, dummyxml, instance, network_info, block_device_info, - vifs_already_plugged=True) + vifs_already_plugged=True, external_events=[]) + + @mock.patch('oslo_utils.fileutils.ensure_tree', new=mock.Mock()) + @mock.patch('nova.virt.libvirt.LibvirtDriver.get_info') + @mock.patch('nova.virt.libvirt.LibvirtDriver._create_guest_with_network') + @mock.patch('nova.virt.libvirt.LibvirtDriver._get_guest_xml') + @mock.patch('nova.virt.libvirt.LibvirtDriver.destroy', new=mock.Mock()) + @mock.patch( + 'nova.virt.libvirt.LibvirtDriver._get_all_assigned_mediated_devices', + new=mock.Mock(return_value={})) + def test_hard_reboot_wait_for_plug( + self, mock_get_guest_xml, mock_create_guest_with_network, mock_get_info + ): + self.flags( + group="workarounds", + wait_for_vif_plugged_event_during_hard_reboot=["normal"]) + self.context.auth_token = None + instance = objects.Instance(**self.test_instance) + network_info = _fake_network_info(self, num_networks=4) + network_info[0]["vnic_type"] = "normal" + network_info[1]["vnic_type"] = "direct" + network_info[2]["vnic_type"] = "normal" + network_info[3]["vnic_type"] = "direct-physical" + block_device_info = None + return_values = [hardware.InstanceInfo(state=power_state.SHUTDOWN), + hardware.InstanceInfo(state=power_state.RUNNING)] + mock_get_info.side_effect = return_values + mock_get_guest_xml.return_value = mock.sentinel.xml + + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + drvr._hard_reboot( + self.context, instance, network_info, block_device_info) + + mock_create_guest_with_network.assert_called_once_with( + self.context, mock.sentinel.xml, instance, network_info, + block_device_info, + vifs_already_plugged=False, + external_events=[ + ('network-vif-plugged', uuids.vif1), + ('network-vif-plugged', uuids.vif3), + ] + ) @mock.patch('oslo_utils.fileutils.ensure_tree') @mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall') @@ -25086,6 +25169,28 @@ def fake_nodeDeviceLookupByName(name): self.assertEqual([], drvr._get_mdev_capable_devices(types=['nvidia-12'])) + @mock.patch.object(host.Host, 'device_lookup_by_name') + def test_get_mdev_capabilities_for_dev_name_optional( + self, device_lookup_by_name): + # We use another PCI device that doesn't provide a name attribute for + # each mdev type. + def fake_nodeDeviceLookupByName(name): + return FakeNodeDevice(_fake_NodeDevXml[name]) + device_lookup_by_name.side_effect = fake_nodeDeviceLookupByName + + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + + expected = {"dev_id": "pci_0000_06_00_1", + "vendor_id": 0x8086, + "types": {'i915-GVTg_V5_8': {'availableInstances': 2, + 'name': None, + 'deviceAPI': 'vfio-pci'}, + } + } + self.assertEqual( + expected, + drvr._get_mdev_capabilities_for_dev("pci_0000_06_00_1")) + @mock.patch.object(host.Host, 'device_lookup_by_name') @mock.patch.object(host.Host, 'list_mediated_devices') def test_get_mediated_devices(self, list_mediated_devices, diff --git a/nova/tests/unit/virt/libvirt/test_migration.py b/nova/tests/unit/virt/libvirt/test_migration.py index d4fba48397d..37bc6ad4a4c 100644 --- a/nova/tests/unit/virt/libvirt/test_migration.py +++ b/nova/tests/unit/virt/libvirt/test_migration.py @@ -991,7 +991,48 @@ def test_update_vif_xml_no_matching_vif(self): doc = etree.fromstring(original_xml) ex = self.assertRaises(KeyError, migration._update_vif_xml, doc, data, get_vif_config) - self.assertIn("CA:FE:DE:AD:BE:EF", six.text_type(ex)) + self.assertIn("ca:fe:de:ad:be:ef", six.text_type(ex)) + + def test_update_vif_xml_lower_case_mac(self): + """Tests that the vif in the migrate data is not found in the existing + guest interfaces. + """ + conf = vconfig.LibvirtConfigGuestInterface() + conf.net_type = "bridge" + conf.source_dev = "qbra188171c-ea" + conf.target_dev = "tapa188171c-ea" + conf.mac_addr = "DE:AD:BE:EF:CA:FE" + conf.model = "virtio" + original_xml = """ + 3de6550a-8596-4937-8046-9d862036bca5 + + + + + + + + + +
+ + + """ % uuids.ovs + expected_xml = """ + 3de6550a-8596-4937-8046-9d862036bca5 + + + + + + +
+ + + """ + self._test_update_vif_xml(conf, original_xml, expected_xml) class MigrationMonitorTestCase(test.NoDBTestCase): diff --git a/nova/tests/unit/virt/libvirt/test_vif.py b/nova/tests/unit/virt/libvirt/test_vif.py index b77d704bab6..24df5d5aa74 100644 --- a/nova/tests/unit/virt/libvirt/test_vif.py +++ b/nova/tests/unit/virt/libvirt/test_vif.py @@ -379,6 +379,10 @@ class LibvirtVifTestCase(test.NoDBTestCase): uuid='f0000000-0000-0000-0000-000000000001', project_id=723) + flavor_1vcpu = objects.Flavor(vcpus=1, memory=512, root_gb=1) + + flavor_2vcpu = objects.Flavor(vcpus=2, memory=512, root_gb=1) + bandwidth = { 'quota:vif_inbound_peak': '200', 'quota:vif_outbound_peak': '20', @@ -1063,30 +1067,50 @@ def test_tap_ethernet_vif_driver(self): @mock.patch('nova.privsep.linux_net.device_exists', return_value=True) @mock.patch('nova.privsep.linux_net.set_device_mtu') @mock.patch('nova.privsep.linux_net.create_tap_dev') - def test_plug_tap_kvm_virtio(self, mock_create_tap_dev, mock_set_mtu, - mock_device_exists): + def test_plug_tap_kvm_virtio( + self, mock_create_tap_dev, mock_set_mtu, mock_device_exists): d1 = vif.LibvirtGenericVIFDriver() ins = objects.Instance( id=1, uuid='f0000000-0000-0000-0000-000000000001', + flavor=self.flavor_2vcpu, project_id=723, system_metadata={} ) d1.plug(ins, self.vif_tap) - mock_create_tap_dev.assert_called_once_with('tap-xxx-yyy-zzz', None, - multiqueue=False) + mock_create_tap_dev.assert_called_once_with( + 'tap-xxx-yyy-zzz', None, multiqueue=False) mock_create_tap_dev.reset_mock() d2 = vif.LibvirtGenericVIFDriver() mq_ins = objects.Instance( id=1, uuid='f0000000-0000-0000-0000-000000000001', + flavor=self.flavor_2vcpu, project_id=723, system_metadata={ 'image_hw_vif_multiqueue_enabled': 'True' } ) d2.plug(mq_ins, self.vif_tap) - mock_create_tap_dev.assert_called_once_with('tap-xxx-yyy-zzz', None, - multiqueue=True) + mock_create_tap_dev.assert_called_once_with( + 'tap-xxx-yyy-zzz', None, multiqueue=True) + + @mock.patch('nova.privsep.linux_net.device_exists', return_value=True) + @mock.patch('nova.privsep.linux_net.set_device_mtu') + @mock.patch('nova.privsep.linux_net.create_tap_dev') + def test_plug_tap_mq_ignored_1vcpu( + self, mock_create_tap_dev, mock_set_mtu, mock_device_exists): + + d1 = vif.LibvirtGenericVIFDriver() + mq_ins = objects.Instance( + id=1, uuid='f0000000-0000-0000-0000-000000000001', + image_ref=uuids.image_ref, flavor=self.flavor_1vcpu, + project_id=723, system_metadata={ + 'image_hw_vif_multiqueue_enabled': 'True', + } + ) + d1.plug(mq_ins, self.vif_tap) + mock_create_tap_dev.assert_called_once_with( + 'tap-xxx-yyy-zzz', None, multiqueue=False) @mock.patch('nova.privsep.linux_net.device_exists', return_value=True) @mock.patch('nova.privsep.linux_net.set_device_mtu') @@ -1101,14 +1125,14 @@ def test_plug_tap_mq_ignored_virt_type( d1 = vif.LibvirtGenericVIFDriver() ins = objects.Instance( id=1, uuid='f0000000-0000-0000-0000-000000000001', + flavor=self.flavor_2vcpu, project_id=723, system_metadata={ 'image_hw_vif_multiqueue_enabled': 'True' } ) d1.plug(ins, self.vif_tap) - mock_create_tap_dev.assert_called_once_with('tap-xxx-yyy-zzz', - None, - multiqueue=False) + mock_create_tap_dev.assert_called_once_with( + 'tap-xxx-yyy-zzz', None, multiqueue=False) @mock.patch('nova.privsep.linux_net.device_exists', return_value=True) @mock.patch('nova.privsep.linux_net.set_device_mtu') @@ -1119,15 +1143,15 @@ def test_plug_tap_mq_ignored_vif_model( d1 = vif.LibvirtGenericVIFDriver() ins = objects.Instance( id=1, uuid='f0000000-0000-0000-0000-000000000001', + flavor=self.flavor_2vcpu, project_id=723, system_metadata={ 'image_hw_vif_multiqueue_enabled': 'True', 'image_hw_vif_model': 'e1000', } ) d1.plug(ins, self.vif_tap) - mock_create_tap_dev.assert_called_once_with('tap-xxx-yyy-zzz', - None, - multiqueue=False) + mock_create_tap_dev.assert_called_once_with( + 'tap-xxx-yyy-zzz', None, multiqueue=False) def test_unplug_tap(self): d = vif.LibvirtGenericVIFDriver() diff --git a/nova/utils.py b/nova/utils.py index 181d1f1affa..ec5e3e86dc9 100644 --- a/nova/utils.py +++ b/nova/utils.py @@ -1103,3 +1103,49 @@ def raise_if_old_compute(): scope=scope, min_service_level=current_service_version, oldest_supported_service=oldest_supported_service_level) + + +def run_once(message, logger, cleanup=None): + """This is a utility function decorator to ensure a function + is run once and only once in an interpreter instance. + + Note: this is copied from the placement repo (placement/util.py) + + The decorated function object can be reset by calling its + reset function. All exceptions raised by the wrapped function, + logger and cleanup function will be propagated to the caller. + """ + def outer_wrapper(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not wrapper.called: + # Note(sean-k-mooney): the called state is always + # updated even if the wrapped function completes + # by raising an exception. If the caller catches + # the exception it is their responsibility to call + # reset if they want to re-execute the wrapped function. + try: + return func(*args, **kwargs) + finally: + wrapper.called = True + else: + logger(message) + + wrapper.called = False + + def reset(wrapper, *args, **kwargs): + # Note(sean-k-mooney): we conditionally call the + # cleanup function if one is provided only when the + # wrapped function has been called previously. We catch + # and reraise any exception that may be raised and update + # the called state in a finally block to ensure its + # always updated if reset is called. + try: + if cleanup and wrapper.called: + return cleanup(*args, **kwargs) + finally: + wrapper.called = False + + wrapper.reset = functools.partial(reset, wrapper) + return wrapper + return outer_wrapper diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index aa00ef68395..d8140e733b7 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -788,10 +788,15 @@ def _refresh_hash_ring(self, ctxt): def _refresh_cache(self): ctxt = nova_context.get_admin_context() self._refresh_hash_ring(ctxt) - instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host) node_cache = {} def _get_node_list(**kwargs): + # NOTE(TheJulia): This call can take a substantial amount + # of time as it may be attempting to retrieve thousands of + # baremetal nodes. Depending on the version of Ironic, + # this can be as long as 2-10 seconds per every thousand + # nodes, and this call may retrieve all nodes in a deployment, + # depending on if any filter paramters are applied. return self._get_node_list(fields=_NODE_FIELDS, **kwargs) # NOTE(jroll) if partition_key is set, we need to limit nodes that @@ -815,6 +820,15 @@ def _get_node_list(**kwargs): else: nodes = _get_node_list() + # NOTE(saga): As _get_node_list() will take a long + # time to return in large clusters we need to call it before + # get_uuids_by_host() method. Otherwise the instances list we get from + # get_uuids_by_host() method will become stale. + # A stale instances list can cause a node that is managed by this + # compute host to be excluded in error and cause the compute node + # to be orphaned and associated resource provider to be deleted. + instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host) + for node in nodes: # NOTE(jroll): we always manage the nodes for instances we manage if node.instance_uuid in instances: @@ -1609,13 +1623,21 @@ def _unplug_vifs(self, node, instance, network_info): def plug_vifs(self, instance, network_info): """Plug VIFs into networks. + This method is present for compatability. Any call will result + in a DEBUG log entry being generated, and will otherwise be + ignored, as Ironic manages VIF attachments through a node + lifecycle. Please see ``attach_interface``, which is the + proper and current method to utilize. + :param instance: The instance object. :param network_info: Instance network information. """ - # instance.node is the ironic node's UUID. - node = self._get_node(instance.node) - self._plug_vifs(node, instance, network_info) + LOG.debug('VIF plug called for instance %(instance)s on node ' + '%(node)s, however Ironic manages VIF attachments ' + 'for nodes.', + {'instance': instance.uuid, + 'node': instance.node}) def unplug_vifs(self, instance, network_info): """Unplug VIFs from networks. @@ -1646,7 +1668,8 @@ def attach_interface(self, context, instance, image_meta, vif): # event from neutron or by _heal_instance_info_cache periodic task. In # both cases, this is done asynchronously, so the cache may not be up # to date immediately after attachment. - self.plug_vifs(instance, [vif]) + node = self._get_node(instance.node) + self._plug_vifs(node, instance, [vif]) def detach_interface(self, context, instance, vif): """Use hotunplug to remove a network interface from a running instance. @@ -1963,7 +1986,9 @@ def prepare_networks_before_block_device_mapping(self, instance, """ try: - self.plug_vifs(instance, network_info) + node = self._get_node(instance.node) + self._plug_vifs(node, instance, network_info) + except Exception: with excutils.save_and_reraise_exception(): LOG.error("Error preparing deploy for instance " diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index fbd033690a6..9dbaefef9db 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -3383,11 +3383,32 @@ def _hard_reboot(self, context, instance, network_info, # on which vif type we're using and we are working with a stale network # info cache here, so won't rely on waiting for neutron plug events. # vifs_already_plugged=True means "do not wait for neutron plug events" + external_events = [] + vifs_already_plugged = True + event_expected_for_vnic_types = ( + CONF.workarounds.wait_for_vif_plugged_event_during_hard_reboot) + if event_expected_for_vnic_types: + # NOTE(gibi): We unplugged every vif during destroy above and we + # will replug them with _create_guest_with_network. As the + # workaround config has some vnic_types configured we expect + # vif-plugged events for every vif with those vnic_types. + # TODO(gibi): only wait for events if we know that the networking + # backend sends plug time events. For that we need to finish + # https://bugs.launchpad.net/neutron/+bug/1821058 first in Neutron + # then create a driver -> plug-time event mapping in nova. + external_events = [ + ('network-vif-plugged', vif['id']) + for vif in network_info + if vif['vnic_type'] in event_expected_for_vnic_types + ] + vifs_already_plugged = False + # NOTE(efried): The instance should already have a vtpm_secret_uuid # registered if appropriate. self._create_guest_with_network( context, xml, instance, network_info, block_device_info, - vifs_already_plugged=True) + vifs_already_plugged=vifs_already_plugged, + external_events=external_events) self._prepare_pci_devices_for_use( pci_manager.get_instance_pci_devs(instance, 'all')) @@ -7274,7 +7295,8 @@ def _get_mdev_capabilities_for_dev(self, devname, types=None): if not types or cap['type'] in types: device["types"].update({cap['type']: { 'availableInstances': cap['availableInstances'], - 'name': cap['name'], + # This attribute is optional + 'name': cap.get('name'), 'deviceAPI': cap['deviceAPI']}}) return device @@ -8579,15 +8601,16 @@ def check_can_live_migrate_destination(self, context, instance, disk_available_mb = ( (disk_available_gb * units.Ki) - CONF.reserved_host_disk_mb) - # Compare CPU - try: - if not instance.vcpu_model or not instance.vcpu_model.model: - source_cpu_info = src_compute_info['cpu_info'] - self._compare_cpu(None, source_cpu_info, instance) - else: - self._compare_cpu(instance.vcpu_model, None, instance) - except exception.InvalidCPUInfo as e: - raise exception.MigrationPreCheckError(reason=e) + if not CONF.workarounds.skip_cpu_compare_on_dest: + # Compare CPU + try: + if not instance.vcpu_model or not instance.vcpu_model.model: + source_cpu_info = src_compute_info['cpu_info'] + self._compare_cpu(None, source_cpu_info, instance) + else: + self._compare_cpu(instance.vcpu_model, None, instance) + except exception.InvalidCPUInfo as e: + raise exception.MigrationPreCheckError(reason=e) # Create file on storage, to be checked on source host filename = self._create_shared_storage_test_file(instance) diff --git a/nova/virt/libvirt/migration.py b/nova/virt/libvirt/migration.py index 9543adda7ad..e601e46a9ba 100644 --- a/nova/virt/libvirt/migration.py +++ b/nova/virt/libvirt/migration.py @@ -344,14 +344,21 @@ def _update_vif_xml(xml_doc, migrate_data, get_vif_config): instance_uuid = xml_doc.findtext('uuid') parser = etree.XMLParser(remove_blank_text=True) interface_nodes = xml_doc.findall('./devices/interface') - migrate_vif_by_mac = {vif.source_vif['address']: vif + # MAC address stored for port in neutron DB and in domain XML + # might be in different cases, so to harmonize that + # we convert MAC to lower case for dict key. + migrate_vif_by_mac = {vif.source_vif['address'].lower(): vif for vif in migrate_data.vifs} for interface_dev in interface_nodes: mac = interface_dev.find('mac') mac = mac if mac is not None else {} mac_addr = mac.get('address') if mac_addr: - migrate_vif = migrate_vif_by_mac[mac_addr] + # MAC address stored in libvirt should always be normalized + # and stored in lower case. But just to be extra safe here + # we still normalize MAC retrieved from XML to be absolutely + # sure it will be the same with the Neutron provided one. + migrate_vif = migrate_vif_by_mac[mac_addr.lower()] vif = migrate_vif.get_dest_vif() # get_vif_config is a partial function of # nova.virt.libvirt.vif.LibvirtGenericVIFDriver.get_config diff --git a/nova/virt/libvirt/vif.py b/nova/virt/libvirt/vif.py index 5c87223bafe..67e0771ad93 100644 --- a/nova/virt/libvirt/vif.py +++ b/nova/virt/libvirt/vif.py @@ -666,7 +666,8 @@ def plug_tap(self, instance, vif): vif_model = self.get_vif_model(image_meta=image_meta) # TODO(ganso): explore whether multiqueue works for other vif models # that go through this code path. - multiqueue = (self._requests_multiqueue(image_meta) and + multiqueue = (instance.get_flavor().vcpus > 1 and + self._requests_multiqueue(image_meta) and vif_model == network_model.VIF_MODEL_VIRTIO) nova.privsep.linux_net.create_tap_dev(dev, mac, multiqueue=multiqueue) network = vif.get('network') diff --git a/playbooks/legacy/nova-grenade-multinode/post.yaml b/playbooks/legacy/nova-grenade-multinode/post.yaml deleted file mode 100644 index e07f5510ae7..00000000000 --- a/playbooks/legacy/nova-grenade-multinode/post.yaml +++ /dev/null @@ -1,15 +0,0 @@ -- hosts: primary - tasks: - - - name: Copy files from {{ ansible_user_dir }}/workspace/ on node - synchronize: - src: '{{ ansible_user_dir }}/workspace/' - dest: '{{ zuul.executor.log_root }}' - mode: pull - copy_links: true - verify_host: true - rsync_opts: - - --include=/logs/** - - --include=*/ - - --exclude=* - - --prune-empty-dirs diff --git a/playbooks/legacy/nova-grenade-multinode/run.yaml b/playbooks/legacy/nova-grenade-multinode/run.yaml deleted file mode 100644 index 18f7c753ebc..00000000000 --- a/playbooks/legacy/nova-grenade-multinode/run.yaml +++ /dev/null @@ -1,65 +0,0 @@ -- hosts: primary - name: nova-grenade-multinode - tasks: - - - name: Ensure legacy workspace directory - file: - path: '{{ ansible_user_dir }}/workspace' - state: directory - - - shell: - cmd: | - set -e - set -x - cat > clonemap.yaml << EOF - clonemap: - - name: openstack/devstack-gate - dest: devstack-gate - EOF - /usr/zuul-env/bin/zuul-cloner -m clonemap.yaml --cache-dir /opt/git \ - https://opendev.org \ - openstack/devstack-gate - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' - - - shell: - cmd: | - set -e - set -x - export PROJECTS="openstack/grenade $PROJECTS" - export PYTHONUNBUFFERED=true - export DEVSTACK_GATE_CONFIGDRIVE=0 - export DEVSTACK_GATE_NEUTRON=1 - # NOTE(mriedem): Run tempest smoke tests specific to compute on the - # new side of the grenade environment. The post-test hook script will - # run non-smoke migration tests in a local/block and shared/ceph - # setup. Note that grenade hard-codes "tox -esmoke" for tempest on - # the old side so the regex is not appied there. - export DEVSTACK_GATE_TEMPEST=1 - export DEVSTACK_GATE_TEMPEST_REGEX="tempest\.(api\.compute|scenario)\..*smoke.*" - export DEVSTACK_GATE_GRENADE=pullup - export DEVSTACK_GATE_USE_PYTHON3=True - # By default grenade runs only smoke tests so we need to set - # RUN_SMOKE to False in order to run live migration tests using - # grenade - export DEVSTACK_LOCAL_CONFIG="RUN_SMOKE=False" - # LIVE_MIGRATE_BACK_AND_FORTH will tell Tempest to run a live - # migration of the same instance to one compute node and then back - # to the other, which is mostly only interesting for grenade since - # we have mixed level computes. - export DEVSTACK_LOCAL_CONFIG+=$'\n'"LIVE_MIGRATE_BACK_AND_FORTH=True" - export BRANCH_OVERRIDE=default - export DEVSTACK_GATE_TOPOLOGY="multinode" - if [ "$BRANCH_OVERRIDE" != "default" ] ; then - export OVERRIDE_ZUUL_BRANCH=$BRANCH_OVERRIDE - fi - function post_test_hook { - /opt/stack/new/nova/gate/live_migration/hooks/run_tests.sh - } - export -f post_test_hook - cp devstack-gate/devstack-vm-gate-wrap.sh ./safe-devstack-vm-gate-wrap.sh - ./safe-devstack-vm-gate-wrap.sh - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' diff --git a/playbooks/legacy/nova-live-migration/post.yaml b/playbooks/legacy/nova-live-migration/post.yaml deleted file mode 100644 index e07f5510ae7..00000000000 --- a/playbooks/legacy/nova-live-migration/post.yaml +++ /dev/null @@ -1,15 +0,0 @@ -- hosts: primary - tasks: - - - name: Copy files from {{ ansible_user_dir }}/workspace/ on node - synchronize: - src: '{{ ansible_user_dir }}/workspace/' - dest: '{{ zuul.executor.log_root }}' - mode: pull - copy_links: true - verify_host: true - rsync_opts: - - --include=/logs/** - - --include=*/ - - --exclude=* - - --prune-empty-dirs diff --git a/playbooks/legacy/nova-live-migration/run.yaml b/playbooks/legacy/nova-live-migration/run.yaml deleted file mode 100644 index ef8853135bb..00000000000 --- a/playbooks/legacy/nova-live-migration/run.yaml +++ /dev/null @@ -1,60 +0,0 @@ -- hosts: primary - name: nova-live-migration - tasks: - - - name: Ensure legacy workspace directory - file: - path: '{{ ansible_user_dir }}/workspace' - state: directory - - - shell: - cmd: | - set -e - set -x - cat > clonemap.yaml << EOF - clonemap: - - name: openstack/devstack-gate - dest: devstack-gate - EOF - /usr/zuul-env/bin/zuul-cloner -m clonemap.yaml --cache-dir /opt/git \ - https://opendev.org \ - openstack/devstack-gate - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' - - - name: Configure devstack - shell: - # Force config drive. - cmd: | - set -e - set -x - cat << 'EOF' >>"/tmp/dg-local.conf" - [[local|localrc]] - FORCE_CONFIG_DRIVE=True - - EOF - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' - - - shell: - cmd: | - set -e - set -x - export PYTHONUNBUFFERED=true - export DEVSTACK_GATE_CONFIGDRIVE=0 - export DEVSTACK_GATE_TEMPEST=1 - export DEVSTACK_GATE_TEMPEST_NOTESTS=1 - export DEVSTACK_GATE_TOPOLOGY="multinode" - export DEVSTACK_GATE_USE_PYTHON3=True - function post_test_hook { - /opt/stack/new/nova/gate/live_migration/hooks/run_tests.sh - $BASE/new/nova/gate/test_evacuate.sh - } - export -f post_test_hook - cp devstack-gate/devstack-vm-gate-wrap.sh ./safe-devstack-vm-gate-wrap.sh - ./safe-devstack-vm-gate-wrap.sh - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' diff --git a/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml new file mode 100644 index 00000000000..4c6135311bb --- /dev/null +++ b/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Improved detection of anti-affinity policy violation when performing live + and cold migrations. Most of the violations caused by race conditions due + to performing concurrent live or cold migrations should now be addressed + by extra checks in the compute service. Upon detection, cold migration + operations are automatically rescheduled, while live migrations have two + checks and will be rescheduled if detected by the first one, otherwise the + live migration will fail cleanly and revert the instance state back to its + previous value. diff --git a/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml b/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml new file mode 100644 index 00000000000..d6e71c42224 --- /dev/null +++ b/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes an issue with multiple ``nova-compute`` services used with Ironic, + where a rebalance operation could result in a compute node being deleted + from the database and not recreated. See `bug 1853009 + `__ for details. diff --git a/releasenotes/notes/bug-1939604-547c729b7741831b.yaml b/releasenotes/notes/bug-1939604-547c729b7741831b.yaml new file mode 100644 index 00000000000..e14327c2853 --- /dev/null +++ b/releasenotes/notes/bug-1939604-547c729b7741831b.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Addressed an issue that prevented instances with 1 vcpu using multiqueue + feature from being created successfully when their vif_type is TAP. diff --git a/releasenotes/notes/bug-1946729-wait-for-vif-plugged-event-during-hard-reboot-fb491f6a68370bab.yaml b/releasenotes/notes/bug-1946729-wait-for-vif-plugged-event-during-hard-reboot-fb491f6a68370bab.yaml new file mode 100644 index 00000000000..c3686a99780 --- /dev/null +++ b/releasenotes/notes/bug-1946729-wait-for-vif-plugged-event-during-hard-reboot-fb491f6a68370bab.yaml @@ -0,0 +1,18 @@ +--- +issues: + - | + The libvirt virt driver in Nova implements power on and hard reboot by + destroying the domain first and unpluging the vifs then recreating the + domain and replugging the vifs. However nova does not wait for the + network-vif-plugged event before unpause the domain. This can cause + the domain to start running and requesting IP via DHCP before the + networking backend has finished plugging the vifs. The config option + [workarounds]wait_for_vif_plugged_event_during_hard_reboot has been added, + defaulting to an empty list, that can be used to ensure that the libvirt + driver waits for the network-vif-plugged event for vifs with specific + ``vnic_type`` before it unpauses the domain during hard reboot. This should + only be used if the deployment uses a networking backend that sends such + event for the given ``vif_type`` at vif plug time. The ml2/ovs and the + networking-odl Neutron backend is known to send plug time events for ports + with ``normal`` ``vnic_type``. For more information see + https://bugs.launchpad.net/nova/+bug/1946729 diff --git a/releasenotes/notes/console-proxy-reject-open-redirect-4ac0a7895acca7eb.yaml b/releasenotes/notes/console-proxy-reject-open-redirect-4ac0a7895acca7eb.yaml new file mode 100644 index 00000000000..ce05b9a8670 --- /dev/null +++ b/releasenotes/notes/console-proxy-reject-open-redirect-4ac0a7895acca7eb.yaml @@ -0,0 +1,19 @@ +--- +security: + - | + A vulnerability in the console proxies (novnc, serial, spice) that allowed + open redirection has been `patched`_. The novnc, serial, and spice console + proxies are implemented as websockify servers and the request handler + inherits from the python standard SimpleHTTPRequestHandler. There is a + `known issue`_ in the SimpleHTTPRequestHandler which allows open redirects + by way of URLs in the following format:: + + http://vncproxy.my.domain.com//example.com/%2F.. + + which if visited, will redirect a user to example.com. + + The novnc, serial, and spice console proxies will now reject requests that + pass a redirection URL beginning with "//" with a 400 Bad Request. + + .. _patched: https://bugs.launchpad.net/nova/+bug/1927677 + .. _known issue: https://bugs.python.org/issue32084 diff --git a/releasenotes/notes/fix-ironic-compute-restart-port-attachments-3282e9ea051561d4.yaml b/releasenotes/notes/fix-ironic-compute-restart-port-attachments-3282e9ea051561d4.yaml new file mode 100644 index 00000000000..2c2c9e78382 --- /dev/null +++ b/releasenotes/notes/fix-ironic-compute-restart-port-attachments-3282e9ea051561d4.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Fixes slow compute restart when using the ``nova.virt.ironic`` compute + driver where the driver was previously attempting to attach VIFS on + start-up via the ``plug_vifs`` driver method. This method has grown + otherwise unused since the introduction of the ``attach_interface`` + method of attaching VIFs. As Ironic manages the attachment of VIFs to + baremetal nodes in order to align with the security requirements of a + physical baremetal node's lifecycle. The ironic driver now ignores calls + to the ``plug_vifs`` method. diff --git a/releasenotes/notes/ignore-instance-task-state-for-evacuation-e000f141d0153638.yaml b/releasenotes/notes/ignore-instance-task-state-for-evacuation-e000f141d0153638.yaml new file mode 100644 index 00000000000..46ebf0bd2d0 --- /dev/null +++ b/releasenotes/notes/ignore-instance-task-state-for-evacuation-e000f141d0153638.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + If compute service is down in source node and user try to stop + instance, instance gets stuck at powering-off, hence evacuation fails with + msg: Cannot 'evacuate' instance while it is in + task_state powering-off. + It is now possible for evacuation to ignore the vm task state. + For more details see: `bug 1978983`_ + + .. _`bug 1978983`: https://bugs.launchpad.net/nova/+bug/1978983 \ No newline at end of file diff --git a/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml b/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml new file mode 100644 index 00000000000..a49d64c6ac5 --- /dev/null +++ b/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml @@ -0,0 +1,13 @@ +--- +fixes: + - | + Minimizes a race condition window when using the ``ironic`` virt driver + where the data generated for the Resource Tracker may attempt to compare + potentially stale instance information with the latest known baremetal + node information. While this doesn't completely prevent nor resolve the + underlying race condition identified in + `bug 1841481 `_, + this change allows Nova to have the latest state information, as opposed + to state information which may be out of date due to the time which it may + take to retrieve the status from Ironic. This issue was most observable + on baremetal clusters with several thousand physical nodes. diff --git a/releasenotes/notes/skip-compare-cpu-on-dest-6ae419ddd61fd0f8.yaml b/releasenotes/notes/skip-compare-cpu-on-dest-6ae419ddd61fd0f8.yaml new file mode 100644 index 00000000000..e7cd4041b16 --- /dev/null +++ b/releasenotes/notes/skip-compare-cpu-on-dest-6ae419ddd61fd0f8.yaml @@ -0,0 +1,24 @@ +--- +issues: + - | + Nova's use of libvirt's compareCPU() API served its purpose over the + years, but its design limitations break live migration in subtle + ways. For example, the compareCPU() API compares against the host + physical CPUID. Some of the features from this CPUID aren not + exposed by KVM, and then there are some features that KVM emulates + that are not in the host CPUID. The latter can cause bogus live + migration failures. + + With QEMU >=2.9 and libvirt >= 4.4.0, libvirt will do the right + thing in terms of CPU compatibility checks on the destination host + during live migration. Nova satisfies these minimum version + requirements by a good margin. So, this workaround provides a way to + skip the CPU comparison check on the destination host before + migrating a guest, and let libvirt handle it correctly. + + This workaround will be deprecated and removed once Nova replaces + the older libvirt APIs with their newer counterparts. The work is + being tracked via this `blueprint + cpu-selection-with-hypervisor-consideration`_. + + .. _blueprint cpu-selection-with-hypervisor-consideration: https://blueprints.launchpad.net/nova/+spec/cpu-selection-with-hypervisor-consideration diff --git a/tools/check-cherry-picks.sh b/tools/check-cherry-picks.sh index 5ca6ded2033..5a449c520b7 100755 --- a/tools/check-cherry-picks.sh +++ b/tools/check-cherry-picks.sh @@ -4,11 +4,6 @@ # to verify that they're all on either master or stable/ branches # -# Allow this script to be disabled by a simple env var -if [ ${DISABLE_CHERRY_PICK_CHECK:-0} -eq 1 ]; then - exit 0 -fi - commit_hash="" # Check if the patch is a merge patch by counting the number of parents. diff --git a/tox.ini b/tox.ini index 61969cf5687..aae3708a101 100644 --- a/tox.ini +++ b/tox.ini @@ -49,8 +49,6 @@ commands = description = Run style checks. envdir = {toxworkdir}/shared -passenv = - DISABLE_CHERRY_PICK_CHECK commands = {[testenv:mypy]commands} bash tools/flake8wrap.sh {posargs} @@ -58,7 +56,6 @@ commands = bash -c "! find doc/ -type f -name *.json | xargs grep -U -n $'\r'" # Check that all included JSON files are valid JSON bash -c '! find doc/ -type f -name *.json | xargs -t -n1 python -m json.tool 2>&1 > /dev/null | grep -B1 -v ^python' - bash tools/check-cherry-picks.sh [testenv:fast8] description = @@ -67,6 +64,15 @@ envdir = {toxworkdir}/shared commands = bash tools/flake8wrap.sh -HEAD +[testenv:validate-backport] +description = + Determine whether a backport is ready to be merged by checking whether it has + already been merged to master or more recent stable branches. +deps = +skipsdist = true +commands = + bash tools/check-cherry-picks.sh + [testenv:functional] description = Run functional tests using python3. @@ -120,6 +126,16 @@ deps = {[testenv:functional]deps} commands = {[testenv:functional]commands} +[testenv:functional-without-sample-db-tests] +description = + Run functional tests by excluding the API|Notification + sample tests and DB tests. This env is used in + placement-nova-tox-functional-py38 job which is defined and + run in placement. +deps = {[testenv:functional]deps} +commands = + stestr --test-path=./nova/tests/functional run --exclude-regex '((?:api|notification)_sample_tests|functional\.db\.)' {posargs} + [testenv:api-samples] setenv = {[testenv]setenv} @@ -176,6 +192,7 @@ description = # to install (test-)requirements.txt for docs. deps = -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/victoria} + -r{toxinidir}/requirements.txt -r{toxinidir}/doc/requirements.txt commands = rm -rf doc/build/html doc/build/doctrees @@ -329,10 +346,3 @@ usedevelop = False deps = bindep commands = bindep test - -[testenv:lower-constraints] -usedevelop = False -deps = - -c{toxinidir}/lower-constraints.txt - -r{toxinidir}/test-requirements.txt - -r{toxinidir}/requirements.txt