From 283f2c83d4042f96ee4a055accbf7f06aa5d19ac Mon Sep 17 00:00:00 2001 From: vsibirsk Date: Tue, 19 May 2026 13:42:43 +0300 Subject: [PATCH 1/7] [virt] drop get_clinet() clean up last virt utils from using get_client() Signed-off-by: vsibirsk --- tests/network/migration/test_migration.py | 3 +- tests/scale/test_scale_benchmark.py | 1 - tests/utils.py | 4 +- tests/virt/node/descheduler/conftest.py | 2 +- tests/virt/utils.py | 2 +- utilities/virt.py | 83 +++++++++++++++-------- 6 files changed, 58 insertions(+), 37 deletions(-) diff --git a/tests/network/migration/test_migration.py b/tests/network/migration/test_migration.py index f0cda02d8c..d440a8b55e 100644 --- a/tests/network/migration/test_migration.py +++ b/tests/network/migration/test_migration.py @@ -358,13 +358,14 @@ def test_connectivity_after_migration_and_restart( ], ) def test_migration_with_masquerade( + admin_client, running_vma, running_vmb, ip_family, ): http_port_accessible( vm=running_vma, - server_ip=running_vmb.custom_service.service_ip(ip_family=ip_family), + server_ip=running_vmb.custom_service.service_ip(admin_client=admin_client, ip_family=ip_family), server_port=running_vmb.custom_service.service_port, ) diff --git a/tests/scale/test_scale_benchmark.py b/tests/scale/test_scale_benchmark.py index 2dfd6a2ff8..814727b217 100644 --- a/tests/scale/test_scale_benchmark.py +++ b/tests/scale/test_scale_benchmark.py @@ -455,7 +455,6 @@ def test_mass_vm_live_migration( for batch in scale_vms: for vm in batch: wait_for_migration_finished( - namespace=vm.namespace, migration=vm_migration_info[vm.name][MIGRATION_INSTANCE_STR], ) verify_vm_migrated( diff --git a/tests/utils.py b/tests/utils.py index 1ab4742c0e..000b5ffb6f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -177,9 +177,7 @@ def hotplug_instance_type_vm_and_verify(vm, client, instance_type): def verify_hotplug(vm, client, sockets=None, memory_guest=None): vmim = get_created_migration_job(vm=vm, client=client) - wait_for_migration_finished( - namespace=vm.namespace, migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN - ) + wait_for_migration_finished(migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN) wait_for_ssh_connectivity(vm=vm) vmi_spec_domain = vm.vmi.instance.spec.domain if sockets: diff --git a/tests/virt/node/descheduler/conftest.py b/tests/virt/node/descheduler/conftest.py index ade5c543f2..8a7a5d4ed8 100644 --- a/tests/virt/node/descheduler/conftest.py +++ b/tests/virt/node/descheduler/conftest.py @@ -122,7 +122,7 @@ def deployed_vms_for_descheduler_test( def all_existing_migrations_completed(admin_client, namespace): # Descheduler may trigger multiple migrations, need to wait when all succeeded for migration in VirtualMachineInstanceMigration.get(client=admin_client, namespace=namespace): - wait_for_migration_finished(namespace=namespace.name, migration=migration, timeout=TIMEOUT_5MIN) + wait_for_migration_finished(migration=migration, timeout=TIMEOUT_5MIN) @pytest.fixture(scope="class") diff --git a/tests/virt/utils.py b/tests/virt/utils.py index 9e1d8113cb..2f845c73a8 100644 --- a/tests/virt/utils.py +++ b/tests/virt/utils.py @@ -139,7 +139,7 @@ def migrate_and_verify_multi_vms(vm_list): for vm in vm_list: migration = vms_dict[vm.name]["vm_mig"] - wait_for_migration_finished(namespace=vm.namespace, migration=migration) + wait_for_migration_finished(migration=migration) migration.clean_up() for vm in vm_list: diff --git a/utilities/virt.py b/utilities/virt.py index 9fbc66267a..dabc1627e3 100644 --- a/utilities/virt.py +++ b/utilities/virt.py @@ -1598,7 +1598,20 @@ def to_dict(self): if self.ip_families: self.res["spec"]["ipFamilies"] = self.ip_families - def service_ip(self, ip_family=None): + def service_ip(self, admin_client: DynamicClient, ip_family: str | None = None) -> str: + """ + Get the IP address of the service. + + Args: + admin_client (DynamicClient): admin client to be used for the service. + ip_family (str | None): IP family to be used for the service. + + Returns: + str: IP address of the service. + + Raises: + ValueError: If the service IP cannot be retrieved. + """ if self.service_type == Service.Type.CLUSTER_IP: if ip_family: cluster_ips = [ @@ -1611,10 +1624,7 @@ def service_ip(self, ip_family=None): return self.instance.spec.clusterIP - vm_node = Node( - client=get_client(), - name=self.vmi.instance.status.nodeName, - ) + vm_node = self.vm.vmi.get_node(privileged_client=admin_client) if self.service_type == Service.Type.NODE_PORT: if ip_family: internal_ips = [ @@ -1627,6 +1637,10 @@ def service_ip(self, ip_family=None): return self.target_ip or vm_node.internal_ip + raise ValueError( + f"Could not get service IP for service {self.vm.custom_service.name} with type {self.service_type}" + ) + @property def service_port(self): if self.service_type == Service.Type.CLUSTER_IP: @@ -1928,7 +1942,7 @@ def migrate_vm_and_verify( ) as migration: if not wait_for_migration_success: return migration - wait_for_migration_finished(namespace=vm.namespace, migration=migration, timeout=timeout) + wait_for_migration_finished(migration=migration, timeout=timeout) verify_vm_migrated( vm=vm, @@ -1939,7 +1953,36 @@ def migrate_vm_and_verify( return None -def wait_for_migration_finished(namespace, migration, timeout=TIMEOUT_12MIN): +def wait_for_migration_finished(migration: VirtualMachineInstanceMigration, timeout: int = TIMEOUT_12MIN) -> None: + """ + Wait for migration to finish. + If migration is stuck in Scheduling state for more than 4 minutes, abort the migration and collect data. + + Args: + migration (VirtualMachineInstanceMigration): Migration object. + timeout (int): Maximum time to wait for the migration to finish. + + Raises: + TimeoutExpiredError: If the migration does not finish within the timeout. + """ + + def _abort_migration_wait(_migration: VirtualMachineInstanceMigration) -> None: + for pod in utilities.infra.get_pod_by_name_prefix( + client=_migration.client, pod_prefix=VIRT_LAUNCHER, namespace=_migration.namespace, get_all=True + ): + # Get status/events for PODs in non-running or failed state + if pod.status not in {Pod.Status.RUNNING, Pod.Status.COMPLETED, Pod.Status.SUCCEEDED}: + pod_events = [ + event["raw_object"]["message"] + for event in pod.events(timeout=TIMEOUT_5SEC, field_selector="type==Warning") + ] + LOGGER.error( + f"POD Conditions:\n {pod.instance.status.conditions[0]}\nPOD Events:\n {', '.join(pod_events)}" + ) + raise TimeoutExpiredError( + f"VMIM {migration.name} stuck in Scheduling state and probably will be failed" + ) + sleep = TIMEOUT_10SEC samples = TimeoutSampler(wait_timeout=timeout, sleep=sleep, func=lambda: migration.instance.status.phase) counter = 0 @@ -1948,30 +1991,12 @@ def wait_for_migration_finished(namespace, migration, timeout=TIMEOUT_12MIN): for sample in samples: if sample == migration.Status.SUCCEEDED: break - elif sample == "Scheduling": + if sample == VirtualMachineInstanceMigration.Status.SCHEDULING: counter += 1 # If migration stuck in Scheduling state for more than 4 minutes - most likely it will be failed # Need to collect data before 5 min timeout reached and target POD is removed if counter >= TIMEOUT_4MIN / sleep: - # Get status/events for PODs in non-running or failed state - for pod in utilities.infra.get_pod_by_name_prefix( - client=get_client(), - pod_prefix=VIRT_LAUNCHER, - namespace=namespace, - get_all=True, - ): - if pod.status not in (Pod.Status.RUNNING, Pod.Status.COMPLETED, Pod.Status.SUCCEEDED): - pod_events = [ - event["raw_object"]["message"] - for event in pod.events(timeout=TIMEOUT_5SEC, field_selector="type==Warning") - ] - LOGGER.error( - f"POD Conditions:\n {pod.instance.status.conditions[0]}\n" - f"POD Events:\n {', '.join(pod_events)}" - ) - raise TimeoutExpiredError( - f"VMIM {migration.name} stuck in Scheduling state and probably will be failed" - ) + _abort_migration_wait(_migration=migration) except TimeoutExpiredError: if sample: LOGGER.error(f"Status of VMIM {migration.name} is {sample}") @@ -2331,9 +2356,7 @@ def check_migration_process_after_node_drain(client, vm, admin_client): LOGGER.info(f"The VMI was running on {source_node.name}") wait_for_node_schedulable_status(node=source_node, status=False) vmim = get_created_migration_job(vm=vm, client=client, timeout=TIMEOUT_5MIN) - wait_for_migration_finished( - namespace=vm.namespace, migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN - ) + wait_for_migration_finished(migration=vmim, timeout=TIMEOUT_30MIN if "windows" in vm.name else TIMEOUT_10MIN) target_pod = vm.vmi.get_virt_launcher_pod(privileged_client=admin_client) target_pod.wait_for_status(status=Pod.Status.RUNNING, timeout=TIMEOUT_3MIN) From 0ff391c220e4981bbec9c6e1e1c8b3fac90fdcdf Mon Sep 17 00:00:00 2001 From: vsibirsk Date: Wed, 20 May 2026 09:42:00 +0300 Subject: [PATCH 2/7] rename inner func Signed-off-by: vsibirsk --- utilities/virt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utilities/virt.py b/utilities/virt.py index dabc1627e3..b2130274e2 100644 --- a/utilities/virt.py +++ b/utilities/virt.py @@ -1966,7 +1966,7 @@ def wait_for_migration_finished(migration: VirtualMachineInstanceMigration, time TimeoutExpiredError: If the migration does not finish within the timeout. """ - def _abort_migration_wait(_migration: VirtualMachineInstanceMigration) -> None: + def _abort_stuck_scheduling_migration(_migration: VirtualMachineInstanceMigration) -> None: for pod in utilities.infra.get_pod_by_name_prefix( client=_migration.client, pod_prefix=VIRT_LAUNCHER, namespace=_migration.namespace, get_all=True ): @@ -1996,7 +1996,7 @@ def _abort_migration_wait(_migration: VirtualMachineInstanceMigration) -> None: # If migration stuck in Scheduling state for more than 4 minutes - most likely it will be failed # Need to collect data before 5 min timeout reached and target POD is removed if counter >= TIMEOUT_4MIN / sleep: - _abort_migration_wait(_migration=migration) + _abort_stuck_scheduling_migration(_migration=migration) except TimeoutExpiredError: if sample: LOGGER.error(f"Status of VMIM {migration.name} is {sample}") From 464ef13f6e18e3f4ae3ee6af2d990bec91278744 Mon Sep 17 00:00:00 2001 From: vsibirsk Date: Wed, 20 May 2026 09:57:31 +0300 Subject: [PATCH 3/7] move node lookup under condition Signed-off-by: vsibirsk --- utilities/virt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/virt.py b/utilities/virt.py index b2130274e2..bd86d45c4d 100644 --- a/utilities/virt.py +++ b/utilities/virt.py @@ -1624,8 +1624,8 @@ def service_ip(self, admin_client: DynamicClient, ip_family: str | None = None) return self.instance.spec.clusterIP - vm_node = self.vm.vmi.get_node(privileged_client=admin_client) if self.service_type == Service.Type.NODE_PORT: + vm_node = self.vm.vmi.get_node(privileged_client=admin_client) if ip_family: internal_ips = [ internal_ip From a56d55a6711ae82bd946d79a531014e94d5513b0 Mon Sep 17 00:00:00 2001 From: vsibirsk Date: Tue, 26 May 2026 00:45:45 +0300 Subject: [PATCH 4/7] resolve pr comments Signed-off-by: vsibirsk --- utilities/exceptions.py | 10 ++++++++++ utilities/virt.py | 13 +++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/utilities/exceptions.py b/utilities/exceptions.py index f31a594ce1..326594cba1 100644 --- a/utilities/exceptions.py +++ b/utilities/exceptions.py @@ -128,6 +128,16 @@ class UnsupportedCPUArchitectureError(Exception): """Exception raised when a CPU architecture is not supported.""" +class MigrationStuckSchedulingError(Exception): + """Exception raised when a migration is stuck in Scheduling state for more than 4 minutes.""" + + def __init__(self, migration_name): + self.migration_name = migration_name + + def __str__(self): + return f"Migration {self.migration_name} is stuck in Scheduling state." + + def raise_multiple_exceptions(exceptions): """Raising multiple exceptions diff --git a/utilities/virt.py b/utilities/virt.py index bd86d45c4d..0cdb288215 100644 --- a/utilities/virt.py +++ b/utilities/virt.py @@ -94,6 +94,7 @@ Images, ) from utilities.data_collector import collect_vnc_screenshot_for_vms +from utilities.exceptions import MigrationStuckSchedulingError from utilities.hco import get_hco_namespace, wait_for_hco_conditions from utilities.network import ( cloud_init_network_data, @@ -1966,7 +1967,7 @@ def wait_for_migration_finished(migration: VirtualMachineInstanceMigration, time TimeoutExpiredError: If the migration does not finish within the timeout. """ - def _abort_stuck_scheduling_migration(_migration: VirtualMachineInstanceMigration) -> None: + def _abort_stuck_scheduling_migration_wait(_migration: VirtualMachineInstanceMigration) -> None: for pod in utilities.infra.get_pod_by_name_prefix( client=_migration.client, pod_prefix=VIRT_LAUNCHER, namespace=_migration.namespace, get_all=True ): @@ -1977,11 +1978,11 @@ def _abort_stuck_scheduling_migration(_migration: VirtualMachineInstanceMigratio for event in pod.events(timeout=TIMEOUT_5SEC, field_selector="type==Warning") ] LOGGER.error( - f"POD Conditions:\n {pod.instance.status.conditions[0]}\nPOD Events:\n {', '.join(pod_events)}" - ) - raise TimeoutExpiredError( - f"VMIM {migration.name} stuck in Scheduling state and probably will be failed" + f"POD Name: {pod.name}\n" + f"POD Conditions:\n {pod.instance.status.conditions[0]}\n" + f"POD Events:\n {', '.join(pod_events)}" ) + raise MigrationStuckSchedulingError(migration_name=migration.name) sleep = TIMEOUT_10SEC samples = TimeoutSampler(wait_timeout=timeout, sleep=sleep, func=lambda: migration.instance.status.phase) @@ -1996,7 +1997,7 @@ def _abort_stuck_scheduling_migration(_migration: VirtualMachineInstanceMigratio # If migration stuck in Scheduling state for more than 4 minutes - most likely it will be failed # Need to collect data before 5 min timeout reached and target POD is removed if counter >= TIMEOUT_4MIN / sleep: - _abort_stuck_scheduling_migration(_migration=migration) + _abort_stuck_scheduling_migration_wait(_migration=migration) except TimeoutExpiredError: if sample: LOGGER.error(f"Status of VMIM {migration.name} is {sample}") From 748e1d637c197f9097a45ba5eb6fec71270e280f Mon Sep 17 00:00:00 2001 From: vsibirsk Date: Tue, 26 May 2026 01:13:34 +0300 Subject: [PATCH 5/7] add unittests for new expection Signed-off-by: vsibirsk --- utilities/unittests/test_exceptions.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/utilities/unittests/test_exceptions.py b/utilities/unittests/test_exceptions.py index cc92be44d9..7282c4d262 100644 --- a/utilities/unittests/test_exceptions.py +++ b/utilities/unittests/test_exceptions.py @@ -9,6 +9,7 @@ from utilities.exceptions import ( ClusterSanityError, DataVolumeConditionMessageNotFoundError, + MigrationStuckSchedulingError, MissingEnvironmentVariableError, MissingResourceException, OsDictNotFoundError, @@ -314,3 +315,20 @@ def test_unsupported_cpu_architecture_error(self): """Test UnsupportedCPUArchitectureError can be raised""" with pytest.raises(UnsupportedCPUArchitectureError): raise UnsupportedCPUArchitectureError("Test error") + + +class TestMigrationStuckSchedulingError: + """Test cases for MigrationStuckSchedulingError exception""" + + def test_migration_stuck_scheduling_error_init(self): + """Test MigrationStuckSchedulingError initialization""" + migration_name = "test-migration" + error = MigrationStuckSchedulingError(migration_name) + assert error.migration_name == migration_name + + def test_migration_stuck_scheduling_error_str(self): + """Test MigrationStuckSchedulingError string representation""" + migration_name = "test-migration" + error = MigrationStuckSchedulingError(migration_name) + expected = "Migration test-migration is stuck in Scheduling state." + assert str(error) == expected From caae5509adaf50e219bc838cabc12b2264bffa71 Mon Sep 17 00:00:00 2001 From: vsibirsk Date: Tue, 26 May 2026 10:23:05 +0300 Subject: [PATCH 6/7] add typehint Signed-off-by: vsibirsk --- utilities/exceptions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utilities/exceptions.py b/utilities/exceptions.py index 326594cba1..fb82e3741c 100644 --- a/utilities/exceptions.py +++ b/utilities/exceptions.py @@ -131,10 +131,10 @@ class UnsupportedCPUArchitectureError(Exception): class MigrationStuckSchedulingError(Exception): """Exception raised when a migration is stuck in Scheduling state for more than 4 minutes.""" - def __init__(self, migration_name): + def __init__(self, migration_name: str) -> None: self.migration_name = migration_name - def __str__(self): + def __str__(self) -> str: return f"Migration {self.migration_name} is stuck in Scheduling state." From df635af81269324014ccf4dc3a16b46e8a37b01b Mon Sep 17 00:00:00 2001 From: vsibirsk Date: Wed, 27 May 2026 10:51:27 +0300 Subject: [PATCH 7/7] resolve pr comments move pod event collection to separate module-scope func replaced/updated raised exceptions Signed-off-by: vsibirsk --- utilities/exceptions.py | 2 +- utilities/virt.py | 53 ++++++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/utilities/exceptions.py b/utilities/exceptions.py index fb82e3741c..92f946a10e 100644 --- a/utilities/exceptions.py +++ b/utilities/exceptions.py @@ -129,7 +129,7 @@ class UnsupportedCPUArchitectureError(Exception): class MigrationStuckSchedulingError(Exception): - """Exception raised when a migration is stuck in Scheduling state for more than 4 minutes.""" + """Exception raised when a migration is stuck in Scheduling state.""" def __init__(self, migration_name: str) -> None: self.migration_name = migration_name diff --git a/utilities/virt.py b/utilities/virt.py index 0cdb288215..2028b8b57d 100644 --- a/utilities/virt.py +++ b/utilities/virt.py @@ -94,7 +94,7 @@ Images, ) from utilities.data_collector import collect_vnc_screenshot_for_vms -from utilities.exceptions import MigrationStuckSchedulingError +from utilities.exceptions import MigrationStuckSchedulingError, ResourceValueError from utilities.hco import get_hco_namespace, wait_for_hco_conditions from utilities.network import ( cloud_init_network_data, @@ -1611,7 +1611,7 @@ def service_ip(self, admin_client: DynamicClient, ip_family: str | None = None) str: IP address of the service. Raises: - ValueError: If the service IP cannot be retrieved. + ResourceValueError: If the service IP cannot be retrieved. """ if self.service_type == Service.Type.CLUSTER_IP: if ip_family: @@ -1638,7 +1638,7 @@ def service_ip(self, admin_client: DynamicClient, ip_family: str | None = None) return self.target_ip or vm_node.internal_ip - raise ValueError( + raise ResourceValueError( f"Could not get service IP for service {self.vm.custom_service.name} with type {self.service_type}" ) @@ -1957,33 +1957,17 @@ def migrate_vm_and_verify( def wait_for_migration_finished(migration: VirtualMachineInstanceMigration, timeout: int = TIMEOUT_12MIN) -> None: """ Wait for migration to finish. - If migration is stuck in Scheduling state for more than 4 minutes, abort the migration and collect data. + If migration is stuck in Scheduling state, abort the migration and collect data. Args: migration (VirtualMachineInstanceMigration): Migration object. timeout (int): Maximum time to wait for the migration to finish. Raises: + MigrationStuckSchedulingError: If the migration is stuck in Scheduling state. TimeoutExpiredError: If the migration does not finish within the timeout. """ - def _abort_stuck_scheduling_migration_wait(_migration: VirtualMachineInstanceMigration) -> None: - for pod in utilities.infra.get_pod_by_name_prefix( - client=_migration.client, pod_prefix=VIRT_LAUNCHER, namespace=_migration.namespace, get_all=True - ): - # Get status/events for PODs in non-running or failed state - if pod.status not in {Pod.Status.RUNNING, Pod.Status.COMPLETED, Pod.Status.SUCCEEDED}: - pod_events = [ - event["raw_object"]["message"] - for event in pod.events(timeout=TIMEOUT_5SEC, field_selector="type==Warning") - ] - LOGGER.error( - f"POD Name: {pod.name}\n" - f"POD Conditions:\n {pod.instance.status.conditions[0]}\n" - f"POD Events:\n {', '.join(pod_events)}" - ) - raise MigrationStuckSchedulingError(migration_name=migration.name) - sleep = TIMEOUT_10SEC samples = TimeoutSampler(wait_timeout=timeout, sleep=sleep, func=lambda: migration.instance.status.phase) counter = 0 @@ -1997,13 +1981,38 @@ def _abort_stuck_scheduling_migration_wait(_migration: VirtualMachineInstanceMig # If migration stuck in Scheduling state for more than 4 minutes - most likely it will be failed # Need to collect data before 5 min timeout reached and target POD is removed if counter >= TIMEOUT_4MIN / sleep: - _abort_stuck_scheduling_migration_wait(_migration=migration) + log_failed_pod_events(migration=migration) + raise MigrationStuckSchedulingError(migration_name=migration.name) except TimeoutExpiredError: if sample: LOGGER.error(f"Status of VMIM {migration.name} is {sample}") raise +def log_failed_pod_events(migration: VirtualMachineInstanceMigration) -> None: + """ + Log failed pod events for a migration. + + Args: + migration (VirtualMachineInstanceMigration): Migration object. + """ + + for pod in utilities.infra.get_pod_by_name_prefix( + client=migration.client, pod_prefix=VIRT_LAUNCHER, namespace=migration.namespace, get_all=True + ): + # Get status/events for PODs in non-running or failed state + if pod.status not in {Pod.Status.RUNNING, Pod.Status.COMPLETED, Pod.Status.SUCCEEDED}: + pod_events = [ + event["raw_object"]["message"] + for event in pod.events(timeout=TIMEOUT_5SEC, field_selector="type==Warning") + ] + LOGGER.error( + f"POD Name: {pod.name}\n" + f"POD Conditions:\n {pod.instance.status.conditions[0]}\n" + f"POD Events:\n {', '.join(pod_events)}" + ) + + def verify_vm_migrated( vm, node_before,