Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions libs/vm/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ocp_resources.virtual_machine import VirtualMachine
from ocp_resources.virtual_machine_instance import VirtualMachineInstance
from pytest_testconfig import config as py_config
from timeout_sampler import retry

from libs.net.vmspec import VMInterfaceSpecNotFoundError
from libs.vm.spec import (
Expand Down Expand Up @@ -143,6 +144,22 @@ def set_template_affinity(self, affinity: Affinity | None) -> None:
patches = {self: {"spec": {"template": {"spec": {"affinity": template_affinity}}}}}
ResourceEditor(patches=patches).update()

def wait_for_vmi_affinity(self) -> None:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

per my understanding, the API contract of "set affinity" should mean "affinity is applied when I return." Hence the wait operation should be a part if set_template_affinity. Otherwise, every caller must remember the wait, and future callers may forget. Consider integrating the reconciliation wait into set_template_affinity itself.

"""Wait for the VMI to reflect the current template affinity."""
template_affinity = self._spec.template.spec.affinity
expected_affinity = (
asdict(obj=template_affinity, dict_factory=self._filter_out_none_values) if template_affinity else None
Comment thread
Anatw marked this conversation as resolved.
)
self._assert_vmi_affinity(expected_affinity=expected_affinity)

@retry(wait_timeout=10, sleep=1)
Comment thread
Anatw marked this conversation as resolved.
def _assert_vmi_affinity(self, expected_affinity: dict[str, Any] | None) -> bool:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT:
Since we use assert for tests, consider renaming to expected_vmi_affinity

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 from me, let not mix up concerns. assert is about test itself, if you have to check some state in precondition/step, please use any other way/naming, not assert.

vmi_affinity = self.vmi.instance.to_dict()["spec"].get("affinity")
assert vmi_affinity == expected_affinity, (

@servolkov servolkov Jun 30, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see thread. assert is a test concern, not library. Raise a clear exception instead.

f"VMI {self.name} affinity mismatch: expected {expected_affinity}, got {vmi_affinity}"
)
return True

@property
def template_spec(self) -> VMISpec:
return self._spec.template.spec
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,16 @@
import pytest

from libs.vm.affinity import new_pod_affinity, new_pod_anti_affinity
from tests.network.libs.stuntime import CLIENT_VM_LABEL, SERVER_VM_LABEL, STUNTIME_THRESHOLD_SECONDS, measure_stuntime
from tests.network.libs.stuntime import (
CLIENT_VM_LABEL,
POD_AFFINITY_TYPE,
POD_ANTI_AFFINITY_TYPE,
SERVER_VM_LABEL,
STUNTIME_THRESHOLD_SECONDS,
assert_affinity_after_migration,
measure_stuntime,
)
from utilities.jira import is_jira_open
from utilities.virt import migrate_vm_and_verify

pytestmark = [pytest.mark.tier3]
Expand Down Expand Up @@ -71,7 +80,10 @@ def test_client_migrates_off_server_node(
- Measured stuntime does not exceed the global threshold.
"""
stuntime_client_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=SERVER_VM_LABEL))
stuntime_client_vm.wait_for_vmi_affinity()
migrate_vm_and_verify(vm=stuntime_client_vm, client=admin_client)
if is_jira_open(jira_id="CNV-90576"):
assert_affinity_after_migration(vm=stuntime_client_vm, expected_type=POD_ANTI_AFFINITY_TYPE)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
measured_stuntime = measure_stuntime(active_ping=l2_bridge_active_ping)
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
Expand Down Expand Up @@ -130,7 +142,10 @@ def test_client_migrates_to_server_node(
- Measured stuntime does not exceed the global threshold.
"""
stuntime_client_vm.set_template_affinity(affinity=new_pod_affinity(label=SERVER_VM_LABEL))
stuntime_client_vm.wait_for_vmi_affinity()
migrate_vm_and_verify(vm=stuntime_client_vm, client=admin_client)
if is_jira_open(jira_id="CNV-90576"):
assert_affinity_after_migration(vm=stuntime_client_vm, expected_type=POD_AFFINITY_TYPE)
measured_stuntime = measure_stuntime(active_ping=l2_bridge_active_ping)
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
Expand Down Expand Up @@ -160,7 +175,10 @@ def test_server_migrates_off_client_node(
- Measured stuntime does not exceed the global threshold.
"""
stuntime_server_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=CLIENT_VM_LABEL))
stuntime_server_vm.wait_for_vmi_affinity()
migrate_vm_and_verify(vm=stuntime_server_vm, client=admin_client)
if is_jira_open(jira_id="CNV-90576"):
assert_affinity_after_migration(vm=stuntime_server_vm, expected_type=POD_ANTI_AFFINITY_TYPE)
measured_stuntime = measure_stuntime(active_ping=l2_bridge_active_ping)
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
Expand Down Expand Up @@ -189,7 +207,6 @@ def test_server_migrates_between_non_client_nodes(
Expected:
- Measured stuntime does not exceed the global threshold.
"""
stuntime_server_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=CLIENT_VM_LABEL))
migrate_vm_and_verify(vm=stuntime_server_vm, client=admin_client)
measured_stuntime = measure_stuntime(active_ping=l2_bridge_active_ping)
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
Expand Down Expand Up @@ -220,7 +237,10 @@ def test_server_migrates_to_client_node(
- Measured stuntime does not exceed the global threshold.
"""
stuntime_server_vm.set_template_affinity(affinity=new_pod_affinity(label=CLIENT_VM_LABEL))
stuntime_server_vm.wait_for_vmi_affinity()
migrate_vm_and_verify(vm=stuntime_server_vm, client=admin_client)
if is_jira_open(jira_id="CNV-90576"):
assert_affinity_after_migration(vm=stuntime_server_vm, expected_type=POD_AFFINITY_TYPE)
measured_stuntime = measure_stuntime(active_ping=l2_bridge_active_ping)
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
Expand Down
28 changes: 28 additions & 0 deletions tests/network/libs/stuntime.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
SERVER_VM_LABEL: Final[tuple[str, str]] = (STUNTIME_LABEL_KEY, "server")
CLIENT_VM_LABEL: Final[tuple[str, str]] = (STUNTIME_LABEL_KEY, "client")
STUNTIME_THRESHOLD_SECONDS: Final[float] = 5.0
POD_AFFINITY_TYPE: Final[str] = "podAffinity"
POD_ANTI_AFFINITY_TYPE: Final[str] = "podAntiAffinity"
STUNTIME_PING_LOG_PATH: Final[str] = "/tmp/stuntime-ping.log"
PING_INTERVAL_SECONDS: Final[float] = 0.01
DEFAULT_COMMAND_TIMEOUT_SECONDS: Final[int] = 10
Expand Down Expand Up @@ -141,3 +143,29 @@ def _compute_stuntime(lost_packets: int) -> float:
stuntime = 0.0 if lost_packets == 0 else (lost_packets + 1) * PING_INTERVAL_SECONDS
LOGGER.info(f"Stuntime: {stuntime:.2f}s (from {lost_packets} lost packets)")
return stuntime


def assert_affinity_after_migration(vm: BaseVirtualMachine, expected_type: str) -> None:
Comment thread
Anatw marked this conversation as resolved.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this is not a test itself, rename the helper and don't use assert. I bet we don't test affinity after migration, hence it does not deserve assertion. Otherwise, it is a mess.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is now redundant - removed entirely.

"""Verify the migration target pod has the correct affinity after migration.

Detects the virt-controller race (CNV-90576) where the migration target pod is
created from a stale VMI snapshot, resulting in incorrect affinity on the pod.

Args:
vm: The VM that was migrated.
expected_type: The affinity type expected on the pod ("podAffinity" or "podAntiAffinity").
"""
pod_affinity = vm.vmi.virt_launcher_pod.instance.to_dict()["spec"].get("affinity")
stale_type = POD_ANTI_AFFINITY_TYPE if expected_type == POD_AFFINITY_TYPE else POD_AFFINITY_TYPE
assert expected_type in pod_affinity, (
Comment thread
coderabbitai[bot] marked this conversation as resolved.
f"POD ({vm.vmi.virt_launcher_pod.name}) missing {expected_type}: {pod_affinity}"
)
if stale_type in pod_affinity:
stale_rules = pod_affinity[stale_type].get("requiredDuringSchedulingIgnoredDuringExecution", [])
match_expressions = [
expr for rule in stale_rules for expr in rule.get("labelSelector", {}).get("matchExpressions", [])
]
has_stuntime_rules = any(expr.get("key", "").startswith("stuntime.") for expr in match_expressions)
assert not has_stuntime_rules, (
f"POD ({vm.vmi.virt_launcher_pod.name}) has stale stuntime {stale_type}: {pod_affinity[stale_type]}"
)
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@
from libs.vm.affinity import new_pod_affinity, new_pod_anti_affinity
from tests.network.libs.stuntime import (
CLIENT_VM_LABEL,
POD_AFFINITY_TYPE,
POD_ANTI_AFFINITY_TYPE,
SERVER_VM_LABEL,
STUNTIME_THRESHOLD_SECONDS,
assert_affinity_after_migration,
measure_stuntime,
)
from utilities.jira import is_jira_open
from utilities.virt import migrate_vm_and_verify

pytestmark = [pytest.mark.tier3]
Expand Down Expand Up @@ -76,7 +80,10 @@ def test_client_migrates_off_server_node(self, admin_client, ip_family, localnet
- Measured stuntime does not exceed the global threshold.
"""
localnet_stuntime_client_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=SERVER_VM_LABEL))
localnet_stuntime_client_vm.wait_for_vmi_affinity()
migrate_vm_and_verify(vm=localnet_stuntime_client_vm, client=admin_client)
if is_jira_open(jira_id="CNV-90576"):

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw don't we need to check affinity even without this bug? I mean, perhaps the direction is right regardless the bug -> checking jira is not needed?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Firstly - the bug is an automation bug, so the product bug this code is dependent on is not relevant anymore.

To your point - I was conflicting about this too, but I think that we shouldn't test this regardless - we expect the product to behave as it should - for example, if we set an NNCP and it is reported as being configured successfully - we assume it is not lying - we don't check the node to make sure the bridge is actually there.
But I'm not 100% convinced, I'm curious on @EdDev take on this.

assert_affinity_after_migration(vm=localnet_stuntime_client_vm, expected_type=POD_ANTI_AFFINITY_TYPE)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
measured_stuntime = measure_stuntime(active_ping=active_ping)
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
Expand Down Expand Up @@ -133,7 +140,10 @@ def test_client_migrates_to_server_node(self, admin_client, ip_family, localnet_
- Measured stuntime does not exceed the global threshold.
"""
localnet_stuntime_client_vm.set_template_affinity(affinity=new_pod_affinity(label=SERVER_VM_LABEL))
localnet_stuntime_client_vm.wait_for_vmi_affinity()
migrate_vm_and_verify(vm=localnet_stuntime_client_vm, client=admin_client)
if is_jira_open(jira_id="CNV-90576"):
assert_affinity_after_migration(vm=localnet_stuntime_client_vm, expected_type=POD_AFFINITY_TYPE)
measured_stuntime = measure_stuntime(active_ping=active_ping)
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
Expand Down Expand Up @@ -161,7 +171,10 @@ def test_server_migrates_off_client_node(self, admin_client, ip_family, localnet
- Measured stuntime does not exceed the global threshold.
"""
localnet_stuntime_server_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=CLIENT_VM_LABEL))
localnet_stuntime_server_vm.wait_for_vmi_affinity()
migrate_vm_and_verify(vm=localnet_stuntime_server_vm, client=admin_client)
if is_jira_open(jira_id="CNV-90576"):
assert_affinity_after_migration(vm=localnet_stuntime_server_vm, expected_type=POD_ANTI_AFFINITY_TYPE)
measured_stuntime = measure_stuntime(active_ping=active_ping)
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
Expand Down Expand Up @@ -190,7 +203,6 @@ def test_server_migrates_between_non_client_nodes(
Expected:
- Measured stuntime does not exceed the global threshold.
"""
localnet_stuntime_server_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=CLIENT_VM_LABEL))
migrate_vm_and_verify(vm=localnet_stuntime_server_vm, client=admin_client)
measured_stuntime = measure_stuntime(active_ping=active_ping)
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
Expand Down Expand Up @@ -219,7 +231,10 @@ def test_server_migrates_to_client_node(self, admin_client, ip_family, localnet_
- Measured stuntime does not exceed the global threshold.
"""
localnet_stuntime_server_vm.set_template_affinity(affinity=new_pod_affinity(label=CLIENT_VM_LABEL))
localnet_stuntime_server_vm.wait_for_vmi_affinity()
migrate_vm_and_verify(vm=localnet_stuntime_server_vm, client=admin_client)
if is_jira_open(jira_id="CNV-90576"):
assert_affinity_after_migration(vm=localnet_stuntime_server_vm, expected_type=POD_AFFINITY_TYPE)
measured_stuntime = measure_stuntime(active_ping=active_ping)
assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, (
f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)"
Expand Down