-
Notifications
You must be signed in to change notification settings - Fork 71
net, tests, stuntime: Wait for VMI affinity reconciliation before migration #5347
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ | |
| from ocp_resources.virtual_machine import VirtualMachine | ||
| from ocp_resources.virtual_machine_instance import VirtualMachineInstance | ||
| from pytest_testconfig import config as py_config | ||
| from timeout_sampler import retry | ||
|
|
||
| from libs.net.vmspec import VMInterfaceSpecNotFoundError | ||
| from libs.vm.spec import ( | ||
|
|
@@ -143,6 +144,22 @@ def set_template_affinity(self, affinity: Affinity | None) -> None: | |
| patches = {self: {"spec": {"template": {"spec": {"affinity": template_affinity}}}}} | ||
| ResourceEditor(patches=patches).update() | ||
|
|
||
| def wait_for_vmi_affinity(self) -> None: | ||
| """Wait for the VMI to reflect the current template affinity.""" | ||
| template_affinity = self._spec.template.spec.affinity | ||
| expected_affinity = ( | ||
| asdict(obj=template_affinity, dict_factory=self._filter_out_none_values) if template_affinity else None | ||
|
Anatw marked this conversation as resolved.
|
||
| ) | ||
| self._assert_vmi_affinity(expected_affinity=expected_affinity) | ||
|
|
||
| @retry(wait_timeout=10, sleep=1) | ||
|
Anatw marked this conversation as resolved.
|
||
| def _assert_vmi_affinity(self, expected_affinity: dict[str, Any] | None) -> bool: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NIT:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 from me, let not mix up concerns. assert is about test itself, if you have to check some state in precondition/step, please use any other way/naming, not assert. |
||
| vmi_affinity = self.vmi.instance.to_dict()["spec"].get("affinity") | ||
| assert vmi_affinity == expected_affinity, ( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see thread. |
||
| f"VMI {self.name} affinity mismatch: expected {expected_affinity}, got {vmi_affinity}" | ||
| ) | ||
| return True | ||
|
|
||
| @property | ||
| def template_spec(self) -> VMISpec: | ||
| return self._spec.template.spec | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,8 @@ | |
| SERVER_VM_LABEL: Final[tuple[str, str]] = (STUNTIME_LABEL_KEY, "server") | ||
| CLIENT_VM_LABEL: Final[tuple[str, str]] = (STUNTIME_LABEL_KEY, "client") | ||
| STUNTIME_THRESHOLD_SECONDS: Final[float] = 5.0 | ||
| POD_AFFINITY_TYPE: Final[str] = "podAffinity" | ||
| POD_ANTI_AFFINITY_TYPE: Final[str] = "podAntiAffinity" | ||
| STUNTIME_PING_LOG_PATH: Final[str] = "/tmp/stuntime-ping.log" | ||
| PING_INTERVAL_SECONDS: Final[float] = 0.01 | ||
| DEFAULT_COMMAND_TIMEOUT_SECONDS: Final[int] = 10 | ||
|
|
@@ -141,3 +143,29 @@ def _compute_stuntime(lost_packets: int) -> float: | |
| stuntime = 0.0 if lost_packets == 0 else (lost_packets + 1) * PING_INTERVAL_SECONDS | ||
| LOGGER.info(f"Stuntime: {stuntime:.2f}s (from {lost_packets} lost packets)") | ||
| return stuntime | ||
|
|
||
|
|
||
| def assert_affinity_after_migration(vm: BaseVirtualMachine, expected_type: str) -> None: | ||
|
Anatw marked this conversation as resolved.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if this is not a test itself, rename the helper and don't use assert. I bet we don't test affinity after migration, hence it does not deserve assertion. Otherwise, it is a mess.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is now redundant - removed entirely. |
||
| """Verify the migration target pod has the correct affinity after migration. | ||
|
|
||
| Detects the virt-controller race (CNV-90576) where the migration target pod is | ||
| created from a stale VMI snapshot, resulting in incorrect affinity on the pod. | ||
|
|
||
| Args: | ||
| vm: The VM that was migrated. | ||
| expected_type: The affinity type expected on the pod ("podAffinity" or "podAntiAffinity"). | ||
| """ | ||
| pod_affinity = vm.vmi.virt_launcher_pod.instance.to_dict()["spec"].get("affinity") | ||
| stale_type = POD_ANTI_AFFINITY_TYPE if expected_type == POD_AFFINITY_TYPE else POD_AFFINITY_TYPE | ||
| assert expected_type in pod_affinity, ( | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| f"POD ({vm.vmi.virt_launcher_pod.name}) missing {expected_type}: {pod_affinity}" | ||
| ) | ||
| if stale_type in pod_affinity: | ||
| stale_rules = pod_affinity[stale_type].get("requiredDuringSchedulingIgnoredDuringExecution", []) | ||
| match_expressions = [ | ||
| expr for rule in stale_rules for expr in rule.get("labelSelector", {}).get("matchExpressions", []) | ||
| ] | ||
| has_stuntime_rules = any(expr.get("key", "").startswith("stuntime.") for expr in match_expressions) | ||
| assert not has_stuntime_rules, ( | ||
| f"POD ({vm.vmi.virt_launcher_pod.name}) has stale stuntime {stale_type}: {pod_affinity[stale_type]}" | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,10 +21,14 @@ | |
| from libs.vm.affinity import new_pod_affinity, new_pod_anti_affinity | ||
| from tests.network.libs.stuntime import ( | ||
| CLIENT_VM_LABEL, | ||
| POD_AFFINITY_TYPE, | ||
| POD_ANTI_AFFINITY_TYPE, | ||
| SERVER_VM_LABEL, | ||
| STUNTIME_THRESHOLD_SECONDS, | ||
| assert_affinity_after_migration, | ||
| measure_stuntime, | ||
| ) | ||
| from utilities.jira import is_jira_open | ||
| from utilities.virt import migrate_vm_and_verify | ||
|
|
||
| pytestmark = [pytest.mark.tier3] | ||
|
|
@@ -76,7 +80,10 @@ def test_client_migrates_off_server_node(self, admin_client, ip_family, localnet | |
| - Measured stuntime does not exceed the global threshold. | ||
| """ | ||
| localnet_stuntime_client_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=SERVER_VM_LABEL)) | ||
| localnet_stuntime_client_vm.wait_for_vmi_affinity() | ||
| migrate_vm_and_verify(vm=localnet_stuntime_client_vm, client=admin_client) | ||
| if is_jira_open(jira_id="CNV-90576"): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw don't we need to check affinity even without this bug? I mean, perhaps the direction is right regardless the bug -> checking jira is not needed?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Firstly - the bug is an automation bug, so the product bug this code is dependent on is not relevant anymore. To your point - I was conflicting about this too, but I think that we shouldn't test this regardless - we expect the product to behave as it should - for example, if we set an NNCP and it is reported as being configured successfully - we assume it is not lying - we don't check the node to make sure the bridge is actually there. |
||
| assert_affinity_after_migration(vm=localnet_stuntime_client_vm, expected_type=POD_ANTI_AFFINITY_TYPE) | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| measured_stuntime = measure_stuntime(active_ping=active_ping) | ||
| assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, ( | ||
| f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)" | ||
|
|
@@ -133,7 +140,10 @@ def test_client_migrates_to_server_node(self, admin_client, ip_family, localnet_ | |
| - Measured stuntime does not exceed the global threshold. | ||
| """ | ||
| localnet_stuntime_client_vm.set_template_affinity(affinity=new_pod_affinity(label=SERVER_VM_LABEL)) | ||
| localnet_stuntime_client_vm.wait_for_vmi_affinity() | ||
| migrate_vm_and_verify(vm=localnet_stuntime_client_vm, client=admin_client) | ||
| if is_jira_open(jira_id="CNV-90576"): | ||
| assert_affinity_after_migration(vm=localnet_stuntime_client_vm, expected_type=POD_AFFINITY_TYPE) | ||
| measured_stuntime = measure_stuntime(active_ping=active_ping) | ||
| assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, ( | ||
| f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)" | ||
|
|
@@ -161,7 +171,10 @@ def test_server_migrates_off_client_node(self, admin_client, ip_family, localnet | |
| - Measured stuntime does not exceed the global threshold. | ||
| """ | ||
| localnet_stuntime_server_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=CLIENT_VM_LABEL)) | ||
| localnet_stuntime_server_vm.wait_for_vmi_affinity() | ||
| migrate_vm_and_verify(vm=localnet_stuntime_server_vm, client=admin_client) | ||
| if is_jira_open(jira_id="CNV-90576"): | ||
| assert_affinity_after_migration(vm=localnet_stuntime_server_vm, expected_type=POD_ANTI_AFFINITY_TYPE) | ||
| measured_stuntime = measure_stuntime(active_ping=active_ping) | ||
| assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, ( | ||
| f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)" | ||
|
|
@@ -190,7 +203,6 @@ def test_server_migrates_between_non_client_nodes( | |
| Expected: | ||
| - Measured stuntime does not exceed the global threshold. | ||
| """ | ||
| localnet_stuntime_server_vm.set_template_affinity(affinity=new_pod_anti_affinity(label=CLIENT_VM_LABEL)) | ||
| migrate_vm_and_verify(vm=localnet_stuntime_server_vm, client=admin_client) | ||
| measured_stuntime = measure_stuntime(active_ping=active_ping) | ||
| assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, ( | ||
|
|
@@ -219,7 +231,10 @@ def test_server_migrates_to_client_node(self, admin_client, ip_family, localnet_ | |
| - Measured stuntime does not exceed the global threshold. | ||
| """ | ||
| localnet_stuntime_server_vm.set_template_affinity(affinity=new_pod_affinity(label=CLIENT_VM_LABEL)) | ||
| localnet_stuntime_server_vm.wait_for_vmi_affinity() | ||
| migrate_vm_and_verify(vm=localnet_stuntime_server_vm, client=admin_client) | ||
| if is_jira_open(jira_id="CNV-90576"): | ||
| assert_affinity_after_migration(vm=localnet_stuntime_server_vm, expected_type=POD_AFFINITY_TYPE) | ||
| measured_stuntime = measure_stuntime(active_ping=active_ping) | ||
| assert measured_stuntime <= STUNTIME_THRESHOLD_SECONDS, ( | ||
| f"Stuntime {measured_stuntime}s exceeds threshold ({STUNTIME_THRESHOLD_SECONDS}s)" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
per my understanding, the API contract of "set affinity" should mean "affinity is applied when I return." Hence the wait operation should be a part if
set_template_affinity. Otherwise, every caller must remember the wait, and future callers may forget. Consider integrating the reconciliation wait intoset_template_affinityitself.