From 829a7c950332610e92e91ceb11579d03e55a7692 Mon Sep 17 00:00:00 2001 From: Ohad Date: Wed, 27 May 2026 17:40:24 +0300 Subject: [PATCH 1/2] iuo, operator: Add request timeout to ClusterOperator API calls During T3 crypto policy tests, changing the APIServer TLS profile triggers a kube-apiserver rolling restart. The ClusterOperator.get() call in get_cluster_operator_status_conditions had no client-side HTTP timeout, causing it to hang indefinitely when the API accepts TCP connections but never responds. Since TimeoutSampler can only check its wall-clock timeout between iterations, a hanging func() means the 60-minute timeout in wait_for_cluster_operator_stabilize never fires. Add a 30-second _request_timeout to ClusterOperator.get() so the HTTP client raises a timeout exception instead of blocking forever, allowing TimeoutSampler to catch it and retry. assisted by: claude code claude-opus-4-6 Signed-off-by: Ohad --- utilities/operator.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utilities/operator.py b/utilities/operator.py index 446a10fbd8..ca308273de 100644 --- a/utilities/operator.py +++ b/utilities/operator.py @@ -31,6 +31,7 @@ TIMEOUT_10SEC, TIMEOUT_15MIN, TIMEOUT_20MIN, + TIMEOUT_30SEC, TIMEOUT_75MIN, ) from utilities.data_collector import collect_ocp_must_gather @@ -558,10 +559,10 @@ def cluster_with_icsp(): return len(icsp_list) > 0 -def get_cluster_operator_status_conditions(admin_client, operator_conditions=None): +def get_cluster_operator_status_conditions(admin_client, operator_conditions=None, request_timeout=TIMEOUT_30SEC): operator_conditions = operator_conditions or DEFAULT_RESOURCE_CONDITIONS cluster_operator_status = {} - for cluster_operator in list(ClusterOperator.get(client=admin_client)): + for cluster_operator in list(ClusterOperator.get(client=admin_client, _request_timeout=request_timeout)): operator_name = cluster_operator.name cluster_operator_status[operator_name] = {} for condition in cluster_operator.instance.get("status", {}).get("conditions", []): From fef672e1721347147536031b12474dabee8b4f3e Mon Sep 17 00:00:00 2001 From: Ohad Date: Thu, 28 May 2026 15:36:47 +0300 Subject: [PATCH 2/2] iuo, crypto_policy: Harden teardown against transient API and webhook failures Add exception handling to wait_for_cluster_operator_stabilize so transient ApiException errors (including request timeouts) are retried instead of crashing the polling loop during TLS reconfiguration. Add HCO webhook readiness check after APIServer TLS restoration in update_apiserver_crypto_policy. The conversion webhook can briefly lose endpoints after cluster operators report stable, causing subsequent HCO modifications to fail with 500 errors. assisted by: claude code claude-opus-4-6 Signed-off-by: Ohad --- .../crypto_policy/utils.py | 27 +++++++++++++++++++ utilities/operator.py | 2 ++ 2 files changed, 29 insertions(+) diff --git a/tests/install_upgrade_operators/crypto_policy/utils.py b/tests/install_upgrade_operators/crypto_policy/utils.py index 018feb5088..8f0197768a 100644 --- a/tests/install_upgrade_operators/crypto_policy/utils.py +++ b/tests/install_upgrade_operators/crypto_policy/utils.py @@ -289,6 +289,33 @@ def update_apiserver_crypto_policy( hco_namespace=hco_namespace, list_dependent_crs_to_check=MANAGED_CRS_LIST, ) + _wait_for_hco_webhook_ready(admin_client=admin_client, hco_namespace=hco_namespace) + + +def _get_hco_resources(admin_client: DynamicClient, namespace_name: str) -> list: + return list(HyperConverged.get(client=admin_client, namespace=namespace_name)) + + +def _wait_for_hco_webhook_ready(admin_client: DynamicClient, hco_namespace: Resource) -> None: + """Waits for the HCO webhook service to become reachable. + + After APIServer TLS changes, the conversion webhook may briefly lose endpoints + even after cluster operators report stable. Reading the HyperConverged resource + exercises the conversion webhook, confirming it is functional before subsequent + HCO modifications. + """ + sampler = TimeoutSampler( + wait_timeout=TIMEOUT_2MIN, + sleep=10, + func=_get_hco_resources, + exceptions_dict={ApiException: []}, + admin_client=admin_client, + namespace_name=hco_namespace.name, + ) + for sample in sampler: + if sample: + LOGGER.info("HCO webhook service is ready.") + return def check_service_accepts_tls_version(utility_pods: list, node: Node, service: Resource, tls_version: str) -> bool: diff --git a/utilities/operator.py b/utilities/operator.py index ca308273de..0ab8c467c5 100644 --- a/utilities/operator.py +++ b/utilities/operator.py @@ -5,6 +5,7 @@ from datetime import datetime from pprint import pformat +from kubernetes.client.exceptions import ApiException from kubernetes.dynamic import DynamicClient from kubernetes.dynamic.exceptions import ResourceNotFoundError from ocp_resources.catalog_source import CatalogSource @@ -599,6 +600,7 @@ def wait_for_cluster_operator_stabilize(admin_client, wait_timeout=TIMEOUT_20MIN wait_timeout=wait_timeout, sleep=10, func=get_failed_cluster_operator, + exceptions_dict={ApiException: []}, admin_client=admin_client, ) consecutive_check = 0