From 85b2aec5b7310ab87b9de4398c08bacf872b89df Mon Sep 17 00:00:00 2001 From: "doron88@gmail.com" Date: Sat, 20 Jun 2026 18:07:19 +0300 Subject: [PATCH] tcp: register tcp.rcv_wnd_max and tcp.snd_mss_max as runtime-tunable sysctls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two TCP policy values were effectively hard-coded, forcing applications that embed PyTCP to monkeypatch internals to tune throughput on fast or asymmetric paths: * The advertised receive-window ceiling was fixed at 65535 bytes (WindowState.rcv_wnd_max). A bulk inbound transfer is bound by window / RTT, so on a high bandwidth-delay-product path (fast link, tunnel) 64 KiB throttles the peer far below the link rate even though PyTCP already negotiates RFC 7323 window scaling. * The send-side MSS always tracked the egress interface MTU, with no way to emit smaller segments while still advertising a large receive MSS. Overlay/tunnel deployments whose host->peer path MTU is below the local interface MTU need exactly that asymmetry (and classical PMTUD cannot discover the smaller hop when it sits past a relay that drops ICMP PTB). Expose both as registered sysctls, consistent with the existing 'tcp.base_mss' / 'tcp.mtu_probing' knobs: * 'tcp.rcv_wnd_max' (flat; Linux net.ipv4.tcp_rmem parity) — default 65535, seeded into WindowState.rcv_wnd_max at session creation, so behaviour is unchanged until an operator raises it. * 'tcp.snd_mss_max' (per-interface, like 'tcp.base_mss') — default 0 (uncapped); a non-zero value clamps _mss_ceiling() last, bounding the segments we EMIT without lowering the advertised receive MSS. Floor 88 (Linux TCP_MIN_MSS); 0 reserved for "off". Both ride the 'sysctls={...}' bag in stack.init(); neither warrants an explicit kwarg yet. Tests at tests/integration/protocols/tcp/test__tcp__sysctls.py pin registration, defaults, validator rejection (rcv_wnd_max != 0; snd_mss_max 0-or->=88) and the per-interface storage semantics; test__tcp__session__throughput_knobs.py pins the session behaviour (rcv_wnd_max seeds the window; snd_mss_max caps _mss_ceiling while leaving rcv_mss at the interface ceiling). Reference: Linux net.ipv4.tcp_rmem (receive-window max). Reference: Linux include/net/tcp.h TCP_MIN_MSS=88. --- .../protocols/tcp/session/tcp__session.py | 28 ++- .../pytcp/protocols/tcp/tcp__constants.py | 67 ++++++ .../test__tcp__session__throughput_knobs.py | 207 ++++++++++++++++++ .../protocols/tcp/test__tcp__sysctls.py | 140 +++++++++++- 4 files changed, 436 insertions(+), 6 deletions(-) create mode 100644 packages/pytcp/pytcp/tests/integration/protocols/tcp/test__tcp__session__throughput_knobs.py diff --git a/packages/pytcp/pytcp/protocols/tcp/session/tcp__session.py b/packages/pytcp/pytcp/protocols/tcp/session/tcp__session.py index 730f7c04d..5dac0b5a9 100644 --- a/packages/pytcp/pytcp/protocols/tcp/session/tcp__session.py +++ b/packages/pytcp/pytcp/protocols/tcp/session/tcp__session.py @@ -166,6 +166,10 @@ def __init__( # See 'state/tcp__state__window.py'. self._win: WindowState = WindowState() self._win.rcv_mss = self._egress_interface_mtu() - self._ip_tcp_overhead + # Linux 'net.ipv4.tcp_rmem' (max) — operator ceiling on the receive + # window this session advertises. Seeded once at creation; the default + # preserves the historical 65535-byte cap. + self._win.rcv_wnd_max = tcp__constants.TCP__RCV_WND_MAX # RFC 4821 / RFC 8899 per-session PLPMTUD adapter. # Wraps a PmtuSearch engine bound to the remote @@ -631,18 +635,32 @@ def _mss_ceiling(self) -> int: or below the link ceiling so probes have somewhere to climb to. + Finally the 'tcp.snd_mss_max' operator cap (0 = uncapped) + is applied last so it bounds the send-side MSS regardless of + probing state, WITHOUT touching 'rcv_mss' — the advertised + receive MSS stays at the interface ceiling so a large MTU can + still invite large inbound segments while output stays small. + Reference: RFC 4821 §3 (Probing without ICMP). Reference: Linux 'tcp_mtu_probing=2' MSS-ceiling semantics. """ iface_ceiling = self._egress_interface_mtu() - self._ip_tcp_overhead - if not self._plpmtud_probing_enabled: - return iface_ceiling - base_mss: int = sysctl_iface.get_for_iface( - "tcp.base_mss", + if self._plpmtud_probing_enabled: + base_mss: int = sysctl_iface.get_for_iface( + "tcp.base_mss", + self._egress_interface_name(), + ) + ceiling = min(base_mss - self._ip_tcp_overhead, iface_ceiling) + else: + ceiling = iface_ceiling + snd_mss_max: int = sysctl_iface.get_for_iface( + "tcp.snd_mss_max", self._egress_interface_name(), ) - return min(base_mss - self._ip_tcp_overhead, iface_ceiling) + if snd_mss_max: + ceiling = min(ceiling, snd_mss_max) + return ceiling def _arm_timer(self, name: str, delay_ms: int, /) -> None: """ diff --git a/packages/pytcp/pytcp/protocols/tcp/tcp__constants.py b/packages/pytcp/pytcp/protocols/tcp/tcp__constants.py index 15620e898..6fa65b5d4 100644 --- a/packages/pytcp/pytcp/protocols/tcp/tcp__constants.py +++ b/packages/pytcp/pytcp/protocols/tcp/tcp__constants.py @@ -112,6 +112,15 @@ # arithmetic-friendly. TCP__TS_RECENT__OUTDATED_THRESHOLD_MS = 24 * 86_400 * 1_000 +# Linux 'net.ipv4.tcp_rmem' (max slot) — the ceiling on the receive window a +# session will advertise. A bulk inbound transfer is bound by window / RTT, so +# the historical 65535-byte default throttles high bandwidth-delay-product paths +# (fast links, tunnels) far below the link rate; raising it lets the peer keep a +# full BDP in flight. PyTCP negotiates RFC 7323 window scaling (WSCALE 7), so the +# advertised value can represent well beyond 64 KiB. Kept at 65535 by default to +# preserve historical behaviour; operators raise it per the deployment's BDP. +TCP__RCV_WND_MAX = 65535 + # Per-interface conf-plane policy storage. 'dict[str, int]' keyed by # interface name with a mandatory '"default"' template slot — the # operator addresses a specific interface ('tcp..') or @@ -151,6 +160,18 @@ # block. TCP__MTU_PROBING: dict[str, int] = {"default": 0} +# Ceiling on the send-side MSS — the largest segment this stack will EMIT — +# applied independently of the receive MSS advertised to the peer. 0 (default) +# means uncapped: 'snd_mss' rises to 'interface_mtu - overhead' as today. A +# non-zero value caps the segments we send WITHOUT lowering the MSS option we +# advertise, so a large interface MTU can still invite large inbound segments +# (fast download) while host->peer output stays small. The motivating case is an +# overlay/tunnel whose host->peer path MTU is smaller than the local interface +# MTU; classical PMTUD cannot discover that when the small hop is past a relay +# that does not emit ICMP PTB. Per-interface like 'tcp.base_mss'; floor 88 +# (Linux 'TCP_MIN_MSS') matches the base-MSS knob, with 0 reserved for "off". +TCP__SND_MSS_MAX: dict[str, int] = {"default": 0} + # Sysctl registration. Every constant above is a policy knob, # operator-tunable at boot via 'stack.init(sysctls={"tcp....": ...})' @@ -208,6 +229,26 @@ def validator(value: Any) -> None: return validator +def _is_zero_or_int_at_least(name: str, *, low: int) -> Any: + """ + Build a validator that accepts 0 (a documented "disabled" sentinel) + or any integer ≥ 'low' — used for opt-in cap knobs whose floor matches + a hard limit (e.g. 'tcp.snd_mss_max' off-or-≥-TCP_MIN_MSS). + """ + + def validator(value: Any) -> None: + """ + Raise 'ValueError' unless 'value' is 0 or an int ≥ low. + """ + + if isinstance(value, bool) or not isinstance(value, int) or (value != 0 and value < low): + raise ValueError( + f"sysctl {name!r} must be 0 (disabled) or an int ≥ {low}; got {value!r}", + ) + + return validator + + register( key="tcp.rto.initial_ms", module_name=__name__, @@ -288,6 +329,18 @@ def validator(value: Any) -> None: validator=is_positive_int("tcp.ts_recent.outdated_threshold_ms"), description="RFC 7323 §5.5 outdated-timestamps threshold in milliseconds (~24 days).", ) +register( + key="tcp.rcv_wnd_max", + module_name=__name__, + attr="TCP__RCV_WND_MAX", + default=TCP__RCV_WND_MAX, + validator=is_positive_int("tcp.rcv_wnd_max"), + description=( + "Linux 'net.ipv4.tcp_rmem' (max) — ceiling on the advertised " + "receive window (default 65535). Raise for high bandwidth-delay-" + "product paths; WSCALE lets it exceed 64 KiB on the wire." + ), +) register( key="tcp.base_mss", module_name=__name__, @@ -338,6 +391,20 @@ def _tcp_mtu_probing_validator(value: object) -> None: ), interface_scope=True, ) +register( + key="tcp.snd_mss_max", + module_name=__name__, + attr="TCP__SND_MSS_MAX", + default=TCP__SND_MSS_MAX["default"], + validator=_is_zero_or_int_at_least("tcp.snd_mss_max", low=88), + description=( + "Ceiling on the send-side MSS (largest segment emitted), applied " + "independently of the advertised receive MSS. 0=uncapped (default); " + "a non-zero value bounds host->peer output for tunnels whose path " + "MTU is below the interface MTU (floor 88 = Linux TCP_MIN_MSS)." + ), + interface_scope=True, +) def _finalize__persist_max_ge_rto_initial() -> None: diff --git a/packages/pytcp/pytcp/tests/integration/protocols/tcp/test__tcp__session__throughput_knobs.py b/packages/pytcp/pytcp/tests/integration/protocols/tcp/test__tcp__session__throughput_knobs.py new file mode 100644 index 000000000..09236ec6b --- /dev/null +++ b/packages/pytcp/pytcp/tests/integration/protocols/tcp/test__tcp__session__throughput_knobs.py @@ -0,0 +1,207 @@ +################################################################################ +## ## +## PyTCP - Python TCP/IP stack ## +## Copyright (C) 2020-present Sebastian Majewski ## +## ## +## This program is free software: you can redistribute it and/or modify ## +## it under the terms of the GNU General Public License as published by ## +## the Free Software Foundation, either version 3 of the License, or ## +## (at your option) any later version. ## +## ## +## This program is distributed in the hope that it will be useful, ## +## but WITHOUT ANY WARRANTY; without even the implied warranty of ## +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## +## GNU General Public License for more details. ## +## ## +## You should have received a copy of the GNU General Public License ## +## along with this program. If not, see . ## +## ## +## Author's email: ccie18643@gmail.com ## +## Github repository: https://github.com/ccie18643/PyTCP ## +## ## +################################################################################ + + +""" +This module contains the session-level behaviour pins for the two TCP +throughput sysctls: 'tcp.rcv_wnd_max' (the advertised receive-window +ceiling, seeded into 'WindowState.rcv_wnd_max' at session creation) and +'tcp.snd_mss_max' (the send-side MSS cap applied in '_mss_ceiling()' +independently of the advertised receive MSS). + +The registration / validator / override-round-trip pins live in +'test__tcp__sysctls.py'; this file pins that a live session actually +honours the knobs. + +pytcp/tests/integration/protocols/tcp/test__tcp__session__throughput_knobs.py + +ver 3.0.7 +""" + +from typing import override + +from net_addr import Ip4Address +from pytcp import stack +from pytcp.protocols.tcp.session import TcpSession +from pytcp.socket import AddressFamily +from pytcp.socket.tcp__socket import TcpSocket +from pytcp.stack import sysctl as sysctl_module +from pytcp.tests.lib.network_testcase import ( + HOST_A__IP4_ADDRESS, + STACK__IP4_HOST, +) +from pytcp.tests.lib.tcp_testcase import TcpTestCase + +STACK__IP: Ip4Address = STACK__IP4_HOST.address +STACK__PORT: int = 12345 +PEER__IP: Ip4Address = HOST_A__IP4_ADDRESS +PEER__PORT: int = 80 +LOCAL__ISS: int = 0x0000_1000 + + +class _ThroughputKnobFixture(TcpTestCase): + """ + Shared fixture — resets every sysctl slot on teardown so a + knob write in one test does not leak into the next. + """ + + @override + def tearDown(self) -> None: + """ + Restore the registered sysctl defaults after each test. + """ + + sysctl_module.reset_to_defaults() + super().tearDown() + + def _make_session(self) -> TcpSession: + """ + Build an unstarted IPv4 session against PEER, useful for + pinning '__init__'-time window state and '_mss_ceiling()' + without a handshake. + """ + + self._force_iss(LOCAL__ISS) + sock = TcpSocket(family=AddressFamily.INET4) + sock._local_ip_address = STACK__IP + sock._local_port = STACK__PORT + sock._remote_ip_address = PEER__IP + sock._remote_port = PEER__PORT + session = TcpSession( + local_ip_address=STACK__IP, + local_port=STACK__PORT, + remote_ip_address=PEER__IP, + remote_port=PEER__PORT, + socket=sock, + ) + sock._tcp_session = session + stack.sockets[sock.socket_id] = sock + return session + + +class TestTcpRcvWndMax(_ThroughputKnobFixture): + """ + The 'tcp.rcv_wnd_max' session-seeding behaviour tests. + """ + + def test__tcp__rcv_wnd_max__default_seeds_65535(self) -> None: + """ + Ensure a fresh session seeds 'WindowState.rcv_wnd_max' from + the registered default, preserving the historical 65535-byte + advertised-window ceiling. + + Reference: Linux net.ipv4.tcp_rmem (receive-window max). + """ + + session = self._make_session() + self.assertEqual( + session._win.rcv_wnd_max, + 65535, + msg="Default 'tcp.rcv_wnd_max' must seed the session window ceiling at 65535.", + ) + + def test__tcp__rcv_wnd_max__override_seeds_session(self) -> None: + """ + Ensure raising 'tcp.rcv_wnd_max' is picked up by a session + created afterwards — the per-session ceiling reflects the + live sysctl value, letting a high-BDP path keep a full + window in flight. + + Reference: Linux net.ipv4.tcp_rmem (receive-window max). + """ + + sysctl_module.set("tcp.rcv_wnd_max", 4 * 1024 * 1024) + session = self._make_session() + self.assertEqual( + session._win.rcv_wnd_max, + 4 * 1024 * 1024, + msg="A session must seed 'rcv_wnd_max' from the live 'tcp.rcv_wnd_max' value.", + ) + + +class TestTcpSndMssMax(_ThroughputKnobFixture): + """ + The 'tcp.snd_mss_max' '_mss_ceiling()' cap behaviour tests. + """ + + def test__tcp__snd_mss_max__default_uncapped(self) -> None: + """ + Ensure with 'tcp.snd_mss_max=0' (default) the send-side MSS + ceiling is the interface ceiling ('interface_mtu - overhead'), + i.e. the cap is inert. + + Reference: PyTCP test infrastructure (no RFC clause). + """ + + session = self._make_session() + expected = session._egress_interface_mtu() - session._ip_tcp_overhead + self.assertEqual( + session._mss_ceiling(), + expected, + msg="With the cap disabled, '_mss_ceiling()' must equal 'interface_mtu - overhead'.", + ) + + def test__tcp__snd_mss_max__caps_send_ceiling(self) -> None: + """ + Ensure a non-zero 'tcp.snd_mss_max' clamps '_mss_ceiling()' + to the configured value while leaving the advertised receive + MSS ('rcv_mss') at the interface ceiling — so a large MTU can + still invite large inbound segments while output stays small. + + Reference: PyTCP test infrastructure (no RFC clause). + """ + + sysctl_module.set("tcp.default.snd_mss_max", 576) + session = self._make_session() + iface_ceiling = session._egress_interface_mtu() - session._ip_tcp_overhead + + self.assertEqual( + session._mss_ceiling(), + 576, + msg="A non-zero 'tcp.snd_mss_max' must cap the send-side MSS ceiling.", + ) + self.assertEqual( + session._win.rcv_mss, + iface_ceiling, + msg="'tcp.snd_mss_max' must NOT lower the advertised receive MSS.", + ) + + def test__tcp__snd_mss_max__cap_above_interface_is_inert(self) -> None: + """ + Ensure a 'tcp.snd_mss_max' larger than the interface ceiling + leaves '_mss_ceiling()' at the interface ceiling — the cap + only ever lowers, never raises, the send MSS. + + Reference: PyTCP test infrastructure (no RFC clause). + """ + + session_default = self._make_session() + iface_ceiling = session_default._egress_interface_mtu() - session_default._ip_tcp_overhead + + sysctl_module.set("tcp.default.snd_mss_max", iface_ceiling + 1000) + session = self._make_session() + self.assertEqual( + session._mss_ceiling(), + iface_ceiling, + msg="A cap above the interface ceiling must be inert.", + ) diff --git a/packages/pytcp/pytcp/tests/integration/protocols/tcp/test__tcp__sysctls.py b/packages/pytcp/pytcp/tests/integration/protocols/tcp/test__tcp__sysctls.py index 8bdb6cfb1..a15e7b542 100644 --- a/packages/pytcp/pytcp/tests/integration/protocols/tcp/test__tcp__sysctls.py +++ b/packages/pytcp/pytcp/tests/integration/protocols/tcp/test__tcp__sysctls.py @@ -28,7 +28,8 @@ 'tcp.delayed_ack.delay_ms', 'tcp.challenge_ack.rate_limit_ms', 'tcp.persist.timeout_max_ms', 'tcp.keepalive.idle_time_ms', 'tcp.keepalive.probe_interval_ms', 'tcp.keepalive.probe_max_count', -'tcp.ts_recent.outdated_threshold_ms'). +'tcp.ts_recent.outdated_threshold_ms', 'tcp.rcv_wnd_max', +'tcp.snd_mss_max'). The full TCP integration suite is the behavioural regression net — every test that depends on the renamed module attributes continues @@ -197,6 +198,20 @@ def test__tcp__sysctl__ts_recent_outdated_threshold_default_registered(self) -> msg="tcp.ts_recent.outdated_threshold_ms must default to 24*86400*1000 ms.", ) + def test__tcp__sysctl__rcv_wnd_max_default_registered(self) -> None: + """ + Ensure 'tcp.rcv_wnd_max' registers with the historical + 65535-byte advertised-window ceiling default. + + Reference: Linux net.ipv4.tcp_rmem (receive-window max). + """ + + self.assertEqual( + sysctl.get("tcp.rcv_wnd_max"), + 65535, + msg="tcp.rcv_wnd_max must default to 65535 (historical window cap).", + ) + class TestTcpSysctlOverrides(NetworkTestCase): """ @@ -242,6 +257,28 @@ def test__tcp__sysctl__keepalive_idle_override_updates_attr(self) -> None: msg="Override must write through to TCP__KEEPALIVE__IDLE_TIME_MS.", ) + def test__tcp__sysctl__rcv_wnd_max_override_updates_attr(self) -> None: + """ + Ensure overriding 'tcp.rcv_wnd_max' writes through to the + backing module attribute the session init reads to seed the + per-session advertised-window ceiling. + + Reference: PyTCP test infrastructure (no RFC clause). + """ + + with sysctl.override("tcp.rcv_wnd_max", 4 * 1024 * 1024): + self.assertEqual( + tcp__constants.TCP__RCV_WND_MAX, + 4 * 1024 * 1024, + msg="Override must write through to TCP__RCV_WND_MAX.", + ) + + self.assertEqual( + tcp__constants.TCP__RCV_WND_MAX, + 65535, + msg="Override exit must restore the registered default.", + ) + class TestTcpSysctlValidators(NetworkTestCase): """ @@ -336,6 +373,17 @@ def test__tcp__sysctl__keepalive_idle_accepts_exactly_2h(self) -> None: msg="2 h exactly must be accepted (inclusive RFC floor).", ) + def test__tcp__sysctl__rcv_wnd_max_rejects_zero(self) -> None: + """ + Ensure 'tcp.rcv_wnd_max' rejects zero — a zero receive + window ceiling would advertise a permanently closed window. + + Reference: PyTCP test infrastructure (no RFC clause). + """ + + with self.assertRaises(ValueError): + sysctl.set("tcp.rcv_wnd_max", 0) + class TestTcpSysctlCrossKnobConstraints(NetworkTestCase): """ @@ -460,3 +508,93 @@ def test__tcp__sysctl__base_mss_rejects_below_min_mss(self) -> None: with self.assertRaises(ValueError): sysctl.set("tcp.default.base_mss", 87) + + +class TestTcpSysctlSndMssMax(NetworkTestCase): + """ + The 'tcp.snd_mss_max' per-interface sysctl tests — the + opt-in send-side MSS cap, applied independently of the + advertised receive MSS. Per-iface storage; bare base key + rejected; 0 reserved for "uncapped". + """ + + @override + def tearDown(self) -> None: + """ + Clear every per-iface slot and reset the template so + a write in one test cannot leak into the next. + """ + + sysctl.reset_to_defaults() + super().tearDown() + + def test__tcp__sysctl__snd_mss_max_default_is_zero(self) -> None: + """ + Ensure 'tcp.snd_mss_max' registers disabled (0) in the + '"default"' template slot, preserving the historical + uncapped send-MSS behaviour. + + Reference: PyTCP test infrastructure (no RFC clause). + """ + + self.assertEqual( + sysctl.get("tcp.default.snd_mss_max"), + 0, + msg="tcp.snd_mss_max must default to 0 (uncapped) in the 'default' template.", + ) + + def test__tcp__sysctl__snd_mss_max_per_iface_override(self) -> None: + """ + Ensure writing 'tcp..snd_mss_max' lands in the + per-interface slot only — the '"default"' template stays + at 0 and an unconfigured interface still resolves through + the template. + + Reference: PyTCP test infrastructure (no RFC clause). + """ + + sysctl.set("tcp.tap_x.snd_mss_max", 1340) + + self.assertEqual( + sysctl.get("tcp.tap_x.snd_mss_max"), + 1340, + msg="Per-iface write must surface on the same key.", + ) + self.assertEqual( + sysctl.get("tcp.default.snd_mss_max"), + 0, + msg="Per-iface write must NOT mutate the 'default' template.", + ) + self.assertEqual( + sysctl.get("tcp.tap_y.snd_mss_max"), + 0, + msg="Unconfigured ifaces must fall back to the 'default' template.", + ) + + def test__tcp__sysctl__snd_mss_max_accepts_zero(self) -> None: + """ + Ensure 'tcp.snd_mss_max' accepts 0 — the documented + "uncapped" sentinel. + + Reference: PyTCP test infrastructure (no RFC clause). + """ + + with sysctl.override("tcp.default.snd_mss_max", 0): + self.assertEqual( + sysctl.get("tcp.default.snd_mss_max"), + 0, + msg="0 must be accepted (uncapped sentinel).", + ) + + def test__tcp__sysctl__snd_mss_max_rejects_below_min_mss(self) -> None: + """ + Ensure 'tcp.snd_mss_max' rejects 87 — one below the Linux + 'TCP_MIN_MSS = 88' floor. A non-zero cap below this floor + would size segments uselessly small; 0 is the only sub-88 + value accepted (it disables the cap). + + Reference: Linux include/net/tcp.h TCP_MIN_MSS=88. + """ + + with self.assertRaises(ValueError): + sysctl.set("tcp.default.snd_mss_max", 87)