fsspec · zhixiangli · Jul 3, 2026 · Jul 2, 2026
diff --git a/...erf/macrobenchmarks/workloads/hf-pytorch-lightning-cpu/helm_chart/llama_3_1_8b_cpu_sim.py b/...erf/macrobenchmarks/workloads/hf-pytorch-lightning-cpu/helm_chart/llama_3_1_8b_cpu_sim.py
@@ -252,15 +252,20 @@ def _materialize_adamw_state(optimizer):
                 state = optimizer.state[p]
                 if state:
                     continue
+                # Random, not zero: an all-zero buffer is trivially compressible/
+                # dedupable (page merging, a future transport compression layer,
+                # etc.), which would let this ~2/3 of the checkpoint transfer
+                # faster than the real, non-degenerate floats a trained
+                # optimizer actually produces -- skewing the IO benchmark.
                 state["step"] = torch.zeros((), dtype=torch.float32)
-                state["exp_avg"] = torch.zeros_like(
+                state["exp_avg"] = torch.randn_like(
                     p, memory_format=torch.preserve_format
                 )
-                state["exp_avg_sq"] = torch.zeros_like(
+                state["exp_avg_sq"] = torch.rand_like(
                     p, memory_format=torch.preserve_format
                 )
                 if group["amsgrad"]:
-                    state["max_exp_avg_sq"] = torch.zeros_like(
+                    state["max_exp_avg_sq"] = torch.rand_like(
                         p, memory_format=torch.preserve_format
                     )