pytorch · kgajdamo · Jun 15, 2026 · Jun 19, 2026 · Jun 22, 2026 · Jun 23, 2026
@@ -33,12 +33,25 @@
 h100_peak_flops_fp16_tc = 1979e12
 h100_peak_tops_float8_tc = 3958e12
 
+# Intel Arc B580 specs: bottom of https://www.intel.com/content/www/us/en/products/sku/241598/intel-arc-b580-graphics/specifications.html
+b580_peak_flops_float32 = 58e12
+b580_peak_flops_bf16 = 116e12
+
 dtype_to_peak_tops = {
-    torch.float32: h100_peak_flops_float32,
-    torch.float16: h100_peak_flops_fp16_tc,
-    torch.bfloat16: h100_peak_flops_fp16_tc,
-    torch.float8_e4m3fn: h100_peak_tops_float8_tc,
-    torch.float8_e5m2: h100_peak_tops_float8_tc,
+    "NVIDIA H100": {
+        torch.float32: h100_peak_flops_float32,
+        torch.float16: h100_peak_flops_fp16_tc,
+        torch.bfloat16: h100_peak_flops_fp16_tc,
+        torch.float8_e4m3fn: h100_peak_tops_float8_tc,
+        torch.float8_e5m2: h100_peak_tops_float8_tc,
+    },
+    "Intel(R) Arc(TM) B580 Graphics": {
+        torch.float32: b580_peak_flops_float32,
+        torch.float16: b580_peak_flops_bf16,
+        torch.bfloat16: b580_peak_flops_bf16,
+        torch.float8_e4m3fn: b580_peak_flops_bf16,
+        torch.float8_e5m2: b580_peak_flops_bf16,
+    },
 }
 
 # prevent splitting columns when printing a data frame
@@ -71,6 +84,14 @@ class Experiment:
     compiled: bool
     use_fast_accum: bool
     scaling_repr: str
+    device: str
+
+    @property
+    def gpu_name(self):
+        if self.device == "xpu":
+            return torch.xpu.get_device_name(0)
+        else:
+            return torch.cuda.get_device_name(0)
 
     # 3 Times since we are calculating forward backward
     @property
@@ -80,7 +101,7 @@ def ref_tops_sec(self):
 
     @property
     def ref_pct_top_peak(self):
-        return self.ref_tops_sec / dtype_to_peak_tops[self.dtype]
+        return self.ref_tops_sec / dtype_to_peak_tops[self.gpu_name][self.dtype]
 
     @property
     def float8_tops_sec(self):
@@ -89,7 +110,10 @@ def float8_tops_sec(self):
 
     @property
     def float8_pct_top_peak(self):
-        return self.float8_tops_sec / dtype_to_peak_tops[torch.float8_e4m3fn]
+        return (
+            self.float8_tops_sec
+            / dtype_to_peak_tops[self.gpu_name][torch.float8_e4m3fn]
+        )
 
 
 # TODO(future PR): add option to measure GPU kernel time, as in other
@@ -110,7 +134,7 @@ def main(
     scaling_type_grad_output: str = "dynamic",
     scaling_granularity: str = "tensorwise",
 ):
-    device = "cuda"
+    device = torch.accelerator.current_accelerator().type
     print(f"Compile is set to             | {compile}")
 
     scaling_type_input = ScalingType(scaling_type_input)
@@ -203,6 +227,7 @@ def wrapper(*args, **kwargs):
             * 1e-6
             / REPEAT_N
         )
+
         experiment = Experiment(
             name,
             (M, K, N),
@@ -212,6 +237,7 @@ def wrapper(*args, **kwargs):
             compile,
             use_fast_accum=fast_accum,
             scaling_repr=scaling_repr,
+            device=device,
         )
         print(experiment)
         print("float8 speedup", experiment.ref_time_sec / experiment.float8_time_sec)
@@ -308,11 +334,11 @@ def invoke_main() -> None:
     if args.shape_gen_name is not None:
         kwargs["shape_gen_name"] = args.shape_gen_name
     if args.M is not None:
-        kwargs["M"] = (args.M,)
+        kwargs["M"] = args.M
     if args.K is not None:
-        kwargs["K"] = (args.K,)
+        kwargs["K"] = args.K
     if args.N is not None:
-        kwargs["N"] = (args.N,)
+        kwargs["N"] = args.N
     if args.scaling_type_input is not None:
         kwargs["scaling_type_input"] = args.scaling_type_input
     if args.scaling_type_weight is not None:

@@ -51,8 +51,8 @@ def get_tops_info(tops, time, peak_tops):
 
 
 def do_fp8_matmul(A, B, fp8_dtype, out_dtype):
-    scale_a = torch.tensor([1], device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor([1], device="cuda", dtype=torch.float32)
+    scale_a = torch.tensor([1], device=A.device, dtype=torch.float32)
+    scale_b = torch.tensor([1], device=B.device, dtype=torch.float32)
 
     a_config = ScaledMMConfig(
         emulate=False, use_fast_accum=True, fp8_output=True, pad_inner_dim=True
@@ -87,8 +87,8 @@ def do_fp8_pad_first_matmul(A, B, fp8_dtype, out_dtype):
     A_pad = pad_tensor_for_matmul(A, dims=1)  # mem copy
     B_pad = pad_tensor_for_matmul(B, dims=0)  # mem copy
 
-    scale_a = torch.tensor([1], device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor([1], device="cuda", dtype=torch.float32)
+    scale_a = torch.tensor([1], device=A.device, dtype=torch.float32)
+    scale_b = torch.tensor([1], device=B.device, dtype=torch.float32)
 
     A_pad = A_pad.to(fp8_dtype)  # mem copy
     B_pad = B_pad.to(fp8_dtype)  # mem copy
@@ -142,7 +142,7 @@ def gen_configs():
 
 @torch.no_grad()
 def run(compile: bool = False, n_limit: Optional[int] = None):
-    device = "cuda"
+    device = torch.accelerator.current_accelerator()
     experiments = gen_configs()
     results = []
     tops_table = []

@@ -176,7 +176,7 @@ class ProfileConfig:
     logs_file_path: Optional[str] = None
     trace_modified_file_path: Optional[str] = None
     name: Optional[str] = None
-    cuda: bool = True
+    device: str = "cuda"
     iters: int = 0
     warmup_iters: int = 0
     sync: bool = False
@@ -214,14 +214,16 @@ def profile_function(
         torch._logging._init_logs(log_file_name=config.logs_file_path)
 
     activities = [ProfilerActivity.CPU]
-    if config.cuda:
+    if config.device == "xpu":
+        activities.append(ProfilerActivity.XPU)
+    elif config.device == "cuda":
         activities.append(ProfilerActivity.CUDA)
 
     if config.warmup_iters >= 0:
         for _ in range(config.warmup_iters):
             func(*args, **kwargs)
     if config.sync:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
     name_context = (
         nullcontext() if config.name is None else record_function(config.name)
     )
@@ -241,7 +243,7 @@ def profile_function(
             with name_context:
                 func(*args, **kwargs)
                 if config.sync:
-                    torch.cuda.synchronize()
+                    torch.accelerator.synchronize()
 
     if config.trace_file_path is not None:
         prof.export_chrome_trace(config.trace_file_path)
@@ -253,6 +255,7 @@ def profile_function(
             config.trace_file_path,
             config.logs_file_path,
             config.trace_modified_file_path,
+            config.device,
         )
 
         # undo custom log settings
@@ -291,6 +294,7 @@ def main(
     mode_filter: str = "fwd_bwd",
     forward_only: bool = False,
 ):
+    device = torch.accelerator.current_accelerator().type
     assert model_type in (
         "linear",
         "ln_linear",
@@ -314,6 +318,9 @@ def main(
     if mode_filter == "cast_only":
         assert experiment_filter == "lowp", "unsupported"
 
+    if device == "xpu" and mx_recipe_name is not None:
+        raise NotImplementedError("MXFP8TrainingRecipe is not supported on XPU yet")
+
     assert not (float8_recipe_name is not None and mx_recipe_name is not None), (
         "either float8_recipe_name or mx_recipe_name can be specified, but not both"
     )
@@ -341,7 +348,6 @@ def main(
     print(f"mode_filter is set to {mode_filter}")
     print(f"config: {config}")
 
-    device = "cuda"
     ref_dtype = torch.bfloat16
     if model_type == "ln_linear":
         M, K, N = 4 * 4096, 8192, 7168
@@ -509,6 +515,7 @@ def lowp_forw_backward_wrapper(x):
                     log_ref_path,
                     trace_ref_modified_path,
                     ref_trace_suffix,
+                    device=device,
                     iters=profile_iters,
                     warmup_iters=2,
                     sync=True,
@@ -555,6 +562,7 @@ def lowp_forw_backward_wrapper(x):
                     log_lowp_path,
                     trace_lowp_modified_path,
                     lowp_trace_suffix,
+                    device=device,
                     iters=profile_iters,
                     warmup_iters=2,
                     sync=True,

@@ -76,8 +76,8 @@ def profiler_output_to_filtered_time_by_kernel_name(
 
         # manually filter expected microbenchmarking overhead, in order of execution
         if e.key == "aten::sum":
-            # forward pass sum
-            assert e.count == num_iter, f"unexpected number of iter for {e.key}"
+            # forward pass sum (count may vary depending on model internals,
+            # e.g. RMSNorm uses mean which calls sum)
             continue
         elif e.key == "aten::add_":
             # accumulating gradients into leaf tensors
@@ -87,6 +87,12 @@ def profiler_output_to_filtered_time_by_kernel_name(
             continue
         elif e.key == "cudaDeviceSynchronize":
             continue
+        elif e.key == "zeCommandListHostSynchronize":
+            # Level Zero (Intel XPU) host-side synchronization
+            continue
+        elif e.key == "zeEventHostSynchronize":
+            # Level Zero (Intel XPU) host-side synchronization
+            continue
         elif e.key == "Activity Buffer Request":
             continue
         elif e.key == "Unrecognized":
@@ -295,6 +301,7 @@ def update_triton_kernels_in_prof_chome_trace_with_torch_logs(
     perf_trace_file: str,
     torch_logs_file: str,
     modified_perf_trace_file: str,
+    device: str = "cuda",
 ):
     """
     Input 1: a perf trace generated by using `torch.profiler.profile` inside of
@@ -347,7 +354,7 @@ def update_triton_kernels_in_prof_chome_trace_with_torch_logs(
         if match_name:
             cur_name = match_name.group(1)
 
-        match_end = re.match("''', device_str='cuda'\)", line)
+        match_end = re.match(f"''', device_str='{device}'\)", line)
         if match_end:
             cur_end = line_num
 

@@ -23,8 +23,9 @@ def benchmark_fn_in_usec(f, *args, **kwargs):
 
 def run(torch_compile_mode: str = "default"):
     M, K, N = 1024, 2048, 4096
-    x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
-    m = nn.Sequential(nn.Linear(K, N, device="cuda", dtype=torch.bfloat16))
+    device = torch.accelerator.current_accelerator()
+    x = torch.randn(M, K, device=device, dtype=torch.bfloat16)
+    m = nn.Sequential(nn.Linear(K, N, device=device, dtype=torch.bfloat16))
     quantize_(m, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
     m = torch.compile(m, mode=torch_compile_mode)
     # warm up