diff --git a/benchmarks/float8/bench_linear_float8.py b/benchmarks/float8/bench_linear_float8.py index 6d55bcc173..112fe2c2e7 100644 --- a/benchmarks/float8/bench_linear_float8.py +++ b/benchmarks/float8/bench_linear_float8.py @@ -33,12 +33,25 @@ h100_peak_flops_fp16_tc = 1979e12 h100_peak_tops_float8_tc = 3958e12 +# Intel Arc B580 specs: bottom of https://www.intel.com/content/www/us/en/products/sku/241598/intel-arc-b580-graphics/specifications.html +b580_peak_flops_float32 = 58e12 +b580_peak_flops_bf16 = 116e12 + dtype_to_peak_tops = { - torch.float32: h100_peak_flops_float32, - torch.float16: h100_peak_flops_fp16_tc, - torch.bfloat16: h100_peak_flops_fp16_tc, - torch.float8_e4m3fn: h100_peak_tops_float8_tc, - torch.float8_e5m2: h100_peak_tops_float8_tc, + "NVIDIA H100": { + torch.float32: h100_peak_flops_float32, + torch.float16: h100_peak_flops_fp16_tc, + torch.bfloat16: h100_peak_flops_fp16_tc, + torch.float8_e4m3fn: h100_peak_tops_float8_tc, + torch.float8_e5m2: h100_peak_tops_float8_tc, + }, + "Intel(R) Arc(TM) B580 Graphics": { + torch.float32: b580_peak_flops_float32, + torch.float16: b580_peak_flops_bf16, + torch.bfloat16: b580_peak_flops_bf16, + torch.float8_e4m3fn: b580_peak_flops_bf16, + torch.float8_e5m2: b580_peak_flops_bf16, + }, } # prevent splitting columns when printing a data frame @@ -71,6 +84,14 @@ class Experiment: compiled: bool use_fast_accum: bool scaling_repr: str + device: str + + @property + def gpu_name(self): + if self.device == "xpu": + return torch.xpu.get_device_name(0) + else: + return torch.cuda.get_device_name(0) # 3 Times since we are calculating forward backward @property @@ -80,7 +101,7 @@ def ref_tops_sec(self): @property def ref_pct_top_peak(self): - return self.ref_tops_sec / dtype_to_peak_tops[self.dtype] + return self.ref_tops_sec / dtype_to_peak_tops[self.gpu_name][self.dtype] @property def float8_tops_sec(self): @@ -89,7 +110,10 @@ def float8_tops_sec(self): @property def float8_pct_top_peak(self): - return self.float8_tops_sec / dtype_to_peak_tops[torch.float8_e4m3fn] + return ( + self.float8_tops_sec + / dtype_to_peak_tops[self.gpu_name][torch.float8_e4m3fn] + ) # TODO(future PR): add option to measure GPU kernel time, as in other @@ -110,7 +134,7 @@ def main( scaling_type_grad_output: str = "dynamic", scaling_granularity: str = "tensorwise", ): - device = "cuda" + device = torch.accelerator.current_accelerator().type print(f"Compile is set to | {compile}") scaling_type_input = ScalingType(scaling_type_input) @@ -203,6 +227,7 @@ def wrapper(*args, **kwargs): * 1e-6 / REPEAT_N ) + experiment = Experiment( name, (M, K, N), @@ -212,6 +237,7 @@ def wrapper(*args, **kwargs): compile, use_fast_accum=fast_accum, scaling_repr=scaling_repr, + device=device, ) print(experiment) print("float8 speedup", experiment.ref_time_sec / experiment.float8_time_sec) @@ -308,11 +334,11 @@ def invoke_main() -> None: if args.shape_gen_name is not None: kwargs["shape_gen_name"] = args.shape_gen_name if args.M is not None: - kwargs["M"] = (args.M,) + kwargs["M"] = args.M if args.K is not None: - kwargs["K"] = (args.K,) + kwargs["K"] = args.K if args.N is not None: - kwargs["N"] = (args.N,) + kwargs["N"] = args.N if args.scaling_type_input is not None: kwargs["scaling_type_input"] = args.scaling_type_input if args.scaling_type_weight is not None: diff --git a/benchmarks/float8/bench_padding.py b/benchmarks/float8/bench_padding.py index 62a161637b..599dab59b9 100644 --- a/benchmarks/float8/bench_padding.py +++ b/benchmarks/float8/bench_padding.py @@ -51,8 +51,8 @@ def get_tops_info(tops, time, peak_tops): def do_fp8_matmul(A, B, fp8_dtype, out_dtype): - scale_a = torch.tensor([1], device="cuda", dtype=torch.float32) - scale_b = torch.tensor([1], device="cuda", dtype=torch.float32) + scale_a = torch.tensor([1], device=A.device, dtype=torch.float32) + scale_b = torch.tensor([1], device=B.device, dtype=torch.float32) a_config = ScaledMMConfig( emulate=False, use_fast_accum=True, fp8_output=True, pad_inner_dim=True @@ -87,8 +87,8 @@ def do_fp8_pad_first_matmul(A, B, fp8_dtype, out_dtype): A_pad = pad_tensor_for_matmul(A, dims=1) # mem copy B_pad = pad_tensor_for_matmul(B, dims=0) # mem copy - scale_a = torch.tensor([1], device="cuda", dtype=torch.float32) - scale_b = torch.tensor([1], device="cuda", dtype=torch.float32) + scale_a = torch.tensor([1], device=A.device, dtype=torch.float32) + scale_b = torch.tensor([1], device=B.device, dtype=torch.float32) A_pad = A_pad.to(fp8_dtype) # mem copy B_pad = B_pad.to(fp8_dtype) # mem copy @@ -142,7 +142,7 @@ def gen_configs(): @torch.no_grad() def run(compile: bool = False, n_limit: Optional[int] = None): - device = "cuda" + device = torch.accelerator.current_accelerator() experiments = gen_configs() results = [] tops_table = [] diff --git a/benchmarks/float8/profile_lowp_training.py b/benchmarks/float8/profile_lowp_training.py index 1d6665d7fe..e64f44dfe9 100644 --- a/benchmarks/float8/profile_lowp_training.py +++ b/benchmarks/float8/profile_lowp_training.py @@ -176,7 +176,7 @@ class ProfileConfig: logs_file_path: Optional[str] = None trace_modified_file_path: Optional[str] = None name: Optional[str] = None - cuda: bool = True + device: str = "cuda" iters: int = 0 warmup_iters: int = 0 sync: bool = False @@ -214,14 +214,16 @@ def profile_function( torch._logging._init_logs(log_file_name=config.logs_file_path) activities = [ProfilerActivity.CPU] - if config.cuda: + if config.device == "xpu": + activities.append(ProfilerActivity.XPU) + elif config.device == "cuda": activities.append(ProfilerActivity.CUDA) if config.warmup_iters >= 0: for _ in range(config.warmup_iters): func(*args, **kwargs) if config.sync: - torch.cuda.synchronize() + torch.accelerator.synchronize() name_context = ( nullcontext() if config.name is None else record_function(config.name) ) @@ -241,7 +243,7 @@ def profile_function( with name_context: func(*args, **kwargs) if config.sync: - torch.cuda.synchronize() + torch.accelerator.synchronize() if config.trace_file_path is not None: prof.export_chrome_trace(config.trace_file_path) @@ -253,6 +255,7 @@ def profile_function( config.trace_file_path, config.logs_file_path, config.trace_modified_file_path, + config.device, ) # undo custom log settings @@ -291,6 +294,7 @@ def main( mode_filter: str = "fwd_bwd", forward_only: bool = False, ): + device = torch.accelerator.current_accelerator().type assert model_type in ( "linear", "ln_linear", @@ -314,6 +318,9 @@ def main( if mode_filter == "cast_only": assert experiment_filter == "lowp", "unsupported" + if device == "xpu" and mx_recipe_name is not None: + raise NotImplementedError("MXFP8TrainingRecipe is not supported on XPU yet") + assert not (float8_recipe_name is not None and mx_recipe_name is not None), ( "either float8_recipe_name or mx_recipe_name can be specified, but not both" ) @@ -341,7 +348,6 @@ def main( print(f"mode_filter is set to {mode_filter}") print(f"config: {config}") - device = "cuda" ref_dtype = torch.bfloat16 if model_type == "ln_linear": M, K, N = 4 * 4096, 8192, 7168 @@ -509,6 +515,7 @@ def lowp_forw_backward_wrapper(x): log_ref_path, trace_ref_modified_path, ref_trace_suffix, + device=device, iters=profile_iters, warmup_iters=2, sync=True, @@ -555,6 +562,7 @@ def lowp_forw_backward_wrapper(x): log_lowp_path, trace_lowp_modified_path, lowp_trace_suffix, + device=device, iters=profile_iters, warmup_iters=2, sync=True, diff --git a/benchmarks/float8/utils.py b/benchmarks/float8/utils.py index 311d5e374f..92c5178931 100644 --- a/benchmarks/float8/utils.py +++ b/benchmarks/float8/utils.py @@ -76,8 +76,8 @@ def profiler_output_to_filtered_time_by_kernel_name( # manually filter expected microbenchmarking overhead, in order of execution if e.key == "aten::sum": - # forward pass sum - assert e.count == num_iter, f"unexpected number of iter for {e.key}" + # forward pass sum (count may vary depending on model internals, + # e.g. RMSNorm uses mean which calls sum) continue elif e.key == "aten::add_": # accumulating gradients into leaf tensors @@ -87,6 +87,12 @@ def profiler_output_to_filtered_time_by_kernel_name( continue elif e.key == "cudaDeviceSynchronize": continue + elif e.key == "zeCommandListHostSynchronize": + # Level Zero (Intel XPU) host-side synchronization + continue + elif e.key == "zeEventHostSynchronize": + # Level Zero (Intel XPU) host-side synchronization + continue elif e.key == "Activity Buffer Request": continue elif e.key == "Unrecognized": @@ -295,6 +301,7 @@ def update_triton_kernels_in_prof_chome_trace_with_torch_logs( perf_trace_file: str, torch_logs_file: str, modified_perf_trace_file: str, + device: str = "cuda", ): """ Input 1: a perf trace generated by using `torch.profiler.profile` inside of @@ -347,7 +354,7 @@ def update_triton_kernels_in_prof_chome_trace_with_torch_logs( if match_name: cur_name = match_name.group(1) - match_end = re.match("''', device_str='cuda'\)", line) + match_end = re.match(f"''', device_str='{device}'\)", line) if match_end: cur_end = line_num diff --git a/benchmarks/inference/bench_float8_inference.py b/benchmarks/inference/bench_float8_inference.py index 593e2425d7..37e0108a1b 100644 --- a/benchmarks/inference/bench_float8_inference.py +++ b/benchmarks/inference/bench_float8_inference.py @@ -23,8 +23,9 @@ def benchmark_fn_in_usec(f, *args, **kwargs): def run(torch_compile_mode: str = "default"): M, K, N = 1024, 2048, 4096 - x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16) - m = nn.Sequential(nn.Linear(K, N, device="cuda", dtype=torch.bfloat16)) + device = torch.accelerator.current_accelerator() + x = torch.randn(M, K, device=device, dtype=torch.bfloat16) + m = nn.Sequential(nn.Linear(K, N, device=device, dtype=torch.bfloat16)) quantize_(m, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())) m = torch.compile(m, mode=torch_compile_mode) # warm up