Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 37 additions & 11 deletions benchmarks/float8/bench_linear_float8.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,25 @@
h100_peak_flops_fp16_tc = 1979e12
h100_peak_tops_float8_tc = 3958e12

# Intel Arc B580 specs: bottom of https://www.intel.com/content/www/us/en/products/sku/241598/intel-arc-b580-graphics/specifications.html
b580_peak_flops_float32 = 58e12
b580_peak_flops_bf16 = 116e12

dtype_to_peak_tops = {
torch.float32: h100_peak_flops_float32,
torch.float16: h100_peak_flops_fp16_tc,
torch.bfloat16: h100_peak_flops_fp16_tc,
torch.float8_e4m3fn: h100_peak_tops_float8_tc,
torch.float8_e5m2: h100_peak_tops_float8_tc,
"NVIDIA H100": {
torch.float32: h100_peak_flops_float32,
torch.float16: h100_peak_flops_fp16_tc,
torch.bfloat16: h100_peak_flops_fp16_tc,
torch.float8_e4m3fn: h100_peak_tops_float8_tc,
torch.float8_e5m2: h100_peak_tops_float8_tc,
},
"Intel(R) Arc(TM) B580 Graphics": {
torch.float32: b580_peak_flops_float32,
torch.float16: b580_peak_flops_bf16,
torch.bfloat16: b580_peak_flops_bf16,
torch.float8_e4m3fn: b580_peak_flops_bf16,
torch.float8_e5m2: b580_peak_flops_bf16,
},
}

# prevent splitting columns when printing a data frame
Expand Down Expand Up @@ -71,6 +84,14 @@ class Experiment:
compiled: bool
use_fast_accum: bool
scaling_repr: str
device: str

@property
def gpu_name(self):
if self.device == "xpu":
return torch.xpu.get_device_name(0)
else:
return torch.cuda.get_device_name(0)

# 3 Times since we are calculating forward backward
@property
Expand All @@ -80,7 +101,7 @@ def ref_tops_sec(self):

@property
def ref_pct_top_peak(self):
return self.ref_tops_sec / dtype_to_peak_tops[self.dtype]
return self.ref_tops_sec / dtype_to_peak_tops[self.gpu_name][self.dtype]

@property
def float8_tops_sec(self):
Expand All @@ -89,7 +110,10 @@ def float8_tops_sec(self):

@property
def float8_pct_top_peak(self):
return self.float8_tops_sec / dtype_to_peak_tops[torch.float8_e4m3fn]
return (
self.float8_tops_sec
/ dtype_to_peak_tops[self.gpu_name][torch.float8_e4m3fn]
)


# TODO(future PR): add option to measure GPU kernel time, as in other
Expand All @@ -110,7 +134,7 @@ def main(
scaling_type_grad_output: str = "dynamic",
scaling_granularity: str = "tensorwise",
):
device = "cuda"
device = torch.accelerator.current_accelerator().type
print(f"Compile is set to | {compile}")

scaling_type_input = ScalingType(scaling_type_input)
Expand Down Expand Up @@ -203,6 +227,7 @@ def wrapper(*args, **kwargs):
* 1e-6
/ REPEAT_N
)

experiment = Experiment(
name,
(M, K, N),
Expand All @@ -212,6 +237,7 @@ def wrapper(*args, **kwargs):
compile,
use_fast_accum=fast_accum,
scaling_repr=scaling_repr,
device=device,
)
print(experiment)
print("float8 speedup", experiment.ref_time_sec / experiment.float8_time_sec)
Expand Down Expand Up @@ -308,11 +334,11 @@ def invoke_main() -> None:
if args.shape_gen_name is not None:
kwargs["shape_gen_name"] = args.shape_gen_name
if args.M is not None:
kwargs["M"] = (args.M,)
kwargs["M"] = args.M
if args.K is not None:
kwargs["K"] = (args.K,)
kwargs["K"] = args.K
if args.N is not None:
kwargs["N"] = (args.N,)
kwargs["N"] = args.N
if args.scaling_type_input is not None:
kwargs["scaling_type_input"] = args.scaling_type_input
if args.scaling_type_weight is not None:
Expand Down
10 changes: 5 additions & 5 deletions benchmarks/float8/bench_padding.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def get_tops_info(tops, time, peak_tops):


def do_fp8_matmul(A, B, fp8_dtype, out_dtype):
scale_a = torch.tensor([1], device="cuda", dtype=torch.float32)
scale_b = torch.tensor([1], device="cuda", dtype=torch.float32)
scale_a = torch.tensor([1], device=A.device, dtype=torch.float32)
scale_b = torch.tensor([1], device=B.device, dtype=torch.float32)

a_config = ScaledMMConfig(
emulate=False, use_fast_accum=True, fp8_output=True, pad_inner_dim=True
Expand Down Expand Up @@ -87,8 +87,8 @@ def do_fp8_pad_first_matmul(A, B, fp8_dtype, out_dtype):
A_pad = pad_tensor_for_matmul(A, dims=1) # mem copy
B_pad = pad_tensor_for_matmul(B, dims=0) # mem copy

scale_a = torch.tensor([1], device="cuda", dtype=torch.float32)
scale_b = torch.tensor([1], device="cuda", dtype=torch.float32)
scale_a = torch.tensor([1], device=A.device, dtype=torch.float32)
scale_b = torch.tensor([1], device=B.device, dtype=torch.float32)

A_pad = A_pad.to(fp8_dtype) # mem copy
B_pad = B_pad.to(fp8_dtype) # mem copy
Expand Down Expand Up @@ -142,7 +142,7 @@ def gen_configs():

@torch.no_grad()
def run(compile: bool = False, n_limit: Optional[int] = None):
device = "cuda"
device = torch.accelerator.current_accelerator()
experiments = gen_configs()
results = []
tops_table = []
Expand Down
18 changes: 13 additions & 5 deletions benchmarks/float8/profile_lowp_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ class ProfileConfig:
logs_file_path: Optional[str] = None
trace_modified_file_path: Optional[str] = None
name: Optional[str] = None
cuda: bool = True
device: str = "cuda"
iters: int = 0
warmup_iters: int = 0
sync: bool = False
Expand Down Expand Up @@ -214,14 +214,16 @@ def profile_function(
torch._logging._init_logs(log_file_name=config.logs_file_path)

activities = [ProfilerActivity.CPU]
if config.cuda:
if config.device == "xpu":
activities.append(ProfilerActivity.XPU)
elif config.device == "cuda":
activities.append(ProfilerActivity.CUDA)

if config.warmup_iters >= 0:
for _ in range(config.warmup_iters):
func(*args, **kwargs)
if config.sync:
torch.cuda.synchronize()
torch.accelerator.synchronize()
name_context = (
nullcontext() if config.name is None else record_function(config.name)
)
Expand All @@ -241,7 +243,7 @@ def profile_function(
with name_context:
func(*args, **kwargs)
if config.sync:
torch.cuda.synchronize()
torch.accelerator.synchronize()

if config.trace_file_path is not None:
prof.export_chrome_trace(config.trace_file_path)
Expand All @@ -253,6 +255,7 @@ def profile_function(
config.trace_file_path,
config.logs_file_path,
config.trace_modified_file_path,
config.device,
)

# undo custom log settings
Expand Down Expand Up @@ -291,6 +294,7 @@ def main(
mode_filter: str = "fwd_bwd",
forward_only: bool = False,
):
device = torch.accelerator.current_accelerator().type
assert model_type in (
"linear",
"ln_linear",
Expand All @@ -314,6 +318,9 @@ def main(
if mode_filter == "cast_only":
assert experiment_filter == "lowp", "unsupported"

if device == "xpu" and mx_recipe_name is not None:
raise NotImplementedError("MXFP8TrainingRecipe is not supported on XPU yet")

assert not (float8_recipe_name is not None and mx_recipe_name is not None), (
"either float8_recipe_name or mx_recipe_name can be specified, but not both"
)
Expand Down Expand Up @@ -341,7 +348,6 @@ def main(
print(f"mode_filter is set to {mode_filter}")
print(f"config: {config}")

device = "cuda"
ref_dtype = torch.bfloat16
if model_type == "ln_linear":
M, K, N = 4 * 4096, 8192, 7168
Expand Down Expand Up @@ -509,6 +515,7 @@ def lowp_forw_backward_wrapper(x):
log_ref_path,
trace_ref_modified_path,
ref_trace_suffix,
device=device,
iters=profile_iters,
warmup_iters=2,
sync=True,
Expand Down Expand Up @@ -555,6 +562,7 @@ def lowp_forw_backward_wrapper(x):
log_lowp_path,
trace_lowp_modified_path,
lowp_trace_suffix,
device=device,
iters=profile_iters,
warmup_iters=2,
sync=True,
Expand Down
13 changes: 10 additions & 3 deletions benchmarks/float8/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ def profiler_output_to_filtered_time_by_kernel_name(

# manually filter expected microbenchmarking overhead, in order of execution
if e.key == "aten::sum":
# forward pass sum
assert e.count == num_iter, f"unexpected number of iter for {e.key}"
# forward pass sum (count may vary depending on model internals,
# e.g. RMSNorm uses mean which calls sum)
continue
elif e.key == "aten::add_":
# accumulating gradients into leaf tensors
Expand All @@ -87,6 +87,12 @@ def profiler_output_to_filtered_time_by_kernel_name(
continue
elif e.key == "cudaDeviceSynchronize":
continue
elif e.key == "zeCommandListHostSynchronize":
# Level Zero (Intel XPU) host-side synchronization
continue
elif e.key == "zeEventHostSynchronize":
# Level Zero (Intel XPU) host-side synchronization
continue
elif e.key == "Activity Buffer Request":
continue
elif e.key == "Unrecognized":
Expand Down Expand Up @@ -295,6 +301,7 @@ def update_triton_kernels_in_prof_chome_trace_with_torch_logs(
perf_trace_file: str,
torch_logs_file: str,
modified_perf_trace_file: str,
device: str = "cuda",
):
"""
Input 1: a perf trace generated by using `torch.profiler.profile` inside of
Expand Down Expand Up @@ -347,7 +354,7 @@ def update_triton_kernels_in_prof_chome_trace_with_torch_logs(
if match_name:
cur_name = match_name.group(1)

match_end = re.match("''', device_str='cuda'\)", line)
match_end = re.match(f"''', device_str='{device}'\)", line)
if match_end:
cur_end = line_num

Expand Down
5 changes: 3 additions & 2 deletions benchmarks/inference/bench_float8_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ def benchmark_fn_in_usec(f, *args, **kwargs):

def run(torch_compile_mode: str = "default"):
M, K, N = 1024, 2048, 4096
x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
m = nn.Sequential(nn.Linear(K, N, device="cuda", dtype=torch.bfloat16))
device = torch.accelerator.current_accelerator()
x = torch.randn(M, K, device=device, dtype=torch.bfloat16)
m = nn.Sequential(nn.Linear(K, N, device=device, dtype=torch.bfloat16))
quantize_(m, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
m = torch.compile(m, mode=torch_compile_mode)
# warm up
Expand Down