Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions test/prototype/mx_formats/test_inference_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,6 @@ def test_inference_workflow_nvfp4(
# DYNAMIC mode requires SM100+, but WEIGHT_ONLY works on older GPUs
if quant_type == "dynamic" and not is_sm_at_least_100():
pytest.skip("CUDA capability >= 10.0 required for DYNAMIC float4 gemm")

if bias and inpt_dtype == torch.float32:
pytest.xfail("Bias is not supported when module weight is in fp32")

if quant_type == "weight_only" and compile:
pytest.skip("TODO: weight_only quant currently errors w/ compile")
if quant_type == "weight_only" and use_triton_kernel:
Expand Down
6 changes: 0 additions & 6 deletions torchao/prototype/mx_formats/inference_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,6 @@ def _nvfp4_inference_linear_transform(
f"NVFP4 only supports weight shape with last 2 dims divisible by 16, got {weight.shape}"
)

if module.bias is not None and weight.dtype == torch.float32:
raise RuntimeError(
"Bias is not supported when module weight is in fp32 (out_dtype=Float32). "
"Please use bfloat16 or float16 weights, or remove the bias from the linear layer."
)

per_tensor_scale = None
if config.use_dynamic_per_tensor_scale:
tensor_amax = torch.max(torch.abs(weight))
Expand Down
12 changes: 9 additions & 3 deletions torchao/prototype/mx_formats/nvfp4_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,11 +470,17 @@ def _addmm_nvfp4_dispatch(
assert b.per_tensor_scale is None and a.per_tensor_scale is None
scale_result = None

# THIS IS A WORKAROUND:
# RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling
# THIS IS A WORKAROUND FOR TWO ERRORS:
#
# (1) RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling
# When we have per-tensor scaling, we need to apply it before bias
# since bias is not quantized
should_add_bias_separately = (scale_result is not None) and (bias is not None)
#
# (2) RuntimeError: Bias is not supported when out_dtype is set to Float32
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay this is what I thought would happen

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah but this only happens if per_tensor_scale=None (by default it is not), so users generally won't run into the _scaled_mm error. Either way this PR fixes that case

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you just create a note somewhere on the conversions / casting path for the paths?

# This is not supported by _scaled_mm
should_add_bias_separately = (
scale_result is not None or a._orig_dtype == torch.float32
) and (bias is not None)
# should_add_bias_separately = bias is not None

result = torch._scaled_mm(
Expand Down
Loading