Skip to content
59 changes: 59 additions & 0 deletions csrc/activation_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param(
}
}

template <typename T>
__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
float alpha, float limit) {
// clamp gate: min=None, max=limit
const float gate_f = (float)gate;
const float clamped_gate = gate_f > limit ? limit : gate_f;

// clamp up: min=-limit, max=limit
const float up_f = (float)up;
const float clamped_up =
up_f > limit ? limit : (up_f < -limit ? -limit : up_f);

// glu = gate * sigmoid(gate * alpha)
const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
const float glu = clamped_gate * sigmoid_val;

// (up + 1) * glu
return (T)((clamped_up + 1.0f) * glu);
}

template <typename scalar_t,
scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
const float)>
__global__ void swigluoai_and_mul_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., 2, d]
const int d, const float alpha, const float limit) {
const int64_t token_idx = blockIdx.x;
// TODO: Vectorize loads and stores.
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
// gate = x[..., ::2] (even indices)
const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
// up = x[..., 1::2] (odd indices)
const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);

out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
}
}
Comment on lines +154 to +168
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The memory access pattern in swigluoai_and_mul_kernel assumes an interleaved memory layout for gate and up values ([g0, u0, g1, u1, ...]). However, other _and_mul activation kernels in this file and their usage in fused_moe expect a concatenated layout ([g0, g1, ..., u0, u1, ...]). This inconsistency will lead to incorrect data being read and wrong computation results when used in fused_moe. The kernel should be updated to use a concatenated memory layout, similar to act_and_mul_kernel.

__global__ void swigluoai_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., 2 * d] const int d, const float alpha, const float limit) { const int64_t token_idx = blockIdx.x; // TODO: Vectorize loads and stores. for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + idx]); const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit); } } 

} // namespace vllm

#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \
Expand All @@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param(
PARAM); \
});

#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
dim3 grid(num_tokens); \
dim3 block(std::min(d, 1024)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] { \
vllm::swigluoai_and_mul_kernel<scalar_t, KERNEL<scalar_t>> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d, ALPHA, \
LIMIT); \
});

void fatrelu_and_mul(torch::Tensor& out, // [..., d],
torch::Tensor& input, // [..., 2 * d]
double threshold) {
LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
}
void swigluoai_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input, // [..., 2 * d]
double alpha, double limit) {
LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit);
}
namespace vllm {

// Element-wise activation kernel template.
Expand Down
2 changes: 2 additions & 0 deletions csrc/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);

void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
double threshold);
void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input,
double alpha = 1.702, double limit = 7.0);

void gelu_new(torch::Tensor& out, torch::Tensor& input);

Expand Down
6 changes: 6 additions & 0 deletions csrc/torch_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);

ops.def(
"swigluoai_and_mul(Tensor! out, Tensor input, float alpha=1.702, float "
"limit=7.0) "
"-> ()");
ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul);

// GELU implementation used in GPT-2.
ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
ops.impl("gelu_new", torch::kCUDA, &gelu_new);
Expand Down
45 changes: 39 additions & 6 deletions tests/kernels/core/test_activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
GeluAndMul, MulAndSilu,
NewGELU, QuickGELU,
SiluAndMul)
SiluAndMul, SwigluOAIAndMul)
from vllm.platforms import current_platform

DTYPES = [torch.half, torch.bfloat16, torch.float]
Expand All @@ -25,7 +25,15 @@

@pytest.mark.parametrize(
"activation",
["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
[
"silu_and_mul",
"mul_and_silu",
"gelu",
"gelu_tanh",
"fatrelu",
"swigluoai_and_mul",
],
)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES)
Expand Down Expand Up @@ -59,18 +67,43 @@ def test_act_and_mul(
threshold = random.uniform(0, 1)
layer = FatreluAndMul(threshold)
fn = torch.ops._C.fatrelu_and_mul
elif activation == "swigluoai_and_mul":
layer = SwigluOAIAndMul()
fn = torch.ops._C.swigluoai_and_mul
out = layer(x)
ref_out = layer.forward_native(x)
# The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
# equivalent to the native PyTorch implementations, so we can do exact
# comparison.
torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
if activation == "swigluoai_and_mul":

rtol = {
#For fp16, change the relative tolerance from 1e-3 to 2e-3
torch.float16:
2e-3,
torch.bfloat16:
2e-2,
torch.float:
1.3e-6
}

def _get_rtol(output) -> float:
return rtol[output.dtype]

torch.testing.assert_close(out,
ref_out,
atol=get_default_atol(out),
rtol=_get_rtol(out))
else:
# The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
# equivalent to the native PyTorch implementations, so we can do exact
# comparison.
torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)

d = x.shape[-1] // 2
output_shape = (x.shape[:-1] + (d, ))
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
if activation == "fatrelu":
opcheck(fn, (out, x, threshold))
elif activation == "swigluoai_and_mul":
opcheck(fn, (out, x, layer.alpha, layer.limit))
else:
opcheck(fn, (out, x))

Expand Down
41 changes: 38 additions & 3 deletions vllm/model_executor/layers/activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,35 @@ def extra_repr(self) -> str:
return f'approximate={repr(self.approximate)}'


@CustomOp.register("swigluoai_and_mul")
class SwigluOAIAndMul(CustomOp):
# https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
def __init__(self, alpha: float = 1.702, limit: float = 7.0):
super().__init__()
self.alpha = alpha
self.limit = limit

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
"""PyTorch-native implementation equivalent to forward()."""

gate, up = x[..., ::2], x[..., 1::2]
gate = gate.clamp(min=None, max=self.limit)
up = up.clamp(min=-self.limit, max=self.limit)
glu = gate * torch.sigmoid(gate * self.alpha)
gated_output = (up + 1) * glu
return gated_output
Comment on lines +250 to +258
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The forward_native implementation assumes an interleaved memory layout for gate and up tensors by using x[..., ::2] and x[..., 1::2]. This is inconsistent with other similar activation functions in vLLM which use a concatenated layout. To ensure correctness and consistency, especially for testing against the CUDA kernel (which should also be updated), this should be changed to split the tensor along the last dimension.

Suggested change
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
"""PyTorch-native implementation equivalent to forward()."""
gate, up = x[..., ::2], x[..., 1::2]
gate = gate.clamp(min=None, max=self.limit)
up = up.clamp(min=-self.limit, max=self.limit)
glu = gate * torch.sigmoid(gate * self.alpha)
gated_output = (up + 1) * glu
return gated_output
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
"""PyTorch-native implementation equivalent to forward()."""
d = x.shape[-1] // 2
gate, up = x[..., :d], x[..., d:]
gate = gate.clamp(min=None, max=self.limit)
up = up.clamp(min=-self.limit, max=self.limit)
glu = gate * torch.sigmoid(gate * self.alpha)
gated_output = (up + 1) * glu
return gated_output

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
d = x.shape[-1] // 2
output_shape = (x.shape[:-1] + (d, ))
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
return out

def extra_repr(self) -> str:
return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"


@CustomOp.register("gelu_new")
class NewGELU(CustomOp):

Expand Down Expand Up @@ -330,6 +359,7 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
return torch.square(F.relu(x))

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
#TODO : implement cuda kenrels
return self.forward_native(x)


Expand Down Expand Up @@ -406,9 +436,14 @@ def get_act_fn(act_fn_name: str) -> nn.Module:


_ACTIVATION_AND_MUL_REGISTRY = LazyDict({
"gelu": lambda: GeluAndMul(),
"silu": lambda: SiluAndMul(),
"geglu": lambda: GeluAndMul(),
"gelu":
lambda: GeluAndMul(),
"silu":
lambda: SiluAndMul(),
"geglu":
lambda: GeluAndMul(),
"swigluoai":
lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
})


Expand Down
22 changes: 5 additions & 17 deletions vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,25 +169,13 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
if activation == "silu":
torch.ops._C.silu_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, 2 * N))
elif activation == "swiglu_oai":
# NOTE: in gpt-oss, the gate_proj and up_proj is interleaved
# - interleaved: gate, up = gate_up[..., ::2], gate_up[..., 1::2]
# - origin: gate, up = gate_up[..., :N], gate_up[..., N:]

@torch.compile(dynamic=True)
def swiglu_oai(gate_up):
alpha = 1.702
limit = 7.0
gate, up = gate_up[..., ::2], gate_up[..., 1::2]
gate = gate.clamp(min=None, max=limit)
up = up.clamp(min=-limit, max=limit)
glu = gate * torch.sigmoid(gate * alpha)
return (up + 1) * glu

intermediate_cache2 = swiglu_oai(intermediate_cache1)
elif activation == "swigluoai":
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@simon-mo Compared to the previous #22538, the main modification is here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we replace the torch compile op here? Hopefully it is faster :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already replaced. I will spend time continuing to optimize this cuda kernel later, so I haven't conducted testing now

# alpha = 1.702, limit = 7.0
torch.ops._C.swigluoai_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, 2 * N))
else:
raise ValueError(f"Unsupported activation: {activation}. "
"Only silu and swiglu_oai activations are supported.")
"Only silu and swigluoai activations are supported.")

if expert_map is not None:
intermediate_cache3.zero_()
Expand Down
18 changes: 5 additions & 13 deletions vllm/model_executor/layers/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1628,31 +1628,23 @@ def fused_experts_impl(
block_shape=block_shape,
B_bias=w1_bias)

# TODO fused kernel
def swiglu_oai(gate_up):
alpha = 1.702
limit = 7.0
gate, up = gate_up[..., ::2], gate_up[..., 1::2]
gate = gate.clamp(min=None, max=limit)
up = up.clamp(min=-limit, max=limit)
glu = gate * torch.sigmoid(gate * alpha)
gated_output = (up + 1) * glu
return gated_output

# Activation function with multiplication
if activation == "silu" and is_act_and_mul:
torch.ops._C.silu_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, N))
elif activation == "gelu" and is_act_and_mul:
torch.ops._C.gelu_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, N))
elif activation == "swigluoai" and is_act_and_mul:
# alpha = 1.702, limit = 7.0
torch.ops._C.swigluoai_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, N))
# Activation function without multiplication
elif activation == "silu":
intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
elif activation == "gelu":
intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
elif activation == "swiglu_oai":
intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N))

else:
raise ValueError(f"Unsupported FusedMoe activation: {activation}, "
f"with is_act_and_mul={is_act_and_mul}.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,14 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
e_score_correction_bias: Optional[torch.Tensor] = None,
apply_router_weight_on_input: bool = False,
scoring_func: str = "softmax",
activation: str = "swiglu_oai",
activation: str = "swigluoai",
expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None):
return not (use_grouped_topk or topk_group or num_expert_group
or expert_map or custom_routing_function
or e_score_correction_bias or apply_router_weight_on_input
or scoring_func != "softmax" or activation != "swiglu_oai"
or scoring_func != "softmax" or activation != "swigluoai"
or expert_load_view or logical_to_physical_map
or logical_replica_count)

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/gpt_oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def __init__(
prefix=f"{prefix}.experts",
apply_router_weight_on_input=False,
has_bias=True,
activation="swiglu_oai")
activation="swigluoai")

def forward(self, x: torch.Tensor) -> torch.Tensor:
t = self.norm(x)
Expand Down