PaddlePaddle · zhwesky2010 · Aug 12, 2025 · Aug 8, 2025 · Aug 11, 2025 · Aug 11, 2025
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
@@ -14,7 +14,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import paddle
 from paddle import _C_ops, in_dynamic_mode
@@ -150,14 +150,18 @@ def elu_(x: Tensor, alpha: float = 1.0, name: str | None = None) -> Tensor:
 
 
 def gelu(
- x: Tensor, approximate: bool = False, name: str | None = None
+ x: Tensor,
+ approximate: Literal["tanh", "none"] | bool = False,
+ name: str | None = None,
 ) -> Tensor:
  r"""
  gelu activation.
 
  The activation function of Gelu is calculated element by element. More information refers to :ref: `Gaussian Error Linear Units`.
 
- if approximate is True
+ approximate parameter must be True, False, "tanh", "none".
+
+ if approximate is True or "tanh"
 
  .. math::
 
@@ -171,7 +175,7 @@ def gelu(
 
  Parameters:
  x (Tensor): The input Tensor with data type float32, float64.
- approximate (bool, optional): Whether to enable approximation. Default is False.
+ approximate (str|bool, optional): Whether to enable approximation. Default is False.
  name (str|None, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
  Returns:
@@ -194,8 +198,23 @@ def gelu(
  Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
  [[-0.15880796, 0.34571400],
  [ 0.84119201, 1.39957154]])
+ >>> out3 = F.gelu(x, "none")
+ >>> print(out3)
+ Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+ [[-0.15865529, 0.34573123],
+ [ 0.84134471, 1.39978933]])
+ >>> out4 = F.gelu(x, "tanh")
+ >>> print(out4)
+ Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+ [[-0.15880796, 0.34571400],
+ [ 0.84119201, 1.39957154]])
  """
 
+ if approximate == "tanh":
+ approximate = True
+ elif approximate == "none":
+ approximate = False
+
  if in_dynamic_or_pir_mode():
  return _C_ops.gelu(x, approximate)
  else:

diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
@@ -15,7 +15,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 from paddle.framework import get_default_dtype
 
@@ -176,7 +176,9 @@ class GELU(Layer):
  r"""
  GELU Activation.
 
- If approximate is True
+ approximate parameter must be True, False, "tanh", "none".
+
+ If approximate is True or "tanh"
 
  .. math::
 
@@ -189,7 +191,7 @@ class GELU(Layer):
  GELU(x) = 0.5 * x * (1 + erf(\frac{x}{\sqrt{2}}))
 
  Parameters:
- approximate (bool, optional): Whether to enable approximation. Default is False.
+ approximate (str|bool, optional): Whether to enable approximation. Default is False.
  name (str|None, optional): Name for the operation (optional, default is None).
  For more information, please refer to :ref:`api_guide_Name`.
 
@@ -208,6 +210,24 @@ class GELU(Layer):
  Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
  [[-0.15865529, 0.34573123],
  [ 0.84134471, 1.39978933]])
+ >>> m = paddle.nn.GELU(False)
+ >>> out = m(x)
+ >>> print(out)
+ Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+ [[-0.15865529, 0.34573123],
+ [ 0.84134471, 1.39978933]])
+ >>> m = paddle.nn.GELU("none")
+ >>> out = m(x)
+ >>> print(out)
+ Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+ [[-0.15865529, 0.34573123],
+ [ 0.84134471, 1.39978933]])
+ >>> m = paddle.nn.GELU("tanh")
+ >>> out = m(x)
+ >>> print(out)
+ Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+ [[-0.15880796, 0.34571400],
+ [ 0.84119201, 1.39957154]])
  >>> m = paddle.nn.GELU(True)
  >>> out = m(x)
  >>> print(out)
@@ -217,7 +237,9 @@ class GELU(Layer):
  """
 
  def __init__(
- self, approximate: bool = False, name: str | None = None
+ self,
+ approximate: Literal["tanh", "none"] | bool = False,
+ name: str | None = None,
  ) -> None:
  super().__init__()
  self._approximate = approximate

diff --git a/test/legacy_test/test_gelu_op.py b/test/legacy_test/test_gelu_op.py
@@ -20,10 +20,14 @@
 import paddle
 import paddle.base.dygraph as dg
 import paddle.nn.functional as F
-from paddle import base
+from paddle import base, nn
 
 
 def gelu(x, approximate):
+ if approximate == "tanh":
+ approximate = True
+ if approximate == "none":
+ approximate = False
  if approximate:
  y_ref = (
  0.5
@@ -46,9 +50,14 @@ def _test_case1_cpu(self, approximate):
  place = base.CPUPlace()
  with dg.guard(place) as g:
  x_var = paddle.to_tensor(x)
- y_var = F.gelu(x_var, approximate)
- y_test = y_var.numpy()
- np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08)
+ y_var1 = F.gelu(x_var, approximate)
+ y_test1 = y_var1.numpy()
+
+ func = nn.GELU(approximate)
+ y_var2 = func(x_var)
+ y_test2 = y_var2.numpy()
+ np.testing.assert_allclose(y_ref, y_test1, rtol=1e-05, atol=1e-08)
+ np.testing.assert_allclose(y_ref, y_test2, rtol=1e-05, atol=1e-08)
 
  def _test_case1_gpu(self, approximate):
  x = np.random.uniform(-1, 1, size=(11, 17)).astype(np.float32)
@@ -57,12 +66,17 @@ def _test_case1_gpu(self, approximate):
  place = base.CUDAPlace(0)
  with dg.guard(place) as g:
  x_var = paddle.to_tensor(x)
- y_var = F.gelu(x_var, approximate)
- y_test = y_var.numpy()
- np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08)
+ y_var1 = F.gelu(x_var, approximate)
+ y_test1 = y_var1.numpy()
+
+ func = nn.GELU(approximate)
+ y_var2 = func(x_var)
+ y_test2 = y_var2.numpy()
+ np.testing.assert_allclose(y_ref, y_test1, rtol=1e-05, atol=1e-08)
+ np.testing.assert_allclose(y_ref, y_test2, rtol=1e-05, atol=1e-08)
 
  def test_cases(self):
- for approximate in [True, False]:
+ for approximate in [True, False, "none", "tanh"]:
  self._test_case1_cpu(approximate)
  if base.is_compiled_with_cuda():
  self._test_case1_gpu(approximate)
@@ -86,15 +100,36 @@ def run_gelu_op(approximate):
  x_grad = paddle.grad([y], [x], [paddle.to_tensor(y_g_np)])[0]
  return y.numpy(), x_grad.numpy()
 
+ def run_gelu_class(approximate):
+ with dg.guard():
+ x = paddle.to_tensor(x_np)
+ x.stop_gradient = False
+ func = nn.GELU(approximate=approximate)
+ y = func(x)
+ x_grad = paddle.grad([y], [x], [paddle.to_tensor(y_g_np)])[0]
+ return y.numpy(), x_grad.numpy()
+
  use_fast_math(True)
- y_fast_math, x_g_fast_math = run_gelu_op(True)
+ y_fast_math1, x_g_fast_math1 = run_gelu_op(True)
+ y_fast_math2, x_g_fast_math2 = run_gelu_class(True)
  use_fast_math(False)
 
- y_ref, x_g_ref = run_gelu_op(True)
- np.testing.assert_allclose(y_ref, y_fast_math, rtol=1e-05, atol=0.0005)
+ y_ref1, x_g_ref1 = run_gelu_op(True)
+ y_ref2, x_g_ref2 = run_gelu_class(True)
+ np.testing.assert_allclose(
+ y_ref1, y_fast_math1, rtol=1e-05, atol=0.0005
+ )
+
+ np.testing.assert_allclose(
+ x_g_ref1, x_g_fast_math1, rtol=1e-05, atol=0.0005
+ )
+
+ np.testing.assert_allclose(
+ y_ref2, y_fast_math2, rtol=1e-05, atol=0.0005
+ )
 
  np.testing.assert_allclose(
- x_g_ref, x_g_fast_math, rtol=1e-05, atol=0.0005
+ x_g_ref2, x_g_fast_math2, rtol=1e-05, atol=0.0005
  )
 
 
@@ -105,38 +140,97 @@ def _test_case1_cpu(self, approximate):
 
  place = base.CPUPlace()
  with dg.guard(place) as g:
- x_var = paddle.to_tensor(x)
- x_var.stop_gradient = False
- y_var = F.gelu(x_var, approximate)
- y_test = y_var.numpy()
+ x_var1 = paddle.to_tensor(x)
+ x_var2 = paddle.to_tensor(x)
+
+ x_var1.stop_gradient = False
+ x_var2.stop_gradient = False
+
+ y_var1 = F.gelu(x_var1, approximate)
+ y_test1 = y_var1.numpy()
+
+ func = nn.GELU(approximate)
+ y_var2 = func(x_var2)
+ y_test2 = y_var2.numpy()
 
- loss = paddle.sum(y_var)
- loss.backward()
- np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08)
- np.testing.assert_allclose(x_var.grad.shape, x_var.shape)
+ loss1 = paddle.sum(y_var1)
+ loss1.backward()
+
+ loss2 = paddle.sum(y_var2)
+ loss2.backward()
+ np.testing.assert_allclose(y_ref, y_test1, rtol=1e-05, atol=1e-08)
+ np.testing.assert_allclose(x_var1.grad.shape, x_var1.shape)
+
+ np.testing.assert_allclose(y_ref, y_test2, rtol=1e-05, atol=1e-08)
+ np.testing.assert_allclose(x_var2.grad.shape, x_var2.shape)
 
  def _test_case1_gpu(self, approximate):
  x = np.random.uniform(-1, 1, size=(0, 17)).astype(np.float32)
  y_ref = gelu(x, approximate)
 
  place = base.CUDAPlace(0)
  with dg.guard(place) as g:
- x_var = paddle.to_tensor(x)
- x_var.stop_gradient = False
- y_var = F.gelu(x_var, approximate)
- y_test = y_var.numpy()
+ x_var1 = paddle.to_tensor(x)
+ x_var2 = paddle.to_tensor(x)
+
+ x_var1.stop_gradient = False
+ x_var2.stop_gradient = False
+
+ y_var1 = F.gelu(x_var1, approximate)
+ y_test1 = y_var1.numpy()
 
- loss = paddle.sum(y_var)
- loss.backward()
- np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08)
- np.testing.assert_allclose(x_var.grad.shape, x_var.shape)
+ func = nn.GELU(approximate)
+ y_var2 = func(x_var2)
+ y_test2 = y_var2.numpy()
+
+ loss1 = paddle.sum(y_var1)
+ loss1.backward()
+
+ loss2 = paddle.sum(y_var2)
+ loss2.backward()
+ np.testing.assert_allclose(y_ref, y_test1, rtol=1e-05, atol=1e-08)
+ np.testing.assert_allclose(x_var1.grad.shape, x_var1.shape)
+
+ np.testing.assert_allclose(y_ref, y_test2, rtol=1e-05, atol=1e-08)
+ np.testing.assert_allclose(x_var2.grad.shape, x_var2.shape)
 
  def test_cases(self):
- for approximate in [True, False]:
+ for approximate in [True, False, "none", "tanh"]:
  self._test_case1_cpu(approximate)
  if base.is_compiled_with_cuda():
  self._test_case1_gpu(approximate)
 
 
+class TestGeluError(unittest.TestCase):
+
+ def setUp(self):
+ x = np.random.uniform(-1, 1, size=(11, 17)).astype(np.float32)
+ self.x = paddle.to_tensor(x)
+
+ def test_gelu_op_error(self):
+
+ def test_type_error1():
+ y = F.gelu(self.x, "tan")
+
+ def test_type_error2():
+ y = F.gelu(self.x, 1234)
+
+ self.assertRaises(TypeError, test_type_error1)
+ self.assertRaises(TypeError, test_type_error2)
+
+ def test_gelu_class_error(self):
+
+ def test_type_error1():
+ func = nn.GELU("tan")
+ y = func(self.x)
+
+ def test_type_error2():
+ func = nn.GELU(1234)
+ y = func(self.x)
+
+ self.assertRaises(TypeError, test_type_error1)
+ self.assertRaises(TypeError, test_type_error2)
+
+
 if __name__ == '__main__':
  unittest.main()