PaddlePaddle · lanxianghit · Oct 11, 2021 · Sep 15, 2021 · Sep 29, 2021 · Sep 29, 2021
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -94,7 +94,7 @@ if (WITH_GPU OR WITH_ROCM)
  endif()
  op_library(sync_batch_norm_op)
  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
- if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) )
+ if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) )
  op_library(sparse_attention_op)
  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sparse_attention);\n")
  endif()

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -454,6 +454,11 @@ list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while)
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 
+# disable sparse_attention which not in suitable env
+if ( (NOT WITH_GPU) OR (WIN32) OR (PADDLE_WITH_ARM) OR (WITH_ROCM) )
+ list(REMOVE_ITEM TEST_OPS test_sparse_attention_op)
+endif()
+
 if (APPLE OR WIN32)
  list(REMOVE_ITEM TEST_OPS test_dataset)
  list(REMOVE_ITEM TEST_OPS test_dataset_dataloader)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
@@ -16,10 +16,13 @@
 import numpy as np
 from op_test import OpTest
 import paddle.fluid.core as core
+from paddle.static import Program, program_guard
 import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.nn.functional as F
 import os
 import re
-import platform
 
 
 def get_cuda_version():
@@ -34,22 +37,6 @@ def get_cuda_version():
  return -1
 
 
-def get_linux_platform():
- if platform.system().lower() == 'windows':
- return 0
- elif platform.system().lower() == 'linux':
- return 1
- else:
- return -1
-
-
-def get_suitable_env():
- if get_cuda_version() >= 11020 and get_linux_platform() == 1:
- return True
- else:
- return False
-
-
 def softmax(x):
  max = np.max(x, axis=1, keepdims=True)
  e_x = np.exp(x - max)
@@ -141,8 +128,9 @@ def init_csr_format(batch_size, num_heads, rows, blocksize):
 
 
 @unittest.skipIf(
- not core.is_compiled_with_cuda() or get_suitable_env() == False,
- "core is not compiled with CUDA and cuda version need >= 11.2 in windows")
+ not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+ "core is not compiled with CUDA and cuda version need larger than or equal to 11.2"
+)
 class TestSparseAttentionOp(OpTest):
  def config(self):
  self.shape = (1, 1, 16, 8)
@@ -201,5 +189,130 @@ def config(self):
  self.dtype = "float64"
 
 
+@unittest.skipIf(
+ not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+ "core is not compiled with CUDA and cuda version need larger than or equal to 11.2"
+)
+class TestSparseAttentionAPI(unittest.TestCase):
+ def setUp(self):
+ self.place = paddle.CUDAPlace(0)
+ self.shape = (1, 1, 8, 4)
+ self.blocksize = 2
+ self.dtype = 'float64'
+
+ def test_static_graph(self):
+ paddle.enable_static()
+ with paddle.static.program_guard(paddle.static.Program()):
+ Q = paddle.static.data(name="Q", shape=self.shape, dtype=self.dtype)
+ K = paddle.static.data(name="K", shape=self.shape, dtype=self.dtype)
+ V = paddle.static.data(name="V", shape=self.shape, dtype=self.dtype)
+
+ batch_size, num_heads, rows = self.shape[0], self.shape[
+ 1], self.shape[2]
+ block_num = rows / self.blocksize
+ block_last = rows % self.blocksize
+ sparse_nnz_num = block_num * self.blocksize * self.blocksize + block_last * block_last
+ offset_shape = (batch_size, num_heads, rows + 1)
+ columns_shape = (batch_size, num_heads, int(sparse_nnz_num))
+
+ offset = paddle.static.data(
+ name="Offset", shape=offset_shape, dtype="int32")
+ columns = paddle.static.data(
+ name="Columns", shape=columns_shape, dtype="int32")
+ Out = F.sparse_attention(Q, K, V, offset, columns)
+
+ Q_np = np.random.random(self.shape).astype(self.dtype)
+ K_np = np.random.random(self.shape).astype(self.dtype)
+ V_np = np.random.random(self.shape).astype(self.dtype)
+ offset_np, columns_np = init_csr_format(
+ self.shape[0], self.shape[1], self.shape[2], self.blocksize)
+ offset_np = offset_np.astype('int32')
+ columns_np = columns_np.astype('int32')
+
+ exe = fluid.Executor(self.place)
+ fetches_result = exe.run(feed={
+ "Q": Q_np,
+ "K": K_np,
+ "V": V_np,
+ "Offset": offset_np,
+ "Columns": columns_np
+ },
+ fetch_list=[Out])
+ expected_result, __, __ = ref_batch_sparse_attention(
+ Q_np, K_np, V_np, offset_np, columns_np)
+
+ self.assertTrue(
+ np.allclose(
+ fetches_result, expected_result, atol=1e-5))
+
+ def test_dygraph(self):
+ paddle.disable_static()
+ offset, columns = init_csr_format(self.shape[0], self.shape[1],
+ self.shape[2], self.blocksize)
+ offset = offset.astype('int32')
+ columns = columns.astype('int32')
+ query = np.random.random(self.shape).astype(self.dtype)
+ key = np.random.random(self.shape).astype(self.dtype)
+ value = np.random.random(self.shape).astype(self.dtype)
+
+ paddle_query = paddle.to_tensor(query, place=self.place)
+ paddle_key = paddle.to_tensor(key, place=self.place)
+ paddle_value = paddle.to_tensor(value, place=self.place)
+ paddle_offset = paddle.to_tensor(offset, place=self.place)
+ paddle_colunmns = paddle.to_tensor(columns, place=self.place)
+
+ paddle_result = F.sparse_attention(paddle_query, paddle_key,
+ paddle_value, paddle_offset,
+ paddle_colunmns)
+
+ numpy_result, __, __ = ref_batch_sparse_attention(query, key, value,
+ offset, columns)
+ numpy_result = numpy_result.astype(self.dtype)
+
+ self.assertTrue(
+ np.allclose(
+ paddle_result.numpy(), numpy_result, atol=1e-5))
+
+
+class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI):
+ def setUp(self):
+ self.place = paddle.CUDAPlace(0)
+ self.shape = (2, 2, 8, 4)
+ self.blocksize = 2
+ self.dtype = 'float32'
+
+
+class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI):
+ def setUp(self):
+ self.place = paddle.CUDAPlace(0)
+ self.shape = (2, 2, 64, 32)
+ self.blocksize = 2
+ self.dtype = 'float64'
+
+
+class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI):
+ def setUp(self):
+ self.place = paddle.CUDAPlace(0)
+ self.shape = (2, 1, 64, 32)
+ self.blocksize = 2
+ self.dtype = 'float64'
+
+
+class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI):
+ def setUp(self):
+ self.place = paddle.CUDAPlace(0)
+ self.shape = (4, 4, 128, 32)
+ self.blocksize = 8
+ self.dtype = 'float64'
+
+
+class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI):
+ def setUp(self):
+ self.place = paddle.CUDAPlace(0)
+ self.shape = (3, 3, 35, 15)
+ self.blocksize = 3
+ self.dtype = 'float64'
+
+
 if __name__ == '__main__':
  unittest.main()
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
@@ -112,6 +112,8 @@
 from ...fluid.layers import gather_tree # noqa: F401
 from ...fluid.layers import temporal_shift # noqa: F401
 
+from .sparse_attention import sparse_attention
+
 __all__ = [ #noqa
  'conv1d',
  'conv1d_transpose',
@@ -207,4 +209,5 @@
  'layer_norm',
  'instance_norm',
  'class_center_sample',
+ 'sparse_attention',
 ]
diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+import paddle
+from ...fluid.framework import in_dygraph_mode, default_main_program
+from paddle.fluid.layer_helper import LayerHelper
+from ...fluid.framework import in_dygraph_mode
+from paddle import _C_ops
+
+
+def sparse_attention(query,
+ key,
+ value,
+ sparse_csr_offset,
+ sparse_csr_columns,
+ name=None):
+ r"""
+ This operator sparsify the Attention matrix in Transformer module
+ to achieve the effect of reducing memory consumption and computation. 
+ The sparse layout is expressed in CSR format and contains two parameters, 
+ ``offset`` and ``columns``.
+
+ .. math::
+
+ result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
+
+ where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. 
+ The dimensions of the three parameters are the same. 
+ ``d`` represents the size of the last dimension of the three parameters.
+
+ Parameters:
+ query(Tensor): The query tensor in the Attention module. 
+ It's a 4-D tensor with a shape of 
+ :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. 
+ The dtype can be ``float32`` and ``float64``.
+ key(Tensor): The key tensor in the Attention module. 
+ It's a 4-D tensor with a shape of 
+ :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. 
+ The dtype can be ``float32`` and ``float64``.
+ value(Tensor): The value tensor in the Attention module. 
+ It's a 4-D tensor with a shape of 
+ :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. 
+ The dtype can be ``float32`` and ``float64``.
+ sparse_csr_offset(Tensor): The sparsity feature in the Attention module 
+ is expressed in the CSR format, and the offset represents 
+ the number of non-zero elements in each row of the matrix.
+ It's a 3-D tensor with a shape of 
+ :math:`[batch\_size, num\_heads, seq\_len + 1]`. 
+ The dtype should be ``int32``.
+ sparse_csr_columns(Tensor): The sparsity feature in the Attention module 
+ is expressed in the CSR format, and the columns represent 
+ the column index values of non-zero elements in the matrix.
+ It's a 3-D tensor with a shape of 
+ :math:`[batch\_size, num\_heads, sparse\_nnz]`. 
+ The dtype should be ``int32``.
+ name(str, optional): The default value is None. Normally there is no need for user
+ to set this property. For more information, please refer to
+ :ref:`api_guide_Name`.
+
+ Returns:
+ A Tensor which refers to the result in the Attention module. 
+ It's a 4-D tensor with a shape of 
+ :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. 
+ The dtype can be ``float32`` and ``float64``.
+
+ Examples:
+ .. code-block:: python
+
+ # required: skiptest
+ import paddle
+ import numpy as np
+
+ query_data = np.array([[[[0, 1,], [2, 3],
+ [ 0, 1], [2, 3]]]]).astype("float32")
+ key_data = np.array([[[[0, 1,], [2, 3],
+ [ 0, 1], [2, 3]]]]).astype("float32")
+ value_data = np.array([[[[0, 1,], [2, 3],
+ [ 0, 1], [2, 3]]]]).astype("float32")
+ sparse_csr_offset_data = np.array([[[0, 2,
+ 4, 6, 8]]]).astype("int32")
+ sparse_csr_columns_data = np.array([[[0, 1,
+ 0, 1, 2, 3, 2, 3]]]).astype("int32")
+ print(query_data.shape)
+ # (1, 1, 4, 2)
+ print(sparse_csr_offset_data.shape)
+ # (1, 1, 5)
+ print(sparse_csr_columns_data.shape)
+ # (1, 1, 8)
+ paddle.disable_static()
+ query = paddle.to_tensor(query_data, stop_gradient=False, 
+ place=paddle.CUDAPlace(0))
+ key = paddle.to_tensor(key_data, stop_gradient=False, 
+ place=paddle.CUDAPlace(0))
+ value = paddle.to_tensor(value_data, stop_gradient=False, 
+ place=paddle.CUDAPlace(0))
+ offset = paddle.to_tensor(sparse_csr_offset_data, stop_gradient=False, 
+ place=paddle.CUDAPlace(0))
+ columns = paddle.to_tensor(sparse_csr_columns_data, stop_gradient=False, 
+ place=paddle.CUDAPlace(0))
+ output = paddle.nn.functional.sparse_attention(query, key, 
+ value, offset, columns)
+ print(output)
+
+ # [[[[1.60885942, 2.60885954],
+ # [1.99830270, 2.99830270],
+ # [1.60885942, 2.60885954],
+ # [1.99830270, 2.99830270]]]]
+ """
+ if in_dygraph_mode():
+ result_attention, result_sdd, result_softmax = _C_ops.sparse_attention(
+ query, key, value, sparse_csr_offset, sparse_csr_columns)
+ return result_attention
+
+ helper = LayerHelper('sparse_attention', **locals())
+ dtype = helper.input_dtype(input_param_name='Q')
+ out = helper.create_variable_for_type_inference(dtype)
+ result_sdd = helper.create_variable_for_type_inference(dtype)
+ result_softmax = helper.create_variable_for_type_inference(dtype)
+ inputs = {
+ 'Q': query,
+ 'K': key,
+ 'V': value,
+ 'Offset': sparse_csr_offset,
+ 'Columns': sparse_csr_columns
+ }
+ outputs = {
+ 'Out': out,
+ 'SparseDotSdd': result_sdd,
+ 'Softmax': result_softmax
+ }
+ helper.append_op(type='sparse_attention', inputs=inputs, outputs=outputs)
+ return out