PaddlePaddle
diff --git a/‎paddle/fluid/operators/fused/fmha_ref.h‎
Lines changed: 29 additions & 5 deletions b/‎paddle/fluid/operators/fused/fmha_ref.h‎
Lines changed: 29 additions & 5 deletions
diff --git a/‎paddle/fluid/operators/fused/fused_attention_op.cc‎
Lines changed: 32 additions & 11 deletions b/‎paddle/fluid/operators/fused/fused_attention_op.cc‎
Lines changed: 32 additions & 11 deletions
diff --git a/‎paddle/fluid/operators/fused/fused_attention_op.cu‎
Lines changed: 48 additions & 9 deletions b/‎paddle/fluid/operators/fused/fused_attention_op.cu‎
Lines changed: 48 additions & 9 deletions
diff --git a/‎python/paddle/incubate/nn/functional/fused_transformer.py‎
Lines changed: 19 additions & 2 deletions b/‎python/paddle/incubate/nn/functional/fused_transformer.py‎
Lines changed: 19 additions & 2 deletions
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
@@ -74,35 +76,57 @@ class FMHARef {
  Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor,
  Tensor* dropout_mask_out_tensor,
  Tensor* dropout_out_tensor, Tensor* qktv_out_tensor,
- Tensor* fmha_out_tensor) {
+ Tensor* fmha_out_tensor, const Tensor* cache_k,
+ const Tensor* cache_v, Tensor* cache_k_out,
+ Tensor* cache_v_out) {
  // input shape: [bs, seq_len, 3, num_head, head_dim]
- // transpose with perm [2, 0, 1, 3, 4],
+ // transpose with perm [2, 0, 3, 1, 4],
  // output_shape: [3, bs, num_head, seq_len, head_dim]
  int ndims = 5;
  std::vector<int> perm_1 = {2, 0, 3, 1, 4};
  TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_input_tensor, perm_1,
  transpose_2_out_tensor);
-
  T* qkv_data = transpose_2_out_tensor->data<T>();
  T* qk_out_data = qk_out_tensor->data<T>();
  T* qktv_out_data = qktv_out_tensor->data<T>();
  T* softmax_out_data = softmax_out_tensor->data<T>();
  T* dropout_out_data = dropout_out_tensor->data<T>();
  T* fmha_out_data = fmha_out_tensor->data<T>();
+ const T* cache_k_data = cache_k ? cache_k->data<T>() : nullptr;
+ const T* cache_v_data = cache_k ? cache_v->data<T>() : nullptr;
+ int cache_size = 0;
+ int cache_seq_len = 0;
+ if (cache_k) {
+ cache_size = cache_k->numel();
+ cache_seq_len = cache_k->dims()[2];
+ }
 
  int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
  int k_size = q_size;
+ int new_k_size = cache_size + k_size;
  T* q_ptr = qkv_data;
  T* k_ptr = q_ptr + q_size;
  T* v_ptr = k_ptr + k_size;
+ if (cache_k) {
+ std::vector<Tensor> qkv = transpose_2_out_tensor->Split(1, 0);
+ int64_t kdims[4] = {qkv[1].dims()[1], qkv[1].dims()[2], qkv[1].dims()[3],
+ qkv[1].dims()[4]};
+ qkv[1].Resize(phi::DDim(kdims, 4));
+ qkv[2].Resize(phi::DDim(kdims, 4));
+ phi::funcs::ConcatFunctor<phi::GPUContext, T> concat;
+ concat(dev_ctx_, {*cache_k, qkv[1]}, 2, cache_k_out);
+ concat(dev_ctx_, {*cache_v, qkv[2]}, 2, cache_v_out);
+ k_ptr = cache_k_out->data<T>();
+ v_ptr = cache_v_out->data<T>();
+ }
 
  // q*k^t, batched_gemm
  CBLAS_TRANSPOSE transA = CblasNoTrans;
  CBLAS_TRANSPOSE transB = CblasTrans;
  auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
  int gemm_batch_size = batch_size_ * num_head_;
  int gemm_m = seq_len_;
- int gemm_n = seq_len_;
+ int gemm_n = cache_seq_len + seq_len_;
  int gemm_k = head_dim_;
  T alpha = static_cast<T>(1.0 / sqrt(head_dim_));
  T beta = static_cast<T>(0.0);
@@ -133,7 +157,7 @@ class FMHARef {
  transB = CblasNoTrans;
  gemm_m = seq_len_;
  gemm_n = head_dim_;
- gemm_k = seq_len_;
+ gemm_k = cache_seq_len + seq_len_;
  alpha = static_cast<T>(1.0);
  stride_a = gemm_m * gemm_k;
  stride_b = gemm_k * gemm_n;
 
@@ -105,12 +105,14 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
  "input qkv_weight = [%s]",
  x_dim, y_dim));
 
- PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
- platform::errors::InvalidArgument(
- "The dimensions of qkv_weight must be 4"
- "(3, num_head, dim_head, dim_embed),"
- "and must satisfy the limitations: "
- "(num_head * dim_head == dim_embed)"));
+ if (ctx->Attrs().Get<int>("ring_id") == -1) {
+ PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
+ platform::errors::InvalidArgument(
+ "The dimensions of qkv_weight must be 4"
+ "(3, num_head, dim_head, dim_embed),"
+ "and must satisfy the limitations: "
+ "(num_head * dim_head == dim_embed)"));
+ }
 
  if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
  ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]});
@@ -133,19 +135,28 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
  ctx->SetOutputDim("TransposeOut2",
  {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
  // [batch, num_head, seq_len, seq_len]
- ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+ auto last_dim = x_dim[1];
+ if (ctx->HasInput("CacheK")) {
+ auto cache_dim = ctx->GetInputDim("CacheK");
+ last_dim += cache_dim[2];
+ ctx->SetOutputDim("CacheKOut",
+ {cache_dim[0], cache_dim[1], last_dim, cache_dim[3]});
+ ctx->SetOutputDim("CacheVOut",
+ {cache_dim[0], cache_dim[1], last_dim, cache_dim[3]});
+ }
+ ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], last_dim});
 
  if (ctx->HasInput("SrcMask")) {
- ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+ ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], last_dim});
  }
  // the same as QKOut's shape.
  ctx->SetOutputDim("AttnDropoutOut",
- {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+ {x_dim[0], y_dim[1], x_dim[1], last_dim});
  if (ctx->Attrs().Get<bool>("attn_dropout_is_test") == false) {
  ctx->SetOutputDim("AttnDropoutMaskOut",
- {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+ {x_dim[0], y_dim[1], x_dim[1], last_dim});
  }
- ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+ ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], last_dim});
  // [batch_size, num_heads, seq_len, head_dim]
  ctx->SetOutputDim("QKTVOut", {x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
  // [batch_size, seq_len, number of heads*head size]
@@ -194,6 +205,10 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
  "(optional) Bias is a 1-dimensional tensor of size "
  "H. Here, H represents the last dimension of its input tensor.")
  .AsDispensable();
+ AddInput("CacheK", "(optional) The cached K for generation inference.")
+ .AsDispensable();
+ AddInput("CacheV", "(optional) The cached V for generation inference.")
+ .AsDispensable();
  AddOutput("LnMean", "Mean of the current mini batch.").AsIntermediate();
  AddOutput("LnVariance", "Variance of the current mini batch.")
  .AsIntermediate();
@@ -217,6 +232,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
  AddOutput("BiasDropoutResidualOut",
  "Result of residual + dropout(src + bias).")
  .AsIntermediate();
+ AddOutput("CacheKOut", "The udpated cache K.");
+ AddOutput("CacheVOut", "The udpated cache V.");
  AddOutput("Y", "Result after attention.");
 
  AddAttr<bool>("pre_layer_norm",
@@ -324,6 +341,10 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
  "0.0 and 0.001, But received [%s].",
  ln_epsilon));
  });
+ AddAttr<int>(
+ "ring_id",
+ "ring id for tensor model parallel. distributed training and inference")
+ .SetDefault(-1);
 
  AddComment(R"DOC(
  Add fused attention op whose logic is as follows:
 
@@ -27,6 +27,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -51,6 +56,10 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
  // y: qkv's weight: [3, num_head, dim_head, dim_embed]
  auto *qkv_weight = ctx.Input<Tensor>("QKVW");
  auto *qkv_bias = ctx.Input<Tensor>("QKVBias");
+ auto *cache_k = ctx.Input<Tensor>("CacheK");
+ auto *cache_v = ctx.Input<Tensor>("CacheV");
+ auto *cache_k_out = ctx.Output<Tensor>("CacheKOut");
+ auto *cache_v_out = ctx.Output<Tensor>("CacheVOut");
  auto *qkv_out = ctx.Output<Tensor>("QKVOut");
  auto *qkv_bias_out = ctx.Output<Tensor>("QKVBiasOut");
 
@@ -86,6 +95,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
  auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
  bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
  int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
+ int ring_id = ctx.Attr<int>("ring_id");
 
  // final output.
  auto *out = ctx.Output<Tensor>("Y");
@@ -128,6 +138,11 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
  dropout_mask_out->mutable_data<uint8_t>(ctx.GetPlace());
  auto *final_out_data = out->mutable_data<T>(ctx.GetPlace());
 
+ auto *cache_k_out_data =
+ cache_k_out ? cache_k_out->mutable_data<T>(ctx.GetPlace()) : nullptr;
+ auto *cache_v_out_data =
+ cache_v_out ? cache_v_out->mutable_data<T>(ctx.GetPlace()) : nullptr;
+
  int batch_size = input_x_dims[0];
  int max_seq_len = input_x_dims[1];
  int dim_embed = input_x_dims[2];
@@ -161,9 +176,14 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
  output_size = hidden_size;
  // (transA, transB, compute_bias) = (false, false, false)
+ // NOTE(Yuang Liu): For general input size == output size, change the
+ // position won't have effects. For mp, the output size is mp_head * dkey
+ // which is actually the input size. While the input size is hidden size,
+ // which is actually the output size. So for out linear, switch the
+ // input size and output size.
  auto out_linear_compute =
  AttnMatMul<T>(ctx.cuda_device_context(), false, false, bsz_seq,
- output_size, input_size, false);
+ input_size, output_size, false);
  DropoutParam dropout_param2(ctx, 0);
  FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
  ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
@@ -186,22 +206,41 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
  qkv_bias_out);
  }
  if (qkv_bias == nullptr) {
- fmha_ref_compute.ComputeForward(*qkv_out, src_mask, transpose_out_2,
-  qk_out, src_mask_out, softmax_out,
-  attn_dropout_mask_out, attn_dropout_out,
-  qktv_out, fmha_out);
+ fmha_ref_compute.ComputeForward(
+ *qkv_out, src_mask, transpose_out_2, qk_out, src_mask_out,
+ softmax_out, attn_dropout_mask_out, attn_dropout_out, qktv_out,
+ fmha_out, cache_k, cache_v, cache_k_out, cache_v_out);
  } else {
- fmha_ref_compute.ComputeForward(*qkv_bias_out, src_mask, transpose_out_2,
-  qk_out, src_mask_out, softmax_out,
-  attn_dropout_mask_out, attn_dropout_out,
-  qktv_out, fmha_out);
+ fmha_ref_compute.ComputeForward(
+ *qkv_bias_out, src_mask, transpose_out_2, qk_out, src_mask_out,
+ softmax_out, attn_dropout_mask_out, attn_dropout_out, qktv_out,
+ fmha_out, cache_k, cache_v, cache_k_out, cache_v_out);
  }
 
  // fmha_out: [batch_size, seq_len, num_head, head_dim]
  // weight: [embed_dim, embed_dim]
  // out_linear_out: [batch_size, seq_len, embed_dim]
  out_linear_compute.ComputeForward(out_linear_weight, fmha_out, nullptr,
  out_linear_out, nullptr);
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+ if (ring_id >= 0) {
+ ncclDataType_t dtype = platform::ToNCCLDataType(
+ framework::TransToProtoVarType(out_linear_out->dtype()));
+ auto place = ctx.GetPlace();
+ int64_t numel = out_linear_out->numel();
+ const void *sendbuff = out_linear_out->data<T>();
+ void *recvbuff = out_linear_out->mutable_data<T>(place);
+ auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+ auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+ gpuStream_t stream =
+ static_cast<platform::CUDADeviceContext *>(dev_ctx)->stream();
+ ncclRedOp_t nccl_red_type = ncclSum;
+ PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+ sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(),
+ stream));
+ }
+#endif
+
  if (pre_layer_norm) {
  // output = (residual + dropout(input + bias))
  fused_dropout_layernorm_helper.ResidualDropoutBias(
 
@@ -229,7 +229,10 @@ def fused_multi_head_attention(x,
  ln_epsilon=1e-05,
  training=True,
  mode='upscale_in_train',
- name=None):
+ name=None,
+ ring_id=-1,
+ cache_k=None,
+ cache_v=None):
  r"""
  Attention mapps queries and a set of key-value pairs to outputs, and
  Multi-Head Attention performs multiple parallel attention to jointly attending
@@ -304,6 +307,9 @@ def fused_multi_head_attention(x,
  - train: out = input * mask
  - inference: out = input * (1.0 - p)
  name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+ ring_id (int, optional): For distributed forward in mp, only support NCCL and forward. Default is -1, means not using mp
+ cache_k (Tensor, optional): For generation model, cache structure
+ cache_v (Tensor, optional): For generation model, cache structure
 
  Returns:
  Tensor: The output Tensor, the data type and shape is same as `x`.
@@ -398,6 +404,9 @@ def fused_multi_head_attention(x,
  inputs['Ln2Scale'] = [ln_scale]
  if ln_bias:
  inputs['Ln2Bias'] = [ln_bias]
+ if cache_k:
+ inputs['CacheK'] = [cache_k]
+ inputs['CacheV'] = [cache_v]
 
  if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
  seed = helper.main_program.random_seed
@@ -417,6 +426,7 @@ def fused_multi_head_attention(x,
  'dropout_seed': seed if seed is not None else 0,
  'attn_dropout_implementation': mode,
  'dropout_implementation': mode,
+ 'ring_id': ring_id
  }
 
  # set outputs
@@ -449,6 +459,8 @@ def fused_multi_head_attention(x,
  bias_dropout_residual_out = helper.create_variable_for_type_inference(
  dtype=dtype)
  final_out = helper.create_variable_for_type_inference(dtype=dtype)
+ cache_k_out = helper.create_variable_for_type_inference(dtype=dtype)
+ cache_v_out = helper.create_variable_for_type_inference(dtype=dtype)
 
  helper.append_op(
  type='fused_attention',
@@ -472,7 +484,12 @@ def fused_multi_head_attention(x,
  "Ln2Mean": ln_mean_out,
  "Ln2Variance": ln_variance_out,
  "BiasDropoutResidualOut": bias_dropout_residual_out,
- 'Y': final_out
+ 'Y': final_out,
+ 'CacheKOut': cache_k_out,
+ 'CacheVOut': cache_v_out
  },
  attrs=attrs)
+
+ if cache_k:
+ return [final_out, cache_k_out, cache_v_out]
  return final_out