PaddlePaddle
diff --git a/‎paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc‎
Lines changed: 74 additions & 17 deletions b/‎paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc‎
Lines changed: 74 additions & 17 deletions
diff --git a/‎paddle/phi/kernels/gpu/weight_quantize_kernel.cu‎
Lines changed: 5 additions & 4 deletions b/‎paddle/phi/kernels/gpu/weight_quantize_kernel.cu‎
Lines changed: 5 additions & 4 deletions
@@ -108,14 +108,43 @@ class FusedWeightOnlyLinearWithBiasPattern
  //
  paddle::drr::ResultPattern res = src.ResultPattern();
 
- const auto &weight_quantize =
- res.Op(paddle::dialect::WeightQuantizeOp::name(),
- {{"algo", res.StrAttr(algo_)},
- {"arch", res.Int32Attr(sm_version_)},
- {"group_size", res.Int32Attr(-1)}});
- weight_quantize({&res.Tensor("w")},
- {&res.Tensor("quanted_weight_tensor"),
- &res.Tensor("weight_scale_tensor")});
+ if (algo_ == "weight_only_int4") {
+ // TODO(liuyuanle): When the operator weight_quantize supports
+ // weight_only_int4 on gpu version, delete the memory copy.
+ const auto &memcpy_d2h =
+ res.Op(paddle::dialect::MemcpyOp::name(),
+ {{"dst_place_type", res.Int32Attr(0 /*cpu*/)}});
+ res.Tensor("w_cpu") = memcpy_d2h(res.Tensor("w"));
+ const auto &weight_quantize =
+ res.Op(paddle::dialect::WeightQuantizeOp::name(),
+ {{"algo", res.StrAttr(algo_)},
+ {"arch", res.Int32Attr(sm_version_)},
+ {"group_size", res.Int32Attr(-1)}});
+ weight_quantize({&res.Tensor("w_cpu")},
+ {&res.Tensor("quanted_weight_tensor_cpu"),
+ &res.Tensor("weight_scale_tensor_cpu")});
+
+ const auto &memcpy_h2d_1 =
+ res.Op(paddle::dialect::MemcpyOp::name(),
+ {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+ res.Tensor("quanted_weight_tensor") =
+ memcpy_h2d_1(res.Tensor("quanted_weight_tensor_cpu"));
+ const auto &memcpy_h2d_2 =
+ res.Op(paddle::dialect::MemcpyOp::name(),
+ {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+ res.Tensor("weight_scale_tensor") =
+ memcpy_h2d_2(res.Tensor("weight_scale_tensor_cpu"));
+ } else {
+ const auto &weight_quantize =
+ res.Op(paddle::dialect::WeightQuantizeOp::name(),
+ {{"algo", res.StrAttr(algo_)},
+ {"arch", res.Int32Attr(sm_version_)},
+ {"group_size", res.Int32Attr(-1)}});
+
+ weight_quantize({&res.Tensor("w")},
+ {&res.Tensor("quanted_weight_tensor"),
+ &res.Tensor("weight_scale_tensor")});
+ }
 
  const auto &weight_only_linear =
  res.Op(paddle::dialect::WeightOnlyLinearOp::name(),
@@ -192,15 +221,43 @@ class FusedWeightOnlyLinearNoBiasPattern : public paddle::drr::DrrPatternBase {
  //
  paddle::drr::ResultPattern res = src.ResultPattern();
 
- const auto &weight_quantize =
- res.Op(paddle::dialect::WeightQuantizeOp::name(),
- {{"algo", res.StrAttr(algo_)},
- {"arch", res.Int32Attr(sm_version_)},
- {"group_size", res.Int32Attr(-1)}});
- weight_quantize({&res.Tensor("w")},
- {&res.Tensor("quanted_weight_tensor"),
- &res.Tensor("weight_scale_tensor")});
-
+ if (algo_ == "weight_only_int4") {
+ // TODO(liuyuanle): When the operator weight_quantize supports
+ // weight_only_int4 on gpu version, delete the memory copy.
+ const auto &memcpy_d2h =
+ res.Op(paddle::dialect::MemcpyOp::name(),
+ {{"dst_place_type", res.Int32Attr(0 /*cpu*/)}});
+ res.Tensor("w_cpu") = memcpy_d2h(res.Tensor("w"));
+ const auto &weight_quantize =
+ res.Op(paddle::dialect::WeightQuantizeOp::name(),
+ {{"algo", res.StrAttr(algo_)},
+ {"arch", res.Int32Attr(sm_version_)},
+ {"group_size", res.Int32Attr(-1)}});
+ weight_quantize({&res.Tensor("w_cpu")},
+ {&res.Tensor("quanted_weight_tensor_cpu"),
+ &res.Tensor("weight_scale_tensor_cpu")});
+
+ const auto &memcpy_h2d_1 =
+ res.Op(paddle::dialect::MemcpyOp::name(),
+ {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+ res.Tensor("quanted_weight_tensor") =
+ memcpy_h2d_1(res.Tensor("quanted_weight_tensor_cpu"));
+ const auto &memcpy_h2d_2 =
+ res.Op(paddle::dialect::MemcpyOp::name(),
+ {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+ res.Tensor("weight_scale_tensor") =
+ memcpy_h2d_2(res.Tensor("weight_scale_tensor_cpu"));
+ } else {
+ const auto &weight_quantize =
+ res.Op(paddle::dialect::WeightQuantizeOp::name(),
+ {{"algo", res.StrAttr(algo_)},
+ {"arch", res.Int32Attr(sm_version_)},
+ {"group_size", res.Int32Attr(-1)}});
+
+ weight_quantize({&res.Tensor("w")},
+ {&res.Tensor("quanted_weight_tensor"),
+ &res.Tensor("weight_scale_tensor")});
+ }
  const auto &weight_only_linear =
  res.Op(paddle::dialect::WeightOnlyLinearOp::name(),
  {{"weight_dtype",
 
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/datatype_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -72,14 +73,14 @@ void WeightQuantizeKernel(const Context& dev_ctx,
  weight_shape,
  arch);
  } else if (algo == "weight_only_int4") {
- phi::errors::Unimplemented(
+ PADDLE_FATAL(phi::errors::Unimplemented(
  "Weight quant gpu kernel currently don't support weight_only_int4 "
- "algo, please use cpu version.");
+ "algo, please use cpu version."));
  } else {
- phi::errors::Unimplemented(
+ PADDLE_FATAL(phi::errors::Unimplemented(
  "The algo must be in ['weight_only_int8', 'weight_only_int4', "
  "'llm.int8'], but got[%s]",
- algo);
+ algo));
  }
 }
 } // namespace phi