PaddlePaddle · NHZlX · Feb 2, 2018 · Jan 22, 2018 · Jan 23, 2018 · Jan 25, 2018
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
@@ -158,7 +158,10 @@ op_library(parallel_do_op DEPS executor)
 
 # Regist multiple Kernel to pybind
 if (WITH_GPU)
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
+
+op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
+ vol2col depthwise_conv)
+
 op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
 op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
 op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc

diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
@@ -318,9 +318,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
 REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
  ops::ConvOpGrad);
+
+// depthwise convolution op
+REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+ depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
  ops::ConvOpGrad);
 
+// depthwise conv kernel
+// TODO(xingzhaolong): neon kernel for mobile
+REGISTER_OP_CPU_KERNEL(
+ depthwise_conv2d,
+ ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+ ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+ depthwise_conv2d_grad,
+ ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+ ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
+
 REGISTER_OP_CPU_KERNEL(
  conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
  ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);

diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
@@ -16,6 +16,16 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
+REGISTER_OP_CUDA_KERNEL(
+ depthwise_conv2d,
+ ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
+ ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+ depthwise_conv2d_grad,
+ ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+ ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
+
 REGISTER_OP_CUDA_KERNEL(
  conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
  ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);

diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/depthwise_conv.h"
 #include "paddle/operators/math/im2col.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/vol2col.h"
@@ -350,5 +351,72 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
  }
  }
 };
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvKernel : public framework::OpKernel<T> {
+ public:
+ void Compute(const framework::ExecutionContext& context) const override {
+ const Tensor* input = context.Input<Tensor>("Input");
+ Tensor filter = *context.Input<Tensor>("Filter");
+ Tensor* output = context.Output<Tensor>("Output");
+ output->mutable_data<T>(context.GetPlace());
+
+ PADDLE_ENFORCE_EQ(
+ output->dims()[1] % input->dims()[1], 0,
+ "The output channels must be a multiple of the input channels");
+ std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+ std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+ std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+ math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+
+ auto& dev_ctx = context.template device_context<DeviceContext>();
+ depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
+ }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvGradKernel : public framework::OpKernel<T> {
+ public:
+ void Compute(const framework::ExecutionContext& context) const override {
+ const Tensor* input = context.Input<Tensor>("Input");
+ const Tensor* output_grad =
+ context.Input<Tensor>(framework::GradVarName("Output"));
+ Tensor* input_grad =
+ context.Output<Tensor>(framework::GradVarName("Input"));
+ Tensor* filter_grad =
+ context.Output<Tensor>(framework::GradVarName("Filter"));
+ Tensor filter = *context.Input<Tensor>("Filter");
+
+ if (!input_grad && !filter_grad) return;
+
+ std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+ std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+ std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+ math::SetConstant<DeviceContext, T> set_zero;
+ auto& dev_ctx = context.template device_context<DeviceContext>();
+
+ math::DepthwiseConvInputGradFunctor<DeviceContext, T>
+ depthwiseConvInputGrad;
+ math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
+ depthwiseConvFilterGrad;
+
+ if (input_grad) {
+ input_grad->mutable_data<T>(context.GetPlace());
+ set_zero(dev_ctx, input_grad, static_cast<T>(0));
+ depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+ paddings, input_grad);
+ }
+
+ if (filter_grad) {
+ filter_grad->mutable_data<T>(context.GetPlace());
+ set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+ depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
+ filter_grad);
+ }
+ }
+};
+
 } // namespace operators
 } // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
@@ -8,6 +8,7 @@ if(WITH_GPU)
  nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
  nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
  nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+ nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
  nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
  nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
  nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)