PaddlePaddle · luotao1 · Aug 30, 2017 · Aug 18, 2017 · Aug 18, 2017 · Aug 18, 2017
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
@@ -51,7 +51,7 @@ ExternalProject_Add(
  ${EXTERNAL_PROJECT_LOG_ARGS}
  DEPENDS ${MKLDNN_DEPENDS}
  GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
- GIT_TAG "v0.9"
+ GIT_TAG "v0.10"
  PREFIX ${MKLDNN_SOURCES_DIR}
  UPDATE_COMMAND ""
  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}

diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT "extern_mklml")
 SET(MKLML_VER "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR "mklml")

diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
@@ -41,7 +41,7 @@ namespace paddle {
 Layer::Layer(const LayerConfig& config, bool useGpu)
  : config_(config),
  useGpu_(useGpu),
- deviceId_(-1),
+ deviceId_(CPU_DEVICE),
  needSequenceInfo_(true) {}
 
 bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {

diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
@@ -59,7 +59,12 @@ class Layer {
  LayerConfig config_;
  /// whether to use GPU
  bool useGpu_;
- /// Device Id. CPU is -1, and GPU is 0, 1, 2 ...
+ /// Paddle device ID, MKLDNN is -2, CPU is -1
+ enum PADDLE_DEVICE_ID {
+ MKLDNN_DEVICE = -2,
+ CPU_DEVICE = -1,
+ };
+ /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
  int deviceId_;
  /// Input layers
  std::vector<LayerPtr> inputLayers_;
@@ -77,6 +82,7 @@ class Layer {
  Argument output_;
  /// Several outputs stored on different devices, used in 'parallel_nn' case,
  /// and record them by deviceId_.
+ /// Also used in 'use_mkldnn' case.
  std::vector<Argument> outputOtherDevice_;
  /// If there are several outputs, map them by each name.
  std::map<std::string, Argument*> outputMap_;
@@ -172,6 +178,13 @@ class Layer {
  return inputLayer.getOutput(deviceId_);
  }
 
+ /**
+ * Get the argument of input layer with deviceId.
+ */
+ const Argument& getInput(size_t inputIndex, int deviceId) const {
+ return inputLayers_[inputIndex]->getOutput(deviceId);
+ }
+
  /**
  * Get the forward-input value.
  */
@@ -186,6 +199,13 @@ class Layer {
  return inputLayer.getOutput(deviceId_).value;
  }
 
+ /**
+ * Get the forward-input value with deviceId.
+ */
+ const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+ return inputLayers_[inputIndex]->getOutput(deviceId).value;
+ }
+
  /**
  * Get the forward-input grad.
  */
@@ -200,6 +220,13 @@ class Layer {
  return inputLayer.getOutput(deviceId_).grad;
  }
 
+ /**
+ * Get the forward-input grad.
+ */
+ const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+ return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+ }
+
  /**
  * Get the forward-input label.
  */

diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
  return;
  }
 
- // TODO(TJ): dst format should get from wgtVal_
- int dstFmt = PARAM_FORMAT_MKLDNN_OI;
- int srcFmt = weight_->getParameterPtr()->getHeaderFormat();
- if (srcFmt == dstFmt) {
- return;
- }
-
- // The weight_ is transposed from initial paddle weight
- MatrixPtr paddleWgt = Matrix::create(
- weight_->getW()->getData(), iLayerSize_, oc_, false, false);
-
- // TODO(TJ): remove this print when do not need differ weights
- std::ostringstream ostr;
- paddleWgt->print(ostr);
- VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
-
- // The mkldnn weight is transposed from initial paddle matrix
- MatrixPtr paddleWgtT;
- paddleWgt->transpose(paddleWgtT, true);
- weight_->getW()->copyFrom(*paddleWgtT);
- weight_->getParameterPtr()->setHeaderFormat(dstFmt);
+ CHECK(wgtVal_) << "should have been initialized";
+ bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
+ auto targetDim = wgtVal_->getDims();
+ auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+ wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
  hasInitedWgt_ = true;
 }
 
 void MKLDNNFcLayer::convertWeightsToPaddle() {
- MatrixPtr dnnWgt = weight_->getW();
- MatrixPtr paddleWgt;
- dnnWgt->transpose(paddleWgt, true);
-
- // copy paddle weight and override on weight_
- MatrixPtr dnnWgtT = Matrix::create(
- dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
- dnnWgtT->copyFrom(*paddleWgt);
+ CHECK(wgtVal_) << "should have been initialized";
+ bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
+ auto targetDim = wgtVal_->getDims();
+ auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+ wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNFcLayer::convertOutputToOtherDevice() {
+ copyOutputInfoToOtherDevice();
+ // find other cpu device and reorder output to cpu device
+ int cnt = 0;
+ for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+ if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+ // fc cpu output value do not need convert
+ // just share point
+ outputOtherDevice_[i].value = output_.value;
+ ++cnt;
+ }
+ }
+
+ if (cnt > 1) {
+ LOG(WARNING) << "should not have more than one CPU devie";
+ }
 }
 
 void MKLDNNFcLayer::reshape() {
- const Argument& input = getInput(0);
+ const Argument& input = getInput(0, getPrev(0)->getDeviceId());
  int batchSize = input.getBatchSize();
  if (bs_ == batchSize) {
  return;
@@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() {
  if (iw_ == 0) {
  iw_ = 1;
  }
- hasSpatial_ = true;
- if (ih_ == 1 && iw_ == 1) {
- hasSpatial_ = false;
- }
  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
  ic_ = iLayerSize_ / (ih_ * iw_);
  CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
@@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() {
 
 void MKLDNNFcLayer::resetFwd() {
  bool hasBias = biases_ && biases_->getW();
- real* iData = getInputValue(0)->getData();
- real* oData = getOutputValue()->getData();
- real* wData = weight_->getW()->getData();
- real* bData = hasBias ? biases_->getW()->getData() : NULL;
-
- // TODO(TJ): below create should be covered in MkldnnMatrix
- // create memory desc
- memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
- : createMD({bs_, ic_}, format::nc);
- memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
- : createMD({oc_, ic_}, format::oi);
- memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
- : createMD({}, format::format_undef);
- memory::desc oMD = createMD({bs_, oc_}, format::nc);
-
- // create memory primitive desc and memory self
- inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
- wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
- outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
+ const MatrixPtr& wgt = weight_->getW();
+ const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
+ const MatrixPtr& out = output_.value;
+
+ if (inputIsOnlyMKLDNN()) {
+ const MatrixPtr& in = getInputValue(0);
+ inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
+ CHECK(inVal_) << "Input should be MKLDNNMatrix";
+ } else {
+ CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
+ const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
+ inVal_ = MKLDNNMatrix::create(
+ in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
+ }
+ inVal_->downSpatial();
+ wgtVal_ = MKLDNNMatrix::create(
+ wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
+ wgtVal_->downSpatial();
+ biasVal_ =
+ hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
+ outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
+
+ // change original output value to mkldnn output value
+ output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+ if (!outputIsOnlyMKLDNN()) {
+ convertOutputToOtherDevice();
+ }
 
+ // create forward handle
  prop_kind pk = prop_kind::forward;
- fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
- : fc_fwd::desc(pk, iMD, wMD, oMD);
+ fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
+ inVal_->getMemoryDesc(),
+ wgtVal_->getMemoryDesc(),
+ biasVal_->getMemoryDesc(),
+ outVal_->getMemoryDesc())
+ : fc_fwd::desc(pk,
+ inVal_->getMemoryDesc(),
+ wgtVal_->getMemoryDesc(),
+ outVal_->getMemoryDesc());
  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-
- if (bData != NULL) {
- biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
+ if (hasBias) {
  fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
  } else {
  fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
  }
+ printValueFormatFlow();
+
  pipelineFwd_.clear();
  pipelineFwd_.push_back(*fwd_);
 }
@@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() {
  return;
  }
  needResetBwd_ = false;
-
  bool hasBias = biases_ && biases_->getWGrad();
- real* iData = getInputValue(0)->getData();
- real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
- real* oDiff = getOutputGrad()->getData();
- real* wDiff = weight_->getWGrad()->getData();
- real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;
 
  /// backward weight
- // create memory desc for backward memory
- memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
- : createMD({bs_, ic_}, format::nc);
- memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
- : createMD({oc_, ic_}, format::oi);
- memory::desc oMD = createMD({bs_, oc_}, format::nc);
- memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
- : createMD({}, format::format_undef);
-
- if (inVal_) {
- // update data
- inVal_->set_data_handle(iData);
- } else {
- inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
- }
-
- // create memory primitive desc and memory self
- wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
- outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
-
- fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
+ CHECK(inVal_) << "Should have input value";
+ const MatrixPtr& wgt = weight_->getWGrad();
+ const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
+
+ // TODO(TJ): merge outgrad
+ int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+ // for MKLDNN device:
+ // can not directly cast outputgrad to mkldnnmatrix,
+ // since each layer can not write the inputgrad to mkldnn inputgrad.
+ // So just create from matrix with outputvalue format.
+ // for CPU device:
+ // fc do not need to convert from cpu device since output is always nc format
+ // only need create from cpu device
+ const MatrixPtr& out = getOutput(device).grad;
+ outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
+ wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
+ biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
+ : nullptr;
+
+ // create memory primitive desc
+ fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
+ inVal_->getMemoryDesc(),
+ wgtGrad_->getMemoryDesc(),
+ outGrad_->getMemoryDesc());
  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
- fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
- ? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
- : fc_bwdWgt::desc(iMD, wMD, oMD);
+ fc_bwdWgt::desc bwdWgtDesc = hasBias
+ ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+ wgtGrad_->getMemoryDesc(),
+ biasGrad_->getMemoryDesc(),
+ outGrad_->getMemoryDesc())
+ : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+ wgtGrad_->getMemoryDesc(),
+ outGrad_->getMemoryDesc());
  fc_bwdWgt::primitive_desc bwdWgtPD =
  fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
 
- if (bDiff != NULL) {
- biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
+ if (hasBias) {
  bwdWgt_.reset(
  new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
  } else {
@@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() {
  pipelineBwd_.push_back(*bwdWgt_);
 
  /// backward data
- if (iDiff == NULL) {
+ device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+ const MatrixPtr& in = getInputGrad(0, device);
+ if (in == nullptr) {
  return;
  }
- fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
+ if (getInput(0, device).getAllCount() > 1) {
+ // TODO(TJ): use outputMaps_ ways when merge outgrad done
+ } else {
+ inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
+ }
+
+ fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
+ wgtGrad_->getMemoryDesc(),
+ outGrad_->getMemoryDesc());
  fc_bwdData::primitive_desc bwdDataPD =
  fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
- inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
+
  CHECK(wgtVal_) << "Should have weight memory";
  bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
+ printGradFormatFlow();
  pipelineBwd_.push_back(*bwdData_);
 }
 
@@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
 
  {
  REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-
- // update input data
- // since it might be changed if this is after data layer
- real* iData = getInputValue(0)->getData();
- inVal_->set_data_handle(iData);
+ syncInputValue();
 
  // just submit forward pipeline
  stream_->submit(pipelineFwd_);
@@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
  REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
  resetBwd();
 
- // update diff
- real* oDiff = getOutputGrad()->getData();
- outGrad_->set_data_handle(oDiff);
-
+ syncOutputGrad();
  // just sumbmit backward pipeline
  stream_->submit(pipelineBwd_);
  }