Superjomn
diff --git a/‎doc/design/model_format.md‎
Lines changed: 36 additions & 0 deletions b/‎doc/design/model_format.md‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎paddle/framework/CMakeLists.txt‎
Lines changed: 5 additions & 3 deletions b/‎paddle/framework/CMakeLists.txt‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎paddle/framework/lod_tensor.cc‎
Lines changed: 144 additions & 0 deletions b/‎paddle/framework/lod_tensor.cc‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎paddle/framework/lod_tensor.h‎
Lines changed: 22 additions & 0 deletions b/‎paddle/framework/lod_tensor.h‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎paddle/framework/lod_tensor_test.cc‎
Lines changed: 23 additions & 1 deletion b/‎paddle/framework/lod_tensor_test.cc‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎paddle/framework/lod_tensor_test.cu‎
Lines changed: 27 additions & 0 deletions b/‎paddle/framework/lod_tensor_test.cu‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎paddle/framework/saver.proto‎
Lines changed: 39 additions & 0 deletions b/‎paddle/framework/saver.proto‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎paddle/framework/scope.cc‎
Lines changed: 17 additions & 0 deletions b/‎paddle/framework/scope.cc‎
Lines changed: 17 additions & 0 deletions
@@ -0,0 +1,36 @@
+# Design Doc: Model Format
+
+## Motivation
+
+The model is the output of training process. One complete model consists of two parts, namely, the **topology** and the **parameters**. To support industrial deployment, we need to make the model format must be self-completed and do not expose any training source code.
+
+As a result, In PaddlePaddle, the **topology** represents as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model, we must support large size parameter, and efficient serialization/deserialization. 
+
+## Implementation
+
+The topology is saved as a plain text, in detail, a self-contain protobuf file. 
+
+The parameters are saved as a binary file. As we all know, the protobuf message has the limits of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We do a (benchmark experiment)[https://github.com/PaddlePaddle/Paddle/pull/4610], its result shows protobuf is not fit in this scene.
+
+As a result, we design a particular format for tensor serialization. By default, arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of (LoDTensorDesc)[https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99]. We save the DescProto as the byte string header, it contains the necessary information, such as the `dims`, the `name` of the tensor, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). Tensor stores value in a continuous memory buffer, for speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
+
+|HeaderLength|ContentLength|**LoDTensorDesc**|**TensorValue**|
+
+In detail, tensor's byte view as the table shows. Note that all the signed value written in little-endian.
+
+```text
+[offset] [type] [description] 
+0004 4 bytes integer HeaderLength, the length of LoDTensorDesc
+0008 4 bytes integer ContentLength, the length of LodTensor Buffer
+0009 1 bytes char TensorDesc
+00010 1 bytes char TensorDesc
+...
+00100 1 bytes char TensorValue
+00101 1 bytes char TensorValue
+00102 1 bytes char TensorValue ..
+...
+```
+
+## Summary
+
+We introduce the model format, the `ProgramDesc` describe the **topology**, and a bunch of particular format binary tensors describes the **parameters**.
@@ -1,4 +1,7 @@
 # ddim lib
+proto_library(framework_proto SRCS framework.proto)
+proto_library(saver_proto SRCS framework.proto saver.proto)
+
 cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
@@ -7,16 +10,15 @@ cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor)
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor saver_proto framework_proto)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
 cc_test(variable_test SRCS variable_test.cc)
 
 cc_library(scope SRCS scope.cc)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
-proto_library(framework_proto SRCS framework.proto)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc)
 
@@ -13,6 +13,15 @@
  limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/saver.pb.h"
+
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <iterator>
 
 #include <glog/logging.h>
 
@@ -112,5 +121,140 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
  lod_ = new_lod;
 }
 
+std::string LoDTensor::SerializeToString() const {
+ LoDTensorProto desc;
+
+ // set data_type
+ if (this->type() == typeid(int8_t)) desc.set_data_type(DataType::BOOL);
+ if (this->type() == typeid(int16_t)) desc.set_data_type(DataType::INT16);
+ if (this->type() == typeid(int32_t)) desc.set_data_type(DataType::INT32);
+ if (this->type() == typeid(int64_t)) desc.set_data_type(DataType::INT64);
+ // FIXME(dzh): there is no fp16 in standard c++
+
+ if (this->type() == typeid(float)) // NOLINT
+ desc.set_data_type(DataType::FP32);
+ if (this->type() == typeid(double)) // NOLINT
+ desc.set_data_type(DataType::FP64);
+
+ for (int i = 0; i < dims().size(); ++i) {
+ desc.add_dims(dims()[i]);
+ }
+
+ // set lod information
+ desc.set_lod_level(this->NumLevels());
+ for (size_t i = 0; i < this->NumLevels(); ++i) {
+ LoDInfo* lod = desc.add_levels();
+ for (size_t j = 0; j < lod_[i].size(); ++j) {
+ lod->add_level(lod_[i][j]);
+ }
+ }
+
+ desc.set_version(0);
+
+ std::string desc_bytes = desc.SerializeAsString();
+
+ // FIXME(dzh) : implement fix chunk size buffer.
+ size_t DESC_SIZE = desc_bytes.size();
+ size_t DATA_SIZE = holder_->size() - offset_;
+
+ const size_t BUFFER_SIZE = DESC_SIZE + DATA_SIZE + 2 * sizeof(size_t);
+ char* buffer =
+ static_cast<char*>(memory::Alloc(platform::CPUPlace(), BUFFER_SIZE));
+
+ // format: desc_size data_size, desc_bytes, data_bytes.
+ platform::CPUPlace src_place;
+ platform::CPUPlace dst_place;
+
+ memory::Copy(dst_place, buffer, src_place, &BUFFER_SIZE, sizeof(size_t));
+ memory::Copy(dst_place, buffer + sizeof(size_t), src_place, &DESC_SIZE,
+ sizeof(size_t));
+ memory::Copy(dst_place, buffer + sizeof(size_t) * 2, src_place,
+ desc_bytes.c_str(), desc_bytes.size());
+
+ PADDLE_ENFORCE(this->numel() != 0, "Serialize a empty Tensor!");
+
+ platform::Place place = holder_->place();
+ int element_width = holder_->size() / this->numel();
+
+ if (platform::is_cpu_place(place)) {
+ memory::Copy(dst_place, buffer + sizeof(size_t) * 2 + desc_bytes.size(),
+ boost::get<platform::CPUPlace>(place),
+ static_cast<char*>(holder_->ptr()) + offset_ / element_width,
+ DATA_SIZE);
+ }
+#ifdef PADDLE_WITH_GPU
+ if (platform::is_gpu_place(place)) {
+ memory::Copy(dst_place, buffer + sizeof(size_t) * 2 + desc_bytes.size(),
+ boost::get<platform::GPUPlace>(place),
+ static_cast<char*>(holder_->ptr()) + offset_ / element_width,
+ DATA_SIZE);
+ }
+#endif
+
+ std::string ret(buffer, BUFFER_SIZE);
+ memory::Free(platform::CPUPlace(), buffer);
+ return ret;
+}
+
+void LoDTensor::DeserializeFromString(const std::string& s,
+ const platform::Place& dst_place) {
+ size_t DESC_SIZE, BUFFER_SIZE;
+ platform::CPUPlace src_place;
+
+ memory::Copy(src_place, &BUFFER_SIZE, src_place, s.c_str(), sizeof(size_t));
+ memory::Copy(src_place, &DESC_SIZE, src_place, s.c_str() + sizeof(size_t),
+ sizeof(size_t));
+
+ const size_t DATA_SIZE = BUFFER_SIZE - DESC_SIZE - sizeof(size_t) * 2;
+
+ // parse LoDTensorDesc
+ LoDTensorProto desc;
+ desc.ParseFromArray(s.c_str() + sizeof(size_t) * 2, DESC_SIZE);
+
+ std::vector<int64_t> dims;
+ std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+ this->Resize(make_ddim(dims));
+
+ // parse data type
+ void* ptr = nullptr;
+ if (desc.data_type() == DataType::BOOL)
+ ptr = this->mutable_data<bool>(dst_place);
+ if (desc.data_type() == DataType::INT16)
+ ptr = this->mutable_data<int16_t>(dst_place);
+ if (desc.data_type() == DataType::INT32)
+ ptr = this->mutable_data<int32_t>(dst_place);
+ if (desc.data_type() == DataType::INT64)
+ ptr = this->mutable_data<int64_t>(dst_place);
+ // FIXME(dzh): there is no fp16 in standard c++
+
+ if (desc.data_type() == DataType::FP32)
+ ptr = this->mutable_data<float>(dst_place);
+ if (desc.data_type() == DataType::FP64)
+ ptr = this->mutable_data<double>(dst_place);
+
+ LoD lod;
+ std::vector<size_t> levels;
+ for (int i = 0; i < desc.levels().size(); ++i) {
+ auto current_level = desc.levels()[i].level();
+ std::copy(current_level.begin(), current_level.end(),
+ std::back_inserter(levels));
+ lod.emplace_back(levels);
+ levels.clear();
+ }
+
+ this->set_lod(lod);
+
+ if (platform::is_cpu_place(dst_place)) {
+ memory::Copy(boost::get<platform::CPUPlace>(dst_place), ptr, src_place,
+ s.c_str() + sizeof(size_t) * 2 + DESC_SIZE, DATA_SIZE);
+ }
+#ifdef PADDLE_WITH_GPU
+ if (platform::is_gpu_place(dst_place)) {
+ memory::Copy(boost::get<platform::GPUPlace>(dst_place), ptr, src_place,
+ s.c_str() + sizeof(size_t) * 2 + DESC_SIZE, DATA_SIZE);
+ }
+#endif
+}
+
 } // namespace framework
 } // namespace paddle
@@ -25,6 +25,7 @@
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
 
 namespace paddle {
 namespace framework {
@@ -132,6 +133,27 @@ class LoDTensor : public Tensor {
  */
  void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
 
+ /**
+ * @brief Serialize tensor to char bytes.
+ * Please check model_format.md for the format detail.
+ * NOTE: GPUTensor will copy data to cpu implicitly.
+ * @return return string
+ */
+
+ // FIXME(dzh) : Currently, this interface should only be used in
+ // save/restore model and checkpoint. ParameterServer do not use shape
+ // information to do the optimization, as a result, when we serialize
+ // parameter/gradient to string, we should serialize the tensor
+ // to string in the ps trainer instead of LoDTensor.
+ std::string SerializeToString() const;
+
+ /**
+ * @brief Deserialize char bytes to tensor.
+ * @return return string
+ */
+ void DeserializeFromString(const std::string& s,
+ const platform::Place& dst_place);
+
  private:
  LoD lod_;
 };
 
@@ -17,10 +17,13 @@
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
+#include <vector>
 
 namespace paddle {
 namespace framework {
 
+const int kLodTensorSize = 20 * 128;
+
 class LoDTensorTester : public ::testing::Test {
  public:
  virtual void SetUp() override {
@@ -38,7 +41,10 @@ class LoDTensorTester : public ::testing::Test {
 
  lod_tensor_.Resize({20 /*batch size*/, 128 /*dim*/});
  // malloc memory
- lod_tensor_.mutable_data<float>(place);
+ float* dst_ptr = lod_tensor_.mutable_data<float>(place);
+ for (int i = 0; i < kLodTensorSize; ++i) {
+ dst_ptr[i] = i;
+ }
 
  lod_tensor_.set_lod(lod);
  }
@@ -101,5 +107,21 @@ TEST_F(LoDTensorTester, ShrinkInLevel) {
  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
 }
 
+TEST_F(LoDTensorTester, SerializeDeserialize) {
+ LoDTensor new_lod_tensor = lod_tensor_;
+ float* src_ptr = lod_tensor_.data<float>();
+ std::string s = lod_tensor_.SerializeToString();
+ LoDTensor dst;
+ dst.DeserializeFromString(s, platform::CPUPlace());
+ float* dst_ptr = dst.data<float>();
+ for (int i = 0; i < kLodTensorSize; ++i) {
+ EXPECT_EQ(dst_ptr[i], src_ptr[i]);
+ }
+
+ ASSERT_EQ(dst.NumElements(0), 2UL);
+ ASSERT_EQ(dst.NumElements(1), 3UL);
+ ASSERT_EQ(dst.NumElements(2), 8UL);
+}
+
 } // namespace framework
 } // namespace paddle
@@ -48,3 +48,30 @@ TEST(LoDTensor, LoDInGPU) {
  CHECK_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
  }
 }
+
+TEST(LoDTensor, SerializeDeserialize) {
+ paddle::framework::LoDTensor lod_tensor;
+ paddle::platform::GPUPlace place(0);
+
+ paddle::framework::LoD src_lod;
+ src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
+
+ lod_tensor.Resize({14, 16});
+ lod_tensor.mutable_data<float>(place);
+
+ lod_tensor.set_lod(src_lod);
+ CHECK_EQ(lod_tensor.lod_element(0, 2).first, 4UL);
+ CHECK_EQ(lod_tensor.lod_element(0, 4).first, 8UL);
+
+ test<<<1, 8>>>(src_lod[0].data(), src_lod[0].size());
+ cudaDeviceSynchronize();
+
+ std::string s = lod_tensor.SerializeToString();
+ paddle::framework::LoDTensor dst;
+ dst.DeserializeFromString(s, place);
+ paddle::framework::LoD dst_lod = dst.lod();
+
+ for (size_t i = 0; i < dst_lod[0].size(); ++i) {
+ CHECK_EQ(src_lod[0].data()[i], dst_lod[0].data()[i] * 2);
+ }
+}
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
+package paddle.framework;
+
+import "framework.proto";
+
+/**
+ * This file contains necessary information for model, checkpoint.
+ * etc.
+ */
+
+message LoDInfo { repeated int64 level = 1; }
+
+/**
+ * Save the LoDTensorDesc information through LoDTensorProto, its data memory
+ * is copyed to c buffer immediately. See model_format.md for details.
+ */
+
+message LoDTensorProto {
+ optional DataType data_type = 1;
+ repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+ repeated LoDInfo levels = 3;
+ optional int32 lod_level = 4 [ default = 0 ];
+ optional int32 version = 5;
+}
@@ -65,6 +65,23 @@ void Scope::DropKids() {
  kids_.clear();
 }
 
+std::vector<std::string> Scope::GetAllNames(bool recursive) const {
+ std::vector<std::string> known_vars(vars_.size());
+
+ if (recursive) {
+ for (auto& kid : kids_) {
+ auto kid_vars = kid->GetAllNames();
+ for (auto& p : kid_vars) {
+ known_vars.emplace_back(p);
+ }
+ }
+ }
+ for (auto& p : vars_) {
+ known_vars.emplace_back(p.first);
+ }
+ return known_vars;
+}
+
 void Scope::DeleteScope(Scope* scope) {
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);