PaddlePaddle
diff --git a/‎cmake/configure.cmake‎
Lines changed: 6 additions & 4 deletions b/‎cmake/configure.cmake‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎cmake/external/libmct.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/external/libmct.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/external/llvm.cmake‎
Lines changed: 8 additions & 5 deletions b/‎cmake/external/llvm.cmake‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎cmake/external/protobuf.cmake‎
Lines changed: 4 additions & 0 deletions b/‎cmake/external/protobuf.cmake‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cmake/operators.cmake‎
Lines changed: 1 addition & 0 deletions b/‎cmake/operators.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/pten_kernel.cmake‎
Lines changed: 6 additions & 3 deletions b/‎cmake/pten_kernel.cmake‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎paddle/fluid/distributed/common/chunk_allocator.h‎
Lines changed: 95 additions & 0 deletions b/‎paddle/fluid/distributed/common/chunk_allocator.h‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/dataset_utils/README.md‎
Lines changed: 6 additions & 0 deletions b/‎paddle/fluid/distributed/dataset_utils/README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/fleet.cc‎
Lines changed: 16 additions & 23 deletions b/‎paddle/fluid/distributed/fleet.cc‎
Lines changed: 16 additions & 23 deletions
diff --git a/‎paddle/fluid/distributed/fleet.h‎
Lines changed: 0 additions & 1 deletion b/‎paddle/fluid/distributed/fleet.h‎
Lines changed: 0 additions & 1 deletion
@@ -31,10 +31,12 @@ endif(NOT WITH_PROFILER)
 if(WITH_AVX AND AVX_FOUND)
  set(SIMD_FLAG ${AVX_FLAG})
  add_definitions(-DPADDLE_WITH_AVX)
-elseif(SSE3_FOUND)
- if(NOT WIN32)
- set(SIMD_FLAG ${SSE3_FLAG})
- endif()
+elseif(SSE3_FOUND AND NOT WIN32)
+ set(SIMD_FLAG ${SSE3_FLAG})
+endif()
+
+if (SSE3_FOUND)
+ # TODO: Runtime detection should be used here.
  add_definitions(-DPADDLE_WITH_SSE3)
 endif()
 
 
@@ -19,7 +19,7 @@ IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL))
  MESSAGE(STATUS "use pre defined download url")
  SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE)
  SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE)
- SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct.tar.gz" CACHE STRING "" FORCE) 
+ SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct/libmct.tar.gz" CACHE STRING "" FORCE)
 ENDIF()
 MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}")
 SET(LIBMCT_PREFIX_DIR "${THIRD_PARTY_PATH}/libmct")
 
@@ -1,7 +1,7 @@
 include(FetchContent)
 
-set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz)
-set(LLVM_MD5 39d32b6be466781dddf5869318dcba53)
+set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz)
+set(LLVM_MD5 022819bb5760817013cf4b8a37e97d5e)
 
 set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm)
 set(FETCHCONTENT_QUIET OFF)
@@ -51,16 +51,18 @@ message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 # To build with MLIR, the LLVM is build from source code using the following flags:
 
 #[==[
-cmake -G Ninja ../llvm \
+cmake ../llvm -G "Unix Makefiles" \
  -DLLVM_ENABLE_PROJECTS="mlir;clang" \
  -DLLVM_BUILD_EXAMPLES=OFF \
  -DLLVM_TARGETS_TO_BUILD="X86" \
  -DCMAKE_BUILD_TYPE=Release \
  -DLLVM_ENABLE_ASSERTIONS=ON \
  -DLLVM_ENABLE_ZLIB=OFF \
  -DLLVM_ENABLE_RTTI=ON \
+ -DLLVM_INSTALL_UTILS=ON \
+ -DCMAKE_INSTALL_PREFIX=./install
 #]==]
-# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
+# The matched llvm-project version is b5149f4e66a49a98b67e8e2de4e24a4af8e2781b (currently a temporary commit)
 
 add_definitions(${LLVM_DEFINITIONS})
 
@@ -75,7 +77,7 @@ add_definitions(${LLVM_DEFINITIONS})
 
 
 # The minimum needed libraries for MLIR IR parse and transform.
-set(MLIR_IR_LIBS MLIRAnalysis MLIRStandardOps MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
+set(MLIR_IR_LIBS MLIRAnalysis MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
 
 
 # tb_base is the name of a xxx.td file (without the .td suffix)
@@ -89,6 +91,7 @@ function(mlir_tablegen_on td_base)
  mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
  if (mlir_tablegen_on_DIALECT)
  mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT})
+ mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs -dialect=${mlir_tablegen_on_DIALECT})
  endif()
  add_public_tablegen_target(${td_base}_IncGen)
  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
 
@@ -207,6 +207,10 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
  elseif(WITH_IPU)
  SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
  SET(PROTOBUF_TAG d750fbf648256c7c631f51ffdbf67d7c18b0114e)
+ elseif(WIN32)
+ SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
+ # Change the tag to support building with vs2019
+ SET(PROTOBUF_TAG 01a05a53f40ca2ac5f0af10c6cc0810bee39b792)
  else()
  SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
  SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)
 
@@ -203,6 +203,7 @@ function(op_library TARGET)
  list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu")
  list(REMOVE_ITEM hip_srcs "qr_op.cu")
  list(REMOVE_ITEM hip_srcs "eigh_op.cu")
+ list(REMOVE_ITEM hip_srcs "lstsq_op.cu")
  list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
  list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
  hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
 
@@ -16,12 +16,12 @@
 function(kernel_declare TARGET_LIST)
  foreach(kernel_path ${TARGET_LIST})
  file(READ ${kernel_path} kernel_impl)
- # TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL
+ # TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL
  # NOTE(chenweihang): now we don't recommend to use digit in kernel name
- string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
+ string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
  if (NOT first_registry STREQUAL "")
  # parse the first kernel name
- string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}")
+ string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
  string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
  string(REPLACE "," "" kernel_name "${kernel_name}")
  string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
@@ -79,6 +79,9 @@ function(kernel_library TARGET)
  endif()
 
  list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h)
+ if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h)
+ list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h)
+ endif()
  list(APPEND all_srcs ${common_srcs})
  list(APPEND all_srcs ${cpu_srcs})
  list(APPEND all_srcs ${gpu_srcs})
 
@@ -0,0 +1,95 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+namespace paddle {
+namespace distributed {
+
+// Fast allocation and deallocation of objects by allocating them in chunks.
+template <class T>
+class ChunkAllocator {
+ public:
+ explicit ChunkAllocator(size_t chunk_size = 64) {
+ CHECK(sizeof(Node) == std::max(sizeof(void*), sizeof(T)));
+ _chunk_size = chunk_size;
+ _chunks = NULL;
+ _free_nodes = NULL;
+ _counter = 0;
+ }
+ ChunkAllocator(const ChunkAllocator&) = delete;
+ ~ChunkAllocator() {
+ while (_chunks != NULL) {
+ Chunk* x = _chunks;
+ _chunks = _chunks->next;
+ free(x);
+ }
+ }
+ template <class... ARGS>
+ T* acquire(ARGS&&... args) {
+ if (_free_nodes == NULL) {
+ create_new_chunk();
+ }
+
+ T* x = (T*)(void*)_free_nodes; // NOLINT
+ _free_nodes = _free_nodes->next;
+ new (x) T(std::forward<ARGS>(args)...);
+ _counter++;
+ return x;
+ }
+ void release(T* x) {
+ x->~T();
+ Node* node = (Node*)(void*)x; // NOLINT
+ node->next = _free_nodes;
+ _free_nodes = node;
+ _counter--;
+ }
+ size_t size() const { return _counter; }
+
+ private:
+ struct alignas(T) Node {
+ union {
+ Node* next;
+ char data[sizeof(T)];
+ };
+ };
+ struct Chunk {
+ Chunk* next;
+ Node nodes[];
+ };
+
+ size_t _chunk_size; // how many elements in one chunk
+ Chunk* _chunks; // a list
+ Node* _free_nodes; // a list
+ size_t _counter; // how many elements are acquired
+
+ void create_new_chunk() {
+ Chunk* chunk;
+ posix_memalign(reinterpret_cast<void**>(&chunk),
+ std::max<size_t>(sizeof(void*), alignof(Chunk)),
+ sizeof(Chunk) + sizeof(Node) * _chunk_size);
+ chunk->next = _chunks;
+ _chunks = chunk;
+
+ for (size_t i = 0; i < _chunk_size; i++) {
+ Node* node = &chunk->nodes[i];
+ node->next = _free_nodes;
+ _free_nodes = node;
+ }
+ }
+};
+
+} // namespace distributed
+} // namespace paddle
@@ -0,0 +1,6 @@
+# 目录说明
+
+> 干掉原来的 index_dataset 目录
+dataset 抽样工具类
+用户自定义数据处理so
+流式dataserver相关类
@@ -460,25 +460,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
  clks->lod().size() ? clks->lod()[0].size() - 1 : clks->dims()[0];
  CHECK(clk_size == batch_size || clk_size == 1);
 
- std::vector<float> g;
- for (framework::LoDTensor* g_tensor : *outputs) {
- float* g_ori = g_tensor->data<float>();
- // no cvm
- if (batch_size_consist) { // TODO(zhaocaibei123): add config
- // scale_sparse_gradient_with_batch_size_
- Eigen::Map<
- Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
- g_mat(g_ori, g_tensor->numel() / fea_dim, fea_dim);
- g_mat.rightCols(fea_dim) *= batch_size;
- }
-
- size_t origin = g.size();
- size_t add = g_tensor->numel();
- g.resize(origin + add);
-
- memcpy(g.data() + origin, g_tensor->data<float>(), add * sizeof(float));
- }
-
+ CHECK(outputs->size() == inputs->size());
  std::vector<uint64_t> push_keys;
  push_keys.reserve(MAX_FEASIGN_NUM / 100);
  std::vector<std::vector<float>> push_values;
@@ -495,9 +477,21 @@ void FleetWrapper::PushSparseFromTensorAsync(
  const int64_t* clk_tensor = clks->data<int64_t>();
 
  for (size_t index = 0; index < inputs->size(); ++index) {
+ framework::LoDTensor* g_tensor = outputs->at(index);
+ float* g = g_tensor->data<float>();
+ // no cvm
+ if (batch_size_consist) { // TODO(zhaocaibei123): add config
+ // scale_sparse_gradient_with_batch_size_
+ Eigen::Map<
+ Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+ g_mat(g, g_tensor->numel() / fea_dim, fea_dim);
+ g_mat.rightCols(fea_dim) *= batch_size;
+ }
+
  const framework::LoDTensor* tensor = inputs->at(index);
  const int64_t* ids = tensor->data<int64_t>();
  size_t len = tensor->numel();
+ output_len = 0;
 
  if (tensor->lod().size() > 0) {
  for (size_t i = 0; i < tensor->lod()[0].size() - 1; ++i) {
@@ -519,7 +513,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
 
  float* data = push_values.back().data() + 3;
 
- memcpy(data, g.data() + output_len, sizeof(float) * fea_dim);
+ memcpy(data, g + output_len, sizeof(float) * fea_dim);
 
  ++input_idx;
  }
@@ -542,14 +536,13 @@ void FleetWrapper::PushSparseFromTensorAsync(
 
  float* data = push_values.back().data() + 3;
 
- memcpy(data, g.data() + output_len, sizeof(float) * fea_dim);
+ memcpy(data, g + output_len, sizeof(float) * fea_dim);
 
  ++input_idx;
  }
  }
+ CHECK(output_len == g_tensor->numel());
  }
- VLOG(1) << "output_len: " << output_len << " g.size(): " << g.size();
- CHECK(output_len == g.size());
 
  std::vector<float*> push_g_vec(input_idx, nullptr);
 
 
@@ -36,7 +36,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
 class Scope;
 class SelectedRows;
 class Variable;