Skip to content

Commit 9f0839e

Browse files
authored
Merge branch 'PaddlePaddle:develop' into flops
2 parents e090a16 + 8b77f87 commit 9f0839e

File tree

1,107 files changed

+23373
-14079
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,107 files changed

+23373
-14079
lines changed

cmake/configure.cmake

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@ endif(NOT WITH_PROFILER)
3131
if(WITH_AVX AND AVX_FOUND)
3232
set(SIMD_FLAG ${AVX_FLAG})
3333
add_definitions(-DPADDLE_WITH_AVX)
34-
elseif(SSE3_FOUND)
35-
if(NOT WIN32)
36-
set(SIMD_FLAG ${SSE3_FLAG})
37-
endif()
34+
elseif(SSE3_FOUND AND NOT WIN32)
35+
set(SIMD_FLAG ${SSE3_FLAG})
36+
endif()
37+
38+
if (SSE3_FOUND)
39+
# TODO: Runtime detection should be used here.
3840
add_definitions(-DPADDLE_WITH_SSE3)
3941
endif()
4042

cmake/external/libmct.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL))
1919
MESSAGE(STATUS "use pre defined download url")
2020
SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE)
2121
SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE)
22-
SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct.tar.gz" CACHE STRING "" FORCE)
22+
SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct/libmct.tar.gz" CACHE STRING "" FORCE)
2323
ENDIF()
2424
MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}")
2525
SET(LIBMCT_PREFIX_DIR "${THIRD_PARTY_PATH}/libmct")

cmake/external/llvm.cmake

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
include(FetchContent)
22

3-
set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz)
4-
set(LLVM_MD5 39d32b6be466781dddf5869318dcba53)
3+
set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz)
4+
set(LLVM_MD5 022819bb5760817013cf4b8a37e97d5e)
55

66
set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm)
77
set(FETCHCONTENT_QUIET OFF)
@@ -51,16 +51,18 @@ message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
5151
# To build with MLIR, the LLVM is build from source code using the following flags:
5252

5353
#[==[
54-
cmake -G Ninja ../llvm \
54+
cmake ../llvm -G "Unix Makefiles" \
5555
-DLLVM_ENABLE_PROJECTS="mlir;clang" \
5656
-DLLVM_BUILD_EXAMPLES=OFF \
5757
-DLLVM_TARGETS_TO_BUILD="X86" \
5858
-DCMAKE_BUILD_TYPE=Release \
5959
-DLLVM_ENABLE_ASSERTIONS=ON \
6060
-DLLVM_ENABLE_ZLIB=OFF \
6161
-DLLVM_ENABLE_RTTI=ON \
62+
-DLLVM_INSTALL_UTILS=ON \
63+
-DCMAKE_INSTALL_PREFIX=./install
6264
#]==]
63-
# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
65+
# The matched llvm-project version is b5149f4e66a49a98b67e8e2de4e24a4af8e2781b (currently a temporary commit)
6466

6567
add_definitions(${LLVM_DEFINITIONS})
6668

@@ -75,7 +77,7 @@ add_definitions(${LLVM_DEFINITIONS})
7577

7678

7779
# The minimum needed libraries for MLIR IR parse and transform.
78-
set(MLIR_IR_LIBS MLIRAnalysis MLIRStandardOps MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
80+
set(MLIR_IR_LIBS MLIRAnalysis MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
7981

8082

8183
# tb_base is the name of a xxx.td file (without the .td suffix)
@@ -89,6 +91,7 @@ function(mlir_tablegen_on td_base)
8991
mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
9092
if (mlir_tablegen_on_DIALECT)
9193
mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT})
94+
mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs -dialect=${mlir_tablegen_on_DIALECT})
9295
endif()
9396
add_public_tablegen_target(${td_base}_IncGen)
9497
add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)

cmake/external/protobuf.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
207207
elseif(WITH_IPU)
208208
SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
209209
SET(PROTOBUF_TAG d750fbf648256c7c631f51ffdbf67d7c18b0114e)
210+
elseif(WIN32)
211+
SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
212+
# Change the tag to support building with vs2019
213+
SET(PROTOBUF_TAG 01a05a53f40ca2ac5f0af10c6cc0810bee39b792)
210214
else()
211215
SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
212216
SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546)

cmake/operators.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ function(op_library TARGET)
203203
list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu")
204204
list(REMOVE_ITEM hip_srcs "qr_op.cu")
205205
list(REMOVE_ITEM hip_srcs "eigh_op.cu")
206+
list(REMOVE_ITEM hip_srcs "lstsq_op.cu")
206207
list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
207208
list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
208209
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}

cmake/pten_kernel.cmake

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616
function(kernel_declare TARGET_LIST)
1717
foreach(kernel_path ${TARGET_LIST})
1818
file(READ ${kernel_path} kernel_impl)
19-
# TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL
19+
# TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL
2020
# NOTE(chenweihang): now we don't recommend to use digit in kernel name
21-
string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
21+
string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
2222
if (NOT first_registry STREQUAL "")
2323
# parse the first kernel name
24-
string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}")
24+
string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
2525
string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
2626
string(REPLACE "," "" kernel_name "${kernel_name}")
2727
string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
@@ -79,6 +79,9 @@ function(kernel_library TARGET)
7979
endif()
8080

8181
list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h)
82+
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h)
83+
list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/impl/${TARGET}_impl.h)
84+
endif()
8285
list(APPEND all_srcs ${common_srcs})
8386
list(APPEND all_srcs ${cpu_srcs})
8487
list(APPEND all_srcs ${gpu_srcs})
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
#include <glog/logging.h>
17+
18+
namespace paddle {
19+
namespace distributed {
20+
21+
// Fast allocation and deallocation of objects by allocating them in chunks.
22+
template <class T>
23+
class ChunkAllocator {
24+
public:
25+
explicit ChunkAllocator(size_t chunk_size = 64) {
26+
CHECK(sizeof(Node) == std::max(sizeof(void*), sizeof(T)));
27+
_chunk_size = chunk_size;
28+
_chunks = NULL;
29+
_free_nodes = NULL;
30+
_counter = 0;
31+
}
32+
ChunkAllocator(const ChunkAllocator&) = delete;
33+
~ChunkAllocator() {
34+
while (_chunks != NULL) {
35+
Chunk* x = _chunks;
36+
_chunks = _chunks->next;
37+
free(x);
38+
}
39+
}
40+
template <class... ARGS>
41+
T* acquire(ARGS&&... args) {
42+
if (_free_nodes == NULL) {
43+
create_new_chunk();
44+
}
45+
46+
T* x = (T*)(void*)_free_nodes; // NOLINT
47+
_free_nodes = _free_nodes->next;
48+
new (x) T(std::forward<ARGS>(args)...);
49+
_counter++;
50+
return x;
51+
}
52+
void release(T* x) {
53+
x->~T();
54+
Node* node = (Node*)(void*)x; // NOLINT
55+
node->next = _free_nodes;
56+
_free_nodes = node;
57+
_counter--;
58+
}
59+
size_t size() const { return _counter; }
60+
61+
private:
62+
struct alignas(T) Node {
63+
union {
64+
Node* next;
65+
char data[sizeof(T)];
66+
};
67+
};
68+
struct Chunk {
69+
Chunk* next;
70+
Node nodes[];
71+
};
72+
73+
size_t _chunk_size; // how many elements in one chunk
74+
Chunk* _chunks; // a list
75+
Node* _free_nodes; // a list
76+
size_t _counter; // how many elements are acquired
77+
78+
void create_new_chunk() {
79+
Chunk* chunk;
80+
posix_memalign(reinterpret_cast<void**>(&chunk),
81+
std::max<size_t>(sizeof(void*), alignof(Chunk)),
82+
sizeof(Chunk) + sizeof(Node) * _chunk_size);
83+
chunk->next = _chunks;
84+
_chunks = chunk;
85+
86+
for (size_t i = 0; i < _chunk_size; i++) {
87+
Node* node = &chunk->nodes[i];
88+
node->next = _free_nodes;
89+
_free_nodes = node;
90+
}
91+
}
92+
};
93+
94+
} // namespace distributed
95+
} // namespace paddle
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# 目录说明
2+
3+
> 干掉原来的 index_dataset 目录
4+
dataset 抽样工具类
5+
用户自定义数据处理so
6+
流式dataserver相关类

paddle/fluid/distributed/fleet.cc

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -460,25 +460,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
460460
clks->lod().size() ? clks->lod()[0].size() - 1 : clks->dims()[0];
461461
CHECK(clk_size == batch_size || clk_size == 1);
462462

463-
std::vector<float> g;
464-
for (framework::LoDTensor* g_tensor : *outputs) {
465-
float* g_ori = g_tensor->data<float>();
466-
// no cvm
467-
if (batch_size_consist) { // TODO(zhaocaibei123): add config
468-
// scale_sparse_gradient_with_batch_size_
469-
Eigen::Map<
470-
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
471-
g_mat(g_ori, g_tensor->numel() / fea_dim, fea_dim);
472-
g_mat.rightCols(fea_dim) *= batch_size;
473-
}
474-
475-
size_t origin = g.size();
476-
size_t add = g_tensor->numel();
477-
g.resize(origin + add);
478-
479-
memcpy(g.data() + origin, g_tensor->data<float>(), add * sizeof(float));
480-
}
481-
463+
CHECK(outputs->size() == inputs->size());
482464
std::vector<uint64_t> push_keys;
483465
push_keys.reserve(MAX_FEASIGN_NUM / 100);
484466
std::vector<std::vector<float>> push_values;
@@ -495,9 +477,21 @@ void FleetWrapper::PushSparseFromTensorAsync(
495477
const int64_t* clk_tensor = clks->data<int64_t>();
496478

497479
for (size_t index = 0; index < inputs->size(); ++index) {
480+
framework::LoDTensor* g_tensor = outputs->at(index);
481+
float* g = g_tensor->data<float>();
482+
// no cvm
483+
if (batch_size_consist) { // TODO(zhaocaibei123): add config
484+
// scale_sparse_gradient_with_batch_size_
485+
Eigen::Map<
486+
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
487+
g_mat(g, g_tensor->numel() / fea_dim, fea_dim);
488+
g_mat.rightCols(fea_dim) *= batch_size;
489+
}
490+
498491
const framework::LoDTensor* tensor = inputs->at(index);
499492
const int64_t* ids = tensor->data<int64_t>();
500493
size_t len = tensor->numel();
494+
output_len = 0;
501495

502496
if (tensor->lod().size() > 0) {
503497
for (size_t i = 0; i < tensor->lod()[0].size() - 1; ++i) {
@@ -519,7 +513,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
519513

520514
float* data = push_values.back().data() + 3;
521515

522-
memcpy(data, g.data() + output_len, sizeof(float) * fea_dim);
516+
memcpy(data, g + output_len, sizeof(float) * fea_dim);
523517

524518
++input_idx;
525519
}
@@ -542,14 +536,13 @@ void FleetWrapper::PushSparseFromTensorAsync(
542536

543537
float* data = push_values.back().data() + 3;
544538

545-
memcpy(data, g.data() + output_len, sizeof(float) * fea_dim);
539+
memcpy(data, g + output_len, sizeof(float) * fea_dim);
546540

547541
++input_idx;
548542
}
549543
}
544+
CHECK(output_len == g_tensor->numel());
550545
}
551-
VLOG(1) << "output_len: " << output_len << " g.size(): " << g.size();
552-
CHECK(output_len == g.size());
553546

554547
std::vector<float*> push_g_vec(input_idx, nullptr);
555548

paddle/fluid/distributed/fleet.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ limitations under the License. */
3636

3737
namespace paddle {
3838
namespace framework {
39-
class LoDTensor;
4039
class Scope;
4140
class SelectedRows;
4241
class Variable;

0 commit comments

Comments
 (0)