Skip to content

Commit 0695e1a

Browse files
authored
Add StringTensor (#39830)
* add string tensor and case convert kernels * Add strings empty kernel; Reorganize the structure of case convert kernel * Add string infermeta * Update mutable_data of string tensor * rename kernel name * add string copy tmp * Fix strings copy device bug * add utf8 gpu converter * add string tensor c++ api * Remove mutable_data of string tensor * update string tensor interface * remove charcases_flag.h * remove some fluid headers * Add make_ddim * __HIPCC__ -> PADDLE_WITH_HIP * remove fluid headers * fix cpu compile * remove std::hash * Fix cudaMalloc * Remove strings/impl directory * Fix infrt/get_phi_kernel_info.py;Add custom_kernels deps * Add empty kernel test * Remove some comments * Modify lower/upper api encoding type: string->bool * STRING->PSTRING; Add CreateInferLikeMeta * Add code gen for C++ String API * remove strings_api_utils.h * Add ignore file (strings_api.h, strings_api.cc) * update strings gen script * change args order of case convert kernels * Add comments for pstring, StringTensor * cpstring_internal.h -> cpstring_impl.h * Update accordding to comments: 1. Remove fluid headers 2. paddle::platform::errors -> phi::errors 3. Use 'place.GetType() == phi::AllocationType::GPU' instead of 'paddle::platform::is_cpu_space()' 4. Use camel code style * Remove all singletons in strings kernels * fix rocm compile * Fix py3 compile * Fix c++ coverage * 1. Add pstring proto type 2. Add StringTensor debug info 3. Rename case_convert_kernel to strings_lower_upper 4. Remove serialize derialize strings kernel * DataLayout::PSTRING -> DataLayout::PSTRING_UNION * Register pstring data type * Fix strings api gen * Fix dense tensor register pstring dtype * Fix error messages * remove line * add pstring unittest * remove test string api unitest * remove empty line * Remove some headers to decrease the size of executable file
1 parent 3b89542 commit 0695e1a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+7679
-24
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@ paddle/phi/api/backward/backward_api.h
99
paddle/phi/api/backward/sparse_bw_api.h
1010
paddle/phi/api/include/api.h
1111
paddle/phi/api/include/sparse_api.h
12+
paddle/phi/api/include/strings_api.h
1213
paddle/phi/api/lib/api.cc
1314
paddle/phi/api/lib/dygraph_api.*
1415
paddle/phi/api/lib/backward_api.cc
1516
paddle/phi/api/lib/sparse_api.cc
17+
paddle/phi/api/lib/strings_api.cc
1618
paddle/phi/api/lib/sparse_bw_api.cc
1719
paddle/phi/extension.h
1820
paddle/phi/include/*

paddle/fluid/framework/convert_utils.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ paddle::experimental::DataType TransToPhiDataType(
4747
return DataType::BFLOAT16;
4848
case paddle::framework::proto::VarType::BOOL:
4949
return DataType::BOOL;
50+
case paddle::framework::proto::VarType::PSTRING:
51+
return DataType::PSTRING;
5052
default:
5153
return DataType::UNDEFINED;
5254
}
@@ -81,6 +83,8 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
8183
return paddle::framework::proto::VarType::BF16;
8284
case DataType::BOOL:
8385
return paddle::framework::proto::VarType::BOOL;
86+
case DataType::PSTRING:
87+
return paddle::framework::proto::VarType::PSTRING;
8488
default:
8589
PADDLE_THROW(paddle::platform::errors::Unimplemented(
8690
"Unsupported data type `%s` when casting it into "
@@ -117,6 +121,8 @@ size_t DataTypeSize(DataType dtype) {
117121
return sizeof(paddle::platform::complex<float>);
118122
case DataType::COMPLEX128:
119123
return sizeof(paddle::platform::complex<double>);
124+
case DataType::PSTRING:
125+
return sizeof(paddle::platform::pstring);
120126
default:
121127
return 0;
122128
}
@@ -145,6 +151,8 @@ DataType String2DataType(const std::string& str) {
145151
return DataType::COMPLEX64;
146152
} else if (str == "complex128") {
147153
return DataType::COMPLEX128;
154+
} else if (str == "pstring") {
155+
return DataType::PSTRING;
148156
} else if (str == "bfloat16") {
149157
return DataType::BFLOAT16;
150158
} else {
@@ -176,6 +184,8 @@ std::string DataType2String(DataType dtype) {
176184
return "complex64";
177185
case DataType::COMPLEX128:
178186
return "complex128";
187+
case DataType::PSTRING:
188+
return "pstring";
179189
case DataType::BFLOAT16:
180190
return "bfloat16";
181191
default:

paddle/fluid/framework/data_type.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818

1919
#include "paddle/fluid/platform/bfloat16.h"
2020
#include "paddle/fluid/platform/float16.h"
21+
#include "paddle/phi/common/pstring.h"
2122

2223
using float16 = paddle::platform::float16;
2324
using bfloat16 = paddle::platform::bfloat16;
25+
using pstring = phi::dtype::pstring;
2426

2527
namespace paddle {
2628
namespace framework {
@@ -58,7 +60,8 @@ static DataTypeMap* InitDataTypeMap() {
5860
RegisterType<cc_type>(retv, proto_type, #cc_type)
5961

6062
_ForEachDataType_(RegType);
61-
63+
// Register pstring individually
64+
RegType(pstring, proto::VarType::PSTRING);
6265
#undef RegType
6366
return retv;
6467
}

paddle/fluid/framework/framework.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@ message VarType {
152152
STRINGS = 26;
153153
VOCAB = 27;
154154
FEED_LIST = 28;
155+
// The data type of phi::StringTensor
156+
PSTRING = 29;
155157
}
156158

157159
required Type type = 1;

paddle/phi/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ add_subdirectory(tools)
2323
add_subdirectory(tests)
2424

2525
# make an unity target for compile deps
26-
set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor)
26+
set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor)
2727
get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
2828
set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
2929

paddle/phi/api/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
add_subdirectory(lib)
2-
cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api)
2+
cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api strings_api)

paddle/phi/api/lib/CMakeLists.txt

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_a
5151
set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
5252
set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
5353

54+
# strings api file
55+
set(strings_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api_gen.py)
56+
set(strings_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml)
57+
set(strings_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/strings_api.h)
58+
set(strings_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/strings_api.cc)
59+
set(strings_api_header_file_tmp ${strings_api_header_file}.tmp)
60+
set(strings_api_source_file_tmp ${strings_api_source_file}.tmp)
61+
5462
# wrapped infermeta file
5563
set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
5664
set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
@@ -114,6 +122,19 @@ add_custom_command(
114122
DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
115123
VERBATIM)
116124

125+
# generate strings api
126+
add_custom_command(
127+
OUTPUT ${strings_api_header_file} ${strings_api_source_file}
128+
COMMAND ${PYTHON_EXECUTABLE} ${strings_api_gen_file}
129+
--api_yaml_path ${strings_api_yaml_file}
130+
--api_header_path ${strings_api_header_file_tmp}
131+
--api_source_path ${strings_api_source_file_tmp}
132+
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_header_file_tmp} ${strings_api_header_file}
133+
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${strings_api_source_file_tmp} ${strings_api_source_file}
134+
COMMENT "copy_if_different ${strings_api_header_file} ${strings_strings_api_source_file}"
135+
DEPENDS ${strings_api_yaml_file} ${strings_api_gen_file} ${api_gen_base} ${api_gen_file}
136+
VERBATIM)
137+
117138
# generate dygraph(intermediate) api
118139
add_custom_command(
119140
OUTPUT ${dygraph_api_header_file} ${dygraph_api_source_file}
@@ -152,5 +173,5 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph
152173
cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
153174
cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
154175
cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform phi_function_api sparse_api)
155-
156-
cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api)
176+
cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
177+
cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api)

paddle/phi/api/lib/api_declare.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ limitations under the License. */
1919

2020
// PD_DECLARE_API(Math);
2121
// PD_DECLARE_API(SparseApi);
22+
// PD_DECLARE_API(StringsApi);

paddle/phi/api/lib/api_gen_utils.cc

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
5656
return nullptr;
5757
}
5858

59+
std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor) {
60+
return std::dynamic_pointer_cast<phi::StringTensor>(tensor.impl());
61+
}
62+
5963
/* ----------------- for infer_meta --------------------- */
6064

6165
phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
@@ -92,6 +96,10 @@ paddle::optional<phi::MetaTensor> MakeMetaTensor(
9296
return {paddle::none};
9397
}
9498

99+
phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) {
100+
return phi::MetaTensor(tensor);
101+
}
102+
95103
/* ------------------ for output ----------------------- */
96104

97105
phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
@@ -148,5 +156,20 @@ phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) {
148156
return out->impl().get();
149157
}
150158

159+
phi::TensorBase* SetStringsKernelOutput(Backend backend,
160+
Tensor* out,
161+
TensorType type) {
162+
if (!out->initialized()) {
163+
if (type == TensorType::STRING_TENSOR) {
164+
if (out->impl() == nullptr) {
165+
auto strings_tensor = std::make_shared<phi::StringTensor>();
166+
out->set_impl(strings_tensor);
167+
}
168+
return out->impl().get();
169+
}
170+
}
171+
return out->impl().get();
172+
}
173+
151174
} // namespace experimental
152175
} // namespace paddle

paddle/phi/api/lib/api_gen_utils.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,12 @@ limitations under the License. */
2222
#include "paddle/phi/core/selected_rows.h"
2323
#include "paddle/phi/core/sparse_coo_tensor.h"
2424
#include "paddle/phi/core/sparse_csr_tensor.h"
25+
#include "paddle/phi/core/string_tensor.h"
2526

2627
namespace paddle {
2728
namespace experimental {
2829

29-
enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO };
30+
enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO, STRING_TENSOR };
3031

3132
/* ------------------ for input ----------------------- */
3233

@@ -43,6 +44,8 @@ std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
4344
std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
4445
const paddle::optional<Tensor>& tensor);
4546

47+
std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor);
48+
4649
/* ----------------- for infer_meta --------------------- */
4750

4851
phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
@@ -58,6 +61,8 @@ phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
5861
paddle::optional<phi::MetaTensor> MakeMetaTensor(
5962
const paddle::optional<const phi::SelectedRows&>& tensor);
6063

64+
phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor);
65+
6166
/* ------------------ for output ----------------------- */
6267

6368
phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out);
@@ -70,5 +75,9 @@ phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);
7075

7176
phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);
7277

78+
phi::TensorBase* SetStringsKernelOutput(Backend backend,
79+
Tensor* out,
80+
TensorType type);
81+
7382
} // namespace experimental
7483
} // namespace paddle

0 commit comments

Comments
 (0)