Skip to content

Commit 3e114be

Browse files
WeldonWangwangnkogtevawangleissongbell
authored
[HETERO] support LLM and split model per available memory size (#21764)
### Details: - Support LLM inference |device| |--| |HETERO:CPU| |HETERO:GPU| |HETERO:CPU,GPU| |HETERO:GPU,CPU| |HETERO:GPU.0,GPU.1| |HETERO:GPU.0,GPU.1,CPU| |HETERO:GPU.0,GPU.1,GPU.2| - Use the `ov::hint::model_distribution_policy` property in [PR23077](#23077) - Use host memory in different subgraphs when input/output data exchange - Mask supported nodes and unsupported nodes to Subgraph in graph, and query model use subgraph, keep the model in query_model same as compile - Add a property `ov::query_model_ratio` to set the percentage of the model can be queried during query model - Improve performance on some LLM with large parameter by split the model to different devices, the number of split models should be smaller, reduce communication usage between multiple devices ### Tickets: - *CVS-133258* --------- Co-authored-by: Nadezhda <nadezhda.ageeva@intel.com> Co-authored-by: Shen, Wanglei <wanglei.shen@intel.com> Co-authored-by: yanlan song <bell.song@intel.com>
1 parent b250796 commit 3e114be

File tree

22 files changed

+949
-78
lines changed

22 files changed

+949
-78
lines changed

src/bindings/python/src/pyopenvino/core/properties/properties.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ void regmodule_properties(py::module m) {
7272
.value("ECORE_ONLY", ov::hint::SchedulingCoreType::ECORE_ONLY);
7373

7474
py::enum_<ov::hint::ModelDistributionPolicy>(m_hint, "ModelDistributionPolicy", py::arithmetic())
75-
.value("TENSOR_PARALLEL", ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL);
75+
.value("TENSOR_PARALLEL", ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL)
76+
.value("PIPELINE_PARALLEL", ov::hint::ModelDistributionPolicy::PIPELINE_PARALLEL);
7677

7778
py::enum_<ov::hint::ExecutionMode>(m_hint, "ExecutionMode", py::arithmetic())
7879
.value("PERFORMANCE", ov::hint::ExecutionMode::PERFORMANCE)

src/core/include/openvino/core/any.hpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <map>
1212
#include <memory>
13+
#include <set>
1314
#include <string>
1415
#include <typeindex>
1516
#include <typeinfo>
@@ -209,6 +210,18 @@ struct Read<std::vector<T, A>, typename std::enable_if<std::is_default_construct
209210
}
210211
};
211212

213+
template <typename K, typename C, typename A>
214+
struct Read<std::set<K, C, A>, typename std::enable_if<std::is_default_constructible<K>::value>::type> {
215+
void operator()(std::istream& is, std::set<K, C, A>& set) const {
216+
while (is.good()) {
217+
std::string str;
218+
is >> str;
219+
auto v = from_string<K>(str);
220+
set.insert(std::move(v));
221+
}
222+
}
223+
};
224+
212225
template <typename K, typename T, typename C, typename A>
213226
struct Read<
214227
std::map<K, T, C, A>,
@@ -343,6 +356,21 @@ struct Write<std::vector<T, A>> {
343356
}
344357
};
345358

359+
template <typename K, typename C, typename A>
360+
struct Write<std::set<K, C, A>> {
361+
void operator()(std::ostream& os, const std::set<K, C, A>& set) const {
362+
if (!set.empty()) {
363+
std::size_t i = 0;
364+
for (auto&& v : set) {
365+
os << to_string(v);
366+
if (i < (set.size() - 1))
367+
os << ' ';
368+
++i;
369+
}
370+
}
371+
}
372+
};
373+
346374
template <typename K, typename T, typename C, typename A>
347375
struct Write<std::map<K, T, C, A>> {
348376
void operator()(std::ostream& os, const std::map<K, T, C, A>& map) const {

src/core/tests/any.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,23 @@ TEST_F(AnyTests, AnyAsMapOfAnys) {
158158
ASSERT_EQ(refMap["testParamString"].as<std::string>(), testString);
159159
}
160160

161+
TEST_F(AnyTests, AnyAsSetOfAnys) {
162+
std::set<std::string> refSet0;
163+
std::set<int> refSet1;
164+
refSet0.insert("test");
165+
refSet1.insert(4);
166+
Any s0 = refSet0;
167+
Any s1 = refSet1;
168+
bool isSet0 = s0.is<std::set<std::string>>();
169+
bool isSet1 = s1.is<std::set<int>>();
170+
ASSERT_TRUE(isSet0);
171+
ASSERT_TRUE(isSet1);
172+
auto testSet0 = s0.as<std::set<std::string>>();
173+
auto testSet1 = s1.as<std::set<int>>();
174+
ASSERT_NE(testSet0.count("test"), 0);
175+
ASSERT_NE(testSet1.count(4), 0);
176+
}
177+
161178
TEST_F(AnyTests, AnyAsMapOfMapOfAnys) {
162179
std::map<std::string, Any> refMap1;
163180
refMap1["testParamInt"] = 4;

src/inference/dev_api/openvino/runtime/internal_properties.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,5 +69,12 @@ static constexpr Property<std::string, PropertyMutability::RO> compiled_model_ru
6969
static constexpr Property<bool, PropertyMutability::RO> compiled_model_runtime_properties_supported{
7070
"COMPILED_MODEL_RUNTIME_PROPERTIES_SUPPORTED"};
7171

72+
/**
73+
* @brief Read-write property to set the percentage of the estimated model size which is used to determine the query
74+
* model results for further processing
75+
* @ingroup ov_dev_api_plugin_api
76+
*/
77+
static constexpr Property<float, PropertyMutability::RW> query_model_ratio{"QUERY_MODEL_RATIO"};
78+
7279
} // namespace internal
7380
} // namespace ov

src/inference/dev_api/openvino/runtime/iplugin.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,12 +230,14 @@ class OPENVINO_RUNTIME_API IPlugin : public std::enable_shared_from_this<IPlugin
230230
* @param model Original model
231231
* @param transform Transformation pipeline function
232232
* @param is_node_supported Function returning whether node is supported or not
233+
* @param query_model_ratio The percentage of the model can be queried during query model (0 if not query)
233234
* @return Set of strings which contains supported node names
234235
*/
235236
OPENVINO_RUNTIME_API std::unordered_set<std::string> get_supported_nodes(
236237
const std::shared_ptr<const ov::Model>& model,
237238
std::function<void(std::shared_ptr<ov::Model>&)> transform,
238-
std::function<bool(const std::shared_ptr<ov::Node>)> is_node_supported);
239+
std::function<bool(const std::shared_ptr<ov::Node>)> is_node_supported,
240+
float query_model_ratio = 1.0f);
239241

240242
/**
241243
* @private

src/inference/include/openvino/runtime/properties.hpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -400,16 +400,20 @@ inline std::istream& operator>>(std::istream& is, SchedulingCoreType& core_type)
400400
static constexpr Property<SchedulingCoreType> scheduling_core_type{"SCHEDULING_CORE_TYPE"};
401401

402402
enum class ModelDistributionPolicy {
403-
TENSOR_PARALLEL = 0, // Split tensor into several parts and distribute them between sockets/devices during model
404-
// compilation. At inference time sockets/devices process tensors in parallel and do
405-
// syncronization at the end ensuring mathematical correctness.
403+
TENSOR_PARALLEL = 0, // Distribute tensor to multiple sockets/devices during model compilation. At inference
404+
// time, sockets/devices process individual tensor in parallel.
405+
PIPELINE_PARALLEL = 1, // Distribute tensor to multiple sockets/devices during model compilation. At inference
406+
// time, sockets/devices process individual tensor one by one. And each socket/device
407+
// processes a portion of a different tensor in parallel.
406408
};
407409

408410
/** @cond INTERNAL */
409411
inline std::ostream& operator<<(std::ostream& os, const ModelDistributionPolicy& stream_mode) {
410412
switch (stream_mode) {
411413
case ModelDistributionPolicy::TENSOR_PARALLEL:
412414
return os << "TENSOR_PARALLEL";
415+
case ModelDistributionPolicy::PIPELINE_PARALLEL:
416+
return os << "PIPELINE_PARALLEL";
413417
default:
414418
OPENVINO_THROW("Unsupported model distribution policy!");
415419
}
@@ -420,6 +424,8 @@ inline std::istream& operator>>(std::istream& is, ModelDistributionPolicy& strea
420424
is >> str;
421425
if (str == "TENSOR_PARALLEL") {
422426
stream_mode = ModelDistributionPolicy::TENSOR_PARALLEL;
427+
} else if (str == "PIPELINE_PARALLEL") {
428+
stream_mode = ModelDistributionPolicy::PIPELINE_PARALLEL;
423429
} else {
424430
OPENVINO_THROW("Unsupported model distribution policy: ", str);
425431
}
@@ -430,17 +436,19 @@ inline std::istream& operator>>(std::istream& is, ModelDistributionPolicy& strea
430436
/**
431437
* @brief This property defines model distribution policy for inference with multiple sockets/devices.
432438
* @ingroup ov_runtime_cpp_prop_api
433-
*
434439
* This property can be used to select model distribution policy between execution units (e.g. between CPU sockets/NUMA
435440
* nodes or between different GPUs).
436-
* -- TENSOR_PARALLEL : Split tensor into several parts and distribute them between sockets/devices during model
437-
* compilation. At inference time sockets/devices process tensors in parallel and do syncronization
438-
* at the end ensuring mathematical correctness.
441+
* -- TENSOR_PARALLEL : Distribute tensor to multiple sockets/devices during model compilation. At inference time,
442+
* sockets/devices process individual tensor in parallel.
443+
* -- PIPELINE_PARALLEL : Distribute tensor to multiple sockets/devices during model compilation. At inference time,
444+
* sockets/devices process individual tensor one by one. And each socket/device processes a
445+
* portion of a different tensor in parallel.
439446
*
440-
* The following code is an example how TENSOR_PARALLEL model disrtibution policy might be enabled.
447+
* The following code is an example how TENSOR_PARALLEL or PIPELINE_PARALLEL model distribution policy might be enabled.
441448
*
442449
* @code
443450
* ie.set_property(ov::hint::model_distribution_policy({ov::hint::ModelDistributionPolicy::TENSOR_PARALLEL}));
451+
* ie.set_property(ov::hint::model_distribution_policy({ov::hint::ModelDistributionPolicy::PIPELINE_PARALLEL}));
444452
* @endcode
445453
*/
446454
static constexpr Property<std::set<ModelDistributionPolicy>> model_distribution_policy{"MODEL_DISTRIBUTION_POLICY"};

0 commit comments

Comments
 (0)