Skip to content

Commit 0f39e01

Browse files
slawekptakvinser52
andauthored
[SYCL] Extend the handler-less path to API functions with event dependencies (intel#20461)
The kernel submission APIs with event dependencies are switched to use the handler-less kernel submission path. --------- Co-authored-by: Sergei Vinogradov <sergey.vinogradov@intel.com>
1 parent 61cf28d commit 0f39e01

18 files changed

+338
-128
lines changed

sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ void single_task(queue Q, const KernelType &KernelObj,
157157
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
158158
void>::value)) {
159159
detail::submit_kernel_direct_single_task<KernelName>(
160-
std::move(Q), KernelObj, empty_properties_t{}, CodeLoc);
160+
std::move(Q), KernelObj, {}, empty_properties_t{}, CodeLoc);
161161
} else {
162162
submit(
163163
std::move(Q),
@@ -312,7 +312,7 @@ void nd_launch(queue Q, launch_config<nd_range<Dimensions>, Properties> Config,
312312
LaunchConfigAccess(Config);
313313

314314
detail::submit_kernel_direct_parallel_for<KernelName>(
315-
std::move(Q), LaunchConfigAccess.getRange(), KernelObj,
315+
std::move(Q), LaunchConfigAccess.getRange(), KernelObj, {},
316316
LaunchConfigAccess.getProperties());
317317
} else {
318318
submit(std::move(Q), [&](handler &CGH) {

sycl/include/sycl/khr/free_function_commands.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ void launch_task(const sycl::queue &q, KernelType &&k,
317317
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
318318
void>::value)) {
319319
detail::submit_kernel_direct_single_task(
320-
q, std::forward<KernelType>(k),
320+
q, std::forward<KernelType>(k), {},
321321
ext::oneapi::experimental::empty_properties_t{}, codeLoc);
322322
} else {
323323
submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);

sycl/include/sycl/queue.hpp

Lines changed: 90 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include <sycl/nd_range.hpp> // for nd_range
4141
#include <sycl/property_list.hpp> // for property_list
4242
#include <sycl/range.hpp> // for range
43+
#include <sycl/sycl_span.hpp> // for sycl::span
4344

4445
#include <cstddef> // for size_t
4546
#include <functional> // for function
@@ -68,6 +69,7 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl(
6869
const queue &Queue, const nd_range<Dims> &Range,
6970
detail::HostKernelRefBase &HostKernel,
7071
detail::DeviceKernelInfo *DeviceKernelInfo,
72+
sycl::span<const event> DepEvents,
7173
const detail::KernelPropertyHolderStructTy &Props,
7274
const detail::code_location &CodeLoc, bool IsTopCodeLoc);
7375

@@ -76,6 +78,7 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
7678
const queue &Queue, const nd_range<Dims> &Range,
7779
detail::HostKernelRefBase &HostKernel,
7880
detail::DeviceKernelInfo *DeviceKernelInfo,
81+
sycl::span<const event> DepEvents,
7982
const detail::KernelPropertyHolderStructTy &Props,
8083
const detail::code_location &CodeLoc, bool IsTopCodeLoc);
8184

@@ -165,7 +168,7 @@ template <detail::WrapAs WrapAs, typename LambdaArgType,
165168
typename KernelTypeUniversalRef, int Dims>
166169
auto submit_kernel_direct(
167170
const queue &Queue, const nd_range<Dims> &Range,
168-
KernelTypeUniversalRef &&KernelFunc,
171+
KernelTypeUniversalRef &&KernelFunc, sycl::span<const event> DepEvents,
169172
const PropertiesT &ExtraProps =
170173
ext::oneapi::experimental::empty_properties_t{},
171174
const detail::code_location &CodeLoc = detail::code_location::current()) {
@@ -230,12 +233,14 @@ auto submit_kernel_direct(
230233

231234
if constexpr (EventNeeded) {
232235
return submit_kernel_direct_with_event_impl(
233-
Queue, Range, HostKernel, DeviceKernelInfoPtr, ParsedProperties,
234-
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
236+
Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents,
237+
ParsedProperties, TlsCodeLocCapture.query(),
238+
TlsCodeLocCapture.isToplevel());
235239
} else {
236240
submit_kernel_direct_without_event_impl(
237-
Queue, Range, HostKernel, DeviceKernelInfoPtr, ParsedProperties,
238-
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
241+
Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents,
242+
ParsedProperties, TlsCodeLocCapture.query(),
243+
TlsCodeLocCapture.isToplevel());
239244
}
240245
}
241246

@@ -244,7 +249,7 @@ template <typename KernelName = detail::auto_name, bool EventNeeded = false,
244249
typename KernelTypeUniversalRef, int Dims>
245250
auto submit_kernel_direct_parallel_for(
246251
const queue &Queue, const nd_range<Dims> &Range,
247-
KernelTypeUniversalRef &&KernelFunc,
252+
KernelTypeUniversalRef &&KernelFunc, sycl::span<const event> DepEvents = {},
248253
const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{},
249254
const detail::code_location &CodeLoc = detail::code_location::current()) {
250255

@@ -266,23 +271,25 @@ auto submit_kernel_direct_parallel_for(
266271
return submit_kernel_direct<detail::WrapAs::parallel_for, TransformedArgType,
267272
KernelName, EventNeeded, PropertiesT,
268273
KernelTypeUniversalRef, Dims>(
269-
Queue, Range, std::forward<KernelTypeUniversalRef>(KernelFunc), Props,
270-
CodeLoc);
274+
Queue, Range, std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents,
275+
Props, CodeLoc);
271276
}
272277

273278
template <typename KernelName = detail::auto_name, bool EventNeeded = false,
274279
typename PropertiesT = ext::oneapi::experimental::empty_properties_t,
275280
typename KernelTypeUniversalRef>
276281
auto submit_kernel_direct_single_task(
277282
const queue &Queue, KernelTypeUniversalRef &&KernelFunc,
283+
sycl::span<const event> DepEvents = {},
278284
const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{},
279285
const detail::code_location &CodeLoc = detail::code_location::current()) {
280286

281287
return submit_kernel_direct<detail::WrapAs::single_task, void, KernelName,
282288
EventNeeded, PropertiesT, KernelTypeUniversalRef,
283289
1>(
284290
Queue, nd_range<1>{1, 1},
285-
std::forward<KernelTypeUniversalRef>(KernelFunc), Props, CodeLoc);
291+
std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents, Props,
292+
CodeLoc);
286293
}
287294

288295
} // namespace detail
@@ -2802,7 +2809,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
28022809
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
28032810
void>::value)) {
28042811
return detail::submit_kernel_direct_single_task<KernelName, true>(
2805-
*this, KernelFunc, Properties, TlsCodeLocCapture.query());
2812+
*this, KernelFunc, {}, Properties, TlsCodeLocCapture.query());
28062813
} else {
28072814
return submit(
28082815
[&](handler &CGH) {
@@ -2852,13 +2859,23 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
28522859
"Use queue.submit() instead");
28532860

28542861
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
2855-
return submit(
2856-
[&](handler &CGH) {
2857-
CGH.depends_on(DepEvent);
2858-
CGH.template single_task<KernelName, KernelType, PropertiesT>(
2859-
Properties, KernelFunc);
2860-
},
2861-
TlsCodeLocCapture.query());
2862+
2863+
// TODO The handler-less path does not support kernel functions
2864+
// with the kernel_handler type argument yet.
2865+
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
2866+
void>::value)) {
2867+
return detail::submit_kernel_direct_single_task<KernelName, true>(
2868+
*this, KernelFunc, sycl::span<const event>(&DepEvent, 1), Properties,
2869+
TlsCodeLocCapture.query());
2870+
} else {
2871+
return submit(
2872+
[&](handler &CGH) {
2873+
CGH.depends_on(DepEvent);
2874+
CGH.template single_task<KernelName, KernelType, PropertiesT>(
2875+
Properties, KernelFunc);
2876+
},
2877+
TlsCodeLocCapture.query());
2878+
}
28622879
}
28632880

28642881
/// single_task version with a kernel represented as a lambda.
@@ -2903,13 +2920,22 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
29032920
"Use queue.submit() instead");
29042921

29052922
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
2906-
return submit(
2907-
[&](handler &CGH) {
2908-
CGH.depends_on(DepEvents);
2909-
CGH.template single_task<KernelName, KernelType, PropertiesT>(
2910-
Properties, KernelFunc);
2911-
},
2912-
TlsCodeLocCapture.query());
2923+
2924+
// TODO The handler-less path does not support kernel functions
2925+
// with the kernel_handler type argument yet.
2926+
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
2927+
void>::value)) {
2928+
return detail::submit_kernel_direct_single_task<KernelName, true>(
2929+
*this, KernelFunc, DepEvents, Properties, TlsCodeLocCapture.query());
2930+
} else {
2931+
return submit(
2932+
[&](handler &CGH) {
2933+
CGH.depends_on(DepEvents);
2934+
CGH.template single_task<KernelName, KernelType, PropertiesT>(
2935+
Properties, KernelFunc);
2936+
},
2937+
TlsCodeLocCapture.query());
2938+
}
29132939
}
29142940

29152941
/// single_task version with a kernel represented as a lambda.
@@ -3348,7 +3374,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
33483374
KernelType, sycl::nd_item<Dims>>::value)) {
33493375

33503376
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
3351-
*this, Range, Rest..., Properties, TlsCodeLocCapture.query());
3377+
*this, Range, Rest..., {}, Properties, TlsCodeLocCapture.query());
33523378
} else
33533379
return submit(
33543380
[&](handler &CGH) {
@@ -3377,7 +3403,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
33773403
!(detail::KernelLambdaHasKernelHandlerArgT<
33783404
KernelType, sycl::nd_item<Dims>>::value)) {
33793405
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
3380-
*this, Range, Rest...,
3406+
*this, Range, Rest..., {},
33813407
ext::oneapi::experimental::empty_properties_t{},
33823408
TlsCodeLocCapture.query());
33833409
} else {
@@ -3431,12 +3457,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
34313457
parallel_for(nd_range<Dims> Range, event DepEvent, RestT &&...Rest) {
34323458
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
34333459
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
3434-
return submit(
3435-
[&](handler &CGH) {
3436-
CGH.depends_on(DepEvent);
3437-
CGH.template parallel_for<KernelName>(Range, Rest...);
3438-
},
3439-
TlsCodeLocCapture.query());
3460+
using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;
3461+
3462+
// TODO The handler-less path does not support reductions, and
3463+
// kernel functions with the kernel_handler type argument yet.
3464+
if constexpr (sizeof...(RestT) == 1 &&
3465+
!(detail::KernelLambdaHasKernelHandlerArgT<
3466+
KernelType, sycl::nd_item<Dims>>::value)) {
3467+
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
3468+
*this, Range, Rest..., sycl::span<const event>(&DepEvent, 1),
3469+
ext::oneapi::experimental::empty_properties_t{},
3470+
TlsCodeLocCapture.query());
3471+
} else {
3472+
return submit(
3473+
[&](handler &CGH) {
3474+
CGH.depends_on(DepEvent);
3475+
CGH.template parallel_for<KernelName>(Range, Rest...);
3476+
},
3477+
TlsCodeLocCapture.query());
3478+
}
34403479
}
34413480

34423481
/// parallel_for version with a kernel represented as a lambda + nd_range that
@@ -3485,12 +3524,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
34853524
RestT &&...Rest) {
34863525
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
34873526
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
3488-
return submit(
3489-
[&](handler &CGH) {
3490-
CGH.depends_on(DepEvents);
3491-
CGH.template parallel_for<KernelName>(Range, Rest...);
3492-
},
3493-
TlsCodeLocCapture.query());
3527+
using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;
3528+
3529+
// TODO The handler-less path does not support reductions, and
3530+
// kernel functions with the kernel_handler type argument yet.
3531+
if constexpr (sizeof...(RestT) == 1 &&
3532+
!(detail::KernelLambdaHasKernelHandlerArgT<
3533+
KernelType, sycl::nd_item<Dims>>::value)) {
3534+
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
3535+
*this, Range, Rest..., DepEvents,
3536+
ext::oneapi::experimental::empty_properties_t{},
3537+
TlsCodeLocCapture.query());
3538+
} else {
3539+
return submit(
3540+
[&](handler &CGH) {
3541+
CGH.depends_on(DepEvents);
3542+
CGH.template parallel_for<KernelName>(Range, Rest...);
3543+
},
3544+
TlsCodeLocCapture.query());
3545+
}
34943546
}
34953547

34963548
/// Copies data from a memory region pointed to by a placeholder accessor to

sycl/source/detail/event_deps.hpp

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
//==---------------- event_deps.hpp - SYCL event dependency utils ----------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#pragma once
10+
11+
#include <detail/context_impl.hpp>
12+
#include <detail/device_impl.hpp>
13+
#include <detail/event_impl.hpp>
14+
#include <detail/graph/graph_impl.hpp>
15+
#include <detail/queue_impl.hpp>
16+
#include <sycl/detail/cg_types.hpp>
17+
#include <sycl/exception.hpp>
18+
19+
#include <vector>
20+
21+
namespace sycl {
22+
inline namespace _V1 {
23+
namespace detail {
24+
25+
/// Adds an event dependency to the list of dependencies, performing
26+
/// a series of checks.
27+
///
28+
/// If the event is associated with a graph, and the queue is not,
29+
/// the queue will be switched to a recording mode (transitive queue
30+
/// recording feature).
31+
///
32+
/// The LockQueue template argument defines whether the queue lock
33+
/// should be acquired for the transition to a recording mode. It is
34+
/// set to false in cases where the event dependencies are set directly
35+
/// in the command submission flow and the lock is already acquired.
36+
///
37+
/// \param EventImpl Event to register as a dependency
38+
/// \param EventsRegistered A list of already registered events, where
39+
/// the event will be added.
40+
/// \param QueueImpl A queue associated with the event dependencies. Can
41+
/// be nullptr if no associated queue.
42+
/// \param ContextImpl A context associated with a queue or graph.
43+
/// \param DeviceImpl A device associated with a queue or graph.
44+
/// \param GraphImpl A graph associated with a queue or a handler. Can
45+
/// be nullptr if no associated graph.
46+
/// \param CommandGroupType Type of command group.
47+
template <bool LockQueue = true>
48+
void registerEventDependency(
49+
const EventImplPtr &EventImpl, std::vector<EventImplPtr> &EventsRegistered,
50+
queue_impl *QueueImpl, const context_impl &ContextImpl,
51+
const device_impl &DeviceImpl,
52+
const ext::oneapi::experimental::detail::graph_impl *GraphImpl,
53+
CGType CommandGroupType) {
54+
55+
if (!EventImpl)
56+
return;
57+
if (EventImpl->isDiscarded()) {
58+
throw sycl::exception(make_error_code(errc::invalid),
59+
"Queue operation cannot depend on discarded event.");
60+
}
61+
62+
// Async alloc calls adapter immediately. Any explicit/implicit dependencies
63+
// are handled at that point, including in order queue deps. Further calls to
64+
// depends_on after an async alloc are explicitly disallowed.
65+
if (CommandGroupType == CGType::AsyncAlloc) {
66+
throw sycl::exception(make_error_code(errc::invalid),
67+
"Cannot submit a dependency after an asynchronous "
68+
"allocation has already been executed!");
69+
}
70+
71+
auto EventGraph = EventImpl->getCommandGraph();
72+
if (QueueImpl && EventGraph) {
73+
auto QueueGraph = QueueImpl->getCommandGraph();
74+
75+
if (&EventGraph->getContextImpl() != &ContextImpl) {
76+
throw sycl::exception(
77+
make_error_code(errc::invalid),
78+
"Cannot submit to a queue with a dependency from a graph that is "
79+
"associated with a different context.");
80+
}
81+
82+
if (&EventGraph->getDeviceImpl() != &DeviceImpl) {
83+
throw sycl::exception(
84+
make_error_code(errc::invalid),
85+
"Cannot submit to a queue with a dependency from a graph that is "
86+
"associated with a different device.");
87+
}
88+
89+
if (QueueGraph && QueueGraph != EventGraph) {
90+
throw sycl::exception(sycl::make_error_code(errc::invalid),
91+
"Cannot submit to a recording queue with a "
92+
"dependency from a different graph.");
93+
}
94+
95+
// If the event dependency has a graph, that means that the queue that
96+
// created it was in recording mode. If the current queue is not recording,
97+
// we need to set it to recording (implements the transitive queue recording
98+
// feature).
99+
if (!QueueGraph) {
100+
if constexpr (LockQueue) {
101+
EventGraph->beginRecording(*QueueImpl);
102+
} else {
103+
EventGraph->beginRecordingUnlockedQueue(*QueueImpl);
104+
}
105+
}
106+
}
107+
108+
if (GraphImpl) {
109+
if (EventGraph == nullptr) {
110+
throw sycl::exception(
111+
make_error_code(errc::invalid),
112+
"Graph nodes cannot depend on events from outside the graph.");
113+
}
114+
if (EventGraph.get() != GraphImpl) {
115+
throw sycl::exception(
116+
make_error_code(errc::invalid),
117+
"Graph nodes cannot depend on events from another graph.");
118+
}
119+
}
120+
EventsRegistered.push_back(EventImpl);
121+
}
122+
123+
} // namespace detail
124+
} // namespace _V1
125+
} // namespace sycl

0 commit comments

Comments
 (0)