4040#include < sycl/nd_range.hpp> // for nd_range
4141#include < sycl/property_list.hpp> // for property_list
4242#include < sycl/range.hpp> // for range
43+ #include < sycl/sycl_span.hpp> // for sycl::span
4344
4445#include < cstddef> // for size_t
4546#include < functional> // for function
@@ -68,6 +69,7 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl(
6869 const queue &Queue, const nd_range<Dims> &Range,
6970 detail::HostKernelRefBase &HostKernel,
7071 detail::DeviceKernelInfo *DeviceKernelInfo,
72+ sycl::span<const event> DepEvents,
7173 const detail::KernelPropertyHolderStructTy &Props,
7274 const detail::code_location &CodeLoc, bool IsTopCodeLoc);
7375
@@ -76,6 +78,7 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
7678 const queue &Queue, const nd_range<Dims> &Range,
7779 detail::HostKernelRefBase &HostKernel,
7880 detail::DeviceKernelInfo *DeviceKernelInfo,
81+ sycl::span<const event> DepEvents,
7982 const detail::KernelPropertyHolderStructTy &Props,
8083 const detail::code_location &CodeLoc, bool IsTopCodeLoc);
8184
@@ -165,7 +168,7 @@ template <detail::WrapAs WrapAs, typename LambdaArgType,
165168 typename KernelTypeUniversalRef, int Dims>
166169auto submit_kernel_direct (
167170 const queue &Queue, const nd_range<Dims> &Range,
168- KernelTypeUniversalRef &&KernelFunc,
171+ KernelTypeUniversalRef &&KernelFunc, sycl::span< const event> DepEvents,
169172 const PropertiesT &ExtraProps =
170173 ext::oneapi::experimental::empty_properties_t {},
171174 const detail::code_location &CodeLoc = detail::code_location::current()) {
@@ -230,12 +233,14 @@ auto submit_kernel_direct(
230233
231234 if constexpr (EventNeeded) {
232235 return submit_kernel_direct_with_event_impl (
233- Queue, Range, HostKernel, DeviceKernelInfoPtr, ParsedProperties,
234- TlsCodeLocCapture.query (), TlsCodeLocCapture.isToplevel ());
236+ Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents,
237+ ParsedProperties, TlsCodeLocCapture.query (),
238+ TlsCodeLocCapture.isToplevel ());
235239 } else {
236240 submit_kernel_direct_without_event_impl (
237- Queue, Range, HostKernel, DeviceKernelInfoPtr, ParsedProperties,
238- TlsCodeLocCapture.query (), TlsCodeLocCapture.isToplevel ());
241+ Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents,
242+ ParsedProperties, TlsCodeLocCapture.query (),
243+ TlsCodeLocCapture.isToplevel ());
239244 }
240245}
241246
@@ -244,7 +249,7 @@ template <typename KernelName = detail::auto_name, bool EventNeeded = false,
244249 typename KernelTypeUniversalRef, int Dims>
245250auto submit_kernel_direct_parallel_for (
246251 const queue &Queue, const nd_range<Dims> &Range,
247- KernelTypeUniversalRef &&KernelFunc,
252+ KernelTypeUniversalRef &&KernelFunc, sycl::span< const event> DepEvents = {},
248253 const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t {},
249254 const detail::code_location &CodeLoc = detail::code_location::current()) {
250255
@@ -266,23 +271,25 @@ auto submit_kernel_direct_parallel_for(
266271 return submit_kernel_direct<detail::WrapAs::parallel_for, TransformedArgType,
267272 KernelName, EventNeeded, PropertiesT,
268273 KernelTypeUniversalRef, Dims>(
269- Queue, Range, std::forward<KernelTypeUniversalRef>(KernelFunc), Props ,
270- CodeLoc);
274+ Queue, Range, std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents ,
275+ Props, CodeLoc);
271276}
272277
273278template <typename KernelName = detail::auto_name, bool EventNeeded = false ,
274279 typename PropertiesT = ext::oneapi::experimental::empty_properties_t ,
275280 typename KernelTypeUniversalRef>
276281auto submit_kernel_direct_single_task (
277282 const queue &Queue, KernelTypeUniversalRef &&KernelFunc,
283+ sycl::span<const event> DepEvents = {},
278284 const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t {},
279285 const detail::code_location &CodeLoc = detail::code_location::current()) {
280286
281287 return submit_kernel_direct<detail::WrapAs::single_task, void , KernelName,
282288 EventNeeded, PropertiesT, KernelTypeUniversalRef,
283289 1 >(
284290 Queue, nd_range<1 >{1 , 1 },
285- std::forward<KernelTypeUniversalRef>(KernelFunc), Props, CodeLoc);
291+ std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents, Props,
292+ CodeLoc);
286293}
287294
288295} // namespace detail
@@ -2802,7 +2809,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
28022809 if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
28032810 void >::value)) {
28042811 return detail::submit_kernel_direct_single_task<KernelName, true >(
2805- *this , KernelFunc, Properties, TlsCodeLocCapture.query ());
2812+ *this , KernelFunc, {}, Properties, TlsCodeLocCapture.query ());
28062813 } else {
28072814 return submit (
28082815 [&](handler &CGH) {
@@ -2852,13 +2859,23 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
28522859 " Use queue.submit() instead" );
28532860
28542861 detail::tls_code_loc_t TlsCodeLocCapture (CodeLoc);
2855- return submit (
2856- [&](handler &CGH) {
2857- CGH.depends_on (DepEvent);
2858- CGH.template single_task <KernelName, KernelType, PropertiesT>(
2859- Properties, KernelFunc);
2860- },
2861- TlsCodeLocCapture.query ());
2862+
2863+ // TODO The handler-less path does not support kernel functions
2864+ // with the kernel_handler type argument yet.
2865+ if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
2866+ void >::value)) {
2867+ return detail::submit_kernel_direct_single_task<KernelName, true >(
2868+ *this , KernelFunc, sycl::span<const event>(&DepEvent, 1 ), Properties,
2869+ TlsCodeLocCapture.query ());
2870+ } else {
2871+ return submit (
2872+ [&](handler &CGH) {
2873+ CGH.depends_on (DepEvent);
2874+ CGH.template single_task <KernelName, KernelType, PropertiesT>(
2875+ Properties, KernelFunc);
2876+ },
2877+ TlsCodeLocCapture.query ());
2878+ }
28622879 }
28632880
28642881 // / single_task version with a kernel represented as a lambda.
@@ -2903,13 +2920,22 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
29032920 " Use queue.submit() instead" );
29042921
29052922 detail::tls_code_loc_t TlsCodeLocCapture (CodeLoc);
2906- return submit (
2907- [&](handler &CGH) {
2908- CGH.depends_on (DepEvents);
2909- CGH.template single_task <KernelName, KernelType, PropertiesT>(
2910- Properties, KernelFunc);
2911- },
2912- TlsCodeLocCapture.query ());
2923+
2924+ // TODO The handler-less path does not support kernel functions
2925+ // with the kernel_handler type argument yet.
2926+ if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
2927+ void >::value)) {
2928+ return detail::submit_kernel_direct_single_task<KernelName, true >(
2929+ *this , KernelFunc, DepEvents, Properties, TlsCodeLocCapture.query ());
2930+ } else {
2931+ return submit (
2932+ [&](handler &CGH) {
2933+ CGH.depends_on (DepEvents);
2934+ CGH.template single_task <KernelName, KernelType, PropertiesT>(
2935+ Properties, KernelFunc);
2936+ },
2937+ TlsCodeLocCapture.query ());
2938+ }
29132939 }
29142940
29152941 // / single_task version with a kernel represented as a lambda.
@@ -3348,7 +3374,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
33483374 KernelType, sycl::nd_item<Dims>>::value)) {
33493375
33503376 return detail::submit_kernel_direct_parallel_for<KernelName, true >(
3351- *this , Range, Rest..., Properties, TlsCodeLocCapture.query ());
3377+ *this , Range, Rest..., {}, Properties, TlsCodeLocCapture.query ());
33523378 } else
33533379 return submit (
33543380 [&](handler &CGH) {
@@ -3377,7 +3403,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
33773403 !(detail::KernelLambdaHasKernelHandlerArgT<
33783404 KernelType, sycl::nd_item<Dims>>::value)) {
33793405 return detail::submit_kernel_direct_parallel_for<KernelName, true >(
3380- *this , Range, Rest...,
3406+ *this , Range, Rest..., {},
33813407 ext::oneapi::experimental::empty_properties_t {},
33823408 TlsCodeLocCapture.query ());
33833409 } else {
@@ -3431,12 +3457,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
34313457 parallel_for (nd_range<Dims> Range, event DepEvent, RestT &&...Rest) {
34323458 constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
34333459 detail::tls_code_loc_t TlsCodeLocCapture (CodeLoc);
3434- return submit (
3435- [&](handler &CGH) {
3436- CGH.depends_on (DepEvent);
3437- CGH.template parallel_for <KernelName>(Range, Rest...);
3438- },
3439- TlsCodeLocCapture.query ());
3460+ using KernelType = std::tuple_element_t <0 , std::tuple<RestT...>>;
3461+
3462+ // TODO The handler-less path does not support reductions, and
3463+ // kernel functions with the kernel_handler type argument yet.
3464+ if constexpr (sizeof ...(RestT) == 1 &&
3465+ !(detail::KernelLambdaHasKernelHandlerArgT<
3466+ KernelType, sycl::nd_item<Dims>>::value)) {
3467+ return detail::submit_kernel_direct_parallel_for<KernelName, true >(
3468+ *this , Range, Rest..., sycl::span<const event>(&DepEvent, 1 ),
3469+ ext::oneapi::experimental::empty_properties_t {},
3470+ TlsCodeLocCapture.query ());
3471+ } else {
3472+ return submit (
3473+ [&](handler &CGH) {
3474+ CGH.depends_on (DepEvent);
3475+ CGH.template parallel_for <KernelName>(Range, Rest...);
3476+ },
3477+ TlsCodeLocCapture.query ());
3478+ }
34403479 }
34413480
34423481 // / parallel_for version with a kernel represented as a lambda + nd_range that
@@ -3485,12 +3524,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
34853524 RestT &&...Rest) {
34863525 constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
34873526 detail::tls_code_loc_t TlsCodeLocCapture (CodeLoc);
3488- return submit (
3489- [&](handler &CGH) {
3490- CGH.depends_on (DepEvents);
3491- CGH.template parallel_for <KernelName>(Range, Rest...);
3492- },
3493- TlsCodeLocCapture.query ());
3527+ using KernelType = std::tuple_element_t <0 , std::tuple<RestT...>>;
3528+
3529+ // TODO The handler-less path does not support reductions, and
3530+ // kernel functions with the kernel_handler type argument yet.
3531+ if constexpr (sizeof ...(RestT) == 1 &&
3532+ !(detail::KernelLambdaHasKernelHandlerArgT<
3533+ KernelType, sycl::nd_item<Dims>>::value)) {
3534+ return detail::submit_kernel_direct_parallel_for<KernelName, true >(
3535+ *this , Range, Rest..., DepEvents,
3536+ ext::oneapi::experimental::empty_properties_t {},
3537+ TlsCodeLocCapture.query ());
3538+ } else {
3539+ return submit (
3540+ [&](handler &CGH) {
3541+ CGH.depends_on (DepEvents);
3542+ CGH.template parallel_for <KernelName>(Range, Rest...);
3543+ },
3544+ TlsCodeLocCapture.query ());
3545+ }
34943546 }
34953547
34963548 // / Copies data from a memory region pointed to by a placeholder accessor to
0 commit comments