diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 498a3cee61728..d2afe95ce657b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2514,30 +2514,33 @@ static ur_result_t SetKernelParamsAndLaunch( NDRDesc.GlobalOffset[1] != 0 || NDRDesc.GlobalOffset[2] != 0; - std::vector property_list; + ur_kernel_launch_ext_properties_t property_list = { + UR_STRUCTURE_TYPE_KERNEL_LAUNCH_EXT_PROPERTIES, nullptr, 0}; + void **last_pNext = &property_list.pNext; if (KernelUsesClusterLaunch) { - ur_kernel_launch_property_value_t launch_property_value_cluster_range; - launch_property_value_cluster_range.clusterDim[0] = - NDRDesc.ClusterDimensions[0]; - launch_property_value_cluster_range.clusterDim[1] = - NDRDesc.ClusterDimensions[1]; - launch_property_value_cluster_range.clusterDim[2] = - NDRDesc.ClusterDimensions[2]; - - property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION, - launch_property_value_cluster_range}); + ur_kernel_launch_cluster_property_t launch_property_cluster_range; + launch_property_cluster_range.stype = + UR_STRUCTURE_TYPE_KERNEL_LAUNCH_CLUSTER_PROPERTY; + launch_property_cluster_range.pNext = nullptr; + launch_property_cluster_range.clusterDim[0] = NDRDesc.ClusterDimensions[0]; + launch_property_cluster_range.clusterDim[1] = NDRDesc.ClusterDimensions[1]; + launch_property_cluster_range.clusterDim[2] = NDRDesc.ClusterDimensions[2]; + *last_pNext = &launch_property_cluster_range; + last_pNext = &launch_property_cluster_range.pNext; } if (IsCooperative) { - ur_kernel_launch_property_value_t launch_property_value_cooperative; - launch_property_value_cooperative.cooperative = 1; - property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE, - launch_property_value_cooperative}); + property_list.flags |= UR_KERNEL_LAUNCH_FLAG_COOPERATIVE; } // If there is no implicit arg, let the driver handle it via a property if (WorkGroupMemorySize && !ImplicitLocalArg.has_value()) { - property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY, - {{WorkGroupMemorySize}}}); + ur_kernel_launch_workgroup_property_t workgroup_property; + workgroup_property.stype = + UR_STRUCTURE_TYPE_KERNEL_LAUNCH_WORKGROUP_PROPERTY; + workgroup_property.pNext = nullptr; + workgroup_property.workgroup_mem_size = WorkGroupMemorySize; + *last_pNext = &workgroup_property; + last_pNext = &workgroup_property.pNext; } ur_event_handle_t UREvent = nullptr; ur_result_t Error = @@ -2545,8 +2548,8 @@ static ur_result_t SetKernelParamsAndLaunch( Queue.getHandleRef(), Kernel, NDRDesc.Dims, HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, &NDRDesc.GlobalSize[0], LocalSize, UrArgs.size(), UrArgs.data(), - property_list.size(), - property_list.empty() ? nullptr : property_list.data(), + (property_list.flags || property_list.pNext) ? &property_list + : nullptr, RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], OutEventImpl ? &UREvent : nullptr); if (Error == UR_RESULT_SUCCESS && OutEventImpl) { diff --git a/unified-runtime/examples/codegen/codegen.cpp b/unified-runtime/examples/codegen/codegen.cpp index a18366ae6ca59..7d15a13e16c92 100644 --- a/unified-runtime/examples/codegen/codegen.cpp +++ b/unified-runtime/examples/codegen/codegen.cpp @@ -150,7 +150,7 @@ int main() { const size_t lWorkSize[] = {1, 1, 1}; ur_event_handle_t event; ur_check(urEnqueueKernelLaunch(queue, hKernel, 3, gWorkOffset, gWorkSize, - lWorkSize, 0, nullptr, &event)); + lWorkSize, nullptr, &event)); ur_check(urEnqueueMemBufferRead(queue, dB, true, 0, a_size * sizeof(int), b.data, 1, &event, nullptr)); diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 4ceda0bc46b5d..ee3771e74ea24 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -570,6 +570,12 @@ typedef enum ur_structure_type_t { UR_STRUCTURE_TYPE_USM_POOL_BUFFER_DESC = 36, /// ::ur_physical_mem_properties_t UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES = 37, + /// ::ur_kernel_launch_ext_properties_t + UR_STRUCTURE_TYPE_KERNEL_LAUNCH_EXT_PROPERTIES = 38, + /// ::ur_kernel_launch_cluster_property_t + UR_STRUCTURE_TYPE_KERNEL_LAUNCH_CLUSTER_PROPERTY = 40, + /// ::ur_kernel_launch_workgroup_property_t + UR_STRUCTURE_TYPE_KERNEL_LAUNCH_WORKGROUP_PROPERTY = 41, /// ::ur_exp_command_buffer_desc_t UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC = 0x1000, /// ::ur_exp_command_buffer_update_kernel_launch_desc_t @@ -3051,14 +3057,14 @@ typedef enum ur_device_throttle_reasons_flag_t { /// @brief Kernel launch properties support typedef uint32_t ur_kernel_launch_properties_flags_t; typedef enum ur_kernel_launch_properties_flag_t { - /// Supports ::UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE and + /// Supports ::UR_KERNEL_LAUNCH_FLAG_COOPERATIVE and /// ::urKernelSuggestMaxCooperativeGroupCount UR_KERNEL_LAUNCH_PROPERTIES_FLAG_COOPERATIVE = UR_BIT(0), - /// Supports ::UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION + /// Supports ::ur_kernel_launch_cluster_property_t UR_KERNEL_LAUNCH_PROPERTIES_FLAG_CLUSTER_DIMENSION = UR_BIT(1), - /// Supports ::UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY + /// Supports ::ur_kernel_launch_workgroup_property_t UR_KERNEL_LAUNCH_PROPERTIES_FLAG_WORK_GROUP_MEMORY = UR_BIT(2), - /// Supports ::UR_KERNEL_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE + /// Supports ::UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE UR_KERNEL_LAUNCH_PROPERTIES_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE = UR_BIT(3), /// @cond UR_KERNEL_LAUNCH_PROPERTIES_FLAG_FORCE_UINT32 = 0x7fffffff @@ -7750,64 +7756,77 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback( #pragma region enqueue #endif /////////////////////////////////////////////////////////////////////////////// -/// @brief Specifies a launch property id -/// -/// @remarks -/// _Analogues_ -/// - **CUlaunchAttributeID** -typedef enum ur_kernel_launch_property_id_t { - /// The property has no effect. - UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE = 0, +/// @brief Kernel launch flags +typedef uint32_t ur_kernel_launch_flags_t; +typedef enum ur_kernel_launch_flag_t { /// Whether to launch a cooperative kernel. - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, - /// work-group cluster dimensions. - UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, - /// Implicit work group memory allocation. - UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, + UR_KERNEL_LAUNCH_FLAG_COOPERATIVE = UR_BIT(0), /// Whether to opportunistically execute kernel launches serially on a /// native queue - UR_KERNEL_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE = 4, + UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE = UR_BIT(1), /// @cond - UR_KERNEL_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff + UR_KERNEL_LAUNCH_FLAG_FORCE_UINT32 = 0x7fffffff /// @endcond -} ur_kernel_launch_property_id_t; +} ur_kernel_launch_flag_t; +/// @brief Bit Mask for validating ur_kernel_launch_flags_t +#define UR_KERNEL_LAUNCH_FLAGS_MASK 0xfffffffc + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Extended kernel launch properties +/// +/// @remarks +/// _Analogues_ +/// - **cuLaunchAttribute** +typedef struct ur_kernel_launch_ext_properties_t { + /// [in] type of this structure, must be + /// ::UR_STRUCTURE_TYPE_KERNEL_LAUNCH_EXT_PROPERTIES + ur_structure_type_t stype; + /// [in,out][optional] pointer to extension-specific structure + void *pNext; + /// [in] Kernel launch flags. Allowed values are: + /// ::UR_KERNEL_LAUNCH_FLAG_COOPERATIVE, + /// ::UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE. + ur_kernel_launch_flags_t flags; + +} ur_kernel_launch_ext_properties_t; /////////////////////////////////////////////////////////////////////////////// -/// @brief Specifies a launch property value +/// @brief Kernel launch cluster property /// /// @remarks /// _Analogues_ /// - **CUlaunchAttributeValue** -typedef union ur_kernel_launch_property_value_t { +typedef struct ur_kernel_launch_cluster_property_t { + /// [in] type of this structure, must be + /// ::UR_STRUCTURE_TYPE_KERNEL_LAUNCH_CLUSTER_PROPERTY + ur_structure_type_t stype; + /// [in,out][optional] pointer to extension-specific structure + void *pNext; /// [in] dimensions of the cluster (units of work-group) (x, y, z). Each /// value must be a divisor of the corresponding global work-size /// dimension (in units of work-group). uint32_t clusterDim[3]; - /// [in] non-zero value indicates a cooperative kernel - int cooperative; - /// [in] non-zero value indicates the amount of work group memory to - /// allocate in bytes - size_t workgroup_mem_size; - /// [in] non-zero value indicates an opportunistic native queue serialized - /// kernel - int opportunistic_queue_serialize; -} ur_kernel_launch_property_value_t; +} ur_kernel_launch_cluster_property_t; /////////////////////////////////////////////////////////////////////////////// -/// @brief Kernel launch property +/// @brief Kernel launch work group memory property /// /// @remarks /// _Analogues_ -/// - **cuLaunchAttribute** -typedef struct ur_kernel_launch_property_t { - /// [in] launch property id - ur_kernel_launch_property_id_t id; - /// [in][tagged_by(id)] launch property value - ur_kernel_launch_property_value_t value; +/// - **CUlaunchAttributeValue** +typedef struct ur_kernel_launch_workgroup_property_t { + /// [in] type of this structure, must be + /// ::UR_STRUCTURE_TYPE_KERNEL_LAUNCH_WORKGROUP_PROPERTY + ur_structure_type_t stype; + /// [in,out][optional] pointer to extension-specific structure + void *pNext; + /// [in] non-zero value indicates the amount of work group memory to + /// allocate in bytes + size_t workgroup_mem_size; -} ur_kernel_launch_property_t; +} ur_kernel_launch_workgroup_property_t; /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a command to execute a kernel @@ -7829,7 +7848,10 @@ typedef struct ur_kernel_launch_property_t { /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pGlobalWorkSize` -/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `launchPropList == NULL` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != launchPropList && ::UR_KERNEL_LAUNCH_FLAGS_MASK & +/// launchPropList->flags` /// - ::UR_RESULT_ERROR_INVALID_QUEUE /// - ::UR_RESULT_ERROR_INVALID_KERNEL /// - ::UR_RESULT_ERROR_INVALID_EVENT @@ -7866,11 +7888,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -13213,10 +13232,12 @@ typedef struct ur_exp_kernel_arg_properties_t { /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pGlobalWorkSize` -/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `launchPropList == NULL` /// + `pArgs == NULL && numArgs > 0` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// + `NULL != launchPropList && ::UR_KERNEL_LAUNCH_FLAGS_MASK & +/// launchPropList->flags` /// - ::UR_RESULT_ERROR_INVALID_QUEUE /// - ::UR_RESULT_ERROR_INVALID_KERNEL /// - ::UR_RESULT_ERROR_INVALID_EVENT @@ -13264,11 +13285,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -14503,8 +14521,7 @@ typedef struct ur_enqueue_kernel_launch_params_t { const size_t **ppGlobalWorkOffset; const size_t **ppGlobalWorkSize; const size_t **ppLocalWorkSize; - uint32_t *pnumPropsInLaunchPropList; - const ur_kernel_launch_property_t **plaunchPropList; + const ur_kernel_launch_ext_properties_t **plaunchPropList; uint32_t *pnumEventsInWaitList; const ur_event_handle_t **pphEventWaitList; ur_event_handle_t **pphEvent; @@ -14910,8 +14927,7 @@ typedef struct ur_enqueue_kernel_launch_with_args_exp_params_t { const size_t **ppLocalWorkSize; uint32_t *pnumArgs; const ur_exp_kernel_arg_properties_t **ppArgs; - uint32_t *pnumPropsInLaunchPropList; - const ur_kernel_launch_property_t **plaunchPropList; + const ur_kernel_launch_ext_properties_t **plaunchPropList; uint32_t *pnumEventsInWaitList; const ur_event_handle_t **pphEventWaitList; ur_event_handle_t **pphEvent; diff --git a/unified-runtime/include/ur_ddi.h b/unified-runtime/include/ur_ddi.h index 73e040d1f6bf1..68df30bfdaea2 100644 --- a/unified-runtime/include/ur_ddi.h +++ b/unified-runtime/include/ur_ddi.h @@ -891,9 +891,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetPhysicalMemProcAddrTable_t)( /// @brief Function-pointer for urEnqueueKernelLaunch typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunch_t)( ur_queue_handle_t, ur_kernel_handle_t, uint32_t, const size_t *, - const size_t *, const size_t *, uint32_t, - const ur_kernel_launch_property_t *, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *); + const size_t *, const size_t *, const ur_kernel_launch_ext_properties_t *, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urEnqueueEventsWait @@ -1108,9 +1107,9 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetEnqueueProcAddrTable_t)( typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchWithArgsExp_t)( ur_queue_handle_t, ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, const size_t *, uint32_t, - const ur_exp_kernel_arg_properties_t *, uint32_t, - const ur_kernel_launch_property_t *, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *); + const ur_exp_kernel_arg_properties_t *, + const ur_kernel_launch_ext_properties_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *); /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urEnqueueUSMDeviceAllocExp diff --git a/unified-runtime/include/ur_print.h b/unified-runtime/include/ur_print.h index f9e1904d94c0c..026cb6eb4d4a8 100644 --- a/unified-runtime/include/ur_print.h +++ b/unified-runtime/include/ur_print.h @@ -1112,23 +1112,43 @@ urPrintExecutionInfo(enum ur_execution_info_t value, char *buffer, const size_t buff_size, size_t *out_size); /////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_kernel_launch_property_id_t enum +/// @brief Print ur_kernel_launch_flag_t enum /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_INVALID_SIZE /// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelLaunchPropertyId( - enum ur_kernel_launch_property_id_t value, char *buffer, +UR_APIEXPORT ur_result_t UR_APICALL +urPrintKernelLaunchFlags(enum ur_kernel_launch_flag_t value, char *buffer, + const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_launch_ext_properties_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelLaunchExtProperties( + const struct ur_kernel_launch_ext_properties_t params, char *buffer, + const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_launch_cluster_property_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelLaunchClusterProperty( + const struct ur_kernel_launch_cluster_property_t params, char *buffer, const size_t buff_size, size_t *out_size); /////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_kernel_launch_property_t struct +/// @brief Print ur_kernel_launch_workgroup_property_t struct /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_INVALID_SIZE /// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelLaunchProperty( - const struct ur_kernel_launch_property_t params, char *buffer, +UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelLaunchWorkgroupProperty( + const struct ur_kernel_launch_workgroup_property_t params, char *buffer, const size_t buff_size, size_t *out_size); /////////////////////////////////////////////////////////////////////////////// diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index a100a0a426b5f..cf7db5a0bc776 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -223,10 +223,9 @@ template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_profiling_info_t value, size_t size); -inline ur_result_t -printUnion(std::ostream &os, - const union ur_kernel_launch_property_value_t params, - const enum ur_kernel_launch_property_id_t tag); +template <> +inline ur_result_t printFlag(std::ostream &os, + uint32_t flag); template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); @@ -524,10 +523,16 @@ operator<<(std::ostream &os, inline std::ostream &operator<<(std::ostream &os, enum ur_execution_info_t value); inline std::ostream &operator<<(std::ostream &os, - enum ur_kernel_launch_property_id_t value); -inline std::ostream & -operator<<(std::ostream &os, - [[maybe_unused]] const struct ur_kernel_launch_property_t params); + enum ur_kernel_launch_flag_t value); +inline std::ostream &operator<<( + std::ostream &os, + [[maybe_unused]] const struct ur_kernel_launch_ext_properties_t params); +inline std::ostream &operator<<( + std::ostream &os, + [[maybe_unused]] const struct ur_kernel_launch_cluster_property_t params); +inline std::ostream &operator<<( + std::ostream &os, + [[maybe_unused]] const struct ur_kernel_launch_workgroup_property_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_map_flag_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_usm_migration_flag_t value); @@ -1438,6 +1443,15 @@ inline std::ostream &operator<<(std::ostream &os, case UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES: os << "UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES"; break; + case UR_STRUCTURE_TYPE_KERNEL_LAUNCH_EXT_PROPERTIES: + os << "UR_STRUCTURE_TYPE_KERNEL_LAUNCH_EXT_PROPERTIES"; + break; + case UR_STRUCTURE_TYPE_KERNEL_LAUNCH_CLUSTER_PROPERTY: + os << "UR_STRUCTURE_TYPE_KERNEL_LAUNCH_CLUSTER_PROPERTY"; + break; + case UR_STRUCTURE_TYPE_KERNEL_LAUNCH_WORKGROUP_PROPERTY: + os << "UR_STRUCTURE_TYPE_KERNEL_LAUNCH_WORKGROUP_PROPERTY"; + break; case UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC: os << "UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC"; break; @@ -1711,6 +1725,24 @@ inline ur_result_t printStruct(std::ostream &os, const void *ptr) { printPtr(os, pstruct); } break; + case UR_STRUCTURE_TYPE_KERNEL_LAUNCH_EXT_PROPERTIES: { + const ur_kernel_launch_ext_properties_t *pstruct = + (const ur_kernel_launch_ext_properties_t *)ptr; + printPtr(os, pstruct); + } break; + + case UR_STRUCTURE_TYPE_KERNEL_LAUNCH_CLUSTER_PROPERTY: { + const ur_kernel_launch_cluster_property_t *pstruct = + (const ur_kernel_launch_cluster_property_t *)ptr; + printPtr(os, pstruct); + } break; + + case UR_STRUCTURE_TYPE_KERNEL_LAUNCH_WORKGROUP_PROPERTY: { + const ur_kernel_launch_workgroup_property_t *pstruct = + (const ur_kernel_launch_workgroup_property_t *)ptr; + printPtr(os, pstruct); + } break; + case UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC: { const ur_exp_command_buffer_desc_t *pstruct = (const ur_exp_command_buffer_desc_t *)ptr; @@ -11020,26 +11052,17 @@ inline std::ostream &operator<<(std::ostream &os, return os; } /////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_kernel_launch_property_id_t type +/// @brief Print operator for the ur_kernel_launch_flag_t type /// @returns /// std::ostream & inline std::ostream &operator<<(std::ostream &os, - enum ur_kernel_launch_property_id_t value) { + enum ur_kernel_launch_flag_t value) { switch (value) { - case UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE: - os << "UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE"; - break; - case UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE: - os << "UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE"; - break; - case UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: - os << "UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION"; + case UR_KERNEL_LAUNCH_FLAG_COOPERATIVE: + os << "UR_KERNEL_LAUNCH_FLAG_COOPERATIVE"; break; - case UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: - os << "UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY"; - break; - case UR_KERNEL_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE: - os << "UR_KERNEL_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE"; + case UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE: + os << "UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE"; break; default: os << "unknown enumerator"; @@ -11047,68 +11070,123 @@ inline std::ostream &operator<<(std::ostream &os, } return os; } + namespace ur::details { +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_launch_flag_t flag +template <> +inline ur_result_t printFlag(std::ostream &os, + uint32_t flag) { + uint32_t val = flag; + bool first = true; + + if ((val & UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) == + (uint32_t)UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { + val ^= (uint32_t)UR_KERNEL_LAUNCH_FLAG_COOPERATIVE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_KERNEL_LAUNCH_FLAG_COOPERATIVE; + } + if ((val & UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE) == + (uint32_t)UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE) { + val ^= (uint32_t)UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE; + } + if (val != 0) { + std::bitset<32> bits(val); + if (!first) { + os << " | "; + } + os << "unknown bit flags " << bits; + } else if (first) { + os << "0"; + } + return UR_RESULT_SUCCESS; +} +} // namespace ur::details /////////////////////////////////////////////////////////////////////////////// -// @brief Print ur_kernel_launch_property_value_t union -inline ur_result_t -printUnion(std::ostream &os, - const union ur_kernel_launch_property_value_t params, - const enum ur_kernel_launch_property_id_t tag) { - os << "(union ur_kernel_launch_property_value_t){"; +/// @brief Print operator for the ur_kernel_launch_ext_properties_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, + const struct ur_kernel_launch_ext_properties_t params) { + os << "(struct ur_kernel_launch_ext_properties_t){"; - switch (tag) { - case UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: + os << ".stype = "; - os << ".clusterDim = {"; - ur::details::printArray<3>(os, params.clusterDim); - os << "}"; + os << (params.stype); - break; - case UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE: + os << ", "; + os << ".pNext = "; - os << ".cooperative = "; + ur::details::printStruct(os, (params.pNext)); - os << (params.cooperative); + os << ", "; + os << ".flags = "; - break; - case UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: + ur::details::printFlag(os, (params.flags)); - os << ".workgroup_mem_size = "; + os << "}"; + return os; +} +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_kernel_launch_cluster_property_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, + const struct ur_kernel_launch_cluster_property_t params) { + os << "(struct ur_kernel_launch_cluster_property_t){"; - os << (params.workgroup_mem_size); + os << ".stype = "; - break; - case UR_KERNEL_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE: + os << (params.stype); - os << ".opportunistic_queue_serialize = "; + os << ", "; + os << ".pNext = "; - os << (params.opportunistic_queue_serialize); + ur::details::printStruct(os, (params.pNext)); - break; - default: - os << ""; - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } + os << ", "; + os << ".clusterDim = {"; + ur::details::printArray<3>(os, params.clusterDim); os << "}"; - return UR_RESULT_SUCCESS; + + os << "}"; + return os; } -} // namespace ur::details /////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_kernel_launch_property_t type +/// @brief Print operator for the ur_kernel_launch_workgroup_property_t type /// @returns /// std::ostream & inline std::ostream & -operator<<(std::ostream &os, const struct ur_kernel_launch_property_t params) { - os << "(struct ur_kernel_launch_property_t){"; +operator<<(std::ostream &os, + const struct ur_kernel_launch_workgroup_property_t params) { + os << "(struct ur_kernel_launch_workgroup_property_t){"; - os << ".id = "; + os << ".stype = "; - os << (params.id); + os << (params.stype); os << ", "; - os << ".value = "; - ur::details::printUnion(os, (params.value), params.id); + os << ".pNext = "; + + ur::details::printStruct(os, (params.pNext)); + + os << ", "; + os << ".workgroup_mem_size = "; + + os << (params.workgroup_mem_size); os << "}"; return os; @@ -15702,26 +15780,10 @@ inline std::ostream &operator<<( ur::details::printPtr(os, *(params->ppLocalWorkSize)); - os << ", "; - os << ".numPropsInLaunchPropList = "; - - os << *(params->pnumPropsInLaunchPropList); - os << ", "; os << ".launchPropList = "; - ur::details::printPtr( - os, reinterpret_cast(*(params->plaunchPropList))); - if (*(params->plaunchPropList) != NULL) { - os << " {"; - for (size_t i = 0; i < *params->pnumPropsInLaunchPropList; ++i) { - if (i != 0) { - os << ", "; - } - os << (*(params->plaunchPropList))[i]; - } - os << "}"; - } + ur::details::printPtr(os, *(params->plaunchPropList)); os << ", "; os << ".numEventsInWaitList = "; @@ -17406,26 +17468,10 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct os << "}"; } - os << ", "; - os << ".numPropsInLaunchPropList = "; - - os << *(params->pnumPropsInLaunchPropList); - os << ", "; os << ".launchPropList = "; - ur::details::printPtr( - os, reinterpret_cast(*(params->plaunchPropList))); - if (*(params->plaunchPropList) != NULL) { - os << " {"; - for (size_t i = 0; i < *params->pnumPropsInLaunchPropList; ++i) { - if (i != 0) { - os << ", "; - } - os << (*(params->plaunchPropList))[i]; - } - os << "}"; - } + ur::details::printPtr(os, *(params->plaunchPropList)); os << ", "; os << ".numEventsInWaitList = "; diff --git a/unified-runtime/scripts/core/PROG.rst b/unified-runtime/scripts/core/PROG.rst index 28d8397927b5e..6ff3b358814ff 100644 --- a/unified-runtime/scripts/core/PROG.rst +++ b/unified-runtime/scripts/core/PROG.rst @@ -293,7 +293,7 @@ event dependencies that are passed to each Enqueue command. const size_t gWorkSize = {128, 128, 128}; const size_t lWorkSize = {1, 8, 8}; ${x}EnqueueKernelLaunch(hQueue, hKernel, nDim, gWorkOffset, gWorkSize, - lWorkSize, 0, nullptr, 0, nullptr, nullptr); + lWorkSize, nullptr, 0, nullptr, nullptr); Queue object lifetime --------------------- diff --git a/unified-runtime/scripts/core/device.yml b/unified-runtime/scripts/core/device.yml index 2ad5a2171f490..4f98155eca8ab 100644 --- a/unified-runtime/scripts/core/device.yml +++ b/unified-runtime/scripts/core/device.yml @@ -986,14 +986,14 @@ class: $xDevice name: $x_kernel_launch_properties_flags_t etors: - name: COOPERATIVE - desc: "Supports $X_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE and $xKernelSuggestMaxCooperativeGroupCount" + desc: "Supports $X_KERNEL_LAUNCH_FLAG_COOPERATIVE and $xKernelSuggestMaxCooperativeGroupCount" value: "$X_BIT(0)" - name: CLUSTER_DIMENSION - desc: "Supports $X_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION" + desc: "Supports $x_kernel_launch_cluster_property_t" value: "$X_BIT(1)" - name: WORK_GROUP_MEMORY - desc: "Supports $X_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY" + desc: "Supports $x_kernel_launch_workgroup_property_t" value: "$X_BIT(2)" - name: OPPORTUNISTIC_QUEUE_SERIALIZE - desc: "Supports $X_KERNEL_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE" + desc: "Supports $X_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE" value: "$X_BIT(3)" diff --git a/unified-runtime/scripts/core/enqueue.yml b/unified-runtime/scripts/core/enqueue.yml index a6148bd366f64..47e7d1a566411 100644 --- a/unified-runtime/scripts/core/enqueue.yml +++ b/unified-runtime/scripts/core/enqueue.yml @@ -1,5 +1,5 @@ # -# Copyright (C) 2021 Intel Corporation +# Copyright (C) 2021-2025 Intel Corporation # # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. # See LICENSE.TXT @@ -13,60 +13,50 @@ desc: "Intel $OneApi Unified Runtime APIs" ordinal: "10" --- #-------------------------------------------------------------------------- type: enum -desc: "Specifies a launch property id" -name: $x_kernel_launch_property_id_t -analogue: - - "**CUlaunchAttributeID**" +desc: "Kernel launch flags" +name: $x_kernel_launch_flags_t etors: - - name: IGNORE - desc: "The property has no effect." - name: COOPERATIVE + value: "$X_BIT(0)" desc: "Whether to launch a cooperative kernel." - - name: CLUSTER_DIMENSION - desc: "work-group cluster dimensions." - - name: WORK_GROUP_MEMORY - desc: "Implicit work group memory allocation." - name: OPPORTUNISTIC_QUEUE_SERIALIZE + value: "$X_BIT(1)" desc: "Whether to opportunistically execute kernel launches serially on a native queue" --- #-------------------------------------------------------------------------- -type: union -desc: "Specifies a launch property value" -name: $x_kernel_launch_property_value_t -tag: $x_kernel_launch_property_id_t +type: struct +desc: "Extended kernel launch properties" +name: $x_kernel_launch_ext_properties_t +base: $x_base_properties_t +analogue: + - "**cuLaunchAttribute**" +members: + - type: $x_kernel_launch_flags_t + name: flags + desc: "[in] Kernel launch flags. Allowed values are: $X_KERNEL_LAUNCH_FLAG_COOPERATIVE, $X_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE." +--- #-------------------------------------------------------------------------- +type: struct +desc: "Kernel launch cluster property" +name: $x_kernel_launch_cluster_property_t +base: $x_base_properties_t analogue: - "**CUlaunchAttributeValue**" members: - type: uint32_t[3] name: clusterDim desc: "[in] dimensions of the cluster (units of work-group) (x, y, z). Each value must be a divisor of the corresponding global work-size dimension (in units of work-group)." - tag: $X_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION - - type: int - name: cooperative - desc: "[in] non-zero value indicates a cooperative kernel" - tag: $X_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE - - type: size_t - name: workgroup_mem_size - desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes" - tag: $X_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY - - type: int - name: opportunistic_queue_serialize - desc: "[in] non-zero value indicates an opportunistic native queue serialized kernel" - tag: $X_KERNEL_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE + init: [0, 0, 0] --- #-------------------------------------------------------------------------- type: struct -desc: "Kernel launch property" -name: $x_kernel_launch_property_t +desc: "Kernel launch work group memory property" +name: $x_kernel_launch_workgroup_property_t +base: $x_base_properties_t analogue: - - "**cuLaunchAttribute**" + - "**CUlaunchAttributeValue**" members: - - type: $x_kernel_launch_property_id_t - name: id - desc: "[in] launch property id" - init: $X_KERNEL_LAUNCH_PROPERTY_ID_IGNORE - - type: $x_kernel_launch_property_value_t - name: value - desc: "[in][tagged_by(id)] launch property value" - init: nullptr + - type: size_t + name: workgroup_mem_size + desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes" + init: 0 --- #-------------------------------------------------------------------------- type: function desc: "Enqueue a command to execute a kernel" @@ -98,12 +88,9 @@ params: desc: | [in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. If nullptr, the runtime implementation will choose the work-group size. - - type: uint32_t - name: numPropsInLaunchPropList - desc: "[in] size of the launch prop list" - - type: const $x_kernel_launch_property_t* + - type: const $x_kernel_launch_ext_properties_t* name: launchPropList - desc: "[in][optional][range(0, numPropsInLaunchPropList)] pointer to a list of launch properties" + desc: "[in][optional] pointer to a single linked list of launch properties" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -134,7 +121,7 @@ returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: - "If any property in `launchPropList` isn't supported by the device." - $X_RESULT_ERROR_INVALID_NULL_POINTER: - - "`launchPropList == NULL && numPropsInLaunchPropList > 0`" + - "`launchPropList == NULL`" --- #-------------------------------------------------------------------------- type: function desc: "Enqueue a command which waits a list of events to complete before it completes" diff --git a/unified-runtime/scripts/core/exp-enqueue-kernel-launch-with-args.yml b/unified-runtime/scripts/core/exp-enqueue-kernel-launch-with-args.yml index 6656b6a6d0299..7c13c1a9620ae 100644 --- a/unified-runtime/scripts/core/exp-enqueue-kernel-launch-with-args.yml +++ b/unified-runtime/scripts/core/exp-enqueue-kernel-launch-with-args.yml @@ -126,12 +126,9 @@ params: - type: "const $x_exp_kernel_arg_properties_t*" name: pArgs desc: "[in][optional][range(0, numArgs)] pointer to a list of kernel arg properties." - - type: uint32_t - name: numPropsInLaunchPropList - desc: "[in] size of the launch prop list" - - type: const $x_kernel_launch_property_t* + - type: const $x_kernel_launch_ext_properties_t* name: launchPropList - desc: "[in][optional][range(0, numPropsInLaunchPropList)] pointer to a list of launch properties" + desc: "[in][optional] pointer to a single linked list of launch properties" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -166,5 +163,5 @@ returns: - $X_RESULT_ERROR_INVALID_OPERATION: - "If any property in `launchPropList` isn't supported by the device." - $X_RESULT_ERROR_INVALID_NULL_POINTER: - - "`launchPropList == NULL && numPropsInLaunchPropList > 0`" + - "`launchPropList == NULL`" - "`pArgs == NULL && numArgs > 0`" diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index ce969c39abf33..42084b15815cf 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -805,3 +805,12 @@ etors: - name: PHYSICAL_MEM_PROPERTIES desc: $x_physical_mem_properties_t value: '37' +- name: KERNEL_LAUNCH_EXT_PROPERTIES + desc: $x_kernel_launch_ext_properties_t + value: '38' +- name: KERNEL_LAUNCH_CLUSTER_PROPERTY + desc: $x_kernel_launch_cluster_property_t + value: '40' +- name: KERNEL_LAUNCH_WORKGROUP_PROPERTY + desc: $x_kernel_launch_workgroup_property_t + value: '41' diff --git a/unified-runtime/source/adapters/cuda/enqueue.cpp b/unified-runtime/source/adapters/cuda/enqueue.cpp index e4ccb670e0291..fe4dcbd08381a 100644 --- a/unified-runtime/source/adapters/cuda/enqueue.cpp +++ b/unified-runtime/source/adapters/cuda/enqueue.cpp @@ -454,29 +454,41 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const size_t *pLocalWorkSize, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - size_t WorkGroupMemory = [&]() -> size_t { - const ur_kernel_launch_property_t *WorkGroupMemoryProp = std::find_if( - launchPropList, launchPropList + numPropsInLaunchPropList, - [](const ur_kernel_launch_property_t &Prop) { - return Prop.id == UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY; - }); - if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList) - return WorkGroupMemoryProp->value.workgroup_mem_size; - return 0; - }(); + size_t WorkGroupMemory = 0; + uint32_t numProps = 0; + + ur_kernel_launch_ext_properties_t *_launchPropList = + const_cast(launchPropList); + while (_launchPropList != nullptr) { + if (_launchPropList->stype != + as_stype()) { + numProps++; + } + if (_launchPropList->stype == + as_stype()) { + ur_kernel_launch_workgroup_property_t *WorkGroupMemoryProp = + reinterpret_cast( + _launchPropList); + WorkGroupMemory = WorkGroupMemoryProp->workgroup_mem_size; + break; + } + _launchPropList = static_cast( + _launchPropList->pNext); + } - if (numPropsInLaunchPropList == 0 || - (WorkGroupMemory && numPropsInLaunchPropList == 1)) { + if (numProps == 0 || + (WorkGroupMemory && numProps == 1 && launchPropList->flags == 0)) { return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent, WorkGroupMemory); } + #if CUDA_VERSION >= 11080 // Preconditions UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(), @@ -489,7 +501,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } std::vector launch_attribute; - launch_attribute.reserve(numPropsInLaunchPropList); + launch_attribute.reserve(numProps); // Early exit for zero size kernel if (*pGlobalWorkSize == 0) { @@ -508,29 +520,52 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( uint32_t LocalSize = hKernel->getLocalSize(); CUfunction CuFunc = hKernel->get(); - for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { - switch (launchPropList[i].id) { - case UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE: { - auto &attr = launch_attribute.emplace_back(); - attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE; + _launchPropList = + const_cast(launchPropList); + + if (_launchPropList->flags & UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { + auto &attr = launch_attribute.emplace_back(); + attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; + attr.value.cooperative = 1; + } + + if (_launchPropList->flags & + UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE) { + auto &attr = launch_attribute.emplace_back(); + attr.id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION; + attr.value.programmaticStreamSerializationAllowed = 1; + } + + if (_launchPropList->flags & + ~(UR_KERNEL_LAUNCH_FLAG_COOPERATIVE | + UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE)) { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + while (_launchPropList != nullptr) { + switch (_launchPropList->stype) { + case UR_STRUCTURE_TYPE_KERNEL_LAUNCH_EXT_PROPERTIES: { break; } - case UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: { + case UR_STRUCTURE_TYPE_KERNEL_LAUNCH_CLUSTER_PROPERTY: { auto &attr = launch_attribute.emplace_back(); attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; + ur_kernel_launch_cluster_property_t *clusterProperty = + reinterpret_cast( + _launchPropList); // Note that cuda orders from right to left wrt SYCL dimensional order. if (workDim == 3) { - attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2]; - attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1]; - attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0]; + attr.value.clusterDim.x = clusterProperty->clusterDim[2]; + attr.value.clusterDim.y = clusterProperty->clusterDim[1]; + attr.value.clusterDim.z = clusterProperty->clusterDim[0]; } else if (workDim == 2) { - attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1]; - attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0]; - attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2]; + attr.value.clusterDim.x = clusterProperty->clusterDim[1]; + attr.value.clusterDim.y = clusterProperty->clusterDim[0]; + attr.value.clusterDim.z = clusterProperty->clusterDim[2]; } else { - attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0]; - attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1]; - attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2]; + attr.value.clusterDim.x = clusterProperty->clusterDim[0]; + attr.value.clusterDim.y = clusterProperty->clusterDim[1]; + attr.value.clusterDim.z = clusterProperty->clusterDim[2]; } UR_CHECK_ERROR(cuFuncSetAttribute( @@ -538,26 +573,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( break; } - case UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE: { - auto &attr = launch_attribute.emplace_back(); - attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; - attr.value.cooperative = launchPropList[i].value.cooperative; - break; - } - case UR_KERNEL_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE: { - auto &attr = launch_attribute.emplace_back(); - attr.id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION; - attr.value.programmaticStreamSerializationAllowed = - launchPropList[i].value.opportunistic_queue_serialize; - break; - } - case UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: { + case UR_STRUCTURE_TYPE_KERNEL_LAUNCH_WORKGROUP_PROPERTY: { break; } default: { return UR_RESULT_ERROR_INVALID_ENUMERATION; } } + _launchPropList = static_cast( + _launchPropList->pNext); } // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled @@ -633,13 +657,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( #endif // CUDA_VERSION >= 11080 } +// const ur_kernel_launch_ext_properties_t *launchPropList, + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { try { @@ -682,8 +707,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( return Err; } return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, + pGlobalWorkSize, pLocalWorkSize, launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } diff --git a/unified-runtime/source/adapters/hip/enqueue.cpp b/unified-runtime/source/adapters/hip/enqueue.cpp index 89b45d9d29b2a..43f4dbc300291 100644 --- a/unified-runtime/source/adapters/hip/enqueue.cpp +++ b/unified-runtime/source/adapters/hip/enqueue.cpp @@ -250,8 +250,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const size_t *pLocalWorkSize, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { UR_ASSERT(hQueue->getContext() == hKernel->getContext(), @@ -259,20 +259,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; - propIndex++) { - // Adapters that don't support cooperative kernels are currently expected - // to ignore COOPERATIVE launch properties. Ideally we should avoid passing - // these at the SYCL RT level instead, see - // https://github.com/intel/llvm/issues/18421 - if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE || - launchPropList[propIndex].id == - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { - continue; - } + ur_kernel_launch_ext_properties_t *_launchPropList = + const_cast(launchPropList); + // Adapters that don't support cooperative kernels are currently expected + // to ignore COOPERATIVE launch properties. Ideally we should avoid passing + // these at the SYCL RT level instead, see + // https://github.com/intel/llvm/issues/18421 + if (_launchPropList->flags & ~UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + while (_launchPropList != nullptr) { + if (_launchPropList->stype != + as_stype()) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + _launchPropList = static_cast( + _launchPropList->pNext); + } + // Early exit for zero size range kernel if (*pGlobalWorkSize == 0) { return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, @@ -346,8 +351,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { try { @@ -390,8 +394,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( return Err; } return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, + pGlobalWorkSize, pLocalWorkSize, launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } diff --git a/unified-runtime/source/adapters/level_zero/kernel.cpp b/unified-runtime/source/adapters/level_zero/kernel.cpp index 1a5d02da15d37..873ba4d6270a3 100644 --- a/unified-runtime/source/adapters/level_zero/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/kernel.cpp @@ -168,11 +168,8 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *Args, - /// [in] size of the launch prop list - uint32_t NumPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *LaunchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *LaunchPropList, uint32_t NumEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of /// events that must be complete before the kernel execution. If @@ -219,8 +216,7 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( // Normalize so each dimension has at least one work item return level_zero::urEnqueueKernelLaunch( Queue, Kernel, workDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize, - NumPropsInLaunchPropList, LaunchPropList, NumEventsInWaitList, - EventWaitList, OutEvent); + LaunchPropList, NumEventsInWaitList, EventWaitList, OutEvent); } ur_result_t urEnqueueKernelLaunch( @@ -243,11 +239,8 @@ ur_result_t urEnqueueKernelLaunch( /// will execute the kernel function. If nullptr, the runtime /// implementation will choose the work-group size. const size_t *LocalWorkSize, - /// [in] size of the launch prop list - uint32_t NumPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *LaunchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *LaunchPropList, /// [in] size of the event wait list uint32_t NumEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -262,20 +255,28 @@ ur_result_t urEnqueueKernelLaunch( ze_command_list_handle_t, ze_kernel_handle_t, const ze_group_count_t *, ze_event_handle_t, uint32_t, ze_event_handle_t *); ZeKernelLaunchFuncT ZeKernelLaunchFunc = &zeCommandListAppendLaunchKernel; - for (uint32_t PropIndex = 0; PropIndex < NumPropsInLaunchPropList; - PropIndex++) { - if (LaunchPropList[PropIndex].id == - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE && - LaunchPropList[PropIndex].value.cooperative) { - ZeKernelLaunchFunc = &zeCommandListAppendLaunchCooperativeKernel; - } - if (LaunchPropList[PropIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE && - LaunchPropList[PropIndex].id != - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { + + ur_kernel_launch_ext_properties_t *_launchPropList = + const_cast(LaunchPropList); + if (_launchPropList->flags & ~UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { + // We don't support any other flags. + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + if (_launchPropList->flags & UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { + ZeKernelLaunchFunc = &zeCommandListAppendLaunchCooperativeKernel; + } + + while (_launchPropList != nullptr) { + if (_launchPropList->stype != + as_stype()) { // We don't support any other properties. return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + _launchPropList = static_cast( + _launchPropList->pNext); } + UR_ASSERT(WorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(WorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index 5f0e93ea5d8da..5926d465f7f85 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -342,8 +342,8 @@ ur_result_t urEventSetCallback(ur_event_handle_t hEvent, ur_result_t urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const size_t *pLocalWorkSize, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue, @@ -828,8 +828,7 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); ur_result_t urEnqueueEventsWaitWithBarrierExt( diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index 9e80e170c3916..7c4b66c9e7547 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -329,7 +329,7 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( syncPointWaitList, numSyncPointsInWaitList); UR_CALL(commandListLocked->appendKernelLaunch( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, nullptr, numSyncPointsInWaitList, eventsWaitList, commandBuffer->createEventIfRequested(retSyncPoint))); diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index d57fa9dd9051d..a41598d73cbee 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -242,29 +242,34 @@ ur_result_t ur_command_list_manager::appendKernelLaunchUnlocked( ur_result_t ur_command_list_manager::appendKernelLaunch( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const size_t *pLocalWorkSize, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendKernelLaunch"); - for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; - propIndex++) { - if (launchPropList[propIndex].id == - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE && - launchPropList[propIndex].value.cooperative) { - UR_CALL(appendKernelLaunchUnlocked(hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, - phEvent, true /* cooperative */)); - return UR_RESULT_SUCCESS; - } - if (launchPropList[propIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE && - launchPropList[propIndex].id != - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { + ur_kernel_launch_ext_properties_t *_launchPropList = + const_cast(launchPropList); + if (_launchPropList->flags & UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { + UR_CALL(appendKernelLaunchUnlocked( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent, true /* cooperative */)); + return UR_RESULT_SUCCESS; + } + + if (_launchPropList->flags & ~UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { + // We don't support any other flags. + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + while (_launchPropList != nullptr) { + if (_launchPropList->stype != + as_stype()) { // We don't support any other properties. return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + _launchPropList = static_cast( + _launchPropList->pNext); } UR_CALL(appendKernelLaunchUnlocked( @@ -1084,8 +1089,7 @@ ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExpOld( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { { @@ -1125,8 +1129,7 @@ ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExpOld( } UR_CALL(appendKernelLaunch(hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, + pGlobalWorkSize, pLocalWorkSize, launchPropList, numEventsInWaitList, phEventWaitList, phEvent)); return UR_RESULT_SUCCESS; @@ -1210,8 +1213,7 @@ ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY( @@ -1219,20 +1221,25 @@ ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp( bool cooperativeKernelLaunchRequested = false; - for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; - propIndex++) { - switch (launchPropList[propIndex].id) { - case UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE: - break; - case UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE: - if (launchPropList[propIndex].value.cooperative) { - cooperativeKernelLaunchRequested = true; - } - break; - default: + ur_kernel_launch_ext_properties_t *_launchPropList = + const_cast(launchPropList); + if (_launchPropList->flags & ~UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { + // We don't support any other flags. + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + if (_launchPropList->flags & UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { + cooperativeKernelLaunchRequested = true; + } + + while (_launchPropList != nullptr) { + if (_launchPropList->stype != + as_stype()) { // We don't support any other properties. return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + _launchPropList = static_cast( + _launchPropList->pNext); } ur_platform_handle_t hPlatform = hContext->getPlatform(); @@ -1256,8 +1263,8 @@ ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp( // check it on its own since it is called also from enqueueKernelLaunch(). return appendKernelLaunchWithArgsExpOld( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + numArgs, pArgs, launchPropList, numEventsInWaitList, phEventWaitList, + phEvent); } return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp index 9734df2941aae..82ce0d605a00e 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp @@ -211,8 +211,8 @@ struct ur_command_list_manager { ur_result_t appendKernelLaunch( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const size_t *pLocalWorkSize, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); ur_result_t @@ -238,8 +238,7 @@ struct ur_command_list_manager { const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); @@ -249,8 +248,7 @@ struct ur_command_list_manager { const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp index 29a8bf7114862..056df7f27a477 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp @@ -59,14 +59,13 @@ ur_result_t urQueueFlush(ur_queue_handle_t hQueue) try { ur_result_t urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const size_t *pLocalWorkSize, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { return hQueue->get().enqueueKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, phEvent); + launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -447,14 +446,13 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { return hQueue->get().enqueueKernelLaunchWithArgsExp( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + numArgs, pArgs, launchPropList, numEventsInWaitList, phEventWaitList, + phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp index 530897288f0fa..36eb54512160d 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp @@ -27,12 +27,11 @@ struct ur_queue_t_ { ur_native_handle_t *) = 0; virtual ur_result_t queueFinish() = 0; virtual ur_result_t queueFlush() = 0; - virtual ur_result_t enqueueKernelLaunch(ur_kernel_handle_t, uint32_t, - const size_t *, const size_t *, - const size_t *, uint32_t, - const ur_kernel_launch_property_t *, - uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) = 0; + virtual ur_result_t + enqueueKernelLaunch(ur_kernel_handle_t, uint32_t, const size_t *, + const size_t *, const size_t *, + const ur_kernel_launch_ext_properties_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueEventsWait(uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueEventsWaitWithBarrier(uint32_t, @@ -166,7 +165,7 @@ struct ur_queue_t_ { virtual ur_result_t enqueueKernelLaunchWithArgsExp( ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *, - uint32_t, const ur_kernel_launch_property_t *, uint32_t, + const ur_kernel_launch_ext_properties_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 7f40d9139d63a..72deba534ec78 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -56,14 +56,13 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_result_t enqueueKernelLaunch( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const size_t *pLocalWorkSize, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, + launchPropList, numEventsInWaitList, phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -459,14 +458,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendKernelLaunchWithArgsExp( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, + numArgs, pArgs, launchPropList, numEventsInWaitList, phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this)); } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp index 564c0472edb81..2cb624b4f263a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -69,15 +69,14 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { ur_result_t enqueueKernelLaunch( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const size_t *pLocalWorkSize, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, + launchPropList, numEventsInWaitList, phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -511,16 +510,15 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .appendKernelLaunchWithArgsExp( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, - launchPropList, numEventsInWaitList, phEventWaitList, + pLocalWorkSize, numArgs, pArgs, launchPropList, numEventsInWaitList, + phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this)); } diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index 73999d1c387ad..754743485527d 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -5664,11 +5664,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -5683,17 +5680,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_event_handle_t *phEvent) try { ur_result_t result = UR_RESULT_SUCCESS; - ur_enqueue_kernel_launch_params_t params = {&hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numPropsInLaunchPropList, - &launchPropList, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; + ur_enqueue_kernel_launch_params_t params = { + &hQueue, &hKernel, &workDim, &pGlobalWorkOffset, + &pGlobalWorkSize, &pLocalWorkSize, &launchPropList, &numEventsInWaitList, + &phEventWaitList, &phEvent}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback("urEnqueueKernelLaunch")); @@ -12230,11 +12220,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -12250,19 +12237,12 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( ur_result_t result = UR_RESULT_SUCCESS; ur_enqueue_kernel_launch_with_args_exp_params_t params = { - &hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numArgs, - &pArgs, - &numPropsInLaunchPropList, - &launchPropList, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; + &hQueue, &hKernel, + &workDim, &pGlobalWorkOffset, + &pGlobalWorkSize, &pLocalWorkSize, + &numArgs, &pArgs, + &launchPropList, &numEventsInWaitList, + &phEventWaitList, &phEvent}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index f1e7ea3c31f4d..97942169d89f5 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -99,16 +99,26 @@ static inline native_cpu::state getState(const native_cpu::NDRDescT &ndr) { UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const size_t *pLocalWorkSize, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - // We don't support any launch properties. - for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; - propIndex++) { - if (launchPropList[propIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE) { + + ur_kernel_launch_ext_properties_t *_launchPropList = + const_cast(launchPropList); + if (_launchPropList->flags) { + // We don't support any flags. + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + while (_launchPropList != nullptr) { + if (_launchPropList->stype != + as_stype()) { + // We don't support any launch properties. return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + _launchPropList = static_cast( + _launchPropList->pNext); } UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); @@ -711,8 +721,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) { @@ -743,7 +752,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( } } return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, + pGlobalWorkSize, pLocalWorkSize, launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index 5fe74c4a32c33..889ba927e4378 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -154,7 +154,7 @@ UR_APIEXPORT ur_result_t urEnqueueEventsWaitWithBarrierExt( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t, const ur_kernel_launch_property_t *, + const size_t *pLocalWorkSize, const ur_kernel_launch_ext_properties_t *, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); @@ -490,8 +490,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { for (uint32_t i = 0; i < numArgs; i++) { @@ -517,7 +516,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( } return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, + pGlobalWorkSize, pLocalWorkSize, launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } diff --git a/unified-runtime/source/adapters/opencl/enqueue.cpp b/unified-runtime/source/adapters/opencl/enqueue.cpp index 1bd75b6b56aaf..527034d826ec7 100644 --- a/unified-runtime/source/adapters/opencl/enqueue.cpp +++ b/unified-runtime/source/adapters/opencl/enqueue.cpp @@ -46,24 +46,30 @@ void MapUREventsToCL(uint32_t numEvents, const ur_event_handle_t *UREvents, UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const size_t *pLocalWorkSize, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; - propIndex++) { - // Adapters that don't support cooperative kernels are currently expected - // to ignore COOPERATIVE launch properties. Ideally we should avoid passing - // these at the SYCL RT level instead, see - // https://github.com/intel/llvm/issues/18421 - if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE || - launchPropList[propIndex].id == - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { - continue; - } + + ur_kernel_launch_ext_properties_t *_launchPropList = + const_cast(launchPropList); + // Adapters that don't support cooperative kernels are currently expected + // to ignore COOPERATIVE launch properties. Ideally we should avoid passing + // these at the SYCL RT level instead, see + // https://github.com/intel/llvm/issues/18421 + if (_launchPropList->flags & ~UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + while (_launchPropList != nullptr) { + if (_launchPropList->stype != + as_stype()) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + _launchPropList = static_cast( + _launchPropList->pNext); + } + std::vector compiledLocalWorksize; if (!pLocalWorkSize) { cl_device_id device = nullptr; @@ -511,24 +517,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, + const ur_kernel_launch_ext_properties_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; - propIndex++) { - // Adapters that don't support cooperative kernels are currently expected - // to ignore COOPERATIVE launch properties. Ideally we should avoid passing - // these at the SYCL RT level instead, see - // https://github.com/intel/llvm/issues/18421 - if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE || - launchPropList[propIndex].id == - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { - continue; - } + + ur_kernel_launch_ext_properties_t *_launchPropList = + const_cast(launchPropList); + // Adapters that don't support cooperative kernels are currently expected + // to ignore COOPERATIVE launch properties. Ideally we should avoid passing + // these at the SYCL RT level instead, see + // https://github.com/intel/llvm/issues/18421 + if (_launchPropList->flags & ~UR_KERNEL_LAUNCH_FLAG_COOPERATIVE) { return UR_RESULT_ERROR_INVALID_OPERATION; } + while (_launchPropList != nullptr) { + if (_launchPropList->stype != + as_stype()) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + _launchPropList = static_cast( + _launchPropList->pNext); + } + clSetKernelArgMemPointerINTEL_fn SetKernelArgMemPointerPtr = nullptr; UR_RETURN_ON_FAILURE( cl_ext::getExtFuncFromContext( diff --git a/unified-runtime/source/common/stype_map_helpers.def b/unified-runtime/source/common/stype_map_helpers.def index efd69e6ae4cb3..c053a5251c183 100644 --- a/unified-runtime/source/common/stype_map_helpers.def +++ b/unified-runtime/source/common/stype_map_helpers.def @@ -116,6 +116,15 @@ template <> struct stype_map : stype_map_impl {}; template <> +struct stype_map + : stype_map_impl {}; +template <> +struct stype_map + : stype_map_impl {}; +template <> +struct stype_map + : stype_map_impl {}; +template <> struct stype_map : stype_map_impl {}; template <> diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp index 3df7897f406c4..a1292965253b7 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -509,11 +509,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. If nullptr, the runtime implementation will /// choose the work-group size. const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -547,8 +544,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_result_t UrRes = getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - LaunchInfo.LocalWorkSize.data(), numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + LaunchInfo.LocalWorkSize.data(), launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); if (UrRes != UR_RESULT_SUCCESS) { if (UrRes == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) { UR_LOG_L( @@ -1698,11 +1695,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -1779,14 +1773,14 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, LaunchInfo.LocalWorkSize.data(), numArgs, pArgs, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + launchPropList, numEventsInWaitList, phEventWaitList, phEvent)); */ UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - LaunchInfo.LocalWorkSize.data(), numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent)); + LaunchInfo.LocalWorkSize.data(), launchPropList, numEventsInWaitList, + phEventWaitList, phEvent)); UR_CALL(getAsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp index 9f7abdf54a8ae..25d0307d8862b 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp @@ -474,11 +474,8 @@ ur_result_t urEnqueueKernelLaunch( /// execute the kernel function. If nullptr, the runtime implementation will /// choose the work-group size. const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -506,8 +503,8 @@ ur_result_t urEnqueueKernelLaunch( UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - LaunchInfo.LocalWorkSize.data(), numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent)); + LaunchInfo.LocalWorkSize.data(), launchPropList, numEventsInWaitList, + phEventWaitList, phEvent)); UR_CALL(getMsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); @@ -1859,11 +1856,8 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -1943,14 +1937,14 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, LaunchInfo.LocalWorkSize.data(), numArgs, pArgs, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + launchPropList, numEventsInWaitList, phEventWaitList, phEvent)); */ UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - LaunchInfo.LocalWorkSize.data(), numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent)); + LaunchInfo.LocalWorkSize.data(), launchPropList, numEventsInWaitList, + phEventWaitList, phEvent)); UR_CALL(getMsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp index 11ccfca969cf6..1172b741444cb 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp @@ -1318,11 +1318,8 @@ ur_result_t urEnqueueKernelLaunch( /// execute the kernel function. If nullptr, the runtime implementation will /// choose the work-group size. const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -1348,8 +1345,8 @@ ur_result_t urEnqueueKernelLaunch( UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent)); + pLocalWorkSize, launchPropList, numEventsInWaitList, phEventWaitList, + phEvent)); UR_CALL(getTsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); @@ -1383,11 +1380,8 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -1464,14 +1458,14 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( // TODO: revert to the correct call to pfnKernelLaunchWithArgsExp(): UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, + pLocalWorkSize, numArgs, pArgs, launchPropList, numEventsInWaitList, phEventWaitList, phEvent)); */ UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent)); + pLocalWorkSize, launchPropList, numEventsInWaitList, phEventWaitList, + phEvent)); UR_CALL(getTsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index e30dbd895071c..d13bdabef86e0 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -4680,11 +4680,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -4702,27 +4699,20 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( if (nullptr == pfnKernelLaunch) return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - ur_enqueue_kernel_launch_params_t params = {&hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numPropsInLaunchPropList, - &launchPropList, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; + ur_enqueue_kernel_launch_params_t params = { + &hQueue, &hKernel, &workDim, &pGlobalWorkOffset, + &pGlobalWorkSize, &pLocalWorkSize, &launchPropList, &numEventsInWaitList, + &phEventWaitList, &phEvent}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH, "urEnqueueKernelLaunch", ¶ms); auto &logger = getContext()->logger; UR_LOG_L(logger, INFO, " ---> urEnqueueKernelLaunch\n"); - ur_result_t result = pfnKernelLaunch( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + ur_result_t result = + pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH, "urEnqueueKernelLaunch", ¶ms, &result, instance); @@ -10363,11 +10353,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -10387,19 +10374,12 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; ur_enqueue_kernel_launch_with_args_exp_params_t params = { - &hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numArgs, - &pArgs, - &numPropsInLaunchPropList, - &launchPropList, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; + &hQueue, &hKernel, + &workDim, &pGlobalWorkOffset, + &pGlobalWorkSize, &pLocalWorkSize, + &numArgs, &pArgs, + &launchPropList, &numEventsInWaitList, + &phEventWaitList, &phEvent}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, "urEnqueueKernelLaunchWithArgsExp", ¶ms); @@ -10409,8 +10389,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( ur_result_t result = pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + pLocalWorkSize, numArgs, pArgs, launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, "urEnqueueKernelLaunchWithArgsExp", ¶ms, &result, diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index 464acb714b2a5..76f3a3cbff5bf 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -4552,11 +4552,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -4579,7 +4576,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( if (NULL == pGlobalWorkSize) return UR_RESULT_ERROR_INVALID_NULL_POINTER; - if (launchPropList == NULL && numPropsInLaunchPropList > 0) + if (launchPropList == NULL) return UR_RESULT_ERROR_INVALID_NULL_POINTER; if (NULL == hQueue) @@ -4588,6 +4585,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( if (NULL == hKernel) return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + if (NULL != launchPropList && + UR_KERNEL_LAUNCH_FLAGS_MASK & launchPropList->flags) + return UR_RESULT_ERROR_INVALID_ENUMERATION; + if (phEventWaitList == NULL && numEventsInWaitList > 0) return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; @@ -4613,10 +4614,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( URLOG_CTX_INVALID_REFERENCE(hKernel); } - ur_result_t result = pfnKernelLaunch( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + ur_result_t result = + pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS && phEvent) { @@ -11124,11 +11125,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -11152,7 +11150,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( if (NULL == pGlobalWorkSize) return UR_RESULT_ERROR_INVALID_NULL_POINTER; - if (launchPropList == NULL && numPropsInLaunchPropList > 0) + if (launchPropList == NULL) return UR_RESULT_ERROR_INVALID_NULL_POINTER; if (pArgs == NULL && numArgs > 0) @@ -11167,6 +11165,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( if (NULL != pArgs && UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type) return UR_RESULT_ERROR_INVALID_ENUMERATION; + if (NULL != launchPropList && + UR_KERNEL_LAUNCH_FLAGS_MASK & launchPropList->flags) + return UR_RESULT_ERROR_INVALID_ENUMERATION; + if (phEventWaitList == NULL && numEventsInWaitList > 0) return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; @@ -11202,8 +11204,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( ur_result_t result = pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + pLocalWorkSize, numArgs, pArgs, launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS && phEvent) { diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index 2940e77998695..2bf10e1eb968d 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -397,9 +397,11 @@ EXPORTS urPrintKernelGetSuggestedLocalWorkSizeParams urPrintKernelGroupInfo urPrintKernelInfo + urPrintKernelLaunchClusterProperty + urPrintKernelLaunchExtProperties + urPrintKernelLaunchFlags urPrintKernelLaunchPropertiesFlags - urPrintKernelLaunchProperty - urPrintKernelLaunchPropertyId + urPrintKernelLaunchWorkgroupProperty urPrintKernelNativeProperties urPrintKernelReleaseParams urPrintKernelRetainParams diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index 7839a53f218dc..0c3bf3ec9f90b 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -397,9 +397,11 @@ urPrintKernelGetSuggestedLocalWorkSizeParams; urPrintKernelGroupInfo; urPrintKernelInfo; + urPrintKernelLaunchClusterProperty; + urPrintKernelLaunchExtProperties; + urPrintKernelLaunchFlags; urPrintKernelLaunchPropertiesFlags; - urPrintKernelLaunchProperty; - urPrintKernelLaunchPropertyId; + urPrintKernelLaunchWorkgroupProperty; urPrintKernelNativeProperties; urPrintKernelReleaseParams; urPrintKernelRetainParams; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index ecf768a1006a4..dd834463536e0 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -2584,11 +2584,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -2610,8 +2607,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( // forward to device-platform return pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, + pGlobalWorkSize, pLocalWorkSize, launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } @@ -5891,11 +5887,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -5917,10 +5910,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( return UR_RESULT_ERROR_UNINITIALIZED; // forward to device-platform - return pfnKernelLaunchWithArgsExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + return pfnKernelLaunchWithArgsExp(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, numArgs, + pArgs, launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); } /////////////////////////////////////////////////////////////////////////////// diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 1c5b83b224928..11cb5d67e662a 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -5136,7 +5136,10 @@ ur_result_t UR_APICALL urEventSetCallback( /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pGlobalWorkSize` -/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `launchPropList == NULL` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != launchPropList && ::UR_KERNEL_LAUNCH_FLAGS_MASK & +/// launchPropList->flags` /// - ::UR_RESULT_ERROR_INVALID_QUEUE /// - ::UR_RESULT_ERROR_INVALID_KERNEL /// - ::UR_RESULT_ERROR_INVALID_EVENT @@ -5173,11 +5176,8 @@ ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -5196,8 +5196,7 @@ ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_ERROR_UNINITIALIZED; return pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, + pGlobalWorkSize, pLocalWorkSize, launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); @@ -10816,10 +10815,12 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pGlobalWorkSize` -/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `launchPropList == NULL` /// + `pArgs == NULL && numArgs > 0` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// + `NULL != launchPropList && ::UR_KERNEL_LAUNCH_FLAGS_MASK & +/// launchPropList->flags` /// - ::UR_RESULT_ERROR_INVALID_QUEUE /// - ::UR_RESULT_ERROR_INVALID_KERNEL /// - ::UR_RESULT_ERROR_INVALID_EVENT @@ -10867,11 +10868,8 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -10889,10 +10887,10 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( if (nullptr == pfnKernelLaunchWithArgsExp) return UR_RESULT_ERROR_UNINITIALIZED; - return pfnKernelLaunchWithArgsExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); + return pfnKernelLaunchWithArgsExp(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, numArgs, + pArgs, launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp index 4608b6bb86333..9a33bbff6dac6 100644 --- a/unified-runtime/source/loader/ur_print.cpp +++ b/unified-runtime/source/loader/ur_print.cpp @@ -890,19 +890,33 @@ ur_result_t urPrintExecutionInfo(enum ur_execution_info_t value, char *buffer, return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t -urPrintKernelLaunchPropertyId(enum ur_kernel_launch_property_id_t value, - char *buffer, const size_t buff_size, - size_t *out_size) { +ur_result_t urPrintKernelLaunchFlags(enum ur_kernel_launch_flag_t value, + char *buffer, const size_t buff_size, + size_t *out_size) { std::stringstream ss; ss << value; return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t -urPrintKernelLaunchProperty(const struct ur_kernel_launch_property_t params, - char *buffer, const size_t buff_size, - size_t *out_size) { +ur_result_t urPrintKernelLaunchExtProperties( + const struct ur_kernel_launch_ext_properties_t params, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t urPrintKernelLaunchClusterProperty( + const struct ur_kernel_launch_cluster_property_t params, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t urPrintKernelLaunchWorkgroupProperty( + const struct ur_kernel_launch_workgroup_property_t params, char *buffer, + const size_t buff_size, size_t *out_size) { std::stringstream ss; ss << params; return str_copy(&ss, buffer, buff_size, out_size); diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 44b985fb95353..f44f194e664bf 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -4485,7 +4485,10 @@ ur_result_t UR_APICALL urEventSetCallback( /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pGlobalWorkSize` -/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `launchPropList == NULL` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != launchPropList && ::UR_KERNEL_LAUNCH_FLAGS_MASK & +/// launchPropList->flags` /// - ::UR_RESULT_ERROR_INVALID_QUEUE /// - ::UR_RESULT_ERROR_INVALID_KERNEL /// - ::UR_RESULT_ERROR_INVALID_EVENT @@ -4522,11 +4525,8 @@ ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -9409,10 +9409,12 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pGlobalWorkSize` -/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` +/// + `launchPropList == NULL` /// + `pArgs == NULL && numArgs > 0` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// + `NULL != launchPropList && ::UR_KERNEL_LAUNCH_FLAGS_MASK & +/// launchPropList->flags` /// - ::UR_RESULT_ERROR_INVALID_QUEUE /// - ::UR_RESULT_ERROR_INVALID_KERNEL /// - ::UR_RESULT_ERROR_INVALID_EVENT @@ -9460,11 +9462,8 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg /// properties. const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, + /// [in][optional] pointer to a single linked list of launch properties + const ur_kernel_launch_ext_properties_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index aadb43cc398ba..0d5cda72069c3 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -99,39 +99,32 @@ TEST_P(urEnqueueKernelLaunchTest, Success) { } TEST_P(urEnqueueKernelLaunchTest, SuccessWithLaunchProperties) { - std::vector props(1); - props[0].id = UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE; - + ur_kernel_launch_ext_properties_t props = { + UR_STRUCTURE_TYPE_KERNEL_LAUNCH_EXT_PROPERTIES, nullptr, 0}; ur_kernel_launch_properties_flags_t supported_properties = 0; + ASSERT_SUCCESS(urDeviceGetInfo( device, UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES, sizeof(supported_properties), &supported_properties, nullptr)); if (supported_properties & UR_KERNEL_LAUNCH_PROPERTIES_FLAG_COOPERATIVE) { - ur_kernel_launch_property_t coop_prop; - coop_prop.id = UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE; - coop_prop.value.cooperative = 1; - props.push_back(coop_prop); + props.flags |= UR_KERNEL_LAUNCH_FLAG_COOPERATIVE; } if (supported_properties & - UR_KERNEL_LAUNCH_PROPERTIES_FLAG_CLUSTER_DIMENSION) { - ur_kernel_launch_property_t cluster_dims_prop; - cluster_dims_prop.id = UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION; - cluster_dims_prop.value.clusterDim[0] = 16; - cluster_dims_prop.value.clusterDim[1] = 1; - cluster_dims_prop.value.clusterDim[2] = 1; - - props.push_back(cluster_dims_prop); + UR_KERNEL_LAUNCH_PROPERTIES_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE) { + props.flags |= UR_KERNEL_LAUNCH_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE; } if (supported_properties & - UR_KERNEL_LAUNCH_PROPERTIES_FLAG_OPPORTUNISTIC_QUEUE_SERIALIZE) { - ur_kernel_launch_property_t opportunistic_queue_serialize_prop; - opportunistic_queue_serialize_prop.id = - UR_KERNEL_LAUNCH_PROPERTY_ID_OPPORTUNISTIC_QUEUE_SERIALIZE; - opportunistic_queue_serialize_prop.value.opportunistic_queue_serialize = 1; - props.push_back(opportunistic_queue_serialize_prop); + UR_KERNEL_LAUNCH_PROPERTIES_FLAG_CLUSTER_DIMENSION) { + ur_kernel_launch_cluster_property_t cluster_dims_prop; + cluster_dims_prop.stype = UR_STRUCTURE_TYPE_KERNEL_LAUNCH_CLUSTER_PROPERTY; + cluster_dims_prop.clusterDim[0] = 16; + cluster_dims_prop.clusterDim[1] = 1; + cluster_dims_prop.clusterDim[2] = 1; + cluster_dims_prop.pNext = nullptr; + props.pNext = &cluster_dims_prop; } ur_mem_handle_t buffer = nullptr; @@ -140,8 +133,8 @@ TEST_P(urEnqueueKernelLaunchTest, SuccessWithLaunchProperties) { UUR_RETURN_ON_FATAL_FAILURE(AddPodArg(val)); ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - &global_offset, &global_size, nullptr, 1, - &props[0], 0, nullptr, nullptr)); + &global_offset, &global_size, nullptr, + &props, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); ValidateBuffer(buffer, sizeof(val) * global_size, val); diff --git a/unified-runtime/test/conformance/testing/include/uur/fixtures.h b/unified-runtime/test/conformance/testing/include/uur/fixtures.h index 424d4df5e4744..087a039934b2a 100644 --- a/unified-runtime/test/conformance/testing/include/uur/fixtures.h +++ b/unified-runtime/test/conformance/testing/include/uur/fixtures.h @@ -1504,8 +1504,8 @@ struct KernelLaunchHelper { void Launch1DRange(size_t global_size, size_t local_size = 1) { size_t offset = 0; ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, 1, &offset, - &global_size, &local_size, 0, nullptr, - 0, nullptr, nullptr)); + &global_size, &local_size, nullptr, 0, + nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); }