Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions onemath/sycl/blas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,6 @@ if (SYCL_COMPILER MATCHES "adaptivecpp" OR ${CMAKE_CXX_COMPILER} MATCHES "acpp|s
set(BLAS_ENABLE_COMPLEX OFF PARENT_SCOPE)
endif()
endif()
if (BLAS_ENABLE_USM)
message(STATUS "USM API is not supported on AdaptiveCpp/hipSYCL. USM API is disabled")
set(BLAS_ENABLE_USM OFF)
if (NOT ${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})
set(BLAS_ENABLE_USM OFF PARENT_SCOPE)
endif()
endif()
endif()

add_library(onemath_sycl_blas INTERFACE)
Expand Down
6 changes: 0 additions & 6 deletions onemath/sycl/blas/include/sb_handle/handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,23 +48,19 @@ class SB_Handle {
using event_t = std::vector<sycl::event>;
inline SB_Handle(queue_t q)
:
#ifndef __ADAPTIVECPP__
tempMemPool_(nullptr),
#endif
q_(q),
workGroupSize_(helper::get_work_group_size(q)),
localMemorySupport_(helper::has_local_memory(q)),
computeUnits_(helper::get_num_compute_units(q)) {
}

#ifndef __ADAPTIVECPP__
inline SB_Handle(Temp_Mem_Pool* tmp)
: tempMemPool_(tmp),
q_(tmp->get_queue()),
workGroupSize_(helper::get_work_group_size(q_)),
localMemorySupport_(helper::has_local_memory(q_)),
computeUnits_(helper::get_num_compute_units(q_)) {}
#endif

template <helper::AllocType alloc, typename value_t>
typename std::enable_if<
Expand Down Expand Up @@ -192,9 +188,7 @@ class SB_Handle {
const size_t workGroupSize_;
const bool localMemorySupport_;
const size_t computeUnits_;
#ifndef __ADAPTIVECPP__
Temp_Mem_Pool* tempMemPool_;
#endif
};

} // namespace blas
Expand Down
3 changes: 0 additions & 3 deletions onemath/sycl/blas/include/sb_handle/temp_memory_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
#ifndef TEMP_MEMORY_POOL_H
#define TEMP_MEMORY_POOL_H

#ifndef __ADAPTIVECPP__
#include <map>
#include <mutex>

Expand Down Expand Up @@ -112,6 +111,4 @@ class Temp_Mem_Pool {
};
} // namespace blas

#endif // __ADAPTIVECPP__

#endif
21 changes: 0 additions & 21 deletions onemath/sycl/blas/src/interface/blas1_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ typename sb_handle_t::event_t _sdsdot(
sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx,
increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs,
const typename sb_handle_t::event_t &_dependencies) {
#ifndef __ADAPTIVECPP__
if (!_N) {
using element_t = typename ValueType<container_2_t>::type;
sb_handle.wait(_dependencies);
Expand All @@ -167,11 +166,6 @@ typename sb_handle_t::event_t _sdsdot(
auto ret = sb_handle.execute(assignOp2, dotOp);
return blas::concatenate_vectors(dotOp, ret);
}
#else
throw std::runtime_error(
"Sdsdot is not supported with AdaptiveCpp as it uses SYCL 2020 "
"reduction.");
#endif
}

/**
Expand Down Expand Up @@ -959,7 +953,6 @@ typename ValueType<container_0_t>::type _dot(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _vy, increment_t _incy,
const typename sb_handle_t::event_t &_dependencies) {
#ifndef __ADAPTIVECPP__
constexpr bool is_usm = std::is_pointer<container_0_t>::value;
using element_t = typename ValueType<container_0_t>::type;
element_t res{0};
Expand All @@ -982,10 +975,6 @@ typename ValueType<container_0_t>::type _dot(
: helper::AllocType::buffer>(gpu_res,
sb_handle.get_queue());
return res;
#else
throw std::runtime_error(
"Dot is not supported with AdaptiveCpp as it uses SYCL 2020 reduction.");
#endif
}

/**
Expand Down Expand Up @@ -1109,7 +1098,6 @@ template <typename sb_handle_t, typename container_t, typename index_t,
typename ValueType<container_t>::type _asum(
sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx,
const typename sb_handle_t::event_t &_dependencies) {
#ifndef __ADAPTIVECPP__
constexpr bool is_usm = std::is_pointer<container_t>::value;
using element_t = typename ValueType<container_t>::type;
auto res = std::vector<element_t>(1, element_t(0));
Expand All @@ -1130,10 +1118,6 @@ typename ValueType<container_t>::type _asum(
: helper::AllocType::buffer>(
gpu_res, sb_handle.get_queue());
return res[0];
#else
throw std::runtime_error(
"Asum is not supported with AdaptiveCpp as it uses SYCL 2020 reduction.");
#endif
}

/**
Expand All @@ -1149,7 +1133,6 @@ template <typename sb_handle_t, typename container_t, typename index_t,
typename ValueType<container_t>::type _nrm2(
sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx,
const typename sb_handle_t::event_t &_dependencies) {
#ifndef __ADAPTIVECPP__
constexpr bool is_usm = std::is_pointer<container_t>::value;
using element_t = typename ValueType<container_t>::type;
auto res = std::vector<element_t>(1, element_t(0));
Expand All @@ -1169,10 +1152,6 @@ typename ValueType<container_t>::type _nrm2(
: helper::AllocType::buffer>(
gpu_res, sb_handle.get_queue());
return res[0];
#else
throw std::runtime_error(
"Nrm2 is not supported with AdaptiveCpp as it uses SYCL 2020 reduction.");
#endif
}

} // namespace internal
Expand Down
11 changes: 0 additions & 11 deletions onemath/sycl/blas/src/interface/blas2_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,6 @@ typename sb_handle_t::event_t _trsv_impl(
sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda,
container_t1 _vx, increment_t _incx,
const typename sb_handle_t::event_t& _dependencies) {
#ifdef __ADAPTIVECPP__
throw std::runtime_error("trsv requires SYCL 2020");
#else
static_assert(subgroup_size % subgroups == 0,
"`subgroups` needs to be a multiple of `subgroup_size`.");
using one = constant<increment_t, const_val::one>;
Expand Down Expand Up @@ -404,7 +401,6 @@ typename sb_handle_t::event_t _trsv_impl(
sb_handle.release_temp_mem(ret, sync_buffer);

return ret;
#endif
}

/*! _SYMV.
Expand Down Expand Up @@ -746,9 +742,6 @@ typename sb_handle_t::event_t _tbsv_impl(
sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA,
index_t _lda, container_t1 _vx, increment_t _incx,
const typename sb_handle_t::event_t& _dependencies) {
#ifdef __ADAPTIVECPP__
throw std::runtime_error("tbsv requires SYCL 2020");
#else
static_assert(subgroup_size % subgroups == 0,
"`subgroups` needs to be a multiple of `subgroup_size`.");

Expand Down Expand Up @@ -799,7 +792,6 @@ typename sb_handle_t::event_t _tbsv_impl(
sb_handle.release_temp_mem(ret, sync_buffer);

return ret;
#endif
}

template <uint32_t subgroup_size, uint32_t subgroups, uplo_type uplo,
Expand All @@ -809,9 +801,7 @@ template <uint32_t subgroup_size, uint32_t subgroups, uplo_type uplo,
typename sb_handle_t::event_t _tpsv_impl(
sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx,
increment_t _incx, const typename sb_handle_t::event_t& _dependencies) {
#ifdef __ADAPTIVECPP__
throw std::runtime_error("tpsv requires SYCL 2020");
#else
static_assert(subgroup_size % subgroups == 0,
"`subgroups` needs to be a multiple of `subgroup_size`.");

Expand Down Expand Up @@ -864,7 +854,6 @@ typename sb_handle_t::event_t _tpsv_impl(
sb_handle.release_temp_mem(ret, sync_buffer);

return ret;
#endif
}

/**** RANK 1 MODIFICATION ****/
Expand Down
2 changes: 0 additions & 2 deletions onemath/sycl/blas/src/operations/blas2/txsv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ ONEMATH_SYCL_BLAS_INLINE
is_upper, is_transposed, is_unitdiag>::eval(local_memory_t local_mem,
sycl::nd_item<1> ndItem) {
value_t ret = 0;
#ifndef __ADAPTIVECPP__

constexpr bool is_forward =
(is_upper && is_transposed) || (!is_upper && !is_transposed);
Expand Down Expand Up @@ -291,7 +290,6 @@ ONEMATH_SYCL_BLAS_INLINE

sycl::atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::device);

#endif
return ret;
}

Expand Down
8 changes: 4 additions & 4 deletions onemath/sycl/blas/src/sb_handle/handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,9 @@ typename std::enable_if<
alloc == helper::AllocType::buffer,
typename helper::AllocHelper<value_t, alloc>::type>::type
SB_Handle::acquire_temp_mem(size_t size) {
#ifndef __ADAPTIVECPP__
if (tempMemPool_ != nullptr)
return tempMemPool_->acquire_buff_mem<value_t>(size);
else
#endif
return make_sycl_iterator_buffer<value_t>(size);
}

Expand All @@ -58,11 +56,9 @@ typename std::enable_if<
typename SB_Handle::event_t>::type
SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies,
const container_t& mem) {
#ifndef __ADAPTIVECPP__
if (tempMemPool_ != nullptr)
return tempMemPool_->release_buff_mem(dependencies, mem);
else
#endif
return {};
}

Expand Down Expand Up @@ -92,7 +88,11 @@ SB_Handle::release_temp_mem(const typename SB_Handle::event_t& dependencies,
sycl::context context = q_.get_context();
return {q_.submit([&](sycl::handler& cgh) {
cgh.depends_on(dependencies);
#ifndef __ADAPTIVECPP__
cgh.host_task([=]() { sycl::free(mem, context); });
#else
cgh.AdaptiveCpp_enqueue_custom_operation([=](sycl::interop_handle &) { sycl::free(mem, context); });
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it correct to do a free here inside a custom operation? Custom operations will be evaluated at submission time, which is a fundamental difference from host task (See https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/enqueue-custom-operation.md). So this looks to me like there might be a hazard of deleting memory while the device might still be using it?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess you're correct. I am not sure how to fix this though, do you have a suggestion?

Copy link

@illuhad illuhad Jun 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In pseudo code, maybe something like this could work (and, if the C++ standard library uses a thread pool, should not be slower than host_task):

std::async(..., [=](){
  if(q.is_in_order()) {
    q.wait();
  } else {
    for(auto d : dependencies){
      d.wait();
    }
  }
  free(...);
});

Not sure if we need the in-order branch. If we reliably get dependency lists even for in-order queues we might not need it.

The real issue here is that SYCL does not have the asynchronous free mechanisms that e.g. CUDA has.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes this is a workaround for an asynchronous free. The suggested solutions makes sense to me. I suggest to make a helper function for it since the pattern is repeated a few times.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There might be some complexity here since it's possible we need to wait on the returned std::future that is returned by std::async at some point to get things working.

@Rbiessy I would recommend to also reconsider the current workaround for the DPC++ side. According to the current SYCL 2020 specification,

Capturing accessors in a host task is allowed, however, capturing or using any other SYCL class that has reference semantics (see Section 4.5.2) is undefined behavior.

(https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interfaces.hosttasks.overview)

The sycl::context class, similarly to sycl::queue or sycl::device, follows common reference semantics, and thus cannot be used inside the host_task. So, the current code is undefined behavior and might break at any time.

The background here is that many backends impose the restriction that enqueued callbacks may not make calls into the backend runtime API or risk deadlocks. For example, in CUDA, cudaStreamAddCallback requires that the registered callback may not perform any calls into the CUDA runtime, otherwise, deadlocks or other forms of UB might appear.

The SYCL restriction that common-reference-semantics objects are not allowed in host tasks enforces this limitation.

Unfortunately, because of limitations like this, the current host_task is far less useful than it might appear at first glance, which is why AdaptiveCpp never implemented it. It's full of potential performance or correctness issues unfortunately :(

I'm not sure what the best approach here is. I suspect that designing a new approach might require a better understanding of what the goal here is with the async free emulation than I currently have.

One solution that I've considered would be to just add asynchronous free to SYCL implementations as an extension, but that is also not trivial, since the OpenCL USM extensions don't support this functionality. Also, it would mean that the code would rely on specific extensions, and even if both AdaptiveCpp and DPC++ implemented them, the code would no longer be based on standard SYCL and it wouldn't work anymore with other compilers.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I am aware it is technically UB according to SYCL spec. In practice we agreed that it should work fine with DPC++ and likely other SYCL implementations that would support host_task.
I was just reminded that there is already an extension for async alloc and free, see https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_async_memory_alloc.asciidoc.
DPC++ already supports that so if we want to change the DPC++ path I would strongly suggest to use this. We don't have enough bandwidth to work on this at Codeplay but we could help with the review. It may be a good opportunity to update it if AdaptiveCpp were to support that extension?

Copy link

@illuhad illuhad Jun 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In practice we agreed that it should work fine with DPC++ and likely other SYCL implementations that would support host_task

It's not clear to me why that should be the case. I can see that for specific implementation choices of host_task (e.g. using a SYCL-managed worker thread instead of a backend callback) it might be fine. But that is an implementation detail and not guaranteed.

It may be a good opportunity to update it if AdaptiveCpp were to support that extension?

hmm... We already have our own memory pool interface which is different from the one in that extension. The real new feature I suppose would be a mechanism to extend lifetime of an object until a specific kernel has finished executing. Then we could use RAII to return a memory object to the pool.

I wonder how DPC++ implements the async free on OpenCL, given that the Intel OpenCL USM extension doesn't have it.

#endif
})};
}
}
Expand Down
10 changes: 8 additions & 2 deletions onemath/sycl/blas/src/sb_handle/temp_memory_pool.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#ifndef TEMP_MEMORY_POOL_HPP
#define TEMP_MEMORY_POOL_HPP
#ifndef __ADAPTIVECPP__
#include "helper.h"

namespace blas {
Expand Down Expand Up @@ -54,7 +53,11 @@ typename Temp_Mem_Pool::event_t Temp_Mem_Pool::release_buff_mem(
const container_t& mem) {
return {q_.submit([&](sycl::handler& cgh) {
cgh.depends_on(dependencies);
#ifndef __ADAPTIVECPP__
cgh.host_task([&, mem]() { release_buff_mem_(mem); });
#else
cgh.AdaptiveCpp_enqueue_custom_operation([&, mem](sycl::interop_handle &) { release_buff_mem_(mem); });
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps same issue here?

#endif
})};
}

Expand Down Expand Up @@ -110,10 +113,13 @@ typename Temp_Mem_Pool::event_t Temp_Mem_Pool::release_usm_mem(
const container_t& mem) {
return {q_.submit([&](sycl::handler& cgh) {
cgh.depends_on(dependencies);
#ifndef __ADAPTIVECPP__
cgh.host_task([&, mem]() { release_usm_mem_(mem); });
#else
cgh.AdaptiveCpp_enqueue_custom_operation([&, mem](sycl::interop_handle &) { release_usm_mem_(mem); });
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And here?

#endif
})};
}
}
#endif // SB_ENABLE_USM
#endif // __ADAPTIVECPP__
#endif