Skip to content

Commit 6a2b416

Browse files
committed
Manage Tracy colors in a centralized location
1 parent 52910f0 commit 6a2b416

14 files changed

+137
-60
lines changed

include/buffer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ class buffer {
9898
/// It notifies the runtime of buffer creation and destruction and also persists changes of the buffer debug name.
9999
struct tracker {
100100
tracker(const celerity::range<Dims>& range, const void* const host_init_ptr) : range(range) {
101-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::buffer", DarkSlateBlue);
101+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::buffer", buffer_ctor);
102102

103103
if(!detail::runtime::has_instance()) { detail::runtime::init(nullptr, nullptr); }
104104
auto user_aid = detail::null_allocation_id;
@@ -116,7 +116,7 @@ class buffer {
116116
tracker& operator=(tracker&&) = delete;
117117

118118
~tracker() {
119-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::~buffer", DarkCyan);
119+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::~buffer", buffer_dtor);
120120
detail::runtime::get_instance().destroy_buffer(id);
121121
// The user must guarantee liveness of the user pointer only until the buffer instance goes out of scope
122122
// TODO This is more synchronization than necessary - consider issuing a fence-like task that does not block concurrent tasks.

include/distr_queue.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue {
6969
template <typename CGF>
7070
void submit(CGF cgf) { // NOLINT(readability-convert-member-functions-to-static)
7171
// (Note while this function could be made static, it must not be! Otherwise we can't be sure the runtime has been initialized.)
72-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::submit", Orange3);
72+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::submit", distr_queue_submit);
7373
auto cg = detail::invoke_command_group_function(std::move(cgf));
7474
[[maybe_unused]] const auto tid = detail::runtime::get_instance().submit(std::move(cg));
7575
CELERITY_DETAIL_TRACY_ZONE_NAME("T{} submit", tid);
@@ -83,7 +83,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue {
8383
* @warning { This is very slow, as it drains all queues and synchronizes across the entire cluster. }
8484
*/
8585
void slow_full_sync() { // NOLINT(readability-convert-member-functions-to-static)
86-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::slow_full_sync", Red2);
86+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::slow_full_sync", distr_queue_slow_full_sync);
8787
[[maybe_unused]] const auto tid = detail::runtime::get_instance().sync(detail::epoch_action::barrier);
8888
CELERITY_DETAIL_TRACY_ZONE_NAME("T{} slow_full_sync", tid);
8989
}
@@ -126,7 +126,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue {
126126
/// It notifies the runtime of queue creation and destruction, which might trigger runtime initialization if it is the first such object.
127127
struct tracker {
128128
tracker(const detail::devices_or_selector& devices_or_selector) {
129-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::distr_queue", DarkSlateBlue);
129+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::distr_queue", distr_queue_ctor);
130130

131131
if(!detail::runtime::has_instance()) {
132132
detail::runtime::init(nullptr, nullptr, devices_or_selector);
@@ -144,7 +144,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue {
144144
tracker& operator=(tracker&&) = delete;
145145

146146
~tracker() {
147-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::~distr_queue", DarkCyan);
147+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::~distr_queue", distr_queue_dtor);
148148

149149
detail::runtime::get_instance().destroy_queue();
150150

include/fence.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ class buffer_fence_promise final : public detail::task_promise {
110110
template <typename T>
111111
std::future<T> fence(const experimental::host_object<T>& obj) {
112112
static_assert(std::is_object_v<T>, "host_object<T&> and host_object<void> are not allowed as parameters to fence()");
113-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", Green2);
113+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", queue_fence);
114114

115115
const host_object_effect effect{detail::get_host_object_id(obj), experimental::side_effect_order::sequential};
116116
auto promise = std::make_unique<detail::host_object_fence_promise<T>>(detail::get_host_object_instance(obj));
@@ -123,7 +123,7 @@ std::future<T> fence(const experimental::host_object<T>& obj) {
123123

124124
template <typename DataT, int Dims>
125125
std::future<buffer_snapshot<DataT, Dims>> fence(const buffer<DataT, Dims>& buf, const subrange<Dims>& sr) {
126-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", Green2);
126+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", queue_fence);
127127

128128
detail::buffer_access access{detail::get_buffer_id(buf), access_mode::read,
129129
std::make_unique<detail::range_mapper<Dims, celerity::access::fixed<Dims>>>(celerity::access::fixed<Dims>(sr), buf.get_range())};

include/host_object.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ struct host_object_tracker {
3838
bool references_user_object;
3939

4040
explicit host_object_tracker(std::unique_ptr<host_object_instance> instance) : references_user_object(instance == nullptr) {
41-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("host_object::host_object", DarkSlateBlue);
41+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("host_object::host_object", host_object_ctor);
4242
if(!detail::runtime::has_instance()) { detail::runtime::init(nullptr, nullptr); }
4343
id = detail::runtime::get_instance().create_host_object(std::move(instance));
4444
}
@@ -49,7 +49,7 @@ struct host_object_tracker {
4949
host_object_tracker& operator=(const host_object_tracker&) = delete;
5050

5151
~host_object_tracker() {
52-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("~host_object::host_object", DarkCyan);
52+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("~host_object::host_object", host_object_dtor);
5353
detail::runtime::get_instance().destroy_host_object(id);
5454
// The user must guarantee liveness of the referenced object only until the host_object instance goes out of scope
5555
if(references_user_object) { detail::runtime::get_instance().sync(detail::epoch_action::none); }

include/queue.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class queue {
3636
template <typename CGF>
3737
void submit(CGF&& cgf) { // NOLINT(readability-convert-member-functions-to-static)
3838
// (Note while this function could be made static, it must not be! Otherwise we can't be sure the runtime has been initialized.)
39-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::submit", Orange3);
39+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::submit", queue_submit);
4040
auto cg = detail::invoke_command_group_function(std::forward<CGF>(cgf));
4141
[[maybe_unused]] const auto tid = detail::runtime::get_instance().submit(std::move(cg));
4242
CELERITY_DETAIL_TRACY_ZONE_NAME("T{} submit", tid);
@@ -49,7 +49,7 @@ class queue {
4949
/// Note that this overload of `wait` does not issue a global barrier, so when using this for simple user-side benchmarking, cluster nodes might disagree on
5050
/// start time measurements. Use `wait(experimental::barrier)` instead for benchmarking purposes.
5151
void wait() { // NOLINT(readability-convert-member-functions-to-static)
52-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", Red2);
52+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", queue_wait);
5353
[[maybe_unused]] const auto tid = detail::runtime::get_instance().sync(detail::epoch_action::none);
5454
CELERITY_DETAIL_TRACY_ZONE_NAME("T{} wait", tid);
5555
}
@@ -58,7 +58,7 @@ class queue {
5858
///
5959
/// This has an even higher latency than `wait()`, but may be useful for user-side performance measurements.
6060
void wait(detail::barrier_tag /* barrier */) { // NOLINT(readability-convert-member-functions-to-static)
61-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", Red2);
61+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", queue_wait);
6262
[[maybe_unused]] const auto tid = detail::runtime::get_instance().sync(detail::epoch_action::barrier);
6363
CELERITY_DETAIL_TRACY_ZONE_NAME("T{} wait (barrier)", tid);
6464
}
@@ -95,7 +95,7 @@ class queue {
9595
/// It notifies the runtime of queue creation and destruction, which might trigger runtime initialization if it is the first such object.
9696
struct tracker {
9797
tracker() {
98-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::queue", DarkSlateBlue);
98+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::queue", queue_ctor);
9999
if(!detail::runtime::has_instance()) { detail::runtime::init(nullptr, nullptr, detail::auto_select_devices{}); }
100100
detail::runtime::get_instance().create_queue();
101101
}
@@ -106,7 +106,7 @@ class queue {
106106
tracker& operator=(tracker&&) = delete;
107107

108108
~tracker() {
109-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::~queue", DarkCyan);
109+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::~queue", queue_ctor);
110110

111111
detail::runtime::get_instance().destroy_queue();
112112

include/tracy.h

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,86 @@ inline const char* leak_name(const std::string& name) {
8282
inline void set_thread_name_and_order(const std::string& name, const int32_t index) {
8383
const int32_t order = tracy_detail::lane_order::thread + index;
8484
assert(order <= static_cast<int32_t>(tracy_detail::lane_order::thread_max));
85-
::tracy::SetThreadNameWithHint(leak_name(name), order);
85+
tracy::SetThreadNameWithHint(leak_name(name), order);
8686
}
8787

8888
} // namespace celerity::detail::tracy_detail
8989

90+
namespace celerity::detail {
91+
92+
enum class trace_color : std::underlying_type_t<tracy::Color::ColorType> {
93+
generic_red = tracy::Color::Red,
94+
generic_green = tracy::Color::Green,
95+
generic_blue = tracy::Color::Blue,
96+
generic_yellow = tracy::Color::Yellow,
97+
98+
buffer_ctor = tracy::Color::DarkSlateBlue,
99+
buffer_dtor = tracy::Color::DarkCyan,
100+
101+
cuda_memcpy = tracy::Color::ForestGreen,
102+
cuda_memcpy_1d = cuda_memcpy,
103+
cuda_memcpy_2d = cuda_memcpy,
104+
cuda_memcpy_3d = cuda_memcpy,
105+
cuda_record_event = tracy::Color::ForestGreen,
106+
107+
distr_queue_ctor = tracy::Color::DarkSlateBlue,
108+
distr_queue_dtor = tracy::Color::DarkCyan,
109+
distr_queue_slow_full_sync = tracy::Color::Red2,
110+
distr_queue_submit = tracy::Color::Orange3,
111+
112+
executor_fetch = tracy::Color::Gray,
113+
executor_issue = tracy::Color::Blue,
114+
executor_issue_copy = tracy::Color::Green4,
115+
executor_issue_device_kernel = tracy::Color::Yellow2,
116+
executor_make_accessor_info = tracy::Color::Magenta3,
117+
executor_oob_check = tracy::Color::Red,
118+
executor_oob_init = executor_oob_check,
119+
executor_retire = tracy::Color::Brown,
120+
executor_starve = tracy::Color::DarkSlateGray,
121+
122+
host_object_ctor = tracy::Color::DarkSlateBlue,
123+
host_object_dtor = tracy::Color::DarkCyan,
124+
125+
iggen_allocate = tracy::Color::Teal,
126+
iggen_anticipate = iggen_allocate,
127+
iggen_coherence = tracy::Color::Red2,
128+
iggen_launch_kernel = tracy::Color::Blue2,
129+
iggen_perform_buffer_access = tracy::Color::Red3,
130+
iggen_satisfy_buffer_requirements = tracy::Color::ForestGreen,
131+
iggen_split_task = tracy::Color::Maroon,
132+
133+
mpi_finalize = tracy::Color::LightSkyBlue,
134+
mpi_init = tracy::Color::LightSkyBlue,
135+
136+
out_of_order_engine_assign = tracy::Color::Blue3,
137+
out_of_order_engine_complete = tracy::Color::Blue3,
138+
out_of_order_engine_submit = tracy::Color::Blue3,
139+
140+
queue_ctor = distr_queue_ctor,
141+
queue_dtor = distr_queue_dtor,
142+
queue_fence = tracy::Color::Green2,
143+
queue_submit = distr_queue_submit,
144+
queue_wait = distr_queue_slow_full_sync,
145+
146+
runtime_select_devices = tracy::Color::PaleVioletRed,
147+
runtime_shutdown = tracy::Color::DimGray,
148+
runtime_startup = tracy::Color::DarkGray,
149+
150+
scheduler_buffer_created = tracy::Color::DarkGreen,
151+
scheduler_buffer_destroyed = scheduler_buffer_created,
152+
scheduler_buffer_name_changed = tracy::Color::DarkGreen,
153+
scheduler_build_task = tracy::Color::WebMaroon,
154+
scheduler_compile_command = tracy::Color::MidnightBlue,
155+
scheduler_host_object_created = tracy::Color::DarkGreen,
156+
scheduler_host_object_destroyed = scheduler_host_object_created,
157+
scheduler_prune = tracy::Color::Gray,
158+
159+
sycl_init = tracy::Color::Orange2,
160+
sycl_submit = tracy::Color::Orange2,
161+
};
162+
163+
}
164+
90165
#define CELERITY_DETAIL_IF_TRACY_SUPPORTED(...) __VA_ARGS__
91166

92167
#else
@@ -100,7 +175,9 @@ inline void set_thread_name_and_order(const std::string& name, const int32_t ind
100175
#define CELERITY_DETAIL_IF_TRACY_ENABLED_FULL(...) CELERITY_DETAIL_IF_TRACY_SUPPORTED(if(::celerity::detail::tracy_detail::is_enabled_full()) { __VA_ARGS__; })
101176

102177
#define CELERITY_DETAIL_TRACY_ZONE_SCOPED(TAG, COLOR_NAME) \
103-
CELERITY_DETAIL_IF_TRACY_SUPPORTED(ZoneNamedNC(___tracy_scoped_zone, TAG, ::tracy::Color::COLOR_NAME, ::celerity::detail::tracy_detail::is_enabled()))
178+
CELERITY_DETAIL_IF_TRACY_SUPPORTED(ZoneNamedNC(___tracy_scoped_zone, TAG, \
179+
static_cast<std::underlying_type_t<::celerity::detail::trace_color>>(::celerity::detail::trace_color::COLOR_NAME), \
180+
::celerity::detail::tracy_detail::is_enabled()))
104181

105182
#define CELERITY_DETAIL_TRACY_ZONE_NAME(...) \
106183
CELERITY_DETAIL_IF_TRACY_ENABLED_FULL(::celerity::detail::tracy_detail::apply_string([&](const auto& n) { ZoneName(n.data(), n.size()); }, __VA_ARGS__))

src/backend/sycl_backend.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ sycl_backend::~sycl_backend() {
213213
const system_info& sycl_backend::get_system_info() const { return m_impl->system; }
214214

215215
void sycl_backend::init() {
216-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::init", Orange2);
216+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::init", sycl_init);
217217

218218
// Instantiate the first in-order queue on each device. At least for CUDA systems this will perform device initialization, which can take > 100 ms / device.
219219
for(device_id did = 0; did < m_impl->system.devices.size(); ++did) {
@@ -276,7 +276,7 @@ async_event sycl_backend::enqueue_device_kernel(const device_id device, const si
276276
std::vector<closure_hydrator::accessor_info> accessor_infos, const box<3>& execution_range, const std::vector<void*>& reduction_ptrs) //
277277
{
278278
return enqueue_device_work(device, lane, [=, this, acc_infos = std::move(accessor_infos)](sycl::queue& queue) mutable {
279-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", Orange2);
279+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit);
280280
auto event = queue.submit([&](sycl::handler& sycl_cgh) {
281281
auto& hydrator = closure_hydrator::get_instance();
282282
hydrator.arm(target::device, std::move(acc_infos));

src/backend/sycl_cuda_backend.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,17 @@ void nd_copy_device_async(const cudaStream_t stream, const void* const source_ba
3838
if(layout.contiguous_size == 0) return;
3939

4040
if(layout.num_complex_strides == 0) {
41-
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_1d", ForestGreen, "cudaMemcpyAsync");
41+
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_1d", cuda_memcpy_1d, "cudaMemcpyAsync");
4242
CELERITY_CUDA_CHECK(cudaMemcpyAsync, static_cast<std::byte*>(dest_base) + layout.offset_in_dest,
4343
static_cast<const std::byte*>(source_base) + layout.offset_in_source, layout.contiguous_size, cudaMemcpyDefault, stream);
4444
} else if(layout.num_complex_strides == 1) {
45-
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_2d", ForestGreen, "cudaMemcpy2DAsync");
45+
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_2d", cuda_memcpy_2d, "cudaMemcpy2DAsync");
4646
CELERITY_CUDA_CHECK(cudaMemcpy2DAsync, static_cast<std::byte*>(dest_base) + layout.offset_in_dest, layout.strides[0].dest_stride,
4747
static_cast<const std::byte*>(source_base) + layout.offset_in_source, layout.strides[0].source_stride, layout.contiguous_size,
4848
layout.strides[0].count, cudaMemcpyDefault, stream);
4949
} else {
5050
assert(layout.num_complex_strides == 2);
51-
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_3d", ForestGreen, "cudaMemcpy3DAsync");
51+
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_3d", cuda_memcpy_3d, "cudaMemcpy3DAsync");
5252
// Arriving in the 3D case means no dimensionality reduction was possible, and cudaMemcpy3D is more closely aligned to the parameters to
5353
// nd_copy_device_async than to nd_copy_layout, so we don't compute cudaMemcpy3DParms from `layout`.
5454
cudaMemcpy3DParms parms = {};
@@ -81,7 +81,7 @@ void nd_copy_device_async(cudaStream_t stream, const void* const source_base, vo
8181
nd_copy_device_async(stream, source, dest, source_box, dest_box, copy_box, elem_size);
8282
},
8383
[stream](const void* const source, void* const dest, size_t size_bytes) {
84-
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy", ForestGreen, "cudaMemcpyAsync");
84+
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy", cuda_memcpy, "cudaMemcpyAsync");
8585
CELERITY_CUDA_CHECK(cudaMemcpyAsync, dest, source, size_bytes, cudaMemcpyDefault, stream);
8686
});
8787
}
@@ -101,7 +101,7 @@ struct cuda_native_event_deleter {
101101
using unique_cuda_native_event = std::unique_ptr<std::remove_pointer_t<cudaEvent_t>, cuda_native_event_deleter>;
102102

103103
unique_cuda_native_event record_native_event(const cudaStream_t stream, bool enable_profiling) {
104-
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::record_event", ForestGreen, "cudaEventRecord")
104+
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::record_event", cuda_record_event, "cudaEventRecord")
105105
cudaEvent_t event;
106106
CELERITY_CUDA_CHECK(cudaEventCreateWithFlags, &event, enable_profiling ? cudaEventDefault : cudaEventDisableTiming);
107107
CELERITY_CUDA_CHECK(cudaEventRecord, event, stream);

src/backend/sycl_generic_backend.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ void nd_copy_device_chunked(sycl::queue& queue, const void* const source_base, v
2727
const auto layout = layout_nd_copy(source_box.get_range(), dest_box.get_range(), copy_box.get_offset() - source_box.get_offset(),
2828
copy_box.get_offset() - dest_box.get_offset(), copy_box.get_range(), elem_size);
2929
for_each_contiguous_chunk(layout, [&](const size_t chunk_offset_in_source, const size_t chunk_offset_in_dest, const size_t chunk_size) {
30-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", Orange2);
30+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit);
3131
// first, last: We remember the first and last submission event to report completion time spanning the entire region copy
3232
last = queue.memcpy(
3333
static_cast<std::byte*>(dest_base) + chunk_offset_in_dest, static_cast<const std::byte*>(source_base) + chunk_offset_in_source, chunk_size);
@@ -47,7 +47,7 @@ async_event nd_copy_device_generic(sycl::queue& queue, const void* const source_
4747
[&queue, elem_size, enable_profiling, &first, &last](const void* const source, void* const dest, const box<3>& source_box, const box<3>& dest_box,
4848
const box<3>& copy_box) { nd_copy_device_chunked(queue, source, dest, source_box, dest_box, copy_box, elem_size, enable_profiling, first, last); },
4949
[&queue, enable_profiling, &first, &last](const void* const source, void* const dest, size_t size_bytes) {
50-
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", Orange2);
50+
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit);
5151
last = queue.memcpy(dest, source, size_bytes);
5252
if(enable_profiling) { first = last; }
5353
});

0 commit comments

Comments
 (0)