Skip to content

Commit 2cddadb

Browse files
authored
Migrated radix sort sample (#1999)
* Add sample * Adjust helper_cuda file * Add intitial documentation * Add sample * Adjust helper_cuda file * Add intitial documentation * Modify json * Update README.md * Modify helper file
1 parent 9a91b8b commit 2cddadb

File tree

12 files changed

+3558
-0
lines changed

12 files changed

+3558
-0
lines changed

DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_cuda.h

Lines changed: 1022 additions & 0 deletions
Large diffs are not rendered by default.

DirectProgramming/C++SYCL/GraphTraversal/radix_sort_thrust_migrated/01_sycl_dpct_output/Common/helper_string.h

Lines changed: 428 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2+
*
3+
* Redistribution and use in source and binary forms, with or without
4+
* modification, are permitted provided that the following conditions
5+
* are met:
6+
* * Redistributions of source code must retain the above copyright
7+
* notice, this list of conditions and the following disclaimer.
8+
* * Redistributions in binary form must reproduce the above copyright
9+
* notice, this list of conditions and the following disclaimer in the
10+
* documentation and/or other materials provided with the distribution.
11+
* * Neither the name of NVIDIA CORPORATION nor the names of its
12+
* contributors may be used to endorse or promote products derived
13+
* from this software without specific prior written permission.
14+
*
15+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*/
27+
28+
#include <oneapi/dpl/execution>
29+
#include <oneapi/dpl/algorithm>
30+
#include <sycl/sycl.hpp>
31+
#include <dpct/dpct.hpp>
32+
#include <dpct/dpl_utils.hpp>
33+
34+
#include "helper_cuda.h"
35+
36+
#include <algorithm>
37+
#include <time.h>
38+
#include <limits.h>
39+
#include <chrono>
40+
41+
template <typename T, bool floatKeys> bool testSort(int argc, char **argv) try {
42+
int cmdVal;
43+
int keybits = 32;
44+
45+
unsigned int numElements = 1048576;
46+
bool keysOnly = checkCmdLineFlag(argc, (const char **)argv, "keysonly");
47+
bool quiet = checkCmdLineFlag(argc, (const char **)argv, "quiet");
48+
49+
if (checkCmdLineFlag(argc, (const char **)argv, "n")) {
50+
cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "n");
51+
numElements = cmdVal;
52+
53+
if (cmdVal < 0) {
54+
printf("Error: elements must be > 0, elements=%d is invalid\n", cmdVal);
55+
exit(EXIT_SUCCESS);
56+
}
57+
}
58+
59+
if (checkCmdLineFlag(argc, (const char **)argv, "keybits")) {
60+
cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "keybits");
61+
keybits = cmdVal;
62+
63+
if (keybits <= 0) {
64+
printf("Error: keybits must be > 0, keybits=%d is invalid\n", keybits);
65+
exit(EXIT_SUCCESS);
66+
}
67+
}
68+
69+
unsigned int numIterations = (numElements >= 16777216) ? 10 : 100;
70+
71+
if (checkCmdLineFlag(argc, (const char **)argv, "iterations")) {
72+
cmdVal = getCmdLineArgumentInt(argc, (const char **)argv, "iterations");
73+
numIterations = cmdVal;
74+
}
75+
76+
if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
77+
printf("Command line:\nradixSortThrust [-option]\n");
78+
printf("Valid options:\n");
79+
printf("-n=<N> : number of elements to sort\n");
80+
printf("-keybits=bits : keybits must be > 0\n");
81+
printf(
82+
"-keysonly : only sort an array of keys (default sorts key-value "
83+
"pairs)\n");
84+
printf(
85+
"-float : use 32-bit float keys (default is 32-bit unsigned "
86+
"int)\n");
87+
printf(
88+
"-quiet : Output only the number of elements and the time to "
89+
"sort\n");
90+
printf("-help : Output a help message\n");
91+
exit(EXIT_SUCCESS);
92+
}
93+
94+
if (!quiet)
95+
printf("\nSorting %d %d-bit %s keys %s\n\n", numElements, keybits,
96+
floatKeys ? "float" : "unsigned int",
97+
keysOnly ? "(only)" : "and values");
98+
99+
int deviceID = -1;
100+
101+
if (0 == deviceID = dpct::dev_mgr::instance().current_device_id()) {
102+
dpct::device_info devprop;
103+
dpct::dev_mgr::instance().get_device(deviceID).get_device_info(devprop);
104+
unsigned int totalMem = (keysOnly ? 2 : 4) * numElements * sizeof(T);
105+
106+
if (devprop.get_global_mem_size() < totalMem) {
107+
printf("Error: insufficient amount of memory to sort %d elements.\n",
108+
numElements);
109+
printf("%d bytes needed, %d bytes available\n", (int)totalMem,
110+
(int)devprop.get_global_mem_size());
111+
exit(EXIT_SUCCESS);
112+
}
113+
}
114+
115+
std::vector<T> h_keys(numElements);
116+
std::vector<T> h_keysSorted(numElements);
117+
std::vector<unsigned int> h_values;
118+
119+
if (!keysOnly) h_values = std::vector<unsigned int>(numElements);
120+
121+
// Fill up with some random data
122+
/*
123+
DPCT1008:14: clock function is not defined in SYCL. This is a
124+
hardware-specific feature. Consult with your hardware vendor to find a
125+
replacement.
126+
*/
127+
thrust::default_random_engine rng(clock());
128+
129+
if (floatKeys) {
130+
thrust::uniform_real_distribution<float> u01(0, 1);
131+
132+
for (int i = 0; i < (int)numElements; i++) h_keys[i] = u01(rng);
133+
} else {
134+
thrust::uniform_int_distribution<unsigned int> u(0, UINT_MAX);
135+
136+
for (int i = 0; i < (int)numElements; i++) h_keys[i] = u(rng);
137+
}
138+
139+
if (!keysOnly)
140+
dpct::iota(oneapi::dpl::execution::seq, h_values.begin(), h_values.end());
141+
142+
// Copy data onto the GPU
143+
dpct::device_vector<T> d_keys;
144+
dpct::device_vector<unsigned int> d_values;
145+
146+
// run multiple iterations to compute an average sort time
147+
dpct::event_ptr start_event, stop_event;
148+
std::chrono::time_point<std::chrono::steady_clock> start_event_ct1;
149+
std::chrono::time_point<std::chrono::steady_clock> stop_event_ct1;
150+
checkCudaErrors(DPCT_CHECK_ERROR(start_event = new sycl::event()));
151+
checkCudaErrors(DPCT_CHECK_ERROR(stop_event = new sycl::event()));
152+
153+
float totalTime = 0;
154+
155+
for (unsigned int i = 0; i < numIterations; i++) {
156+
// reset data before sort
157+
d_keys = h_keys;
158+
159+
if (!keysOnly) d_values = h_values;
160+
161+
/*
162+
DPCT1012:15: Detected kernel execution time measurement pattern and
163+
generated an initial code for time measurements in SYCL. You can change the
164+
way time is measured depending on your goals.
165+
*/
166+
/*
167+
DPCT1024:16: The original code returned the error code that was further
168+
consumed by the program logic. This original code was replaced with 0. You
169+
may need to rewrite the program logic consuming the error code.
170+
*/
171+
start_event_ct1 = std::chrono::steady_clock::now();
172+
checkCudaErrors(0);
173+
174+
if (keysOnly)
175+
oneapi::dpl::sort(
176+
oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
177+
d_keys.begin(), d_keys.end());
178+
else
179+
dpct::sort(
180+
oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
181+
d_keys.begin(), d_keys.end(), d_values.begin());
182+
183+
/*
184+
DPCT1012:17: Detected kernel execution time measurement pattern and
185+
generated an initial code for time measurements in SYCL. You can change the
186+
way time is measured depending on your goals.
187+
*/
188+
/*
189+
DPCT1024:18: The original code returned the error code that was further
190+
consumed by the program logic. This original code was replaced with 0. You
191+
may need to rewrite the program logic consuming the error code.
192+
*/
193+
stop_event_ct1 = std::chrono::steady_clock::now();
194+
checkCudaErrors(0);
195+
checkCudaErrors(0);
196+
197+
float time = 0;
198+
checkCudaErrors(
199+
DPCT_CHECK_ERROR((time = std::chrono::duration<float, std::milli>(
200+
stop_event_ct1 - start_event_ct1)
201+
.count())));
202+
totalTime += time;
203+
}
204+
205+
totalTime /= (1.0e3f * numIterations);
206+
printf(
207+
"radixSortThrust, Throughput = %.4f MElements/s, Time = %.5f s, Size = "
208+
"%u elements\n",
209+
1.0e-6f * numElements / totalTime, totalTime, numElements);
210+
211+
getLastCudaError("after radixsort");
212+
213+
// Get results back to host for correctness checking
214+
std::copy(
215+
oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
216+
d_keys.begin(), d_keys.end(), h_keysSorted.begin());
217+
218+
if (!keysOnly)
219+
std::copy(
220+
oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
221+
d_values.begin(), d_values.end(), h_values.begin());
222+
223+
getLastCudaError("copying results to host memory");
224+
225+
// Check results
226+
bool bTestResult = oneapi::dpl::is_sorted(
227+
oneapi::dpl::execution::seq, h_keysSorted.begin(), h_keysSorted.end());
228+
229+
checkCudaErrors(DPCT_CHECK_ERROR(dpct::destroy_event(start_event)));
230+
checkCudaErrors(DPCT_CHECK_ERROR(dpct::destroy_event(stop_event)));
231+
232+
if (!bTestResult && !quiet) {
233+
return false;
234+
}
235+
236+
return bTestResult;
237+
}
238+
catch (sycl::exception const &exc) {
239+
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
240+
<< ", line:" << __LINE__ << std::endl;
241+
std::exit(1);
242+
}
243+
244+
int main(int argc, char **argv) {
245+
// Start logs
246+
printf("%s Starting...\n\n", argv[0]);
247+
248+
findCudaDevice(argc, (const char **)argv);
249+
250+
bool bTestResult = false;
251+
252+
if (checkCmdLineFlag(argc, (const char **)argv, "float"))
253+
bTestResult = testSort<float, true>(argc, argv);
254+
else
255+
bTestResult = testSort<unsigned int, false>(argc, argv);
256+
257+
printf(bTestResult ? "Test passed\n" : "Test failed!\n");
258+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -std=c++17")
2+
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lmkl_sycl -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core")
3+
4+
include_directories(${CMAKE_SOURCE_DIR}/02_sycl_dpct_migrated/Common/)
5+
add_subdirectory("src")

0 commit comments

Comments
 (0)