1+ /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2+ *
3+ * Redistribution and use in source and binary forms, with or without
4+ * modification, are permitted provided that the following conditions
5+ * are met:
6+ * * Redistributions of source code must retain the above copyright
7+ * notice, this list of conditions and the following disclaimer.
8+ * * Redistributions in binary form must reproduce the above copyright
9+ * notice, this list of conditions and the following disclaimer in the
10+ * documentation and/or other materials provided with the distribution.
11+ * * Neither the name of NVIDIA CORPORATION nor the names of its
12+ * contributors may be used to endorse or promote products derived
13+ * from this software without specific prior written permission.
14+ *
15+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+ */
27+
28+ #include < oneapi/dpl/execution>
29+ #include < oneapi/dpl/algorithm>
30+ #include < sycl/sycl.hpp>
31+ #include < dpct/dpct.hpp>
32+ #include < dpct/dpl_utils.hpp>
33+
34+ #include " helper_cuda.h"
35+
36+ #include < algorithm>
37+ #include < time.h>
38+ #include < limits.h>
39+ #include < chrono>
40+
41+ template <typename T, bool floatKeys> bool testSort (int argc, char **argv) try {
42+ int cmdVal;
43+ int keybits = 32 ;
44+
45+ unsigned int numElements = 1048576 ;
46+ bool keysOnly = checkCmdLineFlag (argc, (const char **)argv, " keysonly" );
47+ bool quiet = checkCmdLineFlag (argc, (const char **)argv, " quiet" );
48+
49+ if (checkCmdLineFlag (argc, (const char **)argv, " n" )) {
50+ cmdVal = getCmdLineArgumentInt (argc, (const char **)argv, " n" );
51+ numElements = cmdVal;
52+
53+ if (cmdVal < 0 ) {
54+ printf (" Error: elements must be > 0, elements=%d is invalid\n " , cmdVal);
55+ exit (EXIT_SUCCESS);
56+ }
57+ }
58+
59+ if (checkCmdLineFlag (argc, (const char **)argv, " keybits" )) {
60+ cmdVal = getCmdLineArgumentInt (argc, (const char **)argv, " keybits" );
61+ keybits = cmdVal;
62+
63+ if (keybits <= 0 ) {
64+ printf (" Error: keybits must be > 0, keybits=%d is invalid\n " , keybits);
65+ exit (EXIT_SUCCESS);
66+ }
67+ }
68+
69+ unsigned int numIterations = (numElements >= 16777216 ) ? 10 : 100 ;
70+
71+ if (checkCmdLineFlag (argc, (const char **)argv, " iterations" )) {
72+ cmdVal = getCmdLineArgumentInt (argc, (const char **)argv, " iterations" );
73+ numIterations = cmdVal;
74+ }
75+
76+ if (checkCmdLineFlag (argc, (const char **)argv, " help" )) {
77+ printf (" Command line:\n radixSortThrust [-option]\n " );
78+ printf (" Valid options:\n " );
79+ printf (" -n=<N> : number of elements to sort\n " );
80+ printf (" -keybits=bits : keybits must be > 0\n " );
81+ printf (
82+ " -keysonly : only sort an array of keys (default sorts key-value "
83+ " pairs)\n " );
84+ printf (
85+ " -float : use 32-bit float keys (default is 32-bit unsigned "
86+ " int)\n " );
87+ printf (
88+ " -quiet : Output only the number of elements and the time to "
89+ " sort\n " );
90+ printf (" -help : Output a help message\n " );
91+ exit (EXIT_SUCCESS);
92+ }
93+
94+ if (!quiet)
95+ printf (" \n Sorting %d %d-bit %s keys %s\n\n " , numElements, keybits,
96+ floatKeys ? " float" : " unsigned int" ,
97+ keysOnly ? " (only)" : " and values" );
98+
99+ int deviceID = -1 ;
100+
101+ if (0 == deviceID = dpct::dev_mgr::instance ().current_device_id ()) {
102+ dpct::device_info devprop;
103+ dpct::dev_mgr::instance ().get_device (deviceID).get_device_info (devprop);
104+ unsigned int totalMem = (keysOnly ? 2 : 4 ) * numElements * sizeof (T);
105+
106+ if (devprop.get_global_mem_size () < totalMem) {
107+ printf (" Error: insufficient amount of memory to sort %d elements.\n " ,
108+ numElements);
109+ printf (" %d bytes needed, %d bytes available\n " , (int )totalMem,
110+ (int )devprop.get_global_mem_size ());
111+ exit (EXIT_SUCCESS);
112+ }
113+ }
114+
115+ std::vector<T> h_keys (numElements);
116+ std::vector<T> h_keysSorted (numElements);
117+ std::vector<unsigned int > h_values;
118+
119+ if (!keysOnly) h_values = std::vector<unsigned int >(numElements);
120+
121+ // Fill up with some random data
122+ /*
123+ DPCT1008:14: clock function is not defined in SYCL. This is a
124+ hardware-specific feature. Consult with your hardware vendor to find a
125+ replacement.
126+ */
127+ thrust::default_random_engine rng (clock ());
128+
129+ if (floatKeys) {
130+ thrust::uniform_real_distribution<float > u01 (0 , 1 );
131+
132+ for (int i = 0 ; i < (int )numElements; i++) h_keys[i] = u01 (rng);
133+ } else {
134+ thrust::uniform_int_distribution<unsigned int > u (0 , UINT_MAX);
135+
136+ for (int i = 0 ; i < (int )numElements; i++) h_keys[i] = u (rng);
137+ }
138+
139+ if (!keysOnly)
140+ dpct::iota (oneapi::dpl::execution::seq, h_values.begin (), h_values.end ());
141+
142+ // Copy data onto the GPU
143+ dpct::device_vector<T> d_keys;
144+ dpct::device_vector<unsigned int > d_values;
145+
146+ // run multiple iterations to compute an average sort time
147+ dpct::event_ptr start_event, stop_event;
148+ std::chrono::time_point<std::chrono::steady_clock> start_event_ct1;
149+ std::chrono::time_point<std::chrono::steady_clock> stop_event_ct1;
150+ checkCudaErrors (DPCT_CHECK_ERROR (start_event = new sycl::event ()));
151+ checkCudaErrors (DPCT_CHECK_ERROR (stop_event = new sycl::event ()));
152+
153+ float totalTime = 0 ;
154+
155+ for (unsigned int i = 0 ; i < numIterations; i++) {
156+ // reset data before sort
157+ d_keys = h_keys;
158+
159+ if (!keysOnly) d_values = h_values;
160+
161+ /*
162+ DPCT1012:15: Detected kernel execution time measurement pattern and
163+ generated an initial code for time measurements in SYCL. You can change the
164+ way time is measured depending on your goals.
165+ */
166+ /*
167+ DPCT1024:16: The original code returned the error code that was further
168+ consumed by the program logic. This original code was replaced with 0. You
169+ may need to rewrite the program logic consuming the error code.
170+ */
171+ start_event_ct1 = std::chrono::steady_clock::now ();
172+ checkCudaErrors (0 );
173+
174+ if (keysOnly)
175+ oneapi::dpl::sort (
176+ oneapi::dpl::execution::make_device_policy (dpct::get_default_queue ()),
177+ d_keys.begin (), d_keys.end ());
178+ else
179+ dpct::sort (
180+ oneapi::dpl::execution::make_device_policy (dpct::get_default_queue ()),
181+ d_keys.begin (), d_keys.end (), d_values.begin ());
182+
183+ /*
184+ DPCT1012:17: Detected kernel execution time measurement pattern and
185+ generated an initial code for time measurements in SYCL. You can change the
186+ way time is measured depending on your goals.
187+ */
188+ /*
189+ DPCT1024:18: The original code returned the error code that was further
190+ consumed by the program logic. This original code was replaced with 0. You
191+ may need to rewrite the program logic consuming the error code.
192+ */
193+ stop_event_ct1 = std::chrono::steady_clock::now ();
194+ checkCudaErrors (0 );
195+ checkCudaErrors (0 );
196+
197+ float time = 0 ;
198+ checkCudaErrors (
199+ DPCT_CHECK_ERROR ((time = std::chrono::duration<float , std::milli>(
200+ stop_event_ct1 - start_event_ct1)
201+ .count ())));
202+ totalTime += time;
203+ }
204+
205+ totalTime /= (1 .0e3f * numIterations);
206+ printf (
207+ " radixSortThrust, Throughput = %.4f MElements/s, Time = %.5f s, Size = "
208+ " %u elements\n " ,
209+ 1 .0e-6f * numElements / totalTime, totalTime, numElements);
210+
211+ getLastCudaError (" after radixsort" );
212+
213+ // Get results back to host for correctness checking
214+ std::copy (
215+ oneapi::dpl::execution::make_device_policy (dpct::get_default_queue ()),
216+ d_keys.begin (), d_keys.end (), h_keysSorted.begin ());
217+
218+ if (!keysOnly)
219+ std::copy (
220+ oneapi::dpl::execution::make_device_policy (dpct::get_default_queue ()),
221+ d_values.begin (), d_values.end (), h_values.begin ());
222+
223+ getLastCudaError (" copying results to host memory" );
224+
225+ // Check results
226+ bool bTestResult = oneapi::dpl::is_sorted (
227+ oneapi::dpl::execution::seq, h_keysSorted.begin (), h_keysSorted.end ());
228+
229+ checkCudaErrors (DPCT_CHECK_ERROR (dpct::destroy_event (start_event)));
230+ checkCudaErrors (DPCT_CHECK_ERROR (dpct::destroy_event (stop_event)));
231+
232+ if (!bTestResult && !quiet) {
233+ return false ;
234+ }
235+
236+ return bTestResult;
237+ }
238+ catch (sycl::exception const &exc) {
239+ std::cerr << exc.what () << " Exception caught at file:" << __FILE__
240+ << " , line:" << __LINE__ << std::endl;
241+ std::exit (1 );
242+ }
243+
244+ int main (int argc, char **argv) {
245+ // Start logs
246+ printf (" %s Starting...\n\n " , argv[0 ]);
247+
248+ findCudaDevice (argc, (const char **)argv);
249+
250+ bool bTestResult = false ;
251+
252+ if (checkCmdLineFlag (argc, (const char **)argv, " float" ))
253+ bTestResult = testSort<float , true >(argc, argv);
254+ else
255+ bTestResult = testSort<unsigned int , false >(argc, argv);
256+
257+ printf (bTestResult ? " Test passed\n " : " Test failed!\n " );
258+ }
0 commit comments