1+ #! /bin/bash
2+
3+ # Simple RISC-V omatcopy performance comparison script
4+ # Direct comparison of omatcopy_ct.c vs omatcopy_ct_rvv.c
5+
6+ set -e
7+
8+ # Configuration
9+ RISCV_TOOLCHAIN=" /qemu/riscv"
10+ CC=" ${RISCV_TOOLCHAIN} /bin/riscv64-unknown-linux-gnu-gcc"
11+ CFLAGS=" -O3 -march=rv64gcv -static"
12+ TEST_SIZES=(64 128 256 512 1024)
13+ TEST_ITERATIONS=1000
14+
15+ echo " === Simple RISC-V omatcopy Performance Comparison ==="
16+ echo " Toolchain: ${RISCV_TOOLCHAIN} "
17+ echo " Compiler: ${CC} "
18+ echo " Flags: ${CFLAGS} "
19+ echo
20+
21+ # Check if RISC-V toolchain exists
22+ if [ ! -f " ${CC} " ]; then
23+ echo " Error: RISC-V compiler not found at ${CC} "
24+ echo " Please ensure RISC-V toolchain is installed at ${RISCV_TOOLCHAIN} "
25+ exit 1
26+ fi
27+
28+ # Create standalone test for scalar version
29+ cat > test_scalar.c << 'EOF '
30+ #include <stdio.h>
31+ #include <stdlib.h>
32+ #include <string.h>
33+ #include <time.h>
34+ #include <sys/time.h>
35+
36+ typedef long BLASLONG;
37+ typedef double FLOAT;
38+
39+ // Scalar implementation (simplified from omatcopy_ct.c)
40+ int omatcopy_ct_scalar(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
41+ {
42+ BLASLONG i, j;
43+ FLOAT *aptr, *bptr;
44+
45+ if (rows <= 0) return(0);
46+ if (cols <= 0) return(0);
47+
48+ aptr = a;
49+
50+ if (alpha == 0.0) {
51+ for (i = 0; i < cols; i++) {
52+ bptr = &b[i];
53+ for (j = 0; j < rows; j++) {
54+ bptr[j * ldb] = 0.0;
55+ }
56+ }
57+ return(0);
58+ }
59+
60+ if (alpha == 1.0) {
61+ for (i = 0; i < cols; i++) {
62+ bptr = &b[i];
63+ for (j = 0; j < rows; j++) {
64+ bptr[j * ldb] = aptr[j];
65+ }
66+ aptr += lda;
67+ }
68+ return(0);
69+ }
70+
71+ for (i = 0; i < cols; i++) {
72+ bptr = &b[i];
73+ for (j = 0; j < rows; j++) {
74+ bptr[j * ldb] = alpha * aptr[j];
75+ }
76+ aptr += lda;
77+ }
78+
79+ return(0);
80+ }
81+
82+ double get_time() {
83+ struct timeval tv;
84+ gettimeofday(&tv, NULL);
85+ return tv.tv_sec + tv.tv_usec * 1e-6;
86+ }
87+
88+ int main(int argc, char *argv[]) {
89+ if (argc != 4) {
90+ printf("Usage: %s <rows> <cols> <iterations>\n", argv[0]);
91+ return 1;
92+ }
93+
94+ BLASLONG rows = atol(argv[1]);
95+ BLASLONG cols = atol(argv[2]);
96+ int iterations = atoi(argv[3]);
97+
98+ // Allocate matrices
99+ FLOAT *a = (FLOAT*)malloc(rows * cols * sizeof(FLOAT));
100+ FLOAT *b = (FLOAT*)malloc(rows * cols * sizeof(FLOAT));
101+
102+ if (!a || !b) {
103+ printf("Memory allocation failed\n");
104+ return 1;
105+ }
106+
107+ // Initialize matrix A
108+ for (BLASLONG i = 0; i < rows * cols; i++) {
109+ a[i] = (FLOAT)(i % 100) / 10.0;
110+ }
111+
112+ // Warm up
113+ for (int i = 0; i < 10; i++) {
114+ omatcopy_ct_scalar(rows, cols, 1.0, a, rows, b, cols);
115+ }
116+
117+ // Benchmark
118+ double start_time = get_time();
119+ for (int i = 0; i < iterations; i++) {
120+ omatcopy_ct_scalar(rows, cols, 1.0, a, rows, b, cols);
121+ }
122+ double end_time = get_time();
123+
124+ double total_time = end_time - start_time;
125+ double avg_time = total_time / iterations;
126+ double gflops = (2.0 * rows * cols * iterations) / (total_time * 1e9);
127+
128+ printf("SCALAR,%ld,%ld,%d,%.6f,%.6f,%.3f\n",
129+ rows, cols, iterations, total_time, avg_time, gflops);
130+
131+ free(a);
132+ free(b);
133+ return 0;
134+ }
135+ EOF
136+
137+ # Create standalone test for RVV version
138+ cat > test_rvv.c << 'EOF '
139+ #include <stdio.h>
140+ #include <stdlib.h>
141+ #include <string.h>
142+ #include <time.h>
143+ #include <sys/time.h>
144+ #include <riscv_vector.h>
145+
146+ typedef long BLASLONG;
147+ typedef double FLOAT;
148+
149+ // RVV macros for double precision
150+ #define VSETVL_MAX __riscv_vsetvlmax_e64m8()
151+ #define VSETVL(n) __riscv_vsetvl_e64m8(n)
152+ #define FLOAT_V_T vfloat64m8_t
153+ #define VLEV_FLOAT __riscv_vle64_v_f64m8
154+ #define VSEV_FLOAT __riscv_vse64_v_f64m8
155+ #define VSSEV_FLOAT __riscv_vsse64_v_f64m8
156+ #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8
157+ #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
158+
159+ // RVV implementation (from omatcopy_ct_rvv.c)
160+ int omatcopy_ct_rvv(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
161+ {
162+ BLASLONG i, j;
163+ FLOAT *aptr, *bptr;
164+ size_t vl;
165+
166+ FLOAT_V_T va;
167+ if (rows <= 0) return(0);
168+ if (cols <= 0) return(0);
169+
170+ aptr = a;
171+
172+ if (alpha == 0.0) {
173+ vl = VSETVL_MAX;
174+ va = VFMVVF_FLOAT(0, vl);
175+ for (i = 0; i < cols; i++) {
176+ bptr = &b[i];
177+ for (j = 0; j < rows; j += vl) {
178+ vl = VSETVL(rows - j);
179+ VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
180+ }
181+ }
182+ return(0);
183+ }
184+
185+ if (alpha == 1.0) {
186+ for (i = 0; i < cols; i++) {
187+ bptr = &b[i];
188+ for (j = 0; j < rows; j += vl) {
189+ vl = VSETVL(rows - j);
190+ va = VLEV_FLOAT(aptr + j, vl);
191+ VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
192+ }
193+ aptr += lda;
194+ }
195+ return(0);
196+ }
197+
198+ for (i = 0; i < cols; i++) {
199+ bptr = &b[i];
200+ for (j = 0; j < rows; j += vl) {
201+ vl = VSETVL(rows - j);
202+ va = VLEV_FLOAT(aptr + j, vl);
203+ va = VFMULVF_FLOAT(va, alpha, vl);
204+ VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
205+ }
206+ aptr += lda;
207+ }
208+
209+ return(0);
210+ }
211+
212+ double get_time() {
213+ struct timeval tv;
214+ gettimeofday(&tv, NULL);
215+ return tv.tv_sec + tv.tv_usec * 1e-6;
216+ }
217+
218+ int main(int argc, char *argv[]) {
219+ if (argc != 4) {
220+ printf("Usage: %s <rows> <cols> <iterations>\n", argv[0]);
221+ return 1;
222+ }
223+
224+ BLASLONG rows = atol(argv[1]);
225+ BLASLONG cols = atol(argv[2]);
226+ int iterations = atoi(argv[3]);
227+
228+ // Allocate matrices
229+ FLOAT *a = (FLOAT*)malloc(rows * cols * sizeof(FLOAT));
230+ FLOAT *b = (FLOAT*)malloc(rows * cols * sizeof(FLOAT));
231+
232+ if (!a || !b) {
233+ printf("Memory allocation failed\n");
234+ return 1;
235+ }
236+
237+ // Initialize matrix A
238+ for (BLASLONG i = 0; i < rows * cols; i++) {
239+ a[i] = (FLOAT)(i % 100) / 10.0;
240+ }
241+
242+ // Warm up
243+ for (int i = 0; i < 10; i++) {
244+ omatcopy_ct_rvv(rows, cols, 1.0, a, rows, b, cols);
245+ }
246+
247+ // Benchmark
248+ double start_time = get_time();
249+ for (int i = 0; i < iterations; i++) {
250+ omatcopy_ct_rvv(rows, cols, 1.0, a, rows, b, cols);
251+ }
252+ double end_time = get_time();
253+
254+ double total_time = end_time - start_time;
255+ double avg_time = total_time / iterations;
256+ double gflops = (2.0 * rows * cols * iterations) / (total_time * 1e9);
257+
258+ printf("RVV,%ld,%ld,%d,%.6f,%.6f,%.3f\n",
259+ rows, cols, iterations, total_time, avg_time, gflops);
260+
261+ free(a);
262+ free(b);
263+ return 0;
264+ }
265+ EOF
266+
267+ echo " Building standalone test programs..."
268+
269+ # Compile scalar version
270+ echo " Compiling scalar version..."
271+ ${CC} ${CFLAGS} -o test_scalar test_scalar.c
272+ if [ $? -ne 0 ]; then
273+ echo " Failed to compile scalar version"
274+ exit 1
275+ fi
276+
277+ # Compile RVV version
278+ echo " Compiling RVV version..."
279+ ${CC} ${CFLAGS} -o test_rvv test_rvv.c
280+ if [ $? -ne 0 ]; then
281+ echo " Failed to compile RVV version"
282+ exit 1
283+ fi
284+
285+ echo " Compilation successful!"
286+ echo
287+
288+ # Create results file
289+ RESULTS_FILE=" omatcopy_benchmark_results.csv"
290+ echo " Version,Rows,Cols,Iterations,TotalTime(s),AvgTime(s),GFLOPS" > ${RESULTS_FILE}
291+
292+ echo " Running benchmarks..."
293+ echo " Results will be saved to: ${RESULTS_FILE} "
294+ echo
295+ echo " Format: Version,Rows,Cols,Iterations,TotalTime(s),AvgTime(s),GFLOPS"
296+ echo " ----------------------------------------"
297+
298+ # Run benchmarks for different matrix sizes
299+ for size in " ${TEST_SIZES[@]} " ; do
300+ echo " Testing ${size} x${size} matrices..."
301+
302+ # Test scalar version
303+ ./test_scalar ${size} ${size} ${TEST_ITERATIONS} | tee -a ${RESULTS_FILE}
304+
305+ # Test RVV version
306+ ./test_rvv ${size} ${size} ${TEST_ITERATIONS} | tee -a ${RESULTS_FILE}
307+
308+ echo
309+ done
310+
311+ echo " Benchmark completed!"
312+ echo " Results saved to: ${RESULTS_FILE} "
313+ echo
314+ echo " To transfer files to sg2044:"
315+ echo " 1. Copy the compiled binaries and results:"
316+ echo " scp test_scalar test_rvv ${RESULTS_FILE} user@sg2044:/path/to/test/"
317+ echo " 2. Run on sg2044:"
318+ echo " ./test_scalar 1024 1024 1000"
319+ echo " ./test_rvv 1024 1024 1000"
320+ echo
321+ echo " Binary information:"
322+ file test_scalar test_rvv 2> /dev/null || echo " file command not available"
323+ ls -lh test_scalar test_rvv
324+
325+ # Clean up source files
326+ rm -f test_scalar.c test_rvv.c
327+
328+ echo
329+ echo " Script completed successfully!"
0 commit comments