Skip to content

Commit 2265318

Browse files
committed
Optimize RISC-V RVV omatcopy implementation with latest RVV API\n\nCo-authored-by: gong-flying <gongxiaofei24@iscas.ac.cn>
1 parent bd45b82 commit 2265318

File tree

10 files changed

+494
-685
lines changed

10 files changed

+494
-685
lines changed

benchmark_omatcopy.sh

Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
#!/bin/bash
2+
3+
# Simple RISC-V omatcopy performance comparison script
4+
# Direct comparison of omatcopy_ct.c vs omatcopy_ct_rvv.c
5+
6+
set -e
7+
8+
# Configuration
9+
RISCV_TOOLCHAIN="/qemu/riscv"
10+
CC="${RISCV_TOOLCHAIN}/bin/riscv64-unknown-linux-gnu-gcc"
11+
CFLAGS="-O3 -march=rv64gcv -static"
12+
TEST_SIZES=(64 128 256 512 1024)
13+
TEST_ITERATIONS=1000
14+
15+
echo "=== Simple RISC-V omatcopy Performance Comparison ==="
16+
echo "Toolchain: ${RISCV_TOOLCHAIN}"
17+
echo "Compiler: ${CC}"
18+
echo "Flags: ${CFLAGS}"
19+
echo
20+
21+
# Check if RISC-V toolchain exists
22+
if [ ! -f "${CC}" ]; then
23+
echo "Error: RISC-V compiler not found at ${CC}"
24+
echo "Please ensure RISC-V toolchain is installed at ${RISCV_TOOLCHAIN}"
25+
exit 1
26+
fi
27+
28+
# Create standalone test for scalar version
29+
cat > test_scalar.c << 'EOF'
30+
#include <stdio.h>
31+
#include <stdlib.h>
32+
#include <string.h>
33+
#include <time.h>
34+
#include <sys/time.h>
35+
36+
typedef long BLASLONG;
37+
typedef double FLOAT;
38+
39+
// Scalar implementation (simplified from omatcopy_ct.c)
40+
int omatcopy_ct_scalar(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
41+
{
42+
BLASLONG i, j;
43+
FLOAT *aptr, *bptr;
44+
45+
if (rows <= 0) return(0);
46+
if (cols <= 0) return(0);
47+
48+
aptr = a;
49+
50+
if (alpha == 0.0) {
51+
for (i = 0; i < cols; i++) {
52+
bptr = &b[i];
53+
for (j = 0; j < rows; j++) {
54+
bptr[j * ldb] = 0.0;
55+
}
56+
}
57+
return(0);
58+
}
59+
60+
if (alpha == 1.0) {
61+
for (i = 0; i < cols; i++) {
62+
bptr = &b[i];
63+
for (j = 0; j < rows; j++) {
64+
bptr[j * ldb] = aptr[j];
65+
}
66+
aptr += lda;
67+
}
68+
return(0);
69+
}
70+
71+
for (i = 0; i < cols; i++) {
72+
bptr = &b[i];
73+
for (j = 0; j < rows; j++) {
74+
bptr[j * ldb] = alpha * aptr[j];
75+
}
76+
aptr += lda;
77+
}
78+
79+
return(0);
80+
}
81+
82+
double get_time() {
83+
struct timeval tv;
84+
gettimeofday(&tv, NULL);
85+
return tv.tv_sec + tv.tv_usec * 1e-6;
86+
}
87+
88+
int main(int argc, char *argv[]) {
89+
if (argc != 4) {
90+
printf("Usage: %s <rows> <cols> <iterations>\n", argv[0]);
91+
return 1;
92+
}
93+
94+
BLASLONG rows = atol(argv[1]);
95+
BLASLONG cols = atol(argv[2]);
96+
int iterations = atoi(argv[3]);
97+
98+
// Allocate matrices
99+
FLOAT *a = (FLOAT*)malloc(rows * cols * sizeof(FLOAT));
100+
FLOAT *b = (FLOAT*)malloc(rows * cols * sizeof(FLOAT));
101+
102+
if (!a || !b) {
103+
printf("Memory allocation failed\n");
104+
return 1;
105+
}
106+
107+
// Initialize matrix A
108+
for (BLASLONG i = 0; i < rows * cols; i++) {
109+
a[i] = (FLOAT)(i % 100) / 10.0;
110+
}
111+
112+
// Warm up
113+
for (int i = 0; i < 10; i++) {
114+
omatcopy_ct_scalar(rows, cols, 1.0, a, rows, b, cols);
115+
}
116+
117+
// Benchmark
118+
double start_time = get_time();
119+
for (int i = 0; i < iterations; i++) {
120+
omatcopy_ct_scalar(rows, cols, 1.0, a, rows, b, cols);
121+
}
122+
double end_time = get_time();
123+
124+
double total_time = end_time - start_time;
125+
double avg_time = total_time / iterations;
126+
double gflops = (2.0 * rows * cols * iterations) / (total_time * 1e9);
127+
128+
printf("SCALAR,%ld,%ld,%d,%.6f,%.6f,%.3f\n",
129+
rows, cols, iterations, total_time, avg_time, gflops);
130+
131+
free(a);
132+
free(b);
133+
return 0;
134+
}
135+
EOF
136+
137+
# Create standalone test for RVV version
138+
cat > test_rvv.c << 'EOF'
139+
#include <stdio.h>
140+
#include <stdlib.h>
141+
#include <string.h>
142+
#include <time.h>
143+
#include <sys/time.h>
144+
#include <riscv_vector.h>
145+
146+
typedef long BLASLONG;
147+
typedef double FLOAT;
148+
149+
// RVV macros for double precision
150+
#define VSETVL_MAX __riscv_vsetvlmax_e64m8()
151+
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
152+
#define FLOAT_V_T vfloat64m8_t
153+
#define VLEV_FLOAT __riscv_vle64_v_f64m8
154+
#define VSEV_FLOAT __riscv_vse64_v_f64m8
155+
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
156+
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8
157+
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
158+
159+
// RVV implementation (from omatcopy_ct_rvv.c)
160+
int omatcopy_ct_rvv(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
161+
{
162+
BLASLONG i, j;
163+
FLOAT *aptr, *bptr;
164+
size_t vl;
165+
166+
FLOAT_V_T va;
167+
if (rows <= 0) return(0);
168+
if (cols <= 0) return(0);
169+
170+
aptr = a;
171+
172+
if (alpha == 0.0) {
173+
vl = VSETVL_MAX;
174+
va = VFMVVF_FLOAT(0, vl);
175+
for (i = 0; i < cols; i++) {
176+
bptr = &b[i];
177+
for (j = 0; j < rows; j += vl) {
178+
vl = VSETVL(rows - j);
179+
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
180+
}
181+
}
182+
return(0);
183+
}
184+
185+
if (alpha == 1.0) {
186+
for (i = 0; i < cols; i++) {
187+
bptr = &b[i];
188+
for (j = 0; j < rows; j += vl) {
189+
vl = VSETVL(rows - j);
190+
va = VLEV_FLOAT(aptr + j, vl);
191+
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
192+
}
193+
aptr += lda;
194+
}
195+
return(0);
196+
}
197+
198+
for (i = 0; i < cols; i++) {
199+
bptr = &b[i];
200+
for (j = 0; j < rows; j += vl) {
201+
vl = VSETVL(rows - j);
202+
va = VLEV_FLOAT(aptr + j, vl);
203+
va = VFMULVF_FLOAT(va, alpha, vl);
204+
VSSEV_FLOAT(bptr + j * ldb, sizeof(FLOAT) * ldb, va, vl);
205+
}
206+
aptr += lda;
207+
}
208+
209+
return(0);
210+
}
211+
212+
double get_time() {
213+
struct timeval tv;
214+
gettimeofday(&tv, NULL);
215+
return tv.tv_sec + tv.tv_usec * 1e-6;
216+
}
217+
218+
int main(int argc, char *argv[]) {
219+
if (argc != 4) {
220+
printf("Usage: %s <rows> <cols> <iterations>\n", argv[0]);
221+
return 1;
222+
}
223+
224+
BLASLONG rows = atol(argv[1]);
225+
BLASLONG cols = atol(argv[2]);
226+
int iterations = atoi(argv[3]);
227+
228+
// Allocate matrices
229+
FLOAT *a = (FLOAT*)malloc(rows * cols * sizeof(FLOAT));
230+
FLOAT *b = (FLOAT*)malloc(rows * cols * sizeof(FLOAT));
231+
232+
if (!a || !b) {
233+
printf("Memory allocation failed\n");
234+
return 1;
235+
}
236+
237+
// Initialize matrix A
238+
for (BLASLONG i = 0; i < rows * cols; i++) {
239+
a[i] = (FLOAT)(i % 100) / 10.0;
240+
}
241+
242+
// Warm up
243+
for (int i = 0; i < 10; i++) {
244+
omatcopy_ct_rvv(rows, cols, 1.0, a, rows, b, cols);
245+
}
246+
247+
// Benchmark
248+
double start_time = get_time();
249+
for (int i = 0; i < iterations; i++) {
250+
omatcopy_ct_rvv(rows, cols, 1.0, a, rows, b, cols);
251+
}
252+
double end_time = get_time();
253+
254+
double total_time = end_time - start_time;
255+
double avg_time = total_time / iterations;
256+
double gflops = (2.0 * rows * cols * iterations) / (total_time * 1e9);
257+
258+
printf("RVV,%ld,%ld,%d,%.6f,%.6f,%.3f\n",
259+
rows, cols, iterations, total_time, avg_time, gflops);
260+
261+
free(a);
262+
free(b);
263+
return 0;
264+
}
265+
EOF
266+
267+
echo "Building standalone test programs..."
268+
269+
# Compile scalar version
270+
echo "Compiling scalar version..."
271+
${CC} ${CFLAGS} -o test_scalar test_scalar.c
272+
if [ $? -ne 0 ]; then
273+
echo "Failed to compile scalar version"
274+
exit 1
275+
fi
276+
277+
# Compile RVV version
278+
echo "Compiling RVV version..."
279+
${CC} ${CFLAGS} -o test_rvv test_rvv.c
280+
if [ $? -ne 0 ]; then
281+
echo "Failed to compile RVV version"
282+
exit 1
283+
fi
284+
285+
echo "Compilation successful!"
286+
echo
287+
288+
# Create results file
289+
RESULTS_FILE="omatcopy_benchmark_results.csv"
290+
echo "Version,Rows,Cols,Iterations,TotalTime(s),AvgTime(s),GFLOPS" > ${RESULTS_FILE}
291+
292+
echo "Running benchmarks..."
293+
echo "Results will be saved to: ${RESULTS_FILE}"
294+
echo
295+
echo "Format: Version,Rows,Cols,Iterations,TotalTime(s),AvgTime(s),GFLOPS"
296+
echo "----------------------------------------"
297+
298+
# Run benchmarks for different matrix sizes
299+
for size in "${TEST_SIZES[@]}"; do
300+
echo "Testing ${size}x${size} matrices..."
301+
302+
# Test scalar version
303+
./test_scalar ${size} ${size} ${TEST_ITERATIONS} | tee -a ${RESULTS_FILE}
304+
305+
# Test RVV version
306+
./test_rvv ${size} ${size} ${TEST_ITERATIONS} | tee -a ${RESULTS_FILE}
307+
308+
echo
309+
done
310+
311+
echo "Benchmark completed!"
312+
echo "Results saved to: ${RESULTS_FILE}"
313+
echo
314+
echo "To transfer files to sg2044:"
315+
echo "1. Copy the compiled binaries and results:"
316+
echo " scp test_scalar test_rvv ${RESULTS_FILE} user@sg2044:/path/to/test/"
317+
echo "2. Run on sg2044:"
318+
echo " ./test_scalar 1024 1024 1000"
319+
echo " ./test_rvv 1024 1024 1000"
320+
echo
321+
echo "Binary information:"
322+
file test_scalar test_rvv 2>/dev/null || echo "file command not available"
323+
ls -lh test_scalar test_rvv
324+
325+
# Clean up source files
326+
rm -f test_scalar.c test_rvv.c
327+
328+
echo
329+
echo "Script completed successfully!"

0 commit comments

Comments
 (0)