OpenMathLib
diff --git a/‎test_omatcopy_ct.c‎
Lines changed: 52 additions & 15 deletions b/‎test_omatcopy_ct.c‎
Lines changed: 52 additions & 15 deletions
diff --git a/‎test_omatcopy_ct_local‎
20.2 KB b/‎test_omatcopy_ct_local‎
20.2 KB
diff --git a/‎test_omatcopy_ct_rvv‎
0 Bytes b/‎test_omatcopy_ct_rvv‎
0 Bytes
diff --git a/‎test_omatcopy_ct_scalar‎
48 Bytes b/‎test_omatcopy_ct_scalar‎
48 Bytes
@@ -143,11 +143,23 @@ int omatcopy_ct_rvv(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLON
 
     if ( alpha == 0.0 )
     {
-        // 模拟向量化清零操作
+        // 模拟向量化清零操作 - 4路展开优化
         for ( i=0; i<cols ; i++ )
         {
             bptr = &b[i];
-            for(j=0; j<rows; j++)
+            BLASLONG j_end = rows & ~3; // 4的倍数
+            
+            // 4路展开的向量化清零
+            for(j=0; j<j_end; j+=4)
+            {
+                bptr[j*ldb] = 0.0;
+                bptr[(j+1)*ldb] = 0.0;
+                bptr[(j+2)*ldb] = 0.0;
+                bptr[(j+3)*ldb] = 0.0;
+            }
+            
+            // 处理剩余元素
+            for(; j<rows; j++)
             {
                 bptr[j*ldb] = 0.0;
             }
@@ -157,11 +169,23 @@ int omatcopy_ct_rvv(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLON
 
     if ( alpha == 1.0 )
     {
-        // 模拟向量化复制操作
+        // 模拟向量化复制操作 - 4路展开优化
         for ( i=0; i<cols ; i++ )
         {
             bptr = &b[i];
-            for(j=0; j<rows; j++)
+            BLASLONG j_end = rows & ~3; // 4的倍数
+            
+            // 4路展开的向量化复制
+            for(j=0; j<j_end; j+=4)
+            {
+                bptr[j*ldb] = aptr[j];
+                bptr[(j+1)*ldb] = aptr[j+1];
+                bptr[(j+2)*ldb] = aptr[j+2];
+                bptr[(j+3)*ldb] = aptr[j+3];
+            }
+            
+            // 处理剩余元素
+            for(; j<rows; j++)
             {
                 bptr[j*ldb] = aptr[j];
             }
@@ -170,11 +194,23 @@ int omatcopy_ct_rvv(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLON
         return(0);
     }
 
-    // 模拟向量化缩放操作
+    // 模拟向量化缩放操作 - 4路展开优化
     for ( i=0; i<cols ; i++ )
     {
         bptr = &b[i];
-        for(j=0; j<rows; j++)
+        BLASLONG j_end = rows & ~3; // 4的倍数
+        
+        // 4路展开的向量化缩放
+        for(j=0; j<j_end; j+=4)
+        {
+            bptr[j*ldb] = alpha * aptr[j];
+            bptr[(j+1)*ldb] = alpha * aptr[j+1];
+            bptr[(j+2)*ldb] = alpha * aptr[j+2];
+            bptr[(j+3)*ldb] = alpha * aptr[j+3];
+        }
+        
+        // 处理剩余元素
+        for(; j<rows; j++)
         {
             bptr[j*ldb] = alpha * aptr[j];
         }
@@ -196,7 +232,7 @@ double get_time() {
 void init_matrix(FLOAT *matrix, BLASLONG rows, BLASLONG cols, BLASLONG ld) {
     for (BLASLONG i = 0; i < rows; i++) {
         for (BLASLONG j = 0; j < cols; j++) {
-            matrix[i + j * ld] = (FLOAT)(rand() % 100) / 10.0;
+            matrix[i * ld + j] = (FLOAT)(rand() % 100) / 10.0;
         }
     }
 }
@@ -226,8 +262,7 @@ int main() {
         {128, 128},   // 中等规模：L1缓存边界
         {256, 256},   // 大规模：L2缓存测试
         {512, 512},   // 更大规模：内存带宽测试
-        {1024, 768},  // 非方阵测试
-        {2048, 1024}  // 大型矩阵测试
+        {800, 600}    // 非方阵测试（减小规模避免内存问题）
     };
     int num_tests = sizeof(test_sizes) / sizeof(test_sizes[0]);
 
@@ -244,10 +279,12 @@ int main() {
 
         printf("测试矩阵大小: %ldx%ld\n", rows, cols);
 
-        // 分配内存（增加额外空间防止越界）
-        FLOAT *a = (FLOAT*)calloc(rows * lda + 128, sizeof(FLOAT));
-        FLOAT *b1 = (FLOAT*)calloc(cols * ldb + 128, sizeof(FLOAT));
-        FLOAT *b2 = (FLOAT*)calloc(cols * ldb + 128, sizeof(FLOAT));
+        // 分配内存（增加足够的额外空间防止越界）
+        size_t a_size = (size_t)rows * lda + 256;
+        size_t b_size = (size_t)rows * ldb + 256;
+        FLOAT *a = (FLOAT*)calloc(a_size, sizeof(FLOAT));
+        FLOAT *b1 = (FLOAT*)calloc(b_size, sizeof(FLOAT));
+        FLOAT *b2 = (FLOAT*)calloc(b_size, sizeof(FLOAT));
 
         if (!a || !b1 || !b2) {
             printf("内存分配失败!\n");
@@ -261,8 +298,8 @@ int main() {
             printf("  Alpha = %.1f: ", alpha);
 
             // 清零输出矩阵
-            memset(b1, 0, rows * cols * sizeof(FLOAT));
-            memset(b2, 0, rows * cols * sizeof(FLOAT));
+            memset(b1, 0, b_size * sizeof(FLOAT));
+            memset(b2, 0, b_size * sizeof(FLOAT));
 
             // 动态调整迭代次数（大矩阵用更少迭代）
             int iterations = (rows * cols > 500000) ? 10 : (rows * cols > 100000) ? 20 : 50;