@@ -53,65 +53,176 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5353/*****************************************************
5454 * Order ColMajor
5555 * Trans with RVV optimization
56- *
56+ * Optimized version with:
57+ * - Block processing for cache efficiency
58+ * - Loop unrolling for better ILP
59+ * - Reduced VSETVL overhead
60+ * - Software prefetching
5761******************************************************/
5862
63+ // Block size for cache-friendly processing
64+ #define BLOCK_SIZE_ROWS 256
65+ #define BLOCK_SIZE_COLS 64
66+
67+ // Fast path for small matrices
68+ static inline int small_matrix_transpose (BLASLONG rows , BLASLONG cols , FLOAT alpha , FLOAT * a , BLASLONG lda , FLOAT * b , BLASLONG ldb )
69+ {
70+ if (rows <= 8 && cols <= 8 ) {
71+ // Optimized 8x8 or smaller transpose
72+ for (BLASLONG i = 0 ; i < cols ; i ++ ) {
73+ for (BLASLONG j = 0 ; j < rows ; j ++ ) {
74+ b [j * ldb + i ] = alpha * a [i * lda + j ];
75+ }
76+ }
77+ return 1 ;
78+ }
79+ return 0 ;
80+ }
81+
5982int CNAME (BLASLONG rows , BLASLONG cols , FLOAT alpha , FLOAT * a , BLASLONG lda , FLOAT * b , BLASLONG ldb )
6083{
61- BLASLONG i , j ;
84+ BLASLONG i , j , ii , jj ;
6285 FLOAT * aptr , * bptr ;
63- size_t vl ;
64- FLOAT_V_T va , vb ;
86+ size_t vl , vl_max ;
87+ FLOAT_V_T va , vb , va2 , va3 , va4 ;
6588
6689 if (rows <= 0 ) return (0 );
6790 if (cols <= 0 ) return (0 );
6891
69- aptr = a ;
92+ // Try small matrix fast path
93+ if (small_matrix_transpose (rows , cols , alpha , a , lda , b , ldb )) {
94+ return (0 );
95+ }
96+
97+ // Get maximum vector length once
98+ vl_max = VSETVL_MAX ;
7099
71100 if (alpha == 0.0 )
72101 {
73- vl = VSETVL_MAX ;
74- va = VFMVVF_FLOAT (0 , vl );
75- for (i = 0 ; i < cols ; i ++ )
76- {
77- bptr = & b [i ];
78- for (j = 0 ; j < rows ; j += vl )
79- {
80- vl = VSETVL (rows - j );
81- VSSEV_FLOAT (bptr + j * ldb , sizeof (FLOAT ) * ldb , va , vl );
102+ va = VFMVVF_FLOAT (0 , vl_max );
103+ // Block processing for better cache locality
104+ for (ii = 0 ; ii < cols ; ii += BLOCK_SIZE_COLS ) {
105+ BLASLONG col_end = (ii + BLOCK_SIZE_COLS < cols ) ? ii + BLOCK_SIZE_COLS : cols ;
106+ for (jj = 0 ; jj < rows ; jj += BLOCK_SIZE_ROWS ) {
107+ BLASLONG row_end = (jj + BLOCK_SIZE_ROWS < rows ) ? jj + BLOCK_SIZE_ROWS : rows ;
108+
109+ for (i = ii ; i < col_end ; i ++ ) {
110+ bptr = & b [i + jj * ldb ];
111+ BLASLONG remaining = row_end - jj ;
112+
113+ // Main loop with reduced VSETVL calls
114+ for (j = 0 ; j < remaining ; j += vl_max ) {
115+ vl = (j + vl_max <= remaining ) ? vl_max : VSETVL (remaining - j );
116+ VSSEV_FLOAT (bptr + j * ldb , sizeof (FLOAT ) * ldb , va , vl );
117+ }
118+ }
82119 }
83120 }
84121 return (0 );
85122 }
86123
87124 if (alpha == 1.0 )
88125 {
89- for (i = 0 ; i < cols ; i ++ )
90- {
91- bptr = & b [i ];
92- for (j = 0 ; j < rows ; j += vl )
93- {
94- vl = VSETVL (rows - j );
95- va = VLEV_FLOAT (aptr + j , vl );
96- VSSEV_FLOAT (bptr + j * ldb , sizeof (FLOAT ) * ldb , va , vl );
126+ // Block processing with loop unrolling
127+ for (ii = 0 ; ii < cols ; ii += BLOCK_SIZE_COLS ) {
128+ BLASLONG col_end = (ii + BLOCK_SIZE_COLS < cols ) ? ii + BLOCK_SIZE_COLS : cols ;
129+ for (jj = 0 ; jj < rows ; jj += BLOCK_SIZE_ROWS ) {
130+ BLASLONG row_end = (jj + BLOCK_SIZE_ROWS < rows ) ? jj + BLOCK_SIZE_ROWS : rows ;
131+
132+ // Process 4 columns at once when possible
133+ for (i = ii ; i < col_end - 3 ; i += 4 ) {
134+ aptr = & a [i * lda + jj ];
135+ FLOAT * bptr1 = & b [i + jj * ldb ];
136+ FLOAT * bptr2 = & b [i + 1 + jj * ldb ];
137+ FLOAT * bptr3 = & b [i + 2 + jj * ldb ];
138+ FLOAT * bptr4 = & b [i + 3 + jj * ldb ];
139+
140+ BLASLONG remaining = row_end - jj ;
141+
142+ // Prefetch next block
143+ if (i + 4 < col_end ) {
144+ __builtin_prefetch (& a [(i + 4 ) * lda + jj ], 0 , 3 );
145+ }
146+
147+ for (j = 0 ; j < remaining ; j += vl_max ) {
148+ vl = (j + vl_max <= remaining ) ? vl_max : VSETVL (remaining - j );
149+
150+ va = VLEV_FLOAT (aptr + j , vl );
151+ va2 = VLEV_FLOAT (aptr + lda + j , vl );
152+ va3 = VLEV_FLOAT (aptr + 2 * lda + j , vl );
153+ va4 = VLEV_FLOAT (aptr + 3 * lda + j , vl );
154+
155+ VSSEV_FLOAT (bptr1 + j * ldb , sizeof (FLOAT ) * ldb , va , vl );
156+ VSSEV_FLOAT (bptr2 + j * ldb , sizeof (FLOAT ) * ldb , va2 , vl );
157+ VSSEV_FLOAT (bptr3 + j * ldb , sizeof (FLOAT ) * ldb , va3 , vl );
158+ VSSEV_FLOAT (bptr4 + j * ldb , sizeof (FLOAT ) * ldb , va4 , vl );
159+ }
160+ }
161+
162+ // Handle remaining columns
163+ for (; i < col_end ; i ++ ) {
164+ aptr = & a [i * lda + jj ];
165+ bptr = & b [i + jj * ldb ];
166+ BLASLONG remaining = row_end - jj ;
167+
168+ for (j = 0 ; j < remaining ; j += vl_max ) {
169+ vl = (j + vl_max <= remaining ) ? vl_max : VSETVL (remaining - j );
170+ va = VLEV_FLOAT (aptr + j , vl );
171+ VSSEV_FLOAT (bptr + j * ldb , sizeof (FLOAT ) * ldb , va , vl );
172+ }
173+ }
97174 }
98- aptr += lda ;
99175 }
100176 return (0 );
101177 }
102178
103- // General case with alpha scaling
104- for (i = 0 ; i < cols ; i ++ )
105- {
106- bptr = & b [i ];
107- for (j = 0 ; j < rows ; j += vl )
108- {
109- vl = VSETVL (rows - j );
110- va = VLEV_FLOAT (aptr + j , vl );
111- va = VFMULVF_FLOAT (va , alpha , vl );
112- VSSEV_FLOAT (bptr + j * ldb , sizeof (FLOAT ) * ldb , va , vl );
179+ // General case with alpha scaling and optimizations
180+ for (ii = 0 ; ii < cols ; ii += BLOCK_SIZE_COLS ) {
181+ BLASLONG col_end = (ii + BLOCK_SIZE_COLS < cols ) ? ii + BLOCK_SIZE_COLS : cols ;
182+ for (jj = 0 ; jj < rows ; jj += BLOCK_SIZE_ROWS ) {
183+ BLASLONG row_end = (jj + BLOCK_SIZE_ROWS < rows ) ? jj + BLOCK_SIZE_ROWS : rows ;
184+
185+ // Process 2 columns at once for better pipeline utilization
186+ for (i = ii ; i < col_end - 1 ; i += 2 ) {
187+ aptr = & a [i * lda + jj ];
188+ FLOAT * bptr1 = & b [i + jj * ldb ];
189+ FLOAT * bptr2 = & b [i + 1 + jj * ldb ];
190+
191+ BLASLONG remaining = row_end - jj ;
192+
193+ // Prefetch next block
194+ if (i + 2 < col_end ) {
195+ __builtin_prefetch (& a [(i + 2 ) * lda + jj ], 0 , 3 );
196+ }
197+
198+ for (j = 0 ; j < remaining ; j += vl_max ) {
199+ vl = (j + vl_max <= remaining ) ? vl_max : VSETVL (remaining - j );
200+
201+ va = VLEV_FLOAT (aptr + j , vl );
202+ va2 = VLEV_FLOAT (aptr + lda + j , vl );
203+
204+ va = VFMULVF_FLOAT (va , alpha , vl );
205+ va2 = VFMULVF_FLOAT (va2 , alpha , vl );
206+
207+ VSSEV_FLOAT (bptr1 + j * ldb , sizeof (FLOAT ) * ldb , va , vl );
208+ VSSEV_FLOAT (bptr2 + j * ldb , sizeof (FLOAT ) * ldb , va2 , vl );
209+ }
210+ }
211+
212+ // Handle remaining columns
213+ for (; i < col_end ; i ++ ) {
214+ aptr = & a [i * lda + jj ];
215+ bptr = & b [i + jj * ldb ];
216+ BLASLONG remaining = row_end - jj ;
217+
218+ for (j = 0 ; j < remaining ; j += vl_max ) {
219+ vl = (j + vl_max <= remaining ) ? vl_max : VSETVL (remaining - j );
220+ va = VLEV_FLOAT (aptr + j , vl );
221+ va = VFMULVF_FLOAT (va , alpha , vl );
222+ VSSEV_FLOAT (bptr + j * ldb , sizeof (FLOAT ) * ldb , va , vl );
223+ }
224+ }
113225 }
114- aptr += lda ;
115226 }
116227
117228 return (0 );
0 commit comments