|
1 | 1 | // Generated by the Tensor Algebra Compiler (tensor-compiler.org) |
2 | | -// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c |
| 2 | +// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s=split(i,i0,i1,32) -s=reorder(i0,i1,j) -s=parallelize(i0,CPUThread,NoRaces) -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c |
3 | 3 | #ifndef TACO_C_HEADERS |
4 | 4 | #define TACO_C_HEADERS |
5 | 5 | #include <stdio.h> |
@@ -118,14 +118,23 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) { |
118 | 118 | int x1_dimension = (int)(x->dimensions[0]); |
119 | 119 | double* restrict x_vals = (double*)(x->vals); |
120 | 120 |
|
| 121 | + #pragma omp parallel for schedule(static) |
| 122 | + for (int32_t py = 0; py < y1_dimension; py++) { |
| 123 | + y_vals[py] = 0.0; |
| 124 | + } |
| 125 | + |
121 | 126 | #pragma omp parallel for schedule(runtime) |
122 | | - for (int32_t i = 0; i < A1_dimension; i++) { |
123 | | - double y_val = 0.0; |
124 | | - for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) { |
125 | | - int32_t j = A2_crd[jA]; |
126 | | - y_val += A_vals[jA] * x_vals[j]; |
| 127 | + for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) { |
| 128 | + for (int32_t i1 = 0; i1 < 32; i1++) { |
| 129 | + int32_t i = i0 * 32 + i1; |
| 130 | + if (i >= A1_dimension) |
| 131 | + continue; |
| 132 | + |
| 133 | + for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) { |
| 134 | + int32_t j = A2_crd[jA]; |
| 135 | + y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j]; |
| 136 | + } |
127 | 137 | } |
128 | | - y_vals[i] = y_val; |
129 | 138 | } |
130 | 139 | return 0; |
131 | 140 | } |
@@ -153,14 +162,23 @@ int evaluate(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) { |
153 | 162 | int32_t y_capacity = y1_dimension; |
154 | 163 | y_vals = (double*)malloc(sizeof(double) * y_capacity); |
155 | 164 |
|
| 165 | + #pragma omp parallel for schedule(static) |
| 166 | + for (int32_t py = 0; py < y_capacity; py++) { |
| 167 | + y_vals[py] = 0.0; |
| 168 | + } |
| 169 | + |
156 | 170 | #pragma omp parallel for schedule(runtime) |
157 | | - for (int32_t i = 0; i < A1_dimension; i++) { |
158 | | - double y_val = 0.0; |
159 | | - for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) { |
160 | | - int32_t j = A2_crd[jA]; |
161 | | - y_val += A_vals[jA] * x_vals[j]; |
| 171 | + for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) { |
| 172 | + for (int32_t i1 = 0; i1 < 32; i1++) { |
| 173 | + int32_t i = i0 * 32 + i1; |
| 174 | + if (i >= A1_dimension) |
| 175 | + continue; |
| 176 | + |
| 177 | + for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) { |
| 178 | + int32_t j = A2_crd[jA]; |
| 179 | + y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j]; |
| 180 | + } |
162 | 181 | } |
163 | | - y_vals[i] = y_val; |
164 | 182 | } |
165 | 183 |
|
166 | 184 | y->vals = (uint8_t*)y_vals; |
@@ -218,12 +236,12 @@ int pack_A(taco_tensor_t *A, int* A_COO1_pos, int* A_COO1_crd, int* A_COO2_crd, |
218 | 236 | jA_COO++; |
219 | 237 | } |
220 | 238 | if (A_capacity <= jA) { |
221 | | - A_vals = (double*)realloc(A_vals, sizeof(double) * (A_capacity * 2)); |
| 239 | + A_vals = (double*)realloc(A_vals, sizeof(double) * A_capacity * 2); |
222 | 240 | A_capacity *= 2; |
223 | 241 | } |
224 | 242 | A_vals[jA] = A_COO_val; |
225 | 243 | if (A2_crd_size <= jA) { |
226 | | - A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * (A2_crd_size * 2)); |
| 244 | + A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * A2_crd_size * 2); |
227 | 245 | A2_crd_size *= 2; |
228 | 246 | } |
229 | 247 | A2_crd[jA] = j; |
@@ -294,12 +312,12 @@ int unpack(int** y_COO1_pos_ptr, int** y_COO1_crd_ptr, double** y_COO_vals_ptr, |
294 | 312 |
|
295 | 313 | for (int32_t i = 0; i < y1_dimension; i++) { |
296 | 314 | if (y_COO_capacity <= iy_COO) { |
297 | | - y_COO_vals = (double*)realloc(y_COO_vals, sizeof(double) * (y_COO_capacity * 2)); |
| 315 | + y_COO_vals = (double*)realloc(y_COO_vals, sizeof(double) * y_COO_capacity * 2); |
298 | 316 | y_COO_capacity *= 2; |
299 | 317 | } |
300 | 318 | y_COO_vals[iy_COO] = y_vals[i]; |
301 | 319 | if (y_COO1_crd_size <= iy_COO) { |
302 | | - y_COO1_crd = (int32_t*)realloc(y_COO1_crd, sizeof(int32_t) * (y_COO1_crd_size * 2)); |
| 320 | + y_COO1_crd = (int32_t*)realloc(y_COO1_crd, sizeof(int32_t) * y_COO1_crd_size * 2); |
303 | 321 | y_COO1_crd_size *= 2; |
304 | 322 | } |
305 | 323 | y_COO1_crd[iy_COO] = i; |
|
0 commit comments