99#include <math.h>
1010#include <complex.h>
1111#include <string.h>
12+ #if _OPENMP
13+ #include <omp.h>
14+ #endif
1215#define TACO_MIN (_a ,_b ) ((_a) < (_b) ? (_a) : (_b))
1316#define TACO_MAX (_a ,_b ) ((_a) > (_b) ? (_a) : (_b))
1417#define TACO_DEREF (_a ) (((___context___*)(*__ctx__))->_a)
@@ -26,6 +29,10 @@ typedef struct {
2629 int32_t vals_size ; // values array size
2730} taco_tensor_t ;
2831#endif
32+ #if !_OPENMP
33+ int omp_get_thread_num () { return 0 ; }
34+ int omp_get_max_threads () { return 1 ; }
35+ #endif
2936int cmp (const void * a , const void * b ) {
3037 return * ((const int * )a ) - * ((const int * )b );
3138}
@@ -122,14 +129,18 @@ int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
122129 int * restrict C2_crd = (int * )(C -> indices [1 ][1 ]);
123130 double * restrict C_vals = (double * )(C -> vals );
124131
132+ double * restrict workspace_all = 0 ;
133+ int32_t * restrict workspace_index_list_all = 0 ;
134+ workspace_index_list_all = (int32_t * )malloc (sizeof (int32_t ) * (C2_dimension * omp_get_max_threads ()));
135+ bool * restrict workspace_already_set_all = calloc ((C2_dimension * omp_get_max_threads ()), sizeof (bool ));
136+ workspace_all = (double * )malloc (sizeof (double ) * (C2_dimension * omp_get_max_threads ()));
137+
125138 #pragma omp parallel for schedule(runtime)
126139 for (int32_t i = 0 ; i < B1_dimension ; i ++ ) {
127140 int32_t workspace_index_list_size = 0 ;
128- double * restrict workspace = 0 ;
129- int32_t * restrict workspace_index_list = 0 ;
130- workspace_index_list = (int32_t * )malloc (sizeof (int32_t ) * C2_dimension );
131- bool * restrict workspace_already_set = calloc (C2_dimension , sizeof (bool ));
132- workspace = (double * )malloc (sizeof (double ) * C2_dimension );
141+ double * restrict workspace = workspace_all + C2_dimension * omp_get_thread_num ();
142+ int32_t * restrict workspace_index_list = workspace_index_list_all + C2_dimension * omp_get_thread_num ();
143+ bool * restrict workspace_already_set = workspace_already_set_all + C2_dimension * omp_get_thread_num ();
133144 for (int32_t kB = B2_pos [i ]; kB < B2_pos [(i + 1 )]; kB ++ ) {
134145 int32_t k = B2_crd [kB ];
135146 for (int32_t jC = C2_pos [k ]; jC < C2_pos [(k + 1 )]; jC ++ ) {
@@ -153,11 +164,12 @@ int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
153164 A_vals [pA2 ] = workspace [j ];
154165 workspace_already_set [j ] = 0 ;
155166 }
156- free (workspace_index_list );
157- free (workspace_already_set );
158- free (workspace );
159167 }
160168
169+ free (workspace_index_list_all );
170+ free (workspace_already_set_all );
171+ free (workspace_all );
172+
161173 for (int32_t p = 0 ; p < A1_dimension ; p ++ ) {
162174 A2_pos [A1_dimension - p ] = A2_pos [((A1_dimension - p ) - 1 )];
163175 }
@@ -184,12 +196,15 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
184196 int32_t * restrict A2_nnz = 0 ;
185197 A2_nnz = (int32_t * )malloc (sizeof (int32_t ) * B1_dimension );
186198
199+ int32_t * restrict qworkspace_index_list_all = 0 ;
200+ qworkspace_index_list_all = (int32_t * )malloc (sizeof (int32_t ) * (C2_dimension * omp_get_max_threads ()));
201+ bool * restrict qworkspace_already_set_all = calloc ((C2_dimension * omp_get_max_threads ()), sizeof (bool ));
202+
187203 #pragma omp parallel for schedule(runtime)
188204 for (int32_t i = 0 ; i < B1_dimension ; i ++ ) {
189205 int32_t qworkspace_index_list_size = 0 ;
190- int32_t * restrict qworkspace_index_list = 0 ;
191- qworkspace_index_list = (int32_t * )malloc (sizeof (int32_t ) * C2_dimension );
192- bool * restrict qworkspace_already_set = calloc (C2_dimension , sizeof (bool ));
206+ int32_t * restrict qworkspace_index_list = qworkspace_index_list_all + C2_dimension * omp_get_thread_num ();
207+ bool * restrict qworkspace_already_set = qworkspace_already_set_all + C2_dimension * omp_get_thread_num ();
193208 for (int32_t kB = B2_pos [i ]; kB < B2_pos [(i + 1 )]; kB ++ ) {
194209 int32_t k = B2_crd [kB ];
195210 for (int32_t jC = C2_pos [k ]; jC < C2_pos [(k + 1 )]; jC ++ ) {
@@ -208,10 +223,11 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
208223 qworkspace_already_set [j ] = 0 ;
209224 }
210225 A2_nnz [i ] = tjA2_nnz_val ;
211- free (qworkspace_index_list );
212- free (qworkspace_already_set );
213226 }
214227
228+ free (qworkspace_index_list_all );
229+ free (qworkspace_already_set_all );
230+
215231 A2_pos = (int32_t * )malloc (sizeof (int32_t ) * (A1_dimension + 1 ));
216232 A2_pos [0 ] = 0 ;
217233 for (int32_t i = 0 ; i < A1_dimension ; i ++ ) {
@@ -220,12 +236,15 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
220236 A2_crd = (int32_t * )malloc (sizeof (int32_t ) * A2_pos [A1_dimension ]);
221237 A_vals = (double * )malloc (sizeof (double ) * A2_pos [A1_dimension ]);
222238
239+ int32_t * restrict workspace_index_list_all = 0 ;
240+ workspace_index_list_all = (int32_t * )malloc (sizeof (int32_t ) * (C2_dimension * omp_get_max_threads ()));
241+ bool * restrict workspace_already_set_all = calloc ((C2_dimension * omp_get_max_threads ()), sizeof (bool ));
242+
223243 #pragma omp parallel for schedule(runtime)
224244 for (int32_t i = 0 ; i < B1_dimension ; i ++ ) {
225245 int32_t workspace_index_list_size = 0 ;
226- int32_t * restrict workspace_index_list = 0 ;
227- workspace_index_list = (int32_t * )malloc (sizeof (int32_t ) * C2_dimension );
228- bool * restrict workspace_already_set = calloc (C2_dimension , sizeof (bool ));
246+ int32_t * restrict workspace_index_list = workspace_index_list_all + C2_dimension * omp_get_thread_num ();
247+ bool * restrict workspace_already_set = workspace_already_set_all + C2_dimension * omp_get_thread_num ();
229248 for (int32_t kB0 = B2_pos [i ]; kB0 < B2_pos [(i + 1 )]; kB0 ++ ) {
230249 int32_t k = B2_crd [kB0 ];
231250 for (int32_t jC0 = C2_pos [k ]; jC0 < C2_pos [(k + 1 )]; jC0 ++ ) {
@@ -246,10 +265,11 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
246265 A2_crd [pA2 ] = j ;
247266 workspace_already_set [j ] = 0 ;
248267 }
249- free (workspace_index_list );
250- free (workspace_already_set );
251268 }
252269
270+ free (workspace_index_list_all );
271+ free (workspace_already_set_all );
272+
253273 for (int32_t p = 0 ; p < A1_dimension ; p ++ ) {
254274 A2_pos [A1_dimension - p ] = A2_pos [((A1_dimension - p ) - 1 )];
255275 }
@@ -281,12 +301,15 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
281301 int32_t * restrict A2_nnz = 0 ;
282302 A2_nnz = (int32_t * )malloc (sizeof (int32_t ) * B1_dimension );
283303
304+ int32_t * restrict qworkspace_index_list_all = 0 ;
305+ qworkspace_index_list_all = (int32_t * )malloc (sizeof (int32_t ) * (C2_dimension * omp_get_max_threads ()));
306+ bool * restrict qworkspace_already_set_all = calloc ((C2_dimension * omp_get_max_threads ()), sizeof (bool ));
307+
284308 #pragma omp parallel for schedule(runtime)
285309 for (int32_t i = 0 ; i < B1_dimension ; i ++ ) {
286310 int32_t qworkspace_index_list_size = 0 ;
287- int32_t * restrict qworkspace_index_list = 0 ;
288- qworkspace_index_list = (int32_t * )malloc (sizeof (int32_t ) * C2_dimension );
289- bool * restrict qworkspace_already_set = calloc (C2_dimension , sizeof (bool ));
311+ int32_t * restrict qworkspace_index_list = qworkspace_index_list_all + C2_dimension * omp_get_thread_num ();
312+ bool * restrict qworkspace_already_set = qworkspace_already_set_all + C2_dimension * omp_get_thread_num ();
290313 for (int32_t kB = B2_pos [i ]; kB < B2_pos [(i + 1 )]; kB ++ ) {
291314 int32_t k = B2_crd [kB ];
292315 for (int32_t jC = C2_pos [k ]; jC < C2_pos [(k + 1 )]; jC ++ ) {
@@ -305,10 +328,11 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
305328 qworkspace_already_set [j ] = 0 ;
306329 }
307330 A2_nnz [i ] = tjA2_nnz_val ;
308- free (qworkspace_index_list );
309- free (qworkspace_already_set );
310331 }
311332
333+ free (qworkspace_index_list_all );
334+ free (qworkspace_already_set_all );
335+
312336 A2_pos = (int32_t * )malloc (sizeof (int32_t ) * (A1_dimension + 1 ));
313337 A2_pos [0 ] = 0 ;
314338 for (int32_t i = 0 ; i < A1_dimension ; i ++ ) {
@@ -317,14 +341,18 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
317341 A2_crd = (int32_t * )malloc (sizeof (int32_t ) * A2_pos [A1_dimension ]);
318342 A_vals = (double * )malloc (sizeof (double ) * A2_pos [A1_dimension ]);
319343
344+ double * restrict workspace_all = 0 ;
345+ int32_t * restrict workspace_index_list_all = 0 ;
346+ workspace_index_list_all = (int32_t * )malloc (sizeof (int32_t ) * (C2_dimension * omp_get_max_threads ()));
347+ bool * restrict workspace_already_set_all = calloc ((C2_dimension * omp_get_max_threads ()), sizeof (bool ));
348+ workspace_all = (double * )malloc (sizeof (double ) * (C2_dimension * omp_get_max_threads ()));
349+
320350 #pragma omp parallel for schedule(runtime)
321351 for (int32_t i = 0 ; i < B1_dimension ; i ++ ) {
322352 int32_t workspace_index_list_size = 0 ;
323- double * restrict workspace = 0 ;
324- int32_t * restrict workspace_index_list = 0 ;
325- workspace_index_list = (int32_t * )malloc (sizeof (int32_t ) * C2_dimension );
326- bool * restrict workspace_already_set = calloc (C2_dimension , sizeof (bool ));
327- workspace = (double * )malloc (sizeof (double ) * C2_dimension );
353+ double * restrict workspace = workspace_all + C2_dimension * omp_get_thread_num ();
354+ int32_t * restrict workspace_index_list = workspace_index_list_all + C2_dimension * omp_get_thread_num ();
355+ bool * restrict workspace_already_set = workspace_already_set_all + C2_dimension * omp_get_thread_num ();
328356 for (int32_t kB0 = B2_pos [i ]; kB0 < B2_pos [(i + 1 )]; kB0 ++ ) {
329357 int32_t k = B2_crd [kB0 ];
330358 for (int32_t jC0 = C2_pos [k ]; jC0 < C2_pos [(k + 1 )]; jC0 ++ ) {
@@ -350,11 +378,12 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
350378 A_vals [pA2 ] = workspace [j ];
351379 workspace_already_set [j ] = 0 ;
352380 }
353- free (workspace_index_list );
354- free (workspace_already_set );
355- free (workspace );
356381 }
357382
383+ free (workspace_index_list_all );
384+ free (workspace_already_set_all );
385+ free (workspace_all );
386+
358387 for (int32_t p = 0 ; p < A1_dimension ; p ++ ) {
359388 A2_pos [A1_dimension - p ] = A2_pos [((A1_dimension - p ) - 1 )];
360389 }
0 commit comments