@@ -47,8 +47,16 @@ subroutine co_dot_unaccelerated(x,y,x_dot_y)
4747 subroutine co_dot_accelerated (x ,y ,x_dot_y ,API )
4848 real , intent (in ) :: x(:),y(:)
4949 real , intent (out ) :: x_dot_y
50- integer (c_int), intent (in ) :: API
51- select case (API)
50+ integer (c_int), intent (in ), optional :: API
51+ integer (c_int) :: chosen_API
52+
53+ if (present (API))
54+ chosen_API = API
55+ else
56+ chosen_API = CUDA
57+ end if
58+
59+ select case (chosen_API)
5260 case (CUDA)
5361 call cudaDot(x,y,x_dot_y,size (x)) ! Accelerated reduction on local data
5462 case (OpenMP)
@@ -68,18 +76,14 @@ program cu_dot_test
6876 implicit none
6977
7078 ! Unaccelerated variables
71- real (c_float), allocatable :: a (:),b (:)
79+ real (c_float), allocatable :: a_unacc (:),b_unacc (:)
7280 real (c_float) :: dot
7381 real (c_double) :: t_start, t_end
7482
7583 ! Library-accelerated variables
7684 real (c_float), allocatable :: a_acc(:)[:], b_acc(:)[:]
7785 real (c_float) :: dot_acc[* ]
7886
79- ! Manually accelerated variables
80- real (c_float), allocatable :: a_man(:)[:], b_man(:)[:]
81- real (c_float) :: dot_man[* ]
82-
8387 integer (c_int),parameter :: n = 99900000
8488 integer (c_int) :: n_local,np,me
8589
@@ -98,15 +102,15 @@ program cu_dot_test
98102
99103 ! Parallel execution
100104 t_start = walltime()
101- call co_dot_accelerated(a_acc,b_acc,dot_acc,CUDA)
105+ call co_dot_accelerated(a_acc( 1 :n_local) ,b_acc( 1 :n_local) ,dot_acc,CUDA)
102106 t_end = walltime()
103107 if (me== 1 ) print * , ' Accelerated dot_prod' ,dot_acc,' time:' ,t_end- t_start
104108
105109 sync all
106110
107111 ! Serial execution
108112 t_start = walltime()
109- call co_dot_unaccelerated(a_man,b_man ,dot)
113+ call co_dot_unaccelerated(a_unacc( 1 :n_local),b_unacc( 1 :n_local) ,dot)
110114 t_end = walltime()
111115 if (me== 1 ) print * , ' Serial result' ,dot,' time:' ,t_end- t_start
112116
@@ -118,8 +122,10 @@ program cu_dot_test
118122
119123 subroutine initialize_all_variables ()
120124 integer (c_int) :: i
121- call accelerated_allocate(a_acc(n_local)[* ],b_acc(n_local)[* ])
122- call accelerated_allocate(a_man(n_local)[* ],b_man(n_local)[* ])
125+ ! The allocation arguments must be coarrays to support the scatter operation below
126+ call accelerated_allocate(a_acc,n_local)
127+ call accelerated_allocate(b_acc,n_local)
128+ allocate (a_unacc(n_local)[* ],b_unacc(n_local)[* ])
123129
124130 if (me == 1 ) then
125131 ! Initialize the local unaccelerated data on every image
@@ -129,10 +135,11 @@ subroutine initialize_all_variables()
129135 ! Scatter a and b to a_cc and b_cc
130136 do i= 1 ,np
131137 a_acc(1 :n_local)[i] = a(n_local* (i-1 )+ 1 :n_local* i)
132- a_man(1 :n_local)[i] = a(n_local* (i-1 )+ 1 :n_local* i)
133138 b_acc(1 :n_local)[i] = b(n_local* (i-1 )+ 1 :n_local* i)
134- b_man(1 :n_local)[i] = b(n_local* (i-1 )+ 1 :n_local* i)
135- enddo
139+ end do
140+ sync all
141+ a_unacc= a_acc
142+ b_unacc= b_acc
136143 endif
137144 end subroutine
138145
0 commit comments