Skip to content

Commit 5190054

Browse files
different users in manager and worker nodes in MW clusters
1 parent 402a58c commit 5190054

File tree

10 files changed

+149
-49
lines changed

10 files changed

+149
-49
lines changed

CCconfig.toml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,18 @@ mpiflags = ""
1818

1919
[ec2]
2020

21-
imageid = "ami-09121cfdb459a0804" # found at us-east-1 (North Virginia). To use in other regions, copy it.
21+
imageid = "ami-0b869698add04fbdc" # found at us-east-1 (North Virginia). To use in other regions, copy it.
2222

2323
# placement_group = "pg-XXXXXXXXXXXX" or "automatic"
2424
# security_group_id = "sg-XXXXXXXXXXXX" or "automatic"
2525
# subnet_id = "subnet-XXXXXXXXXXXX"
2626

27-
[gcp]
27+
[gcp]
28+
29+
imageid = "hpc-shelf-311900/global/images/cloudclusters-basic-v3"
30+
zone = "us-central1-a"
31+
project = "hpc-shelf-311900"
32+
user = "heron"
33+
exename = "/home/heron/.juliaup/bin/julia"
34+
directory = "/home/heron"
35+
mpiflags = "--map-by node --hostfile /home/heron/hostfile"

Project.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
1010
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
1111
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
1212
FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f"
13+
GoogleCloud = "55e21f81-8b0a-565e-b5ad-6816892a5ee7"
1314
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
1415
MPIClusterManagers = "e7922434-ae4b-11e9-05c5-9780451d2c66"
1516
PlatformAware = "e7c50b67-2c03-471e-9cf2-69e515d86ecf"
@@ -26,8 +27,9 @@ YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
2627
AWS = "1"
2728
Base64 = "1.10.4"
2829
Distributed = "1.10.4"
29-
FilePathsBase = "0.9.21"
3030
Downloads = "1.6.0"
31+
FilePathsBase = "0.9.21"
32+
GoogleCloud = "0.11.0"
3133
JSON = "0.21"
3234
MPIClusterManagers = "0.2.4"
3335
PlatformAware = "0.6.1"
@@ -36,6 +38,6 @@ Reexport = "1"
3638
Serialization = "1.10.4"
3739
Sockets = "1.10.4"
3840
TOML = "1.0.3"
39-
Test = "1.11.0"
41+
Test = "1.11.0"
4042
YAML = "0.4"
4143
julia = "1"

docs/src/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ The parallel code sums the ranks of the processes using the _Reduce_ collective
423423

424424
A ___Manager-Workers___ cluster comprises an _access node_ and a homogenous set of _compute nodes_. The compute nodes are only accessible from the access node. The instance type of the access node may be different from the instance type of the compute nodes.
425425

426-
In a ___Manager-Workers___ cluster, the master process, running in the REPL or main program, is called the _driver process_. It is responsible for launching the so-called _entry process_ in the cluster's access node. In turn, the entry process launches _worker processes_ across the compute nodes, using _MPIClusterManagers.jl_. The worker processes perform the computation, while the entry process is responsible for communication between the driver and the worker processes. A global MPI communicator exists between worker processes, like in ___Peer-Workers-MPI___ clusters.
426+
In a ___Manager-Workers___ cluster, the manager process, running in the REPL or main program, is called the _driver process_. It is responsible for launching the so-called _entry process_ in the cluster's access node. In turn, the entry process launches _worker processes_ across the compute nodes, using _MPIClusterManagers.jl_. The worker processes perform the computation, while the entry process is responsible for communication between the driver and the worker processes. A global MPI communicator exists between worker processes, like in ___Peer-Workers-MPI___ clusters.
427427

428428
A ___Manager-Workers___ cluster is useful when compute nodes are not directly accessible from the external network. This is a common situation in on-premises clusters. However, this is also possible in clusters built from the services of cluster providers specifically tailored to HPC applications.
429429

src/CloudClusters.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ include("cluster_providers/ec2/ec2_persist.jl")
2222
include("cluster_providers/ec2/ec2_resolve.jl")
2323
include("cluster_providers/ec2/ec2_deploy.jl")
2424
include("cluster_providers/gcp/gcp_configs.jl")
25-
#include("cluster_providers/gcp/gcp_backend.jl")
25+
include("cluster_providers/gcp/gcp_backend.jl")
26+
include("cluster_providers/gcp/gcp_persist.jl")
2627
include("cluster_providers/gcp/gcp_resolve.jl")
2728
include("cluster_providers/gcp/gcp_deploy.jl")
2829
include("cluster_providers/local/local_configs.jl")

src/cluster_providers/ec2/ec2_backend.jl

Lines changed: 113 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ mutable struct EC2ManagerWorkers <: ManagerWorkers #Cluster
2727
count::Int
2828
image_id_manager::String
2929
image_id_worker::String
30+
user_manager::String
31+
user_worker::String
3032
subnet_id::Union{String, Nothing}
3133
placement_group::Union{String, Nothing}
3234
auto_pg::Bool
@@ -44,6 +46,7 @@ mutable struct EC2PeerWorkers <: PeerWorkers # Cluster
4446
instance_type::String
4547
count::Int
4648
image_id::String
49+
user::String
4750
subnet_id::Union{String, Nothing}
4851
placement_group::Union{String, Nothing}
4952
auto_pg::Bool
@@ -60,6 +63,7 @@ mutable struct EC2PeerWorkersMPI <: PeerWorkersMPI # Cluster
6063
instance_type::String
6164
count::Int
6265
image_id::String
66+
user::String
6367
subnet_id::Union{String, Nothing}
6468
placement_group::Union{String, Nothing}
6569
auto_pg::Bool
@@ -169,6 +173,50 @@ Criação de Instâncias
169173
=#
170174

171175
# Funções auxiliares.
176+
# Funções auxiliares.
177+
function ec2_set_up_ssh_connection(cluster_name, comment)
178+
179+
internal_key_name = cluster_name
180+
181+
ssh_path = joinpath(homedir(), ".ssh")
182+
183+
!isdir(ssh_path) && mkdir(ssh_path)
184+
185+
keypath = joinpath(ssh_path, "$internal_key_name.key")
186+
pubpath = joinpath(ssh_path, "$internal_key_name.key.pub")
187+
188+
# Criar chave interna pública e privada do SSH.
189+
# chars = ['a':'z'; 'A':'Z'; '0':'9']
190+
# random_suffix = join(chars[Random.rand(1:length(chars), 5)])
191+
run(`ssh-keygen -t rsa -b 2048 -f $keypath -C $comment -N ""`)
192+
run(`chmod 400 $keypath`)
193+
private_key = base64encode(read(keypath, String))
194+
public_key = base64encode(read(pubpath, String))
195+
196+
private_key, public_key
197+
end
198+
199+
function ec2_get_user_data(cluster_name, user, private_key, public_key)
200+
201+
# Define o script que irá instalar a chave pública e privada no headnode e workers.
202+
user_data = "#!/bin/bash
203+
echo $private_key | base64 -d > /home/$user/.ssh/$cluster_name
204+
echo $public_key | base64 -d > /home/$user/.ssh/$cluster_name.pub
205+
echo 'Host *
206+
IdentityFile /home/$user/.ssh/$cluster_name
207+
StrictHostKeyChecking no' > /home/$user/.ssh/config
208+
cat /home/$user/.ssh/$cluster_name.pub >> /home/$user/.ssh/authorized_keys
209+
chown -R $user:$user /home/$user/.ssh
210+
chmod 600 /home/$user/.ssh/*
211+
sed -i 's/#ClientAliveInterval 0/ClientAliveInterval 1000/g' /etc/ssh/sshd_config
212+
sed -i 's/#ClientAliveCountMax 3/ClientAliveCountMax 100/g' /etc/ssh/sshd_config
213+
systemctl restart ssh
214+
"
215+
216+
return user_data
217+
end
218+
219+
#=
172220
function ec2_set_up_ssh_connection(cluster_name)
173221
174222
internal_key_name = cluster_name
@@ -203,8 +251,12 @@ systemctl restart ssh
203251
"
204252
[internal_key_name, user_data]
205253
end
254+
=#
206255

207256
function ec2_create_params(cluster::ManagerWorkers, user_data_base64)
257+
258+
user_data_manager_base64, user_data_worker_base64 = user_data_base64
259+
208260
params_manager = Dict(
209261
"InstanceType" => cluster.instance_type_manager,
210262
"ImageId" => cluster.image_id_manager,
@@ -214,7 +266,7 @@ function ec2_create_params(cluster::ManagerWorkers, user_data_base64)
214266
"Tag" => [Dict("Key" => "cluster", "Value" => cluster.name),
215267
Dict("Key" => "Name", "Value" => "manager") ]
216268
),
217-
"UserData" => user_data_base64,
269+
"UserData" => user_data_manager_base64,
218270
)
219271

220272
params_workers = Dict(
@@ -226,7 +278,7 @@ function ec2_create_params(cluster::ManagerWorkers, user_data_base64)
226278
"Tag" => [Dict("Key" => "cluster", "Value" => cluster.name),
227279
Dict("Key" => "Name", "Value" => "worker") ]
228280
),
229-
"UserData" => user_data_base64,
281+
"UserData" => user_data_worker_base64,
230282
)
231283

232284
if !isnothing(cluster.subnet_id)
@@ -283,9 +335,11 @@ function ec2_remove_temp_files(internal_key_name)
283335
rm(pubpath)
284336
end
285337

286-
338+
function ec2_set_hostfile(cluster_nodes, internal_key_name, user)
339+
ec2_set_hostfile(cluster_nodes, internal_key_name, user, user)
340+
end
287341

288-
function ec2_set_hostfile(cluster_nodes, internal_key_name)
342+
function ec2_set_hostfile(cluster_nodes, internal_key_name, user_manager, user_worker)
289343
# Testando se a conexão SSH está ativa.
290344
for instance in keys(cluster_nodes)
291345
public_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"]
@@ -316,16 +370,17 @@ function ec2_set_hostfile(cluster_nodes, internal_key_name)
316370

317371
# Atualiza o hostname e o hostfile.
318372
for instance in keys(cluster_nodes)
373+
user = instance == :manager ? user_manager : user_worker
319374
public_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"]
320375
# private_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["privateIpAddress"]
321-
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "sudo hostnamectl set-hostname $instance"`)
322-
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "echo '$hostfilefile_content' > /home/ubuntu/hostfile"`)
323-
# try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "awk '{ print \$2 \" \" \$1 }' hostfile >> hosts.tmp"`)
324-
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "echo '$hostfile_content' >> hosts.tmp"`)
325-
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "sudo chown ubuntu:ubuntu /etc/hosts"`)
326-
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "cat hosts.tmp > /etc/hosts"`)
327-
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "sudo chown root:root /etc/hosts"`)
328-
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip "rm hosts.tmp"`)
376+
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "sudo hostnamectl set-hostname $instance"`)
377+
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "echo '$hostfilefile_content' > /home/$user/hostfile"`)
378+
# try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "awk '{ print \$2 \" \" \$1 }' hostfile >> hosts.tmp"`)
379+
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "echo '$hostfile_content' >> hosts.tmp"`)
380+
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "sudo chown $user:$user /etc/hosts"`)
381+
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "cat hosts.tmp > /etc/hosts"`)
382+
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "sudo chown root:root /etc/hosts"`)
383+
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip "rm hosts.tmp"`)
329384
end
330385

331386
#wait(h)
@@ -342,22 +397,37 @@ function ec2_create_instances(cluster::ManagerWorkers)
342397
cluster_nodes = Dict()
343398

344399
# Configurando a conexão SSH.
345-
internal_key_name, user_data = ec2_set_up_ssh_connection(cluster.name)
400+
401+
private_key, public_key = ec2_set_up_ssh_connection(cluster.name, cluster.user_manager)
402+
403+
user_data_manager = ec2_get_user_data(cluster.name, cluster.user_manager, private_key, public_key)
404+
user_data_worker = ec2_get_user_data(cluster.name, cluster.user_worker, private_key, public_key)
405+
406+
internal_key_name = cluster.name
346407

347408
# Configuração do NFS
348409
if cluster.shared_fs
349410
file_system_ip = cluster.environment.file_system_ip
350-
nfs_user_data = "apt-get -y install nfs-common
351-
mkdir /home/ubuntu/shared/
352-
mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip:/ /home/ubuntu/shared/
353-
chown -R ubuntu:ubuntu /home/ubuntu/shared
411+
nfs_user_data_manager = "apt-get -y install nfs-common
412+
mkdir /home/$user_manager/shared/
413+
mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip:/ /home/$user_manager/shared/
414+
chown -R $user_manager:$user_manager /home/$user_manager/shared
354415
"
355-
user_data *= nfs_user_data
416+
nfs_user_data_worker = "apt-get -y install nfs-common
417+
mkdir /home/$user_worker/shared/
418+
mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip:/ /home/$user_worker/shared/
419+
chown -R $user_worker:$user_worker /home/$user_worker/shared
420+
"
421+
user_data_manager *= nfs_user_data_manager
422+
user_data_worker *= nfs_user_data_worker
356423
end
357-
user_data_base64 = base64encode(user_data)
424+
425+
user_data_manager_base64 = base64encode(user_data_manager)
426+
user_data_worker_base64 = base64encode(user_data_worker)
358427

359428
# Criando as instâncias
360-
params_manager, params_workers = ec2_create_params(cluster, user_data_base64)
429+
params_manager, params_workers = ec2_create_params(cluster, (user_data_manager_base64, user_data_worker_base64))
430+
361431
# Criar o headnode
362432
instance_headnode = run_instances(1, 1, params_manager)
363433
cluster_nodes[:manager] = instance_headnode["instancesSet"]["item"]["instanceId"]
@@ -382,7 +452,7 @@ chown -R ubuntu:ubuntu /home/ubuntu/shared
382452
ec2_await_status(cluster_nodes, "running")
383453
ec2_await_check(cluster_nodes, "ok")
384454

385-
ec2_set_hostfile(cluster_nodes, internal_key_name)
455+
ec2_set_hostfile(cluster_nodes, internal_key_name, cluster.user_manager, cluster.user_worker)
386456

387457
#ec2_remove_temp_files(internal_key_name)
388458

@@ -393,15 +463,18 @@ function ec2_create_instances(cluster::PeerWorkers)
393463
cluster_nodes = Dict()
394464

395465
# Configurando a conexão SSH.
396-
internal_key_name, user_data = ec2_set_up_ssh_connection(cluster.name)
466+
private_key, public_key = ec2_set_up_ssh_connection(cluster.name, cluster.user)
467+
user_data = ec2_get_user_data(cluster.name, cluster.user, private_key, public_key)
468+
469+
internal_key_name = cluster.name
397470

398471
# Configuração do NFS
399472
if cluster.shared_fs
400473
file_system_ip = cluster.environment.file_system_ip
401474
nfs_user_data = "apt-get -y install nfs-common
402-
mkdir /home/ubuntu/shared/
403-
mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip:/ /home/ubuntu/shared/
404-
chown -R ubuntu:ubuntu /home/ubuntu/shared
475+
mkdir /home/$user/shared/
476+
mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip:/ /home/$user/shared/
477+
chown -R $user:$user /home/$user/shared
405478
"
406479
user_data *= nfs_user_data
407480
end
@@ -427,7 +500,7 @@ chown -R ubuntu:ubuntu /home/ubuntu/shared
427500
ec2_await_status(cluster_nodes, "running")
428501
ec2_await_check(cluster_nodes, "ok")
429502

430-
ec2_set_hostfile(cluster_nodes, internal_key_name)
503+
ec2_set_hostfile(cluster_nodes, internal_key_name, cluster.user)
431504

432505
# ec2_remove_temp_files(internal_key_name)
433506

@@ -581,16 +654,28 @@ ec2_can_resume(cluster::Cluster) = ec2_cluster_status(cluster, ["stopped"])
581654
# All instances must be in "interrupted" or "running" state.
582655
# If some instance is not in "interrupted" or "running" state, raise an exception.
583656
# PUBLIC
584-
function ec2_resume_cluster(cluster::Cluster)
657+
658+
function ec2_resume_cluster(cluster::PeerWorkers)
659+
ec2_resume_cluster(cluster, cluster.user, cluster.user)
660+
end
661+
662+
function ec2_resume_cluster(cluster::ManagerWorkers)
663+
ec2_resume_cluster(cluster, cluster.user_manager, cluster.user_worker)
664+
end
665+
666+
function ec2_resume_cluster(cluster::Cluster, user_manager, user_worker)
667+
home = ENV["HOME"]
585668
ssh_path = joinpath(homedir(), ".ssh")
586669
keypath = joinpath(ssh_path, "$(cluster.name).key")
587670

588671
ec2_start_instances(cluster)
589672
ec2_await_status(cluster.cluster_nodes, "running")
590673
ec2_await_check(cluster.cluster_nodes, "ok")
591674
for instance in keys(cluster.cluster_nodes)
675+
user = instance == :manager ? user_manager : user_worker
592676
public_ip = Ec2.describe_instances(Dict("InstanceId" => cluster.cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["ipAddress"]
593-
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip uptime`)
677+
run(`ssh-keygen -f $home/.ssh/known_hosts -R $public_ip`)
678+
try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user@$public_ip uptime`)
594679
end
595680
end
596681

0 commit comments

Comments
 (0)