@@ -27,6 +27,8 @@ mutable struct EC2ManagerWorkers <: ManagerWorkers #Cluster
2727 count:: Int
2828 image_id_manager:: String
2929 image_id_worker:: String
30+ user_manager:: String
31+ user_worker:: String
3032 subnet_id:: Union{String, Nothing}
3133 placement_group:: Union{String, Nothing}
3234 auto_pg:: Bool
@@ -44,6 +46,7 @@ mutable struct EC2PeerWorkers <: PeerWorkers # Cluster
4446 instance_type:: String
4547 count:: Int
4648 image_id:: String
49+ user:: String
4750 subnet_id:: Union{String, Nothing}
4851 placement_group:: Union{String, Nothing}
4952 auto_pg:: Bool
@@ -60,6 +63,7 @@ mutable struct EC2PeerWorkersMPI <: PeerWorkersMPI # Cluster
6063 instance_type:: String
6164 count:: Int
6265 image_id:: String
66+ user:: String
6367 subnet_id:: Union{String, Nothing}
6468 placement_group:: Union{String, Nothing}
6569 auto_pg:: Bool
@@ -169,6 +173,50 @@ Criação de Instâncias
169173=#
170174
171175# Funções auxiliares.
176+ # Funções auxiliares.
177+ function ec2_set_up_ssh_connection (cluster_name, comment)
178+
179+ internal_key_name = cluster_name
180+
181+ ssh_path = joinpath (homedir (), " .ssh" )
182+
183+ ! isdir (ssh_path) && mkdir (ssh_path)
184+
185+ keypath = joinpath (ssh_path, " $internal_key_name .key" )
186+ pubpath = joinpath (ssh_path, " $internal_key_name .key.pub" )
187+
188+ # Criar chave interna pública e privada do SSH.
189+ # chars = ['a':'z'; 'A':'Z'; '0':'9']
190+ # random_suffix = join(chars[Random.rand(1:length(chars), 5)])
191+ run (` ssh-keygen -t rsa -b 2048 -f $keypath -C $comment -N ""` )
192+ run (` chmod 400 $keypath ` )
193+ private_key = base64encode (read (keypath, String))
194+ public_key = base64encode (read (pubpath, String))
195+
196+ private_key, public_key
197+ end
198+
199+ function ec2_get_user_data (cluster_name, user, private_key, public_key)
200+
201+ # Define o script que irá instalar a chave pública e privada no headnode e workers.
202+ user_data = " #!/bin/bash
203+ echo $private_key | base64 -d > /home/$user /.ssh/$cluster_name
204+ echo $public_key | base64 -d > /home/$user /.ssh/$cluster_name .pub
205+ echo 'Host *
206+ IdentityFile /home/$user /.ssh/$cluster_name
207+ StrictHostKeyChecking no' > /home/$user /.ssh/config
208+ cat /home/$user /.ssh/$cluster_name .pub >> /home/$user /.ssh/authorized_keys
209+ chown -R $user :$user /home/$user /.ssh
210+ chmod 600 /home/$user /.ssh/*
211+ sed -i 's/#ClientAliveInterval 0/ClientAliveInterval 1000/g' /etc/ssh/sshd_config
212+ sed -i 's/#ClientAliveCountMax 3/ClientAliveCountMax 100/g' /etc/ssh/sshd_config
213+ systemctl restart ssh
214+ "
215+
216+ return user_data
217+ end
218+
219+ #=
172220function ec2_set_up_ssh_connection(cluster_name)
173221
174222 internal_key_name = cluster_name
@@ -203,8 +251,12 @@ systemctl restart ssh
203251"
204252 [internal_key_name, user_data]
205253end
254+ =#
206255
207256function ec2_create_params (cluster:: ManagerWorkers , user_data_base64)
257+
258+ user_data_manager_base64, user_data_worker_base64 = user_data_base64
259+
208260 params_manager = Dict (
209261 " InstanceType" => cluster. instance_type_manager,
210262 " ImageId" => cluster. image_id_manager,
@@ -214,7 +266,7 @@ function ec2_create_params(cluster::ManagerWorkers, user_data_base64)
214266 " Tag" => [Dict (" Key" => " cluster" , " Value" => cluster. name),
215267 Dict (" Key" => " Name" , " Value" => " manager" ) ]
216268 ),
217- " UserData" => user_data_base64 ,
269+ " UserData" => user_data_manager_base64 ,
218270 )
219271
220272 params_workers = Dict (
@@ -226,7 +278,7 @@ function ec2_create_params(cluster::ManagerWorkers, user_data_base64)
226278 " Tag" => [Dict (" Key" => " cluster" , " Value" => cluster. name),
227279 Dict (" Key" => " Name" , " Value" => " worker" ) ]
228280 ),
229- " UserData" => user_data_base64 ,
281+ " UserData" => user_data_worker_base64 ,
230282 )
231283
232284 if ! isnothing (cluster. subnet_id)
@@ -283,9 +335,11 @@ function ec2_remove_temp_files(internal_key_name)
283335 rm (pubpath)
284336end
285337
286-
338+ function ec2_set_hostfile (cluster_nodes, internal_key_name, user)
339+ ec2_set_hostfile (cluster_nodes, internal_key_name, user, user)
340+ end
287341
288- function ec2_set_hostfile (cluster_nodes, internal_key_name)
342+ function ec2_set_hostfile (cluster_nodes, internal_key_name, user_manager, user_worker )
289343 # Testando se a conexão SSH está ativa.
290344 for instance in keys (cluster_nodes)
291345 public_ip = Ec2. describe_instances (Dict (" InstanceId" => cluster_nodes[instance]))[" reservationSet" ][" item" ][" instancesSet" ][" item" ][" ipAddress" ]
@@ -316,16 +370,17 @@ function ec2_set_hostfile(cluster_nodes, internal_key_name)
316370
317371 # Atualiza o hostname e o hostfile.
318372 for instance in keys (cluster_nodes)
373+ user = instance == :manager ? user_manager : user_worker
319374 public_ip = Ec2. describe_instances (Dict (" InstanceId" => cluster_nodes[instance]))[" reservationSet" ][" item" ][" instancesSet" ][" item" ][" ipAddress" ]
320375 # private_ip = Ec2.describe_instances(Dict("InstanceId" => cluster_nodes[instance]))["reservationSet"]["item"]["instancesSet"]["item"]["privateIpAddress"]
321- try_run (` ssh -i $keypath -o StrictHostKeyChecking=no ubuntu @$public_ip "sudo hostnamectl set-hostname $instance "` )
322- try_run (` ssh -i $keypath -o StrictHostKeyChecking=no ubuntu @$public_ip "echo '$hostfilefile_content ' > /home/ubuntu /hostfile"` )
323- # try_run(`ssh -i $keypath -o StrictHostKeyChecking=no ubuntu @$public_ip "awk '{ print \$2 \" \" \$1 }' hostfile >> hosts.tmp"`)
324- try_run (` ssh -i $keypath -o StrictHostKeyChecking=no ubuntu @$public_ip "echo '$hostfile_content ' >> hosts.tmp"` )
325- try_run (` ssh -i $keypath -o StrictHostKeyChecking=no ubuntu @$public_ip "sudo chown ubuntu:ubuntu /etc/hosts"` )
326- try_run (` ssh -i $keypath -o StrictHostKeyChecking=no ubuntu @$public_ip "cat hosts.tmp > /etc/hosts"` )
327- try_run (` ssh -i $keypath -o StrictHostKeyChecking=no ubuntu @$public_ip "sudo chown root:root /etc/hosts"` )
328- try_run (` ssh -i $keypath -o StrictHostKeyChecking=no ubuntu @$public_ip "rm hosts.tmp"` )
376+ try_run (` ssh -i $keypath -o StrictHostKeyChecking=no $user @$public_ip "sudo hostnamectl set-hostname $instance "` )
377+ try_run (` ssh -i $keypath -o StrictHostKeyChecking=no $user @$public_ip "echo '$hostfilefile_content ' > /home/$user /hostfile"` )
378+ # try_run(`ssh -i $keypath -o StrictHostKeyChecking=no $user @$public_ip "awk '{ print \$2 \" \" \$1 }' hostfile >> hosts.tmp"`)
379+ try_run (` ssh -i $keypath -o StrictHostKeyChecking=no $user @$public_ip "echo '$hostfile_content ' >> hosts.tmp"` )
380+ try_run (` ssh -i $keypath -o StrictHostKeyChecking=no $user @$public_ip "sudo chown $user : $user /etc/hosts"` )
381+ try_run (` ssh -i $keypath -o StrictHostKeyChecking=no $user @$public_ip "cat hosts.tmp > /etc/hosts"` )
382+ try_run (` ssh -i $keypath -o StrictHostKeyChecking=no $user @$public_ip "sudo chown root:root /etc/hosts"` )
383+ try_run (` ssh -i $keypath -o StrictHostKeyChecking=no $user @$public_ip "rm hosts.tmp"` )
329384 end
330385
331386 # wait(h)
@@ -342,22 +397,37 @@ function ec2_create_instances(cluster::ManagerWorkers)
342397 cluster_nodes = Dict ()
343398
344399 # Configurando a conexão SSH.
345- internal_key_name, user_data = ec2_set_up_ssh_connection (cluster. name)
400+
401+ private_key, public_key = ec2_set_up_ssh_connection (cluster. name, cluster. user_manager)
402+
403+ user_data_manager = ec2_get_user_data (cluster. name, cluster. user_manager, private_key, public_key)
404+ user_data_worker = ec2_get_user_data (cluster. name, cluster. user_worker, private_key, public_key)
405+
406+ internal_key_name = cluster. name
346407
347408 # Configuração do NFS
348409 if cluster. shared_fs
349410 file_system_ip = cluster. environment. file_system_ip
350- nfs_user_data = " apt-get -y install nfs-common
351- mkdir /home/ubuntu /shared/
352- mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip :/ /home/ubuntu /shared/
353- chown -R ubuntu:ubuntu /home/ubuntu /shared
411+ nfs_user_data_manager = " apt-get -y install nfs-common
412+ mkdir /home/$user_manager /shared/
413+ mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip :/ /home/$user_manager /shared/
414+ chown -R $user_manager : $user_manager /home/$user_manager /shared
354415"
355- user_data *= nfs_user_data
416+ nfs_user_data_worker = " apt-get -y install nfs-common
417+ mkdir /home/$user_worker /shared/
418+ mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip :/ /home/$user_worker /shared/
419+ chown -R $user_worker :$user_worker /home/$user_worker /shared
420+ "
421+ user_data_manager *= nfs_user_data_manager
422+ user_data_worker *= nfs_user_data_worker
356423 end
357- user_data_base64 = base64encode (user_data)
424+
425+ user_data_manager_base64 = base64encode (user_data_manager)
426+ user_data_worker_base64 = base64encode (user_data_worker)
358427
359428 # Criando as instâncias
360- params_manager, params_workers = ec2_create_params (cluster, user_data_base64)
429+ params_manager, params_workers = ec2_create_params (cluster, (user_data_manager_base64, user_data_worker_base64))
430+
361431 # Criar o headnode
362432 instance_headnode = run_instances (1 , 1 , params_manager)
363433 cluster_nodes[:manager ] = instance_headnode[" instancesSet" ][" item" ][" instanceId" ]
@@ -382,7 +452,7 @@ chown -R ubuntu:ubuntu /home/ubuntu/shared
382452 ec2_await_status (cluster_nodes, " running" )
383453 ec2_await_check (cluster_nodes, " ok" )
384454
385- ec2_set_hostfile (cluster_nodes, internal_key_name)
455+ ec2_set_hostfile (cluster_nodes, internal_key_name, cluster . user_manager, cluster . user_worker )
386456
387457 # ec2_remove_temp_files(internal_key_name)
388458
@@ -393,15 +463,18 @@ function ec2_create_instances(cluster::PeerWorkers)
393463 cluster_nodes = Dict ()
394464
395465 # Configurando a conexão SSH.
396- internal_key_name, user_data = ec2_set_up_ssh_connection (cluster. name)
466+ private_key, public_key = ec2_set_up_ssh_connection (cluster. name, cluster. user)
467+ user_data = ec2_get_user_data (cluster. name, cluster. user, private_key, public_key)
468+
469+ internal_key_name = cluster. name
397470
398471 # Configuração do NFS
399472 if cluster. shared_fs
400473 file_system_ip = cluster. environment. file_system_ip
401474 nfs_user_data = " apt-get -y install nfs-common
402- mkdir /home/ubuntu /shared/
403- mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip :/ /home/ubuntu /shared/
404- chown -R ubuntu:ubuntu /home/ubuntu /shared
475+ mkdir /home/$user /shared/
476+ mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 $file_system_ip :/ /home/$user /shared/
477+ chown -R $user : $user /home/$user /shared
405478"
406479 user_data *= nfs_user_data
407480 end
@@ -427,7 +500,7 @@ chown -R ubuntu:ubuntu /home/ubuntu/shared
427500 ec2_await_status (cluster_nodes, " running" )
428501 ec2_await_check (cluster_nodes, " ok" )
429502
430- ec2_set_hostfile (cluster_nodes, internal_key_name)
503+ ec2_set_hostfile (cluster_nodes, internal_key_name, cluster . user )
431504
432505 # ec2_remove_temp_files(internal_key_name)
433506
@@ -581,16 +654,28 @@ ec2_can_resume(cluster::Cluster) = ec2_cluster_status(cluster, ["stopped"])
581654# All instances must be in "interrupted" or "running" state.
582655# If some instance is not in "interrupted" or "running" state, raise an exception.
583656# PUBLIC
584- function ec2_resume_cluster (cluster:: Cluster )
657+
658+ function ec2_resume_cluster (cluster:: PeerWorkers )
659+ ec2_resume_cluster (cluster, cluster. user, cluster. user)
660+ end
661+
662+ function ec2_resume_cluster (cluster:: ManagerWorkers )
663+ ec2_resume_cluster (cluster, cluster. user_manager, cluster. user_worker)
664+ end
665+
666+ function ec2_resume_cluster (cluster:: Cluster , user_manager, user_worker)
667+ home = ENV [" HOME" ]
585668 ssh_path = joinpath (homedir (), " .ssh" )
586669 keypath = joinpath (ssh_path, " $(cluster. name) .key" )
587670
588671 ec2_start_instances (cluster)
589672 ec2_await_status (cluster. cluster_nodes, " running" )
590673 ec2_await_check (cluster. cluster_nodes, " ok" )
591674 for instance in keys (cluster. cluster_nodes)
675+ user = instance == :manager ? user_manager : user_worker
592676 public_ip = Ec2. describe_instances (Dict (" InstanceId" => cluster. cluster_nodes[instance]))[" reservationSet" ][" item" ][" instancesSet" ][" item" ][" ipAddress" ]
593- try_run (` ssh -i $keypath -o StrictHostKeyChecking=no ubuntu@$public_ip uptime` )
677+ run (` ssh-keygen -f $home /.ssh/known_hosts -R $public_ip ` )
678+ try_run (` ssh -i $keypath -o StrictHostKeyChecking=no $user @$public_ip uptime` )
594679 end
595680end
596681
0 commit comments