Skip to content

Commit facaf3e

Browse files
committed
1 parent 5095c6c commit facaf3e

File tree

8 files changed

+293
-102
lines changed

8 files changed

+293
-102
lines changed

ci/slurm/Dockerfile

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,55 @@ FROM ubuntu:24.04
33
ENV DEBIAN_FRONTEND=noninteractive
44
RUN --mount=type=cache,target=/var/cache/apt \
55
rm -f /etc/apt/apt.conf.d/docker-clean \
6-
&& apt-get update && apt-get -y install python3-pip slurm-wlm python3-venv
6+
&& apt-get update && apt-get -y install \
7+
gosu \
8+
mysql-client \
9+
python3-venv \
10+
python3-pip \
11+
slurm-wlm \
12+
slurmdbd \
13+
slurm
714

815
ENV PIP_CACHE_DIR=/tmp/pip-cache \
916
VIRTUAL_ENV=/srv/env \
10-
PATH=/srv/env/bin:${PATH}
17+
PATH=/srv/env/bin:${PATH} \
18+
IPP_DISABLE_JS=1
1119

1220
RUN --mount=type=cache,target=${PIP_CACHE_DIR} \
1321
python3 -m venv $VIRTUAL_ENV \
1422
&& $VIRTUAL_ENV/bin/python3 -m pip install ipyparallel pytest-asyncio pytest-cov
15-
RUN mkdir /var/spool/slurmctl \
16-
&& mkdir /var/spool/slurmd
17-
COPY slurm.conf /etc/slurm-llnl/slurm.conf
23+
24+
# initialize some filesystem
25+
RUN mkdir -p /etc/sysconfig/slurm \
26+
/var/spool/slurmd \
27+
/var/run/slurmd \
28+
/var/run/slurmdbd \
29+
/var/lib/slurmd \
30+
/data \
31+
&& touch /var/lib/slurmd/node_state \
32+
/var/lib/slurmd/front_end_state \
33+
/var/lib/slurmd/job_state \
34+
/var/lib/slurmd/resv_state \
35+
/var/lib/slurmd/trigger_state \
36+
/var/lib/slurmd/assoc_mgr_state \
37+
/var/lib/slurmd/assoc_usage \
38+
/var/lib/slurmd/qos_usage \
39+
/var/lib/slurmd/fed_mgr_state \
40+
&& chown -R slurm:slurm /var/*/slurm* \
41+
&& mkdir /run/munge \
42+
&& chown munge:munge /run/munge \
43+
&& chmod a+rwxt /run/munge
44+
# && mungekey -c
45+
46+
COPY --chown=slurm:slurm --chmod=0600 etc_slurm/ /etc/slurm/
47+
48+
1849
COPY entrypoint.sh /entrypoint
19-
ENV IPP_DISABLE_JS=1
2050
ENTRYPOINT ["/entrypoint"]
2151

2252
# the mounted directory
2353
RUN mkdir /io
2454
ENV PYTHONPATH=/io
2555
WORKDIR "/io"
56+
57+
CMD [ "tail", "-f", "/var/log/slurm/slurmd.log" ]

ci/slurm/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Slurm cluster example for testing
2+
3+
adapted and simplified from https://github.com/giovtorres/slurm-docker-cluster/
4+
5+
License: MIT

ci/slurm/docker-compose.yaml

Lines changed: 57 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,109 @@
11
services:
2+
mysql:
3+
image: mariadb:10.11
4+
hostname: mysql
5+
container_name: mysql
6+
environment:
7+
MYSQL_RANDOM_ROOT_PASSWORD: "yes"
8+
MYSQL_DATABASE: slurm_acct_db
9+
MYSQL_USER: slurm
10+
MYSQL_PASSWORD: password
11+
volumes:
12+
- var_lib_mysql:/var_lib/mysql
13+
14+
slurmdbd:
15+
image: ipp-cluster:slurm
16+
build: .
17+
command:
18+
- slurmdbd
19+
container_name: slurmdbd
20+
hostname: slurmdbd
21+
volumes:
22+
- etc_munge:/etc/munge
23+
# - $PWD/etc_slurm:/etc/slurm
24+
- var_log_slurm:/var/log/slurm
25+
expose:
26+
- "6819"
27+
depends_on:
28+
- mysql
29+
230
slurmctld:
331
image: ipp-cluster:slurm
432
build: .
33+
command:
34+
- slurmctld
535
container_name: slurmctld
636
hostname: slurmctld
7-
command:
8-
- tail
9-
- "-f"
10-
- /var/log/slurm-llnl/slurmctld.log
1137
volumes:
1238
- etc_munge:/etc/munge
13-
- etc_slurm:/etc/slurm
39+
# - $PWD/etc_slurm:/etc/slurm
40+
# - $PWD/slurm.conf:/etc/slurm/slurm.conf
1441
- slurm_jobdir:/data
1542
- var_log_slurm:/var/log/slurm
1643
- ../..:/io
1744
expose:
1845
- "6817"
19-
networks:
20-
common-network:
21-
ipv4_address: 10.1.1.10
46+
- "6818"
47+
depends_on:
48+
- slurmdbd
49+
# networks:
50+
# common-network:
51+
# ipv4_address: 10.1.1.10
2252

2353
c1:
2454
image: ipp-cluster:slurm
2555
build: .
26-
hostname: c1
2756
command:
28-
- tail
29-
- "-f"
30-
- /var/log/slurm-llnl/slurmd.log
57+
- slurmd
58+
hostname: c1
3159
container_name: c1
3260

3361
volumes:
3462
- etc_munge:/etc/munge
35-
- etc_slurm:/etc/slurm
63+
# - $PWD/etc_slurm:/etc/slurm
64+
# - $PWD/slurm.conf:/etc/slurm/slurm.conf
3665
- slurm_jobdir:/data
3766
- var_log_slurm:/var/log/slurm
3867
- ../..:/io
3968
expose:
69+
- "6817"
4070
- "6818"
4171
depends_on:
4272
- "slurmctld"
43-
networks:
44-
common-network:
45-
ipv4_address: 10.1.1.11
73+
# networks:
74+
# common-network:
75+
# ipv4_address: 10.1.1.11
4676

4777
c2:
4878
image: ipp-cluster:slurm
4979
build: .
5080
command:
51-
- tail
52-
- "-f"
53-
- /var/log/slurm-llnl/slurmd.log
81+
- slurmd
5482
hostname: c2
5583
container_name: c2
84+
# need to be privileged for cgroup plugin
85+
privileged: true
5686
volumes:
5787
- etc_munge:/etc/munge
58-
- etc_slurm:/etc/slurm
88+
# - $PWD/etc_slurm:/etc/slurm
89+
# - $PWD/slurm.conf:/etc/slurm/slurm.conf
5990
- slurm_jobdir:/data
6091
- var_log_slurm:/var/log/slurm
6192
- ../..:/io
6293
expose:
94+
- "6817"
6395
- "6818"
6496
depends_on:
6597
- "slurmctld"
66-
networks:
67-
common-network:
68-
ipv4_address: 10.1.1.12
98+
# networks:
99+
# common-network:
100+
# ipv4_address: 10.1.1.12
69101

70102
volumes:
71103
etc_munge:
72-
etc_slurm:
104+
# etc_slurm:
73105
slurm_jobdir:
106+
var_lib_mysql:
74107
var_log_slurm:
75108

76109
networks:

ci/slurm/entrypoint.sh

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,64 @@
11
#!/bin/bash
22
set -ex
3-
# set permissions on munge dir, may be mounted
4-
chown -R munge:munge /etc/munge
5-
6-
echo "starting munge"
7-
service munge start
8-
9-
echo "hostname=$(hostname)"
10-
if [[ "$(hostname)" == *"slurmctl"* ]]; then
11-
echo "starting slurmctld"
12-
service slurmctld start
13-
else
14-
echo "starting slurmd"
15-
service slurmd start
3+
4+
if [ "$1" = "slurmdbd" ]
5+
then
6+
echo "---> Starting the MUNGE Authentication service (munged) ..."
7+
gosu munge /usr/sbin/munged
8+
9+
echo "---> Starting the Slurm Database Daemon (slurmdbd) ..."
10+
11+
{
12+
. /etc/slurm/slurmdbd.conf
13+
until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null
14+
do
15+
echo "-- Waiting for database to become active ..."
16+
sleep 2
17+
done
18+
}
19+
echo "-- Database is now active ..."
20+
21+
exec gosu slurm /usr/sbin/slurmdbd -Dvvv
22+
fi
23+
24+
if [ "$1" = "slurmctld" ]
25+
then
26+
echo "---> Starting the MUNGE Authentication service (munged) ..."
27+
gosu munge /usr/sbin/munged
28+
29+
echo "---> Waiting for slurmdbd to become active before starting slurmctld ..."
30+
31+
until 2>/dev/null >/dev/tcp/slurmdbd/6819
32+
do
33+
echo "-- slurmdbd is not available. Sleeping ..."
34+
sleep 2
35+
done
36+
echo "-- slurmdbd is now active ..."
37+
38+
echo "---> Starting the Slurm Controller Daemon (slurmctld) ..."
39+
if /usr/sbin/slurmctld -V | grep -q '17.02' ; then
40+
exec gosu slurm /usr/sbin/slurmctld -Dvvv
41+
else
42+
exec gosu slurm /usr/sbin/slurmctld -i -Dvvv
43+
fi
44+
fi
45+
46+
if [ "$1" = "slurmd" ]
47+
then
48+
echo "---> Starting the MUNGE Authentication service (munged) ..."
49+
gosu munge /usr/sbin/munged
50+
51+
echo "---> Waiting for slurmctld to become active before starting slurmd..."
52+
53+
until 2>/dev/null >/dev/tcp/slurmctld/6817
54+
do
55+
echo "-- slurmctld is not available. Sleeping ..."
56+
sleep 2
57+
done
58+
echo "-- slurmctld is now active ..."
59+
60+
echo "---> Starting the Slurm Node Daemon (slurmd) ..."
61+
exec /usr/sbin/slurmd -Dvvv
1662
fi
1763

1864
exec "$@"

ci/slurm/etc_slurm/cgroup.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CgroupPlugin=cgroup/v1

ci/slurm/etc_slurm/slurm.conf

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# slurm.conf
2+
#
3+
# See the slurm.conf man page for more information.
4+
#
5+
ClusterName=linux
6+
ControlMachine=slurmctld
7+
ControlAddr=slurmctld
8+
#BackupController=
9+
#BackupAddr=
10+
#
11+
SlurmUser=slurm
12+
#SlurmdUser=root
13+
SlurmctldPort=6817
14+
SlurmdPort=6818
15+
AuthType=auth/munge
16+
#JobCredentialPrivateKey=
17+
#JobCredentialPublicCertificate=
18+
StateSaveLocation=/var/lib/slurmd
19+
SlurmdSpoolDir=/var/spool/slurmd
20+
SwitchType=switch/none
21+
MpiDefault=none
22+
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
23+
SlurmdPidFile=/var/run/slurmd/slurmd.pid
24+
ProctrackType=proctrack/linuxproc
25+
#PluginDir=
26+
#CacheGroups=0
27+
#FirstJobId=
28+
ReturnToService=0
29+
#MaxJobCount=
30+
#PlugStackConfig=
31+
#PropagatePrioProcess=
32+
#PropagateResourceLimits=
33+
#PropagateResourceLimitsExcept=
34+
#Prolog=
35+
#Epilog=
36+
#SrunProlog=
37+
#SrunEpilog=
38+
#TaskProlog=
39+
#TaskEpilog=
40+
#TaskPlugin=
41+
#TrackWCKey=no
42+
#TreeWidth=50
43+
#TmpFS=
44+
#UsePAM=
45+
#
46+
# TIMERS
47+
SlurmctldTimeout=300
48+
SlurmdTimeout=300
49+
InactiveLimit=0
50+
MinJobAge=300
51+
KillWait=30
52+
Waittime=0
53+
#
54+
# SCHEDULING
55+
SchedulerType=sched/backfill
56+
#SchedulerAuth=
57+
#SchedulerPort=
58+
#SchedulerRootFilter=
59+
SelectType=select/cons_tres # <-- MODIFICAÇÃO 1: Atualizado para compatibilidade com TRES
60+
SelectTypeParameters=CR_Core_Memory
61+
#FastSchedule=1 # <-- MODIFICAÇÃO 2: Comentado por ser obsoleto
62+
#PriorityType=priority/multifactor
63+
#PriorityDecayHalfLife=14-0
64+
#PriorityUsageResetPeriod=14-0
65+
#PriorityWeightFairshare=100000
66+
#PriorityWeightAge=1000
67+
#PriorityWeightPartition=10000
68+
#PriorityWeightJobSize=1000
69+
#PriorityMaxAge=1-0
70+
#
71+
# LOGGING
72+
SlurmctldDebug=3
73+
SlurmctldLogFile=/var/log/slurm/slurmctld.log
74+
SlurmdDebug=3
75+
SlurmdLogFile=/var/log/slurm/slurmd.log
76+
JobCompType=jobcomp/filetxt
77+
JobCompLoc=/var/log/slurm/jobcomp.log
78+
#
79+
# ACCOUNTING
80+
JobAcctGatherType=jobacct_gather/linux
81+
JobAcctGatherFrequency=30
82+
#
83+
AccountingStorageType=accounting_storage/slurmdbd
84+
AccountingStorageHost=slurmdbd
85+
AccountingStoragePort=6819
86+
#AccountingStorageLoc=slurm_acct_db
87+
#AccountingStoragePass=
88+
#AccountingStorageUser=
89+
#
90+
# COMPUTE NODES
91+
NodeName=c1 NodeAddr=c1 CPUs=2 RealMemory=1000 State=UNKNOWN
92+
NodeName=c2 NodeAddr=c2 CPUs=2 RealMemory=1000 State=UNKNOWN
93+
#
94+
# PARTITIONS
95+
#PartitionName=normal Default=yes Nodes=c1,c2 Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=2 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP
96+
PartitionName=normal Nodes=c1,c2 Default=YES MaxTime=INFINITE State=UP

0 commit comments

Comments
 (0)