Skip to content

Commit a7e1ce9

Browse files
feat: add optional SSH access to worker pods
Enable SSH access to NodeSet worker pods with a CRD toggle, following the same pattern as LoginSet. SSH host keys are shared across all pods in a NodeSet to prevent "host key changed" warnings when pods are recreated or scaled. Ref: https://slurm.schedmd.com/pam_slurm_adopt.
1 parent c4e92d8 commit a7e1ce9

File tree

12 files changed

+288
-28
lines changed

12 files changed

+288
-28
lines changed

api/v1beta1/nodeset_keys.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,11 @@ func (o *NodeSet) HeadlessServiceKey() types.NamespacedName {
2323
Namespace: o.Namespace,
2424
}
2525
}
26+
27+
func (o *NodeSet) SshHostKeys() types.NamespacedName {
28+
key := o.Key()
29+
return types.NamespacedName{
30+
Name: fmt.Sprintf("%s-ssh-host-keys", key.Name),
31+
Namespace: o.Namespace,
32+
}
33+
}

api/v1beta1/nodeset_types.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ type NodeSetSpec struct {
3737
// +optional
3838
Slurmd ContainerWrapper `json:"slurmd,omitempty"`
3939

40+
// SSH configuration for worker pods.
41+
// +optional
42+
Ssh NodeSetSsh `json:"ssh,omitzero"`
43+
4044
// The logfile sidecar configuration.
4145
// +optional
4246
LogFile ContainerWrapper `json:"logfile,omitzero"`
@@ -112,6 +116,14 @@ type NodeSetPartition struct {
112116
Config string `json:"config,omitzero"`
113117
}
114118

119+
// NodeSetSsh defines SSH configuration for NodeSet worker pods.
120+
type NodeSetSsh struct {
121+
// Enabled controls whether SSH access is enabled for this NodeSet.
122+
// When enabled, SSH host keys will be created and mounted, and port 22 will be exposed.
123+
// +default:=false
124+
Enabled bool `json:"enabled"`
125+
}
126+
115127
// NodeSetUpdateStrategy indicates the strategy that the NodeSet
116128
// controller will be used to perform updates. It includes any additional
117129
// parameters necessary to perform the update for the indicated strategy.

api/v1beta1/zz_generated.deepcopy.go

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/slinky.slurm.net_nodesets.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,18 @@ spec:
173173
Ref: https://github.com/kubernetes/api/blob/master/core/v1/types.go#L2885
174174
type: object
175175
x-kubernetes-preserve-unknown-fields: true
176+
ssh:
177+
description: SSH configuration for worker pods.
178+
properties:
179+
enabled:
180+
default: false
181+
description: |-
182+
Enabled controls whether SSH access is enabled for this NodeSet.
183+
When enabled, SSH host keys will be created and mounted, and port 22 will be exposed.
184+
type: boolean
185+
required:
186+
- enabled
187+
type: object
176188
taintKubeNodes:
177189
default: false
178190
description: |-

helm/slurm-operator-crds/templates/slinky.slurm.net_nodesets.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,18 @@ spec:
173173
Ref: https://github.com/kubernetes/api/blob/master/core/v1/types.go#L2885
174174
type: object
175175
x-kubernetes-preserve-unknown-fields: true
176+
ssh:
177+
description: SSH configuration for worker pods.
178+
properties:
179+
enabled:
180+
default: false
181+
description: |-
182+
Enabled controls whether SSH access is enabled for this NodeSet.
183+
When enabled, SSH host keys will be created and mounted, and port 22 will be exposed.
184+
type: boolean
185+
required:
186+
- enabled
187+
type: object
176188
taintKubeNodes:
177189
default: false
178190
description: |-

helm/slurm/templates/nodeset/nodeset-cr.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ spec:
5151
config: {{ include "slurm.worker.partitionConfig" $nodeset.partition }}
5252
{{- end }}{{- /* if (include "slurm.worker.partitionConfig" $nodeset.partition) */}}
5353
{{- end }}{{- /* with $nodeset.partition */}}
54+
{{- with $nodeset.ssh }}
55+
ssh:
56+
{{- toYaml . | nindent 4 }}
57+
{{- end }}{{- /* with $nodeset.ssh */}}
5458
replicas: {{ $nodeset.replicas }}
5559
slurmd:
5660
{{- $_ := set $nodeset.slurmd "imagePullPolicy" (default $.Values.imagePullPolicy $nodeset.slurmd.imagePullPolicy) -}}

helm/slurm/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,11 @@ nodesets:
643643
configMap: {}
644644
# State: UP
645645
# MaxTime: UNLIMITED
646+
# SSH configuration for this NodeSet.
647+
# ssh:
648+
# -- Enable SSH access to worker pods with pam_slurm_adopt.
649+
# Ref: https://slurm.schedmd.com/pam_slurm_adopt.html
650+
# enabled: false
646651
# -- Enable propagation of container `resources.limits` into slurmd.
647652
useResourceLimits: true
648653
# Update strategy configuration.

internal/builder/worker_app.go

Lines changed: 72 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ func (b *Builder) BuildWorkerPodTemplate(nodeset *slinkyv1beta1.NodeSet, control
6868
InitContainers: []corev1.Container{
6969
b.logfileContainer(spec.LogFile, slurmdLogFilePath),
7070
},
71-
Volumes: nodesetVolumes(controller),
71+
Volumes: nodesetVolumes(nodeset, controller),
7272
Tolerations: []corev1.Toleration{
7373
slurmtaints.TolerationWorkerNode,
7474
},
@@ -79,7 +79,7 @@ func (b *Builder) BuildWorkerPodTemplate(nodeset *slinkyv1beta1.NodeSet, control
7979
return b.buildPodTemplate(opts)
8080
}
8181

82-
func nodesetVolumes(controller *slinkyv1beta1.Controller) []corev1.Volume {
82+
func nodesetVolumes(nodeset *slinkyv1beta1.NodeSet, controller *slinkyv1beta1.Controller) []corev1.Volume {
8383
out := []corev1.Volume{
8484
{
8585
Name: slurmEtcVolume,
@@ -103,28 +103,83 @@ func nodesetVolumes(controller *slinkyv1beta1.Controller) []corev1.Volume {
103103
},
104104
logFileVolume(),
105105
}
106+
107+
// Add SSH host keys volume if SSH is enabled
108+
if nodeset.Spec.Ssh.Enabled {
109+
out = append(out, corev1.Volume{
110+
Name: sshHostKeysVolume,
111+
VolumeSource: corev1.VolumeSource{
112+
Projected: &corev1.ProjectedVolumeSource{
113+
DefaultMode: ptr.To[int32](0o600),
114+
Sources: []corev1.VolumeProjection{
115+
{
116+
Secret: &corev1.SecretProjection{
117+
LocalObjectReference: corev1.LocalObjectReference{
118+
Name: nodeset.SshHostKeys().Name,
119+
},
120+
Items: []corev1.KeyToPath{
121+
{Key: sshHostRsaKeyFile, Path: sshHostRsaKeyFile, Mode: ptr.To[int32](0o600)},
122+
{Key: sshHostRsaPubKeyFile, Path: sshHostRsaPubKeyFile, Mode: ptr.To[int32](0o644)},
123+
{Key: sshHostEd25519KeyFile, Path: sshHostEd25519KeyFile, Mode: ptr.To[int32](0o600)},
124+
{Key: sshHostEd25519PubKeyFile, Path: sshHostEd25519PubKeyFile, Mode: ptr.To[int32](0o644)},
125+
{Key: sshHostEcdsaKeyFile, Path: sshHostEcdsaKeyFile, Mode: ptr.To[int32](0o600)},
126+
{Key: sshHostEcdsaPubKeyFile, Path: sshHostEcdsaPubKeyFile, Mode: ptr.To[int32](0o644)},
127+
},
128+
},
129+
},
130+
},
131+
},
132+
},
133+
})
134+
}
135+
106136
return out
107137
}
108138

109139
func (b *Builder) slurmdContainer(nodeset *slinkyv1beta1.NodeSet, controller *slinkyv1beta1.Controller) corev1.Container {
110140
merge := nodeset.Spec.Slurmd.Container
111141

142+
// Base ports always include slurmd
143+
ports := []corev1.ContainerPort{
144+
{
145+
Name: labels.WorkerApp,
146+
ContainerPort: SlurmdPort,
147+
Protocol: corev1.ProtocolTCP,
148+
},
149+
}
150+
151+
// Add SSH port if enabled
152+
if nodeset.Spec.Ssh.Enabled {
153+
ports = append(ports, corev1.ContainerPort{
154+
Name: "ssh",
155+
ContainerPort: SshPort,
156+
Protocol: corev1.ProtocolTCP,
157+
})
158+
}
159+
160+
// Base volume mounts
161+
volumeMounts := []corev1.VolumeMount{
162+
{Name: slurmEtcVolume, MountPath: slurmEtcDir, ReadOnly: true},
163+
{Name: slurmLogFileVolume, MountPath: slurmLogFileDir},
164+
}
165+
166+
// Add SSH host key mounts if enabled
167+
if nodeset.Spec.Ssh.Enabled {
168+
volumeMounts = append(volumeMounts,
169+
corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostRsaKeyFilePath, SubPath: sshHostRsaKeyFile, ReadOnly: true},
170+
corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostRsaKeyPubFilePath, SubPath: sshHostRsaPubKeyFile, ReadOnly: true},
171+
corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostEd25519KeyFilePath, SubPath: sshHostEd25519KeyFile, ReadOnly: true},
172+
corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostEd25519PubKeyFilePath, SubPath: sshHostEd25519PubKeyFile, ReadOnly: true},
173+
corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostEcdsaKeyFilePath, SubPath: sshHostEcdsaKeyFile, ReadOnly: true},
174+
corev1.VolumeMount{Name: sshHostKeysVolume, MountPath: sshHostEcdsaPubKeyFilePath, SubPath: sshHostEcdsaPubKeyFile, ReadOnly: true},
175+
)
176+
}
177+
112178
opts := ContainerOpts{
113179
base: corev1.Container{
114-
Name: labels.WorkerApp,
115-
Args: slurmdArgs(nodeset, controller),
116-
Ports: []corev1.ContainerPort{
117-
{
118-
Name: labels.WorkerApp,
119-
ContainerPort: SlurmdPort,
120-
Protocol: corev1.ProtocolTCP,
121-
},
122-
{
123-
Name: "ssh",
124-
ContainerPort: SshPort,
125-
Protocol: corev1.ProtocolTCP,
126-
},
127-
},
180+
Name: labels.WorkerApp,
181+
Args: slurmdArgs(nodeset, controller),
182+
Ports: ports,
128183
StartupProbe: &corev1.Probe{
129184
ProbeHandler: corev1.ProbeHandler{
130185
HTTPGet: &corev1.HTTPGetAction{
@@ -175,10 +230,7 @@ func (b *Builder) slurmdContainer(nodeset *slinkyv1beta1.NodeSet, controller *sl
175230
},
176231
},
177232
},
178-
VolumeMounts: []corev1.VolumeMount{
179-
{Name: slurmEtcVolume, MountPath: slurmEtcDir, ReadOnly: true},
180-
{Name: slurmLogFileVolume, MountPath: slurmLogFileDir},
181-
},
233+
VolumeMounts: volumeMounts,
182234
},
183235
merge: merge,
184236
}

internal/builder/worker_app_test.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -93,14 +93,6 @@ func TestBuilder_BuildWorkerPodTemplate(t *testing.T) {
9393
t.Errorf("Containers[0].Ports[0].ContainerPort = %v , want = %v",
9494
got.Spec.Containers[0].Ports[0].Name, SlurmdPort)
9595

96-
case got.Spec.Containers[0].Ports[1].Name != "ssh":
97-
t.Errorf("Containers[0].Ports[1].Name = %v , want = ssh",
98-
got.Spec.Containers[0].Ports[1].Name)
99-
100-
case got.Spec.Containers[0].Ports[1].ContainerPort != SshPort:
101-
t.Errorf("Containers[0].Ports[1].ContainerPort = %v , want = %v",
102-
got.Spec.Containers[0].Ports[1].ContainerPort, SshPort)
103-
10496
case got.Spec.Subdomain == "":
10597
t.Errorf("Subdomain = %v , want = non-empty", got.Spec.Subdomain)
10698

internal/builder/worker_secret.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// SPDX-FileCopyrightText: Copyright (C) SchedMD LLC.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package builder
5+
6+
import (
7+
"fmt"
8+
9+
corev1 "k8s.io/api/core/v1"
10+
11+
slinkyv1beta1 "github.com/SlinkyProject/slurm-operator/api/v1beta1"
12+
"github.com/SlinkyProject/slurm-operator/internal/builder/labels"
13+
"github.com/SlinkyProject/slurm-operator/internal/utils/crypto"
14+
"github.com/SlinkyProject/slurm-operator/internal/utils/structutils"
15+
)
16+
17+
func (b *Builder) BuildWorkerSshHostKeys(nodeset *slinkyv1beta1.NodeSet) (*corev1.Secret, error) {
18+
keyPairRsa, err := crypto.NewKeyPair(crypto.WithType(crypto.KeyPairRsa))
19+
if err != nil {
20+
return nil, fmt.Errorf("failed to create RSA key pair: %w", err)
21+
}
22+
keyPairEd25519, err := crypto.NewKeyPair(crypto.WithType(crypto.KeyPairEd25519))
23+
if err != nil {
24+
return nil, fmt.Errorf("failed to create ED25519 key pair: %w", err)
25+
}
26+
keyPairEcdsa, err := crypto.NewKeyPair(crypto.WithType(crypto.KeyPairEcdsa))
27+
if err != nil {
28+
return nil, fmt.Errorf("failed to create ECDSA key pair: %w", err)
29+
}
30+
31+
opts := SecretOpts{
32+
Key: nodeset.SshHostKeys(),
33+
Metadata: nodeset.Spec.Template.PodMetadata,
34+
Data: map[string][]byte{
35+
sshHostEcdsaKeyFile: keyPairEcdsa.PrivateKey(),
36+
sshHostEcdsaPubKeyFile: keyPairEcdsa.PublicKey(),
37+
sshHostEd25519KeyFile: keyPairEd25519.PrivateKey(),
38+
sshHostEd25519PubKeyFile: keyPairEd25519.PublicKey(),
39+
sshHostRsaKeyFile: keyPairRsa.PrivateKey(),
40+
sshHostRsaPubKeyFile: keyPairRsa.PublicKey(),
41+
},
42+
Immutable: true,
43+
}
44+
45+
opts.Metadata.Labels = structutils.MergeMaps(opts.Metadata.Labels, labels.NewBuilder().WithWorkerLabels(nodeset).Build())
46+
47+
return b.BuildSecret(opts, nodeset)
48+
}

0 commit comments

Comments
 (0)