diff --git a/README.md b/README.md index 6f3997c..eaf4767 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ As part of the [spearow/juice](https://github.com/spearow/juice) efforts, it bec cuda™/cudnn access and eventually also rocm and OpenCL™ support from within the container without granting excessive privileges that would allow to remount the device tree. -All instructions here are for [`Fedora 32` / `Fedora 33`](https://getfedora.org). +All instructions here are for [`Fedora 32` / `Fedora 33`](https://getfedora.org). For Ubuntu-specific instructions, see ubuntu/README.md Assumes concourse is unpacked under `/usr/local`, such that `/usr/local/concourse/bin/{gdn,concourse}` exist. diff --git a/etc/concourse/garden.ini b/fedora/etc/concourse/garden.ini similarity index 100% rename from etc/concourse/garden.ini rename to fedora/etc/concourse/garden.ini diff --git a/etc/containerd/config.toml b/fedora/etc/containerd/config.toml similarity index 100% rename from etc/containerd/config.toml rename to fedora/etc/containerd/config.toml diff --git a/etc/containers/oci/hooks.d/nvidia-tools.json b/fedora/etc/containers/oci/hooks.d/nvidia-tools.json similarity index 100% rename from etc/containers/oci/hooks.d/nvidia-tools.json rename to fedora/etc/containers/oci/hooks.d/nvidia-tools.json diff --git a/etc/systemd/system/concourse-worker@.service b/fedora/etc/systemd/system/concourse-worker@.service similarity index 100% rename from etc/systemd/system/concourse-worker@.service rename to fedora/etc/systemd/system/concourse-worker@.service diff --git a/etc/systemd/system/containerd.service b/fedora/etc/systemd/system/containerd.service similarity index 100% rename from etc/systemd/system/containerd.service rename to fedora/etc/systemd/system/containerd.service diff --git a/etc/systemd/system/garden.service b/fedora/etc/systemd/system/garden.service similarity index 100% rename from etc/systemd/system/garden.service rename to fedora/etc/systemd/system/garden.service diff --git a/etc/yum.repos.d/nvidia-container-runtime.repo b/fedora/etc/yum.repos.d/nvidia-container-runtime.repo similarity index 100% rename from etc/yum.repos.d/nvidia-container-runtime.repo rename to fedora/etc/yum.repos.d/nvidia-container-runtime.repo diff --git a/ubuntu/README.md b/ubuntu/README.md new file mode 100644 index 0000000..6182d64 --- /dev/null +++ b/ubuntu/README.md @@ -0,0 +1,25 @@ +# Ubuntu 20.04 + +The following describes a variation to the Fedora-style configuration described at ../README.md, and explains how I setup Concourse worker with GPU acceleration using Ubuntu 20.04 + +## nvidia drivers + +Install the latest nvidia drivers recommended by `sudo ubuntu-drivers devices`, or simply run `sudo ubuntu-drivers autoinstall` if you're feeling lucky + +## nvidia runtime + +Per the [docs](https://nvidia.github.io/nvidia-container-runtime/), install the [nvidia-runtime](https://github.com/NVIDIA/nvidia-container-runtime): + +``` +curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | \ + sudo apt-key add - +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list +sudo apt-get update +sudo apt-get install nvidia-container-runtime +``` + +## config + +Copy the contents of the `etc` folder to `/etc`, and customize to suit. The primary distinction to the fedora instructions is that containerd running on Ubuntu seems to ignore the `/etc/containers` directory, so it's necessary to specify the nvidia runtime in `/etc/containerd/config.toml`. \ No newline at end of file diff --git a/ubuntu/etc/concourse/garden.ini b/ubuntu/etc/concourse/garden.ini new file mode 100644 index 0000000..acc4f10 --- /dev/null +++ b/ubuntu/etc/concourse/garden.ini @@ -0,0 +1,17 @@ +[server] +# avoid running out of ip addresses +# use a bigger subnet than the default +network-pool = 172.16.1.0/16 +# local first +dns-server = 127.0.0.1 +# avoid dns resolution failures +dns-server = 1.1.1.1 +dns-server = 9.9.9.9 + +# failed attempts, ignore +#runtime-plugin = runc +#runtime-plugin-extra-arg = --debug +#runtime-plugin = /usr/bin/nvidia-container-runtime + +# avoid running out of file descriptors +cleanup-process-dirs-on-wait = true diff --git a/ubuntu/etc/containerd/config.toml b/ubuntu/etc/containerd/config.toml new file mode 100644 index 0000000..6c85d31 --- /dev/null +++ b/ubuntu/etc/containerd/config.toml @@ -0,0 +1,19 @@ +root = "/media/cachy/containerd" +state = "/run/containerd" +#subreaper = true +#oom_score = 0 + +[grpc] +address = "/run/containerd/containerd.sock" +uid = 0 +gid = 0 + +[debug] +# address = "/run/containerd/debug.sock" +# uid = 0 +# gid = 0 +level = "debug" + +[plugins] +[plugins.linux] +runtime = "nvidia-container-runtime" diff --git a/ubuntu/etc/systemd/system/concourse-worker@.service b/ubuntu/etc/systemd/system/concourse-worker@.service new file mode 100644 index 0000000..5076069 --- /dev/null +++ b/ubuntu/etc/systemd/system/concourse-worker@.service @@ -0,0 +1,40 @@ +[Unit] +Description=concourse worker %i +After=suspend.target +After=hibernate.target +After=hybrid-sleep.target +After=network.service +Requires=network.service +After=garden.service +Requires=garden.service +RequiresMountsFor=/media/cachy +ConditionPathIsDirectory=/media/cachy/concourse + +[Service] +Type=simple +Restart=always +RestartSec=15s + +Environment=CONCOURSE_KEY_DIR=/etc/concourse/keys/worker +Environment=CONCOURSE_WORK_DIR=/media/cachy/concourse +#Environment=CONCOURSE_ENABLE_LIDAR=true + +ExecStartPre=-/usr/bin/mkdir ${CONCOURSE_WORK_DIR} +ExecStartPre=-/usr/local/concourse/bin/concourse --version +ExecStart=/usr/local/concourse/bin/concourse \ + worker \ + --name=%i \ + --work-dir=${CONCOURSE_WORK_DIR} \ + --tsa-host=ci.example.com:1111111 \ + --tsa-worker-private-key=${CONCOURSE_KEY_DIR}/%i \ + --tsa-public-key=${CONCOURSE_KEY_DIR}/tsa_host_key.pub \ + --external-garden-url=http://localhost:7777/ + +RestartSec=5 +RestartKillSignal=SIGUSR1 +KillMode=process +KillSignal=SIGUSR2 +TimeoutStopSec=180 + +[Install] +WantedBy=multi-user.target diff --git a/ubuntu/etc/systemd/system/containerd.service b/ubuntu/etc/systemd/system/containerd.service new file mode 100644 index 0000000..27c9455 --- /dev/null +++ b/ubuntu/etc/systemd/system/containerd.service @@ -0,0 +1,12 @@ +[Unit] +Description=containerd container runtime +Documentation=https://containerd.io +After=network.target + +[Service] +ExecStart=/usr/bin/containerd +Delegate=yes +KillMode=process + +[Install] +WantedBy=multi-user.target diff --git a/ubuntu/etc/systemd/system/garden.service b/ubuntu/etc/systemd/system/garden.service new file mode 100644 index 0000000..049a865 --- /dev/null +++ b/ubuntu/etc/systemd/system/garden.service @@ -0,0 +1,37 @@ +[Unit] +Description=garden container management + +After=suspend.target +After=hibernate.target +After=hybrid-sleep.target +After=network.target +Requires=network.target +After=containerd.service +Requires=containerd.service +RequiresMountsFor=/media/cachy +ConditionPathIsDirectory=/media/cachy/concourse + +[Service] +Type=simple +Restart=always +LimitNOFILE=50000 +TasksMax=50000 +User=root +Group=root +#ExecStartPre=-btrfschk --check --repair --backup /dev/yadayada +ExecStartPre=-/usr/local/concourse/bin/gdn -v +ExecStart=/usr/local/concourse/bin/gdn \ + --config /etc/concourse/garden.ini \ + server \ + --use-containerd-for-processes \ + --containerd-socket=/run/containerd/containerd.sock \ + --log-level=info \ + --bind-ip 127.0.0.1 \ + --bind-port 7777 \ + --depot /media/cachy/concourse/depot \ + --properties-path /media/cachy/concourse/garden-properties.json \ + --time-format rfc3339 \ + --no-image-plugin + +RestartSec=3 +TimeoutStopSec=120