Skip to content
Merged
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
a8b06f4
inital commit
sirutBuasai Nov 5, 2025
2ea4c54
update sglang container and entrypoint
sirutBuasai Nov 5, 2025
2b624d3
add buildspec.yaml
sirutBuasai Nov 5, 2025
d81a343
tmp test qwen
sirutBuasai Nov 6, 2025
110edb6
Merge branch 'sgl' of https://github.com/sirutBuasai/deep-learning-co…
sirutBuasai Nov 6, 2025
dae1cec
Merge branch 'master' into sgl
sirutBuasai Nov 6, 2025
5ea6132
revert vllm
sirutBuasai Nov 6, 2025
91cf705
fix sm path
sirutBuasai Nov 6, 2025
ef528b4
fix sglang entrpoint
sirutBuasai Nov 6, 2025
3d95345
Merge branch 'master' into sgl
sirutBuasai Nov 6, 2025
62eaf27
finalize dockerfile
sirutBuasai Nov 10, 2025
2031b39
add toml file
sirutBuasai Nov 10, 2025
1352c62
add get job type func
sirutBuasai Nov 10, 2025
f803c15
use dict job type
sirutBuasai Nov 10, 2025
b6716a2
add sglang
sirutBuasai Nov 10, 2025
ca48eb4
fix target name
sirutBuasai Nov 10, 2025
3234774
Merge branch 'master' into sgl
sirutBuasai Nov 10, 2025
c6927ad
add tests to buildspec
sirutBuasai Nov 11, 2025
dd97fc1
fix test runner and get framework func
sirutBuasai Nov 11, 2025
e24c955
add job type
sirutBuasai Nov 11, 2025
b4444a9
fix sanity and security tests
sirutBuasai Nov 11, 2025
d9bf7c1
revert run new tests
sirutBuasai Nov 11, 2025
71b1182
formatting
sirutBuasai Nov 11, 2025
2f86d52
fix jobtype func and add sglang general integration sagemaker dir
sirutBuasai Nov 11, 2025
456bdc6
add sglang and vllm to frameworks
sirutBuasai Nov 11, 2025
7309d67
add skip general types
sirutBuasai Nov 11, 2025
2ed025f
fix cuda compat and entrypoint
sirutBuasai Nov 11, 2025
49c31fa
Merge branch 'sgl' of https://github.com/sirutBuasai/deep-learning-co…
sirutBuasai Nov 11, 2025
5637095
fix dlc container type
sirutBuasai Nov 11, 2025
cce1e87
install boto3
sirutBuasai Nov 11, 2025
1927956
add sglang to types
sirutBuasai Nov 11, 2025
8aa5c9c
sgl fix bug
sirutBuasai Nov 11, 2025
a95e10c
add pytest
sirutBuasai Nov 11, 2025
ad5e24d
add print debug
sirutBuasai Nov 11, 2025
c89a8f5
add conftest
sirutBuasai Nov 11, 2025
eb524f7
fix conftest
sirutBuasai Nov 11, 2025
1c13adb
fix fixtures
sirutBuasai Nov 12, 2025
cd8a500
printing responses
sirutBuasai Nov 12, 2025
d7e0f05
fix endpoint name
sirutBuasai Nov 12, 2025
481fa34
remove sm local
sirutBuasai Nov 12, 2025
f2a1eb0
revert sglang
sirutBuasai Nov 12, 2025
4b60ba1
Merge branch 'master' into sgl
sirutBuasai Nov 12, 2025
3dfcb32
revert new test structure
sirutBuasai Nov 12, 2025
5d33f6e
fix syntax
sirutBuasai Nov 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions dlc_developer_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ deep_canary_mode = false

[build]
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = []
# available frameworks - ["base", "vllm", "sglang" "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = ["sglang"]


# By default we build both training and inference containers. Set true/false values to determine which to build.
Expand All @@ -58,7 +58,7 @@ notify_test_failures = false
[test]
# Set to true to use the new test structure path for frameworks
# Off by default (set to false)
use_new_test_structure = false
use_new_test_structure = true

### On by default
sanity_tests = true
Expand Down Expand Up @@ -90,7 +90,7 @@ enable_ipv6 = false
### b. Configure the default security group to allow SSH traffic using IPv4
###
### 3. Create an EFA-enabled security group:
### a. Follow 'Step 1: Prepare an EFA-enabled security group' in:
### a. Follow 'Step 1: Prepare an EFA-enabled security group' in:
### https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-security
### b. Configure this security group to also allow SSH traffic via IPv4
ipv6_vpc_name = ""
Expand Down Expand Up @@ -185,3 +185,6 @@ dlc-pr-tensorflow-2-eia-inference = ""

# vllm
dlc-pr-vllm = ""

# sglang
dlc-pr-sglang = ""
42 changes: 42 additions & 0 deletions sglang/build_artifacts/sagemaker_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash
# Check if telemetry file exists before executing
# Execute telemetry script if it exists, suppress errors
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true

if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then
bash /usr/local/bin/start_cuda_compat.sh
fi

echo "Starting server"

PREFIX="SM_SGLANG_"
ARG_PREFIX="--"

ARGS=()

while IFS='=' read -r key value; do
arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')

ARGS+=("${ARG_PREFIX}${arg_name}")
if [ -n "$value" ]; then
ARGS+=("$value")
fi
done < <(env | grep "^${PREFIX}")

# Add default port only if not already set
if ! [[ " ${ARGS[@]} " =~ " --port " ]]; then
ARGS+=(--port "${SM_SGLANG_PORT:-8080}")
fi

# Add default host only if not already set
if ! [[ " ${ARGS[@]} " =~ " --host " ]]; then
ARGS+=(--host "${SM_SGLANG_HOST:-0.0.0.0}")
fi

# Add default model-path only if not already set
if ! [[ " ${ARGS[@]} " =~ " --model-path " ]]; then
ARGS+=(--model-path "${SM_SGLANG_MODEL_PATH:-/opt/ml/model}")
fi

echo "Running command: exec python3 -m sglang.launch_server ${ARGS[@]}"
exec python3 -m sglang.launch_server "${ARGS[@]}"
25 changes: 25 additions & 0 deletions sglang/build_artifacts/start_cuda_compat.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

verlte() {
[ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
}

COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
if [ -f $COMPAT_FILE ]; then
CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
fi
echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
echo "Adding CUDA compat to LD_LIBRARY_PATH"
export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
echo $LD_LIBRARY_PATH
else
echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
fi
else
echo "Skipping CUDA compat setup as package not found"
fi
55 changes: 55 additions & 0 deletions sglang/buildspec-sm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK sglang
version: &VERSION "0.5.5"
short_version: &SHORT_VERSION "0.5"
arch_type: &ARCH_TYPE x86_64
autopatch_build: "False"

repository_info:
build_repository: &BUILD_REPOSITORY
image_type: &IMAGE_TYPE gpu
root: .
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
build_context: &BUILD_CONTEXT
deep_learning_container:
source: src/deep_learning_container.py
target: deep_learning_container.py
install_efa:
source: scripts/install_efa.sh
target: install_efa.sh
start_cuda_compat:
source: sglang/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
sagemaker_entrypoint:
source: sglang/build_artifacts/sagemaker_entrypoint.sh
target: sagemaker_entrypoint.sh

images:
sglang_sm:
<<: *BUILD_REPOSITORY
context:
<<: *BUILD_CONTEXT
image_size_baseline: 26000
device_type: &DEVICE_TYPE gpu
cuda_version: &CUDA_VERSION cu129
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /Dockerfile ]
target: sglang-sagemaker
build: true
enable_common_stage_build: false
test_configs:
test_platforms:
- sanity
- security
- sagemaker
1 change: 1 addition & 0 deletions sglang/buildspec.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
buildspec_pointer: buildspec-sm.yml
105 changes: 105 additions & 0 deletions sglang/x86_64/gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
FROM lmsysorg/sglang:v0.5.5-cu129-amd64 AS base

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use digest pinning / checksum verification, since this is not an Amazon controlled image.

Copy link
Member Author

@sirutBuasai sirutBuasai Nov 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is by design since we want to consume security patching from upstream. Pinning with a digest version will prevent our downstream image from consuming these patches. By pinning to a specific version rather than latest we are restricting updates on core packages and only consume security patching.

Moreover, docker containers are static post-build by design. This means that after build, the base layer is hashed and will remain static until we trigger a rebuild and re-release of this particular image. This will prevent potential security vulnerabilities that may sneak its way in from upstream.

We are ingesting the base image from this vendor (https://hub.docker.com/r/lmsysorg/sglang/tags) which is a sponsored OSS vendor on Docker hub. Hope this help provide credibility that we are consuming images from a trusted source similar to how we consume our other images from cuda base container or ubuntu base containers.


# ====================================================
# ====================== common ======================
# ====================================================

ARG PYTHON="python3"
ARG EFA_VERSION="1.43.3"

LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"

ENV DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
DLC_CONTAINER_TYPE=general \
# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONIOENCODING=UTF-8 \
LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}"

WORKDIR /

# Copy artifacts
# ===============
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
COPY install_efa.sh install_efa.sh
COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh

RUN chmod +x /usr/local/bin/deep_learning_container.py \
&& chmod +x /usr/local/bin/bash_telemetry.sh \
&& chmod +x /usr/local/bin/start_cuda_compat.sh

# Install cuda compat
# ====================
# RUN apt-get update \
# && apt-get -y upgrade --only-upgrade systemd \
# && apt-get install -y --allow-change-held-packages --no-install-recommends \
# cuda-compat-12-9 \
# && rm -rf /var/lib/apt/lists/* \
# && apt-get clean

# Install EFA and remove vulnerable nvjpeg
# =========================================
RUN bash install_efa.sh ${EFA_VERSION} \
&& rm install_efa.sh \
&& mkdir -p /tmp/nvjpeg \
&& cd /tmp/nvjpeg \
# latest cu12 libnvjpeg available is cu124
&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
&& rm -rf /tmp/nvjpeg \
# create symlink for python
&& rm -rf /usr/bin/python \
&& ln -s /usr/bin/python3 /usr/bin/python \
# remove cuobjdump and nvdisasm
&& rm -rf /usr/local/cuda/bin/cuobjdump* \
&& rm -rf /usr/local/cuda/bin/nvdisasm*

# Run OSS compliance script
# ==========================
RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc \
# OSS compliance - use Python zipfile instead of unzip
&& HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& python3 -c "import zipfile, os; zipfile.ZipFile('/root/oss_compliance.zip').extractall('/root/'); os.remove('/root/oss_compliance.zip')" \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
# clean up
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp* \
&& rm -rf /tmp/uv* \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /root/.cache | true

# =======================================================
# ====================== sagemaker ======================
# =======================================================

FROM base AS sglang-sagemaker

RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold \
&& apt-get update \
&& apt-get upgrade -y \
&& apt-get clean

RUN pip install --no-cache-dir -U \
boto3

RUN rm -rf /tmp/*

COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh

ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
2 changes: 2 additions & 0 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
"autogluon",
"stabilityai_pytorch",
"base",
"vllm",
"sglang",
}
DEVICE_TYPES = {"cpu", "gpu", "hpu", "eia", "inf", "neuron", "neuronx"}
IMAGE_TYPES = {"training", "inference"}
Expand Down
2 changes: 1 addition & 1 deletion src/deep_learning_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--framework",
choices=["tensorflow", "mxnet", "pytorch", "base", "vllm"],
choices=["tensorflow", "mxnet", "pytorch", "base", "vllm", "sglang"],
help="framework of container image.",
required=True,
)
Expand Down
48 changes: 27 additions & 21 deletions src/image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,25 @@

import concurrent.futures
import datetime
import itertools
import os
import re
import tempfile

from copy import deepcopy

import constants
import utils
import itertools
import patch_helper

from codebuild_environment import get_codebuild_project_name, get_cloned_folder_path
from config import is_build_enabled, is_autopatch_build_enabled
import utils
from buildspec import Buildspec
from codebuild_environment import get_cloned_folder_path, get_codebuild_project_name
from common_stage_image import CommonStageImage
from config import is_autopatch_build_enabled, is_build_enabled
from context import Context
from metrics import Metrics
from image import DockerImage
from common_stage_image import CommonStageImage
from buildspec import Buildspec
from metrics import Metrics
from output import OutputFormatter
from utils import get_dummy_boto_client


FORMATTER = OutputFormatter(constants.PADDING)
build_context = os.getenv("BUILD_CONTEXT")

Expand Down Expand Up @@ -241,17 +238,7 @@ def image_builder(buildspec, image_types=[], device_types=[]):
)
# Determine job_type (inference, training, or base) based on the image repository URI.
# This is used to set the job_type label on the container image.
if "training" in image_repo_uri:
label_job_type = "training"
elif "inference" in image_repo_uri:
label_job_type = "inference"
elif "base" in image_repo_uri or "vllm" in image_repo_uri:
label_job_type = "general"
else:
raise RuntimeError(
f"Cannot find inference, training or base job type in {image_repo_uri}. "
f"This is required to set job_type label."
)
label_job_type = get_job_type(image_repo_uri)

bash_template_file = os.path.join(
os.sep, get_cloned_folder_path(), "miscellaneous_scripts", "bash_telemetry.sh"
Expand Down Expand Up @@ -690,3 +677,22 @@ def modify_repository_name_for_context(image_repo_uri, build_context):
constants.PR_REPO_PREFIX, constants.NIGHTLY_REPO_PREFIX
)
return "/".join(repo_uri_values)


def get_job_type(image_repo_uri):
job_type_mapping = {
"training": "training",
"inference": "inference",
"base": "general",
"vllm": "general",
"sglang": "general",
}

for key, job_type in job_type_mapping.items():
if key in image_repo_uri:
return job_type

raise RuntimeError(
f"Cannot determine job type from {image_repo_uri}. "
f"Expected one of: {', '.join(job_type_mapping.keys())}"
)
4 changes: 3 additions & 1 deletion test/dlc_tests/sanity/test_anaconda.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ def test_repo_anaconda_not_present(image):

# First check to see if image has conda installed, if not, skip test since no packages installed from conda present
conda_present = test_utils.run_cmd_on_container(
container_name, ctx, 'find . -name conda -not -path "**/.github/*" -ignore_readdir_race'
container_name,
ctx,
'find . -name conda -not -path "**/.github/*" -not -path "**/.oh-my-zsh/*" -ignore_readdir_race',
).stdout.strip()
if not conda_present:
pytest.skip(f"Image {image} does not have conda installed, skipping test.")
Expand Down
6 changes: 3 additions & 3 deletions test/dlc_tests/sanity/test_boottime_container_security.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import pytest

from invoke import run


@pytest.mark.usefixtures("sagemaker", "security_sanity")
@pytest.mark.model("N/A")
@pytest.mark.canary("Run security test regularly on production images")
def test_security(image):
if "vllm" in image:
upstream_types = ["vllm"]
if any(t in image for t in upstream_types):
pytest.skip(
"vLLM images do not require pip check as they are managed by vLLM devs. Skipping test."
f"{', '.join(upstream_types)} images do not require boot time security check as they are managed by upstream devs. Skipping test."
)
repo_name, image_tag = image.split("/")[-1].split(":")
container_name = f"{repo_name}-{image_tag}-security"
Expand Down
Loading