aws · sirutBuasai · Nov 12, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
@@ -36,8 +36,8 @@ deep_canary_mode = false
 
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+# available frameworks - ["base", "vllm", "sglang" "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
+build_frameworks = ["sglang"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
@@ -58,7 +58,7 @@ notify_test_failures = false
 [test]
 # Set to true to use the new test structure path for frameworks
 # Off by default (set to false)
-use_new_test_structure = false 
+use_new_test_structure = true
 
 ### On by default
 sanity_tests = true
@@ -90,7 +90,7 @@ enable_ipv6 = false
 ###    b. Configure the default security group to allow SSH traffic using IPv4
 ###
 ### 3. Create an EFA-enabled security group:
-###    a. Follow 'Step 1: Prepare an EFA-enabled security group' in: 
+###    a. Follow 'Step 1: Prepare an EFA-enabled security group' in:
 ###       https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-security
 ###    b. Configure this security group to also allow SSH traffic via IPv4
 ipv6_vpc_name = ""
@@ -185,3 +185,6 @@ dlc-pr-tensorflow-2-eia-inference = ""
 
 # vllm
 dlc-pr-vllm = ""
+
+# sglang
+dlc-pr-sglang = ""
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Check if telemetry file exists before executing
+# Execute telemetry script if it exists, suppress errors
+bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
+
+if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then
+    bash /usr/local/bin/start_cuda_compat.sh
+fi
+
+echo "Starting server"
+
+PREFIX="SM_SGLANG_"
+ARG_PREFIX="--"
+
+ARGS=()
+
+while IFS='=' read -r key value; do
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}")
+
+# Add default port only if not already set
+if ! [[ " ${ARGS[@]} " =~ " --port " ]]; then
+    ARGS+=(--port "${SM_SGLANG_PORT:-8080}")
+fi
+
+# Add default host only if not already set
+if ! [[ " ${ARGS[@]} " =~ " --host " ]]; then
+    ARGS+=(--host "${SM_SGLANG_HOST:-0.0.0.0}")
+fi
+
+# Add default model-path only if not already set
+if ! [[ " ${ARGS[@]} " =~ " --model-path " ]]; then
+    ARGS+=(--model-path "${SM_SGLANG_MODEL_PATH:-/opt/ml/model}")
+fi
+
+echo "Running command: exec python3 -m sglang.launch_server ${ARGS[@]}"
+exec python3 -m sglang.launch_server "${ARGS[@]}"
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+verlte() {
+  [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
+}
+
+COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
+if [ -f $COMPAT_FILE ]; then
+  CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
+  echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
+  NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+  if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
+    NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
+  fi
+  echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
+  if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
+    echo "Adding CUDA compat to LD_LIBRARY_PATH"
+    export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
+    echo $LD_LIBRARY_PATH
+  else
+    echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
+  fi
+else
+  echo "Skipping CUDA compat setup as package not found"
+fi
@@ -0,0 +1,55 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK sglang
+version: &VERSION "0.5.5"
+short_version: &SHORT_VERSION "0.5"
+arch_type: &ARCH_TYPE x86_64
+autopatch_build: "False"
+
+repository_info:
+  build_repository: &BUILD_REPOSITORY
+    image_type: &IMAGE_TYPE gpu
+    root: .
+    repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  build_context: &BUILD_CONTEXT
+    deep_learning_container:
+      source: src/deep_learning_container.py
+      target: deep_learning_container.py
+    install_efa:
+      source: scripts/install_efa.sh
+      target: install_efa.sh
+    start_cuda_compat:
+      source: sglang/build_artifacts/start_cuda_compat.sh
+      target: start_cuda_compat.sh
+    sagemaker_entrypoint:
+      source: sglang/build_artifacts/sagemaker_entrypoint.sh
+      target: sagemaker_entrypoint.sh
+
+images:
+  sglang_sm:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 26000
+    device_type: &DEVICE_TYPE gpu
+    cuda_version: &CUDA_VERSION cu129
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /Dockerfile ]
+    target: sglang-sagemaker
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
+        - sagemaker
@@ -0,0 +1 @@
+buildspec_pointer: buildspec-sm.yml
@@ -0,0 +1,105 @@
+FROM lmsysorg/sglang:v0.5.5-cu129-amd64 AS base
+
+# ====================================================
+# ====================== common ======================
+# ====================================================
+
+ARG PYTHON="python3"
+ARG EFA_VERSION="1.43.3"
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8 \
+    DLC_CONTAINER_TYPE=general \
+    # Python won’t try to write .pyc or .pyo files on the import of source modules
+    # Force stdin, stdout and stderr to be totally unbuffered. Good for logging
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONIOENCODING=UTF-8 \
+    LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
+    PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}"
+
+WORKDIR /
+
+# Copy artifacts
+# ===============
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
+COPY install_efa.sh install_efa.sh
+COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
+
+RUN chmod +x /usr/local/bin/deep_learning_container.py \
+ && chmod +x /usr/local/bin/bash_telemetry.sh \
+ && chmod +x /usr/local/bin/start_cuda_compat.sh
+
+# Install cuda compat
+# ====================
+# RUN apt-get update \
+#  && apt-get -y upgrade --only-upgrade systemd \
+#  && apt-get install -y --allow-change-held-packages --no-install-recommends \
+#     cuda-compat-12-9 \
+#  && rm -rf /var/lib/apt/lists/* \
+#  && apt-get clean
+
+# Install EFA and remove vulnerable nvjpeg
+# =========================================
+RUN bash install_efa.sh ${EFA_VERSION} \
+ && rm install_efa.sh \
+ && mkdir -p /tmp/nvjpeg \
+ && cd /tmp/nvjpeg \
+ # latest cu12 libnvjpeg available is cu124
+ && wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
+ && tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
+ && rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
+ && rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
+ && cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
+ && cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
+ && rm -rf /tmp/nvjpeg \
+ # create symlink for python
+ && rm -rf /usr/bin/python \
+ && ln -s /usr/bin/python3 /usr/bin/python \
+ # remove cuobjdump and nvdisasm
+ && rm -rf /usr/local/cuda/bin/cuobjdump* \
+ && rm -rf /usr/local/cuda/bin/nvdisasm*
+
+# Run OSS compliance script
+# ==========================
+RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc  \
+ # OSS compliance - use Python zipfile instead of unzip
+ && HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && python3 -c "import zipfile, os; zipfile.ZipFile('/root/oss_compliance.zip').extractall('/root/'); os.remove('/root/oss_compliance.zip')" \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ # clean up
+ && rm -rf ${HOME_DIR}/oss_compliance* \
+ && rm -rf /tmp/tmp* \
+ && rm -rf /tmp/uv* \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /root/.cache | true
+
+# =======================================================
+# ====================== sagemaker ======================
+# =======================================================
+
+FROM base AS sglang-sagemaker
+
+RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold \
+ && apt-get update \
+ && apt-get upgrade -y \
+ && apt-get clean
+
+RUN pip install --no-cache-dir -U \
+    boto3
+
+RUN rm -rf /tmp/*
+
+COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
+RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh
+
+ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
@@ -25,6 +25,8 @@
     "autogluon",
     "stabilityai_pytorch",
     "base",
+    "vllm",
+    "sglang",
 }
 DEVICE_TYPES = {"cpu", "gpu", "hpu", "eia", "inf", "neuron", "neuronx"}
 IMAGE_TYPES = {"training", "inference"}

@@ -239,7 +239,7 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--framework",
-        choices=["tensorflow", "mxnet", "pytorch", "base", "vllm"],
+        choices=["tensorflow", "mxnet", "pytorch", "base", "vllm", "sglang"],
         help="framework of container image.",
         required=True,
     )

@@ -15,28 +15,25 @@
 
 import concurrent.futures
 import datetime
+import itertools
 import os
 import re
 import tempfile
-
 from copy import deepcopy
 
 import constants
-import utils
-import itertools
 import patch_helper
-
-from codebuild_environment import get_codebuild_project_name, get_cloned_folder_path
-from config import is_build_enabled, is_autopatch_build_enabled
+import utils
+from buildspec import Buildspec
+from codebuild_environment import get_cloned_folder_path, get_codebuild_project_name
+from common_stage_image import CommonStageImage
+from config import is_autopatch_build_enabled, is_build_enabled
 from context import Context
-from metrics import Metrics
 from image import DockerImage
-from common_stage_image import CommonStageImage
-from buildspec import Buildspec
+from metrics import Metrics
 from output import OutputFormatter
 from utils import get_dummy_boto_client
 
-
 FORMATTER = OutputFormatter(constants.PADDING)
 build_context = os.getenv("BUILD_CONTEXT")
 
@@ -241,17 +238,7 @@ def image_builder(buildspec, image_types=[], device_types=[]):
         )
         # Determine job_type (inference, training, or base) based on the image repository URI.
         # This is used to set the job_type label on the container image.
-        if "training" in image_repo_uri:
-            label_job_type = "training"
-        elif "inference" in image_repo_uri:
-            label_job_type = "inference"
-        elif "base" in image_repo_uri or "vllm" in image_repo_uri:
-            label_job_type = "general"
-        else:
-            raise RuntimeError(
-                f"Cannot find inference, training or base job type in {image_repo_uri}. "
-                f"This is required to set job_type label."
-            )
+        label_job_type = get_job_type(image_repo_uri)
 
         bash_template_file = os.path.join(
             os.sep, get_cloned_folder_path(), "miscellaneous_scripts", "bash_telemetry.sh"
@@ -690,3 +677,22 @@ def modify_repository_name_for_context(image_repo_uri, build_context):
             constants.PR_REPO_PREFIX, constants.NIGHTLY_REPO_PREFIX
         )
     return "/".join(repo_uri_values)
+
+
+def get_job_type(image_repo_uri):
+    job_type_mapping = {
+        "training": "training",
+        "inference": "inference",
+        "base": "general",
+        "vllm": "general",
+        "sglang": "general",
+    }
+
+    for key, job_type in job_type_mapping.items():
+        if key in image_repo_uri:
+            return job_type
+
+    raise RuntimeError(
+        f"Cannot determine job type from {image_repo_uri}. "
+        f"Expected one of: {', '.join(job_type_mapping.keys())}"
+    )
@@ -15,7 +15,9 @@ def test_repo_anaconda_not_present(image):
 
         # First check to see if image has conda installed, if not, skip test since no packages installed from conda present
         conda_present = test_utils.run_cmd_on_container(
-            container_name, ctx, 'find . -name conda -not -path "**/.github/*" -ignore_readdir_race'
+            container_name,
+            ctx,
+            'find . -name conda -not -path "**/.github/*" -not -path "**/.oh-my-zsh/*" -ignore_readdir_race',
         ).stdout.strip()
         if not conda_present:
             pytest.skip(f"Image {image} does not have conda installed, skipping test.")

@@ -1,15 +1,15 @@
 import pytest
-
 from invoke import run
 
 
 @pytest.mark.usefixtures("sagemaker", "security_sanity")
 @pytest.mark.model("N/A")
 @pytest.mark.canary("Run security test regularly on production images")
 def test_security(image):
-    if "vllm" in image:
+    upstream_types = ["vllm"]
+    if any(t in image for t in upstream_types):
         pytest.skip(
-            "vLLM images do not require pip check as they are managed by vLLM devs. Skipping test."
+            f"{', '.join(upstream_types)} images do not require boot time security check as they are managed by upstream devs. Skipping test."
         )
     repo_name, image_tag = image.split("/")[-1].split(":")
     container_name = f"{repo_name}-{image_tag}-security"