Update neuron backend (#3098)

dacorvo · web-flow · commit f01dc9e74357 · 2025-03-12T09:53:15.000+01:00
* feat(neuron): use AWS Neuron SDK 2.21.1

* feat(neuron): bump optimum-neuron version

* feat(neuron): tag latest image for local tests

* test(neuron): simplify sampling test
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
@@ -5,7 +5,7 @@ RUN mkdir -p /tgi
 # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
 FROM alpine AS optimum-neuron
 RUN mkdir -p /optimum-neuron
-ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz
+ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.1.0.tar.gz /optimum-neuron/sources.tar.gz
 RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
@@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
 # Install neuronx packages
 RUN apt-get update -y \
     && apt-get install -y --no-install-recommends \
-    aws-neuronx-dkms=2.18.20.0 \
-    aws-neuronx-collectives=2.22.33.0-d2128d1aa \
-    aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \
-    aws-neuronx-tools=2.19.0.0 \
+    aws-neuronx-dkms=2.19.64.0 \
+    aws-neuronx-collectives=2.23.135.0-3e70920f2 \
+    aws-neuronx-runtime-lib=2.23.112.0-9b5179492 \
+    aws-neuronx-tools=2.20.204.0 \
     libxml2 \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
@@ -120,16 +120,16 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
 
 # Install manually torch CPU version to avoid pulling CUDA
 RUN pip3 install \
-    torch==2.1.2 \
-    torchvision==0.16.2 \
+    torch==2.5.1 \
+    torchvision==0.20.1 \
     --index-url https://download.pytorch.org/whl/cpu
 
 RUN pip3 install \
-    neuronx-cc==2.15.143.0 \
-    torch-neuronx==2.1.2.2.3.2 \
-    transformers-neuronx==0.12.313 \
-    neuronx-distributed==0.9.0 \
-    libneuronxla==2.0.5347.0 \
+    neuronx-cc==2.16.372.0 \
+    torch-neuronx==2.5.1.2.4.0 \
+    transformers-neuronx==0.13.322 \
+    neuronx-distributed==0.10.1 \
+    libneuronxla==2.1.681.0 \
     --extra-index-url=https://pip.repos.neuron.amazonaws.com
 
 # Install HuggingFace packages
diff --git a/backends/neuron/Makefile b/backends/neuron/Makefile
@@ -25,6 +25,7 @@ image:
 				 --ulimit nofile=100000:100000 \
 				 --build-arg VERSION=$(VERSION) \
 				 -t text-generation-inference:$(VERSION)-neuron ${root_dir}
+	docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron
 
 install_server:
 	make -C ${mkfile_dir}/server install VERSION:=${VERSION}
diff --git a/integration-tests/neuron/test_generate.py b/integration-tests/neuron/test_generate.py
@@ -49,17 +49,11 @@ async def test_model_single_request(tgi_service):
         max_new_tokens=128,
         seed=42,
     )
-    sample_expectations = {
-        "gpt2": "Deep Learning",
-        "llama": "Deep Learning",
-        "mistral": "Deep learning",
-        "qwen2": "Deep Learning",
-        "granite": "Deep learning",
-    }
-    assert sample_expectations[service_name] in response
+    # The response must be different
+    assert not response.startswith(greedy_expectations[service_name])
 
-    # Sampling with stop sequence
-    stop_sequence = sample_expectations[service_name][-5:]
+    # Sampling with stop sequence (using one of the words returned from the previous test)
+    stop_sequence = response.split(" ")[-5]
     response = await tgi_service.client.text_generation(
         "What is Deep Learning?",
         do_sample=True,