Optimum neuron 0.2.2 (#3281)

dacorvo · web-flow · commit 3d2e7c8fce33 · 2025-07-03T07:59:25.000+02:00
* chore(neuron): use optimum-neuron 0.2.1

* test(neuron): adjust expectations

Since the latest optimum-neuron uses a new modeling for granite and
qwen, the greedy outputs are slighly different.

* test(neuron): add phi3 and qwen3 tests

* chore(neuron): use optimum-neuron 0.2.2
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
@@ -5,7 +5,7 @@ RUN mkdir -p /tgi
 # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
 FROM alpine AS optimum-neuron
 RUN mkdir -p /optimum-neuron
-ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.0.tar.gz /optimum-neuron/sources.tar.gz
+ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.2.tar.gz /optimum-neuron/sources.tar.gz
 RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
diff --git a/integration-tests/fixtures/neuron/export_models.py b/integration-tests/fixtures/neuron/export_models.py
@@ -46,6 +46,15 @@
             "auto_cast_type": "fp16",
         },
     },
+    "qwen3": {
+        "model_id": "Qwen/Qwen3-1.7B",
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
+    },
     "granite": {
         "model_id": "ibm-granite/granite-3.1-2b-instruct",
         "export_kwargs": {
@@ -55,6 +64,15 @@
             "auto_cast_type": "bf16",
         },
     },
+    "phi3": {
+        "model_id": "microsoft/Phi-3-mini-4k-instruct",
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
+    },
 }
 
 
diff --git a/integration-tests/neuron/test_generate.py b/integration-tests/neuron/test_generate.py
@@ -21,8 +21,10 @@ async def test_model_single_request(tgi_service):
     assert response.details.generated_tokens == 17
     greedy_expectations = {
         "llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial",
-        "qwen2": " - Part 1\n\nDeep Learning is a subset of Machine Learning that is based on",
-        "granite": "\n\nDeep Learning is a subset of Machine Learning, which is a branch of Art",
+        "qwen2": " - Deep Learning is a subset of Machine Learning that involves the use of artificial neural networks",
+        "granite": "\n\nDeep learning is a subset of machine learning techniques based on artificial neural networks",
+        "qwen3": " A Deep Learning is a subset of machine learning that uses neural networks with multiple layers to",
+        "phi3": "\n\nDeep learning is a subfield of machine learning that focuses on creating",
     }
     assert response.generated_text == greedy_expectations[service_name]
 
@@ -78,8 +80,10 @@ async def test_model_multiple_requests(tgi_service, neuron_generate_load):
     assert len(responses) == 4
     expectations = {
         "llama": "Deep learning is a subset of machine learning that uses artificial",
-        "qwen2": "Deep Learning is a subset of Machine Learning that is based on",
-        "granite": "Deep Learning is a subset of Machine Learning, which is a branch of Art",
+        "qwen2": "Deep Learning is a subset of Machine Learning that involves",
+        "granite": "Deep learning is a subset of machine learning techniques",
+        "qwen3": "Deep Learning is a subset of machine learning that uses neural networks",
+        "phi3": "Deep learning is a subfield of machine learning that focuses on creating",
     }
     expected = expectations[tgi_service.client.service_name]
     for r in responses: