Skip to content

Commit 0b3e3db

Browse files
authored
xpu 2.6 update (#3051)
* xpu 2.6 update Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * install whl Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * update get xpu memory api Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * int Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix awq crash if modules_to_not_convert is None Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
1 parent f91434e commit 0b3e3db

File tree

3 files changed

+15
-20
lines changed

3 files changed

+15
-20
lines changed

Dockerfile_intel

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ RUN cargo build --profile release-opt --frozen
4545

4646
# Text Generation Inference base image for Intel
4747

48-
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS xpu
48+
FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 AS xpu
4949

5050
USER root
5151

@@ -87,7 +87,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https:/
8787

8888
RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d
8989

90-
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9
90+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-ocloc
9191

9292
# Text Generation Inference base env
9393
ENV HF_HOME=/data \
@@ -96,13 +96,11 @@ ENV HF_HOME=/data \
9696

9797

9898

99+
99100
WORKDIR /usr/src
100-
RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp311-cp311-linux_x86_64.whl --no-cache-dir
101-
RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchaudio-2.5.0a0%2B56bc006-cp311-cp311-linux_x86_64.whl --no-cache-dir
102-
RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchvision-0.20.0a0%2B8e8a208-cp311-cp311-linux_x86_64.whl --no-cache-dir
103-
RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp311-cp311-linux_x86_64.whl --no-cache-dir
101+
RUN pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu
104102

105-
RUN pip install triton-xpu==3.0.0b2 --no-cache-dir
103+
RUN pip install triton-xpu==3.2.0b1 --no-cache-dir
106104

107105
# Install server
108106
COPY proto proto
@@ -114,15 +112,14 @@ RUN cd server && \
114112
pip install -U pip uv && \
115113
uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
116114

117-
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
115+
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
118116
ENV CCL_ZE_IPC_EXCHANGE=sockets
119-
#ENV TORCH_LLM_ALLREDUCE=1
120-
#ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
117+
ENV TORCH_LLM_ALLREDUCE=1
118+
ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
121119
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
122120

123-
RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 1ccf72b2d11cd00b47aef6d6cd054c088aa6f083
124-
RUN cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc,ats-m150' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch
125-
121+
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.6.0%2Bxpu-cp311-cp311-linux_x86_64.whl
122+
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.6.10%2Bxpu-cp311-cp311-linux_x86_64.whl
126123
# Install benchmarker
127124
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
128125
# Install router

server/text_generation_server/utils/import_utils.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,10 @@ def get_cuda_free_memory(device, memory_fraction):
1818

1919

2020
def get_xpu_free_memory(device, memory_fraction):
21-
total_memory = torch.xpu.get_device_properties(device).total_memory
22-
device_id = device.index
23-
memory_fraction = float(os.getenv("XPU_MEMORY_FRACTION", "1.0"))
21+
total_free_memory, total_xpu_memory = torch.xpu.mem_get_info(device)
22+
memory_fraction = float(os.getenv("XPU_MEMORY_FRACTION", "0.9"))
2423
free_memory = max(
25-
0,
26-
int(
27-
total_memory * 0.9 * memory_fraction - torch.xpu.memory_reserved(device_id)
28-
),
24+
0, int(total_free_memory - (1 - memory_fraction) * total_xpu_memory)
2925
)
3026
return free_memory
3127

server/text_generation_server/utils/quantization.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ def _get_quantizer_config(model_id, revision):
7979
modules_to_not_convert = data["quantization_config"].get(
8080
"modules_to_not_convert", []
8181
)
82+
if modules_to_not_convert is None:
83+
modules_to_not_convert = []
8284
except Exception:
8385
filename = "quantize_config.json"
8486
try:

0 commit comments

Comments
 (0)