Skip to content

Commit 20acc21

Browse files
vLLM Sleep mode blog (#106)
Signed-off-by: PinSiang <pinsiang.tan@embeddedllm.com> Co-authored-by: youkaichao <youkaichao@gmail.com>
1 parent f84665f commit 20acc21

15 files changed

+1994
-0
lines changed

_posts/2025-10-26-sleep-mode.md

Lines changed: 471 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
document.addEventListener('DOMContentLoaded', function() {
2+
// Ablation inference data: BF16 vs FP8
3+
const ablationInferenceData = {
4+
"ModelA": {
5+
name: "Qwen3-0.6B",
6+
bf16: [0.41, 0.4, 0.41],
7+
fp8: [0.43, 0.43, 0.45]
8+
},
9+
"ModelB": {
10+
name: "Phi-3-vision-128k",
11+
bf16: [0.9, 0.74, 0.8],
12+
fp8: [0.69, 0.59, 0.44]
13+
}
14+
};
15+
16+
function calcStatsAblInf(values) {
17+
const mean = values.reduce((a, b) => a + b, 0) / values.length;
18+
const min = Math.min(...values);
19+
const max = Math.max(...values);
20+
return { mean, errorMinus: mean - min, errorPlus: max - mean };
21+
}
22+
23+
const modelsAblInf = Object.keys(ablationInferenceData);
24+
const bf16StatsInf = modelsAblInf.map(m => calcStatsAblInf(ablationInferenceData[m].bf16));
25+
const fp8StatsInf = modelsAblInf.map(m => calcStatsAblInf(ablationInferenceData[m].fp8));
26+
27+
const bf16TraceInf = {
28+
x: modelsAblInf.map(m => ablationInferenceData[m].name),
29+
y: bf16StatsInf.map(s => s.mean),
30+
name: "BF16",
31+
type: "bar",
32+
marker: { color: "#1f77b4" },
33+
error_y: {
34+
type: "data",
35+
symmetric: false,
36+
array: bf16StatsInf.map(s => s.errorPlus),
37+
arrayminus: bf16StatsInf.map(s => s.errorMinus),
38+
color: "#0d4a6e",
39+
thickness: 2,
40+
width: 6
41+
},
42+
text: bf16StatsInf.map(s => s.mean.toFixed(2) + "s"),
43+
textposition: "outside",
44+
textfont: { size: 12, color: "#1f77b4", weight: "bold" },
45+
hovertemplate: "<b>%{x}</b><br>BF16: %{y:.2f}s<extra></extra>"
46+
};
47+
48+
const fp8TraceInf = {
49+
x: modelsAblInf.map(m => ablationInferenceData[m].name),
50+
y: fp8StatsInf.map(s => s.mean),
51+
name: "FP8",
52+
type: "bar",
53+
marker: { color: "#ff7f0e" },
54+
error_y: {
55+
type: "data",
56+
symmetric: false,
57+
array: fp8StatsInf.map(s => s.errorPlus),
58+
arrayminus: fp8StatsInf.map(s => s.errorMinus),
59+
color: "#cc6600",
60+
thickness: 2,
61+
width: 6
62+
},
63+
text: fp8StatsInf.map(s => s.mean.toFixed(2) + "s"),
64+
textposition: "outside",
65+
textfont: { size: 12, color: "#ff7f0e", weight: "bold" },
66+
hovertemplate: "<b>%{x}</b><br>FP8: %{y:.2f}s<extra></extra>"
67+
};
68+
69+
Plotly.newPlot("plotly-ablation-inference", [bf16TraceInf, fp8TraceInf], {
70+
barmode: "group",
71+
bargap: 0.15,
72+
bargroupgap: 0.1,
73+
margin: { l: 60, r: 30, t: 40, b: 50 },
74+
xaxis: {
75+
title: "",
76+
tickangle: 0
77+
},
78+
yaxis: {
79+
title: "Inference Time (seconds)",
80+
range: [0, Math.max(...bf16StatsInf.map(s => s.mean + s.errorPlus), ...fp8StatsInf.map(s => s.mean + s.errorPlus)) * 1.25]
81+
},
82+
hovermode: "closest",
83+
legend: {
84+
x: 0.5,
85+
y: 1.15,
86+
xanchor: "center",
87+
yanchor: "top",
88+
orientation: "h"
89+
}
90+
}, {displayModeBar: true, responsive: true});
91+
});
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
document.addEventListener('DOMContentLoaded', function() {
2+
// Ablation study: BF16 vs FP8 quantization
3+
const timingDataAblation = {
4+
"Sleep Mode (BF16)": [
5+
{ event: "A Model Load", duration: 32.56 },
6+
{ event: "A Model Warm Up", duration: 2.69 },
7+
{ event: "B Model Load", duration: 57.96 },
8+
{ event: "B Model Warm Up", duration: 5.92 },
9+
{ event: "A Model Wake up", duration: 0.28 },
10+
{ event: "A Model Prompt", duration: 0.41 },
11+
{ event: "A Model Sleep", duration: 0.09 },
12+
{ event: "B Model Wake Up", duration: 0.89 },
13+
{ event: "B Model Prompt", duration: 0.9 },
14+
{ event: "B Model Sleep", duration: 0.48 },
15+
{ event: "A Model Wake up", duration: 0.27 },
16+
{ event: "A Model Prompt", duration: 0.4 },
17+
{ event: "A Model Sleep", duration: 0.1 },
18+
{ event: "B Model Wake Up", duration: 0.93 },
19+
{ event: "B Model Prompt", duration: 0.74 },
20+
{ event: "B Model Sleep", duration: 0.5 },
21+
{ event: "A Model Wake up", duration: 0.27 },
22+
{ event: "A Model Prompt", duration: 0.41 },
23+
{ event: "A Model Sleep", duration: 0.1 },
24+
{ event: "B Model Wake Up", duration: 0.88 },
25+
{ event: "B Model Prompt", duration: 0.8 }
26+
],
27+
"Sleep Mode (FP8)": [
28+
{ event: "A Model Load", duration: 37.71 },
29+
{ event: "A Model Warm Up", duration: 2.34 },
30+
{ event: "B Model Load", duration: 57.79 },
31+
{ event: "B Model Warm Up", duration: 6.37 },
32+
{ event: "A Model Wake up", duration: 0.18 },
33+
{ event: "A Model Prompt", duration: 0.43 },
34+
{ event: "A Model Sleep", duration: 0.06 },
35+
{ event: "B Model Wake Up", duration: 0.79 },
36+
{ event: "B Model Prompt", duration: 0.69 },
37+
{ event: "B Model Sleep", duration: 0.31 },
38+
{ event: "A Model Wake up", duration: 0.19 },
39+
{ event: "A Model Prompt", duration: 0.43 },
40+
{ event: "A Model Sleep", duration: 0.06 },
41+
{ event: "B Model Wake Up", duration: 0.77 },
42+
{ event: "B Model Prompt", duration: 0.59 },
43+
{ event: "B Model Sleep", duration: 0.31 },
44+
{ event: "A Model Wake up", duration: 0.16 },
45+
{ event: "A Model Prompt", duration: 0.45 },
46+
{ event: "A Model Sleep", duration: 0.07 },
47+
{ event: "B Model Wake Up", duration: 0.78 },
48+
{ event: "B Model Prompt", duration: 0.44 }
49+
]
50+
};
51+
52+
// Convert to segment format
53+
function createSegmentsAblation(timingData) {
54+
const segments = [];
55+
56+
Object.entries(timingData).forEach(([scenario, events]) => {
57+
let cumulativeTime = 0;
58+
59+
events.forEach(({ event, duration }) => {
60+
const [who, ...stageParts] = event.split(' ');
61+
const stage = stageParts.join(' ');
62+
63+
let action, category;
64+
if (stage.includes('Load')) {
65+
action = 'Load';
66+
category = `${who} Load`;
67+
} else if (stage.includes('Wake')) {
68+
action = 'Wake';
69+
category = `${who} Wake`;
70+
} else if (stage.includes('Prompt')) {
71+
action = 'Prompt';
72+
category = `${who} Prompt`;
73+
} else if (stage.includes('Sleep')) {
74+
action = 'Sleep';
75+
category = `${who} Sleep`;
76+
} else if (stage.includes('Warm')) {
77+
action = 'Load';
78+
category = `${who} Load`;
79+
}
80+
81+
segments.push({
82+
scenario,
83+
who,
84+
stage,
85+
action,
86+
start: cumulativeTime,
87+
end: cumulativeTime + duration,
88+
duration,
89+
category
90+
});
91+
92+
cumulativeTime += duration;
93+
});
94+
});
95+
96+
return segments;
97+
}
98+
99+
const segmentsAblation = createSegmentsAblation(timingDataAblation);
100+
const colorMapAblation = {"A Load": "#1f77b4", "B Load": "#ff7f0e", "A Wake": "#2ca02c", "B Wake": "#17becf", "A Sleep": "#9467bd", "B Sleep": "#8c564b", "A Prompt": "#e377c2", "B Prompt": "#7f7f7f"};
101+
const categoriesAblation = Object.keys(colorMapAblation);
102+
103+
const xAblation = segmentsAblation.map(d => d.duration);
104+
const baseAblation = segmentsAblation.map(d => d.start);
105+
const yAblation = segmentsAblation.map(d => d.scenario);
106+
const colorsAblation = segmentsAblation.map(d => colorMapAblation[d.category]);
107+
const customAblation = segmentsAblation.map(d => [d.scenario, d.category, d.stage, d.start, d.end]);
108+
109+
const barsAblation = {
110+
type: "bar",
111+
orientation: "h",
112+
x: xAblation, base: baseAblation, y: yAblation,
113+
marker: { color: colorsAblation, line: {width:1, color:"rgba(0,0,0,0.35)"} },
114+
hovertemplate:
115+
"<b>%{customdata[0]}</b><br>%{customdata[1]} — %{customdata[2]}<br>"+
116+
"Start %{customdata[3]:.2f}s → End %{customdata[4]:.2f}s<br>"+
117+
"<b>%{x:.2f}s</b><extra></extra>",
118+
customdata: customAblation,
119+
showlegend: false
120+
};
121+
122+
const legendTracesAblation = categoriesAblation.map(name => ({
123+
type: "scatter", mode: "markers", x:[null], y:[null],
124+
name, marker: {color: colorMapAblation[name], size: 10},
125+
hoverinfo:"skip", showlegend:true
126+
}));
127+
128+
Plotly.newPlot("plotly-ablation-quant", [barsAblation, ...legendTracesAblation], {
129+
barmode: "overlay",
130+
bargap: 0.05,
131+
margin: {l: 140, r: 30, t: 20, b: 40},
132+
xaxis: { title: "Time (seconds)", range: [0, 115] },
133+
yaxis: {
134+
categoryorder: "array",
135+
categoryarray: ["Sleep Mode (FP8)", "Sleep Mode (BF16)"]
136+
},
137+
hovermode: "closest",
138+
dragmode: "pan"
139+
}, {displayModeBar: true, responsive: true});
140+
});
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
document.addEventListener('DOMContentLoaded', function() {
2+
// Ablation switching data: BF16 vs FP8
3+
const ablationSwitchingData = {
4+
"ModelA": {
5+
name: "Qwen3-0.6B",
6+
bf16: [0.28, 0.27, 0.27],
7+
fp8: [0.18, 0.19, 0.16]
8+
},
9+
"ModelB": {
10+
name: "Phi-3-vision-128k",
11+
bf16: [0.89, 0.93, 0.88],
12+
fp8: [0.79, 0.77, 0.78]
13+
}
14+
};
15+
16+
function calcStatsAblSwitch(values) {
17+
const mean = values.reduce((a, b) => a + b, 0) / values.length;
18+
const min = Math.min(...values);
19+
const max = Math.max(...values);
20+
return { mean, errorMinus: mean - min, errorPlus: max - mean };
21+
}
22+
23+
const modelsAblSwitch = Object.keys(ablationSwitchingData);
24+
const bf16StatsSwitch = modelsAblSwitch.map(m => calcStatsAblSwitch(ablationSwitchingData[m].bf16));
25+
const fp8StatsSwitch = modelsAblSwitch.map(m => calcStatsAblSwitch(ablationSwitchingData[m].fp8));
26+
27+
const bf16TraceSwitch = {
28+
x: modelsAblSwitch.map(m => ablationSwitchingData[m].name),
29+
y: bf16StatsSwitch.map(s => s.mean),
30+
name: "BF16",
31+
type: "bar",
32+
marker: { color: "#1f77b4" },
33+
error_y: {
34+
type: "data",
35+
symmetric: false,
36+
array: bf16StatsSwitch.map(s => s.errorPlus),
37+
arrayminus: bf16StatsSwitch.map(s => s.errorMinus),
38+
color: "#0d4a6e",
39+
thickness: 2,
40+
width: 6
41+
},
42+
text: bf16StatsSwitch.map(s => s.mean.toFixed(2) + "s"),
43+
textposition: "outside",
44+
textfont: { size: 12, color: "#1f77b4", weight: "bold" },
45+
hovertemplate: "<b>%{x}</b><br>BF16: %{y:.2f}s<extra></extra>"
46+
};
47+
48+
const fp8TraceSwitch = {
49+
x: modelsAblSwitch.map(m => ablationSwitchingData[m].name),
50+
y: fp8StatsSwitch.map(s => s.mean),
51+
name: "FP8",
52+
type: "bar",
53+
marker: { color: "#ff7f0e" },
54+
error_y: {
55+
type: "data",
56+
symmetric: false,
57+
array: fp8StatsSwitch.map(s => s.errorPlus),
58+
arrayminus: fp8StatsSwitch.map(s => s.errorMinus),
59+
color: "#cc6600",
60+
thickness: 2,
61+
width: 6
62+
},
63+
text: fp8StatsSwitch.map(s => s.mean.toFixed(2) + "s"),
64+
textposition: "outside",
65+
textfont: { size: 12, color: "#ff7f0e", weight: "bold" },
66+
hovertemplate: "<b>%{x}</b><br>FP8: %{y:.2f}s<extra></extra>"
67+
};
68+
69+
// Calculate speedup percentages for annotation
70+
const speedupsSwitchAbl = bf16StatsSwitch.map((bf16, i) => {
71+
const reduction = ((bf16.mean - fp8StatsSwitch[i].mean) / bf16.mean * 100).toFixed(0);
72+
return reduction;
73+
});
74+
75+
Plotly.newPlot("plotly-ablation-switching", [bf16TraceSwitch, fp8TraceSwitch], {
76+
barmode: "group",
77+
bargap: 0.15,
78+
bargroupgap: 0.1,
79+
margin: { l: 60, r: 30, t: 40, b: 50 },
80+
xaxis: {
81+
title: "",
82+
tickangle: 0
83+
},
84+
yaxis: {
85+
title: "Wake Time (seconds)",
86+
range: [0, Math.max(...bf16StatsSwitch.map(s => s.mean + s.errorPlus)) * 1.3]
87+
},
88+
hovermode: "closest",
89+
legend: {
90+
x: 0.5,
91+
y: 1.15,
92+
xanchor: "center",
93+
yanchor: "top",
94+
orientation: "h"
95+
},
96+
annotations: modelsAblSwitch.map((m, i) => ({
97+
x: ablationSwitchingData[m].name,
98+
y: bf16StatsSwitch[i].mean + bf16StatsSwitch[i].errorPlus + 0.07,
99+
text: `<b>${speedupsSwitchAbl[i]}% faster</b>`,
100+
showarrow: false,
101+
font: { size: 11, color: "#ff7f0e", weight: "bold" },
102+
xanchor: "center"
103+
}))
104+
}, {displayModeBar: true, responsive: true});
105+
});

0 commit comments

Comments
 (0)