Skip to content

Commit 4c77042

Browse files
authored
Bump kv-cache-manager to v0.4.0-rc2 (#467)
* Bump kv-cache-manager to v0.4.0-rc1 Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> * Update test configuration for HF tokenizer dir Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> * Update documentation Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> * rc2 Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> * debug -> trace Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> --------- Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com>
1 parent 1575b3a commit 4c77042

File tree

8 files changed

+84
-78
lines changed

8 files changed

+84
-78
lines changed

deploy/config/sim-epp-kvcache-config.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ plugins:
1515
blockSize: 16 # must match vLLM block size if not default (16)
1616
hashSeed: "42" # must match PYTHONHASHSEED in vLLM pods
1717
tokenizersPoolConfig:
18-
tokenizersCacheDir: "/cache/tokenizers"
18+
hf:
19+
tokenizersCacheDir: "/cache/tokenizers"
1920
kvBlockIndexConfig:
2021
enableMetrics: false # enable kv-block index metrics (prometheus)
2122
metricsLoggingInterval: 6000000000 # log kv-block metrics as well (1m in nanoseconds)

docs/architecture.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,8 @@ plugins:
330330
blockSize: 64
331331
hashSeed: "12345"
332332
tokenizersPoolConfig:
333-
huggingFaceToken: your_hf_token_here # automatically set by `HF_TOKEN` environment variable
333+
hf:
334+
huggingFaceToken: your_hf_token_here # automatically set by `HF_TOKEN` environment variable
334335
kvBlockIndexConfig:
335336
enableMetrics: true
336337
```
@@ -359,8 +360,9 @@ plugins:
359360
enableMetrics: true
360361
tokenizersPoolConfig:
361362
workersCount: 8
362-
huggingFaceToken: your_hf_token_here # automatically set by `HF_TOKEN` environment variable
363-
tokenizersCacheDir: /tmp/tokenizers
363+
hf:
364+
huggingFaceToken: your_hf_token_here # automatically set by `HF_TOKEN` environment variable
365+
tokenizersCacheDir: /tmp/tokenizers
364366
```
365367
366368
---

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ require (
1010
github.com/google/uuid v1.6.0
1111
github.com/hashicorp/golang-lru/v2 v2.0.7
1212
github.com/jellydator/ttlcache/v3 v3.4.0
13-
github.com/llm-d/llm-d-kv-cache-manager v0.3.2
13+
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2
1414
github.com/onsi/ginkgo/v2 v2.27.2
1515
github.com/onsi/gomega v1.38.2
1616
github.com/openai/openai-go v1.12.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
181181
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
182182
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
183183
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
184-
github.com/llm-d/llm-d-kv-cache-manager v0.3.2 h1:omSTXtuII3ol37CaoI9h+2VxE0m8EoeVOor+CkQh99I=
185-
github.com/llm-d/llm-d-kv-cache-manager v0.3.2/go.mod h1:q6u7LnzMxNcHHb5/LRdHNNeZzzGMSENFSP1NGfsJEmA=
184+
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2 h1:l2Sm8W6SRg4TAme4RsndwZ++5+4aQvDI4vnf8TKrhww=
185+
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2/go.mod h1:ZlK7MCuz5D/weLeHyNKEmVF/eJZDyYn3XyRowTihq9o=
186186
github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4=
187187
github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU=
188188
github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo=

pkg/plugins/scorer/precise_prefix_cache.go

Lines changed: 45 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,23 @@ var _ framework.Scorer = &PrecisePrefixCacheScorer{}
4141
// a new instance of the PrefixCacheTrackingPlugin.
4242
func PrecisePrefixCachePluginFactory(name string, rawParameters json.RawMessage,
4343
handle plugins.Handle) (plugins.Plugin, error) {
44+
45+
indexerConfig, err := kvcache.NewDefaultConfig()
46+
if err != nil {
47+
return nil, fmt.Errorf("failed to initialize indexer config: %w", err)
48+
}
49+
4450
parameters := PrecisePrefixCachePluginConfig{
45-
IndexerConfig: kvcache.NewDefaultConfig(),
51+
IndexerConfig: indexerConfig,
4652
KVEventsConfig: kvevents.DefaultConfig(),
4753
}
4854

4955
// read hugging face token from environment variable if set
50-
if token := os.Getenv("HF_TOKEN"); token != "" {
51-
parameters.IndexerConfig.TokenizersPoolConfig.HuggingFaceToken = token
56+
if token := os.Getenv("HF_TOKEN"); token != "" &&
57+
parameters.IndexerConfig != nil &&
58+
parameters.IndexerConfig.TokenizersPoolConfig != nil &&
59+
parameters.IndexerConfig.TokenizersPoolConfig.HFTokenizerConfig != nil {
60+
parameters.IndexerConfig.TokenizersPoolConfig.HFTokenizerConfig.HuggingFaceToken = token
5261
}
5362

5463
if rawParameters != nil {
@@ -93,9 +102,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
93102
}
94103

95104
return &PrecisePrefixCacheScorer{
96-
typedName: plugins.TypedName{Type: PrecisePrefixCachePluginType},
97-
kvCacheIndexer: kvCacheIndexer,
98-
chatTemplateRenderer: chatTemplateRenderer,
105+
typedName: plugins.TypedName{Type: PrecisePrefixCachePluginType},
106+
kvCacheIndexer: kvCacheIndexer,
99107
}, nil
100108
}
101109

@@ -105,9 +113,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
105113
// state, and the `kvevents.Pool` to subscribe to KV-cache events
106114
// to keep the internal KV-cache index state up-to-date.
107115
type PrecisePrefixCacheScorer struct {
108-
typedName plugins.TypedName
109-
kvCacheIndexer *kvcache.Indexer
110-
chatTemplateRenderer *preprocessing.ChatTemplatingProcessor
116+
typedName plugins.TypedName
117+
kvCacheIndexer *kvcache.Indexer
111118
}
112119

113120
// TypedName returns the typed name of the plugin.
@@ -125,26 +132,20 @@ func (s *PrecisePrefixCacheScorer) WithName(name string) *PrecisePrefixCacheScor
125132
// The returned scores are normalized to a range of 0-1.
126133
func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
127134
logger := log.FromContext(ctx).WithName(s.typedName.String())
135+
debugLogger := logger.V(logutil.DEBUG)
128136

129137
if request == nil {
130-
logger.V(logutil.DEBUG).Info("Request is nil, skipping scoring")
131-
return nil
132-
}
133-
134-
// Extract the flattened prompt from the request
135-
prompt, err := s.extractPrompt(ctx, request)
136-
if err != nil {
137-
logger.Error(err, "Failed to extract prompt from request")
138+
debugLogger.Info("Request is nil, skipping scoring")
138139
return nil
139140
}
140141

141-
scores, err := s.kvCacheIndexer.GetPodScores(ctx, prompt, request.TargetModel, nil)
142+
// Extract the flattened scores from the request
143+
scores, err := s.getScores(ctx, request)
142144
if err != nil {
143-
logger.Error(err, "Failed to get pod scores")
145+
logger.Error(err, "Failed to extract scores from request")
144146
return nil
145147
}
146-
147-
logger.V(logutil.DEBUG).Info("Got pod scores", "scores", scores)
148+
debugLogger.Info("Got pod scores", "scores", scores)
148149

149150
podToKey := func(pod types.Pod) (string, bool) {
150151
metricsPod := pod.GetPod()
@@ -161,20 +162,22 @@ func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleStat
161162
// extractPrompt extracts the flattened prompt from the request.
162163
// For chat completions, it renders the messages using the model's chat template.
163164
// For regular completions, it uses the prompt directly.
164-
func (s *PrecisePrefixCacheScorer) extractPrompt(ctx context.Context, request *types.LLMRequest) (string, error) {
165-
traceLogger := log.FromContext(ctx).V(logutil.TRACE).WithName(s.typedName.String())
165+
func (s *PrecisePrefixCacheScorer) getScores(ctx context.Context, request *types.LLMRequest) (map[string]float64, error) {
166+
logger := log.FromContext(ctx).WithName(s.typedName.String())
167+
traceLogger := logger.V(logutil.TRACE)
168+
169+
traceLogger.Info("Getting scores",
170+
"target_model", request.TargetModel,
171+
"has_chat_completions", request.Body != nil && request.Body.ChatCompletions != nil,
172+
"has_completions", request.Body != nil && request.Body.Completions != nil)
166173

167174
// The upstream parser guarantees exactly one body is populated, but we defensively prioritize chat completions.
168175
// If an unexpected dual payload slips through (parser regression/new client), log it and use chat semantics.
169176
if request.Body != nil && request.Body.ChatCompletions != nil {
170177
if request.Body.Completions != nil {
171178
traceLogger.Info("Both chat/completions and completions present; defaulting to chat/completions")
172179
}
173-
traceLogger.Info("Processing chat completion request",
174-
"messages_count", len(request.Body.ChatCompletions.Messages),
175-
"target_model", request.TargetModel)
176180

177-
// Create render request
178181
renderReq := &preprocessing.RenderJinjaTemplateRequest{
179182
Conversations: make([]preprocessing.ChatMessage, 0),
180183
Tools: request.Body.ChatCompletions.Tools,
@@ -194,47 +197,30 @@ func (s *PrecisePrefixCacheScorer) extractPrompt(ctx context.Context, request *t
194197
})
195198
}
196199

197-
// Fetch the chat template from the model
198-
fetchReq := preprocessing.FetchChatTemplateRequest{
199-
Model: request.TargetModel,
200-
}
201-
202-
chatTemplate, chatTemplateKWArgs, err := s.chatTemplateRenderer.FetchChatTemplate(ctx, fetchReq)
203-
if err != nil {
204-
return "", fmt.Errorf("failed to fetch chat template: %w", err)
205-
}
206-
207-
traceLogger.Info("Chat template fetched",
208-
"model", request.TargetModel,
209-
"templateLength", len(chatTemplate),
210-
"hasKwargs", len(chatTemplateKWArgs) > 0)
211-
212-
// Set the fetched template in the render request
213-
renderReq.ChatTemplate = chatTemplate
214-
renderReq.ChatTemplateKWArgs = chatTemplateKWArgs
200+
traceLogger.Info("Processing chat completion request",
201+
"messages_count", len(renderReq.Conversations),
202+
"tools_count", len(renderReq.Tools),
203+
"documents_count", len(renderReq.Documents),
204+
"target_model", request.TargetModel)
215205

216-
// Render the template to get flattened prompt
217-
resp, err := s.chatTemplateRenderer.RenderChatTemplate(ctx, renderReq)
206+
scores, err := s.kvCacheIndexer.GetPodScores(ctx, renderReq, "", request.TargetModel, nil)
218207
if err != nil {
219-
return "", fmt.Errorf("failed to render chat template: %w", err)
208+
return nil, fmt.Errorf("failed to get pod scores for chat/completions: %w", err)
220209
}
221-
222-
if len(resp.RenderedChats) == 0 {
223-
return "", errors.New("no rendered chat returned from template rendering")
224-
}
225-
226-
prompt := resp.RenderedChats[0]
227-
traceLogger.Info("Chat template rendered successfully",
228-
"promptLength", len(prompt))
229-
return prompt, nil
210+
return scores, nil
230211
}
231212

232213
// For regular completions, use the prompt directly
233214
if request.Body != nil && request.Body.Completions != nil {
234215
prompt := request.Body.Completions.Prompt
235216
traceLogger.Info("Using completion prompt directly", "promptLength", len(prompt))
236-
return prompt, nil
217+
218+
scores, err := s.kvCacheIndexer.GetPodScores(ctx, nil, prompt, request.TargetModel, nil)
219+
if err != nil {
220+
return nil, fmt.Errorf("failed to get pod scores for completions: %w", err)
221+
}
222+
return scores, nil
237223
}
238224

239-
return "", errors.New("no valid prompt found in request")
225+
return nil, errors.New("no valid input found in request")
240226
}

pkg/plugins/scorer/utils.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
package scorer
22

3-
import "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
3+
import (
4+
"math"
5+
6+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
7+
)
48

59
// podToKey is a function type that converts a Pod to a string key.
610
// It returns the key and a boolean indicating success.
@@ -11,7 +15,7 @@ type podToKeyFunc func(pod types.Pod) (string, bool)
1115
// a pod to a key, and a map of scores indexed by those keys. It returns a map
1216
// of pods to their normalized scores.
1317
func indexedScoresToNormalizedScoredPods(pods []types.Pod, podToKey podToKeyFunc,
14-
scores map[string]int) map[types.Pod]float64 {
18+
scores map[string]float64) map[types.Pod]float64 {
1519
scoredPods := make(map[types.Pod]float64)
1620
minScore, maxScore := getMinMax(scores)
1721

@@ -27,7 +31,7 @@ func indexedScoresToNormalizedScoredPods(pods []types.Pod, podToKey podToKeyFunc
2731
continue
2832
}
2933

30-
scoredPods[pod] = float64(score-minScore) / float64(maxScore-minScore)
34+
scoredPods[pod] = (score - minScore) / (maxScore - minScore)
3135
} else {
3236
scoredPods[pod] = 0.0
3337
}
@@ -36,9 +40,9 @@ func indexedScoresToNormalizedScoredPods(pods []types.Pod, podToKey podToKeyFunc
3640
return scoredPods
3741
}
3842

39-
func getMinMax(scores map[string]int) (int, int) {
40-
minScore := int(^uint(0) >> 1) // max int
41-
maxScore := -1
43+
func getMinMax(scores map[string]float64) (float64, float64) {
44+
minScore := math.MaxFloat64
45+
maxScore := math.Inf(-1)
4246

4347
for _, score := range scores {
4448
if score < minScore {

test/e2e/e2e_test.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,12 @@ func runCompletion(prompt string, theModel openai.CompletionNewParamsModel) (str
341341
Model: theModel,
342342
}
343343

344-
resp, err := openaiclient.Completions.New(testConfig.Context, completionParams, option.WithResponseInto(&httpResp))
344+
ginkgo.By(fmt.Sprintf("Sending Completion Request: (port %s) %#v", port, completionParams))
345+
346+
resp, err := openaiclient.Completions.New(testConfig.Context, completionParams, option.WithResponseInto(&httpResp), option.WithRequestTimeout(readyTimeout))
347+
348+
ginkgo.By(fmt.Sprintf("Verifying Completion Response: %#v", resp))
349+
345350
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
346351
gomega.Expect(resp.Choices).Should(gomega.HaveLen(1))
347352
gomega.Expect(resp.Choices[0].FinishReason).Should(gomega.Equal(openai.CompletionChoiceFinishReasonStop))
@@ -445,7 +450,8 @@ plugins:
445450
blockSize: 16 # must match vLLM block size if not default (16)
446451
hashSeed: "42" # must match PYTHONHASHSEED in vLLM pods
447452
tokenizersPoolConfig:
448-
tokenizersCacheDir: "/cache/tokenizers"
453+
hf:
454+
tokenizersCacheDir: "/cache/tokenizers"
449455
kvBlockIndexConfig:
450456
enableMetrics: false # enable kv-block index metrics (prometheus)
451457
metricsLoggingInterval: 6000000000 # log kv-block metrics as well (1m in nanoseconds)

test/e2e/utils_test.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ func scaleDeployment(objects []string, increment int) {
4646

4747
// getModelServerPods Returns the list of Prefill and Decode vLLM pods separately
4848
func getModelServerPods(podLabels, prefillLabels, decodeLabels map[string]string) ([]string, []string) {
49+
ginkgo.By("Getting Model server pods")
50+
4951
pods := getPods(podLabels)
5052

5153
prefillValidator, err := apilabels.ValidatedSelectorFromSet(prefillLabels)
@@ -98,17 +100,22 @@ func getPods(labels map[string]string) []corev1.Pod {
98100
}
99101

100102
func podsInDeploymentsReady(objects []string) {
101-
var deployment appsv1.Deployment
102-
helper := func(deploymentName string) bool {
103+
isDeploymentReady := func(deploymentName string) bool {
104+
var deployment appsv1.Deployment
103105
err := testConfig.K8sClient.Get(testConfig.Context, types.NamespacedName{Namespace: nsName, Name: deploymentName}, &deployment)
106+
ginkgo.By(fmt.Sprintf("Waiting for deployment %q to be ready (err: %v): replicas=%#v, status=%#v", deploymentName, err, *deployment.Spec.Replicas, deployment.Status))
104107
return err == nil && *deployment.Spec.Replicas == deployment.Status.Replicas &&
105108
deployment.Status.Replicas == deployment.Status.ReadyReplicas
106109
}
110+
107111
for _, kindAndName := range objects {
108112
split := strings.Split(kindAndName, "/")
109113
if strings.ToLower(split[0]) == deploymentKind {
110-
ginkgo.By(fmt.Sprintf("Waiting for pods of %s to be ready", split[1]))
111-
gomega.Eventually(helper, readyTimeout, interval).WithArguments(split[1]).Should(gomega.BeTrue())
114+
gomega.Eventually(isDeploymentReady).
115+
WithArguments(split[1]).
116+
WithPolling(interval).
117+
WithTimeout(readyTimeout).
118+
Should(gomega.BeTrue())
112119
}
113120
}
114121
}

0 commit comments

Comments
 (0)