Skip to content

Commit de9a944

Browse files
committed
apply kv manager v0.4.0 (GetPodScores)
Co-authored-by: bongwoobak <bongwoobak@gmail.com> Co-authored-by: Hyeonki Hong <hyeonki.hong@moreh.io> Signed-off-by: HyunKyun Moon <mhg5303@gmail.com>
1 parent 1575b3a commit de9a944

File tree

4 files changed

+29
-60
lines changed

4 files changed

+29
-60
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ require (
1010
github.com/google/uuid v1.6.0
1111
github.com/hashicorp/golang-lru/v2 v2.0.7
1212
github.com/jellydator/ttlcache/v3 v3.4.0
13-
github.com/llm-d/llm-d-kv-cache-manager v0.3.2
13+
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2
1414
github.com/onsi/ginkgo/v2 v2.27.2
1515
github.com/onsi/gomega v1.38.2
1616
github.com/openai/openai-go v1.12.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
181181
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
182182
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
183183
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
184-
github.com/llm-d/llm-d-kv-cache-manager v0.3.2 h1:omSTXtuII3ol37CaoI9h+2VxE0m8EoeVOor+CkQh99I=
185-
github.com/llm-d/llm-d-kv-cache-manager v0.3.2/go.mod h1:q6u7LnzMxNcHHb5/LRdHNNeZzzGMSENFSP1NGfsJEmA=
184+
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2 h1:l2Sm8W6SRg4TAme4RsndwZ++5+4aQvDI4vnf8TKrhww=
185+
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2/go.mod h1:ZlK7MCuz5D/weLeHyNKEmVF/eJZDyYn3XyRowTihq9o=
186186
github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4=
187187
github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU=
188188
github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo=

pkg/plugins/scorer/precise_prefix_cache.go

Lines changed: 17 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,18 @@ var _ framework.Scorer = &PrecisePrefixCacheScorer{}
4141
// a new instance of the PrefixCacheTrackingPlugin.
4242
func PrecisePrefixCachePluginFactory(name string, rawParameters json.RawMessage,
4343
handle plugins.Handle) (plugins.Plugin, error) {
44+
indexerConfig, err := kvcache.NewDefaultConfig()
45+
if err != nil {
46+
return nil, fmt.Errorf("failed to create default indexer config: %w", err)
47+
}
4448
parameters := PrecisePrefixCachePluginConfig{
45-
IndexerConfig: kvcache.NewDefaultConfig(),
49+
IndexerConfig: indexerConfig,
4650
KVEventsConfig: kvevents.DefaultConfig(),
4751
}
4852

4953
// read hugging face token from environment variable if set
5054
if token := os.Getenv("HF_TOKEN"); token != "" {
51-
parameters.IndexerConfig.TokenizersPoolConfig.HuggingFaceToken = token
55+
parameters.IndexerConfig.TokenizersPoolConfig.HFTokenizerConfig.HuggingFaceToken = token
5256
}
5357

5458
if rawParameters != nil {
@@ -87,15 +91,9 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
8791
pool := kvevents.NewPool(config.KVEventsConfig, kvCacheIndexer.KVBlockIndex())
8892
pool.Start(ctx)
8993

90-
chatTemplateRenderer := preprocessing.NewChatTemplatingProcessor()
91-
if err := chatTemplateRenderer.Initialize(); err != nil {
92-
return nil, fmt.Errorf("failed to initialize chat templating processor: %w", err)
93-
}
94-
9594
return &PrecisePrefixCacheScorer{
96-
typedName: plugins.TypedName{Type: PrecisePrefixCachePluginType},
97-
kvCacheIndexer: kvCacheIndexer,
98-
chatTemplateRenderer: chatTemplateRenderer,
95+
typedName: plugins.TypedName{Type: PrecisePrefixCachePluginType},
96+
kvCacheIndexer: kvCacheIndexer,
9997
}, nil
10098
}
10199

@@ -105,9 +103,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
105103
// state, and the `kvevents.Pool` to subscribe to KV-cache events
106104
// to keep the internal KV-cache index state up-to-date.
107105
type PrecisePrefixCacheScorer struct {
108-
typedName plugins.TypedName
109-
kvCacheIndexer *kvcache.Indexer
110-
chatTemplateRenderer *preprocessing.ChatTemplatingProcessor
106+
typedName plugins.TypedName
107+
kvCacheIndexer *kvcache.Indexer
111108
}
112109

113110
// TypedName returns the typed name of the plugin.
@@ -132,13 +129,13 @@ func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleStat
132129
}
133130

134131
// Extract the flattened prompt from the request
135-
prompt, err := s.extractPrompt(ctx, request)
132+
renderReq, prompt, err := s.extractRequest(ctx, request)
136133
if err != nil {
137134
logger.Error(err, "Failed to extract prompt from request")
138135
return nil
139136
}
140137

141-
scores, err := s.kvCacheIndexer.GetPodScores(ctx, prompt, request.TargetModel, nil)
138+
scores, err := s.kvCacheIndexer.GetPodScores(ctx, renderReq, prompt, request.TargetModel, nil)
142139
if err != nil {
143140
logger.Error(err, "Failed to get pod scores")
144141
return nil
@@ -158,10 +155,10 @@ func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleStat
158155
return indexedScoresToNormalizedScoredPods(pods, podToKey, scores)
159156
}
160157

161-
// extractPrompt extracts the flattened prompt from the request.
158+
// extractRequest extracts the flattened prompt from the request.
162159
// For chat completions, it renders the messages using the model's chat template.
163160
// For regular completions, it uses the prompt directly.
164-
func (s *PrecisePrefixCacheScorer) extractPrompt(ctx context.Context, request *types.LLMRequest) (string, error) {
161+
func (s *PrecisePrefixCacheScorer) extractRequest(ctx context.Context, request *types.LLMRequest) (*preprocessing.RenderJinjaTemplateRequest, string, error) {
165162
traceLogger := log.FromContext(ctx).V(logutil.TRACE).WithName(s.typedName.String())
166163

167164
// The upstream parser guarantees exactly one body is populated, but we defensively prioritize chat completions.
@@ -194,47 +191,15 @@ func (s *PrecisePrefixCacheScorer) extractPrompt(ctx context.Context, request *t
194191
})
195192
}
196193

197-
// Fetch the chat template from the model
198-
fetchReq := preprocessing.FetchChatTemplateRequest{
199-
Model: request.TargetModel,
200-
}
201-
202-
chatTemplate, chatTemplateKWArgs, err := s.chatTemplateRenderer.FetchChatTemplate(ctx, fetchReq)
203-
if err != nil {
204-
return "", fmt.Errorf("failed to fetch chat template: %w", err)
205-
}
206-
207-
traceLogger.Info("Chat template fetched",
208-
"model", request.TargetModel,
209-
"templateLength", len(chatTemplate),
210-
"hasKwargs", len(chatTemplateKWArgs) > 0)
211-
212-
// Set the fetched template in the render request
213-
renderReq.ChatTemplate = chatTemplate
214-
renderReq.ChatTemplateKWArgs = chatTemplateKWArgs
215-
216-
// Render the template to get flattened prompt
217-
resp, err := s.chatTemplateRenderer.RenderChatTemplate(ctx, renderReq)
218-
if err != nil {
219-
return "", fmt.Errorf("failed to render chat template: %w", err)
220-
}
221-
222-
if len(resp.RenderedChats) == 0 {
223-
return "", errors.New("no rendered chat returned from template rendering")
224-
}
225-
226-
prompt := resp.RenderedChats[0]
227-
traceLogger.Info("Chat template rendered successfully",
228-
"promptLength", len(prompt))
229-
return prompt, nil
194+
return renderReq, "", nil
230195
}
231196

232197
// For regular completions, use the prompt directly
233198
if request.Body != nil && request.Body.Completions != nil {
234199
prompt := request.Body.Completions.Prompt
235200
traceLogger.Info("Using completion prompt directly", "promptLength", len(prompt))
236-
return prompt, nil
201+
return nil, prompt, nil
237202
}
238203

239-
return "", errors.New("no valid prompt found in request")
204+
return nil, "", errors.New("no valid prompt found in request")
240205
}

pkg/plugins/scorer/utils.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
package scorer
22

3-
import "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
3+
import (
4+
"math"
5+
6+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
7+
)
48

59
// podToKey is a function type that converts a Pod to a string key.
610
// It returns the key and a boolean indicating success.
@@ -11,7 +15,7 @@ type podToKeyFunc func(pod types.Pod) (string, bool)
1115
// a pod to a key, and a map of scores indexed by those keys. It returns a map
1216
// of pods to their normalized scores.
1317
func indexedScoresToNormalizedScoredPods(pods []types.Pod, podToKey podToKeyFunc,
14-
scores map[string]int) map[types.Pod]float64 {
18+
scores map[string]float64) map[types.Pod]float64 {
1519
scoredPods := make(map[types.Pod]float64)
1620
minScore, maxScore := getMinMax(scores)
1721

@@ -36,9 +40,9 @@ func indexedScoresToNormalizedScoredPods(pods []types.Pod, podToKey podToKeyFunc
3640
return scoredPods
3741
}
3842

39-
func getMinMax(scores map[string]int) (int, int) {
40-
minScore := int(^uint(0) >> 1) // max int
41-
maxScore := -1
43+
func getMinMax(scores map[string]float64) (float64, float64) {
44+
minScore := math.MaxFloat64
45+
maxScore := float64(-1)
4246

4347
for _, score := range scores {
4448
if score < minScore {

0 commit comments

Comments
 (0)