@@ -102,9 +102,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
102102 }
103103
104104 return & PrecisePrefixCacheScorer {
105- typedName : plugins.TypedName {Type : PrecisePrefixCachePluginType },
106- kvCacheIndexer : kvCacheIndexer ,
107- chatTemplateRenderer : chatTemplateRenderer ,
105+ typedName : plugins.TypedName {Type : PrecisePrefixCachePluginType },
106+ kvCacheIndexer : kvCacheIndexer ,
108107 }, nil
109108}
110109
@@ -114,9 +113,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
114113// state, and the `kvevents.Pool` to subscribe to KV-cache events
115114// to keep the internal KV-cache index state up-to-date.
116115type PrecisePrefixCacheScorer struct {
117- typedName plugins.TypedName
118- kvCacheIndexer * kvcache.Indexer
119- chatTemplateRenderer * preprocessing.ChatTemplatingProcessor
116+ typedName plugins.TypedName
117+ kvCacheIndexer * kvcache.Indexer
120118}
121119
122120// TypedName returns the typed name of the plugin.
@@ -134,26 +132,20 @@ func (s *PrecisePrefixCacheScorer) WithName(name string) *PrecisePrefixCacheScor
134132// The returned scores are normalized to a range of 0-1.
135133func (s * PrecisePrefixCacheScorer ) Score (ctx context.Context , _ * types.CycleState , request * types.LLMRequest , pods []types.Pod ) map [types.Pod ]float64 {
136134 logger := log .FromContext (ctx ).WithName (s .typedName .String ())
135+ debugLogger := logger .V (logutil .DEBUG )
137136
138137 if request == nil {
139- logger . V ( logutil . DEBUG ) .Info ("Request is nil, skipping scoring" )
138+ debugLogger .Info ("Request is nil, skipping scoring" )
140139 return nil
141140 }
142141
143- // Extract the flattened prompt from the request
144- prompt , err := s .extractPrompt (ctx , request )
142+ // Extract the flattened scores from the request
143+ scores , err := s .getScores (ctx , request )
145144 if err != nil {
146- logger .Error (err , "Failed to extract prompt from request" )
145+ logger .Error (err , "Failed to extract scores from request" )
147146 return nil
148147 }
149-
150- scores , err := s .kvCacheIndexer .GetPodScores (ctx , prompt , request .TargetModel , nil )
151- if err != nil {
152- logger .Error (err , "Failed to get pod scores" )
153- return nil
154- }
155-
156- logger .V (logutil .DEBUG ).Info ("Got pod scores" , "scores" , scores )
148+ debugLogger .Info ("Got pod scores" , "scores" , scores )
157149
158150 podToKey := func (pod types.Pod ) (string , bool ) {
159151 metricsPod := pod .GetPod ()
@@ -170,20 +162,23 @@ func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleStat
170162// extractPrompt extracts the flattened prompt from the request.
171163// For chat completions, it renders the messages using the model's chat template.
172164// For regular completions, it uses the prompt directly.
173- func (s * PrecisePrefixCacheScorer ) extractPrompt (ctx context.Context , request * types.LLMRequest ) (string , error ) {
174- traceLogger := log .FromContext (ctx ).V (logutil .TRACE ).WithName (s .typedName .String ())
165+ func (s * PrecisePrefixCacheScorer ) getScores (ctx context.Context , request * types.LLMRequest ) (map [string ]float64 , error ) {
166+ logger := log .FromContext (ctx ).WithName (s .typedName .String ())
167+ debugLogger := logger .V (logutil .DEBUG )
168+ traceLogger := logger .V (logutil .TRACE )
169+
170+ debugLogger .Info ("Getting scores" ,
171+ "target_model" , request .TargetModel ,
172+ "has_chat_completions" , request .Body != nil && request .Body .ChatCompletions != nil ,
173+ "has_completions" , request .Body != nil && request .Body .Completions != nil )
175174
176175 // The upstream parser guarantees exactly one body is populated, but we defensively prioritize chat completions.
177176 // If an unexpected dual payload slips through (parser regression/new client), log it and use chat semantics.
178177 if request .Body != nil && request .Body .ChatCompletions != nil {
179178 if request .Body .Completions != nil {
180179 traceLogger .Info ("Both chat/completions and completions present; defaulting to chat/completions" )
181180 }
182- traceLogger .Info ("Processing chat completion request" ,
183- "messages_count" , len (request .Body .ChatCompletions .Messages ),
184- "target_model" , request .TargetModel )
185181
186- // Create render request
187182 renderReq := & preprocessing.RenderJinjaTemplateRequest {
188183 Conversations : make ([]preprocessing.ChatMessage , 0 ),
189184 Tools : request .Body .ChatCompletions .Tools ,
@@ -203,47 +198,30 @@ func (s *PrecisePrefixCacheScorer) extractPrompt(ctx context.Context, request *t
203198 })
204199 }
205200
206- // Fetch the chat template from the model
207- fetchReq := preprocessing.FetchChatTemplateRequest {
208- Model : request .TargetModel ,
209- }
210-
211- chatTemplate , chatTemplateKWArgs , err := s .chatTemplateRenderer .FetchChatTemplate (ctx , fetchReq )
212- if err != nil {
213- return "" , fmt .Errorf ("failed to fetch chat template: %w" , err )
214- }
215-
216- traceLogger .Info ("Chat template fetched" ,
217- "model" , request .TargetModel ,
218- "templateLength" , len (chatTemplate ),
219- "hasKwargs" , len (chatTemplateKWArgs ) > 0 )
220-
221- // Set the fetched template in the render request
222- renderReq .ChatTemplate = chatTemplate
223- renderReq .ChatTemplateKWArgs = chatTemplateKWArgs
201+ traceLogger .Info ("Processing chat completion request" ,
202+ "messages_count" , len (renderReq .Conversations ),
203+ "tools_count" , len (renderReq .Tools ),
204+ "documents_count" , len (renderReq .Documents ),
205+ "target_model" , request .TargetModel )
224206
225- // Render the template to get flattened prompt
226- resp , err := s .chatTemplateRenderer .RenderChatTemplate (ctx , renderReq )
207+ scores , err := s .kvCacheIndexer .GetPodScores (ctx , renderReq , "" , request .TargetModel , nil )
227208 if err != nil {
228- return "" , fmt .Errorf ("failed to render chat template : %w" , err )
209+ return nil , fmt .Errorf ("failed to get pod scores for chat/completions : %w" , err )
229210 }
230-
231- if len (resp .RenderedChats ) == 0 {
232- return "" , errors .New ("no rendered chat returned from template rendering" )
233- }
234-
235- prompt := resp .RenderedChats [0 ]
236- traceLogger .Info ("Chat template rendered successfully" ,
237- "promptLength" , len (prompt ))
238- return prompt , nil
211+ return scores , nil
239212 }
240213
241214 // For regular completions, use the prompt directly
242215 if request .Body != nil && request .Body .Completions != nil {
243216 prompt := request .Body .Completions .Prompt
244217 traceLogger .Info ("Using completion prompt directly" , "promptLength" , len (prompt ))
245- return prompt , nil
218+
219+ scores , err := s .kvCacheIndexer .GetPodScores (ctx , nil , prompt , request .TargetModel , nil )
220+ if err != nil {
221+ return nil , fmt .Errorf ("failed to get pod scores for completions: %w" , err )
222+ }
223+ return scores , nil
246224 }
247225
248- return "" , errors .New ("no valid prompt found in request" )
226+ return nil , errors .New ("no valid input found in request" )
249227}
0 commit comments