@@ -41,14 +41,18 @@ var _ framework.Scorer = &PrecisePrefixCacheScorer{}
4141// a new instance of the PrefixCacheTrackingPlugin.
4242func PrecisePrefixCachePluginFactory (name string , rawParameters json.RawMessage ,
4343 handle plugins.Handle ) (plugins.Plugin , error ) {
44+ indexerConfig , err := kvcache .NewDefaultConfig ()
45+ if err != nil {
46+ return nil , fmt .Errorf ("failed to create default indexer config: %w" , err )
47+ }
4448 parameters := PrecisePrefixCachePluginConfig {
45- IndexerConfig : kvcache . NewDefaultConfig () ,
49+ IndexerConfig : indexerConfig ,
4650 KVEventsConfig : kvevents .DefaultConfig (),
4751 }
4852
4953 // read hugging face token from environment variable if set
5054 if token := os .Getenv ("HF_TOKEN" ); token != "" {
51- parameters .IndexerConfig .TokenizersPoolConfig .HuggingFaceToken = token
55+ parameters .IndexerConfig .TokenizersPoolConfig .HFTokenizerConfig . HuggingFaceToken = token
5256 }
5357
5458 if rawParameters != nil {
@@ -87,15 +91,9 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
8791 pool := kvevents .NewPool (config .KVEventsConfig , kvCacheIndexer .KVBlockIndex ())
8892 pool .Start (ctx )
8993
90- chatTemplateRenderer := preprocessing .NewChatTemplatingProcessor ()
91- if err := chatTemplateRenderer .Initialize (); err != nil {
92- return nil , fmt .Errorf ("failed to initialize chat templating processor: %w" , err )
93- }
94-
9594 return & PrecisePrefixCacheScorer {
96- typedName : plugins.TypedName {Type : PrecisePrefixCachePluginType },
97- kvCacheIndexer : kvCacheIndexer ,
98- chatTemplateRenderer : chatTemplateRenderer ,
95+ typedName : plugins.TypedName {Type : PrecisePrefixCachePluginType },
96+ kvCacheIndexer : kvCacheIndexer ,
9997 }, nil
10098}
10199
@@ -105,9 +103,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
105103// state, and the `kvevents.Pool` to subscribe to KV-cache events
106104// to keep the internal KV-cache index state up-to-date.
107105type PrecisePrefixCacheScorer struct {
108- typedName plugins.TypedName
109- kvCacheIndexer * kvcache.Indexer
110- chatTemplateRenderer * preprocessing.ChatTemplatingProcessor
106+ typedName plugins.TypedName
107+ kvCacheIndexer * kvcache.Indexer
111108}
112109
113110// TypedName returns the typed name of the plugin.
@@ -132,13 +129,13 @@ func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleStat
132129 }
133130
134131 // Extract the flattened prompt from the request
135- prompt , err := s .extractPrompt (ctx , request )
132+ renderReq , prompt , err := s .extractRequest (ctx , request )
136133 if err != nil {
137134 logger .Error (err , "Failed to extract prompt from request" )
138135 return nil
139136 }
140137
141- scores , err := s .kvCacheIndexer .GetPodScores (ctx , prompt , request .TargetModel , nil )
138+ scores , err := s .kvCacheIndexer .GetPodScores (ctx , renderReq , prompt , request .TargetModel , nil )
142139 if err != nil {
143140 logger .Error (err , "Failed to get pod scores" )
144141 return nil
@@ -158,10 +155,10 @@ func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleStat
158155 return indexedScoresToNormalizedScoredPods (pods , podToKey , scores )
159156}
160157
161- // extractPrompt extracts the flattened prompt from the request.
158+ // extractRequest extracts the flattened prompt from the request.
162159// For chat completions, it renders the messages using the model's chat template.
163160// For regular completions, it uses the prompt directly.
164- func (s * PrecisePrefixCacheScorer ) extractPrompt (ctx context.Context , request * types.LLMRequest ) (string , error ) {
161+ func (s * PrecisePrefixCacheScorer ) extractRequest (ctx context.Context , request * types.LLMRequest ) (* preprocessing. RenderJinjaTemplateRequest , string , error ) {
165162 traceLogger := log .FromContext (ctx ).V (logutil .TRACE ).WithName (s .typedName .String ())
166163
167164 // The upstream parser guarantees exactly one body is populated, but we defensively prioritize chat completions.
@@ -194,47 +191,15 @@ func (s *PrecisePrefixCacheScorer) extractPrompt(ctx context.Context, request *t
194191 })
195192 }
196193
197- // Fetch the chat template from the model
198- fetchReq := preprocessing.FetchChatTemplateRequest {
199- Model : request .TargetModel ,
200- }
201-
202- chatTemplate , chatTemplateKWArgs , err := s .chatTemplateRenderer .FetchChatTemplate (ctx , fetchReq )
203- if err != nil {
204- return "" , fmt .Errorf ("failed to fetch chat template: %w" , err )
205- }
206-
207- traceLogger .Info ("Chat template fetched" ,
208- "model" , request .TargetModel ,
209- "templateLength" , len (chatTemplate ),
210- "hasKwargs" , len (chatTemplateKWArgs ) > 0 )
211-
212- // Set the fetched template in the render request
213- renderReq .ChatTemplate = chatTemplate
214- renderReq .ChatTemplateKWArgs = chatTemplateKWArgs
215-
216- // Render the template to get flattened prompt
217- resp , err := s .chatTemplateRenderer .RenderChatTemplate (ctx , renderReq )
218- if err != nil {
219- return "" , fmt .Errorf ("failed to render chat template: %w" , err )
220- }
221-
222- if len (resp .RenderedChats ) == 0 {
223- return "" , errors .New ("no rendered chat returned from template rendering" )
224- }
225-
226- prompt := resp .RenderedChats [0 ]
227- traceLogger .Info ("Chat template rendered successfully" ,
228- "promptLength" , len (prompt ))
229- return prompt , nil
194+ return renderReq , "" , nil
230195 }
231196
232197 // For regular completions, use the prompt directly
233198 if request .Body != nil && request .Body .Completions != nil {
234199 prompt := request .Body .Completions .Prompt
235200 traceLogger .Info ("Using completion prompt directly" , "promptLength" , len (prompt ))
236- return prompt , nil
201+ return nil , prompt , nil
237202 }
238203
239- return "" , errors .New ("no valid prompt found in request" )
204+ return nil , "" , errors .New ("no valid prompt found in request" )
240205}
0 commit comments