Merge branch 'main' into enhance/golangci-lint-configuration

rishi-jat · web-flow · commit 24462a7041bc · 2025-10-30T06:29:51.000+05:30
diff --git a/README.md b/README.md
@@ -12,11 +12,30 @@ the llm-d inference framework.
 
 This provides an "Endpoint Picker (EPP)" component to the llm-d inference
 framework which schedules incoming inference requests to the platform via a
-[Kubernetes] Gateway according to scheduler plugins. For more details on the llm-d inference scheduler architecture, routing logic, and different plugins (filters and scorers), including plugin configuration, see the [Architecture Documentation]).
+[Kubernetes] Gateway according to scheduler plugins. For more details on the
+llm-d inference scheduler architecture, routing logic, and different plugins
+(filters and scorers), including plugin configuration, see the [Architecture Documentation]).
+
+### Relation to GIE (IGW)
 
 The EPP extends the [Gateway API Inference Extension (GIE)] project,
 which provides the API resources and machinery for scheduling. We add some
 custom features that are specific to llm-d here, such as [P/D Disaggregation].
+The two projects collaborate closely as often a feature in llm-d might require
+enablement and extensions in the GIE code base.
+Unique and experimental features may start in llm-d and migrate, over time, to
+GIE. As a project goal, we prefer to upstream functionality to GIE when
+- it has matured sufficiently and has proven wide applicability and usefulness; and
+- it can be implemented in EPP alone (i.e., llm-d provides a full inference framework,
+  beyond scheduling).
+
+Note that in general features should go to the upstream [Gateway API Inference
+Extension (GIE)] project _first_ if applicable. The GIE is a major dependency of
+ours, and where most _general purpose_ inference features live. If you have
+something that you feel is general purpose or use, it probably should go to the
+GIE. If you have something that's _llm-d specific_ then it should go here. If
+you're not sure whether your feature belongs here or in the GIE, feel free to
+create a [discussion] or ask on [Slack].
 
 A compatible [Gateway API] implementation is used as the Gateway. The Gateway
 API implementation must utilize [Envoy] and support [ext-proc], as this is the
@@ -41,14 +60,6 @@ For large changes please [create an issue] first describing the change so the
 maintainers can do an assessment, and work on the details with you. See
 [DEVELOPMENT.md](DEVELOPMENT.md) for details on how to work with the codebase.
 
-Note that in general features should go to the upstream [Gateway API Inference
-Extension (GIE)] project _first_ if applicable. The GIE is a major dependency of
-ours, and where most _general purpose_ inference features live. If you have
-something that you feel is general purpose or use, it probably should go to the
-GIE. If you have something that's _llm-d specific_ then it should go here. If
-you're not sure whether your feature belongs here or in the GIE, feel free to
-create a [discussion] or ask on [Slack].
-
 Contributions are welcome!
 
 [create an issue]:https://github.com/llm-d/llm-d-inference-scheduler/issues/new
diff --git a/pkg/sidecar/proxy/connector_lmcache.go b/pkg/sidecar/proxy/connector_lmcache.go
@@ -49,8 +49,8 @@ func (s *Server) runLMCacheProtocol(w http.ResponseWriter, r *http.Request, pref
 	ctx := r.Context()
 	preq := r.Clone(ctx)
 
-	completionRequest["max_tokens"] = 1
-	completionRequest["max_completion_tokens"] = 1
+	completionRequest[requestFieldMaxTokens] = 1
+	completionRequest[requestFieldMaxCompletionTokens] = 1
 
 	pbody, err := json.Marshal(completionRequest)
 	if err != nil {
diff --git a/pkg/sidecar/proxy/connector_nixlv2.go b/pkg/sidecar/proxy/connector_nixlv2.go
@@ -67,6 +67,7 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
 	streamValue, streamOk := completionRequest[requestFieldStream]
 	streamOptionsValue, streamOptionsOk := completionRequest[requestFieldStreamOptions]
 	maxTokensValue, maxTokensOk := completionRequest[requestFieldMaxTokens]
+	maxCompletionTokensValue, maxCompletionTokensOk := completionRequest[requestFieldMaxCompletionTokens]
 
 	completionRequest[requestFieldKVTransferParams] = map[string]any{
 		requestFieldDoRemoteDecode:  true,
@@ -80,6 +81,7 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
 	completionRequest[requestFieldStream] = false
 	delete(completionRequest, requestFieldStreamOptions)
 	completionRequest[requestFieldMaxTokens] = 1
+	completionRequest[requestFieldMaxCompletionTokens] = 1
 
 	pbody, err := json.Marshal(completionRequest)
 	if err != nil {
@@ -146,6 +148,10 @@ func (s *Server) runNIXLProtocolV2(w http.ResponseWriter, r *http.Request, prefi
 	if maxTokensOk {
 		completionRequest[requestFieldMaxTokens] = maxTokensValue
 	}
+	delete(completionRequest, requestFieldMaxCompletionTokens)
+	if maxCompletionTokensOk {
+		completionRequest[requestFieldMaxCompletionTokens] = maxCompletionTokensValue
+	}
 	completionRequest[requestFieldKVTransferParams] = pKVTransferParams
 
 	dbody, err := json.Marshal(completionRequest)
diff --git a/pkg/sidecar/proxy/proxy.go b/pkg/sidecar/proxy/proxy.go
@@ -38,16 +38,17 @@ const (
 	requestHeaderPrefillURL = "x-prefiller-url"
 	requestHeaderRequestID  = "x-request-id"
 
-	requestFieldKVTransferParams = "kv_transfer_params"
-	requestFieldMaxTokens        = "max_tokens"
-	requestFieldDoRemotePrefill  = "do_remote_prefill"
-	requestFieldDoRemoteDecode   = "do_remote_decode"
-	requestFieldRemoteBlockIDs   = "remote_block_ids"
-	requestFieldRemoteEngineID   = "remote_engine_id"
-	requestFieldRemoteHost       = "remote_host"
-	requestFieldRemotePort       = "remote_port"
-	requestFieldStream           = "stream"
-	requestFieldStreamOptions    = "stream_options"
+	requestFieldKVTransferParams    = "kv_transfer_params"
+	requestFieldMaxTokens           = "max_tokens"
+	requestFieldMaxCompletionTokens = "max_completion_tokens"
+	requestFieldDoRemotePrefill     = "do_remote_prefill"
+	requestFieldDoRemoteDecode      = "do_remote_decode"
+	requestFieldRemoteBlockIDs      = "remote_block_ids"
+	requestFieldRemoteEngineID      = "remote_engine_id"
+	requestFieldRemoteHost          = "remote_host"
+	requestFieldRemotePort          = "remote_port"
+	requestFieldStream              = "stream"
+	requestFieldStreamOptions       = "stream_options"
 
 	// ConnectorNIXLV2 enables the P/D NIXL v2 protocol
 	ConnectorNIXLV2 = "nixlv2"