feat: Add initial metrics and update dependencies

JeffLuoo · JeffLuoo · commit 256f4573211f · 2025-11-21T17:48:54.000Z
This commit introduces Prometheus metrics to the scheduler,
starting with a request counter.
diff --git a/Dockerfile.epp b/Dockerfile.epp
@@ -105,4 +105,3 @@ EXPOSE 9090
 EXPOSE 5557
 
 ENTRYPOINT ["/app/epp"]
-
diff --git a/Dockerfile.sidecar b/Dockerfile.sidecar
@@ -9,9 +9,7 @@ WORKDIR /workspace
 # Copy the Go Modules manifests
 COPY go.mod go.mod
 COPY go.sum go.sum
-# cache deps before building and copying source so that we don't need to re-download as much
-# and so that source changes don't invalidate our downloaded layer
-RUN go mod download
+COPY vendor/ vendor/
 
 # Copy the go source
 COPY cmd/pd-sidecar/main.go cmd/cmd.go
@@ -26,7 +24,7 @@ COPY pkg/common pkg/common
 ENV CGO_ENABLED=0
 ENV GOOS=${TARGETOS:-linux}
 ENV GOARCH=${TARGETARCH}
-RUN go build -a -o bin/pd-sidecar \
+RUN go build -mod=vendor -a -o bin/pd-sidecar \
        -ldflags="-X github.com/llm-d/llm-d-inference-scheduler/pkg/sidecar/version.CommitSHA=${COMMIT_SHA} -X github.com/llm-d/llm-d-inference-scheduler/pkg/sidecar/version.BuildRef=${BUILD_REF}" \
        cmd/cmd.go
 
diff --git a/cmd/epp/main.go b/cmd/epp/main.go
@@ -30,14 +30,17 @@ import (
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
 
+	"github.com/llm-d/llm-d-inference-scheduler/pkg/metrics"
 	"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins"
 )
 
 func main() {
 	// Register llm-d-inference-scheduler plugins
 	plugins.RegisterAllPlugins()
 
-	if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
+	if err := runner.NewRunner().
+		WithCustomCollectors(metrics.GetCollectors()...).
+		Run(ctrl.SetupSignalHandler()); err != nil {
 		os.Exit(1)
 	}
 }
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -0,0 +1,25 @@
+# Metrics
+
+The `llm-d-inference-scheduler` exposes the following Prometheus metrics to monitor its behavior and performance, particularly concerning Prefill/Decode Disaggregation.
+
+All metrics are in the `llm_d_inference_scheduler` subsystem.
+
+## Scrape and see the metric
+
+Metrics defined in the scheduler plugin are extention of Inference Gateway metrics. For more details of seeing metrics, see the [Instruction](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/metrics-and-observability.md).
+
+## Metric Details
+
+### `pd_decision_total`
+
+*   **Type:** Counter
+*   **Labels:**
+    *   `decision_type`: string ("decode-only" or "prefill-decode")
+*   **Release Stage:** ALPHA
+*   **Description:** Counts the number of requests processed, broken down by the Prefill/Decode disaggregation decision.
+    *   `prefill-decode`: The request was split into separate Prefill and Decode stages.
+    *   `decode-only`: The request used the Decode-only path.
+*   **Usage:** Provides a high-level view of how many requests are utilizing the disaggregated path versus the unified path.
+*   **Actionability:**
+    *   Monitor the ratio of "prefill-decode" to "decode-only" to understand the P/D engagement rate.
+    *   Sudden changes in this ratio might indicate configuration issues, changes in workload patterns, or problems with the decision logic.
diff --git a/go.mod b/go.mod
@@ -14,13 +14,15 @@ require (
 	github.com/onsi/ginkgo/v2 v2.27.2
 	github.com/onsi/gomega v1.38.2
 	github.com/openai/openai-go v1.12.0
+	github.com/prometheus/client_golang v1.23.2
 	github.com/stretchr/testify v1.11.1
 	golang.org/x/sync v0.18.0
 	google.golang.org/grpc v1.76.0
 	k8s.io/api v0.34.2
 	k8s.io/apiextensions-apiserver v0.34.2
 	k8s.io/apimachinery v0.34.2
 	k8s.io/client-go v0.34.2
+	k8s.io/component-base v0.34.2
 	k8s.io/klog/v2 v2.130.1
 	k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d
 	sigs.k8s.io/controller-runtime v0.22.4
@@ -67,6 +69,7 @@ require (
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/kylelemons/godebug v1.1.0 // indirect
 	github.com/mailru/easyjson v0.9.0 // indirect
 	github.com/moby/spdystream v0.5.0 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
@@ -76,7 +79,6 @@ require (
 	github.com/pebbe/zmq4 v1.4.0 // indirect
 	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
-	github.com/prometheus/client_golang v1.23.2 // indirect
 	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/prometheus/common v0.67.2 // indirect
 	github.com/prometheus/procfs v0.17.0 // indirect
@@ -124,7 +126,6 @@ require (
 	gopkg.in/inf.v0 v0.9.1 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 	k8s.io/apiserver v0.34.2 // indirect
-	k8s.io/component-base v0.34.2 // indirect
 	k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect
 	sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect
 	sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
@@ -0,0 +1,38 @@
+package metrics
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	compbasemetrics "k8s.io/component-base/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/metrics"
+)
+
+const (
+	SchedulerSubsystem = "llm_d_inference_scheduler"
+
+	// Decision types for pd_decision_total metric
+	DecisionTypeDecodeOnly    = "decode-only"
+	DecisionTypePrefillDecode = "prefill-decode"
+)
+
+var (
+	SchedulerPDDecisionCount = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: SchedulerSubsystem,
+			Name:      "pd_decision_total",
+			Help:      metrics.HelpMsgWithStability("Total number of P/D disaggregation decisions made", compbasemetrics.ALPHA),
+		},
+		[]string{"decision_type"}, // "decode-only" or "prefill-decode"
+	)
+)
+
+// GetCollectors returns all custom collectors for the llm-d-inference-scheduler.
+func GetCollectors() []prometheus.Collector {
+	return []prometheus.Collector{
+		SchedulerPDDecisionCount,
+	}
+}
+
+// RecordPDDecisionCounter records the type of P/D disaggregation decision made.
+func RecordPDDecisionCounter(decisionType string) {
+	SchedulerPDDecisionCount.WithLabelValues(decisionType).Inc()
+}
diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go
@@ -0,0 +1,22 @@
+package metrics
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus/testutil"
+)
+
+func TestSchedulerPDDecisionCount(t *testing.T) {
+	RecordPDDecisionCounter(DecisionTypePrefillDecode)
+	RecordPDDecisionCounter(DecisionTypeDecodeOnly)
+	RecordPDDecisionCounter(DecisionTypePrefillDecode)
+	if err := testutil.CollectAndCompare(SchedulerPDDecisionCount, strings.NewReader(`
+		# HELP llm_d_inference_scheduler_pd_decision_total [ALPHA] Total number of P/D disaggregation decisions made
+		# TYPE llm_d_inference_scheduler_pd_decision_total counter
+		llm_d_inference_scheduler_pd_decision_total{decision_type="decode-only"} 1
+		llm_d_inference_scheduler_pd_decision_total{decision_type="prefill-decode"} 2
+	`), "decision_type"); err != nil {
+		t.Errorf("RecordPDDecisionCounter() failed: %v", err)
+	}
+}
diff --git a/pkg/plugins/profile/pd_profile_handler.go b/pkg/plugins/profile/pd_profile_handler.go
@@ -9,6 +9,8 @@ import (
 	"net"
 	"strconv"
 
+	"github.com/llm-d/llm-d-inference-scheduler/pkg/metrics"
+
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
@@ -156,10 +158,12 @@ func (h *PdProfileHandler) Pick(ctx context.Context, cycleState *types.CycleStat
 
 		if (1.0-hitPercentagePrefix)*float64(len(userInput)) < float64(h.pdThreshold) {
 			log.FromContext(ctx).Info("Non-cached suffix is smaller than threshold, using decode profile only", "hitPercentage", hitPercentagePrefix)
+			metrics.RecordPDDecisionCounter(metrics.DecisionTypeDecodeOnly)
 			return map[string]*framework.SchedulerProfile{} // do not run prefill
 		}
 	}
 
+	metrics.RecordPDDecisionCounter(metrics.DecisionTypePrefillDecode)
 	// run the prefill profile
 	return map[string]*framework.SchedulerProfile{
 		h.prefillProfile: profiles[h.prefillProfile],
diff --git a/version/version.go b/version/version.go
@@ -0,0 +1,34 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package version
+
+var (
+	// The git hash of the latest commit in the build.
+	CommitSHA string
+
+	// The build ref from the _PULL_BASE_REF from cloud build trigger.
+	BuildRef string
+)
+
+const (
+	// BundleVersionAnnotation is the annotation key used in the Gateway API inference extension CRDs to specify
+	// the installed Gateway API inference extension version.
+	BundleVersionAnnotation = "inference.networking.k8s.io/bundle-version"
+
+	// BundleVersion is the value used for labeling the version of the gateway-api-inference-extension.
+	BundleVersion = "main-dev"
+)

Original file line number	Diff line number	Diff line change
`@@ -105,4 +105,3 @@ EXPOSE 9090`
`105`	`105`	`EXPOSE 5557`
`106`	`106`
`107`	`107`	`ENTRYPOINT ["/app/epp"]`
`108`		`-`
Original file line number	Diff line number	Diff line change
`@@ -30,14 +30,17 @@ import (`
`30`	`30`	`ctrl "sigs.k8s.io/controller-runtime"`
`31`	`31`	`"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"`
`32`	`32`
	`33`	`+ "github.com/llm-d/llm-d-inference-scheduler/pkg/metrics"`
`33`	`34`	`"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins"`
`34`	`35`	`)`
`35`	`36`
`36`	`37`	`func main() {`
`37`	`38`	`// Register llm-d-inference-scheduler plugins`
`38`	`39`	`plugins.RegisterAllPlugins()`
`39`	`40`
`40`		`- if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {`
	`41`	`+ if err := runner.NewRunner().`
	`42`	`+ WithCustomCollectors(metrics.GetCollectors()...).`
	`43`	`+ Run(ctrl.SetupSignalHandler()); err != nil {`
`41`	`44`	`os.Exit(1)`
`42`	`45`	`}`
`43`	`46`	`}`