From e317a92aa05a8c34517a10e0f18ecacc28f21a83 Mon Sep 17 00:00:00 2001 From: jeffluoo Date: Wed, 29 Oct 2025 20:04:26 +0000 Subject: [PATCH] feat: Add initial metrics and update dependencies This commit introduces Prometheus metrics to the scheduler, starting with a request counter. Signed-off-by: jeffluoo --- cmd/epp/main.go | 5 ++- docs/metrics.md | 25 ++++++++++++++ go.mod | 5 +-- pkg/metrics/metrics.go | 42 +++++++++++++++++++++++ pkg/metrics/metrics_test.go | 22 ++++++++++++ pkg/plugins/profile/pd_profile_handler.go | 4 +++ 6 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 docs/metrics.md create mode 100644 pkg/metrics/metrics.go create mode 100644 pkg/metrics/metrics_test.go diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 721892fa..1952fcf3 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -30,6 +30,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner" + "github.com/llm-d/llm-d-inference-scheduler/pkg/metrics" "github.com/llm-d/llm-d-inference-scheduler/pkg/plugins" ) @@ -37,7 +38,9 @@ func main() { // Register llm-d-inference-scheduler plugins plugins.RegisterAllPlugins() - if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil { + if err := runner.NewRunner(). + WithCustomCollectors(metrics.GetCollectors()...). + Run(ctrl.SetupSignalHandler()); err != nil { os.Exit(1) } } diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 00000000..17042813 --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,25 @@ +# Metrics + +The `llm-d-inference-scheduler` exposes the following Prometheus metrics to monitor its behavior and performance, particularly concerning Prefill/Decode Disaggregation. + +All metrics are in the `llm_d_inference_scheduler` subsystem. + +## Scrape and see the metric + +Metrics defined in llm-d-scheduler are in addition to Inference Gateway metrics. For more details of seeing metrics, see the [metrics and observability section](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/metrics-and-observability.md). + +## Metrics Details + +### `pd_decision_total` + +* **Type:** Counter +* **Labels:** + * `decision_type`: string ("decode-only" or "prefill-decode") +* **Release Stage:** ALPHA +* **Description:** Counts the number of requests processed, broken down by the Prefill/Decode disaggregation decision. + * `prefill-decode`: The request was split into separate Prefill and Decode stages. + * `decode-only`: The request used the Decode-only path. +* **Usage:** Provides a high-level view of how many requests are utilizing the disaggregated path versus the unified path. +* **Actionability:** + * Monitor the ratio of "prefill-decode" to "decode-only" to understand the P/D engagement rate. + * Sudden changes in this ratio might indicate configuration issues, changes in workload patterns, or problems with the decision logic. \ No newline at end of file diff --git a/go.mod b/go.mod index 5ceb7e2e..43f7852a 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/onsi/ginkgo/v2 v2.27.2 github.com/onsi/gomega v1.38.2 github.com/openai/openai-go v1.12.0 + github.com/prometheus/client_golang v1.23.2 github.com/stretchr/testify v1.11.1 golang.org/x/sync v0.18.0 google.golang.org/grpc v1.76.0 @@ -21,6 +22,7 @@ require ( k8s.io/apiextensions-apiserver v0.34.2 k8s.io/apimachinery v0.34.2 k8s.io/client-go v0.34.2 + k8s.io/component-base v0.34.2 k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d sigs.k8s.io/controller-runtime v0.22.4 @@ -67,6 +69,7 @@ require ( github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/moby/spdystream v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect @@ -76,7 +79,6 @@ require ( github.com/pebbe/zmq4 v1.4.0 // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.2 // indirect github.com/prometheus/procfs v0.17.0 // indirect @@ -124,7 +126,6 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiserver v0.34.2 // indirect - k8s.io/component-base v0.34.2 // indirect k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 00000000..03aadebd --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,42 @@ +// Package metrics provides metrics registration for the epp. +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + compbasemetrics "k8s.io/component-base/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/metrics" +) + +const ( + // SchedulerSubsystem is the metric prefix of the package. + SchedulerSubsystem = "llm_d_inference_scheduler" + + // DecisionTypeDecodeOnly is for requests that are routed to decode instance only. + DecisionTypeDecodeOnly = "decode-only" + // DecisionTypePrefillDecode is for requests that are gone through P/D. + DecisionTypePrefillDecode = "prefill-decode" +) + +var ( + // SchedulerPDDecisionCount records request P/D decision. + SchedulerPDDecisionCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: SchedulerSubsystem, + Name: "pd_decision_total", + Help: metrics.HelpMsgWithStability("Total number of P/D disaggregation decisions made", compbasemetrics.ALPHA), + }, + []string{"decision_type"}, // "decode-only" or "prefill-decode" + ) +) + +// GetCollectors returns all custom collectors for the llm-d-inference-scheduler. +func GetCollectors() []prometheus.Collector { + return []prometheus.Collector{ + SchedulerPDDecisionCount, + } +} + +// RecordPDDecision records the type of P/D disaggregation decision made. +func RecordPDDecision(decisionType string) { + SchedulerPDDecisionCount.WithLabelValues(decisionType).Inc() +} diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go new file mode 100644 index 00000000..98b3bc3b --- /dev/null +++ b/pkg/metrics/metrics_test.go @@ -0,0 +1,22 @@ +package metrics + +import ( + "strings" + "testing" + + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestSchedulerPDDecisionCount(t *testing.T) { + RecordPDDecision(DecisionTypePrefillDecode) + RecordPDDecision(DecisionTypeDecodeOnly) + RecordPDDecision(DecisionTypePrefillDecode) + if err := testutil.CollectAndCompare(SchedulerPDDecisionCount, strings.NewReader(` + # HELP llm_d_inference_scheduler_pd_decision_total [ALPHA] Total number of P/D disaggregation decisions made + # TYPE llm_d_inference_scheduler_pd_decision_total counter + llm_d_inference_scheduler_pd_decision_total{decision_type="decode-only"} 1 + llm_d_inference_scheduler_pd_decision_total{decision_type="prefill-decode"} 2 + `), "decision_type"); err != nil { + t.Errorf("RecordPDDecision() failed: %v", err) + } +} diff --git a/pkg/plugins/profile/pd_profile_handler.go b/pkg/plugins/profile/pd_profile_handler.go index f91aaf28..a3fe3e75 100644 --- a/pkg/plugins/profile/pd_profile_handler.go +++ b/pkg/plugins/profile/pd_profile_handler.go @@ -9,6 +9,8 @@ import ( "net" "strconv" + "github.com/llm-d/llm-d-inference-scheduler/pkg/metrics" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" @@ -156,10 +158,12 @@ func (h *PdProfileHandler) Pick(ctx context.Context, cycleState *types.CycleStat if (1.0-hitPercentagePrefix)*float64(len(userInput)) < float64(h.pdThreshold) { log.FromContext(ctx).Info("Non-cached suffix is smaller than threshold, using decode profile only", "hitPercentage", hitPercentagePrefix) + metrics.RecordPDDecision(metrics.DecisionTypeDecodeOnly) return map[string]*framework.SchedulerProfile{} // do not run prefill } } + metrics.RecordPDDecision(metrics.DecisionTypePrefillDecode) // run the prefill profile return map[string]*framework.SchedulerProfile{ h.prefillProfile: profiles[h.prefillProfile],