Skip to content

Commit 3695e69

Browse files
authored
Add statusz endpoint with basic diagnostic information. (#131)
1 parent 9c78fd0 commit 3695e69

File tree

5 files changed

+461
-77
lines changed

5 files changed

+461
-77
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ FROM quay.io/prometheus/busybox:latest
22
LABEL maintainer "Stackdriver Engineering <engineering@stackdriver.com>"
33

44
COPY stackdriver-prometheus-sidecar /bin/stackdriver-prometheus-sidecar
5+
COPY cmd/stackdriver-prometheus-sidecar/statusz-tmpl.html /statusz-tmpl.html
56

67
USER nobody
78
EXPOSE 9091

cmd/stackdriver-prometheus-sidecar/main.go

Lines changed: 93 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -145,13 +145,13 @@ func init() {
145145
}
146146

147147
type kubernetesConfig struct {
148-
location string
149-
clusterName string
148+
Location string
149+
ClusterName string
150150
}
151151

152152
type genericConfig struct {
153-
location string
154-
namespace string
153+
Location string
154+
Namespace string
155155
}
156156

157157
type fileConfig struct {
@@ -173,95 +173,103 @@ type fileConfig struct {
173173
} `json:"aggregated_counters"`
174174
}
175175

176+
// Note: When adding a new config field, consider adding it to
177+
// statusz-tmpl.html
178+
type mainConfig struct {
179+
ConfigFilename string
180+
ProjectIDResource string
181+
KubernetesLabels kubernetesConfig
182+
GenericLabels genericConfig
183+
StackdriverAddress *url.URL
184+
MetricsPrefix string
185+
UseGKEResource bool
186+
StoreInFilesDirectory string
187+
WALDirectory string
188+
PrometheusURL *url.URL
189+
ListenAddress string
190+
EnableStatusz bool
191+
Filters []string
192+
Filtersets []string
193+
Aggregations retrieval.CounterAggregatorConfig
194+
MetricRenames map[string]string
195+
StaticMetadata []scrape.MetricMetadata
196+
UseRestrictedIPs bool
197+
manualResolver *manual.Resolver
198+
MonitoringBackends []string
199+
200+
LogLevel promlog.AllowedLevel
201+
}
202+
176203
func main() {
177204
if os.Getenv("DEBUG") != "" {
178205
runtime.SetBlockProfileRate(20)
179206
runtime.SetMutexProfileFraction(20)
180207
}
181208

182-
cfg := struct {
183-
configFilename string
184-
projectIdResource string
185-
kubernetesLabels kubernetesConfig
186-
genericLabels genericConfig
187-
stackdriverAddress *url.URL
188-
metricsPrefix string
189-
useGkeResource bool
190-
storeInFilesDirectory string
191-
walDirectory string
192-
prometheusURL *url.URL
193-
listenAddress string
194-
filters []string
195-
filtersets []string
196-
aggregations retrieval.CounterAggregatorConfig
197-
metricRenames map[string]string
198-
staticMetadata []scrape.MetricMetadata
199-
useRestrictedIps bool
200-
manualResolver *manual.Resolver
201-
monitoringBackends []string
202-
203-
logLevel promlog.AllowedLevel
204-
}{}
209+
var cfg mainConfig
205210

206211
a := kingpin.New(filepath.Base(os.Args[0]), "The Prometheus monitoring server")
207212

208213
a.Version(version.Print("prometheus"))
209214

210215
a.HelpFlag.Short('h')
211216

212-
a.Flag("config-file", "A configuration file.").StringVar(&cfg.configFilename)
217+
a.Flag("config-file", "A configuration file.").StringVar(&cfg.ConfigFilename)
213218

214219
projectId := a.Flag("stackdriver.project-id", "The Google project ID where Stackdriver will store the metrics.").
215220
Required().
216221
String()
217222

218223
a.Flag("stackdriver.api-address", "Address of the Stackdriver Monitoring API.").
219-
Default("https://monitoring.googleapis.com:443/").URLVar(&cfg.stackdriverAddress)
224+
Default("https://monitoring.googleapis.com:443/").URLVar(&cfg.StackdriverAddress)
220225

221226
a.Flag("stackdriver.use-restricted-ips", "If true, send all requests through restricted VIPs (EXPERIMENTAL).").
222-
Default("false").BoolVar(&cfg.useRestrictedIps)
227+
Default("false").BoolVar(&cfg.UseRestrictedIPs)
223228

224229
a.Flag("stackdriver.kubernetes.location", "Value of the 'location' label in the Kubernetes Stackdriver MonitoredResources.").
225-
StringVar(&cfg.kubernetesLabels.location)
230+
StringVar(&cfg.KubernetesLabels.Location)
226231

227232
a.Flag("stackdriver.kubernetes.cluster-name", "Value of the 'cluster_name' label in the Kubernetes Stackdriver MonitoredResources.").
228-
StringVar(&cfg.kubernetesLabels.clusterName)
233+
StringVar(&cfg.KubernetesLabels.ClusterName)
229234

230235
a.Flag("stackdriver.generic.location", "Location for metrics written with the generic resource, e.g. a cluster or data center name.").
231-
StringVar(&cfg.genericLabels.location)
236+
StringVar(&cfg.GenericLabels.Location)
232237

233238
a.Flag("stackdriver.generic.namespace", "Namespace for metrics written with the generic resource, e.g. a cluster or data center name.").
234-
StringVar(&cfg.genericLabels.namespace)
239+
StringVar(&cfg.GenericLabels.Namespace)
235240

236241
a.Flag("stackdriver.metrics-prefix", "Customized prefix for Stackdriver metrics. If not set, external.googleapis.com/prometheus will be used").
237-
StringVar(&cfg.metricsPrefix)
242+
StringVar(&cfg.MetricsPrefix)
238243

239244
a.Flag("stackdriver.use-gke-resource",
240245
"Whether to use the legacy gke_container MonitoredResource type instead of k8s_container").
241-
Default("false").BoolVar(&cfg.useGkeResource)
246+
Default("false").BoolVar(&cfg.UseGKEResource)
242247

243248
a.Flag("stackdriver.store-in-files-directory", "If specified, store the CreateTimeSeriesRequest protobuf messages to files under this directory, instead of sending protobuf messages to Stackdriver Monitoring API.").
244-
StringVar(&cfg.storeInFilesDirectory)
249+
StringVar(&cfg.StoreInFilesDirectory)
245250

246251
a.Flag("prometheus.wal-directory", "Directory from where to read the Prometheus TSDB WAL.").
247-
Default("data/wal").StringVar(&cfg.walDirectory)
252+
Default("data/wal").StringVar(&cfg.WALDirectory)
248253

249254
a.Flag("prometheus.api-address", "Address to listen on for UI, API, and telemetry.").
250-
Default("http://127.0.0.1:9090/").URLVar(&cfg.prometheusURL)
255+
Default("http://127.0.0.1:9090/").URLVar(&cfg.PrometheusURL)
251256

252257
a.Flag("monitoring.backend", "Monitoring backend(s) for internal metrics").Default("prometheus").
253-
EnumsVar(&cfg.monitoringBackends, "prometheus", "stackdriver")
258+
EnumsVar(&cfg.MonitoringBackends, "prometheus", "stackdriver")
254259

255260
a.Flag("web.listen-address", "Address to listen on for UI, API, and telemetry.").
256-
Default("0.0.0.0:9091").StringVar(&cfg.listenAddress)
261+
Default("0.0.0.0:9091").StringVar(&cfg.ListenAddress)
262+
263+
a.Flag("web.enable-statusz", "If true, then enables a /statusz endpoint on the web server with diagnostic information.").
264+
Default("true").BoolVar(&cfg.EnableStatusz)
257265

258266
a.Flag("include", "PromQL metric and label matcher which must pass for a series to be forwarded to Stackdriver. If repeated, the series must pass any of the filter sets to be forwarded.").
259-
StringsVar(&cfg.filtersets)
267+
StringsVar(&cfg.Filtersets)
260268

261269
a.Flag("filter", "PromQL-style matcher for a single label which must pass for a series to be forwarded to Stackdriver. If repeated, the series must pass all filters to be forwarded. Deprecated, please use --include instead.").
262-
StringsVar(&cfg.filters)
270+
StringsVar(&cfg.Filters)
263271

264-
promlogflag.AddFlags(a, &cfg.logLevel)
272+
promlogflag.AddFlags(a, &cfg.LogLevel)
265273

266274
_, err := a.Parse(os.Args[1:])
267275
if err != nil {
@@ -270,25 +278,25 @@ func main() {
270278
os.Exit(2)
271279
}
272280

273-
logger := promlog.New(cfg.logLevel)
274-
if cfg.configFilename != "" {
275-
cfg.metricRenames, cfg.staticMetadata, cfg.aggregations, err = parseConfigFile(cfg.configFilename)
281+
logger := promlog.New(cfg.LogLevel)
282+
if cfg.ConfigFilename != "" {
283+
cfg.MetricRenames, cfg.StaticMetadata, cfg.Aggregations, err = parseConfigFile(cfg.ConfigFilename)
276284
if err != nil {
277-
msg := fmt.Sprintf("Parse config file %s", cfg.configFilename)
285+
msg := fmt.Sprintf("Parse config file %s", cfg.ConfigFilename)
278286
level.Error(logger).Log("msg", msg, "err", err)
279287
os.Exit(2)
280288
}
281289

282290
// Enable Stackdriver monitoring backend if counter aggregator configuration is present.
283-
if len(cfg.aggregations) > 0 {
291+
if len(cfg.Aggregations) > 0 {
284292
sdEnabled := false
285-
for _, backend := range cfg.monitoringBackends {
293+
for _, backend := range cfg.MonitoringBackends {
286294
if backend == "stackdriver" {
287295
sdEnabled = true
288296
}
289297
}
290298
if !sdEnabled {
291-
cfg.monitoringBackends = append(cfg.monitoringBackends, "stackdriver")
299+
cfg.MonitoringBackends = append(cfg.MonitoringBackends, "stackdriver")
292300
}
293301
}
294302
}
@@ -304,7 +312,7 @@ func main() {
304312
*projectId = getGCEProjectID()
305313
}
306314

307-
for _, backend := range cfg.monitoringBackends {
315+
for _, backend := range cfg.MonitoringBackends {
308316
switch backend {
309317
case "prometheus":
310318
promExporter, err := oc_prometheus.NewExporter(oc_prometheus.Options{
@@ -332,10 +340,10 @@ func main() {
332340

333341
var staticLabels = map[string]string{
334342
retrieval.ProjectIDLabel: *projectId,
335-
retrieval.KubernetesLocationLabel: cfg.kubernetesLabels.location,
336-
retrieval.KubernetesClusterNameLabel: cfg.kubernetesLabels.clusterName,
337-
retrieval.GenericLocationLabel: cfg.genericLabels.location,
338-
retrieval.GenericNamespaceLabel: cfg.genericLabels.namespace,
343+
retrieval.KubernetesLocationLabel: cfg.KubernetesLabels.Location,
344+
retrieval.KubernetesClusterNameLabel: cfg.KubernetesLabels.ClusterName,
345+
retrieval.GenericLocationLabel: cfg.GenericLabels.Location,
346+
retrieval.GenericNamespaceLabel: cfg.GenericLabels.Namespace,
339347
}
340348
fillMetadata(&staticLabels)
341349
for k, v := range staticLabels {
@@ -344,14 +352,14 @@ func main() {
344352
}
345353
}
346354

347-
filtersets, err := parseFiltersets(logger, cfg.filtersets, cfg.filters)
355+
filtersets, err := parseFiltersets(logger, cfg.Filtersets, cfg.Filters)
348356
if err != nil {
349357
level.Error(logger).Log("msg", "Error parsing --include (or --filter)", "err", err)
350358
os.Exit(2)
351359
}
352360

353-
cfg.projectIdResource = fmt.Sprintf("projects/%v", *projectId)
354-
if cfg.useRestrictedIps {
361+
cfg.ProjectIDResource = fmt.Sprintf("projects/%v", *projectId)
362+
if cfg.UseRestrictedIPs {
355363
// manual.GenerateAndRegisterManualResolver generates a Resolver and a random scheme.
356364
// It also registers the resolver. rb.InitialAddrs adds the addresses we are using
357365
// to resolve GCP API calls to the resolver.
@@ -364,23 +372,23 @@ func main() {
364372
{Addr: "199.36.153.7:443"},
365373
})
366374
}
367-
targetsURL, err := cfg.prometheusURL.Parse(targets.DefaultAPIEndpoint)
375+
targetsURL, err := cfg.PrometheusURL.Parse(targets.DefaultAPIEndpoint)
368376
if err != nil {
369377
panic(err)
370378
}
371379
targetCache := targets.NewCache(logger, httpClient, targetsURL)
372380

373-
metadataURL, err := cfg.prometheusURL.Parse(metadata.DefaultEndpointPath)
381+
metadataURL, err := cfg.PrometheusURL.Parse(metadata.DefaultEndpointPath)
374382
if err != nil {
375383
panic(err)
376384
}
377-
metadataCache := metadata.NewCache(httpClient, metadataURL, cfg.staticMetadata)
385+
metadataCache := metadata.NewCache(httpClient, metadataURL, cfg.StaticMetadata)
378386

379387
// We instantiate a context here since the tailer is used by two other components.
380388
// The context will be used in the lifecycle of prometheusReader further down.
381389
ctx, cancel := context.WithCancel(context.Background())
382390

383-
tailer, err := tail.Tail(ctx, cfg.walDirectory)
391+
tailer, err := tail.Tail(ctx, cfg.WALDirectory)
384392
if err != nil {
385393
level.Error(logger).Log("msg", "Tailing WAL failed", "err", err)
386394
os.Exit(1)
@@ -399,23 +407,23 @@ func main() {
399407

400408
var scf stackdriver.StorageClientFactory
401409

402-
if len(cfg.storeInFilesDirectory) > 0 {
403-
err := os.MkdirAll(cfg.storeInFilesDirectory, 0700)
410+
if len(cfg.StoreInFilesDirectory) > 0 {
411+
err := os.MkdirAll(cfg.StoreInFilesDirectory, 0700)
404412
if err != nil {
405413
level.Error(logger).Log(
406414
"msg", "Failure creating directory.",
407415
"err", err)
408416
os.Exit(1)
409417
}
410418
scf = &fileClientFactory{
411-
dir: cfg.storeInFilesDirectory,
419+
dir: cfg.StoreInFilesDirectory,
412420
logger: log.With(logger, "component", "storage"),
413421
}
414422
} else {
415423
scf = &stackdriverClientFactory{
416424
logger: log.With(logger, "component", "storage"),
417-
projectIdResource: cfg.projectIdResource,
418-
url: cfg.stackdriverAddress,
425+
projectIdResource: cfg.ProjectIDResource,
426+
url: cfg.StackdriverAddress,
419427
timeout: 10 * time.Second,
420428
manualResolver: cfg.manualResolver,
421429
}
@@ -434,7 +442,7 @@ func main() {
434442

435443
counterAggregator, err := retrieval.NewCounterAggregator(
436444
log.With(logger, "component", "counter_aggregator"),
437-
&cfg.aggregations)
445+
&cfg.Aggregations)
438446
if err != nil {
439447
level.Error(logger).Log("msg", "Creating counter aggregator failed", "err", err)
440448
os.Exit(1)
@@ -443,15 +451,15 @@ func main() {
443451

444452
prometheusReader := retrieval.NewPrometheusReader(
445453
log.With(logger, "component", "Prometheus reader"),
446-
cfg.walDirectory,
454+
cfg.WALDirectory,
447455
tailer,
448456
filtersets,
449-
cfg.metricRenames,
457+
cfg.MetricRenames,
450458
retrieval.TargetsWithDiscoveredLabels(targetCache, labels.FromMap(staticLabels)),
451459
metadataCache,
452460
queueManager,
453-
cfg.metricsPrefix,
454-
cfg.useGkeResource,
461+
cfg.MetricsPrefix,
462+
cfg.UseGKEResource,
455463
counterAggregator,
456464
)
457465

@@ -471,6 +479,14 @@ func main() {
471479

472480
http.Handle("/metrics", promhttp.Handler())
473481

482+
if cfg.EnableStatusz {
483+
http.Handle("/statusz", &statuszHandler{
484+
logger: logger,
485+
projectId: *projectId,
486+
cfg: &cfg,
487+
})
488+
}
489+
474490
var g group.Group
475491
{
476492
ctx, cancel := context.WithCancel(context.Background())
@@ -507,16 +523,16 @@ func main() {
507523
// depends on to exit properly.
508524
g.Add(
509525
func() error {
510-
startOffset, err := retrieval.ReadProgressFile(cfg.walDirectory)
526+
startOffset, err := retrieval.ReadProgressFile(cfg.WALDirectory)
511527
if err != nil {
512528
level.Warn(logger).Log("msg", "reading progress file failed", "err", err)
513529
startOffset = 0
514530
}
515531
// Write the file again once to ensure we have write permission on startup.
516-
if err := retrieval.SaveProgressFile(cfg.walDirectory, startOffset); err != nil {
532+
if err := retrieval.SaveProgressFile(cfg.WALDirectory, startOffset); err != nil {
517533
return err
518534
}
519-
waitForPrometheus(ctx, logger, cfg.prometheusURL)
535+
waitForPrometheus(ctx, logger, cfg.PrometheusURL)
520536
// Sleep a fixed amount of time to allow the first scrapes to complete.
521537
select {
522538
case <-time.After(time.Minute):
@@ -557,7 +573,7 @@ func main() {
557573
{
558574
cancel := make(chan struct{})
559575
server := &http.Server{
560-
Addr: cfg.listenAddress,
576+
Addr: cfg.ListenAddress,
561577
}
562578
g.Add(
563579
func() error {

0 commit comments

Comments
 (0)