From 8e43aabeb8e535c8d1fe6b1b934252466264ed56 Mon Sep 17 00:00:00 2001 From: schmikei Date: Mon, 10 Nov 2025 14:56:06 -0500 Subject: [PATCH 1/5] modernize the discourse mixin --- discourse-mixin/.lint | 2 + discourse-mixin/alerts.libsonnet | 45 + discourse-mixin/config.libsonnet | 38 +- discourse-mixin/dashboards.libsonnet | 86 ++ .../dashboards_out/discourse-jobs.json | 801 +++++------------ .../dashboards_out/discourse-overview.json | 803 +++++------------- discourse-mixin/g.libsonnet | 3 + discourse-mixin/jsonnetfile.json | 35 +- discourse-mixin/links.libsonnet | 13 + discourse-mixin/main.libsonnet | 45 + discourse-mixin/mixin.libsonnet | 19 +- discourse-mixin/panels.libsonnet | 140 +++ .../prometheus_alerts.yaml | 8 +- discourse-mixin/rows.libsonnet | 62 ++ discourse-mixin/signals/jobs.libsonnet | 132 +++ discourse-mixin/signals/overview.libsonnet | 115 +++ 16 files changed, 1169 insertions(+), 1178 deletions(-) create mode 100644 discourse-mixin/alerts.libsonnet create mode 100644 discourse-mixin/dashboards.libsonnet create mode 100644 discourse-mixin/g.libsonnet create mode 100644 discourse-mixin/links.libsonnet create mode 100644 discourse-mixin/main.libsonnet create mode 100644 discourse-mixin/panels.libsonnet create mode 100644 discourse-mixin/rows.libsonnet create mode 100644 discourse-mixin/signals/jobs.libsonnet create mode 100644 discourse-mixin/signals/overview.libsonnet diff --git a/discourse-mixin/.lint b/discourse-mixin/.lint index c25f83b67..cfb8e524b 100644 --- a/discourse-mixin/.lint +++ b/discourse-mixin/.lint @@ -10,3 +10,5 @@ exclusions: - panel: "Sidekiq Workers" template-instance-rule: reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'" + panel-datasource-rule: + reason: "Modern mixins use signal-based architecture where datasource references are handled by the framework" diff --git a/discourse-mixin/alerts.libsonnet b/discourse-mixin/alerts.libsonnet new file mode 100644 index 000000000..6111038ca --- /dev/null +++ b/discourse-mixin/alerts.libsonnet @@ -0,0 +1,45 @@ +{ + new(this): { + groups: [ + { + name: this.config.uid + '-alerts', + rules: [ + { + alert: 'DiscourseHigh5xxErrors', + expr: ||| + 100 * rate(discourse_http_requests{status="500"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > %(alertsCritical5xxResponses)s + ||| % this.config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'More than %(alertsCritical5xxResponses)s%% of all requests result in a 5XX.' % this.config, + description: + ('{{ printf "%%.2f" $value }}%% of all requests are resulting in 500 status codes, ' + + 'which is above the threshold %(alertsCritical5xxResponses)s%%, ' + + 'indicating a potentially larger issue for {{$labels.instance}}') % this.config, + }, + }, + { + alert: 'DiscourseHigh4xxErrors', + expr: ||| + 100 * rate(discourse_http_requests{status=~"4..", %(filteringSelector)s}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > %(alertsWarning4xxResponses)s + ||| % this.config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'More than %(alertsWarning4xxResponses)s%% of all requests result in a 4XX.' % this.config, + description: + ('{{ printf "%%.2f" $value }}%% of all requests are resulting in 400 status code, ' + + 'which is above the threshold %(alertsWarning4xxResponses)s%%, ' + + 'indicating a potentially larger issue for {{$labels.instance}}') % this.config, + }, + }, + ], + }, + ], + }, +} diff --git a/discourse-mixin/config.libsonnet b/discourse-mixin/config.libsonnet index ea47fc2e9..182103ff8 100644 --- a/discourse-mixin/config.libsonnet +++ b/discourse-mixin/config.libsonnet @@ -1,12 +1,32 @@ { - _config+:: { - dashboardTags: ['discourse-mixin'], - dashboardPeriod: 'now-1h', - dashboardTimezone: 'default', - dashboardRefresh: '1m', - - // for alerts - alertsCritical5xxResponses: '10', // % - alertsWarning4xxResponses: '30', // % + local this = self, + + // Filtering + filteringSelector: 'job="integrations/discourse"', + groupLabels: ['job'], + instanceLabels: ['instance'], + + // Dashboard settings + dashboardTags: ['discourse-mixin'], + dashboardPeriod: 'now-1h', + dashboardTimezone: 'default', + dashboardRefresh: '1m', + dashboardNamePrefix: 'Discourse', + uid: 'discourse', + + // Logs configuration + enableLokiLogs: false, + + // Alert thresholds + alertsCritical5xxResponses: 10, // % + alertsWarning4xxResponses: 30, // % + + // Metrics source + metricsSource: 'prometheus', + + // Signal categories + signals: { + overview: (import './signals/overview.libsonnet')(this), + jobs: (import './signals/jobs.libsonnet')(this), }, } diff --git a/discourse-mixin/dashboards.libsonnet b/discourse-mixin/dashboards.libsonnet new file mode 100644 index 000000000..22b56e9c5 --- /dev/null +++ b/discourse-mixin/dashboards.libsonnet @@ -0,0 +1,86 @@ +local g = import './g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + local root = self, + new(this): + local prefix = this.config.dashboardNamePrefix; + local links = this.grafana.links; + local tags = this.config.dashboardTags; + local uid = this.config.uid; + local vars = commonlib.variables.new( + filteringSelector=this.config.filteringSelector, + groupLabels=this.config.groupLabels, + instanceLabels=this.config.instanceLabels, + varMetric='discourse_page_views', + customAllValue='.+', + enableLokiLogs=this.config.enableLokiLogs, + ); + local annotations = {}; + local refresh = this.config.dashboardRefresh; + local period = this.config.dashboardPeriod; + local timezone = this.config.dashboardTimezone; + + { + 'discourse-overview.json': + g.dashboard.new(prefix + ' overview') + + g.dashboard.withDescription('Overview of Discourse application performance and traffic.') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + this.grafana.rows.overviewRow, + this.grafana.rows.latencyRow, + ] + ) + ) + ) + + root.applyCommon( + vars.multiInstance, + uid + '-overview', + tags, + links { overview+:: {} }, + annotations, + timezone, + refresh, + period + ), + + 'discourse-jobs.json': + g.dashboard.new(prefix + ' jobs processing') + + g.dashboard.withDescription('Discourse job processing, workers, and memory usage.') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + this.grafana.rows.jobStatsRow, + this.grafana.rows.jobCountsRow, + this.grafana.rows.jobDurationRow, + this.grafana.rows.memoryRow, + ] + ) + ) + ) + + root.applyCommon( + vars.multiInstance, + uid + '-jobs', + tags, + links { jobs+:: {} }, + annotations, + timezone, + refresh, + period + ), + }, + + applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period): + g.dashboard.withTags(tags) + + g.dashboard.withUid(uid) + + g.dashboard.withLinks(std.objectValues(links)) + + g.dashboard.withTimezone(timezone) + + g.dashboard.withRefresh(refresh) + + g.dashboard.time.withFrom(period) + + g.dashboard.withVariables(vars) + + g.dashboard.withAnnotations(std.objectValues(annotations)) + + g.dashboard.graphTooltip.withSharedCrosshair(), +} diff --git a/discourse-mixin/dashboards_out/discourse-jobs.json b/discourse-mixin/dashboards_out/discourse-jobs.json index 7827ee5e9..b5a39fb50 100644 --- a/discourse-mixin/dashboards_out/discourse-jobs.json +++ b/discourse-mixin/dashboards_out/discourse-jobs.json @@ -1,38 +1,41 @@ { - "__inputs": [ ], - "__requires": [ ], "annotations": { "list": [ ] }, - "editable": false, - "gnetId": null, + "description": "Discourse job processing, workers, and memory usage.", "graphTooltip": 1, - "hideControls": false, - "id": null, "links": [ { - "asDropdown": false, - "icon": "external link", - "includeVars": true, "keepTime": true, - "tags": [ - "discourse-mixin" - ], - "targetBlank": false, - "title": "Other Discourse dashboards", - "type": "dashboards", - "url": "" + "title": "Discourse overview", + "type": "link", + "url": "/d/discourse-overview" } ], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "Job statistics", + "type": "row" + }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Current number of Sidekiq Workers.", "fieldConfig": { "defaults": { "color": { + "fixedColor": "text", "mode": "thresholds" }, "mappings": [ @@ -46,73 +49,52 @@ "type": "special" } ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 5, "w": 8, "x": 0, - "y": 0 + "y": 1 }, "id": 2, - "links": [ ], - "maxDataPoints": 100, "options": { - "colorMode": "none", "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" - ], - "fields": "", - "values": false + ] }, "textMode": "auto" }, - "pluginVersion": "9.1.8", + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "editorMode": "code", - "expr": "count(discourse_rss{type=\"sidekiq\",instance=~\"$instance\",job=~\"$job\"})", + "expr": "count(discourse_rss{type=\"sidekiq\",job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"})", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "range": true, - "refId": "A", - "step": 40, - "target": "" + "instant": false, + "legendFormat": "{{job}}: Sidekiq worker count", + "refId": "Sidekiq worker count" } ], - "title": "Sidekiq Workers", + "title": "Sidekiq workers", "type": "stat" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Current number of Web Workers.", "fieldConfig": { "defaults": { "color": { + "fixedColor": "text", "mode": "thresholds" }, "mappings": [ @@ -126,73 +108,52 @@ "type": "special" } ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 5, "w": 8, "x": 8, - "y": 0 + "y": 1 }, "id": 3, - "links": [ ], - "maxDataPoints": 100, "options": { - "colorMode": "none", "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" - ], - "fields": "", - "values": false + ] }, "textMode": "auto" }, - "pluginVersion": "9.1.8", + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "editorMode": "code", - "expr": "count(discourse_rss{type='web',instance=~\"$instance\",job=~\"$job\"})", + "expr": "count(discourse_rss{type=\"web\",job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"})", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "range": true, - "refId": "A", - "step": 40, - "target": "" + "instant": false, + "legendFormat": "{{job}}: Web worker count", + "refId": "Web worker count" } ], - "title": "Web Workers", + "title": "Web workers", "type": "stat" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Current number of jobs in Sidekiq queue.", "fieldConfig": { "defaults": { "color": { + "fixedColor": "text", "mode": "thresholds" }, "mappings": [ @@ -206,675 +167,407 @@ "type": "special" } ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 5, "w": 8, "x": 16, - "y": 0 + "y": 1 }, "id": 4, - "links": [ ], - "maxDataPoints": 100, "options": { - "colorMode": "none", "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" - ], - "fields": "", - "values": false + ] }, "textMode": "auto" }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "editorMode": "code", - "expr": "max(discourse_sidekiq_jobs_enqueued{instance=~\"$instance\",job=~\"$job\"})", + "expr": "max by (job) (\n discourse_sidekiq_jobs_enqueued{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "range": true, - "refId": "A", - "step": 40, - "target": "" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Sidekiq jobs enqueued" } ], - "title": "Sidekiq Queued", + "title": "Sidekiq queued", "type": "stat" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 6 + }, + "id": 5, + "panels": [ ], + "title": "Job counts", + "type": "row" + }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "The number of scheduled jobs ran over an interval.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ ] + "unit": "none" + } }, "gridPos": { "h": 6, "w": 12, "x": 0, - "y": 6 + "y": 7 }, - "id": 5, + "id": 6, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "increase(discourse_scheduled_job_count{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])", + "expr": "sum by (job) (\n increase(discourse_scheduled_job_count{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}[$__interval:] offset -$__interval)\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{job_name}}" + "instant": false, + "legendFormat": "{{job_name}}", + "refId": "Scheduled job count" } ], - "title": "Scheduled Jobs", + "title": "Scheduled jobs", "type": "timeseries" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "The amount of sidekiq jobs ran over an interval.", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "fixedColor": "text", + "mode": "fixed" }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ ] + "unit": "none" + } }, "gridPos": { "h": 6, "w": 12, "x": 12, - "y": 6 + "y": 7 }, - "id": 6, + "id": 7, "options": { - "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" + "colorMode": "fixed", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "increase(discourse_sidekiq_job_count{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])", + "expr": "sum by (job) (\n increase(discourse_sidekiq_job_count{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}[$__interval:] offset -$__interval)\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{job_name}}" + "instant": false, + "legendFormat": "{{job_name}}", + "refId": "Sidekiq job count" } ], - "title": "Sidekiq Jobs", - "type": "timeseries" + "title": "Sidekiq jobs", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 13 + }, + "id": 8, + "panels": [ ], + "title": "Duration", + "type": "row" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Time spent in scheduled jobs broken out by job name.", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 30, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fixedColor": "text", + "mode": "fixed" }, "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, "w": 12, "x": 0, - "y": 12 + "y": 14 }, - "id": 7, - "links": [ ], - "options": { - "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "pluginVersion": "9.1.8", + "id": 9, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "sum(rate(discourse_scheduled_job_duration_seconds{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])) by (job_name)", + "expr": "sum by (job,job_name) (\n rate(discourse_scheduled_job_duration_seconds{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{job_name}}" + "instant": false, + "legendFormat": "{{job_name}}", + "refId": "Scheduled job duration" } ], - "title": "Scheduled Job Duration", - "type": "timeseries" + "title": "Scheduled job duration", + "type": "stat" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Time spent in Sidekiq jobs broken out by job name.", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 30, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fixedColor": "text", + "mode": "fixed" }, "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, "w": 12, "x": 12, - "y": 12 + "y": 14 }, - "id": 8, - "links": [ ], - "options": { - "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "pluginVersion": "9.1.8", + "id": 10, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "sum(rate(discourse_sidekiq_job_duration_seconds{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])) by (job_name)", + "expr": "sum by (job,job_name) (\n rate(discourse_sidekiq_job_duration_seconds{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{job_name}}" + "instant": false, + "legendFormat": "{{job_name}}", + "refId": "Sidekiq job duration" } ], - "title": "Sidekiq Job Duration", - "type": "timeseries" + "title": "Sidekiq job duration", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 20 + }, + "id": 11, + "panels": [ ], + "title": "Memory", + "type": "row" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Total RSS Memory used by process. Broken up by pid.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "bytes" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, "w": 12, "x": 0, - "y": 18 + "y": 21 }, - "id": 9, + "id": 12, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "sum(discourse_rss{instance=~\"$instance\",job=~\"$job\"}) by (pid)", + "expr": "sum by (job) (\n discourse_rss{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "pid: {{pid}}" + "instant": false, + "legendFormat": "pid: {{pid}}", + "refId": "RSS memory" } ], - "title": "Used RSS Memory", + "title": "Used RSS memory", "type": "timeseries" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Current heap size of V8 engine. Broken up by process type", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "bytes" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, "w": 12, "x": 12, - "y": 18 + "y": 21 }, - "id": 10, + "id": 13, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "sum(discourse_v8_used_heap_size{instance=~\"$instance\",job=~\"$job\"}) by (type)", + "expr": "sum by (job) (\n discourse_v8_used_heap_size{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}" + "instant": false, + "legendFormat": "{{type}}", + "refId": "V8 heap size" } ], - "title": "V8 Heap Size", + "title": "V8 heap size", "type": "timeseries" } ], "refresh": "1m", - "rows": [ ], - "schemaVersion": 14, - "style": "dark", + "schemaVersion": 39, "tags": [ "discourse-mixin" ], "templating": { "list": [ { - "hide": 0, "label": "Data source", - "name": "prometheus_datasource", + "name": "datasource", "query": "prometheus", - "refresh": 1, "regex": "", "type": "datasource" }, { "allValue": ".+", - "current": { - "text": "", - "value": "" + "datasource": { + "type": "prometheus", + "uid": "${datasource}" }, - "datasource": "$prometheus_datasource", - "hide": 0, "includeAll": true, - "label": "instance", + "label": "Job", "multi": true, - "name": "instance", - "options": [ ], - "query": "label_values(discourse_page_views, instance)", + "name": "job", + "query": "label_values(discourse_page_views{job=\"integrations/discourse\"}, job)", "refresh": 2, - "regex": "", "sort": 1, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { "allValue": ".+", - "current": { }, "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "hide": 0, "includeAll": true, - "label": "Job", + "label": "Instance", "multi": true, - "name": "job", - "options": [ ], - "query": "label_values(discourse_page_views{}, job)", + "name": "instance", + "query": "label_values(discourse_page_views{job=\"integrations/discourse\",job=~\"$job\"}, instance)", "refresh": 2, - "regex": "", "sort": 1, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" } ] }, @@ -882,33 +575,7 @@ "from": "now-1h", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, "timezone": "default", - "title": "Discourse Jobs Processing", - "uid": "discourse-jobs", - "version": 0 + "title": "Discourse jobs processing", + "uid": "discourse-jobs" } \ No newline at end of file diff --git a/discourse-mixin/dashboards_out/discourse-overview.json b/discourse-mixin/dashboards_out/discourse-overview.json index 261198525..f127c2ba0 100644 --- a/discourse-mixin/dashboards_out/discourse-overview.json +++ b/discourse-mixin/dashboards_out/discourse-overview.json @@ -1,27 +1,15 @@ { - "__inputs": [ ], - "__requires": [ ], "annotations": { "list": [ ] }, - "editable": false, - "gnetId": null, + "description": "Overview of Discourse application performance and traffic.", "graphTooltip": 1, - "hideControls": false, - "id": null, "links": [ { - "asDropdown": false, - "icon": "external link", - "includeVars": true, "keepTime": true, - "tags": [ - "discourse-mixin" - ], - "targetBlank": false, - "title": "Other discourse dashboards", - "type": "dashboards", - "url": "" + "title": "Discourse jobs", + "type": "link", + "url": "/d/discourse-jobs" } ], "panels": [ @@ -29,71 +17,34 @@ "collapsed": false, "gridPos": { "h": 1, - "w": 24, + "w": 0, "x": 0, "y": 0 }, - "id": 2, + "id": 1, + "panels": [ ], "title": "Overview", "type": "row" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Rate of HTTP traffic over time for the entire application. Grouped by response code.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "decimals": 1, + "noValue": "No traffic", "unit": "reqps" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, @@ -101,90 +52,53 @@ "x": 0, "y": 1 }, - "id": 3, + "id": 2, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "sum(rate(discourse_http_requests{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])) by (status)", + "expr": "sum by (job,status) (\n rate(discourse_http_requests{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{status}}" + "instant": false, + "legendFormat": "{{ api }} - {{ verb }} - {{ status }}", + "refId": "HTTP requests" } ], - "title": "Traffic by Response Code", + "title": "Traffic by response code", "type": "timeseries" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Active web requests for the entire application", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "decimals": 1, + "noValue": "No traffic", "unit": "reqps" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, @@ -192,90 +106,53 @@ "x": 12, "y": 1 }, - "id": 4, + "id": 3, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "discourse_active_app_reqs{instance=~\"$instance\",job=~\"$job\"}", + "expr": "sum by (job) (\n discourse_active_app_reqs{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Active requests" } ], - "title": "Active Requests", + "title": "Active requests", "type": "timeseries" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Queued web requests for the entire application.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "decimals": 1, + "noValue": "No traffic", "unit": "reqps" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, @@ -283,100 +160,53 @@ "x": 0, "y": 7 }, - "id": 5, + "id": 4, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "discourse_queued_app_reqs{instance=~\"$instance\",job=~\"$job\"}", + "expr": "sum by (job) (\n discourse_queued_app_reqs{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Queued requests" } ], - "title": "Queued Requests", + "title": "Queued requests", "type": "timeseries" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, "description": "Rate of pageviews for the entire application. Grouped by type and service.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "views/sec" - }, - "overrides": [ ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "decimals": 1, + "noValue": "No traffic", + "unit": "none" + } }, "gridPos": { "h": 6, @@ -384,472 +214,307 @@ "x": 12, "y": 7 }, - "id": 6, + "id": 5, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "rate(discourse_page_views{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])", + "expr": "sum by (job) (\n rate(discourse_page_views{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "Page views" } ], - "title": "Page Views", + "title": "Page views", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, - "w": 24, + "w": 0, "x": 0, - "y": 12 + "y": 13 }, - "id": 7, + "id": 6, + "panels": [ ], "title": "Latency", "type": "row" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "description": "The median amount of time for “latest” page requests for the selected site.", + "description": "The median amount of time for \"latest\" page requests for the selected site.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "decimals": 1, + "noValue": "No packets", "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, "w": 12, "x": 0, - "y": 13 + "y": 14 }, - "id": 8, + "id": 7, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "sum(discourse_http_duration_seconds{quantile=\"0.5\",action=\"latest\",instance=~\"$instance\",job=~\"$job\"}) by (controller)", + "expr": "sum by (job) (\n discourse_http_duration_seconds{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\",quantile=\"0.5\",action=\"latest\"}\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{controller}}" + "instant": false, + "legendFormat": "{{ instance }}", + "refId": "Latest median request time" } ], - "title": "Latest Median Request Time", + "title": "Latest median request time", "type": "timeseries" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "description": "The median amount of time for “topics show” requests for the selected site.", + "description": "The median amount of time for \"topics show\" requests for the selected site.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "decimals": 1, + "noValue": "No packets", "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, "w": 12, "x": 12, - "y": 13 + "y": 14 }, - "id": 9, + "id": 8, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "sum(discourse_http_duration_seconds{quantile=\"0.5\",controller=\"topics\",instance=~\"$instance\",job=~\"$job\"}) by (controller)", + "expr": "sum by (job) (\n discourse_http_duration_seconds{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\",quantile=\"0.5\",controller=\"topics\"}\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{controller}}" + "instant": false, + "legendFormat": "{{ instance }}", + "refId": "Topic median request time" } ], - "title": "Topic Show Median Request Time", + "title": "Topic show median request time", "type": "timeseries" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "description": "The 99th percentile amount of time for “latest” page requests for the selected site.", + "description": "The 99th percentile amount of time for \"latest\" page requests for the selected site.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "decimals": 1, + "noValue": "No packets", "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, "w": 12, "x": 0, - "y": 18 + "y": 20 }, - "id": 10, + "id": 9, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "sum(discourse_http_duration_seconds{quantile=\"0.99\",action=\"latest\",instance=~\"$instance\",job=~\"$job\"}) by (controller)", + "expr": "sum by (job) (\n discourse_http_duration_seconds{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\",quantile=\"0.99\",action=\"latest\"}\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{controller}}" + "instant": false, + "legendFormat": "{{ instance }}", + "refId": "Latest 99th percentile request time" } ], - "title": "Latest 99th percentile Request Time", + "title": "Latest 99th percentile request time", "type": "timeseries" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "description": "The 99th percentile amount of time for “topics show” requests for the selected site.", + "description": "The 99th percentile amount of time for \"topics show\" requests for the selected site.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" + }, + "decimals": 1, + "noValue": "No packets", "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 6, "w": 12, "x": 12, - "y": 18 + "y": 20 }, - "id": 11, + "id": 10, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "discourse_http_duration_seconds{quantile=\"0.99\",controller=\"topics\",instance=~\"$instance\",job=~\"$job\"}", + "expr": "sum by (job) (\n discourse_http_duration_seconds{job=\"integrations/discourse\",job=~\"$job\",instance=~\"$instance\",quantile=\"0.99\",controller=\"topics\"}\n)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{controller}}" + "instant": false, + "legendFormat": "{{ instance }}", + "refId": "Topic 99th percentile request time" } ], - "title": "Topic Show 99th percentile Request Time", + "title": "Topic show 99th percentile request time", "type": "timeseries" } ], "refresh": "1m", - "rows": [ ], - "schemaVersion": 14, - "style": "dark", + "schemaVersion": 39, "tags": [ "discourse-mixin" ], "templating": { "list": [ { - "hide": 0, "label": "Data source", - "name": "prometheus_datasource", + "name": "datasource", "query": "prometheus", - "refresh": 1, "regex": "", "type": "datasource" }, { "allValue": ".+", - "current": { - "text": "", - "value": "" + "datasource": { + "type": "prometheus", + "uid": "${datasource}" }, - "datasource": "$prometheus_datasource", - "hide": 0, "includeAll": true, - "label": "instance", + "label": "Job", "multi": true, - "name": "instance", - "options": [ ], - "query": "label_values(discourse_page_views{}, instance)", + "name": "job", + "query": "label_values(discourse_page_views{job=\"integrations/discourse\"}, job)", "refresh": 2, - "regex": "", "sort": 1, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { "allValue": ".+", - "current": { }, "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "hide": 0, "includeAll": true, - "label": "Job", + "label": "Instance", "multi": true, - "name": "job", - "options": [ ], - "query": "label_values(discourse_page_views{}, job)", + "name": "instance", + "query": "label_values(discourse_page_views{job=\"integrations/discourse\",job=~\"$job\"}, instance)", "refresh": 2, - "regex": "", "sort": 1, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" } ] }, @@ -857,33 +522,7 @@ "from": "now-1h", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, "timezone": "default", - "title": "Discourse Overview", - "uid": "discourse-overview", - "version": 0 + "title": "Discourse overview", + "uid": "discourse-overview" } \ No newline at end of file diff --git a/discourse-mixin/g.libsonnet b/discourse-mixin/g.libsonnet new file mode 100644 index 000000000..ba90fd9b0 --- /dev/null +++ b/discourse-mixin/g.libsonnet @@ -0,0 +1,3 @@ +// grafonnet must be imported with "g" alias +local g = import './vendor/grafonnet-v11.0.0/main.libsonnet'; +g diff --git a/discourse-mixin/jsonnetfile.json b/discourse-mixin/jsonnetfile.json index 64258d167..46d5af011 100644 --- a/discourse-mixin/jsonnetfile.json +++ b/discourse-mixin/jsonnetfile.json @@ -1,15 +1,24 @@ { - "version": 1, - "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" - } - }, - "version": "master" + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet", + "subdir": "gen/grafonnet-v11.0.0" } - ], - "legacyImports": true -} \ No newline at end of file + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "common-lib" + } + }, + "version": "master" + } + ], + "legacyImports": true +} diff --git a/discourse-mixin/links.libsonnet b/discourse-mixin/links.libsonnet new file mode 100644 index 000000000..c4bea87c6 --- /dev/null +++ b/discourse-mixin/links.libsonnet @@ -0,0 +1,13 @@ +local g = import './g.libsonnet'; + +{ + new(this): { + overview: + g.dashboard.link.link.new(this.config.dashboardNamePrefix + ' overview', '/d/' + this.config.uid + '-overview') + + g.dashboard.link.link.options.withKeepTime(true), + + jobs: + g.dashboard.link.link.new(this.config.dashboardNamePrefix + ' jobs', '/d/' + this.config.uid + '-jobs') + + g.dashboard.link.link.options.withKeepTime(true), + }, +} diff --git a/discourse-mixin/main.libsonnet b/discourse-mixin/main.libsonnet new file mode 100644 index 000000000..1b7dd4ce8 --- /dev/null +++ b/discourse-mixin/main.libsonnet @@ -0,0 +1,45 @@ +local alerts = import './alerts.libsonnet'; +local config = import './config.libsonnet'; +local dashboards = import './dashboards.libsonnet'; +local g = import './g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + withConfigMixin(config): { + config+: config, + }, + + new(): { + local this = self, + config: config, + + signals: + { + [sig]: commonlib.signals.unmarshallJsonMulti( + this.config.signals[sig], + type=this.config.metricsSource + ) + for sig in std.objectFields(this.config.signals) + }, + + grafana: { + variables: commonlib.variables.new( + filteringSelector=this.config.filteringSelector, + groupLabels=this.config.groupLabels, + instanceLabels=this.config.instanceLabels, + varMetric='discourse_page_views', + customAllValue='.+', + enableLokiLogs=this.config.enableLokiLogs, + ), + links: (import './links.libsonnet').new(this), + panels: (import './panels.libsonnet').new(this), + rows: (import './rows.libsonnet').new(this), + dashboards: dashboards.new(this), + }, + + prometheus: { + alerts: alerts.new(this), + recordingRules: {}, + }, + }, +} diff --git a/discourse-mixin/mixin.libsonnet b/discourse-mixin/mixin.libsonnet index 119d2cdde..cba9e1794 100644 --- a/discourse-mixin/mixin.libsonnet +++ b/discourse-mixin/mixin.libsonnet @@ -1,3 +1,16 @@ -(import 'alerts/alerts.libsonnet') + -(import 'dashboards/dashboards.libsonnet') + -(import 'config.libsonnet') +local config = import './config.libsonnet'; +local lib = import './main.libsonnet'; + +local discourse = + lib.new() + + lib.withConfigMixin({ + filteringSelector: config.filteringSelector, + uid: config.uid, + enableLokiLogs: config.enableLokiLogs, + }); + +{ + grafanaDashboards+:: discourse.grafana.dashboards, + prometheusAlerts+:: discourse.prometheus.alerts, + prometheusRules+:: discourse.prometheus.recordingRules, +} diff --git a/discourse-mixin/panels.libsonnet b/discourse-mixin/panels.libsonnet new file mode 100644 index 000000000..d3e3feec2 --- /dev/null +++ b/discourse-mixin/panels.libsonnet @@ -0,0 +1,140 @@ +local g = import './g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + new(this): { + local signals = this.signals, + + // Overview dashboard panels + trafficByResponseCode: + commonlib.panels.network.timeSeries.traffic.new( + 'Traffic by response code', + targets=[signals.overview.httpRequests.asTarget()] + ) + + g.panel.timeSeries.panelOptions.withDescription('Rate of HTTP traffic over time for the entire application. Grouped by response code.') + + g.panel.timeSeries.standardOptions.withUnit('reqps'), + + activeRequests: + commonlib.panels.network.timeSeries.traffic.new( + 'Active requests', + targets=[signals.overview.activeRequests.asTarget()] + ) + + g.panel.timeSeries.panelOptions.withDescription('Active web requests for the entire application') + + g.panel.timeSeries.standardOptions.withUnit('reqps'), + + queuedRequests: + commonlib.panels.network.timeSeries.traffic.new( + 'Queued requests', + targets=[signals.overview.queuedRequests.asTarget()] + ) + + g.panel.timeSeries.panelOptions.withDescription('Queued web requests for the entire application.') + + g.panel.timeSeries.standardOptions.withUnit('reqps'), + + pageViews: + commonlib.panels.network.timeSeries.traffic.new( + 'Page views', + targets=[signals.overview.pageViews.asTarget()] + ) + + g.panel.timeSeries.panelOptions.withDescription('Rate of pageviews for the entire application. Grouped by type and service.') + + g.panel.timeSeries.standardOptions.withUnit('none'), + + latestMedianRequestTime: + commonlib.panels.network.timeSeries.base.new('Latest median request time', targets=[signals.overview.latestMedianRequestTime.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The median amount of time for "latest" page requests for the selected site.') + + g.panel.timeSeries.standardOptions.withUnit('s'), + + topicMedianRequestTime: + commonlib.panels.network.timeSeries.base.new('Topic show median request time', targets=[signals.overview.topicMedianRequestTime.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The median amount of time for "topics show" requests for the selected site.') + + g.panel.timeSeries.standardOptions.withUnit('s'), + + latest99thPercentileRequestTime: + commonlib.panels.network.timeSeries.base.new('Latest 99th percentile request time', targets=[signals.overview.latest99thPercentileRequestTime.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The 99th percentile amount of time for "latest" page requests for the selected site.') + + g.panel.timeSeries.standardOptions.withUnit('s'), + + topic99thPercentileRequestTime: + commonlib.panels.network.timeSeries.base.new('Topic show 99th percentile request time', targets=[signals.overview.topic99thPercentileRequestTime.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The 99th percentile amount of time for "topics show" requests for the selected site.') + + g.panel.timeSeries.standardOptions.withUnit('s'), + + // Jobs dashboard panels + sidekiqJobDuration: + commonlib.panels.generic.stat.base.new('Sidekiq job duration', targets=[signals.jobs.sidekiqJobDuration.asTarget()]) + + g.panel.stat.panelOptions.withDescription('Time spent in Sidekiq jobs broken out by job name.') + + g.panel.stat.standardOptions.withUnit('s'), + + scheduledJobDuration: + commonlib.panels.generic.stat.base.new('Scheduled job duration', targets=[signals.jobs.scheduledJobDuration.asTarget()]) + + g.panel.stat.panelOptions.withDescription('Time spent in scheduled jobs broken out by job name.') + + g.panel.stat.standardOptions.withUnit('s'), + + sidekiqJobCount: + commonlib.panels.generic.stat.info.new('Sidekiq jobs', targets=[signals.jobs.sidekiqJobCount.asTarget()]) + + g.panel.stat.panelOptions.withDescription('The amount of sidekiq jobs ran over an interval.') + + g.panel.stat.standardOptions.withUnit('none'), + + scheduledJobCount: + commonlib.panels.generic.timeSeries.base.new('Scheduled jobs', targets=[signals.jobs.scheduledJobCount.asTarget()]) + + g.panel.stat.panelOptions.withDescription('The number of scheduled jobs ran over an interval.') + + g.panel.timeSeries.standardOptions.withUnit('none'), + + usedRSSMemory: + commonlib.panels.generic.timeSeries.base.new('Used RSS memory', targets=[signals.jobs.rssMemory.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('Total RSS Memory used by process. Broken up by pid.') + + g.panel.timeSeries.standardOptions.withUnit('bytes'), + + v8HeapSize: + commonlib.panels.generic.timeSeries.base.new('V8 heap size', targets=[signals.jobs.v8HeapSize.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('Current heap size of V8 engine. Broken up by process type') + + g.panel.timeSeries.standardOptions.withUnit('bytes'), + + sidekiqWorkers: + commonlib.panels.generic.stat.base.new('Sidekiq workers', targets=[signals.jobs.sidekiqWorkerCount.asTarget()]) + + g.panel.stat.panelOptions.withDescription('Current number of Sidekiq Workers.') + + g.panel.stat.standardOptions.withUnit('none') + + g.panel.stat.standardOptions.color.withMode('thresholds') + + g.panel.stat.standardOptions.withMappings([ + g.panel.stat.standardOptions.mapping.SpecialValueMap.withType() + + g.panel.stat.standardOptions.mapping.SpecialValueMap.withOptions({ + match: 'null', + result: { text: 'N/A' }, + }), + ]) + + g.panel.stat.options.reduceOptions.withCalcs(['lastNotNull']) + + g.panel.stat.options.withGraphMode('none') + + g.panel.stat.options.withTextMode('auto'), + + webWorkers: + commonlib.panels.generic.stat.base.new('Web workers', targets=[signals.jobs.webWorkerCount.asTarget()]) + + g.panel.stat.panelOptions.withDescription('Current number of Web Workers.') + + g.panel.stat.standardOptions.withUnit('none') + + g.panel.stat.standardOptions.color.withMode('thresholds') + + g.panel.stat.standardOptions.withMappings([ + g.panel.stat.standardOptions.mapping.SpecialValueMap.withType() + + g.panel.stat.standardOptions.mapping.SpecialValueMap.withOptions({ + match: 'null', + result: { text: 'N/A' }, + }), + ]) + + g.panel.stat.options.reduceOptions.withCalcs(['lastNotNull']) + + g.panel.stat.options.withGraphMode('none') + + g.panel.stat.options.withTextMode('auto'), + + sidekiqQueued: + commonlib.panels.generic.stat.base.new('Sidekiq queued', targets=[signals.jobs.sidekiqJobsEnqueued.asTarget()]) + + g.panel.stat.panelOptions.withDescription('Current number of jobs in Sidekiq queue.') + + g.panel.stat.standardOptions.withUnit('none') + + g.panel.stat.standardOptions.color.withMode('thresholds') + + g.panel.stat.standardOptions.withMappings([ + g.panel.stat.standardOptions.mapping.SpecialValueMap.withType() + + g.panel.stat.standardOptions.mapping.SpecialValueMap.withOptions({ + match: 'null', + result: { text: 'N/A' }, + }), + ]) + + g.panel.stat.options.reduceOptions.withCalcs(['lastNotNull']) + + g.panel.stat.options.withGraphMode('none') + + g.panel.stat.options.withTextMode('auto'), + }, +} diff --git a/discourse-mixin/prometheus_rules_out/prometheus_alerts.yaml b/discourse-mixin/prometheus_rules_out/prometheus_alerts.yaml index 93fff9a6d..d19035710 100644 --- a/discourse-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/discourse-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -1,7 +1,7 @@ groups: - - name: DiscourseAlerts + - name: discourse-alerts rules: - - alert: DiscourseRequestsHigh5xxErrors + - alert: DiscourseHigh5xxErrors annotations: description: '{{ printf "%.2f" $value }}% of all requests are resulting in 500 status codes, which is above the threshold 10%, indicating a potentially larger issue for {{$labels.instance}}' summary: More than 10% of all requests result in a 5XX. @@ -10,12 +10,12 @@ groups: for: 5m labels: severity: critical - - alert: DiscourseRequestsHigh4xxErrors + - alert: DiscourseHigh4xxErrors annotations: description: '{{ printf "%.2f" $value }}% of all requests are resulting in 400 status code, which is above the threshold 30%, indicating a potentially larger issue for {{$labels.instance}}' summary: More than 30% of all requests result in a 4XX. expr: | - 100 * rate(discourse_http_requests{status=~"^4.*"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 30 + 100 * rate(discourse_http_requests{status=~"4..", job="integrations/discourse"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 30 for: 5m labels: severity: warning diff --git a/discourse-mixin/rows.libsonnet b/discourse-mixin/rows.libsonnet new file mode 100644 index 000000000..98cdfff0b --- /dev/null +++ b/discourse-mixin/rows.libsonnet @@ -0,0 +1,62 @@ +local g = import './g.libsonnet'; + +{ + new(this): { + local panels = this.grafana.panels, + + // discourse-overview rows + overviewRow: + g.panel.row.new('Overview') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.trafficByResponseCode { gridPos: { h: 6, w: 12 } }, + panels.activeRequests { gridPos: { h: 6, w: 12 } }, + panels.queuedRequests { gridPos: { h: 6, w: 12 } }, + panels.pageViews { gridPos: { h: 6, w: 12 } }, + ]), + + latencyRow: + g.panel.row.new('Latency') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.latestMedianRequestTime { gridPos: { h: 6, w: 12 } }, + panels.topicMedianRequestTime { gridPos: { h: 6, w: 12 } }, + panels.latest99thPercentileRequestTime { gridPos: { h: 6, w: 12 } }, + panels.topic99thPercentileRequestTime { gridPos: { h: 6, w: 12 } }, + ]), + + // discourse-jobs rows + jobStatsRow: + g.panel.row.new('Job statistics') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.sidekiqWorkers { gridPos: { h: 5, w: 8 } }, + panels.webWorkers { gridPos: { h: 5, w: 8 } }, + panels.sidekiqQueued { gridPos: { h: 5, w: 8 } }, + ]), + + jobCountsRow: + g.panel.row.new('Job counts') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.scheduledJobCount { gridPos: { h: 6, w: 12 } }, + panels.sidekiqJobCount { gridPos: { h: 6, w: 12 } }, + ]), + + jobDurationRow: + g.panel.row.new('Duration') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.scheduledJobDuration { gridPos: { h: 6, w: 12 } }, + panels.sidekiqJobDuration { gridPos: { h: 6, w: 12 } }, + ]), + + memoryRow: + g.panel.row.new('Memory') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + panels.usedRSSMemory { gridPos: { h: 6, w: 12 } }, + panels.v8HeapSize { gridPos: { h: 6, w: 12 } }, + ]), + }, +} diff --git a/discourse-mixin/signals/jobs.libsonnet b/discourse-mixin/signals/jobs.libsonnet new file mode 100644 index 000000000..ebde76d45 --- /dev/null +++ b/discourse-mixin/signals/jobs.libsonnet @@ -0,0 +1,132 @@ +function(this) + { + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + enableLokiLogs: this.enableLokiLogs, + aggLevel: 'group', + aggFunction: 'sum', + signals: { + // Job processing signals + sidekiqJobDuration: { + name: 'Sidekiq job duration', + type: 'counter', + unit: 's', + description: 'Time spent in Sidekiq jobs broken out by job name.', + sources: { + prometheus: { + expr: 'discourse_sidekiq_job_duration_seconds{%(queriesSelector)s}', + aggKeepLabels: ['job_name'], + legendCustomTemplate: '{{job_name}}', + }, + }, + }, + + scheduledJobDuration: { + name: 'Scheduled job duration', + type: 'counter', + unit: 's', + description: 'Time spent in scheduled jobs broken out by job name.', + sources: { + prometheus: { + expr: 'discourse_scheduled_job_duration_seconds{%(queriesSelector)s}', + aggKeepLabels: ['job_name'], + legendCustomTemplate: '{{job_name}}', + }, + }, + }, + + sidekiqJobCount: { + name: 'Sidekiq job count', + type: 'counter', + unit: 'none', + description: 'The amount of sidekiq jobs ran over an interval.', + sources: { + prometheus: { + expr: 'discourse_sidekiq_job_count{%(queriesSelector)s}', + rangeFunction: 'increase', + legendCustomTemplate: '{{job_name}}', + }, + }, + }, + + scheduledJobCount: { + name: 'Scheduled job count', + type: 'counter', + unit: 'none', + description: 'The number of scheduled jobs ran over an interval.', + sources: { + prometheus: { + expr: 'discourse_scheduled_job_count{%(queriesSelector)s}', + rangeFunction: 'increase', + legendCustomTemplate: '{{job_name}}', + }, + }, + }, + + sidekiqJobsEnqueued: { + name: 'Sidekiq jobs enqueued', + type: 'gauge', + aggFunction: 'max', + unit: 'none', + description: 'Current number of jobs in Sidekiq queue.', + sources: { + prometheus: { + expr: 'discourse_sidekiq_jobs_enqueued{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + + sidekiqWorkerCount: { + name: 'Sidekiq worker count', + type: 'raw', + unit: 'none', + description: 'Current number of Sidekiq workers.', + sources: { + prometheus: { + expr: 'count(discourse_rss{type="sidekiq",%(queriesSelector)s})', + }, + }, + }, + + webWorkerCount: { + name: 'Web worker count', + type: 'raw', + unit: 'none', + description: 'Current number of web workers.', + sources: { + prometheus: { + expr: 'count(discourse_rss{type="web",%(queriesSelector)s})', + }, + }, + }, + + // Memory signals + rssMemory: { + name: 'RSS memory', + type: 'gauge', + unit: 'bytes', + description: 'Total RSS memory used by process.', + sources: { + prometheus: { + expr: 'discourse_rss{%(queriesSelector)s}', + legendCustomTemplate: 'pid: {{pid}}', + }, + }, + }, + + v8HeapSize: { + name: 'V8 heap size', + type: 'gauge', + unit: 'bytes', + description: 'Current heap size of V8 engine broken up by process type.', + sources: { + prometheus: { + expr: 'discourse_v8_used_heap_size{%(queriesSelector)s}', + legendCustomTemplate: '{{type}}', + }, + }, + }, + }, + } diff --git a/discourse-mixin/signals/overview.libsonnet b/discourse-mixin/signals/overview.libsonnet new file mode 100644 index 000000000..465b11127 --- /dev/null +++ b/discourse-mixin/signals/overview.libsonnet @@ -0,0 +1,115 @@ +function(this) + local legendCustomTemplate = '{{ instance }}'; + { + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + legendCustomTemplate: legendCustomTemplate, + enableLokiLogs: this.enableLokiLogs, + aggLevel: 'group', + aggFunction: 'sum', + signals: { + // HTTP traffic and latency signals + httpRequests: { + name: 'HTTP requests', + type: 'counter', + unit: 'reqps', + description: 'Rate of HTTP requests by status code.', + sources: { + prometheus: { + expr: 'discourse_http_requests{%(queriesSelector)s}', + aggKeepLabels: ['status'], + legendCustomTemplate: '{{ api }} - {{ verb }} - {{ status }}', + }, + }, + }, + + latestMedianRequestTime: { + name: 'Latest median request time', + type: 'gauge', + unit: 's', + description: 'The median amount of time for "latest" page requests.', + sources: { + prometheus: { + expr: 'discourse_http_duration_seconds{%(queriesSelector)s,quantile="0.5",action="latest"}', + }, + }, + }, + + topicMedianRequestTime: { + name: 'Topic median request time', + type: 'gauge', + unit: 's', + description: 'The median amount of time for "topics show" requests.', + sources: { + prometheus: { + expr: 'discourse_http_duration_seconds{%(queriesSelector)s,quantile="0.5",controller="topics"}', + }, + }, + }, + + latest99thPercentileRequestTime: { + name: 'Latest 99th percentile request time', + type: 'gauge', + unit: 's', + description: 'The 99th percentile amount of time for "latest" page requests.', + sources: { + prometheus: { + expr: 'discourse_http_duration_seconds{%(queriesSelector)s,quantile="0.99",action="latest"}', + }, + }, + }, + + topic99thPercentileRequestTime: { + name: 'Topic 99th percentile request time', + type: 'gauge', + unit: 's', + description: 'The 99th percentile amount of time for "topics show" requests.', + sources: { + prometheus: { + expr: 'discourse_http_duration_seconds{%(queriesSelector)s,quantile="0.99",controller="topics"}', + }, + }, + }, + + // Request queue signals + activeRequests: { + name: 'Active requests', + type: 'gauge', + unit: 'reqps', + description: 'Active web requests for the entire application.', + sources: { + prometheus: { + expr: 'discourse_active_app_reqs{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + + queuedRequests: { + name: 'Queued requests', + type: 'gauge', + unit: 'reqps', + description: 'Queued web requests for the entire application.', + sources: { + prometheus: { + expr: 'discourse_queued_app_reqs{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + + pageViews: { + name: 'Page views', + type: 'counter', + unit: 'views/sec', + description: 'Rate of pageviews for the entire application.', + sources: { + prometheus: { + expr: 'discourse_page_views{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + }, + } From c0613122a20c22af931614318fa8bf753856e131 Mon Sep 17 00:00:00 2001 From: schmikei Date: Mon, 10 Nov 2025 14:59:52 -0500 Subject: [PATCH 2/5] remove .lint addition --- discourse-mixin/.lint | 2 -- 1 file changed, 2 deletions(-) diff --git a/discourse-mixin/.lint b/discourse-mixin/.lint index cfb8e524b..c25f83b67 100644 --- a/discourse-mixin/.lint +++ b/discourse-mixin/.lint @@ -10,5 +10,3 @@ exclusions: - panel: "Sidekiq Workers" template-instance-rule: reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'" - panel-datasource-rule: - reason: "Modern mixins use signal-based architecture where datasource references are handled by the framework" From 3279937bd7264c0aaf900ff470d163f0019283d1 Mon Sep 17 00:00:00 2001 From: schmikei Date: Mon, 10 Nov 2025 15:01:45 -0500 Subject: [PATCH 3/5] update readme for alerts rename --- discourse-mixin/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discourse-mixin/README.md b/discourse-mixin/README.md index 93432634a..6e25b89ab 100644 --- a/discourse-mixin/README.md +++ b/discourse-mixin/README.md @@ -9,8 +9,8 @@ The Discourse mixin contains the following dashboards: The Discourse mixin contains the following alerts: -- DiscourseRequestsHigh5xxErrors -- DiscourseRequestsHigh4xxErrors +- DiscourseHigh5xxErrors +- DiscourseHigh4xxErrors ## Discourse Overview From 05a474cc5802eec70cf2833b28801d70a8a32677 Mon Sep 17 00:00:00 2001 From: schmikei Date: Mon, 10 Nov 2025 15:03:28 -0500 Subject: [PATCH 4/5] update import so that it uses the jsonnetfile.json --- discourse-mixin/g.libsonnet | 4 +--- discourse-mixin/jsonnetfile.json | 41 +++++++++++++++++++------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/discourse-mixin/g.libsonnet b/discourse-mixin/g.libsonnet index ba90fd9b0..e6a2060ee 100644 --- a/discourse-mixin/g.libsonnet +++ b/discourse-mixin/g.libsonnet @@ -1,3 +1 @@ -// grafonnet must be imported with "g" alias -local g = import './vendor/grafonnet-v11.0.0/main.libsonnet'; -g +import 'github.com/grafana/grafonnet/gen/grafonnet-v11.4.0/main.libsonnet' diff --git a/discourse-mixin/jsonnetfile.json b/discourse-mixin/jsonnetfile.json index 46d5af011..3f9bbb1fe 100644 --- a/discourse-mixin/jsonnetfile.json +++ b/discourse-mixin/jsonnetfile.json @@ -1,24 +1,33 @@ { "version": 1, "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet", - "subdir": "gen/grafonnet-v11.0.0" - } + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" }, - "version": "main" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "common-lib" - } + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "common-lib" + } + }, + "version": "master" }, - "version": "master" - } + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "grafana-cloud-integration-utils" + } + }, + "version": "master" + } ], "legacyImports": true } From 52beb8e295550c744f1c9ce81dc156c884743ddd Mon Sep 17 00:00:00 2001 From: schmikei Date: Tue, 11 Nov 2025 16:27:41 -0500 Subject: [PATCH 5/5] pr feedback; minus validation on histogram switch --- discourse-mixin/alerts.libsonnet | 2 +- .../dashboards_out/discourse-jobs.json | 70 ++++++++++++++----- .../dashboards_out/discourse-overview.json | 64 ++++------------- discourse-mixin/panels.libsonnet | 34 +++++---- .../prometheus_alerts.yaml | 2 +- 5 files changed, 89 insertions(+), 83 deletions(-) diff --git a/discourse-mixin/alerts.libsonnet b/discourse-mixin/alerts.libsonnet index 6111038ca..0f173f6ad 100644 --- a/discourse-mixin/alerts.libsonnet +++ b/discourse-mixin/alerts.libsonnet @@ -7,7 +7,7 @@ { alert: 'DiscourseHigh5xxErrors', expr: ||| - 100 * rate(discourse_http_requests{status="500"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > %(alertsCritical5xxResponses)s + 100 * rate(discourse_http_requests{status=~"5..", %(filteringSelector)s}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > %(alertsCritical5xxResponses)s ||| % this.config, 'for': '5m', labels: { diff --git a/discourse-mixin/dashboards_out/discourse-jobs.json b/discourse-mixin/dashboards_out/discourse-jobs.json index b5a39fb50..3aa0bec69 100644 --- a/discourse-mixin/dashboards_out/discourse-jobs.json +++ b/discourse-mixin/dashboards_out/discourse-jobs.json @@ -276,9 +276,12 @@ "description": "The amount of sidekiq jobs ran over an interval.", "fieldConfig": { "defaults": { - "color": { - "fixedColor": "text", - "mode": "fixed" + "custom": { + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "none" } @@ -291,12 +294,13 @@ }, "id": 7, "options": { - "colorMode": "fixed", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] + "legend": { + "calcs": [ ], + "displayMode": "list" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" } }, "pluginVersion": "v11.0.0", @@ -314,7 +318,7 @@ } ], "title": "Sidekiq jobs", - "type": "stat" + "type": "timeseries" }, { "collapsed": false, @@ -337,9 +341,12 @@ "description": "Time spent in scheduled jobs broken out by job name.", "fieldConfig": { "defaults": { - "color": { - "fixedColor": "text", - "mode": "fixed" + "custom": { + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "s" } @@ -351,6 +358,18 @@ "y": 14 }, "id": 9, + "options": { + "legend": { + "asTable": true, + "calcs": [ ], + "displayMode": "list", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "pluginVersion": "v11.0.0", "targets": [ { @@ -366,7 +385,7 @@ } ], "title": "Scheduled job duration", - "type": "stat" + "type": "timeseries" }, { "datasource": { @@ -376,9 +395,12 @@ "description": "Time spent in Sidekiq jobs broken out by job name.", "fieldConfig": { "defaults": { - "color": { - "fixedColor": "text", - "mode": "fixed" + "custom": { + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "s" } @@ -390,6 +412,18 @@ "y": 14 }, "id": 10, + "options": { + "legend": { + "asTable": true, + "calcs": [ ], + "displayMode": "list", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, "pluginVersion": "v11.0.0", "targets": [ { @@ -405,7 +439,7 @@ } ], "title": "Sidekiq job duration", - "type": "stat" + "type": "timeseries" }, { "collapsed": false, diff --git a/discourse-mixin/dashboards_out/discourse-overview.json b/discourse-mixin/dashboards_out/discourse-overview.json index f127c2ba0..f2124ddea 100644 --- a/discourse-mixin/dashboards_out/discourse-overview.json +++ b/discourse-mixin/dashboards_out/discourse-overview.json @@ -55,8 +55,10 @@ "id": 2, "options": { "legend": { + "asTable": true, "calcs": [ ], - "displayMode": "list" + "displayMode": "list", + "placement": "right" }, "tooltip": { "mode": "multi", @@ -365,21 +367,12 @@ }, { "datasource": { - "type": "prometheus", - "uid": "${datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The 99th percentile amount of time for \"latest\" page requests for the selected site.", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 30, - "gradientMode": "opacity", - "lineInterpolation": "smooth", - "lineWidth": 2, - "showPoints": "never" - }, - "decimals": 1, - "noValue": "No packets", "unit": "s" } }, @@ -390,17 +383,7 @@ "y": 20 }, "id": 9, - "options": { - "legend": { - "calcs": [ ], - "displayMode": "list" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "pluginVersion": "v11.0.0", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { @@ -414,26 +397,17 @@ "refId": "Latest 99th percentile request time" } ], - "title": "Latest 99th percentile request time", - "type": "timeseries" + "title": "Latest request time", + "type": "histogram" }, { "datasource": { - "type": "prometheus", - "uid": "${datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, - "description": "The 99th percentile amount of time for \"topics show\" requests for the selected site.", + "description": "The amount of time for \"topics show\" requests for the selected site.", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 30, - "gradientMode": "opacity", - "lineInterpolation": "smooth", - "lineWidth": 2, - "showPoints": "never" - }, - "decimals": 1, - "noValue": "No packets", "unit": "s" } }, @@ -444,17 +418,7 @@ "y": 20 }, "id": 10, - "options": { - "legend": { - "calcs": [ ], - "displayMode": "list" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "pluginVersion": "v11.0.0", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { @@ -468,8 +432,8 @@ "refId": "Topic 99th percentile request time" } ], - "title": "Topic show 99th percentile request time", - "type": "timeseries" + "title": "Topic show request time", + "type": "histogram" } ], "refresh": "1m", diff --git a/discourse-mixin/panels.libsonnet b/discourse-mixin/panels.libsonnet index d3e3feec2..944dc0693 100644 --- a/discourse-mixin/panels.libsonnet +++ b/discourse-mixin/panels.libsonnet @@ -11,6 +11,8 @@ local commonlib = import 'common-lib/common/main.libsonnet'; 'Traffic by response code', targets=[signals.overview.httpRequests.asTarget()] ) + + g.panel.timeSeries.options.legend.withAsTable(true) + + g.panel.timeSeries.options.legend.withPlacement('right') + g.panel.timeSeries.panelOptions.withDescription('Rate of HTTP traffic over time for the entire application. Grouped by response code.') + g.panel.timeSeries.standardOptions.withUnit('reqps'), @@ -49,28 +51,34 @@ local commonlib = import 'common-lib/common/main.libsonnet'; + g.panel.timeSeries.standardOptions.withUnit('s'), latest99thPercentileRequestTime: - commonlib.panels.network.timeSeries.base.new('Latest 99th percentile request time', targets=[signals.overview.latest99thPercentileRequestTime.asTarget()]) - + g.panel.timeSeries.panelOptions.withDescription('The 99th percentile amount of time for "latest" page requests for the selected site.') - + g.panel.timeSeries.standardOptions.withUnit('s'), + g.panel.histogram.new('Latest request time') + + g.panel.histogram.queryOptions.withTargets([signals.overview.latest99thPercentileRequestTime.asTarget()]) + + g.panel.histogram.panelOptions.withDescription('The 99th percentile amount of time for "latest" page requests for the selected site.') + + g.panel.histogram.standardOptions.withUnit('s'), topic99thPercentileRequestTime: - commonlib.panels.network.timeSeries.base.new('Topic show 99th percentile request time', targets=[signals.overview.topic99thPercentileRequestTime.asTarget()]) - + g.panel.timeSeries.panelOptions.withDescription('The 99th percentile amount of time for "topics show" requests for the selected site.') - + g.panel.timeSeries.standardOptions.withUnit('s'), + g.panel.histogram.new('Topic show request time') + + g.panel.histogram.queryOptions.withTargets([signals.overview.topic99thPercentileRequestTime.asTarget()]) + + g.panel.histogram.panelOptions.withDescription('The amount of time for "topics show" requests for the selected site.') + + g.panel.histogram.standardOptions.withUnit('s'), // Jobs dashboard panels sidekiqJobDuration: - commonlib.panels.generic.stat.base.new('Sidekiq job duration', targets=[signals.jobs.sidekiqJobDuration.asTarget()]) - + g.panel.stat.panelOptions.withDescription('Time spent in Sidekiq jobs broken out by job name.') - + g.panel.stat.standardOptions.withUnit('s'), + commonlib.panels.generic.timeSeries.base.new('Sidekiq job duration', targets=[signals.jobs.sidekiqJobDuration.asTarget()]) + + g.panel.timeSeries.options.legend.withAsTable(true) + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.panelOptions.withDescription('Time spent in Sidekiq jobs broken out by job name.') + + g.panel.timeSeries.standardOptions.withUnit('s'), scheduledJobDuration: - commonlib.panels.generic.stat.base.new('Scheduled job duration', targets=[signals.jobs.scheduledJobDuration.asTarget()]) - + g.panel.stat.panelOptions.withDescription('Time spent in scheduled jobs broken out by job name.') - + g.panel.stat.standardOptions.withUnit('s'), + commonlib.panels.generic.timeSeries.base.new('Scheduled job duration', targets=[signals.jobs.scheduledJobDuration.asTarget()]) + + g.panel.timeSeries.options.legend.withAsTable(true) + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.panelOptions.withDescription('Time spent in scheduled jobs broken out by job name.') + + g.panel.timeSeries.standardOptions.withUnit('s'), sidekiqJobCount: - commonlib.panels.generic.stat.info.new('Sidekiq jobs', targets=[signals.jobs.sidekiqJobCount.asTarget()]) + commonlib.panels.generic.timeSeries.base.new('Sidekiq jobs', targets=[signals.jobs.sidekiqJobCount.asTarget()]) + g.panel.stat.panelOptions.withDescription('The amount of sidekiq jobs ran over an interval.') + g.panel.stat.standardOptions.withUnit('none'), diff --git a/discourse-mixin/prometheus_rules_out/prometheus_alerts.yaml b/discourse-mixin/prometheus_rules_out/prometheus_alerts.yaml index d19035710..36ccd2ae1 100644 --- a/discourse-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/discourse-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -6,7 +6,7 @@ groups: description: '{{ printf "%.2f" $value }}% of all requests are resulting in 500 status codes, which is above the threshold 10%, indicating a potentially larger issue for {{$labels.instance}}' summary: More than 10% of all requests result in a 5XX. expr: | - 100 * rate(discourse_http_requests{status="500"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 10 + 100 * rate(discourse_http_requests{status=~"5..", job="integrations/discourse"}[5m]) / on() group_left() (sum(rate(discourse_http_requests[5m])) by (instance)) > 10 for: 5m labels: severity: critical