diff --git a/.gitignore b/.gitignore index d68c86c04..1a57d51f0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ vendor jsonnetfile.lock.json *.zip +.worktrees diff --git a/apache-airflow-mixin/.lint b/apache-airflow-mixin/.lint index 7831c2389..354cebed3 100644 --- a/apache-airflow-mixin/.lint +++ b/apache-airflow-mixin/.lint @@ -1 +1,7 @@ -exclusions: {} +exclusions: + template-job-rule: + reason: "Prometheus datasource variable is being named as prometheus_datasource now while linter expects 'datasource'" + panel-datasource-rule: + reason: "Modern mixins use signal-based architecture where datasource references are handled by the framework" + template-datasource-rule: + reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'" diff --git a/apache-airflow-mixin/alerts/alerts.libsonnet b/apache-airflow-mixin/alerts.libsonnet similarity index 67% rename from apache-airflow-mixin/alerts/alerts.libsonnet rename to apache-airflow-mixin/alerts.libsonnet index f02d2f94f..ed70ae359 100644 --- a/apache-airflow-mixin/alerts/alerts.libsonnet +++ b/apache-airflow-mixin/alerts.libsonnet @@ -1,14 +1,14 @@ { - prometheusAlerts+:: { - groups+: [ + new(this): { + groups: [ { - name: 'apache-airflow', + name: this.config.uid + '-alerts', rules: [ { alert: 'ApacheAirflowStarvingPoolTasks', expr: ||| - airflow_pool_starving_tasks > %(alertsCriticalPoolStarvingTasks)s - ||| % $._config, + airflow_pool_starving_tasks{%(filteringSelector)s} > %(alertsCriticalPoolStarvingTasks)s + ||| % this.config, 'for': '5m', labels: { severity: 'critical', @@ -17,14 +17,14 @@ summary: 'There are starved tasks detected in the Apache Airflow pool.', description: ||| The number of starved tasks is {{ printf "%%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.pool_name }} which is above the threshold of %(alertsCriticalPoolStarvingTasks)s. - ||| % $._config, + ||| % this.config, }, }, { - alert: 'ApacheAirflowDAGScheduleDelayWarningLevel', + alert: 'ApacheAirflowDAGScheduleDelayWarning', expr: ||| - increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > %(alertsWarningDAGScheduleDelayLevel)s - ||| % $._config, + increase(airflow_dagrun_schedule_delay_sum{%(filteringSelector)s}[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{%(filteringSelector)s}[5m]),1) > %(alertsWarningDAGScheduleDelayLevel)s + ||| % this.config, 'for': '1m', labels: { severity: 'warning', @@ -33,14 +33,14 @@ summary: 'The delay in DAG schedule time to DAG run time has reached the warning threshold.', description: ||| The average delay in DAG schedule to run time is {{ printf "%%.0f" $value }} over the last 1m on {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of %(alertsWarningDAGScheduleDelayLevel)s. - ||| % $._config, + ||| % this.config, }, }, { - alert: 'ApacheAirflowDAGScheduleDelayCriticalLevel', + alert: 'ApacheAirflowDAGScheduleDelayCritical', expr: ||| - increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > %(alertsCriticalDAGScheduleDelayLevel)s - ||| % $._config, + increase(airflow_dagrun_schedule_delay_sum{%(filteringSelector)s}[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{%(filteringSelector)s}[5m]),1) > %(alertsCriticalDAGScheduleDelayLevel)s + ||| % this.config, 'for': '1m', labels: { severity: 'critical', @@ -49,14 +49,14 @@ summary: 'The delay in DAG schedule time to DAG run time has reached the critical threshold.', description: ||| The average delay in DAG schedule to run time is {{ printf "%%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of %(alertsCriticalDAGScheduleDelayLevel)s. - ||| % $._config, + ||| % this.config, }, }, { alert: 'ApacheAirflowDAGFailures', expr: ||| - increase(airflow_dagrun_duration_failed_count[5m]) > %(alertsCriticalFailedDAGs)s - ||| % $._config, + increase(airflow_dagrun_duration_failed_count{%(filteringSelector)s}[5m]) > %(alertsCriticalFailedDAGs)s + ||| % this.config, 'for': '1m', labels: { severity: 'critical', @@ -65,7 +65,7 @@ summary: 'There have been DAG failures detected.', description: ||| The number of DAG failures seen is {{ printf "%%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of %(alertsCriticalFailedDAGs)s. - ||| % $._config, + ||| % this.config, }, }, ], diff --git a/apache-airflow-mixin/config.libsonnet b/apache-airflow-mixin/config.libsonnet index db3db9025..2a6828636 100644 --- a/apache-airflow-mixin/config.libsonnet +++ b/apache-airflow-mixin/config.libsonnet @@ -1,20 +1,42 @@ { - _config+:: { - dashboardTags: ['apache-airflow-mixin'], - dashboardPeriod: 'now-1h', - dashboardTimezone: 'default', - dashboardRefresh: '1m', + local this = self, + filteringSelector: 'job=~"$job", instance=~"$instance"', + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardTags: ['apache-airflow-mixin'], + uid: 'apache-airflow', + dashboardNamePrefix: 'Apache Airflow', - //alert thresholds - alertsCriticalPoolStarvingTasks: 0, - alertsWarningDAGScheduleDelayLevel: 10, //s - alertsCriticalDAGScheduleDelayLevel: 60, //s - alertsCriticalFailedDAGs: 0, + // additional params + dashboardPeriod: 'now-1h', + dashboardTimezone: 'default', + dashboardRefresh: '1m', - enableLokiLogs: true, - enableMultiCluster: false, + // logs lib related + enableLokiLogs: true, + logLabels: ['job', 'instance', 'filename'], + extraLogLabels: [], // Required by logs-lib + logsVolumeGroupBy: 'level', + showLogsVolume: true, - multiclusterSelector: 'job=~"$job"', - airflowSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"', + // alert thresholds + alertsCriticalPoolStarvingTasks: 0, // count + alertsWarningDAGScheduleDelayLevel: 10, // s + alertsCriticalDAGScheduleDelayLevel: 60, // s + alertsCriticalFailedDAGs: 0, // count + + // multi-cluster support + enableMultiCluster: false, + + // metrics source for signals library + metricsSource: 'prometheus', + + legendCustomTemplate: std.join(' ', std.map(function(label) '{{' + label + '}}', this.instanceLabels)), + signals+: { + dags: (import './signals/dags.libsonnet')(this), + tasks: (import './signals/tasks.libsonnet')(this), + scheduler: (import './signals/scheduler.libsonnet')(this), + executor: (import './signals/executor.libsonnet')(this), + pools: (import './signals/pools.libsonnet')(this), }, } diff --git a/apache-airflow-mixin/dashboards.libsonnet b/apache-airflow-mixin/dashboards.libsonnet new file mode 100644 index 000000000..8d4ecc310 --- /dev/null +++ b/apache-airflow-mixin/dashboards.libsonnet @@ -0,0 +1,142 @@ +local g = import './g.libsonnet'; +local logslib = import 'logs-lib/logs/main.libsonnet'; + +{ + local root = self, + new(this):: + local prefix = this.config.dashboardNamePrefix; + local links = this.grafana.links; + local tags = this.config.dashboardTags; + local uid = g.util.string.slugify(this.config.uid); + local vars = this.grafana.variables; + local annotations = this.grafana.annotations; + local refresh = this.config.dashboardRefresh; + local period = this.config.dashboardPeriod; + local timezone = this.config.dashboardTimezone; + + { + // Apache Airflow overview dashboard + 'apache-airflow-overview.json': + g.dashboard.new(prefix + ' overview') + + g.dashboard.withDescription('Dashboard providing an overview of Apache Airflow DAGs, tasks, and scheduler performance.') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + this.grafana.rows.apacheAirflowOverview, + this.grafana.rows.apacheAirflowSchedulerDetails, + ] + ) + ) + ) + + root.applyCommon( + vars.multiInstance + [ + g.dashboard.variable.query.new('dag_id') + + g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus) + + g.dashboard.variable.query.queryTypes.withLabelValues('dag_id', 'airflow_dagrun_duration_success_sum{job=~"$job", instance=~"$instance"}') + + g.dashboard.variable.query.generalOptions.withLabel('DAG ID') + + g.dashboard.variable.query.selectionOptions.withMulti(true) + + g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+') + + g.dashboard.variable.query.refresh.onLoad() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('task_id') + + g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus) + + g.dashboard.variable.query.queryTypes.withLabelValues('task_id', 'airflow_ti_failures{job=~"$job", instance=~"$instance"}') + + g.dashboard.variable.query.generalOptions.withLabel('Task ID') + + g.dashboard.variable.query.selectionOptions.withMulti(true) + + g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+') + + g.dashboard.variable.query.refresh.onLoad() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('state') + + g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus) + + g.dashboard.variable.query.queryTypes.withLabelValues('state', 'airflow_task_finish_total{job=~"$job", instance=~"$instance"}') + + g.dashboard.variable.query.generalOptions.withLabel('State') + + g.dashboard.variable.query.selectionOptions.withMulti(true) + + g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+') + + g.dashboard.variable.query.refresh.onLoad() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('pool_name') + + g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus) + + g.dashboard.variable.query.queryTypes.withLabelValues('pool_name', 'airflow_pool_running_slots{job=~"$job", instance=~"$instance"}') + + g.dashboard.variable.query.generalOptions.withLabel('Pool name') + + g.dashboard.variable.query.selectionOptions.withMulti(true) + + g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+') + + g.dashboard.variable.query.refresh.onLoad() + + g.dashboard.variable.query.refresh.onTime(), + ], + uid + '_overview', + tags, + links { apacheAirflowOverview+:: {} }, + annotations, + timezone, + refresh, + period + ), + } + + + if this.config.enableLokiLogs then + { + 'apache-airflow-logs.json': + logslib.new( + prefix + ' logs', + datasourceName=vars.datasources.loki.name, + datasourceRegex=vars.datasources.loki.regex, + filterSelector=this.config.filteringSelector, + labels=this.config.logLabels + this.config.extraLogLabels, + formatParser=null, + showLogsVolume=this.config.showLogsVolume, + logsVolumeGroupBy=this.config.logsVolumeGroupBy, + extraFilters=[] + ) + { + dashboards+: + { + logs+: + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + this.grafana.rows.apacheAirflowLogs, + ] + ) + ) + ) + + root.applyCommon( + [], + uid + '_logs', + tags, + links { logs+:: {} }, + annotations, + timezone, + refresh, + period + ), + }, + variables+: { + toArray+: [ + g.dashboard.variable.query.new('dag_file') + + g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.loki) + + g.dashboard.variable.query.queryTypes.withLabelValues('filename', '{job=~"$job", instance=~"$instance"}') + + g.dashboard.variable.query.generalOptions.withLabel('DAG file') + + g.dashboard.variable.query.selectionOptions.withMulti(true) + + g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+') + + g.dashboard.variable.query.refresh.onLoad() + + g.dashboard.variable.query.refresh.onTime(), + ], + }, + }.dashboards.logs, + } else {}, + + applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period): + g.dashboard.withTags(tags) + + g.dashboard.withUid(uid) + + g.dashboard.withLinks(std.objectValues(links)) + + g.dashboard.withTimezone(timezone) + + g.dashboard.withRefresh(refresh) + + g.dashboard.time.withFrom(period) + + g.dashboard.withVariables(vars) + + g.dashboard.withAnnotations(std.objectValues(annotations)), +} diff --git a/apache-airflow-mixin/dashboards/airflow-overview.libsonnet b/apache-airflow-mixin/dashboards/airflow-overview.libsonnet deleted file mode 100644 index 02fe75a12..000000000 --- a/apache-airflow-mixin/dashboards/airflow-overview.libsonnet +++ /dev/null @@ -1,1183 +0,0 @@ -local g = (import 'grafana-builder/grafana.libsonnet'); -local grafana = (import 'grafonnet/grafana.libsonnet'); -local dashboard = grafana.dashboard; -local template = grafana.template; -local prometheus = grafana.prometheus; - -local dashboardUid = 'apache-airflow-overview'; - -local promDatasourceName = 'prometheus_datasource'; -local lokiDatasourceName = 'loki_datasource'; - -local getMatcher(cfg) = '%(airflowSelector)s, instance=~"$instance"' % cfg; - -local promDatasource = { - uid: '${%s}' % promDatasourceName, -}; - -local lokiDatasource = { - uid: '${%s}' % lokiDatasourceName, -}; - -local dagFileParsingErrorsPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'airflow_dag_processing_import_errors{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}}', - ), - ], - type: 'timeseries', - title: 'DAG file parsing errors', - description: 'The number of errors from trying to parse DAG files in an Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 1, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'single', - sort: 'none', - }, - }, -}; - -local slaMissesPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_sla_missed{' + matcher + '}[$__interval:])', - datasource=promDatasource, - legendFormat='{{instance}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'SLA misses', - description: 'The number of Service Level Agreement misses for any DAG runs in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'single', - sort: 'none', - }, - }, -}; - -local taskFailuresPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_ti_failures{' + matcher + '}[$__interval:])', - datasource=promDatasource, - legendFormat='{{instance}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'Task failures', - description: 'The overall task instances failures for an Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 1, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'single', - sort: 'none', - }, - }, -}; - -local dagSuccessDurationPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_dagrun_duration_success_sum{' + matcher + ', dag_id=~"$dag_id"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_success_count{' + matcher + ', dag_id=~"$dag_id"}[$__interval:]),1)', - datasource=promDatasource, - legendFormat='{{dag_id}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'DAG success duration', - description: 'The average time taken for recent successful DAG runs by DAG ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 's', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local dagFailedDurationPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_dagrun_duration_failed_sum{' + matcher + ', dag_id=~"$dag_id"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_failed_count{' + matcher + ', dag_id=~"$dag_id"}[$__interval:]),1)', - datasource=promDatasource, - legendFormat='{{dag_id}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'DAG failed duration', - description: 'The average time taken for recent failed DAG runs by DAG ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 's', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local taskDurationPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_dag_task_duration_sum{' + matcher + ', dag_id=~"$dag_id", task_id=~"$task_id"}[$__interval:]) / clamp_min(increase(airflow_dag_task_duration_count{' + matcher + ', dag_id=~"$dag_id", task_id=~"$task_id"}[$__interval:]),1) != 0', - datasource=promDatasource, - legendFormat='{{dag_id}} - {{task_id}}', - interval='1m', - ), - ], - type: 'bargauge', - title: 'Task duration', - description: 'The average time taken for recent task runs by Task ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 's', - }, - overrides: [], - }, - options: { - displayMode: 'gradient', - minVizHeight: 10, - minVizWidth: 0, - orientation: 'horizontal', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - showUnfilled: true, - text: {}, - }, - pluginVersion: '9.2.3', - transformations: [], -}; - -local taskCountSummaryPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'sum by(job, instance, dag_id, state) (increase(airflow_task_finish_total{' + matcher + ', dag_id=~"$dag_id"}[$__interval:])) != 0', - datasource=promDatasource, - legendFormat='{{dag_id}} - {{state}}', - interval='1m', - ), - ], - type: 'piechart', - title: 'Task count summary', - description: 'The number of task counts by DAG ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - }, - mappings: [], - }, - overrides: [], - }, - options: { - legend: { - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - pieType: 'pie', - reduceOptions: { - calcs: [ - 'sum', - ], - fields: '', - values: false, - }, - tooltip: { - mode: 'multi', - sort: 'asc', - }, - }, -}; - -local taskCountsPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_task_finish_total{' + matcher + ', dag_id=~"$dag_id", task_id=~"$task_id", state=~"$state"}[$__interval:])', - datasource=promDatasource, - legendFormat='{{dag_id}} - {{task_id}} - {{state}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'Task counts', - description: 'The number of task counts by Task ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, - pluginVersion: '9.2.3', -}; - -local taskLogsPanel(matcher) = { - datasource: lokiDatasource, - targets: [ - { - datasource: lokiDatasource, - editorMode: 'code', - expr: '{' + matcher + ', dag_id=~"$dag_id", task_id=~"$task_id", filename=~".*/airflow/logs/dag_id.*"} |= ``', - queryType: 'range', - refId: 'A', - }, - ], - type: 'logs', - title: 'Task logs', - description: 'Logs for each individual task run on the DAGs.', - options: { - dedupStrategy: 'none', - enableLogDetails: true, - prettifyLogMessage: false, - showCommonLabels: false, - showLabels: false, - showTime: false, - sortOrder: 'Descending', - wrapLogMessage: false, - }, -}; - -local schedulerDetailsRow = { - datasource: promDatasource, - targets: [], - type: 'row', - title: 'Scheduler details', - collapsed: false, -}; - -local dagScheduleDelayPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_dagrun_schedule_delay_sum{' + matcher + ', dag_id=~"$dag_id"}[$__interval:]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{' + matcher + ', dag_id=~"$dag_id"}[$__interval:]),1)', - datasource=promDatasource, - legendFormat='{{instance}} - {{dag_id}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'DAG schedule delay', - description: 'The amount of average delay between recent scheduled DAG runtime and the actual DAG runtime by DAG ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 's', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local schedulerTasksPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'airflow_scheduler_tasks_executable{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}} - executable', - ), - prometheus.target( - 'airflow_scheduler_tasks_starving{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}} - starving', - ), - ], - type: 'timeseries', - title: 'Scheduler tasks', - description: 'The number of current tasks that the scheduler is handling in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local executorTasksPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'airflow_executor_running_tasks{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}} - running', - ), - prometheus.target( - 'airflow_executor_queued_tasks{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}} - queued', - ), - prometheus.target( - 'airflow_executor_open_slots{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}} - open', - ), - ], - type: 'timeseries', - title: 'Executor tasks', - description: 'The number of current tasks that the executors are handling in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local poolTaskSlotsPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'airflow_pool_running_slots{' + matcher + ', pool_name=~"$pool_name"}', - datasource=promDatasource, - legendFormat='{{instance}} - {{pool_name}} - running', - ), - prometheus.target( - 'airflow_pool_queued_slots{' + matcher + ', pool_name=~"$pool_name"}', - datasource=promDatasource, - legendFormat='{{instance}} - {{pool_name}} - queued', - ), - prometheus.target( - 'airflow_pool_starving_tasks{' + matcher + ', pool_name=~"$pool_name"}', - datasource=promDatasource, - legendFormat='{{instance}} - {{pool_name}} - starving', - ), - prometheus.target( - 'airflow_pool_open_slots{' + matcher + ', pool_name=~"$pool_name"}', - datasource=promDatasource, - legendFormat='{{instance}} - {{pool_name}} - open', - ), - ], - type: 'timeseries', - title: 'Pool task slots', - description: 'The number of current task slots that the pools are handling in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local schedulerLogsPanel(matcher) = { - datasource: lokiDatasource, - targets: [ - { - datasource: lokiDatasource, - editorMode: 'code', - expr: '{' + matcher + ', dag_file=~"$dag_file", filename=~".*/airflow/logs/scheduler/latest/.*"} |= ``', - queryType: 'range', - refId: 'A', - }, - ], - type: 'logs', - title: 'Scheduler logs', - description: 'Shows the scheduler logs by DAG file.', - options: { - dedupStrategy: 'none', - enableLogDetails: true, - prettifyLogMessage: false, - showCommonLabels: false, - showLabels: false, - showTime: false, - sortOrder: 'Descending', - wrapLogMessage: false, - }, -}; - -{ - grafanaDashboards+:: { - 'apache-airflow-overview.json': - dashboard.new( - 'Apache Airflow overview', - time_from='%s' % $._config.dashboardPeriod, - tags=($._config.dashboardTags), - timezone='%s' % $._config.dashboardTimezone, - refresh='%s' % $._config.dashboardRefresh, - description='', - uid=dashboardUid, - ) - - .addTemplates( - std.flattenArrays([ - [ - template.datasource( - promDatasourceName, - 'prometheus', - null, - label='Prometheus data source', - refresh='load' - ), - ], - if $._config.enableLokiLogs then [ - template.datasource( - lokiDatasourceName, - 'loki', - null, - label='Loki data source', - refresh='load' - ), - ] else [], - [ - template.new( - 'job', - promDatasource, - 'label_values(airflow_scheduler_tasks_executable,job)', - label='Job', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'cluster', - promDatasource, - 'label_values(airflow_scheduler_tasks_executable{%(multiclusterSelector)s}, cluster)' % $._config, - label='Cluster', - refresh=2, - includeAll=true, - multi=true, - allValues='.*', - hide=if $._config.enableMultiCluster then '' else 'variable', - sort=0 - ), - template.new( - 'instance', - promDatasource, - 'label_values(airflow_scheduler_tasks_executable{%(airflowSelector)s}, instance)' % $._config, - label='Instance', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'dag_id', - promDatasource, - 'label_values(airflow_task_start_total{%(airflowSelector)s, instance=~"$instance"}, dag_id)' % $._config, - label='DAG', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'task_id', - promDatasource, - 'label_values(airflow_task_start_total{%(airflowSelector)s, instance=~"$instance", dag_id=~"$dag_id"}, task_id)' % $._config, - label='Task', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'state', - promDatasource, - 'label_values(airflow_task_finish_total{%(airflowSelector)s, instance=~"$instance", dag_id=~"$dag_id", task_id=~"$task_id"}, state)' % $._config, - label='Task state', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'pool_name', - promDatasource, - 'label_values(airflow_pool_open_slots{%(airflowSelector)s, instance=~"$instance"}, pool_name)' % $._config, - label='Pool', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - - ], - if $._config.enableLokiLogs then [ - template.new( - 'dag_file', - lokiDatasource, - 'label_values(dag_file)', - label='DAG file', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - ] else [], - ]) - ) - .addPanels( - std.flattenArrays([ - [ - dagFileParsingErrorsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 8, x: 0, y: 0 } }, - slaMissesPanel(getMatcher($._config)) { gridPos: { h: 8, w: 8, x: 8, y: 0 } }, - taskFailuresPanel(getMatcher($._config)) { gridPos: { h: 8, w: 8, x: 16, y: 0 } }, - dagSuccessDurationPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 8 } }, - dagFailedDurationPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 8 } }, - taskDurationPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 16 } }, - taskCountSummaryPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 16 } }, - taskCountsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 24 } }, - ], - if $._config.enableLokiLogs then [ - taskLogsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 32 } }, - ] else [], - [ - schedulerDetailsRow { gridPos: { h: 1, w: 24, x: 0, y: 40 } }, - dagScheduleDelayPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 41 } }, - schedulerTasksPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 41 } }, - executorTasksPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 49 } }, - poolTaskSlotsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 49 } }, - ], - if $._config.enableLokiLogs then [ - schedulerLogsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 57 } }, - ] else [], - [ - ], - ]) - ), - }, -} diff --git a/apache-airflow-mixin/dashboards/dashboards.libsonnet b/apache-airflow-mixin/dashboards/dashboards.libsonnet deleted file mode 100644 index 68ee182ef..000000000 --- a/apache-airflow-mixin/dashboards/dashboards.libsonnet +++ /dev/null @@ -1 +0,0 @@ -(import 'airflow-overview.libsonnet') diff --git a/apache-airflow-mixin/dashboards_out/apache-airflow-logs.json b/apache-airflow-mixin/dashboards_out/apache-airflow-logs.json new file mode 100644 index 000000000..0a8511133 --- /dev/null +++ b/apache-airflow-mixin/dashboards_out/apache-airflow-logs.json @@ -0,0 +1,117 @@ +{ + "annotations": { + "list": [ ] + }, + "links": [ + { + "keepTime": true, + "title": "Apache Airflow overview", + "type": "link", + "url": "/d/apacheairflow_overview" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "apache-airflow-mixin" + ], + "title": "All dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "Apache Airflow logs", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Logs for each individual task run on the DAGs.", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "enableLogDetails": true, + "showCommonLabels": false, + "showTime": false, + "wrapLogMessage": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "expr": "{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\", filename=~\".*/airflow/logs/dag_id.*\"} |= ``" + } + ], + "title": "Task logs", + "type": "logs" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "description": "Logs for the scheduler in the Apache Airflow system.", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 3, + "options": { + "enableLogDetails": true, + "showCommonLabels": false, + "showTime": false, + "wrapLogMessage": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "expr": "{job=~\"$job\", instance=~\"$instance\", filename=~\".*/airflow/logs/scheduler/.*\"} |= ``" + } + ], + "title": "Scheduler logs", + "type": "logs" + } + ], + "refresh": "1m", + "schemaVersion": 39, + "tags": [ + "apache-airflow-mixin" + ], + "templating": { + "list": [ ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timezone": "default", + "title": "Apache Airflow logs", + "uid": "apacheairflow_logs" + } \ No newline at end of file diff --git a/apache-airflow-mixin/dashboards_out/apache-airflow-overview.json b/apache-airflow-mixin/dashboards_out/apache-airflow-overview.json index 69e0b0031..b86f9cdcf 100644 --- a/apache-airflow-mixin/dashboards_out/apache-airflow-overview.json +++ b/apache-airflow-mixin/dashboards_out/apache-airflow-overview.json @@ -1,62 +1,51 @@ { - "__inputs": [ ], - "__requires": [ ], "annotations": { "list": [ ] }, - "description": "", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ ], + "description": "Dashboard providing an overview of Apache Airflow DAGs, tasks, and scheduler performance.", + "links": [ + { + "keepTime": true, + "title": "Apache Airflow logs", + "type": "link", + "url": "/d/apacheairflow_logs" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "apache-airflow-mixin" + ], + "title": "All dashboards", + "type": "dashboards" + } + ], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "Apache Airflow overview", + "type": "row" + }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The number of errors from trying to parse DAG files in an Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, "decimals": 0, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -69,37 +58,27 @@ ] }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 8, "x": 0, - "y": 0 + "y": 1 }, "id": 2, - "options": { - "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "airflow_dag_processing_import_errors{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_dag_processing_import_errors{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "DAG file parsing errors" } ], "title": "DAG file parsing errors", @@ -107,49 +86,15 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The number of Service Level Agreement misses for any DAG runs in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, "decimals": 0, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -162,38 +107,28 @@ ] }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 8, "x": 8, - "y": 0 + "y": 1 }, "id": 3, - "options": { - "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "increase(airflow_sla_missed{job=~\"$job\", instance=~\"$instance\"}[$__interval:])", + "expr": "rate(airflow_sla_missed{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}[$__rate_interval])", "format": "time_series", + "instant": false, "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "legendFormat": "{{instance}}", + "refId": "SLA misses" } ], "title": "SLA misses", @@ -201,49 +136,15 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The overall task instances failures for an Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, "decimals": 0, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -256,38 +157,28 @@ ] }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 8, "x": 16, - "y": 0 + "y": 1 }, "id": 4, - "options": { - "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "increase(airflow_ti_failures{job=~\"$job\", instance=~\"$instance\"}[$__interval:])", + "expr": "rate(airflow_ti_failures{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}[$__rate_interval])", "format": "time_series", + "instant": false, "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "legendFormat": "{{instance}}", + "refId": "Task failures" } ], "title": "Task failures", @@ -295,48 +186,14 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The average time taken for recent successful DAG runs by DAG ID in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -349,38 +206,36 @@ ] }, "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 8 + "y": 9 }, "id": 5, "options": { "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "right", - "showLegend": true + "placement": "right" }, "tooltip": { - "mode": "multi", - "sort": "none" + "mode": "multi" } }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "increase(airflow_dagrun_duration_success_sum{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_success_count{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]),1)", + "expr": "increase(airflow_dagrun_duration_success_sum{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_success_count{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]),1)", "format": "time_series", + "instant": false, "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{dag_id}}" + "legendFormat": "{{dag_id}}", + "refId": "Average successful DAG run duration" } ], "title": "DAG success duration", @@ -388,48 +243,14 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The average time taken for recent failed DAG runs by DAG ID in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -442,38 +263,36 @@ ] }, "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 8 + "y": 9 }, "id": 6, "options": { "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "right", - "showLegend": true + "placement": "right" }, "tooltip": { - "mode": "multi", - "sort": "none" + "mode": "multi" } }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "increase(airflow_dagrun_duration_failed_sum{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_failed_count{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]),1)", + "expr": "increase(airflow_dagrun_duration_failed_sum{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_failed_count{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]),1)", "format": "time_series", + "instant": false, "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{dag_id}}" + "legendFormat": "{{dag_id}}", + "refId": "Average failed DAG run duration" } ], "title": "DAG failed duration", @@ -481,7 +300,8 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The average time taken for recent task runs by Task ID in the Apache Airflow system.", "fieldConfig": { @@ -489,10 +309,8 @@ "color": { "mode": "thresholds" }, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -501,105 +319,82 @@ ] }, "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, - "w": 12, + "w": 8, "x": 0, - "y": 16 + "y": 17 }, "id": 7, "options": { "displayMode": "gradient", - "minVizHeight": 10, - "minVizWidth": 0, "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" - ], - "fields": "", - "values": false + ] }, - "showUnfilled": true, - "text": { } + "showUnfilled": true }, - "pluginVersion": "9.2.3", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "increase(airflow_dag_task_duration_sum{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}[$__interval:]) / clamp_min(increase(airflow_dag_task_duration_count{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}[$__interval:]),1) != 0", + "expr": "increase(airflow_dag_task_duration_sum{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}[$__interval:])/clamp_min(increase(airflow_dag_task_duration_count{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}[$__interval:]),1)", "format": "time_series", + "instant": false, "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{dag_id}} - {{task_id}}" + "legendFormat": "{{dag_id}} - {{task_id}}", + "refId": "Average task duration" } ], "title": "Task duration", - "transformations": [ ], "type": "bargauge" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The number of task counts by DAG ID in the Apache Airflow system.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "mappings": [ ] - }, - "overrides": [ ] - }, "gridPos": { "h": 8, - "w": 12, - "x": 12, - "y": 16 + "w": 8, + "x": 8, + "y": 17 }, "id": 8, "options": { "legend": { - "displayMode": "list", - "placement": "right", - "showLegend": true + "placement": "right" }, - "pieType": "pie", "reduceOptions": { "calcs": [ "sum" - ], - "fields": "", - "values": false + ] }, "tooltip": { - "mode": "multi", - "sort": "asc" + "mode": "multi" } }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "sum by(job, instance, dag_id, state) (increase(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:])) != 0", + "expr": "sum by(job, instance, dag_id, state) (\n rate(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\", state=~\"$state\"}[$__rate_interval])\n)", "format": "time_series", + "instant": false, "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{dag_id}} - {{state}}" + "legendFormat": "{{dag_id}} - {{state}}", + "refId": "Task finish total" } ], "title": "Task count summary", @@ -607,49 +402,15 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The number of task counts by Task ID in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, "decimals": 0, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -658,140 +419,64 @@ ] }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, - "w": 24, - "x": 0, - "y": 24 + "w": 8, + "x": 16, + "y": 17 }, "id": 9, "options": { "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "right", - "showLegend": true + "placement": "right" }, "tooltip": { - "mode": "multi", - "sort": "none" + "mode": "multi" } }, - "pluginVersion": "9.2.3", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "increase(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\", state=~\"$state\"}[$__interval:])", + "expr": "rate(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\", state=~\"$state\"}[$__rate_interval])", "format": "time_series", + "instant": false, "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{dag_id}} - {{task_id}} - {{state}}" + "legendFormat": "{{dag_id}} - {{task_id}} - {{state}}", + "refId": "Task finish total" } ], "title": "Task counts", "type": "timeseries" }, - { - "datasource": { - "uid": "${loki_datasource}" - }, - "description": "Logs for each individual task run on the DAGs.", - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 32 - }, - "id": 10, - "options": { - "dedupStrategy": "none", - "enableLogDetails": true, - "prettifyLogMessage": false, - "showCommonLabels": false, - "showLabels": false, - "showTime": false, - "sortOrder": "Descending", - "wrapLogMessage": false - }, - "targets": [ - { - "datasource": { - "uid": "${loki_datasource}" - }, - "editorMode": "code", - "expr": "{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\", filename=~\".*/airflow/logs/dag_id.*\"} |= ``", - "queryType": "range", - "refId": "A" - } - ], - "title": "Task logs", - "type": "logs" - }, { "collapsed": false, - "datasource": { - "uid": "${prometheus_datasource}" - }, "gridPos": { "h": 1, - "w": 24, + "w": 0, "x": 0, - "y": 40 + "y": 25 }, - "id": 11, - "targets": [ ], + "id": 10, + "panels": [ ], "title": "Scheduler details", "type": "row" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The amount of average delay between recent scheduled DAG runtime and the actual DAG runtime by DAG ID in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -804,38 +489,36 @@ ] }, "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 41 + "y": 26 }, - "id": 12, + "id": 11, "options": { "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "right", - "showLegend": true + "placement": "right" }, "tooltip": { - "mode": "multi", - "sort": "none" + "mode": "multi" } }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "increase(airflow_dagrun_schedule_delay_sum{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]),1)", + "expr": "increase(airflow_dagrun_schedule_delay_sum{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:])/clamp_min(increase(airflow_dagrun_schedule_delay_count{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]),1)", "format": "time_series", + "instant": false, "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{dag_id}}" + "legendFormat": "{{instance}} - {{dag_id}}", + "refId": "Average DAG schedule delay" } ], "title": "DAG schedule delay", @@ -843,49 +526,15 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The number of current tasks that the scheduler is handling in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, "decimals": 0, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -898,46 +547,43 @@ ] }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 41 + "y": 26 }, - "id": 13, + "id": 12, "options": { - "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, "tooltip": { - "mode": "multi", - "sort": "none" + "mode": "multi" } }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "airflow_scheduler_tasks_executable{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_scheduler_tasks_executable{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - executable" + "instant": false, + "legendFormat": "{{instance}} - executable", + "refId": "Scheduler executable tasks" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "airflow_scheduler_tasks_starving{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_scheduler_tasks_starving{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - starving" + "instant": false, + "legendFormat": "{{instance}} - starving", + "refId": "Scheduler starving tasks" } ], "title": "Scheduler tasks", @@ -945,49 +591,15 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The number of current tasks that the executors are handling in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, "decimals": 0, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -1000,55 +612,54 @@ ] }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 49 + "y": 34 }, - "id": 14, + "id": 13, "options": { - "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, "tooltip": { - "mode": "multi", - "sort": "none" + "mode": "multi" } }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "airflow_executor_running_tasks{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_executor_running_tasks{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - running" + "instant": false, + "legendFormat": "{{instance}} - running", + "refId": "Executor running tasks" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "airflow_executor_queued_tasks{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_executor_queued_tasks{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - queued" + "instant": false, + "legendFormat": "{{instance}} - queued", + "refId": "Executor queued tasks" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "airflow_executor_open_slots{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_executor_open_slots{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - open" + "instant": false, + "legendFormat": "{{instance}} - open", + "refId": "Executor open slots" } ], "title": "Executor tasks", @@ -1056,49 +667,15 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, - "description": "The number of current task slots that the pools are handling in the Apache Airflow system.", + "description": "The number of task slots available in the pools of the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, "decimals": 0, - "mappings": [ ], "min": 0, "thresholds": { - "mode": "absolute", "steps": [ { "color": "green", @@ -1111,312 +688,178 @@ ] }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 49 + "y": 34 }, - "id": 15, + "id": 14, "options": { - "legend": { - "calcs": [ ], - "displayMode": "list", - "placement": "right", - "showLegend": true - }, "tooltip": { - "mode": "multi", - "sort": "none" + "mode": "multi" } }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "airflow_pool_running_slots{job=~\"$job\", instance=~\"$instance\", pool_name=~\"$pool_name\"}", + "expr": "airflow_pool_running_slots{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", pool_name=~\"$pool_name\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{pool_name}} - running" + "instant": false, + "legendFormat": "{{pool_name}} - running", + "refId": "Pool running slots" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "airflow_pool_queued_slots{job=~\"$job\", instance=~\"$instance\", pool_name=~\"$pool_name\"}", + "expr": "airflow_pool_queued_slots{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", pool_name=~\"$pool_name\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{pool_name}} - queued" + "instant": false, + "legendFormat": "{{pool_name}} - queued", + "refId": "Pool queued slots" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "airflow_pool_starving_tasks{job=~\"$job\", instance=~\"$instance\", pool_name=~\"$pool_name\"}", + "expr": "airflow_pool_starving_tasks{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", pool_name=~\"$pool_name\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{pool_name}} - starving" + "instant": false, + "legendFormat": "{{pool_name}} - starving", + "refId": "Pool starving tasks" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "prometheus", + "uid": "${datasource}" }, - "expr": "airflow_pool_open_slots{job=~\"$job\", instance=~\"$instance\", pool_name=~\"$pool_name\"}", + "expr": "airflow_pool_open_slots{job=~\"$job\", instance=~\"$instance\",job=~\"$job\",instance=~\"$instance\", pool_name=~\"$pool_name\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{pool_name}} - open" + "instant": false, + "legendFormat": "{{pool_name}} - open", + "refId": "Pool open slots" } ], "title": "Pool task slots", "type": "timeseries" - }, - { - "datasource": { - "uid": "${loki_datasource}" - }, - "description": "Shows the scheduler logs by DAG file.", - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 57 - }, - "id": 16, - "options": { - "dedupStrategy": "none", - "enableLogDetails": true, - "prettifyLogMessage": false, - "showCommonLabels": false, - "showLabels": false, - "showTime": false, - "sortOrder": "Descending", - "wrapLogMessage": false - }, - "targets": [ - { - "datasource": { - "uid": "${loki_datasource}" - }, - "editorMode": "code", - "expr": "{job=~\"$job\", instance=~\"$instance\", dag_file=~\"$dag_file\", filename=~\".*/airflow/logs/scheduler/latest/.*\"} |= ``", - "queryType": "range", - "refId": "A" - } - ], - "title": "Scheduler logs", - "type": "logs" } ], "refresh": "1m", - "rows": [ ], - "schemaVersion": 14, - "style": "dark", + "schemaVersion": 39, "tags": [ "apache-airflow-mixin" ], "templating": { "list": [ { - "current": { }, - "hide": 0, "label": "Prometheus data source", "name": "prometheus_datasource", - "options": [ ], "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "current": { }, - "hide": 0, - "label": "Loki data source", - "name": "loki_datasource", - "options": [ ], - "query": "loki", - "refresh": 1, "regex": "", "type": "datasource" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, "label": "Job", "multi": true, "name": "job", - "options": [ ], - "query": "label_values(airflow_scheduler_tasks_executable,job)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "current": { }, - "datasource": { - "uid": "${prometheus_datasource}" - }, - "hide": 2, - "includeAll": true, - "label": "Cluster", - "multi": true, - "name": "cluster", - "options": [ ], - "query": "label_values(airflow_scheduler_tasks_executable{job=~\"$job\"}, cluster)", + "query": "label_values(airflow_dagbag_size{job=~\"$job\", instance=~\"$instance\"}, job)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "sort": 1, + "type": "query" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, "label": "Instance", "multi": true, "name": "instance", - "options": [ ], - "query": "label_values(airflow_scheduler_tasks_executable{job=~\"$job\"}, instance)", + "query": "label_values(airflow_dagbag_size{job=~\"$job\", instance=~\"$instance\",job=~\"$job\"}, instance)", "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "hide": 2, + "label": "Loki data source", + "name": "loki_datasource", + "query": "loki", "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "datasource" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, - "label": "DAG", + "label": "DAG ID", "multi": true, "name": "dag_id", - "options": [ ], - "query": "label_values(airflow_task_start_total{job=~\"$job\", instance=~\"$instance\"}, dag_id)", + "query": "label_values(airflow_dagrun_duration_success_sum{job=~\"$job\", instance=~\"$instance\"}, dag_id)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, - "label": "Task", + "label": "Task ID", "multi": true, "name": "task_id", - "options": [ ], - "query": "label_values(airflow_task_start_total{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}, task_id)", + "query": "label_values(airflow_ti_failures{job=~\"$job\", instance=~\"$instance\"}, task_id)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, - "label": "Task state", + "label": "State", "multi": true, "name": "state", - "options": [ ], - "query": "label_values(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}, state)", + "query": "label_values(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\"}, state)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, - "label": "Pool", + "label": "Pool name", "multi": true, "name": "pool_name", - "options": [ ], - "query": "label_values(airflow_pool_open_slots{job=~\"$job\", instance=~\"$instance\"}, pool_name)", + "query": "label_values(airflow_pool_running_slots{job=~\"$job\", instance=~\"$instance\"}, pool_name)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".+", - "current": { }, - "datasource": { - "uid": "${loki_datasource}" - }, - "hide": 0, - "includeAll": true, - "label": "DAG file", - "multi": true, - "name": "dag_file", - "options": [ ], - "query": "label_values(dag_file)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" } ] }, @@ -1424,33 +867,7 @@ "from": "now-1h", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, "timezone": "default", "title": "Apache Airflow overview", - "uid": "apache-airflow-overview", - "version": 0 + "uid": "apacheairflow_overview" } \ No newline at end of file diff --git a/apache-airflow-mixin/g.libsonnet b/apache-airflow-mixin/g.libsonnet new file mode 100644 index 000000000..1e2039c69 --- /dev/null +++ b/apache-airflow-mixin/g.libsonnet @@ -0,0 +1,4 @@ +(import 'github.com/grafana/grafonnet/gen/grafonnet-v11.4.0/main.libsonnet') ++ { + util: import 'github.com/grafana/grafonnet/gen/grafonnet-v11.4.0/custom/util/main.libsonnet', +} diff --git a/apache-airflow-mixin/jsonnetfile.json b/apache-airflow-mixin/jsonnetfile.json index 65cebf84b..79c4d8a18 100644 --- a/apache-airflow-mixin/jsonnetfile.json +++ b/apache-airflow-mixin/jsonnetfile.json @@ -1,15 +1,51 @@ { - "version": 1, - "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" - } - }, - "version": "master" - } - ], - "legacyImports": true + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v11.4.0" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "common-lib" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "grafana-cloud-integration-utils" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib" + } + }, + "version": "master" + } + ], + "legacyImports": true } diff --git a/apache-airflow-mixin/links.libsonnet b/apache-airflow-mixin/links.libsonnet new file mode 100644 index 000000000..b40096644 --- /dev/null +++ b/apache-airflow-mixin/links.libsonnet @@ -0,0 +1,23 @@ +local g = import './g.libsonnet'; + +{ + local link = g.dashboard.link, + new(this): { + apacheAirflowOverview: + link.link.new(this.config.dashboardNamePrefix + ' overview', '/d/' + this.grafana.dashboards['apache-airflow-overview.json'].uid) + + link.link.options.withKeepTime(true), + + otherDashboards: + link.dashboards.new('All dashboards', this.config.dashboardTags) + + link.dashboards.options.withIncludeVars(true) + + link.dashboards.options.withKeepTime(true) + + link.dashboards.options.withAsDropdown(true), + } + + + if this.config.enableLokiLogs then + { + logs: + link.link.new(this.config.dashboardNamePrefix + ' logs', '/d/' + this.grafana.dashboards['apache-airflow-logs.json'].uid) + + link.link.options.withKeepTime(true), + } else {}, +} diff --git a/apache-airflow-mixin/main.libsonnet b/apache-airflow-mixin/main.libsonnet new file mode 100644 index 000000000..42d3b0da6 --- /dev/null +++ b/apache-airflow-mixin/main.libsonnet @@ -0,0 +1,49 @@ +local alerts = import './alerts.libsonnet'; +local config = import './config.libsonnet'; +local dashboards = import './dashboards.libsonnet'; +local links = import './links.libsonnet'; +local panels = import './panels.libsonnet'; +local rows = import './rows.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + withConfigMixin(config): { + config+: config, + }, + + new(): { + + local this = self, + config: config, + + signals: + { + [sig]: commonlib.signals.unmarshallJsonMulti( + this.config.signals[sig], + type=this.config.metricsSource + ) + for sig in std.objectFields(this.config.signals) + }, + + grafana: { + variables: commonlib.variables.new( + filteringSelector=this.config.filteringSelector, + groupLabels=this.config.groupLabels, + instanceLabels=this.config.instanceLabels, + varMetric='airflow_dagbag_size', + customAllValue='.+', + enableLokiLogs=this.config.enableLokiLogs, + ), + annotations: {}, + links: links.new(this), + panels: panels.new(this), + dashboards: dashboards.new(this), + rows: rows.new(this), + }, + + prometheus: { + alerts: alerts.new(this), + recordingRules: {}, + }, + }, +} diff --git a/apache-airflow-mixin/mixin.libsonnet b/apache-airflow-mixin/mixin.libsonnet index 4d987cf31..4f4b5f133 100644 --- a/apache-airflow-mixin/mixin.libsonnet +++ b/apache-airflow-mixin/mixin.libsonnet @@ -1,3 +1,22 @@ -(import 'dashboards/dashboards.libsonnet') + -(import 'alerts/alerts.libsonnet') + -(import 'config.libsonnet') +local apacheAirflowlib = import './main.libsonnet'; +local config = (import './config.libsonnet'); + +local apacheAirflow = + apacheAirflowlib.new() + + apacheAirflowlib.withConfigMixin( + { + filteringSelector: config.filteringSelector, + uid: config.uid, + enableLokiLogs: config.enableLokiLogs, + } + ); + +// populate monitoring-mixin: +{ + grafanaDashboards+:: { + [fname]: apacheAirflow.grafana.dashboards[fname] + for fname in std.objectFields(apacheAirflow.grafana.dashboards) + }, + prometheusAlerts+:: apacheAirflow.prometheus.alerts, + prometheusRules+:: apacheAirflow.prometheus.recordingRules, +} diff --git a/apache-airflow-mixin/panels.libsonnet b/apache-airflow-mixin/panels.libsonnet new file mode 100644 index 000000000..366b77cbb --- /dev/null +++ b/apache-airflow-mixin/panels.libsonnet @@ -0,0 +1,283 @@ +local g = (import './g.libsonnet'); +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + new(this):: + { + local signals = this.signals, + + // + // Overview Dashboard Panels + // + + // Panel 1: DAG file parsing errors + dagFileParsingErrorsPanel: + g.panel.timeSeries.new('DAG file parsing errors') + + g.panel.timeSeries.panelOptions.withDescription('The number of errors from trying to parse DAG files in an Apache Airflow system.') + + g.panel.timeSeries.queryOptions.withTargets([ + signals.dags.parsingErrors.asTarget(), + ]) + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.standardOptions.withDecimals(0) + + g.panel.timeSeries.standardOptions.thresholds.withSteps([ + g.panel.timeSeries.thresholdStep.withColor('green') + + g.panel.timeSeries.thresholdStep.withValue(null), + g.panel.timeSeries.thresholdStep.withColor('red') + + g.panel.timeSeries.thresholdStep.withValue(1), + ]), + + // Panel 2: SLA misses + slaIssuesPanel: + g.panel.timeSeries.new('SLA misses') + + g.panel.timeSeries.panelOptions.withDescription('The number of Service Level Agreement misses for any DAG runs in the Apache Airflow system.') + + g.panel.timeSeries.queryOptions.withTargets([ + signals.tasks.slaMisses.asTarget() + + g.query.prometheus.withInterval('1m'), + ]) + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.standardOptions.withDecimals(0) + + g.panel.timeSeries.standardOptions.thresholds.withSteps([ + g.panel.timeSeries.thresholdStep.withColor('green') + + g.panel.timeSeries.thresholdStep.withValue(null), + g.panel.timeSeries.thresholdStep.withColor('red') + + g.panel.timeSeries.thresholdStep.withValue(80), + ]), + + // Panel 3: Task failures + taskFailuresPanel: + g.panel.timeSeries.new('Task failures') + + g.panel.timeSeries.panelOptions.withDescription('The overall task instances failures for an Apache Airflow system.') + + g.panel.timeSeries.queryOptions.withTargets([ + signals.tasks.failures.asTarget() + + g.query.prometheus.withInterval('1m'), + ]) + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.standardOptions.withDecimals(0) + + g.panel.timeSeries.standardOptions.thresholds.withSteps([ + g.panel.timeSeries.thresholdStep.withColor('green') + + g.panel.timeSeries.thresholdStep.withValue(null), + g.panel.timeSeries.thresholdStep.withColor('red') + + g.panel.timeSeries.thresholdStep.withValue(1), + ]), + + // Panel 4: DAG success duration + dagSuccessDurationPanel: + g.panel.timeSeries.new('DAG success duration') + + g.panel.timeSeries.panelOptions.withDescription('The average time taken for recent successful DAG runs by DAG ID in the Apache Airflow system.') + + g.panel.timeSeries.queryOptions.withTargets([ + signals.dags.avgSuccessDuration.asTarget() + + g.query.prometheus.withInterval('1m'), + ]) + + g.panel.timeSeries.standardOptions.withUnit('s') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.standardOptions.thresholds.withSteps([ + g.panel.timeSeries.thresholdStep.withColor('green') + + g.panel.timeSeries.thresholdStep.withValue(null), + g.panel.timeSeries.thresholdStep.withColor('red') + + g.panel.timeSeries.thresholdStep.withValue(80), + ]) + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + // Panel 5: DAG failed duration + dagFailedDurationPanel: + g.panel.timeSeries.new('DAG failed duration') + + g.panel.timeSeries.panelOptions.withDescription('The average time taken for recent failed DAG runs by DAG ID in the Apache Airflow system.') + + g.panel.timeSeries.queryOptions.withTargets([ + signals.dags.avgFailedDuration.asTarget() + + g.query.prometheus.withInterval('1m'), + ]) + + g.panel.timeSeries.standardOptions.withUnit('s') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.standardOptions.thresholds.withSteps([ + g.panel.timeSeries.thresholdStep.withColor('green') + + g.panel.timeSeries.thresholdStep.withValue(null), + g.panel.timeSeries.thresholdStep.withColor('red') + + g.panel.timeSeries.thresholdStep.withValue(80), + ]) + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + // Panel 6: Task duration (bargauge) + taskDurationPanel: + g.panel.barGauge.new('Task duration') + + g.panel.barGauge.panelOptions.withDescription('The average time taken for recent task runs by Task ID in the Apache Airflow system.') + + g.panel.barGauge.queryOptions.withTargets([ + signals.tasks.avgDuration.asTarget() + + g.query.prometheus.withInterval('1m'), + ]) + + g.panel.barGauge.standardOptions.withUnit('s') + + g.panel.barGauge.standardOptions.withMin(0) + + g.panel.barGauge.standardOptions.color.withMode('thresholds') + + g.panel.barGauge.standardOptions.thresholds.withSteps([ + g.panel.barGauge.thresholdStep.withColor('green') + + g.panel.barGauge.thresholdStep.withValue(null), + ]) + + g.panel.barGauge.options.withDisplayMode('gradient') + + g.panel.barGauge.options.withOrientation('horizontal') + + g.panel.barGauge.options.withShowUnfilled(true) + + g.panel.barGauge.options.reduceOptions.withCalcs(['lastNotNull']), + + // Panel 7: Task count summary (piechart) + taskCountSummaryPanel: + g.panel.pieChart.new('Task count summary') + + g.panel.pieChart.panelOptions.withDescription('The number of task counts by DAG ID in the Apache Airflow system.') + + g.panel.pieChart.queryOptions.withTargets([ + signals.tasks.finishTotal.withExprWrappersMixin(['sum by(job, instance, dag_id, state) (', ')']) + .asTarget() + + g.query.prometheus.withInterval('1m') + + g.query.prometheus.withLegendFormat('{{dag_id}} - {{state}}'), + ]) + + g.panel.pieChart.options.legend.withPlacement('right') + + g.panel.pieChart.options.reduceOptions.withCalcs(['sum']) + + g.panel.pieChart.options.tooltip.withMode('multi'), + + // Panel 8: Task counts + taskCountsPanel: + g.panel.timeSeries.new('Task counts') + + g.panel.timeSeries.panelOptions.withDescription('The number of task counts by Task ID in the Apache Airflow system.') + + g.panel.timeSeries.queryOptions.withTargets([ + signals.tasks.finishTotal.asTarget() + + g.query.prometheus.withInterval('1m'), + ]) + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.standardOptions.withDecimals(0) + + g.panel.timeSeries.standardOptions.thresholds.withSteps([ + g.panel.timeSeries.thresholdStep.withColor('green') + + g.panel.timeSeries.thresholdStep.withValue(null), + ]) + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + // + // Scheduler Details Panels + // + + // Panel 9: DAG schedule delay + dagScheduleDelayPanel: + g.panel.timeSeries.new('DAG schedule delay') + + g.panel.timeSeries.panelOptions.withDescription('The amount of average delay between recent scheduled DAG runtime and the actual DAG runtime by DAG ID in the Apache Airflow system.') + + g.panel.timeSeries.queryOptions.withTargets([ + signals.dags.avgScheduleDelay.asTarget() + + g.query.prometheus.withInterval('1m') + + g.query.prometheus.withLegendFormat('{{instance}} - {{dag_id}}'), + ]) + + g.panel.timeSeries.standardOptions.withUnit('s') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.standardOptions.thresholds.withSteps([ + g.panel.timeSeries.thresholdStep.withColor('green') + + g.panel.timeSeries.thresholdStep.withValue(null), + g.panel.timeSeries.thresholdStep.withColor('red') + + g.panel.timeSeries.thresholdStep.withValue(80), + ]) + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + // Panel 10: Scheduler tasks + schedulerTasksPanel: + g.panel.timeSeries.new('Scheduler tasks') + + g.panel.timeSeries.panelOptions.withDescription('The number of current tasks that the scheduler is handling in the Apache Airflow system.') + + g.panel.timeSeries.queryOptions.withTargets([ + signals.scheduler.tasksExecutable.asTarget() + + g.query.prometheus.withLegendFormat('{{instance}} - executable'), + signals.scheduler.tasksStarving.asTarget() + + g.query.prometheus.withLegendFormat('{{instance}} - starving'), + ]) + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.standardOptions.withDecimals(0) + + g.panel.timeSeries.standardOptions.thresholds.withSteps([ + g.panel.timeSeries.thresholdStep.withColor('green') + + g.panel.timeSeries.thresholdStep.withValue(null), + g.panel.timeSeries.thresholdStep.withColor('red') + + g.panel.timeSeries.thresholdStep.withValue(80), + ]) + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + // Panel 11: Executor tasks + executorTasksPanel: + g.panel.timeSeries.new('Executor tasks') + + g.panel.timeSeries.panelOptions.withDescription('The number of current tasks that the executors are handling in the Apache Airflow system.') + + g.panel.timeSeries.queryOptions.withTargets([ + signals.executor.runningTasks.asTarget() + + g.query.prometheus.withLegendFormat('{{instance}} - running'), + signals.executor.queuedTasks.asTarget() + + g.query.prometheus.withLegendFormat('{{instance}} - queued'), + signals.executor.openSlots.asTarget() + + g.query.prometheus.withLegendFormat('{{instance}} - open'), + ]) + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.standardOptions.withDecimals(0) + + g.panel.timeSeries.standardOptions.thresholds.withSteps([ + g.panel.timeSeries.thresholdStep.withColor('green') + + g.panel.timeSeries.thresholdStep.withValue(null), + g.panel.timeSeries.thresholdStep.withColor('red') + + g.panel.timeSeries.thresholdStep.withValue(80), + ]) + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + // Panel 12: Pool task slots + poolTaskSlotsPanel: + g.panel.timeSeries.new('Pool task slots') + + g.panel.timeSeries.panelOptions.withDescription('The number of task slots available in the pools of the Apache Airflow system.') + + g.panel.timeSeries.queryOptions.withTargets([ + signals.pools.runningSlots.asTarget() + + g.query.prometheus.withLegendFormat('{{pool_name}} - running'), + signals.pools.queuedSlots.asTarget() + + g.query.prometheus.withLegendFormat('{{pool_name}} - queued'), + signals.pools.starvingTasks.asTarget() + + g.query.prometheus.withLegendFormat('{{pool_name}} - starving'), + signals.pools.openSlots.asTarget() + + g.query.prometheus.withLegendFormat('{{pool_name}} - open'), + ]) + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.standardOptions.withDecimals(0) + + g.panel.timeSeries.standardOptions.thresholds.withSteps([ + g.panel.timeSeries.thresholdStep.withColor('green') + + g.panel.timeSeries.thresholdStep.withValue(null), + g.panel.timeSeries.thresholdStep.withColor('red') + + g.panel.timeSeries.thresholdStep.withValue(80), + ]) + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + // + // Logs Dashboard Panels + // + + // Panel 13: Task logs + taskLogsPanel: + g.panel.logs.new('Task logs') + + g.panel.logs.panelOptions.withDescription('Logs for each individual task run on the DAGs.') + + g.panel.logs.queryOptions.withTargets([ + g.query.loki.new( + '${loki_datasource}', + '{' + this.config.filteringSelector + ', dag_id=~"$dag_id", task_id=~"$task_id", filename=~".*/airflow/logs/dag_id.*"} |= ``' + ), + ]) + + g.panel.logs.options.withEnableLogDetails(true) + + g.panel.logs.options.withShowCommonLabels(false) + + g.panel.logs.options.withShowTime(false) + + g.panel.logs.options.withWrapLogMessage(false), + + // Panel 14: Scheduler logs + schedulerLogsPanel: + g.panel.logs.new('Scheduler logs') + + g.panel.logs.panelOptions.withDescription('Logs for the scheduler in the Apache Airflow system.') + + g.panel.logs.queryOptions.withTargets([ + g.query.loki.new( + '${loki_datasource}', + '{' + this.config.filteringSelector + ', filename=~".*/airflow/logs/scheduler/.*"} |= ``' + ), + ]) + + g.panel.logs.options.withEnableLogDetails(true) + + g.panel.logs.options.withShowCommonLabels(false) + + g.panel.logs.options.withShowTime(false) + + g.panel.logs.options.withWrapLogMessage(false), + }, +} diff --git a/apache-airflow-mixin/prometheus_rules_out/prometheus_alerts.yaml b/apache-airflow-mixin/prometheus_rules_out/prometheus_alerts.yaml index 305c05bca..af716320f 100644 --- a/apache-airflow-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/apache-airflow-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -1,5 +1,5 @@ groups: - - name: apache-airflow + - name: apache-airflow-alerts rules: - alert: ApacheAirflowStarvingPoolTasks annotations: @@ -7,27 +7,27 @@ groups: The number of starved tasks is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.pool_name }} which is above the threshold of 0. summary: There are starved tasks detected in the Apache Airflow pool. expr: | - airflow_pool_starving_tasks > 0 + airflow_pool_starving_tasks{job=~"$job", instance=~"$instance"} > 0 for: 5m labels: severity: critical - - alert: ApacheAirflowDAGScheduleDelayWarningLevel + - alert: ApacheAirflowDAGScheduleDelayWarning annotations: description: | The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m on {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 10. summary: The delay in DAG schedule time to DAG run time has reached the warning threshold. expr: | - increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 10 + increase(airflow_dagrun_schedule_delay_sum{job=~"$job", instance=~"$instance"}[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{job=~"$job", instance=~"$instance"}[5m]),1) > 10 for: 1m labels: severity: warning - - alert: ApacheAirflowDAGScheduleDelayCriticalLevel + - alert: ApacheAirflowDAGScheduleDelayCritical annotations: description: | The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 60. summary: The delay in DAG schedule time to DAG run time has reached the critical threshold. expr: | - increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 60 + increase(airflow_dagrun_schedule_delay_sum{job=~"$job", instance=~"$instance"}[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{job=~"$job", instance=~"$instance"}[5m]),1) > 60 for: 1m labels: severity: critical @@ -37,7 +37,7 @@ groups: The number of DAG failures seen is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 0. summary: There have been DAG failures detected. expr: | - increase(airflow_dagrun_duration_failed_count[5m]) > 0 + increase(airflow_dagrun_duration_failed_count{job=~"$job", instance=~"$instance"}[5m]) > 0 for: 1m labels: severity: critical diff --git a/apache-airflow-mixin/rows.libsonnet b/apache-airflow-mixin/rows.libsonnet new file mode 100644 index 000000000..c32e7425d --- /dev/null +++ b/apache-airflow-mixin/rows.libsonnet @@ -0,0 +1,48 @@ +local g = import './g.libsonnet'; + +{ + new(this): + { + // --- + // Overview Dashboard Rows + // --- + + // Main overview row + apacheAirflowOverview: + g.panel.row.new('Apache Airflow overview') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + this.grafana.panels.dagFileParsingErrorsPanel { gridPos+: { w: 8 } }, + this.grafana.panels.slaIssuesPanel { gridPos+: { w: 8 } }, + this.grafana.panels.taskFailuresPanel { gridPos+: { w: 8 } }, + this.grafana.panels.dagSuccessDurationPanel { gridPos+: { w: 12 } }, + this.grafana.panels.dagFailedDurationPanel { gridPos+: { w: 12 } }, + this.grafana.panels.taskDurationPanel { gridPos+: { w: 8 } }, + this.grafana.panels.taskCountSummaryPanel { gridPos+: { w: 8 } }, + this.grafana.panels.taskCountsPanel { gridPos+: { w: 8 } }, + ]), + + // Scheduler details row + apacheAirflowSchedulerDetails: + g.panel.row.new('Scheduler details') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + this.grafana.panels.dagScheduleDelayPanel { gridPos+: { w: 12 } }, + this.grafana.panels.schedulerTasksPanel { gridPos+: { w: 12 } }, + this.grafana.panels.executorTasksPanel { gridPos+: { w: 12 } }, + this.grafana.panels.poolTaskSlotsPanel { gridPos+: { w: 12 } }, + ]), + + // --- + // Logs Dashboard Rows + // --- + + apacheAirflowLogs: + g.panel.row.new('Apache Airflow logs') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + this.grafana.panels.taskLogsPanel { gridPos+: { w: 24 } }, + this.grafana.panels.schedulerLogsPanel { gridPos+: { w: 24 } }, + ]), + }, +} diff --git a/apache-airflow-mixin/signals/dags.libsonnet b/apache-airflow-mixin/signals/dags.libsonnet new file mode 100644 index 000000000..4a12c9545 --- /dev/null +++ b/apache-airflow-mixin/signals/dags.libsonnet @@ -0,0 +1,142 @@ +function(this) + { + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + aggLevel: 'none', + aggFunction: 'avg', + signals: { + parsingErrors: { + name: 'DAG file parsing errors', + type: 'gauge', + description: 'The number of errors from trying to parse DAG files in an Apache Airflow system.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_dag_processing_import_errors{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + + successDurationSum: { + name: 'Successful DAG run duration sum', + type: 'counter', + description: 'Sum of successful DAG run durations.', + unit: 's', + sources: { + prometheus: { + expr: 'airflow_dagrun_duration_success_sum{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + successDurationCount: { + name: 'Successful DAG run duration count', + type: 'counter', + description: 'Count of successful DAG runs.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_dagrun_duration_success_count{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + failedDurationSum: { + name: 'Failed DAG run duration sum', + type: 'counter', + description: 'Sum of failed DAG run durations.', + unit: 's', + sources: { + prometheus: { + expr: 'airflow_dagrun_duration_failed_sum{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + failedDurationCount: { + name: 'Failed DAG run duration count', + type: 'counter', + description: 'Count of failed DAG runs.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_dagrun_duration_failed_count{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + scheduleDelaySum: { + name: 'DAG schedule delay sum', + type: 'counter', + description: 'Sum of DAG schedule delays.', + unit: 's', + sources: { + prometheus: { + expr: 'airflow_dagrun_schedule_delay_sum{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + scheduleDelayCount: { + name: 'DAG schedule delay count', + type: 'counter', + description: 'Count of DAG schedule delays.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_dagrun_schedule_delay_count{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + // Raw signal for average successful DAG duration (sum/count) + avgSuccessDuration: { + name: 'Average successful DAG run duration', + type: 'raw', + description: 'Average duration of successful DAG runs calculated as sum/count.', + unit: 's', + sources: { + prometheus: { + expr: 'increase(airflow_dagrun_duration_success_sum{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_success_count{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:]),1)', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + // Raw signal for average failed DAG duration (sum/count) + avgFailedDuration: { + name: 'Average failed DAG run duration', + type: 'raw', + description: 'Average duration of failed DAG runs calculated as sum/count.', + unit: 's', + sources: { + prometheus: { + expr: 'increase(airflow_dagrun_duration_failed_sum{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_failed_count{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:]),1)', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + // Raw signal for average DAG schedule delay (sum/count) + avgScheduleDelay: { + name: 'Average DAG schedule delay', + type: 'raw', + description: 'Average DAG schedule delay calculated as sum/count.', + unit: 's', + sources: { + prometheus: { + expr: 'increase(airflow_dagrun_schedule_delay_sum{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:])/clamp_min(increase(airflow_dagrun_schedule_delay_count{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:]),1)', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + }, + } diff --git a/apache-airflow-mixin/signals/executor.libsonnet b/apache-airflow-mixin/signals/executor.libsonnet new file mode 100644 index 000000000..5d810664d --- /dev/null +++ b/apache-airflow-mixin/signals/executor.libsonnet @@ -0,0 +1,48 @@ +function(this) + { + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + aggLevel: 'none', + aggFunction: 'avg', + signals: { + runningTasks: { + name: 'Executor running tasks', + type: 'gauge', + description: 'Number of tasks currently running in the executor.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_executor_running_tasks{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + + queuedTasks: { + name: 'Executor queued tasks', + type: 'gauge', + description: 'Number of tasks queued in the executor.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_executor_queued_tasks{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + + openSlots: { + name: 'Executor open slots', + type: 'gauge', + description: 'Number of open slots available in the executor.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_executor_open_slots{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + }, + } diff --git a/apache-airflow-mixin/signals/pools.libsonnet b/apache-airflow-mixin/signals/pools.libsonnet new file mode 100644 index 000000000..a1642dfd2 --- /dev/null +++ b/apache-airflow-mixin/signals/pools.libsonnet @@ -0,0 +1,61 @@ +function(this) + { + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + aggLevel: 'none', + aggFunction: 'avg', + signals: { + runningSlots: { + name: 'Pool running slots', + type: 'gauge', + description: 'Number of slots currently running in pools.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_pool_running_slots{%(queriesSelector)s, pool_name=~"$pool_name"}', + legendCustomTemplate: '{{pool_name}}', + }, + }, + }, + + queuedSlots: { + name: 'Pool queued slots', + type: 'gauge', + description: 'Number of slots queued in pools.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_pool_queued_slots{%(queriesSelector)s, pool_name=~"$pool_name"}', + legendCustomTemplate: '{{pool_name}}', + }, + }, + }, + + starvingTasks: { + name: 'Pool starving tasks', + type: 'gauge', + description: 'Number of tasks starving (waiting for resources) in pools.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_pool_starving_tasks{%(queriesSelector)s, pool_name=~"$pool_name"}', + legendCustomTemplate: '{{pool_name}}', + }, + }, + }, + + openSlots: { + name: 'Pool open slots', + type: 'gauge', + description: 'Number of open slots available in pools.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_pool_open_slots{%(queriesSelector)s, pool_name=~"$pool_name"}', + legendCustomTemplate: '{{pool_name}}', + }, + }, + }, + }, + } diff --git a/apache-airflow-mixin/signals/scheduler.libsonnet b/apache-airflow-mixin/signals/scheduler.libsonnet new file mode 100644 index 000000000..b93ee4c96 --- /dev/null +++ b/apache-airflow-mixin/signals/scheduler.libsonnet @@ -0,0 +1,35 @@ +function(this) + { + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + aggLevel: 'none', + aggFunction: 'avg', + signals: { + tasksExecutable: { + name: 'Scheduler executable tasks', + type: 'gauge', + description: 'Number of tasks that are ready for execution in the scheduler.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_scheduler_tasks_executable{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + + tasksStarving: { + name: 'Scheduler starving tasks', + type: 'gauge', + description: 'Number of tasks that are starving (waiting for resources) in the scheduler.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_scheduler_tasks_starving{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + }, + } diff --git a/apache-airflow-mixin/signals/tasks.libsonnet b/apache-airflow-mixin/signals/tasks.libsonnet new file mode 100644 index 000000000..ef576dd3c --- /dev/null +++ b/apache-airflow-mixin/signals/tasks.libsonnet @@ -0,0 +1,104 @@ +function(this) + { + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + aggLevel: 'none', + aggFunction: 'avg', + signals: { + failures: { + name: 'Task failures', + type: 'counter', + rangeFunction: 'increase', + description: 'Overall task instance failures in an Apache Airflow system.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_ti_failures{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + + slaMisses: { + name: 'SLA misses', + type: 'counter', + rangeFunction: 'increase', + description: 'SLA misses in an Apache Airflow system.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_sla_missed{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + + durationSum: { + name: 'Task duration sum', + type: 'counter', + description: 'Sum of task durations.', + unit: 's', + sources: { + prometheus: { + expr: 'airflow_dag_task_duration_sum{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}', + legendCustomTemplate: '{{dag_id}} - {{task_id}}', + }, + }, + }, + + durationCount: { + name: 'Task duration count', + type: 'counter', + description: 'Count of task durations.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_dag_task_duration_count{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}', + legendCustomTemplate: '{{dag_id}} - {{task_id}}', + }, + }, + }, + + finishTotal: { + name: 'Task finish total', + type: 'counter', + rangeFunction: 'increase', + description: 'Total number of task finishes by state.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_task_finish_total{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id", state=~"$state"}', + legendCustomTemplate: '{{dag_id}} - {{task_id}} - {{state}}', + }, + }, + }, + + // Raw signal for average task duration (sum/count) + avgDuration: { + name: 'Average task duration', + type: 'raw', + description: 'Average duration of tasks calculated as sum/count.', + unit: 's', + sources: { + prometheus: { + expr: 'increase(airflow_dag_task_duration_sum{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}[$__interval:])/clamp_min(increase(airflow_dag_task_duration_count{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}[$__interval:]),1)', + legendCustomTemplate: '{{dag_id}} - {{task_id}}', + }, + }, + }, + + startTotal: { + name: 'Task start total', + type: 'counter', + description: 'Total number of task starts.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_task_start_total{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}', + legendCustomTemplate: '{{dag_id}} - {{task_id}}', + }, + }, + }, + }, + }