diff --git a/apache-airflow-mixin/alerts.libsonnet b/apache-airflow-mixin/alerts.libsonnet new file mode 100644 index 000000000..ed70ae359 --- /dev/null +++ b/apache-airflow-mixin/alerts.libsonnet @@ -0,0 +1,75 @@ +{ + new(this): { + groups: [ + { + name: this.config.uid + '-alerts', + rules: [ + { + alert: 'ApacheAirflowStarvingPoolTasks', + expr: ||| + airflow_pool_starving_tasks{%(filteringSelector)s} > %(alertsCriticalPoolStarvingTasks)s + ||| % this.config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'There are starved tasks detected in the Apache Airflow pool.', + description: ||| + The number of starved tasks is {{ printf "%%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.pool_name }} which is above the threshold of %(alertsCriticalPoolStarvingTasks)s. + ||| % this.config, + }, + }, + { + alert: 'ApacheAirflowDAGScheduleDelayWarning', + expr: ||| + increase(airflow_dagrun_schedule_delay_sum{%(filteringSelector)s}[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{%(filteringSelector)s}[5m]),1) > %(alertsWarningDAGScheduleDelayLevel)s + ||| % this.config, + 'for': '1m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'The delay in DAG schedule time to DAG run time has reached the warning threshold.', + description: ||| + The average delay in DAG schedule to run time is {{ printf "%%.0f" $value }} over the last 1m on {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of %(alertsWarningDAGScheduleDelayLevel)s. + ||| % this.config, + }, + }, + { + alert: 'ApacheAirflowDAGScheduleDelayCritical', + expr: ||| + increase(airflow_dagrun_schedule_delay_sum{%(filteringSelector)s}[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{%(filteringSelector)s}[5m]),1) > %(alertsCriticalDAGScheduleDelayLevel)s + ||| % this.config, + 'for': '1m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'The delay in DAG schedule time to DAG run time has reached the critical threshold.', + description: ||| + The average delay in DAG schedule to run time is {{ printf "%%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of %(alertsCriticalDAGScheduleDelayLevel)s. + ||| % this.config, + }, + }, + { + alert: 'ApacheAirflowDAGFailures', + expr: ||| + increase(airflow_dagrun_duration_failed_count{%(filteringSelector)s}[5m]) > %(alertsCriticalFailedDAGs)s + ||| % this.config, + 'for': '1m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'There have been DAG failures detected.', + description: ||| + The number of DAG failures seen is {{ printf "%%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of %(alertsCriticalFailedDAGs)s. + ||| % this.config, + }, + }, + ], + }, + ], + }, +} diff --git a/apache-airflow-mixin/config.libsonnet b/apache-airflow-mixin/config.libsonnet index db3db9025..1dd6bfa38 100644 --- a/apache-airflow-mixin/config.libsonnet +++ b/apache-airflow-mixin/config.libsonnet @@ -1,20 +1,38 @@ { - _config+:: { - dashboardTags: ['apache-airflow-mixin'], - dashboardPeriod: 'now-1h', - dashboardTimezone: 'default', - dashboardRefresh: '1m', + local this = self, + filteringSelector: 'job="integrations/apache-airflow"', + groupLabels: ['job'], + instanceLabels: ['instance'], + overviewLabels: [], + dashboardTags: ['apache-airflow-mixin'], + uid: 'apache-airflow', + dashboardNamePrefix: 'Apache Airflow', - //alert thresholds - alertsCriticalPoolStarvingTasks: 0, - alertsWarningDAGScheduleDelayLevel: 10, //s - alertsCriticalDAGScheduleDelayLevel: 60, //s - alertsCriticalFailedDAGs: 0, + // additional params + dashboardPeriod: 'now-6h', + dashboardTimezone: 'default', + dashboardRefresh: '1m', - enableLokiLogs: true, - enableMultiCluster: false, + // logs lib related + enableLokiLogs: true, + logLabels: ['job', 'instance'], + extraLogLabels: ['dag_file', 'filename'], // Required by logs-lib + logsVolumeGroupBy: 'level', + showLogsVolume: true, - multiclusterSelector: 'job=~"$job"', - airflowSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"', + // alert thresholds + alertsCriticalPoolStarvingTasks: 0, // count + alertsWarningDAGScheduleDelayLevel: 10, // s + alertsCriticalDAGScheduleDelayLevel: 60, // s + alertsCriticalFailedDAGs: 0, // count + + // multi-cluster support + enableMultiCluster: false, + + // metrics source for signals library + metricsSource: 'prometheus', + + signals+: { + overview: (import './signals/overview.libsonnet')(this), }, } diff --git a/apache-airflow-mixin/dashboards.libsonnet b/apache-airflow-mixin/dashboards.libsonnet new file mode 100644 index 000000000..544bc424f --- /dev/null +++ b/apache-airflow-mixin/dashboards.libsonnet @@ -0,0 +1,115 @@ +local g = import './g.libsonnet'; +local logslib = import 'logs-lib/logs/main.libsonnet'; + +{ + local root = self, + new(this):: + local prefix = this.config.dashboardNamePrefix; + local links = this.grafana.links; + local tags = this.config.dashboardTags; + local uid = g.util.string.slugify(this.config.uid); + local vars = this.grafana.variables; + local annotations = this.grafana.annotations; + local refresh = this.config.dashboardRefresh; + local period = this.config.dashboardPeriod; + local timezone = this.config.dashboardTimezone; + + { + // Apache Airflow overview dashboard + 'apache-airflow-overview.json': + g.dashboard.new(prefix + ' overview') + + g.dashboard.withDescription('Dashboard providing an overview of Apache Airflow DAGs, tasks, and scheduler performance.') + + g.dashboard.withPanels( + g.util.panel.resolveCollapsedFlagOnRows( + g.util.grid.wrapPanels( + [ + this.grafana.rows.apacheAirflowOverview, + this.grafana.rows.apacheAirflowSchedulerDetails, + ] + ) + ) + ) + + root.applyCommon( + vars.multiInstance + [ + g.dashboard.variable.query.new('dag_id') + + g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus) + + g.dashboard.variable.query.queryTypes.withLabelValues('dag_id', 'airflow_dagrun_duration_success_sum{job=~"$job", instance=~"$instance"}') + + g.dashboard.variable.query.generalOptions.withLabel('DAG ID') + + g.dashboard.variable.query.selectionOptions.withMulti(true) + + g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+') + + g.dashboard.variable.query.refresh.onLoad() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('task_id') + + g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus) + + g.dashboard.variable.query.queryTypes.withLabelValues('task_id', 'airflow_task_finish_total{job=~"$job", instance=~"$instance"}') + + g.dashboard.variable.query.generalOptions.withLabel('Task') + + g.dashboard.variable.query.selectionOptions.withMulti(true) + + g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+') + + g.dashboard.variable.query.refresh.onLoad() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('state') + + g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus) + + g.dashboard.variable.query.queryTypes.withLabelValues('state', 'airflow_task_finish_total{job=~"$job", instance=~"$instance"}') + + g.dashboard.variable.query.generalOptions.withLabel('State') + + g.dashboard.variable.query.selectionOptions.withMulti(true) + + g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+') + + g.dashboard.variable.query.refresh.onLoad() + + g.dashboard.variable.query.refresh.onTime(), + + g.dashboard.variable.query.new('pool_name') + + g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus) + + g.dashboard.variable.query.queryTypes.withLabelValues('pool_name', 'airflow_pool_running_slots{job=~"$job", instance=~"$instance"}') + + g.dashboard.variable.query.generalOptions.withLabel('Pool name') + + g.dashboard.variable.query.selectionOptions.withMulti(true) + + g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+') + + g.dashboard.variable.query.refresh.onLoad() + + g.dashboard.variable.query.refresh.onTime(), + ], + uid + '_overview', + tags, + links { apacheAirflowOverview+:: {} }, + annotations, + timezone, + refresh, + period + ), + } + + + if this.config.enableLokiLogs then + { + 'apache-airflow-logs.json': + logslib.new( + prefix + ' logs', + datasourceName=this.grafana.variables.datasources.loki.name, + datasourceRegex=this.grafana.variables.datasources.loki.regex, + filterSelector=this.config.filteringSelector, + labels=this.config.groupLabels + this.config.extraLogLabels, + ) + { + dashboards+: + { + logs+: + root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { logs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period), + }, + panels+: + { + logs+: + g.panel.logs.options.withEnableLogDetails(true) + + g.panel.logs.options.withShowTime(false) + + g.panel.logs.options.withWrapLogMessage(false), + }, + }.dashboards.logs, + } else {}, + + applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period): + g.dashboard.withTags(tags) + + g.dashboard.withUid(uid) + + g.dashboard.withLinks(std.objectValues(links)) + + g.dashboard.withTimezone(timezone) + + g.dashboard.withRefresh(refresh) + + g.dashboard.time.withFrom(period) + + g.dashboard.withVariables(vars) + + g.dashboard.withAnnotations(std.objectValues(annotations)), +} diff --git a/apache-airflow-mixin/dashboards/airflow-overview.libsonnet b/apache-airflow-mixin/dashboards/airflow-overview.libsonnet deleted file mode 100644 index 02fe75a12..000000000 --- a/apache-airflow-mixin/dashboards/airflow-overview.libsonnet +++ /dev/null @@ -1,1183 +0,0 @@ -local g = (import 'grafana-builder/grafana.libsonnet'); -local grafana = (import 'grafonnet/grafana.libsonnet'); -local dashboard = grafana.dashboard; -local template = grafana.template; -local prometheus = grafana.prometheus; - -local dashboardUid = 'apache-airflow-overview'; - -local promDatasourceName = 'prometheus_datasource'; -local lokiDatasourceName = 'loki_datasource'; - -local getMatcher(cfg) = '%(airflowSelector)s, instance=~"$instance"' % cfg; - -local promDatasource = { - uid: '${%s}' % promDatasourceName, -}; - -local lokiDatasource = { - uid: '${%s}' % lokiDatasourceName, -}; - -local dagFileParsingErrorsPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'airflow_dag_processing_import_errors{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}}', - ), - ], - type: 'timeseries', - title: 'DAG file parsing errors', - description: 'The number of errors from trying to parse DAG files in an Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 1, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'single', - sort: 'none', - }, - }, -}; - -local slaMissesPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_sla_missed{' + matcher + '}[$__interval:])', - datasource=promDatasource, - legendFormat='{{instance}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'SLA misses', - description: 'The number of Service Level Agreement misses for any DAG runs in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'single', - sort: 'none', - }, - }, -}; - -local taskFailuresPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_ti_failures{' + matcher + '}[$__interval:])', - datasource=promDatasource, - legendFormat='{{instance}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'Task failures', - description: 'The overall task instances failures for an Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 1, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'single', - sort: 'none', - }, - }, -}; - -local dagSuccessDurationPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_dagrun_duration_success_sum{' + matcher + ', dag_id=~"$dag_id"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_success_count{' + matcher + ', dag_id=~"$dag_id"}[$__interval:]),1)', - datasource=promDatasource, - legendFormat='{{dag_id}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'DAG success duration', - description: 'The average time taken for recent successful DAG runs by DAG ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 's', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local dagFailedDurationPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_dagrun_duration_failed_sum{' + matcher + ', dag_id=~"$dag_id"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_failed_count{' + matcher + ', dag_id=~"$dag_id"}[$__interval:]),1)', - datasource=promDatasource, - legendFormat='{{dag_id}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'DAG failed duration', - description: 'The average time taken for recent failed DAG runs by DAG ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 's', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local taskDurationPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_dag_task_duration_sum{' + matcher + ', dag_id=~"$dag_id", task_id=~"$task_id"}[$__interval:]) / clamp_min(increase(airflow_dag_task_duration_count{' + matcher + ', dag_id=~"$dag_id", task_id=~"$task_id"}[$__interval:]),1) != 0', - datasource=promDatasource, - legendFormat='{{dag_id}} - {{task_id}}', - interval='1m', - ), - ], - type: 'bargauge', - title: 'Task duration', - description: 'The average time taken for recent task runs by Task ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 's', - }, - overrides: [], - }, - options: { - displayMode: 'gradient', - minVizHeight: 10, - minVizWidth: 0, - orientation: 'horizontal', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - showUnfilled: true, - text: {}, - }, - pluginVersion: '9.2.3', - transformations: [], -}; - -local taskCountSummaryPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'sum by(job, instance, dag_id, state) (increase(airflow_task_finish_total{' + matcher + ', dag_id=~"$dag_id"}[$__interval:])) != 0', - datasource=promDatasource, - legendFormat='{{dag_id}} - {{state}}', - interval='1m', - ), - ], - type: 'piechart', - title: 'Task count summary', - description: 'The number of task counts by DAG ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - }, - mappings: [], - }, - overrides: [], - }, - options: { - legend: { - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - pieType: 'pie', - reduceOptions: { - calcs: [ - 'sum', - ], - fields: '', - values: false, - }, - tooltip: { - mode: 'multi', - sort: 'asc', - }, - }, -}; - -local taskCountsPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_task_finish_total{' + matcher + ', dag_id=~"$dag_id", task_id=~"$task_id", state=~"$state"}[$__interval:])', - datasource=promDatasource, - legendFormat='{{dag_id}} - {{task_id}} - {{state}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'Task counts', - description: 'The number of task counts by Task ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, - pluginVersion: '9.2.3', -}; - -local taskLogsPanel(matcher) = { - datasource: lokiDatasource, - targets: [ - { - datasource: lokiDatasource, - editorMode: 'code', - expr: '{' + matcher + ', dag_id=~"$dag_id", task_id=~"$task_id", filename=~".*/airflow/logs/dag_id.*"} |= ``', - queryType: 'range', - refId: 'A', - }, - ], - type: 'logs', - title: 'Task logs', - description: 'Logs for each individual task run on the DAGs.', - options: { - dedupStrategy: 'none', - enableLogDetails: true, - prettifyLogMessage: false, - showCommonLabels: false, - showLabels: false, - showTime: false, - sortOrder: 'Descending', - wrapLogMessage: false, - }, -}; - -local schedulerDetailsRow = { - datasource: promDatasource, - targets: [], - type: 'row', - title: 'Scheduler details', - collapsed: false, -}; - -local dagScheduleDelayPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'increase(airflow_dagrun_schedule_delay_sum{' + matcher + ', dag_id=~"$dag_id"}[$__interval:]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{' + matcher + ', dag_id=~"$dag_id"}[$__interval:]),1)', - datasource=promDatasource, - legendFormat='{{instance}} - {{dag_id}}', - interval='1m', - ), - ], - type: 'timeseries', - title: 'DAG schedule delay', - description: 'The amount of average delay between recent scheduled DAG runtime and the actual DAG runtime by DAG ID in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 's', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local schedulerTasksPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'airflow_scheduler_tasks_executable{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}} - executable', - ), - prometheus.target( - 'airflow_scheduler_tasks_starving{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}} - starving', - ), - ], - type: 'timeseries', - title: 'Scheduler tasks', - description: 'The number of current tasks that the scheduler is handling in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local executorTasksPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'airflow_executor_running_tasks{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}} - running', - ), - prometheus.target( - 'airflow_executor_queued_tasks{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}} - queued', - ), - prometheus.target( - 'airflow_executor_open_slots{' + matcher + '}', - datasource=promDatasource, - legendFormat='{{instance}} - open', - ), - ], - type: 'timeseries', - title: 'Executor tasks', - description: 'The number of current tasks that the executors are handling in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local poolTaskSlotsPanel(matcher) = { - datasource: promDatasource, - targets: [ - prometheus.target( - 'airflow_pool_running_slots{' + matcher + ', pool_name=~"$pool_name"}', - datasource=promDatasource, - legendFormat='{{instance}} - {{pool_name}} - running', - ), - prometheus.target( - 'airflow_pool_queued_slots{' + matcher + ', pool_name=~"$pool_name"}', - datasource=promDatasource, - legendFormat='{{instance}} - {{pool_name}} - queued', - ), - prometheus.target( - 'airflow_pool_starving_tasks{' + matcher + ', pool_name=~"$pool_name"}', - datasource=promDatasource, - legendFormat='{{instance}} - {{pool_name}} - starving', - ), - prometheus.target( - 'airflow_pool_open_slots{' + matcher + ', pool_name=~"$pool_name"}', - datasource=promDatasource, - legendFormat='{{instance}} - {{pool_name}} - open', - ), - ], - type: 'timeseries', - title: 'Pool task slots', - description: 'The number of current task slots that the pools are handling in the Apache Airflow system.', - fieldConfig: { - defaults: { - color: { - mode: 'palette-classic', - }, - custom: { - axisCenteredZero: false, - axisColorMode: 'text', - axisLabel: '', - axisPlacement: 'auto', - barAlignment: 0, - drawStyle: 'line', - fillOpacity: 0, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineInterpolation: 'linear', - lineWidth: 1, - pointSize: 5, - scaleDistribution: { - type: 'linear', - }, - showPoints: 'auto', - spanNulls: false, - stacking: { - group: 'A', - mode: 'none', - }, - thresholdsStyle: { - mode: 'off', - }, - }, - decimals: 0, - mappings: [], - min: 0, - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - { - color: 'red', - value: 80, - }, - ], - }, - unit: 'none', - }, - overrides: [], - }, - options: { - legend: { - calcs: [], - displayMode: 'list', - placement: 'right', - showLegend: true, - }, - tooltip: { - mode: 'multi', - sort: 'none', - }, - }, -}; - -local schedulerLogsPanel(matcher) = { - datasource: lokiDatasource, - targets: [ - { - datasource: lokiDatasource, - editorMode: 'code', - expr: '{' + matcher + ', dag_file=~"$dag_file", filename=~".*/airflow/logs/scheduler/latest/.*"} |= ``', - queryType: 'range', - refId: 'A', - }, - ], - type: 'logs', - title: 'Scheduler logs', - description: 'Shows the scheduler logs by DAG file.', - options: { - dedupStrategy: 'none', - enableLogDetails: true, - prettifyLogMessage: false, - showCommonLabels: false, - showLabels: false, - showTime: false, - sortOrder: 'Descending', - wrapLogMessage: false, - }, -}; - -{ - grafanaDashboards+:: { - 'apache-airflow-overview.json': - dashboard.new( - 'Apache Airflow overview', - time_from='%s' % $._config.dashboardPeriod, - tags=($._config.dashboardTags), - timezone='%s' % $._config.dashboardTimezone, - refresh='%s' % $._config.dashboardRefresh, - description='', - uid=dashboardUid, - ) - - .addTemplates( - std.flattenArrays([ - [ - template.datasource( - promDatasourceName, - 'prometheus', - null, - label='Prometheus data source', - refresh='load' - ), - ], - if $._config.enableLokiLogs then [ - template.datasource( - lokiDatasourceName, - 'loki', - null, - label='Loki data source', - refresh='load' - ), - ] else [], - [ - template.new( - 'job', - promDatasource, - 'label_values(airflow_scheduler_tasks_executable,job)', - label='Job', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'cluster', - promDatasource, - 'label_values(airflow_scheduler_tasks_executable{%(multiclusterSelector)s}, cluster)' % $._config, - label='Cluster', - refresh=2, - includeAll=true, - multi=true, - allValues='.*', - hide=if $._config.enableMultiCluster then '' else 'variable', - sort=0 - ), - template.new( - 'instance', - promDatasource, - 'label_values(airflow_scheduler_tasks_executable{%(airflowSelector)s}, instance)' % $._config, - label='Instance', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'dag_id', - promDatasource, - 'label_values(airflow_task_start_total{%(airflowSelector)s, instance=~"$instance"}, dag_id)' % $._config, - label='DAG', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'task_id', - promDatasource, - 'label_values(airflow_task_start_total{%(airflowSelector)s, instance=~"$instance", dag_id=~"$dag_id"}, task_id)' % $._config, - label='Task', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'state', - promDatasource, - 'label_values(airflow_task_finish_total{%(airflowSelector)s, instance=~"$instance", dag_id=~"$dag_id", task_id=~"$task_id"}, state)' % $._config, - label='Task state', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - template.new( - 'pool_name', - promDatasource, - 'label_values(airflow_pool_open_slots{%(airflowSelector)s, instance=~"$instance"}, pool_name)' % $._config, - label='Pool', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - - ], - if $._config.enableLokiLogs then [ - template.new( - 'dag_file', - lokiDatasource, - 'label_values(dag_file)', - label='DAG file', - refresh=2, - includeAll=true, - multi=true, - allValues='.+', - sort=0 - ), - ] else [], - ]) - ) - .addPanels( - std.flattenArrays([ - [ - dagFileParsingErrorsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 8, x: 0, y: 0 } }, - slaMissesPanel(getMatcher($._config)) { gridPos: { h: 8, w: 8, x: 8, y: 0 } }, - taskFailuresPanel(getMatcher($._config)) { gridPos: { h: 8, w: 8, x: 16, y: 0 } }, - dagSuccessDurationPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 8 } }, - dagFailedDurationPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 8 } }, - taskDurationPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 16 } }, - taskCountSummaryPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 16 } }, - taskCountsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 24 } }, - ], - if $._config.enableLokiLogs then [ - taskLogsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 32 } }, - ] else [], - [ - schedulerDetailsRow { gridPos: { h: 1, w: 24, x: 0, y: 40 } }, - dagScheduleDelayPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 41 } }, - schedulerTasksPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 41 } }, - executorTasksPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 49 } }, - poolTaskSlotsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 49 } }, - ], - if $._config.enableLokiLogs then [ - schedulerLogsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 57 } }, - ] else [], - [ - ], - ]) - ), - }, -} diff --git a/apache-airflow-mixin/dashboards/dashboards.libsonnet b/apache-airflow-mixin/dashboards/dashboards.libsonnet deleted file mode 100644 index 68ee182ef..000000000 --- a/apache-airflow-mixin/dashboards/dashboards.libsonnet +++ /dev/null @@ -1 +0,0 @@ -(import 'airflow-overview.libsonnet') diff --git a/apache-airflow-mixin/dashboards_out/apache-airflow-logs.json b/apache-airflow-mixin/dashboards_out/apache-airflow-logs.json new file mode 100644 index 000000000..5c1466c41 --- /dev/null +++ b/apache-airflow-mixin/dashboards_out/apache-airflow-logs.json @@ -0,0 +1,305 @@ +{ + "annotations": { + "list": [ ] + }, + "links": [ + { + "keepTime": true, + "title": "Apache Airflow overview", + "type": "link", + "url": "/d/apacheairflow_overview" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "apache-airflow-mixin" + ], + "title": "All dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "description": "Logs volume grouped by \"level\" label.", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "bars", + "fillOpacity": 50, + "stacking": { + "mode": "normal" + } + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "(E|e)merg|(F|f)atal|(A|a)lert|(C|c)rit.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(E|e)(rr.*|RR.*)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(W|w)(arn.*|ARN.*|rn|RN)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(N|n)(otice|ote)|(I|i)(nf.*|NF.*)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "dbg.*|DBG.*|(D|d)(EBUG|ebug)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(T|t)(race|RACE)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "light-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "logs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "text", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "maxDataPoints": 100, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "expr": "sum by (level) (count_over_time({job=\"integrations/apache-airflow\",job=~\"$job\",dag_file=~\"$dag_file\",filename=~\"$filename\"}\n|~ \"$regex_search\"\n\n[$__auto]))\n", + "legendFormat": "{{ level }}" + } + ], + "title": "Logs volume", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "Value", + "renamePattern": "logs" + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 18, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 2, + "options": { + "dedupStrategy": "exact", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showTime": false, + "wrapLogMessage": false + }, + "pluginVersion": "v11.0.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "expr": "{job=\"integrations/apache-airflow\",job=~\"$job\",dag_file=~\"$dag_file\",filename=~\"$filename\"} \n|~ \"$regex_search\"\n\n\n" + } + ], + "title": "Logs", + "type": "logs" + } + ], + "refresh": "1m", + "schemaVersion": 39, + "tags": [ + "apache-airflow-mixin" + ], + "templating": { + "list": [ + { + "label": "Loki data source", + "name": "loki_datasource", + "query": "loki", + "regex": "", + "type": "datasource" + }, + { + "allValue": ".*", + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values({job=\"integrations/apache-airflow\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "includeAll": true, + "label": "DAG", + "multi": true, + "name": "dag_file", + "query": "label_values({job=\"integrations/apache-airflow\",job=~\"$job\"}, dag_file)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "includeAll": true, + "label": "Filename", + "multi": true, + "name": "filename", + "query": "label_values({job=\"integrations/apache-airflow\",job=~\"$job\",dag_file=~\"$dag_file\"}, filename)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "label": "Regex search", + "name": "regex_search", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "default", + "title": "Apache Airflow logs", + "uid": "apacheairflow-logs" + } \ No newline at end of file diff --git a/apache-airflow-mixin/dashboards_out/apache-airflow-overview.json b/apache-airflow-mixin/dashboards_out/apache-airflow-overview.json index 69e0b0031..a80139a7c 100644 --- a/apache-airflow-mixin/dashboards_out/apache-airflow-overview.json +++ b/apache-airflow-mixin/dashboards_out/apache-airflow-overview.json @@ -1,105 +1,87 @@ { - "__inputs": [ ], - "__requires": [ ], "annotations": { "list": [ ] }, - "description": "", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ ], + "description": "Dashboard providing an overview of Apache Airflow DAGs, tasks, and scheduler performance.", + "links": [ + { + "keepTime": true, + "title": "Apache Airflow logs", + "type": "link", + "url": "/d/apacheairflow-logs" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "apache-airflow-mixin" + ], + "title": "All dashboards", + "type": "dashboards" + } + ], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 0, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "Apache Airflow overview", + "type": "row" + }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The number of errors from trying to parse DAG files in an Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 1 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 8, "x": 0, - "y": 0 + "y": 1 }, "id": 2, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "airflow_dag_processing_import_errors{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_dag_processing_import_errors{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "legendFormat": "{{instance}}", + "refId": "DAG file parsing errors" } ], "title": "DAG file parsing errors", @@ -107,93 +89,52 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The number of Service Level Agreement misses for any DAG runs in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 8, "x": 8, - "y": 0 + "y": 1 }, "id": 3, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "increase(airflow_sla_missed{job=~\"$job\", instance=~\"$instance\"}[$__interval:])", + "expr": "increase(airflow_sla_missed{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\"}[$__interval:] offset -$__interval)", "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "interval": "2m", + "legendFormat": "{{instance}}", + "refId": "SLA misses" } ], "title": "SLA misses", @@ -201,93 +142,52 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The overall task instances failures for an Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 1 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 8, "x": 16, - "y": 0 + "y": 1 }, "id": 4, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { - "mode": "single", - "sort": "none" + "mode": "multi", + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "increase(airflow_ti_failures{job=~\"$job\", instance=~\"$instance\"}[$__interval:])", + "expr": "increase(airflow_ti_failures{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\"}[$__interval:] offset -$__interval)", "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{instance}}" + "instant": false, + "interval": "2m", + "legendFormat": "{{instance}}", + "refId": "Task failures" } ], "title": "Task failures", @@ -295,92 +195,53 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The average time taken for recent successful DAG runs by DAG ID in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 8 + "y": 9 }, "id": 5, "options": { "legend": { "calcs": [ ], "displayMode": "list", - "placement": "right", - "showLegend": true + "placement": "right" }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "increase(airflow_dagrun_duration_success_sum{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_success_count{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]),1)", + "expr": "increase(airflow_dagrun_duration_success_sum{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:] offset -$__interval)/clamp_min(increase(airflow_dagrun_duration_success_count{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:] offset -$__interval),1)", "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{dag_id}}" + "instant": false, + "interval": "2m", + "legendFormat": "{{dag_id}}", + "refId": "Average successful DAG run duration" } ], "title": "DAG success duration", @@ -388,92 +249,53 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The average time taken for recent failed DAG runs by DAG ID in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 8 + "y": 9 }, "id": 6, "options": { "legend": { "calcs": [ ], "displayMode": "list", - "placement": "right", - "showLegend": true + "placement": "right" }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "increase(airflow_dagrun_duration_failed_sum{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:])/clamp_min(increase(airflow_dagrun_duration_failed_count{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]),1)", + "expr": "increase(airflow_dagrun_duration_failed_sum{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:] offset -$__interval)/clamp_min(increase(airflow_dagrun_duration_failed_count{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:] offset -$__interval),1)", "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{dag_id}}" + "instant": false, + "interval": "2m", + "legendFormat": "{{dag_id}}", + "refId": "Average failed DAG run duration" } ], "title": "DAG failed duration", @@ -481,125 +303,80 @@ }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The average time taken for recent task runs by Task ID in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, - "w": 12, + "w": 8, "x": 0, - "y": 16 + "y": 17 }, "id": 7, "options": { - "displayMode": "gradient", - "minVizHeight": 10, - "minVizWidth": 0, - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showUnfilled": true, - "text": { } + "orientation": "horizontal" }, - "pluginVersion": "9.2.3", + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "increase(airflow_dag_task_duration_sum{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}[$__interval:]) / clamp_min(increase(airflow_dag_task_duration_count{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}[$__interval:]),1) != 0", + "expr": "increase(airflow_dag_task_duration_sum{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}[$__interval:] offset -$__interval)/clamp_min(increase(airflow_dag_task_duration_count{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}[$__interval:] offset -$__interval),1) != 0", "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{dag_id}} - {{task_id}}" + "instant": false, + "interval": "2m", + "legendFormat": "{{dag_id}} - {{task_id}}", + "refId": "Average task duration" } ], "title": "Task duration", - "transformations": [ ], "type": "bargauge" }, { "datasource": { - "uid": "${prometheus_datasource}" + "type": "datasource", + "uid": "-- Mixed --" }, "description": "The number of task counts by DAG ID in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "mappings": [ ] - }, - "overrides": [ ] + "unit": "none" + } }, "gridPos": { "h": 8, - "w": 12, - "x": 12, - "y": 16 + "w": 8, + "x": 8, + "y": 17 }, "id": 8, "options": { "legend": { - "displayMode": "list", - "placement": "right", - "showLegend": true - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "sum" - ], - "fields": "", - "values": false - }, - "tooltip": { - "mode": "multi", - "sort": "asc" + "asTable": true, + "placement": "right" } }, + "pluginVersion": "v11.4.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "sum by(job, instance, dag_id, state) (increase(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:])) != 0", + "expr": "sum by(job, instance, dag_id, state) (increase(airflow_task_finish_total{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\", state=~\"$state\"}[$__interval:] offset -$__interval) > 0)", "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{dag_id}} - {{state}}" + "instant": false, + "interval": "2m", + "legendFormat": "{{dag_id}} - {{state}}", + "refId": "Task finish total pie" } ], "title": "Task count summary", @@ -607,235 +384,120 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The number of task counts by Task ID in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, - "w": 24, - "x": 0, - "y": 24 + "w": 8, + "x": 16, + "y": 17 }, "id": 9, "options": { "legend": { "calcs": [ ], "displayMode": "list", - "placement": "right", - "showLegend": true + "placement": "right" }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, - "pluginVersion": "9.2.3", + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "increase(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\", state=~\"$state\"}[$__interval:])", + "expr": "increase(airflow_task_finish_total{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\", state=~\"$state\"}[$__interval:] offset -$__interval) > 0", "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{dag_id}} - {{task_id}} - {{state}}" + "instant": false, + "interval": "2m", + "legendFormat": "{{dag_id}} - {{task_id}} - {{state}}", + "refId": "Task finish total" } ], "title": "Task counts", "type": "timeseries" }, - { - "datasource": { - "uid": "${loki_datasource}" - }, - "description": "Logs for each individual task run on the DAGs.", - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 32 - }, - "id": 10, - "options": { - "dedupStrategy": "none", - "enableLogDetails": true, - "prettifyLogMessage": false, - "showCommonLabels": false, - "showLabels": false, - "showTime": false, - "sortOrder": "Descending", - "wrapLogMessage": false - }, - "targets": [ - { - "datasource": { - "uid": "${loki_datasource}" - }, - "editorMode": "code", - "expr": "{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\", filename=~\".*/airflow/logs/dag_id.*\"} |= ``", - "queryType": "range", - "refId": "A" - } - ], - "title": "Task logs", - "type": "logs" - }, { "collapsed": false, - "datasource": { - "uid": "${prometheus_datasource}" - }, "gridPos": { "h": 1, - "w": 24, + "w": 0, "x": 0, - "y": 40 + "y": 25 }, - "id": 11, - "targets": [ ], + "id": 10, + "panels": [ ], "title": "Scheduler details", "type": "row" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The amount of average delay between recent scheduled DAG runtime and the actual DAG runtime by DAG ID in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "s" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 41 + "y": 26 }, - "id": 12, + "id": 11, "options": { "legend": { "calcs": [ ], "displayMode": "list", - "placement": "right", - "showLegend": true + "placement": "right" }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "increase(airflow_dagrun_schedule_delay_sum{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:]),1)", + "expr": "increase(airflow_dagrun_schedule_delay_sum{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:] offset -$__interval)/clamp_min(increase(airflow_dagrun_schedule_delay_count{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", dag_id=~\"$dag_id\"}[$__interval:] offset -$__interval),1)", "format": "time_series", - "interval": "1m", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{dag_id}}" + "instant": false, + "interval": "2m", + "legendFormat": "{{dag_id}}", + "refId": "Average DAG schedule delay" } ], "title": "DAG schedule delay", @@ -843,101 +505,62 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The number of current tasks that the scheduler is handling in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 41 + "y": 26 }, - "id": 13, + "id": 12, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "airflow_scheduler_tasks_executable{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_scheduler_tasks_executable{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - executable" + "instant": false, + "legendFormat": "{{instance}} - executable", + "refId": "Scheduler executable tasks" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "airflow_scheduler_tasks_starving{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_scheduler_tasks_starving{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - starving" + "instant": false, + "legendFormat": "{{instance}} - starving", + "refId": "Scheduler starving tasks" } ], "title": "Scheduler tasks", @@ -945,110 +568,73 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The number of current tasks that the executors are handling in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 49 + "y": 34 }, - "id": 14, + "id": 13, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "airflow_executor_running_tasks{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_executor_running_tasks{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - running" + "instant": false, + "legendFormat": "{{instance}} - running", + "refId": "Executor running tasks" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "airflow_executor_queued_tasks{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_executor_queued_tasks{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - queued" + "instant": false, + "legendFormat": "{{instance}} - queued", + "refId": "Executor queued tasks" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "airflow_executor_open_slots{job=~\"$job\", instance=~\"$instance\"}", + "expr": "airflow_executor_open_slots{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - open" + "instant": false, + "legendFormat": "{{instance}} - open", + "refId": "Executor open slots" } ], "title": "Executor tasks", @@ -1056,401 +642,205 @@ }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, "description": "The number of current task slots that the pools are handling in the Apache Airflow system.", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "mappings": [ ], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "fillOpacity": 30, + "gradientMode": "opacity", + "lineInterpolation": "smooth", + "lineWidth": 2, + "showPoints": "never" }, "unit": "none" - }, - "overrides": [ ] + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 49 + "y": 34 }, - "id": 15, + "id": 14, "options": { "legend": { "calcs": [ ], - "displayMode": "list", - "placement": "right", - "showLegend": true + "displayMode": "list" }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, + "pluginVersion": "v11.0.0", "targets": [ { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "airflow_pool_running_slots{job=~\"$job\", instance=~\"$instance\", pool_name=~\"$pool_name\"}", + "expr": "airflow_pool_running_slots{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", pool_name=~\"$pool_name\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{pool_name}} - running" + "instant": false, + "legendFormat": "{{instance}} - {{pool_name}} - running", + "refId": "Pool running slots" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "airflow_pool_queued_slots{job=~\"$job\", instance=~\"$instance\", pool_name=~\"$pool_name\"}", + "expr": "airflow_pool_queued_slots{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", pool_name=~\"$pool_name\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{pool_name}} - queued" + "instant": false, + "legendFormat": "{{instance}} - {{pool_name}} - queued", + "refId": "Pool queued slots" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "airflow_pool_starving_tasks{job=~\"$job\", instance=~\"$instance\", pool_name=~\"$pool_name\"}", + "expr": "airflow_pool_starving_tasks{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", pool_name=~\"$pool_name\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{pool_name}} - starving" + "instant": false, + "legendFormat": "{{instance}} - {{pool_name}} - starving", + "refId": "Pool starving tasks" }, { "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "expr": "airflow_pool_open_slots{job=~\"$job\", instance=~\"$instance\", pool_name=~\"$pool_name\"}", + "expr": "airflow_pool_open_slots{job=\"integrations/apache-airflow\",job=~\"$job\",instance=~\"$instance\", pool_name=~\"$pool_name\"}", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} - {{pool_name}} - open" + "instant": false, + "legendFormat": "{{instance}} - {{pool_name}} - open", + "refId": "Pool open slots" } ], "title": "Pool task slots", "type": "timeseries" - }, - { - "datasource": { - "uid": "${loki_datasource}" - }, - "description": "Shows the scheduler logs by DAG file.", - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 57 - }, - "id": 16, - "options": { - "dedupStrategy": "none", - "enableLogDetails": true, - "prettifyLogMessage": false, - "showCommonLabels": false, - "showLabels": false, - "showTime": false, - "sortOrder": "Descending", - "wrapLogMessage": false - }, - "targets": [ - { - "datasource": { - "uid": "${loki_datasource}" - }, - "editorMode": "code", - "expr": "{job=~\"$job\", instance=~\"$instance\", dag_file=~\"$dag_file\", filename=~\".*/airflow/logs/scheduler/latest/.*\"} |= ``", - "queryType": "range", - "refId": "A" - } - ], - "title": "Scheduler logs", - "type": "logs" } ], "refresh": "1m", - "rows": [ ], - "schemaVersion": 14, - "style": "dark", + "schemaVersion": 39, "tags": [ "apache-airflow-mixin" ], "templating": { "list": [ { - "current": { }, - "hide": 0, "label": "Prometheus data source", "name": "prometheus_datasource", - "options": [ ], "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "current": { }, - "hide": 0, - "label": "Loki data source", - "name": "loki_datasource", - "options": [ ], - "query": "loki", - "refresh": 1, "regex": "", "type": "datasource" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, "label": "Job", "multi": true, "name": "job", - "options": [ ], - "query": "label_values(airflow_scheduler_tasks_executable,job)", + "query": "label_values(airflow_dagbag_size{job=\"integrations/apache-airflow\"}, job)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "current": { }, - "datasource": { - "uid": "${prometheus_datasource}" - }, - "hide": 2, - "includeAll": true, - "label": "Cluster", - "multi": true, - "name": "cluster", - "options": [ ], - "query": "label_values(airflow_scheduler_tasks_executable{job=~\"$job\"}, cluster)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "sort": 1, + "type": "query" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, "label": "Instance", "multi": true, "name": "instance", - "options": [ ], - "query": "label_values(airflow_scheduler_tasks_executable{job=~\"$job\"}, instance)", + "query": "label_values(airflow_dagbag_size{job=\"integrations/apache-airflow\",job=~\"$job\"}, instance)", "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "hide": 2, + "label": "Loki data source", + "name": "loki_datasource", + "query": "loki", "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "datasource" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, "label": "DAG", "multi": true, "name": "dag_id", - "options": [ ], - "query": "label_values(airflow_task_start_total{job=~\"$job\", instance=~\"$instance\"}, dag_id)", + "query": "label_values(airflow_dagrun_duration_success_sum{job=~\"$job\", instance=~\"$instance\"}, dag_id)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, "label": "Task", "multi": true, "name": "task_id", - "options": [ ], - "query": "label_values(airflow_task_start_total{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\"}, task_id)", + "query": "label_values(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\"}, task_id)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, - "label": "Task state", + "label": "State", "multi": true, "name": "state", - "options": [ ], - "query": "label_values(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\", dag_id=~\"$dag_id\", task_id=~\"$task_id\"}, state)", + "query": "label_values(airflow_task_finish_total{job=~\"$job\", instance=~\"$instance\"}, state)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { "allValue": ".+", - "current": { }, "datasource": { + "type": "prometheus", "uid": "${prometheus_datasource}" }, - "hide": 0, "includeAll": true, - "label": "Pool", + "label": "Pool name", "multi": true, "name": "pool_name", - "options": [ ], - "query": "label_values(airflow_pool_open_slots{job=~\"$job\", instance=~\"$instance\"}, pool_name)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".+", - "current": { }, - "datasource": { - "uid": "${loki_datasource}" - }, - "hide": 0, - "includeAll": true, - "label": "DAG file", - "multi": true, - "name": "dag_file", - "options": [ ], - "query": "label_values(dag_file)", + "query": "label_values(airflow_pool_running_slots{job=~\"$job\", instance=~\"$instance\"}, pool_name)", "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" } ] }, "time": { - "from": "now-1h", + "from": "now-6h", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, "timezone": "default", "title": "Apache Airflow overview", - "uid": "apache-airflow-overview", - "version": 0 + "uid": "apacheairflow_overview" } \ No newline at end of file diff --git a/apache-airflow-mixin/g.libsonnet b/apache-airflow-mixin/g.libsonnet new file mode 100644 index 000000000..e6a2060ee --- /dev/null +++ b/apache-airflow-mixin/g.libsonnet @@ -0,0 +1 @@ +import 'github.com/grafana/grafonnet/gen/grafonnet-v11.4.0/main.libsonnet' diff --git a/apache-airflow-mixin/jsonnetfile.json b/apache-airflow-mixin/jsonnetfile.json index 65cebf84b..79c4d8a18 100644 --- a/apache-airflow-mixin/jsonnetfile.json +++ b/apache-airflow-mixin/jsonnetfile.json @@ -1,15 +1,51 @@ { - "version": 1, - "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" - } - }, - "version": "master" - } - ], - "legacyImports": true + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v11.4.0" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "common-lib" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "grafana-cloud-integration-utils" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib" + } + }, + "version": "master" + } + ], + "legacyImports": true } diff --git a/apache-airflow-mixin/links.libsonnet b/apache-airflow-mixin/links.libsonnet new file mode 100644 index 000000000..b40096644 --- /dev/null +++ b/apache-airflow-mixin/links.libsonnet @@ -0,0 +1,23 @@ +local g = import './g.libsonnet'; + +{ + local link = g.dashboard.link, + new(this): { + apacheAirflowOverview: + link.link.new(this.config.dashboardNamePrefix + ' overview', '/d/' + this.grafana.dashboards['apache-airflow-overview.json'].uid) + + link.link.options.withKeepTime(true), + + otherDashboards: + link.dashboards.new('All dashboards', this.config.dashboardTags) + + link.dashboards.options.withIncludeVars(true) + + link.dashboards.options.withKeepTime(true) + + link.dashboards.options.withAsDropdown(true), + } + + + if this.config.enableLokiLogs then + { + logs: + link.link.new(this.config.dashboardNamePrefix + ' logs', '/d/' + this.grafana.dashboards['apache-airflow-logs.json'].uid) + + link.link.options.withKeepTime(true), + } else {}, +} diff --git a/apache-airflow-mixin/main.libsonnet b/apache-airflow-mixin/main.libsonnet new file mode 100644 index 000000000..42d3b0da6 --- /dev/null +++ b/apache-airflow-mixin/main.libsonnet @@ -0,0 +1,49 @@ +local alerts = import './alerts.libsonnet'; +local config = import './config.libsonnet'; +local dashboards = import './dashboards.libsonnet'; +local links = import './links.libsonnet'; +local panels = import './panels.libsonnet'; +local rows = import './rows.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + withConfigMixin(config): { + config+: config, + }, + + new(): { + + local this = self, + config: config, + + signals: + { + [sig]: commonlib.signals.unmarshallJsonMulti( + this.config.signals[sig], + type=this.config.metricsSource + ) + for sig in std.objectFields(this.config.signals) + }, + + grafana: { + variables: commonlib.variables.new( + filteringSelector=this.config.filteringSelector, + groupLabels=this.config.groupLabels, + instanceLabels=this.config.instanceLabels, + varMetric='airflow_dagbag_size', + customAllValue='.+', + enableLokiLogs=this.config.enableLokiLogs, + ), + annotations: {}, + links: links.new(this), + panels: panels.new(this), + dashboards: dashboards.new(this), + rows: rows.new(this), + }, + + prometheus: { + alerts: alerts.new(this), + recordingRules: {}, + }, + }, +} diff --git a/apache-airflow-mixin/mixin.libsonnet b/apache-airflow-mixin/mixin.libsonnet index 4d987cf31..2a2b77ff0 100644 --- a/apache-airflow-mixin/mixin.libsonnet +++ b/apache-airflow-mixin/mixin.libsonnet @@ -1,3 +1,35 @@ -(import 'dashboards/dashboards.libsonnet') + -(import 'alerts/alerts.libsonnet') + -(import 'config.libsonnet') +local apacheAirflowlib = import './main.libsonnet'; +local config = (import './config.libsonnet'); +local util = import 'grafana-cloud-integration-utils/util.libsonnet'; + +local apacheAirflow = + apacheAirflowlib.new() + + apacheAirflowlib.withConfigMixin( + { + filteringSelector: config.filteringSelector, + uid: config.uid, + enableLokiLogs: config.enableLokiLogs, + } + ); + +local label_patch = { + cluster+: { + allValue: '.*', + }, + dag_id+: { + label: 'DAG', + }, + dag_file+: { + label: 'DAG', + }, +}; + +// populate monitoring-mixin: +{ + grafanaDashboards+:: { + [fname]: apacheAirflow.grafana.dashboards[fname] + util.patch_variables(apacheAirflow.grafana.dashboards[fname], label_patch) + for fname in std.objectFields(apacheAirflow.grafana.dashboards) + }, + prometheusAlerts+:: apacheAirflow.prometheus.alerts, + prometheusRules+:: apacheAirflow.prometheus.recordingRules, +} diff --git a/apache-airflow-mixin/panels.libsonnet b/apache-airflow-mixin/panels.libsonnet new file mode 100644 index 000000000..e508c6fd9 --- /dev/null +++ b/apache-airflow-mixin/panels.libsonnet @@ -0,0 +1,134 @@ +local g = (import './g.libsonnet'); +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + new(this):: + { + local signals = this.signals, + + // + // Overview Dashboard Panels + // + dagFileParsingErrorsPanel: + commonlib.panels.generic.timeSeries.base.new('DAG file parsing errors', targets=[signals.overview.dagParsingErrors.asTarget()]) + + g.panel.timeSeries.panelOptions.withDescription('The number of errors from trying to parse DAG files in an Apache Airflow system.') + + g.panel.timeSeries.standardOptions.withUnit('none'), + + slaIssuesPanel: + commonlib.panels.generic.timeSeries.base.new('SLA misses', targets=[ + signals.overview.taskSlaMisses.asTarget() { interval: '2m' }, + ]) + + g.panel.timeSeries.panelOptions.withDescription('The number of Service Level Agreement misses for any DAG runs in the Apache Airflow system.') + + g.panel.timeSeries.standardOptions.withUnit('none'), + + taskFailuresPanel: + commonlib.panels.generic.timeSeries.base.new('Task failures', targets=[ + signals.overview.taskFailures.asTarget() { interval: '2m' }, + ]) + + g.panel.timeSeries.panelOptions.withDescription('The overall task instances failures for an Apache Airflow system.') + + g.panel.timeSeries.standardOptions.withUnit('none'), + + dagSuccessDurationPanel: + commonlib.panels.generic.timeSeries.base.new('DAG success duration', targets=[ + signals.overview.dagAvgSuccessDuration.asTarget() { interval: '2m' }, + ]) + + g.panel.timeSeries.panelOptions.withDescription('The average time taken for recent successful DAG runs by DAG ID in the Apache Airflow system.') + + g.panel.timeSeries.standardOptions.withUnit('s') + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + dagFailedDurationPanel: + commonlib.panels.generic.timeSeries.base.new('DAG failed duration', targets=[ + signals.overview.dagAvgFailedDuration.asTarget() { interval: '2m' }, + ]) + + g.panel.timeSeries.panelOptions.withDescription('The average time taken for recent failed DAG runs by DAG ID in the Apache Airflow system.') + + g.panel.timeSeries.standardOptions.withUnit('s') + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + taskDurationPanel: + g.panel.barGauge.new('Task duration') + + g.panel.barGauge.queryOptions.withTargets([ + signals.overview.taskAvgDuration.asTarget() { interval: '2m' }, + ]) + + g.panel.barGauge.options.withOrientation('horizontal') + + g.panel.barGauge.panelOptions.withDescription('The average time taken for recent task runs by Task ID in the Apache Airflow system.') + + g.panel.barGauge.standardOptions.withUnit('s'), + + taskCountSummaryPanel: + g.panel.pieChart.new( + 'Task count summary' + ) + + g.panel.pieChart.queryOptions.withTargets([ + signals.overview.taskFinishTotalSum.asTarget() { interval: '2m' }, + ]) + + g.panel.pieChart.options.legend.withPlacement('right') + + g.panel.pieChart.options.legend.withAsTable(true) + + g.panel.pieChart.panelOptions.withDescription('The number of task counts by DAG ID in the Apache Airflow system.') + + g.panel.pieChart.standardOptions.withUnit('none'), + + taskCountsPanel: + commonlib.panels.generic.timeSeries.base.new('Task counts', targets=[ + signals.overview.taskFinishTotal.asTarget() { interval: '2m' }, + ]) + + g.panel.timeSeries.panelOptions.withDescription('The number of task counts by Task ID in the Apache Airflow system.') + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + // + // Scheduler Details Panels + // + + dagScheduleDelayPanel: + commonlib.panels.generic.timeSeries.base.new( + 'DAG schedule delay', + targets=[ + signals.overview.dagAvgScheduleDelay.asTarget() { interval: '2m' }, + ] + ) + + g.panel.timeSeries.panelOptions.withDescription('The amount of average delay between recent scheduled DAG runtime and the actual DAG runtime by DAG ID in the Apache Airflow system.') + + g.panel.timeSeries.standardOptions.withUnit('s') + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + schedulerTasksPanel: + commonlib.panels.generic.timeSeries.base.new( + 'Scheduler tasks', + targets=[ + signals.overview.schedulerTasksExecutable.asTarget(), + signals.overview.schedulerTasksStarving.asTarget(), + ] + ) + + g.panel.timeSeries.panelOptions.withDescription('The number of current tasks that the scheduler is handling in the Apache Airflow system.') + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + executorTasksPanel: + commonlib.panels.generic.timeSeries.base.new( + 'Executor tasks', + targets=[ + signals.overview.executorRunningTasks.asTarget(), + signals.overview.executorQueuedTasks.asTarget(), + signals.overview.executorOpenSlots.asTarget(), + ] + ) + + g.panel.timeSeries.panelOptions.withDescription('The number of current tasks that the executors are handling in the Apache Airflow system.') + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + + poolTaskSlotsPanel: + commonlib.panels.generic.timeSeries.base.new( + 'Pool task slots', + targets=[ + signals.overview.poolRunningSlots.asTarget(), + signals.overview.poolQueuedSlots.asTarget(), + signals.overview.poolStarvingTasks.asTarget(), + signals.overview.poolOpenSlots.asTarget(), + ] + ) + + g.panel.timeSeries.panelOptions.withDescription('The number of current task slots that the pools are handling in the Apache Airflow system.') + + g.panel.timeSeries.standardOptions.withUnit('none') + + g.panel.timeSeries.options.tooltip.withMode('multi'), + }, +} diff --git a/apache-airflow-mixin/prometheus_rules_out/prometheus_alerts.yaml b/apache-airflow-mixin/prometheus_rules_out/prometheus_alerts.yaml index 305c05bca..fb355e856 100644 --- a/apache-airflow-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/apache-airflow-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -1,5 +1,5 @@ groups: - - name: apache-airflow + - name: apache-airflow-alerts rules: - alert: ApacheAirflowStarvingPoolTasks annotations: @@ -7,27 +7,27 @@ groups: The number of starved tasks is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.pool_name }} which is above the threshold of 0. summary: There are starved tasks detected in the Apache Airflow pool. expr: | - airflow_pool_starving_tasks > 0 + airflow_pool_starving_tasks{job="integrations/apache-airflow"} > 0 for: 5m labels: severity: critical - - alert: ApacheAirflowDAGScheduleDelayWarningLevel + - alert: ApacheAirflowDAGScheduleDelayWarning annotations: description: | The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m on {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 10. summary: The delay in DAG schedule time to DAG run time has reached the warning threshold. expr: | - increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 10 + increase(airflow_dagrun_schedule_delay_sum{job="integrations/apache-airflow"}[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{job="integrations/apache-airflow"}[5m]),1) > 10 for: 1m labels: severity: warning - - alert: ApacheAirflowDAGScheduleDelayCriticalLevel + - alert: ApacheAirflowDAGScheduleDelayCritical annotations: description: | The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 60. summary: The delay in DAG schedule time to DAG run time has reached the critical threshold. expr: | - increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 60 + increase(airflow_dagrun_schedule_delay_sum{job="integrations/apache-airflow"}[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count{job="integrations/apache-airflow"}[5m]),1) > 60 for: 1m labels: severity: critical @@ -37,7 +37,7 @@ groups: The number of DAG failures seen is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 0. summary: There have been DAG failures detected. expr: | - increase(airflow_dagrun_duration_failed_count[5m]) > 0 + increase(airflow_dagrun_duration_failed_count{job="integrations/apache-airflow"}[5m]) > 0 for: 1m labels: severity: critical diff --git a/apache-airflow-mixin/rows.libsonnet b/apache-airflow-mixin/rows.libsonnet new file mode 100644 index 000000000..dbabda986 --- /dev/null +++ b/apache-airflow-mixin/rows.libsonnet @@ -0,0 +1,37 @@ +local g = import './g.libsonnet'; + +{ + new(this): + { + // --- + // Overview Dashboard Rows + // --- + + // Main overview row + apacheAirflowOverview: + g.panel.row.new('Apache Airflow overview') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + this.grafana.panels.dagFileParsingErrorsPanel { gridPos+: { w: 8 } }, + this.grafana.panels.slaIssuesPanel { gridPos+: { w: 8 } }, + this.grafana.panels.taskFailuresPanel { gridPos+: { w: 8 } }, + this.grafana.panels.dagSuccessDurationPanel { gridPos+: { w: 12 } }, + this.grafana.panels.dagFailedDurationPanel { gridPos+: { w: 12 } }, + this.grafana.panels.taskDurationPanel { gridPos+: { w: 8 } }, + this.grafana.panels.taskCountSummaryPanel { gridPos+: { w: 8 } }, + this.grafana.panels.taskCountsPanel { gridPos+: { w: 8 } }, + ]), + + // Scheduler details row + apacheAirflowSchedulerDetails: + g.panel.row.new('Scheduler details') + + g.panel.row.withCollapsed(false) + + g.panel.row.withPanels([ + this.grafana.panels.dagScheduleDelayPanel { gridPos+: { w: 12 } }, + this.grafana.panels.schedulerTasksPanel { gridPos+: { w: 12 } }, + this.grafana.panels.executorTasksPanel { gridPos+: { w: 12 } }, + this.grafana.panels.poolTaskSlotsPanel { gridPos+: { w: 12 } }, + ]), + + }, +} diff --git a/apache-airflow-mixin/signals/overview.libsonnet b/apache-airflow-mixin/signals/overview.libsonnet new file mode 100644 index 000000000..e26e4ca2b --- /dev/null +++ b/apache-airflow-mixin/signals/overview.libsonnet @@ -0,0 +1,369 @@ +function(this) + local legendCustmoTemplate = std.join(' ', std.map(function(label) '{{' + label + '}}', this.instanceLabels)); + { + filteringSelector: this.filteringSelector, + groupLabels: this.groupLabels, + instanceLabels: this.instanceLabels, + enableLokiLogs: this.enableLokiLogs, + legendCustomTemplate: legendCustmoTemplate, + aggLevel: 'none', + aggFunction: 'avg', + signals: { + // DAG-related signals + dagParsingErrors: { + name: 'DAG file parsing errors', + type: 'gauge', + description: 'The number of errors from trying to parse DAG files in an Apache Airflow system.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_dag_processing_import_errors{%(queriesSelector)s}', + }, + }, + }, + + dagSuccessDurationSum: { + name: 'Successful DAG run duration sum', + type: 'counter', + description: 'Sum of successful DAG run durations.', + unit: 's', + sources: { + prometheus: { + expr: 'airflow_dagrun_duration_success_sum{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + dagSuccessDurationCount: { + name: 'Successful DAG run duration count', + type: 'counter', + description: 'Count of successful DAG runs.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_dagrun_duration_success_count{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + dagFailedDurationSum: { + name: 'Failed DAG run duration sum', + type: 'counter', + description: 'Sum of failed DAG run durations.', + unit: 's', + sources: { + prometheus: { + expr: 'airflow_dagrun_duration_failed_sum{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + dagFailedDurationCount: { + name: 'Failed DAG run duration count', + type: 'counter', + description: 'Count of failed DAG runs.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_dagrun_duration_failed_count{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + dagScheduleDelaySum: { + name: 'DAG schedule delay sum', + type: 'counter', + description: 'Sum of DAG schedule delays.', + unit: 's', + sources: { + prometheus: { + expr: 'airflow_dagrun_schedule_delay_sum{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + dagScheduleDelayCount: { + name: 'DAG schedule delay count', + type: 'counter', + description: 'Count of DAG schedule delays.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_dagrun_schedule_delay_count{%(queriesSelector)s, dag_id=~"$dag_id"}', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + dagAvgSuccessDuration: { + name: 'Average successful DAG run duration', + type: 'raw', + description: 'Average duration of successful DAG runs calculated as sum/count.', + unit: 's', + sources: { + prometheus: { + expr: 'increase(airflow_dagrun_duration_success_sum{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:] offset -$__interval)/clamp_min(increase(airflow_dagrun_duration_success_count{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:] offset -$__interval),1)', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + dagAvgFailedDuration: { + name: 'Average failed DAG run duration', + type: 'raw', + description: 'Average duration of failed DAG runs calculated as sum/count.', + unit: 's', + sources: { + prometheus: { + expr: 'increase(airflow_dagrun_duration_failed_sum{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:] offset -$__interval)/clamp_min(increase(airflow_dagrun_duration_failed_count{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:] offset -$__interval),1)', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + dagAvgScheduleDelay: { + name: 'Average DAG schedule delay', + type: 'raw', + description: 'Average DAG schedule delay calculated as sum/count.', + unit: 's', + sources: { + prometheus: { + expr: 'increase(airflow_dagrun_schedule_delay_sum{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:] offset -$__interval)/clamp_min(increase(airflow_dagrun_schedule_delay_count{%(queriesSelector)s, dag_id=~"$dag_id"}[$__interval:] offset -$__interval),1)', + legendCustomTemplate: '{{dag_id}}', + }, + }, + }, + + // Task-related signals + taskFailures: { + name: 'Task failures', + type: 'counter', + description: 'Overall task instance failures in an Apache Airflow system.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_ti_failures{%(queriesSelector)s}', + legendCustomTemplate: '{{instance}}', + rangeFunction: 'increase', + }, + }, + }, + + taskSlaMisses: { + name: 'SLA misses', + type: 'counter', + description: 'SLA misses in an Apache Airflow system.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_sla_missed{%(queriesSelector)s}', + rangeFunction: 'increase', + legendCustomTemplate: '{{instance}}', + }, + }, + }, + + taskDurationSum: { + name: 'Task duration sum', + type: 'counter', + description: 'Sum of task durations.', + unit: 's', + sources: { + prometheus: { + expr: 'airflow_dag_task_duration_sum{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}', + legendCustomTemplate: '{{dag_id}} - {{task_id}}', + }, + }, + }, + + taskDurationCount: { + name: 'Task duration count', + type: 'counter', + description: 'Count of task durations.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_dag_task_duration_count{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}', + legendCustomTemplate: '{{dag_id}} - {{task_id}}', + }, + }, + }, + + taskFinishTotalSum: { + name: 'Task finish total pie', + type: 'raw', + description: 'Total number of task finishes by state.', + unit: 'none', + sources: { + prometheus: { + expr: 'sum by(job, instance, dag_id, state) (increase(airflow_task_finish_total{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id", state=~"$state"}[$__interval:] offset -$__interval) > 0)', + legendCustomTemplate: '{{dag_id}} - {{state}}', + }, + }, + }, + + taskFinishTotal: { + name: 'Task finish total', + type: 'raw', + description: 'Total number of task finishes by state.', + unit: 'none', + sources: { + prometheus: { + expr: 'increase(airflow_task_finish_total{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id", state=~"$state"}[$__interval:] offset -$__interval) > 0', + legendCustomTemplate: '{{dag_id}} - {{task_id}} - {{state}}', + }, + }, + }, + + taskAvgDuration: { + name: 'Average task duration', + type: 'raw', + description: 'Average duration of tasks calculated as sum/count.', + unit: 's', + sources: { + prometheus: { + expr: 'increase(airflow_dag_task_duration_sum{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}[$__interval:] offset -$__interval)/clamp_min(increase(airflow_dag_task_duration_count{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}[$__interval:] offset -$__interval),1) != 0', + legendCustomTemplate: '{{dag_id}} - {{task_id}}', + }, + }, + }, + + taskStartTotal: { + name: 'Task start total', + type: 'counter', + description: 'Total number of task starts.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_task_start_total{%(queriesSelector)s, dag_id=~"$dag_id", task_id=~"$task_id"}', + legendCustomTemplate: '{{dag_id}} - {{task_id}}', + }, + }, + }, + + // Scheduler-related signals + schedulerTasksExecutable: { + name: 'Scheduler executable tasks', + type: 'gauge', + description: 'Number of tasks that are ready for execution in the scheduler.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_scheduler_tasks_executable{%(queriesSelector)s}', + legendCustomTemplate: legendCustmoTemplate + ' - executable', + }, + }, + }, + + schedulerTasksStarving: { + name: 'Scheduler starving tasks', + type: 'gauge', + description: 'Number of tasks that are starving (waiting for resources) in the scheduler.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_scheduler_tasks_starving{%(queriesSelector)s}', + legendCustomTemplate: legendCustmoTemplate + ' - starving', + }, + }, + }, + + // Executor-related signals + executorRunningTasks: { + name: 'Executor running tasks', + type: 'gauge', + description: 'Number of tasks currently running in the executor.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_executor_running_tasks{%(queriesSelector)s}', + legendCustomTemplate: legendCustmoTemplate + ' - running', + }, + }, + }, + + executorQueuedTasks: { + name: 'Executor queued tasks', + type: 'gauge', + description: 'Number of tasks queued in the executor.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_executor_queued_tasks{%(queriesSelector)s}', + legendCustomTemplate: legendCustmoTemplate + ' - queued', + }, + }, + }, + + executorOpenSlots: { + name: 'Executor open slots', + type: 'gauge', + description: 'Number of open slots available in the executor.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_executor_open_slots{%(queriesSelector)s}', + legendCustomTemplate: legendCustmoTemplate + ' - open', + }, + }, + }, + + // Pool-related signals + poolRunningSlots: { + name: 'Pool running slots', + type: 'gauge', + description: 'Number of slots currently running in pools.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_pool_running_slots{%(queriesSelector)s, pool_name=~"$pool_name"}', + legendCustomTemplate: legendCustmoTemplate + ' - {{pool_name}} - running', + }, + }, + }, + + poolQueuedSlots: { + name: 'Pool queued slots', + type: 'gauge', + description: 'Number of slots queued in pools.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_pool_queued_slots{%(queriesSelector)s, pool_name=~"$pool_name"}', + legendCustomTemplate: legendCustmoTemplate + ' - {{pool_name}} - queued', + }, + }, + }, + + poolStarvingTasks: { + name: 'Pool starving tasks', + type: 'gauge', + description: 'Number of tasks starving (waiting for resources) in pools.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_pool_starving_tasks{%(queriesSelector)s, pool_name=~"$pool_name"}', + legendCustomTemplate: legendCustmoTemplate + ' - {{pool_name}} - starving', + }, + }, + }, + + poolOpenSlots: { + name: 'Pool open slots', + type: 'gauge', + description: 'Number of open slots available in pools.', + unit: 'none', + sources: { + prometheus: { + expr: 'airflow_pool_open_slots{%(queriesSelector)s, pool_name=~"$pool_name"}', + legendCustomTemplate: legendCustmoTemplate + ' - {{pool_name}} - open', + }, + }, + }, + }, + }