|
5 | 5 | name: 'otelcol', |
6 | 6 | rules: [ |
7 | 7 | { |
8 | | - alert: 'OtelcolSendingQueueFull', |
| 8 | + alert: 'ReceiverDroppedSpans', |
9 | 9 | expr: ||| |
10 | | - otelcol_exporter_queue_size >= otelcol_exporter_queue_capacity |
| 10 | + rate(otelcol_receiver_refused_spans_total[5m]) > 0 |
11 | 11 | |||, |
12 | | - 'for': '30m', |
| 12 | + 'for': '2m', |
| 13 | + labels: { |
| 14 | + severity: 'critical', |
| 15 | + }, |
| 16 | + annotations: { |
| 17 | + summary: 'Receiver is dropping spans.', |
| 18 | + description: 'The {{ $labels.receiver }} receiver is dropping spans at a rate of {{ humanize $value }} per second.', |
| 19 | + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures', |
| 20 | + }, |
| 21 | + }, |
| 22 | + { |
| 23 | + alert: 'ReceiverDroppedMetrics', |
| 24 | + expr: ||| |
| 25 | + rate(otelcol_receiver_refused_metric_points_total[5m]) > 0 |
| 26 | + |||, |
| 27 | + 'for': '2m', |
| 28 | + labels: { |
| 29 | + severity: 'critical', |
| 30 | + }, |
| 31 | + annotations: { |
| 32 | + summary: 'Receiver is dropping metrics.', |
| 33 | + description: 'The {{ $labels.receiver }} receiver is dropping metrics at a rate of {{ humanize $value }} per second.', |
| 34 | + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures', |
| 35 | + }, |
| 36 | + }, |
| 37 | + { |
| 38 | + alert: 'ReceiverDroppedLogs', |
| 39 | + expr: ||| |
| 40 | + rate(otelcol_receiver_refused_log_records_total[5m]) > 0 |
| 41 | + |||, |
| 42 | + 'for': '5m', |
| 43 | + labels: { |
| 44 | + severity: 'critical', |
| 45 | + }, |
| 46 | + annotations: { |
| 47 | + summary: 'Receiver is dropping logs.', |
| 48 | + description: 'The {{ $labels.receiver }} is dropping logs at a rate of {{ humanize $value }} per second.', |
| 49 | + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#receive-failures', |
| 50 | + }, |
| 51 | + }, |
| 52 | + { |
| 53 | + alert: 'ExporterDroppedSpans', |
| 54 | + expr: ||| |
| 55 | + rate(otelcol_exporter_send_failed_spans_total[5m]) > 0 |
| 56 | + |||, |
| 57 | + 'for': '2m', |
| 58 | + labels: { |
| 59 | + severity: 'critical', |
| 60 | + }, |
| 61 | + annotations: { |
| 62 | + summary: 'Exporter is dropping spans.', |
| 63 | + description: 'The {{ $labels.exporter }} exporter is dropping spans at a rate of {{ humanize $value }} per second.', |
| 64 | + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures', |
| 65 | + }, |
| 66 | + }, |
| 67 | + { |
| 68 | + alert: 'ExporterDroppedMetrics', |
| 69 | + expr: ||| |
| 70 | + rate(otelcol_exporter_send_failed_metric_points_total[5m]) > 0 |
| 71 | + |||, |
| 72 | + 'for': '2m', |
| 73 | + labels: { |
| 74 | + severity: 'critical', |
| 75 | + }, |
| 76 | + annotations: { |
| 77 | + summary: 'Exporter is dropping metrics.', |
| 78 | + description: 'The {{ $labels.exporter }} exporter is dropping metrics at a rate of {{ humanize $value }} per second.', |
| 79 | + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures', |
| 80 | + }, |
| 81 | + }, |
| 82 | + { |
| 83 | + alert: 'ExporterDroppedLogs', |
| 84 | + expr: ||| |
| 85 | + rate(otelcol_exporter_send_failed_log_records_total[5m]) > 0 |
| 86 | + |||, |
| 87 | + 'for': '5m', |
| 88 | + labels: { |
| 89 | + severity: 'critical', |
| 90 | + }, |
| 91 | + annotations: { |
| 92 | + summary: 'Exporter is dropping logs.', |
| 93 | + description: 'The {{ $labels.exporter }} is dropping logs at a rate of {{ humanize $value }} per second.', |
| 94 | + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#send-failures', |
| 95 | + }, |
| 96 | + }, |
| 97 | + { |
| 98 | + alert: 'ExporterQueueSize', |
| 99 | + expr: ||| |
| 100 | + otelcol_exporter_queue_size > otelcol_exporter_queue_capacity * 0.8 |
| 101 | + |||, |
| 102 | + 'for': '1m', |
| 103 | + labels: { |
| 104 | + severity: 'warning', |
| 105 | + }, |
| 106 | + annotations: { |
| 107 | + summary: 'Exporter queue is filling up.', |
| 108 | + description: 'The {{ $labels.exporter }} queue has reached a size of {{ $value }}.', |
| 109 | + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length', |
| 110 | + }, |
| 111 | + }, |
| 112 | + { |
| 113 | + alert: 'SendQueueFailedSpans', |
| 114 | + expr: ||| |
| 115 | + rate(otelcol_exporter_enqueue_failed_spans_total[5m]) > 0 |
| 116 | + |||, |
| 117 | + 'for': '1m', |
| 118 | + labels: { |
| 119 | + severity: 'warning', |
| 120 | + }, |
| 121 | + annotations: { |
| 122 | + summary: 'Exporter send queue failed to accept spans.', |
| 123 | + description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} spans.', |
| 124 | + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length', |
| 125 | + }, |
| 126 | + }, |
| 127 | + { |
| 128 | + alert: 'SendQueueFailedMetricPoints', |
| 129 | + expr: ||| |
| 130 | + rate(otelcol_exporter_enqueue_failed_metric_points_total[5m]) > 0 |
| 131 | + |||, |
| 132 | + 'for': '1m', |
| 133 | + labels: { |
| 134 | + severity: 'warning', |
| 135 | + }, |
| 136 | + annotations: { |
| 137 | + summary: 'Exporter send queue failed to accept metric points.', |
| 138 | + description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} metric points.', |
| 139 | + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length', |
| 140 | + }, |
| 141 | + }, |
| 142 | + { |
| 143 | + alert: 'SendQueueFailedLogRecords', |
| 144 | + expr: ||| |
| 145 | + rate(otelcol_exporter_enqueue_failed_log_records_total[5m]) > 0 |
| 146 | + |||, |
| 147 | + 'for': '1m', |
13 | 148 | labels: { |
14 | 149 | severity: 'warning', |
15 | 150 | }, |
16 | 151 | annotations: { |
17 | | - summary: 'The sending queue has filled up.', |
18 | | - description: 'The sending queue is full for {{ $labels.instance }}. The collector might start dropping data', |
| 152 | + summary: 'Exporter send queue failed to accept log records.', |
| 153 | + description: 'The {{ $labels.exporter }} sending queue failed to accept {{ $value }} log records.', |
| 154 | + runbook_url: 'https://opentelemetry.io/docs/collector/internal-telemetry/#queue-length', |
19 | 155 | }, |
20 | 156 | }, |
21 | 157 | ], |
|
0 commit comments