Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Commit 4fb7275

Browse files
authored
Merge branch 'main' into darrenjaneczek/dashboard-descriptions-reads-writes
2 parents 5794607 + 344fce1 commit 4fb7275

File tree

8 files changed

+127
-56
lines changed

8 files changed

+127
-56
lines changed

CHANGELOG.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,25 @@
22

33
## master / unreleased
44

5+
* [ENHANCEMENT] Cortex-mixin: Include `cortex-gw-internal` naming variation in default `gateway` job names. #328
56
* [CHANGE] `namespace` template variable in dashboards now only selects namespaces for selected clusters. #311
67
* [CHANGE] Alertmanager: mounted overrides configmap to alertmanager too. #315
78
* [CHANGE] Memcached: upgraded memcached from `1.5.17` to `1.6.9`. #316
89
* [CHANGE] `CortexIngesterRestarts` alert severity changed from `critical` to `warning`. #321
910
* [CHANGE] Store-gateway: increased memory request and limit respectively from 6GB / 6GB to 12GB / 18GB. #322
1011
* [CHANGE] Store-gateway: increased `-blocks-storage.bucket-store.max-chunk-pool-bytes` from 2GB (default) to 12GB. #322
11-
* [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317
12-
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
1312
* [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. #319
1413
* [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. #319
14+
* [CHANGE] Ingester/Ruler: set `-server.grpc-max-send-msg-size-bytes` and `-server.grpc-max-send-msg-size-bytes` to sensible default values (10MB). #326
15+
* [CHANGE] Renamed `CortexCompactorHasNotUploadedBlocksSinceStart` to `CortexCompactorHasNotUploadedBlocks`. #334
16+
* [CHANGE] Renamed `CortexCompactorRunFailed` to `CortexCompactorHasNotSuccessfullyRunCompaction`. #334
17+
* [CHANGE] Renamed `CortexInconsistentConfig` alert to `CortexInconsistentRuntimeConfig` and increased severity to `critical`. #335
18+
* [CHANGE] Increased `CortexBadRuntimeConfig` alert severity to `critical` and removed support for `cortex_overrides_last_reload_successful` metric (was removed in Cortex 1.3.0). #335
19+
* [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317
1520
* [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards. #324
21+
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
22+
* [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329
23+
* [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335
1624

1725
## 1.9.0 / 2021-05-18
1826

cortex-mixin/alerts/alerts.libsonnet

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -92,39 +92,30 @@
9292
},
9393
},
9494
{
95-
alert: 'CortexInconsistentConfig',
95+
alert: 'CortexInconsistentRuntimeConfig',
9696
expr: |||
97-
count(count by(%s, job, sha256) (cortex_config_hash)) without(sha256) > 1
97+
count(count by(%s, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1
9898
||| % $._config.alert_aggregation_labels,
9999
'for': '1h',
100100
labels: {
101-
severity: 'warning',
101+
severity: 'critical',
102102
},
103103
annotations: {
104104
message: |||
105-
An inconsistent config file hash is used across cluster {{ $labels.job }}.
105+
An inconsistent runtime config file is used across cluster {{ $labels.job }}.
106106
|||,
107107
},
108108
},
109109
{
110-
// As of https://github.com/cortexproject/cortex/pull/2092, this metric is
111-
// only exposed when it is supposed to be non-zero, so we don't need to do
112-
// any special filtering on the job label.
113-
// The metric itself was renamed in
114-
// https://github.com/cortexproject/cortex/pull/2874
115-
//
116-
// TODO: Remove deprecated metric name of
117-
// cortex_overrides_last_reload_successful in the future
118110
alert: 'CortexBadRuntimeConfig',
119111
expr: |||
112+
# The metric value is reset to 0 on error while reloading the config at runtime.
120113
cortex_runtime_config_last_reload_successful == 0
121-
or
122-
cortex_overrides_last_reload_successful == 0
123114
|||,
124115
// Alert quicker for human errors.
125116
'for': '5m',
126117
labels: {
127-
severity: 'warning',
118+
severity: 'critical',
128119
},
129120
annotations: {
130121
message: |||

cortex-mixin/alerts/compactor.libsonnet

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,19 @@
4747
message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.',
4848
},
4949
},
50+
{
51+
// Alert if compactor failed to run 2 consecutive compactions.
52+
alert: 'CortexCompactorHasNotSuccessfullyRunCompaction',
53+
expr: |||
54+
increase(cortex_compactor_runs_failed_total[2h]) >= 2
55+
|||,
56+
labels: {
57+
severity: 'critical',
58+
},
59+
annotations: {
60+
message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} failed to run 2 consecutive compactions.',
61+
},
62+
},
5063
{
5164
// Alert if the compactor has not uploaded anything in the last 24h.
5265
alert: 'CortexCompactorHasNotUploadedBlocks',
@@ -65,7 +78,7 @@
6578
},
6679
{
6780
// Alert if the compactor has not uploaded anything since its start.
68-
alert: 'CortexCompactorHasNotUploadedBlocksSinceStart',
81+
alert: 'CortexCompactorHasNotUploadedBlocks',
6982
'for': '24h',
7083
expr: |||
7184
thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} == 0
@@ -77,21 +90,6 @@
7790
message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.',
7891
},
7992
},
80-
{
81-
// Alert if compactor fails.
82-
alert: 'CortexCompactorRunFailed',
83-
expr: |||
84-
increase(cortex_compactor_runs_failed_total[2h]) >= 2
85-
|||,
86-
labels: {
87-
severity: 'critical',
88-
},
89-
annotations: {
90-
message: |||
91-
{{ $labels.job }}/{{ $labels.instance }} failed to run compaction.
92-
|||,
93-
},
94-
},
9593
],
9694
},
9795
],

cortex-mixin/config.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
query_scheduler: 'query-scheduler', // Not part of single-binary.
3535
table_manager: '(table-manager|cortex$)',
3636
store_gateway: '(store-gateway|cortex$)',
37-
gateway: '(gateway|cortex-gw)',
37+
gateway: '(gateway|cortex-gw|cortex-gw-internal)',
3838
compactor: 'compactor.*', // Match also custom compactor deployments.
3939
},
4040

cortex-mixin/docs/playbooks.md

Lines changed: 91 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,63 @@ If nothing obvious from the above, check for increased load:
2626

2727
### CortexIngesterReachingSeriesLimit
2828

29-
_TODO: this playbook has not been written yet._
29+
This alert fires when the `max_series` per ingester instance limit is enabled and the actual number of in-memory series in a ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new series, while appending samples to existing ones will continue to succeed.
30+
31+
In case of **emergency**:
32+
- If the actual number of series is very close or already hit the limit, then you can increase the limit via runtime config to gain some time
33+
- Increasing the limit will increase the ingesters memory utilization. Please monitor the ingesters memory utilization via the `Cortex / Writes Resources` dashboard
34+
35+
How the limit is **configured**:
36+
- The limit can be configured either on CLI (`-ingester.instance-limits.max-series`) or in the runtime config:
37+
```
38+
ingester_limits:
39+
max_series: <int>
40+
```
41+
- The mixin configures the limit in the runtime config and can be fine-tuned via:
42+
```
43+
_config+:: {
44+
ingester_instance_limits+:: {
45+
max_series: <int>
46+
}
47+
}
48+
```
49+
- When configured in the runtime config, changes are applied live without requiring an ingester restart
50+
- The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_series"}`
51+
52+
How to **fix**:
53+
1. **Scale up ingesters**<br />
54+
Scaling up ingesters will lower the number of series per ingester. However, the effect of this change will take up to 4h, because after the scale up we need to wait until all stale series are dropped from memory as the effect of TSDB head compaction, which could take up to 4h (with the default config, TSDB keeps in-memory series up to 3h old and it gets compacted every 2h).
55+
2. **Temporarily increase the limit**<br />
56+
If the actual number of series is very close or already hit the limit, or if you foresee the ingester will hit the limit before dropping the stale series as effect of the scale up, you should also temporarily increase the limit.
3057

3158
### CortexIngesterReachingTenantsLimit
3259

33-
_TODO: this playbook has not been written yet._
60+
This alert fires when the `max_tenants` per ingester instance limit is enabled and the actual number of tenants in a ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new tenants, while they will continue to succeed for previously existing ones.
61+
62+
In case of **emergency**:
63+
- If the actual number of tenants is very close or already hit the limit, then you can increase the limit via runtime config to gain some time
64+
- Increasing the limit will increase the ingesters memory utilization. Please monitor the ingesters memory utilization via the `Cortex / Writes Resources` dashboard
65+
66+
How the limit is **configured**:
67+
- The limit can be configured either on CLI (`-ingester.instance-limits.max-tenants`) or in the runtime config:
68+
```
69+
ingester_limits:
70+
max_tenants: <int>
71+
```
72+
- The mixin configures the limit in the runtime config and can be fine-tuned via:
73+
```
74+
_config+:: {
75+
ingester_instance_limits+:: {
76+
max_tenants: <int>
77+
}
78+
}
79+
```
80+
- When configured in the runtime config, changes are applied live without requiring an ingester restart
81+
- The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_tenants"}`
82+
83+
How to **fix**:
84+
1. Ensure shuffle-sharding is enabled in the Cortex cluster
85+
1. Assuming shuffle-sharding is enabled, scaling up ingesters will lower the number of tenants per ingester. However, the effect of this change will be visible only after `-blocks-storage.tsdb.close-idle-tsdb-timeout` period so you may have to temporarily increase the limit
3486

3587
### CortexRequestLatency
3688
First establish if the alert is for read or write latency. The alert should say.
@@ -220,11 +272,21 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas
220272
This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time.
221273

222274
How to **investigate**:
223-
- If the alert `CortexCompactorHasNotSuccessfullyRun` or `CortexCompactorHasNotSuccessfullyRunSinceStart` have fired as well, then investigate that issue first
275+
- If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` has fired as well, then investigate that issue first
224276
- If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first
225277
- Ensure ingesters are successfully shipping blocks to the storage
226278
- Look for any error in the compactor logs
227279

280+
### CortexCompactorHasNotSuccessfullyRunCompaction
281+
282+
This alert fires if the compactor is not able to successfully compact all discovered compactable blocks (across all tenants).
283+
284+
When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block.
285+
286+
How to **investigate**:
287+
- Look for any error in the compactor logs
288+
- Corruption: [`not healthy index found`](#compactor-is-failing-because-of-not-healthy-index-found)
289+
228290
#### Compactor is failing because of `not healthy index found`
229291

230292
The compactor may fail to compact blocks due a corrupted block index found in one of the source blocks:
@@ -249,18 +311,6 @@ To rename a block stored on GCS you can use the `gsutil` CLI:
249311
gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK
250312
```
251313

252-
### CortexCompactorHasNotUploadedBlocksSinceStart
253-
254-
Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks).
255-
256-
### CortexCompactorHasNotSuccessfullyRunCompaction
257-
258-
_TODO: this playbook has not been written yet._
259-
260-
### CortexCompactorRunFailed
261-
262-
_TODO: this playbook has not been written yet._
263-
264314
### CortexBucketIndexNotUpdated
265315

266316
This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store.
@@ -317,13 +367,33 @@ _TODO: this playbook has not been written yet._
317367

318368
_TODO: this playbook has not been written yet._
319369

320-
### CortexInconsistentConfig
370+
### CortexInconsistentRuntimeConfig
321371

322-
_TODO: this playbook has not been written yet._
372+
This alert fires if multiple replicas of the same Cortex service are using a different runtime config for a longer period of time.
373+
374+
The Cortex runtime config is a config file which gets live reloaded by Cortex at runtime. In order for Cortex to work properly, the loaded config is expected to be the exact same across multiple replicas of the same Cortex service (eg. distributors, ingesters, ...). When the config changes, there may be short periods of time during which some replicas have loaded the new config and others are still running on the previous one, but it shouldn't last for more than few minutes.
375+
376+
How to **investigate**:
377+
- Check how many different config file versions (hashes) are reported
378+
```
379+
count by (sha256) (cortex_runtime_config_hash{namespace="<namespace>"})
380+
```
381+
- Check which replicas are running a different version
382+
```
383+
cortex_runtime_config_hash{namespace="<namespace>",sha256="<unexpected>"}
384+
```
385+
- Check if the runtime config has been updated on the affected replicas' filesystem. Check `-runtime-config.file` command line argument to find the location of the file.
386+
- Check the affected replicas logs and look for any error loading the runtime config
323387

324388
### CortexBadRuntimeConfig
325389

326-
_TODO: this playbook has not been written yet._
390+
This alert fires if Cortex is unable to reload the runtime config.
391+
392+
This typically means an invalid runtime config was deployed. Cortex keeps running with the previous (valid) version of the runtime config; running Cortex replicas and the system availability shouldn't be affected, but new replicas won't be able to startup until the runtime config is fixed.
393+
394+
How to **investigate**:
395+
- Check the latest runtime config update (it's likely to be broken)
396+
- Check Cortex logs to get more details about what's wrong with the config
327397

328398
### CortexQuerierCapacityFull
329399

@@ -347,15 +417,15 @@ _TODO: this playbook has not been written yet._
347417

348418
### CortexCheckpointCreationFailed
349419

350-
_TODO: this playbook has not been written yet._
420+
_This alert applies to Cortex chunks storage only._
351421

352422
### CortexCheckpointDeletionFailed
353423

354-
_TODO: this playbook has not been written yet._
424+
_This alert applies to Cortex chunks storage only._
355425

356426
### CortexProvisioningMemcachedTooSmall
357427

358-
_TODO: this playbook has not been written yet._
428+
_This alert applies to Cortex chunks storage only._
359429

360430
### CortexProvisioningTooManyActiveSeries
361431

cortex/alertmanager.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
$.util.mapToFlags($.alertmanager_args) +
6464
if isHA then
6565
['--alertmanager.cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port] +
66-
['--alertmanager.cluster.peers=%s' % peer for peer in peers]
66+
['--alertmanager.cluster.peers=%s' % std.join(',', peers)]
6767
else [],
6868
) +
6969
container.withVolumeMountsMixin(

cortex/ingester.libsonnet

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
'ingester.max-samples-per-query': $._config.limits.max_samples_per_query,
3333
'runtime-config.file': '/etc/cortex/overrides.yaml',
3434
'server.grpc-max-concurrent-streams': 100000,
35+
'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024,
36+
'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024,
3537
} + (
3638
if $._config.memcached_index_writes_enabled then
3739
{

cortex/ruler.libsonnet

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
// Limits
2525
'ruler.max-rules-per-rule-group': $._config.limits.ruler_max_rules_per_rule_group,
2626
'ruler.max-rule-groups-per-tenant': $._config.limits.ruler_max_rule_groups_per_tenant,
27+
'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024,
28+
'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024,
2729

2830
// Storage
2931
'querier.second-store-engine': $._config.querier_second_storage_engine,

0 commit comments

Comments
 (0)