Skip to content

Commit 257cda0

Browse files
authored
PGO updates pgnodemx/pg_stat_statements (#3400)
* PGO updates pgnodemx/pg_stat_statements Users reported that an updated image wouldn't trigger an update of monitoring extensions. This changes that behavior by * adding the monitor and pg image tags to the revision hash, * adding update lines to the pgmonitor enable action. Note: this _only_ targets these two extensions as updating other extensions should probably be under the user's power. Issue: [sc-14476] * add KUTTL test for exporter upgrade errors
1 parent be153b3 commit 257cda0

File tree

10 files changed

+187
-11
lines changed

10 files changed

+187
-11
lines changed

internal/controller/postgrescluster/pgmonitor.go

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,24 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context,
7878
writableInstance *Instance
7979
writablePod *corev1.Pod
8080
setup string
81+
pgImageSHA string
82+
exporterImageSHA string
8183
)
8284

8385
// Find the PostgreSQL instance that can execute SQL that writes to every
8486
// database. When there is none, return early.
85-
8687
writablePod, writableInstance = instances.writablePod(naming.ContainerDatabase)
8788
if writableInstance == nil || writablePod == nil {
8889
return nil
8990
}
9091

92+
// For the writableInstance found above
93+
// 1) make sure the `exporter` container is running
94+
// 2) get and save the imageIDs for the `exporter` and `database` containers, and
95+
// 3) exit early if we can't get the ImageID of either of those containers.
96+
// We use these ImageIDs in the hash we make to see if the operator needs to rerun
97+
// the `EnableExporterInPostgreSQL` funcs; that way we are always running
98+
// that function against an updated and running pod.
9199
if pgmonitor.ExporterEnabled(cluster) {
92100
running, known := writableInstance.IsRunning(naming.ContainerPGMonitorExporter)
93101
if !running || !known {
@@ -97,11 +105,15 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context,
97105

98106
for _, containerStatus := range writablePod.Status.ContainerStatuses {
99107
if containerStatus.Name == naming.ContainerPGMonitorExporter {
100-
setup = containerStatus.ImageID
108+
exporterImageSHA = containerStatus.ImageID
109+
}
110+
if containerStatus.Name == naming.ContainerDatabase {
111+
pgImageSHA = containerStatus.ImageID
101112
}
102113
}
103-
if setup == "" {
104-
// Could not get exporter container imageID
114+
115+
// Could not get container imageIDs
116+
if exporterImageSHA == "" || pgImageSHA == "" {
105117
return nil
106118
}
107119
}
@@ -127,11 +139,13 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context,
127139
) error {
128140
_, err := io.Copy(hasher, stdin)
129141
if err == nil {
130-
_, err = fmt.Fprint(hasher, command)
142+
// Use command and image tag in hash to execute hash on image update
143+
_, err = fmt.Fprint(hasher, command, pgImageSHA, exporterImageSHA)
131144
}
132145
return err
133146
})
134147
})
148+
135149
if err != nil {
136150
return err
137151
}
@@ -149,7 +163,6 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context,
149163
}
150164

151165
// Apply the necessary SQL and record its hash in cluster.Status
152-
153166
if err == nil {
154167
err = action(ctx, func(_ context.Context, stdin io.Reader,
155168
stdout, stderr io.Writer, command ...string) error {

internal/controller/postgrescluster/pgmonitor_test.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,9 @@ func TestReconcilePGMonitorExporterSetupErrors(t *testing.T) {
317317
},
318318
Status: corev1.PodStatus{
319319
ContainerStatuses: []corev1.ContainerStatus{{
320-
Name: naming.ContainerDatabase,
321-
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
320+
Name: naming.ContainerDatabase,
321+
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
322+
ImageID: "image@sha123",
322323
}, {
323324
Name: naming.ContainerPGMonitorExporter,
324325
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
@@ -437,7 +438,7 @@ func TestReconcilePGMonitorExporterStatus(t *testing.T) {
437438
podExecCalled: false,
438439
// Status was generated manually for this test case
439440
// TODO jmckulk: add code to generate status
440-
status: v1beta1.MonitoringStatus{ExporterConfiguration: "5f599686cf"},
441+
status: v1beta1.MonitoringStatus{ExporterConfiguration: "5dbc557689"},
441442
statusChangedAfterReconcile: false,
442443
}} {
443444
t.Run(test.name, func(t *testing.T) {
@@ -468,8 +469,9 @@ func TestReconcilePGMonitorExporterStatus(t *testing.T) {
468469
},
469470
Status: corev1.PodStatus{
470471
ContainerStatuses: []corev1.ContainerStatus{{
471-
Name: naming.ContainerDatabase,
472-
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
472+
Name: naming.ContainerDatabase,
473+
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
474+
ImageID: "image@sha123",
473475
}},
474476
},
475477
}},

internal/pgmonitor/postgres.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ func EnableExporterInPostgreSQL(ctx context.Context, exec postgres.Executor,
102102
// Exporter expects that extension(s) to be installed in all databases
103103
// pg_stat_statements: https://access.crunchydata.com/documentation/pgmonitor/latest/exporter/
104104
"CREATE EXTENSION IF NOT EXISTS pg_stat_statements;",
105+
106+
// Run idempotent update
107+
"ALTER EXTENSION pg_stat_statements UPDATE;",
105108
}, "\n"),
106109
map[string]string{
107110
"ON_ERROR_STOP": "on", // Abort when any one statement fails.
@@ -130,6 +133,9 @@ func EnableExporterInPostgreSQL(ctx context.Context, exec postgres.Executor,
130133
// https://github.com/CrunchyData/pgmonitor/blob/master/postgres_exporter/common/queries_nodemx.yml
131134
"CREATE EXTENSION IF NOT EXISTS pgnodemx WITH SCHEMA monitor;",
132135

136+
// Run idempotent update
137+
"ALTER EXTENSION pgnodemx UPDATE;",
138+
133139
// ccp_monitoring user is created in Setup.sql without a
134140
// password; update the password and ensure that the ROLE
135141
// can login to the database
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
apiVersion: postgres-operator.crunchydata.com/v1beta1
2+
kind: PostgresCluster
3+
metadata:
4+
name: exporter
5+
spec:
6+
postgresVersion: 14
7+
image: us.gcr.io/container-suite/crunchy-postgres:ubi8-14.0-5.0.3-0
8+
instances:
9+
- name: instance1
10+
dataVolumeClaimSpec:
11+
accessModes:
12+
- "ReadWriteOnce"
13+
resources:
14+
requests:
15+
storage: 1Gi
16+
backups:
17+
pgbackrest:
18+
repos:
19+
- name: repo1
20+
volume:
21+
volumeClaimSpec:
22+
accessModes:
23+
- "ReadWriteOnce"
24+
resources:
25+
requests:
26+
storage: 1Gi
27+
monitoring:
28+
pgmonitor:
29+
exporter:
30+
image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres-exporter:ubi8-5.2.0-0
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: postgres-operator.crunchydata.com/v1beta1
2+
kind: PostgresCluster
3+
metadata:
4+
name: exporter
5+
status:
6+
instances:
7+
- name: instance1
8+
readyReplicas: 1
9+
replicas: 1
10+
updatedReplicas: 1
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
---
2+
apiVersion: kuttl.dev/v1beta1
3+
kind: TestStep
4+
commands:
5+
- script: |
6+
set -e
7+
PRIMARY=$(
8+
kubectl get pod --namespace "${NAMESPACE}" \
9+
--output name --selector '
10+
postgres-operator.crunchydata.com/cluster=exporter,
11+
postgres-operator.crunchydata.com/role=master'
12+
)
13+
14+
# Ensure that the metrics endpoint is available from inside the exporter container
15+
for i in {1..5}; do
16+
kubectl exec --namespace "${NAMESPACE}" "${PRIMARY}" -c exporter -- curl http://localhost:9187/metrics
17+
sleep 2
18+
done
19+
20+
# Ensure that the monitoring user exists and is configured.
21+
kubectl exec --stdin --namespace "${NAMESPACE}" "${PRIMARY}" \
22+
-- psql -qb --set ON_ERROR_STOP=1 --file=- <<'SQL'
23+
DO $$
24+
DECLARE
25+
result record;
26+
BEGIN
27+
SELECT * INTO result FROM pg_catalog.pg_roles WHERE rolname = 'ccp_monitoring';
28+
ASSERT FOUND, 'user not found';
29+
ASSERT result.rolconfig @> '{jit=off}', format('got config: %L', result.rolconfig);
30+
END $$
31+
SQL
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
apiVersion: postgres-operator.crunchydata.com/v1beta1
2+
kind: PostgresCluster
3+
metadata:
4+
name: exporter
5+
spec:
6+
postgresVersion: 14
7+
image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-14.5-1
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
apiVersion: postgres-operator.crunchydata.com/v1beta1
2+
kind: PostgresCluster
3+
metadata:
4+
name: exporter
5+
status:
6+
instances:
7+
- name: instance1
8+
readyReplicas: 1
9+
replicas: 1
10+
updatedReplicas: 1
11+
---
12+
apiVersion: batch/v1
13+
kind: Job
14+
metadata:
15+
labels:
16+
postgres-operator.crunchydata.com/cluster: exporter
17+
postgres-operator.crunchydata.com/pgbackrest-backup: replica-create
18+
status:
19+
succeeded: 1
20+
---
21+
apiVersion: v1
22+
kind: Service
23+
metadata:
24+
name: exporter-primary
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
---
2+
apiVersion: kuttl.dev/v1beta1
3+
kind: TestStep
4+
commands:
5+
- script: |
6+
set -e
7+
PRIMARY=$(
8+
kubectl get pod --namespace "${NAMESPACE}" \
9+
--output name --selector '
10+
postgres-operator.crunchydata.com/cluster=exporter,
11+
postgres-operator.crunchydata.com/role=master'
12+
)
13+
14+
# Get errors from the exporter
15+
# See the README.md for a discussion of these errors
16+
ERR=$(kubectl logs --namespace "${NAMESPACE}" "${PRIMARY}" -c exporter | grep -e "Error running query on database")
17+
ERR_COUNT=$(echo "$ERR" | wc -l)
18+
19+
if [[ "$ERR_COUNT" -gt 2 ]]; then
20+
echo "Errors in log from exporter: ${ERR}"
21+
exit 1
22+
fi
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
The exporter-upgrade test makes sure that PGO updates an extension used for monitoring. This
2+
avoids an error where a user might update to a new PG image with a newer extension, but with an
3+
older extension operative.
4+
5+
Note: This test relies on two `crunchy-postgres` images with known, different `pgnodemx` extensions:
6+
the image created in 00--cluster.yaml has `pgnodemx` 1.1; the image we update the cluster to in
7+
02--update-cluster.yaml has `pgnodemx` 1.3.
8+
9+
00-01
10+
This starts up a cluster with a purposely outdated `pgnodemx` extension. Because we want a specific
11+
extension, the image used here is hard-coded (and so outdated it's not publicly available).
12+
13+
(This image is so outdated that it doesn't finish creating a backup with the current PGO, which is
14+
why the 00-assert.yaml only checks that the pod is ready; and why 01--check-exporter.yaml wraps the
15+
call in a retry loop.)
16+
17+
02-03
18+
The cluster is updated with a newer (and hardcoded) image with a newer version of `pgnodemx`. Due
19+
to the change made in https://github.com/CrunchyData/postgres-operator/pull/3400, this should no
20+
longer produce multiple errors.
21+
22+
Note: a few errors may be logged after the `exporter` container attempts to run the `pgnodemx`
23+
functions but before the extension is updated. So this checks that there are no more than 2 errors,
24+
since that was the observed maximum number of printed errors during manual tests of the check.
25+
26+
For instance, using these hardcoded images (with `pgnodemx` versions 1.1 and 1.3), those errors were:
27+
28+
```
29+
Error running query on database \"localhost:5432\": ccp_nodemx_disk_activity pq: query-specified return tuple and function return type are not compatible"
30+
Error running query on database \"localhost:5432\": ccp_nodemx_data_disk pq: query-specified return tuple and function return type are not compatible
31+
```

0 commit comments

Comments
 (0)