Skip to content

Commit eb7901c

Browse files
committed
feat: make probe_duration_seconds match probe_script_duration_seconds
For the vast majority of check types, `probe_duration_seconds` is reported as the wall time that took to run the prober. This is accurate for the vast majority of cases, but there's one in particular where it is not: k6-backed checks that use external runners. External runners can take a certain time to process a request, which prior to this commit was counted as part of `probe_duration_seconds` while in fact it was not. This commit adds code that parses the `probe_script_duration_seconds` metric returned by k6 runners, and uses that as the duration of the check for both logs and `probe_duration_seconds`.
1 parent 15e62bf commit eb7901c

File tree

8 files changed

+43
-28
lines changed

8 files changed

+43
-28
lines changed

internal/k6runner/http_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ func TestScriptHTTPRun(t *testing.T) {
320320
zlogger = testhelper.Logger(t)
321321
)
322322

323-
success, err := script.Run(ctx, registry, logger, zlogger, SecretStore{})
323+
success, _, err := script.Run(ctx, registry, logger, zlogger, SecretStore{})
324324
require.Equal(t, tc.expectSuccess, success)
325325
require.Equal(t, tc.expectLogs, logger.buf.String())
326326
if tc.expectErrorAs == nil {
@@ -453,7 +453,7 @@ func TestHTTPProcessorRetries(t *testing.T) {
453453
logger testLogger
454454
zlogger = zerolog.New(io.Discard)
455455
)
456-
success, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{})
456+
success, _, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{})
457457
require.ErrorIs(t, err, tc.expectError)
458458
require.Equal(t, tc.expectError == nil, success)
459459
require.Equal(t, tc.expectRequests, requests.Load())
@@ -499,7 +499,7 @@ func TestHTTPProcessorRetries(t *testing.T) {
499499
logger testLogger
500500
zlogger = zerolog.New(io.Discard)
501501
)
502-
success, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{})
502+
success, _, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{})
503503
require.NoError(t, err)
504504
require.True(t, success)
505505
})

internal/k6runner/k6runner.go

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"fmt"
88
"io"
99
"strings"
10+
"time"
1011

1112
"github.com/go-logfmt/logfmt"
1213
smmmodel "github.com/grafana/synthetic-monitoring-agent/internal/model"
@@ -147,7 +148,7 @@ var (
147148
ErrFromRunner = errors.New("runner reported an error")
148149
)
149150

150-
func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logger logger.Logger, internalLogger zerolog.Logger, secretStore SecretStore) (bool, error) {
151+
func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logger logger.Logger, internalLogger zerolog.Logger, secretStore SecretStore) (bool, time.Duration, error) {
151152
k6runner := r.runner.WithLogger(&internalLogger)
152153

153154
// TODO: This error message is okay to be Debug for local k6 execution, but should be Error for remote runners.
@@ -156,15 +157,15 @@ func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logge
156157
internalLogger.Debug().
157158
Err(err).
158159
Msg("k6 script exited with error code")
159-
return false, err
160+
return false, 0, err
160161
}
161162

162163
// If only one of Error and ErrorCode are non-empty, the proxy is misbehaving.
163164
switch {
164165
case result.Error == "" && result.ErrorCode != "":
165166
fallthrough
166167
case result.Error != "" && result.ErrorCode == "":
167-
return false, fmt.Errorf(
168+
return false, 0, fmt.Errorf(
168169
"%w: only one of error (%q) and errorCode (%q) is non-empty",
169170
ErrBuggyRunner, result.Error, result.ErrorCode,
170171
)
@@ -194,40 +195,41 @@ func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logge
194195
internalLogger.Debug().
195196
Err(err).
196197
Msg("cannot load logs to logger")
197-
return false, err
198+
return false, 0, err
198199
}
199200

200201
var (
201-
collector sampleCollector
202-
resultCollector checkResultCollector
202+
collector sampleCollector
203+
resultCollector checkResultCollector
204+
durationCollector probeDurationCollector
203205
)
204206

205-
if err := extractMetricSamples(result.Metrics, internalLogger, collector.process, resultCollector.process); err != nil {
207+
if err := extractMetricSamples(result.Metrics, internalLogger, collector.process, resultCollector.process, durationCollector.process); err != nil {
206208
internalLogger.Debug().
207209
Err(err).
208210
Msg("cannot extract metric samples")
209-
return false, err
211+
return false, 0, err
210212
}
211213

212214
if err := registry.Register(&collector.collector); err != nil {
213215
internalLogger.Error().
214216
Err(err).
215217
Msg("cannot register collector")
216-
return false, err
218+
return false, 0, err
217219
}
218220

219221
// https://github.com/grafana/sm-k6-runner/blob/b811839d444a7e69fd056b0a4e6ccf7e914197f3/internal/mq/runner.go#L51
220222
switch result.ErrorCode {
221223
case "":
222224
// No error, all good.
223-
return true, nil
225+
return true, durationCollector.duration, nil
224226
// TODO: Remove "user" from this list, which has been renamed to "aborted".
225227
case "timeout", "killed", "user", "failed", "aborted":
226228
// These are user errors. The probe failed, but we don't return an error.
227-
return false, nil
229+
return false, durationCollector.duration, nil
228230
default:
229231
// We got an "unknown" error, or some other code we do not recognize. Return it so we log it.
230-
return false, fmt.Errorf("%w: %s: %s", ErrFromRunner, result.ErrorCode, result.Error)
232+
return false, durationCollector.duration, fmt.Errorf("%w: %s: %s", ErrFromRunner, result.ErrorCode, result.Error)
231233
}
232234
}
233235

@@ -302,6 +304,21 @@ func (rc *checkResultCollector) process(mf *dto.MetricFamily, sample *model.Samp
302304
return nil
303305
}
304306

307+
type probeDurationCollector struct {
308+
duration time.Duration
309+
}
310+
311+
func (dc *probeDurationCollector) process(_ *dto.MetricFamily, sample *model.Sample) error {
312+
if sample.Metric[model.MetricNameLabel] != "probe_script_duration_seconds" {
313+
return nil
314+
}
315+
316+
// TODO: Is there a better way to convert from float seconds to time?
317+
dc.duration = time.Duration(sample.Value*1000) * time.Millisecond // This truncates to the nearest millisecond.
318+
319+
return nil
320+
}
321+
305322
func extractMetricSamples(metrics []byte, logger zerolog.Logger, processors ...sampleProcessorFunc) error {
306323
promDecoder := expfmt.NewDecoder(bytes.NewBuffer(metrics), expfmt.NewFormat(expfmt.TypeTextPlain))
307324
decoderOpts := expfmt.DecodeOptions{Timestamp: model.Now()}

internal/k6runner/k6runner_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"sort"
1010
"strings"
1111
"testing"
12+
"time"
1213

1314
"github.com/grafana/synthetic-monitoring-agent/internal/model"
1415
"github.com/grafana/synthetic-monitoring-agent/internal/prober/logger"
@@ -90,9 +91,10 @@ func TestScriptRun(t *testing.T) {
9091
// We already know tha parsing the metrics and the logs is working, so
9192
// we are only interested in verifying that the script runs without
9293
// errors.
93-
success, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{})
94+
success, duration, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{})
9495
require.NoError(t, err)
9596
require.True(t, success)
97+
require.Equal(t, 500*time.Millisecond, duration)
9698
}
9799

98100
func TestCheckInfoFromSM(t *testing.T) {
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
probe_data_sent_bytes{scenario="default"} 0
22
probe_data_received_bytes{scenario="default"} 0
33
probe_iteration_duration_seconds{scenario="default"} 1.3029e-05
4+
probe_script_duration_seconds 0.500

internal/prober/browser/browser.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,20 +66,18 @@ func (p Prober) Name() string {
6666

6767
func (p Prober) Probe(ctx context.Context, target string, registry *prometheus.Registry, logger logger.Logger) (bool, float64) {
6868
secretStore, err := p.secretsRetriever(ctx)
69-
7069
if err != nil {
7170
p.logger.Error().Err(err).Msg("failed to retrieve secret store")
7271
return false, 0
7372
}
7473

75-
success, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore)
74+
success, duration, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore)
7675
if err != nil {
7776
p.logger.Error().Err(err).Msg("running probe")
7877
return false, 0
7978
}
8079

81-
// TODO(mem): implement custom duration extraction.
82-
return success, 0
80+
return success, duration.Seconds()
8381
}
8482

8583
// TODO(mem): This should probably be in the k6runner package.

internal/prober/multihttp/multihttp.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,20 +89,18 @@ func (p Prober) Name() string {
8989

9090
func (p Prober) Probe(ctx context.Context, target string, registry *prometheus.Registry, logger logger.Logger) (bool, float64) {
9191
secretStore, err := p.secretsRetriever(ctx)
92-
9392
if err != nil {
9493
p.logger.Error().Err(err).Msg("running probe")
9594
return false, 0
9695
}
9796

98-
success, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore)
97+
success, duration, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore)
9998
if err != nil {
10099
p.logger.Error().Err(err).Msg("running probe")
101100
return false, 0
102101
}
103102

104-
// TODO(mem): implement custom duration extraction.
105-
return success, 0
103+
return success, duration.Seconds()
106104
}
107105

108106
// Overrides any user-provided headers with our own augmented values

internal/prober/multihttp/script_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -869,7 +869,7 @@ func TestSettingsToScript(t *testing.T) {
869869
t.Log("Log entries:\n" + buf.String())
870870

871871
require.True(t, success)
872-
require.Equal(t, float64(0), duration)
872+
require.NotEqual(t, 0, duration)
873873
}
874874

875875
func TestReplaceVariablesInString(t *testing.T) {

internal/prober/scripted/scripted.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,13 @@ func (p Prober) Probe(ctx context.Context, target string, registry *prometheus.R
7878
Bool("hasSecretStoreToken", secretStore.Token != "").
7979
Msg("secret store retrieved for scripted probe")
8080

81-
success, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore)
81+
success, duration, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore)
8282
if err != nil {
8383
p.logger.Error().Err(err).Msg("running probe")
8484
return false, 0
8585
}
8686

87-
// TODO(mem): implement custom duration extraction.
88-
return success, 0
87+
return success, duration.Seconds()
8988
}
9089

9190
func newCredentialsRetriever(provider secrets.SecretProvider, tenantID model.GlobalID, logger zerolog.Logger) func(context.Context) (k6runner.SecretStore, error) {

0 commit comments

Comments
 (0)