From 94d80aa7dc774e27ce9a914b7022a4b428d5e396 Mon Sep 17 00:00:00 2001 From: anilb Date: Thu, 16 Oct 2025 12:39:22 +0200 Subject: [PATCH 1/2] feat: last copy pipe executions monitoring endpoint --- .../monitoring_copy_pipe_executions.pipe | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 services/libs/tinybird/pipes/monitoring_copy_pipe_executions.pipe diff --git a/services/libs/tinybird/pipes/monitoring_copy_pipe_executions.pipe b/services/libs/tinybird/pipes/monitoring_copy_pipe_executions.pipe new file mode 100644 index 0000000000..961da68397 --- /dev/null +++ b/services/libs/tinybird/pipes/monitoring_copy_pipe_executions.pipe @@ -0,0 +1,51 @@ +DESCRIPTION > + Monitors Tinybird copy job executions and exposes their status as Prometheus gauge metrics. Consumed by datadog scraper (`dd-tinybird-copy-pipe-executions-scraper`) + +Tracks the latest execution status for each copy pipe (starting today) and generates one-hot encoded metrics where each pipe gets a value of `1` for its current status (`'ok'`, `'error'`, `'cancelled'`, `'queued'`, `'working'`) and `0` for all other statuses. + +**Metric:** `copy_pipes_latest_execution_status` (gauge) +**Labels:** `pipe_name`, `status` + +**Note:** The `'queued'` status is virtual—it's derived by detecting the error message `"You have reached the maximum number of copy jobs"` rather than being a native Tinybird status. This is because tinybird retries these again when possible but returns an error status. Similarly, `'ok'` is mapped from the native `'done'` status for datadog color-scheme convention. + + +TAGS "Monitoring" + +NODE copy_pipes_latest_executions +SQL > + + SELECT + JSONExtract(job_metadata, 'pipe_name', 'String') AS pipe_name, + multiIf( + error LIKE 'You have reached the maximum number of copy jobs%', + 'queued', + status = 'done', + 'ok', + status + ) AS status, + error, + started_at + FROM tinybird.jobs_log + WHERE + job_type = 'copy' AND started_at > toStartOfDay(now()) and pipe_name <> 'members_with_location' + ORDER BY pipe_id, created_at DESC + LIMIT 1 BY pipe_id + + + +NODE errored_copy_pipes_latest_execution +SQL > + + WITH + possible_statuses AS (SELECT array('ok', 'error', 'cancelled', 'queued', 'working') AS statuses) + SELECT + 'copy_pipes_latest_execution_status' AS name, + IF(s = e.status, 1, 0) AS value, + 'Latest execution status per pipe (one-hot: 1 active, 0 otherwise)' AS help, + 'gauge' AS type, + map('pipe_name', e.pipe_name, 'status', s) AS labels + FROM copy_pipes_latest_executions AS e + CROSS JOIN possible_statuses ps ARRAY + JOIN ps.statuses AS s + + From dd83f625c6397625fc040286646776049477dc2c Mon Sep 17 00:00:00 2001 From: anilb Date: Thu, 23 Oct 2025 12:49:44 +0200 Subject: [PATCH 2/2] feat: improve monitoring endpoint for retried pipes bcs of limits --- .../monitoring_copy_pipe_executions.pipe | 61 ++++++++++++------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/services/libs/tinybird/pipes/monitoring_copy_pipe_executions.pipe b/services/libs/tinybird/pipes/monitoring_copy_pipe_executions.pipe index 961da68397..3c7446c70e 100644 --- a/services/libs/tinybird/pipes/monitoring_copy_pipe_executions.pipe +++ b/services/libs/tinybird/pipes/monitoring_copy_pipe_executions.pipe @@ -1,41 +1,62 @@ DESCRIPTION > - Monitors Tinybird copy job executions and exposes their status as Prometheus gauge metrics. Consumed by datadog scraper (`dd-tinybird-copy-pipe-executions-scraper`) - -Tracks the latest execution status for each copy pipe (starting today) and generates one-hot encoded metrics where each pipe gets a value of `1` for its current status (`'ok'`, `'error'`, `'cancelled'`, `'queued'`, `'working'`) and `0` for all other statuses. - -**Metric:** `copy_pipes_latest_execution_status` (gauge) -**Labels:** `pipe_name`, `status` - -**Note:** The `'queued'` status is virtual—it's derived by detecting the error message `"You have reached the maximum number of copy jobs"` rather than being a native Tinybird status. This is because tinybird retries these again when possible but returns an error status. Similarly, `'ok'` is mapped from the native `'done'` status for datadog color-scheme convention. - + Monitors Tinybird copy job executions and exposes their status as Prometheus gauge metrics. Consumed by datadog scraper (`dd-tinybird-copy-pipe-executions-scraper`) + Tracks the latest execution status for each copy pipe (starting today) and generates one-hot encoded metrics where each pipe gets a value of `1` for its current status (`'ok'`, `'error'`, `'cancelled'`, `'queued'`, `'working'`) and `0` for all other statuses. + **Metric:** `copy_pipes_latest_execution_status` (gauge) + **Labels:** `pipe_name`, `status` + **Note:** The `'queued'` status is virtual—it's derived by detecting the error message `"You have reached the maximum number of copy jobs"` rather than being a native Tinybird status. + This is because tinybird retries these again when possible but returns an error status. After 3 retries, if the copy pipe still gets the same limit error, we return an error state. + Similarly, `'ok'` is mapped from the native `'done'` status for datadog color-scheme convention. TAGS "Monitoring" NODE copy_pipes_latest_executions SQL > - + WITH 'You have reached the maximum number of copy jobs%' AS max_jobs_err SELECT JSONExtract(job_metadata, 'pipe_name', 'String') AS pipe_name, multiIf( - error LIKE 'You have reached the maximum number of copy jobs%', - 'queued', status = 'done', 'ok', + /* queued: current has max-jobs error AND the previous two do NOT */ + (error LIKE max_jobs_err) + AND (coalesce(prev_error1, '') NOT LIKE max_jobs_err) + AND (coalesce(prev_error2, '') NOT LIKE max_jobs_err), + 'queued', + /* hard error: last three consecutive runs have the same max-jobs error */ + (error LIKE max_jobs_err) + AND (coalesce(prev_error1, '') LIKE max_jobs_err) + AND (coalesce(prev_error2, '') LIKE max_jobs_err), + 'error', + /* any other error on the last execution -> error */ + (error != '' AND error NOT LIKE max_jobs_err), + 'error', + /* otherwise keep original status (e.g., running/queued by system) */ status ) AS status, error, started_at - FROM tinybird.jobs_log - WHERE - job_type = 'copy' AND started_at > toStartOfDay(now()) and pipe_name <> 'members_with_location' + FROM + ( + SELECT + *, + lagInFrame(error, 1, '') OVER ( + PARTITION BY pipe_id ORDER BY created_at DESC + ) AS prev_error1, + lagInFrame(error, 2, '') OVER ( + PARTITION BY pipe_id ORDER BY created_at DESC + ) AS prev_error2, + row_number() OVER (PARTITION BY pipe_id ORDER BY created_at DESC) AS rn + FROM tinybird.jobs_log + WHERE + job_type = 'copy' + AND started_at > toStartOfDay(now()) + AND JSONExtract(job_metadata, 'pipe_name', 'String') <> 'members_with_location' + ) + WHERE rn = 1 ORDER BY pipe_id, created_at DESC - LIMIT 1 BY pipe_id - - NODE errored_copy_pipes_latest_execution SQL > - WITH possible_statuses AS (SELECT array('ok', 'error', 'cancelled', 'queued', 'working') AS statuses) SELECT @@ -47,5 +68,3 @@ SQL > FROM copy_pipes_latest_executions AS e CROSS JOIN possible_statuses ps ARRAY JOIN ps.statuses AS s - -