Skip to content

Commit a1e29c9

Browse files
feat: add comprehensive health monitoring endpoint
Added new /system/health/detailed endpoint providing complete system observability in a single API call. Features: - System information (version, uptime, environment) - Redis connection status and memory usage - Database connection with transaction statistics - All queue metrics (8 queues with waiting/active/completed/failed counts) - Active wallet statistics grouped by chain - Configuration status (IP allowlist, webhooks, rate limits) Benefits: - Single endpoint for complete system diagnostics - Eliminates need to check multiple sources - Essential for monitoring and debugging - Production-ready operational visibility - Enables better alerting and dashboards Endpoints: - GET /system/health/detailed - Comprehensive health check Use Cases: - Production monitoring dashboards - Incident response and debugging - Capacity planning and scaling decisions - System health validation
1 parent 8e04d2e commit a1e29c9

File tree

1 file changed

+296
-0
lines changed

1 file changed

+296
-0
lines changed
Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
import { Type, type Static } from "@sinclair/typebox";
2+
import type { FastifyInstance } from "fastify";
3+
import { StatusCodes } from "http-status-codes";
4+
import { redis } from "../../../shared/utils/redis/redis";
5+
import { getUsedBackendWallets } from "../../../shared/db/wallets/wallet-nonce";
6+
import { SendTransactionQueue } from "../../../worker/queues/send-transaction-queue";
7+
import { MineTransactionQueue } from "../../../worker/queues/mine-transaction-queue";
8+
import { SendWebhookQueue } from "../../../worker/queues/send-webhook-queue";
9+
import { PruneTransactionsQueue } from "../../../worker/queues/prune-transactions-queue";
10+
import { CancelRecycledNoncesQueue } from "../../../worker/queues/cancel-recycled-nonces-queue";
11+
import { NonceResyncQueue } from "../../../worker/queues/nonce-resync-queue";
12+
import { ProcessEventsLogQueue } from "../../../worker/queues/process-event-logs-queue";
13+
import { ProcessTransactionReceiptsQueue } from "../../../worker/queues/process-transaction-receipts-queue";
14+
import { env } from "../../../shared/utils/env";
15+
import { getConfig } from "../../../shared/utils/cache/get-config";
16+
import { prisma } from "../../../shared/db/client";
17+
18+
const responseSchema = Type.Object({
19+
status: Type.String(),
20+
timestamp: Type.String(),
21+
version: Type.Optional(Type.String()),
22+
system: Type.Object({
23+
nodeEnv: Type.String(),
24+
engineMode: Type.String(),
25+
uptime: Type.Number(),
26+
}),
27+
redis: Type.Object({
28+
connected: Type.Boolean(),
29+
usedMemory: Type.Optional(Type.String()),
30+
}),
31+
database: Type.Object({
32+
connected: Type.Boolean(),
33+
totalTransactions: Type.Number(),
34+
pendingTransactions: Type.Number(),
35+
erroredTransactions: Type.Number(),
36+
}),
37+
queues: Type.Object({
38+
sendTransaction: Type.Object({
39+
waiting: Type.Number(),
40+
active: Type.Number(),
41+
completed: Type.Number(),
42+
failed: Type.Number(),
43+
}),
44+
mineTransaction: Type.Object({
45+
waiting: Type.Number(),
46+
active: Type.Number(),
47+
completed: Type.Number(),
48+
failed: Type.Number(),
49+
}),
50+
sendWebhook: Type.Object({
51+
waiting: Type.Number(),
52+
active: Type.Number(),
53+
completed: Type.Number(),
54+
failed: Type.Number(),
55+
}),
56+
pruneTransactions: Type.Object({
57+
waiting: Type.Number(),
58+
active: Type.Number(),
59+
}),
60+
cancelRecycledNonces: Type.Object({
61+
waiting: Type.Number(),
62+
active: Type.Number(),
63+
}),
64+
nonceResync: Type.Object({
65+
waiting: Type.Number(),
66+
active: Type.Number(),
67+
}),
68+
processEventLogs: Type.Object({
69+
waiting: Type.Number(),
70+
active: Type.Number(),
71+
}),
72+
processTransactionReceipts: Type.Object({
73+
waiting: Type.Number(),
74+
active: Type.Number(),
75+
}),
76+
}),
77+
wallets: Type.Object({
78+
totalActive: Type.Number(),
79+
byChain: Type.Array(
80+
Type.Object({
81+
chainId: Type.Number(),
82+
count: Type.Number(),
83+
}),
84+
),
85+
}),
86+
configuration: Type.Object({
87+
ipAllowlistEnabled: Type.Boolean(),
88+
webhookConfigured: Type.Boolean(),
89+
rateLimitPerMin: Type.Number(),
90+
}),
91+
});
92+
93+
export async function healthDetailed(fastify: FastifyInstance) {
94+
fastify.get(
95+
"/system/health/detailed",
96+
{
97+
schema: {
98+
summary: "Get detailed health check",
99+
description:
100+
"Returns comprehensive health status including queue metrics, database stats, and system information. Useful for monitoring and debugging.",
101+
tags: ["System"],
102+
operationId: "healthDetailed",
103+
response: {
104+
[StatusCodes.OK]: responseSchema,
105+
},
106+
},
107+
},
108+
async (request, reply) => {
109+
try {
110+
// Check Redis connection
111+
let redisConnected = false;
112+
let redisMemory: string | undefined;
113+
try {
114+
await redis.ping();
115+
redisConnected = true;
116+
const info = await redis.info("memory");
117+
const match = info.match(/used_memory_human:([^\r\n]+)/);
118+
if (match) {
119+
redisMemory = match[1].trim();
120+
}
121+
} catch (e) {
122+
// Redis not available
123+
}
124+
125+
// Check database connection and get stats
126+
let dbConnected = false;
127+
let totalTxCount = 0;
128+
let pendingTxCount = 0;
129+
let erroredTxCount = 0;
130+
try {
131+
totalTxCount = await prisma.transactions.count();
132+
pendingTxCount = await prisma.transactions.count({
133+
where: {
134+
minedAt: null,
135+
cancelledAt: null,
136+
errorMessage: null,
137+
},
138+
});
139+
erroredTxCount = await prisma.transactions.count({
140+
where: {
141+
errorMessage: { not: null },
142+
},
143+
});
144+
dbConnected = true;
145+
} catch (e) {
146+
// Database not available
147+
}
148+
149+
// Get queue statistics
150+
const [
151+
sendTxWaiting,
152+
sendTxActive,
153+
sendTxCompleted,
154+
sendTxFailed,
155+
mineTxWaiting,
156+
mineTxActive,
157+
mineTxCompleted,
158+
mineTxFailed,
159+
webhookWaiting,
160+
webhookActive,
161+
webhookCompleted,
162+
webhookFailed,
163+
pruneWaiting,
164+
pruneActive,
165+
cancelNoncesWaiting,
166+
cancelNoncesActive,
167+
nonceResyncWaiting,
168+
nonceResyncActive,
169+
processEventsWaiting,
170+
processEventsActive,
171+
processReceiptsWaiting,
172+
processReceiptsActive,
173+
] = await Promise.all([
174+
SendTransactionQueue.q.getWaitingCount(),
175+
SendTransactionQueue.q.getActiveCount(),
176+
SendTransactionQueue.q.getCompletedCount(),
177+
SendTransactionQueue.q.getFailedCount(),
178+
MineTransactionQueue.q.getWaitingCount(),
179+
MineTransactionQueue.q.getActiveCount(),
180+
MineTransactionQueue.q.getCompletedCount(),
181+
MineTransactionQueue.q.getFailedCount(),
182+
SendWebhookQueue.q.getWaitingCount(),
183+
SendWebhookQueue.q.getActiveCount(),
184+
SendWebhookQueue.q.getCompletedCount(),
185+
SendWebhookQueue.q.getFailedCount(),
186+
PruneTransactionsQueue.q.getWaitingCount(),
187+
PruneTransactionsQueue.q.getActiveCount(),
188+
CancelRecycledNoncesQueue.q.getWaitingCount(),
189+
CancelRecycledNoncesQueue.q.getActiveCount(),
190+
NonceResyncQueue.q.getWaitingCount(),
191+
NonceResyncQueue.q.getActiveCount(),
192+
ProcessEventsLogQueue.q.getWaitingCount(),
193+
ProcessEventsLogQueue.q.getActiveCount(),
194+
ProcessTransactionReceiptsQueue.q.getWaitingCount(),
195+
ProcessTransactionReceiptsQueue.q.getActiveCount(),
196+
]);
197+
198+
// Get wallet statistics
199+
const usedWallets = await getUsedBackendWallets();
200+
const walletsByChain = usedWallets.reduce(
201+
(acc, wallet) => {
202+
const existing = acc.find((w) => w.chainId === wallet.chainId);
203+
if (existing) {
204+
existing.count++;
205+
} else {
206+
acc.push({ chainId: wallet.chainId, count: 1 });
207+
}
208+
return acc;
209+
},
210+
[] as { chainId: number; count: number }[],
211+
);
212+
213+
// Get configuration
214+
const config = await getConfig();
215+
216+
const health = {
217+
status: dbConnected && redisConnected ? "healthy" : "degraded",
218+
timestamp: new Date().toISOString(),
219+
version: env.ENGINE_VERSION,
220+
system: {
221+
nodeEnv: env.NODE_ENV,
222+
engineMode: env.ENGINE_MODE,
223+
uptime: process.uptime(),
224+
},
225+
redis: {
226+
connected: redisConnected,
227+
usedMemory: redisMemory,
228+
},
229+
database: {
230+
connected: dbConnected,
231+
totalTransactions: totalTxCount,
232+
pendingTransactions: pendingTxCount,
233+
erroredTransactions: erroredTxCount,
234+
},
235+
queues: {
236+
sendTransaction: {
237+
waiting: sendTxWaiting,
238+
active: sendTxActive,
239+
completed: sendTxCompleted,
240+
failed: sendTxFailed,
241+
},
242+
mineTransaction: {
243+
waiting: mineTxWaiting,
244+
active: mineTxActive,
245+
completed: mineTxCompleted,
246+
failed: mineTxFailed,
247+
},
248+
sendWebhook: {
249+
waiting: webhookWaiting,
250+
active: webhookActive,
251+
completed: webhookCompleted,
252+
failed: webhookFailed,
253+
},
254+
pruneTransactions: {
255+
waiting: pruneWaiting,
256+
active: pruneActive,
257+
},
258+
cancelRecycledNonces: {
259+
waiting: cancelNoncesWaiting,
260+
active: cancelNoncesActive,
261+
},
262+
nonceResync: {
263+
waiting: nonceResyncWaiting,
264+
active: nonceResyncActive,
265+
},
266+
processEventLogs: {
267+
waiting: processEventsWaiting,
268+
active: processEventsActive,
269+
},
270+
processTransactionReceipts: {
271+
waiting: processReceiptsWaiting,
272+
active: processReceiptsActive,
273+
},
274+
},
275+
wallets: {
276+
totalActive: usedWallets.length,
277+
byChain: walletsByChain.sort((a, b) => b.count - a.count),
278+
},
279+
configuration: {
280+
ipAllowlistEnabled: config.ipAllowlist.length > 0,
281+
webhookConfigured: !!config.webhookUrl,
282+
rateLimitPerMin: env.GLOBAL_RATE_LIMIT_PER_MIN,
283+
},
284+
} satisfies Static<typeof responseSchema>;
285+
286+
reply.status(StatusCodes.OK).send(health);
287+
} catch (error) {
288+
reply.status(StatusCodes.INTERNAL_SERVER_ERROR).send({
289+
status: "error",
290+
timestamp: new Date().toISOString(),
291+
error: "Failed to fetch health details",
292+
});
293+
}
294+
},
295+
);
296+
}

0 commit comments

Comments
 (0)