@@ -35,20 +35,16 @@ internal/alerting/
3535- 字段:
3636- state:问题状态(Open/Closed)
3737- level:告警等级(P0/P1/P2/Warning)
38- - alert_state :处理状态(InProcessing/AutoRestored/Restored)
38+ - alertState :处理状态(InProcessing/AutoRestored/Restored)
3939- title:标题
40- - labels:JSON([ {key,value}...] )
41- - alert_since:DATETIME(首次告警时间)
42- - resolved_at:DATETIME(恢复时间,可为空)
43- - source:来源(prometheus/es/...)
44- - fingerprint:去重指纹(同一问题归并)
45- - extra:JSON 扩展(原始维度/链接等)
40+ - label:JSON(单一标签对象 {key,value})
41+ - alertSince:DATETIME(首次告警时间)
42+ - json:JSON 扩展(原始维度/链接等)
4643
47442 ) alert_issue_comments(告警问题评论表)
48- - 主键:id(自增)
49- - issue_id:外键关联 alert_issues.id
50- - created_at:DATETIME
51- - author:字符串(如 system/ai/user@name)
45+ - 主键:无单独主键,按业务以 issueID+createAt 唯一(或可加自增列)
46+ - issueID:外键关联 alert_issues.id
47+ - createAt:DATETIME
5248- content:TEXT(Markdown,记录AI/系统/人工动作)
5349
5450建表示例
@@ -57,64 +53,61 @@ CREATE TABLE alert_issues (
5753 id VARCHAR (64 ) PRIMARY KEY ,
5854 state VARCHAR (16 ) NOT NULL ,
5955 level VARCHAR (16 ) NOT NULL ,
60- alert_state VARCHAR (32 ) NOT NULL ,
56+ alertState VARCHAR (32 ) NOT NULL ,
6157 title VARCHAR (255 ) NOT NULL ,
62- labels JSON NULL ,
63- alert_since DATETIME(3 ) NOT NULL ,
64- resolved_at DATETIME(3 ) NULL ,
65- source VARCHAR (32 ) NOT NULL ,
66- fingerprint VARCHAR (128 ) NOT NULL ,
67- extra JSON NULL ,
58+ label JSON NULL ,
59+ alertSince DATETIME(3 ) NOT NULL ,
60+ json JSON NULL ,
6861 KEY idx_state_level (state, level),
69- KEY idx_fingerprint (fingerprint),
70- KEY idx_alert_since (alert_since)
62+ KEY idx_alert_since (alertSince)
7163);
7264
7365CREATE TABLE alert_issue_comments (
74- id BIGINT PRIMARY KEY AUTO_INCREMENT,
75- issue_id VARCHAR (64 ) NOT NULL ,
76- created_at DATETIME(3 ) NOT NULL ,
77- author VARCHAR (64 ) NOT NULL ,
66+ issueID VARCHAR (64 ) NOT NULL ,
67+ createAt DATETIME(3 ) NOT NULL ,
7868 content MEDIUMTEXT NOT NULL ,
79- KEY idx_issue (issue_id ),
80- CONSTRAINT fk_issue FOREIGN KEY (issue_id ) REFERENCES alert_issues(id)
69+ KEY idx_issue (issueID ),
70+ CONSTRAINT fk_issue FOREIGN KEY (issueID ) REFERENCES alert_issues(id)
8171);
8272```
8373
8474状态机
8575- Issue.state:Open → Closed(单向闭环)
86- - Issue.alert_state :
87- - InProcessing(触发后处理中 )
88- - AutoRestored(系统自愈恢复 )
89- - Restored(人工或外部系统恢复 )
76+ - Issue.alertState :
77+ - InProcessing(处理中 )
78+ - AutoRestored(自然恢复 )
79+ - Restored(已恢复 )
9080
9181告警等级计算
9282- 输入:原始告警等级(来自源头)+ 服务影响面(流量、租户数、区域、核心度)
9383- 输出:最终 level(P0/P1/P2/Warning)
9484- 计算器放置于 ` rules/ ` ,通过接口可热插拔与单元测试
9585
96- 聚合与去重
97- - 指纹 fingerprint = hash(source, rule_id, resource, dimensions...)
98- - 指纹一致且时间窗口内归为同一 Issue,更新 ` alert_since ` /计数/最后出现时间
99-
10086API 接口
101871 ) 列表
10288GET /v1/issues?start=xxx&limit=10&state=Closed|Open
10389响应:
10490{
105- "items": [
106- {
107- "id": "xxx",
108- "state": "Closed",
109- "level": "P0",
110- "alertState": "Restored",
111- "title": "yzh S3APIV2s3apiv2.putobject 0_64K上传响应时间95值:50012ms > 450ms",
112- "labels": [ {"key":"api","value":"s3apiv2.putobject"},{"key":"idc","value":"yzh"}] ,
113- "alertSince": "2025-05-05T11:00:00Z",
114- "resolved_at": "2025-05-05T12:00:00Z"
115- }
116- ] ,
117- "next": "cursor-token"
91+ "items": [
92+ {
93+ "id": "xxx", // 告警 issue ID
94+ "state": "Closed", // 告警条目的状态。Closed处理完成、Open处理中
95+ "level": "P0", // 枚举值:P0严重、P1重要、P2、Warning需要关注但不是线上异常
96+ "alertState": "Restored", // 告警处理状态。Restored 已恢复、AutoRestored 系统自动恢复、InProcessing 处理中
97+ "title": "yzh S3APIV2s3apiv2.putobject 0_64K上传响应时间95值:50012ms > 450ms", // 告警标题
98+ "labels": [
99+ {
100+ "key": "api",
101+ "value: "s3apiv2.putobject"
102+ },
103+ {
104+ "key": "idc",
105+ "value": "yzh"
106+ }
107+ ] ,
108+ "alertSince": "2025-05-05 11:00:00.0000Z"
109+ }
110+ ]
118111}
119112
1201132 ) 详情
@@ -125,34 +118,28 @@ GET /v1/issues/:issueID
125118 "state": "Closed",
126119 "level": "P0",
127120 "alertState": "Restored",
128- "title": "yzh S3APIV2s3apiv2.putobject 0_64K上传响应时间95值:50012ms > 450ms ",
129- "labels ": [ {"key":"api","value":"s3apiv2.putobject"},{"key":"idc","value":"yzh"} ] ,
121+ "title": "... ",
122+ "label ": {"key":"api","value":"s3apiv2.putobject"},
130123 "alertSince": "2025-05-05T11:00:00Z",
131- "resolved_at ": "2025-05-05T12:00:00Z" ,
124+ "json ": {"k":"v"} ,
132125 "comments": [
133- {"createdAt ": "2024-01-03T03:00:00Z", "author": "ai ", "content": "markdown content"}
126+ {"issueID ": "xxx", "createAt": " 2024-01-03T03:00:00Z", "content": "markdown content"}
134127 ]
135128}
136129
137- 3 ) 新增评论
138- POST /v1/issues/: issueID /comments
139- 请求:
140- { "author": "user@name", "content": "markdown" }
141- 响应:204
142-
143- 4 ) 手动关闭/恢复标记
130+ 4 ) 手动关闭(标记恢复正常)
144131POST /v1/issues/: issueID /close
145- POST /v1/issues/: issueID /reopen
146132响应:200
147133
134+
148135摄入(Ingress)
149136- Prometheus Webhook:/v1/ingest/prometheus
150137- Elastic/Logs:定制 handler 于 ` ingest/ `
151138- 每个接入负责标准化为内部 Event,交由 service 层聚合
152139
153140治愈(Healing)
154141- ` healing/ ` 定义动作(如重启、扩容、清缓存),由编排器串联
155- - 执行结果写入 ` alert_issue_comments ` ,并可更新 ` alert_state `
142+ - 执行结果写入 ` alert_issue_comments ` ,并可更新 ` alertState `
156143
157144通知(Notifier)
158145- 在 state 变化或等级升级时触发
0 commit comments