Skip to content

Commit 4e6f2ff

Browse files
feat(kinesisdataanalytics): Add alarms for Checkpoints (#661)
This PR adds checkpoint alarms for Kinesis Data Analytics monitoring.
1 parent 27ae408 commit 4e6f2ff

File tree

7 files changed

+327
-4
lines changed

7 files changed

+327
-4
lines changed

API.md

Lines changed: 134 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/common/monitoring/alarms/KinesisDataAnalyticsAlarmFactory.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ import {
33
TreatMissingData,
44
} from "aws-cdk-lib/aws-cloudwatch";
55

6+
import {
7+
ErrorAlarmFactory,
8+
ErrorCountThreshold,
9+
ErrorRateThreshold,
10+
ErrorType,
11+
} from "./ErrorAlarmFactory";
612
import { AlarmFactory, CustomAlarmThreshold } from "../../alarm";
713
import { MetricWithAlarmSupport } from "../../metric";
814

@@ -16,9 +22,11 @@ export interface FullRestartCountThreshold extends CustomAlarmThreshold {
1622

1723
export class KinesisDataAnalyticsAlarmFactory {
1824
protected readonly alarmFactory: AlarmFactory;
25+
protected readonly errorAlarmFactory: ErrorAlarmFactory;
1926

2027
constructor(alarmFactory: AlarmFactory) {
2128
this.alarmFactory = alarmFactory;
29+
this.errorAlarmFactory = new ErrorAlarmFactory(alarmFactory);
2230
}
2331

2432
addDowntimeAlarm(
@@ -61,4 +69,30 @@ export class KinesisDataAnalyticsAlarmFactory {
6169
alarmDedupeStringSuffix: "KDAFullRestartAlarm",
6270
});
6371
}
72+
73+
addCheckpointFailureCountAlarm(
74+
metric: MetricWithAlarmSupport,
75+
props: ErrorCountThreshold,
76+
disambiguator?: string,
77+
) {
78+
return this.errorAlarmFactory.addErrorCountAlarm(
79+
metric,
80+
ErrorType.FAILURE,
81+
props,
82+
disambiguator,
83+
);
84+
}
85+
86+
addCheckpointFailureRateAlarm(
87+
metric: MetricWithAlarmSupport,
88+
props: ErrorRateThreshold,
89+
disambiguator?: string,
90+
) {
91+
return this.errorAlarmFactory.addErrorRateAlarm(
92+
metric,
93+
ErrorType.FAILURE,
94+
props,
95+
disambiguator,
96+
);
97+
}
6498
}

lib/monitoring/aws-kinesisanalytics/KinesisDataAnalyticsMetricFactory.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import {
55
BaseMetricFactoryProps,
66
MetricFactory,
77
MetricStatistic,
8+
RateComputationMethod,
89
} from "../../common";
910

1011
export interface KinesisDataAnalyticsMetricFactoryProps
@@ -118,6 +119,15 @@ export class KinesisDataAnalyticsMetricFactory extends BaseMetricFactory<Kinesis
118119
});
119120
}
120121

122+
metricCheckpointFailureRate() {
123+
return this.metricFactory.toRate(
124+
this.metricNumberOfFailedCheckpointsCount(),
125+
RateComputationMethod.PER_HOUR,
126+
false,
127+
"checkpoints",
128+
);
129+
}
130+
121131
private metric(metricsSpec: MetricsSpec) {
122132
return this.metricFactory.createMetric(
123133
metricsSpec.name,

lib/monitoring/aws-kinesisanalytics/KinesisDataAnalyticsMonitoring.ts

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ import {
1313
CountAxisFromZero,
1414
DefaultGraphWidgetHeight,
1515
DefaultSummaryWidgetHeight,
16+
ErrorCountThreshold,
17+
ErrorRateThreshold,
1618
FullRestartCountThreshold,
1719
KinesisDataAnalyticsAlarmFactory,
1820
MaxDowntimeThreshold,
@@ -21,6 +23,7 @@ import {
2123
MonitoringScope,
2224
PercentageAxisFromZeroToHundred,
2325
QuarterWidth,
26+
RateAxisFromZero,
2427
SizeAxisBytesFromZero,
2528
TimeAxisMillisFromZero,
2629
} from "../../common";
@@ -34,6 +37,10 @@ export interface KinesisDataAnalyticsMonitoringOptions
3437
readonly addDowntimeAlarm?: Record<string, MaxDowntimeThreshold>;
3538

3639
readonly addFullRestartCountAlarm?: Record<string, FullRestartCountThreshold>;
40+
41+
readonly addCheckpointFailureCountAlarm?: Record<string, ErrorCountThreshold>;
42+
43+
readonly addCheckpointFailureRateAlarm?: Record<string, ErrorRateThreshold>;
3744
}
3845

3946
export interface KinesisDataAnalyticsMonitoringProps
@@ -47,6 +54,8 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
4754
readonly kdaAlarmFactory: KinesisDataAnalyticsAlarmFactory;
4855
readonly downtimeAnnotations: HorizontalAnnotation[];
4956
readonly fullRestartAnnotations: HorizontalAnnotation[];
57+
readonly checkpointFailureCountAnnotations: HorizontalAnnotation[];
58+
readonly checkpointFailureRateAnnotations: HorizontalAnnotation[];
5059

5160
readonly cpuUtilizationPercentMetric: MetricWithAlarmSupport;
5261
readonly downtimeMsMetric: MetricWithAlarmSupport;
@@ -58,6 +67,7 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
5867
readonly numberOfFailedCheckpointsCountMetric: MetricWithAlarmSupport;
5968
readonly oldGenerationGCCountMetric: MetricWithAlarmSupport;
6069
readonly oldGenerationGCTimeMsMetric: MetricWithAlarmSupport;
70+
readonly checkpointFailureRateMetric: MetricWithAlarmSupport;
6171

6272
constructor(
6373
scope: MonitoringScope,
@@ -80,6 +90,8 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
8090
this.kdaAlarmFactory = new KinesisDataAnalyticsAlarmFactory(alarmFactory);
8191
this.downtimeAnnotations = [];
8292
this.fullRestartAnnotations = [];
93+
this.checkpointFailureCountAnnotations = [];
94+
this.checkpointFailureRateAnnotations = [];
8395

8496
const metricFactory = new KinesisDataAnalyticsMetricFactory(
8597
scope.createMetricFactory(),
@@ -103,6 +115,8 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
103115
metricFactory.metricOldGenerationGCCount();
104116
this.oldGenerationGCTimeMsMetric =
105117
metricFactory.metricOldGenerationGCTimeMs();
118+
this.checkpointFailureRateMetric =
119+
metricFactory.metricCheckpointFailureRate();
106120

107121
for (const disambiguator in props.addDowntimeAlarm) {
108122
const alarmProps = props.addDowntimeAlarm[disambiguator];
@@ -126,6 +140,28 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
126140
this.addAlarm(createdAlarm);
127141
}
128142

143+
for (const disambiguator in props.addCheckpointFailureCountAlarm) {
144+
const alarmProps = props.addCheckpointFailureCountAlarm[disambiguator];
145+
const createdAlarm = this.kdaAlarmFactory.addCheckpointFailureCountAlarm(
146+
this.numberOfFailedCheckpointsCountMetric,
147+
alarmProps,
148+
disambiguator,
149+
);
150+
this.checkpointFailureCountAnnotations.push(createdAlarm.annotation);
151+
this.addAlarm(createdAlarm);
152+
}
153+
154+
for (const disambiguator in props.addCheckpointFailureRateAlarm) {
155+
const alarmProps = props.addCheckpointFailureRateAlarm[disambiguator];
156+
const createdAlarm = this.kdaAlarmFactory.addCheckpointFailureRateAlarm(
157+
this.checkpointFailureRateMetric,
158+
alarmProps,
159+
disambiguator,
160+
);
161+
this.checkpointFailureRateAnnotations.push(createdAlarm.annotation);
162+
this.addAlarm(createdAlarm);
163+
}
164+
129165
props.useCreatedAlarms?.consume(this.createdAlarms());
130166
}
131167

@@ -204,6 +240,10 @@ export class KinesisDataAnalyticsMonitoring extends Monitoring {
204240
title: "Checkpoint Failures",
205241
left: [this.numberOfFailedCheckpointsCountMetric],
206242
leftYAxis: CountAxisFromZero,
243+
leftAnnotations: this.checkpointFailureCountAnnotations,
244+
right: [this.checkpointFailureRateMetric],
245+
rightYAxis: RateAxisFromZero,
246+
rightAnnotations: this.checkpointFailureRateAnnotations,
207247
});
208248
}
209249

0 commit comments

Comments
 (0)