@@ -670,20 +670,21 @@ Resources:
670670 OKActions: []
671671
672672<% JAVALAB_APP_TYPES . each do | name | -%>
673-
674- <%= name%> HighSevereErrorRateAlarm:
673+ <% {
674+ TenPercentSevereErrorRateAlarm : { Threshold : 10 , AlarmName : 'ten_percent_severe_error_rate' } ,
675+ NinetyPercentSevereErrorRateAlarm : { Threshold : 90 , AlarmName : 'ninety_percent_severe_error_rate' } ,
676+ } . each do |alarmTitle , config | -%>
677+ <%= name%> <%= alarmTitle%> :
675678 Type: AWS::CloudWatch::Alarm
676679 Properties:
677- AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _high_severe_error_rate"
678- AlarmDescription: Send page if Javabuilder severe error rate exceeds 10% for 20
679- minutes. Occasional spikes are expected, but a sustained high error rate
680- is an indication of an outage.
681- ActionsEnabled: true
682- AlarmActions:
683- - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
680+ AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _<%= config [ :AlarmName ] %> "
681+ AlarmDescription: Severe error rate in Javabuilder's <%= name%> build and run lambda (the core of
682+ Javabuilder, which executes student <%= name%> code) exceeded <%= config [ :Threshold ] %> % for four
683+ consecutive 5 minute periods.
684+ ActionsEnabled: false
684685 EvaluationPeriods: 4
685686 DatapointsToAlarm: 4
686- Threshold: 10
687+ Threshold: <%= config [ :Threshold ] %>
687688 ComparisonOperator: GreaterThanThreshold
688689 TreatMissingData: notBreaching
689690 Metrics:
@@ -713,20 +714,24 @@ Resources:
713714 Value: !Ref BuildAndRunJava<%= name%> ProjectFunction
714715 Period: 300
715716 Stat: Sum
717+ <% end%>
718+
716719
717- <%= name%> HighErrorRateAlarm:
720+ <% {
721+ TwentyFivePercentErrorRateAlarm : { Threshold : 25 , AlarmName : 'twenty_five_percent_error_rate' } ,
722+ NinetyPercentErrorRateAlarm : { Threshold : 90 , AlarmName : 'ninety_percent_error_rate' } ,
723+ } . each do |alarmTitle , config | -%>
724+ <%= name%> <%= alarmTitle%> :
718725 Type: AWS::CloudWatch::Alarm
719726 Properties:
720- AlarmName: !Sub "${SubDomainName}_build_and_run_ <%= name . downcase%> _lambda_error_rate "
727+ AlarmName: !Sub "${SubDomainName}_ <%= name . downcase%> _ <%= config [ :AlarmName ] %> "
721728 AlarmDescription: Error rate in Javabuilder's <%= name%> build and run lambda (the core of
722- Javabuilder, which executes student <%= name%> code) exceeded 10 % for four
729+ Javabuilder, which executes student <%= name%> code) exceeded <%= config [ :Threshold ] %> % for four
723730 consecutive 5 minute periods.
724- ActionsEnabled: true
725- AlarmActions:
726- - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:javabuilder-build-and-run-lambda-error-rate"]
731+ ActionsEnabled: false
727732 EvaluationPeriods: 4
728733 DatapointsToAlarm: 4
729- Threshold: 25
734+ Threshold: <%= config [ :Threshold ] %>
730735 ComparisonOperator: GreaterThanThreshold
731736 TreatMissingData: notBreaching
732737 Metrics:
@@ -767,6 +772,7 @@ Resources:
767772 Value: !Ref BuildAndRunJava<%= name%> ProjectFunction
768773 Period: 300
769774 Stat: TC(89000:)
775+ <% end%>
770776
771777 <%= name%> SlowCleanupTimeAlarm:
772778 Type: AWS::CloudWatch::Alarm
@@ -894,6 +900,111 @@ Resources:
894900 Expression: ANOMALY_DETECTION_BAND(m1, 8)
895901 ThresholdMetricId: ad1
896902
903+ <%= name%> MinimumUsageAlarm:
904+ Type: AWS::CloudWatch::Alarm
905+ Properties:
906+ AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _minimum_usage"
907+ AlarmDescription: This alarm is to be used as part of a composite alarm, not by itself.
908+ It triggers if the usage is above a minimum threshold, so we do not alarm on error
909+ rates if we have very low usage.
910+ ActionsEnabled: false
911+ MetricName: Invocations
912+ Namespace: AWS/Lambda
913+ Statistic: Sum
914+ Dimensions:
915+ - Name: FunctionName
916+ Value: !Ref BuildAndRunJava<%= name%> ProjectFunction
917+ Period: 300
918+ EvaluationPeriods: 4
919+ DatapointsToAlarm: 4
920+ Threshold: 100
921+ ComparisonOperator: GreaterThanOrEqualToThreshold
922+ TreatMissingData: notBreaching
923+
924+ <%= name%> SevereErrorRateAlarm:
925+ Type: AWS::CloudWatch::CompositeAlarm
926+ DependsOn:
927+ - <%= name%> TenPercentSevereErrorRateAlarm
928+ - <%= name%> MinimumUsageAlarm
929+ - <%= name%> ElevatedSevereErrorRateAlarm
930+ Properties:
931+ AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _severe_error_rate"
932+ AlarmDescription: Alarm if Javabuilder severe error rate exceeds 10% every 5 minutes for 20
933+ minutes and there are at least 100 requests every 5 minutes.
934+ Occasional spikes are expected, but a sustained elevated severe error rate is an indication of an issue.
935+ Severe errors are generated and emitted by our code.
936+ ActionsEnabled: true
937+ AlarmActions:
938+ - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
939+ AlarmRule: !Sub "ALARM(${SubDomainName}_<%= name . downcase%> _ten_percent_severe_error_rate) AND
940+ ALARM(${SubDomainName}_<%= name . downcase%> _minimum_usage)"
941+ InsufficientDataActions: []
942+ OKActions: []
943+ ActionsSuppressor: !Sub "arn:aws:cloudwatch:${AWS::Region}:${AWS::AccountId}:alarm:${SubDomainName}_<%= name . downcase%> _elevated_severe_error_rate"
944+ ActionsSuppressorWaitPeriod: 120
945+ ActionsSuppressorExtensionPeriod: 120
946+
947+ <%= name%> ElevatedSevereErrorRateAlarm:
948+ Type: AWS::CloudWatch::CompositeAlarm
949+ DependsOn:
950+ - <%= name%> NinetyPercentSevereErrorRateAlarm
951+ - <%= name%> MinimumUsageAlarm
952+ Properties:
953+ AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _elevated_severe_error_rate"
954+ AlarmDescription: Alarm if Javabuilder severe error rate exceeds 90% every 5 minutes for 20
955+ minutes and there are at least 100 requests every 5 minutes.
956+ Occasional spikes are expected, but a sustained high severe error rate is an indication of an outage.
957+ Severe errors are generated and emitted by our code.
958+ ActionsEnabled: true
959+ AlarmActions:
960+ - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
961+ AlarmRule: !Sub "ALARM(${SubDomainName}_<%= name . downcase%> _ninety_percent_severe_error_rate) AND
962+ ALARM(${SubDomainName}_<%= name . downcase%> _minimum_usage)"
963+ InsufficientDataActions: []
964+ OKActions: []
965+
966+ <%= name%> ErrorRateAlarm:
967+ Type: AWS::CloudWatch::CompositeAlarm
968+ DependsOn:
969+ - <%= name%> TwentyFivePercentErrorRateAlarm
970+ - <%= name%> MinimumUsageAlarm
971+ - <%= name%> ElevatedErrorRateAlarm
972+ Properties:
973+ AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _error_rate"
974+ AlarmDescription: Alarm if Javabuilder severe error rate exceeds 25% every 5 minutes for 20
975+ minutes and there are at least 100 requests every 5 minutes.
976+ Occasional spikes are expected, but a sustained elevated error rate is an indication of an issue.
977+ Errors are generated by the Lambda system.
978+ ActionsEnabled: true
979+ AlarmActions:
980+ - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
981+ AlarmRule: !Sub "ALARM(${SubDomainName}_<%= name . downcase%> _twenty_five_percent_error_rate) AND
982+ ALARM(${SubDomainName}_<%= name . downcase%> _minimum_usage)"
983+ InsufficientDataActions: []
984+ OKActions: []
985+ ActionsSuppressor: !Sub "arn:aws:cloudwatch:${AWS::Region}:${AWS::AccountId}:alarm:${SubDomainName}_<%= name . downcase%> _elevated_error_rate"
986+ ActionsSuppressorWaitPeriod: 120
987+ ActionsSuppressorExtensionPeriod: 120
988+
989+ <%= name%> ElevatedErrorRateAlarm:
990+ Type: AWS::CloudWatch::CompositeAlarm
991+ DependsOn:
992+ - <%= name%> NinetyPercentErrorRateAlarm
993+ - <%= name%> MinimumUsageAlarm
994+ Properties:
995+ AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _elevated_error_rate"
996+ AlarmDescription: Alarm if Javabuilder error rate exceeds 90% every 5 minutes for 20
997+ minutes and there are at least 100 requests every 5 minutes.
998+ Occasional spikes are expected, but a sustained high error rate is an indication of an outage.
999+ Errors are generated by the Lambda system.
1000+ ActionsEnabled: true
1001+ AlarmActions:
1002+ - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
1003+ AlarmRule: !Sub "ALARM(${SubDomainName}_<%= name . downcase%> _ninety_percent_error_rate) AND
1004+ ALARM(${SubDomainName}_<%= name . downcase%> _minimum_usage)"
1005+ InsufficientDataActions: []
1006+ OKActions: []
1007+
8971008<% end -%>
8981009
8991010# We use shortened versions of names for partition keys (eg, user_id),
0 commit comments