@@ -541,134 +541,6 @@ Resources:
541541 ForwardedValues: {QueryString: true}
542542 ViewerProtocolPolicy: redirect-to-https
543543
544- HighConcurrentExecutionsAlarm:
545- Type: AWS::CloudWatch::Alarm
546- Properties:
547- AlarmName: !Sub "${SubDomainName}_high_concurrent_executions"
548- AlarmDescription: !Sub |
549- This will page the DOTD if javabuilder usage exceeds 50 concurrent
550- executions for 10 minutes. Occasional spikes are expected, but
551- long-running high usage is an indication of an attack. Go to the
552- following URLs and set reserved concurrency to 10 immediately
553- <% JAVALAB_APP_TYPES . each do | name | -%>
554- https://console.aws.amazon.com/lambda/home?region=${AWS::Region}#/functions/${BuildAndRunJava<%= name%> ProjectFunction}/edit/concurrency?tab=configure
555- <% end -%>
556- Then post in #ap-csa-dev.
557- ActionsEnabled: true
558- AlarmActions:
559- - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent"]
560- EvaluationPeriods: 10
561- DatapointsToAlarm: 10
562- Threshold: 50
563- ComparisonOperator: GreaterThanThreshold
564- TreatMissingData: notBreaching
565- Metrics:
566- - Id: e1
567- Label: Concurrent Executions Across All Lambdas
568- ReturnData: true
569- Expression: SUM(METRICS())
570- <% { Theater : "m2" , Neighborhood : "m3" , Console : "m4" } . each do |name , id | -%>
571- - Id: <%= id%>
572- ReturnData: false
573- MetricStat:
574- Metric:
575- Namespace: AWS/Lambda
576- MetricName: ConcurrentExecutions
577- Dimensions:
578- - Name: FunctionName
579- Value: !Ref BuildAndRunJava<%= name%> ProjectFunction
580- Period: 60
581- Stat: Maximum
582- <% end -%>
583-
584- HighWebsocketConnectionsAlarm:
585- Type: AWS::CloudWatch::Alarm
586- Properties:
587- AlarmName: !Sub "${SubDomainName}_high_websocket_connections"
588- AlarmDescription: Significantly higher websocket connections than normal detected. Investigate if there is a DDOS.
589- ActionsEnabled: false
590- EvaluationPeriods: 20
591- DatapointsToAlarm: 20
592- ComparisonOperator: GreaterThanUpperThreshold
593- TreatMissingData: notBreaching
594- Metrics:
595- - Id: m1
596- ReturnData: true
597- MetricStat:
598- Metric:
599- Namespace: AWS/ApiGateway
600- MetricName: ConnectCount
601- Dimensions:
602- - Name: Stage
603- Value: !Sub "${StageName}"
604- - Name: ApiId
605- Value: !Ref WebSocketApi
606- Period: 60
607- Stat: Sum
608- - Id: ad1
609- Label: ConnectCount (expected)
610- ReturnData: true
611- Expression: ANOMALY_DETECTION_BAND(m1, 8)
612- ThresholdMetricId: ad1
613-
614- HighHttpRequestsAlarm:
615- Type: AWS::CloudWatch::Alarm
616- Properties:
617- AlarmName: !Sub "${SubDomainName}_high_http_requests"
618- AlarmDescription: Significantly higher HTTP requests than normal detected.
619- Investigate if there is a DDOS.
620- ActionsEnabled: true
621- OKActions: []
622- AlarmActions: []
623- InsufficientDataActions: []
624- EvaluationPeriods: 20
625- DatapointsToAlarm: 20
626- ComparisonOperator: GreaterThanUpperThreshold
627- TreatMissingData: notBreaching
628- Metrics:
629- - Id: m1
630- ReturnData: true
631- MetricStat:
632- Metric:
633- Namespace: AWS/ApiGateway
634- MetricName: Count
635- Dimensions:
636- - Name: ApiId
637- Value: !Ref HttpApi
638- Period: 60
639- Stat: Sum
640- - Id: ad1
641- Label: Count (expected)
642- ReturnData: true
643- Expression: ANOMALY_DETECTION_BAND(m1, 8)
644- ThresholdMetricId: ad1
645-
646- HighUsageCompositeAlarm:
647- Type: AWS::CloudWatch::CompositeAlarm
648- DependsOn:
649- - ConsoleHighInvocationsAlarm
650- - HighHttpRequestsAlarm
651- - HighWebsocketConnectionsAlarm
652- - NeighborhoodHighInvocationsAlarm
653- - TheaterHighInvocationsAlarm
654- Properties:
655- ActionsEnabled: true
656- AlarmActions:
657- # TODO: after we have run at high usage for a while, consider re-enabling this alarm. Right now it is too noisy
658- # - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:javabuilder-high-usage"]
659- - !Ref AWS::NoValue
660- AlarmDescription: Send message if abnormally high Javabuilder usage detected.
661- Monitors usage across the HTTP API, WebSocket API, and all Build and Run
662- Lambdas.
663- AlarmName: !Sub "${SubDomainName}_high_usage_composite"
664- AlarmRule: !Sub "ALARM(${SubDomainName}_console_high_invocations) OR
665- ALARM(${SubDomainName}_high_http_requests) OR
666- ALARM(${SubDomainName}_high_websocket_connections) OR
667- ALARM(${SubDomainName}_neighborhood_high_invocations) OR
668- ALARM(${SubDomainName}_theater_high_invocations)"
669- InsufficientDataActions: []
670- OKActions: []
671-
672544<% JAVALAB_APP_TYPES . each do | name | -%>
673545<% {
674546 TenPercentSevereErrorRateAlarm : { Threshold : 10 , AlarmName : 'ten_percent_severe_error_rate' } ,
@@ -871,35 +743,6 @@ Resources:
871743 Threshold: 2500
872744 Period: 60
873745
874- <%= name%> HighInvocationsAlarm:
875- Type: AWS::CloudWatch::Alarm
876- Properties:
877- AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _high_invocations"
878- AlarmDescription: Significantly higher <%= name%> build and run invocations than
879- normal detected. Investigate if there is a DDOS.
880- ActionsEnabled: false
881- EvaluationPeriods: 20
882- DatapointsToAlarm: 20
883- ComparisonOperator: GreaterThanUpperThreshold
884- TreatMissingData: notBreaching
885- Metrics:
886- - Id: m1
887- ReturnData: true
888- MetricStat:
889- Metric:
890- Namespace: AWS/Lambda
891- MetricName: Invocations
892- Dimensions:
893- - Name: FunctionName
894- Value: !Ref BuildAndRunJava<%= name%> ProjectFunction
895- Period: 60
896- Stat: Sum
897- - Id: ad1
898- Label: Invocations (expected)
899- ReturnData: true
900- Expression: ANOMALY_DETECTION_BAND(m1, 8)
901- ThresholdMetricId: ad1
902-
903746 <%= name%> MinimumUsageAlarm:
904747 Type: AWS::CloudWatch::Alarm
905748 Properties:
@@ -932,7 +775,8 @@ Resources:
932775 AlarmDescription: Alarm if Javabuilder severe error rate exceeds 10% every 5 minutes for 20
933776 minutes and there are at least 100 requests every 5 minutes.
934777 Occasional spikes are expected, but a sustained elevated severe error rate is an indication of an issue.
935- Severe errors are generated and emitted by our code.
778+ Severe errors are generated and emitted by our code. Please follow the instructions in this document to mitigate
779+ https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
936780 ActionsEnabled: true
937781 AlarmActions:
938782 - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
@@ -954,10 +798,11 @@ Resources:
954798 AlarmDescription: Alarm if Javabuilder severe error rate exceeds 90% every 5 minutes for 20
955799 minutes and there are at least 100 requests every 5 minutes.
956800 Occasional spikes are expected, but a sustained high severe error rate is an indication of an outage.
957- Severe errors are generated and emitted by our code.
801+ Severe errors are generated and emitted by our code. Please follow the instructions in this document to mitigate
802+ https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
958803 ActionsEnabled: true
959804 AlarmActions:
960- - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate "]
805+ - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent "]
961806 AlarmRule: !Sub "ALARM(${SubDomainName}_<%= name . downcase%> _ninety_percent_severe_error_rate) AND
962807 ALARM(${SubDomainName}_<%= name . downcase%> _minimum_usage)"
963808 InsufficientDataActions: []
@@ -974,7 +819,8 @@ Resources:
974819 AlarmDescription: Alarm if Javabuilder severe error rate exceeds 25% every 5 minutes for 20
975820 minutes and there are at least 100 requests every 5 minutes.
976821 Occasional spikes are expected, but a sustained elevated error rate is an indication of an issue.
977- Errors are generated by the Lambda system.
822+ Errors are generated by the Lambda system. Please follow the instructions in this document to mitigate
823+ https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
978824 ActionsEnabled: true
979825 AlarmActions:
980826 - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate"]
@@ -996,15 +842,41 @@ Resources:
996842 AlarmDescription: Alarm if Javabuilder error rate exceeds 90% every 5 minutes for 20
997843 minutes and there are at least 100 requests every 5 minutes.
998844 Occasional spikes are expected, but a sustained high error rate is an indication of an outage.
999- Errors are generated by the Lambda system.
845+ Errors are generated by the Lambda system. Please follow the instructions in this document to mitigate
846+ https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.2gh4dxmz643n
1000847 ActionsEnabled: true
1001848 AlarmActions:
1002- - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:Javabuilder-high-error-rate "]
849+ - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent "]
1003850 AlarmRule: !Sub "ALARM(${SubDomainName}_<%= name . downcase%> _ninety_percent_error_rate) AND
1004851 ALARM(${SubDomainName}_<%= name . downcase%> _minimum_usage)"
1005852 InsufficientDataActions: []
1006853 OKActions: []
1007-
854+
855+ <%= name%> HighConcurrentExecutionsAlarm:
856+ Type: AWS::CloudWatch::Alarm
857+ Properties:
858+ AlarmName: !Sub "${SubDomainName}_<%= name . downcase%> _high_concurrent_executions"
859+ AlarmDescription: !Sub |
860+ Alarm if javabuilder usage exceeds 400 concurrent
861+ executions for 10 minutes. Occasional spikes are expected, but
862+ long-running high usage is an indication of an attack. Page the student learning
863+ team for further investigation. See this doc for investigation steps
864+ https://docs.google.com/document/d/1bHvV6pvUcwxgZpw0YWBmxFggQL5KqYx9zwolwkZhjU8/edit#bookmark=id.xs1gcuxrw6ze
865+ ActionsEnabled: true
866+ AlarmActions:
867+ - !If [SilenceAlertsCondition, !Ref AWS::NoValue, !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CDO-Urgent"]
868+ EvaluationPeriods: 10
869+ DatapointsToAlarm: 10
870+ Period: 60
871+ Threshold: 400
872+ ComparisonOperator: GreaterThanThreshold
873+ TreatMissingData: notBreaching
874+ MetricName: ConcurrentExecutions
875+ Namespace: AWS/Lambda
876+ Statistic: Maximum
877+ Dimensions:
878+ - Name: FunctionName
879+ Value: !Ref BuildAndRunJava<%= name%> ProjectFunction
1008880<% end -%>
1009881
1010882# We use shortened versions of names for partition keys (eg, user_id),
0 commit comments