Skip to content

Commit 2849c58

Browse files
authored
Add cloudwatch dashboard (#1054)
1 parent 925e5f2 commit 2849c58

File tree

16 files changed

+466
-126
lines changed

16 files changed

+466
-126
lines changed

cli/cmd/cluster.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
"github.com/cortexlabs/cortex/cli/cluster"
2626
"github.com/cortexlabs/cortex/cli/types/cliconfig"
27+
"github.com/cortexlabs/cortex/pkg/consts"
2728
"github.com/cortexlabs/cortex/pkg/lib/aws"
2829
cr "github.com/cortexlabs/cortex/pkg/lib/configreader"
2930
"github.com/cortexlabs/cortex/pkg/lib/console"
@@ -155,6 +156,11 @@ var _upCmd = &cobra.Command{
155156
exit.Error(err)
156157
}
157158

159+
err = createDashboard(awsClient, clusterConfig.ClusterName)
160+
if err != nil {
161+
exit.Error(err)
162+
}
163+
158164
out, exitCode, err := runManagerUpdateCommand("/root/install.sh", clusterConfig, awsCreds, _flagClusterEnv)
159165
if err != nil {
160166
exit.Error(err)
@@ -313,6 +319,11 @@ var _downCmd = &cobra.Command{
313319
prompt.YesOrExit(fmt.Sprintf("your cluster named \"%s\" in %s will be spun down and all apis will be deleted, are you sure you want to continue?", *accessConfig.ClusterName, *accessConfig.Region), "", "")
314320
}
315321

322+
err = awsClient.DeleteDashboard(*accessConfig.ClusterName)
323+
if err != nil {
324+
exit.Error(err)
325+
}
326+
316327
out, exitCode, err := runManagerAccessCommand("/root/uninstall.sh", *accessConfig, awsCreds, _flagClusterEnv)
317328
if err != nil {
318329
exit.Error(err)
@@ -723,3 +734,26 @@ func CreateLogGroupIfNotFound(awsClient *aws.Client, logGroup string) error {
723734

724735
return nil
725736
}
737+
738+
// createDashboard creates a new dashboard (or clears an existing one if it already exists)
739+
func createDashboard(awsClient *aws.Client, dashboardName string) error {
740+
dashboardFound, err := awsClient.DoesDashboardExist(dashboardName)
741+
if err != nil {
742+
return err
743+
}
744+
745+
if dashboardFound {
746+
fmt.Print("○ using existing cloudwatch dashboard: ", dashboardName)
747+
} else {
748+
fmt.Print("○ creating cloudwatch dashboard: ", dashboardName)
749+
}
750+
751+
err = awsClient.CreateDashboard(dashboardName, consts.DashboardTitle)
752+
if err != nil {
753+
return err
754+
}
755+
756+
fmt.Println(" ✓")
757+
758+
return nil
759+
}

cli/cmd/get.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,11 @@ func getAPI(env cliconfig.Environment, apiName string) (string, error) {
323323
if env.Provider == types.AWSProviderType {
324324
apiEndpoint = strings.Replace(urls.Join(apiRes.BaseURL, *api.Endpoint), "https://", "http://", 1)
325325
}
326+
327+
if apiRes.DashboardURL != "" {
328+
out += "\n" + console.Bold("metrics dashboard: ") + apiRes.DashboardURL + "\n"
329+
}
330+
326331
out += "\n" + console.Bold("endpoint: ") + apiEndpoint
327332

328333
out += fmt.Sprintf("\n%s curl %s -X POST -H \"Content-Type: application/json\" -d @sample.json\n", console.Bold("curl:"), apiEndpoint)

docs/guides/metrics.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Plot in-flight requests
2+
3+
_WARNING: you are on the master branch, please refer to the docs on the branch that matches your `cortex version`_
4+
5+
The `cortex get` and `cortex get API_NAME` commands display the request time (averaged over the past 2 weeks) and response code counts (summed over the past 2 weeks) for your API(s):
6+
7+
```text
8+
$ cx get
9+
10+
env api status up-to-date requested last update avg request 2XX
11+
aws iris-classifier live 1 1 17m 24ms 1223
12+
aws text-generator live 1 1 8m 180ms 433
13+
```
14+
15+
The `cortex get API_NAME` command also provides a link to a CloudWatch Metrics dashboard containing this information:
16+
17+
![dashboard](https://user-images.githubusercontent.com/808475/82497902-ed4fd400-9aa3-11ea-8280-20dc6430215f.png)
18+
19+
**responses per minute**
20+
21+
Shows the number of 2XX, 4XX, and 5XX responses per minute.
22+
23+
**total in-flight requests**
24+
25+
Shows the total number of in-flight requests in the cluster.
26+
27+
Note: This is a sum over 10 second intervals because each replica reports it's in-flight requests once per 10 seconds. This plot is only available for the last 3 hours (because second-granular data is aggregated to minute-granular data after 3 hours). To plot data older than 3 hours, instead sum over 1 minute, and divide the y-axis by 6 to determine the number of in-flight requests (since the metrics are reported every 10 seconds).
28+
29+
**median response time**
30+
31+
Shows the median response time for requests, over 1-minute periods (measured in milliseconds).
32+
33+
**p99 response time**
34+
35+
Shows the p99 response time for requests, over 1-minute periods (measured in milliseconds).

docs/guides/plot-in-flight-requests.md

Lines changed: 0 additions & 37 deletions
This file was deleted.

docs/guides/plot-request-time.md

Lines changed: 0 additions & 29 deletions
This file was deleted.

docs/guides/plot-response-code-counts.md

Lines changed: 0 additions & 29 deletions
This file was deleted.

docs/summary.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,9 @@
4747

4848
## Guides
4949

50+
* [View API metrics](guides/metrics.md)
5051
* [Set up AWS API gateway](guides/api-gateway.md)
5152
* [Set up HTTPS on a subdomain](guides/subdomain-https-setup.md)
52-
* [Plot response code counts](guides/plot-response-code-counts.md)
53-
* [Plot API request time](guides/plot-request-time.md)
54-
* [Plot in-flight requests](guides/plot-in-flight-requests.md)
5553
* [Set up VPC peering](guides/vpc-peering.md)
5654
* [Add a batch runner API](guides/batch-runner.md)
5755
* [SSH into worker instance](guides/ssh-instance.md)

pkg/consts/consts.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ var (
4545
)
4646

4747
MaxClassesPerMonitoringRequest = 20 // cloudwatch.GeMetricData can get up to 100 metrics per request, avoid multiple requests and have room for other stats
48+
DashboardTitle = "# cortex monitoring dashboard"
4849
)
4950

5051
func defaultDockerImage(imageName string) string {

pkg/lib/aws/clients.go

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,19 @@ import (
3232
)
3333

3434
type clients struct {
35-
s3 *s3.S3
36-
s3Uploader *s3manager.Uploader
37-
s3Downloader *s3manager.Downloader
38-
sts *sts.STS
39-
ec2 *ec2.EC2
40-
ecr *ecr.ECR
41-
acm *acm.ACM
42-
autoscaling *autoscaling.AutoScaling
43-
cloudWatchLogs *cloudwatchlogs.CloudWatchLogs
44-
cloudWatchMetrics *cloudwatch.CloudWatch
45-
serviceQuotas *servicequotas.ServiceQuotas
46-
cloudFormation *cloudformation.CloudFormation
47-
iam *iam.IAM
35+
s3 *s3.S3
36+
s3Uploader *s3manager.Uploader
37+
s3Downloader *s3manager.Downloader
38+
sts *sts.STS
39+
ec2 *ec2.EC2
40+
ecr *ecr.ECR
41+
acm *acm.ACM
42+
autoscaling *autoscaling.AutoScaling
43+
cloudWatchLogs *cloudwatchlogs.CloudWatchLogs
44+
cloudWatch *cloudwatch.CloudWatch
45+
serviceQuotas *servicequotas.ServiceQuotas
46+
cloudFormation *cloudformation.CloudFormation
47+
iam *iam.IAM
4848
}
4949

5050
func (c *Client) S3() *s3.S3 {
@@ -117,11 +117,11 @@ func (c *Client) CloudWatchLogs() *cloudwatchlogs.CloudWatchLogs {
117117
return c.clients.cloudWatchLogs
118118
}
119119

120-
func (c *Client) CloudWatchMetrics() *cloudwatch.CloudWatch {
121-
if c.clients.cloudWatchMetrics == nil {
122-
c.clients.cloudWatchMetrics = cloudwatch.New(c.sess)
120+
func (c *Client) CloudWatch() *cloudwatch.CloudWatch {
121+
if c.clients.cloudWatch == nil {
122+
c.clients.cloudWatch = cloudwatch.New(c.sess)
123123
}
124-
return c.clients.cloudWatchMetrics
124+
return c.clients.cloudWatch
125125
}
126126

127127
func (c *Client) ServiceQuotas() *servicequotas.ServiceQuotas {

0 commit comments

Comments
 (0)