Skip to content

Commit e81ffb7

Browse files
committed
Update API statuses and replica counts
1 parent 06faa36 commit e81ffb7

File tree

7 files changed

+207
-148
lines changed

7 files changed

+207
-148
lines changed

cli/cmd/get.go

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -428,33 +428,56 @@ func describeAPI(name string, resourcesRes *schema.GetResourcesResponse) (string
428428
ctx := resourcesRes.Context
429429
api := ctx.APIs[name]
430430

431-
var staleReplicas int32
432431
var ctxAPIStatus *resource.APIStatus
432+
if api != nil {
433+
ctxAPIStatus = resourcesRes.APIStatuses[api.ID]
434+
}
435+
433436
var anyAPIStatus *resource.APIStatus
434437
for _, apiStatus := range resourcesRes.APIStatuses {
435-
if apiStatus.APIName != name {
436-
continue
437-
}
438-
anyAPIStatus = apiStatus
439-
if api != nil && apiStatus.ResourceID == api.ID {
440-
ctxAPIStatus = apiStatus
438+
if apiStatus.APIName == name {
439+
anyAPIStatus = apiStatus
440+
break
441441
}
442-
staleReplicas += apiStatus.TotalStaleReady()
443442
}
444443

445-
out := titleStr("Summary")
446-
out += "Status: " + groupStatus.Message() + "\n"
444+
var requestedReplicas int32
447445
if ctxAPIStatus != nil {
448-
out += fmt.Sprintf("Up-to-date replicas: %d ready\n", ctxAPIStatus.ReadyUpdated)
449-
}
450-
if staleReplicas != 0 {
451-
out += fmt.Sprintf("Stale replicas: %d ready\n", staleReplicas)
446+
requestedReplicas = api.Compute.InitReplicas
447+
if ctxAPIStatus.K8sRequested > 0 {
448+
requestedReplicas = ctxAPIStatus.K8sRequested
449+
}
450+
if requestedReplicas < api.Compute.MinReplicas {
451+
requestedReplicas = api.Compute.MinReplicas
452+
}
452453
}
453-
out += "Created at: " + libtime.LocalTimestamp(groupStatus.Start) + "\n"
454+
455+
refreshedAt := groupStatus.Start
454456
if groupStatus.ActiveStatus != nil && groupStatus.ActiveStatus.Start != nil {
455-
out += "Refreshed at: " + libtime.LocalTimestamp(groupStatus.ActiveStatus.Start) + "\n"
457+
refreshedAt = groupStatus.ActiveStatus.Start
456458
}
457459

460+
var staleComputeStr = ""
461+
if groupStatus.ReadyStaleCompute != 0 {
462+
hasStr := "has"
463+
if groupStatus.ReadyStaleCompute > 1 {
464+
hasStr = "have"
465+
}
466+
staleComputeStr = fmt.Sprintf(" (%s %s previous compute)", s.Int32(groupStatus.ReadyStaleCompute), hasStr)
467+
}
468+
469+
out := titleStr("Summary")
470+
out += fmt.Sprintf("Status: %s\n", groupStatus.Message())
471+
out += "\n"
472+
out += fmt.Sprintf("Available replicas: %s\n", s.Int32(groupStatus.Available()))
473+
out += fmt.Sprintf(" - Current model: %s%s\n", s.Int32(groupStatus.UpToDate()), staleComputeStr)
474+
out += fmt.Sprintf(" - Previous model: %s\n", s.Int32(groupStatus.ReadyStaleModel))
475+
out += fmt.Sprintf("Requested replicas: %s\n", s.Int32(requestedReplicas))
476+
out += fmt.Sprintf("Failed replicas: %s\n", s.Int32(groupStatus.FailedUpdated))
477+
out += "\n"
478+
out += fmt.Sprintf("Created at: %s\n", libtime.LocalTimestamp(groupStatus.Start))
479+
out += fmt.Sprintf("Refreshed at: %s\n", libtime.LocalTimestamp(refreshedAt))
480+
458481
out += titleStr("Endpoint")
459482
out += "URL: " + urls.Join(resourcesRes.APIsBaseURL, anyAPIStatus.Path) + "\n"
460483
out += "Method: POST\n"

cli/cmd/predict.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ var predictCmd = &cobra.Command{
8181
predictResponse, err := makePredictRequest(apiURL, samplesJSONPath)
8282
if err != nil {
8383
if strings.Contains(err.Error(), "503 Service Temporarily Unavailable") || strings.Contains(err.Error(), "502 Bad Gateway") {
84-
errors.Exit(ErrorAPINotReady(apiName, resource.StatusUpdating.Message()))
84+
errors.Exit(ErrorAPINotReady(apiName, resource.StatusCreating.Message()))
8585
}
8686
errors.Exit(err)
8787
}

pkg/operator/api/resource/status.go

Lines changed: 52 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ type DataStatus struct {
2525
Code StatusCode `json:"status_code"`
2626
}
2727

28+
// There is one APIStatus per API resource ID (including stale/removed models). There is always an APIStatus for APIs currently in the context.
2829
type APIStatus struct {
2930
APISavedStatus
3031
Path string `json:"path"`
@@ -37,36 +38,51 @@ type APIStatus struct {
3738
}
3839

3940
type ReplicaCounts struct {
40-
ReadyUpdated int32 `json:"ready_updated"`
41-
ReadyStaleCompute int32 `json:"ready_stale_compute"`
42-
ReadyStaleResource int32 `json:"ready_stale_resource"`
43-
FailedUpdated int32 `json:"failed_updated"`
44-
FailedStaleCompute int32 `json:"failed_stale_compute"`
45-
FailedStaleResource int32 `json:"failed_stale_resource"`
41+
ReadyUpdatedCompute int32 `json:"ready_updated_compute"`
42+
ReadyStaleCompute int32 `json:"ready_stale_compute"`
43+
FailedUpdatedCompute int32 `json:"failed_updated_compute"`
44+
FailedStaleCompute int32 `json:"failed_stale_compute"`
45+
K8sRequested int32 `json:"k8s_requested"` // Number of requested replicas in an active k8s.deployment for this resource ID
4646
}
4747

48+
// There is one APIGroupStatus per API name/endpoint
4849
type APIGroupStatus struct {
49-
APIName string `json:"api_name"`
50-
Start *time.Time `json:"start"`
51-
ActiveStatus *APIStatus `json:"active_status"`
52-
Code StatusCode `json:"status_code"`
50+
APIName string `json:"api_name"`
51+
Start *time.Time `json:"start"`
52+
ActiveStatus *APIStatus `json:"active_status"` // The most recently ready API status, or the ctx API status if it's ready
53+
Code StatusCode `json:"status_code"`
54+
GroupedReplicaCounts `json:"grouped_replica_counts"`
55+
}
56+
57+
type GroupedReplicaCounts struct {
58+
ReadyUpdated int32 `json:"ready_updated"` // Updated means the replica is fully up-to-date (compute and model match the API's current resource ID in the context)
59+
ReadyStaleModel int32 `json:"ready_stale_model"` // Stale model means the replica is serving a model which not currently in the context (either it was updated or removed)
60+
ReadyStaleCompute int32 `json:"ready_stale_compute"` // Stale compute means the replica is serving the correct model, but the compute request has changed
61+
FailedUpdated int32 `json:"failed_updated"`
62+
FailedStaleModel int32 `json:"failed_stale_model"`
63+
FailedStaleCompute int32 `json:"failed_stale_compute"`
5364
}
5465

5566
type Status interface {
5667
Message() string
5768
GetCode() StatusCode
5869
}
5970

60-
func (replicaCounts *ReplicaCounts) TotalReady() int32 {
61-
return replicaCounts.ReadyUpdated + replicaCounts.ReadyStaleCompute + replicaCounts.ReadyStaleResource
71+
func (rc *ReplicaCounts) TotalReady() int32 {
72+
return rc.ReadyUpdatedCompute + rc.ReadyStaleCompute
73+
}
74+
75+
func (rc *ReplicaCounts) TotalFailed() int32 {
76+
return rc.FailedUpdatedCompute + rc.FailedStaleCompute
6277
}
6378

64-
func (replicaCounts *ReplicaCounts) TotalStaleReady() int32 {
65-
return replicaCounts.ReadyStaleCompute + replicaCounts.ReadyStaleResource
79+
func (grc *GroupedReplicaCounts) Available() int32 {
80+
return grc.ReadyUpdated + grc.ReadyStaleModel + grc.ReadyStaleCompute
6681
}
6782

68-
func (replicaCounts *ReplicaCounts) TotalStale() int32 {
69-
return replicaCounts.ReadyStaleCompute + replicaCounts.ReadyStaleResource + replicaCounts.FailedStaleCompute + replicaCounts.FailedStaleResource
83+
// Number of replicas with the up-to-date model (includes stale compute)
84+
func (grc *GroupedReplicaCounts) UpToDate() int32 {
85+
return grc.ReadyUpdated + grc.ReadyStaleCompute
7086
}
7187

7288
func (status *DataStatus) GetCode() StatusCode {
@@ -91,26 +107,21 @@ const (
91107
StatusPendingCompute
92108
StatusWaiting // Resource can be created based on resource DAG, but hasn't started yet
93109
StatusSkipped
110+
StatusError
94111
StatusParentFailed
95112
StatusParentKilled
96113
StatusKilledOOM
97114

98115
// Data statuses
99116
StatusRunning
100117
StatusSucceeded
101-
StatusFailed
102118
StatusKilled
103119

104120
// API statuses
105-
StatusUpdating
106-
StatusReady
121+
StatusCreating
122+
StatusLive
107123
StatusStopping
108124
StatusStopped
109-
StatusError
110-
111-
// Additional API group statuses (i.e. aggregated API status)
112-
StatusPendingUpdate
113-
StatusUpdateSkipped
114125
)
115126

116127
var statusCodes = []string{
@@ -120,26 +131,22 @@ var statusCodes = []string{
120131
"status_pending_compute",
121132
"status_waiting",
122133
"status_skipped",
134+
"status_error",
123135
"status_parent_failed",
124136
"status_parent_killed",
125137
"status_killed_oom",
126138

127139
"status_running",
128140
"status_succeeded",
129-
"status_failed",
130141
"status_killed",
131142

132-
"status_updating",
133-
"status_ready",
143+
"status_creating",
144+
"status_live",
134145
"status_stopping",
135146
"status_stopped",
136-
"status_error",
137-
138-
"status_pending_update",
139-
"status_update_skipped",
140147
}
141148

142-
var _ = [1]int{}[int(StatusUpdateSkipped)-(len(statusCodes)-1)] // Ensure list length matches
149+
var _ = [1]int{}[int(StatusStopped)-(len(statusCodes)-1)] // Ensure list length matches
143150

144151
var statusCodeMessages = []string{
145152
"unknown", // StatusUnknown
@@ -148,27 +155,22 @@ var statusCodeMessages = []string{
148155
"compute unavailable", // StatusPendingCompute
149156
"pending", // StatusWaiting
150157
"skipped", // StatusSkipped
158+
"error", // StatusError
151159
"upstream error", // StatusParentFailed
152160
"upstream termination", // StatusParentKilled
153161
"terminated (out of mem)", // StatusDataOOM
154162

155-
"running", // StatusDataRunning
156-
"ready", // StatusDataSucceeded
157-
"error", // StatusDataFailed
158-
"terminated", // StatusDataKilled
159-
160-
"updating", // StatusAPIUpdating
161-
"ready", // StatusAPIReady
162-
"stopping", // StatusAPIStopping
163-
"stopped", // StatusAPIStopped
164-
"error", // StatusAPIError
165-
166-
"update pending", // StatusAPIGroupPendingUpdate
167-
"update skipped", // StatusAPIGroupUpdateSkipped
163+
"running", // StatusRunning
164+
"ready", // StatusSucceeded
165+
"terminated", // StatusKilled
168166

167+
"creating", // StatusCreating
168+
"live", // StatusLive
169+
"stopping", // StatusStopping
170+
"stopped", // StatusStopped
169171
}
170172

171-
var _ = [1]int{}[int(StatusUpdateSkipped)-(len(statusCodeMessages)-1)] // Ensure list length matches
173+
var _ = [1]int{}[int(StatusStopped)-(len(statusCodeMessages)-1)] // Ensure list length matches
172174

173175
// StatusDataRunning aliases
174176
const (
@@ -186,26 +188,22 @@ var statusSortBuckets = []int{
186188
4, // StatusPendingCompute
187189
4, // StatusWaiting
188190
2, // StatusSkipped
191+
1, // StatusError
189192
2, // StatusParentFailed
190193
2, // StatusParentKilled
191194
1, // StatusKilledOOM
192195

193196
3, // StatusRunning
194197
0, // StatusSucceeded
195-
1, // StatusFailed
196198
1, // StatusKilled
197199

198-
3, // StatusUpdating
199-
0, // StatusReady
200+
3, // StatusCreating
201+
0, // StatusLive
200202
3, // StatusStopping
201203
1, // StatusStopped
202-
1, // StatusError
203-
204-
0, // StatusPendingUpdate
205-
2, // StatusUpdateSkipped
206204
}
207205

208-
var _ = [1]int{}[int(StatusUpdateSkipped)-(len(statusSortBuckets)-1)] // Ensure list length matches
206+
var _ = [1]int{}[int(StatusStopped)-(len(statusSortBuckets)-1)] // Ensure list length matches
209207

210208
func (code StatusCode) String() string {
211209
if int(code) < 0 || int(code) >= len(statusCodes) {

pkg/operator/endpoints/resources.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,17 @@ func GetResources(w http.ResponseWriter, r *http.Request) {
4040
return
4141
}
4242

43-
apiStatuses, err := workloads.GetCurrentAPIStatuses(ctx, dataStatuses)
43+
deployments, err := workloads.APIDeploymentMap(ctx.App.Name)
4444
if RespondIfError(w, err) {
4545
return
4646
}
4747

48-
apiGroupStatuses, err := workloads.GetAPIGroupStatuses(apiStatuses, ctx)
48+
apiStatuses, err := workloads.GetCurrentAPIStatuses(dataStatuses, deployments, ctx)
49+
if RespondIfError(w, err) {
50+
return
51+
}
52+
53+
apiGroupStatuses, err := workloads.GetAPIGroupStatuses(apiStatuses, deployments, ctx)
4954
if RespondIfError(w, err) {
5055
return
5156
}

pkg/operator/workloads/api.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ func hpaSpec(ctx *context.Context, api *context.API) *autoscaling.HorizontalPodA
219219
func apiWorkloadSpecs(ctx *context.Context) ([]*WorkloadSpec, error) {
220220
var workloadSpecs []*WorkloadSpec
221221

222-
deployments, err := apiDeploymentMap(ctx.App.Name)
222+
deployments, err := APIDeploymentMap(ctx.App.Name)
223223
if err != nil {
224224
return nil, err
225225
}
@@ -365,7 +365,7 @@ func createServicesAndIngresses(ctx *context.Context) error {
365365
}
366366

367367
// This returns map apiName -> deployment (not internalName -> deployment)
368-
func apiDeploymentMap(appName string) (map[string]*appsv1b1.Deployment, error) {
368+
func APIDeploymentMap(appName string) (map[string]*appsv1b1.Deployment, error) {
369369
deploymentList, err := config.Kubernetes.ListDeploymentsByLabels(map[string]string{
370370
"appName": appName,
371371
"workloadType": WorkloadTypeAPI,
@@ -428,6 +428,19 @@ func APIsBaseURL() (string, error) {
428428
return "https://" + service.Status.LoadBalancer.Ingress[0].Hostname, nil
429429
}
430430

431+
func APIPodComputeID(containers []corev1.Container) string {
432+
cpu, mem, gpu := APIPodCompute(containers)
433+
if cpu == nil {
434+
cpu = &userconfig.Quantity{} // unexpected, since the default is 200m and 0 is disallowed
435+
}
436+
podAPICompute := userconfig.APICompute{
437+
CPU: *cpu,
438+
Mem: mem,
439+
GPU: gpu,
440+
}
441+
return podAPICompute.IDWithoutReplicas()
442+
}
443+
431444
func APIPodCompute(containers []corev1.Container) (*userconfig.Quantity, *userconfig.Quantity, int64) {
432445
var totalCPU *userconfig.Quantity
433446
var totalMem *userconfig.Quantity

0 commit comments

Comments
 (0)