@@ -18,6 +18,7 @@ package clusterstate
1818
1919import (
2020 "fmt"
21+ "time"
2122
2223 "github.com/aws/aws-sdk-go/service/cloudformation"
2324 "github.com/cortexlabs/cortex/pkg/lib/aws"
@@ -42,8 +43,16 @@ type ClusterState struct {
4243 Status Status
4344}
4445
45- func any ( statuses [] string , allowedStatuses ... string ) bool {
46+ func is ( status string , allowedStatus string , allowedStatuses ... string ) bool {
4647 statusSet := strset .New (allowedStatuses ... )
48+ statusSet .Add (allowedStatus )
49+
50+ return statusSet .Has (status )
51+ }
52+
53+ func any (statuses []string , allowedStatus string , allowedStatuses ... string ) bool {
54+ statusSet := strset .New (allowedStatuses ... )
55+ statusSet .Add (allowedStatus )
4756 for _ , stackStatus := range statuses {
4857 if statusSet .Has (stackStatus ) {
4958 return true
@@ -53,8 +62,9 @@ func any(statuses []string, allowedStatuses ...string) bool {
5362 return false
5463}
5564
56- func all (statuses []string , allowedStatuses ... string ) bool {
65+ func all (statuses []string , allowedStatus string , allowedStatuses ... string ) bool {
5766 statusSet := strset .New (allowedStatuses ... )
67+ statusSet .Add (allowedStatus )
5868 for _ , stackStatus := range statuses {
5969 if ! statusSet .Has (stackStatus ) {
6070 return false
@@ -76,9 +86,8 @@ func (cs ClusterState) TableString() string {
7686
7787func getStatus (statusMap map [string ]string , controlPlane string ) (Status , error ) {
7888 // the order matters
79-
8089 allStatuses := []string {}
81- controlPlaneStatus := [] string { statusMap [controlPlane ]}
90+ controlPlaneStatus := statusMap [controlPlane ]
8291 nodeGroupStatuses := []string {}
8392
8493 for stackName , status := range statusMap {
@@ -88,6 +97,19 @@ func getStatus(statusMap map[string]string, controlPlane string) (Status, error)
8897 }
8998 }
9099
100+ if any (allStatuses , string (StatusCreateFailedTimedOut )) {
101+ return StatusNotFound , ErrorUnexpectedCloudFormationStatus (s .ObjFlat (statusMap ))
102+ }
103+
104+ if len (nodeGroupStatuses ) == 0 && controlPlaneStatus == string (StatusNotFound ) {
105+ return StatusNotFound , nil
106+ }
107+
108+ // controlplane stack may be created while nodegroup stacks aren't listed in cloudformation stacks during cluster spin up
109+ if len (nodeGroupStatuses ) == 0 && is (controlPlaneStatus , cloudformation .StackStatusCreateComplete , cloudformation .StackStatusCreateInProgress ) {
110+ return StatusCreateInProgress , nil
111+ }
112+
91113 if any (allStatuses , cloudformation .StackStatusCreateFailed ) {
92114 return StatusCreateFailed , nil
93115 }
@@ -96,8 +118,8 @@ func getStatus(statusMap map[string]string, controlPlane string) (Status, error)
96118 return StatusDeleteFailed , nil
97119 }
98120
99- if all (allStatuses , string ( StatusNotFound ) ) {
100- return StatusCreateComplete , nil
121+ if any (allStatuses , cloudformation . StackStatusDeleteInProgress ) {
122+ return StatusDeleteInProgress , nil
101123 }
102124
103125 if all (allStatuses , cloudformation .StackStatusCreateComplete ) {
@@ -108,45 +130,54 @@ func getStatus(statusMap map[string]string, controlPlane string) (Status, error)
108130 return StatusDeleteComplete , nil
109131 }
110132
111- if any (allStatuses , cloudformation .StackStatusDeleteInProgress ) {
133+ // nodegroup stacks are deleted first while control plane stack is still in create complete state
134+ if controlPlaneStatus == cloudformation .StackStatusCreateComplete &&
135+ all (nodeGroupStatuses , cloudformation .StackStatusDeleteInProgress , cloudformation .StackStatusDeleteComplete ) {
112136 return StatusDeleteInProgress , nil
113137 }
114138
115- // controlplane stack may be in complete state while nodegroup stacks are still in status not found
116- if all ( controlPlaneStatus , cloudformation . StackStatusCreateComplete , cloudformation .StackStatusCreateInProgress ) &&
117- all (nodeGroupStatuses , cloudformation .StackStatusCreateInProgress , string ( StatusNotFound ), cloudformation .StackStatusCreateComplete ) {
139+ // controlplane stack may be in complete state while nodegroup stacks are still in creating or one nodegroup finishes before the other
140+ if controlPlaneStatus == cloudformation .StackStatusCreateComplete &&
141+ all (nodeGroupStatuses , cloudformation .StackStatusCreateInProgress , cloudformation .StackStatusCreateComplete ) {
118142 return StatusCreateInProgress , nil
119143 }
120144
121145 return StatusNotFound , ErrorUnexpectedCloudFormationStatus (s .ObjFlat (statusMap ))
122146}
123147
124- func GetClusterState (awsClient * aws.Client , clusterConfig * clusterconfig.Config ) (* ClusterState , error ) {
125- controlPlaneStackName := fmt .Sprintf (controlPlaneTemplate , clusterConfig .ClusterName )
126- operatorStackName := fmt .Sprintf (operatorTemplate , clusterConfig .ClusterName )
127- spotStackName := fmt .Sprintf (spotTemplate , clusterConfig .ClusterName )
128- onDemandStackName := fmt .Sprintf (onDemandTemplate , clusterConfig .ClusterName )
129-
130- nodeGroupStackNames := []string {operatorStackName }
131- if clusterConfig .Spot != nil && * clusterConfig .Spot {
132- nodeGroupStackNames = append (nodeGroupStackNames , spotStackName )
133- if clusterConfig .SpotConfig != nil && clusterConfig .SpotConfig .OnDemandBackup != nil && * clusterConfig .SpotConfig .OnDemandBackup {
134- nodeGroupStackNames = append (nodeGroupStackNames , onDemandStackName )
135- }
136- } else {
137- nodeGroupStackNames = append (nodeGroupStackNames , onDemandStackName )
138- }
148+ func GetClusterState (awsClient * aws.Client , accessConfig * clusterconfig.AccessConfig ) (* ClusterState , error ) {
149+ controlPlaneStackName := fmt .Sprintf (controlPlaneTemplate , * accessConfig .ClusterName )
150+ operatorStackName := fmt .Sprintf (operatorTemplate , * accessConfig .ClusterName )
151+ spotStackName := fmt .Sprintf (spotTemplate , * accessConfig .ClusterName )
152+ onDemandStackName := fmt .Sprintf (onDemandTemplate , * accessConfig .ClusterName )
139153
140- stackSummaries , err := awsClient .ListEKSStacks (controlPlaneStackName , nodeGroupStackNames ... )
154+ nodeGroupStackNamesSet := strset .New (operatorStackName , spotStackName , onDemandStackName )
155+
156+ stackSummaries , err := awsClient .ListEKSStacks (controlPlaneStackName , nodeGroupStackNamesSet )
141157 if err != nil {
142158 return nil , errors .Wrap (err , "unable to get cluster state from cloudformation" )
143159 }
144160
145161 statusMap := map [string ]string {}
146- statusMap [controlPlaneStackName ] = getStatusFromSummaries (stackSummaries , controlPlaneStackName )
162+ nodeGroupStackNames := []string {}
163+ var controlPlaneCreationTime time.Time
164+
165+ for _ , stackSummary := range stackSummaries {
166+ statusMap [* stackSummary .StackName ] = * stackSummary .StackStatus
167+ if * stackSummary .StackName == controlPlaneStackName {
168+ controlPlaneCreationTime = * stackSummary .CreationTime
169+ } else {
170+ nodeGroupStackNames = append (nodeGroupStackNames , * stackSummary .StackName )
171+ }
172+ }
147173
148- for _ , nodeGroupName := range nodeGroupStackNames {
149- statusMap [nodeGroupName ] = getStatusFromSummaries (stackSummaries , nodeGroupName )
174+ if _ , ok := statusMap [controlPlaneStackName ]; ! ok {
175+ statusMap [controlPlaneStackName ] = string (StatusNotFound )
176+ }
177+
178+ // add a timeout for situations where the control plane is listed in the cloudformation stacks but not the nodegroup stacks
179+ if ! is (statusMap [controlPlaneStackName ], string (StatusNotFound ), cloudformation .StackStatusDeleteComplete ) && len (nodeGroupStackNames ) == 0 && time .Now ().After (controlPlaneCreationTime .Add (30 * time .Minute )) {
180+ statusMap [operatorStackName ] = string (StatusCreateFailedTimedOut )
150181 }
151182
152183 status , err := getStatus (statusMap , controlPlaneStackName )
@@ -161,13 +192,3 @@ func GetClusterState(awsClient *aws.Client, clusterConfig *clusterconfig.Config)
161192 Status : status ,
162193 }, nil
163194}
164-
165- func getStatusFromSummaries (stackSummaries []* cloudformation.StackSummary , stackName string ) string {
166- for _ , stackSummary := range stackSummaries {
167- if * stackSummary .StackName == stackName {
168- return * stackSummary .StackStatus
169- }
170- }
171-
172- return string (StatusNotFound )
173- }
0 commit comments