@@ -132,6 +132,7 @@ import { EnvVarService, ResolvedEnvVars } from "../user/env-var-service";
132132import { RedlockAbortSignal } from "redlock" ;
133133import { getExperimentsClientForBackend } from "@gitpod/gitpod-protocol/lib/experiments/configcat-server" ;
134134import { ConfigProvider } from "./config-provider" ;
135+ import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc" ;
135136
136137export interface StartWorkspaceOptions extends GitpodServer . StartWorkspaceOptions {
137138 excludeFeatureFlags ?: NamedWorkspaceFeatureFlag [ ] ;
@@ -559,6 +560,7 @@ export class WorkspaceStarter {
559560 additionalAuth ,
560561 forceRebuild ,
561562 forceRebuild ,
563+ abortSignal ,
562564 region ,
563565 ) ;
564566
@@ -579,23 +581,23 @@ export class WorkspaceStarter {
579581 startRequest . setSpec ( spec ) ;
580582 startRequest . setServicePrefix ( workspace . id ) ;
581583
582- if ( instance . status . phase === "pending" ) {
583- // due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
584- const workspaceAlreadyExists = await this . existsWithWsManager ( ctx , instance ) ;
585- if ( workspaceAlreadyExists ) {
586- log . debug (
587- { instanceId : instance . id , workspaceId : instance . workspaceId } ,
588- "workspace already exists, not starting again" ,
589- { phase : instance . status . phase } ,
590- ) ;
591- return ;
592- }
593- }
594-
595584 // choose a cluster and start the instance
596585 let resp : StartWorkspaceResponse . AsObject | undefined = undefined ;
597586 let retries = 0 ;
598587 try {
588+ if ( instance . status . phase === "pending" ) {
589+ // due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
590+ const workspaceAlreadyExists = await this . existsWithWsManager ( ctx , instance ) ;
591+ if ( workspaceAlreadyExists ) {
592+ log . debug (
593+ { instanceId : instance . id , workspaceId : instance . workspaceId } ,
594+ "workspace already exists, not starting again" ,
595+ { phase : instance . status . phase } ,
596+ ) ;
597+ return ;
598+ }
599+ }
600+
599601 for ( ; retries < MAX_INSTANCE_START_RETRIES ; retries ++ ) {
600602 if ( abortSignal . aborted ) {
601603 return ;
@@ -659,6 +661,14 @@ export class WorkspaceStarter {
659661 } ) ;
660662 }
661663 } catch ( err ) {
664+ if ( isGrpcError ( err ) && ( err . code === grpc . status . UNAVAILABLE || err . code === grpc . status . ALREADY_EXISTS ) ) {
665+ // fall-through: we don't want to fail but retry/wait for future updates to resolve this
666+ } else if ( ! ( err instanceof StartInstanceError ) ) {
667+ // fallback in case we did not already handle this error
668+ await this . failInstanceStart ( { span } , err , workspace , instance , abortSignal ) ;
669+ err = new StartInstanceError ( "other" , err ) ; // don't throw because there's nobody catching it. We just want to log/trace it.
670+ }
671+
662672 this . logAndTraceStartWorkspaceError ( { span } , logCtx , err ) ;
663673 } finally {
664674 if ( abortSignal . aborted ) {
@@ -811,8 +821,9 @@ export class WorkspaceStarter {
811821 // We may have never actually started the workspace which means that ws-manager-bridge never set a workspace status.
812822 // We have to set that status ourselves.
813823 instance . status . phase = "stopped" ;
814- instance . stoppingTime = new Date ( ) . toISOString ( ) ;
815- instance . stoppedTime = new Date ( ) . toISOString ( ) ;
824+ const now = new Date ( ) . toISOString ( ) ;
825+ instance . stoppingTime = now ;
826+ instance . stoppedTime = now ;
816827
817828 instance . status . conditions . failed = err . toString ( ) ;
818829 instance . status . message = `Workspace cannot be started: ${ err } ` ;
@@ -1201,6 +1212,7 @@ export class WorkspaceStarter {
12011212 additionalAuth : Map < string , string > ,
12021213 ignoreBaseImageresolvedAndRebuildBase : boolean = false ,
12031214 forceRebuild : boolean = false ,
1215+ abortSignal : RedlockAbortSignal ,
12041216 region ?: WorkspaceRegion ,
12051217 ) : Promise < WorkspaceInstance > {
12061218 const span = TraceContext . startSpan ( "buildWorkspaceImage" , ctx ) ;
@@ -1302,6 +1314,7 @@ export class WorkspaceStarter {
13021314 additionalAuth ,
13031315 true ,
13041316 forceRebuild ,
1317+ abortSignal ,
13051318 region ,
13061319 ) ;
13071320 } else {
@@ -1338,24 +1351,8 @@ export class WorkspaceStarter {
13381351 }
13391352
13401353 // This instance's image build "failed" as well, so mark it as such.
1341- const now = new Date ( ) . toISOString ( ) ;
1342- instance = await this . workspaceDb . trace ( { span } ) . updateInstancePartial ( instance . id , {
1343- status : { ...instance . status , phase : "stopped" , conditions : { failed : message } , message } ,
1344- stoppedTime : now ,
1345- stoppingTime : now ,
1346- } ) ;
1347-
1348- // Mark the PrebuildWorkspace as failed
1349- await this . failPrebuildWorkspace ( { span } , err , workspace ) ;
1354+ await this . failInstanceStart ( { span } , err , workspace , instance , abortSignal ) ;
13501355
1351- // Publish updated workspace instance
1352- await this . publisher . publishInstanceUpdate ( {
1353- workspaceID : workspace . ownerId ,
1354- instanceID : instance . id ,
1355- ownerID : workspace . ownerId ,
1356- } ) ;
1357-
1358- TraceContext . setError ( { span } , err ) ;
13591356 const looksLikeUserError = ( msg : string ) : boolean => {
13601357 return msg . startsWith ( "build failed:" ) || msg . includes ( "headless task failed:" ) ;
13611358 } ;
@@ -1365,6 +1362,8 @@ export class WorkspaceStarter {
13651362 `workspace image build failed: ${ message } ` ,
13661363 { looksLikeUserError : true } ,
13671364 ) ;
1365+ err = new StartInstanceError ( "imageBuildFailedUser" , err ) ;
1366+ // Don't report this as "failed" to our metrics as it would trigger an alert
13681367 } else {
13691368 log . error (
13701369 { instanceId : instance . id , userId : user . id , workspaceId : workspace . id } ,
@@ -1963,6 +1962,9 @@ export class WorkspaceStarter {
19631962 await client . describeWorkspace ( ctx , req ) ;
19641963 return true ;
19651964 } catch ( err ) {
1965+ if ( isClusterMaintenanceError ( err ) ) {
1966+ throw err ;
1967+ }
19661968 return false ;
19671969 }
19681970 }
0 commit comments