Skip to content

Commit 25a1144

Browse files
nybidarigvisor-bot
authored andcommitted
Close TCP endpoints on restore error instead of panics.
Restoring of TCP endpoints can fail due to various reasons, some of them are: - when the port used by the endpoint during checkpoint is already being used by some host socket during restore. - when the new route cannot be found due to changes in the network config. During these failures instead of panic'ing and aborting the restore, log a warning and close the failing connection. PiperOrigin-RevId: 795118834
1 parent 14f317c commit 25a1144

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

pkg/tcpip/transport/tcp/endpoint_state.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ func (e *Endpoint) Restore(s *stack.Stack) {
157157
e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits)
158158
e.segmentQueue.thaw()
159159

160+
e.mu.Lock()
161+
id := e.ID
162+
e.mu.Unlock()
163+
160164
bind := func() {
161165
e.mu.Lock()
162166
defer e.mu.Unlock()
@@ -211,7 +215,11 @@ func (e *Endpoint) Restore(s *stack.Stack) {
211215
e.mu.Lock()
212216
err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.TransportEndpointInfo.ID.RemotePort}, false /* handshake */)
213217
if _, ok := err.(*tcpip.ErrConnectStarted); !ok {
214-
panic("endpoint connecting failed: " + err.String())
218+
log.Warningf("TCP endpoint connect failed for connected endpoint with ID: %+v err: %v", id, err)
219+
e.mu.Unlock()
220+
e.Close()
221+
connectedLoading.Done()
222+
return
215223
}
216224
e.state.Store(e.origEndpointState)
217225
// For FIN-WAIT-2 and TIME-WAIT we need to start the appropriate timers so
@@ -274,7 +282,8 @@ func (e *Endpoint) Restore(s *stack.Stack) {
274282
bind()
275283
err := e.Connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.TransportEndpointInfo.ID.RemotePort})
276284
if _, ok := err.(*tcpip.ErrConnectStarted); !ok {
277-
panic("endpoint connecting failed: " + err.String())
285+
log.Warningf("TCP endpoint connect failed for connecting endpoint with ID: %+v err: %v", id, err)
286+
e.Close()
278287
}
279288
connectingLoading.Done()
280289
tcpip.AsyncLoading.Done()
@@ -289,11 +298,15 @@ func (e *Endpoint) Restore(s *stack.Stack) {
289298
// naturally complete the connection.
290299
bind()
291300
e.mu.Lock()
292-
defer e.mu.Unlock()
293301
e.setEndpointState(epState)
294302
r, err := e.stack.FindRoute(e.boundNICID, e.TransportEndpointInfo.ID.LocalAddress, e.TransportEndpointInfo.ID.RemoteAddress, e.effectiveNetProtos[0], false /* multicastLoop */)
295303
if err != nil {
296-
panic(fmt.Sprintf("FindRoute failed when restoring endpoint w/ ID: %+v", e.ID))
304+
e.mu.Unlock()
305+
log.Warningf("FindRoute failed when restoring endpoint w/ ID: %+v err: %v", id, err)
306+
e.Close()
307+
connectingLoading.Done()
308+
tcpip.AsyncLoading.Done()
309+
return
297310
}
298311
e.route = r
299312
timer, err := newBackoffTimer(e.stack.Clock(), InitialRTO, MaxRTO, timerHandler(e, e.h.retransmitHandlerLocked))
@@ -303,6 +316,7 @@ func (e *Endpoint) Restore(s *stack.Stack) {
303316
e.h.retransmitTimer = timer
304317
connectingLoading.Done()
305318
tcpip.AsyncLoading.Done()
319+
e.mu.Unlock()
306320
}()
307321
case epState == StateBound:
308322
tcpip.AsyncLoading.Add(1)

0 commit comments

Comments
 (0)