@@ -15,125 +15,157 @@ package vm
1515
1616import (
1717 "context"
18+ "fmt"
1819 "net"
19- "strings"
2020 "time"
2121
2222 "github.com/mdlayher/vsock"
23+ "github.com/pkg/errors"
2324 "github.com/sirupsen/logrus"
2425)
2526
2627const (
27- vsockConnectTimeout = 20 * time .Second
28+ vsockRetryTimeout = 20 * time .Second
29+ vsockRetryInterval = 100 * time .Millisecond
30+ unixDialTimeout = 100 * time .Millisecond
31+ vsockConnectMsgTimeout = 100 * time .Millisecond
32+ vsockAckMsgTimeout = 1 * time .Second
2833)
2934
30- // VSockDial attempts to connect to a vsock listener at the provided cid and port with a hardcoded number
31- // of retries.
32- func VSockDial (reqCtx context.Context , logger * logrus.Entry , contextID , port uint32 ) (net.Conn , error ) {
33- // Retries occur every 100ms up to vsockConnectTimeout
34- const retryInterval = 100 * time .Millisecond
35- ctx , cancel := context .WithTimeout (reqCtx , vsockConnectTimeout )
35+ // VSockDial attempts to connect to the Firecracker host-side vsock at the provided unix
36+ // path and port. It will retry connect attempts if a temporary error is encountered (up
37+ // to a fixed timeout) or the provided request is canceled.
38+ func VSockDial (reqCtx context.Context , logger * logrus.Entry , udsPath string , port uint32 ) (net.Conn , error ) {
39+ ctx , cancel := context .WithTimeout (reqCtx , vsockRetryTimeout )
3640 defer cancel ()
3741
42+ tickerCh := time .NewTicker (vsockRetryInterval ).C
3843 var attemptCount int
39- for range time . NewTicker ( retryInterval ). C {
44+ for {
4045 attemptCount ++
41- logger = logger .WithField ("attempt" , attemptCount )
46+ logger : = logger .WithField ("attempt" , attemptCount )
4247
4348 select {
4449 case <- ctx .Done ():
4550 return nil , ctx .Err ()
46- default :
47- conn , err := vsock .Dial (contextID , port )
48- if err == nil {
49- logger .WithField ("connection" , conn ).Debug ("vsock dial succeeded" )
50- return conn , nil
51- }
52-
53- // ENXIO and ECONNRESET can be returned while the VM+agent are still in the midst of booting
54- if isTemporaryNetErr (err ) || isENXIO (err ) || isECONNRESET (err ) {
55- logger .WithError (err ).Debug ("temporary vsock dial failure" )
51+ case <- tickerCh :
52+ conn , err := tryConnect (logger , udsPath , port )
53+ if isTemporaryNetErr (err ) {
54+ err = errors .Wrap (err , "temporary vsock dial failure" )
55+ logger .WithError (err ).Debug ()
5656 continue
57+ } else if err != nil {
58+ err = errors .Wrap (err , "non-temporary vsock dial failure" )
59+ logger .WithError (err ).Error ()
60+ return nil , err
5761 }
5862
59- logger .WithError (err ).Error ("non-temporary vsock dial failure" )
60- return nil , err
63+ return conn , nil
6164 }
6265 }
63-
64- panic ("unreachable code" ) // appeases the compiler, which doesn't know the for loop is infinite
6566}
6667
67- // VSockDialConnector provides an IOConnector interface to the VSockDial function.
68- func VSockDialConnector (contextID , port uint32 ) IOConnector {
69- return func (procCtx context.Context , logger * logrus.Entry ) <- chan IOConnectorResult {
70- returnCh := make (chan IOConnectorResult )
71-
72- go func () {
73- defer close (returnCh )
74-
75- conn , err := VSockDial (procCtx , logger , contextID , port )
76- returnCh <- IOConnectorResult {
77- ReadWriteCloser : conn ,
78- Err : err ,
79- }
80- }()
81-
82- return returnCh
83- }
68+ type vsockListener struct {
69+ listener net.Listener
70+ port uint32
71+ ctx context.Context
72+ logger * logrus.Entry
8473}
8574
86- func vsockAccept (reqCtx context.Context , logger * logrus.Entry , port uint32 ) (net.Conn , error ) {
75+ // VSockListener returns a net.Listener implementation for guest-side Firecracker vsock
76+ // connections.
77+ func VSockListener (ctx context.Context , logger * logrus.Entry , port uint32 ) (net.Listener , error ) {
8778 listener , err := vsock .Listen (port )
8879 if err != nil {
8980 return nil , err
9081 }
9182
92- defer listener .Close ()
83+ return vsockListener {
84+ listener : listener ,
85+ port : port ,
86+ ctx : ctx ,
87+ logger : logger ,
88+ }, nil
89+ }
9390
94- // Retries occur every 10ms up to vsockConnectTimeout
95- const retryInterval = 10 * time .Millisecond
96- ctx , cancel := context .WithTimeout (reqCtx , vsockConnectTimeout )
91+ func (l vsockListener ) Accept () (net.Conn , error ) {
92+ ctx , cancel := context .WithTimeout (l .ctx , vsockRetryTimeout )
9793 defer cancel ()
9894
9995 var attemptCount int
100- for range time .NewTicker (retryInterval ).C {
96+ tickerCh := time .NewTicker (vsockRetryInterval ).C
97+ for {
10198 attemptCount ++
102- logger = logger .WithField ("attempt" , attemptCount )
99+ logger := l . logger .WithField ("attempt" , attemptCount )
103100
104101 select {
105102 case <- ctx .Done ():
106103 return nil , ctx .Err ()
107- default :
108- // accept is non-blocking so try to accept until we get a connection
109- conn , err := listener .Accept ()
110- if err == nil {
111- return conn , nil
112- }
113-
104+ case <- tickerCh :
105+ conn , err := tryAccept (logger , l .listener , l .port )
114106 if isTemporaryNetErr (err ) {
115- logger .WithError (err ).Debug ("temporary stdio vsock accept failure" )
107+ err = errors .Wrap (err , "temporary vsock accept failure" )
108+ logger .WithError (err ).Debug ()
116109 continue
110+ } else if err != nil {
111+ err = errors .Wrap (err , "non-temporary vsock accept failure" )
112+ logger .WithError (err ).Error ()
113+ return nil , err
117114 }
118115
119- logger .WithError (err ).Error ("non-temporary stdio vsock accept failure" )
120- return nil , err
116+ return conn , nil
121117 }
122118 }
119+ }
120+
121+ func (l vsockListener ) Close () error {
122+ return l .listener .Close ()
123+ }
124+
125+ func (l vsockListener ) Addr () net.Addr {
126+ return l .listener .Addr ()
127+ }
123128
124- panic ("unreachable code" ) // appeases the compiler, which doesn't know the for loop is infinite
129+ // VSockDialConnector returns an IOConnector for establishing vsock connections
130+ // that are dialed from the host to a guest listener.
131+ func VSockDialConnector (udsPath string , port uint32 ) IOConnector {
132+ return func (procCtx context.Context , logger * logrus.Entry ) <- chan IOConnectorResult {
133+ returnCh := make (chan IOConnectorResult )
134+
135+ go func () {
136+ defer close (returnCh )
137+
138+ conn , err := VSockDial (procCtx , logger , udsPath , port )
139+ returnCh <- IOConnectorResult {
140+ ReadWriteCloser : conn ,
141+ Err : err ,
142+ }
143+ }()
144+
145+ return returnCh
146+ }
125147}
126148
127- // VSockAcceptConnector provides an IOConnector that establishes the connection by listening on the provided
128- // vsock port and accepting the first connection that comes in.
149+ // VSockAcceptConnector provides an IOConnector that establishes the connection by listening
150+ // on the provided guest-side vsock port and accepting the first connection that comes in.
129151func VSockAcceptConnector (port uint32 ) IOConnector {
130152 return func (procCtx context.Context , logger * logrus.Entry ) <- chan IOConnectorResult {
131153 returnCh := make (chan IOConnectorResult )
132154
155+ listener , err := VSockListener (procCtx , logger , port )
156+ if err != nil {
157+ returnCh <- IOConnectorResult {
158+ Err : err ,
159+ }
160+ close (returnCh )
161+ return returnCh
162+ }
163+
133164 go func () {
134165 defer close (returnCh )
166+ defer listener .Close ()
135167
136- conn , err := vsockAccept ( procCtx , logger , port )
168+ conn , err := listener . Accept ( )
137169 returnCh <- IOConnectorResult {
138170 ReadWriteCloser : conn ,
139171 Err : err ,
@@ -144,23 +176,153 @@ func VSockAcceptConnector(port uint32) IOConnector {
144176 }
145177}
146178
147- func isTemporaryNetErr (err error ) bool {
148- terr , ok := err .(interface {
149- Temporary () bool
150- })
179+ func vsockConnectMsg (port uint32 ) string {
180+ // The message a host-side connection must write after connecting to a firecracker
181+ // vsock unix socket in order to establish a connection with a guest-side listener
182+ // at the provided port number. This is specified in Firecracker documentation:
183+ // https://github.com/firecracker-microvm/firecracker/blob/master/docs/vsock.md#host-initiated-connections
184+ return fmt .Sprintf ("CONNECT %d\n " , port )
185+ }
151186
152- return err != nil && ok && terr .Temporary ()
187+ func vsockAckMsg (port uint32 ) string {
188+ // The message a guest-side connection will write after accepting a connection from
189+ // a host dial. This is not part of the official Firecracker vsock spec, but is
190+ // recommended in order to allow the host to verify connections were established
191+ // successfully: https://github.com/firecracker-microvm/firecracker/issues/1272#issuecomment-533004066
192+ return fmt .Sprintf ("IMALIVE %d\n " , port )
193+ }
194+
195+ // tryConnect attempts to dial a guest vsock listener at the provided host-side
196+ // unix socket and provided guest-listener port.
197+ func tryConnect (logger * logrus.Entry , udsPath string , port uint32 ) (net.Conn , error ) {
198+ conn , err := net .DialTimeout ("unix" , udsPath , unixDialTimeout )
199+ if err != nil {
200+ return nil , err
201+ }
202+
203+ defer func () {
204+ if err != nil {
205+ closeErr := conn .Close ()
206+ if closeErr != nil {
207+ logger .WithError (closeErr ).Error (
208+ "failed to close vsock socket after previous error" )
209+ }
210+ }
211+ }()
212+
213+ err = tryConnWrite (conn , vsockConnectMsg (port ), vsockConnectMsgTimeout )
214+ if err != nil {
215+ return nil , vsockConnectMsgError {cause : err }
216+ }
217+
218+ err = tryConnRead (conn , vsockAckMsg (port ), vsockAckMsgTimeout )
219+ if err != nil {
220+ return nil , vsockAckError {cause : err }
221+ }
222+ return conn , nil
223+ }
224+
225+ // tryAccept attempts to accept a single host-side connection from the provided
226+ // guest-side listener at the provided port.
227+ func tryAccept (logger * logrus.Entry , listener net.Listener , port uint32 ) (net.Conn , error ) {
228+ conn , err := listener .Accept ()
229+ if err != nil {
230+ return nil , err
231+ }
232+
233+ defer func () {
234+ if err != nil {
235+ closeErr := conn .Close ()
236+ if closeErr != nil {
237+ logger .WithError (closeErr ).Error (
238+ "failed to close vsock after previous error" )
239+ }
240+ }
241+ }()
242+
243+ err = tryConnWrite (conn , vsockAckMsg (port ), vsockAckMsgTimeout )
244+ if err != nil {
245+ return nil , vsockAckError {cause : err }
246+ }
247+
248+ return conn , nil
249+ }
250+
251+ // tryConnRead will try to do a read from the provided conn, returning an error if
252+ // the bytes read does not match what was provided or if the read does not complete
253+ // within the provided timeout. It will reset socket deadlines to none after returning.
254+ // It's only intended to be used for connect/ack messages, not general purpose reads
255+ // after the vsock connection is established fully.
256+ func tryConnRead (conn net.Conn , expectedRead string , timeout time.Duration ) error {
257+ conn .SetDeadline (time .Now ().Add (timeout ))
258+ defer conn .SetDeadline (time.Time {})
259+
260+ actualRead := make ([]byte , len (expectedRead ))
261+ _ , err := conn .Read (actualRead )
262+ if err != nil {
263+ return err
264+ }
265+
266+ if expectedRead != string (actualRead ) {
267+ return errors .Errorf ("expected to read %q, but instead read %q" ,
268+ expectedRead , string (actualRead ))
269+ }
270+
271+ return nil
272+ }
273+
274+ // tryConnWrite will try to do a write to the provided conn, returning an error if
275+ // the write fails, is partial or does not complete within the provided timeout. It
276+ // will reset socket deadlines to none after returning. It's only intended to be
277+ // used for connect/ack messages, not general purpose writes after the vsock
278+ // connection is established fully.
279+ func tryConnWrite (conn net.Conn , expectedWrite string , timeout time.Duration ) error {
280+ conn .SetDeadline (time .Now ().Add (timeout ))
281+ defer conn .SetDeadline (time.Time {})
282+
283+ bytesWritten , err := conn .Write ([]byte (expectedWrite ))
284+ if err != nil {
285+ return err
286+ }
287+ if bytesWritten != len (expectedWrite ) {
288+ return errors .Errorf ("incomplete write, expected %d bytes but wrote %d" ,
289+ len (expectedWrite ), bytesWritten )
290+ }
291+
292+ return nil
153293}
154294
155- // Unfortunately, as "documented" on various online forums, there's no ideal way to
156- // test for actual Linux error codes returned by the net library or wrappers
157- // around that library. The common approach is to fall back on string matching,
158- // which is done for the functions below
295+ type vsockConnectMsgError struct {
296+ cause error
297+ }
159298
160- func isENXIO ( err error ) bool {
161- return strings . HasSuffix ( err . Error () , "no such device" )
299+ func ( e vsockConnectMsgError ) Error () string {
300+ return errors . Wrap ( e . cause , "vsock connect message failure" ). Error ( )
162301}
163302
164- func isECONNRESET (err error ) bool {
165- return strings .HasSuffix (err .Error (), "connection reset by peer" )
303+ func (e vsockConnectMsgError ) Temporary () bool {
304+ return false
305+ }
306+
307+ type vsockAckError struct {
308+ cause error
309+ }
310+
311+ func (e vsockAckError ) Error () string {
312+ return errors .Wrap (e .cause , "vsock ack message failure" ).Error ()
313+ }
314+
315+ func (e vsockAckError ) Temporary () bool {
316+ return true
317+ }
318+
319+ // isTemporaryNetErr returns whether the provided error is a retriable
320+ // error, according to the interface defined here:
321+ // https://golang.org/pkg/net/#Error
322+ func isTemporaryNetErr (err error ) bool {
323+ terr , ok := err .(interface {
324+ Temporary () bool
325+ })
326+
327+ return err != nil && ok && terr .Temporary ()
166328}
0 commit comments