@@ -10,9 +10,6 @@ use core::ffi::NonZero_c_int;
1010#[ cfg( target_os = "linux" ) ]
1111use crate :: os:: linux:: process:: PidFd ;
1212
13- #[ cfg( target_os = "linux" ) ]
14- use crate :: sys:: weak:: raw_syscall;
15-
1613#[ cfg( any(
1714 target_os = "macos" ,
1815 target_os = "watchos" ,
@@ -91,6 +88,11 @@ impl Command {
9188 if let Some ( ret) = self . posix_spawn ( & theirs, envp. as_ref ( ) ) ? {
9289 return Ok ( ( ret, ours) ) ;
9390 }
91+
92+ #[ cfg( target_os = "linux" ) ]
93+ let ( input, output) = sys:: net:: Socket :: new_pair ( libc:: AF_UNIX , libc:: SOCK_SEQPACKET ) ?;
94+
95+ #[ cfg( not( target_os = "linux" ) ) ]
9496 let ( input, output) = sys:: pipe:: anon_pipe ( ) ?;
9597
9698 // Whatever happens after the fork is almost for sure going to touch or
@@ -104,12 +106,16 @@ impl Command {
104106 // The child calls `mem::forget` to leak the lock, which is crucial because
105107 // releasing a lock is not async-signal-safe.
106108 let env_lock = sys:: os:: env_read_lock ( ) ;
107- let ( pid, pidfd ) = unsafe { self . do_fork ( ) ? } ;
109+ let pid = unsafe { self . do_fork ( ) ? } ;
108110
109111 if pid == 0 {
110112 crate :: panic:: always_abort ( ) ;
111113 mem:: forget ( env_lock) ; // avoid non-async-signal-safe unlocking
112114 drop ( input) ;
115+ #[ cfg( target_os = "linux" ) ]
116+ if self . get_create_pidfd ( ) {
117+ self . send_pidfd ( & output) ;
118+ }
113119 let Err ( err) = unsafe { self . do_exec ( theirs, envp. as_ref ( ) ) } ;
114120 let errno = err. raw_os_error ( ) . unwrap_or ( libc:: EINVAL ) as u32 ;
115121 let errno = errno. to_be_bytes ( ) ;
@@ -133,6 +139,12 @@ impl Command {
133139 drop ( env_lock) ;
134140 drop ( output) ;
135141
142+ #[ cfg( target_os = "linux" ) ]
143+ let pidfd = if self . get_create_pidfd ( ) { self . recv_pidfd ( & input) } else { -1 } ;
144+
145+ #[ cfg( not( target_os = "linux" ) ) ]
146+ let pidfd = -1 ;
147+
136148 // Safety: We obtained the pidfd from calling `clone3` with
137149 // `CLONE_PIDFD` so it's valid an otherwise unowned.
138150 let mut p = unsafe { Process :: new ( pid, pidfd) } ;
@@ -160,6 +172,7 @@ impl Command {
160172 }
161173 Ok ( ..) => {
162174 // pipe I/O up to PIPE_BUF bytes should be atomic
175+ // similarly SOCK_SEQPACKET messages should arrive whole
163176 assert ! ( p. wait( ) . is_ok( ) , "wait() should either return Ok or panic" ) ;
164177 panic ! ( "short read on the CLOEXEC pipe" )
165178 }
@@ -185,28 +198,27 @@ impl Command {
185198 ) ;
186199
187200 #[ cfg( any( target_os = "tvos" , target_os = "watchos" ) ) ]
188- unsafe fn do_fork ( & mut self ) -> Result < ( pid_t , pid_t ) , io:: Error > {
201+ unsafe fn do_fork ( & mut self ) -> Result < pid_t , io:: Error > {
189202 return Err ( Self :: ERR_APPLE_TV_WATCH_NO_FORK_EXEC ) ;
190203 }
191204
192205 // Attempts to fork the process. If successful, returns Ok((0, -1))
193206 // in the child, and Ok((child_pid, -1)) in the parent.
194207 #[ cfg( not( any(
195- target_os = "linux" ,
196208 target_os = "watchos" ,
197209 target_os = "tvos" ,
198210 all( target_os = "nto" , target_env = "nto71" ) ,
199211 ) ) ) ]
200- unsafe fn do_fork ( & mut self ) -> Result < ( pid_t , pid_t ) , io:: Error > {
201- cvt ( libc:: fork ( ) ) . map ( |res| ( res , - 1 ) )
212+ unsafe fn do_fork ( & mut self ) -> Result < pid_t , io:: Error > {
213+ cvt ( libc:: fork ( ) )
202214 }
203215
204216 // On QNX Neutrino, fork can fail with EBADF in case "another thread might have opened
205217 // or closed a file descriptor while the fork() was occurring".
206218 // Documentation says "... or try calling fork() again". This is what we do here.
207219 // See also https://www.qnx.com/developers/docs/7.1/#com.qnx.doc.neutrino.lib_ref/topic/f/fork.html
208220 #[ cfg( all( target_os = "nto" , target_env = "nto71" ) ) ]
209- unsafe fn do_fork ( & mut self ) -> Result < ( pid_t , pid_t ) , io:: Error > {
221+ unsafe fn do_fork ( & mut self ) -> Result < pid_t , io:: Error > {
210222 use crate :: sys:: os:: errno;
211223
212224 let mut delay = MIN_FORKSPAWN_SLEEP ;
@@ -229,91 +241,11 @@ impl Command {
229241 delay *= 2 ;
230242 continue ;
231243 } else {
232- return cvt ( r) . map ( |res| ( res , - 1 ) ) ;
244+ return cvt ( r) ;
233245 }
234246 }
235247 }
236248
237- // Attempts to fork the process. If successful, returns Ok((0, -1))
238- // in the child, and Ok((child_pid, child_pidfd)) in the parent.
239- #[ cfg( target_os = "linux" ) ]
240- unsafe fn do_fork ( & mut self ) -> Result < ( pid_t , pid_t ) , io:: Error > {
241- use crate :: sync:: atomic:: { AtomicBool , Ordering } ;
242-
243- static HAS_CLONE3 : AtomicBool = AtomicBool :: new ( true ) ;
244- const CLONE_PIDFD : u64 = 0x00001000 ;
245-
246- #[ repr( C ) ]
247- struct clone_args {
248- flags : u64 ,
249- pidfd : u64 ,
250- child_tid : u64 ,
251- parent_tid : u64 ,
252- exit_signal : u64 ,
253- stack : u64 ,
254- stack_size : u64 ,
255- tls : u64 ,
256- set_tid : u64 ,
257- set_tid_size : u64 ,
258- cgroup : u64 ,
259- }
260-
261- raw_syscall ! {
262- fn clone3( cl_args: * mut clone_args, len: libc:: size_t) -> libc:: c_long
263- }
264-
265- // Bypassing libc for `clone3` can make further libc calls unsafe,
266- // so we use it sparingly for now. See #89522 for details.
267- // Some tools (e.g. sandboxing tools) may also expect `fork`
268- // rather than `clone3`.
269- let want_clone3_pidfd = self . get_create_pidfd ( ) ;
270-
271- // If we fail to create a pidfd for any reason, this will
272- // stay as -1, which indicates an error.
273- let mut pidfd: pid_t = -1 ;
274-
275- // Attempt to use the `clone3` syscall, which supports more arguments
276- // (in particular, the ability to create a pidfd). If this fails,
277- // we will fall through this block to a call to `fork()`
278- if want_clone3_pidfd && HAS_CLONE3 . load ( Ordering :: Relaxed ) {
279- let mut args = clone_args {
280- flags : CLONE_PIDFD ,
281- pidfd : & mut pidfd as * mut pid_t as u64 ,
282- child_tid : 0 ,
283- parent_tid : 0 ,
284- exit_signal : libc:: SIGCHLD as u64 ,
285- stack : 0 ,
286- stack_size : 0 ,
287- tls : 0 ,
288- set_tid : 0 ,
289- set_tid_size : 0 ,
290- cgroup : 0 ,
291- } ;
292-
293- let args_ptr = & mut args as * mut clone_args ;
294- let args_size = crate :: mem:: size_of :: < clone_args > ( ) ;
295-
296- let res = cvt ( clone3 ( args_ptr, args_size) ) ;
297- match res {
298- Ok ( n) => return Ok ( ( n as pid_t , pidfd) ) ,
299- Err ( e) => match e. raw_os_error ( ) {
300- // Multiple threads can race to execute this store,
301- // but that's fine - that just means that multiple threads
302- // will have tried and failed to execute the same syscall,
303- // with no other side effects.
304- Some ( libc:: ENOSYS ) => HAS_CLONE3 . store ( false , Ordering :: Relaxed ) ,
305- // Fallback to fork if `EPERM` is returned. (e.g. blocked by seccomp)
306- Some ( libc:: EPERM ) => { }
307- _ => return Err ( e) ,
308- } ,
309- }
310- }
311-
312- // Generally, we just call `fork`. If we get here after wanting `clone3`,
313- // then the syscall does not exist or we do not have permission to call it.
314- cvt ( libc:: fork ( ) ) . map ( |res| ( res, pidfd) )
315- }
316-
317249 pub fn exec ( & mut self , default : Stdio ) -> io:: Error {
318250 let envp = self . capture_env ( ) ;
319251
@@ -722,6 +654,115 @@ impl Command {
722654 Ok ( Some ( p) )
723655 }
724656 }
657+
658+ #[ cfg( target_os = "linux" ) ]
659+ fn send_pidfd ( & self , sock : & crate :: sys:: net:: Socket ) {
660+ use crate :: io:: IoSlice ;
661+ use crate :: os:: fd:: RawFd ;
662+ use crate :: sys:: cvt_r;
663+ use libc:: { CMSG_DATA , CMSG_FIRSTHDR , CMSG_LEN , CMSG_SPACE , SCM_RIGHTS , SOL_SOCKET } ;
664+
665+ unsafe {
666+ let child_pid = libc:: getpid ( ) ;
667+ // pidfd_open sets CLOEXEC by default
668+ let pidfd = libc:: syscall ( libc:: SYS_pidfd_open , child_pid, 0 ) ;
669+
670+ let fds: [ c_int ; 1 ] = [ pidfd as RawFd ] ;
671+
672+ const SCM_MSG_LEN : usize = mem:: size_of :: < [ c_int ; 1 ] > ( ) ;
673+
674+ #[ repr( C ) ]
675+ union Cmsg {
676+ buf : [ u8 ; unsafe { CMSG_SPACE ( SCM_MSG_LEN as u32 ) as usize } ] ,
677+ _align : libc:: cmsghdr ,
678+ }
679+
680+ let mut cmsg: Cmsg = mem:: zeroed ( ) ;
681+
682+ // 0-length message to send through the socket so we can pass along the fd
683+ let mut iov = [ IoSlice :: new ( b"" ) ] ;
684+ let mut msg: libc:: msghdr = mem:: zeroed ( ) ;
685+
686+ msg. msg_iov = & mut iov as * mut _ as * mut _ ;
687+ msg. msg_iovlen = 1 ;
688+ msg. msg_controllen = mem:: size_of_val ( & cmsg. buf ) as _ ;
689+ msg. msg_control = & mut cmsg. buf as * mut _ as * mut _ ;
690+
691+ // only attach cmsg if we successfully acquired the pidfd
692+ if pidfd >= 0 {
693+ let hdr = CMSG_FIRSTHDR ( & mut msg as * mut _ as * mut _ ) ;
694+ ( * hdr) . cmsg_level = SOL_SOCKET ;
695+ ( * hdr) . cmsg_type = SCM_RIGHTS ;
696+ ( * hdr) . cmsg_len = CMSG_LEN ( SCM_MSG_LEN as _ ) as _ ;
697+ let data = CMSG_DATA ( hdr) ;
698+ crate :: ptr:: copy_nonoverlapping (
699+ fds. as_ptr ( ) . cast :: < u8 > ( ) ,
700+ data as * mut _ ,
701+ SCM_MSG_LEN ,
702+ ) ;
703+ }
704+
705+ // we send the 0-length message even if we failed to acquire the pidfd
706+ // so we get a consistent SEQPACKET order
707+ match cvt_r ( || libc:: sendmsg ( sock. as_raw ( ) , & msg, 0 ) ) {
708+ Ok ( 0 ) => { }
709+ _ => rtabort ! ( "failed to communicate with parent process" ) ,
710+ }
711+ }
712+ }
713+
714+ #[ cfg( target_os = "linux" ) ]
715+ fn recv_pidfd ( & self , sock : & crate :: sys:: net:: Socket ) -> pid_t {
716+ use crate :: io:: IoSliceMut ;
717+ use crate :: sys:: cvt_r;
718+
719+ use libc:: { CMSG_DATA , CMSG_FIRSTHDR , CMSG_LEN , CMSG_SPACE , SCM_RIGHTS , SOL_SOCKET } ;
720+
721+ unsafe {
722+ const SCM_MSG_LEN : usize = mem:: size_of :: < [ c_int ; 1 ] > ( ) ;
723+
724+ #[ repr( C ) ]
725+ union Cmsg {
726+ _buf : [ u8 ; unsafe { CMSG_SPACE ( SCM_MSG_LEN as u32 ) as usize } ] ,
727+ _align : libc:: cmsghdr ,
728+ }
729+ let mut cmsg: Cmsg = mem:: zeroed ( ) ;
730+ // 0-length read to get the fd
731+ let mut iov = [ IoSliceMut :: new ( & mut [ ] ) ] ;
732+
733+ let mut msg: libc:: msghdr = mem:: zeroed ( ) ;
734+
735+ msg. msg_iov = & mut iov as * mut _ as * mut _ ;
736+ msg. msg_iovlen = 1 ;
737+ msg. msg_controllen = mem:: size_of :: < Cmsg > ( ) as _ ;
738+ msg. msg_control = & mut cmsg as * mut _ as * mut _ ;
739+
740+ match cvt_r ( || libc:: recvmsg ( sock. as_raw ( ) , & mut msg, 0 ) ) {
741+ Err ( _) => return -1 ,
742+ Ok ( _) => { }
743+ }
744+
745+ let hdr = CMSG_FIRSTHDR ( & mut msg as * mut _ as * mut _ ) ;
746+ if hdr. is_null ( )
747+ || ( * hdr) . cmsg_level != SOL_SOCKET
748+ || ( * hdr) . cmsg_type != SCM_RIGHTS
749+ || ( * hdr) . cmsg_len != CMSG_LEN ( SCM_MSG_LEN as _ ) as _
750+ {
751+ return -1 ;
752+ }
753+ let data = CMSG_DATA ( hdr) ;
754+
755+ let mut fds = [ -1 as c_int ] ;
756+
757+ crate :: ptr:: copy_nonoverlapping (
758+ data as * const _ ,
759+ fds. as_mut_ptr ( ) . cast :: < u8 > ( ) ,
760+ SCM_MSG_LEN ,
761+ ) ;
762+
763+ fds[ 0 ]
764+ }
765+ }
725766}
726767
727768////////////////////////////////////////////////////////////////////////////////
0 commit comments