@@ -53,6 +53,11 @@ func (cs *CapabilitySet) Clear(cp linux.Capability) {
5353 * cs &= ^ CapabilitySetOf (cp )
5454}
5555
56+ // IsSubsetOf returns true if the given capability set is a subset of "super".
57+ func (cs * CapabilitySet ) IsSubsetOf (super CapabilitySet ) bool {
58+ return * cs & super == * cs
59+ }
60+
5661// VfsCapDataOf returns a VfsCapData containing the file capabilities for the given slice of bytes.
5762// For each field of the cap data, which are in the structure of either vfs_cap_data or vfs_ns_cap_data,
5863// the bytes are ordered in little endian.
@@ -78,30 +83,6 @@ func VfsCapDataOf(data []byte) (linux.VfsNsCapData, error) {
7883 return capData , nil
7984}
8085
81- // HandleVfsCaps updates creds based on the given vfsCaps. It returns two
82- // booleans; the first indicates whether the effective flag is set, and the second
83- // second indicates whether the file capability is applied.
84- func HandleVfsCaps (vfsCaps linux.VfsNsCapData , creds * Credentials ) (bool , bool , error ) {
85- // gVisor does not support ID-mapped mounts and all filesystems are owned by
86- // the initial user namespace. So we an directly cast the root ID to KUID.
87- rootID := KUID (vfsCaps .RootID )
88- if ! rootIDOwnsCurrentUserns (creds , rootID ) {
89- // Linux skips vfs caps in this situation.
90- return false , false , nil
91- }
92- // Note that ambient capabilities are not yet supported in gVisor.
93- // P'(permitted) = (P(inheritable) & F(inheritable)) | (F(permitted) & P(bounding)) | P'(ambient)
94- creds .PermittedCaps = (CapabilitySet (vfsCaps .Permitted ()) & creds .BoundingCaps ) |
95- (CapabilitySet (vfsCaps .Inheritable ()) & creds .InheritableCaps )
96- effective := (vfsCaps .MagicEtc & linux .VFS_CAP_FLAGS_EFFECTIVE ) > 0
97- // Insufficient to execute correctly. Linux only returns EPERM when effective
98- // flag is set.
99- if effective && (CapabilitySet (vfsCaps .Permitted ()) & ^ creds .PermittedCaps ) != 0 {
100- return effective , true , linuxerr .EPERM
101- }
102- return effective , true , nil
103- }
104-
10586// FixupVfsCapDataOnSet may convert the given value to v3 file capabilities. It
10687// is analogous to security/commoncap.c:cap_convert_nscap().
10788func FixupVfsCapDataOnSet (creds * Credentials , value string , kuid KUID , kgid KGID ) (string , error ) {
@@ -173,49 +154,157 @@ func rootIDOwnsCurrentUserns(creds *Credentials, rootID KUID) bool {
173154 return false
174155}
175156
176- // HandlePrivilegedRoot updates creds for a privileged root user as per
177- // `Capabilities and execution of programs by root` in capabilities(7).
178- // It returns true if the file effective bit should be considered set.
179- func HandlePrivilegedRoot (creds * Credentials , hasVFSCaps bool , filename string ) bool {
157+ // FilePrivileges contains the file privileges for a file.
158+ type FilePrivileges struct {
159+ // SetUserID, when not NoID, indicates that the file has the setuid bit set. It is the KUID of the
160+ // owner of the file.
161+ SetUserID KUID
162+
163+ // SetGroupID, when not NoID, indicates that the file has the setgid bit set. It is the KGID of
164+ // the owning group of the file.
165+ SetGroupID KGID
166+
167+ // HasCaps indicates whether the file has capabilities attached.
168+ HasCaps bool
169+
170+ // CapRootID is the KUID of the namespace root of the Task that created the file caps.
171+ CapRootID KUID
172+
173+ // "These capabilities are automatically permitted to the thread, regardless of the thread's
174+ // inheritable capabilities." - capabilities(7).
175+ PermittedCaps CapabilitySet
176+
177+ // "This set is ANDed with the thread's inheritable set to determine which inheritable capabilities
178+ // are enabled in the permitted set of the thread after the execve(2)." - capabilities(7).
179+ InheritableCaps CapabilitySet
180+
181+ // "Determines if all of the new permitted capabilities for the thread are also raised in the
182+ // effective set." - capabilities(7).
183+ Effective bool
184+ }
185+
186+ // handlePrivilegedRoot updates creds for a privileged root user as per
187+ // "Capabilities and execution of programs by root" in capabilities(7).
188+ func handlePrivilegedRoot (c * Credentials , f * FilePrivileges , filename string ) {
180189 // gVisor currently does not support SECURE_NOROOT secure bit since
181190 // PR_SET_SECUREBITS is not supported. So no need to check here.
182- root := creds .UserNamespace .MapToKUID (RootUID )
183- if hasVFSCaps && creds .RealKUID != root && creds .EffectiveKUID == root {
191+ root := c .UserNamespace .MapToKUID (RootUID )
192+
193+ // "If (a) the binary that is being executed has capabilities attached and (b) the real user ID of
194+ // the process is not 0 (root) and (c) the effective user ID of the process is 0 (root), then the
195+ // file capability bits are honored. (i.e., they are not notionally considered to be all ones)."
196+ // - capabilities(7)
197+ if f .HasCaps && c .RealKUID != root && c .EffectiveKUID == root {
184198 log .Warningf ("File %q has both SUID bit and file capabilities set, not raising all capabilities." , filename )
185- return false
199+ return
186200 }
187- if creds .RealKUID == root || creds .EffectiveKUID == root {
201+
202+ // "If the real or effective user ID of the process is 0 (root), then the file inheritable and
203+ // permitted sets are ignored; instead they are notionally considered to be all ones (i.e., all
204+ // capabilities enabled)." - capabilities(7)
205+ if c .RealKUID == root || c .EffectiveKUID == root {
188206 // P'(permitted) = P(inheritable) | P(bounding)
189- creds .PermittedCaps = creds .BoundingCaps | creds .InheritableCaps
207+ c .PermittedCaps = c .BoundingCaps | c .InheritableCaps
190208 }
191- // Linux only sets the effective bit if the effective KUID is root.
192- return creds .EffectiveKUID == root
209+
210+ // "If the effective user ID of the process is 0 (root) or the file effective bit is in fact
211+ // enabled, then the file effective bit is notionally defined to be one (enabled)." - capabilities(7)
212+ f .Effective = c .EffectiveKUID == root || f .Effective
193213}
194214
195- // UpdateCredsForNewTask updates creds for a new task as per capabilities(7).
196- func UpdateCredsForNewTask (creds * Credentials , fileCaps string , filename string ) error {
197- // Clear the permitted capability set. It is initialized below via
198- // HandleVfsCaps() and HandlePrivilegedRoot().
199- creds .PermittedCaps = 0
200- hasVFSCaps := false
201- setEffective := false
202- if len (fileCaps ) != 0 {
203- vfsCaps , err := VfsCapDataOf ([]byte (fileCaps ))
204- if err != nil {
205- return err
215+ // ComputeCredsForExec computes the new credentials given the file privileges.
216+ // It returns the new creds and a bool indicating if the task is executing with
217+ // elevated privileges. A few words about the arguments:
218+ // - c: The current credentials of the task.
219+ // - f: The file privileges of the executable.
220+ // - filename: The name of the executable, used for logging.
221+ // - noNewPrivs: The current state of the prctl NO_NEW_PRIVS.
222+ // - stopPrivGain: Determines if privilege gain should be stopped for reasons beyond NO_NEW_PRIVS.
223+ // Both noNewPrivs and stopPrivGain prevent cap gain, but stopPrivGain does not by itself
224+ // prevent ID gain.
225+ // - allowSUID: If true, the task will be allowed to setuid.
226+ // Both noNewPrivs and allowSUID prevent ID gain, but allowSUID does not by itself prevent cap
227+ // gain. Note also that while noNewPrivs brings down the effective IDs down to the real IDs,
228+ // allowSUID at most prevents further ID gain due the SUID/GID bits.
229+ //
230+ // Note that gVisor does not support Ambient capabilities.
231+ func ComputeCredsForExec (c * Credentials , f FilePrivileges , filename string ,
232+ noNewPrivs bool , stopPrivGain bool , allowSUID bool ) (* Credentials , bool , error ) {
233+ if noNewPrivs || ! allowSUID {
234+ f .SetUserID = NoID
235+ f .SetGroupID = NoID
236+ }
237+ // "...if either the user or the group ID of the file has no mapping inside the namespace, the
238+ // set-user-ID (set-group-ID) bit is silently ignored: the new program is executed, but the
239+ // process's effective user (group) ID is left unchanged." - user_namespaces(7).
240+ if ! f .SetUserID .In (c .UserNamespace ).Ok () {
241+ f .SetUserID = NoID
242+ }
243+ if ! f .SetGroupID .In (c .UserNamespace ).Ok () {
244+ f .SetGroupID = NoID
245+ }
246+ // "...capabilities are conferred only if the binary is executed by a process that resides in a
247+ // user namespace whose UID 0 maps to the root user ID that is saved in the extended attribute,
248+ // or when executed by a process that resides in a descendant of such a namespace."
249+ // - capabilities(7).
250+ if ! rootIDOwnsCurrentUserns (c , f .CapRootID ) {
251+ f .HasCaps = false
252+ f .Effective = false
253+ }
254+
255+ newC := c .Fork ()
256+ if f .SetUserID .Ok () {
257+ newC .EffectiveKUID = f .SetUserID
258+ }
259+ if f .SetGroupID .Ok () {
260+ newC .EffectiveKGID = f .SetGroupID
261+ }
262+
263+ newC .PermittedCaps = CapabilitySet (0 )
264+ if f .HasCaps {
265+ // P'(permitted) = (P(inheritable) & F(inheritable)) | (F(permitted) & P(bounding))
266+ newC .PermittedCaps = (c .InheritableCaps & f .InheritableCaps ) | (f .PermittedCaps & c .BoundingCaps )
267+
268+ // The "Safety checking for capability-dumb binaries" section of capabilities(7) says:
269+ // "...For such applications, the effective capability bit is set on the file...
270+ // ...If the process did not obtain the full set of file permitted capabilities,
271+ // then execve(2) fails with the error EPERM."
272+ if f .Effective && (newC .PermittedCaps & f .PermittedCaps != f .PermittedCaps ) {
273+ return nil , false , linuxerr .EPERM
206274 }
207- setEffective , hasVFSCaps , err = HandleVfsCaps (vfsCaps , creds )
208- if err != nil {
209- return err
275+ }
276+ // newC.PermittedCaps and f.Effective are set differently for namespace root.
277+ handlePrivilegedRoot (newC , & f , filename )
278+
279+ // Deny privilege elevation if we have to, see commoncap.c:cap_bprm_creds_from_file().
280+ gainedID := (newC .EffectiveKUID != c .RealKUID ) || (newC .EffectiveKGID != c .RealKGID )
281+ gainedCaps := ! newC .PermittedCaps .IsSubsetOf (c .PermittedCaps )
282+ if (gainedID || gainedCaps ) && (noNewPrivs || stopPrivGain ) {
283+ if noNewPrivs || ! c .HasCapability (linux .CAP_SETUID ) {
284+ newC .EffectiveKUID = c .RealKUID
285+ newC .EffectiveKGID = c .RealKGID
210286 }
287+ newC .PermittedCaps &= c .PermittedCaps
211288 }
212- setEffective = HandlePrivilegedRoot (creds , hasVFSCaps , filename ) || setEffective
289+ newC .SavedKUID = newC .EffectiveKUID
290+ newC .SavedKGID = newC .EffectiveKGID
291+
213292 // P'(effective) = effective ? P'(permitted) : P'(ambient).
214- creds .EffectiveCaps = 0
215- if setEffective {
216- creds .EffectiveCaps = creds .PermittedCaps
293+ newC .EffectiveCaps = 0
294+ if f .Effective {
295+ newC .EffectiveCaps = newC .PermittedCaps
296+ }
297+
298+ // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent calls to execve(2).
299+ newC .KeepCaps = false
300+
301+ root := c .UserNamespace .MapToKUID (RootUID )
302+ secureExec := false
303+ // See commoncap.c:cap_bprm_secureexec() in Linux 4.2 (before the introduction of ambient caps).
304+ if gainedID || (newC .RealKUID != root && (f .Effective || newC .PermittedCaps != CapabilitySet (0 ))) {
305+ secureExec = true
217306 }
218- return nil
307+ return newC , secureExec , nil
219308}
220309
221310// TaskCapabilities represents all the capability sets for a task. Each of these
0 commit comments