Skip to content

Commit 6c3396a

Browse files
author
Ian Kent
committed
kernfs: Introduce separate rwsem to protect inode attributes
JIRA: https://issues.redhat.com/browse/RHEL-52956 Upstream status: Linus Conflicts: There's a reject of hunks 2, 4 and 5 against fs/kernfs/inode.c because idmapping updates have not yet been made to the RHEL9 kernel. The RH_KABI_EXTEND() macro is used to add the new lock to the kernfs_root structure. This should be fine because CentOS Stream commit 25d13c0 ("kernfs: move struct kernfs_root out of the public view") has already been included in RHEL-9 making the kernfs_root structure private to kernfs. commit 9caf696 Author: Imran Khan <imran.f.khan@oracle.com> Date: Thu Mar 9 22:09:30 2023 +1100 kernfs: Introduce separate rwsem to protect inode attributes. Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan <imran.f.khan@oracle.com> Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
1 parent 7d980e2 commit 6c3396a

File tree

3 files changed

+16
-8
lines changed

3 files changed

+16
-8
lines changed

fs/kernfs/dir.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -759,12 +759,15 @@ int kernfs_add_one(struct kernfs_node *kn)
759759
goto out_unlock;
760760

761761
/* Update timestamps on the parent */
762+
down_write(&root->kernfs_iattr_rwsem);
763+
762764
ps_iattr = parent->iattr;
763765
if (ps_iattr) {
764766
ktime_get_real_ts64(&ps_iattr->ia_ctime);
765767
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
766768
}
767769

770+
up_write(&root->kernfs_iattr_rwsem);
768771
up_write(&root->kernfs_rwsem);
769772

770773
/*
@@ -927,6 +930,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
927930

928931
idr_init(&root->ino_idr);
929932
init_rwsem(&root->kernfs_rwsem);
933+
init_rwsem(&root->kernfs_iattr_rwsem);
930934
INIT_LIST_HEAD(&root->supers);
931935

932936
/*
@@ -1445,11 +1449,14 @@ static void __kernfs_remove(struct kernfs_node *kn)
14451449
pos->parent ? pos->parent->iattr : NULL;
14461450

14471451
/* update timestamps on the parent */
1452+
down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
1453+
14481454
if (ps_iattr) {
14491455
ktime_get_real_ts64(&ps_iattr->ia_ctime);
14501456
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
14511457
}
14521458

1459+
up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
14531460
kernfs_put(pos);
14541461
}
14551462

fs/kernfs/inode.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
101101
int ret;
102102
struct kernfs_root *root = kernfs_root(kn);
103103

104-
down_write(&root->kernfs_rwsem);
104+
down_write(&root->kernfs_iattr_rwsem);
105105
ret = __kernfs_setattr(kn, iattr);
106-
up_write(&root->kernfs_rwsem);
106+
up_write(&root->kernfs_iattr_rwsem);
107107
return ret;
108108
}
109109

@@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
119119
return -EINVAL;
120120

121121
root = kernfs_root(kn);
122-
down_write(&root->kernfs_rwsem);
122+
down_write(&root->kernfs_iattr_rwsem);
123123
error = setattr_prepare(&init_user_ns, dentry, iattr);
124124
if (error)
125125
goto out;
@@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
132132
setattr_copy(&init_user_ns, inode, iattr);
133133

134134
out:
135-
up_write(&root->kernfs_rwsem);
135+
up_write(&root->kernfs_iattr_rwsem);
136136
return error;
137137
}
138138

@@ -189,10 +189,10 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
189189
struct kernfs_node *kn = inode->i_private;
190190
struct kernfs_root *root = kernfs_root(kn);
191191

192-
down_read(&root->kernfs_rwsem);
192+
down_read(&root->kernfs_iattr_rwsem);
193193
kernfs_refresh_inode(kn, inode);
194194
generic_fillattr(&init_user_ns, inode, stat);
195-
up_read(&root->kernfs_rwsem);
195+
up_read(&root->kernfs_iattr_rwsem);
196196

197197
return 0;
198198
}
@@ -285,10 +285,10 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
285285
kn = inode->i_private;
286286
root = kernfs_root(kn);
287287

288-
down_read(&root->kernfs_rwsem);
288+
down_read(&root->kernfs_iattr_rwsem);
289289
kernfs_refresh_inode(kn, inode);
290290
ret = generic_permission(&init_user_ns, inode, mask);
291-
up_read(&root->kernfs_rwsem);
291+
up_read(&root->kernfs_iattr_rwsem);
292292

293293
return ret;
294294
}

fs/kernfs/kernfs-internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct kernfs_root {
4747

4848
wait_queue_head_t deactivate_waitq;
4949
struct rw_semaphore kernfs_rwsem;
50+
RH_KABI_EXTEND(struct rw_semaphore kernfs_iattr_rwsem)
5051
};
5152

5253
/* +1 to avoid triggering overflow warning when negating it */

0 commit comments

Comments
 (0)