Skip to content

Commit 791e5f5

Browse files
author
Alex Markuze
committed
ceph: fix client race condition where r_parent becomes stale before sending message
JIRA: https://issues.redhat.com/browse/RHEL-117609 Conflicts: Missing 5995d90 some surrounding APIs are different commit bec324f Author: Alex Markuze <amarkuze@redhat.com> Date: Tue Aug 12 09:57:39 2025 +0000 ceph: fix race condition where r_parent becomes stale before sending message When the parent directory's i_rwsem is not locked, req->r_parent may become stale due to concurrent operations (e.g. rename) between dentry lookup and message creation. Validate that r_parent matches the encoded parent inode and update to the correct inode if a mismatch is detected. [ idryomov: folded a follow-up fix from Alex to drop extra reference from ceph_get_reply_dir() in ceph_fill_trace(): ceph_get_reply_dir() may return a different, referenced inode when r_parent is stale and the parent directory lock is not held. ceph_fill_trace() used that inode but failed to drop the reference when it differed from req->r_parent, leaking an inode reference. Keep the directory inode in a local variable and iput() it at function end if it does not match req->r_parent. ] Cc: stable@vger.kernel.org Signed-off-by: Alex Markuze <amarkuze@redhat.com> Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Signed-off-by: Alex Markuze <amarkuze@redhat.com>
1 parent 72a36ae commit 791e5f5

File tree

1 file changed

+73
-15
lines changed

1 file changed

+73
-15
lines changed

fs/ceph/inode.c

Lines changed: 73 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
5555
return 0;
5656
}
5757

58+
/*
59+
* Check if the parent inode matches the vino from directory reply info
60+
*/
61+
static inline bool ceph_vino_matches_parent(struct inode *parent,
62+
struct ceph_vino vino)
63+
{
64+
return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
65+
}
66+
67+
/*
68+
* Validate that the directory inode referenced by @req->r_parent matches the
69+
* inode number and snapshot id contained in the reply's directory record. If
70+
* they do not match – which can theoretically happen if the parent dentry was
71+
* moved between the time the request was issued and the reply arrived – fall
72+
* back to looking up the correct inode in the inode cache.
73+
*
74+
* A reference is *always* returned. Callers that receive a different inode
75+
* than the original @parent are responsible for dropping the extra reference
76+
* once the reply has been processed.
77+
*/
78+
static struct inode *ceph_get_reply_dir(struct super_block *sb,
79+
struct inode *parent,
80+
struct ceph_mds_reply_info_parsed *rinfo)
81+
{
82+
struct ceph_vino vino;
83+
84+
if (unlikely(!rinfo->diri.in))
85+
return parent; /* nothing to compare against */
86+
87+
/* If we didn't have a cached parent inode to begin with, just bail out. */
88+
if (!parent)
89+
return NULL;
90+
91+
vino.ino = le64_to_cpu(rinfo->diri.in->ino);
92+
vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
93+
94+
if (likely(ceph_vino_matches_parent(parent, vino)))
95+
return parent; /* matches – use the original reference */
96+
97+
/* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
98+
WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
99+
ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
100+
101+
return ceph_get_inode(sb, vino, NULL);
102+
}
103+
58104
/**
59105
* ceph_new_inode - allocate a new inode in advance of an expected create
60106
* @dir: parent directory for new inode
@@ -1523,6 +1569,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
15231569
struct ceph_vino tvino, dvino;
15241570
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
15251571
struct ceph_client *cl = fsc->client;
1572+
struct inode *parent_dir = NULL;
15261573
int err = 0;
15271574

15281575
doutc(cl, "%p is_dentry %d is_target %d\n", req,
@@ -1536,10 +1583,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
15361583
}
15371584

15381585
if (rinfo->head->is_dentry) {
1539-
struct inode *dir = req->r_parent;
1586+
/*
1587+
* r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
1588+
* so we need to get the correct inode
1589+
*/
1590+
parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
1591+
if (unlikely(IS_ERR(parent_dir))) {
1592+
err = PTR_ERR(parent_dir);
1593+
goto done;
1594+
}
15401595

1541-
if (dir) {
1542-
err = ceph_fill_inode(dir, NULL, &rinfo->diri,
1596+
if (parent_dir) {
1597+
err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
15431598
rinfo->dirfrag, session, -1,
15441599
&req->r_caps_reservation);
15451600
if (err < 0)
@@ -1548,14 +1603,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
15481603
WARN_ON_ONCE(1);
15491604
}
15501605

1551-
if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
1606+
if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
15521607
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
15531608
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
15541609
bool is_nokey = false;
15551610
struct qstr dname;
15561611
struct dentry *dn, *parent;
15571612
struct fscrypt_str oname = FSTR_INIT(NULL, 0);
1558-
struct ceph_fname fname = { .dir = dir,
1613+
struct ceph_fname fname = { .dir = parent_dir,
15591614
.name = rinfo->dname,
15601615
.ctext = rinfo->altname,
15611616
.name_len = rinfo->dname_len,
@@ -1564,10 +1619,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
15641619
BUG_ON(!rinfo->head->is_target);
15651620
BUG_ON(req->r_dentry);
15661621

1567-
parent = d_find_any_alias(dir);
1622+
parent = d_find_any_alias(parent_dir);
15681623
BUG_ON(!parent);
15691624

1570-
err = ceph_fname_alloc_buffer(dir, &oname);
1625+
err = ceph_fname_alloc_buffer(parent_dir, &oname);
15711626
if (err < 0) {
15721627
dput(parent);
15731628
goto done;
@@ -1576,14 +1631,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
15761631
err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
15771632
if (err < 0) {
15781633
dput(parent);
1579-
ceph_fname_free_buffer(dir, &oname);
1634+
ceph_fname_free_buffer(parent_dir, &oname);
15801635
goto done;
15811636
}
15821637
dname.name = oname.name;
15831638
dname.len = oname.len;
15841639
dname.hash = full_name_hash(parent, dname.name, dname.len);
15851640
tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
15861641
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1642+
15871643
retry_lookup:
15881644
dn = d_lookup(parent, &dname);
15891645
doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
@@ -1595,7 +1651,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
15951651
dname.len, dname.name, dn);
15961652
if (!dn) {
15971653
dput(parent);
1598-
ceph_fname_free_buffer(dir, &oname);
1654+
ceph_fname_free_buffer(parent_dir, &oname);
15991655
err = -ENOMEM;
16001656
goto done;
16011657
}
@@ -1610,12 +1666,12 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
16101666
ceph_snap(d_inode(dn)) != tvino.snap)) {
16111667
doutc(cl, " dn %p points to wrong inode %p\n",
16121668
dn, d_inode(dn));
1613-
ceph_dir_clear_ordered(dir);
1669+
ceph_dir_clear_ordered(parent_dir);
16141670
d_delete(dn);
16151671
dput(dn);
16161672
goto retry_lookup;
16171673
}
1618-
ceph_fname_free_buffer(dir, &oname);
1674+
ceph_fname_free_buffer(parent_dir, &oname);
16191675

16201676
req->r_dentry = dn;
16211677
dput(parent);
@@ -1794,6 +1850,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
17941850
&dvino, ptvino);
17951851
}
17961852
done:
1853+
/* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
1854+
if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
1855+
iput(parent_dir);
17971856
doutc(cl, "done err=%d\n", err);
17981857
return err;
17991858
}
@@ -2483,22 +2542,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
24832542
int truncate_retry = 20; /* The RMW will take around 50ms */
24842543
struct dentry *dentry;
24852544
char *path;
2486-
int pathlen;
2487-
u64 pathbase;
24882545
bool do_sync = false;
24892546

24902547
dentry = d_find_alias(inode);
24912548
if (!dentry) {
24922549
do_sync = true;
24932550
} else {
2494-
path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
2551+
struct ceph_path_info path_info;
2552+
path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
24952553
if (IS_ERR(path)) {
24962554
do_sync = true;
24972555
err = 0;
24982556
} else {
24992557
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
25002558
}
2501-
ceph_mdsc_free_path(path, pathlen);
2559+
ceph_mdsc_free_path_info(&path_info);
25022560
dput(dentry);
25032561

25042562
/* For none EACCES cases will let the MDS do the mds auth check */

0 commit comments

Comments
 (0)