@@ -412,25 +412,29 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
412412
413413static void __btrfs_remove_delayed_item (struct btrfs_delayed_item * delayed_item )
414414{
415+ struct btrfs_delayed_node * delayed_node = delayed_item -> delayed_node ;
415416 struct rb_root_cached * root ;
416417 struct btrfs_delayed_root * delayed_root ;
417418
418419 /* Not inserted, ignore it. */
419420 if (RB_EMPTY_NODE (& delayed_item -> rb_node ))
420421 return ;
421422
422- delayed_root = delayed_item -> delayed_node -> root -> fs_info -> delayed_root ;
423+ /* If it's in a rbtree, then we need to have delayed node locked. */
424+ lockdep_assert_held (& delayed_node -> mutex );
425+
426+ delayed_root = delayed_node -> root -> fs_info -> delayed_root ;
423427
424428 BUG_ON (!delayed_root );
425429
426430 if (delayed_item -> type == BTRFS_DELAYED_INSERTION_ITEM )
427- root = & delayed_item -> delayed_node -> ins_root ;
431+ root = & delayed_node -> ins_root ;
428432 else
429- root = & delayed_item -> delayed_node -> del_root ;
433+ root = & delayed_node -> del_root ;
430434
431435 rb_erase_cached (& delayed_item -> rb_node , root );
432436 RB_CLEAR_NODE (& delayed_item -> rb_node );
433- delayed_item -> delayed_node -> count -- ;
437+ delayed_node -> count -- ;
434438
435439 finish_one_item (delayed_root );
436440}
@@ -1153,20 +1157,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
11531157 ret = __btrfs_commit_inode_delayed_items (trans , path ,
11541158 curr_node );
11551159 if (ret ) {
1156- btrfs_release_delayed_node (curr_node );
1157- curr_node = NULL ;
11581160 btrfs_abort_transaction (trans , ret );
11591161 break ;
11601162 }
11611163
11621164 prev_node = curr_node ;
11631165 curr_node = btrfs_next_delayed_node (curr_node );
1166+ /*
1167+ * See the comment below about releasing path before releasing
1168+ * node. If the commit of delayed items was successful the path
1169+ * should always be released, but in case of an error, it may
1170+ * point to locked extent buffers (a leaf at the very least).
1171+ */
1172+ ASSERT (path -> nodes [0 ] == NULL );
11641173 btrfs_release_delayed_node (prev_node );
11651174 }
11661175
1176+ /*
1177+ * Release the path to avoid a potential deadlock and lockdep splat when
1178+ * releasing the delayed node, as that requires taking the delayed node's
1179+ * mutex. If another task starts running delayed items before we take
1180+ * the mutex, it will first lock the mutex and then it may try to lock
1181+ * the same btree path (leaf).
1182+ */
1183+ btrfs_free_path (path );
1184+
11671185 if (curr_node )
11681186 btrfs_release_delayed_node (curr_node );
1169- btrfs_free_path (path );
11701187 trans -> block_rsv = block_rsv ;
11711188
11721189 return ret ;
@@ -1413,7 +1430,29 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
14131430 btrfs_wq_run_delayed_node (delayed_root , fs_info , BTRFS_DELAYED_BATCH );
14141431}
14151432
1416- /* Will return 0 or -ENOMEM */
1433+ static void btrfs_release_dir_index_item_space (struct btrfs_trans_handle * trans )
1434+ {
1435+ struct btrfs_fs_info * fs_info = trans -> fs_info ;
1436+ const u64 bytes = btrfs_calc_insert_metadata_size (fs_info , 1 );
1437+
1438+ if (test_bit (BTRFS_FS_LOG_RECOVERING , & fs_info -> flags ))
1439+ return ;
1440+
1441+ /*
1442+ * Adding the new dir index item does not require touching another
1443+ * leaf, so we can release 1 unit of metadata that was previously
1444+ * reserved when starting the transaction. This applies only to
1445+ * the case where we had a transaction start and excludes the
1446+ * transaction join case (when replaying log trees).
1447+ */
1448+ trace_btrfs_space_reservation (fs_info , "transaction" ,
1449+ trans -> transid , bytes , 0 );
1450+ btrfs_block_rsv_release (fs_info , trans -> block_rsv , bytes , NULL );
1451+ ASSERT (trans -> bytes_reserved >= bytes );
1452+ trans -> bytes_reserved -= bytes ;
1453+ }
1454+
1455+ /* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */
14171456int btrfs_insert_delayed_dir_index (struct btrfs_trans_handle * trans ,
14181457 const char * name , int name_len ,
14191458 struct btrfs_inode * dir ,
@@ -1455,6 +1494,27 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
14551494
14561495 mutex_lock (& delayed_node -> mutex );
14571496
1497+ /*
1498+ * First attempt to insert the delayed item. This is to make the error
1499+ * handling path simpler in case we fail (-EEXIST). There's no risk of
1500+ * any other task coming in and running the delayed item before we do
1501+ * the metadata space reservation below, because we are holding the
1502+ * delayed node's mutex and that mutex must also be locked before the
1503+ * node's delayed items can be run.
1504+ */
1505+ ret = __btrfs_add_delayed_item (delayed_node , delayed_item );
1506+ if (unlikely (ret )) {
1507+ btrfs_err (trans -> fs_info ,
1508+ "error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d" ,
1509+ name_len , name , index , btrfs_root_id (delayed_node -> root ),
1510+ delayed_node -> inode_id , dir -> index_cnt ,
1511+ delayed_node -> index_cnt , ret );
1512+ btrfs_release_delayed_item (delayed_item );
1513+ btrfs_release_dir_index_item_space (trans );
1514+ mutex_unlock (& delayed_node -> mutex );
1515+ goto release_node ;
1516+ }
1517+
14581518 if (delayed_node -> index_item_leaves == 0 ||
14591519 delayed_node -> curr_index_batch_size + data_len > leaf_data_size ) {
14601520 delayed_node -> curr_index_batch_size = data_len ;
@@ -1472,36 +1532,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
14721532 * impossible.
14731533 */
14741534 if (WARN_ON (ret )) {
1475- mutex_unlock (& delayed_node -> mutex );
14761535 btrfs_release_delayed_item (delayed_item );
1536+ mutex_unlock (& delayed_node -> mutex );
14771537 goto release_node ;
14781538 }
14791539
14801540 delayed_node -> index_item_leaves ++ ;
1481- } else if (!test_bit (BTRFS_FS_LOG_RECOVERING , & fs_info -> flags )) {
1482- const u64 bytes = btrfs_calc_insert_metadata_size (fs_info , 1 );
1483-
1484- /*
1485- * Adding the new dir index item does not require touching another
1486- * leaf, so we can release 1 unit of metadata that was previously
1487- * reserved when starting the transaction. This applies only to
1488- * the case where we had a transaction start and excludes the
1489- * transaction join case (when replaying log trees).
1490- */
1491- trace_btrfs_space_reservation (fs_info , "transaction" ,
1492- trans -> transid , bytes , 0 );
1493- btrfs_block_rsv_release (fs_info , trans -> block_rsv , bytes , NULL );
1494- ASSERT (trans -> bytes_reserved >= bytes );
1495- trans -> bytes_reserved -= bytes ;
1496- }
1497-
1498- ret = __btrfs_add_delayed_item (delayed_node , delayed_item );
1499- if (unlikely (ret )) {
1500- btrfs_err (trans -> fs_info ,
1501- "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)" ,
1502- name_len , name , delayed_node -> root -> root_key .objectid ,
1503- delayed_node -> inode_id , ret );
1504- BUG ();
1541+ } else {
1542+ btrfs_release_dir_index_item_space (trans );
15051543 }
15061544 mutex_unlock (& delayed_node -> mutex );
15071545
0 commit comments