Skip to content

Commit 9935919

Browse files
riptlripatel-fd
authored andcommitted
resolf: fix accdb races
The full Firedancer resolv tile was not synchronized with database admin operations performed by the replay tile. This resulted in assertion failures and crashes. This patch makes the resolv tile use a speculative query API (accdb_peek) and gracefully skip over such data races.
1 parent 630e2a4 commit 9935919

25 files changed

+691
-191
lines changed

book/api/metrics-generated.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,6 +1054,7 @@
10541054
| <span class="metrics-name">resolf_&#8203;lut_&#8203;resolved</span><br/>{lut_&#8203;resolve_&#8203;result="<span class="metrics-enum">success</span>"} | counter | Count of address lookup tables resolved (Resolved successfully) |
10551055
| <span class="metrics-name">resolf_&#8203;blockhash_&#8203;expired</span> | counter | Count of transactions that failed to resolve because the blockhash was expired |
10561056
| <span class="metrics-name">resolf_&#8203;transaction_&#8203;bundle_&#8203;peer_&#8203;failure</span> | counter | Count of transactions that failed to resolve because a peer transaction in the bundle failed |
1057+
| <span class="metrics-name">resolf_&#8203;db_&#8203;races</span> | counter | Number of database races encountered (diagnostic counter, not indicative of issues) |
10571058

10581059
</div>
10591060

src/disco/metrics/generated/fd_metrics_resolf.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ const fd_metrics_meta_t FD_METRICS_RESOLF[FD_METRICS_RESOLF_TOTAL] = {
1515
DECLARE_METRIC_ENUM( RESOLF_LUT_RESOLVED, COUNTER, LUT_RESOLVE_RESULT, SUCCESS ),
1616
DECLARE_METRIC( RESOLF_BLOCKHASH_EXPIRED, COUNTER ),
1717
DECLARE_METRIC( RESOLF_TRANSACTION_BUNDLE_PEER_FAILURE, COUNTER ),
18+
DECLARE_METRIC( RESOLF_DB_RACES, COUNTER ),
1819
};

src/disco/metrics/generated/fd_metrics_resolf.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,13 @@
5050
#define FD_METRICS_COUNTER_RESOLF_TRANSACTION_BUNDLE_PEER_FAILURE_DESC "Count of transactions that failed to resolve because a peer transaction in the bundle failed"
5151
#define FD_METRICS_COUNTER_RESOLF_TRANSACTION_BUNDLE_PEER_FAILURE_CVT (FD_METRICS_CONVERTER_NONE)
5252

53-
#define FD_METRICS_RESOLF_TOTAL (13UL)
53+
#define FD_METRICS_COUNTER_RESOLF_DB_RACES_OFF (29UL)
54+
#define FD_METRICS_COUNTER_RESOLF_DB_RACES_NAME "resolf_db_races"
55+
#define FD_METRICS_COUNTER_RESOLF_DB_RACES_TYPE (FD_METRICS_TYPE_COUNTER)
56+
#define FD_METRICS_COUNTER_RESOLF_DB_RACES_DESC "Number of database races encountered (diagnostic counter, not indicative of issues)"
57+
#define FD_METRICS_COUNTER_RESOLF_DB_RACES_CVT (FD_METRICS_CONVERTER_NONE)
58+
59+
#define FD_METRICS_RESOLF_TOTAL (14UL)
5460
extern const fd_metrics_meta_t FD_METRICS_RESOLF[FD_METRICS_RESOLF_TOTAL];
5561

5662
#endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_resolf_h */

src/disco/metrics/metrics.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,7 @@ metric introduced.
432432
<counter name="LutResolved" enum="LutResolveResult" summary="Count of address lookup tables resolved" />
433433
<counter name="BlockhashExpired" summary="Count of transactions that failed to resolve because the blockhash was expired" />
434434
<counter name="TransactionBundlePeerFailure" summary="Count of transactions that failed to resolve because a peer transaction in the bundle failed" />
435+
<counter name="DbRaces" summary="Number of database races encountered (diagnostic counter, not indicative of issues)" />
435436
</tile>
436437

437438
<tile name="resolv">

src/discof/bank/fd_bank_err.h

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -59,24 +59,4 @@ fd_bank_err_from_runtime_err( int err ) {
5959
return 0;
6060
}
6161

62-
#define FD_BANK_LUT_ERR_SUCCESS ( 0)
63-
#define FD_BANK_LUT_ERR_ADDRESS_LOOKUP_TABLE_NOT_FOUND (-1)
64-
#define FD_BANK_LUT_ERR_INVALID_ADDRESS_LOOKUP_TABLE_OWNER (-2)
65-
#define FD_BANK_LUT_ERR_INVALID_ADDRESS_LOOKUP_TABLE_DATA (-3)
66-
#define FD_BANK_LUT_ERR_INVALID_ADDRESS_LOOKUP_TABLE_INDEX (-5)
67-
68-
#define FD_BANK_LUT_ERR_LAST (-5)
69-
70-
static inline int
71-
fd_bank_lut_err_from_runtime_err( int err ) {
72-
switch( err ) {
73-
case FD_RUNTIME_EXECUTE_SUCCESS: return FD_BANK_LUT_ERR_SUCCESS;
74-
case FD_RUNTIME_TXN_ERR_ADDRESS_LOOKUP_TABLE_NOT_FOUND: return FD_BANK_LUT_ERR_ADDRESS_LOOKUP_TABLE_NOT_FOUND;
75-
case FD_RUNTIME_TXN_ERR_INVALID_ADDRESS_LOOKUP_TABLE_OWNER: return FD_BANK_LUT_ERR_INVALID_ADDRESS_LOOKUP_TABLE_OWNER;
76-
case FD_RUNTIME_TXN_ERR_INVALID_ADDRESS_LOOKUP_TABLE_DATA: return FD_BANK_LUT_ERR_INVALID_ADDRESS_LOOKUP_TABLE_DATA;
77-
case FD_RUNTIME_TXN_ERR_INVALID_ADDRESS_LOOKUP_TABLE_INDEX: return FD_BANK_LUT_ERR_INVALID_ADDRESS_LOOKUP_TABLE_INDEX;
78-
default: FD_LOG_ERR(( "Unknown runtime LUT error %d", err ));
79-
}
80-
}
81-
8262
#endif /* HEADER_fd_src_discof_bank_fd_bank_err_h */

src/discof/resolv/fd_resolv_tile.c

Lines changed: 103 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
#include "fd_resolv_tile.h"
22
#include "../../disco/fd_txn_m.h"
33
#include "../../disco/topo/fd_topo.h"
4-
#include "../bank/fd_bank_err.h"
54
#include "../replay/fd_replay_tile.h"
65
#include "generated/fd_resolv_tile_seccomp.h"
76
#include "../../disco/metrics/fd_metrics.h"
7+
#include "../../flamenco/accdb/fd_accdb_sync.h"
8+
#include "../../flamenco/runtime/fd_alut_interp.h"
89
#include "../../flamenco/runtime/fd_system_ids_pp.h"
9-
#include "../../flamenco/runtime/fd_runtime.h"
1010
#include "../../flamenco/runtime/fd_bank.h"
1111
#include "../../util/pod/fd_pod_format.h"
1212

@@ -147,7 +147,8 @@ typedef struct {
147147
rooted bank (after exchanging it for a new rooted bank). */
148148
fd_banks_t * banks;
149149
fd_bank_t * bank;
150-
fd_funk_t funk[1];
150+
151+
fd_accdb_user_t accdb[1];
151152

152153
fd_stashed_txn_m_t * pool;
153154
map_chain_t * map_chain;
@@ -165,6 +166,7 @@ typedef struct {
165166
ulong blockhash_expired;
166167
ulong bundle_peer_failure;
167168
ulong stash[ FD_METRICS_COUNTER_RESOLV_STASH_OPERATION_CNT ];
169+
ulong db_race;
168170
} metrics;
169171

170172
fd_resolv_in_ctx_t in[ 64UL ];
@@ -195,6 +197,7 @@ metrics_write( fd_resolv_ctx_t * ctx ) {
195197
FD_MCNT_ENUM_COPY( RESOLF, LUT_RESOLVED, ctx->metrics.lut );
196198
FD_MCNT_ENUM_COPY( RESOLF, STASH_OPERATION, ctx->metrics.stash );
197199
FD_MCNT_SET( RESOLF, TRANSACTION_BUNDLE_PEER_FAILURE, ctx->metrics.bundle_peer_failure );
200+
FD_MCNT_SET( RESOLF, DB_RACES, ctx->metrics.db_race );
198201
}
199202

200203
static int
@@ -241,7 +244,99 @@ during_frag( fd_resolv_ctx_t * ctx,
241244
}
242245
}
243246

244-
static inline int
247+
/* peek_alut reads a single address lookup table from database cache. */
248+
249+
static int
250+
peek_alut( fd_resolv_ctx_t * ctx,
251+
fd_txn_m_t * txnm,
252+
fd_alut_interp_t * interp,
253+
ulong alut_idx ) {
254+
fd_funk_txn_xid_t const xid = { .ul = { fd_bank_slot_get( ctx->bank ), fd_bank_slot_get( ctx->bank ) } };
255+
256+
ulong ro_indir_cnt_old = interp->ro_indir_cnt;
257+
ulong rw_indir_cnt_old = interp->rw_indir_cnt;
258+
ulong alut_idx_old = interp->alut_idx;
259+
260+
fd_txn_t const * txn = fd_txn_m_txn_t_const ( txnm );
261+
uchar const * txn_payload = fd_txn_m_payload_const( txnm );
262+
fd_txn_acct_addr_lut_t const * addr_lut =
263+
&fd_txn_get_address_tables_const( txn )[ alut_idx ];
264+
fd_pubkey_t addr_lut_acc = FD_LOAD( fd_pubkey_t, txn_payload+addr_lut->addr_off );
265+
266+
int err = 0;
267+
for(;;) {
268+
fd_accdb_peek_t _peek[1];
269+
fd_accdb_peek_t * peek = fd_accdb_peek(
270+
ctx->accdb, _peek, &xid, &addr_lut_acc );
271+
if( FD_UNLIKELY( !peek ) ) {
272+
err = FD_RUNTIME_TXN_ERR_ADDRESS_LOOKUP_TABLE_NOT_FOUND;
273+
break;
274+
}
275+
276+
err = fd_alut_interp_next(
277+
interp,
278+
&addr_lut_acc,
279+
fd_accdb_ref_owner ( peek->acc ),
280+
fd_accdb_ref_data_const( peek->acc ),
281+
fd_accdb_ref_data_sz ( peek->acc ) );
282+
283+
int peek_ok = fd_accdb_peek_test( peek );
284+
fd_accdb_peek_drop( peek );
285+
if( FD_LIKELY( peek_ok ) ) break;
286+
287+
/* Restore old interp state and retry */
288+
FD_SPIN_PAUSE();
289+
ctx->metrics.db_race++;
290+
interp->ro_indir_cnt = ro_indir_cnt_old;
291+
interp->rw_indir_cnt = rw_indir_cnt_old;
292+
interp->alut_idx = alut_idx_old;
293+
}
294+
295+
return err;
296+
}
297+
298+
/* peek_aluts reads address lookup tables from database cache.
299+
Gracefully recovers from data races and missing accounts. */
300+
301+
static int
302+
peek_aluts( fd_resolv_ctx_t * ctx,
303+
fd_txn_m_t * txnm ) {
304+
305+
/* Unpack context */
306+
fd_txn_t const * txn = fd_txn_m_txn_t_const ( txnm );
307+
uchar const * txn_payload = fd_txn_m_payload_const( txnm );
308+
ulong const alut_cnt = txn->addr_table_lookup_cnt;
309+
ulong const slot = fd_bank_slot_get( ctx->bank );
310+
fd_sysvar_cache_t const * sysvar_cache = fd_bank_sysvar_cache_query( ctx->bank ); FD_TEST( sysvar_cache );
311+
fd_slot_hash_t const * slot_hashes = fd_sysvar_cache_slot_hashes_join_const( sysvar_cache );
312+
313+
/* Write indirect addrs into here */
314+
fd_acct_addr_t * indir_addrs = fd_txn_m_alut( txnm );
315+
316+
int err = FD_RUNTIME_EXECUTE_SUCCESS;
317+
fd_alut_interp_t interp[1];
318+
fd_alut_interp_new( interp, indir_addrs, txn, txn_payload, slot_hashes, slot );
319+
for( ulong i=0UL; i<alut_cnt; i++ ) {
320+
err = peek_alut( ctx, txnm, interp, i );
321+
if( FD_UNLIKELY( err ) ) break;
322+
}
323+
fd_alut_interp_delete( interp );
324+
fd_sysvar_cache_slot_hashes_leave_const( sysvar_cache, slot_hashes );
325+
326+
ulong ctr_idx;
327+
switch( err ) {
328+
case FD_RUNTIME_EXECUTE_SUCCESS: ctr_idx = FD_METRICS_ENUM_LUT_RESOLVE_RESULT_V_SUCCESS_IDX; break;
329+
case FD_RUNTIME_TXN_ERR_ADDRESS_LOOKUP_TABLE_NOT_FOUND: ctr_idx = FD_METRICS_ENUM_LUT_RESOLVE_RESULT_V_ACCOUNT_NOT_FOUND_IDX; break;
330+
case FD_RUNTIME_TXN_ERR_INVALID_ADDRESS_LOOKUP_TABLE_OWNER: ctr_idx = FD_METRICS_ENUM_LUT_RESOLVE_RESULT_V_INVALID_ACCOUNT_OWNER_IDX; break;
331+
case FD_RUNTIME_TXN_ERR_INVALID_ADDRESS_LOOKUP_TABLE_DATA: ctr_idx = FD_METRICS_ENUM_LUT_RESOLVE_RESULT_V_INVALID_ACCOUNT_DATA_IDX; break;
332+
case FD_RUNTIME_TXN_ERR_INVALID_ADDRESS_LOOKUP_TABLE_INDEX: ctr_idx = FD_METRICS_ENUM_LUT_RESOLVE_RESULT_V_INVALID_LOOKUP_INDEX_IDX; break;
333+
default: ctr_idx = FD_METRICS_ENUM_LUT_RESOLVE_RESULT_V_ACCOUNT_UNINITIALIZED_IDX; break;
334+
}
335+
ctx->metrics.lut[ ctr_idx ]++;
336+
return err;
337+
}
338+
339+
static int
245340
publish_txn( fd_resolv_ctx_t * ctx,
246341
fd_stem_context_t * stem,
247342
fd_stashed_txn_m_t const * stashed ) {
@@ -257,24 +352,8 @@ publish_txn( fd_resolv_ctx_t * ctx,
257352
FD_MCNT_INC( RESOLF, NO_BANK_DROP, 1 );
258353
return 0;
259354
}
260-
261-
fd_sysvar_cache_t const * sysvar_cache = fd_bank_sysvar_cache_query( ctx->bank );
262-
FD_TEST( sysvar_cache );
263-
264-
fd_funk_txn_xid_t xid = { .ul = { fd_bank_slot_get( ctx->bank ), fd_bank_slot_get( ctx->bank ) } };
265-
266-
fd_slot_hash_t const * slot_hashes = fd_sysvar_cache_slot_hashes_join_const( sysvar_cache );
267-
268-
int result = fd_runtime_load_txn_address_lookup_tables( txnt,
269-
fd_txn_m_payload( txnm ),
270-
ctx->funk,
271-
&xid,
272-
fd_bank_slot_get( ctx->bank ),
273-
slot_hashes,
274-
fd_txn_m_alut( txnm ) );
275-
fd_sysvar_cache_slot_hashes_leave_const( sysvar_cache, slot_hashes );
276-
ctx->metrics.lut[ result ]++;
277-
if( FD_UNLIKELY( result ) ) return 0;
355+
int err = peek_aluts( ctx, txnm );
356+
if( FD_UNLIKELY( err ) ) return 0;
278357
}
279358

280359
ulong realized_sz = fd_txn_m_realized_footprint( txnm, 1, 1 );
@@ -481,22 +560,7 @@ after_frag( fd_resolv_ctx_t * ctx,
481560
return;
482561
}
483562

484-
fd_sysvar_cache_t const * sysvar_cache = fd_bank_sysvar_cache_query( ctx->bank );
485-
FD_TEST( sysvar_cache );
486-
fd_slot_hash_t const * slot_hashes = fd_sysvar_cache_slot_hashes_join_const( sysvar_cache );
487-
FD_TEST( slot_hashes );
488-
489-
fd_funk_txn_xid_t xid = { .ul = { fd_bank_slot_get( ctx->bank ), fd_bank_slot_get( ctx->bank ) } };
490-
491-
int result = fd_runtime_load_txn_address_lookup_tables( txnt,
492-
fd_txn_m_payload( txnm ),
493-
ctx->funk,
494-
&xid,
495-
fd_bank_slot_get( ctx->bank ),
496-
slot_hashes,
497-
fd_txn_m_alut( txnm ) );
498-
fd_sysvar_cache_slot_hashes_leave_const( sysvar_cache, slot_hashes );
499-
ctx->metrics.lut[ -fd_bank_lut_err_from_runtime_err( result ) ]++;
563+
int result = peek_aluts( ctx, txnm );
500564
if( FD_UNLIKELY( result ) ) {
501565
if( FD_UNLIKELY( txnm->block_engine.bundle_id ) ) ctx->bundle_failed = 1;
502566
return;
@@ -567,7 +631,7 @@ unprivileged_init( fd_topo_t * topo,
567631
ctx->out_replay->wmark = fd_dcache_compact_wmark ( ctx->out_replay->mem, topo->links[ tile->out_link_id[ 1 ] ].dcache, topo->links[ tile->out_link_id[ 1 ] ].mtu );
568632
ctx->out_replay->chunk = ctx->out_replay->chunk0;
569633

570-
FD_TEST( fd_funk_join( ctx->funk, fd_topo_obj_laddr( topo, tile->resolv.funk_obj_id ) ) );
634+
FD_TEST( fd_accdb_user_join( ctx->accdb, fd_topo_obj_laddr( topo, tile->resolv.funk_obj_id ) ) );
571635

572636
ulong banks_obj_id = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "banks" );
573637
FD_TEST( banks_obj_id!=ULONG_MAX );

src/discof/restore/fd_snapin_tile.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,7 @@ handle_control_frag( fd_snapin_tile_t * ctx,
620620
ctx->state = FD_SNAPSHOT_STATE_IDLE;
621621

622622
if( ctx->full ) {
623-
fd_funk_txn_remove_published( funk );
623+
fd_accdb_clear( ctx->accdb_admin );
624624
} else {
625625
fd_accdb_cancel( ctx->accdb_admin, ctx->xid );
626626
fd_funk_txn_xid_copy( ctx->xid, fd_funk_last_publish( funk ) );
@@ -637,7 +637,7 @@ handle_control_frag( fd_snapin_tile_t * ctx,
637637
}
638638
ctx->state = FD_SNAPSHOT_STATE_IDLE;
639639

640-
fd_funk_txn_xid_t incremental_xid = fd_funk_generate_xid();
640+
fd_funk_txn_xid_t incremental_xid = { .ul={ LONG_MAX, LONG_MAX } };
641641
fd_accdb_attach_child( ctx->accdb_admin, ctx->xid, &incremental_xid );
642642
fd_funk_txn_xid_copy( ctx->xid, &incremental_xid );
643643
break;

src/flamenco/accdb/Local.mk

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,6 @@ $(call add-objs,fd_accdb_admin,fd_flamenco)
55
# User API
66
$(call add-hdrs,fd_accdb_user.h fd_accdb_sync.h)
77
$(call add-objs,fd_accdb_user,fd_flamenco)
8+
9+
$(call make-unit-test,test_accdb,test_accdb,fd_flamenco fd_funk fd_util)
10+
$(call run-unit-test,test_accdb)

src/flamenco/accdb/fd_accdb_admin.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,5 +443,5 @@ fd_accdb_clear( fd_accdb_admin_t * cache ) {
443443

444444
void
445445
fd_accdb_verify( fd_accdb_admin_t * admin ) {
446-
fd_funk_verify( admin->funk );
446+
FD_TEST( fd_funk_verify( admin->funk )==FD_FUNK_SUCCESS );
447447
}

src/flamenco/accdb/fd_accdb_user.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ fd_accdb_peek( fd_accdb_user_t * accdb,
218218
FD_SPIN_PAUSE();
219219
/* FIXME backoff */
220220
}
221+
if( !rec ) return NULL;
221222

222223
*peek = (fd_accdb_peek_t) {
223224
.acc = {{

0 commit comments

Comments
 (0)