Skip to content

Commit 87afbc8

Browse files
snapshots: lthash tile
1 parent 918a06b commit 87afbc8

28 files changed

+995
-115
lines changed

book/api/metrics-generated.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,18 @@
887887

888888
</div>
889889

890+
## Snaplt Tile
891+
892+
<div class="metrics">
893+
894+
| Metric | Type | Description |
895+
|--------|------|-------------|
896+
| <span class="metrics-name">snaplt_&#8203;state</span> | gauge | State of the tile. 0=hashing, 1=done, 2=shutdown |
897+
| <span class="metrics-name">snaplt_&#8203;full_&#8203;accounts_&#8203;hashed</span> | gauge | Number of accounts hashed for the full snapshot during snapshot loading. Might decrease if snapshot load is aborted and restarted |
898+
| <span class="metrics-name">snaplt_&#8203;incremental_&#8203;accounts_&#8203;hashed</span> | gauge | Number of accounts hashed for the incremental snapshot during snapshot loading. Might decrease if snapshot load is aborted and restarted |
899+
900+
</div>
901+
890902
## Ipecho Tile
891903

892904
<div class="metrics">

src/app/firedancer-dev/commands/backtest.c

Lines changed: 81 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "../../../disco/metrics/fd_metrics.h"
2424
#include "../../../util/pod/fd_pod_format.h"
2525
#include "../../../discof/restore/utils/fd_ssmsg.h"
26+
#include "../../../discof/restore/utils/fd_ssctrl.h"
2627
#include "../../../discof/tower/fd_tower_tile.h"
2728
#include "../../../discof/reasm/fd_reasm.h"
2829
#include "../../../discof/replay/fd_exec.h" /* FD_RUNTIME_PUBLIC_ACCOUNT_UPDATE_MSG_MTU */
@@ -38,6 +39,8 @@ static void
3839
backtest_topo( config_t * config ) {
3940
ulong exec_tile_cnt = config->firedancer.layout.exec_tile_count;
4041
ulong writer_tile_cnt = config->firedancer.layout.writer_tile_count;
42+
ulong snaplt_tile_cnt = config->firedancer.layout.snaplt_tile_count;
43+
int snaplt_disabled = config->development.snapshots.disable_lthash_verification;
4144

4245
int disable_snap_loader = !config->gossip.entrypoints_cnt;
4346
int solcap_enabled = strlen( config->capture.solcap_capture )>0;
@@ -100,6 +103,14 @@ backtest_topo( config_t * config ) {
100103
snaprd_tile->allow_shutdown = 1;
101104
snapdc_tile->allow_shutdown = 1;
102105
snapin_tile->allow_shutdown = 1;
106+
107+
if( FD_LIKELY( !snaplt_disabled ) ) {
108+
fd_topob_wksp( topo, "snaplt" );
109+
for( ulong i=0UL; i<snaplt_tile_cnt; i++ ) {
110+
fd_topo_tile_t * snaplt_tile = fd_topob_tile( topo, "snaplt", "snaplt", "metric_in", cpu_idx++, 0, 0 );
111+
snaplt_tile->allow_shutdown = 1;
112+
}
113+
}
103114
} else {
104115
fd_topob_wksp( topo, "genesi" );
105116
fd_topob_tile( topo, "genesi", "genesi", "metric_in", cpu_idx++, 0, 0 )->allow_shutdown = 1;
@@ -128,7 +139,12 @@ backtest_topo( config_t * config ) {
128139
fd_topob_wksp( topo, "snapdc_rd" );
129140
fd_topob_wksp( topo, "snapin_rd" );
130141
fd_topob_wksp( topo, "snap_out" );
131-
fd_topob_wksp( topo, "replay_manif" );
142+
143+
if( FD_LIKELY( !snaplt_disabled ) ) {
144+
fd_topob_wksp( topo, "snapin_lt" );
145+
fd_topob_wksp( topo, "snaplt_out" );
146+
fd_topob_wksp( topo, "snaplt_rd" );
147+
}
132148
/* TODO: Should be depth of 1 or 2, not 4, but it causes backpressure
133149
from the replay tile parsing the manifest, remove when this is
134150
fixed. */
@@ -139,6 +155,12 @@ backtest_topo( config_t * config ) {
139155
fd_topob_link( topo, "snapdc_rd", "snapdc_rd", 128UL, 0UL, 1UL );
140156
fd_topob_link( topo, "snapin_rd", "snapin_rd", 128UL, 0UL, 1UL );
141157

158+
if( FD_LIKELY( !snaplt_disabled ) ) {
159+
fd_topob_link( topo, "snapin_lt", "snapin_lt", 128UL, sizeof(fd_snapshot_existing_account_t), 1UL );
160+
FOR(snaplt_tile_cnt) fd_topob_link( topo, "snaplt_out", "snaplt_out", 128UL, 2048UL, 1UL );
161+
FOR(snaplt_tile_cnt) fd_topob_link( topo, "snaplt_rd", "snaplt_rd", 128UL, 0UL, 1UL );
162+
}
163+
142164
fd_topob_tile_out( topo, "snaprd", 0UL, "snap_zstd", 0UL );
143165
fd_topob_tile_in ( topo, "snapdc", 0UL, "metric_in", "snap_zstd", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
144166
fd_topob_tile_out( topo, "snapdc", 0UL, "snap_stream", 0UL );
@@ -150,6 +172,15 @@ backtest_topo( config_t * config ) {
150172
fd_topob_tile_out( topo, "snapdc", 0UL, "snapdc_rd", 0UL );
151173
fd_topob_tile_in( topo, "snaprd", 0UL, "metric_in", "snapin_rd", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
152174
fd_topob_tile_out( topo, "snapin", 0UL, "snapin_rd", 0UL );
175+
176+
if( FD_LIKELY( !snaplt_disabled ) ) {
177+
fd_topob_tile_out( topo, "snapin", 0UL, "snapin_lt", 0UL );
178+
FOR(snaplt_tile_cnt) fd_topob_tile_in ( topo, "snapin", 0UL, "metric_in", "snaplt_out", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
179+
FOR(snaplt_tile_cnt) fd_topob_tile_in ( topo, "snaplt", i, "metric_in", "snapin_lt", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
180+
FOR(snaplt_tile_cnt) fd_topob_tile_in ( topo, "snaprd", 0UL, "metric_in", "snaplt_rd", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
181+
FOR(snaplt_tile_cnt) fd_topob_tile_out( topo, "snaplt", i, "snaplt_out", i );
182+
FOR(snaplt_tile_cnt) fd_topob_tile_out( topo, "snaplt", i, "snaplt_rd", i );
183+
}
153184
} else {
154185
fd_topob_wksp( topo, "genesi_out" );
155186
fd_topob_link( topo, "genesi_out", "genesi_out", 2UL, 10UL*1024UL*1024UL+32UL+sizeof(fd_lthash_value_t), 1UL );
@@ -302,14 +333,7 @@ backtest_topo( config_t * config ) {
302333
}
303334

304335
if( FD_LIKELY( !disable_snap_loader ) ) {
305-
/* Replay decoded manifest dcache topo obj */
306-
fd_topo_obj_t * replay_manifest_dcache = fd_topob_obj( topo, "dcache", "replay_manif" );
307-
fd_pod_insertf_ulong( topo->props, 2UL << 30UL, "obj.%lu.data_sz", replay_manifest_dcache->id );
308-
fd_pod_insert_ulong( topo->props, "manifest_dcache", replay_manifest_dcache->id );
309-
310336
fd_topob_tile_uses( topo, snapin_tile, funk_obj, FD_SHMEM_JOIN_MODE_READ_WRITE );
311-
fd_topob_tile_uses( topo, snapin_tile, replay_manifest_dcache, FD_SHMEM_JOIN_MODE_READ_WRITE );
312-
fd_topob_tile_uses( topo, replay_tile, replay_manifest_dcache, FD_SHMEM_JOIN_MODE_READ_ONLY );
313337
}
314338

315339
for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
@@ -394,23 +418,51 @@ backtest_cmd_fn( args_t * args FD_PARAM_UNUSED,
394418
ulong volatile * const snapdc_metrics = fd_metrics_tile( snapdc_tile->metrics );
395419
ulong volatile * const snapin_metrics = fd_metrics_tile( snapin_tile->metrics );
396420

421+
ulong volatile * snaplt_metrics[ FD_MAX_SNAPLT_TILES ];
422+
ulong snaplt_tile_cnt = fd_topo_tile_name_cnt( topo, "snaplt" );
423+
424+
for( ulong i=0UL; i<snaplt_tile_cnt; i++ ) {
425+
ulong snaplt_tile_idx = fd_topo_find_tile( topo, "snaplt", i );
426+
FD_TEST( snaplt_tile_idx!=ULONG_MAX );
427+
fd_topo_tile_t * snaplt_tile = &topo->tiles[ snaplt_tile_idx ];
428+
snaplt_metrics[ i ] = fd_metrics_tile( snaplt_tile->metrics );
429+
}
430+
397431
ulong total_off_old = 0UL;
398432
ulong snaprd_backp_old = 0UL;
399433
ulong snaprd_wait_old = 0UL;
400434
ulong snapdc_backp_old = 0UL;
401435
ulong snapdc_wait_old = 0UL;
402436
ulong snapin_backp_old = 0UL;
403437
ulong snapin_wait_old = 0UL;
438+
ulong snaplt_backp_old = 0UL;
439+
ulong snaplt_wait_old = 0UL;
404440
ulong acc_cnt_old = 0UL;
405441
sleep( 1 );
406-
puts( "-------------backp=(snaprd,snapdc,snapin) busy=(snaprd,snapdc,snapin)---------------" );
442+
puts( "" );
443+
puts( "Columns:" );
444+
puts( "- bw: Uncompressed bandwidth" );
445+
puts( "- backp: Backpressured by downstream tile" );
446+
puts( "- stall: Waiting on upstream tile" );
447+
puts( "- acc: Number of accounts" );
448+
puts( "" );
449+
puts( "-------------backp=(snaprd,snapdc,snapin,snaplt) busy=(snaprd,snapdc,snapin,snaplt)---------------" );
407450
long next = start+1000L*1000L*1000L;
408451
for(;;) {
409452
ulong snaprd_status = FD_VOLATILE_CONST( snaprd_metrics[ MIDX( GAUGE, TILE, STATUS ) ] );
410453
ulong snapdc_status = FD_VOLATILE_CONST( snapdc_metrics[ MIDX( GAUGE, TILE, STATUS ) ] );
411454
ulong snapin_status = FD_VOLATILE_CONST( snapin_metrics[ MIDX( GAUGE, TILE, STATUS ) ] );
455+
ulong snaplt_status = ULONG_MAX;
456+
457+
ulong snaplt_status_sum = 0UL;
458+
for( ulong i=0UL; i<snaplt_tile_cnt; i++ ) {
459+
ulong snaplt_status = FD_VOLATILE_CONST( snaplt_metrics[ i ][ MIDX( GAUGE, TILE, STATUS ) ] );
460+
snaplt_status_sum += snaplt_status;
461+
}
462+
if( FD_UNLIKELY( snaplt_status_sum==2UL*snaplt_tile_cnt ) ) snaplt_status = 2UL;
463+
else snaplt_status = snaplt_tile_cnt>0UL ? 1UL : 2UL;
412464

413-
if( FD_UNLIKELY( snaprd_status==2UL && snapdc_status==2UL && snapin_status == 2UL ) ) break;
465+
if( FD_UNLIKELY( snaprd_status==2UL && snapdc_status==2UL && snapin_status == 2UL && snaplt_status==2UL ) ) break;
414466

415467
long cur = fd_log_wallclock();
416468
if( FD_UNLIKELY( cur<next ) ) {
@@ -430,16 +482,31 @@ backtest_cmd_fn( args_t * args FD_PARAM_UNUSED,
430482
ulong snapin_backp = snapin_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ];
431483
ulong snapin_wait = snapin_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] +
432484
snapin_metrics[ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] + snapin_backp;
485+
ulong snaplt_backp = 0UL;
486+
ulong snaplt_wait = 0UL;
433487

434-
ulong acc_cnt = snapin_metrics[ MIDX( GAUGE, SNAPIN, ACCOUNTS_INSERTED ) ];
435-
printf( "bw=%4.0f MB/s backp=(%3.0f%%,%3.0f%%,%3.0f%%) busy=(%3.0f%%,%3.0f%%,%3.0f%%) acc=%3.1f M/s\n",
488+
for( ulong i=0UL; i<snaplt_tile_cnt; i++ ) {
489+
snaplt_backp += snaplt_metrics[ i ][ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_BACKPRESSURE_PREFRAG ) ];
490+
}
491+
for( ulong i=0UL; i<snaplt_tile_cnt; i++ ) {
492+
snaplt_wait += snaplt_metrics[ i ][ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_PREFRAG ) ] +
493+
snaplt_metrics[ i ][ MIDX( COUNTER, TILE, REGIME_DURATION_NANOS_CAUGHT_UP_POSTFRAG ) ] + snaplt_backp;
494+
}
495+
496+
double snaplt_backp_val = snaplt_tile_cnt ? ((double)(snaplt_backp-snaplt_backp_old)*ns_per_tick )/1e7/(double)snaplt_tile_cnt : 0.0;
497+
double snaplt_busy_val = snaplt_tile_cnt ? 100-(((double)(snaplt_wait-snaplt_wait_old)*ns_per_tick)/1e7/(double)snaplt_tile_cnt) : 0.0;
498+
499+
ulong acc_cnt = snapin_metrics[ MIDX( GAUGE, SNAPIN, ACCOUNTS_INSERTED ) ];
500+
printf( "bw=%4.0f MB/s backp=(%3.0f%%,%3.0f%%,%3.0f%%,%3.0f%%) busy=(%3.0f%%,%3.0f%%,%3.0f%%,%3.0f%%) acc=%3.1f M/s\n",
436501
(double)( total_off-total_off_old )/1e6,
437502
( (double)( snaprd_backp-snaprd_backp_old )*ns_per_tick )/1e7,
438503
( (double)( snapdc_backp-snapdc_backp_old )*ns_per_tick )/1e7,
439504
( (double)( snapin_backp-snapin_backp_old )*ns_per_tick )/1e7,
505+
snaplt_backp_val,
440506
100-( ( (double)( snaprd_wait-snaprd_wait_old )*ns_per_tick )/1e7 ),
441507
100-( ( (double)( snapdc_wait-snapdc_wait_old )*ns_per_tick )/1e7 ),
442508
100-( ( (double)( snapin_wait-snapin_wait_old )*ns_per_tick )/1e7 ),
509+
snaplt_busy_val,
443510
(double)( acc_cnt-acc_cnt_old )/1e6 );
444511
fflush( stdout );
445512
total_off_old = total_off;
@@ -449,12 +516,13 @@ backtest_cmd_fn( args_t * args FD_PARAM_UNUSED,
449516
snapdc_wait_old = snapdc_wait;
450517
snapin_backp_old = snapin_backp;
451518
snapin_wait_old = snapin_wait;
519+
snaplt_backp_old = snaplt_backp;
520+
snaplt_wait_old = snaplt_wait;
452521
acc_cnt_old = acc_cnt;
453522

454523
next+=1000L*1000L*1000L;
455524
}
456525
}
457-
458526
for(;;) pause();
459527
}
460528

0 commit comments

Comments
 (0)