@@ -591,40 +591,71 @@ after_sign( ctx_t * ctx,
591591 }
592592 }
593593
594- sign_req_t * pending = fd_signs_map_query ( ctx -> signs_map , pending_key , NULL );
595- if ( FD_UNLIKELY ( !pending ) ) FD_LOG_CRIT (( "No pending request found for key %lu" , pending_key ));
594+ sign_req_t * pending_ = fd_signs_map_query ( ctx -> signs_map , pending_key , NULL );
595+ sign_req_t pending [1 ] = { * pending_ }; /* Make a copy of the pending request so we can sign_map_remove immediately. */
596+ sign_map_remove ( ctx , pending_key );
597+
598+ if ( FD_UNLIKELY ( !pending_ ) ) FD_LOG_CRIT (( "No pending request found for key %lu" , pending_key ));
596599
600+ /* Thhis is a pong message */
597601 if ( FD_UNLIKELY ( pending -> msg .kind == FD_REPAIR_KIND_PONG ) ) {
598602 fd_memcpy ( pending -> msg .pong .sig , ctx -> buffer , 64UL );
599603 send_packet ( ctx , stem , 1 , pending -> pong_data .peer_addr .addr , pending -> pong_data .peer_addr .port , pending -> pong_data .daddr , pending -> buf , fd_repair_sz ( & pending -> msg ), fd_frag_meta_ts_comp ( fd_tickcount () ) );
600- sign_map_remove ( ctx , pending_key );
601604 return ;
602605 }
603606
604- /* else: regular repair shred request format */
605-
607+ /* Inject the signature into the pending request */
606608 fd_memcpy ( pending -> buf + 4 , ctx -> buffer , 64UL );
607609 uint src_ip4 = 0U ;
608- fd_policy_peer_t * active = fd_policy_peer_query ( ctx -> policy , & pending -> msg .shred .to );
610+
611+ /* This is a warmup message */
612+ if ( FD_UNLIKELY ( pending -> msg .kind == FD_REPAIR_KIND_SHRED && pending -> msg .shred .slot == 0 ) ) {
613+ fd_policy_peer_t * active = fd_policy_peer_query ( ctx -> policy , & pending -> msg .shred .to );
614+ if ( FD_UNLIKELY ( active ) ) send_packet ( ctx , stem , 1 , active -> ip4 , active -> port , src_ip4 , pending -> buf , pending -> buflen , fd_frag_meta_ts_comp ( fd_tickcount () ) );
615+ else { /* This is a warmup request for a peer that is no longer active. There's no reason to pick another peer for a warmup rq, so just drop it. */ }
616+ return ;
617+ }
618+
619+ /* This is a regular repair shred request
620+
621+ TODO: anyways to make this less complicated? Essentially we need to
622+ ensure we always send out any shred requests we have, because policy_next
623+ has no way to revisit a shred. But the fact that peers can drop out
624+ of the active peer list makes this complicated.
625+
626+ 1. If the peer is still there (common), it's fine.
627+ 2. If the peer is not there, we can select another peer and send the request.
628+ 3. If the peer is not there, and we have no other peers, we can add
629+ this request to the inflights table, pretend we've sent it and
630+ let the inflight timeout request it down the line.
631+ */
632+ fd_policy_peer_t * active = fd_policy_peer_query ( ctx -> policy , & pending -> msg .shred .to );
633+ int is_regular_req = pending -> msg .kind == FD_REPAIR_KIND_SHRED && pending -> msg .shred .nonce > 0 ; // not a highest/orphan request
609634
610635 if ( FD_UNLIKELY ( !active ) ) {
611- FD_LOG_INFO (( "Signed a message for %s, but it is no longer in the active peer list" , FD_BASE58_ENC_32_ALLOCA ( & pending -> msg .shred .to ) ));
612- /* Happens extremely rarely, so we can just pick a new peer and
613- try to resign here. */
614636 fd_pubkey_t const * new_peer = fd_policy_peer_select ( ctx -> policy );
615- pending -> msg .shred .to = * new_peer ;
616- sign_map_remove ( ctx , pending_key );
617- fd_signs_queue_push ( ctx -> sign_queue , (sign_pending_t ){ .msg = pending -> msg } );
637+ if ( FD_LIKELY ( new_peer ) ) {
638+ /* We have a new peer, so we can send the request */
639+ pending -> msg .shred .to = * new_peer ;
640+ fd_signs_queue_push ( ctx -> sign_queue , (sign_pending_t ){ .msg = pending -> msg } );
641+ }
642+
643+ if ( FD_UNLIKELY ( !new_peer && is_regular_req ) ) {
644+ /* This is real devastation - we clearly had a peer at the time of
645+ making this request, but for some reason we now have ZERO
646+ peers. The only thing we can do is to add this artificially to
647+ the inflights table, pretend we've sent it and let the inflight
648+ timeout request it down the line. */
649+ fd_inflights_request_insert ( ctx -> inflight , pending -> msg .shred .nonce , & pending -> msg .shred .to , pending -> msg .shred .slot , pending -> msg .shred .shred_idx );
650+ }
618651 return ;
619652 }
620-
621- int is_regular_request = pending -> msg .kind != FD_REPAIR_KIND_PONG && pending -> msg .shred .nonce > 0 ;
622- if ( FD_LIKELY ( is_regular_request && pending -> msg .kind == FD_REPAIR_KIND_SHRED ) ) {
653+ /* Happy path - all is well, our peer didn't drop out from beneath us. */
654+ if ( FD_LIKELY ( is_regular_req ) ) {
623655 fd_inflights_request_insert ( ctx -> inflight , pending -> msg .shred .nonce , & pending -> msg .shred .to , pending -> msg .shred .slot , pending -> msg .shred .shred_idx );
624656 fd_policy_peer_request_update ( ctx -> policy , & pending -> msg .shred .to );
625657 }
626658 send_packet ( ctx , stem , 1 , active -> ip4 , active -> port , src_ip4 , pending -> buf , pending -> buflen , fd_frag_meta_ts_comp ( fd_tickcount () ) );
627- sign_map_remove ( ctx , pending_key );
628659}
629660
630661static inline void
@@ -850,6 +881,7 @@ after_frag( ctx_t * ctx,
850881 }
851882 }
852883 /* update metrics */
884+ ctx -> metrics -> repaired_slots = fd_forest_highest_repaired_slot ( ctx -> forest );
853885 return ;
854886 }
855887
@@ -898,9 +930,16 @@ after_credit( ctx_t * ctx,
898930 fd_forest_blk_t * blk = fd_forest_query ( ctx -> forest , slot );
899931 if ( FD_UNLIKELY ( !fd_forest_blk_idxs_test ( blk -> idxs , shred_idx ) ) ) {
900932 fd_pubkey_t const * peer = fd_policy_peer_select ( ctx -> policy );
901- fd_repair_msg_t * msg = fd_repair_shred ( ctx -> protocol , peer , (ulong )((ulong )now / 1e6L ), (uint )nonce , slot , shred_idx );
902- fd_repair_send_sign_request ( ctx , sign_out , msg , NULL );
903- return ;
933+ if ( FD_UNLIKELY ( !peer ) ) {
934+ /* No peers. But we CANNOT lose this request. */
935+ /* Add this request to the inflights table, pretend we've sent it and let the inflight timeout request it down the line. */
936+ fd_hash_t hash = { .ul [0 ] = 0 };
937+ fd_inflights_request_insert ( ctx -> inflight , nonce , & hash , slot , shred_idx );
938+ } else {
939+ fd_repair_msg_t * msg = fd_repair_shred ( ctx -> protocol , peer , (ulong )((ulong )now / 1e6L ), (uint )nonce , slot , shred_idx );
940+ fd_repair_send_sign_request ( ctx , sign_out , msg , NULL );
941+ return ;
942+ }
904943 }
905944 }
906945
@@ -1115,8 +1154,6 @@ populate_allowed_fds( fd_topo_t const * topo FD_PARAM_UNUSED,
11151154
11161155static inline void
11171156metrics_write ( ctx_t * ctx ) {
1118- ctx -> metrics -> repaired_slots = fd_forest_highest_repaired_slot ( ctx -> forest );
1119-
11201157 FD_MCNT_SET ( REPAIR , CURRENT_SLOT , ctx -> metrics -> current_slot );
11211158 FD_MCNT_SET ( REPAIR , REPAIRED_SLOTS , ctx -> metrics -> repaired_slots );
11221159 FD_MCNT_SET ( REPAIR , REQUEST_PEERS , fd_peer_pool_used ( ctx -> policy -> peers .pool ) );
0 commit comments