repair: handling tricky edge cases with dropping peers, and being at head of turbine, invalidating iterator on publish

emwang-jump · emwang-jump · commit a56c85c93aab · 2025-10-17T22:41:07.000Z
diff --git a/src/discof/forest/fd_forest.c b/src/discof/forest/fd_forest.c
@@ -149,6 +149,10 @@ requests_remove( fd_forest_t * forest, ulong pool_idx ) {
   fd_forest_ref_t      * pool     = fd_forest_reqspool( forest );
   fd_forest_ref_t      * ele;
   if( FD_LIKELY( ele = fd_forest_requests_ele_remove( requests, &pool_idx, NULL, pool ) ) ) {
+    /* invalidate the iterator if it is on the removed slot. */
+    if( FD_UNLIKELY( forest->iter.ele_idx == pool_idx ) ) {
+      forest->iter.ele_idx = ULONG_MAX;
+    }
     fd_forest_reqslist_ele_remove( fd_forest_reqslist( forest ), ele, pool );
     fd_forest_reqspool_ele_release( pool, ele );
   }
@@ -632,8 +636,7 @@ fd_forest_blk_insert( fd_forest_t * forest, ulong slot, ulong parent_slot ) {
                  fd_forest_frontier_ele_query( frontier, &ele->slot, NULL, pool ) ) ) {
     /* There is a chance that we connected this ele to the main tree. If
        this ele doesn't have a parent in the consumed/requests map, add it to the
-       consumed/requests map. If there are no requests in the deque though
-       (common case after catchup), don't even bother iterating. */
+       consumed/requests map. */
     ulong ancestor = fd_forest_pool_idx( pool, ele );
     int   has_requests_anc = 0;
     int   has_consumed_anc = 0;
@@ -760,6 +763,7 @@ fd_forest_publish( fd_forest_t * forest, ulong new_root_slot ) {
   fd_forest_orphaned_t * orphaned = fd_forest_orphaned( forest );
   fd_forest_frontier_t * frontier = fd_forest_frontier( forest );
   fd_forest_subtrees_t * subtrees = fd_forest_subtrees( forest );
+  fd_forest_ref_t *      conspool = fd_forest_conspool( forest );
   fd_forest_blk_t *      pool     = fd_forest_pool( forest );
   ulong                  null     = fd_forest_pool_idx_null( pool );
   ulong *                queue    = fd_forest_deque( forest );
@@ -849,7 +853,7 @@ fd_forest_publish( fd_forest_t * forest, ulong new_root_slot ) {
      In that case we need to continue repairing from the new root, so
      add it to the consumed map. */
 
-  if( FD_UNLIKELY( fd_forest_conspool_used( fd_forest_conspool( forest ) ) == 0 ) ) {
+  if( FD_UNLIKELY( fd_forest_conslist_is_empty( fd_forest_conslist( forest ), conspool ) ) ) {
     consumed_insert( forest, fd_forest_pool_idx( pool, new_root_ele ) );
     requests_insert( forest, fd_forest_pool_idx( pool, new_root_ele ) );
     new_root_ele->complete_idx = 0;
@@ -956,14 +960,23 @@ fd_forest_iter_next( fd_forest_t * forest ) {
           requests_insert( forest, fd_forest_pool_idx( pool, child ) );
           child = fd_forest_pool_ele_const( pool, child->sibling );
         }
-        requests_remove( forest, iter->ele_idx );  /* remove finished slot from head of requests deque */
-        if( FD_UNLIKELY( ele->complete_idx == UINT_MAX ) ) {
-          /* if we just made a highest_window_idx request, add this slot back to the requests deque at the end */
+        /* so annoying. cant call requests_remove because itll invalidate the current iter->ele_idx,
+           so we explicitly pop the head and free the ele here. */
+        fd_forest_ref_t * head = fd_forest_reqslist_ele_pop_head( fd_forest_reqslist( forest ), reqspool );
+        fd_forest_requests_ele_remove ( fd_forest_requests( forest ), &head->idx, NULL, reqspool );
+        fd_forest_reqspool_ele_release( reqspool, head );
+
+        if( FD_UNLIKELY( iter->shred_idx == UINT_MAX && ( ele->buffered_idx == UINT_MAX || ele->buffered_idx < ele->complete_idx ) ) ) {
+          /* If we just made a highest_window_idx request, add this slot
+             back to the requests deque at the end.  Also condition on
+             whether or not this slot is still incomplete.  If the slot
+             is complete and we add it back to the loop, we will end up
+             infinite looping. */
           requests_insert( forest, iter->ele_idx );
         }
       }
 
-      /* move onto the next slot */
+      /* Move onto the next slot */
       if( FD_UNLIKELY( fd_forest_reqslist_is_empty( reqslist, reqspool ) ) ) {
         iter->ele_idx = fd_forest_pool_idx_null( pool );
         iter->shred_idx = UINT_MAX;
diff --git a/src/discof/forest/test_forest.c b/src/discof/forest/test_forest.c
@@ -862,6 +862,25 @@ test_iter_publish( fd_wksp_t * wksp ) {
 
 }
 
+void
+test_iter_caught_up( fd_wksp_t * wksp ) {
+  ulong ele_max = 8;
+  void * mem = fd_wksp_alloc_laddr( wksp, fd_forest_align(), fd_forest_footprint( ele_max ), 1UL );
+  FD_TEST( mem );
+  fd_forest_t * forest = fd_forest_join( fd_forest_new( mem, ele_max, 42UL /* seed */ ) );
+
+  fd_forest_init( forest, 0 );
+  fd_forest_blk_fec_insert       ( forest, 1, 0, 0, 0, 1    );
+  fd_forest_blk_fec_insert       ( forest, 2, 1, 0, 0, 1    ); /* fully caught up */
+  fd_forest_blk_data_shred_insert( forest, 3, 2, 0, 0, 0, 0 );
+
+  for( int i = 0; i < 10; i++ ) {
+    fd_forest_iter_next( forest );
+    FD_LOG_NOTICE(("iter: slot %lu, idx %u", idx_slot( forest, forest->iter.ele_idx ), forest->iter.shred_idx));
+  }
+
+}
+
 
 int
 main( int argc, char ** argv ) {
@@ -873,18 +892,19 @@ main( int argc, char ** argv ) {
   fd_wksp_t * wksp = fd_wksp_new_anonymous( fd_cstr_to_shmem_page_sz( page_sz ), page_cnt, fd_shmem_cpu_idx( numa_idx ), "wksp", 0UL );
   FD_TEST( wksp );
 
-  test_invalid_frontier_insert( wksp );
-  test_publish( wksp );
-  test_publish_incremental( wksp );
-  test_out_of_order( wksp );
-  test_forks( wksp );
-  test_print_tree( wksp );
-  // test_large_print_tree( wksp);
-  test_linear_forest_iterator( wksp );
-  test_branched_forest_iterator( wksp );
-  test_frontier( wksp );
-  test_fec_clear( wksp );
-  test_iter_publish( wksp );
+  //test_invalid_frontier_insert( wksp );
+  //test_publish( wksp );
+  //test_publish_incremental( wksp );
+  //test_out_of_order( wksp );
+  //test_forks( wksp );
+  //test_print_tree( wksp );
+  //// test_large_print_tree( wksp);
+  //test_linear_forest_iterator( wksp );
+  //test_branched_forest_iterator( wksp );
+  //test_frontier( wksp );
+  //test_fec_clear( wksp );
+  //test_iter_publish( wksp );
+  test_iter_caught_up( wksp );
 
   fd_halt();
   return 0;
diff --git a/src/discof/repair/fd_inflight.h b/src/discof/repair/fd_inflight.h
@@ -4,13 +4,18 @@
 #include "../../flamenco/types/fd_types.h"
 
 /* fd_inflights tracks repair requests that are inflight to other
-   validators.  This module is not necessary for the repair protocol and
-   strategy, but is useful for metrics and reporting.  Incorrect updates
-   and removals from this module are non-critical.  Requests are key-ed
-   by nonce as in the current strategy (see fd_policy.h), all requests
-   have a unique nonce.  The chances that an inflight request does not
-   get a response are non-negligible due to shred tile upstream deduping
-   duplicates. */
+   validators.  This module is useful for metrics and reporting.
+   In-exact updates of orphan requests and highest window requests from
+   this module are non-critical, but exact updates of shred requests are
+   critical. Repair tile relies on this module to be able to re-request
+   any shreds that it has sent, because policy next does not request any
+   shred twice.
+   (TODO should this be rolled into policy.h?)
+
+   Requests are key-ed by nonce as in the current strategy (see
+   fd_policy.h), all requests have a unique nonce.  The chances that an
+   inflight request does not get a response are non-negligible due to
+   shred tile upstream deduping duplicates. */
 
 /* Max number of pending requests */
 #define FD_INFLIGHT_REQ_MAX (1<<20)
@@ -96,7 +101,7 @@ fd_inflights_should_drain( fd_inflights_t * table, long now ) {
   if( FD_UNLIKELY( fd_inflight_dlist_is_empty( table->dlist, table->pool ) ) ) return 0;
 
   fd_inflight_t * inflight_req = fd_inflight_dlist_ele_peek_head( table->dlist, table->pool );
-  if( FD_UNLIKELY( inflight_req->timestamp_ns + 100e6L < now ) ) return 1;
+  if( FD_UNLIKELY( inflight_req->timestamp_ns + 90e6L < now ) ) return 1;
   return 0;
 }
 
diff --git a/src/discof/repair/fd_policy.c b/src/discof/repair/fd_policy.c
@@ -164,6 +164,8 @@ fd_policy_peer_select( fd_policy_t * policy ) {
   fd_peer_dlist_t * worst_dlist = policy->peers.slow;
   fd_peer_t       * pool        = policy->peers.pool;
 
+  if( FD_UNLIKELY( fd_peer_pool_used( policy->peers.pool ) == 0 ) ) return NULL;
+
   fd_peer_dlist_t * dlist = bucket_stages[policy->peers.select.stage] == FD_POLICY_LATENCY_FAST ? best_dlist : worst_dlist;
 
   while( FD_UNLIKELY( fd_peer_dlist_iter_done( policy->peers.select.iter, dlist, pool ) ) ) {
@@ -235,10 +237,10 @@ fd_policy_next( fd_policy_t * policy, fd_forest_t * forest, fd_repair_t * repair
        means that the shred_idx of the iterf is likely to be UINT_MAX,
        which means calling fd_forest_iter_next will advance the iterf
        to the next slot. */
-    //forest->iter.shred_idx = UINT_MAX; // heinous... i'm sorry
-    //fd_forest_iter_next( forest );
-    //if( FD_UNLIKELY( fd_forest_iter_done( forest->iter, forest ) ) ) break;
-    //continue;
+    forest->iter.shred_idx = UINT_MAX;
+    /* TODO: Heinous... I'm sorry. Easiest way to ensure this slot gets added back to the requests deque.
+       but maybe there should be an explicit API for it. */
+    return NULL;
   }
 
   if( FD_UNLIKELY( forest->iter.shred_idx == UINT_MAX ) ) {
@@ -300,7 +302,7 @@ fd_policy_peer_remove( fd_policy_t * policy, fd_pubkey_t const * key ) {
 
   if( FD_UNLIKELY( policy->peers.select.iter == fd_peer_pool_idx( policy->peers.pool, peer_ele ) ) ) {
     /* In general removal during iteration is safe, except when the iterator is on the peer to be removed. */
-    fd_peer_dlist_t * dlist = policy->peers.select.stage == FD_POLICY_LATENCY_FAST ? policy->peers.fast : policy->peers.slow;
+    fd_peer_dlist_t * dlist = bucket_stages[policy->peers.select.stage] == FD_POLICY_LATENCY_FAST ? policy->peers.fast : policy->peers.slow;
     policy->peers.select.iter = fd_peer_dlist_iter_fwd_next( policy->peers.select.iter, dlist, policy->peers.pool );
   }
 
diff --git a/src/discof/repair/fd_repair_metrics.h b/src/discof/repair/fd_repair_metrics.h
@@ -18,7 +18,7 @@ struct fd_slot_metrics {
 };
 typedef struct fd_slot_metrics fd_slot_metrics_t;
 
-#define FD_CATCHUP_METRICS_MAX 256
+#define FD_CATCHUP_METRICS_MAX 16384
 
 struct fd_repair_metrics_t {
   fd_slot_metrics_t slots[ FD_CATCHUP_METRICS_MAX ];
diff --git a/src/discof/repair/fd_repair_tile.c b/src/discof/repair/fd_repair_tile.c
@@ -591,40 +591,71 @@ after_sign( ctx_t             * ctx,
     }
   }
 
-  sign_req_t * pending = fd_signs_map_query( ctx->signs_map, pending_key, NULL );
-  if( FD_UNLIKELY( !pending ) ) FD_LOG_CRIT(( "No pending request found for key %lu", pending_key ));
+  sign_req_t * pending_ = fd_signs_map_query( ctx->signs_map, pending_key, NULL );
+  sign_req_t   pending[1] = { *pending_ }; /* Make a copy of the pending request so we can sign_map_remove immediately. */
+  sign_map_remove( ctx, pending_key );
+
+  if( FD_UNLIKELY( !pending_ ) ) FD_LOG_CRIT(( "No pending request found for key %lu", pending_key ));
 
+  /* Thhis is a pong message */
   if( FD_UNLIKELY( pending->msg.kind == FD_REPAIR_KIND_PONG ) ) {
     fd_memcpy( pending->msg.pong.sig, ctx->buffer, 64UL );
     send_packet( ctx, stem, 1, pending->pong_data.peer_addr.addr, pending->pong_data.peer_addr.port, pending->pong_data.daddr, pending->buf, fd_repair_sz( &pending->msg ), fd_frag_meta_ts_comp( fd_tickcount() ) );
-    sign_map_remove( ctx, pending_key );
     return;
   }
 
-  /* else: regular repair shred request format */
-
+  /* Inject the signature into the pending request */
   fd_memcpy( pending->buf + 4, ctx->buffer, 64UL );
   uint  src_ip4 = 0U;
-  fd_policy_peer_t * active = fd_policy_peer_query( ctx->policy, &pending->msg.shred.to );
+
+  /* This is a warmup message */
+  if( FD_UNLIKELY( pending->msg.kind == FD_REPAIR_KIND_SHRED && pending->msg.shred.slot == 0 ) ) {
+    fd_policy_peer_t * active = fd_policy_peer_query( ctx->policy, &pending->msg.shred.to );
+    if( FD_UNLIKELY( active ) ) send_packet( ctx, stem, 1, active->ip4, active->port, src_ip4, pending->buf, pending->buflen, fd_frag_meta_ts_comp( fd_tickcount() ) );
+    else { /* This is a warmup request for a peer that is no longer active.  There's no reason to pick another peer for a warmup rq, so just drop it. */ }
+    return;
+  }
+
+  /* This is a regular repair shred request
+
+     TODO: anyways to make this less complicated? Essentially we need to
+     ensure we always send out any shred requests we have, because policy_next
+     has no way to revisit a shred.  But the fact that peers can drop out
+     of the active peer list makes this complicated.
+
+     1. If the peer is still there (common), it's fine.
+     2. If the peer is not there, we can select another peer and send the request.
+     3. If the peer is not there, and we have no other peers, we can add
+        this request to the inflights table, pretend we've sent it and
+        let the inflight timeout request it down the line.
+  */
+  fd_policy_peer_t * active         = fd_policy_peer_query( ctx->policy, &pending->msg.shred.to );
+  int                is_regular_req = pending->msg.kind == FD_REPAIR_KIND_SHRED && pending->msg.shred.nonce > 0; // not a highest/orphan request
 
   if( FD_UNLIKELY( !active ) ) {
-    FD_LOG_INFO(( "Signed a message for %s, but it is no longer in the active peer list", FD_BASE58_ENC_32_ALLOCA( &pending->msg.shred.to ) ));
-    /* Happens extremely rarely, so we can just pick a new peer and
-       try to resign here. */
     fd_pubkey_t const * new_peer = fd_policy_peer_select( ctx->policy );
-    pending->msg.shred.to        = *new_peer;
-    sign_map_remove( ctx, pending_key );
-    fd_signs_queue_push( ctx->sign_queue, (sign_pending_t){ .msg = pending->msg } );
+    if( FD_LIKELY( new_peer ) ) {
+      /* We have a new peer, so we can send the request */
+      pending->msg.shred.to = *new_peer;
+      fd_signs_queue_push( ctx->sign_queue, (sign_pending_t){ .msg = pending->msg } );
+    }
+
+    if( FD_UNLIKELY( !new_peer && is_regular_req ) ) {
+      /* This is real devastation - we clearly had a peer at the time of
+         making this request, but for some reason we now have ZERO
+         peers. The only thing we can do is to add this artificially to
+         the inflights table, pretend we've sent it and let the inflight
+         timeout request it down the line. */
+      fd_inflights_request_insert( ctx->inflight, pending->msg.shred.nonce, &pending->msg.shred.to, pending->msg.shred.slot, pending->msg.shred.shred_idx );
+    }
     return;
   }
-
-  int is_regular_request = pending->msg.kind != FD_REPAIR_KIND_PONG && pending->msg.shred.nonce > 0;
-  if( FD_LIKELY( is_regular_request && pending->msg.kind == FD_REPAIR_KIND_SHRED ) ) {
+  /* Happy path - all is well, our peer didn't drop out from beneath us. */
+  if( FD_LIKELY( is_regular_req ) ) {
     fd_inflights_request_insert( ctx->inflight, pending->msg.shred.nonce, &pending->msg.shred.to, pending->msg.shred.slot, pending->msg.shred.shred_idx );
     fd_policy_peer_request_update( ctx->policy, &pending->msg.shred.to );
   }
   send_packet( ctx, stem, 1, active->ip4, active->port, src_ip4, pending->buf, pending->buflen, fd_frag_meta_ts_comp( fd_tickcount() ) );
-  sign_map_remove( ctx, pending_key );
 }
 
 static inline void
@@ -850,6 +881,7 @@ after_frag( ctx_t * ctx,
       }
     }
     /* update metrics */
+    ctx->metrics->repaired_slots = fd_forest_highest_repaired_slot( ctx->forest );
     return;
   }
 
@@ -898,9 +930,16 @@ after_credit( ctx_t *             ctx,
     fd_forest_blk_t * blk = fd_forest_query( ctx->forest, slot );
     if( FD_UNLIKELY( !fd_forest_blk_idxs_test( blk->idxs, shred_idx ) ) ) {
       fd_pubkey_t const * peer = fd_policy_peer_select( ctx->policy );
-      fd_repair_msg_t   * msg  = fd_repair_shred( ctx->protocol, peer, (ulong)((ulong)now / 1e6L), (uint)nonce, slot, shred_idx );
-      fd_repair_send_sign_request( ctx, sign_out, msg, NULL );
-      return;
+      if( FD_UNLIKELY( !peer ) ) {
+        /* No peers. But we CANNOT lose this request. */
+        /* Add this request to the inflights table, pretend we've sent it and let the inflight timeout request it down the line. */
+        fd_hash_t hash = { .ul[0] = 0 };
+        fd_inflights_request_insert( ctx->inflight, nonce, &hash, slot, shred_idx );
+      } else {
+        fd_repair_msg_t * msg = fd_repair_shred( ctx->protocol, peer, (ulong)((ulong)now / 1e6L), (uint)nonce, slot, shred_idx );
+        fd_repair_send_sign_request( ctx, sign_out, msg, NULL );
+        return;
+      }
     }
   }
 
@@ -1115,8 +1154,6 @@ populate_allowed_fds( fd_topo_t const *      topo FD_PARAM_UNUSED,
 
 static inline void
 metrics_write( ctx_t * ctx ) {
-  ctx->metrics->repaired_slots = fd_forest_highest_repaired_slot( ctx->forest );
-
   FD_MCNT_SET( REPAIR, CURRENT_SLOT,      ctx->metrics->current_slot );
   FD_MCNT_SET( REPAIR, REPAIRED_SLOTS,    ctx->metrics->repaired_slots );
   FD_MCNT_SET( REPAIR, REQUEST_PEERS,     fd_peer_pool_used( ctx->policy->peers.pool ) );