From 41f94d7a7382b04acb96254cc6babde3fb2dddc5 Mon Sep 17 00:00:00 2001 From: gmega Date: Thu, 5 Jun 2025 15:30:08 -0300 Subject: [PATCH 01/20] feat: add dataset request batching --- codex/blockexchange/engine/discovery.nim | 2 + codex/blockexchange/engine/engine.nim | 64 +++++++++++++++++--- codex/blockexchange/engine/pendingblocks.nim | 5 +- codex/blockexchange/peers/peercontext.nim | 7 +++ codex/node.nim | 19 +++--- codex/stores/networkstore.nim | 20 ++++++ 6 files changed, 95 insertions(+), 22 deletions(-) diff --git a/codex/blockexchange/engine/discovery.nim b/codex/blockexchange/engine/discovery.nim index b32b855574..6288ceae06 100644 --- a/codex/blockexchange/engine/discovery.nim +++ b/codex/blockexchange/engine/discovery.nim @@ -78,6 +78,8 @@ proc discoveryTaskLoop(b: DiscoveryEngine) {.async: (raises: []).} = trace "Discovery request already in progress", cid continue + trace "Running discovery task for cid", cid + let haves = b.peers.peersHave(cid) if haves.len < b.minPeersPerBlock: diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index 36d00cf0be..a988d9eda0 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -154,6 +154,28 @@ proc sendWantBlock( ) # we want this remote to send us a block codex_block_exchange_want_block_lists_sent.inc() +proc refreshBlockKnowledge(self: BlockExcEngine, peer: BlockExcPeerCtx) {.async: (raises: [CancelledError]).} = + # broadcast our want list, the other peer will do the same + if self.pendingBlocks.wantListLen > 0: + let cids = toSeq(self.pendingBlocks.wantList) + trace "Sending our want list to a peer", peer = peer.id, length = cids.len + await self.network.request.sendWantList(peer.id, cids, full = true) + +proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledError]).} = + for peer in self.peers.peers.values: + # We refresh block knowledge if: + # 1. the peer hasn't been refreshed in a while; + # 2. the list of blocks we care about has actually changed. + # + # Note that because of (2), it is important that we update our + # want list in the coarsest way possible instead of over many + # small updates. + # + # In dynamic swarms, staleness will dominate latency. + if peer.lastRefresh < self.pendingBlocks.lastInclusion or peer.isKnowledgeStale: + await self.refreshBlockKnowledge(peer) + peer.refreshed() + proc randomPeer(peers: seq[BlockExcPeerCtx]): BlockExcPeerCtx = Rng.instance.sample(peers) @@ -189,7 +211,7 @@ proc downloadInternal( else: self.pendingBlocks.setInFlight(address, false) if peers.without.len > 0: - await self.sendWantHave(@[address], peers.without) + await self.refreshBlockKnowledge() self.discovery.queueFindBlocksReq(@[address.cidOrTreeCid]) await (handle or sleepAsync(self.pendingBlocks.retryInterval)) @@ -209,6 +231,32 @@ proc downloadInternal( finally: self.pendingBlocks.setInFlight(address, false) +proc requestBlocks*(self: BlockExcEngine, addresses: seq[BlockAddress]): Future[seq[?!Block]] {.async: (raises: [CancelledError]).} = + var handles: seq[BlockHandle] + # Adds all blocks to pendingBlocks before calling the first downloadInternal. This will + # ensure that we don't send incomplete want lists. + for address in addresses: + if address notin self.pendingBlocks: + handles.add(self.pendingBlocks.getWantHandle(address)) + + for address in addresses: + self.trackedFutures.track(self.downloadInternal(address)) + + # TODO: we can reduce latency and improve download times + # by returning blocks out of order as futures complete. + var blocks: seq[?!Block] + for handle in handles: + try: + blocks.add(success await handle) + except CancelledError as err: + warn "Block request cancelled", addresses, err = err.msg + raise err + except CatchableError as err: + error "Error getting blocks from exchange engine", addresses, err = err.msg + blocks.add(Block.failure err) + + return blocks + proc requestBlock*( self: BlockExcEngine, address: BlockAddress ): Future[?!Block] {.async: (raises: [CancelledError]).} = @@ -233,7 +281,7 @@ proc requestBlock*( proc blockPresenceHandler*( self: BlockExcEngine, peer: PeerId, blocks: seq[BlockPresence] ) {.async: (raises: []).} = - trace "Received block presence from peer", peer, blocks = blocks.mapIt($it) + trace "Received block presence from peer", peer, len = blocks.len let peerCtx = self.peers.get(peer) ourWantList = toSeq(self.pendingBlocks.wantList) @@ -476,12 +524,14 @@ proc wantListHandler*( case e.wantType of WantType.WantHave: if have: + trace "We HAVE the block", address = e.address presence.add( BlockPresence( address: e.address, `type`: BlockPresenceType.Have, price: price ) ) else: + trace "We DON'T HAVE the block", address = e.address if e.sendDontHave: presence.add( BlockPresence( @@ -554,15 +604,11 @@ proc setupPeer*( trace "Setting up peer", peer if peer notin self.peers: + let peerCtx = BlockExcPeerCtx(id: peer) trace "Setting up new peer", peer - self.peers.add(BlockExcPeerCtx(id: peer)) + self.peers.add(peerCtx) trace "Added peer", peers = self.peers.len - - # broadcast our want list, the other peer will do the same - if self.pendingBlocks.wantListLen > 0: - trace "Sending our want list to a peer", peer - let cids = toSeq(self.pendingBlocks.wantList) - await self.network.request.sendWantList(peer, cids, full = true) + await self.refreshBlockKnowledge(peerCtx) if address =? self.pricing .? address: trace "Sending account to peer", peer diff --git a/codex/blockexchange/engine/pendingblocks.nim b/codex/blockexchange/engine/pendingblocks.nim index f169f744c9..bb4ac2d1e2 100644 --- a/codex/blockexchange/engine/pendingblocks.nim +++ b/codex/blockexchange/engine/pendingblocks.nim @@ -34,7 +34,7 @@ declareGauge( const DefaultBlockRetries* = 3000 - DefaultRetryInterval* = 500.millis + DefaultRetryInterval* = 10.seconds type RetriesExhaustedError* = object of CatchableError @@ -50,6 +50,7 @@ type blockRetries*: int = DefaultBlockRetries retryInterval*: Duration = DefaultRetryInterval blocks*: Table[BlockAddress, BlockReq] # pending Block requests + lastInclusion*: Moment # time at which we last included a block into our wantlist proc updatePendingBlockGauge(p: PendingBlocksManager) = codex_block_exchange_pending_block_requests.set(p.blocks.len.int64) @@ -70,6 +71,8 @@ proc getWantHandle*( startTime: getMonoTime().ticks, ) self.blocks[address] = blk + self.lastInclusion = Moment.now() + let handle = blk.handle proc cleanUpBlock(data: pointer) {.raises: [].} = diff --git a/codex/blockexchange/peers/peercontext.nim b/codex/blockexchange/peers/peercontext.nim index 7a299b6b32..aeb83de7af 100644 --- a/codex/blockexchange/peers/peercontext.nim +++ b/codex/blockexchange/peers/peercontext.nim @@ -31,9 +31,16 @@ type BlockExcPeerCtx* = ref object of RootObj peerWants*: seq[WantListEntry] # remote peers want lists exchanged*: int # times peer has exchanged with us lastExchange*: Moment # last time peer has exchanged with us + lastRefresh*: Moment # last time we refreshed our knowledge of the blocks this peer has account*: ?Account # ethereum account of this peer paymentChannel*: ?ChannelId # payment channel id +proc isKnowledgeStale*(self: BlockExcPeerCtx): bool = + self.lastRefresh + 15.seconds < Moment.now() + +proc refreshed*(self: BlockExcPeerCtx) = + self.lastRefresh = Moment.now() + proc peerHave*(self: BlockExcPeerCtx): seq[BlockAddress] = toSeq(self.blocks.keys) diff --git a/codex/node.nim b/codex/node.nim index b742df2cc3..ea173ad1e6 100644 --- a/codex/node.nim +++ b/codex/node.nim @@ -52,7 +52,7 @@ export logutils logScope: topics = "codex node" -const DefaultFetchBatch = 10 +const DefaultFetchBatch = 1_000_000 type Contracts* = @@ -187,23 +187,18 @@ proc fetchBatched*( # ) while not iter.finished: - let blockFutures = collect: + let addresses = collect: for i in 0 ..< batchSize: if not iter.finished: let address = BlockAddress.init(cid, iter.next()) if not (await address in self.networkStore) or fetchLocal: - self.networkStore.getBlock(address) + address - if blockFutures.len == 0: - continue + let + blockResults = await self.networkStore.getBlocks(addresses) + blocks = blockResults.filterIt(it.isSuccess()).mapIt(it.value) + numOfFailedBlocks = blockResults.len - blocks.len - without blockResults =? await allFinishedValues[?!bt.Block](blockFutures), err: - trace "Some blocks failed to fetch", err = err.msg - return failure(err) - - let blocks = blockResults.filterIt(it.isSuccess()).mapIt(it.value) - - let numOfFailedBlocks = blockResults.len - blocks.len if numOfFailedBlocks > 0: return failure("Some blocks failed (Result) to fetch (" & $numOfFailedBlocks & ")") diff --git a/codex/stores/networkstore.nim b/codex/stores/networkstore.nim index 64410ce0ba..829197f8bf 100644 --- a/codex/stores/networkstore.nim +++ b/codex/stores/networkstore.nim @@ -31,6 +31,26 @@ type NetworkStore* = ref object of BlockStore engine*: BlockExcEngine # blockexc decision engine localStore*: BlockStore # local block store +proc getBlocks*( + self: NetworkStore, + addresses: seq[BlockAddress] +): Future[seq[?!Block]] {.async: (raises: [CancelledError]).} = + var + localBlocks: seq[?!Block] + remoteAddresses: seq[BlockAddress] + + # We can resolve local blocks sequentially as for now those are blocking anyway. Still: + # TODO: implement getBlocks for local store so we can delegate it here. + for address in addresses: + if not (await address in self.localStore): + remoteAddresses.add(address) + else: + localBlocks.add(await self.localStore.getBlock(address)) + + let remoteBlocks = await self.engine.requestBlocks(remoteAddresses) + + return localBlocks.concat(remoteBlocks) + method getBlock*( self: NetworkStore, address: BlockAddress ): Future[?!Block] {.async: (raises: [CancelledError]).} = From d80b7ddbd0244093d16612acb50f356746260332 Mon Sep 17 00:00:00 2001 From: gmega Date: Thu, 5 Jun 2025 17:45:38 -0300 Subject: [PATCH 02/20] switch nph branch to provisional which compiles in 2.2.x --- vendor/nph | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/nph b/vendor/nph index f1f047760c..3191cc71f4 160000 --- a/vendor/nph +++ b/vendor/nph @@ -1 +1 @@ -Subproject commit f1f047760c6cb38d5c55d0ddb29b57a9c008a976 +Subproject commit 3191cc71f4d49473de6cf73a2680009a92419407 From 1dcb998be6f7372a9180bdb987914810324f0cd5 Mon Sep 17 00:00:00 2001 From: gmega Date: Fri, 6 Jun 2025 12:56:56 -0300 Subject: [PATCH 03/20] feat: cap how many blocks we can pack in a single message --- codex/blockexchange/engine/engine.nim | 130 +++++++++++-------- codex/blockexchange/engine/pendingblocks.nim | 5 +- codex/blockexchange/peers/peercontext.nim | 13 ++ codex/blockexchange/protobuf/message.nim | 1 - 4 files changed, 91 insertions(+), 58 deletions(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index a988d9eda0..270f668866 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -66,6 +66,10 @@ declareCounter( const DefaultMaxPeersPerRequest* = 10 + # The default max message length of nim-libp2p is 100 megabytes, meaning we can + # in principle fit up to 1600 64k blocks per message, so 500 is well under + # that number. + DefaultMaxBlocksPerMessage = 500 DefaultTaskQueueSize = 100 DefaultConcurrentTasks = 10 @@ -82,6 +86,8 @@ type concurrentTasks: int # Number of concurrent peers we're serving at any given time trackedFutures: TrackedFutures # Tracks futures of blockexc tasks blockexcRunning: bool # Indicates if the blockexc task is running + maxBlocksPerMessage: int + # Maximum number of blocks we can squeeze in a single message pendingBlocks*: PendingBlocksManager # Blocks we're awaiting to be resolved wallet*: WalletRef # Nitro wallet for micropayments pricing*: ?Pricing # Optional bandwidth pricing @@ -154,7 +160,9 @@ proc sendWantBlock( ) # we want this remote to send us a block codex_block_exchange_want_block_lists_sent.inc() -proc refreshBlockKnowledge(self: BlockExcEngine, peer: BlockExcPeerCtx) {.async: (raises: [CancelledError]).} = +proc refreshBlockKnowledge( + self: BlockExcEngine, peer: BlockExcPeerCtx +) {.async: (raises: [CancelledError]).} = # broadcast our want list, the other peer will do the same if self.pendingBlocks.wantListLen > 0: let cids = toSeq(self.pendingBlocks.wantList) @@ -214,6 +222,9 @@ proc downloadInternal( await self.refreshBlockKnowledge() self.discovery.queueFindBlocksReq(@[address.cidOrTreeCid]) + # FIXME: blocks should not blindly reschedule themselves. Instead, + # we should only reschedule a block if the peer drops, or we are + # in endgame mode. await (handle or sleepAsync(self.pendingBlocks.retryInterval)) self.pendingBlocks.decRetries(address) @@ -231,7 +242,9 @@ proc downloadInternal( finally: self.pendingBlocks.setInFlight(address, false) -proc requestBlocks*(self: BlockExcEngine, addresses: seq[BlockAddress]): Future[seq[?!Block]] {.async: (raises: [CancelledError]).} = +proc requestBlocks*( + self: BlockExcEngine, addresses: seq[BlockAddress] +): Future[seq[?!Block]] {.async: (raises: [CancelledError]).} = var handles: seq[BlockHandle] # Adds all blocks to pendingBlocks before calling the first downloadInternal. This will # ensure that we don't send incomplete want lists. @@ -311,6 +324,7 @@ proc blockPresenceHandler*( if ourWantCids.len > 0: trace "Peer has blocks in our wantList", peer, wants = ourWantCids + # FIXME: this will result in duplicate requests for blocks if err =? catch(await self.sendWantBlock(ourWantCids, peerCtx)).errorOption: warn "Failed to send wantBlock to peer", peer, err = err.msg @@ -623,65 +637,73 @@ proc dropPeer*(self: BlockExcEngine, peer: PeerId) {.raises: [].} = # drop the peer from the peers table self.peers.remove(peer) +proc localLookup( + self: BlockExcEngine, e: WantListEntry +): Future[?!BlockDelivery] {.async: (raises: [CancelledError]).} = + if e.address.leaf: + (await self.localStore.getBlockAndProof(e.address.treeCid, e.address.index)).map( + (blkAndProof: (Block, CodexProof)) => + BlockDelivery( + address: e.address, blk: blkAndProof[0], proof: blkAndProof[1].some + ) + ) + else: + (await self.localStore.getBlock(e.address)).map( + (blk: Block) => BlockDelivery( + address: e.address, blk: blk, proof: CodexProof.none + ) + ) + +iterator splitBatches[T](sequence: seq[T], batchSize: int): seq[T] = + var batch: seq[T] + for element in sequence: + if batch.len == batchSize: + yield batch + batch = @[] + batch.add(element) + + if batch.len > 0: + yield batch + proc taskHandler*( - self: BlockExcEngine, task: BlockExcPeerCtx + self: BlockExcEngine, peerCtx: BlockExcPeerCtx ) {.gcsafe, async: (raises: [CancelledError, RetriesExhaustedError]).} = # Send to the peer blocks he wants to get, # if they present in our local store - # TODO: There should be all sorts of accounting of - # bytes sent/received here - - var wantsBlocks = - task.peerWants.filterIt(it.wantType == WantType.WantBlock and not it.inFlight) - - proc updateInFlight(addresses: seq[BlockAddress], inFlight: bool) = - for peerWant in task.peerWants.mitems: - if peerWant.address in addresses: - peerWant.inFlight = inFlight - - if wantsBlocks.len > 0: - # Mark wants as in-flight. - let wantAddresses = wantsBlocks.mapIt(it.address) - updateInFlight(wantAddresses, true) - wantsBlocks.sort(SortOrder.Descending) - - proc localLookup(e: WantListEntry): Future[?!BlockDelivery] {.async.} = - if e.address.leaf: - (await self.localStore.getBlockAndProof(e.address.treeCid, e.address.index)).map( - (blkAndProof: (Block, CodexProof)) => - BlockDelivery( - address: e.address, blk: blkAndProof[0], proof: blkAndProof[1].some - ) - ) - else: - (await self.localStore.getBlock(e.address)).map( - (blk: Block) => - BlockDelivery(address: e.address, blk: blk, proof: CodexProof.none) - ) - - let - blocksDeliveryFut = await allFinished(wantsBlocks.map(localLookup)) - blocksDelivery = blocksDeliveryFut.filterIt(it.completed and it.value.isOk).mapIt: - if bd =? it.value: - bd - else: - raiseAssert "Unexpected error in local lookup" - - # All the wants that failed local lookup must be set to not-in-flight again. - let - successAddresses = blocksDelivery.mapIt(it.address) - failedAddresses = wantAddresses.filterIt(it notin successAddresses) - updateInFlight(failedAddresses, false) + # Blocks that are in flight have already been picked up by other tasks and + # should not be re-sent. + var wantedBlocks = peerCtx.peerWants.filterIt( + it.wantType == WantType.WantBlock and not peerCtx.isInFlight(it.address) + ) - if blocksDelivery.len > 0: - trace "Sending blocks to peer", - peer = task.id, blocks = (blocksDelivery.mapIt(it.address)) - await self.network.request.sendBlocksDelivery(task.id, blocksDelivery) + wantedBlocks.sort(SortOrder.Descending) - codex_block_exchange_blocks_sent.inc(blocksDelivery.len.int64) + for wantedBlock in wantedBlocks: + peerCtx.addInFlight(wantedBlock.address) - task.peerWants.keepItIf(it.address notin successAddresses) + try: + for batch in wantedBlocks.splitBatches(self.maxBlocksPerMessage): + var blockDeliveries: seq[BlockDelivery] + for wantedBlock in batch: + # I/O is blocking so looking up blocks sequentially is fine. + without blockDelivery =? await self.localLookup(wantedBlock), err: + error "Error getting block from local store", + err = err.msg, address = wantedBlock.address + peerCtx.removeInFlight(wantedBlock.address) + continue + blockDeliveries.add(blockDelivery) + + await self.network.request.sendBlocksDelivery(peerCtx.id, blockDeliveries) + codex_block_exchange_blocks_sent.inc(blockDeliveries.len.int64) + # Drops the batch from want list. Note that the send might still fail down the line + # and we will have removed them anyway, at which point we rely on the requester + # performing a retry for the request to succeed. + peerCtx.peerWants.keepItIf(it.address notin blockDeliveries.mapIt(it.address)) + finally: + # Better safe than sorry: if an exception does happen, we don't want to keep + # those in flight as it'll effectively prevent the blocks from ever being sent. + peerCtx.blocksInFlight.keepItIf(it notin wantedBlocks.mapIt(it.address)) proc blockexcTaskRunner(self: BlockExcEngine) {.async: (raises: []).} = ## process tasks @@ -706,6 +728,7 @@ proc new*( advertiser: Advertiser, peerStore: PeerCtxStore, pendingBlocks: PendingBlocksManager, + maxBlocksPerMessage = DefaultMaxBlocksPerMessage, concurrentTasks = DefaultConcurrentTasks, ): BlockExcEngine = ## Create new block exchange engine instance @@ -719,6 +742,7 @@ proc new*( wallet: wallet, concurrentTasks: concurrentTasks, trackedFutures: TrackedFutures(), + maxBlocksPerMessage: maxBlocksPerMessage, taskQueue: newAsyncHeapQueue[BlockExcPeerCtx](DefaultTaskQueueSize), discovery: discovery, advertiser: advertiser, diff --git a/codex/blockexchange/engine/pendingblocks.nim b/codex/blockexchange/engine/pendingblocks.nim index bb4ac2d1e2..6c45d6f52d 100644 --- a/codex/blockexchange/engine/pendingblocks.nim +++ b/codex/blockexchange/engine/pendingblocks.nim @@ -34,7 +34,7 @@ declareGauge( const DefaultBlockRetries* = 3000 - DefaultRetryInterval* = 10.seconds + DefaultRetryInterval* = 180.seconds type RetriesExhaustedError* = object of CatchableError @@ -111,9 +111,6 @@ proc resolve*( blockReq.handle.complete(bd.blk) codex_block_exchange_retrieval_time_us.set(retrievalDurationUs) - - if retrievalDurationUs > 500000: - warn "High block retrieval time", retrievalDurationUs, address = bd.address else: trace "Block handle already finished", address = bd.address diff --git a/codex/blockexchange/peers/peercontext.nim b/codex/blockexchange/peers/peercontext.nim index aeb83de7af..a4b0329aba 100644 --- a/codex/blockexchange/peers/peercontext.nim +++ b/codex/blockexchange/peers/peercontext.nim @@ -34,10 +34,23 @@ type BlockExcPeerCtx* = ref object of RootObj lastRefresh*: Moment # last time we refreshed our knowledge of the blocks this peer has account*: ?Account # ethereum account of this peer paymentChannel*: ?ChannelId # payment channel id + blocksInFlight*: seq[BlockAddress] # blocks in flight towards peer proc isKnowledgeStale*(self: BlockExcPeerCtx): bool = self.lastRefresh + 15.seconds < Moment.now() +proc isInFlight*(self: BlockExcPeerCtx, address: BlockAddress): bool = + address in self.blocksInFlight + +proc addInFlight*(self: BlockExcPeerCtx, address: BlockAddress) = + if not self.isInFlight(address): + self.blocksInFlight.add(address) + +proc removeInFlight*(self: BlockExcPeerCtx, address: BlockAddress) = + let index = self.blocksInFlight.find(address) + if index != -1: + self.blocksInFlight.delete(index) + proc refreshed*(self: BlockExcPeerCtx) = self.lastRefresh = Moment.now() diff --git a/codex/blockexchange/protobuf/message.nim b/codex/blockexchange/protobuf/message.nim index 4db8972936..a338441696 100644 --- a/codex/blockexchange/protobuf/message.nim +++ b/codex/blockexchange/protobuf/message.nim @@ -29,7 +29,6 @@ type cancel*: bool # Whether this revokes an entry wantType*: WantType # Note: defaults to enum 0, ie Block sendDontHave*: bool # Note: defaults to false - inFlight*: bool # Whether block sending is in progress. Not serialized. WantList* = object entries*: seq[WantListEntry] # A list of wantList entries From b6e120394ae119adc6663534ebcc630797e8846c Mon Sep 17 00:00:00 2001 From: gmega Date: Fri, 6 Jun 2025 14:01:51 -0300 Subject: [PATCH 04/20] update engine tests; add BlockAddress hashing tests --- codex/blockexchange/engine/engine.nim | 19 ++++++++ codex/blockexchange/protobuf/blockexc.nim | 8 ---- codex/blocktype.nim | 10 ++++- .../codex/blockexchange/engine/testengine.nim | 8 ++-- tests/codex/examples.nim | 3 +- tests/codex/testblocktype.nim | 44 +++++++++++++++++++ 6 files changed, 77 insertions(+), 15 deletions(-) create mode 100644 tests/codex/testblocktype.nim diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index 270f668866..73b5442c2c 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -302,26 +302,42 @@ proc blockPresenceHandler*( if peerCtx.isNil: return + trace "Build presence list" + for blk in blocks: if presence =? Presence.init(blk): peerCtx.setPresence(presence) + trace "Built presence list" + + trace "Remove dont want cids" + let peerHave = peerCtx.peerHave dontWantCids = peerHave.filterIt(it notin ourWantList) + trace "Removed dont want cids" + if dontWantCids.len > 0: peerCtx.cleanPresence(dontWantCids) + trace "Remove want cids" + let ourWantCids = ourWantList.filterIt( it in peerHave and not self.pendingBlocks.retriesExhausted(it) and not self.pendingBlocks.isInFlight(it) ) + trace "Removed want cids" + + trace "Update pending blocks" + for address in ourWantCids: self.pendingBlocks.setInFlight(address, true) self.pendingBlocks.decRetries(address) + trace "Updated pending blocks" + if ourWantCids.len > 0: trace "Peer has blocks in our wantList", peer, wants = ourWantCids # FIXME: this will result in duplicate requests for blocks @@ -694,6 +710,9 @@ proc taskHandler*( continue blockDeliveries.add(blockDelivery) + if blockDeliveries.len == 0: + continue + await self.network.request.sendBlocksDelivery(peerCtx.id, blockDeliveries) codex_block_exchange_blocks_sent.inc(blockDeliveries.len.int64) # Drops the batch from want list. Note that the send might still fail down the line diff --git a/codex/blockexchange/protobuf/blockexc.nim b/codex/blockexchange/protobuf/blockexc.nim index 698686810d..a0512cd332 100644 --- a/codex/blockexchange/protobuf/blockexc.nim +++ b/codex/blockexchange/protobuf/blockexc.nim @@ -9,7 +9,6 @@ import std/hashes import std/sequtils -import pkg/stew/endians2 import message @@ -20,13 +19,6 @@ export Wantlist, WantType, WantListEntry export BlockDelivery, BlockPresenceType, BlockPresence export AccountMessage, StateChannelUpdate -proc hash*(a: BlockAddress): Hash = - if a.leaf: - let data = a.treeCid.data.buffer & @(a.index.uint64.toBytesBE) - hash(data) - else: - hash(a.cid.data.buffer) - proc hash*(e: WantListEntry): Hash = hash(e.address) diff --git a/codex/blocktype.nim b/codex/blocktype.nim index 7e13493d84..233d4e8f4c 100644 --- a/codex/blocktype.nim +++ b/codex/blocktype.nim @@ -9,6 +9,7 @@ import std/tables import std/sugar +import std/hashes export tables @@ -18,7 +19,7 @@ push: {.upraises: [].} import pkg/libp2p/[cid, multicodec, multihash] -import pkg/stew/byteutils +import pkg/stew/[byteutils, endians2] import pkg/questionable import pkg/questionable/results @@ -67,6 +68,13 @@ proc `$`*(a: BlockAddress): string = else: "cid: " & $a.cid +proc hash*(a: BlockAddress): Hash = + if a.leaf: + let data = a.treeCid.data.buffer & @(a.index.uint64.toBytesBE) + hash(data) + else: + hash(a.cid.data.buffer) + proc cidOrTreeCid*(a: BlockAddress): Cid = if a.leaf: a.treeCid else: a.cid diff --git a/tests/codex/blockexchange/engine/testengine.nim b/tests/codex/blockexchange/engine/testengine.nim index 0541c119fd..685507356e 100644 --- a/tests/codex/blockexchange/engine/testengine.nim +++ b/tests/codex/blockexchange/engine/testengine.nim @@ -620,7 +620,8 @@ asyncchecksuite "Task Handler": proc sendBlocksDelivery( id: PeerId, blocksDelivery: seq[BlockDelivery] ) {.async: (raises: [CancelledError]).} = - check peersCtx[0].peerWants[0].inFlight + let blockAddress = peersCtx[0].peerWants[0].address + check peersCtx[0].isInFlight(blockAddress) for blk in blocks: (await engine.localStore.putBlock(blk)).tryGet() @@ -633,7 +634,6 @@ asyncchecksuite "Task Handler": cancel: false, wantType: WantType.WantBlock, sendDontHave: false, - inFlight: false, ) ) await engine.taskHandler(peersCtx[0]) @@ -646,12 +646,12 @@ asyncchecksuite "Task Handler": cancel: false, wantType: WantType.WantBlock, sendDontHave: false, - inFlight: false, ) ) await engine.taskHandler(peersCtx[0]) - check not peersCtx[0].peerWants[0].inFlight + let blockAddress = peersCtx[0].peerWants[0].address + check not peersCtx[0].isInFlight(blockAddress) test "Should send presence": let present = blocks diff --git a/tests/codex/examples.nim b/tests/codex/examples.nim index 52b8a0b8a3..260adbfc0f 100644 --- a/tests/codex/examples.nim +++ b/tests/codex/examples.nim @@ -38,8 +38,7 @@ proc example*(_: type Pricing): Pricing = Pricing(address: EthAddress.example, price: uint32.rand.u256) proc example*(_: type bt.Block, size: int = 4096): bt.Block = - let length = rand(size) - let bytes = newSeqWith(length, rand(uint8)) + let bytes = newSeqWith(size, rand(uint8)) bt.Block.new(bytes).tryGet() proc example*(_: type PeerId): PeerId = diff --git a/tests/codex/testblocktype.nim b/tests/codex/testblocktype.nim new file mode 100644 index 0000000000..b0ea2732f9 --- /dev/null +++ b/tests/codex/testblocktype.nim @@ -0,0 +1,44 @@ +import pkg/unittest2 +import pkg/libp2p/cid + +import pkg/codex/blocktype + +import ./examples + +suite "blocktype": + test "should hash equal non-leaf block addresses onto the same hash": + let + cid1 = Cid.example + nonLeaf1 = BlockAddress.init(cid1) + nonLeaf2 = BlockAddress.init(cid1) + + check nonLeaf1 == nonLeaf2 + check nonLeaf1.hash == nonLeaf2.hash + + test "should hash equal leaf block addresses onto the same hash": + let + cid1 = Cid.example + leaf1 = BlockAddress.init(cid1, 0) + leaf2 = BlockAddress.init(cid1, 0) + + check leaf1 == leaf2 + check leaf1.hash == leaf2.hash + + test "should hash different non-leaf block addresses onto different hashes": + let + cid1 = Cid.example + cid2 = Cid.example + nonLeaf1 = BlockAddress.init(cid1) + nonLeaf2 = BlockAddress.init(cid2) + + check nonLeaf1 != nonLeaf2 + check nonLeaf1.hash != nonLeaf2.hash + + test "should hash different leaf block addresses onto different hashes": + let + cid1 = Cid.example + leaf1 = BlockAddress.init(cid1, 0) + leaf2 = BlockAddress.init(cid1, 1) + + check leaf1 != leaf2 + check leaf1.hash != leaf2.hash From a53f8da8550fe4975ce8362caee972410c6913a0 Mon Sep 17 00:00:00 2001 From: gmega Date: Fri, 6 Jun 2025 15:09:41 -0300 Subject: [PATCH 05/20] replace list operations with sets --- codex/blockexchange/engine/engine.nim | 27 +++++------------------ codex/blockexchange/peers/peercontext.nim | 9 +++----- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index 73b5442c2c..80986e70b1 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -18,6 +18,7 @@ import pkg/libp2p/[cid, switch, multihash, multicodec] import pkg/metrics import pkg/stint import pkg/questionable +import pkg/stew/shims/sets import ../../rng import ../../stores/blockstore @@ -297,47 +298,31 @@ proc blockPresenceHandler*( trace "Received block presence from peer", peer, len = blocks.len let peerCtx = self.peers.get(peer) - ourWantList = toSeq(self.pendingBlocks.wantList) + ourWantList = toHashSet(self.pendingBlocks.wantList.toSeq) if peerCtx.isNil: return - trace "Build presence list" - for blk in blocks: if presence =? Presence.init(blk): peerCtx.setPresence(presence) - trace "Built presence list" - - trace "Remove dont want cids" - let - peerHave = peerCtx.peerHave - dontWantCids = peerHave.filterIt(it notin ourWantList) - - trace "Removed dont want cids" + peerHave = peerCtx.peerHave.toHashSet + dontWantCids = peerHave - ourWantList if dontWantCids.len > 0: - peerCtx.cleanPresence(dontWantCids) - - trace "Remove want cids" + peerCtx.cleanPresence(dontWantCids.toSeq) let ourWantCids = ourWantList.filterIt( it in peerHave and not self.pendingBlocks.retriesExhausted(it) and not self.pendingBlocks.isInFlight(it) - ) - - trace "Removed want cids" - - trace "Update pending blocks" + ).toSeq for address in ourWantCids: self.pendingBlocks.setInFlight(address, true) self.pendingBlocks.decRetries(address) - trace "Updated pending blocks" - if ourWantCids.len > 0: trace "Peer has blocks in our wantList", peer, wants = ourWantCids # FIXME: this will result in duplicate requests for blocks diff --git a/codex/blockexchange/peers/peercontext.nim b/codex/blockexchange/peers/peercontext.nim index a4b0329aba..efac8b4329 100644 --- a/codex/blockexchange/peers/peercontext.nim +++ b/codex/blockexchange/peers/peercontext.nim @@ -34,7 +34,7 @@ type BlockExcPeerCtx* = ref object of RootObj lastRefresh*: Moment # last time we refreshed our knowledge of the blocks this peer has account*: ?Account # ethereum account of this peer paymentChannel*: ?ChannelId # payment channel id - blocksInFlight*: seq[BlockAddress] # blocks in flight towards peer + blocksInFlight*: HashSet[BlockAddress] # blocks in flight towards peer proc isKnowledgeStale*(self: BlockExcPeerCtx): bool = self.lastRefresh + 15.seconds < Moment.now() @@ -43,13 +43,10 @@ proc isInFlight*(self: BlockExcPeerCtx, address: BlockAddress): bool = address in self.blocksInFlight proc addInFlight*(self: BlockExcPeerCtx, address: BlockAddress) = - if not self.isInFlight(address): - self.blocksInFlight.add(address) + self.blocksInFlight.incl(address) proc removeInFlight*(self: BlockExcPeerCtx, address: BlockAddress) = - let index = self.blocksInFlight.find(address) - if index != -1: - self.blocksInFlight.delete(index) + self.blocksInFlight.excl(address) proc refreshed*(self: BlockExcPeerCtx) = self.lastRefresh = Moment.now() From e18996eed110ff49fde112554060fca2c08f4a72 Mon Sep 17 00:00:00 2001 From: gmega Date: Mon, 9 Jun 2025 11:26:16 -0300 Subject: [PATCH 06/20] optimize remaining list joins so they're not quadratic --- codex/blockexchange/engine/engine.nim | 77 ++++++++++------------ codex/blockexchange/peers/peercontext.nim | 17 ++--- codex/blockexchange/peers/peerctxstore.nim | 10 +-- codex/blockexchange/protobuf/message.nim | 5 ++ 4 files changed, 51 insertions(+), 58 deletions(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index 80986e70b1..25eabb715d 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -308,7 +308,7 @@ proc blockPresenceHandler*( peerCtx.setPresence(presence) let - peerHave = peerCtx.peerHave.toHashSet + peerHave = peerCtx.peerHave dontWantCids = peerHave - ourWantList if dontWantCids.len > 0: @@ -332,24 +332,23 @@ proc blockPresenceHandler*( proc scheduleTasks( self: BlockExcEngine, blocksDelivery: seq[BlockDelivery] ) {.async: (raises: [CancelledError]).} = - let cids = blocksDelivery.mapIt(it.blk.cid) - # schedule any new peers to provide blocks to for p in self.peers: - for c in cids: # for each cid + for blockDelivery in blocksDelivery: # for each cid # schedule a peer if it wants at least one cid # and we have it in our local store - if c in p.peerWantsCids: + if blockDelivery.address in p.wantedBlocks: + let cid = blockDelivery.blk.cid try: - if await (c in self.localStore): + if await (cid in self.localStore): # TODO: the try/except should go away once blockstore tracks exceptions self.scheduleTask(p) break except CancelledError as exc: - warn "Checking local store canceled", cid = c, err = exc.msg + warn "Checking local store canceled", cid = cid, err = exc.msg return except CatchableError as exc: - error "Error checking local store for cid", cid = c, err = exc.msg + error "Error checking local store for cid", cid = cid, err = exc.msg raiseAssert "Unexpected error checking local store for cid" proc cancelBlocks( @@ -513,14 +512,12 @@ proc wantListHandler*( try: for e in wantList.entries: - let idx = peerCtx.peerWants.findIt(it.address == e.address) - logScope: peer = peerCtx.id address = e.address wantType = $e.wantType - if idx < 0: # Adding new entry to peer wants + if e.address notin peerCtx.wantedBlocks: # Adding new entry to peer wants let have = try: @@ -556,25 +553,20 @@ proc wantListHandler*( codex_block_exchange_want_have_lists_received.inc() of WantType.WantBlock: - peerCtx.peerWants.add(e) + peerCtx.wantedBlocks.incl(e.address) schedulePeer = true codex_block_exchange_want_block_lists_received.inc() else: # Updating existing entry in peer wants # peer doesn't want this block anymore if e.cancel: trace "Canceling want for block", address = e.address - peerCtx.peerWants.del(idx) + peerCtx.wantedBlocks.excl(e.address) trace "Canceled block request", - address = e.address, len = peerCtx.peerWants.len + address = e.address, len = peerCtx.wantedBlocks.len else: + trace "Peer has requested a block more than once", address = e.address if e.wantType == WantType.WantBlock: schedulePeer = true - # peer might want to ask for the same cid with - # different want params - trace "Updating want for block", address = e.address - peerCtx.peerWants[idx] = e # update entry - trace "Updated block request", - address = e.address, len = peerCtx.peerWants.len if presence.len > 0: trace "Sending presence to remote", items = presence.mapIt($it).join(",") @@ -639,20 +631,16 @@ proc dropPeer*(self: BlockExcEngine, peer: PeerId) {.raises: [].} = self.peers.remove(peer) proc localLookup( - self: BlockExcEngine, e: WantListEntry + self: BlockExcEngine, address: BlockAddress ): Future[?!BlockDelivery] {.async: (raises: [CancelledError]).} = - if e.address.leaf: - (await self.localStore.getBlockAndProof(e.address.treeCid, e.address.index)).map( + if address.leaf: + (await self.localStore.getBlockAndProof(address.treeCid, address.index)).map( (blkAndProof: (Block, CodexProof)) => - BlockDelivery( - address: e.address, blk: blkAndProof[0], proof: blkAndProof[1].some - ) + BlockDelivery(address: address, blk: blkAndProof[0], proof: blkAndProof[1].some) ) else: - (await self.localStore.getBlock(e.address)).map( - (blk: Block) => BlockDelivery( - address: e.address, blk: blk, proof: CodexProof.none - ) + (await self.localStore.getBlock(address)).map( + (blk: Block) => BlockDelivery(address: address, blk: blk, proof: CodexProof.none) ) iterator splitBatches[T](sequence: seq[T], batchSize: int): seq[T] = @@ -674,40 +662,41 @@ proc taskHandler*( # Blocks that are in flight have already been picked up by other tasks and # should not be re-sent. - var wantedBlocks = peerCtx.peerWants.filterIt( - it.wantType == WantType.WantBlock and not peerCtx.isInFlight(it.address) - ) - - wantedBlocks.sort(SortOrder.Descending) + var + wantedBlocks = peerCtx.wantedBlocks.filterIt(not peerCtx.isInFlight(it)) + sent: HashSet[BlockAddress] for wantedBlock in wantedBlocks: - peerCtx.addInFlight(wantedBlock.address) + peerCtx.addInFlight(wantedBlock) try: - for batch in wantedBlocks.splitBatches(self.maxBlocksPerMessage): + for batch in wantedBlocks.toSeq.splitBatches(self.maxBlocksPerMessage): var blockDeliveries: seq[BlockDelivery] for wantedBlock in batch: # I/O is blocking so looking up blocks sequentially is fine. without blockDelivery =? await self.localLookup(wantedBlock), err: error "Error getting block from local store", - err = err.msg, address = wantedBlock.address - peerCtx.removeInFlight(wantedBlock.address) + err = err.msg, address = wantedBlock + peerCtx.removeInFlight(wantedBlock) continue blockDeliveries.add(blockDelivery) + sent.incl(wantedBlock) if blockDeliveries.len == 0: continue await self.network.request.sendBlocksDelivery(peerCtx.id, blockDeliveries) codex_block_exchange_blocks_sent.inc(blockDeliveries.len.int64) - # Drops the batch from want list. Note that the send might still fail down the line - # and we will have removed them anyway, at which point we rely on the requester - # performing a retry for the request to succeed. - peerCtx.peerWants.keepItIf(it.address notin blockDeliveries.mapIt(it.address)) + # Drops the batch from the peer's set of wanted blocks; i.e. assumes that after + # we send the blocks, then the peer no longer wants them, so we don't need to + # re-send them. Note that the send might still fail down the line and we will + # have removed those anyway. At that point, we rely on the requester performing + # a retry for the request to succeed. + peerCtx.wantedBlocks.keepItIf(it notin sent) finally: # Better safe than sorry: if an exception does happen, we don't want to keep # those in flight as it'll effectively prevent the blocks from ever being sent. - peerCtx.blocksInFlight.keepItIf(it notin wantedBlocks.mapIt(it.address)) + peerCtx.blocksInFlight.keepItIf(it notin wantedBlocks) proc blockexcTaskRunner(self: BlockExcEngine) {.async: (raises: []).} = ## process tasks diff --git a/codex/blockexchange/peers/peercontext.nim b/codex/blockexchange/peers/peercontext.nim index efac8b4329..857a8fa902 100644 --- a/codex/blockexchange/peers/peercontext.nim +++ b/codex/blockexchange/peers/peercontext.nim @@ -28,7 +28,7 @@ export payments, nitro type BlockExcPeerCtx* = ref object of RootObj id*: PeerId blocks*: Table[BlockAddress, Presence] # remote peer have list including price - peerWants*: seq[WantListEntry] # remote peers want lists + wantedBlocks*: HashSet[BlockAddress] # blocks that the peer wants exchanged*: int # times peer has exchanged with us lastExchange*: Moment # last time peer has exchanged with us lastRefresh*: Moment # last time we refreshed our knowledge of the blocks this peer has @@ -37,7 +37,7 @@ type BlockExcPeerCtx* = ref object of RootObj blocksInFlight*: HashSet[BlockAddress] # blocks in flight towards peer proc isKnowledgeStale*(self: BlockExcPeerCtx): bool = - self.lastRefresh + 15.seconds < Moment.now() + self.lastRefresh + 5.minutes < Moment.now() proc isInFlight*(self: BlockExcPeerCtx, address: BlockAddress): bool = address in self.blocksInFlight @@ -51,14 +51,11 @@ proc removeInFlight*(self: BlockExcPeerCtx, address: BlockAddress) = proc refreshed*(self: BlockExcPeerCtx) = self.lastRefresh = Moment.now() -proc peerHave*(self: BlockExcPeerCtx): seq[BlockAddress] = - toSeq(self.blocks.keys) - -proc peerHaveCids*(self: BlockExcPeerCtx): HashSet[Cid] = - self.blocks.keys.toSeq.mapIt(it.cidOrTreeCid).toHashSet - -proc peerWantsCids*(self: BlockExcPeerCtx): HashSet[Cid] = - self.peerWants.mapIt(it.address.cidOrTreeCid).toHashSet +proc peerHave*(self: BlockExcPeerCtx): HashSet[BlockAddress] = + # XXX: this is ugly an inefficient, but since those will typically + # be used in "joins", it's better to pay the price here and have + # a linear join than to not do it and have a quadratic join. + toHashSet(self.blocks.keys.toSeq) proc contains*(self: BlockExcPeerCtx, address: BlockAddress): bool = address in self.blocks diff --git a/codex/blockexchange/peers/peerctxstore.nim b/codex/blockexchange/peers/peerctxstore.nim index ce2506a82f..171206ba1c 100644 --- a/codex/blockexchange/peers/peerctxstore.nim +++ b/codex/blockexchange/peers/peerctxstore.nim @@ -62,21 +62,23 @@ func len*(self: PeerCtxStore): int = self.peers.len func peersHave*(self: PeerCtxStore, address: BlockAddress): seq[BlockExcPeerCtx] = - toSeq(self.peers.values).filterIt(it.peerHave.anyIt(it == address)) + toSeq(self.peers.values).filterIt(address in it.peerHave) func peersHave*(self: PeerCtxStore, cid: Cid): seq[BlockExcPeerCtx] = + # FIXME: this is way slower and can end up leading to unexpected performance loss. toSeq(self.peers.values).filterIt(it.peerHave.anyIt(it.cidOrTreeCid == cid)) func peersWant*(self: PeerCtxStore, address: BlockAddress): seq[BlockExcPeerCtx] = - toSeq(self.peers.values).filterIt(it.peerWants.anyIt(it == address)) + toSeq(self.peers.values).filterIt(address in it.wantedBlocks) func peersWant*(self: PeerCtxStore, cid: Cid): seq[BlockExcPeerCtx] = - toSeq(self.peers.values).filterIt(it.peerWants.anyIt(it.address.cidOrTreeCid == cid)) + # FIXME: this is way slower and can end up leading to unexpected performance loss. + toSeq(self.peers.values).filterIt(it.wantedBlocks.anyIt(it.cidOrTreeCid == cid)) proc getPeersForBlock*(self: PeerCtxStore, address: BlockAddress): PeersForBlock = var res: PeersForBlock = (@[], @[]) for peer in self: - if peer.peerHave.anyIt(it == address): + if address in peer.peerHave: res.with.add(peer) else: res.without.add(peer) diff --git a/codex/blockexchange/protobuf/message.nim b/codex/blockexchange/protobuf/message.nim index a338441696..00dbc57b1e 100644 --- a/codex/blockexchange/protobuf/message.nim +++ b/codex/blockexchange/protobuf/message.nim @@ -25,6 +25,11 @@ type WantListEntry* = object address*: BlockAddress + # XXX: I think explicit priority is pointless as the peer will request + # the blocks in the order it wants to receive them, and all we have to + # do is process those in the same order as we send them back. It also + # complicates things for no reason at the moment, as the priority is + # always set to 0. priority*: int32 # The priority (normalized). default to 1 cancel*: bool # Whether this revokes an entry wantType*: WantType # Note: defaults to enum 0, ie Block From fbd378ec18422da11be3efde3ba146d3373143b7 Mon Sep 17 00:00:00 2001 From: gmega Date: Mon, 9 Jun 2025 15:15:08 -0300 Subject: [PATCH 07/20] adapt existing tests to new data structures, remove vestigial tests --- .../blockexchange/engine/testblockexc.nim | 10 +- .../codex/blockexchange/engine/testengine.nim | 158 ++++++------------ .../codex/blockexchange/testpeerctxstore.nim | 5 +- 3 files changed, 51 insertions(+), 122 deletions(-) diff --git a/tests/codex/blockexchange/engine/testblockexc.nim b/tests/codex/blockexchange/engine/testblockexc.nim index 0c250231a0..108ca539d2 100644 --- a/tests/codex/blockexchange/engine/testblockexc.nim +++ b/tests/codex/blockexchange/engine/testblockexc.nim @@ -98,15 +98,7 @@ asyncchecksuite "NetworkStore engine - 2 nodes": let blkFut = nodeCmps1.pendingBlocks.getWantHandle(blk.cid) (await nodeCmps2.localStore.putBlock(blk)).tryGet() - let entry = WantListEntry( - address: blk.address, - priority: 1, - cancel: false, - wantType: WantType.WantBlock, - sendDontHave: false, - ) - - peerCtx1.peerWants.add(entry) + peerCtx1.wantedBlocks.incl(blk.address) check nodeCmps2.engine.taskQueue.pushOrUpdateNoWait(peerCtx1).isOk check eventually (await nodeCmps1.localStore.hasBlock(blk.cid)).tryGet() diff --git a/tests/codex/blockexchange/engine/testengine.nim b/tests/codex/blockexchange/engine/testengine.nim index 685507356e..c0360fed29 100644 --- a/tests/codex/blockexchange/engine/testengine.nim +++ b/tests/codex/blockexchange/engine/testengine.nim @@ -174,7 +174,7 @@ asyncchecksuite "NetworkStore engine handlers": let ctx = await engine.taskQueue.pop() check ctx.id == peerId # only `wantBlock` scheduled - check ctx.peerWants.mapIt(it.address.cidOrTreeCid) == blocks.mapIt(it.cid) + check ctx.wantedBlocks == blocks.mapIt(it.address).toHashSet let done = handler() await engine.wantListHandler(peerId, wantList) @@ -579,130 +579,66 @@ asyncchecksuite "Task Handler": engine.pricing = Pricing.example.some - test "Should send want-blocks in priority order": - proc sendBlocksDelivery( - id: PeerId, blocksDelivery: seq[BlockDelivery] - ) {.async: (raises: [CancelledError]).} = - check blocksDelivery.len == 2 - check: - blocksDelivery[1].address == blocks[0].address - blocksDelivery[0].address == blocks[1].address - - for blk in blocks: - (await engine.localStore.putBlock(blk)).tryGet() - engine.network.request.sendBlocksDelivery = sendBlocksDelivery - - # second block to send by priority - peersCtx[0].peerWants.add( - WantListEntry( - address: blocks[0].address, - priority: 49, - cancel: false, - wantType: WantType.WantBlock, - sendDontHave: false, - ) - ) - - # first block to send by priority - peersCtx[0].peerWants.add( - WantListEntry( - address: blocks[1].address, - priority: 50, - cancel: false, - wantType: WantType.WantBlock, - sendDontHave: false, - ) - ) - - await engine.taskHandler(peersCtx[0]) + # FIXME: this is disabled for now: I've dropped block priorities to make + # my life easier as I try to optimize the protocol, and also because + # they were not being used anywhere. + # + # test "Should send want-blocks in priority order": + # proc sendBlocksDelivery( + # id: PeerId, blocksDelivery: seq[BlockDelivery] + # ) {.async: (raises: [CancelledError]).} = + # check blocksDelivery.len == 2 + # check: + # blocksDelivery[1].address == blocks[0].address + # blocksDelivery[0].address == blocks[1].address + + # for blk in blocks: + # (await engine.localStore.putBlock(blk)).tryGet() + # engine.network.request.sendBlocksDelivery = sendBlocksDelivery + + # # second block to send by priority + # peersCtx[0].peerWants.add( + # WantListEntry( + # address: blocks[0].address, + # priority: 49, + # cancel: false, + # wantType: WantType.WantBlock, + # sendDontHave: false, + # ) + # ) + + # # first block to send by priority + # peersCtx[0].peerWants.add( + # WantListEntry( + # address: blocks[1].address, + # priority: 50, + # cancel: false, + # wantType: WantType.WantBlock, + # sendDontHave: false, + # ) + # ) + + # await engine.taskHandler(peersCtx[0]) test "Should set in-flight for outgoing blocks": proc sendBlocksDelivery( id: PeerId, blocksDelivery: seq[BlockDelivery] ) {.async: (raises: [CancelledError]).} = - let blockAddress = peersCtx[0].peerWants[0].address + let blockAddress = peersCtx[0].wantedBlocks.toSeq[0] check peersCtx[0].isInFlight(blockAddress) for blk in blocks: (await engine.localStore.putBlock(blk)).tryGet() engine.network.request.sendBlocksDelivery = sendBlocksDelivery - peersCtx[0].peerWants.add( - WantListEntry( - address: blocks[0].address, - priority: 50, - cancel: false, - wantType: WantType.WantBlock, - sendDontHave: false, - ) - ) + peersCtx[0].wantedBlocks.incl(blocks[0].address) + await engine.taskHandler(peersCtx[0]) test "Should clear in-flight when local lookup fails": - peersCtx[0].peerWants.add( - WantListEntry( - address: blocks[0].address, - priority: 50, - cancel: false, - wantType: WantType.WantBlock, - sendDontHave: false, - ) - ) + peersCtx[0].wantedBlocks.incl(blocks[0].address) + await engine.taskHandler(peersCtx[0]) - let blockAddress = peersCtx[0].peerWants[0].address + let blockAddress = peersCtx[0].wantedBlocks.toSeq[0] check not peersCtx[0].isInFlight(blockAddress) - - test "Should send presence": - let present = blocks - let missing = @[Block.new("missing".toBytes).tryGet()] - let price = (!engine.pricing).price - - proc sendPresence( - id: PeerId, presence: seq[BlockPresence] - ) {.async: (raises: [CancelledError]).} = - check presence.mapIt(!Presence.init(it)) == - @[ - Presence(address: present[0].address, have: true, price: price), - Presence(address: present[1].address, have: true, price: price), - Presence(address: missing[0].address, have: false), - ] - - for blk in blocks: - (await engine.localStore.putBlock(blk)).tryGet() - engine.network.request.sendPresence = sendPresence - - # have block - peersCtx[0].peerWants.add( - WantListEntry( - address: present[0].address, - priority: 1, - cancel: false, - wantType: WantType.WantHave, - sendDontHave: false, - ) - ) - - # have block - peersCtx[0].peerWants.add( - WantListEntry( - address: present[1].address, - priority: 1, - cancel: false, - wantType: WantType.WantHave, - sendDontHave: false, - ) - ) - - # don't have block - peersCtx[0].peerWants.add( - WantListEntry( - address: missing[0].address, - priority: 1, - cancel: false, - wantType: WantType.WantHave, - sendDontHave: false, - ) - ) - - await engine.taskHandler(peersCtx[0]) diff --git a/tests/codex/blockexchange/testpeerctxstore.nim b/tests/codex/blockexchange/testpeerctxstore.nim index e2983d101d..f348c1d59e 100644 --- a/tests/codex/blockexchange/testpeerctxstore.nim +++ b/tests/codex/blockexchange/testpeerctxstore.nim @@ -81,8 +81,9 @@ suite "Peer Context Store Peer Selection": ) ) - peerCtxs[0].peerWants = entries - peerCtxs[5].peerWants = entries + for address in addresses: + peerCtxs[0].wantedBlocks.incl(address) + peerCtxs[5].wantedBlocks.incl(address) let peers = store.peersWant(addresses[4]) From 03a1cc70eb0a48f0a1294f4b1b6ffdbf43019d67 Mon Sep 17 00:00:00 2001 From: Giuliano Mega Date: Mon, 9 Jun 2025 17:13:10 -0300 Subject: [PATCH 08/20] Update codex/stores/networkstore.nim Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Giuliano Mega --- codex/stores/networkstore.nim | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/codex/stores/networkstore.nim b/codex/stores/networkstore.nim index 829197f8bf..b20e878cd5 100644 --- a/codex/stores/networkstore.nim +++ b/codex/stores/networkstore.nim @@ -32,8 +32,7 @@ type NetworkStore* = ref object of BlockStore localStore*: BlockStore # local block store proc getBlocks*( - self: NetworkStore, - addresses: seq[BlockAddress] + self: NetworkStore, addresses: seq[BlockAddress] ): Future[seq[?!Block]] {.async: (raises: [CancelledError]).} = var localBlocks: seq[?!Block] From 313d6bac1f884e6037dc8afa7a56d45f0dc36bb4 Mon Sep 17 00:00:00 2001 From: gmega Date: Tue, 10 Jun 2025 19:55:51 -0300 Subject: [PATCH 09/20] fix: refresh timestamp before issuing request to prevent flood of knowledge updates --- codex/blockexchange/engine/engine.nim | 19 ++- tests/codex/blockexchange/engine/fakepeer.nim | 155 ++++++++++++++++++ .../codex/blockexchange/engine/testengine.nim | 5 + 3 files changed, 175 insertions(+), 4 deletions(-) create mode 100644 tests/codex/blockexchange/engine/fakepeer.nim diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index 25eabb715d..2550c36444 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -164,17 +164,16 @@ proc sendWantBlock( proc refreshBlockKnowledge( self: BlockExcEngine, peer: BlockExcPeerCtx ) {.async: (raises: [CancelledError]).} = - # broadcast our want list, the other peer will do the same if self.pendingBlocks.wantListLen > 0: let cids = toSeq(self.pendingBlocks.wantList) trace "Sending our want list to a peer", peer = peer.id, length = cids.len await self.network.request.sendWantList(peer.id, cids, full = true) proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledError]).} = - for peer in self.peers.peers.values: + for peer in self.peers.peers.values.toSeq: # We refresh block knowledge if: # 1. the peer hasn't been refreshed in a while; - # 2. the list of blocks we care about has actually changed. + # 2. the list of blocks we care about has changed. # # Note that because of (2), it is important that we update our # want list in the coarsest way possible instead of over many @@ -182,8 +181,17 @@ proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledErr # # In dynamic swarms, staleness will dominate latency. if peer.lastRefresh < self.pendingBlocks.lastInclusion or peer.isKnowledgeStale: - await self.refreshBlockKnowledge(peer) + # FIXME: we update the lastRefresh before actually refreshing because otherwise + # a slow peer will be bombarded with requests. If the request does fail or the + # peer does not reply, a retrying block will eventually issue this again. This + # is a complex and convoluted flow - ideally we should simply be tracking this + # request and retrying it on the absence of a response, eventually disconnecting + # the peer if it consistently fails to respond. peer.refreshed() + # TODO: optimize this by keeping track of what was sent and sending deltas. + # This should allow us to run much more frequent refreshes, and be way more + # efficient about it. + await self.refreshBlockKnowledge(peer) proc randomPeer(peers: seq[BlockExcPeerCtx]): BlockExcPeerCtx = Rng.instance.sample(peers) @@ -220,6 +228,9 @@ proc downloadInternal( else: self.pendingBlocks.setInFlight(address, false) if peers.without.len > 0: + # We have peers connected, but none of them have the block. This + # could be because our knowledge about what they have has run stale. + # Tries to refresh it. await self.refreshBlockKnowledge() self.discovery.queueFindBlocksReq(@[address.cidOrTreeCid]) diff --git a/tests/codex/blockexchange/engine/fakepeer.nim b/tests/codex/blockexchange/engine/fakepeer.nim new file mode 100644 index 0000000000..b3fcb48b36 --- /dev/null +++ b/tests/codex/blockexchange/engine/fakepeer.nim @@ -0,0 +1,155 @@ +import std/assertions +import std/enumerate +import std/sugar + +import pkg/chronos +import pkg/libp2p + +import pkg/codex/manifest +import pkg/codex/merkletree +import pkg/codex/blockexchange +import pkg/codex/blockexchange/network/network {.all.} +import pkg/codex/blockexchange/protobuf/[message, blockexc] +import pkg/codex/blocktype +import pkg/codex/rng + +import ../../helpers + +type + ## Fake network in which one real peer can talk to + ## k fake peers. + FakeNetwork* = ref object + fakePeers*: Table[PeerId, FakePeer] + sender*: BlockExcNetwork + + FakePeer* = ref object + id*: PeerId + fakeNetwork*: FakeNetwork + pendingRequests*: seq[BlockAddress] + blocks*: Table[BlockAddress, Block] + proofs*: Table[BlockAddress, CodexProof] + + Dataset* = object + blocks*: seq[Block] + proofs*: seq[CodexProof] + manifest*: Manifest + +proc makePeerId(): PeerId = + let + gen = Rng.instance() + secKey = PrivateKey.random(gen[]).tryGet() + + return PeerId.init(secKey.getPublicKey().tryGet()).tryGet() + +proc newDataset*( + nBlocks: int = 5, blockSize: NBytes = 1024.NBytes +): Future[Dataset] {.async.} = + let + blocks = await makeRandomBlocks(blockSize.int * nBlocks, blockSize) + (manifest, tree) = makeManifestAndTree(blocks).tryGet() + treeCid = tree.rootCid.tryGet() + + return Dataset( + blocks: blocks, + proofs: (0 ..< blocks.len).mapIt(tree.getProof(it).tryGet()).toSeq, + manifest: manifest, + ) + +proc storeDataset*(self: FakePeer, dataset: Dataset, slice: HSlice[int, int] = 1 .. 0) = + let actualSlice = + if slice.len == 0: + 0 ..< dataset.blocks.len + else: + slice + + for index in actualSlice: + let address = BlockAddress.init(dataset.manifest.treeCid, index.Natural) + self.proofs[address] = dataset.proofs[index] + self.blocks[address] = dataset.blocks[index] + +proc blockPresences(self: FakePeer, addresses: seq[BlockAddress]): seq[BlockPresence] = + collect: + for address in addresses: + if self.blocks.hasKey(address): + BlockPresence(address: address, `type`: BlockPresenceType.Have) + +proc getPeer(self: FakeNetwork, id: PeerId): FakePeer = + try: + return self.fakePeers[id] + except KeyError as exc: + raise newException(Defect, "peer not found") + +proc newInstrumentedNetwork(self: FakeNetwork): BlockExcNetwork = + var sender = BlockExcNetwork() + + proc sendWantList( + id: PeerId, + addresses: seq[BlockAddress], + priority: int32 = 0, + cancel: bool = false, + wantType: WantType = WantType.WantHave, + full: bool = false, + sendDontHave: bool = false, + ) {.async: (raises: [CancelledError]).} = + var peer = self.getPeer(id) + case wantType + # WantHaves are replied to immediately. + of WantType.WantHave: + let haves = peer.blockPresences(addresses) + if haves.len > 0: + await sender.handlers.onPresence(id, haves) + + # WantBlocks are deferred till `sendPendingBlocks` is called. + of WantType.WantBlock: + let blockAddresses = addresses.filterIt(peer.blocks.hasKey(it)).toSeq + if blockAddresses.len > 0: + for blockAddress in blockAddresses: + peer.pendingRequests.add(blockAddress) + + proc sendBlocksDelivery( + id: PeerId, blocksDelivery: seq[BlockDelivery] + ) {.async: (raises: [CancelledError]).} = + var peer = self.getPeer(id) + for delivery in blocksDelivery: + peer.blocks[delivery.address] = delivery.blk + if delivery.proof.isSome: + peer.proofs[delivery.address] = delivery.proof.get + + sender.request = BlockExcRequest( + sendWantList: sendWantList, + sendBlocksDelivery: sendBlocksDelivery, + sendWantCancellations: proc( + id: PeerId, addresses: seq[BlockAddress] + ) {.async: (raises: [CancelledError]).} = + discard, + ) + + return sender + +proc sendPendingBlocks*(self: FakePeer) {.async.} = + ## Replies to any pending block requests. + let blocks = collect: + for blockAddress in self.pendingRequests: + if not self.blocks.hasKey(blockAddress): + continue + + let proof = + if blockAddress in self.proofs: + self.proofs[blockAddress].some + else: + CodexProof.none + + BlockDelivery(address: blockAddress, blk: self.blocks[blockAddress], proof: proof) + + await self.fakeNetwork.sender.handlers.onBlocksDelivery(self.id, blocks) + +proc newPeer*(self: FakeNetwork): FakePeer = + ## Adds a new `FakePeer` to a `FakeNetwork`. + let peer = FakePeer(id: makePeerId(), fakeNetwork: self) + self.fakePeers[peer.id] = peer + return peer + +proc new*(_: type FakeNetwork): FakeNetwork = + let fakeNetwork = FakeNetwork() + fakeNetwork.sender = fakeNetwork.newInstrumentedNetwork() + return fakeNetwork diff --git a/tests/codex/blockexchange/engine/testengine.nim b/tests/codex/blockexchange/engine/testengine.nim index c0360fed29..6688b41712 100644 --- a/tests/codex/blockexchange/engine/testengine.nim +++ b/tests/codex/blockexchange/engine/testengine.nim @@ -520,6 +520,11 @@ asyncchecksuite "Block Download": expect CancelledError: discard (await pending).tryGet() + # test "Should not keep looking up providers for the same dataset repeatedly": + # let + # blocks = await makeRandomBlocks(datasetSize = 4096, blockSize = 128'nb) + # manifest = await storeDataGetManifest(store, blocks) + asyncchecksuite "Task Handler": var rng: Rng From d94bfe60f6f68d5e4b44cdd2e4d148c0b953265f Mon Sep 17 00:00:00 2001 From: gmega Date: Fri, 27 Jun 2025 16:04:49 -0300 Subject: [PATCH 10/20] feat: add SafeAsyncIter chaining --- codex/utils/safeasynciter.nim | 25 +++++++++++++++++++++++++ tests/codex/utils/testsafeasynciter.nim | 23 +++++++++++++++++++---- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/codex/utils/safeasynciter.nim b/codex/utils/safeasynciter.nim index d582fec3b5..be46cc0f16 100644 --- a/codex/utils/safeasynciter.nim +++ b/codex/utils/safeasynciter.nim @@ -232,3 +232,28 @@ proc empty*[T](_: type SafeAsyncIter[T]): SafeAsyncIter[T] = true SafeAsyncIter[T].new(genNext, isFinished) + +proc chain*[T](iters: seq[SafeAsyncIter[T]]): SafeAsyncIter[T] = + if iters.len == 0: + return SafeAsyncIter[T].empty + + var curIdx = 0 + + proc ensureNext(): void = + while curIdx < iters.len and iters[curIdx].finished: + inc(curIdx) + + proc isFinished(): bool = + curIdx == iters.len + + proc genNext(): Future[?!T] {.async: (raises: [CancelledError]).} = + let item = await iters[curIdx].next() + ensureNext() + return item + + ensureNext() + + return SafeAsyncIter[T].new(genNext, isFinished) + +proc chain*[T](iters: varargs[SafeAsyncIter[T]]): SafeAsyncIter[T] = + chain(iters.toSeq) diff --git a/tests/codex/utils/testsafeasynciter.nim b/tests/codex/utils/testsafeasynciter.nim index 1aeba4d2e5..7acd3640ff 100644 --- a/tests/codex/utils/testsafeasynciter.nim +++ b/tests/codex/utils/testsafeasynciter.nim @@ -373,7 +373,7 @@ asyncchecksuite "Test SafeAsyncIter": # Now, to make sure that this mechanism works, and to document its # cancellation semantics, this test shows that when the async predicate # function is cancelled, this cancellation has immediate effect, which means - # that `next()` (or more precisely `getNext()` in `mapFilter` function), is + # that `next()` (or more precisely `getNext()` in `mapFilter` function), is # interrupted immediately. If this is the case, the the iterator be interrupted # before `next()` returns this locally captured value from the previous # iteration and this is exactly the reason why at the end of the test @@ -404,10 +404,8 @@ asyncchecksuite "Test SafeAsyncIter": expect CancelledError: for fut in iter2: - if i =? (await fut): + without i =? (await fut), err: collected.add(i) - else: - fail() check: # We expect only values "0" and "1" to be collected @@ -415,3 +413,20 @@ asyncchecksuite "Test SafeAsyncIter": # will not be returned because of the cancellation. collected == @["0", "1"] iter2.finished + + test "should allow chaining": + let + iter1 = SafeAsyncIter[int].new(0 ..< 5) + iter2 = SafeAsyncIter[int].new(5 ..< 10) + iter3 = chain[int](iter1, SafeAsyncIter[int].empty, iter2) + + var collected: seq[int] + + for fut in iter3: + without i =? (await fut), err: + fail() + collected.add(i) + + check: + iter3.finished + collected == @[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] From db279d8fa9c67297e729189e9d13d8bf70e0a196 Mon Sep 17 00:00:00 2001 From: gmega Date: Mon, 30 Jun 2025 17:30:47 -0300 Subject: [PATCH 11/20] feat: remove quadratic joins in cancelBlocks; use SafeAsyncIterator for getBlocks; limit memory usage for fetchBatched when used as prefetcher --- codex/blockexchange/engine/engine.nim | 69 ++++++++++++++++--------- codex/node.nim | 38 ++++++++++---- codex/stores/blockstore.nim | 8 +++ codex/stores/networkstore.nim | 17 +++--- codex/stores/repostore/store.nim | 15 ++++++ tests/codex/utils/testsafeasynciter.nim | 4 +- 6 files changed, 108 insertions(+), 43 deletions(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index 2550c36444..bb46b62fb9 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -256,7 +256,7 @@ proc downloadInternal( proc requestBlocks*( self: BlockExcEngine, addresses: seq[BlockAddress] -): Future[seq[?!Block]] {.async: (raises: [CancelledError]).} = +): SafeAsyncIter[Block] = var handles: seq[BlockHandle] # Adds all blocks to pendingBlocks before calling the first downloadInternal. This will # ensure that we don't send incomplete want lists. @@ -267,20 +267,27 @@ proc requestBlocks*( for address in addresses: self.trackedFutures.track(self.downloadInternal(address)) - # TODO: we can reduce latency and improve download times - # by returning blocks out of order as futures complete. - var blocks: seq[?!Block] - for handle in handles: - try: - blocks.add(success await handle) - except CancelledError as err: - warn "Block request cancelled", addresses, err = err.msg - raise err - except CatchableError as err: - error "Error getting blocks from exchange engine", addresses, err = err.msg - blocks.add(Block.failure err) + var completed: int = 0 + + proc isFinished(): bool = + completed == handles.len + + proc genNext(): Future[?!Block] {.async: (raises: [CancelledError]).} = + # Be it success or failure, we're completing this future. + let value = + try: + success await handles[completed] + except CancelledError as err: + warn "Block request cancelled", addresses, err = err.msg + raise err + except CatchableError as err: + error "Error getting blocks from exchange engine", addresses, err = err.msg + failure err - return blocks + inc(completed) + return value + + return SafeAsyncIter[Block].new(genNext, isFinished) proc requestBlock*( self: BlockExcEngine, address: BlockAddress @@ -368,28 +375,42 @@ proc cancelBlocks( ## Tells neighboring peers that we're no longer interested in a block. ## + let addrSet = toHashSet(addrs) + var pendingCancellations: Table[PeerId, HashSet[BlockAddress]] + if self.peers.len == 0: return trace "Sending block request cancellations to peers", addrs, peers = self.peers.peerIds - proc processPeer(peerCtx: BlockExcPeerCtx): Future[BlockExcPeerCtx] {.async.} = + proc processPeer( + entry: tuple[peerId: PeerId, addresses: HashSet[BlockAddress]] + ): Future[PeerId] {.async: (raises: [CancelledError]).} = await self.network.request.sendWantCancellations( - peer = peerCtx.id, addresses = addrs.filterIt(it in peerCtx) + peer = entry.peerId, addresses = entry.addresses.toSeq ) - return peerCtx + return entry.peerId try: - let (succeededFuts, failedFuts) = await allFinishedFailed[BlockExcPeerCtx]( - toSeq(self.peers.peers.values).filterIt(it.peerHave.anyIt(it in addrs)).map( - processPeer - ) + # Does the peer have any of the blocks we're canceling? + for peerCtx in self.peers.peers.values: + let intersection = peerCtx.peerHave.intersection(addrSet) + if intersection.len > 0: + pendingCancellations[peerCtx.id] = intersection + + # If so, dispatches cancellations. + # FIXME: we're still spamming peers - the fact that the peer has the block does + # not mean we've requested it. + let (succeededFuts, failedFuts) = await allFinishedFailed[PeerId]( + toSeq(pendingCancellations.pairs).map(processPeer) ) - (await allFinished(succeededFuts)).mapIt(it.read).apply do(peerCtx: BlockExcPeerCtx): - peerCtx.cleanPresence(addrs) + (await allFinished(succeededFuts)).mapIt(it.read).apply do(peerId: PeerId): + let ctx = self.peers.get(peerId) + if not ctx.isNil: + ctx.cleanPresence(addrs) if failedFuts.len > 0: warn "Failed to send block request cancellations to peers", peers = failedFuts.len @@ -539,6 +560,8 @@ proc wantListHandler*( price = @(self.pricing.get(Pricing(price: 0.u256)).price.toBytesBE) if e.cancel: + # This is sort of expected if we sent the block to the peer, as we have removed + # it from the peer's wantlist ourselves. trace "Received cancelation for untracked block, skipping", address = e.address continue diff --git a/codex/node.nim b/codex/node.nim index ea173ad1e6..7602903291 100644 --- a/codex/node.nim +++ b/codex/node.nim @@ -44,7 +44,7 @@ import ./indexingstrategy import ./utils import ./errors import ./logutils -import ./utils/asynciter +import ./utils/safeasynciter import ./utils/trackedfutures export logutils @@ -194,20 +194,38 @@ proc fetchBatched*( if not (await address in self.networkStore) or fetchLocal: address - let - blockResults = await self.networkStore.getBlocks(addresses) - blocks = blockResults.filterIt(it.isSuccess()).mapIt(it.value) - numOfFailedBlocks = blockResults.len - blocks.len + proc successful( + blk: ?!bt.Block + ): Future[bool] {.async: (raises: [CancelledError]).} = + return blk.isSuccess() - if numOfFailedBlocks > 0: - return - failure("Some blocks failed (Result) to fetch (" & $numOfFailedBlocks & ")") + let blockResults = await self.networkStore.getBlocks(addresses) - if not onBatch.isNil and batchErr =? (await onBatch(blocks)).errorOption: + var + successfulBlocks = 0 + failedBlocks = 0 + blockData: seq[bt.Block] + + for res in blockResults: + without blk =? await res: + inc(failedBlocks) + continue + + inc(successfulBlocks) + + # Only retains block data in memory if there's + # a callback. + if not onBatch.isNil: + blockData.add(blk) + + if failedBlocks > 0: + return failure("Some blocks failed (Result) to fetch (" & $failedBlocks & ")") + + if not onBatch.isNil and batchErr =? (await onBatch(blockData)).errorOption: return failure(batchErr) if not iter.finished: - await sleepAsync(1.millis) + await idleAsync() success() diff --git a/codex/stores/blockstore.nim b/codex/stores/blockstore.nim index bbe0bef8ff..6c49a859df 100644 --- a/codex/stores/blockstore.nim +++ b/codex/stores/blockstore.nim @@ -65,6 +65,14 @@ method getBlock*( raiseAssert("getBlock by addr not implemented!") +method getBlocks*( + self: BlockStore, addresses: seq[BlockAddress] +): Future[SafeAsyncIter[Block]] {.async: (raises: [CancelledError]).} = + ## Gets a set of blocks from the blockstore. Blocks might + ## be returned in any order. + + raiseAssert("getBlocks not implemented!") + method getBlockAndProof*( self: BlockStore, treeCid: Cid, index: Natural ): Future[?!(Block, CodexProof)] {.base, async: (raises: [CancelledError]), gcsafe.} = diff --git a/codex/stores/networkstore.nim b/codex/stores/networkstore.nim index b20e878cd5..976a53cb32 100644 --- a/codex/stores/networkstore.nim +++ b/codex/stores/networkstore.nim @@ -31,24 +31,23 @@ type NetworkStore* = ref object of BlockStore engine*: BlockExcEngine # blockexc decision engine localStore*: BlockStore # local block store -proc getBlocks*( +method getBlocks*( self: NetworkStore, addresses: seq[BlockAddress] -): Future[seq[?!Block]] {.async: (raises: [CancelledError]).} = +): Future[SafeAsyncIter[Block]] {.async: (raises: [CancelledError]).} = var - localBlocks: seq[?!Block] + localAddresses: seq[BlockAddress] remoteAddresses: seq[BlockAddress] - # We can resolve local blocks sequentially as for now those are blocking anyway. Still: - # TODO: implement getBlocks for local store so we can delegate it here. for address in addresses: if not (await address in self.localStore): remoteAddresses.add(address) else: - localBlocks.add(await self.localStore.getBlock(address)) + localAddresses.add(address) - let remoteBlocks = await self.engine.requestBlocks(remoteAddresses) - - return localBlocks.concat(remoteBlocks) + return chain( + await self.localStore.getBlocks(localAddresses), + self.engine.requestBlocks(remoteAddresses), + ) method getBlock*( self: NetworkStore, address: BlockAddress diff --git a/codex/stores/repostore/store.nim b/codex/stores/repostore/store.nim index bea2971c71..ad6f03fc70 100644 --- a/codex/stores/repostore/store.nim +++ b/codex/stores/repostore/store.nim @@ -38,6 +38,21 @@ logScope: # BlockStore API ########################################################### +method getBlocks*( + self: RepoStore, addresses: seq[BlockAddress] +): Future[SafeAsyncIter[Block]] {.async: (raises: [CancelledError]).} = + var i = 0 + + proc isFinished(): bool = + i == addresses.len + + proc genNext(): Future[?!Block] {.async: (raises: [CancelledError]).} = + let value = await self.getBlock(addresses[i]) + inc(i) + return value + + return SafeAsyncIter[Block].new(genNext, isFinished) + method getBlock*( self: RepoStore, cid: Cid ): Future[?!Block] {.async: (raises: [CancelledError]).} = diff --git a/tests/codex/utils/testsafeasynciter.nim b/tests/codex/utils/testsafeasynciter.nim index 7acd3640ff..87b0d84ad9 100644 --- a/tests/codex/utils/testsafeasynciter.nim +++ b/tests/codex/utils/testsafeasynciter.nim @@ -404,8 +404,10 @@ asyncchecksuite "Test SafeAsyncIter": expect CancelledError: for fut in iter2: - without i =? (await fut), err: + if i =? (await fut): collected.add(i) + else: + fail() check: # We expect only values "0" and "1" to be collected From 83fd83ffa5cbc74d28e6d13de1713de8c1c700c3 Mon Sep 17 00:00:00 2001 From: gmega Date: Mon, 30 Jun 2025 20:10:56 -0300 Subject: [PATCH 12/20] feat: allow futures to be returned out-of-order to decrease memory consumption --- codex/blockexchange/engine/engine.nim | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index bb46b62fb9..e3af0593f1 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -258,6 +258,7 @@ proc requestBlocks*( self: BlockExcEngine, addresses: seq[BlockAddress] ): SafeAsyncIter[Block] = var handles: seq[BlockHandle] + # Adds all blocks to pendingBlocks before calling the first downloadInternal. This will # ensure that we don't send incomplete want lists. for address in addresses: @@ -276,7 +277,13 @@ proc requestBlocks*( # Be it success or failure, we're completing this future. let value = try: - success await handles[completed] + # FIXME: this is super expensive. We're doing several linear scans, + # not to mention all the copying and callback fumbling in `one`. + let + handle = await one(handles) + i = handles.find(handle) + handles.del(i) + success await handle except CancelledError as err: warn "Block request cancelled", addresses, err = err.msg raise err From cdab172bae96efa4ec09ad794c1efc954e7ce342 Mon Sep 17 00:00:00 2001 From: gmega Date: Mon, 30 Jun 2025 20:19:32 -0300 Subject: [PATCH 13/20] chore: remove file committed by accident --- tests/codex/blockexchange/engine/fakepeer.nim | 155 ------------------ 1 file changed, 155 deletions(-) delete mode 100644 tests/codex/blockexchange/engine/fakepeer.nim diff --git a/tests/codex/blockexchange/engine/fakepeer.nim b/tests/codex/blockexchange/engine/fakepeer.nim deleted file mode 100644 index b3fcb48b36..0000000000 --- a/tests/codex/blockexchange/engine/fakepeer.nim +++ /dev/null @@ -1,155 +0,0 @@ -import std/assertions -import std/enumerate -import std/sugar - -import pkg/chronos -import pkg/libp2p - -import pkg/codex/manifest -import pkg/codex/merkletree -import pkg/codex/blockexchange -import pkg/codex/blockexchange/network/network {.all.} -import pkg/codex/blockexchange/protobuf/[message, blockexc] -import pkg/codex/blocktype -import pkg/codex/rng - -import ../../helpers - -type - ## Fake network in which one real peer can talk to - ## k fake peers. - FakeNetwork* = ref object - fakePeers*: Table[PeerId, FakePeer] - sender*: BlockExcNetwork - - FakePeer* = ref object - id*: PeerId - fakeNetwork*: FakeNetwork - pendingRequests*: seq[BlockAddress] - blocks*: Table[BlockAddress, Block] - proofs*: Table[BlockAddress, CodexProof] - - Dataset* = object - blocks*: seq[Block] - proofs*: seq[CodexProof] - manifest*: Manifest - -proc makePeerId(): PeerId = - let - gen = Rng.instance() - secKey = PrivateKey.random(gen[]).tryGet() - - return PeerId.init(secKey.getPublicKey().tryGet()).tryGet() - -proc newDataset*( - nBlocks: int = 5, blockSize: NBytes = 1024.NBytes -): Future[Dataset] {.async.} = - let - blocks = await makeRandomBlocks(blockSize.int * nBlocks, blockSize) - (manifest, tree) = makeManifestAndTree(blocks).tryGet() - treeCid = tree.rootCid.tryGet() - - return Dataset( - blocks: blocks, - proofs: (0 ..< blocks.len).mapIt(tree.getProof(it).tryGet()).toSeq, - manifest: manifest, - ) - -proc storeDataset*(self: FakePeer, dataset: Dataset, slice: HSlice[int, int] = 1 .. 0) = - let actualSlice = - if slice.len == 0: - 0 ..< dataset.blocks.len - else: - slice - - for index in actualSlice: - let address = BlockAddress.init(dataset.manifest.treeCid, index.Natural) - self.proofs[address] = dataset.proofs[index] - self.blocks[address] = dataset.blocks[index] - -proc blockPresences(self: FakePeer, addresses: seq[BlockAddress]): seq[BlockPresence] = - collect: - for address in addresses: - if self.blocks.hasKey(address): - BlockPresence(address: address, `type`: BlockPresenceType.Have) - -proc getPeer(self: FakeNetwork, id: PeerId): FakePeer = - try: - return self.fakePeers[id] - except KeyError as exc: - raise newException(Defect, "peer not found") - -proc newInstrumentedNetwork(self: FakeNetwork): BlockExcNetwork = - var sender = BlockExcNetwork() - - proc sendWantList( - id: PeerId, - addresses: seq[BlockAddress], - priority: int32 = 0, - cancel: bool = false, - wantType: WantType = WantType.WantHave, - full: bool = false, - sendDontHave: bool = false, - ) {.async: (raises: [CancelledError]).} = - var peer = self.getPeer(id) - case wantType - # WantHaves are replied to immediately. - of WantType.WantHave: - let haves = peer.blockPresences(addresses) - if haves.len > 0: - await sender.handlers.onPresence(id, haves) - - # WantBlocks are deferred till `sendPendingBlocks` is called. - of WantType.WantBlock: - let blockAddresses = addresses.filterIt(peer.blocks.hasKey(it)).toSeq - if blockAddresses.len > 0: - for blockAddress in blockAddresses: - peer.pendingRequests.add(blockAddress) - - proc sendBlocksDelivery( - id: PeerId, blocksDelivery: seq[BlockDelivery] - ) {.async: (raises: [CancelledError]).} = - var peer = self.getPeer(id) - for delivery in blocksDelivery: - peer.blocks[delivery.address] = delivery.blk - if delivery.proof.isSome: - peer.proofs[delivery.address] = delivery.proof.get - - sender.request = BlockExcRequest( - sendWantList: sendWantList, - sendBlocksDelivery: sendBlocksDelivery, - sendWantCancellations: proc( - id: PeerId, addresses: seq[BlockAddress] - ) {.async: (raises: [CancelledError]).} = - discard, - ) - - return sender - -proc sendPendingBlocks*(self: FakePeer) {.async.} = - ## Replies to any pending block requests. - let blocks = collect: - for blockAddress in self.pendingRequests: - if not self.blocks.hasKey(blockAddress): - continue - - let proof = - if blockAddress in self.proofs: - self.proofs[blockAddress].some - else: - CodexProof.none - - BlockDelivery(address: blockAddress, blk: self.blocks[blockAddress], proof: proof) - - await self.fakeNetwork.sender.handlers.onBlocksDelivery(self.id, blocks) - -proc newPeer*(self: FakeNetwork): FakePeer = - ## Adds a new `FakePeer` to a `FakeNetwork`. - let peer = FakePeer(id: makePeerId(), fakeNetwork: self) - self.fakePeers[peer.id] = peer - return peer - -proc new*(_: type FakeNetwork): FakeNetwork = - let fakeNetwork = FakeNetwork() - fakeNetwork.sender = fakeNetwork.newInstrumentedNetwork() - return fakeNetwork From ad8e25060525a75a9986a55a66d164745308b597 Mon Sep 17 00:00:00 2001 From: gmega Date: Tue, 1 Jul 2025 21:20:14 -0300 Subject: [PATCH 14/20] feat: modify retry mechanism; add DHT guard rails; improve block cancellation handling --- codex/blockexchange/engine/engine.nim | 87 ++++++++++++++------ codex/blockexchange/engine/pendingblocks.nim | 2 +- codex/blockexchange/peers/peercontext.nim | 46 +++++++++-- 3 files changed, 102 insertions(+), 33 deletions(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index e3af0593f1..4acc87ab17 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -73,6 +73,9 @@ const DefaultMaxBlocksPerMessage = 500 DefaultTaskQueueSize = 100 DefaultConcurrentTasks = 10 + # Don't do more than one discovery request per `DiscoveryRateLimit` seconds. + DiscoveryRateLimit = 1.seconds + DefaultPeerActivityTimeout = 1.minutes type TaskHandler* = proc(task: BlockExcPeerCtx): Future[void] {.gcsafe.} @@ -94,6 +97,7 @@ type pricing*: ?Pricing # Optional bandwidth pricing discovery*: DiscoveryEngine advertiser*: Advertiser + lastDiscRequest: Moment # time of last discovery request Pricing* = object address*: EthAddress @@ -193,6 +197,14 @@ proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledErr # efficient about it. await self.refreshBlockKnowledge(peer) +proc searchForNewPeers(self: BlockExcEngine, cid: Cid) = + if self.lastDiscRequest + DiscoveryRateLimit < Moment.now(): + trace "Searching for new peers for", cid = cid + self.lastDiscRequest = Moment.now() # always refresh before calling await! + self.discovery.queueFindBlocksReq(@[cid]) + else: + trace "Not searching for new peers, rate limit not expired", cid = cid + proc randomPeer(peers: seq[BlockExcPeerCtx]): BlockExcPeerCtx = Rng.instance.sample(peers) @@ -215,34 +227,56 @@ proc downloadInternal( handle.fail(newException(RetriesExhaustedError, "Error retries exhausted")) break - trace "Running retry handle" let peers = self.peers.getPeersForBlock(address) logScope: peersWith = peers.with.len peersWithout = peers.without.len - trace "Peers for block" - if peers.with.len > 0: - self.pendingBlocks.setInFlight(address, true) - await self.sendWantBlock(@[address], peers.with.randomPeer) - else: - self.pendingBlocks.setInFlight(address, false) + if peers.with.len == 0: + # We know of no peers that have the block. if peers.without.len > 0: - # We have peers connected, but none of them have the block. This + # If we have peers connected but none of them have the block, this # could be because our knowledge about what they have has run stale. # Tries to refresh it. await self.refreshBlockKnowledge() - self.discovery.queueFindBlocksReq(@[address.cidOrTreeCid]) + # Also tries to look for new peers for good measure. + # TODO: in the future, peer search and knowledge maintenance should + # be completely decoupled from one another. It is very hard to + # control what happens and how many neighbors we get like this. + self.searchForNewPeers(address.cidOrTreeCid) + + # We wait for a bit and then retry. Since we've shipped our wantlists to + # connected peers, they might reply and we might request the block in the + # meantime as part of `blockPresenceHandler`. + await handle or sleepAsync(self.pendingBlocks.retryInterval) + # If we already got the block, we're done. Otherwise, we'll go for another + # cycle, potentially refreshing knowledge on peers again, and looking up + # the DHT again. + if handle.finished: + break + trace "No peers for block, will retry shortly" + continue + + let scheduledPeer = peers.with.randomPeer + self.pendingBlocks.setInFlight(address, true) + scheduledPeer.blockRequested(address) + await self.sendWantBlock(@[address], scheduledPeer) + + let activityTimer = scheduledPeer.activityTimer() + await handle or activityTimer # TODO: or peerDropped + activityTimer.cancel() - # FIXME: blocks should not blindly reschedule themselves. Instead, - # we should only reschedule a block if the peer drops, or we are - # in endgame mode. - await (handle or sleepAsync(self.pendingBlocks.retryInterval)) + # XXX: we should probably not have this. Blocks should be retried + # to infinity unless cancelled by the client. self.pendingBlocks.decRetries(address) if handle.finished: trace "Handle for block finished", failed = handle.failed break + else: + # If the peer timed out, retries immediately. + trace "Dropping timed out peer.", peer = scheduledPeer.id + # TODO: disconnect peer except CancelledError as exc: trace "Block download cancelled" if not handle.finished: @@ -347,6 +381,7 @@ proc blockPresenceHandler*( for address in ourWantCids: self.pendingBlocks.setInFlight(address, true) self.pendingBlocks.decRetries(address) + peerCtx.blockRequested(address) if ourWantCids.len > 0: trace "Peer has blocks in our wantList", peer, wants = ourWantCids @@ -401,15 +436,13 @@ proc cancelBlocks( return entry.peerId try: - # Does the peer have any of the blocks we're canceling? for peerCtx in self.peers.peers.values: - let intersection = peerCtx.peerHave.intersection(addrSet) + # Have we requested any of the blocks we're cancelling to this peer? + let intersection = peerCtx.blocksRequested.intersection(addrSet) if intersection.len > 0: pendingCancellations[peerCtx.id] = intersection # If so, dispatches cancellations. - # FIXME: we're still spamming peers - the fact that the peer has the block does - # not mean we've requested it. let (succeededFuts, failedFuts) = await allFinishedFailed[PeerId]( toSeq(pendingCancellations.pairs).map(processPeer) ) @@ -418,6 +451,8 @@ proc cancelBlocks( let ctx = self.peers.get(peerId) if not ctx.isNil: ctx.cleanPresence(addrs) + for address in pendingCancellations[peerId]: + ctx.blockRequestCancelled(address) if failedFuts.len > 0: warn "Failed to send block request cancellations to peers", peers = failedFuts.len @@ -492,6 +527,8 @@ proc blocksDeliveryHandler*( trace "Received blocks from peer", peer, blocks = (blocksDelivery.mapIt(it.address)) var validatedBlocksDelivery: seq[BlockDelivery] + let peerCtx = self.peers.get(peer) + for bd in blocksDelivery: logScope: peer = peer @@ -517,6 +554,9 @@ proc blocksDeliveryHandler*( ).errorOption: warn "Unable to store proof and cid for a block" continue + + if peerCtx != nil: + peerCtx.blockReceived(bd.address) except CatchableError as exc: warn "Error handling block delivery", error = exc.msg continue @@ -525,7 +565,6 @@ proc blocksDeliveryHandler*( codex_block_exchange_blocks_received.inc(validatedBlocksDelivery.len.int64) - let peerCtx = self.peers.get(peer) if peerCtx != nil: if err =? catch(await self.payForBlocks(peerCtx, blocksDelivery)).errorOption: warn "Error paying for blocks", err = err.msg @@ -652,7 +691,7 @@ proc setupPeer*( trace "Setting up peer", peer if peer notin self.peers: - let peerCtx = BlockExcPeerCtx(id: peer) + let peerCtx = BlockExcPeerCtx(id: peer, activityTimeout: DefaultPeerActivityTimeout) trace "Setting up new peer", peer self.peers.add(peerCtx) trace "Added peer", peers = self.peers.len @@ -701,14 +740,14 @@ proc taskHandler*( # Send to the peer blocks he wants to get, # if they present in our local store - # Blocks that are in flight have already been picked up by other tasks and + # Blocks that have been sent have already been picked up by other tasks and # should not be re-sent. var - wantedBlocks = peerCtx.wantedBlocks.filterIt(not peerCtx.isInFlight(it)) + wantedBlocks = peerCtx.wantedBlocks.filterIt(not peerCtx.isBlockSent(it)) sent: HashSet[BlockAddress] for wantedBlock in wantedBlocks: - peerCtx.addInFlight(wantedBlock) + peerCtx.markBlockAsSent(wantedBlock) try: for batch in wantedBlocks.toSeq.splitBatches(self.maxBlocksPerMessage): @@ -718,7 +757,7 @@ proc taskHandler*( without blockDelivery =? await self.localLookup(wantedBlock), err: error "Error getting block from local store", err = err.msg, address = wantedBlock - peerCtx.removeInFlight(wantedBlock) + peerCtx.markBlockAsNotSent(wantedBlock) continue blockDeliveries.add(blockDelivery) sent.incl(wantedBlock) @@ -737,7 +776,7 @@ proc taskHandler*( finally: # Better safe than sorry: if an exception does happen, we don't want to keep # those in flight as it'll effectively prevent the blocks from ever being sent. - peerCtx.blocksInFlight.keepItIf(it notin wantedBlocks) + peerCtx.blocksSent.keepItIf(it notin wantedBlocks) proc blockexcTaskRunner(self: BlockExcEngine) {.async: (raises: []).} = ## process tasks diff --git a/codex/blockexchange/engine/pendingblocks.nim b/codex/blockexchange/engine/pendingblocks.nim index 6c45d6f52d..d78dda5a6b 100644 --- a/codex/blockexchange/engine/pendingblocks.nim +++ b/codex/blockexchange/engine/pendingblocks.nim @@ -34,7 +34,7 @@ declareGauge( const DefaultBlockRetries* = 3000 - DefaultRetryInterval* = 180.seconds + DefaultRetryInterval* = 5.seconds type RetriesExhaustedError* = object of CatchableError diff --git a/codex/blockexchange/peers/peercontext.nim b/codex/blockexchange/peers/peercontext.nim index 857a8fa902..cca6d3876e 100644 --- a/codex/blockexchange/peers/peercontext.nim +++ b/codex/blockexchange/peers/peercontext.nim @@ -30,23 +30,25 @@ type BlockExcPeerCtx* = ref object of RootObj blocks*: Table[BlockAddress, Presence] # remote peer have list including price wantedBlocks*: HashSet[BlockAddress] # blocks that the peer wants exchanged*: int # times peer has exchanged with us - lastExchange*: Moment # last time peer has exchanged with us lastRefresh*: Moment # last time we refreshed our knowledge of the blocks this peer has account*: ?Account # ethereum account of this peer paymentChannel*: ?ChannelId # payment channel id - blocksInFlight*: HashSet[BlockAddress] # blocks in flight towards peer + blocksSent*: HashSet[BlockAddress] # blocks sent to peer + blocksRequested*: HashSet[BlockAddress] # pending block requests to this peer + lastExchange*: Moment # last time peer has sent us a block + activityTimeout*: Duration proc isKnowledgeStale*(self: BlockExcPeerCtx): bool = self.lastRefresh + 5.minutes < Moment.now() -proc isInFlight*(self: BlockExcPeerCtx, address: BlockAddress): bool = - address in self.blocksInFlight +proc isBlockSent*(self: BlockExcPeerCtx, address: BlockAddress): bool = + address in self.blocksSent -proc addInFlight*(self: BlockExcPeerCtx, address: BlockAddress) = - self.blocksInFlight.incl(address) +proc markBlockAsSent*(self: BlockExcPeerCtx, address: BlockAddress) = + self.blocksSent.incl(address) -proc removeInFlight*(self: BlockExcPeerCtx, address: BlockAddress) = - self.blocksInFlight.excl(address) +proc markBlockAsNotSent*(self: BlockExcPeerCtx, address: BlockAddress) = + self.blocksSent.excl(address) proc refreshed*(self: BlockExcPeerCtx) = self.lastRefresh = Moment.now() @@ -77,3 +79,31 @@ func price*(self: BlockExcPeerCtx, addresses: seq[BlockAddress]): UInt256 = price += precense[].price price + +proc blockRequested*(self: BlockExcPeerCtx, address: BlockAddress) = + # We start counting the timeout from the first block requested. + if self.blocksRequested.len == 0: + self.lastExchange = Moment.now() + self.blocksRequested.incl(address) + +proc blockRequestCancelled*(self: BlockExcPeerCtx, address: BlockAddress) = + self.blocksRequested.excl(address) + +proc blockReceived*(self: BlockExcPeerCtx, address: BlockAddress) = + self.blocksRequested.excl(address) + self.lastExchange = Moment.now() + +proc activityTimer*( + self: BlockExcPeerCtx +): Future[void] {.async: (raises: [CancelledError]).} = + ## This is called by the block exchange when a block is scheduled for this peer. + ## If the peer sends no blocks for a while, it is considered inactive/uncooperative + ## and the peer is dropped. Note that ANY block that the peer sends will reset this + ## timer for all blocks. + ## + while true: + let idleTime = Moment.now() - self.lastExchange + if idleTime > self.activityTimeout: + return + + await sleepAsync(self.activityTimeout - idleTime) From 97fd68e4a393f1d9b78ef991456d00d7b684d3d3 Mon Sep 17 00:00:00 2001 From: gmega Date: Wed, 2 Jul 2025 20:17:06 -0300 Subject: [PATCH 15/20] feat: drop peer on activity timeout --- codex/blockexchange/engine/engine.nim | 145 +++++++----- codex/blockexchange/engine/pendingblocks.nim | 40 +++- codex/blockexchange/network/network.nim | 84 ++++--- codex/blockexchange/peers/peercontext.nim | 4 +- .../codex/blockexchange/engine/testengine.nim | 207 +++++++++--------- 5 files changed, 280 insertions(+), 200 deletions(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index 4acc87ab17..453dea4f87 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -64,6 +64,10 @@ declareCounter(codex_block_exchange_blocks_sent, "codex blockexchange blocks sen declareCounter( codex_block_exchange_blocks_received, "codex blockexchange blocks received" ) +declareCounter( + codex_block_exchange_spurious_blocks_received, + "codex blockexchange unrequested/duplicate blocks received", +) const DefaultMaxPeersPerRequest* = 10 @@ -80,13 +84,15 @@ const type TaskHandler* = proc(task: BlockExcPeerCtx): Future[void] {.gcsafe.} TaskScheduler* = proc(task: BlockExcPeerCtx): bool {.gcsafe.} + PeerSelector* = + proc(peers: seq[BlockExcPeerCtx]): BlockExcPeerCtx {.gcsafe, raises: [].} BlockExcEngine* = ref object of RootObj localStore*: BlockStore # Local block store for this instance - network*: BlockExcNetwork # Petwork interface + network*: BlockExcNetwork # Network interface peers*: PeerCtxStore # Peers we're currently actively exchanging with taskQueue*: AsyncHeapQueue[BlockExcPeerCtx] - # Peers we're currently processing tasks for + selectPeer*: PeerSelector # Peers we're currently processing tasks for concurrentTasks: int # Number of concurrent peers we're serving at any given time trackedFutures: TrackedFutures # Tracks futures of blockexc tasks blockexcRunning: bool # Indicates if the blockexc task is running @@ -205,8 +211,19 @@ proc searchForNewPeers(self: BlockExcEngine, cid: Cid) = else: trace "Not searching for new peers, rate limit not expired", cid = cid -proc randomPeer(peers: seq[BlockExcPeerCtx]): BlockExcPeerCtx = - Rng.instance.sample(peers) +proc evictPeer(self: BlockExcEngine, peer: PeerId) = + ## Cleanup disconnected peer + ## + + trace "Evicting disconnected/departed peer", peer + + let peerCtx = self.peers.get(peer) + if not peerCtx.isNil: + for address in peerCtx.blocksRequested: + self.pendingBlocks.clearRequest(address, peer.some) + + # drop the peer from the peers table + self.peers.remove(peer) proc downloadInternal( self: BlockExcEngine, address: BlockAddress @@ -245,23 +262,34 @@ proc downloadInternal( # control what happens and how many neighbors we get like this. self.searchForNewPeers(address.cidOrTreeCid) - # We wait for a bit and then retry. Since we've shipped our wantlists to - # connected peers, they might reply and we might request the block in the - # meantime as part of `blockPresenceHandler`. + # We now wait for a bit and then retry. If the handle gets completed in the + # meantime (cause the presence handler might have requested the block and + # received it in the meantime), we are done. await handle or sleepAsync(self.pendingBlocks.retryInterval) - # If we already got the block, we're done. Otherwise, we'll go for another - # cycle, potentially refreshing knowledge on peers again, and looking up - # the DHT again. if handle.finished: break + # If we still don't have the block, we'll go for another cycle. trace "No peers for block, will retry shortly" continue - let scheduledPeer = peers.with.randomPeer - self.pendingBlocks.setInFlight(address, true) - scheduledPeer.blockRequested(address) - await self.sendWantBlock(@[address], scheduledPeer) + # Once again, it might happen that the block was requested to a peer + # in the meantime. If so, we don't need to do anything. Otherwise, + # we'll be the ones placing the request. + let scheduledPeer = + if not self.pendingBlocks.isRequested(address): + let peer = self.selectPeer(peers.with) + self.pendingBlocks.markRequested(address, peer.id) + peer.blockRequested(address) + trace "Request block from block retry loop" + await self.sendWantBlock(@[address], peer) + peer + else: + let peerId = self.pendingBlocks.getRequestPeer(address).get() + self.peers.get(peerId) + + assert not scheduledPeer.isNil + # Parks until either the block is received, or the peer times out. let activityTimer = scheduledPeer.activityTimer() await handle or activityTimer # TODO: or peerDropped activityTimer.cancel() @@ -275,8 +303,11 @@ proc downloadInternal( break else: # If the peer timed out, retries immediately. - trace "Dropping timed out peer.", peer = scheduledPeer.id - # TODO: disconnect peer + trace "Peer timed out during block request", peer = scheduledPeer.id + await self.network.dropPeer(scheduledPeer.id) + # Evicts peer immediately or we may end up picking it again in the + # next retry. + self.evictPeer(scheduledPeer.id) except CancelledError as exc: trace "Block download cancelled" if not handle.finished: @@ -286,7 +317,7 @@ proc downloadInternal( if not handle.finished: handle.fail(exc) finally: - self.pendingBlocks.setInFlight(address, false) + self.pendingBlocks.clearRequest(address) proc requestBlocks*( self: BlockExcEngine, addresses: seq[BlockAddress] @@ -375,12 +406,12 @@ proc blockPresenceHandler*( let ourWantCids = ourWantList.filterIt( it in peerHave and not self.pendingBlocks.retriesExhausted(it) and - not self.pendingBlocks.isInFlight(it) + not self.pendingBlocks.isRequested(it) ).toSeq for address in ourWantCids: - self.pendingBlocks.setInFlight(address, true) self.pendingBlocks.decRetries(address) + self.pendingBlocks.markRequested(address, peer) peerCtx.blockRequested(address) if ourWantCids.len > 0: @@ -388,6 +419,8 @@ proc blockPresenceHandler*( # FIXME: this will result in duplicate requests for blocks if err =? catch(await self.sendWantBlock(ourWantCids, peerCtx)).errorOption: warn "Failed to send wantBlock to peer", peer, err = err.msg + for address in ourWantCids: + self.pendingBlocks.clearRequest(address, peer.some) proc scheduleTasks( self: BlockExcEngine, blocksDelivery: seq[BlockDelivery] @@ -417,18 +450,17 @@ proc cancelBlocks( ## Tells neighboring peers that we're no longer interested in a block. ## - let addrSet = toHashSet(addrs) - var pendingCancellations: Table[PeerId, HashSet[BlockAddress]] + let blocksDelivered = toHashSet(addrs) + var scheduledCancellations: Table[PeerId, HashSet[BlockAddress]] if self.peers.len == 0: return - trace "Sending block request cancellations to peers", - addrs, peers = self.peers.peerIds - - proc processPeer( + proc dispatchCancellations( entry: tuple[peerId: PeerId, addresses: HashSet[BlockAddress]] ): Future[PeerId] {.async: (raises: [CancelledError]).} = + trace "Sending block request cancellations to peer", + peer = entry.peerId, addresses = entry.addresses.len await self.network.request.sendWantCancellations( peer = entry.peerId, addresses = entry.addresses.toSeq ) @@ -437,21 +469,22 @@ proc cancelBlocks( try: for peerCtx in self.peers.peers.values: - # Have we requested any of the blocks we're cancelling to this peer? - let intersection = peerCtx.blocksRequested.intersection(addrSet) + # Do we have pending requests, towards this peer, for any of the blocks + # that were just delivered? + let intersection = peerCtx.blocksRequested.intersection(blocksDelivered) if intersection.len > 0: - pendingCancellations[peerCtx.id] = intersection + # If so, schedules a cancellation. + scheduledCancellations[peerCtx.id] = intersection - # If so, dispatches cancellations. let (succeededFuts, failedFuts) = await allFinishedFailed[PeerId]( - toSeq(pendingCancellations.pairs).map(processPeer) + toSeq(scheduledCancellations.pairs).map(dispatchCancellations) ) (await allFinished(succeededFuts)).mapIt(it.read).apply do(peerId: PeerId): let ctx = self.peers.get(peerId) if not ctx.isNil: ctx.cleanPresence(addrs) - for address in pendingCancellations[peerId]: + for address in scheduledCancellations[peerId]: ctx.blockRequestCancelled(address) if failedFuts.len > 0: @@ -535,6 +568,12 @@ proc blocksDeliveryHandler*( address = bd.address try: + # Unknown peers and unrequested blocks are dropped with a warning. + if peerCtx == nil or not peerCtx.blockReceived(bd.address): + warn "Dropping unrequested or duplicate block received from peer" + codex_block_exchange_spurious_blocks_received.inc() + continue + if err =? self.validateBlockDelivery(bd).errorOption: warn "Block validation failed", msg = err.msg continue @@ -554,9 +593,6 @@ proc blocksDeliveryHandler*( ).errorOption: warn "Unable to store proof and cid for a block" continue - - if peerCtx != nil: - peerCtx.blockReceived(bd.address) except CatchableError as exc: warn "Error handling block delivery", error = exc.msg continue @@ -681,7 +717,7 @@ proc paymentHandler*( else: context.paymentChannel = self.wallet.acceptChannel(payment).option -proc setupPeer*( +proc peerAddedHandler*( self: BlockExcEngine, peer: PeerId ) {.async: (raises: [CancelledError]).} = ## Perform initial setup, such as want @@ -701,15 +737,6 @@ proc setupPeer*( trace "Sending account to peer", peer await self.network.request.sendAccount(peer, Account(address: address)) -proc dropPeer*(self: BlockExcEngine, peer: PeerId) {.raises: [].} = - ## Cleanup disconnected peer - ## - - trace "Dropping peer", peer - - # drop the peer from the peers table - self.peers.remove(peer) - proc localLookup( self: BlockExcEngine, address: BlockAddress ): Future[?!BlockDelivery] {.async: (raises: [CancelledError]).} = @@ -775,7 +802,7 @@ proc taskHandler*( peerCtx.wantedBlocks.keepItIf(it notin sent) finally: # Better safe than sorry: if an exception does happen, we don't want to keep - # those in flight as it'll effectively prevent the blocks from ever being sent. + # those as sent, as it'll effectively prevent the blocks from ever being sent again. peerCtx.blocksSent.keepItIf(it notin wantedBlocks) proc blockexcTaskRunner(self: BlockExcEngine) {.async: (raises: []).} = @@ -792,6 +819,9 @@ proc blockexcTaskRunner(self: BlockExcEngine) {.async: (raises: []).} = info "Exiting blockexc task runner" +proc selectRandom*(peers: seq[BlockExcPeerCtx]): BlockExcPeerCtx = + Rng.instance.sample(peers) + proc new*( T: type BlockExcEngine, localStore: BlockStore, @@ -803,6 +833,7 @@ proc new*( pendingBlocks: PendingBlocksManager, maxBlocksPerMessage = DefaultMaxBlocksPerMessage, concurrentTasks = DefaultConcurrentTasks, + selectPeer: PeerSelector = selectRandom, ): BlockExcEngine = ## Create new block exchange engine instance ## @@ -821,18 +852,6 @@ proc new*( advertiser: advertiser, ) - proc peerEventHandler( - peerId: PeerId, event: PeerEvent - ): Future[void] {.gcsafe, async: (raises: [CancelledError]).} = - if event.kind == PeerEventKind.Joined: - await self.setupPeer(peerId) - else: - self.dropPeer(peerId) - - if not isNil(network.switch): - network.switch.addPeerEventHandler(peerEventHandler, PeerEventKind.Joined) - network.switch.addPeerEventHandler(peerEventHandler, PeerEventKind.Left) - proc blockWantListHandler( peer: PeerId, wantList: WantList ): Future[void] {.async: (raises: []).} = @@ -858,12 +877,24 @@ proc new*( ): Future[void] {.async: (raises: []).} = self.paymentHandler(peer, payment) + proc peerAddedHandler( + peer: PeerId + ): Future[void] {.async: (raises: [CancelledError]).} = + await self.peerAddedHandler(peer) + + proc peerDepartedHandler( + peer: PeerId + ): Future[void] {.async: (raises: [CancelledError]).} = + self.evictPeer(peer) + network.handlers = BlockExcHandlers( onWantList: blockWantListHandler, onBlocksDelivery: blocksDeliveryHandler, onPresence: blockPresenceHandler, onAccount: accountHandler, onPayment: paymentHandler, + onPeerJoined: peerAddedHandler, + onPeerDeparted: peerDepartedHandler, ) return self diff --git a/codex/blockexchange/engine/pendingblocks.nim b/codex/blockexchange/engine/pendingblocks.nim index d78dda5a6b..5579a1faca 100644 --- a/codex/blockexchange/engine/pendingblocks.nim +++ b/codex/blockexchange/engine/pendingblocks.nim @@ -42,7 +42,7 @@ type BlockReq* = object handle*: BlockHandle - inFlight*: bool + requested*: ?PeerId blockRetries*: int startTime*: int64 @@ -56,7 +56,7 @@ proc updatePendingBlockGauge(p: PendingBlocksManager) = codex_block_exchange_pending_block_requests.set(p.blocks.len.int64) proc getWantHandle*( - self: PendingBlocksManager, address: BlockAddress, inFlight = false + self: PendingBlocksManager, address: BlockAddress, requested: ?PeerId = PeerId.none ): Future[Block] {.async: (raw: true, raises: [CancelledError, RetriesExhaustedError]).} = ## Add an event for a block ## @@ -66,7 +66,7 @@ proc getWantHandle*( do: let blk = BlockReq( handle: newFuture[Block]("pendingBlocks.getWantHandle"), - inFlight: inFlight, + requested: requested, blockRetries: self.blockRetries, startTime: getMonoTime().ticks, ) @@ -89,9 +89,9 @@ proc getWantHandle*( return handle proc getWantHandle*( - self: PendingBlocksManager, cid: Cid, inFlight = false + self: PendingBlocksManager, cid: Cid, requested: ?PeerId = PeerId.none ): Future[Block] {.async: (raw: true, raises: [CancelledError, RetriesExhaustedError]).} = - self.getWantHandle(BlockAddress.init(cid), inFlight) + self.getWantHandle(BlockAddress.init(cid), requested) proc resolve*( self: PendingBlocksManager, blocksDelivery: seq[BlockDelivery] @@ -128,19 +128,37 @@ func retriesExhausted*(self: PendingBlocksManager, address: BlockAddress): bool self.blocks.withValue(address, pending): result = pending[].blockRetries <= 0 -func setInFlight*(self: PendingBlocksManager, address: BlockAddress, inFlight = true) = - ## Set inflight status for a block +func isRequested*(self: PendingBlocksManager, address: BlockAddress): bool = + ## Check if a block has been requested to a peer ## + result = false + self.blocks.withValue(address, pending): + result = pending[].requested.isSome +func getRequestPeer*(self: PendingBlocksManager, address: BlockAddress): ?PeerId = + ## Returns the peer that requested this block + ## + result = PeerId.none self.blocks.withValue(address, pending): - pending[].inFlight = inFlight + result = pending[].requested -func isInFlight*(self: PendingBlocksManager, address: BlockAddress): bool = - ## Check if a block is in flight +proc markRequested*(self: PendingBlocksManager, address: BlockAddress, peer: PeerId) = + ## Marks this block as having been requested to a peer ## + if self.isRequested(address): + error "Attempt to request block twice", address = address, peer = peer + + self.blocks.withValue(address, pending): + pending[].requested = peer.some + +proc clearRequest*( + self: PendingBlocksManager, address: BlockAddress, peer: ?PeerId = PeerId.none +) = self.blocks.withValue(address, pending): - result = pending[].inFlight + if peer.isSome: + assert peer == pending[].requested + pending[].requested = PeerId.none func contains*(self: PendingBlocksManager, cid: Cid): bool = BlockAddress.init(cid) in self.blocks diff --git a/codex/blockexchange/network/network.nim b/codex/blockexchange/network/network.nim index d47541104a..fb6ff2d51b 100644 --- a/codex/blockexchange/network/network.nim +++ b/codex/blockexchange/network/network.nim @@ -44,6 +44,7 @@ type AccountHandler* = proc(peer: PeerId, account: Account) {.gcsafe, async: (raises: []).} PaymentHandler* = proc(peer: PeerId, payment: SignedState) {.gcsafe, async: (raises: []).} + PeerEventHandler* = proc(peer: PeerId) {.gcsafe, async: (raises: [CancelledError]).} BlockExcHandlers* = object onWantList*: WantListHandler @@ -51,6 +52,9 @@ type onPresence*: BlockPresenceHandler onAccount*: AccountHandler onPayment*: PaymentHandler + onPeerJoined*: PeerEventHandler + onPeerDeparted*: PeerEventHandler + onPeerDropped*: PeerEventHandler WantListSender* = proc( id: PeerId, @@ -240,84 +244,102 @@ proc handlePayment( await network.handlers.onPayment(peer.id, payment) proc rpcHandler( - b: BlockExcNetwork, peer: NetworkPeer, msg: Message + self: BlockExcNetwork, peer: NetworkPeer, msg: Message ) {.async: (raises: []).} = ## handle rpc messages ## if msg.wantList.entries.len > 0: - b.trackedFutures.track(b.handleWantList(peer, msg.wantList)) + self.trackedFutures.track(self.handleWantList(peer, msg.wantList)) if msg.payload.len > 0: - b.trackedFutures.track(b.handleBlocksDelivery(peer, msg.payload)) + self.trackedFutures.track(self.handleBlocksDelivery(peer, msg.payload)) if msg.blockPresences.len > 0: - b.trackedFutures.track(b.handleBlockPresence(peer, msg.blockPresences)) + self.trackedFutures.track(self.handleBlockPresence(peer, msg.blockPresences)) if account =? Account.init(msg.account): - b.trackedFutures.track(b.handleAccount(peer, account)) + self.trackedFutures.track(self.handleAccount(peer, account)) if payment =? SignedState.init(msg.payment): - b.trackedFutures.track(b.handlePayment(peer, payment)) + self.trackedFutures.track(self.handlePayment(peer, payment)) -proc getOrCreatePeer(b: BlockExcNetwork, peer: PeerId): NetworkPeer = +proc getOrCreatePeer(self: BlockExcNetwork, peer: PeerId): NetworkPeer = ## Creates or retrieves a BlockExcNetwork Peer ## - if peer in b.peers: - return b.peers.getOrDefault(peer, nil) + if peer in self.peers: + return self.peers.getOrDefault(peer, nil) var getConn: ConnProvider = proc(): Future[Connection] {. async: (raises: [CancelledError]) .} = try: trace "Getting new connection stream", peer - return await b.switch.dial(peer, Codec) + return await self.switch.dial(peer, Codec) except CancelledError as error: raise error except CatchableError as exc: trace "Unable to connect to blockexc peer", exc = exc.msg - if not isNil(b.getConn): - getConn = b.getConn + if not isNil(self.getConn): + getConn = self.getConn let rpcHandler = proc(p: NetworkPeer, msg: Message) {.async: (raises: []).} = - await b.rpcHandler(p, msg) + await self.rpcHandler(p, msg) # create new pubsub peer let blockExcPeer = NetworkPeer.new(peer, getConn, rpcHandler) debug "Created new blockexc peer", peer - b.peers[peer] = blockExcPeer + self.peers[peer] = blockExcPeer return blockExcPeer -proc setupPeer*(b: BlockExcNetwork, peer: PeerId) = - ## Perform initial setup, such as want - ## list exchange - ## - - discard b.getOrCreatePeer(peer) - -proc dialPeer*(b: BlockExcNetwork, peer: PeerRecord) {.async.} = +proc dialPeer*(self: BlockExcNetwork, peer: PeerRecord) {.async.} = ## Dial a peer ## - if b.isSelf(peer.peerId): + if self.isSelf(peer.peerId): trace "Skipping dialing self", peer = peer.peerId return - if peer.peerId in b.peers: + if peer.peerId in self.peers: trace "Already connected to peer", peer = peer.peerId return - await b.switch.connect(peer.peerId, peer.addresses.mapIt(it.address)) + await self.switch.connect(peer.peerId, peer.addresses.mapIt(it.address)) -proc dropPeer*(b: BlockExcNetwork, peer: PeerId) = +proc dropPeer*( + self: BlockExcNetwork, peer: PeerId +) {.async: (raises: [CancelledError]).} = + trace "Dropping peer", peer + + try: + if not self.switch.isNil: + await self.switch.disconnect(peer) + except CatchableError as error: + warn "Error attempting to disconnect from peer", peer = peer, error = error.msg + + if not self.handlers.onPeerDropped.isNil: + await self.handlers.onPeerDropped(peer) + +proc handlePeerJoined*( + self: BlockExcNetwork, peer: PeerId +) {.async: (raises: [CancelledError]).} = + discard self.getOrCreatePeer(peer) + if not self.handlers.onPeerJoined.isNil: + await self.handlers.onPeerJoined(peer) + +proc handlePeerDeparted*( + self: BlockExcNetwork, peer: PeerId +) {.async: (raises: [CancelledError]).} = ## Cleanup disconnected peer ## - trace "Dropping peer", peer - b.peers.del(peer) + trace "Cleaning up departed peer", peer + self.peers.del(peer) + if not self.handlers.onPeerDeparted.isNil: + await self.handlers.onPeerDeparted(peer) method init*(self: BlockExcNetwork) = ## Perform protocol initialization @@ -327,9 +349,11 @@ method init*(self: BlockExcNetwork) = peerId: PeerId, event: PeerEvent ): Future[void] {.gcsafe, async: (raises: [CancelledError]).} = if event.kind == PeerEventKind.Joined: - self.setupPeer(peerId) + await self.handlePeerJoined(peerId) + elif event.kind == PeerEventKind.Left: + await self.handlePeerDeparted(peerId) else: - self.dropPeer(peerId) + warn "Unknown peer event", event self.switch.addPeerEventHandler(peerEventHandler, PeerEventKind.Joined) self.switch.addPeerEventHandler(peerEventHandler, PeerEventKind.Left) diff --git a/codex/blockexchange/peers/peercontext.nim b/codex/blockexchange/peers/peercontext.nim index cca6d3876e..5d805023a8 100644 --- a/codex/blockexchange/peers/peercontext.nim +++ b/codex/blockexchange/peers/peercontext.nim @@ -89,9 +89,11 @@ proc blockRequested*(self: BlockExcPeerCtx, address: BlockAddress) = proc blockRequestCancelled*(self: BlockExcPeerCtx, address: BlockAddress) = self.blocksRequested.excl(address) -proc blockReceived*(self: BlockExcPeerCtx, address: BlockAddress) = +proc blockReceived*(self: BlockExcPeerCtx, address: BlockAddress): bool = + let wasRequested = address in self.blocksRequested self.blocksRequested.excl(address) self.lastExchange = Moment.now() + wasRequested proc activityTimer*( self: BlockExcPeerCtx diff --git a/tests/codex/blockexchange/engine/testengine.nim b/tests/codex/blockexchange/engine/testengine.nim index 6688b41712..7c3933c581 100644 --- a/tests/codex/blockexchange/engine/testengine.nim +++ b/tests/codex/blockexchange/engine/testengine.nim @@ -27,8 +27,6 @@ const NopSendWantCancellationsProc = proc( asyncchecksuite "NetworkStore engine basic": var - rng: Rng - seckey: PrivateKey peerId: PeerId chunker: Chunker wallet: WalletRef @@ -39,9 +37,7 @@ asyncchecksuite "NetworkStore engine basic": done: Future[void] setup: - rng = Rng.instance() - seckey = PrivateKey.random(rng[]).tryGet() - peerId = PeerId.init(seckey.getPublicKey().tryGet()).tryGet() + peerId = PeerId.example chunker = RandomChunker.new(Rng.instance(), size = 1024'nb, chunkSize = 256'nb) wallet = WalletRef.example blockDiscovery = Discovery.new() @@ -83,7 +79,7 @@ asyncchecksuite "NetworkStore engine basic": for b in blocks: discard engine.pendingBlocks.getWantHandle(b.cid) - await engine.setupPeer(peerId) + await engine.peerAddedHandler(peerId) await done.wait(100.millis) @@ -111,14 +107,12 @@ asyncchecksuite "NetworkStore engine basic": ) engine.pricing = pricing.some - await engine.setupPeer(peerId) + await engine.peerAddedHandler(peerId) await done.wait(100.millis) asyncchecksuite "NetworkStore engine handlers": var - rng: Rng - seckey: PrivateKey peerId: PeerId chunker: Chunker wallet: WalletRef @@ -134,8 +128,7 @@ asyncchecksuite "NetworkStore engine handlers": blocks: seq[Block] setup: - rng = Rng.instance() - chunker = RandomChunker.new(rng, size = 1024'nb, chunkSize = 256'nb) + chunker = RandomChunker.new(Rng.instance(), size = 1024'nb, chunkSize = 256'nb) while true: let chunk = await chunker.getBytes() @@ -144,8 +137,7 @@ asyncchecksuite "NetworkStore engine handlers": blocks.add(Block.new(chunk).tryGet()) - seckey = PrivateKey.random(rng[]).tryGet() - peerId = PeerId.init(seckey.getPublicKey().tryGet()).tryGet() + peerId = PeerId.example wallet = WalletRef.example blockDiscovery = Discovery.new() peerStore = PeerCtxStore.new() @@ -249,6 +241,9 @@ asyncchecksuite "NetworkStore engine handlers": test "Should store blocks in local store": let pending = blocks.mapIt(engine.pendingBlocks.getWantHandle(it.cid)) + for blk in blocks: + peerCtx.blockRequested(blk.address) + let blocksDelivery = blocks.mapIt(BlockDelivery(blk: it, address: it.address)) # Install NOP for want list cancellations so they don't cause a crash @@ -274,6 +269,9 @@ asyncchecksuite "NetworkStore engine handlers": (it.address, Presence(address: it.address, price: rand(uint16).u256, have: true)) ).toTable + for blk in blocks: + peerContext.blockRequested(blk.address) + engine.network = BlockExcNetwork( request: BlockExcRequest( sendPayment: proc( @@ -337,33 +335,44 @@ asyncchecksuite "NetworkStore engine handlers": check a in peerCtx.peerHave check peerCtx.blocks[a].price == price - test "Should send cancellations for received blocks": + test "Should send cancellations for requested blocks only": let - pending = blocks.mapIt(engine.pendingBlocks.getWantHandle(it.cid)) - blocksDelivery = blocks.mapIt(BlockDelivery(blk: it, address: it.address)) - cancellations = newTable(blocks.mapIt((it.address, newFuture[void]())).toSeq) + pendingPeer = peerId # peer towards which we have pending block requests + pendingPeerCtx = peerCtx + senderPeer = PeerId.example # peer that will actually send the blocks + senderPeerCtx = BlockExcPeerCtx(id: senderPeer) + reqBlocks = @[blocks[0], blocks[4]] # blocks that we requested to pendingPeer + reqBlockAddrs = reqBlocks.mapIt(it.address) + blockHandles = blocks.mapIt(engine.pendingBlocks.getWantHandle(it.cid)) - peerCtx.blocks = blocks.mapIt( - (it.address, Presence(address: it.address, have: true, price: UInt256.example)) - ).toTable + var cancelled: HashSet[BlockAddress] + + engine.peers.add(senderPeerCtx) + for address in reqBlockAddrs: + pendingPeerCtx.blockRequested(address) + + for address in blocks.mapIt(it.address): + senderPeerCtx.blockRequested(address) proc sendWantCancellations( id: PeerId, addresses: seq[BlockAddress] ) {.async: (raises: [CancelledError]).} = + assert id == pendingPeer for address in addresses: - cancellations[address].catch.expect("address should exist").complete() + cancelled.incl(address) engine.network = BlockExcNetwork( request: BlockExcRequest(sendWantCancellations: sendWantCancellations) ) - await engine.blocksDeliveryHandler(peerId, blocksDelivery) - discard await allFinished(pending).wait(100.millis) - await allFuturesThrowing(cancellations.values().toSeq) + let blocksDelivery = blocks.mapIt(BlockDelivery(blk: it, address: it.address)) + await engine.blocksDeliveryHandler(senderPeer, blocksDelivery) + discard await allFinished(blockHandles).wait(100.millis) + + check cancelled == reqBlockAddrs.toHashSet() asyncchecksuite "Block Download": var - rng: Rng seckey: PrivateKey peerId: PeerId chunker: Chunker @@ -380,8 +389,7 @@ asyncchecksuite "Block Download": blocks: seq[Block] setup: - rng = Rng.instance() - chunker = RandomChunker.new(rng, size = 1024'nb, chunkSize = 256'nb) + chunker = RandomChunker.new(Rng.instance(), size = 1024'nb, chunkSize = 256'nb) while true: let chunk = await chunker.getBytes() @@ -390,8 +398,7 @@ asyncchecksuite "Block Download": blocks.add(Block.new(chunk).tryGet()) - seckey = PrivateKey.random(rng[]).tryGet() - peerId = PeerId.init(seckey.getPublicKey().tryGet()).tryGet() + peerId = PeerId.example wallet = WalletRef.example blockDiscovery = Discovery.new() peerStore = PeerCtxStore.new() @@ -409,13 +416,27 @@ asyncchecksuite "Block Download": localStore, wallet, network, discovery, advertiser, peerStore, pendingBlocks ) - peerCtx = BlockExcPeerCtx(id: peerId) + peerCtx = BlockExcPeerCtx(id: peerId, activityTimeout: 100.milliseconds) engine.peers.add(peerCtx) - test "Should exhaust retries": + test "Should reschedule blocks on peer timeout": + let + slowPeer = peerId + fastPeer = PeerId.example + slowPeerCtx = peerCtx + # "Fast" peer has in fact a generous timeout. This should avoid timing issues + # in the test. + fastPeerCtx = BlockExcPeerCtx(id: fastPeer, activityTimeout: 60.seconds) + requestedBlock = blocks[0] + var - retries = 2 - address = BlockAddress.init(blocks[0].cid) + slowPeerWantList = newFuture[void]("slowPeerWantList") + fastPeerWantList = newFuture[void]("fastPeerWantList") + slowPeerDropped = newFuture[void]("slowPeerDropped") + slowPeerBlockRequest = newFuture[void]("slowPeerBlockRequest") + fastPeerBlockRequest = newFuture[void]("fastPeerBlockRequest") + + engine.peers.add(fastPeerCtx) proc sendWantList( id: PeerId, @@ -426,68 +447,63 @@ asyncchecksuite "Block Download": full: bool = false, sendDontHave: bool = false, ) {.async: (raises: [CancelledError]).} = - check wantType == WantHave - check not engine.pendingBlocks.isInFlight(address) - check engine.pendingBlocks.retries(address) == retries - retries -= 1 + check addresses == @[requestedBlock.address] - engine.pendingBlocks.blockRetries = 2 - engine.pendingBlocks.retryInterval = 10.millis - engine.network = - BlockExcNetwork(request: BlockExcRequest(sendWantList: sendWantList)) + if wantType == WantBlock: + if id == slowPeer: + slowPeerBlockRequest.complete() + else: + fastPeerBlockRequest.complete() - let pending = engine.requestBlock(address) + if wantType == WantHave: + if id == slowPeer: + slowPeerWantList.complete() + else: + fastPeerWantList.complete() - expect RetriesExhaustedError: - discard (await pending).tryGet() + proc onPeerDropped( + peer: PeerId + ): Future[void] {.async: (raises: [CancelledError]).} = + assert peer == slowPeer + slowPeerDropped.complete() - test "Should retry block request": - var - address = BlockAddress.init(blocks[0].cid) - steps = newAsyncEvent() + proc selectPeer(peers: seq[BlockExcPeerCtx]): BlockExcPeerCtx = + # Looks for the slow peer. + for peer in peers: + if peer.id == slowPeer: + return peer - proc sendWantList( - id: PeerId, - addresses: seq[BlockAddress], - priority: int32 = 0, - cancel: bool = false, - wantType: WantType = WantType.WantHave, - full: bool = false, - sendDontHave: bool = false, - ) {.async: (raises: [CancelledError]).} = - case wantType - of WantHave: - check engine.pendingBlocks.isInFlight(address) == false - check engine.pendingBlocks.retriesExhausted(address) == false - steps.fire() - of WantBlock: - check engine.pendingBlocks.isInFlight(address) == true - check engine.pendingBlocks.retriesExhausted(address) == false - steps.fire() + return peers[0] - engine.pendingBlocks.blockRetries = 10 - engine.pendingBlocks.retryInterval = 10.millis - engine.network = BlockExcNetwork( - request: BlockExcRequest( - sendWantList: sendWantList, sendWantCancellations: NopSendWantCancellationsProc - ) - ) + engine.selectPeer = selectPeer + engine.pendingBlocks.retryInterval = 200.milliseconds + engine.network = + BlockExcNetwork(request: BlockExcRequest(sendWantList: sendWantList)) + engine.network.handlers.onPeerDropped = onPeerDropped - let pending = engine.requestBlock(address) - await steps.wait() + let blockHandle = engine.requestBlock(requestedBlock.address) - # add blocks precense - peerCtx.blocks = blocks.mapIt( - (it.address, Presence(address: it.address, have: true, price: UInt256.example)) - ).toTable + # Waits for the peer to send its want list to both peers. + await slowPeerWantList.wait(5.seconds) + await fastPeerWantList.wait(5.seconds) + + let blockPresence = + @[BlockPresence(address: requestedBlock.address, type: BlockPresenceType.Have)] - steps.clear() - await steps.wait() + await engine.blockPresenceHandler(slowPeer, blockPresence) + await engine.blockPresenceHandler(fastPeer, blockPresence) + # Waits for the peer to ask for the block. + await slowPeerBlockRequest.wait(5.seconds) + # Don't reply and wait for the peer to be dropped by timeout. + await slowPeerDropped.wait(5.seconds) + # The engine should retry and ask the fast peer for the block. + await fastPeerBlockRequest.wait(5.seconds) await engine.blocksDeliveryHandler( - peerId, @[BlockDelivery(blk: blocks[0], address: address)] + fastPeer, @[BlockDelivery(blk: requestedBlock, address: requestedBlock.address)] ) - check (await pending).tryGet() == blocks[0] + + discard await blockHandle.wait(5.seconds) test "Should cancel block request": var @@ -520,15 +536,8 @@ asyncchecksuite "Block Download": expect CancelledError: discard (await pending).tryGet() - # test "Should not keep looking up providers for the same dataset repeatedly": - # let - # blocks = await makeRandomBlocks(datasetSize = 4096, blockSize = 128'nb) - # manifest = await storeDataGetManifest(store, blocks) - asyncchecksuite "Task Handler": var - rng: Rng - seckey: PrivateKey peerId: PeerId chunker: Chunker wallet: WalletRef @@ -546,8 +555,7 @@ asyncchecksuite "Task Handler": blocks: seq[Block] setup: - rng = Rng.instance() - chunker = RandomChunker.new(rng, size = 1024, chunkSize = 256'nb) + chunker = RandomChunker.new(Rng.instance(), size = 1024, chunkSize = 256'nb) while true: let chunk = await chunker.getBytes() if chunk.len <= 0: @@ -555,8 +563,7 @@ asyncchecksuite "Task Handler": blocks.add(Block.new(chunk).tryGet()) - seckey = PrivateKey.random(rng[]).tryGet() - peerId = PeerId.init(seckey.getPublicKey().tryGet()).tryGet() + peerId = PeerId.example wallet = WalletRef.example blockDiscovery = Discovery.new() peerStore = PeerCtxStore.new() @@ -576,9 +583,7 @@ asyncchecksuite "Task Handler": peersCtx = @[] for i in 0 .. 3: - let seckey = PrivateKey.random(rng[]).tryGet() - peers.add(PeerId.init(seckey.getPublicKey().tryGet()).tryGet()) - + peers.add(PeerId.example) peersCtx.add(BlockExcPeerCtx(id: peers[i])) peerStore.add(peersCtx[i]) @@ -625,12 +630,12 @@ asyncchecksuite "Task Handler": # await engine.taskHandler(peersCtx[0]) - test "Should set in-flight for outgoing blocks": + test "Should mark outgoing blocks as sent": proc sendBlocksDelivery( id: PeerId, blocksDelivery: seq[BlockDelivery] ) {.async: (raises: [CancelledError]).} = let blockAddress = peersCtx[0].wantedBlocks.toSeq[0] - check peersCtx[0].isInFlight(blockAddress) + check peersCtx[0].isBlockSent(blockAddress) for blk in blocks: (await engine.localStore.putBlock(blk)).tryGet() @@ -640,10 +645,10 @@ asyncchecksuite "Task Handler": await engine.taskHandler(peersCtx[0]) - test "Should clear in-flight when local lookup fails": + test "Should not mark blocks for which local look fails as sent": peersCtx[0].wantedBlocks.incl(blocks[0].address) await engine.taskHandler(peersCtx[0]) let blockAddress = peersCtx[0].wantedBlocks.toSeq[0] - check not peersCtx[0].isInFlight(blockAddress) + check not peersCtx[0].isBlockSent(blockAddress) From 572b44856bd68418c37d2e0d028cf57656e79fa2 Mon Sep 17 00:00:00 2001 From: gmega Date: Thu, 3 Jul 2025 10:18:32 -0300 Subject: [PATCH 16/20] fix: fix block exchange test to stricter protocol; minor refactor --- codex/blockexchange/engine/engine.nim | 6 ++++-- codex/blockexchange/peers/peercontext.nim | 7 +++++-- tests/codex/blockexchange/engine/testblockexc.nim | 2 ++ tests/codex/blockexchange/engine/testengine.nim | 8 ++++---- tests/codex/blockexchange/testnetwork.nim | 2 +- 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index 453dea4f87..ee706e3c9d 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -279,7 +279,7 @@ proc downloadInternal( if not self.pendingBlocks.isRequested(address): let peer = self.selectPeer(peers.with) self.pendingBlocks.markRequested(address, peer.id) - peer.blockRequested(address) + peer.blockRequestScheduled(address) trace "Request block from block retry loop" await self.sendWantBlock(@[address], peer) peer @@ -412,7 +412,7 @@ proc blockPresenceHandler*( for address in ourWantCids: self.pendingBlocks.decRetries(address) self.pendingBlocks.markRequested(address, peer) - peerCtx.blockRequested(address) + peerCtx.blockRequestScheduled(address) if ourWantCids.len > 0: trace "Peer has blocks in our wantList", peer, wants = ourWantCids @@ -773,6 +773,8 @@ proc taskHandler*( wantedBlocks = peerCtx.wantedBlocks.filterIt(not peerCtx.isBlockSent(it)) sent: HashSet[BlockAddress] + trace "Running task for peer", peer = peerCtx.id + for wantedBlock in wantedBlocks: peerCtx.markBlockAsSent(wantedBlock) diff --git a/codex/blockexchange/peers/peercontext.nim b/codex/blockexchange/peers/peercontext.nim index 5d805023a8..c917b7ee30 100644 --- a/codex/blockexchange/peers/peercontext.nim +++ b/codex/blockexchange/peers/peercontext.nim @@ -80,13 +80,16 @@ func price*(self: BlockExcPeerCtx, addresses: seq[BlockAddress]): UInt256 = price -proc blockRequested*(self: BlockExcPeerCtx, address: BlockAddress) = - # We start counting the timeout from the first block requested. +proc blockRequestScheduled*(self: BlockExcPeerCtx, address: BlockAddress) = + ## Adds a block the set of blocks that have been requested to this peer + ## (its request schedule). if self.blocksRequested.len == 0: self.lastExchange = Moment.now() self.blocksRequested.incl(address) proc blockRequestCancelled*(self: BlockExcPeerCtx, address: BlockAddress) = + ## Removes a block from the set of blocks that have been requested to this peer + ## (its request schedule). self.blocksRequested.excl(address) proc blockReceived*(self: BlockExcPeerCtx, address: BlockAddress): bool = diff --git a/tests/codex/blockexchange/engine/testblockexc.nim b/tests/codex/blockexchange/engine/testblockexc.nim index 108ca539d2..e4bcf764cd 100644 --- a/tests/codex/blockexchange/engine/testblockexc.nim +++ b/tests/codex/blockexchange/engine/testblockexc.nim @@ -96,6 +96,8 @@ asyncchecksuite "NetworkStore engine - 2 nodes": test "Should send want-have for block": let blk = bt.Block.new("Block 1".toBytes).tryGet() let blkFut = nodeCmps1.pendingBlocks.getWantHandle(blk.cid) + peerCtx2.blockRequestScheduled(blk.address) + (await nodeCmps2.localStore.putBlock(blk)).tryGet() peerCtx1.wantedBlocks.incl(blk.address) diff --git a/tests/codex/blockexchange/engine/testengine.nim b/tests/codex/blockexchange/engine/testengine.nim index 7c3933c581..1afe214738 100644 --- a/tests/codex/blockexchange/engine/testengine.nim +++ b/tests/codex/blockexchange/engine/testengine.nim @@ -242,7 +242,7 @@ asyncchecksuite "NetworkStore engine handlers": let pending = blocks.mapIt(engine.pendingBlocks.getWantHandle(it.cid)) for blk in blocks: - peerCtx.blockRequested(blk.address) + peerCtx.blockRequestScheduled(blk.address) let blocksDelivery = blocks.mapIt(BlockDelivery(blk: it, address: it.address)) @@ -270,7 +270,7 @@ asyncchecksuite "NetworkStore engine handlers": ).toTable for blk in blocks: - peerContext.blockRequested(blk.address) + peerContext.blockRequestScheduled(blk.address) engine.network = BlockExcNetwork( request: BlockExcRequest( @@ -349,10 +349,10 @@ asyncchecksuite "NetworkStore engine handlers": engine.peers.add(senderPeerCtx) for address in reqBlockAddrs: - pendingPeerCtx.blockRequested(address) + pendingPeerCtx.blockRequestScheduled(address) for address in blocks.mapIt(it.address): - senderPeerCtx.blockRequested(address) + senderPeerCtx.blockRequestScheduled(address) proc sendWantCancellations( id: PeerId, addresses: seq[BlockAddress] diff --git a/tests/codex/blockexchange/testnetwork.nim b/tests/codex/blockexchange/testnetwork.nim index b9a51c9d62..ab19a6ae2c 100644 --- a/tests/codex/blockexchange/testnetwork.nim +++ b/tests/codex/blockexchange/testnetwork.nim @@ -40,7 +40,7 @@ asyncchecksuite "Network - Handlers": done = newFuture[void]() buffer = BufferStream.new() network = BlockExcNetwork.new(switch = newStandardSwitch(), connProvider = getConn) - network.setupPeer(peerId) + await network.handlePeerJoined(peerId) networkPeer = network.peers[peerId] discard await networkPeer.connect() From b274b29b834a6c58f2855b588d0d950c21822baa Mon Sep 17 00:00:00 2001 From: gmega Date: Thu, 3 Jul 2025 12:06:02 -0300 Subject: [PATCH 17/20] fix: fix testdiscovery so it works with stricter block protocol --- codex/blockexchange/engine/engine.nim | 7 +++++-- tests/codex/blockexchange/discovery/testdiscovery.nim | 7 ++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index ee706e3c9d..efe6bcc442 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -555,7 +555,10 @@ proc validateBlockDelivery(self: BlockExcEngine, bd: BlockDelivery): ?!void = return success() proc blocksDeliveryHandler*( - self: BlockExcEngine, peer: PeerId, blocksDelivery: seq[BlockDelivery] + self: BlockExcEngine, + peer: PeerId, + blocksDelivery: seq[BlockDelivery], + allowSpurious: bool = false, ) {.async: (raises: []).} = trace "Received blocks from peer", peer, blocks = (blocksDelivery.mapIt(it.address)) @@ -569,7 +572,7 @@ proc blocksDeliveryHandler*( try: # Unknown peers and unrequested blocks are dropped with a warning. - if peerCtx == nil or not peerCtx.blockReceived(bd.address): + if not allowSpurious and (peerCtx == nil or not peerCtx.blockReceived(bd.address)): warn "Dropping unrequested or duplicate block received from peer" codex_block_exchange_spurious_blocks_received.inc() continue diff --git a/tests/codex/blockexchange/discovery/testdiscovery.nim b/tests/codex/blockexchange/discovery/testdiscovery.nim index c54a1ffffe..a8d678a033 100644 --- a/tests/codex/blockexchange/discovery/testdiscovery.nim +++ b/tests/codex/blockexchange/discovery/testdiscovery.nim @@ -216,7 +216,6 @@ asyncchecksuite "E2E - Multiple Nodes Discovery": test "E2E - Should advertise and discover blocks": # Distribute the manifests and trees amongst 1..3 # Ask 0 to download everything without connecting him beforehand - var advertised: Table[Cid, SignedPeerRecord] MockDiscovery(blockexc[1].engine.discovery.discovery).publishBlockProvideHandler = proc( @@ -242,6 +241,7 @@ asyncchecksuite "E2E - Multiple Nodes Discovery": blk: mBlocks[0], address: BlockAddress(leaf: false, cid: mBlocks[0].cid) ) ], + allowSpurious = true, ) discard blockexc[2].engine.pendingBlocks.getWantHandle(mBlocks[1].cid) @@ -252,6 +252,7 @@ asyncchecksuite "E2E - Multiple Nodes Discovery": blk: mBlocks[1], address: BlockAddress(leaf: false, cid: mBlocks[1].cid) ) ], + allowSpurious = true, ) discard blockexc[3].engine.pendingBlocks.getWantHandle(mBlocks[2].cid) @@ -262,6 +263,7 @@ asyncchecksuite "E2E - Multiple Nodes Discovery": blk: mBlocks[2], address: BlockAddress(leaf: false, cid: mBlocks[2].cid) ) ], + allowSpurious = true, ) MockDiscovery(blockexc[0].engine.discovery.discovery).findBlockProvidersHandler = proc( @@ -311,6 +313,7 @@ asyncchecksuite "E2E - Multiple Nodes Discovery": blk: mBlocks[0], address: BlockAddress(leaf: false, cid: mBlocks[0].cid) ) ], + allowSpurious = true, ) discard blockexc[2].engine.pendingBlocks.getWantHandle(mBlocks[1].cid) @@ -321,6 +324,7 @@ asyncchecksuite "E2E - Multiple Nodes Discovery": blk: mBlocks[1], address: BlockAddress(leaf: false, cid: mBlocks[1].cid) ) ], + allowSpurious = true, ) discard blockexc[3].engine.pendingBlocks.getWantHandle(mBlocks[2].cid) @@ -331,6 +335,7 @@ asyncchecksuite "E2E - Multiple Nodes Discovery": blk: mBlocks[2], address: BlockAddress(leaf: false, cid: mBlocks[2].cid) ) ], + allowSpurious = true, ) MockDiscovery(blockexc[0].engine.discovery.discovery).findBlockProvidersHandler = proc( From d7c403edfea48a6ef119041f73f0a27200a4219b Mon Sep 17 00:00:00 2001 From: gmega Date: Thu, 3 Jul 2025 18:43:11 -0300 Subject: [PATCH 18/20] feat: add stopgap "adaptive" refresh --- codex/blockexchange/engine/engine.nim | 25 ++++++++++++++--------- codex/blockexchange/peers/peercontext.nim | 23 +++++++++++++++++++-- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index efe6bcc442..d6e641e012 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -175,9 +175,14 @@ proc refreshBlockKnowledge( self: BlockExcEngine, peer: BlockExcPeerCtx ) {.async: (raises: [CancelledError]).} = if self.pendingBlocks.wantListLen > 0: - let cids = toSeq(self.pendingBlocks.wantList) - trace "Sending our want list to a peer", peer = peer.id, length = cids.len - await self.network.request.sendWantList(peer.id, cids, full = true) + # We send only blocks that the peer hasn't already told us that they already have. + let + peerHave = peer.peerHave + toAsk = self.pendingBlocks.wantList.toSeq.filterIt(it notin peerHave) + + if toAsk.len > 0: + trace "Sending want list to a peer", peer = peer.id, length = toAsk.len + await self.network.request.sendWantList(peer.id, toAsk, full = true) proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledError]).} = for peer in self.peers.peers.values.toSeq: @@ -189,15 +194,13 @@ proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledErr # want list in the coarsest way possible instead of over many # small updates. # + if peer.refreshInProgress: + trace "Peer refresh in progress", peer = peer.id + continue + # In dynamic swarms, staleness will dominate latency. if peer.lastRefresh < self.pendingBlocks.lastInclusion or peer.isKnowledgeStale: - # FIXME: we update the lastRefresh before actually refreshing because otherwise - # a slow peer will be bombarded with requests. If the request does fail or the - # peer does not reply, a retrying block will eventually issue this again. This - # is a complex and convoluted flow - ideally we should simply be tracking this - # request and retrying it on the absence of a response, eventually disconnecting - # the peer if it consistently fails to respond. - peer.refreshed() + peer.refreshRequested() # TODO: optimize this by keeping track of what was sent and sending deltas. # This should allow us to run much more frequent refreshes, and be way more # efficient about it. @@ -393,6 +396,8 @@ proc blockPresenceHandler*( if peerCtx.isNil: return + peerCtx.refreshReplied() + for blk in blocks: if presence =? Presence.init(blk): peerCtx.setPresence(presence) diff --git a/codex/blockexchange/peers/peercontext.nim b/codex/blockexchange/peers/peercontext.nim index c917b7ee30..0e0d1060c4 100644 --- a/codex/blockexchange/peers/peercontext.nim +++ b/codex/blockexchange/peers/peercontext.nim @@ -25,12 +25,18 @@ import ../../logutils export payments, nitro +const + MinRefreshInterval = 5.seconds + MaxRefreshBackoff = 36 # 3 minutes + type BlockExcPeerCtx* = ref object of RootObj id*: PeerId blocks*: Table[BlockAddress, Presence] # remote peer have list including price wantedBlocks*: HashSet[BlockAddress] # blocks that the peer wants exchanged*: int # times peer has exchanged with us + refreshInProgress*: bool # indicates if a refresh is in progress lastRefresh*: Moment # last time we refreshed our knowledge of the blocks this peer has + refreshBackoff*: int = 1 # backoff factor for refresh requests account*: ?Account # ethereum account of this peer paymentChannel*: ?ChannelId # payment channel id blocksSent*: HashSet[BlockAddress] # blocks sent to peer @@ -39,7 +45,7 @@ type BlockExcPeerCtx* = ref object of RootObj activityTimeout*: Duration proc isKnowledgeStale*(self: BlockExcPeerCtx): bool = - self.lastRefresh + 5.minutes < Moment.now() + self.lastRefresh + self.refreshBackoff * MinRefreshInterval < Moment.now() proc isBlockSent*(self: BlockExcPeerCtx, address: BlockAddress): bool = address in self.blocksSent @@ -50,8 +56,18 @@ proc markBlockAsSent*(self: BlockExcPeerCtx, address: BlockAddress) = proc markBlockAsNotSent*(self: BlockExcPeerCtx, address: BlockAddress) = self.blocksSent.excl(address) -proc refreshed*(self: BlockExcPeerCtx) = +proc refreshRequested*(self: BlockExcPeerCtx) = + trace "Refresh requested for peer", peer = self.id, backoff = self.refreshBackoff + self.refreshInProgress = true + self.lastRefresh = Moment.now() + +proc refreshReplied*(self: BlockExcPeerCtx) = + self.refreshInProgress = false self.lastRefresh = Moment.now() + self.refreshBackoff = min(self.refreshBackoff * 2, MaxRefreshBackoff) + +proc havesUpdated(self: BlockExcPeerCtx) = + self.refreshBackoff = 1 proc peerHave*(self: BlockExcPeerCtx): HashSet[BlockAddress] = # XXX: this is ugly an inefficient, but since those will typically @@ -63,6 +79,9 @@ proc contains*(self: BlockExcPeerCtx, address: BlockAddress): bool = address in self.blocks func setPresence*(self: BlockExcPeerCtx, presence: Presence) = + if presence.address notin self.blocks: + self.havesUpdated() + self.blocks[presence.address] = presence func cleanPresence*(self: BlockExcPeerCtx, addresses: seq[BlockAddress]) = From 2ada4b5eb35c141b2eb1a6558d46f71876ee8ada Mon Sep 17 00:00:00 2001 From: gmega Date: Tue, 8 Jul 2025 15:23:54 -0300 Subject: [PATCH 19/20] feat: add block knowledge request mechanism, implement tests --- codex/blockexchange/engine/engine.nim | 9 +- codex/blockexchange/engine/pendingblocks.nim | 2 +- codex/blockexchange/network/network.nim | 2 +- codex/blockexchange/peers/peercontext.nim | 2 +- codex/stores/cachestore.nim | 15 +++ .../blockexchange/discovery/testdiscovery.nim | 4 +- .../discovery/testdiscoveryengine.nim | 2 +- .../blockexchange/engine/testblockexc.nim | 49 +++++--- tests/codex/helpers.nim | 38 +----- tests/codex/helpers/datasetutils.nim | 35 ++++++ tests/codex/helpers/mockdiscovery.nim | 28 +++++ tests/codex/helpers/nodeutils.nim | 117 ++++++++++++++++-- tests/codex/node/testnode.nim | 4 +- tests/codex/stores/commonstoretests.nim | 4 +- tests/codex/stores/testrepostore.nim | 30 ++--- tests/helpers.nim | 6 +- 16 files changed, 255 insertions(+), 92 deletions(-) create mode 100644 tests/codex/helpers/datasetutils.nim diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index d6e641e012..3511f9124d 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -121,7 +121,6 @@ proc blockexcTaskRunner(self: BlockExcEngine) {.async: (raises: []).} proc start*(self: BlockExcEngine) {.async: (raises: []).} = ## Start the blockexc task ## - await self.discovery.start() await self.advertiser.start() @@ -200,11 +199,14 @@ proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledErr # In dynamic swarms, staleness will dominate latency. if peer.lastRefresh < self.pendingBlocks.lastInclusion or peer.isKnowledgeStale: + trace "Refreshing block knowledge for peer", peer = peer.id peer.refreshRequested() # TODO: optimize this by keeping track of what was sent and sending deltas. # This should allow us to run much more frequent refreshes, and be way more # efficient about it. await self.refreshBlockKnowledge(peer) + else: + trace "Not refreshing: peer is up to date", peer = peer.id proc searchForNewPeers(self: BlockExcEngine, cid: Cid) = if self.lastDiscRequest + DiscoveryRateLimit < Moment.now(): @@ -336,10 +338,11 @@ proc requestBlocks*( for address in addresses: self.trackedFutures.track(self.downloadInternal(address)) - var completed: int = 0 + let totalHandles = handles.len + var completed = 0 proc isFinished(): bool = - completed == handles.len + completed == totalHandles proc genNext(): Future[?!Block] {.async: (raises: [CancelledError]).} = # Be it success or failure, we're completing this future. diff --git a/codex/blockexchange/engine/pendingblocks.nim b/codex/blockexchange/engine/pendingblocks.nim index 5579a1faca..48233bbab3 100644 --- a/codex/blockexchange/engine/pendingblocks.nim +++ b/codex/blockexchange/engine/pendingblocks.nim @@ -34,7 +34,7 @@ declareGauge( const DefaultBlockRetries* = 3000 - DefaultRetryInterval* = 5.seconds + DefaultRetryInterval* = 1.seconds type RetriesExhaustedError* = object of CatchableError diff --git a/codex/blockexchange/network/network.nim b/codex/blockexchange/network/network.nim index fb6ff2d51b..f503a15c42 100644 --- a/codex/blockexchange/network/network.nim +++ b/codex/blockexchange/network/network.nim @@ -341,7 +341,7 @@ proc handlePeerDeparted*( if not self.handlers.onPeerDeparted.isNil: await self.handlers.onPeerDeparted(peer) -method init*(self: BlockExcNetwork) = +method init*(self: BlockExcNetwork) {.raises: [].} = ## Perform protocol initialization ## diff --git a/codex/blockexchange/peers/peercontext.nim b/codex/blockexchange/peers/peercontext.nim index 0e0d1060c4..f6eeea4677 100644 --- a/codex/blockexchange/peers/peercontext.nim +++ b/codex/blockexchange/peers/peercontext.nim @@ -26,7 +26,7 @@ import ../../logutils export payments, nitro const - MinRefreshInterval = 5.seconds + MinRefreshInterval = 1.seconds MaxRefreshBackoff = 36 # 3 minutes type BlockExcPeerCtx* = ref object of RootObj diff --git a/codex/stores/cachestore.nim b/codex/stores/cachestore.nim index 54bed1b8c7..6f2c6f105d 100644 --- a/codex/stores/cachestore.nim +++ b/codex/stores/cachestore.nim @@ -66,6 +66,21 @@ method getBlock*( trace "Error requesting block from cache", cid, error = exc.msg return failure exc +method getBlocks*( + self: CacheStore, addresses: seq[BlockAddress] +): Future[SafeAsyncIter[Block]] {.async: (raises: [CancelledError]).} = + var i = 0 + + proc isFinished(): bool = + i == addresses.len + + proc genNext(): Future[?!Block] {.async: (raises: [CancelledError]).} = + let value = await self.getBlock(addresses[i]) + inc(i) + return value + + return SafeAsyncIter[Block].new(genNext, isFinished) + method getCidAndProof*( self: CacheStore, treeCid: Cid, index: Natural ): Future[?!(Cid, CodexProof)] {.async: (raises: [CancelledError]).} = diff --git a/tests/codex/blockexchange/discovery/testdiscovery.nim b/tests/codex/blockexchange/discovery/testdiscovery.nim index a8d678a033..6b36a63044 100644 --- a/tests/codex/blockexchange/discovery/testdiscovery.nim +++ b/tests/codex/blockexchange/discovery/testdiscovery.nim @@ -54,7 +54,7 @@ asyncchecksuite "Block Advertising and Discovery": peerStore = PeerCtxStore.new() pendingBlocks = PendingBlocksManager.new() - (manifest, tree) = makeManifestAndTree(blocks).tryGet() + (_, tree, manifest) = makeDataset(blocks).tryGet() manifestBlock = bt.Block.new(manifest.encode().tryGet(), codec = ManifestCodec).tryGet() @@ -172,7 +172,7 @@ asyncchecksuite "E2E - Multiple Nodes Discovery": break blocks.add(bt.Block.new(chunk).tryGet()) - let (manifest, tree) = makeManifestAndTree(blocks).tryGet() + let (_, tree, manifest) = makeDataset(blocks).tryGet() manifests.add(manifest) mBlocks.add(manifest.asBlock()) trees.add(tree) diff --git a/tests/codex/blockexchange/discovery/testdiscoveryengine.nim b/tests/codex/blockexchange/discovery/testdiscoveryengine.nim index 9efab1a648..df3f8c80e7 100644 --- a/tests/codex/blockexchange/discovery/testdiscoveryengine.nim +++ b/tests/codex/blockexchange/discovery/testdiscoveryengine.nim @@ -43,7 +43,7 @@ asyncchecksuite "Test Discovery Engine": blocks.add(bt.Block.new(chunk).tryGet()) - (manifest, tree) = makeManifestAndTree(blocks).tryGet() + (_, tree, manifest) = makeDataset(blocks).tryGet() manifestBlock = manifest.asBlock() blocks.add(manifestBlock) diff --git a/tests/codex/blockexchange/engine/testblockexc.nim b/tests/codex/blockexchange/engine/testblockexc.nim index e4bcf764cd..68aec5fee9 100644 --- a/tests/codex/blockexchange/engine/testblockexc.nim +++ b/tests/codex/blockexchange/engine/testblockexc.nim @@ -24,19 +24,12 @@ asyncchecksuite "NetworkStore engine - 2 nodes": pendingBlocks1, pendingBlocks2: seq[BlockHandle] setup: - blocks1 = await makeRandomBlocks(datasetSize = 2048, blockSize = 256'nb) - blocks2 = await makeRandomBlocks(datasetSize = 2048, blockSize = 256'nb) + blocks1 = makeRandomBlocks(nBlocks = 8, blockSize = 256'nb) + blocks2 = makeRandomBlocks(nBlocks = 8, blockSize = 256'nb) nodeCmps1 = generateNodes(1, blocks1)[0] nodeCmps2 = generateNodes(1, blocks2)[0] - await allFuturesThrowing( - nodeCmps1.switch.start(), - nodeCmps1.blockDiscovery.start(), - nodeCmps1.engine.start(), - nodeCmps2.switch.start(), - nodeCmps2.blockDiscovery.start(), - nodeCmps2.engine.start(), - ) + await allFuturesThrowing(nodeCmps1.start(), nodeCmps2.start()) # initialize our want lists pendingBlocks1 = @@ -65,14 +58,7 @@ asyncchecksuite "NetworkStore engine - 2 nodes": check isNil(peerCtx2).not teardown: - await allFuturesThrowing( - nodeCmps1.blockDiscovery.stop(), - nodeCmps1.engine.stop(), - nodeCmps1.switch.stop(), - nodeCmps2.blockDiscovery.stop(), - nodeCmps2.engine.stop(), - nodeCmps2.switch.stop(), - ) + await allFuturesThrowing(nodeCmps1.stop(), nodeCmps2.stop()) test "Should exchange blocks on connect": await allFuturesThrowing(allFinished(pendingBlocks1)).wait(10.seconds) @@ -145,7 +131,7 @@ asyncchecksuite "NetworkStore - multiple nodes": blocks: seq[bt.Block] setup: - blocks = await makeRandomBlocks(datasetSize = 4096, blockSize = 256'nb) + blocks = makeRandomBlocks(nBlocks = 16, blockSize = 256'nb) nodes = generateNodes(5) for e in nodes: await e.engine.start() @@ -203,3 +189,28 @@ asyncchecksuite "NetworkStore - multiple nodes": check pendingBlocks1.mapIt(it.read) == blocks[0 .. 3] check pendingBlocks2.mapIt(it.read) == blocks[12 .. 15] + +asyncchecksuite "NetworkStore - dissemination": + var nodes: seq[NodesComponents] + + teardown: + if nodes.len > 0: + await nodes.stop() + + test "Should disseminate blocks across large diameter swarm": + let dataset = makeRandomDataset(nBlocks = 60, blockSize = 256'nb).tryGet() + + nodes = generateNodes(6, enableDiscovery = false) + + await assignBlocks(nodes[0], dataset, 0 .. 9) + await assignBlocks(nodes[1], dataset, 10 .. 19) + await assignBlocks(nodes[2], dataset, 20 .. 29) + await assignBlocks(nodes[3], dataset, 30 .. 39) + await assignBlocks(nodes[4], dataset, 40 .. 49) + await assignBlocks(nodes[5], dataset, 50 .. 59) + + await nodes.start() + await nodes.linearTopology() + + let downloads = nodes.mapIt(downloadDataset(it, dataset)) + await allFuturesThrowing(downloads).wait(20.seconds) diff --git a/tests/codex/helpers.nim b/tests/codex/helpers.nim index 898dd16e10..b855f412d9 100644 --- a/tests/codex/helpers.nim +++ b/tests/codex/helpers.nim @@ -12,13 +12,16 @@ import pkg/codex/rng import pkg/codex/utils import ./helpers/nodeutils +import ./helpers/datasetutils import ./helpers/randomchunker import ./helpers/mockchunker import ./helpers/mockdiscovery import ./helpers/always import ../checktest -export randomchunker, nodeutils, mockdiscovery, mockchunker, always, checktest, manifest +export + randomchunker, nodeutils, datasetutils, mockdiscovery, mockchunker, always, checktest, + manifest export libp2p except setup, eventually @@ -46,23 +49,6 @@ proc lenPrefix*(msg: openArray[byte]): seq[byte] = return buf -proc makeManifestAndTree*(blocks: seq[Block]): ?!(Manifest, CodexTree) = - if blocks.len == 0: - return failure("Blocks list was empty") - - let - datasetSize = blocks.mapIt(it.data.len).foldl(a + b) - blockSize = blocks.mapIt(it.data.len).foldl(max(a, b)) - tree = ?CodexTree.init(blocks.mapIt(it.cid)) - treeCid = ?tree.rootCid - manifest = Manifest.new( - treeCid = treeCid, - blockSize = NBytes(blockSize), - datasetSize = NBytes(datasetSize), - ) - - return success((manifest, tree)) - proc makeWantList*( cids: seq[Cid], priority: int = 0, @@ -91,7 +77,7 @@ proc storeDataGetManifest*( (await store.putBlock(blk)).tryGet() let - (manifest, tree) = makeManifestAndTree(blocks).tryGet() + (_, tree, manifest) = makeDataset(blocks).tryGet() treeCid = tree.rootCid.tryGet() for i in 0 ..< tree.leavesCount: @@ -110,19 +96,6 @@ proc storeDataGetManifest*( return await storeDataGetManifest(store, blocks) -proc makeRandomBlocks*( - datasetSize: int, blockSize: NBytes -): Future[seq[Block]] {.async.} = - var chunker = - RandomChunker.new(Rng.instance(), size = datasetSize, chunkSize = blockSize) - - while true: - let chunk = await chunker.getBytes() - if chunk.len <= 0: - break - - result.add(Block.new(chunk).tryGet()) - proc corruptBlocks*( store: BlockStore, manifest: Manifest, blks, bytes: int ): Future[seq[int]] {.async.} = @@ -147,4 +120,5 @@ proc corruptBlocks*( bytePos.add(ii) blk.data[ii] = byte 0 + return pos diff --git a/tests/codex/helpers/datasetutils.nim b/tests/codex/helpers/datasetutils.nim new file mode 100644 index 0000000000..d9ed2abc3d --- /dev/null +++ b/tests/codex/helpers/datasetutils.nim @@ -0,0 +1,35 @@ +import std/random + +import pkg/codex/blocktype as bt +import pkg/codex/merkletree +import pkg/codex/manifest + +type TestDataset* = tuple[blocks: seq[Block], tree: CodexTree, manifest: Manifest] + +proc makeRandomBlock*(size: NBytes): Block = + let bytes = newSeqWith(size.int, rand(uint8)) + Block.new(bytes).tryGet() + +proc makeRandomBlocks*(nBlocks: int, blockSize: NBytes): seq[Block] = + for i in 0 ..< nBlocks: + result.add(makeRandomBlock(blockSize)) + +proc makeDataset*(blocks: seq[Block]): ?!TestDataset = + if blocks.len == 0: + return failure("Blocks list was empty") + + let + datasetSize = blocks.mapIt(it.data.len).foldl(a + b) + blockSize = blocks.mapIt(it.data.len).foldl(max(a, b)) + tree = ?CodexTree.init(blocks.mapIt(it.cid)) + treeCid = ?tree.rootCid + manifest = Manifest.new( + treeCid = treeCid, + blockSize = NBytes(blockSize), + datasetSize = NBytes(datasetSize), + ) + + return success((blocks, tree, manifest)) + +proc makeRandomDataset*(nBlocks: int, blockSize: NBytes): ?!TestDataset = + makeDataset(makeRandomBlocks(nBlocks, blockSize)) diff --git a/tests/codex/helpers/mockdiscovery.nim b/tests/codex/helpers/mockdiscovery.nim index 4110c57781..69b61d4911 100644 --- a/tests/codex/helpers/mockdiscovery.nim +++ b/tests/codex/helpers/mockdiscovery.nim @@ -70,3 +70,31 @@ method provide*( return await d.publishHostProvideHandler(d, host) + +proc nullDiscovery*(): MockDiscovery = + proc findBlockProvidersHandler( + d: MockDiscovery, cid: Cid + ): Future[seq[SignedPeerRecord]] {.async: (raises: [CancelledError]).} = + return @[] + + proc publishBlockProvideHandler( + d: MockDiscovery, cid: Cid + ): Future[void] {.async: (raises: [CancelledError]).} = + return + + proc findHostProvidersHandler( + d: MockDiscovery, host: ca.Address + ): Future[seq[SignedPeerRecord]] {.async: (raises: [CancelledError]).} = + return @[] + + proc publishHostProvideHandler( + d: MockDiscovery, host: ca.Address + ): Future[void] {.async: (raises: [CancelledError]).} = + return + + return MockDiscovery( + findBlockProvidersHandler: findBlockProvidersHandler, + publishBlockProvideHandler: publishBlockProvideHandler, + findHostProvidersHandler: findHostProvidersHandler, + publishHostProvideHandler: publishHostProvideHandler, + ) diff --git a/tests/codex/helpers/nodeutils.nim b/tests/codex/helpers/nodeutils.nim index 057e349d1d..928e927d3a 100644 --- a/tests/codex/helpers/nodeutils.nim +++ b/tests/codex/helpers/nodeutils.nim @@ -1,4 +1,5 @@ import std/sequtils +import std/sets import pkg/chronos import pkg/libp2p @@ -8,8 +9,14 @@ import pkg/codex/discovery import pkg/codex/stores import pkg/codex/blocktype as bt import pkg/codex/blockexchange +import pkg/codex/merkletree +import pkg/codex/manifest +import pkg/codex/utils/safeasynciter +import ./datasetutils +import ./mockdiscovery import ../examples +import ../../helpers type NodesComponents* = tuple[ @@ -25,24 +32,60 @@ type NodesComponents* = networkStore: NetworkStore, ] +proc assignBlocks*( + node: NodesComponents, + dataset: TestDataset, + indices: seq[int], + putMerkleProofs = true, +): Future[void] {.async: (raises: [CatchableError]).} = + let rootCid = dataset.tree.rootCid.tryGet() + + for i in indices: + assert (await node.networkStore.putBlock(dataset.blocks[i])).isOk + if putMerkleProofs: + assert ( + await node.networkStore.putCidAndProof( + rootCid, i, dataset.blocks[i].cid, dataset.tree.getProof(i).tryGet() + ) + ).isOk + +proc assignBlocks*( + node: NodesComponents, + dataset: TestDataset, + indices: HSlice[int, int], + putMerkleProofs = true, +): Future[void] {.async: (raises: [CatchableError]).} = + await assignBlocks(node, dataset, indices.toSeq, putMerkleProofs) + +proc assignBlocks*( + node: NodesComponents, dataset: TestDataset, putMerkleProofs = true +): Future[void] {.async: (raises: [CatchableError]).} = + await assignBlocks(node, dataset, 0 ..< dataset.blocks.len, putMerkleProofs) + proc generateNodes*( - num: Natural, blocks: openArray[bt.Block] = [] + num: Natural, blocks: openArray[bt.Block] = [], enableDiscovery = true ): seq[NodesComponents] = for i in 0 ..< num: let switch = newStandardSwitch(transportFlags = {ServerFlags.ReuseAddr}) - discovery = Discovery.new( - switch.peerInfo.privateKey, - announceAddrs = - @[ - MultiAddress.init("/ip4/127.0.0.1/tcp/0").expect( - "Should return multiaddress" - ) - ], - ) + discovery = + if enableDiscovery: + Discovery.new( + switch.peerInfo.privateKey, + announceAddrs = + @[ + MultiAddress.init("/ip4/127.0.0.1/tcp/0").expect( + "Should return multiaddress" + ) + ], + ) + else: + nullDiscovery() + + let wallet = WalletRef.example network = BlockExcNetwork.new(switch) - localStore = CacheStore.new(blocks.mapIt(it)) + localStore = CacheStore.new(blocks) peerStore = PeerCtxStore.new() pendingBlocks = PendingBlocksManager.new() advertiser = Advertiser.new(localStore, discovery) @@ -63,6 +106,26 @@ proc generateNodes*( result.add(nc) +proc start*(nodes: NodesComponents) {.async: (raises: [CatchableError]).} = + await allFuturesThrowing( + nodes.switch.start(), + #nodes.blockDiscovery.start(), + nodes.engine.start(), + ) + +proc stop*(nodes: NodesComponents) {.async: (raises: [CatchableError]).} = + await allFuturesThrowing( + nodes.switch.stop(), + # nodes.blockDiscovery.stop(), + nodes.engine.stop(), + ) + +proc start*(nodes: seq[NodesComponents]) {.async: (raises: [CatchableError]).} = + await allFuturesThrowing(nodes.mapIt(it.start()).toSeq) + +proc stop*(nodes: seq[NodesComponents]) {.async: (raises: [CatchableError]).} = + await allFuturesThrowing(nodes.mapIt(it.stop()).toSeq) + proc connectNodes*(nodes: seq[Switch]) {.async.} = for dialer in nodes: for node in nodes: @@ -71,3 +134,35 @@ proc connectNodes*(nodes: seq[Switch]) {.async.} = proc connectNodes*(nodes: seq[NodesComponents]) {.async.} = await connectNodes(nodes.mapIt(it.switch)) + +proc connectNodes(nodes: varargs[NodesComponents]): Future[void] = + # varargs can't be captured on closures, and async procs are closures, + # so we have to do this mess + let copy = nodes.toSeq + ( + proc() {.async.} = + await connectNodes(copy.mapIt(it.switch)) + )() + +proc linearTopology*(nodes: seq[NodesComponents]) {.async.} = + for i in 0 .. nodes.len - 2: + await connectNodes(nodes[i], nodes[i + 1]) + +proc downloadDataset*( + node: NodesComponents, dataset: TestDataset +): Future[void] {.async.} = + # This is the same as fetchBatched, but we don't construct CodexNodes so I can't use + # it here. + let requestAddresses = collect: + for i in 0 ..< dataset.manifest.blocksCount: + BlockAddress.init(dataset.manifest.treeCid, i) + + let blockCids = dataset.blocks.mapIt(it.cid).toHashSet() + + var count = 0 + for blockFut in (await node.networkStore.getBlocks(requestAddresses)): + let blk = (await blockFut).tryGet() + assert blk.cid in blockCids, "Unknown block CID: " & $blk.cid + count += 1 + + assert count == dataset.blocks.len, "Incorrect number of blocks downloaded" diff --git a/tests/codex/node/testnode.nim b/tests/codex/node/testnode.nim index 78298ad758..83b82dfd26 100644 --- a/tests/codex/node/testnode.nim +++ b/tests/codex/node/testnode.nim @@ -82,7 +82,7 @@ asyncchecksuite "Test Node - Basic": ).tryGet() test "Block Batching with corrupted blocks": - let blocks = await makeRandomBlocks(datasetSize = 64.KiBs.int, blockSize = 64.KiBs) + let blocks = makeRandomBlocks(nBlocks = 1, blockSize = 64.KiBs) assert blocks.len == 1 let blk = blocks[0] @@ -215,7 +215,7 @@ asyncchecksuite "Test Node - Basic": test "Should delete an entire dataset": let - blocks = await makeRandomBlocks(datasetSize = 2048, blockSize = 256'nb) + blocks = makeRandomBlocks(nBlocks = 8, blockSize = 256'nb) manifest = await storeDataGetManifest(localStore, blocks) manifestBlock = (await store.storeManifest(manifest)).tryGet() manifestCid = manifestBlock.cid diff --git a/tests/codex/stores/commonstoretests.nim b/tests/codex/stores/commonstoretests.nim index e4287dd23f..d313277384 100644 --- a/tests/codex/stores/commonstoretests.nim +++ b/tests/codex/stores/commonstoretests.nim @@ -38,8 +38,8 @@ proc commonBlockStoreTests*( newBlock2 = Block.new("2".repeat(100).toBytes()).tryGet() newBlock3 = Block.new("3".repeat(100).toBytes()).tryGet() - (manifest, tree) = - makeManifestAndTree(@[newBlock, newBlock1, newBlock2, newBlock3]).tryGet() + (_, tree, manifest) = + makeDataset(@[newBlock, newBlock1, newBlock2, newBlock3]).tryGet() if not isNil(before): await before() diff --git a/tests/codex/stores/testrepostore.nim b/tests/codex/stores/testrepostore.nim index 69f38711fa..72d860829d 100644 --- a/tests/codex/stores/testrepostore.nim +++ b/tests/codex/stores/testrepostore.nim @@ -364,9 +364,9 @@ asyncchecksuite "RepoStore": let repo = RepoStore.new(repoDs, metaDs, clock = mockClock, quotaMaxBytes = 1000'nb) - dataset = await makeRandomBlocks(datasetSize = 512, blockSize = 256'nb) - blk = dataset[0] - (manifest, tree) = makeManifestAndTree(dataset).tryGet() + (blocks, tree, manifest) = + makeRandomDataset(nBlocks = 2, blockSize = 256'nb).tryGet() + blk = blocks[0] treeCid = tree.rootCid.tryGet() proof = tree.getProof(0).tryGet() @@ -381,9 +381,9 @@ asyncchecksuite "RepoStore": let repo = RepoStore.new(repoDs, metaDs, clock = mockClock, quotaMaxBytes = 1000'nb) - dataset = await makeRandomBlocks(datasetSize = 512, blockSize = 256'nb) - blk = dataset[0] - (manifest, tree) = makeManifestAndTree(dataset).tryGet() + (blocks, tree, manifest) = + makeRandomDataset(nBlocks = 2, blockSize = 256'nb).tryGet() + blk = blocks[0] treeCid = tree.rootCid.tryGet() proof = tree.getProof(0).tryGet() @@ -397,7 +397,7 @@ asyncchecksuite "RepoStore": let repo = RepoStore.new(repoDs, metaDs, clock = mockClock, quotaMaxBytes = 1000'nb) - blockPool = await makeRandomBlocks(datasetSize = 768, blockSize = 256'nb) + blockPool = makeRandomBlocks(nBlocks = 3, blockSize = 256'nb) let dataset1 = @[blockPool[0], blockPool[1]] @@ -406,9 +406,9 @@ asyncchecksuite "RepoStore": let sharedBlock = blockPool[1] let - (manifest1, tree1) = makeManifestAndTree(dataset1).tryGet() + (_, tree1, manifest1) = makeDataset(dataset1).tryGet() treeCid1 = tree1.rootCid.tryGet() - (manifest2, tree2) = makeManifestAndTree(dataset2).tryGet() + (_, tree2, manifest2) = makeDataset(dataset2).tryGet() treeCid2 = tree2.rootCid.tryGet() (await repo.putBlock(sharedBlock)).tryGet() @@ -435,9 +435,9 @@ asyncchecksuite "RepoStore": let repo = RepoStore.new(repoDs, metaDs, clock = mockClock, quotaMaxBytes = 1000'nb) - dataset = await makeRandomBlocks(datasetSize = 512, blockSize = 256'nb) - blk = dataset[0] - (manifest, tree) = makeManifestAndTree(dataset).tryGet() + blocks = makeRandomBlocks(nBlocks = 2, blockSize = 256'nb) + blk = blocks[0] + (_, tree, manifest) = makeDataset(blocks).tryGet() treeCid = tree.rootCid.tryGet() proof = tree.getProof(1).tryGet() @@ -455,9 +455,9 @@ asyncchecksuite "RepoStore": let repo = RepoStore.new(repoDs, metaDs, clock = mockClock, quotaMaxBytes = 1000'nb) - dataset = await makeRandomBlocks(datasetSize = 512, blockSize = 256'nb) - blk = dataset[0] - (manifest, tree) = makeManifestAndTree(dataset).tryGet() + blocks = makeRandomBlocks(nBlocks = 2, blockSize = 256'nb) + blk = blocks[0] + (_, tree, manifest) = makeDataset(blocks).tryGet() treeCid = tree.rootCid.tryGet() proof = tree.getProof(1).tryGet() diff --git a/tests/helpers.nim b/tests/helpers.nim index b48b787edd..742bc10d34 100644 --- a/tests/helpers.nim +++ b/tests/helpers.nim @@ -7,12 +7,11 @@ import std/sequtils, chronos export multisetup, trackers, templeveldb ### taken from libp2p errorhelpers.nim -proc allFuturesThrowing*(args: varargs[FutureBase]): Future[void] = +proc allFuturesThrowing(futs: seq[FutureBase]): Future[void] = # This proc is only meant for use in tests / not suitable for general use. # - Swallowing errors arbitrarily instead of aggregating them is bad design # - It raises `CatchableError` instead of the union of the `futs` errors, # inflating the caller's `raises` list unnecessarily. `macro` could fix it - let futs = @args ( proc() {.async: (raises: [CatchableError]).} = await allFutures(futs) @@ -28,6 +27,9 @@ proc allFuturesThrowing*(args: varargs[FutureBase]): Future[void] = raise firstErr )() +proc allFuturesThrowing*(args: varargs[FutureBase]): Future[void] = + allFuturesThrowing(@args) + proc allFuturesThrowing*[T](futs: varargs[Future[T]]): Future[void] = allFuturesThrowing(futs.mapIt(FutureBase(it))) From 68918242268675cb989b234281053523d8eac631 Mon Sep 17 00:00:00 2001 From: gmega Date: Wed, 9 Jul 2025 14:44:14 -0300 Subject: [PATCH 20/20] fix: randomize block refresh time, optimize context store checks --- codex/blockexchange/engine/engine.nim | 10 +++++++--- codex/blockexchange/peers/peerctxstore.nim | 2 +- tests/codex/blockexchange/engine/testblockexc.nim | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/codex/blockexchange/engine/engine.nim b/codex/blockexchange/engine/engine.nim index 3511f9124d..4cf84454a2 100644 --- a/codex/blockexchange/engine/engine.nim +++ b/codex/blockexchange/engine/engine.nim @@ -12,6 +12,7 @@ import std/sets import std/options import std/algorithm import std/sugar +import std/random import pkg/chronos import pkg/libp2p/[cid, switch, multihash, multicodec] @@ -199,7 +200,6 @@ proc refreshBlockKnowledge(self: BlockExcEngine) {.async: (raises: [CancelledErr # In dynamic swarms, staleness will dominate latency. if peer.lastRefresh < self.pendingBlocks.lastInclusion or peer.isKnowledgeStale: - trace "Refreshing block knowledge for peer", peer = peer.id peer.refreshRequested() # TODO: optimize this by keeping track of what was sent and sending deltas. # This should allow us to run much more frequent refreshes, and be way more @@ -269,8 +269,9 @@ proc downloadInternal( # We now wait for a bit and then retry. If the handle gets completed in the # meantime (cause the presence handler might have requested the block and - # received it in the meantime), we are done. - await handle or sleepAsync(self.pendingBlocks.retryInterval) + # received it in the meantime), we are done. Retry delays are randomized + # so we don't get all block loops spinning at the same time. + await handle or sleepAsync(secs(rand(self.pendingBlocks.retryInterval.secs))) if handle.finished: break # If we still don't have the block, we'll go for another cycle. @@ -484,6 +485,9 @@ proc cancelBlocks( # If so, schedules a cancellation. scheduledCancellations[peerCtx.id] = intersection + if scheduledCancellations.len == 0: + return + let (succeededFuts, failedFuts) = await allFinishedFailed[PeerId]( toSeq(scheduledCancellations.pairs).map(dispatchCancellations) ) diff --git a/codex/blockexchange/peers/peerctxstore.nim b/codex/blockexchange/peers/peerctxstore.nim index 171206ba1c..d2762fc800 100644 --- a/codex/blockexchange/peers/peerctxstore.nim +++ b/codex/blockexchange/peers/peerctxstore.nim @@ -78,7 +78,7 @@ func peersWant*(self: PeerCtxStore, cid: Cid): seq[BlockExcPeerCtx] = proc getPeersForBlock*(self: PeerCtxStore, address: BlockAddress): PeersForBlock = var res: PeersForBlock = (@[], @[]) for peer in self: - if address in peer.peerHave: + if address in peer: res.with.add(peer) else: res.without.add(peer) diff --git a/tests/codex/blockexchange/engine/testblockexc.nim b/tests/codex/blockexchange/engine/testblockexc.nim index 68aec5fee9..ca37bc8a48 100644 --- a/tests/codex/blockexchange/engine/testblockexc.nim +++ b/tests/codex/blockexchange/engine/testblockexc.nim @@ -213,4 +213,4 @@ asyncchecksuite "NetworkStore - dissemination": await nodes.linearTopology() let downloads = nodes.mapIt(downloadDataset(it, dataset)) - await allFuturesThrowing(downloads).wait(20.seconds) + await allFuturesThrowing(downloads).wait(30.seconds)