2023-08-31 03:31:05 +00:00
|
|
|
use core::{time::Duration, fmt};
|
2023-10-14 02:40:11 +00:00
|
|
|
use std::{
|
|
|
|
sync::Arc,
|
2024-04-23 11:01:07 +00:00
|
|
|
io::{self, Read},
|
2024-04-21 11:02:49 +00:00
|
|
|
collections::{HashSet, HashMap},
|
2023-10-14 02:40:11 +00:00
|
|
|
time::{SystemTime, Instant},
|
|
|
|
};
|
2023-04-16 04:51:56 +00:00
|
|
|
|
|
|
|
use async_trait::async_trait;
|
2023-12-23 02:09:18 +00:00
|
|
|
use rand_core::{RngCore, OsRng};
|
2023-04-16 04:51:56 +00:00
|
|
|
|
2024-07-16 23:42:15 +00:00
|
|
|
use scale::{Decode, Encode};
|
2023-12-12 17:28:53 +00:00
|
|
|
use borsh::{BorshSerialize, BorshDeserialize};
|
2023-12-23 02:09:18 +00:00
|
|
|
use serai_client::{primitives::NetworkId, validator_sets::primitives::ValidatorSet, Serai};
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
|
2023-10-14 02:40:11 +00:00
|
|
|
use serai_db::Db;
|
|
|
|
|
2024-04-23 11:01:07 +00:00
|
|
|
use futures_util::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, StreamExt};
|
2023-10-14 02:40:11 +00:00
|
|
|
use tokio::{
|
|
|
|
sync::{Mutex, RwLock, mpsc, broadcast},
|
|
|
|
time::sleep,
|
|
|
|
};
|
2023-08-08 19:12:47 +00:00
|
|
|
|
|
|
|
use libp2p::{
|
2023-12-23 02:09:18 +00:00
|
|
|
core::multiaddr::{Protocol, Multiaddr},
|
2023-08-08 19:12:47 +00:00
|
|
|
identity::Keypair,
|
2023-10-19 04:27:21 +00:00
|
|
|
PeerId,
|
|
|
|
tcp::Config as TcpConfig,
|
2023-08-08 19:12:47 +00:00
|
|
|
noise, yamux,
|
2024-04-23 09:44:58 +00:00
|
|
|
request_response::{
|
2024-04-23 11:01:07 +00:00
|
|
|
Codec as RrCodecTrait, Message as RrMessage, Event as RrEvent, Config as RrConfig,
|
2024-07-16 23:42:15 +00:00
|
|
|
Behaviour as RrBehavior, ProtocolSupport,
|
2024-04-23 09:44:58 +00:00
|
|
|
},
|
2023-08-08 19:12:47 +00:00
|
|
|
gossipsub::{
|
|
|
|
IdentTopic, FastMessageId, MessageId, MessageAuthenticity, ValidationMode, ConfigBuilder,
|
|
|
|
IdentityTransform, AllowAllSubscriptionFilter, Event as GsEvent, PublishError,
|
|
|
|
Behaviour as GsBehavior,
|
|
|
|
},
|
2024-04-23 10:44:21 +00:00
|
|
|
swarm::{NetworkBehaviour, SwarmEvent},
|
2023-10-19 04:27:21 +00:00
|
|
|
SwarmBuilder,
|
2023-08-08 19:12:47 +00:00
|
|
|
};
|
2023-04-24 03:15:15 +00:00
|
|
|
|
2023-10-14 02:40:11 +00:00
|
|
|
pub(crate) use tributary::{ReadWrite, P2p as TributaryP2p};
|
|
|
|
|
2023-10-14 18:56:02 +00:00
|
|
|
use crate::{Transaction, Block, Tributary, ActiveTributary, TributaryEvent};
|
2023-04-22 14:49:52 +00:00
|
|
|
|
2024-04-23 11:01:07 +00:00
|
|
|
// Block size limit + 1 KB of space for signatures/metadata
|
2024-07-16 23:42:15 +00:00
|
|
|
const MAX_LIBP2P_GOSSIP_MESSAGE_SIZE: usize = tributary::BLOCK_SIZE_LIMIT + 1024;
|
|
|
|
|
|
|
|
const MAX_LIBP2P_REQRES_MESSAGE_SIZE: usize =
|
|
|
|
(tributary::BLOCK_SIZE_LIMIT * BLOCKS_PER_BATCH) + 1024;
|
|
|
|
|
2024-07-17 10:54:54 +00:00
|
|
|
const MAX_LIBP2P_MESSAGE_SIZE: usize = {
|
|
|
|
// Manual `max` since `max` isn't a const fn
|
|
|
|
if MAX_LIBP2P_GOSSIP_MESSAGE_SIZE > MAX_LIBP2P_REQRES_MESSAGE_SIZE {
|
|
|
|
MAX_LIBP2P_GOSSIP_MESSAGE_SIZE
|
|
|
|
} else {
|
|
|
|
MAX_LIBP2P_REQRES_MESSAGE_SIZE
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-08-08 19:12:47 +00:00
|
|
|
const LIBP2P_TOPIC: &str = "serai-coordinator";
|
|
|
|
|
2024-07-16 23:42:15 +00:00
|
|
|
// Amount of blocks in a minute
|
2024-07-17 10:54:54 +00:00
|
|
|
const BLOCKS_PER_MINUTE: usize = (60 / (tributary::tendermint::TARGET_BLOCK_TIME / 1000)) as usize;
|
2024-07-16 23:42:15 +00:00
|
|
|
|
|
|
|
// Maximum amount of blocks to send in a batch
|
|
|
|
const BLOCKS_PER_BATCH: usize = BLOCKS_PER_MINUTE + 1;
|
|
|
|
|
2023-12-12 17:28:53 +00:00
|
|
|
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, BorshSerialize, BorshDeserialize)]
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
pub struct CosignedBlock {
|
|
|
|
pub network: NetworkId,
|
2023-11-16 01:23:19 +00:00
|
|
|
pub block_number: u64,
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
pub block: [u8; 32],
|
|
|
|
pub signature: [u8; 64],
|
|
|
|
}
|
|
|
|
|
2023-04-22 14:49:52 +00:00
|
|
|
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
|
2024-04-23 10:37:41 +00:00
|
|
|
pub enum ReqResMessageKind {
|
2023-08-21 06:36:03 +00:00
|
|
|
KeepAlive,
|
2023-04-23 22:55:43 +00:00
|
|
|
Heartbeat([u8; 32]),
|
2023-04-24 04:53:15 +00:00
|
|
|
Block([u8; 32]),
|
2023-04-22 14:49:52 +00:00
|
|
|
}
|
|
|
|
|
2024-04-23 10:37:41 +00:00
|
|
|
impl ReqResMessageKind {
|
|
|
|
pub fn read<R: Read>(reader: &mut R) -> Option<ReqResMessageKind> {
|
|
|
|
let mut kind = [0; 1];
|
|
|
|
reader.read_exact(&mut kind).ok()?;
|
|
|
|
match kind[0] {
|
|
|
|
0 => Some(ReqResMessageKind::KeepAlive),
|
|
|
|
1 => Some({
|
|
|
|
let mut genesis = [0; 32];
|
|
|
|
reader.read_exact(&mut genesis).ok()?;
|
|
|
|
ReqResMessageKind::Heartbeat(genesis)
|
|
|
|
}),
|
|
|
|
2 => Some({
|
|
|
|
let mut genesis = [0; 32];
|
|
|
|
reader.read_exact(&mut genesis).ok()?;
|
|
|
|
ReqResMessageKind::Block(genesis)
|
|
|
|
}),
|
|
|
|
_ => None,
|
2023-11-19 01:37:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-04-23 10:37:41 +00:00
|
|
|
pub fn serialize(&self) -> Vec<u8> {
|
2023-04-22 14:49:52 +00:00
|
|
|
match self {
|
2024-04-23 10:37:41 +00:00
|
|
|
ReqResMessageKind::KeepAlive => vec![0],
|
|
|
|
ReqResMessageKind::Heartbeat(genesis) => {
|
2023-08-21 06:36:03 +00:00
|
|
|
let mut res = vec![1];
|
2023-04-23 20:56:23 +00:00
|
|
|
res.extend(genesis);
|
|
|
|
res
|
|
|
|
}
|
2024-04-23 10:37:41 +00:00
|
|
|
ReqResMessageKind::Block(genesis) => {
|
2023-08-21 06:36:03 +00:00
|
|
|
let mut res = vec![2];
|
2023-04-23 22:55:43 +00:00
|
|
|
res.extend(genesis);
|
|
|
|
res
|
|
|
|
}
|
2023-04-22 14:49:52 +00:00
|
|
|
}
|
|
|
|
}
|
2024-04-23 10:37:41 +00:00
|
|
|
}
|
2023-04-22 14:49:52 +00:00
|
|
|
|
2024-04-23 10:37:41 +00:00
|
|
|
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
|
|
|
|
pub enum GossipMessageKind {
|
|
|
|
Tributary([u8; 32]),
|
|
|
|
CosignedBlock,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl GossipMessageKind {
|
|
|
|
pub fn read<R: Read>(reader: &mut R) -> Option<GossipMessageKind> {
|
2023-04-23 20:56:23 +00:00
|
|
|
let mut kind = [0; 1];
|
|
|
|
reader.read_exact(&mut kind).ok()?;
|
|
|
|
match kind[0] {
|
2024-04-23 10:37:41 +00:00
|
|
|
0 => Some({
|
2023-04-24 04:53:15 +00:00
|
|
|
let mut genesis = [0; 32];
|
|
|
|
reader.read_exact(&mut genesis).ok()?;
|
2024-04-23 10:37:41 +00:00
|
|
|
GossipMessageKind::Tributary(genesis)
|
2023-04-24 04:53:15 +00:00
|
|
|
}),
|
2024-04-23 10:37:41 +00:00
|
|
|
1 => Some(GossipMessageKind::CosignedBlock),
|
2023-04-22 14:49:52 +00:00
|
|
|
_ => None,
|
|
|
|
}
|
|
|
|
}
|
2024-04-23 10:37:41 +00:00
|
|
|
|
|
|
|
pub fn serialize(&self) -> Vec<u8> {
|
|
|
|
match self {
|
|
|
|
GossipMessageKind::Tributary(genesis) => {
|
|
|
|
let mut res = vec![0];
|
|
|
|
res.extend(genesis);
|
|
|
|
res
|
|
|
|
}
|
|
|
|
GossipMessageKind::CosignedBlock => {
|
|
|
|
vec![1]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
|
|
|
|
pub enum P2pMessageKind {
|
|
|
|
ReqRes(ReqResMessageKind),
|
|
|
|
Gossip(GossipMessageKind),
|
|
|
|
}
|
|
|
|
|
|
|
|
impl P2pMessageKind {
|
|
|
|
fn genesis(&self) -> Option<[u8; 32]> {
|
|
|
|
match self {
|
|
|
|
P2pMessageKind::ReqRes(ReqResMessageKind::KeepAlive) |
|
|
|
|
P2pMessageKind::Gossip(GossipMessageKind::CosignedBlock) => None,
|
|
|
|
P2pMessageKind::ReqRes(
|
|
|
|
ReqResMessageKind::Heartbeat(genesis) | ReqResMessageKind::Block(genesis),
|
|
|
|
) |
|
|
|
|
P2pMessageKind::Gossip(GossipMessageKind::Tributary(genesis)) => Some(*genesis),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl From<ReqResMessageKind> for P2pMessageKind {
|
|
|
|
fn from(kind: ReqResMessageKind) -> P2pMessageKind {
|
|
|
|
P2pMessageKind::ReqRes(kind)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl From<GossipMessageKind> for P2pMessageKind {
|
|
|
|
fn from(kind: GossipMessageKind) -> P2pMessageKind {
|
|
|
|
P2pMessageKind::Gossip(kind)
|
|
|
|
}
|
2023-04-22 14:49:52 +00:00
|
|
|
}
|
2023-04-16 04:51:56 +00:00
|
|
|
|
2023-04-23 20:56:23 +00:00
|
|
|
#[derive(Clone, Debug)]
|
|
|
|
pub struct Message<P: P2p> {
|
|
|
|
pub sender: P::Id,
|
|
|
|
pub kind: P2pMessageKind,
|
|
|
|
pub msg: Vec<u8>,
|
|
|
|
}
|
|
|
|
|
2024-07-16 23:42:15 +00:00
|
|
|
#[derive(Clone, Debug, Encode, Decode)]
|
|
|
|
pub struct BlockCommit {
|
|
|
|
pub block: Vec<u8>,
|
|
|
|
pub commit: Vec<u8>,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone, Debug, Encode, Decode)]
|
|
|
|
pub struct HeartbeatBatch {
|
|
|
|
pub blocks: Vec<BlockCommit>,
|
|
|
|
pub timestamp: u64,
|
|
|
|
}
|
|
|
|
|
2023-04-16 04:51:56 +00:00
|
|
|
#[async_trait]
|
2023-08-08 19:12:47 +00:00
|
|
|
pub trait P2p: Send + Sync + Clone + fmt::Debug + TributaryP2p {
|
|
|
|
type Id: Send + Sync + Clone + Copy + fmt::Debug;
|
2023-04-23 20:56:23 +00:00
|
|
|
|
2023-12-23 02:09:18 +00:00
|
|
|
async fn subscribe(&self, set: ValidatorSet, genesis: [u8; 32]);
|
|
|
|
async fn unsubscribe(&self, set: ValidatorSet, genesis: [u8; 32]);
|
2023-11-19 01:37:53 +00:00
|
|
|
|
2024-04-23 09:44:58 +00:00
|
|
|
async fn send_raw(&self, to: Self::Id, msg: Vec<u8>);
|
|
|
|
async fn broadcast_raw(&self, kind: P2pMessageKind, msg: Vec<u8>);
|
2024-04-23 10:37:41 +00:00
|
|
|
async fn receive(&self) -> Message<Self>;
|
2023-04-23 20:56:23 +00:00
|
|
|
|
2024-04-23 10:37:41 +00:00
|
|
|
async fn send(&self, to: Self::Id, kind: ReqResMessageKind, msg: Vec<u8>) {
|
2023-04-23 20:56:23 +00:00
|
|
|
let mut actual_msg = kind.serialize();
|
|
|
|
actual_msg.extend(msg);
|
2024-04-23 09:44:58 +00:00
|
|
|
self.send_raw(to, actual_msg).await;
|
2023-04-23 20:56:23 +00:00
|
|
|
}
|
2024-04-23 10:37:41 +00:00
|
|
|
async fn broadcast(&self, kind: impl Send + Into<P2pMessageKind>, msg: Vec<u8>) {
|
|
|
|
let kind = kind.into();
|
|
|
|
let mut actual_msg = match kind {
|
|
|
|
P2pMessageKind::ReqRes(kind) => kind.serialize(),
|
|
|
|
P2pMessageKind::Gossip(kind) => kind.serialize(),
|
|
|
|
};
|
2023-04-23 20:56:23 +00:00
|
|
|
actual_msg.extend(msg);
|
2023-11-16 18:37:35 +00:00
|
|
|
/*
|
2023-08-13 06:21:56 +00:00
|
|
|
log::trace!(
|
|
|
|
"broadcasting p2p message (kind {})",
|
|
|
|
match kind {
|
2023-08-21 06:36:03 +00:00
|
|
|
P2pMessageKind::KeepAlive => "KeepAlive".to_string(),
|
2023-08-13 06:21:56 +00:00
|
|
|
P2pMessageKind::Tributary(genesis) => format!("Tributary({})", hex::encode(genesis)),
|
|
|
|
P2pMessageKind::Heartbeat(genesis) => format!("Heartbeat({})", hex::encode(genesis)),
|
|
|
|
P2pMessageKind::Block(genesis) => format!("Block({})", hex::encode(genesis)),
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
P2pMessageKind::CosignedBlock => "CosignedBlock".to_string(),
|
2023-08-13 06:21:56 +00:00
|
|
|
}
|
|
|
|
);
|
2023-11-16 18:37:35 +00:00
|
|
|
*/
|
2024-04-23 09:44:58 +00:00
|
|
|
self.broadcast_raw(kind, actual_msg).await;
|
2023-04-23 20:56:23 +00:00
|
|
|
}
|
2023-04-22 14:49:52 +00:00
|
|
|
}
|
2023-04-16 04:51:56 +00:00
|
|
|
|
2024-04-23 11:01:07 +00:00
|
|
|
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
|
|
|
|
struct RrCodec;
|
|
|
|
#[async_trait]
|
|
|
|
impl RrCodecTrait for RrCodec {
|
|
|
|
type Protocol = &'static str;
|
|
|
|
type Request = Vec<u8>;
|
|
|
|
type Response = Vec<u8>;
|
|
|
|
|
|
|
|
async fn read_request<R: Send + Unpin + AsyncRead>(
|
|
|
|
&mut self,
|
|
|
|
_: &Self::Protocol,
|
|
|
|
io: &mut R,
|
|
|
|
) -> io::Result<Vec<u8>> {
|
|
|
|
let mut len = [0; 4];
|
|
|
|
io.read_exact(&mut len).await?;
|
2024-07-17 10:54:54 +00:00
|
|
|
let len = usize::try_from(u32::from_le_bytes(len)).expect("not at least a 32-bit platform?");
|
2024-07-16 23:42:15 +00:00
|
|
|
if len > MAX_LIBP2P_REQRES_MESSAGE_SIZE {
|
|
|
|
Err(io::Error::other("request length exceeded MAX_LIBP2P_REQRES_MESSAGE_SIZE"))?;
|
2024-04-23 11:01:07 +00:00
|
|
|
}
|
|
|
|
// This may be a non-trivial allocation easily causable
|
|
|
|
// While we could chunk the read, meaning we only perform the allocation as bandwidth is used,
|
|
|
|
// the max message size should be sufficiently sane
|
|
|
|
let mut buf = vec![0; len];
|
|
|
|
io.read_exact(&mut buf).await?;
|
|
|
|
Ok(buf)
|
|
|
|
}
|
|
|
|
async fn read_response<R: Send + Unpin + AsyncRead>(
|
|
|
|
&mut self,
|
|
|
|
proto: &Self::Protocol,
|
|
|
|
io: &mut R,
|
|
|
|
) -> io::Result<Vec<u8>> {
|
|
|
|
self.read_request(proto, io).await
|
|
|
|
}
|
|
|
|
async fn write_request<W: Send + Unpin + AsyncWrite>(
|
|
|
|
&mut self,
|
|
|
|
_: &Self::Protocol,
|
|
|
|
io: &mut W,
|
|
|
|
req: Vec<u8>,
|
|
|
|
) -> io::Result<()> {
|
|
|
|
io.write_all(
|
|
|
|
&u32::try_from(req.len())
|
|
|
|
.map_err(|_| io::Error::other("request length exceeded 2**32"))?
|
|
|
|
.to_le_bytes(),
|
|
|
|
)
|
|
|
|
.await?;
|
|
|
|
io.write_all(&req).await
|
|
|
|
}
|
|
|
|
async fn write_response<W: Send + Unpin + AsyncWrite>(
|
|
|
|
&mut self,
|
|
|
|
proto: &Self::Protocol,
|
|
|
|
io: &mut W,
|
|
|
|
res: Vec<u8>,
|
|
|
|
) -> io::Result<()> {
|
|
|
|
self.write_request(proto, io, res).await
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-08 19:12:47 +00:00
|
|
|
#[derive(NetworkBehaviour)]
|
|
|
|
struct Behavior {
|
2024-04-23 11:01:07 +00:00
|
|
|
reqres: RrBehavior<RrCodec>,
|
2023-08-08 19:12:47 +00:00
|
|
|
gossipsub: GsBehavior,
|
|
|
|
}
|
|
|
|
|
2023-04-23 20:56:23 +00:00
|
|
|
#[allow(clippy::type_complexity)]
|
2023-08-08 19:12:47 +00:00
|
|
|
#[derive(Clone)]
|
2023-11-19 01:37:53 +00:00
|
|
|
pub struct LibP2p {
|
2023-12-23 02:09:18 +00:00
|
|
|
subscribe: Arc<Mutex<mpsc::UnboundedSender<(bool, ValidatorSet, [u8; 32])>>>,
|
2024-04-23 09:44:58 +00:00
|
|
|
send: Arc<Mutex<mpsc::UnboundedSender<(PeerId, Vec<u8>)>>>,
|
|
|
|
broadcast: Arc<Mutex<mpsc::UnboundedSender<(P2pMessageKind, Vec<u8>)>>>,
|
2024-04-23 10:37:41 +00:00
|
|
|
receive: Arc<Mutex<mpsc::UnboundedReceiver<Message<Self>>>>,
|
2023-11-19 01:37:53 +00:00
|
|
|
}
|
2023-08-08 19:12:47 +00:00
|
|
|
impl fmt::Debug for LibP2p {
|
|
|
|
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
|
fmt.debug_struct("LibP2p").finish_non_exhaustive()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl LibP2p {
|
|
|
|
#[allow(clippy::new_without_default)]
|
2023-12-23 02:09:18 +00:00
|
|
|
pub fn new(serai: Arc<Serai>) -> Self {
|
2023-08-08 19:12:47 +00:00
|
|
|
log::info!("creating a libp2p instance");
|
|
|
|
|
|
|
|
let throwaway_key_pair = Keypair::generate_ed25519();
|
|
|
|
|
|
|
|
let behavior = Behavior {
|
2024-07-16 23:42:15 +00:00
|
|
|
reqres: { RrBehavior::new([("/coordinator", ProtocolSupport::Full)], RrConfig::default()) },
|
2023-08-08 19:12:47 +00:00
|
|
|
gossipsub: {
|
2023-08-31 05:33:52 +00:00
|
|
|
let heartbeat_interval = tributary::tendermint::LATENCY_TIME / 2;
|
|
|
|
let heartbeats_per_block =
|
|
|
|
usize::try_from(tributary::tendermint::TARGET_BLOCK_TIME / heartbeat_interval).unwrap();
|
|
|
|
|
2023-08-08 19:12:47 +00:00
|
|
|
use blake2::{Digest, Blake2s256};
|
|
|
|
let config = ConfigBuilder::default()
|
2023-08-31 05:33:52 +00:00
|
|
|
.heartbeat_interval(Duration::from_millis(heartbeat_interval.into()))
|
|
|
|
.history_length(heartbeats_per_block * 2)
|
|
|
|
.history_gossip(heartbeats_per_block)
|
2024-07-16 23:42:15 +00:00
|
|
|
.max_transmit_size(MAX_LIBP2P_GOSSIP_MESSAGE_SIZE)
|
2023-08-21 06:36:03 +00:00
|
|
|
// We send KeepAlive after 80s
|
|
|
|
.idle_timeout(Duration::from_secs(85))
|
2023-08-08 19:12:47 +00:00
|
|
|
.validation_mode(ValidationMode::Strict)
|
|
|
|
// Uses a content based message ID to avoid duplicates as much as possible
|
|
|
|
.message_id_fn(|msg| {
|
|
|
|
MessageId::new(&Blake2s256::digest([msg.topic.as_str().as_bytes(), &msg.data].concat()))
|
|
|
|
})
|
|
|
|
// Re-defines for fast ID to prevent needing to convert into a Message to run
|
|
|
|
// message_id_fn
|
|
|
|
// This function is valid for both
|
|
|
|
.fast_message_id_fn(|msg| {
|
|
|
|
FastMessageId::new(&Blake2s256::digest(
|
|
|
|
[msg.topic.as_str().as_bytes(), &msg.data].concat(),
|
|
|
|
))
|
|
|
|
})
|
|
|
|
.build();
|
|
|
|
let mut gossipsub = GsBehavior::<IdentityTransform, AllowAllSubscriptionFilter>::new(
|
2023-10-19 04:27:21 +00:00
|
|
|
MessageAuthenticity::Signed(throwaway_key_pair.clone()),
|
2023-08-08 19:12:47 +00:00
|
|
|
config.unwrap(),
|
|
|
|
)
|
|
|
|
.unwrap();
|
|
|
|
|
2023-11-19 01:37:53 +00:00
|
|
|
// Subscribe to the base topic
|
2023-08-08 19:12:47 +00:00
|
|
|
let topic = IdentTopic::new(LIBP2P_TOPIC);
|
|
|
|
gossipsub.subscribe(&topic).unwrap();
|
|
|
|
|
|
|
|
gossipsub
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2023-10-19 04:27:21 +00:00
|
|
|
// Uses noise for authentication, yamux for multiplexing
|
|
|
|
// TODO: Do we want to add a custom authentication protocol to only accept connections from
|
|
|
|
// fellow validators? Doing so would reduce the potential for spam
|
|
|
|
// TODO: Relay client?
|
|
|
|
let mut swarm = SwarmBuilder::with_existing_identity(throwaway_key_pair)
|
|
|
|
.with_tokio()
|
2023-11-21 06:53:05 +00:00
|
|
|
.with_tcp(TcpConfig::default().nodelay(true), noise::Config::new, || {
|
|
|
|
let mut config = yamux::Config::default();
|
|
|
|
// 1 MiB default + max message size
|
2024-07-17 10:54:54 +00:00
|
|
|
config.set_max_buffer_size((1024 * 1024) + MAX_LIBP2P_MESSAGE_SIZE);
|
2023-11-21 06:53:05 +00:00
|
|
|
// 256 KiB default + max message size
|
2024-07-17 10:54:54 +00:00
|
|
|
config
|
|
|
|
.set_receive_window_size(((256 * 1024) + MAX_LIBP2P_MESSAGE_SIZE).try_into().unwrap());
|
2023-11-21 06:53:05 +00:00
|
|
|
config
|
|
|
|
})
|
2023-10-19 04:27:21 +00:00
|
|
|
.unwrap()
|
|
|
|
.with_behaviour(|_| behavior)
|
|
|
|
.unwrap()
|
|
|
|
.build();
|
2023-08-08 19:12:47 +00:00
|
|
|
const PORT: u16 = 30563; // 5132 ^ (('c' << 8) | 'o')
|
|
|
|
swarm.listen_on(format!("/ip4/0.0.0.0/tcp/{PORT}").parse().unwrap()).unwrap();
|
|
|
|
|
2024-04-23 09:44:58 +00:00
|
|
|
let (send_send, mut send_recv) = mpsc::unbounded_channel();
|
2023-08-31 03:31:05 +00:00
|
|
|
let (broadcast_send, mut broadcast_recv) = mpsc::unbounded_channel();
|
|
|
|
let (receive_send, receive_recv) = mpsc::unbounded_channel();
|
2023-11-19 01:37:53 +00:00
|
|
|
let (subscribe_send, mut subscribe_recv) = mpsc::unbounded_channel();
|
|
|
|
|
2023-12-23 02:09:18 +00:00
|
|
|
fn topic_for_set(set: ValidatorSet) -> IdentTopic {
|
|
|
|
IdentTopic::new(format!("{LIBP2P_TOPIC}-{}", hex::encode(set.encode())))
|
2023-11-19 01:37:53 +00:00
|
|
|
}
|
2023-08-31 03:31:05 +00:00
|
|
|
|
2024-04-23 16:38:59 +00:00
|
|
|
// TODO: If a network has less than TARGET_PEERS, this will cause retries ad infinitum
|
2024-04-23 16:59:34 +00:00
|
|
|
const TARGET_PEERS: usize = 5;
|
2024-04-23 15:59:38 +00:00
|
|
|
|
2024-04-21 11:02:49 +00:00
|
|
|
// The addrs we're currently dialing, and the networks associated with them
|
|
|
|
let dialing_peers = Arc::new(RwLock::new(HashMap::new()));
|
2024-04-21 11:26:16 +00:00
|
|
|
// The peers we're currently connected to, and the networks associated with them
|
|
|
|
let connected_peers = Arc::new(RwLock::new(HashMap::<Multiaddr, HashSet<NetworkId>>::new()));
|
2024-04-21 11:02:49 +00:00
|
|
|
|
2024-03-23 03:40:15 +00:00
|
|
|
// Find and connect to peers
|
2024-04-21 11:02:49 +00:00
|
|
|
let (connect_to_network_send, mut connect_to_network_recv) =
|
2024-03-23 03:40:15 +00:00
|
|
|
tokio::sync::mpsc::unbounded_channel();
|
|
|
|
let (to_dial_send, mut to_dial_recv) = tokio::sync::mpsc::unbounded_channel();
|
2023-08-08 19:12:47 +00:00
|
|
|
tokio::spawn({
|
2024-04-21 11:02:49 +00:00
|
|
|
let dialing_peers = dialing_peers.clone();
|
2024-04-21 11:26:16 +00:00
|
|
|
let connected_peers = connected_peers.clone();
|
|
|
|
|
2024-04-21 11:02:49 +00:00
|
|
|
let connect_to_network_send = connect_to_network_send.clone();
|
2023-08-08 19:12:47 +00:00
|
|
|
async move {
|
|
|
|
loop {
|
2024-04-21 11:02:49 +00:00
|
|
|
let connect = |network: NetworkId, addr: Multiaddr| {
|
|
|
|
let dialing_peers = dialing_peers.clone();
|
2024-04-21 11:26:16 +00:00
|
|
|
let connected_peers = connected_peers.clone();
|
2024-04-21 11:02:49 +00:00
|
|
|
let to_dial_send = to_dial_send.clone();
|
2024-04-21 11:26:16 +00:00
|
|
|
let connect_to_network_send = connect_to_network_send.clone();
|
2024-04-21 11:02:49 +00:00
|
|
|
async move {
|
2023-12-23 02:09:18 +00:00
|
|
|
log::info!("found peer from substrate: {addr}");
|
|
|
|
|
|
|
|
let protocols = addr.iter().filter_map(|piece| match piece {
|
|
|
|
// Drop PeerIds from the Substrate P2p network
|
|
|
|
Protocol::P2p(_) => None,
|
|
|
|
// Use our own TCP port
|
|
|
|
Protocol::Tcp(_) => Some(Protocol::Tcp(PORT)),
|
|
|
|
other => Some(other),
|
|
|
|
});
|
|
|
|
|
|
|
|
let mut new_addr = Multiaddr::empty();
|
|
|
|
for protocol in protocols {
|
|
|
|
new_addr.push(protocol);
|
|
|
|
}
|
|
|
|
let addr = new_addr;
|
|
|
|
log::debug!("transformed found peer: {addr}");
|
|
|
|
|
2024-04-21 11:02:49 +00:00
|
|
|
let (is_fresh_dial, nets) = {
|
|
|
|
let mut dialing_peers = dialing_peers.write().await;
|
2024-04-21 11:30:18 +00:00
|
|
|
let is_fresh_dial = !dialing_peers.contains_key(&addr);
|
|
|
|
if is_fresh_dial {
|
2024-04-21 11:02:49 +00:00
|
|
|
dialing_peers.insert(addr.clone(), HashSet::new());
|
2023-12-23 02:09:18 +00:00
|
|
|
}
|
2024-04-21 11:02:49 +00:00
|
|
|
// Associate this network with this peer
|
|
|
|
dialing_peers.get_mut(&addr).unwrap().insert(network);
|
|
|
|
|
|
|
|
let nets = dialing_peers.get(&addr).unwrap().clone();
|
|
|
|
(is_fresh_dial, nets)
|
|
|
|
};
|
|
|
|
|
|
|
|
// Spawn a task to remove this peer from 'dialing' in sixty seconds, in case dialing
|
|
|
|
// fails
|
|
|
|
// This performs cleanup and bounds the size of the map to whatever growth occurs
|
|
|
|
// within a temporal window
|
|
|
|
tokio::spawn({
|
|
|
|
let dialing_peers = dialing_peers.clone();
|
2024-04-21 11:26:16 +00:00
|
|
|
let connected_peers = connected_peers.clone();
|
|
|
|
let connect_to_network_send = connect_to_network_send.clone();
|
2024-04-21 11:02:49 +00:00
|
|
|
let addr = addr.clone();
|
|
|
|
async move {
|
|
|
|
tokio::time::sleep(core::time::Duration::from_secs(60)).await;
|
|
|
|
let mut dialing_peers = dialing_peers.write().await;
|
2024-04-21 11:26:16 +00:00
|
|
|
if let Some(expected_nets) = dialing_peers.remove(&addr) {
|
|
|
|
log::debug!("removed addr from dialing upon timeout: {addr}");
|
|
|
|
|
|
|
|
// TODO: De-duplicate this below instance
|
|
|
|
// If we failed to dial and haven't gotten enough actual connections, retry
|
|
|
|
let connected_peers = connected_peers.read().await;
|
|
|
|
for net in expected_nets {
|
|
|
|
let mut remaining_peers = 0;
|
|
|
|
for nets in connected_peers.values() {
|
|
|
|
if nets.contains(&net) {
|
|
|
|
remaining_peers += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// If we do not, start connecting to this network again
|
2024-04-23 15:59:38 +00:00
|
|
|
if remaining_peers < TARGET_PEERS {
|
2024-04-21 11:26:16 +00:00
|
|
|
connect_to_network_send.send(net).expect(
|
|
|
|
"couldn't send net to connect to due to disconnects (receiver dropped?)",
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-12-23 02:09:18 +00:00
|
|
|
}
|
2024-04-21 11:02:49 +00:00
|
|
|
});
|
|
|
|
|
2024-04-21 11:26:16 +00:00
|
|
|
if is_fresh_dial {
|
2024-04-21 11:02:49 +00:00
|
|
|
to_dial_send.send((addr, nets)).unwrap();
|
2023-12-23 02:09:18 +00:00
|
|
|
}
|
|
|
|
}
|
2024-04-21 11:02:49 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// TODO: We should also connect to random peers from random nets as needed for
|
|
|
|
// cosigning
|
|
|
|
|
2024-04-23 14:55:52 +00:00
|
|
|
// Drain the chainnel, de-duplicating any networks in it
|
|
|
|
let mut connect_to_network_networks = HashSet::new();
|
2024-04-23 16:38:59 +00:00
|
|
|
while let Ok(network) = connect_to_network_recv.try_recv() {
|
2024-04-23 14:55:52 +00:00
|
|
|
connect_to_network_networks.insert(network);
|
|
|
|
}
|
|
|
|
for network in connect_to_network_networks {
|
2024-04-21 11:02:49 +00:00
|
|
|
if let Ok(mut nodes) = serai.p2p_validators(network).await {
|
|
|
|
// If there's an insufficient amount of nodes known, connect to all yet add it
|
|
|
|
// back and break
|
2024-04-23 15:59:38 +00:00
|
|
|
if nodes.len() < TARGET_PEERS {
|
2024-04-21 11:02:49 +00:00
|
|
|
log::warn!(
|
|
|
|
"insufficient amount of P2P nodes known for {:?}: {}",
|
|
|
|
network,
|
|
|
|
nodes.len()
|
|
|
|
);
|
2024-04-23 14:55:52 +00:00
|
|
|
// Retry this later
|
|
|
|
connect_to_network_send.send(network).unwrap();
|
2024-04-21 11:02:49 +00:00
|
|
|
for node in nodes {
|
|
|
|
connect(network, node).await;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2024-04-23 16:59:34 +00:00
|
|
|
// Randomly select up to 150% of the TARGET_PEERS
|
|
|
|
for _ in 0 .. ((3 * TARGET_PEERS) / 2) {
|
2024-04-21 11:02:49 +00:00
|
|
|
if !nodes.is_empty() {
|
|
|
|
let to_connect = nodes.swap_remove(
|
|
|
|
usize::try_from(OsRng.next_u64() % u64::try_from(nodes.len()).unwrap())
|
|
|
|
.unwrap(),
|
|
|
|
);
|
|
|
|
connect(network, to_connect).await;
|
|
|
|
}
|
|
|
|
}
|
2024-03-23 03:47:43 +00:00
|
|
|
}
|
2023-12-23 02:09:18 +00:00
|
|
|
}
|
2024-03-23 03:40:15 +00:00
|
|
|
// Sleep 60 seconds before moving to the next iteration
|
|
|
|
tokio::time::sleep(core::time::Duration::from_secs(60)).await;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
// Manage the actual swarm
|
|
|
|
tokio::spawn({
|
|
|
|
let mut time_of_last_p2p_message = Instant::now();
|
|
|
|
|
|
|
|
async move {
|
2024-04-21 11:26:16 +00:00
|
|
|
let connected_peers = connected_peers.clone();
|
2024-04-21 11:02:49 +00:00
|
|
|
|
2024-03-23 03:40:15 +00:00
|
|
|
let mut set_for_genesis = HashMap::new();
|
|
|
|
loop {
|
2023-09-25 22:23:39 +00:00
|
|
|
let time_since_last = Instant::now().duration_since(time_of_last_p2p_message);
|
2023-08-31 03:31:05 +00:00
|
|
|
tokio::select! {
|
|
|
|
biased;
|
|
|
|
|
2023-11-19 01:37:53 +00:00
|
|
|
// Subscribe to any new topics
|
2023-12-23 02:09:18 +00:00
|
|
|
set = subscribe_recv.recv() => {
|
|
|
|
let (subscribe, set, genesis): (_, ValidatorSet, [u8; 32]) =
|
|
|
|
set.expect("subscribe_recv closed. are we shutting down?");
|
|
|
|
let topic = topic_for_set(set);
|
2023-11-19 01:37:53 +00:00
|
|
|
if subscribe {
|
2023-12-23 02:09:18 +00:00
|
|
|
log::info!("subscribing to p2p messages for {set:?}");
|
2024-04-21 11:02:49 +00:00
|
|
|
connect_to_network_send.send(set.network).unwrap();
|
2023-12-23 02:09:18 +00:00
|
|
|
set_for_genesis.insert(genesis, set);
|
|
|
|
swarm.behaviour_mut().gossipsub.subscribe(&topic).unwrap();
|
2023-11-19 01:37:53 +00:00
|
|
|
} else {
|
2023-12-23 02:09:18 +00:00
|
|
|
log::info!("unsubscribing to p2p messages for {set:?}");
|
|
|
|
set_for_genesis.remove(&genesis);
|
|
|
|
swarm.behaviour_mut().gossipsub.unsubscribe(&topic).unwrap();
|
2023-11-19 01:37:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-04-23 09:44:58 +00:00
|
|
|
msg = send_recv.recv() => {
|
|
|
|
let (peer, msg): (PeerId, Vec<u8>) =
|
|
|
|
msg.expect("send_recv closed. are we shutting down?");
|
|
|
|
swarm.behaviour_mut().reqres.send_request(&peer, msg);
|
|
|
|
},
|
|
|
|
|
2023-08-31 03:31:05 +00:00
|
|
|
// Handle any queued outbound messages
|
|
|
|
msg = broadcast_recv.recv() => {
|
2024-04-23 10:44:21 +00:00
|
|
|
// Update the time of last message
|
|
|
|
time_of_last_p2p_message = Instant::now();
|
|
|
|
|
2024-04-23 09:44:58 +00:00
|
|
|
let (kind, msg): (P2pMessageKind, Vec<u8>) =
|
2023-12-23 02:09:18 +00:00
|
|
|
msg.expect("broadcast_recv closed. are we shutting down?");
|
2024-04-23 10:44:21 +00:00
|
|
|
|
2024-04-23 10:37:41 +00:00
|
|
|
if matches!(kind, P2pMessageKind::ReqRes(_)) {
|
2024-04-23 10:44:21 +00:00
|
|
|
// Use request/response, yet send to all connected peers
|
2024-04-23 09:44:58 +00:00
|
|
|
for peer_id in swarm.connected_peers().copied().collect::<Vec<_>>() {
|
|
|
|
swarm.behaviour_mut().reqres.send_request(&peer_id, msg.clone());
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Use gossipsub
|
2024-04-23 10:44:21 +00:00
|
|
|
|
|
|
|
let set =
|
|
|
|
kind.genesis().and_then(|genesis| set_for_genesis.get(&genesis).copied());
|
|
|
|
let topic = if let Some(set) = set {
|
|
|
|
topic_for_set(set)
|
|
|
|
} else {
|
|
|
|
IdentTopic::new(LIBP2P_TOPIC)
|
|
|
|
};
|
|
|
|
|
|
|
|
match swarm.behaviour_mut().gossipsub.publish(topic, msg.clone()) {
|
|
|
|
Err(PublishError::SigningError(e)) => {
|
|
|
|
panic!("signing error when broadcasting: {e}")
|
|
|
|
},
|
|
|
|
Err(PublishError::InsufficientPeers) => {
|
|
|
|
log::warn!("failed to send p2p message due to insufficient peers")
|
|
|
|
}
|
|
|
|
Err(PublishError::MessageTooLarge) => {
|
|
|
|
panic!("tried to send a too large message: {}", hex::encode(msg))
|
|
|
|
}
|
|
|
|
Err(PublishError::TransformFailed(e)) => panic!("IdentityTransform failed: {e}"),
|
|
|
|
Err(PublishError::Duplicate) | Ok(_) => {}
|
|
|
|
}
|
2024-04-23 09:44:58 +00:00
|
|
|
}
|
2023-08-31 03:31:05 +00:00
|
|
|
}
|
2023-08-21 06:36:03 +00:00
|
|
|
|
2023-08-31 03:31:05 +00:00
|
|
|
// Handle new incoming messages
|
|
|
|
event = swarm.next() => {
|
|
|
|
match event {
|
2023-12-23 02:09:18 +00:00
|
|
|
Some(SwarmEvent::Dialing { connection_id, .. }) => {
|
|
|
|
log::debug!("dialing to peer in connection ID {}", &connection_id);
|
2023-08-08 19:12:47 +00:00
|
|
|
}
|
2024-04-21 11:02:49 +00:00
|
|
|
Some(SwarmEvent::ConnectionEstablished {
|
|
|
|
peer_id,
|
|
|
|
connection_id,
|
|
|
|
endpoint,
|
|
|
|
..
|
|
|
|
}) => {
|
2024-03-23 03:51:51 +00:00
|
|
|
if &peer_id == swarm.local_peer_id() {
|
2024-03-23 22:02:48 +00:00
|
|
|
log::warn!("established a libp2p connection to ourselves");
|
2024-03-23 03:51:51 +00:00
|
|
|
swarm.close_connection(connection_id);
|
2024-03-23 22:02:48 +00:00
|
|
|
continue;
|
2023-08-31 03:31:05 +00:00
|
|
|
}
|
2024-03-23 22:02:48 +00:00
|
|
|
|
2024-04-21 11:02:49 +00:00
|
|
|
let addr = endpoint.get_remote_address();
|
|
|
|
let nets = {
|
|
|
|
let mut dialing_peers = dialing_peers.write().await;
|
|
|
|
if let Some(nets) = dialing_peers.remove(addr) {
|
|
|
|
nets
|
|
|
|
} else {
|
|
|
|
log::debug!("connected to a peer who we didn't have within dialing");
|
|
|
|
HashSet::new()
|
|
|
|
}
|
|
|
|
};
|
2024-04-21 11:26:16 +00:00
|
|
|
{
|
|
|
|
let mut connected_peers = connected_peers.write().await;
|
|
|
|
connected_peers.insert(addr.clone(), nets);
|
|
|
|
|
|
|
|
log::debug!(
|
|
|
|
"connection established to peer {} in connection ID {}, connected peers: {}",
|
|
|
|
&peer_id,
|
|
|
|
&connection_id,
|
|
|
|
connected_peers.len(),
|
|
|
|
);
|
|
|
|
}
|
2024-03-23 22:02:48 +00:00
|
|
|
}
|
2024-04-21 11:02:49 +00:00
|
|
|
Some(SwarmEvent::ConnectionClosed { peer_id, endpoint, .. }) => {
|
2024-04-21 11:26:16 +00:00
|
|
|
let mut connected_peers = connected_peers.write().await;
|
2024-04-21 12:02:13 +00:00
|
|
|
let Some(nets) = connected_peers.remove(endpoint.get_remote_address()) else {
|
|
|
|
log::debug!("closed connection to peer which wasn't in connected_peers");
|
|
|
|
continue;
|
|
|
|
};
|
2024-04-21 11:26:16 +00:00
|
|
|
// Downgrade to a read lock
|
|
|
|
let connected_peers = connected_peers.downgrade();
|
2024-04-21 11:02:49 +00:00
|
|
|
|
|
|
|
// For each net we lost a peer for, check if we still have sufficient peers
|
|
|
|
// overall
|
|
|
|
for net in nets {
|
|
|
|
let mut remaining_peers = 0;
|
|
|
|
for nets in connected_peers.values() {
|
|
|
|
if nets.contains(&net) {
|
|
|
|
remaining_peers += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// If we do not, start connecting to this network again
|
2024-04-23 15:59:38 +00:00
|
|
|
if remaining_peers < TARGET_PEERS {
|
2024-04-21 11:02:49 +00:00
|
|
|
connect_to_network_send
|
|
|
|
.send(net)
|
|
|
|
.expect(
|
|
|
|
"couldn't send net to connect to due to disconnects (receiver dropped?)"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-23 22:02:48 +00:00
|
|
|
log::debug!(
|
|
|
|
"connection with peer {peer_id} closed, connected peers: {}",
|
2024-04-21 11:02:49 +00:00
|
|
|
connected_peers.len(),
|
2024-03-23 22:02:48 +00:00
|
|
|
);
|
2024-03-23 03:51:51 +00:00
|
|
|
}
|
2024-04-23 09:44:58 +00:00
|
|
|
Some(SwarmEvent::Behaviour(BehaviorEvent::Reqres(
|
|
|
|
RrEvent::Message { peer, message },
|
|
|
|
))) => {
|
|
|
|
let message = match message {
|
|
|
|
RrMessage::Request { request, .. } => request,
|
|
|
|
RrMessage::Response { response, .. } => response,
|
|
|
|
};
|
2024-04-23 10:37:41 +00:00
|
|
|
|
|
|
|
let mut msg_ref = message.as_slice();
|
|
|
|
let Some(kind) = ReqResMessageKind::read(&mut msg_ref) else { continue };
|
|
|
|
let message = Message {
|
|
|
|
sender: peer,
|
|
|
|
kind: P2pMessageKind::ReqRes(kind),
|
|
|
|
msg: msg_ref.to_vec(),
|
|
|
|
};
|
|
|
|
receive_send.send(message).expect("receive_send closed. are we shutting down?");
|
2024-04-23 09:44:58 +00:00
|
|
|
}
|
2023-08-31 03:31:05 +00:00
|
|
|
Some(SwarmEvent::Behaviour(BehaviorEvent::Gossipsub(
|
|
|
|
GsEvent::Message { propagation_source, message, .. },
|
|
|
|
))) => {
|
2024-04-23 10:37:41 +00:00
|
|
|
let mut msg_ref = message.data.as_slice();
|
|
|
|
let Some(kind) = GossipMessageKind::read(&mut msg_ref) else { continue };
|
|
|
|
let message = Message {
|
|
|
|
sender: propagation_source,
|
|
|
|
kind: P2pMessageKind::Gossip(kind),
|
|
|
|
msg: msg_ref.to_vec(),
|
|
|
|
};
|
|
|
|
receive_send.send(message).expect("receive_send closed. are we shutting down?");
|
2023-08-31 03:31:05 +00:00
|
|
|
}
|
|
|
|
_ => {}
|
2023-08-08 19:12:47 +00:00
|
|
|
}
|
|
|
|
}
|
2023-08-31 03:31:05 +00:00
|
|
|
|
2024-03-23 03:40:15 +00:00
|
|
|
// Handle peers to dial
|
2024-04-21 11:02:49 +00:00
|
|
|
addr_and_nets = to_dial_recv.recv() => {
|
|
|
|
let (addr, nets) =
|
|
|
|
addr_and_nets.expect("received address was None (sender dropped?)");
|
|
|
|
// If we've already dialed and connected to this address, don't further dial them
|
|
|
|
// Just associate these networks with them
|
2024-04-21 11:26:16 +00:00
|
|
|
if let Some(existing_nets) = connected_peers.write().await.get_mut(&addr) {
|
2024-04-21 11:02:49 +00:00
|
|
|
for net in nets {
|
|
|
|
existing_nets.insert(net);
|
|
|
|
}
|
2024-04-21 12:36:30 +00:00
|
|
|
continue;
|
2024-04-21 11:02:49 +00:00
|
|
|
}
|
|
|
|
|
2024-03-23 03:40:15 +00:00
|
|
|
if let Err(e) = swarm.dial(addr) {
|
|
|
|
log::warn!("dialing peer failed: {e:?}");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-31 03:31:05 +00:00
|
|
|
// If it's been >80s since we've published a message, publish a KeepAlive since we're
|
|
|
|
// still an active service
|
|
|
|
// This is useful when we have no active tributaries and accordingly aren't sending
|
|
|
|
// heartbeats
|
|
|
|
// If we are sending heartbeats, we should've sent one after 60s of no finalized blocks
|
|
|
|
// (where a finalized block only occurs due to network activity), meaning this won't be
|
|
|
|
// run
|
2023-12-17 01:54:24 +00:00
|
|
|
() = tokio::time::sleep(Duration::from_secs(80).saturating_sub(time_since_last)) => {
|
2024-04-23 10:37:41 +00:00
|
|
|
time_of_last_p2p_message = Instant::now();
|
|
|
|
for peer_id in swarm.connected_peers().copied().collect::<Vec<_>>() {
|
|
|
|
swarm
|
|
|
|
.behaviour_mut()
|
|
|
|
.reqres
|
|
|
|
.send_request(&peer_id, ReqResMessageKind::KeepAlive.serialize());
|
|
|
|
}
|
2023-08-31 03:31:05 +00:00
|
|
|
}
|
2023-08-08 19:12:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
2023-08-31 03:31:05 +00:00
|
|
|
|
2023-11-19 01:37:53 +00:00
|
|
|
LibP2p {
|
|
|
|
subscribe: Arc::new(Mutex::new(subscribe_send)),
|
2024-04-23 09:44:58 +00:00
|
|
|
send: Arc::new(Mutex::new(send_send)),
|
2023-11-19 01:37:53 +00:00
|
|
|
broadcast: Arc::new(Mutex::new(broadcast_send)),
|
|
|
|
receive: Arc::new(Mutex::new(receive_recv)),
|
|
|
|
}
|
2023-04-22 14:49:52 +00:00
|
|
|
}
|
|
|
|
}
|
2023-04-16 04:51:56 +00:00
|
|
|
|
|
|
|
#[async_trait]
|
2023-08-08 19:12:47 +00:00
|
|
|
impl P2p for LibP2p {
|
|
|
|
type Id = PeerId;
|
2023-04-23 20:56:23 +00:00
|
|
|
|
2023-12-23 02:09:18 +00:00
|
|
|
async fn subscribe(&self, set: ValidatorSet, genesis: [u8; 32]) {
|
2023-11-19 01:37:53 +00:00
|
|
|
self
|
|
|
|
.subscribe
|
|
|
|
.lock()
|
|
|
|
.await
|
2023-12-23 02:09:18 +00:00
|
|
|
.send((true, set, genesis))
|
2023-11-19 01:37:53 +00:00
|
|
|
.expect("subscribe_send closed. are we shutting down?");
|
2023-04-23 20:56:23 +00:00
|
|
|
}
|
|
|
|
|
2023-12-23 02:09:18 +00:00
|
|
|
async fn unsubscribe(&self, set: ValidatorSet, genesis: [u8; 32]) {
|
2023-11-19 01:37:53 +00:00
|
|
|
self
|
|
|
|
.subscribe
|
|
|
|
.lock()
|
|
|
|
.await
|
2023-12-23 02:09:18 +00:00
|
|
|
.send((false, set, genesis))
|
2023-11-19 01:37:53 +00:00
|
|
|
.expect("subscribe_send closed. are we shutting down?");
|
|
|
|
}
|
|
|
|
|
2024-04-23 09:44:58 +00:00
|
|
|
async fn send_raw(&self, peer: Self::Id, msg: Vec<u8>) {
|
|
|
|
self.send.lock().await.send((peer, msg)).expect("send_send closed. are we shutting down?");
|
2023-11-19 01:37:53 +00:00
|
|
|
}
|
|
|
|
|
2024-04-23 09:44:58 +00:00
|
|
|
async fn broadcast_raw(&self, kind: P2pMessageKind, msg: Vec<u8>) {
|
2023-11-19 01:37:53 +00:00
|
|
|
self
|
|
|
|
.broadcast
|
|
|
|
.lock()
|
|
|
|
.await
|
2024-04-23 09:44:58 +00:00
|
|
|
.send((kind, msg))
|
2023-11-19 01:37:53 +00:00
|
|
|
.expect("broadcast_send closed. are we shutting down?");
|
2023-04-22 14:49:52 +00:00
|
|
|
}
|
|
|
|
|
2023-08-31 05:33:52 +00:00
|
|
|
// TODO: We only have a single handle call this. Differentiate Send/Recv to remove this constant
|
|
|
|
// lock acquisition?
|
2024-04-23 10:37:41 +00:00
|
|
|
async fn receive(&self) -> Message<Self> {
|
2023-11-19 01:37:53 +00:00
|
|
|
self.receive.lock().await.recv().await.expect("receive_recv closed. are we shutting down?")
|
2023-04-16 04:51:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[async_trait]
|
2023-08-08 19:12:47 +00:00
|
|
|
impl TributaryP2p for LibP2p {
|
2023-04-23 20:56:23 +00:00
|
|
|
async fn broadcast(&self, genesis: [u8; 32], msg: Vec<u8>) {
|
2024-04-23 10:37:41 +00:00
|
|
|
<Self as P2p>::broadcast(self, GossipMessageKind::Tributary(genesis), msg).await
|
2023-04-22 14:49:52 +00:00
|
|
|
}
|
|
|
|
}
|
2023-10-14 02:40:11 +00:00
|
|
|
|
|
|
|
pub async fn heartbeat_tributaries_task<D: Db, P: P2p>(
|
|
|
|
p2p: P,
|
2023-10-14 18:56:02 +00:00
|
|
|
mut tributary_event: broadcast::Receiver<TributaryEvent<D, P>>,
|
2023-10-14 02:40:11 +00:00
|
|
|
) {
|
|
|
|
let ten_blocks_of_time =
|
|
|
|
Duration::from_secs((10 * Tributary::<D, Transaction, P>::block_time()).into());
|
|
|
|
|
2023-10-14 18:56:02 +00:00
|
|
|
let mut readers = HashMap::new();
|
2023-10-14 02:40:11 +00:00
|
|
|
loop {
|
2023-10-14 18:56:02 +00:00
|
|
|
loop {
|
|
|
|
match tributary_event.try_recv() {
|
|
|
|
Ok(TributaryEvent::NewTributary(ActiveTributary { spec, tributary })) => {
|
|
|
|
readers.insert(spec.set(), tributary.reader());
|
|
|
|
}
|
|
|
|
Ok(TributaryEvent::TributaryRetired(set)) => {
|
|
|
|
readers.remove(&set);
|
|
|
|
}
|
|
|
|
Err(broadcast::error::TryRecvError::Empty) => break,
|
2023-10-14 02:40:11 +00:00
|
|
|
Err(broadcast::error::TryRecvError::Lagged(_)) => {
|
2023-10-14 18:56:02 +00:00
|
|
|
panic!("heartbeat_tributaries lagged to handle tributary_event")
|
2023-10-14 02:40:11 +00:00
|
|
|
}
|
2023-10-14 18:56:02 +00:00
|
|
|
Err(broadcast::error::TryRecvError::Closed) => panic!("tributary_event sender closed"),
|
2023-10-14 02:40:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-14 18:56:02 +00:00
|
|
|
for tributary in readers.values() {
|
2023-10-14 02:40:11 +00:00
|
|
|
let tip = tributary.tip();
|
|
|
|
let block_time =
|
|
|
|
SystemTime::UNIX_EPOCH + Duration::from_secs(tributary.time_of_block(&tip).unwrap_or(0));
|
|
|
|
|
|
|
|
// Only trigger syncing if the block is more than a minute behind
|
|
|
|
if SystemTime::now() > (block_time + Duration::from_secs(60)) {
|
|
|
|
log::warn!("last known tributary block was over a minute ago");
|
|
|
|
let mut msg = tip.to_vec();
|
2024-04-23 09:44:58 +00:00
|
|
|
let time: u64 = SystemTime::now()
|
|
|
|
.duration_since(SystemTime::UNIX_EPOCH)
|
|
|
|
.expect("system clock is wrong")
|
|
|
|
.as_secs();
|
|
|
|
msg.extend(time.to_le_bytes());
|
2024-04-23 10:37:41 +00:00
|
|
|
P2p::broadcast(&p2p, ReqResMessageKind::Heartbeat(tributary.genesis()), msg).await;
|
2023-10-14 02:40:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only check once every 10 blocks of time
|
|
|
|
sleep(ten_blocks_of_time).await;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub async fn handle_p2p_task<D: Db, P: P2p>(
|
|
|
|
p2p: P,
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
cosign_channel: mpsc::UnboundedSender<CosignedBlock>,
|
2023-10-14 18:56:02 +00:00
|
|
|
mut tributary_event: broadcast::Receiver<TributaryEvent<D, P>>,
|
2023-10-14 02:40:11 +00:00
|
|
|
) {
|
2023-10-14 20:47:25 +00:00
|
|
|
let channels = Arc::new(RwLock::new(HashMap::<_, mpsc::UnboundedSender<Message<P>>>::new()));
|
2023-10-14 02:40:11 +00:00
|
|
|
tokio::spawn({
|
|
|
|
let p2p = p2p.clone();
|
|
|
|
let channels = channels.clone();
|
2023-10-14 20:47:25 +00:00
|
|
|
let mut set_to_genesis = HashMap::new();
|
2023-10-14 02:40:11 +00:00
|
|
|
async move {
|
|
|
|
loop {
|
2023-10-14 18:56:02 +00:00
|
|
|
match tributary_event.recv().await.unwrap() {
|
|
|
|
TributaryEvent::NewTributary(tributary) => {
|
|
|
|
let genesis = tributary.spec.genesis();
|
2023-10-14 20:47:25 +00:00
|
|
|
set_to_genesis.insert(tributary.spec.set(), genesis);
|
2023-10-14 18:56:02 +00:00
|
|
|
|
|
|
|
let (send, mut recv) = mpsc::unbounded_channel();
|
|
|
|
channels.write().await.insert(genesis, send);
|
|
|
|
|
2023-11-19 01:37:53 +00:00
|
|
|
// Subscribe to the topic for this tributary
|
2023-12-23 02:09:18 +00:00
|
|
|
p2p.subscribe(tributary.spec.set(), genesis).await;
|
2023-11-19 01:37:53 +00:00
|
|
|
|
2024-04-18 01:54:10 +00:00
|
|
|
let spec_set = tributary.spec.set();
|
|
|
|
|
2023-10-14 23:55:14 +00:00
|
|
|
// Per-Tributary P2P message handler
|
2023-10-14 18:56:02 +00:00
|
|
|
tokio::spawn({
|
|
|
|
let p2p = p2p.clone();
|
|
|
|
async move {
|
|
|
|
loop {
|
2024-07-16 23:42:15 +00:00
|
|
|
let Some(msg) = recv.recv().await else {
|
2023-10-14 20:47:25 +00:00
|
|
|
// Channel closure happens when the tributary retires
|
|
|
|
break;
|
|
|
|
};
|
2023-10-14 18:56:02 +00:00
|
|
|
match msg.kind {
|
2024-04-23 10:37:41 +00:00
|
|
|
P2pMessageKind::ReqRes(ReqResMessageKind::KeepAlive) => {}
|
2023-10-14 02:40:11 +00:00
|
|
|
|
2024-04-23 09:44:58 +00:00
|
|
|
// TODO: Slash on Heartbeat which justifies a response, since the node
|
2023-10-14 18:56:02 +00:00
|
|
|
// obviously was offline and we must now use our bandwidth to compensate for
|
|
|
|
// them?
|
2024-04-23 10:37:41 +00:00
|
|
|
P2pMessageKind::ReqRes(ReqResMessageKind::Heartbeat(msg_genesis)) => {
|
2023-10-14 18:56:02 +00:00
|
|
|
assert_eq!(msg_genesis, genesis);
|
|
|
|
if msg.msg.len() != 40 {
|
|
|
|
log::error!("validator sent invalid heartbeat");
|
|
|
|
continue;
|
|
|
|
}
|
2024-04-18 05:24:38 +00:00
|
|
|
// Only respond to recent heartbeats
|
2024-04-23 09:44:58 +00:00
|
|
|
let msg_time = u64::from_le_bytes(msg.msg[32 .. 40].try_into().expect(
|
2024-04-18 05:24:38 +00:00
|
|
|
"length-checked heartbeat message didn't have 8 bytes for the u64",
|
|
|
|
));
|
2024-04-23 09:44:58 +00:00
|
|
|
if SystemTime::now()
|
|
|
|
.duration_since(SystemTime::UNIX_EPOCH)
|
|
|
|
.expect("system clock is wrong")
|
|
|
|
.as_secs()
|
|
|
|
.saturating_sub(msg_time) >
|
|
|
|
10
|
|
|
|
{
|
2024-04-18 05:24:38 +00:00
|
|
|
continue;
|
|
|
|
}
|
2023-10-14 02:40:11 +00:00
|
|
|
|
2024-04-23 09:44:58 +00:00
|
|
|
log::debug!("received heartbeat with a recent timestamp");
|
2024-04-18 05:39:34 +00:00
|
|
|
|
2023-10-14 18:56:02 +00:00
|
|
|
let reader = tributary.tributary.reader();
|
|
|
|
|
2024-04-18 05:39:34 +00:00
|
|
|
let p2p = p2p.clone();
|
|
|
|
// Spawn a dedicated task as this may require loading large amounts of data
|
|
|
|
// from disk and take a notable amount of time
|
|
|
|
tokio::spawn(async move {
|
2023-10-14 18:56:02 +00:00
|
|
|
let mut latest = msg.msg[.. 32].try_into().unwrap();
|
2024-04-18 01:54:10 +00:00
|
|
|
let mut to_send = vec![];
|
2023-10-14 18:56:02 +00:00
|
|
|
while let Some(next) = reader.block_after(&latest) {
|
2024-04-18 01:54:10 +00:00
|
|
|
to_send.push(next);
|
2023-10-14 18:56:02 +00:00
|
|
|
latest = next;
|
|
|
|
}
|
2024-04-18 05:24:38 +00:00
|
|
|
if to_send.len() > 3 {
|
2024-07-16 23:42:15 +00:00
|
|
|
// prepare the batch to sends
|
|
|
|
let mut blocks = vec![];
|
|
|
|
for (i, next) in to_send.iter().enumerate() {
|
|
|
|
if i >= BLOCKS_PER_BATCH {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
blocks.push(BlockCommit {
|
|
|
|
block: reader.block(next).unwrap().serialize(),
|
|
|
|
commit: reader.commit(next).unwrap(),
|
|
|
|
});
|
2024-04-18 01:54:10 +00:00
|
|
|
}
|
2024-07-16 23:42:15 +00:00
|
|
|
let batch = HeartbeatBatch { blocks, timestamp: msg_time };
|
|
|
|
|
|
|
|
p2p
|
|
|
|
.send(msg.sender, ReqResMessageKind::Block(genesis), batch.encode())
|
|
|
|
.await;
|
2024-04-18 01:54:10 +00:00
|
|
|
}
|
2023-10-14 18:56:02 +00:00
|
|
|
});
|
2023-10-14 02:40:11 +00:00
|
|
|
}
|
|
|
|
|
2024-04-23 10:37:41 +00:00
|
|
|
P2pMessageKind::ReqRes(ReqResMessageKind::Block(msg_genesis)) => {
|
2023-10-14 18:56:02 +00:00
|
|
|
assert_eq!(msg_genesis, genesis);
|
2024-07-16 23:42:15 +00:00
|
|
|
// decode the batch
|
|
|
|
let Ok(batch) = HeartbeatBatch::decode(&mut msg.msg.as_ref()) else {
|
|
|
|
log::error!(
|
|
|
|
"received HeartBeatBatch message with an invalidly serialized batch"
|
|
|
|
);
|
2023-10-14 18:56:02 +00:00
|
|
|
continue;
|
|
|
|
};
|
2024-07-16 23:42:15 +00:00
|
|
|
|
|
|
|
// sync blocks
|
|
|
|
for bc in batch.blocks {
|
|
|
|
// TODO: why do we use ReadWrite instead of Encode/Decode for blocks?
|
|
|
|
// Should we use the same for batches so we can read both at the same time?
|
|
|
|
let Ok(block) = Block::<Transaction>::read(&mut bc.block.as_slice()) else {
|
|
|
|
log::error!("received block message with an invalidly serialized block");
|
|
|
|
continue;
|
|
|
|
};
|
|
|
|
|
|
|
|
let res = tributary.tributary.sync_block(block, bc.commit).await;
|
|
|
|
log::debug!(
|
|
|
|
"received block from {:?}, sync_block returned {}",
|
|
|
|
msg.sender,
|
|
|
|
res
|
|
|
|
);
|
|
|
|
}
|
2023-10-14 18:56:02 +00:00
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
|
2024-04-23 10:37:41 +00:00
|
|
|
P2pMessageKind::Gossip(GossipMessageKind::Tributary(msg_genesis)) => {
|
|
|
|
assert_eq!(msg_genesis, genesis);
|
|
|
|
log::trace!("handling message for tributary {:?}", spec_set);
|
|
|
|
if tributary.tributary.handle_message(&msg.msg).await {
|
|
|
|
P2p::broadcast(&p2p, msg.kind, msg.msg).await;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
P2pMessageKind::Gossip(GossipMessageKind::CosignedBlock) => unreachable!(),
|
2023-10-14 18:56:02 +00:00
|
|
|
}
|
2023-10-14 02:40:11 +00:00
|
|
|
}
|
|
|
|
}
|
2023-10-14 18:56:02 +00:00
|
|
|
});
|
2023-10-14 02:40:11 +00:00
|
|
|
}
|
2023-10-14 20:47:25 +00:00
|
|
|
TributaryEvent::TributaryRetired(set) => {
|
|
|
|
if let Some(genesis) = set_to_genesis.remove(&set) {
|
2023-12-23 02:09:18 +00:00
|
|
|
p2p.unsubscribe(set, genesis).await;
|
2023-10-14 20:47:25 +00:00
|
|
|
channels.write().await.remove(&genesis);
|
|
|
|
}
|
|
|
|
}
|
2023-10-14 18:56:02 +00:00
|
|
|
}
|
2023-10-14 02:40:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
loop {
|
|
|
|
let msg = p2p.receive().await;
|
|
|
|
match msg.kind {
|
2024-04-23 10:37:41 +00:00
|
|
|
P2pMessageKind::ReqRes(ReqResMessageKind::KeepAlive) => {}
|
|
|
|
P2pMessageKind::Gossip(GossipMessageKind::Tributary(genesis)) |
|
|
|
|
P2pMessageKind::ReqRes(
|
|
|
|
ReqResMessageKind::Heartbeat(genesis) | ReqResMessageKind::Block(genesis),
|
|
|
|
) => {
|
2023-10-14 02:40:11 +00:00
|
|
|
if let Some(channel) = channels.read().await.get(&genesis) {
|
|
|
|
channel.send(msg).unwrap();
|
|
|
|
}
|
|
|
|
}
|
2024-04-23 10:37:41 +00:00
|
|
|
P2pMessageKind::Gossip(GossipMessageKind::CosignedBlock) => {
|
2023-12-12 17:28:53 +00:00
|
|
|
let Ok(msg) = CosignedBlock::deserialize_reader(&mut msg.msg.as_slice()) else {
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
log::error!("received CosignedBlock message with invalidly serialized contents");
|
|
|
|
continue;
|
|
|
|
};
|
|
|
|
cosign_channel.send(msg).unwrap();
|
|
|
|
}
|
2023-10-14 02:40:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|