2023-10-14 02:30:13 +00:00
|
|
|
use core::ops::Deref;
|
2023-04-23 07:48:50 +00:00
|
|
|
use std::{
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
sync::{OnceLock, Arc},
|
2023-10-14 02:40:11 +00:00
|
|
|
time::Duration,
|
2023-10-15 00:37:54 +00:00
|
|
|
collections::{VecDeque, HashSet, HashMap},
|
2023-04-23 07:48:50 +00:00
|
|
|
};
|
2023-04-17 04:50:56 +00:00
|
|
|
|
2023-08-06 16:38:44 +00:00
|
|
|
use zeroize::{Zeroize, Zeroizing};
|
2023-04-25 07:14:42 +00:00
|
|
|
use rand_core::OsRng;
|
2023-04-16 07:16:53 +00:00
|
|
|
|
2023-09-01 03:39:36 +00:00
|
|
|
use ciphersuite::{
|
|
|
|
group::ff::{Field, PrimeField},
|
|
|
|
Ciphersuite, Ristretto,
|
|
|
|
};
|
|
|
|
use schnorr::SchnorrSignature;
|
2023-09-01 04:03:53 +00:00
|
|
|
use frost::Participant;
|
2023-04-15 21:38:47 +00:00
|
|
|
|
2023-07-18 05:53:51 +00:00
|
|
|
use serai_db::{DbTxn, Db};
|
|
|
|
use serai_env as env;
|
2023-05-10 04:46:51 +00:00
|
|
|
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
use scale::Encode;
|
2023-10-14 03:36:07 +00:00
|
|
|
use serai_client::{
|
|
|
|
primitives::NetworkId,
|
2023-11-25 09:01:11 +00:00
|
|
|
validator_sets::primitives::{Session, ValidatorSet, KeyPair},
|
2023-10-14 06:52:02 +00:00
|
|
|
Public, Serai, SeraiInInstructions,
|
2023-10-14 03:36:07 +00:00
|
|
|
};
|
2023-04-15 21:38:47 +00:00
|
|
|
|
2023-07-18 05:53:51 +00:00
|
|
|
use message_queue::{Service, client::MessageQueue};
|
|
|
|
|
2023-09-25 22:23:39 +00:00
|
|
|
use tokio::{
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
sync::{Mutex, RwLock, mpsc, broadcast},
|
2023-09-25 22:23:39 +00:00
|
|
|
time::sleep,
|
|
|
|
};
|
2023-04-23 07:48:50 +00:00
|
|
|
|
2023-10-15 01:50:11 +00:00
|
|
|
use ::tributary::{
|
|
|
|
ProvidedError, TransactionKind, TransactionError, TransactionTrait, Block, Tributary,
|
|
|
|
};
|
2023-04-17 04:50:56 +00:00
|
|
|
|
2023-04-20 09:05:17 +00:00
|
|
|
mod tributary;
|
2023-09-25 19:42:39 +00:00
|
|
|
use crate::tributary::{
|
2023-11-22 09:17:51 +00:00
|
|
|
TributarySpec, SignData, Transaction, NonceDecider, scanner::RecognizedIdType, PlanIds,
|
2023-09-25 19:42:39 +00:00
|
|
|
};
|
2023-04-16 04:51:56 +00:00
|
|
|
|
2023-04-23 08:31:00 +00:00
|
|
|
mod db;
|
|
|
|
use db::MainDb;
|
|
|
|
|
2023-04-16 04:51:56 +00:00
|
|
|
mod p2p;
|
|
|
|
pub use p2p::*;
|
|
|
|
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
use processor_messages::{
|
|
|
|
key_gen, sign,
|
|
|
|
coordinator::{self, SubstrateSignableId},
|
|
|
|
ProcessorMessage,
|
|
|
|
};
|
2023-04-25 07:14:42 +00:00
|
|
|
|
2023-05-10 03:44:41 +00:00
|
|
|
pub mod processors;
|
|
|
|
use processors::Processors;
|
2023-04-17 06:10:33 +00:00
|
|
|
|
2023-04-15 21:38:47 +00:00
|
|
|
mod substrate;
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
use substrate::{CosignTransactions, SubstrateDb};
|
|
|
|
|
|
|
|
mod cosign_evaluator;
|
|
|
|
use cosign_evaluator::CosignEvaluator;
|
2023-04-11 23:04:53 +00:00
|
|
|
|
|
|
|
#[cfg(test)]
|
2023-04-23 02:27:12 +00:00
|
|
|
pub mod tests;
|
2023-04-11 23:04:53 +00:00
|
|
|
|
2023-09-25 23:28:53 +00:00
|
|
|
#[derive(Clone)]
|
2023-04-24 03:15:15 +00:00
|
|
|
pub struct ActiveTributary<D: Db, P: P2p> {
|
2023-04-24 06:50:03 +00:00
|
|
|
pub spec: TributarySpec,
|
2023-09-25 23:28:53 +00:00
|
|
|
pub tributary: Arc<Tributary<D, Transaction, P>>,
|
2023-04-24 03:15:15 +00:00
|
|
|
}
|
|
|
|
|
2023-10-14 18:56:02 +00:00
|
|
|
#[derive(Clone)]
|
|
|
|
pub enum TributaryEvent<D: Db, P: P2p> {
|
|
|
|
NewTributary(ActiveTributary<D, P>),
|
|
|
|
TributaryRetired(ValidatorSet),
|
|
|
|
}
|
|
|
|
|
2023-10-13 01:55:25 +00:00
|
|
|
// Creates a new tributary and sends it to all listeners.
|
2023-09-25 22:27:16 +00:00
|
|
|
async fn add_tributary<D: Db, Pro: Processors, P: P2p>(
|
2023-04-24 03:15:15 +00:00
|
|
|
db: D,
|
2023-04-16 07:16:53 +00:00
|
|
|
key: Zeroizing<<Ristretto as Ciphersuite>::F>,
|
2023-09-25 22:27:16 +00:00
|
|
|
processors: &Pro,
|
2023-04-16 07:16:53 +00:00
|
|
|
p2p: P,
|
2023-10-14 18:56:02 +00:00
|
|
|
tributaries: &broadcast::Sender<TributaryEvent<D, P>>,
|
2023-04-24 03:15:15 +00:00
|
|
|
spec: TributarySpec,
|
2023-09-25 23:28:53 +00:00
|
|
|
) {
|
2023-10-14 20:47:25 +00:00
|
|
|
if MainDb::<D>::is_tributary_retired(&db, spec.set()) {
|
2023-10-14 18:56:02 +00:00
|
|
|
log::info!("not adding tributary {:?} since it's been retired", spec.set());
|
|
|
|
}
|
|
|
|
|
2023-08-01 23:00:48 +00:00
|
|
|
log::info!("adding tributary {:?}", spec.set());
|
|
|
|
|
2023-04-24 03:15:15 +00:00
|
|
|
let tributary = Tributary::<_, Transaction, _>::new(
|
2023-08-30 21:25:04 +00:00
|
|
|
// TODO2: Use a db on a distinct volume to protect against DoS attacks
|
2023-10-15 00:07:12 +00:00
|
|
|
// TODO2: Delete said db once the Tributary is dropped
|
2023-04-24 03:15:15 +00:00
|
|
|
db,
|
|
|
|
spec.genesis(),
|
|
|
|
spec.start_time(),
|
2023-09-25 22:27:16 +00:00
|
|
|
key.clone(),
|
2023-04-24 03:15:15 +00:00
|
|
|
spec.validators(),
|
|
|
|
p2p,
|
|
|
|
)
|
|
|
|
.await
|
|
|
|
.unwrap();
|
|
|
|
|
2023-09-25 22:27:16 +00:00
|
|
|
// Trigger a DKG for the newly added Tributary
|
|
|
|
// If we're rebooting, we'll re-fire this message
|
|
|
|
// This is safe due to the message-queue deduplicating based off the intent system
|
|
|
|
let set = spec.set();
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
let our_i = spec
|
|
|
|
.i(Ristretto::generator() * key.deref())
|
|
|
|
.expect("adding a tributary for a set we aren't in set for");
|
2023-09-25 22:27:16 +00:00
|
|
|
processors
|
|
|
|
.send(
|
|
|
|
set.network,
|
2023-09-29 08:19:59 +00:00
|
|
|
processor_messages::key_gen::CoordinatorMessage::GenerateKey {
|
|
|
|
id: processor_messages::key_gen::KeyGenId { set, attempt: 0 },
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
params: frost::ThresholdParams::new(spec.t(), spec.n(), our_i.start).unwrap(),
|
|
|
|
shares: u16::from(our_i.end) - u16::from(our_i.start),
|
2023-09-29 08:19:59 +00:00
|
|
|
},
|
2023-09-25 22:27:16 +00:00
|
|
|
)
|
|
|
|
.await;
|
|
|
|
|
2023-09-25 23:28:53 +00:00
|
|
|
tributaries
|
2023-10-14 18:56:02 +00:00
|
|
|
.send(TributaryEvent::NewTributary(ActiveTributary { spec, tributary: Arc::new(tributary) }))
|
2023-09-25 23:28:53 +00:00
|
|
|
.map_err(|_| "all ActiveTributary recipients closed")
|
|
|
|
.unwrap();
|
2023-04-24 03:15:15 +00:00
|
|
|
}
|
|
|
|
|
2023-09-27 04:44:31 +00:00
|
|
|
async fn publish_signed_transaction<D: Db, P: P2p>(
|
2023-10-14 03:36:07 +00:00
|
|
|
txn: &mut D::Transaction<'_>,
|
2023-05-09 02:20:51 +00:00
|
|
|
tributary: &Tributary<D, Transaction, P>,
|
|
|
|
tx: Transaction,
|
|
|
|
) {
|
2023-08-13 06:21:56 +00:00
|
|
|
log::debug!("publishing transaction {}", hex::encode(tx.hash()));
|
2023-09-27 04:44:31 +00:00
|
|
|
|
|
|
|
let signer = if let TransactionKind::Signed(signed) = tx.kind() {
|
|
|
|
let signer = signed.signer;
|
|
|
|
|
|
|
|
// Safe as we should deterministically create transactions, meaning if this is already on-disk,
|
|
|
|
// it's what we're saving now
|
2023-10-14 03:36:07 +00:00
|
|
|
MainDb::<D>::save_signed_transaction(txn, signed.nonce, tx);
|
2023-09-27 04:44:31 +00:00
|
|
|
|
|
|
|
signer
|
2023-05-09 02:20:51 +00:00
|
|
|
} else {
|
2023-09-25 22:07:26 +00:00
|
|
|
panic!("non-signed transaction passed to publish_signed_transaction");
|
2023-09-27 04:44:31 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// If we're trying to publish 5, when the last transaction published was 3, this will delay
|
|
|
|
// publication until the point in time we publish 4
|
|
|
|
while let Some(tx) = MainDb::<D>::take_signed_transaction(
|
2023-10-14 03:36:07 +00:00
|
|
|
txn,
|
2023-09-27 04:44:31 +00:00
|
|
|
tributary
|
|
|
|
.next_nonce(signer)
|
|
|
|
.await
|
|
|
|
.expect("we don't have a nonce, meaning we aren't a participant on this tributary"),
|
|
|
|
) {
|
2023-10-14 04:41:07 +00:00
|
|
|
// We need to return a proper error here to enable that, due to a race condition around
|
|
|
|
// multiple publications
|
2023-10-15 01:50:11 +00:00
|
|
|
match tributary.add_transaction(tx.clone()).await {
|
|
|
|
Ok(_) => {}
|
|
|
|
// Some asynchonicity if InvalidNonce, assumed safe to deterministic nonces
|
|
|
|
Err(TransactionError::InvalidNonce) => {
|
|
|
|
log::warn!("publishing TX {tx:?} returned InvalidNonce. was it already added?")
|
|
|
|
}
|
|
|
|
Err(e) => panic!("created an invalid transaction: {e:?}"),
|
|
|
|
}
|
2023-05-09 02:20:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
// TODO: Find a better pattern for this
|
|
|
|
static HANDOVER_VERIFY_QUEUE_LOCK: OnceLock<Mutex<()>> = OnceLock::new();
|
|
|
|
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
2023-10-14 03:36:07 +00:00
|
|
|
async fn handle_processor_message<D: Db, P: P2p>(
|
|
|
|
db: &mut D,
|
|
|
|
key: &Zeroizing<<Ristretto as Ciphersuite>::F>,
|
|
|
|
serai: &Serai,
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
p2p: &P,
|
|
|
|
cosign_channel: &mpsc::UnboundedSender<CosignedBlock>,
|
2023-10-14 03:36:07 +00:00
|
|
|
tributaries: &HashMap<Session, ActiveTributary<D, P>>,
|
2023-10-13 07:36:59 +00:00
|
|
|
network: NetworkId,
|
2023-10-14 03:36:07 +00:00
|
|
|
msg: &processors::Message,
|
|
|
|
) -> bool {
|
|
|
|
if MainDb::<D>::handled_message(db, msg.network, msg.id) {
|
|
|
|
return true;
|
2023-10-13 07:36:59 +00:00
|
|
|
}
|
|
|
|
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
let _hvq_lock = HANDOVER_VERIFY_QUEUE_LOCK.get_or_init(|| Mutex::new(())).lock().await;
|
2023-10-14 03:36:07 +00:00
|
|
|
let mut txn = db.txn();
|
2023-09-01 04:03:53 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
let mut relevant_tributary = match &msg.msg {
|
|
|
|
// We'll only receive these if we fired GenerateKey, which we'll only do if if we're
|
|
|
|
// in-set, making the Tributary relevant
|
|
|
|
ProcessorMessage::KeyGen(inner_msg) => match inner_msg {
|
|
|
|
key_gen::ProcessorMessage::Commitments { id, .. } => Some(id.set.session),
|
2023-11-12 12:24:41 +00:00
|
|
|
key_gen::ProcessorMessage::InvalidCommitments { id, .. } => Some(id.set.session),
|
2023-10-14 03:36:07 +00:00
|
|
|
key_gen::ProcessorMessage::Shares { id, .. } => Some(id.set.session),
|
2023-11-12 12:24:41 +00:00
|
|
|
key_gen::ProcessorMessage::InvalidShare { id, .. } => Some(id.set.session),
|
2023-10-14 03:36:07 +00:00
|
|
|
key_gen::ProcessorMessage::GeneratedKeyPair { id, .. } => Some(id.set.session),
|
2023-11-12 12:24:41 +00:00
|
|
|
key_gen::ProcessorMessage::Blame { id, .. } => Some(id.set.session),
|
2023-10-14 03:36:07 +00:00
|
|
|
},
|
|
|
|
// TODO: Review replacing key with Session in messages?
|
|
|
|
ProcessorMessage::Sign(inner_msg) => match inner_msg {
|
2023-11-12 12:24:41 +00:00
|
|
|
// We'll only receive InvalidParticipant/Preprocess/Share if we're actively signing
|
|
|
|
sign::ProcessorMessage::InvalidParticipant { id, .. } => {
|
|
|
|
Some(SubstrateDb::<D>::session_for_key(&txn, &id.key).unwrap())
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
sign::ProcessorMessage::Preprocess { id, .. } => {
|
|
|
|
Some(SubstrateDb::<D>::session_for_key(&txn, &id.key).unwrap())
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
sign::ProcessorMessage::Share { id, .. } => {
|
|
|
|
Some(SubstrateDb::<D>::session_for_key(&txn, &id.key).unwrap())
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
// While the Processor's Scanner will always emit Completed, that's routed through the
|
|
|
|
// Signer and only becomes a ProcessorMessage::Completed if the Signer is present and
|
|
|
|
// confirms it
|
|
|
|
sign::ProcessorMessage::Completed { key, .. } => {
|
|
|
|
Some(SubstrateDb::<D>::session_for_key(&txn, key).unwrap())
|
|
|
|
}
|
|
|
|
},
|
|
|
|
ProcessorMessage::Coordinator(inner_msg) => match inner_msg {
|
|
|
|
// This is a special case as it's relevant to *all* Tributaries for this network
|
|
|
|
// It doesn't return a Tributary to become `relevant_tributary` though
|
|
|
|
coordinator::ProcessorMessage::SubstrateBlockAck { network, block, plans } => {
|
|
|
|
assert_eq!(
|
|
|
|
*network, msg.network,
|
|
|
|
"processor claimed to be a different network than it was for SubstrateBlockAck",
|
|
|
|
);
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
|
2023-10-15 00:37:54 +00:00
|
|
|
// Get the sessions for these keys
|
|
|
|
let keys = plans.iter().map(|plan| plan.key.clone()).collect::<HashSet<_>>();
|
|
|
|
let mut sessions = vec![];
|
|
|
|
for key in keys {
|
|
|
|
let session = SubstrateDb::<D>::session_for_key(&txn, &key).unwrap();
|
|
|
|
// Only keep them if we're in the Tributary AND they haven't been retied
|
|
|
|
let set = ValidatorSet { network: *network, session };
|
|
|
|
if MainDb::<D>::in_tributary(&txn, set) && (!MainDb::<D>::is_tributary_retired(&txn, set))
|
|
|
|
{
|
|
|
|
sessions.push((session, key));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure we have the Tributaries
|
|
|
|
for (session, _) in &sessions {
|
|
|
|
if !tributaries.contains_key(session) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (session, key) in sessions {
|
|
|
|
let tributary = &tributaries[&session];
|
|
|
|
let plans = plans
|
|
|
|
.iter()
|
|
|
|
.filter_map(|plan| Some(plan.id).filter(|_| plan.key == key))
|
|
|
|
.collect::<Vec<_>>();
|
2023-11-22 09:17:51 +00:00
|
|
|
PlanIds::set(&mut txn, &tributary.spec.genesis(), *block, &plans);
|
2023-09-27 04:00:31 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
let tx = Transaction::SubstrateBlock(*block);
|
2023-11-16 18:37:35 +00:00
|
|
|
log::trace!(
|
|
|
|
"processor message effected transaction {} {:?}",
|
|
|
|
hex::encode(tx.hash()),
|
|
|
|
&tx
|
|
|
|
);
|
2023-10-14 03:36:07 +00:00
|
|
|
log::trace!("providing transaction {}", hex::encode(tx.hash()));
|
|
|
|
let res = tributary.tributary.provide_transaction(tx).await;
|
|
|
|
if !(res.is_ok() || (res == Err(ProvidedError::AlreadyProvided))) {
|
|
|
|
if res == Err(ProvidedError::LocalMismatchesOnChain) {
|
|
|
|
// Spin, since this is a crit for this Tributary
|
|
|
|
loop {
|
|
|
|
log::error!(
|
|
|
|
"{}. tributary: {}, provided: SubstrateBlock({})",
|
|
|
|
"tributary added distinct provided to delayed locally provided TX",
|
|
|
|
hex::encode(tributary.spec.genesis()),
|
|
|
|
block,
|
|
|
|
);
|
|
|
|
sleep(Duration::from_secs(60)).await;
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
}
|
2023-09-27 04:00:31 +00:00
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
panic!("provided an invalid transaction: {res:?}");
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
}
|
2023-09-29 07:51:01 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
None
|
|
|
|
}
|
|
|
|
// We'll only fire these if we are the Substrate signer, making the Tributary relevant
|
2023-11-12 12:24:41 +00:00
|
|
|
coordinator::ProcessorMessage::InvalidParticipant { id, .. } => {
|
|
|
|
Some(SubstrateDb::<D>::session_for_key(&txn, &id.key).unwrap())
|
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
coordinator::ProcessorMessage::CosignPreprocess { id, .. } => {
|
|
|
|
Some(SubstrateDb::<D>::session_for_key(&txn, &id.key).unwrap())
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
coordinator::ProcessorMessage::BatchPreprocess { id, .. } => {
|
|
|
|
Some(SubstrateDb::<D>::session_for_key(&txn, &id.key).unwrap())
|
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
coordinator::ProcessorMessage::SubstrateShare { id, .. } => {
|
2023-10-14 03:36:07 +00:00
|
|
|
Some(SubstrateDb::<D>::session_for_key(&txn, &id.key).unwrap())
|
|
|
|
}
|
2023-11-16 01:23:19 +00:00
|
|
|
coordinator::ProcessorMessage::CosignedBlock { block_number, block, signature } => {
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
let cosigned_block = CosignedBlock {
|
|
|
|
network,
|
2023-11-16 01:23:19 +00:00
|
|
|
block_number: *block_number,
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
block: *block,
|
|
|
|
signature: {
|
|
|
|
let mut arr = [0; 64];
|
|
|
|
arr.copy_from_slice(signature);
|
|
|
|
arr
|
|
|
|
},
|
|
|
|
};
|
|
|
|
cosign_channel.send(cosigned_block).unwrap();
|
|
|
|
P2p::broadcast(p2p, P2pMessageKind::CosignedBlock, cosigned_block.encode()).await;
|
|
|
|
None
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
},
|
|
|
|
// These don't return a relevant Tributary as there's no Tributary with action expected
|
|
|
|
ProcessorMessage::Substrate(inner_msg) => match inner_msg {
|
|
|
|
processor_messages::substrate::ProcessorMessage::Batch { batch } => {
|
|
|
|
assert_eq!(
|
|
|
|
batch.network, msg.network,
|
|
|
|
"processor sent us a batch for a different network than it was for",
|
|
|
|
);
|
|
|
|
MainDb::<D>::save_expected_batch(&mut txn, batch);
|
|
|
|
None
|
|
|
|
}
|
|
|
|
// If this is a new Batch, immediately publish it (if we can)
|
|
|
|
processor_messages::substrate::ProcessorMessage::SignedBatch { batch } => {
|
|
|
|
assert_eq!(
|
|
|
|
batch.batch.network, msg.network,
|
|
|
|
"processor sent us a signed batch for a different network than it was for",
|
|
|
|
);
|
2023-09-26 01:54:52 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
log::debug!("received batch {:?} {}", batch.batch.network, batch.batch.id);
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
// Save this batch to the disk
|
|
|
|
MainDb::<D>::save_batch(&mut txn, batch.clone());
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
// Get the next-to-execute batch ID
|
|
|
|
let mut next = substrate::get_expected_next_batch(serai, network).await;
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
// Since we have a new batch, publish all batches yet to be published to Serai
|
|
|
|
// This handles the edge-case where batch n+1 is signed before batch n is
|
|
|
|
let mut batches = VecDeque::new();
|
|
|
|
while let Some(batch) = MainDb::<D>::batch(&txn, network, next) {
|
|
|
|
batches.push_back(batch);
|
|
|
|
next += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
while let Some(batch) = batches.pop_front() {
|
|
|
|
// If this Batch should no longer be published, continue
|
|
|
|
if substrate::get_expected_next_batch(serai, network).await > batch.batch.id {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-10-14 06:52:02 +00:00
|
|
|
let tx = SeraiInInstructions::execute_batch(batch.clone());
|
2023-10-14 03:36:07 +00:00
|
|
|
log::debug!("attempting to publish batch {:?} {}", batch.batch.network, batch.batch.id,);
|
|
|
|
// This publish may fail if this transactions already exists in the mempool, which is
|
|
|
|
// possible, or if this batch was already executed on-chain
|
|
|
|
// Either case will have eventual resolution and be handled by the above check on if
|
|
|
|
// this batch should execute
|
|
|
|
let res = serai.publish(&tx).await;
|
|
|
|
if res.is_ok() {
|
|
|
|
log::info!(
|
|
|
|
"published batch {network:?} {} (block {})",
|
|
|
|
batch.batch.id,
|
|
|
|
hex::encode(batch.batch.block),
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
log::debug!(
|
|
|
|
"couldn't publish batch {:?} {}: {:?}",
|
|
|
|
batch.batch.network,
|
|
|
|
batch.batch.id,
|
|
|
|
res,
|
|
|
|
);
|
|
|
|
// If we failed to publish it, restore it
|
|
|
|
batches.push_front(batch);
|
|
|
|
// Sleep for a few seconds before retrying to prevent hammering the node
|
|
|
|
sleep(Duration::from_secs(5)).await;
|
|
|
|
}
|
|
|
|
}
|
2023-05-10 05:45:42 +00:00
|
|
|
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
None
|
2023-10-14 01:46:17 +00:00
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
},
|
|
|
|
};
|
2023-10-14 01:46:17 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
// If we have a relevant Tributary, check it's actually still relevant and has yet to be retired
|
|
|
|
if let Some(relevant_tributary_value) = relevant_tributary {
|
2023-10-14 20:47:25 +00:00
|
|
|
if MainDb::<D>::is_tributary_retired(
|
|
|
|
&txn,
|
2023-10-14 03:36:07 +00:00
|
|
|
ValidatorSet { network: msg.network, session: relevant_tributary_value },
|
2023-10-14 20:47:25 +00:00
|
|
|
) {
|
2023-10-14 03:36:07 +00:00
|
|
|
relevant_tributary = None;
|
|
|
|
}
|
|
|
|
}
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
// If there's a relevant Tributary...
|
|
|
|
if let Some(relevant_tributary) = relevant_tributary {
|
|
|
|
// Make sure we have it
|
|
|
|
// Per the reasoning above, we only return a Tributary as relevant if we're a participant
|
|
|
|
// Accordingly, we do *need* to have this Tributary now to handle it UNLESS the Tributary has
|
|
|
|
// already completed and this is simply an old message (which we prior checked)
|
|
|
|
let Some(ActiveTributary { spec, tributary }) = tributaries.get(&relevant_tributary) else {
|
|
|
|
// Since we don't, sleep for a fraction of a second and return false, signaling we didn't
|
|
|
|
// handle this message
|
2023-10-14 20:47:25 +00:00
|
|
|
// At the start of the loop which calls this function, we'll check for new tributaries,
|
|
|
|
// making this eventually resolve
|
2023-10-14 03:36:07 +00:00
|
|
|
sleep(Duration::from_millis(100)).await;
|
|
|
|
return false;
|
|
|
|
};
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
let genesis = spec.genesis();
|
|
|
|
let pub_key = Ristretto::generator() * key.deref();
|
2023-09-27 04:00:31 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
let txs = match msg.msg.clone() {
|
|
|
|
ProcessorMessage::KeyGen(inner_msg) => match inner_msg {
|
|
|
|
key_gen::ProcessorMessage::Commitments { id, commitments } => {
|
|
|
|
vec![Transaction::DkgCommitments(id.attempt, commitments, Transaction::empty_signed())]
|
|
|
|
}
|
2023-11-12 12:24:41 +00:00
|
|
|
key_gen::ProcessorMessage::InvalidCommitments { id: _, faulty } => {
|
|
|
|
// This doesn't need the ID since it's a Provided transaction which everyone will provide
|
|
|
|
// With this provision comes explicit ordering (with regards to other RemoveParticipant
|
|
|
|
// transactions) and group consensus
|
|
|
|
// Accordingly, this can't be replayed
|
|
|
|
// It could be included on-chain early/late with regards to the chain's active attempt,
|
|
|
|
// which attempt scheduling is written to avoid
|
|
|
|
vec![Transaction::RemoveParticipant(faulty)]
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
key_gen::ProcessorMessage::Shares { id, mut shares } => {
|
|
|
|
// Create a MuSig-based machine to inform Substrate of this key generation
|
|
|
|
let nonces = crate::tributary::dkg_confirmation_nonces(key, spec, id.attempt);
|
|
|
|
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
let our_i = spec
|
|
|
|
.i(pub_key)
|
|
|
|
.expect("processor message to DKG for a session we aren't a validator in");
|
|
|
|
|
|
|
|
// `tx_shares` needs to be done here as while it can be serialized from the HashMap
|
|
|
|
// without further context, it can't be deserialized without context
|
2023-10-14 03:36:07 +00:00
|
|
|
let mut tx_shares = Vec::with_capacity(shares.len());
|
2023-11-16 03:49:58 +00:00
|
|
|
for shares in &mut shares {
|
|
|
|
tx_shares.push(vec![]);
|
|
|
|
for i in 1 ..= spec.n() {
|
|
|
|
let i = Participant::new(i).unwrap();
|
|
|
|
if our_i.contains(&i) {
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
if shares.contains_key(&i) {
|
|
|
|
panic!("processor sent us our own shares");
|
|
|
|
}
|
2023-11-16 03:49:58 +00:00
|
|
|
continue;
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
}
|
|
|
|
tx_shares.last_mut().unwrap().push(
|
|
|
|
shares.remove(&i).expect("processor didn't send share for another validator"),
|
|
|
|
);
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
}
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
vec![Transaction::DkgShares {
|
|
|
|
attempt: id.attempt,
|
|
|
|
shares: tx_shares,
|
|
|
|
confirmation_nonces: nonces,
|
|
|
|
signed: Transaction::empty_signed(),
|
|
|
|
}]
|
|
|
|
}
|
2023-11-12 12:24:41 +00:00
|
|
|
key_gen::ProcessorMessage::InvalidShare { id, accuser, faulty, blame } => {
|
|
|
|
assert_eq!(
|
|
|
|
id.set.network, msg.network,
|
|
|
|
"processor claimed to be a different network than it was for in InvalidShare",
|
|
|
|
);
|
|
|
|
|
|
|
|
// Check if the MuSig signature had any errors as if so, we need to provide
|
|
|
|
// RemoveParticipant
|
|
|
|
// As for the safety of calling error_generating_key_pair, the processor is presumed
|
|
|
|
// to only send InvalidShare or GeneratedKeyPair for a given attempt
|
|
|
|
let mut txs = if let Some(faulty) =
|
2023-11-22 09:17:51 +00:00
|
|
|
crate::tributary::error_generating_key_pair::<_>(&txn, key, spec, id.attempt)
|
2023-11-12 12:24:41 +00:00
|
|
|
{
|
|
|
|
vec![Transaction::RemoveParticipant(faulty)]
|
|
|
|
} else {
|
|
|
|
vec![]
|
|
|
|
};
|
|
|
|
|
|
|
|
txs.push(Transaction::InvalidDkgShare {
|
|
|
|
attempt: id.attempt,
|
|
|
|
accuser,
|
|
|
|
faulty,
|
|
|
|
blame,
|
|
|
|
signed: Transaction::empty_signed(),
|
|
|
|
});
|
|
|
|
|
|
|
|
txs
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
key_gen::ProcessorMessage::GeneratedKeyPair { id, substrate_key, network_key } => {
|
|
|
|
assert_eq!(
|
|
|
|
id.set.network, msg.network,
|
2023-11-12 12:24:41 +00:00
|
|
|
"processor claimed to be a different network than it was for in GeneratedKeyPair",
|
2023-10-14 03:36:07 +00:00
|
|
|
);
|
|
|
|
// TODO2: Also check the other KeyGenId fields
|
|
|
|
|
|
|
|
// Tell the Tributary the key pair, get back the share for the MuSig signature
|
|
|
|
let share = crate::tributary::generated_key_pair::<D>(
|
|
|
|
&mut txn,
|
|
|
|
key,
|
|
|
|
spec,
|
2023-11-25 09:01:11 +00:00
|
|
|
&KeyPair(Public(substrate_key), network_key.try_into().unwrap()),
|
2023-10-14 03:36:07 +00:00
|
|
|
id.attempt,
|
|
|
|
);
|
|
|
|
|
|
|
|
match share {
|
|
|
|
Ok(share) => {
|
|
|
|
vec![Transaction::DkgConfirmed(id.attempt, share, Transaction::empty_signed())]
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
Err(p) => {
|
2023-11-12 12:24:41 +00:00
|
|
|
vec![Transaction::RemoveParticipant(p)]
|
2023-10-14 03:36:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-11-12 12:24:41 +00:00
|
|
|
key_gen::ProcessorMessage::Blame { id, participant } => {
|
|
|
|
assert_eq!(
|
|
|
|
id.set.network, msg.network,
|
|
|
|
"processor claimed to be a different network than it was for in Blame",
|
|
|
|
);
|
|
|
|
vec![Transaction::RemoveParticipant(participant)]
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
},
|
|
|
|
ProcessorMessage::Sign(msg) => match msg {
|
2023-11-12 12:24:41 +00:00
|
|
|
sign::ProcessorMessage::InvalidParticipant { .. } => {
|
|
|
|
// TODO: Locally increase slash points to maximum (distinct from an explicitly fatal
|
|
|
|
// slash) and censor transactions (yet don't explicitly ban)
|
|
|
|
vec![]
|
|
|
|
}
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
sign::ProcessorMessage::Preprocess { id, preprocesses } => {
|
2023-10-14 03:36:07 +00:00
|
|
|
if id.attempt == 0 {
|
2023-10-15 01:58:10 +00:00
|
|
|
MainDb::<D>::save_first_preprocess(
|
|
|
|
&mut txn,
|
|
|
|
network,
|
|
|
|
RecognizedIdType::Plan,
|
2023-11-07 00:50:32 +00:00
|
|
|
&id.id,
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
preprocesses,
|
2023-10-15 01:58:10 +00:00
|
|
|
);
|
2023-10-14 03:36:07 +00:00
|
|
|
|
|
|
|
vec![]
|
|
|
|
} else {
|
|
|
|
vec![Transaction::SignPreprocess(SignData {
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
plan: id.id,
|
|
|
|
attempt: id.attempt,
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
data: preprocesses,
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
signed: Transaction::empty_signed(),
|
2023-10-14 03:36:07 +00:00
|
|
|
})]
|
|
|
|
}
|
|
|
|
}
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
sign::ProcessorMessage::Share { id, shares } => {
|
|
|
|
vec![Transaction::SignShare(SignData {
|
|
|
|
plan: id.id,
|
|
|
|
attempt: id.attempt,
|
|
|
|
data: shares,
|
|
|
|
signed: Transaction::empty_signed(),
|
|
|
|
})]
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
sign::ProcessorMessage::Completed { key: _, id, tx } => {
|
|
|
|
let r = Zeroizing::new(<Ristretto as Ciphersuite>::F::random(&mut OsRng));
|
|
|
|
#[allow(non_snake_case)]
|
|
|
|
let R = <Ristretto as Ciphersuite>::generator() * r.deref();
|
|
|
|
let mut tx = Transaction::SignCompleted {
|
|
|
|
plan: id,
|
|
|
|
tx_hash: tx,
|
|
|
|
first_signer: pub_key,
|
|
|
|
signature: SchnorrSignature { R, s: <Ristretto as Ciphersuite>::F::ZERO },
|
|
|
|
};
|
|
|
|
let signed = SchnorrSignature::sign(key, r, tx.sign_completed_challenge());
|
|
|
|
match &mut tx {
|
|
|
|
Transaction::SignCompleted { signature, .. } => {
|
|
|
|
*signature = signed;
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
_ => unreachable!(),
|
|
|
|
}
|
|
|
|
vec![tx]
|
|
|
|
}
|
|
|
|
},
|
|
|
|
ProcessorMessage::Coordinator(inner_msg) => match inner_msg {
|
|
|
|
coordinator::ProcessorMessage::SubstrateBlockAck { .. } => unreachable!(),
|
2023-11-12 12:24:41 +00:00
|
|
|
coordinator::ProcessorMessage::InvalidParticipant { .. } => {
|
|
|
|
// TODO: Locally increase slash points to maximum (distinct from an explicitly fatal
|
|
|
|
// slash) and censor transactions (yet don't explicitly ban)
|
|
|
|
vec![]
|
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
coordinator::ProcessorMessage::CosignPreprocess { id, preprocesses } => {
|
|
|
|
vec![Transaction::SubstratePreprocess(SignData {
|
|
|
|
plan: id.id,
|
|
|
|
attempt: id.attempt,
|
2023-11-25 09:01:11 +00:00
|
|
|
data: preprocesses.into_iter().map(Into::into).collect(),
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
signed: Transaction::empty_signed(),
|
|
|
|
})]
|
|
|
|
}
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
coordinator::ProcessorMessage::BatchPreprocess { id, block, preprocesses } => {
|
2023-10-14 03:36:07 +00:00
|
|
|
log::info!(
|
|
|
|
"informed of batch (sign ID {}, attempt {}) for block {}",
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
hex::encode(id.id.encode()),
|
2023-10-14 03:36:07 +00:00
|
|
|
id.attempt,
|
|
|
|
hex::encode(block),
|
|
|
|
);
|
|
|
|
|
|
|
|
// If this is the first attempt instance, wait until we synchronize around the batch
|
|
|
|
// first
|
|
|
|
if id.attempt == 0 {
|
2023-10-15 01:58:10 +00:00
|
|
|
MainDb::<D>::save_first_preprocess(
|
|
|
|
&mut txn,
|
|
|
|
spec.set().network,
|
|
|
|
RecognizedIdType::Batch,
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
&{
|
|
|
|
let SubstrateSignableId::Batch(id) = id.id else {
|
|
|
|
panic!("BatchPreprocess SubstrateSignableId wasn't Batch")
|
|
|
|
};
|
|
|
|
id.encode()
|
|
|
|
},
|
2023-11-25 09:01:11 +00:00
|
|
|
preprocesses.into_iter().map(Into::into).collect(),
|
2023-10-15 01:58:10 +00:00
|
|
|
);
|
2023-10-14 03:36:07 +00:00
|
|
|
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
let intended = Transaction::Batch(
|
|
|
|
block.0,
|
|
|
|
match id.id {
|
|
|
|
SubstrateSignableId::Batch(id) => id,
|
|
|
|
_ => panic!("BatchPreprocess did not contain Batch ID"),
|
|
|
|
},
|
|
|
|
);
|
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
// If this is the new key's first Batch, only create this TX once we verify all
|
|
|
|
// all prior published `Batch`s
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
// TODO: This assumes BatchPreprocess is immediately after Batch
|
|
|
|
// Ensure that assumption
|
2023-10-14 03:36:07 +00:00
|
|
|
let last_received = MainDb::<D>::last_received_batch(&txn, msg.network).unwrap();
|
|
|
|
let handover_batch = MainDb::<D>::handover_batch(&txn, spec.set());
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
let mut queue = false;
|
|
|
|
if let Some(handover_batch) = handover_batch {
|
|
|
|
// There is a race condition here. We may verify all `Batch`s from the prior set,
|
|
|
|
// start signing the handover `Batch` `n`, start signing `n+1`, have `n+1` signed
|
|
|
|
// before `n` (or at the same time), yet then the prior set forges a malicious
|
|
|
|
// `Batch` `n`.
|
|
|
|
//
|
|
|
|
// The malicious `Batch` `n` would be publishable to Serai, as Serai can't
|
|
|
|
// distinguish what's intended to be a handover `Batch`, yet then anyone could
|
|
|
|
// publish the new set's `n+1`, causing their acceptance of the handover.
|
|
|
|
//
|
|
|
|
// To fix this, if this is after the handover `Batch` and we have yet to verify
|
|
|
|
// publication of the handover `Batch`, don't yet yield the provided.
|
|
|
|
if last_received > handover_batch {
|
|
|
|
if let Some(last_verified) = MainDb::<D>::last_verified_batch(&txn, msg.network) {
|
|
|
|
if last_verified < handover_batch {
|
|
|
|
queue = true;
|
2023-10-13 16:14:59 +00:00
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
} else {
|
|
|
|
queue = true;
|
2023-10-13 16:14:59 +00:00
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
} else {
|
|
|
|
MainDb::<D>::set_handover_batch(&mut txn, spec.set(), last_received);
|
|
|
|
// If this isn't the first batch, meaning we do have to verify all prior batches, and
|
|
|
|
// the prior Batch hasn't been verified yet...
|
|
|
|
if (last_received != 0) &&
|
|
|
|
MainDb::<D>::last_verified_batch(&txn, msg.network)
|
|
|
|
.map(|last_verified| last_verified < (last_received - 1))
|
|
|
|
.unwrap_or(true)
|
|
|
|
{
|
|
|
|
// Withhold this TX until we verify all prior `Batch`s
|
|
|
|
queue = true;
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
}
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
if queue {
|
2023-10-14 03:36:07 +00:00
|
|
|
MainDb::<D>::queue_batch(&mut txn, spec.set(), intended);
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
vec![]
|
|
|
|
} else {
|
|
|
|
// Because this is post-verification of the handover batch, take all queued `Batch`s
|
|
|
|
// now to ensure we don't provide this before an already queued Batch
|
|
|
|
// This *may* be an unreachable case due to how last_verified_batch is set, yet it
|
|
|
|
// doesn't hurt to have as a defensive pattern
|
|
|
|
let mut res = MainDb::<D>::take_queued_batches(&mut txn, spec.set());
|
|
|
|
res.push(intended);
|
|
|
|
res
|
2023-10-13 16:14:59 +00:00
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
} else {
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
vec![Transaction::SubstratePreprocess(SignData {
|
2023-10-14 03:36:07 +00:00
|
|
|
plan: id.id,
|
|
|
|
attempt: id.attempt,
|
2023-11-25 09:01:11 +00:00
|
|
|
data: preprocesses.into_iter().map(Into::into).collect(),
|
2023-10-14 03:36:07 +00:00
|
|
|
signed: Transaction::empty_signed(),
|
|
|
|
})]
|
|
|
|
}
|
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
coordinator::ProcessorMessage::SubstrateShare { id, shares } => {
|
|
|
|
vec![Transaction::SubstrateShare(SignData {
|
2023-10-14 03:36:07 +00:00
|
|
|
plan: id.id,
|
|
|
|
attempt: id.attempt,
|
Support multiple key shares per validator (#416)
* Update the coordinator to give key shares based on weight, not based on existence
Participants are now identified by their starting index. While this compiles,
the following is unimplemented:
1) A conversion for DKG `i` values. It assumes the threshold `i` values used
will be identical for the MuSig signature used to confirm the DKG.
2) Expansion from compressed values to full values before forwarding to the
processor.
* Add a fn to the DkgConfirmer to convert `i` values as needed
Also removes TODOs regarding Serai ensuring validator key uniqueness +
validity. The current infra achieves both.
* Have the Tributary DB track participation by shares, not by count
* Prevent a node from obtaining 34% of the maximum amount of key shares
This is actually mainly intended to set a bound on message sizes in the
coordinator. Message sizes are amplified by the amount of key shares held, so
setting an upper bound on said amount lets it determine constants. While that
upper bound could be 150, that'd be unreasonable and increase the potential for
DoS attacks.
* Correct the mechanism to detect if sufficient accumulation has occured
It used to check if the latest accumulation hit the required threshold. Now,
accumulations may jump past the required threshold. The required mechanism is
to check the threshold wasn't prior met and is now met.
* Finish updating the coordinator to handle a multiple key share per validator environment
* Adjust stategy re: preventing noce reuse in DKG Confirmer
* Add TODOs regarding dropped transactions, add possible TODO fix
* Update tests/coordinator
This doesn't add new multi-key-share tests, it solely updates the existing
single key-share tests to compile and run, with the necessary fixes to the
coordinator.
* Update processor key_gen to handle generating multiple key shares at once
* Update SubstrateSigner
* Update signer, clippy
* Update processor tests
* Update processor docker tests
2023-11-04 23:26:13 +00:00
|
|
|
data: shares.into_iter().map(|share| share.to_vec()).collect(),
|
2023-10-14 03:36:07 +00:00
|
|
|
signed: Transaction::empty_signed(),
|
|
|
|
})]
|
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
coordinator::ProcessorMessage::CosignedBlock { .. } => unreachable!(),
|
2023-10-14 03:36:07 +00:00
|
|
|
},
|
|
|
|
ProcessorMessage::Substrate(inner_msg) => match inner_msg {
|
|
|
|
processor_messages::substrate::ProcessorMessage::Batch { .. } => unreachable!(),
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
processor_messages::substrate::ProcessorMessage::SignedBatch { .. } => unreachable!(),
|
2023-10-14 03:36:07 +00:00
|
|
|
},
|
|
|
|
};
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
// If this created transactions, publish them
|
|
|
|
for mut tx in txs {
|
2023-11-16 18:37:35 +00:00
|
|
|
log::trace!("processor message effected transaction {} {:?}", hex::encode(tx.hash()), &tx);
|
2023-10-14 03:36:07 +00:00
|
|
|
|
|
|
|
match tx.kind() {
|
|
|
|
TransactionKind::Provided(_) => {
|
|
|
|
log::trace!("providing transaction {}", hex::encode(tx.hash()));
|
|
|
|
let res = tributary.provide_transaction(tx.clone()).await;
|
|
|
|
if !(res.is_ok() || (res == Err(ProvidedError::AlreadyProvided))) {
|
|
|
|
if res == Err(ProvidedError::LocalMismatchesOnChain) {
|
|
|
|
// Spin, since this is a crit for this Tributary
|
|
|
|
loop {
|
|
|
|
log::error!(
|
|
|
|
"{}. tributary: {}, provided: {:?}",
|
|
|
|
"tributary added distinct provided to delayed locally provided TX",
|
|
|
|
hex::encode(spec.genesis()),
|
|
|
|
&tx,
|
|
|
|
);
|
|
|
|
sleep(Duration::from_secs(60)).await;
|
Start moving Coordinator to a multi-Tributary model
Prior, we only supported a single Tributary per network, and spawned a task to
handled Processor messages per Tributary. Now, we handle Processor messages per
network, yet we still only supported a single Tributary in that handling
function.
Now, when we handle a message, we load the Tributary which is relevant. Once we
know it, we ensure we have it (preventing race conditions), and then proceed.
We do need work to check if we should have a Tributary, or if we're not
participating. We also need to check if a Tributary has been retired, meaning
we shouldn't handle any transactions related to them, and to clean up retired
Tributaries.
2023-09-27 22:20:36 +00:00
|
|
|
}
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
panic!("provided an invalid transaction: {res:?}");
|
2023-09-27 04:00:31 +00:00
|
|
|
}
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
TransactionKind::Unsigned => {
|
|
|
|
log::trace!("publishing unsigned transaction {}", hex::encode(tx.hash()));
|
2023-10-15 01:50:11 +00:00
|
|
|
match tributary.add_transaction(tx.clone()).await {
|
|
|
|
Ok(_) => {}
|
|
|
|
Err(e) => panic!("created an invalid unsigned transaction: {e:?}"),
|
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
}
|
|
|
|
TransactionKind::Signed(_) => {
|
|
|
|
log::trace!("getting next nonce for Tributary TX in response to processor message");
|
|
|
|
|
|
|
|
let nonce = loop {
|
|
|
|
let Some(nonce) =
|
2023-11-12 17:04:34 +00:00
|
|
|
NonceDecider::nonce(&txn, genesis, &tx).expect("signed TX didn't have nonce")
|
2023-10-14 03:36:07 +00:00
|
|
|
else {
|
|
|
|
// This can be None if the following events occur, in order:
|
|
|
|
// 1) We scanned the relevant transaction(s) in a Tributary block
|
|
|
|
// 2) The processor was sent a message and responded
|
|
|
|
// 3) The Tributary TXN has yet to be committed
|
|
|
|
log::warn!("nonce has yet to be saved for processor-instigated transaction");
|
|
|
|
sleep(Duration::from_millis(100)).await;
|
|
|
|
continue;
|
|
|
|
};
|
|
|
|
break nonce;
|
|
|
|
};
|
|
|
|
tx.sign(&mut OsRng, genesis, key, nonce);
|
|
|
|
|
|
|
|
publish_signed_transaction(&mut txn, tributary, tx).await;
|
|
|
|
}
|
2023-09-27 04:00:31 +00:00
|
|
|
}
|
2023-10-14 03:36:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
MainDb::<D>::save_handled_message(&mut txn, msg.network, msg.id);
|
|
|
|
txn.commit();
|
|
|
|
|
|
|
|
true
|
|
|
|
}
|
2023-09-27 04:00:31 +00:00
|
|
|
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
#[allow(clippy::too_many_arguments)]
|
2023-10-14 03:36:07 +00:00
|
|
|
async fn handle_processor_messages<D: Db, Pro: Processors, P: P2p>(
|
|
|
|
mut db: D,
|
|
|
|
key: Zeroizing<<Ristretto as Ciphersuite>::F>,
|
|
|
|
serai: Arc<Serai>,
|
|
|
|
mut processors: Pro,
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
p2p: P,
|
|
|
|
cosign_channel: mpsc::UnboundedSender<CosignedBlock>,
|
2023-10-14 03:36:07 +00:00
|
|
|
network: NetworkId,
|
2023-10-14 18:56:02 +00:00
|
|
|
mut tributary_event: mpsc::UnboundedReceiver<TributaryEvent<D, P>>,
|
2023-10-14 03:36:07 +00:00
|
|
|
) {
|
|
|
|
let mut tributaries = HashMap::new();
|
|
|
|
loop {
|
2023-10-14 18:56:02 +00:00
|
|
|
match tributary_event.try_recv() {
|
2023-10-14 20:47:25 +00:00
|
|
|
Ok(event) => match event {
|
|
|
|
TributaryEvent::NewTributary(tributary) => {
|
|
|
|
let set = tributary.spec.set();
|
|
|
|
assert_eq!(set.network, network);
|
|
|
|
tributaries.insert(set.session, tributary);
|
2023-10-14 18:56:02 +00:00
|
|
|
}
|
2023-10-14 20:47:25 +00:00
|
|
|
TributaryEvent::TributaryRetired(set) => {
|
|
|
|
tributaries.remove(&set.session);
|
|
|
|
}
|
|
|
|
},
|
2023-10-14 03:36:07 +00:00
|
|
|
Err(mpsc::error::TryRecvError::Empty) => {}
|
|
|
|
Err(mpsc::error::TryRecvError::Disconnected) => {
|
2023-10-14 18:56:02 +00:00
|
|
|
panic!("handle_processor_messages tributary_event sender closed")
|
2023-10-14 03:36:07 +00:00
|
|
|
}
|
2023-09-27 04:00:31 +00:00
|
|
|
}
|
|
|
|
|
2023-10-14 03:36:07 +00:00
|
|
|
// TODO: Check this ID is sane (last handled ID or expected next ID)
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
let Ok(msg) = tokio::time::timeout(Duration::from_secs(1), processors.recv(network)).await
|
|
|
|
else {
|
|
|
|
continue;
|
|
|
|
};
|
|
|
|
log::trace!("entering handle_processor_message for {:?}", network);
|
|
|
|
if handle_processor_message(
|
|
|
|
&mut db,
|
|
|
|
&key,
|
|
|
|
&serai,
|
|
|
|
&p2p,
|
|
|
|
&cosign_channel,
|
|
|
|
&tributaries,
|
|
|
|
network,
|
|
|
|
&msg,
|
|
|
|
)
|
|
|
|
.await
|
|
|
|
{
|
2023-10-14 03:36:07 +00:00
|
|
|
processors.ack(msg).await;
|
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
log::trace!("exited handle_processor_message for {:?}", network);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
|
|
async fn handle_cosigns_and_batch_publication<D: Db, P: P2p>(
|
|
|
|
mut db: D,
|
|
|
|
network: NetworkId,
|
|
|
|
mut tributary_event: mpsc::UnboundedReceiver<TributaryEvent<D, P>>,
|
|
|
|
) {
|
|
|
|
let mut tributaries = HashMap::new();
|
|
|
|
'outer: loop {
|
|
|
|
// TODO: Create a better async flow for this, as this does still hammer this task
|
|
|
|
tokio::task::yield_now().await;
|
|
|
|
|
|
|
|
match tributary_event.try_recv() {
|
|
|
|
Ok(event) => match event {
|
|
|
|
TributaryEvent::NewTributary(tributary) => {
|
|
|
|
let set = tributary.spec.set();
|
|
|
|
assert_eq!(set.network, network);
|
|
|
|
tributaries.insert(set.session, tributary);
|
|
|
|
}
|
|
|
|
TributaryEvent::TributaryRetired(set) => {
|
|
|
|
tributaries.remove(&set.session);
|
|
|
|
}
|
|
|
|
},
|
|
|
|
Err(mpsc::error::TryRecvError::Empty) => {}
|
|
|
|
Err(mpsc::error::TryRecvError::Disconnected) => {
|
|
|
|
panic!("handle_processor_messages tributary_event sender closed")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle pending cosigns
|
2023-11-19 07:05:01 +00:00
|
|
|
{
|
|
|
|
let mut txn = db.txn();
|
|
|
|
while let Some((session, block, hash)) = CosignTransactions::try_recv(&mut txn, network) {
|
|
|
|
let Some(ActiveTributary { spec, tributary }) = tributaries.get(&session) else {
|
|
|
|
log::warn!("didn't yet have tributary we're supposed to cosign with");
|
|
|
|
break;
|
|
|
|
};
|
|
|
|
log::info!(
|
|
|
|
"{network:?} {session:?} cosigning block #{block} (hash {}...)",
|
|
|
|
hex::encode(&hash[.. 8])
|
|
|
|
);
|
|
|
|
let tx = Transaction::CosignSubstrateBlock(hash);
|
|
|
|
let res = tributary.provide_transaction(tx.clone()).await;
|
|
|
|
if !(res.is_ok() || (res == Err(ProvidedError::AlreadyProvided))) {
|
|
|
|
if res == Err(ProvidedError::LocalMismatchesOnChain) {
|
|
|
|
// Spin, since this is a crit for this Tributary
|
|
|
|
loop {
|
|
|
|
log::error!(
|
|
|
|
"{}. tributary: {}, provided: {:?}",
|
|
|
|
"tributary added distinct CosignSubstrateBlock",
|
|
|
|
hex::encode(spec.genesis()),
|
|
|
|
&tx,
|
|
|
|
);
|
|
|
|
sleep(Duration::from_secs(60)).await;
|
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
}
|
2023-11-19 07:05:01 +00:00
|
|
|
panic!("provided an invalid CosignSubstrateBlock: {res:?}");
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
}
|
|
|
|
}
|
2023-11-19 07:05:01 +00:00
|
|
|
txn.commit();
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Verify any publifshed `Batch`s
|
|
|
|
{
|
|
|
|
let _hvq_lock = HANDOVER_VERIFY_QUEUE_LOCK.get_or_init(|| Mutex::new(())).lock().await;
|
|
|
|
let mut txn = db.txn();
|
|
|
|
let mut to_publish = vec![];
|
|
|
|
let start_id = MainDb::<D>::last_verified_batch(&txn, network)
|
|
|
|
.map(|already_verified| already_verified + 1)
|
|
|
|
.unwrap_or(0);
|
|
|
|
if let Some(last_id) =
|
|
|
|
substrate::verify_published_batches::<D>(&mut txn, network, u32::MAX).await
|
|
|
|
{
|
|
|
|
// Check if any of these `Batch`s were a handover `Batch` or the `Batch` before a handover
|
|
|
|
// `Batch`
|
|
|
|
// If so, we need to publish queued provided `Batch` transactions
|
|
|
|
for batch in start_id ..= last_id {
|
|
|
|
let is_pre_handover = MainDb::<D>::is_handover_batch(&txn, network, batch + 1);
|
|
|
|
if let Some(set) = is_pre_handover {
|
|
|
|
let mut queued = MainDb::<D>::take_queued_batches(&mut txn, set);
|
|
|
|
// is_handover_batch is only set for handover `Batch`s we're participating in, making
|
|
|
|
// this safe
|
|
|
|
if queued.is_empty() {
|
|
|
|
panic!("knew the next Batch was a handover yet didn't queue it");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only publish the handover Batch
|
|
|
|
to_publish.push((set.session, queued.remove(0)));
|
|
|
|
// Re-queue the remaining batches
|
|
|
|
for remaining in queued {
|
|
|
|
MainDb::<D>::queue_batch(&mut txn, set, remaining);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let is_handover = MainDb::<D>::is_handover_batch(&txn, network, batch);
|
|
|
|
if let Some(set) = is_handover {
|
|
|
|
for queued in MainDb::<D>::take_queued_batches(&mut txn, set) {
|
|
|
|
to_publish.push((set.session, queued));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (session, tx) in to_publish {
|
|
|
|
let Some(ActiveTributary { spec, tributary }) = tributaries.get(&session) else {
|
|
|
|
log::warn!("didn't yet have tributary we're supposed to provide a queued Batch for");
|
|
|
|
// Safe since this will drop the txn updating the most recently queued batch
|
|
|
|
continue 'outer;
|
|
|
|
};
|
2023-11-16 18:37:35 +00:00
|
|
|
log::debug!("providing Batch transaction {:?}", &tx);
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
let res = tributary.provide_transaction(tx.clone()).await;
|
|
|
|
if !(res.is_ok() || (res == Err(ProvidedError::AlreadyProvided))) {
|
|
|
|
if res == Err(ProvidedError::LocalMismatchesOnChain) {
|
|
|
|
// Spin, since this is a crit for this Tributary
|
|
|
|
loop {
|
|
|
|
log::error!(
|
|
|
|
"{}. tributary: {}, provided: {:?}",
|
|
|
|
"tributary added distinct Batch",
|
|
|
|
hex::encode(spec.genesis()),
|
|
|
|
&tx,
|
|
|
|
);
|
|
|
|
sleep(Duration::from_secs(60)).await;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
panic!("provided an invalid Batch: {res:?}");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
txn.commit();
|
|
|
|
}
|
2023-09-27 04:00:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
|
|
|
|
db: D,
|
|
|
|
key: Zeroizing<<Ristretto as Ciphersuite>::F>,
|
|
|
|
serai: Arc<Serai>,
|
2023-09-27 16:20:57 +00:00
|
|
|
processors: Pro,
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
p2p: P,
|
|
|
|
cosign_channel: mpsc::UnboundedSender<CosignedBlock>,
|
2023-10-14 18:56:02 +00:00
|
|
|
mut tributary_event: broadcast::Receiver<TributaryEvent<D, P>>,
|
2023-09-27 04:00:31 +00:00
|
|
|
) {
|
2023-09-27 16:20:57 +00:00
|
|
|
let mut channels = HashMap::new();
|
2023-10-13 03:59:21 +00:00
|
|
|
for network in serai_client::primitives::NETWORKS {
|
|
|
|
if network == NetworkId::Serai {
|
|
|
|
continue;
|
|
|
|
}
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
let (processor_send, processor_recv) = mpsc::unbounded_channel();
|
2023-09-27 16:20:57 +00:00
|
|
|
tokio::spawn(handle_processor_messages(
|
|
|
|
db.clone(),
|
|
|
|
key.clone(),
|
|
|
|
serai.clone(),
|
|
|
|
processors.clone(),
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
p2p.clone(),
|
|
|
|
cosign_channel.clone(),
|
2023-09-27 16:20:57 +00:00
|
|
|
network,
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
processor_recv,
|
2023-09-27 16:20:57 +00:00
|
|
|
));
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
let (cosign_send, cosign_recv) = mpsc::unbounded_channel();
|
|
|
|
tokio::spawn(handle_cosigns_and_batch_publication(db.clone(), network, cosign_recv));
|
|
|
|
channels.insert(network, (processor_send, cosign_send));
|
2023-09-27 16:20:57 +00:00
|
|
|
}
|
2023-08-13 06:21:56 +00:00
|
|
|
|
2023-09-27 16:20:57 +00:00
|
|
|
// Listen to new tributary events
|
2023-09-26 01:54:52 +00:00
|
|
|
loop {
|
2023-10-14 18:56:02 +00:00
|
|
|
match tributary_event.recv().await.unwrap() {
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
TributaryEvent::NewTributary(tributary) => {
|
|
|
|
let (c1, c2) = &channels[&tributary.spec.set().network];
|
|
|
|
c1.send(TributaryEvent::NewTributary(tributary.clone())).unwrap();
|
|
|
|
c2.send(TributaryEvent::NewTributary(tributary)).unwrap();
|
|
|
|
}
|
2023-10-14 20:47:25 +00:00
|
|
|
TributaryEvent::TributaryRetired(set) => {
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
let (c1, c2) = &channels[&set.network];
|
|
|
|
c1.send(TributaryEvent::TributaryRetired(set)).unwrap();
|
|
|
|
c2.send(TributaryEvent::TributaryRetired(set)).unwrap();
|
2023-10-14 20:47:25 +00:00
|
|
|
}
|
2023-10-14 18:56:02 +00:00
|
|
|
};
|
2023-04-25 07:14:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-05-10 03:44:41 +00:00
|
|
|
pub async fn run<D: Db, Pro: Processors, P: P2p>(
|
2023-09-27 04:00:31 +00:00
|
|
|
raw_db: D,
|
2023-04-24 03:15:15 +00:00
|
|
|
key: Zeroizing<<Ristretto as Ciphersuite>::F>,
|
|
|
|
p2p: P,
|
2023-05-10 03:44:41 +00:00
|
|
|
processors: Pro,
|
2023-04-24 03:15:15 +00:00
|
|
|
serai: Serai,
|
|
|
|
) {
|
2023-08-14 10:08:55 +00:00
|
|
|
let serai = Arc::new(serai);
|
|
|
|
|
2023-09-25 23:28:53 +00:00
|
|
|
let (new_tributary_spec_send, mut new_tributary_spec_recv) = mpsc::unbounded_channel();
|
|
|
|
// Reload active tributaries from the database
|
2023-09-27 04:00:31 +00:00
|
|
|
for spec in MainDb::<D>::active_tributaries(&raw_db).1 {
|
2023-09-25 23:28:53 +00:00
|
|
|
new_tributary_spec_send.send(spec).unwrap();
|
|
|
|
}
|
2023-09-25 22:23:39 +00:00
|
|
|
|
2023-10-14 20:47:25 +00:00
|
|
|
let (tributary_retired_send, mut tributary_retired_recv) = mpsc::unbounded_channel();
|
|
|
|
|
2023-04-24 03:15:15 +00:00
|
|
|
// Handle new Substrate blocks
|
2023-10-14 02:31:26 +00:00
|
|
|
tokio::spawn(crate::substrate::scan_task(
|
2023-09-25 22:23:39 +00:00
|
|
|
raw_db.clone(),
|
|
|
|
key.clone(),
|
|
|
|
processors.clone(),
|
|
|
|
serai.clone(),
|
2023-09-25 23:28:53 +00:00
|
|
|
new_tributary_spec_send,
|
2023-10-14 20:47:25 +00:00
|
|
|
tributary_retired_send,
|
2023-09-25 22:23:39 +00:00
|
|
|
));
|
2023-04-24 03:15:15 +00:00
|
|
|
|
|
|
|
// Handle the Tributaries
|
|
|
|
|
2023-09-25 23:28:53 +00:00
|
|
|
// This should be large enough for an entire rotation of all tributaries
|
|
|
|
// If it's too small, the coordinator fail to boot, which is a decent sanity check
|
2023-10-14 18:56:02 +00:00
|
|
|
let (tributary_event, mut tributary_event_listener_1) = broadcast::channel(32);
|
|
|
|
let tributary_event_listener_2 = tributary_event.subscribe();
|
|
|
|
let tributary_event_listener_3 = tributary_event.subscribe();
|
|
|
|
let tributary_event_listener_4 = tributary_event.subscribe();
|
|
|
|
let tributary_event_listener_5 = tributary_event.subscribe();
|
2023-04-24 03:15:15 +00:00
|
|
|
|
2023-10-14 20:47:25 +00:00
|
|
|
// Emit TributaryEvent::TributaryRetired
|
|
|
|
tokio::spawn({
|
|
|
|
let tributary_event = tributary_event.clone();
|
|
|
|
async move {
|
|
|
|
loop {
|
|
|
|
let retired = tributary_retired_recv.recv().await.unwrap();
|
|
|
|
tributary_event.send(TributaryEvent::TributaryRetired(retired)).map_err(|_| ()).unwrap();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2023-09-25 23:28:53 +00:00
|
|
|
// Spawn a task to further add Tributaries as needed
|
|
|
|
tokio::spawn({
|
|
|
|
let raw_db = raw_db.clone();
|
|
|
|
let key = key.clone();
|
|
|
|
let processors = processors.clone();
|
|
|
|
let p2p = p2p.clone();
|
|
|
|
async move {
|
|
|
|
loop {
|
|
|
|
let spec = new_tributary_spec_recv.recv().await.unwrap();
|
2023-10-13 01:55:25 +00:00
|
|
|
// Uses an inner task as Tributary::new may take several seconds
|
|
|
|
tokio::spawn({
|
|
|
|
let raw_db = raw_db.clone();
|
|
|
|
let key = key.clone();
|
|
|
|
let processors = processors.clone();
|
|
|
|
let p2p = p2p.clone();
|
2023-10-14 18:56:02 +00:00
|
|
|
let tributary_event = tributary_event.clone();
|
2023-10-13 01:55:25 +00:00
|
|
|
async move {
|
2023-10-14 20:47:25 +00:00
|
|
|
add_tributary(raw_db, key, &processors, p2p, &tributary_event, spec).await;
|
2023-10-13 01:55:25 +00:00
|
|
|
}
|
|
|
|
});
|
2023-09-25 23:28:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
2023-04-24 03:15:15 +00:00
|
|
|
|
2023-08-25 01:55:59 +00:00
|
|
|
// When we reach synchrony on an event requiring signing, send our preprocess for it
|
2023-10-14 23:55:14 +00:00
|
|
|
// TODO: Properly place this into the Tributary scanner, as it's a mess out here
|
2023-08-25 01:55:59 +00:00
|
|
|
let recognized_id = {
|
2023-05-09 02:20:51 +00:00
|
|
|
let raw_db = raw_db.clone();
|
|
|
|
let key = key.clone();
|
2023-09-25 23:28:53 +00:00
|
|
|
|
|
|
|
let tributaries = Arc::new(RwLock::new(HashMap::new()));
|
2023-10-14 23:55:14 +00:00
|
|
|
// Spawn a task to maintain a local view of the tributaries for whenever recognized_id is
|
|
|
|
// called
|
2023-09-25 23:28:53 +00:00
|
|
|
tokio::spawn({
|
|
|
|
let tributaries = tributaries.clone();
|
2023-10-14 20:47:25 +00:00
|
|
|
let mut set_to_genesis = HashMap::new();
|
2023-09-25 23:28:53 +00:00
|
|
|
async move {
|
|
|
|
loop {
|
2023-10-14 18:56:02 +00:00
|
|
|
match tributary_event_listener_1.recv().await {
|
|
|
|
Ok(TributaryEvent::NewTributary(tributary)) => {
|
2023-10-14 20:47:25 +00:00
|
|
|
set_to_genesis.insert(tributary.spec.set(), tributary.spec.genesis());
|
2023-09-26 01:54:52 +00:00
|
|
|
tributaries.write().await.insert(tributary.spec.genesis(), tributary.tributary);
|
2023-09-25 23:28:53 +00:00
|
|
|
}
|
2023-10-14 20:47:25 +00:00
|
|
|
Ok(TributaryEvent::TributaryRetired(set)) => {
|
|
|
|
if let Some(genesis) = set_to_genesis.remove(&set) {
|
|
|
|
tributaries.write().await.remove(&genesis);
|
|
|
|
}
|
|
|
|
}
|
2023-09-25 23:28:53 +00:00
|
|
|
Err(broadcast::error::RecvError::Lagged(_)) => {
|
2023-10-14 18:56:02 +00:00
|
|
|
panic!("recognized_id lagged to handle tributary_event")
|
2023-09-25 23:28:53 +00:00
|
|
|
}
|
2023-10-14 18:56:02 +00:00
|
|
|
Err(broadcast::error::RecvError::Closed) => panic!("tributary_event sender closed"),
|
2023-09-25 23:28:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2023-11-07 00:50:32 +00:00
|
|
|
move |set: ValidatorSet, genesis, id_type, id: Vec<u8>, nonce| {
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
log::debug!("recognized ID {:?} {}", id_type, hex::encode(&id));
|
2023-09-27 04:44:31 +00:00
|
|
|
let mut raw_db = raw_db.clone();
|
2023-08-25 01:55:59 +00:00
|
|
|
let key = key.clone();
|
|
|
|
let tributaries = tributaries.clone();
|
|
|
|
async move {
|
2023-09-27 04:00:31 +00:00
|
|
|
// The transactions for these are fired before the preprocesses are actually
|
|
|
|
// received/saved, creating a race between Tributary ack and the availability of all
|
|
|
|
// Preprocesses
|
2023-10-14 20:47:25 +00:00
|
|
|
// This waits until the necessary preprocess is available 0,
|
2023-10-15 01:58:10 +00:00
|
|
|
let get_preprocess = |raw_db, id_type, id| async move {
|
2023-08-27 01:09:57 +00:00
|
|
|
loop {
|
2023-10-15 01:58:10 +00:00
|
|
|
let Some(preprocess) = MainDb::<D>::first_preprocess(raw_db, set.network, id_type, id)
|
|
|
|
else {
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
log::warn!("waiting for preprocess for recognized ID");
|
2023-08-27 01:09:57 +00:00
|
|
|
sleep(Duration::from_millis(100)).await;
|
|
|
|
continue;
|
|
|
|
};
|
|
|
|
return preprocess;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
Replace ExternalBlock with Batch
The initial TODO was simply to use one ExternalBlock per all batches in the
block. This would require publishing ExternalBlock after the last batch,
requiring knowing the last batch. While we could add such a pipeline, it'd
require:
1) Initial preprocesses using a distinct message from BatchPreprocess
2) An additional message sent after all BatchPreprocess are sent
Unfortunately, both would require tweaks to the SubstrateSigner which aren't
worth the complexity compared to the solution here, at least, not at this time.
While this will cause, if a Tributary is signing a block whose total batch data
exceeds 25 kB, to use multiple transactions which could be optimized out by
'better' local data pipelining, that's an extreme edge case. Given the temporal
nature of each Tributary, it's also an acceptable edge.
This does no longer achieve synchrony over external blocks accordingly. While
signed batches have synchrony, as they embed their block hash, batches being
signed don't have cryptographic synchrony on their contents. This means
validators who are eclipsed may produce invalid shares, as they sign a
different batch. This will be introduced in a follow-up commit.
2023-09-01 02:48:02 +00:00
|
|
|
let mut tx = match id_type {
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
RecognizedIdType::Batch => Transaction::SubstratePreprocess(SignData {
|
2023-11-07 00:50:32 +00:00
|
|
|
data: get_preprocess(&raw_db, id_type, &id).await,
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
plan: SubstrateSignableId::Batch(id.as_slice().try_into().unwrap()),
|
Replace ExternalBlock with Batch
The initial TODO was simply to use one ExternalBlock per all batches in the
block. This would require publishing ExternalBlock after the last batch,
requiring knowing the last batch. While we could add such a pipeline, it'd
require:
1) Initial preprocesses using a distinct message from BatchPreprocess
2) An additional message sent after all BatchPreprocess are sent
Unfortunately, both would require tweaks to the SubstrateSigner which aren't
worth the complexity compared to the solution here, at least, not at this time.
While this will cause, if a Tributary is signing a block whose total batch data
exceeds 25 kB, to use multiple transactions which could be optimized out by
'better' local data pipelining, that's an extreme edge case. Given the temporal
nature of each Tributary, it's also an acceptable edge.
This does no longer achieve synchrony over external blocks accordingly. While
signed batches have synchrony, as they embed their block hash, batches being
signed don't have cryptographic synchrony on their contents. This means
validators who are eclipsed may produce invalid shares, as they sign a
different batch. This will be introduced in a follow-up commit.
2023-09-01 02:48:02 +00:00
|
|
|
attempt: 0,
|
|
|
|
signed: Transaction::empty_signed(),
|
|
|
|
}),
|
2023-05-09 02:20:51 +00:00
|
|
|
|
Replace ExternalBlock with Batch
The initial TODO was simply to use one ExternalBlock per all batches in the
block. This would require publishing ExternalBlock after the last batch,
requiring knowing the last batch. While we could add such a pipeline, it'd
require:
1) Initial preprocesses using a distinct message from BatchPreprocess
2) An additional message sent after all BatchPreprocess are sent
Unfortunately, both would require tweaks to the SubstrateSigner which aren't
worth the complexity compared to the solution here, at least, not at this time.
While this will cause, if a Tributary is signing a block whose total batch data
exceeds 25 kB, to use multiple transactions which could be optimized out by
'better' local data pipelining, that's an extreme edge case. Given the temporal
nature of each Tributary, it's also an acceptable edge.
This does no longer achieve synchrony over external blocks accordingly. While
signed batches have synchrony, as they embed their block hash, batches being
signed don't have cryptographic synchrony on their contents. This means
validators who are eclipsed may produce invalid shares, as they sign a
different batch. This will be introduced in a follow-up commit.
2023-09-01 02:48:02 +00:00
|
|
|
RecognizedIdType::Plan => Transaction::SignPreprocess(SignData {
|
2023-11-07 00:50:32 +00:00
|
|
|
data: get_preprocess(&raw_db, id_type, &id).await,
|
|
|
|
plan: id.try_into().unwrap(),
|
Replace ExternalBlock with Batch
The initial TODO was simply to use one ExternalBlock per all batches in the
block. This would require publishing ExternalBlock after the last batch,
requiring knowing the last batch. While we could add such a pipeline, it'd
require:
1) Initial preprocesses using a distinct message from BatchPreprocess
2) An additional message sent after all BatchPreprocess are sent
Unfortunately, both would require tweaks to the SubstrateSigner which aren't
worth the complexity compared to the solution here, at least, not at this time.
While this will cause, if a Tributary is signing a block whose total batch data
exceeds 25 kB, to use multiple transactions which could be optimized out by
'better' local data pipelining, that's an extreme edge case. Given the temporal
nature of each Tributary, it's also an acceptable edge.
This does no longer achieve synchrony over external blocks accordingly. While
signed batches have synchrony, as they embed their block hash, batches being
signed don't have cryptographic synchrony on their contents. This means
validators who are eclipsed may produce invalid shares, as they sign a
different batch. This will be introduced in a follow-up commit.
2023-09-01 02:48:02 +00:00
|
|
|
attempt: 0,
|
|
|
|
signed: Transaction::empty_signed(),
|
|
|
|
}),
|
2023-08-25 01:55:59 +00:00
|
|
|
};
|
|
|
|
|
2023-09-25 19:42:39 +00:00
|
|
|
tx.sign(&mut OsRng, genesis, &key, nonce);
|
|
|
|
|
2023-10-14 04:41:07 +00:00
|
|
|
let mut first = true;
|
|
|
|
loop {
|
|
|
|
if !first {
|
|
|
|
sleep(Duration::from_millis(100)).await;
|
|
|
|
}
|
|
|
|
first = false;
|
|
|
|
|
|
|
|
let tributaries = tributaries.read().await;
|
|
|
|
let Some(tributary) = tributaries.get(&genesis) else {
|
2023-10-14 20:47:25 +00:00
|
|
|
// If we don't have this Tributary because it's retired, break and move on
|
|
|
|
if MainDb::<D>::is_tributary_retired(&raw_db, set) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2023-10-14 04:41:07 +00:00
|
|
|
// This may happen if the task above is simply slow
|
|
|
|
log::warn!("tributary we don't have yet came to consensus on an Batch");
|
|
|
|
continue;
|
|
|
|
};
|
2023-10-14 20:47:25 +00:00
|
|
|
// This is safe to perform multiple times and solely needs atomicity with regards to
|
|
|
|
// itself
|
2023-10-15 01:58:10 +00:00
|
|
|
// TODO: Should this not take a txn accordingly? It's best practice to take a txn, yet
|
2023-10-14 20:47:25 +00:00
|
|
|
// taking a txn fails to declare its achieved independence
|
2023-10-14 04:41:07 +00:00
|
|
|
let mut txn = raw_db.txn();
|
|
|
|
publish_signed_transaction(&mut txn, tributary, tx).await;
|
|
|
|
txn.commit();
|
2023-10-14 06:52:02 +00:00
|
|
|
break;
|
2023-10-14 04:41:07 +00:00
|
|
|
}
|
2023-05-09 02:20:51 +00:00
|
|
|
}
|
|
|
|
}
|
2023-08-25 01:55:59 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// Handle new blocks for each Tributary
|
|
|
|
{
|
|
|
|
let raw_db = raw_db.clone();
|
2023-10-14 02:31:26 +00:00
|
|
|
tokio::spawn(tributary::scanner::scan_tributaries_task(
|
2023-08-25 01:55:59 +00:00
|
|
|
raw_db,
|
|
|
|
key.clone(),
|
|
|
|
recognized_id,
|
|
|
|
processors.clone(),
|
|
|
|
serai.clone(),
|
2023-10-14 18:56:02 +00:00
|
|
|
tributary_event_listener_2,
|
2023-08-25 01:55:59 +00:00
|
|
|
));
|
|
|
|
}
|
2023-04-24 03:15:15 +00:00
|
|
|
|
|
|
|
// Spawn the heartbeat task, which will trigger syncing if there hasn't been a Tributary block
|
|
|
|
// in a while (presumably because we're behind)
|
2023-10-14 18:56:02 +00:00
|
|
|
tokio::spawn(p2p::heartbeat_tributaries_task(p2p.clone(), tributary_event_listener_3));
|
2023-04-24 03:15:15 +00:00
|
|
|
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
// Create the Cosign evaluator
|
|
|
|
let cosign_channel = CosignEvaluator::new(raw_db.clone(), p2p.clone(), serai.clone());
|
|
|
|
|
2023-04-24 03:15:15 +00:00
|
|
|
// Handle P2P messages
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
tokio::spawn(p2p::handle_p2p_task(
|
|
|
|
p2p.clone(),
|
|
|
|
cosign_channel.clone(),
|
|
|
|
tributary_event_listener_4,
|
|
|
|
));
|
2023-04-15 21:38:47 +00:00
|
|
|
|
2023-04-25 07:14:42 +00:00
|
|
|
// Handle all messages from processors
|
Add a cosigning protocol to ensure finalizations are unique (#433)
* Add a function to deterministically decide which Serai blocks should be co-signed
Has a 5 minute latency between co-signs, also used as the maximal latency
before a co-sign is started.
* Get all active tributaries we're in at a specific block
* Add and route CosignSubstrateBlock, a new provided TX
* Split queued cosigns per network
* Rename BatchSignId to SubstrateSignId
* Add SubstrateSignableId, a meta-type for either Batch or Block, and modularize around it
* Handle the CosignSubstrateBlock provided TX
* Revert substrate_signer.rs to develop (and patch to still work)
Due to SubstrateSigner moving when the prior multisig closes, yet cosigning
occurring with the most recent key, a single SubstrateSigner can be reused.
We could manage multiple SubstrateSigners, yet considering the much lower
specifications for cosigning, I'd rather treat it distinctly.
* Route cosigning through the processor
* Add note to rename SubstrateSigner post-PR
I don't want to do so now in order to preserve the diff's clarity.
* Implement cosign evaluation into the coordinator
* Get tests to compile
* Bug fixes, mark blocks without cosigners available as cosigned
* Correct the ID Batch preprocesses are saved under, add log statements
* Create a dedicated function to handle cosigns
* Correct the flow around Batch verification/queueing
Verifying `Batch`s could stall when a `Batch` was signed before its
predecessors/before the block it's contained in was cosigned (the latter being
inevitable as we can't sign a block containing a signed batch before signing
the batch).
Now, Batch verification happens on a distinct async task in order to not block
the handling of processor messages. This task is the sole caller of verify in
order to ensure last_verified_batch isn't unexpectedly mutated.
When the processor message handler needs to access it, or needs to queue a
Batch, it associates the DB TXN with a lock preventing the other task from
doing so.
This lock, as currently implemented, is a poor and inefficient design. It
should be modified to the pattern used for cosign management. Additionally, a
new primitive of a DB-backed channel may be immensely valuable.
Fixes a standing potential deadlock and a deadlock introduced with the
cosigning protocol.
* Working full-stack tests
After the last commit, this only required extending a timeout.
* Replace "co-sign" with "cosign" to make finding text easier
* Update the coordinator tests to support cosigning
* Inline prior_batch calculation to prevent panic on rotation
Noticed when doing a final review of the branch.
2023-11-15 21:57:21 +00:00
|
|
|
handle_processors(
|
|
|
|
raw_db,
|
|
|
|
key,
|
|
|
|
serai,
|
|
|
|
processors,
|
|
|
|
p2p,
|
|
|
|
cosign_channel,
|
|
|
|
tributary_event_listener_5,
|
|
|
|
)
|
|
|
|
.await;
|
2023-04-15 21:38:47 +00:00
|
|
|
}
|
|
|
|
|
2023-04-11 13:21:35 +00:00
|
|
|
#[tokio::main]
|
2023-04-15 21:38:47 +00:00
|
|
|
async fn main() {
|
2023-08-13 08:30:49 +00:00
|
|
|
// Override the panic handler with one which will panic if any tokio task panics
|
|
|
|
{
|
|
|
|
let existing = std::panic::take_hook();
|
|
|
|
std::panic::set_hook(Box::new(move |panic| {
|
|
|
|
existing(panic);
|
|
|
|
const MSG: &str = "exiting the process due to a task panicking";
|
|
|
|
println!("{MSG}");
|
|
|
|
log::error!("{MSG}");
|
|
|
|
std::process::exit(1);
|
|
|
|
}));
|
|
|
|
}
|
|
|
|
|
2023-08-01 23:00:48 +00:00
|
|
|
if std::env::var("RUST_LOG").is_err() {
|
|
|
|
std::env::set_var("RUST_LOG", serai_env::var("RUST_LOG").unwrap_or_else(|| "info".to_string()));
|
|
|
|
}
|
|
|
|
env_logger::init();
|
|
|
|
|
|
|
|
log::info!("starting coordinator service...");
|
|
|
|
|
2023-07-26 01:39:29 +00:00
|
|
|
let db = serai_db::new_rocksdb(&env::var("DB_PATH").expect("path to DB wasn't specified"));
|
2023-04-17 06:10:33 +00:00
|
|
|
|
2023-08-06 16:38:44 +00:00
|
|
|
let key = {
|
|
|
|
let mut key_hex = serai_env::var("SERAI_KEY").expect("Serai key wasn't provided");
|
|
|
|
let mut key_vec = hex::decode(&key_hex).map_err(|_| ()).expect("Serai key wasn't hex-encoded");
|
|
|
|
key_hex.zeroize();
|
|
|
|
if key_vec.len() != 32 {
|
|
|
|
key_vec.zeroize();
|
|
|
|
panic!("Serai key had an invalid length");
|
|
|
|
}
|
|
|
|
let mut key_bytes = [0; 32];
|
|
|
|
key_bytes.copy_from_slice(&key_vec);
|
|
|
|
key_vec.zeroize();
|
|
|
|
let key = Zeroizing::new(<Ristretto as Ciphersuite>::F::from_repr(key_bytes).unwrap());
|
|
|
|
key_bytes.zeroize();
|
|
|
|
key
|
|
|
|
};
|
2023-08-08 19:12:47 +00:00
|
|
|
let p2p = LibP2p::new();
|
2023-04-17 06:10:33 +00:00
|
|
|
|
2023-07-21 18:00:03 +00:00
|
|
|
let processors = Arc::new(MessageQueue::from_env(Service::Coordinator));
|
2023-04-17 06:10:33 +00:00
|
|
|
|
2023-04-16 04:51:56 +00:00
|
|
|
let serai = || async {
|
|
|
|
loop {
|
2023-08-06 16:38:44 +00:00
|
|
|
let Ok(serai) = Serai::new(&format!(
|
2023-08-01 23:00:48 +00:00
|
|
|
"ws://{}:9944",
|
|
|
|
serai_env::var("SERAI_HOSTNAME").expect("Serai hostname wasn't provided")
|
2023-08-06 16:38:44 +00:00
|
|
|
))
|
2023-08-01 23:00:48 +00:00
|
|
|
.await
|
|
|
|
else {
|
2023-04-16 04:51:56 +00:00
|
|
|
log::error!("couldn't connect to the Serai node");
|
2023-04-17 06:10:33 +00:00
|
|
|
sleep(Duration::from_secs(5)).await;
|
2023-08-01 04:47:36 +00:00
|
|
|
continue;
|
2023-04-16 04:51:56 +00:00
|
|
|
};
|
2023-08-01 23:00:48 +00:00
|
|
|
log::info!("made initial connection to Serai node");
|
2023-04-16 04:51:56 +00:00
|
|
|
return serai;
|
|
|
|
}
|
|
|
|
};
|
2023-05-10 03:44:41 +00:00
|
|
|
run(db, key, p2p, processors, serai().await).await
|
2023-04-15 21:38:47 +00:00
|
|
|
}
|