Dedicated tasks per-Processor in coordinator

This isn't meaningful yet, as we still have serialized reading messages from Processors, yet is a step closer.
2025-03-25 00:28:52 +00:00 · 2023-09-25 21:54:52 -04:00 · 2023-09-25 21:54:52 -04:00 · e1801b57c9
commit e1801b57c9
parent 60491a091f
1 changed files with 360 additions and 314 deletions
--- a/coordinator/src/main.rs
+++ b/coordinator/src/main.rs
@ -175,9 +175,8 @@ pub async fn scan_substrate<D: Db, Pro: Processors>(
        // Save it to the database
        MainDb::new(db).add_active_tributary(&spec);
-        // Add it to the queue
+        // If we reboot before this is read, the fact it was saved to the database means it'll be
-        // If we reboot before this is read from the queue, the fact it was saved to the database
+        // handled on reboot
        // means it'll be handled on reboot
        new_tributary_spec.send(spec).unwrap();
      },
      &processors,
@ -416,8 +415,8 @@ pub async fn handle_p2p<D: Db, P: P2p>(
            // Have up to three nodes respond
            let responders = u64::from(tributary.spec.n().min(3));
-            // Decide which nodes will respond by using the latest block's hash as a mutually agreed
+            // Decide which nodes will respond by using the latest block's hash as a mutually
-            // upon entropy source
+            // agreed upon entropy source
            // This isn't a secure source of entropy, yet it's fine for this
            let entropy = u64::from_le_bytes(tributary_read.tip().await[.. 8].try_into().unwrap());
            // If n = 10, responders = 3, we want start to be 0 ..= 7 (so the highest is 7, 8, 9)
@ -502,7 +501,7 @@ pub async fn publish_signed_transaction<D: Db, P: P2p>(
 }
 pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
-  mut db: D,
+  db: D,
  key: Zeroizing<<Ristretto as Ciphersuite>::F>,
  serai: Arc<Serai>,
  mut processors: Pro,
@ -510,49 +509,41 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
 ) {
  let pub_key = Ristretto::generator() * key.deref();
-  let mut tributaries = HashMap::new();
+  let channels = Arc::new(RwLock::new(HashMap::new()));
  tokio::spawn({
    let processors = processors.clone();
    let channels = channels.clone();
    async move {
      loop {
-    while let Ok(tributary) = {
+        let channels = channels.clone();
-      match new_tributary.try_recv() {
+        let ActiveTributary { spec, tributary } = new_tributary.recv().await.unwrap();
        Ok(tributary) => Ok(tributary),
        Err(broadcast::error::TryRecvError::Empty) => Err(()),
        Err(broadcast::error::TryRecvError::Lagged(_)) => {
          panic!("handle_processors lagged to handle new_tributary")
        }
        Err(broadcast::error::TryRecvError::Closed) => panic!("new_tributary sender closed"),
      }
    } {
      tributaries.insert(tributary.spec.genesis(), tributary);
    }
    // TODO: Dispatch this message to a task dedicated to handling this processor, preventing one
    // processor from holding up all the others. This would require a peek method be added to the
    // message-queue (to view multiple future messages at once)
    // TODO: Do we handle having handled a message, by DB, yet having rebooted before `ack`ing it?
    // Does the processor?
    let msg = processors.recv().await;
    // TODO2: This is slow, and only works as long as a network only has a single Tributary
    // (which means there's a lack of multisig rotation)
    let spec = {
      let mut spec = None;
      for tributary in tributaries.values() {
        if tributary.spec.set().network == msg.network {
          spec = Some(tributary.spec.clone());
          break;
        }
      }
      spec.expect("received message from processor we don't have a tributary for")
    };
        let genesis = spec.genesis();
        tokio::spawn({
          let mut db = db.clone();
          let key = key.clone();
          let serai = serai.clone();
          let mut processors = processors.clone();
          async move {
            let (send, mut recv) = mpsc::unbounded_channel();
            // TODO: Support multisig rotation (not per-Tributary yet per-network?)
            channels.write().await.insert(spec.set().network, send);
            loop {
              let msg: processors::Message = recv.recv().await.unwrap();
              // TODO: We probably want to NOP here, not panic?
-    let my_i = spec.i(pub_key).expect("processor message for network we aren't a validator in");
+              // TODO: We do have to track produced Batches in order to ensure their integrity
              let my_i =
                spec.i(pub_key).expect("processor message for network we aren't a validator in");
              let tx = match msg.msg.clone() {
                ProcessorMessage::KeyGen(inner_msg) => match inner_msg {
                  key_gen::ProcessorMessage::Commitments { id, commitments } => {
-          Some(Transaction::DkgCommitments(id.attempt, commitments, Transaction::empty_signed()))
+                    Some(Transaction::DkgCommitments(
                      id.attempt,
                      commitments,
                      Transaction::empty_signed(),
                    ))
                  }
                  key_gen::ProcessorMessage::Shares { id, mut shares } => {
                    // Create a MuSig-based machine to inform Substrate of this key generation
@ -564,8 +555,11 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                      if i == my_i {
                        continue;
                      }
-            tx_shares
+                      tx_shares.push(
-              .push(shares.remove(&i).expect("processor didn't send share for another validator"));
+                        shares
                          .remove(&i)
                          .expect("processor didn't send share for another validator"),
                      );
                    }
                    Some(Transaction::DkgShares {
@ -575,7 +569,11 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                      signed: Transaction::empty_signed(),
                    })
                  }
-        key_gen::ProcessorMessage::GeneratedKeyPair { id, substrate_key, network_key } => {
+                  key_gen::ProcessorMessage::GeneratedKeyPair {
                    id,
                    substrate_key,
                    network_key,
                  } => {
                    assert_eq!(
                    id.set.network, msg.network,
                    "processor claimed to be a different network than it was for GeneratedKeyPair",
@ -594,10 +592,14 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                    txn.commit();
                    match share {
-            Ok(share) => {
+                      Ok(share) => Some(Transaction::DkgConfirmed(
-              Some(Transaction::DkgConfirmed(id.attempt, share, Transaction::empty_signed()))
+                        id.attempt,
                        share,
                        Transaction::empty_signed(),
                      )),
                      Err(p) => {
                        todo!("participant {p:?} sent invalid DKG confirmation preprocesses")
                      }
            Err(p) => todo!("participant {p:?} sent invalid DKG confirmation preprocesses"),
                    }
                  }
                },
@ -618,12 +620,14 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                      }))
                    }
                  }
-        sign::ProcessorMessage::Share { id, share } => Some(Transaction::SignShare(SignData {
+                  sign::ProcessorMessage::Share { id, share } => {
                    Some(Transaction::SignShare(SignData {
                      plan: id.id,
                      attempt: id.attempt,
                      data: share,
                      signed: Transaction::empty_signed(),
-        })),
+                    }))
                  }
                  sign::ProcessorMessage::Completed { key: _, id, tx } => {
                    let r = Zeroizing::new(<Ristretto as Ciphersuite>::F::random(&mut OsRng));
                    #[allow(non_snake_case)]
@ -651,8 +655,8 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                    "processor claimed to be a different network than it was for SubstrateBlockAck",
                  );
-          // Safe to use its own txn since this is static and just needs to be written before we
+                    // Safe to use its own txn since this is static and just needs to be written
-          // provide SubstrateBlock
+                    // before we provide SubstrateBlock
                    let mut txn = db.txn();
                    // TODO: This needs to be scoped per multisig
                    TributaryDb::<D>::set_plan_ids(&mut txn, genesis, block, &plans);
@ -667,12 +671,13 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                      id.attempt,
                      hex::encode(block),
                    );
-          // If this is the first attempt instance, wait until we synchronize around the batch
+                    // If this is the first attempt instance, wait until we synchronize around the
-          // first
+                    // batch first
                    if id.attempt == 0 {
                      // Save the preprocess to disk so we can publish it later
-            // This is fine to use its own TX since it's static and just needs to be written
+                      // This is fine to use its own TX since it's static and just needs to be
-            // before this message finishes it handling (or with this message's finished handling)
+                      // written before this message finishes it handling (or with this message's
                      // finished handling)
                      let mut txn = db.txn();
                      MainDb::<D>::save_first_preprocess(&mut txn, id.id, preprocess);
                      txn.commit();
@ -702,28 +707,33 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                      batch.batch.network, msg.network,
                      "processor sent us a batch for a different network than it was for",
                    );
-          // TODO: Check this key's key pair's substrate key is authorized to publish batches
+                    // TODO: Check this key's key pair's substrate key is authorized to publish
                    // batches
                    // Save this batch to the disk
                    MainDb::new(&mut db).save_batch(batch);
                    /*
-            Use a dedicated task to publish batches due to the latency potentially incurred.
+                      Use a dedicated task to publish batches due to the latency potentially
                      incurred.
-            This does not guarantee the batch has actually been published when the message is
+                      This does not guarantee the batch has actually been published when the
-            `ack`ed to message-queue. Accordingly, if we reboot, these batches would be dropped
+                      message is `ack`ed to message-queue. Accordingly, if we reboot, these batches
-            (as we wouldn't see the `Update` again, triggering our re-attempt to publish).
+                      would be dropped (as we wouldn't see the `Update` again, triggering our
                      re-attempt to publish).
-            The solution to this is to have the task try not to publish the batch which caused it
+                      The solution to this is to have the task try not to publish the batch which
-            to be spawned, yet all saved batches which have yet to published. This does risk having
+                      caused it to be spawned, yet all saved batches which have yet to published.
-            multiple tasks trying to publish all pending batches, yet these aren't notably complex.
+                      This does risk having multiple tasks trying to publish all pending batches,
                      yet these aren't notably complex.
                    */
                    tokio::spawn({
                      let mut db = db.clone();
                      let serai = serai.clone();
                      let network = msg.network;
                      async move {
-              // Since we have a new batch, publish all batches yet to be published to Serai
+                        // Since we have a new batch, publish all batches yet to be published to
                        // Serai
                        // This handles the edge-case where batch n+1 is signed before batch n is
                        while let Some(batch) = {
                          // Get the next-to-execute batch ID
@ -732,15 +742,19 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                            loop {
                              if !first {
                                log::error!(
-                        "couldn't connect to Serai node to get the next batch ID for {network:?}",
+                                  "{} {network:?}",
                                  "couldn't connect to Serai node to get the next batch ID for",
                                );
                                tokio::time::sleep(Duration::from_secs(5)).await;
                              }
                              first = false;
-                    let Ok(latest_block) = serai.get_latest_block().await else { continue };
+                              let Ok(latest_block) = serai.get_latest_block().await else {
-                    let Ok(last) =
+                                continue;
-                      serai.get_last_batch_for_network(latest_block.hash(), network).await
+                              };
                              let Ok(last) = serai
                                .get_last_batch_for_network(latest_block.hash(), network)
                                .await
                              else {
                                continue;
                              };
@ -755,12 +769,16 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                          let block = batch.batch.block;
                          let tx = Serai::execute_batch(batch);
-                // This publish may fail if this transactions already exists in the mempool, which
+                          // This publish may fail if this transactions already exists in the
-                // is possible, or if this batch was already executed on-chain
+                          // mempool, which is possible, or if this batch was already executed
-                // Either case will have eventual resolution and be handled by the above check on
+                          // on-chain
-                // if this block should execute
+                          // Either case will have eventual resolution and be handled by the above
                          // check on if this batch should execute
                          if serai.publish(&tx).await.is_ok() {
-                  log::info!("published batch {network:?} {id} (block {})", hex::encode(block));
+                            log::info!(
                              "published batch {network:?} {id} (block {})",
                              hex::encode(block)
                            );
                          }
                        }
                      }
@ -774,13 +792,6 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
              // If this created a transaction, publish it
              if let Some(mut tx) = tx {
                log::trace!("processor message effected transaction {}", hex::encode(tx.hash()));
      let Some(tributary) = tributaries.get(&genesis) else {
        // TODO: This can happen since Substrate tells the Processor to generate commitments
        // at the same time it tells the Tributary to be created
        // There's no guarantee the Tributary will have been created though
        panic!("processor is operating on tributary we don't have");
      };
      let tributary = &tributary.tributary;
                match tx.kind() {
                  TransactionKind::Provided(_) => {
@ -792,23 +803,27 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                  }
                  TransactionKind::Unsigned => {
                    log::trace!("publishing unsigned transaction {}", hex::encode(tx.hash()));
-          // Ignores the result since we can't differentiate already in-mempool from already
+                    // Ignores the result since we can't differentiate already in-mempool from
-          // on-chain from invalid
+                    // already on-chain from invalid
                    // TODO: Don't ignore the result
                    tributary.add_transaction(tx).await;
                  }
                  TransactionKind::Signed(_) => {
-          log::trace!("getting next nonce for Tributary TX in response to processor message");
+                    log::trace!(
                      "getting next nonce for Tributary TX in response to processor message"
                    );
                    let nonce = loop {
-            let Some(nonce) =
+                      let Some(nonce) = NonceDecider::<D>::nonce(&db, genesis, &tx)
-              NonceDecider::<D>::nonce(&db, genesis, &tx).expect("signed TX didn't have nonce")
+                        .expect("signed TX didn't have nonce")
                      else {
                        // This can be None if:
                        // 1) We scanned the relevant transaction(s) in a Tributary block
                        // 2) The processor was sent a message and responded
                        // 3) The Tributary TXN has yet to be committed
-              log::warn!("nonce has yet to be saved for processor-instigated transaction");
+                        log::warn!(
                          "nonce has yet to be saved for processor-instigated transaction"
                        );
                        sleep(Duration::from_millis(100)).await;
                        continue;
                      };
@ -816,13 +831,43 @@ pub async fn handle_processors<D: Db, Pro: Processors, P: P2p>(
                    };
                    tx.sign(&mut OsRng, genesis, &key, nonce);
-          publish_signed_transaction(tributary, tx).await;
+                    publish_signed_transaction(&tributary, tx).await;
                  }
                }
              }
              processors.ack(msg).await;
            }
          }
        });
      }
    }
  });
  let mut last_msg = None;
  loop {
    // TODO: We dispatch this to an async task per-processor, yet we don't move to the next message
    // yet as all processor messages are shoved into a global queue.
    // Modify message-queue to offer per-sender queues, not per-receiver.
    // Alternatively, a peek method with local delineation of handled messages would work.
    // TODO: Do we handle having handled a message, by DB, yet having rebooted before `ack`ing it?
    // Does the processor?
    let msg = processors.recv().await;
    if last_msg == Some(msg.id) {
      sleep(Duration::from_secs(1)).await;
      continue;
    }
    last_msg = Some(msg.id);
    // TODO: Race conditions with above tributary availability?
    // TODO: How does this hold up to multisig rotation?
    if let Some(channel) = channels.read().await.get(&msg.network) {
      channel.send(msg).unwrap();
    } else {
      log::warn!("received processor message for network we don't have a channel for");
    }
  }
 }
 pub async fn run<D: Db, Pro: Processors, P: P2p>(
@ -893,7 +938,7 @@ pub async fn run<D: Db, Pro: Processors, P: P2p>(
        loop {
          match new_tributary_listener_1.recv().await {
            Ok(tributary) => {
-              tributaries.write().await.insert(tributary.spec.genesis(), tributary);
+              tributaries.write().await.insert(tributary.spec.genesis(), tributary.tributary);
            }
            Err(broadcast::error::RecvError::Lagged(_)) => {
              panic!("recognized_id lagged to handle new_tributary")
@ -943,9 +988,10 @@ pub async fn run<D: Db, Pro: Processors, P: P2p>(
        let tributaries = tributaries.read().await;
        let Some(tributary) = tributaries.get(&genesis) else {
          // TODO: This may happen if the task above is simply slow
          panic!("tributary we don't have came to consensus on an Batch");
        };
-        publish_signed_transaction(&tributary.tributary, tx).await;
+        publish_signed_transaction(tributary, tx).await;
      }
    }
  };