242
crates/sq-cluster/src/replication.rs
Normal file
242
crates/sq-cluster/src/replication.rs
Normal file
@@ -0,0 +1,242 @@
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use sq_grpc_interface::{
|
||||
cluster_service_client::ClusterServiceClient, ReplicateEntriesRequest,
|
||||
};
|
||||
|
||||
use crate::membership::{Membership, MemberInfo};
|
||||
|
||||
/// Configuration for write replication.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ReplicationConfig {
|
||||
/// Replication factor (how many copies including local).
|
||||
pub replication_factor: u32,
|
||||
/// Timeout for waiting for peer acks.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
impl Default for ReplicationConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
replication_factor: 3,
|
||||
timeout: Duration::from_secs(5),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a replication attempt.
|
||||
#[derive(Debug)]
|
||||
pub struct ReplicationResult {
|
||||
/// Number of successful acks (including local).
|
||||
pub ack_count: u32,
|
||||
/// Whether quorum was reached.
|
||||
pub quorum_reached: bool,
|
||||
/// Errors from failed peers.
|
||||
pub errors: Vec<(String, String)>,
|
||||
}
|
||||
|
||||
/// Handles replicating WAL entries to peer nodes.
|
||||
pub struct Replicator {
|
||||
membership: Arc<Membership>,
|
||||
config: ReplicationConfig,
|
||||
}
|
||||
|
||||
impl Replicator {
|
||||
pub fn new(membership: Arc<Membership>, config: ReplicationConfig) -> Self {
|
||||
Self {
|
||||
membership,
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
/// Replicate entries to peers. Returns after quorum is reached or timeout.
|
||||
/// The local write is assumed to already be done (counts as 1 ack).
|
||||
pub async fn replicate(
|
||||
&self,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
entries: Vec<Vec<u8>>,
|
||||
) -> ReplicationResult {
|
||||
let peers = self.membership.alive_peers().await;
|
||||
let quorum = (self.config.replication_factor / 2) + 1;
|
||||
|
||||
// If no peers or single-node, local write alone is sufficient.
|
||||
if peers.is_empty() || self.config.replication_factor <= 1 {
|
||||
return ReplicationResult {
|
||||
ack_count: 1,
|
||||
quorum_reached: quorum <= 1,
|
||||
errors: vec![],
|
||||
};
|
||||
}
|
||||
|
||||
// Send to all alive peers in parallel.
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel::<Result<String, (String, String)>>(
|
||||
peers.len(),
|
||||
);
|
||||
|
||||
for peer in &peers {
|
||||
let tx = tx.clone();
|
||||
let peer = peer.clone();
|
||||
let topic = topic.to_string();
|
||||
let entries = entries.clone();
|
||||
tokio::spawn(async move {
|
||||
match replicate_to_peer(&peer, &topic, partition, entries).await {
|
||||
Ok(()) => {
|
||||
let _ = tx.send(Ok(peer.node_id.clone())).await;
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = tx
|
||||
.send(Err((peer.node_id.clone(), e.to_string())))
|
||||
.await;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
drop(tx);
|
||||
|
||||
// Wait for acks with timeout.
|
||||
let mut ack_count: u32 = 1; // Count local write.
|
||||
let mut errors = Vec::new();
|
||||
|
||||
let deadline = tokio::time::Instant::now() + self.config.timeout;
|
||||
|
||||
loop {
|
||||
if ack_count >= quorum {
|
||||
break;
|
||||
}
|
||||
|
||||
tokio::select! {
|
||||
result = rx.recv() => {
|
||||
match result {
|
||||
Some(Ok(_node_id)) => {
|
||||
ack_count += 1;
|
||||
}
|
||||
Some(Err((node_id, err))) => {
|
||||
errors.push((node_id, err));
|
||||
}
|
||||
None => {
|
||||
// Channel closed, all peers responded.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = tokio::time::sleep_until(deadline) => {
|
||||
tracing::warn!(
|
||||
acks = ack_count,
|
||||
quorum = quorum,
|
||||
"replication timeout waiting for quorum"
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ReplicationResult {
|
||||
ack_count,
|
||||
quorum_reached: ack_count >= quorum,
|
||||
errors,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn replicate_to_peer(
|
||||
peer: &MemberInfo,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
entries: Vec<Vec<u8>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let endpoint = format!("http://{}", peer.address);
|
||||
let mut client = ClusterServiceClient::connect(endpoint).await?;
|
||||
|
||||
client
|
||||
.replicate_entries(tonic::Request::new(ReplicateEntriesRequest {
|
||||
topic: topic.to_string(),
|
||||
partition,
|
||||
entries,
|
||||
}))
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::membership::MembershipConfig;
|
||||
|
||||
fn single_node_membership() -> Arc<Membership> {
|
||||
Arc::new(Membership::new(MembershipConfig {
|
||||
node_id: "node-1".to_string(),
|
||||
address: "127.0.0.1:6060".to_string(),
|
||||
..Default::default()
|
||||
}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_single_node_replication() {
|
||||
let membership = single_node_membership();
|
||||
let replicator = Replicator::new(
|
||||
membership,
|
||||
ReplicationConfig {
|
||||
replication_factor: 1,
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
|
||||
let result = replicator
|
||||
.replicate("orders", 0, vec![b"entry-1".to_vec()])
|
||||
.await;
|
||||
|
||||
assert_eq!(result.ack_count, 1);
|
||||
assert!(result.quorum_reached);
|
||||
assert!(result.errors.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_no_peers_available() {
|
||||
let membership = single_node_membership();
|
||||
let replicator = Replicator::new(
|
||||
membership,
|
||||
ReplicationConfig {
|
||||
replication_factor: 3,
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
|
||||
let result = replicator
|
||||
.replicate("orders", 0, vec![b"entry-1".to_vec()])
|
||||
.await;
|
||||
|
||||
// Only local ack (1 out of 2 needed for quorum).
|
||||
assert_eq!(result.ack_count, 1);
|
||||
assert!(!result.quorum_reached);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_unreachable_peers_timeout() {
|
||||
let membership = single_node_membership();
|
||||
|
||||
// Add peers that don't exist - they'll fail to connect.
|
||||
membership
|
||||
.record_heartbeat("node-2", "127.0.0.1:19999")
|
||||
.await;
|
||||
|
||||
let replicator = Replicator::new(
|
||||
membership,
|
||||
ReplicationConfig {
|
||||
replication_factor: 3,
|
||||
timeout: Duration::from_millis(500),
|
||||
},
|
||||
);
|
||||
|
||||
let result = replicator
|
||||
.replicate("orders", 0, vec![b"entry-1".to_vec()])
|
||||
.await;
|
||||
|
||||
// Should have errors from unreachable peer.
|
||||
assert_eq!(result.ack_count, 1);
|
||||
assert!(!result.quorum_reached);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user