243 lines
6.8 KiB
Rust
243 lines
6.8 KiB
Rust
use std::sync::Arc;
|
|
use std::time::Duration;
|
|
|
|
use sq_grpc_interface::{
|
|
cluster_service_client::ClusterServiceClient, ReplicateEntriesRequest,
|
|
};
|
|
|
|
use crate::membership::{Membership, MemberInfo};
|
|
|
|
/// Configuration for write replication.
|
|
#[derive(Debug, Clone)]
|
|
pub struct ReplicationConfig {
|
|
/// Replication factor (how many copies including local).
|
|
pub replication_factor: u32,
|
|
/// Timeout for waiting for peer acks.
|
|
pub timeout: Duration,
|
|
}
|
|
|
|
impl Default for ReplicationConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
replication_factor: 3,
|
|
timeout: Duration::from_secs(5),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Result of a replication attempt.
|
|
#[derive(Debug)]
|
|
pub struct ReplicationResult {
|
|
/// Number of successful acks (including local).
|
|
pub ack_count: u32,
|
|
/// Whether quorum was reached.
|
|
pub quorum_reached: bool,
|
|
/// Errors from failed peers.
|
|
pub errors: Vec<(String, String)>,
|
|
}
|
|
|
|
/// Handles replicating WAL entries to peer nodes.
|
|
pub struct Replicator {
|
|
membership: Arc<Membership>,
|
|
config: ReplicationConfig,
|
|
}
|
|
|
|
impl Replicator {
|
|
pub fn new(membership: Arc<Membership>, config: ReplicationConfig) -> Self {
|
|
Self {
|
|
membership,
|
|
config,
|
|
}
|
|
}
|
|
|
|
/// Replicate entries to peers. Returns after quorum is reached or timeout.
|
|
/// The local write is assumed to already be done (counts as 1 ack).
|
|
pub async fn replicate(
|
|
&self,
|
|
topic: &str,
|
|
partition: u32,
|
|
entries: Vec<Vec<u8>>,
|
|
) -> ReplicationResult {
|
|
let peers = self.membership.alive_peers().await;
|
|
let quorum = (self.config.replication_factor / 2) + 1;
|
|
|
|
// If no peers or single-node, local write alone is sufficient.
|
|
if peers.is_empty() || self.config.replication_factor <= 1 {
|
|
return ReplicationResult {
|
|
ack_count: 1,
|
|
quorum_reached: quorum <= 1,
|
|
errors: vec![],
|
|
};
|
|
}
|
|
|
|
// Send to all alive peers in parallel.
|
|
let (tx, mut rx) = tokio::sync::mpsc::channel::<Result<String, (String, String)>>(
|
|
peers.len(),
|
|
);
|
|
|
|
for peer in &peers {
|
|
let tx = tx.clone();
|
|
let peer = peer.clone();
|
|
let topic = topic.to_string();
|
|
let entries = entries.clone();
|
|
tokio::spawn(async move {
|
|
match replicate_to_peer(&peer, &topic, partition, entries).await {
|
|
Ok(()) => {
|
|
let _ = tx.send(Ok(peer.node_id.clone())).await;
|
|
}
|
|
Err(e) => {
|
|
let _ = tx
|
|
.send(Err((peer.node_id.clone(), e.to_string())))
|
|
.await;
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
drop(tx);
|
|
|
|
// Wait for acks with timeout.
|
|
let mut ack_count: u32 = 1; // Count local write.
|
|
let mut errors = Vec::new();
|
|
|
|
let deadline = tokio::time::Instant::now() + self.config.timeout;
|
|
|
|
loop {
|
|
if ack_count >= quorum {
|
|
break;
|
|
}
|
|
|
|
tokio::select! {
|
|
result = rx.recv() => {
|
|
match result {
|
|
Some(Ok(_node_id)) => {
|
|
ack_count += 1;
|
|
}
|
|
Some(Err((node_id, err))) => {
|
|
errors.push((node_id, err));
|
|
}
|
|
None => {
|
|
// Channel closed, all peers responded.
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
_ = tokio::time::sleep_until(deadline) => {
|
|
tracing::warn!(
|
|
acks = ack_count,
|
|
quorum = quorum,
|
|
"replication timeout waiting for quorum"
|
|
);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
ReplicationResult {
|
|
ack_count,
|
|
quorum_reached: ack_count >= quorum,
|
|
errors,
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn replicate_to_peer(
|
|
peer: &MemberInfo,
|
|
topic: &str,
|
|
partition: u32,
|
|
entries: Vec<Vec<u8>>,
|
|
) -> anyhow::Result<()> {
|
|
let endpoint = format!("http://{}", peer.address);
|
|
let mut client = ClusterServiceClient::connect(endpoint).await?;
|
|
|
|
client
|
|
.replicate_entries(tonic::Request::new(ReplicateEntriesRequest {
|
|
topic: topic.to_string(),
|
|
partition,
|
|
entries,
|
|
}))
|
|
.await?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::membership::MembershipConfig;
|
|
|
|
fn single_node_membership() -> Arc<Membership> {
|
|
Arc::new(Membership::new(MembershipConfig {
|
|
node_id: "node-1".to_string(),
|
|
address: "127.0.0.1:6060".to_string(),
|
|
..Default::default()
|
|
}))
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_single_node_replication() {
|
|
let membership = single_node_membership();
|
|
let replicator = Replicator::new(
|
|
membership,
|
|
ReplicationConfig {
|
|
replication_factor: 1,
|
|
..Default::default()
|
|
},
|
|
);
|
|
|
|
let result = replicator
|
|
.replicate("orders", 0, vec![b"entry-1".to_vec()])
|
|
.await;
|
|
|
|
assert_eq!(result.ack_count, 1);
|
|
assert!(result.quorum_reached);
|
|
assert!(result.errors.is_empty());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_no_peers_available() {
|
|
let membership = single_node_membership();
|
|
let replicator = Replicator::new(
|
|
membership,
|
|
ReplicationConfig {
|
|
replication_factor: 3,
|
|
..Default::default()
|
|
},
|
|
);
|
|
|
|
let result = replicator
|
|
.replicate("orders", 0, vec![b"entry-1".to_vec()])
|
|
.await;
|
|
|
|
// Only local ack (1 out of 2 needed for quorum).
|
|
assert_eq!(result.ack_count, 1);
|
|
assert!(!result.quorum_reached);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_unreachable_peers_timeout() {
|
|
let membership = single_node_membership();
|
|
|
|
// Add peers that don't exist - they'll fail to connect.
|
|
membership
|
|
.record_heartbeat("node-2", "127.0.0.1:19999")
|
|
.await;
|
|
|
|
let replicator = Replicator::new(
|
|
membership,
|
|
ReplicationConfig {
|
|
replication_factor: 3,
|
|
timeout: Duration::from_millis(500),
|
|
},
|
|
);
|
|
|
|
let result = replicator
|
|
.replicate("orders", 0, vec![b"entry-1".to_vec()])
|
|
.await;
|
|
|
|
// Should have errors from unreachable peer.
|
|
assert_eq!(result.ack_count, 1);
|
|
assert!(!result.quorum_reached);
|
|
}
|
|
}
|