feat: add capnp

Signed-off-by: kjuulh <contact@kjuulh.io>
This commit is contained in:
2026-02-27 12:15:35 +01:00
parent 3162971c89
commit 749ae245c7
115 changed files with 16596 additions and 31 deletions

View File

@@ -0,0 +1,3 @@
pub mod membership;
pub mod recovery;
pub mod replication;

View File

@@ -0,0 +1,340 @@
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::Mutex;
/// Status of a node in the cluster.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NodeStatus {
Alive,
Suspected,
Dead,
}
impl std::fmt::Display for NodeStatus {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
NodeStatus::Alive => write!(f, "alive"),
NodeStatus::Suspected => write!(f, "suspected"),
NodeStatus::Dead => write!(f, "dead"),
}
}
}
/// Information about a member node.
#[derive(Debug, Clone)]
pub struct MemberInfo {
pub node_id: String,
pub address: String,
pub status: NodeStatus,
pub last_heartbeat: Instant,
}
/// Configuration for membership management.
#[derive(Debug, Clone)]
pub struct MembershipConfig {
/// This node's ID.
pub node_id: String,
/// This node's gRPC address.
pub address: String,
/// Seed node addresses for initial discovery.
pub seeds: Vec<String>,
/// How many missed heartbeats before a node is suspected.
pub failure_threshold: u32,
/// Heartbeat interval.
pub heartbeat_interval: Duration,
/// Time a node stays in Suspected before being declared Dead.
pub suspect_timeout: Duration,
}
impl Default for MembershipConfig {
fn default() -> Self {
Self {
node_id: "node-1".to_string(),
address: "127.0.0.1:6060".to_string(),
seeds: Vec::new(),
failure_threshold: 3,
heartbeat_interval: Duration::from_secs(5),
suspect_timeout: Duration::from_secs(30),
}
}
}
/// Manages cluster membership state.
pub struct Membership {
config: MembershipConfig,
members: Arc<Mutex<HashMap<String, MemberInfo>>>,
}
impl Membership {
pub fn new(config: MembershipConfig) -> Self {
let mut members = HashMap::new();
// Add self as alive.
members.insert(
config.node_id.clone(),
MemberInfo {
node_id: config.node_id.clone(),
address: config.address.clone(),
status: NodeStatus::Alive,
last_heartbeat: Instant::now(),
},
);
Self {
config,
members: Arc::new(Mutex::new(members)),
}
}
/// Get the shared members handle (for use in gRPC handlers).
pub fn members(&self) -> Arc<Mutex<HashMap<String, MemberInfo>>> {
self.members.clone()
}
/// Get the node ID.
pub fn node_id(&self) -> &str {
&self.config.node_id
}
/// Get the node address.
pub fn address(&self) -> &str {
&self.config.address
}
/// Get seed addresses.
pub fn seeds(&self) -> &[String] {
&self.config.seeds
}
/// Record a heartbeat from a node. Creates the member entry if new.
pub async fn record_heartbeat(&self, node_id: &str, address: &str) {
let mut members = self.members.lock().await;
let entry = members
.entry(node_id.to_string())
.or_insert_with(|| MemberInfo {
node_id: node_id.to_string(),
address: address.to_string(),
status: NodeStatus::Alive,
last_heartbeat: Instant::now(),
});
entry.status = NodeStatus::Alive;
entry.last_heartbeat = Instant::now();
entry.address = address.to_string();
}
/// Record members discovered from a Join/Heartbeat response.
pub async fn merge_members(&self, discovered: Vec<(String, String)>) {
let mut members = self.members.lock().await;
for (node_id, address) in discovered {
if node_id == self.config.node_id {
continue; // Skip self.
}
members
.entry(node_id.clone())
.or_insert_with(|| MemberInfo {
node_id,
address,
status: NodeStatus::Alive,
last_heartbeat: Instant::now(),
});
}
}
/// Check for failed nodes based on heartbeat timeouts.
/// Updates node status from Alive -> Suspected -> Dead.
pub async fn check_failures(&self) {
let now = Instant::now();
let heartbeat_timeout =
self.config.heartbeat_interval * self.config.failure_threshold;
let mut members = self.members.lock().await;
for (id, member) in members.iter_mut() {
if *id == self.config.node_id {
// Don't suspect self.
member.last_heartbeat = now;
continue;
}
let elapsed = now.duration_since(member.last_heartbeat);
match member.status {
NodeStatus::Alive => {
if elapsed > heartbeat_timeout {
tracing::warn!(
node_id = %id,
elapsed_secs = elapsed.as_secs(),
"node suspected: missed heartbeats"
);
member.status = NodeStatus::Suspected;
}
}
NodeStatus::Suspected => {
if elapsed > heartbeat_timeout + self.config.suspect_timeout {
tracing::warn!(node_id = %id, "node declared dead");
member.status = NodeStatus::Dead;
}
}
NodeStatus::Dead => {
// Dead nodes stay dead until they re-join.
}
}
}
}
/// Get all alive peers (excluding self).
pub async fn alive_peers(&self) -> Vec<MemberInfo> {
let members = self.members.lock().await;
members
.values()
.filter(|m| m.node_id != self.config.node_id && m.status == NodeStatus::Alive)
.cloned()
.collect()
}
/// Get all known members (including self).
pub async fn all_members(&self) -> Vec<MemberInfo> {
let members = self.members.lock().await;
members.values().cloned().collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn test_config(node_id: &str) -> MembershipConfig {
MembershipConfig {
node_id: node_id.to_string(),
address: format!("127.0.0.1:606{}", node_id.chars().last().unwrap()),
heartbeat_interval: Duration::from_millis(100),
failure_threshold: 3,
suspect_timeout: Duration::from_millis(300),
..Default::default()
}
}
#[tokio::test]
async fn test_new_membership_has_self() {
let m = Membership::new(test_config("node-1"));
let members = m.all_members().await;
assert_eq!(members.len(), 1);
assert_eq!(members[0].node_id, "node-1");
assert_eq!(members[0].status, NodeStatus::Alive);
}
#[tokio::test]
async fn test_record_heartbeat_adds_new_member() {
let m = Membership::new(test_config("node-1"));
m.record_heartbeat("node-2", "127.0.0.1:6062").await;
let members = m.all_members().await;
assert_eq!(members.len(), 2);
}
#[tokio::test]
async fn test_record_heartbeat_updates_existing() {
let m = Membership::new(test_config("node-1"));
m.record_heartbeat("node-2", "127.0.0.1:6062").await;
// Update address.
m.record_heartbeat("node-2", "127.0.0.1:6063").await;
let members = m.all_members().await;
let node2 = members.iter().find(|m| m.node_id == "node-2").unwrap();
assert_eq!(node2.address, "127.0.0.1:6063");
}
#[tokio::test]
async fn test_merge_members() {
let m = Membership::new(test_config("node-1"));
m.merge_members(vec![
("node-2".to_string(), "addr-2".to_string()),
("node-3".to_string(), "addr-3".to_string()),
])
.await;
let members = m.all_members().await;
assert_eq!(members.len(), 3);
}
#[tokio::test]
async fn test_merge_skips_self() {
let m = Membership::new(test_config("node-1"));
m.merge_members(vec![("node-1".to_string(), "other-addr".to_string())])
.await;
let members = m.all_members().await;
assert_eq!(members.len(), 1);
}
#[tokio::test]
async fn test_alive_peers_excludes_self() {
let m = Membership::new(test_config("node-1"));
m.record_heartbeat("node-2", "addr-2").await;
let peers = m.alive_peers().await;
assert_eq!(peers.len(), 1);
assert_eq!(peers[0].node_id, "node-2");
}
#[tokio::test]
async fn test_check_failures_suspects_after_timeout() {
let m = Membership::new(test_config("node-1"));
m.record_heartbeat("node-2", "addr-2").await;
// Simulate time passing by directly modifying last_heartbeat.
{
let mut members = m.members.lock().await;
let node2 = members.get_mut("node-2").unwrap();
node2.last_heartbeat = Instant::now() - Duration::from_millis(500);
}
m.check_failures().await;
let members = m.all_members().await;
let node2 = members.iter().find(|m| m.node_id == "node-2").unwrap();
assert_eq!(node2.status, NodeStatus::Suspected);
}
#[tokio::test]
async fn test_heartbeat_revives_suspected_node() {
let m = Membership::new(test_config("node-1"));
m.record_heartbeat("node-2", "addr-2").await;
// Make node-2 suspected.
{
let mut members = m.members.lock().await;
let node2 = members.get_mut("node-2").unwrap();
node2.status = NodeStatus::Suspected;
}
// Heartbeat revives it.
m.record_heartbeat("node-2", "addr-2").await;
let members = m.all_members().await;
let node2 = members.iter().find(|m| m.node_id == "node-2").unwrap();
assert_eq!(node2.status, NodeStatus::Alive);
}
#[tokio::test]
async fn test_dead_after_suspect_timeout() {
let m = Membership::new(test_config("node-1"));
m.record_heartbeat("node-2", "addr-2").await;
// Simulate way past all timeouts.
{
let mut members = m.members.lock().await;
let node2 = members.get_mut("node-2").unwrap();
node2.status = NodeStatus::Suspected;
node2.last_heartbeat = Instant::now() - Duration::from_secs(10);
}
m.check_failures().await;
let members = m.all_members().await;
let node2 = members.iter().find(|m| m.node_id == "node-2").unwrap();
assert_eq!(node2.status, NodeStatus::Dead);
}
}

View File

@@ -0,0 +1,74 @@
use std::sync::Arc;
use sq_grpc_interface::cluster_service_client::ClusterServiceClient;
use sq_grpc_interface::JoinRequest;
use crate::membership::Membership;
/// Handles node recovery and catch-up when joining/rejoining the cluster.
pub struct Recovery {
membership: Arc<Membership>,
}
impl Recovery {
pub fn new(membership: Arc<Membership>) -> Self {
Self { membership }
}
/// Join the cluster by contacting seed nodes.
/// Returns the number of seeds successfully contacted.
pub async fn join_cluster(&self) -> anyhow::Result<usize> {
let seeds = self.membership.seeds().to_vec();
let mut contacted = 0;
for seed_addr in &seeds {
let endpoint = format!("http://{}", seed_addr);
match ClusterServiceClient::connect(endpoint).await {
Ok(mut client) => {
let response = client
.join(tonic::Request::new(JoinRequest {
node_id: self.membership.node_id().to_string(),
address: self.membership.address().to_string(),
}))
.await;
match response {
Ok(resp) => {
let members: Vec<(String, String)> = resp
.into_inner()
.members
.into_iter()
.map(|m| (m.node_id, m.address))
.collect();
self.membership.merge_members(members).await;
contacted += 1;
tracing::info!(
seed = %seed_addr,
"successfully joined cluster via seed"
);
}
Err(e) => {
tracing::warn!(
seed = %seed_addr,
error = %e,
"failed to join via seed"
);
}
}
}
Err(e) => {
tracing::warn!(
seed = %seed_addr,
error = %e,
"failed to connect to seed"
);
}
}
}
Ok(contacted)
}
}

View File

@@ -0,0 +1,242 @@
use std::sync::Arc;
use std::time::Duration;
use sq_grpc_interface::{
cluster_service_client::ClusterServiceClient, ReplicateEntriesRequest,
};
use crate::membership::{Membership, MemberInfo};
/// Configuration for write replication.
#[derive(Debug, Clone)]
pub struct ReplicationConfig {
/// Replication factor (how many copies including local).
pub replication_factor: u32,
/// Timeout for waiting for peer acks.
pub timeout: Duration,
}
impl Default for ReplicationConfig {
fn default() -> Self {
Self {
replication_factor: 3,
timeout: Duration::from_secs(5),
}
}
}
/// Result of a replication attempt.
#[derive(Debug)]
pub struct ReplicationResult {
/// Number of successful acks (including local).
pub ack_count: u32,
/// Whether quorum was reached.
pub quorum_reached: bool,
/// Errors from failed peers.
pub errors: Vec<(String, String)>,
}
/// Handles replicating WAL entries to peer nodes.
pub struct Replicator {
membership: Arc<Membership>,
config: ReplicationConfig,
}
impl Replicator {
pub fn new(membership: Arc<Membership>, config: ReplicationConfig) -> Self {
Self {
membership,
config,
}
}
/// Replicate entries to peers. Returns after quorum is reached or timeout.
/// The local write is assumed to already be done (counts as 1 ack).
pub async fn replicate(
&self,
topic: &str,
partition: u32,
entries: Vec<Vec<u8>>,
) -> ReplicationResult {
let peers = self.membership.alive_peers().await;
let quorum = (self.config.replication_factor / 2) + 1;
// If no peers or single-node, local write alone is sufficient.
if peers.is_empty() || self.config.replication_factor <= 1 {
return ReplicationResult {
ack_count: 1,
quorum_reached: quorum <= 1,
errors: vec![],
};
}
// Send to all alive peers in parallel.
let (tx, mut rx) = tokio::sync::mpsc::channel::<Result<String, (String, String)>>(
peers.len(),
);
for peer in &peers {
let tx = tx.clone();
let peer = peer.clone();
let topic = topic.to_string();
let entries = entries.clone();
tokio::spawn(async move {
match replicate_to_peer(&peer, &topic, partition, entries).await {
Ok(()) => {
let _ = tx.send(Ok(peer.node_id.clone())).await;
}
Err(e) => {
let _ = tx
.send(Err((peer.node_id.clone(), e.to_string())))
.await;
}
}
});
}
drop(tx);
// Wait for acks with timeout.
let mut ack_count: u32 = 1; // Count local write.
let mut errors = Vec::new();
let deadline = tokio::time::Instant::now() + self.config.timeout;
loop {
if ack_count >= quorum {
break;
}
tokio::select! {
result = rx.recv() => {
match result {
Some(Ok(_node_id)) => {
ack_count += 1;
}
Some(Err((node_id, err))) => {
errors.push((node_id, err));
}
None => {
// Channel closed, all peers responded.
break;
}
}
}
_ = tokio::time::sleep_until(deadline) => {
tracing::warn!(
acks = ack_count,
quorum = quorum,
"replication timeout waiting for quorum"
);
break;
}
}
}
ReplicationResult {
ack_count,
quorum_reached: ack_count >= quorum,
errors,
}
}
}
async fn replicate_to_peer(
peer: &MemberInfo,
topic: &str,
partition: u32,
entries: Vec<Vec<u8>>,
) -> anyhow::Result<()> {
let endpoint = format!("http://{}", peer.address);
let mut client = ClusterServiceClient::connect(endpoint).await?;
client
.replicate_entries(tonic::Request::new(ReplicateEntriesRequest {
topic: topic.to_string(),
partition,
entries,
}))
.await?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::membership::MembershipConfig;
fn single_node_membership() -> Arc<Membership> {
Arc::new(Membership::new(MembershipConfig {
node_id: "node-1".to_string(),
address: "127.0.0.1:6060".to_string(),
..Default::default()
}))
}
#[tokio::test]
async fn test_single_node_replication() {
let membership = single_node_membership();
let replicator = Replicator::new(
membership,
ReplicationConfig {
replication_factor: 1,
..Default::default()
},
);
let result = replicator
.replicate("orders", 0, vec![b"entry-1".to_vec()])
.await;
assert_eq!(result.ack_count, 1);
assert!(result.quorum_reached);
assert!(result.errors.is_empty());
}
#[tokio::test]
async fn test_no_peers_available() {
let membership = single_node_membership();
let replicator = Replicator::new(
membership,
ReplicationConfig {
replication_factor: 3,
..Default::default()
},
);
let result = replicator
.replicate("orders", 0, vec![b"entry-1".to_vec()])
.await;
// Only local ack (1 out of 2 needed for quorum).
assert_eq!(result.ack_count, 1);
assert!(!result.quorum_reached);
}
#[tokio::test]
async fn test_unreachable_peers_timeout() {
let membership = single_node_membership();
// Add peers that don't exist - they'll fail to connect.
membership
.record_heartbeat("node-2", "127.0.0.1:19999")
.await;
let replicator = Replicator::new(
membership,
ReplicationConfig {
replication_factor: 3,
timeout: Duration::from_millis(500),
},
);
let result = replicator
.replicate("orders", 0, vec![b"entry-1".to_vec()])
.await;
// Should have errors from unreachable peer.
assert_eq!(result.ack_count, 1);
assert!(!result.quorum_reached);
}
}