462
crates/sq-server/tests/capnp_stress_test.rs
Normal file
462
crates/sq-server/tests/capnp_stress_test.rs
Normal file
@@ -0,0 +1,462 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use sq_cluster::membership::{Membership, MembershipConfig};
|
||||
use sq_grpc_interface::{
|
||||
cluster_service_server::ClusterServiceServer,
|
||||
control_plane_service_server::ControlPlaneServiceServer,
|
||||
data_plane_service_server::DataPlaneServiceServer,
|
||||
status_service_client::StatusServiceClient,
|
||||
status_service_server::StatusServiceServer,
|
||||
GetStatusRequest, SubscribeRequest,
|
||||
};
|
||||
use sq_grpc_interface::data_plane_service_client::DataPlaneServiceClient;
|
||||
use sq_sdk::{
|
||||
Consumer, ConsumerConfig, Producer, ProducerConfig,
|
||||
GrpcProducer, GrpcProducerConfig, ProducerMessage,
|
||||
};
|
||||
use sq_server::capnp::CapnpServer;
|
||||
use sq_server::grpc::{cluster, control_plane, data_plane, health};
|
||||
use sq_server::state::{Config, State};
|
||||
use tempfile::TempDir;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test harness — extends TestCluster to include capnp server alongside gRPC
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct TestNode {
|
||||
grpc_addr: SocketAddr,
|
||||
capnp_addr: SocketAddr,
|
||||
cancel: CancellationToken,
|
||||
pipeline_cancel: CancellationToken,
|
||||
_temp_dir: TempDir,
|
||||
_server_handle: tokio::task::JoinHandle<()>,
|
||||
_capnp_handle: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl TestNode {
|
||||
fn grpc_endpoint(&self) -> String {
|
||||
format!("http://{}", self.grpc_addr)
|
||||
}
|
||||
|
||||
fn capnp_endpoint(&self) -> String {
|
||||
self.capnp_addr.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
struct TestCluster {
|
||||
nodes: Vec<TestNode>,
|
||||
}
|
||||
|
||||
impl TestCluster {
|
||||
async fn start(n: usize) -> Self {
|
||||
let mut grpc_listeners = Vec::new();
|
||||
let mut capnp_listeners = Vec::new();
|
||||
let mut grpc_addrs = Vec::new();
|
||||
let mut capnp_addrs = Vec::new();
|
||||
|
||||
for _ in 0..n {
|
||||
let grpc_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let capnp_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
grpc_addrs.push(grpc_listener.local_addr().unwrap());
|
||||
capnp_addrs.push(capnp_listener.local_addr().unwrap());
|
||||
grpc_listeners.push(grpc_listener);
|
||||
capnp_listeners.push(capnp_listener);
|
||||
}
|
||||
|
||||
let mut nodes = Vec::new();
|
||||
for (i, (grpc_listener, capnp_listener)) in
|
||||
grpc_listeners.into_iter().zip(capnp_listeners).enumerate()
|
||||
{
|
||||
let grpc_addr = grpc_addrs[i];
|
||||
let capnp_addr = capnp_addrs[i];
|
||||
let node_id = format!("capnp-stress-node-{}", i + 1);
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
|
||||
let seeds: Vec<String> = grpc_addrs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(j, _)| *j != i)
|
||||
.map(|(_, a)| a.to_string())
|
||||
.collect();
|
||||
|
||||
let config = Config {
|
||||
node_id: node_id.clone(),
|
||||
data_dir: temp_dir.path().to_path_buf(),
|
||||
seeds: seeds.clone(),
|
||||
grpc_address: grpc_addr.to_string(),
|
||||
cluster_id: "test-cluster".to_string(),
|
||||
s3_bucket: None,
|
||||
s3_endpoint: None,
|
||||
s3_region: None,
|
||||
sync_policy: sq_models::SyncPolicy::EveryBatch,
|
||||
};
|
||||
|
||||
let (state, mut pipeline) = State::new(config).unwrap();
|
||||
|
||||
let pipeline_cancel = CancellationToken::new();
|
||||
let pipeline_cancel_clone = pipeline_cancel.clone();
|
||||
tokio::spawn(async move {
|
||||
tokio::select! {
|
||||
() = pipeline.run() => {}
|
||||
() = pipeline_cancel_clone.cancelled() => {}
|
||||
}
|
||||
});
|
||||
|
||||
let membership = Arc::new(Membership::new(MembershipConfig {
|
||||
node_id: node_id.clone(),
|
||||
address: grpc_addr.to_string(),
|
||||
seeds,
|
||||
..Default::default()
|
||||
}));
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Spawn gRPC server.
|
||||
let cancel_clone = cancel.clone();
|
||||
let state_clone = state.clone();
|
||||
let membership_clone = membership.clone();
|
||||
let incoming = tokio_stream::wrappers::TcpListenerStream::new(grpc_listener);
|
||||
let server_handle = tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(StatusServiceServer::new(health::HealthServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(ControlPlaneServiceServer::new(
|
||||
control_plane::ControlPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
},
|
||||
))
|
||||
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
|
||||
state: state_clone,
|
||||
membership: membership_clone,
|
||||
}))
|
||||
.serve_with_incoming_shutdown(incoming, async move {
|
||||
cancel_clone.cancelled().await;
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
// Spawn capnp server — use the CapnpServer Component's run method directly.
|
||||
let cancel_clone = cancel.clone();
|
||||
let capnp_state = state.clone();
|
||||
let capnp_handle = tokio::spawn(async move {
|
||||
let server = CapnpServer {
|
||||
host: capnp_addr,
|
||||
state: capnp_state,
|
||||
};
|
||||
// We can't use the TcpListener we already bound because CapnpServer binds its own.
|
||||
// Instead, drop the listener and let CapnpServer rebind.
|
||||
drop(capnp_listener);
|
||||
let _ = notmad::Component::run(&server, cancel_clone).await;
|
||||
});
|
||||
|
||||
nodes.push(TestNode {
|
||||
grpc_addr,
|
||||
capnp_addr,
|
||||
cancel,
|
||||
pipeline_cancel,
|
||||
_temp_dir: temp_dir,
|
||||
_server_handle: server_handle,
|
||||
_capnp_handle: capnp_handle,
|
||||
});
|
||||
}
|
||||
|
||||
// Wait for gRPC to be ready.
|
||||
for node in &nodes {
|
||||
wait_for_ready(&node.grpc_endpoint()).await;
|
||||
}
|
||||
|
||||
// Give capnp server a moment to bind.
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
|
||||
TestCluster { nodes }
|
||||
}
|
||||
|
||||
fn node(&self, index: usize) -> &TestNode {
|
||||
&self.nodes[index]
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestCluster {
|
||||
fn drop(&mut self) {
|
||||
for node in &self.nodes {
|
||||
node.pipeline_cancel.cancel();
|
||||
node.cancel.cancel();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_ready(endpoint: &str) {
|
||||
let deadline = tokio::time::Instant::now() + tokio::time::Duration::from_secs(5);
|
||||
loop {
|
||||
if tokio::time::Instant::now() > deadline {
|
||||
panic!("Server at {} did not become ready in time", endpoint);
|
||||
}
|
||||
if let Ok(mut client) = StatusServiceClient::connect(endpoint.to_string()).await {
|
||||
if client
|
||||
.status(tonic::Request::new(GetStatusRequest {}))
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Capnp stress test 1: Single producer — 100K messages via capnp
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn capnp_stress_single_producer_100k() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let capnp_ep = cluster.node(0).capnp_endpoint();
|
||||
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: capnp_ep,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let total = 100_000u64;
|
||||
let batch_size = 500;
|
||||
let payload = vec![0u8; 128];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
for batch_start in (0..total).step_by(batch_size) {
|
||||
let batch_end = (batch_start + batch_size as u64).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("capnp-stress-topic", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
let publish_duration = start.elapsed();
|
||||
let msgs_per_sec = total as f64 / publish_duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"capnp_stress_single_producer_100k: published {} messages in {:.2}s ({:.0} msg/s, {:.1} MB/s)",
|
||||
total,
|
||||
publish_duration.as_secs_f64(),
|
||||
msgs_per_sec,
|
||||
(total as f64 * 128.0) / (1024.0 * 1024.0) / publish_duration.as_secs_f64()
|
||||
);
|
||||
|
||||
// Verify: read back via gRPC subscribe (capnp subscribe is streaming-only).
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep).await.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "capnp-stress-topic".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut consumed = 0u64;
|
||||
while consumed < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => consumed += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(consumed, total, "expected all messages to be consumed");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Capnp stress test 2: Concurrent producers — 10 producers, 10K messages each
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn capnp_stress_concurrent_producers() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let capnp_ep = cluster.node(0).capnp_endpoint();
|
||||
|
||||
let num_producers = 10;
|
||||
let msgs_per_producer = 10_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for p in 0..num_producers {
|
||||
let ep = capnp_ep.clone();
|
||||
let pl = payload.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: ep,
|
||||
producer_id: format!("capnp-producer-{p}"),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let topic = format!("capnp-concurrent-{p}");
|
||||
for batch_start in (0..msgs_per_producer).step_by(100) {
|
||||
let batch_end = (batch_start + 100).min(msgs_per_producer);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new(topic.clone(), pl.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
let total = num_producers as u64 * msgs_per_producer;
|
||||
let msgs_per_sec = total as f64 / duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"capnp_stress_concurrent_producers: {} producers x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
|
||||
num_producers,
|
||||
msgs_per_producer,
|
||||
total,
|
||||
duration.as_secs_f64(),
|
||||
msgs_per_sec
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Capnp stress test 3: Subscribe via capnp — publish then consume
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn capnp_stress_subscribe() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let capnp_ep = cluster.node(0).capnp_endpoint();
|
||||
let total = 10_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
// Publish via capnp.
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: capnp_ep.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for batch_start in (0..total).step_by(500) {
|
||||
let batch_end = (batch_start + 500).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("capnp-sub-topic", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
// Consume via capnp.
|
||||
let mut consumer = Consumer::connect(ConsumerConfig {
|
||||
address: capnp_ep,
|
||||
topic: "capnp-sub-topic".to_string(),
|
||||
consumer_group: String::new(),
|
||||
auto_commit: false,
|
||||
start_offset: Some(0),
|
||||
max_poll_records: 1000,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut consumed = 0u64;
|
||||
let start = Instant::now();
|
||||
|
||||
while consumed < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), consumer.poll()).await {
|
||||
Ok(Ok(msgs)) => consumed += msgs.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
let consume_duration = start.elapsed();
|
||||
eprintln!(
|
||||
"capnp_stress_subscribe: consumed {} messages in {:.2}s ({:.0} msg/s)",
|
||||
consumed,
|
||||
consume_duration.as_secs_f64(),
|
||||
consumed as f64 / consume_duration.as_secs_f64()
|
||||
);
|
||||
|
||||
assert_eq!(consumed, total, "expected all messages to be consumed");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Throughput comparison: gRPC vs capnp
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async fn bench_grpc_publish(cluster: &TestCluster, total: u64, batch_size: usize) -> f64 {
|
||||
let endpoint = cluster.node(0).grpc_endpoint();
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: endpoint,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let payload = vec![0u8; 128];
|
||||
let start = Instant::now();
|
||||
|
||||
for batch_start in (0..total).step_by(batch_size) {
|
||||
let batch_end = (batch_start + batch_size as u64).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("bench-grpc", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
total as f64 / start.elapsed().as_secs_f64()
|
||||
}
|
||||
|
||||
async fn bench_capnp_publish(cluster: &TestCluster, total: u64, batch_size: usize) -> f64 {
|
||||
let endpoint = cluster.node(0).capnp_endpoint();
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let payload = vec![0u8; 128];
|
||||
let start = Instant::now();
|
||||
|
||||
for batch_start in (0..total).step_by(batch_size) {
|
||||
let batch_end = (batch_start + batch_size as u64).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("bench-capnp", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
total as f64 / start.elapsed().as_secs_f64()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn capnp_vs_grpc_throughput() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
|
||||
let grpc_rate = bench_grpc_publish(&cluster, 100_000, 500).await;
|
||||
let capnp_rate = bench_capnp_publish(&cluster, 100_000, 500).await;
|
||||
|
||||
eprintln!("=== THROUGHPUT COMPARISON (single producer, 100K msgs x 128B) ===");
|
||||
eprintln!("gRPC: {:.0} msg/s", grpc_rate);
|
||||
eprintln!("capnp: {:.0} msg/s", capnp_rate);
|
||||
eprintln!("ratio: {:.2}x", capnp_rate / grpc_rate);
|
||||
}
|
||||
763
crates/sq-server/tests/cluster_test.rs
Normal file
763
crates/sq-server/tests/cluster_test.rs
Normal file
@@ -0,0 +1,763 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use sq_cluster::membership::{Membership, MembershipConfig};
|
||||
use sq_grpc_interface::{
|
||||
cluster_service_client::ClusterServiceClient,
|
||||
cluster_service_server::ClusterServiceServer,
|
||||
control_plane_service_client::ControlPlaneServiceClient,
|
||||
control_plane_service_server::ControlPlaneServiceServer,
|
||||
data_plane_service_client::DataPlaneServiceClient,
|
||||
data_plane_service_server::DataPlaneServiceServer,
|
||||
status_service_client::StatusServiceClient,
|
||||
status_service_server::StatusServiceServer,
|
||||
ClusterNodeInfo, CreateTopicRequest, DeleteTopicRequest, DescribeTopicRequest,
|
||||
FetchSegmentRequest, GetStatusRequest, HeartbeatRequest, JoinRequest, ListTopicsRequest,
|
||||
ReplicateEntriesRequest, SubscribeRequest,
|
||||
};
|
||||
use sq_sdk::{GrpcConsumer, GrpcConsumerConfig, GrpcProducer, GrpcProducerConfig};
|
||||
use sq_server::grpc::{cluster, control_plane, data_plane, health};
|
||||
use sq_server::state::{Config, State};
|
||||
use tempfile::TempDir;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test harness
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct TestNode {
|
||||
addr: SocketAddr,
|
||||
#[allow(dead_code)]
|
||||
node_id: String,
|
||||
#[allow(dead_code)]
|
||||
state: State,
|
||||
membership: Arc<Membership>,
|
||||
cancel: CancellationToken,
|
||||
pipeline_cancel: CancellationToken,
|
||||
_temp_dir: TempDir,
|
||||
_server_handle: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl TestNode {
|
||||
fn endpoint(&self) -> String {
|
||||
format!("http://{}", self.addr)
|
||||
}
|
||||
}
|
||||
|
||||
struct TestCluster {
|
||||
nodes: Vec<TestNode>,
|
||||
}
|
||||
|
||||
impl TestCluster {
|
||||
/// Start a cluster of `n` real SQ server nodes on random ports.
|
||||
async fn start(n: usize) -> Self {
|
||||
// Phase 1: Bind all listeners to get ports before starting servers.
|
||||
let mut listeners = Vec::new();
|
||||
let mut addrs = Vec::new();
|
||||
|
||||
for _ in 0..n {
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
addrs.push(addr);
|
||||
listeners.push(listener);
|
||||
}
|
||||
|
||||
// Phase 2: Start each node.
|
||||
let mut nodes = Vec::new();
|
||||
for (i, listener) in listeners.into_iter().enumerate() {
|
||||
let addr = addrs[i];
|
||||
let node_id = format!("node-{}", i + 1);
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
|
||||
// Seeds: all addresses except our own.
|
||||
let seeds: Vec<String> = addrs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(j, _)| *j != i)
|
||||
.map(|(_, a)| a.to_string())
|
||||
.collect();
|
||||
|
||||
let config = Config {
|
||||
node_id: node_id.clone(),
|
||||
data_dir: temp_dir.path().to_path_buf(),
|
||||
seeds: seeds.clone(),
|
||||
grpc_address: addr.to_string(),
|
||||
cluster_id: "test-cluster".to_string(),
|
||||
s3_bucket: None,
|
||||
s3_endpoint: None,
|
||||
s3_region: None,
|
||||
sync_policy: sq_models::SyncPolicy::EveryBatch,
|
||||
};
|
||||
|
||||
let (state, mut pipeline) = State::new(config).unwrap();
|
||||
|
||||
// Spawn the write pipeline for this node.
|
||||
let pipeline_cancel = CancellationToken::new();
|
||||
let pipeline_cancel_clone = pipeline_cancel.clone();
|
||||
tokio::spawn(async move {
|
||||
tokio::select! {
|
||||
() = pipeline.run() => {}
|
||||
() = pipeline_cancel_clone.cancelled() => {}
|
||||
}
|
||||
});
|
||||
|
||||
let membership = Arc::new(Membership::new(MembershipConfig {
|
||||
node_id: node_id.clone(),
|
||||
address: addr.to_string(),
|
||||
seeds,
|
||||
..Default::default()
|
||||
}));
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let cancel_clone = cancel.clone();
|
||||
let state_clone = state.clone();
|
||||
let membership_clone = membership.clone();
|
||||
|
||||
let incoming = tokio_stream::wrappers::TcpListenerStream::new(listener);
|
||||
|
||||
let server_handle = tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(StatusServiceServer::new(health::HealthServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(ControlPlaneServiceServer::new(
|
||||
control_plane::ControlPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
},
|
||||
))
|
||||
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
|
||||
state: state_clone,
|
||||
membership: membership_clone,
|
||||
}))
|
||||
.serve_with_incoming_shutdown(incoming, async move {
|
||||
cancel_clone.cancelled().await;
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
nodes.push(TestNode {
|
||||
addr,
|
||||
node_id,
|
||||
state,
|
||||
membership,
|
||||
cancel,
|
||||
pipeline_cancel,
|
||||
_temp_dir: temp_dir,
|
||||
_server_handle: server_handle,
|
||||
});
|
||||
}
|
||||
|
||||
// Phase 3: Wait for all servers to be ready.
|
||||
for node in &nodes {
|
||||
wait_for_ready(&node.endpoint()).await;
|
||||
}
|
||||
|
||||
TestCluster { nodes }
|
||||
}
|
||||
|
||||
fn node(&self, index: usize) -> &TestNode {
|
||||
&self.nodes[index]
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestCluster {
|
||||
fn drop(&mut self) {
|
||||
for node in &self.nodes {
|
||||
node.pipeline_cancel.cancel();
|
||||
node.cancel.cancel();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Poll the Status RPC until the server responds, with a timeout.
|
||||
async fn wait_for_ready(endpoint: &str) {
|
||||
let deadline = tokio::time::Instant::now() + tokio::time::Duration::from_secs(5);
|
||||
loop {
|
||||
if tokio::time::Instant::now() > deadline {
|
||||
panic!("Server at {} did not become ready in time", endpoint);
|
||||
}
|
||||
match StatusServiceClient::connect(endpoint.to_string()).await {
|
||||
Ok(mut client) => {
|
||||
if client
|
||||
.status(tonic::Request::new(GetStatusRequest {}))
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
Err(_) => {}
|
||||
}
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect messages from a subscribe stream with a timeout.
|
||||
async fn collect_messages(
|
||||
endpoint: &str,
|
||||
topic: &str,
|
||||
start_offset: u64,
|
||||
expected_count: usize,
|
||||
) -> Vec<sq_grpc_interface::ConsumedMessage> {
|
||||
let mut client = DataPlaneServiceClient::connect(endpoint.to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: topic.to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(start_offset),
|
||||
max_batch_size: 200,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut messages = Vec::new();
|
||||
|
||||
while messages.len() < expected_count {
|
||||
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => messages.extend(batch.messages),
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
messages
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 1: Single node, 1000 messages via SDK
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_single_node_publish_consume_1000() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
|
||||
// Publish 1000 messages via SDK Producer.
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for i in 0..1000u64 {
|
||||
let result = producer
|
||||
.send("orders", None, format!("msg-{i}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(result.offset, i);
|
||||
assert_eq!(result.topic, "orders");
|
||||
}
|
||||
|
||||
// Consume all 1000 via raw subscribe.
|
||||
let messages = collect_messages(&endpoint, "orders", 0, 1000).await;
|
||||
|
||||
assert_eq!(messages.len(), 1000);
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("msg-{i}").as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 2: Multi-topic isolation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multi_topic_isolation() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let topics = ["alpha", "beta", "gamma"];
|
||||
let counts: [usize; 3] = [50, 100, 25];
|
||||
|
||||
// Publish to each topic.
|
||||
for (topic, count) in topics.iter().zip(counts.iter()) {
|
||||
for i in 0..*count {
|
||||
producer
|
||||
.send(topic, None, format!("{topic}-{i}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Consume from each topic and verify isolation.
|
||||
for (topic, expected_count) in topics.iter().zip(counts.iter()) {
|
||||
let messages = collect_messages(&endpoint, topic, 0, *expected_count).await;
|
||||
|
||||
assert_eq!(
|
||||
messages.len(),
|
||||
*expected_count,
|
||||
"topic {topic} expected {expected_count} messages, got {}",
|
||||
messages.len()
|
||||
);
|
||||
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("{topic}-{i}").as_bytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 3: Consumer group offset resume
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_consumer_group_offset_resume() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
|
||||
// Publish 20 messages.
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for i in 0..20u64 {
|
||||
producer
|
||||
.send("events", None, format!("msg-{i}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
// Consumer 1: consume with auto_commit, collect at least 10 messages.
|
||||
{
|
||||
let mut consumer = GrpcConsumer::connect(GrpcConsumerConfig {
|
||||
address: endpoint.clone(),
|
||||
consumer_group: "test-group".to_string(),
|
||||
topic: "events".to_string(),
|
||||
auto_commit: true,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut received = Vec::new();
|
||||
let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
|
||||
while received.len() < 10 && tokio::time::Instant::now() < deadline {
|
||||
let msgs = consumer.poll().await.unwrap();
|
||||
if msgs.is_empty() {
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
continue;
|
||||
}
|
||||
received.extend(msgs);
|
||||
}
|
||||
assert!(
|
||||
received.len() >= 10,
|
||||
"expected at least 10 messages, got {}",
|
||||
received.len()
|
||||
);
|
||||
}
|
||||
|
||||
// Consumer 2: reconnect with same group, should resume from committed offset.
|
||||
{
|
||||
let mut consumer = GrpcConsumer::connect(GrpcConsumerConfig {
|
||||
address: endpoint.clone(),
|
||||
consumer_group: "test-group".to_string(),
|
||||
topic: "events".to_string(),
|
||||
auto_commit: false,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
|
||||
let mut msgs = Vec::new();
|
||||
while msgs.is_empty() && tokio::time::Instant::now() < deadline {
|
||||
msgs = consumer.poll().await.unwrap();
|
||||
if msgs.is_empty() {
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
}
|
||||
assert!(
|
||||
!msgs.is_empty(),
|
||||
"expected messages from resumed consumer"
|
||||
);
|
||||
// Should start from at least offset 9 (last committed by auto_commit).
|
||||
assert!(
|
||||
msgs[0].offset >= 9,
|
||||
"expected resume from offset >= 9, got {}",
|
||||
msgs[0].offset
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 4: Topic management CRUD
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_topic_management_crud() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
|
||||
let mut client = ControlPlaneServiceClient::connect(endpoint.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create topic.
|
||||
let resp = client
|
||||
.create_topic(tonic::Request::new(CreateTopicRequest {
|
||||
name: "orders".to_string(),
|
||||
partitions: 4,
|
||||
replication_factor: 3,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.into_inner().name, "orders");
|
||||
|
||||
// Duplicate should fail.
|
||||
let err = client
|
||||
.create_topic(tonic::Request::new(CreateTopicRequest {
|
||||
name: "orders".to_string(),
|
||||
partitions: 4,
|
||||
replication_factor: 3,
|
||||
}))
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_eq!(err.code(), tonic::Code::AlreadyExists);
|
||||
|
||||
// Create another.
|
||||
client
|
||||
.create_topic(tonic::Request::new(CreateTopicRequest {
|
||||
name: "events".to_string(),
|
||||
partitions: 1,
|
||||
replication_factor: 1,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// List topics.
|
||||
let resp = client
|
||||
.list_topics(tonic::Request::new(ListTopicsRequest {}))
|
||||
.await
|
||||
.unwrap();
|
||||
let topics = resp.into_inner().topics;
|
||||
assert_eq!(topics.len(), 2);
|
||||
let names: Vec<&str> = topics.iter().map(|t| t.name.as_str()).collect();
|
||||
assert!(names.contains(&"orders"));
|
||||
assert!(names.contains(&"events"));
|
||||
|
||||
// Describe topic.
|
||||
let resp = client
|
||||
.describe_topic(tonic::Request::new(DescribeTopicRequest {
|
||||
name: "orders".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap()
|
||||
.into_inner();
|
||||
let topic = resp.topic.unwrap();
|
||||
assert_eq!(topic.name, "orders");
|
||||
assert_eq!(topic.partitions, 4);
|
||||
assert_eq!(topic.replication_factor, 3);
|
||||
assert_eq!(resp.partition_info.len(), 4);
|
||||
|
||||
// Describe non-existent topic.
|
||||
let err = client
|
||||
.describe_topic(tonic::Request::new(DescribeTopicRequest {
|
||||
name: "nonexistent".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_eq!(err.code(), tonic::Code::NotFound);
|
||||
|
||||
// Delete topic.
|
||||
client
|
||||
.delete_topic(tonic::Request::new(DeleteTopicRequest {
|
||||
name: "orders".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify deleted.
|
||||
let resp = client
|
||||
.list_topics(tonic::Request::new(ListTopicsRequest {}))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.into_inner().topics.len(), 1);
|
||||
|
||||
// Delete non-existent should fail.
|
||||
let err = client
|
||||
.delete_topic(tonic::Request::new(DeleteTopicRequest {
|
||||
name: "orders".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_eq!(err.code(), tonic::Code::NotFound);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 5: Three-node join discovery
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_three_node_join_discovery() {
|
||||
let cluster = TestCluster::start(3).await;
|
||||
|
||||
// Node-2 joins node-1.
|
||||
let mut client = ClusterServiceClient::connect(cluster.node(0).endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
let resp = client
|
||||
.join(tonic::Request::new(JoinRequest {
|
||||
node_id: "node-2".to_string(),
|
||||
address: cluster.nodes[1].addr.to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let members = resp.into_inner().members;
|
||||
assert!(
|
||||
members.len() >= 2,
|
||||
"after node-2 join, node-1 should know >= 2 members, got {}",
|
||||
members.len()
|
||||
);
|
||||
let ids: Vec<&str> = members.iter().map(|m| m.node_id.as_str()).collect();
|
||||
assert!(ids.contains(&"node-1"));
|
||||
assert!(ids.contains(&"node-2"));
|
||||
|
||||
// Node-3 joins node-1.
|
||||
let resp = client
|
||||
.join(tonic::Request::new(JoinRequest {
|
||||
node_id: "node-3".to_string(),
|
||||
address: cluster.nodes[2].addr.to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let members = resp.into_inner().members;
|
||||
assert!(
|
||||
members.len() >= 3,
|
||||
"after node-3 join, node-1 should know >= 3 members, got {}",
|
||||
members.len()
|
||||
);
|
||||
let ids: Vec<&str> = members.iter().map(|m| m.node_id.as_str()).collect();
|
||||
assert!(ids.contains(&"node-1"));
|
||||
assert!(ids.contains(&"node-2"));
|
||||
assert!(ids.contains(&"node-3"));
|
||||
|
||||
// Verify via membership handle.
|
||||
let all = cluster.node(0).membership.all_members().await;
|
||||
assert_eq!(all.len(), 3);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 6: Cross-node heartbeat gossip
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cross_node_heartbeat_gossip() {
|
||||
let cluster = TestCluster::start(3).await;
|
||||
|
||||
// Node-2 and node-3 join node-1.
|
||||
let mut client1 = ClusterServiceClient::connect(cluster.node(0).endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
client1
|
||||
.join(tonic::Request::new(JoinRequest {
|
||||
node_id: "node-2".to_string(),
|
||||
address: cluster.nodes[1].addr.to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
client1
|
||||
.join(tonic::Request::new(JoinRequest {
|
||||
node_id: "node-3".to_string(),
|
||||
address: cluster.nodes[2].addr.to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Node-1 now knows about all 3. Send heartbeat to node-2 carrying this info.
|
||||
let all_members = cluster.node(0).membership.all_members().await;
|
||||
let known: Vec<ClusterNodeInfo> = all_members
|
||||
.iter()
|
||||
.map(|m| ClusterNodeInfo {
|
||||
node_id: m.node_id.clone(),
|
||||
address: m.address.clone(),
|
||||
status: m.status.to_string(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut client2 = ClusterServiceClient::connect(cluster.node(1).endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
let resp = client2
|
||||
.heartbeat(tonic::Request::new(HeartbeatRequest {
|
||||
node_id: "node-1".to_string(),
|
||||
known_members: known,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Node-2 should now know about all 3 nodes via gossip.
|
||||
let node2_members = resp.into_inner().members;
|
||||
assert!(
|
||||
node2_members.len() >= 3,
|
||||
"node-2 should know >= 3 members after gossip, got {}",
|
||||
node2_members.len()
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 7: Cross-node replication via RPC
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cross_node_replication_via_rpc() {
|
||||
let cluster = TestCluster::start(2).await;
|
||||
|
||||
// Publish 10 messages to node-1 via SDK.
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: cluster.node(0).endpoint(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut entry_data = Vec::new();
|
||||
for i in 0..10u64 {
|
||||
let value = format!("replicated-{i}");
|
||||
producer
|
||||
.send("repl-topic", None, value.as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
entry_data.push(value.into_bytes());
|
||||
}
|
||||
|
||||
// Replicate the same data to node-2 via ClusterService RPC.
|
||||
let mut cluster_client = ClusterServiceClient::connect(cluster.node(1).endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
let resp = cluster_client
|
||||
.replicate_entries(tonic::Request::new(ReplicateEntriesRequest {
|
||||
topic: "repl-topic".to_string(),
|
||||
partition: 0,
|
||||
entries: entry_data,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let last_offset = resp.into_inner().last_replicated_offset;
|
||||
assert_eq!(last_offset, 9);
|
||||
|
||||
// Read from node-2 to verify the data is there.
|
||||
let messages = collect_messages(&cluster.node(1).endpoint(), "repl-topic", 0, 10).await;
|
||||
|
||||
assert_eq!(messages.len(), 10);
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("replicated-{i}").as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 8: FetchSegment recovery
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fetch_segment_recovery() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
|
||||
// Write 50 messages.
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for i in 0..50u64 {
|
||||
producer
|
||||
.send("recovery-topic", None, format!("data-{i}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
// Fetch via FetchSegment stream.
|
||||
let mut client = ClusterServiceClient::connect(endpoint)
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.fetch_segment(tonic::Request::new(FetchSegmentRequest {
|
||||
topic: "recovery-topic".to_string(),
|
||||
partition: 0,
|
||||
from_offset: 0,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut all_chunks = Vec::new();
|
||||
|
||||
while let Ok(Some(Ok(resp))) =
|
||||
tokio::time::timeout(Duration::from_secs(5), stream.next()).await
|
||||
{
|
||||
all_chunks.extend(resp.chunk);
|
||||
}
|
||||
|
||||
// Decode the wire format: offset(8 LE) + value_len(4 LE) + value
|
||||
let mut cursor = 0;
|
||||
let mut decoded = Vec::new();
|
||||
while cursor + 12 <= all_chunks.len() {
|
||||
let offset = u64::from_le_bytes(all_chunks[cursor..cursor + 8].try_into().unwrap());
|
||||
let value_len =
|
||||
u32::from_le_bytes(all_chunks[cursor + 8..cursor + 12].try_into().unwrap()) as usize;
|
||||
cursor += 12;
|
||||
assert!(cursor + value_len <= all_chunks.len());
|
||||
let value = all_chunks[cursor..cursor + value_len].to_vec();
|
||||
cursor += value_len;
|
||||
decoded.push((offset, value));
|
||||
}
|
||||
|
||||
assert_eq!(decoded.len(), 50);
|
||||
for (i, (offset, value)) in decoded.iter().enumerate() {
|
||||
assert_eq!(*offset, i as u64);
|
||||
assert_eq!(value, format!("data-{i}").as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 9: Node status returns correct id
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_node_status_returns_correct_id() {
|
||||
let cluster = TestCluster::start(3).await;
|
||||
|
||||
for (i, node) in cluster.nodes.iter().enumerate() {
|
||||
let mut client = StatusServiceClient::connect(node.endpoint()).await.unwrap();
|
||||
let resp = client
|
||||
.status(tonic::Request::new(GetStatusRequest {}))
|
||||
.await
|
||||
.unwrap();
|
||||
let expected = format!("node-{}", i + 1);
|
||||
assert_eq!(
|
||||
resp.into_inner().node_id,
|
||||
expected,
|
||||
"node at index {} should have id '{}'",
|
||||
i,
|
||||
expected
|
||||
);
|
||||
}
|
||||
}
|
||||
496
crates/sq-server/tests/data_plane_test.rs
Normal file
496
crates/sq-server/tests/data_plane_test.rs
Normal file
@@ -0,0 +1,496 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_grpc_interface::{
|
||||
data_plane_service_client::DataPlaneServiceClient,
|
||||
data_plane_service_server::DataPlaneServiceServer,
|
||||
status_service_client::StatusServiceClient,
|
||||
status_service_server::StatusServiceServer,
|
||||
AckMode, GetStatusRequest, MessageHeader, PublishMessage, PublishRequest, PublishSettings,
|
||||
SubscribeRequest,
|
||||
};
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
use sq_sim::SimClock;
|
||||
use sq_storage::engine::StorageEngine;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
/// A lightweight test harness that starts a gRPC server on a random port
|
||||
/// and returns both the server task and connected clients.
|
||||
struct TestServer {
|
||||
addr: SocketAddr,
|
||||
_shutdown: tokio::sync::oneshot::Sender<()>,
|
||||
}
|
||||
|
||||
impl TestServer {
|
||||
async fn start() -> Self {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = sq_models::WalConfig {
|
||||
max_segment_bytes: 1024 * 1024,
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let engine = StorageEngine::new(fs, clock, config).unwrap();
|
||||
engine.recover().unwrap();
|
||||
|
||||
let engine = Arc::new(Mutex::new(engine));
|
||||
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
|
||||
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>();
|
||||
|
||||
// Build the health server state-like object inline for tests.
|
||||
let node_id = "test-node".to_string();
|
||||
|
||||
struct TestHealthServer {
|
||||
node_id: String,
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl sq_grpc_interface::status_service_server::StatusService for TestHealthServer {
|
||||
async fn status(
|
||||
&self,
|
||||
_request: tonic::Request<GetStatusRequest>,
|
||||
) -> Result<tonic::Response<sq_grpc_interface::GetStatusResponse>, tonic::Status> {
|
||||
Ok(tonic::Response::new(sq_grpc_interface::GetStatusResponse {
|
||||
node_id: self.node_id.clone(),
|
||||
cluster: None,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
struct TestDataPlaneServer {
|
||||
engine: Arc<Mutex<StorageEngine<InMemoryFileSystem, SimClock>>>,
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl sq_grpc_interface::data_plane_service_server::DataPlaneService
|
||||
for TestDataPlaneServer
|
||||
{
|
||||
async fn publish(
|
||||
&self,
|
||||
request: tonic::Request<PublishRequest>,
|
||||
) -> Result<tonic::Response<sq_grpc_interface::PublishResponse>, tonic::Status> {
|
||||
let req = request.into_inner();
|
||||
|
||||
if req.messages.is_empty() {
|
||||
return Err(tonic::Status::invalid_argument(
|
||||
"messages must not be empty",
|
||||
));
|
||||
}
|
||||
|
||||
let mut results = Vec::new();
|
||||
let engine = self.engine.lock().await;
|
||||
|
||||
for msg in &req.messages {
|
||||
if msg.topic.is_empty() {
|
||||
return Err(tonic::Status::invalid_argument("topic must not be empty"));
|
||||
}
|
||||
|
||||
let headers: Vec<sq_models::Header> = msg
|
||||
.headers
|
||||
.iter()
|
||||
.map(|h| sq_models::Header {
|
||||
key: h.key.clone(),
|
||||
value: h.value.clone(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let key = if msg.key.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(msg.key.as_slice())
|
||||
};
|
||||
|
||||
let offset = engine
|
||||
.append(&msg.topic, 0, key, &msg.value, &headers, 0)
|
||||
.map_err(|e| tonic::Status::internal(e.to_string()))?;
|
||||
|
||||
results.push(sq_grpc_interface::PublishResult {
|
||||
topic: msg.topic.clone(),
|
||||
partition: 0,
|
||||
offset,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(tonic::Response::new(sq_grpc_interface::PublishResponse {
|
||||
results,
|
||||
}))
|
||||
}
|
||||
|
||||
type SubscribeStream = std::pin::Pin<
|
||||
Box<
|
||||
dyn tokio_stream::Stream<
|
||||
Item = Result<sq_grpc_interface::SubscribeResponse, tonic::Status>,
|
||||
> + Send
|
||||
+ 'static,
|
||||
>,
|
||||
>;
|
||||
|
||||
async fn subscribe(
|
||||
&self,
|
||||
request: tonic::Request<SubscribeRequest>,
|
||||
) -> Result<tonic::Response<Self::SubscribeStream>, tonic::Status> {
|
||||
let req = request.into_inner();
|
||||
let batch_size = if req.max_batch_size == 0 {
|
||||
100
|
||||
} else {
|
||||
req.max_batch_size as usize
|
||||
};
|
||||
let start_offset = req.start_offset.unwrap_or(0);
|
||||
let topic = req.topic.clone();
|
||||
let partition = req.partition;
|
||||
let engine = self.engine.clone();
|
||||
|
||||
let stream = async_stream::try_stream! {
|
||||
let mut current_offset = start_offset;
|
||||
let mut empty_polls = 0u32;
|
||||
|
||||
loop {
|
||||
let messages = {
|
||||
let eng = engine.lock().await;
|
||||
eng.read(&topic, partition, current_offset, batch_size)
|
||||
.map_err(|e| tonic::Status::internal(e.to_string()))?
|
||||
};
|
||||
|
||||
if messages.is_empty() {
|
||||
empty_polls += 1;
|
||||
// In tests, stop after a few empty polls to avoid hanging.
|
||||
if empty_polls > 3 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
|
||||
continue;
|
||||
}
|
||||
|
||||
empty_polls = 0;
|
||||
|
||||
let consumed: Vec<sq_grpc_interface::ConsumedMessage> = messages
|
||||
.iter()
|
||||
.map(|m| {
|
||||
current_offset = m.offset + 1;
|
||||
sq_grpc_interface::ConsumedMessage {
|
||||
offset: m.offset,
|
||||
topic: m.topic.to_string(),
|
||||
partition: m.partition,
|
||||
key: m.key.clone().unwrap_or_default(),
|
||||
value: m.value.clone(),
|
||||
headers: m
|
||||
.headers
|
||||
.iter()
|
||||
.map(|h| MessageHeader {
|
||||
key: h.key.clone(),
|
||||
value: h.value.clone(),
|
||||
})
|
||||
.collect(),
|
||||
timestamp_ms: m.timestamp_ms,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
yield sq_grpc_interface::SubscribeResponse { messages: consumed };
|
||||
}
|
||||
};
|
||||
|
||||
Ok(tonic::Response::new(Box::pin(stream)))
|
||||
}
|
||||
|
||||
async fn ack(
|
||||
&self,
|
||||
_request: tonic::Request<sq_grpc_interface::AckRequest>,
|
||||
) -> Result<tonic::Response<sq_grpc_interface::AckResponse>, tonic::Status> {
|
||||
Ok(tonic::Response::new(sq_grpc_interface::AckResponse {}))
|
||||
}
|
||||
}
|
||||
|
||||
let incoming = tokio_stream::wrappers::TcpListenerStream::new(listener);
|
||||
|
||||
tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(StatusServiceServer::new(TestHealthServer {
|
||||
node_id: node_id.clone(),
|
||||
}))
|
||||
.add_service(DataPlaneServiceServer::new(TestDataPlaneServer {
|
||||
engine,
|
||||
}))
|
||||
.serve_with_incoming_shutdown(incoming, async {
|
||||
let _ = shutdown_rx.await;
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
// Give the server a moment to start.
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
|
||||
|
||||
TestServer {
|
||||
addr,
|
||||
_shutdown: shutdown_tx,
|
||||
}
|
||||
}
|
||||
|
||||
fn endpoint(&self) -> String {
|
||||
format!("http://{}", self.addr)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_status_rpc() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = StatusServiceClient::connect(server.endpoint()).await.unwrap();
|
||||
|
||||
let response = client
|
||||
.status(tonic::Request::new(GetStatusRequest {}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(response.into_inner().node_id, "test-node");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_publish_single_message() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let response = client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages: vec![PublishMessage {
|
||||
topic: "orders".to_string(),
|
||||
key: vec![],
|
||||
value: b"hello world".to_vec(),
|
||||
headers: vec![],
|
||||
}],
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let results = response.into_inner().results;
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].topic, "orders");
|
||||
assert_eq!(results[0].offset, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_publish_batch_sequential_offsets() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let messages: Vec<PublishMessage> = (0..100)
|
||||
.map(|i| PublishMessage {
|
||||
topic: "events".to_string(),
|
||||
key: vec![],
|
||||
value: format!("msg-{i}").into_bytes(),
|
||||
headers: vec![],
|
||||
})
|
||||
.collect();
|
||||
|
||||
let response = client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages,
|
||||
settings: Some(PublishSettings {
|
||||
ack_mode: AckMode::All.into(),
|
||||
}),
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let results = response.into_inner().results;
|
||||
assert_eq!(results.len(), 100);
|
||||
for (i, r) in results.iter().enumerate() {
|
||||
assert_eq!(r.offset, i as u64);
|
||||
assert_eq!(r.topic, "events");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_publish_empty_topic_returns_error() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let err = client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages: vec![PublishMessage {
|
||||
topic: "".to_string(),
|
||||
key: vec![],
|
||||
value: b"data".to_vec(),
|
||||
headers: vec![],
|
||||
}],
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert_eq!(err.code(), tonic::Code::InvalidArgument);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_publish_empty_messages_returns_error() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let err = client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages: vec![],
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert_eq!(err.code(), tonic::Code::InvalidArgument);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_publish_with_key_and_headers() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let response = client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages: vec![PublishMessage {
|
||||
topic: "orders".to_string(),
|
||||
key: b"order-123".to_vec(),
|
||||
value: b"payload".to_vec(),
|
||||
headers: vec![MessageHeader {
|
||||
key: "trace-id".to_string(),
|
||||
value: b"abc-123".to_vec(),
|
||||
}],
|
||||
}],
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let results = response.into_inner().results;
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].offset, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_subscribe_from_beginning() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Publish 10 messages first.
|
||||
let messages: Vec<PublishMessage> = (0..10)
|
||||
.map(|i| PublishMessage {
|
||||
topic: "events".to_string(),
|
||||
key: vec![],
|
||||
value: format!("msg-{i}").into_bytes(),
|
||||
headers: vec![],
|
||||
})
|
||||
.collect();
|
||||
|
||||
client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages,
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Subscribe from offset 0.
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "events".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: "".to_string(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 100,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut all_messages = Vec::new();
|
||||
|
||||
while let Some(Ok(batch)) = stream.next().await {
|
||||
all_messages.extend(batch.messages);
|
||||
if all_messages.len() >= 10 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(all_messages.len(), 10);
|
||||
for (i, msg) in all_messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("msg-{i}").as_bytes());
|
||||
assert_eq!(msg.topic, "events");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_subscribe_from_middle() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Publish 10 messages.
|
||||
let messages: Vec<PublishMessage> = (0..10)
|
||||
.map(|i| PublishMessage {
|
||||
topic: "events".to_string(),
|
||||
key: vec![],
|
||||
value: format!("msg-{i}").into_bytes(),
|
||||
headers: vec![],
|
||||
})
|
||||
.collect();
|
||||
|
||||
client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages,
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Subscribe from offset 5.
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "events".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: "".to_string(),
|
||||
start_offset: Some(5),
|
||||
max_batch_size: 100,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut all_messages = Vec::new();
|
||||
|
||||
while let Some(Ok(batch)) = stream.next().await {
|
||||
all_messages.extend(batch.messages);
|
||||
if all_messages.len() >= 5 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(all_messages.len(), 5);
|
||||
assert_eq!(all_messages[0].offset, 5);
|
||||
assert_eq!(all_messages[4].offset, 9);
|
||||
}
|
||||
965
crates/sq-server/tests/stress_test.rs
Normal file
965
crates/sq-server/tests/stress_test.rs
Normal file
@@ -0,0 +1,965 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use sq_cluster::membership::{Membership, MembershipConfig};
|
||||
use sq_grpc_interface::{
|
||||
cluster_service_server::ClusterServiceServer,
|
||||
control_plane_service_server::ControlPlaneServiceServer,
|
||||
data_plane_service_server::DataPlaneServiceServer,
|
||||
status_service_client::StatusServiceClient,
|
||||
status_service_server::StatusServiceServer,
|
||||
GetStatusRequest, SubscribeRequest,
|
||||
};
|
||||
use sq_grpc_interface::data_plane_service_client::DataPlaneServiceClient;
|
||||
use sq_sdk::{
|
||||
BatchProducer, BatchProducerConfig, Consumer, ConsumerConfig, Producer, ProducerConfig,
|
||||
ProducerMessage,
|
||||
};
|
||||
use sq_server::capnp::CapnpServer;
|
||||
use sq_server::grpc::{cluster, control_plane, data_plane, health};
|
||||
use sq_server::state::{Config, State};
|
||||
use tempfile::TempDir;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test harness (shared with cluster_test.rs, inlined here for simplicity)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct TestNode {
|
||||
grpc_addr: SocketAddr,
|
||||
capnp_addr: SocketAddr,
|
||||
cancel: CancellationToken,
|
||||
pipeline_cancel: CancellationToken,
|
||||
_temp_dir: TempDir,
|
||||
_server_handle: tokio::task::JoinHandle<()>,
|
||||
_capnp_handle: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl TestNode {
|
||||
/// Cap'n Proto endpoint (default data plane).
|
||||
fn endpoint(&self) -> String {
|
||||
self.capnp_addr.to_string()
|
||||
}
|
||||
|
||||
/// gRPC endpoint (health checks, subscribe verification).
|
||||
fn grpc_endpoint(&self) -> String {
|
||||
format!("http://{}", self.grpc_addr)
|
||||
}
|
||||
}
|
||||
|
||||
struct TestCluster {
|
||||
nodes: Vec<TestNode>,
|
||||
}
|
||||
|
||||
impl TestCluster {
|
||||
async fn start(n: usize) -> Self {
|
||||
let mut grpc_listeners = Vec::new();
|
||||
let mut capnp_listeners = Vec::new();
|
||||
let mut grpc_addrs = Vec::new();
|
||||
let mut capnp_addrs = Vec::new();
|
||||
|
||||
for _ in 0..n {
|
||||
let grpc_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let capnp_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
grpc_addrs.push(grpc_listener.local_addr().unwrap());
|
||||
capnp_addrs.push(capnp_listener.local_addr().unwrap());
|
||||
grpc_listeners.push(grpc_listener);
|
||||
capnp_listeners.push(capnp_listener);
|
||||
}
|
||||
|
||||
let mut nodes = Vec::new();
|
||||
for (i, (grpc_listener, capnp_listener)) in
|
||||
grpc_listeners.into_iter().zip(capnp_listeners).enumerate()
|
||||
{
|
||||
let grpc_addr = grpc_addrs[i];
|
||||
let capnp_addr = capnp_addrs[i];
|
||||
let node_id = format!("stress-node-{}", i + 1);
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
|
||||
let seeds: Vec<String> = grpc_addrs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(j, _)| *j != i)
|
||||
.map(|(_, a)| a.to_string())
|
||||
.collect();
|
||||
|
||||
let config = Config {
|
||||
node_id: node_id.clone(),
|
||||
data_dir: temp_dir.path().to_path_buf(),
|
||||
seeds: seeds.clone(),
|
||||
grpc_address: grpc_addr.to_string(),
|
||||
cluster_id: "test-cluster".to_string(),
|
||||
s3_bucket: None,
|
||||
s3_endpoint: None,
|
||||
s3_region: None,
|
||||
sync_policy: sq_models::SyncPolicy::EveryBatch,
|
||||
};
|
||||
|
||||
let (state, mut pipeline) = State::new(config).unwrap();
|
||||
|
||||
let pipeline_cancel = CancellationToken::new();
|
||||
let pipeline_cancel_clone = pipeline_cancel.clone();
|
||||
tokio::spawn(async move {
|
||||
tokio::select! {
|
||||
() = pipeline.run() => {}
|
||||
() = pipeline_cancel_clone.cancelled() => {}
|
||||
}
|
||||
});
|
||||
|
||||
let membership = Arc::new(Membership::new(MembershipConfig {
|
||||
node_id: node_id.clone(),
|
||||
address: grpc_addr.to_string(),
|
||||
seeds,
|
||||
..Default::default()
|
||||
}));
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Spawn gRPC server.
|
||||
let cancel_clone = cancel.clone();
|
||||
let state_clone = state.clone();
|
||||
let membership_clone = membership.clone();
|
||||
let incoming = tokio_stream::wrappers::TcpListenerStream::new(grpc_listener);
|
||||
let server_handle = tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(StatusServiceServer::new(health::HealthServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(ControlPlaneServiceServer::new(
|
||||
control_plane::ControlPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
},
|
||||
))
|
||||
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
|
||||
state: state_clone,
|
||||
membership: membership_clone,
|
||||
}))
|
||||
.serve_with_incoming_shutdown(incoming, async move {
|
||||
cancel_clone.cancelled().await;
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
// Spawn capnp server.
|
||||
let cancel_clone = cancel.clone();
|
||||
let capnp_state = state.clone();
|
||||
let capnp_handle = tokio::spawn(async move {
|
||||
let server = CapnpServer {
|
||||
host: capnp_addr,
|
||||
state: capnp_state,
|
||||
};
|
||||
drop(capnp_listener);
|
||||
let _ = notmad::Component::run(&server, cancel_clone).await;
|
||||
});
|
||||
|
||||
nodes.push(TestNode {
|
||||
grpc_addr,
|
||||
capnp_addr,
|
||||
cancel,
|
||||
pipeline_cancel,
|
||||
_temp_dir: temp_dir,
|
||||
_server_handle: server_handle,
|
||||
_capnp_handle: capnp_handle,
|
||||
});
|
||||
}
|
||||
|
||||
for node in &nodes {
|
||||
wait_for_ready(&node.grpc_endpoint()).await;
|
||||
}
|
||||
// Give capnp server a moment to bind.
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
|
||||
TestCluster { nodes }
|
||||
}
|
||||
|
||||
fn node(&self, index: usize) -> &TestNode {
|
||||
&self.nodes[index]
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestCluster {
|
||||
fn drop(&mut self) {
|
||||
for node in &self.nodes {
|
||||
node.pipeline_cancel.cancel();
|
||||
node.cancel.cancel();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_ready(endpoint: &str) {
|
||||
let deadline = tokio::time::Instant::now() + tokio::time::Duration::from_secs(5);
|
||||
loop {
|
||||
if tokio::time::Instant::now() > deadline {
|
||||
panic!("Server at {} did not become ready in time", endpoint);
|
||||
}
|
||||
if let Ok(mut client) = StatusServiceClient::connect(endpoint.to_string()).await {
|
||||
if client
|
||||
.status(tonic::Request::new(GetStatusRequest {}))
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 1: High-volume publish — 100K messages from a single producer
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_single_producer_100k() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let total = 100_000u64;
|
||||
let batch_size = 500;
|
||||
let payload = vec![0u8; 128]; // 128-byte messages
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
for batch_start in (0..total).step_by(batch_size) {
|
||||
let batch_end = (batch_start + batch_size as u64).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("stress-topic", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
let publish_duration = start.elapsed();
|
||||
let msgs_per_sec = total as f64 / publish_duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_single_producer_100k: published {} messages in {:.2}s ({:.0} msg/s, {:.1} MB/s)",
|
||||
total,
|
||||
publish_duration.as_secs_f64(),
|
||||
msgs_per_sec,
|
||||
(total as f64 * 128.0) / (1024.0 * 1024.0) / publish_duration.as_secs_f64()
|
||||
);
|
||||
|
||||
// Verify: read back all messages via gRPC subscribe.
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep)
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "stress-topic".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut consumed = 0u64;
|
||||
let consume_start = Instant::now();
|
||||
|
||||
while consumed < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => consumed += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
let consume_duration = consume_start.elapsed();
|
||||
let consume_per_sec = consumed as f64 / consume_duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_single_producer_100k: consumed {} messages in {:.2}s ({:.0} msg/s)",
|
||||
consumed,
|
||||
consume_duration.as_secs_f64(),
|
||||
consume_per_sec
|
||||
);
|
||||
|
||||
assert_eq!(consumed, total, "expected all messages to be consumed");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 2: Concurrent producers — 10 producers, 10K messages each
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_concurrent_producers() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
|
||||
let num_producers = 10;
|
||||
let msgs_per_producer = 10_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for p in 0..num_producers {
|
||||
let ep = endpoint.clone();
|
||||
let pl = payload.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: ep,
|
||||
producer_id: format!("producer-{p}"),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let topic = format!("concurrent-topic-{p}");
|
||||
for batch_start in (0..msgs_per_producer).step_by(100) {
|
||||
let batch_end = (batch_start + 100).min(msgs_per_producer);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new(topic.clone(), pl.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
let total = num_producers as u64 * msgs_per_producer;
|
||||
let msgs_per_sec = total as f64 / duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_concurrent_producers: {} producers x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
|
||||
num_producers,
|
||||
msgs_per_producer,
|
||||
total,
|
||||
duration.as_secs_f64(),
|
||||
msgs_per_sec
|
||||
);
|
||||
|
||||
// Verify each topic has the right count via gRPC.
|
||||
for p in 0..num_producers {
|
||||
let topic = format!("concurrent-topic-{p}");
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: topic.clone(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
while count < msgs_per_producer {
|
||||
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
count, msgs_per_producer,
|
||||
"topic {topic} expected {msgs_per_producer} messages, got {count}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 3: Concurrent consumers — publish then read in parallel
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_concurrent_consumers() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
let total = 50_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
// Pre-publish messages.
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for batch_start in (0..total).step_by(500) {
|
||||
let batch_end = (batch_start + 500).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("consume-stress", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
// Consume in parallel from 5 independent consumers via gRPC (no consumer group — each reads all).
|
||||
let num_consumers = 5;
|
||||
let start = Instant::now();
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for _ in 0..num_consumers {
|
||||
let ep = grpc_ep.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
let mut client = DataPlaneServiceClient::connect(ep).await.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "consume-stress".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
while count < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
count
|
||||
}));
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
let count = handle.await.unwrap();
|
||||
assert_eq!(count, total, "each consumer should read all {total} messages");
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
eprintln!(
|
||||
"stress_concurrent_consumers: {} consumers each read {} msgs in {:.2}s",
|
||||
num_consumers,
|
||||
total,
|
||||
duration.as_secs_f64()
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 4: Sustained load — publish+consume simultaneously over time
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_sustained_load() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
let sustain_duration = Duration::from_secs(3);
|
||||
let payload = vec![0u8; 256];
|
||||
|
||||
let ep = endpoint.clone();
|
||||
let pl = payload.clone();
|
||||
|
||||
// Producer: publish as fast as possible for the sustained duration.
|
||||
let producer_handle = tokio::spawn(async move {
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: ep,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let start = Instant::now();
|
||||
let mut total = 0u64;
|
||||
while start.elapsed() < sustain_duration {
|
||||
let batch: Vec<ProducerMessage> = (0..100)
|
||||
.map(|_| ProducerMessage::new("sustained-topic", pl.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
total += 100;
|
||||
}
|
||||
(total, start.elapsed())
|
||||
});
|
||||
|
||||
// Give producer a head start.
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
|
||||
// Consumer: read as fast as possible via gRPC subscribe.
|
||||
let ep = grpc_ep.clone();
|
||||
let consumer_handle = tokio::spawn(async move {
|
||||
let mut client = DataPlaneServiceClient::connect(ep).await.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "sustained-topic".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
let start = Instant::now();
|
||||
|
||||
// Read for longer than the producer runs to drain everything.
|
||||
let read_deadline = sustain_duration + Duration::from_secs(5);
|
||||
while start.elapsed() < read_deadline {
|
||||
match tokio::time::timeout(Duration::from_secs(2), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
count
|
||||
});
|
||||
|
||||
let (published, pub_duration) = producer_handle.await.unwrap();
|
||||
let consumed = consumer_handle.await.unwrap();
|
||||
|
||||
let pub_rate = published as f64 / pub_duration.as_secs_f64();
|
||||
let throughput_mb =
|
||||
(published as f64 * 256.0) / (1024.0 * 1024.0) / pub_duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_sustained_load: published {} in {:.2}s ({:.0} msg/s, {:.1} MB/s), consumed {}",
|
||||
published,
|
||||
pub_duration.as_secs_f64(),
|
||||
pub_rate,
|
||||
throughput_mb,
|
||||
consumed
|
||||
);
|
||||
|
||||
assert!(
|
||||
published > 0,
|
||||
"should have published messages during sustained load"
|
||||
);
|
||||
assert_eq!(consumed, published, "consumer should eventually read all published messages");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 5: Multi-topic fan-out — publish to many topics simultaneously
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_multi_topic_fanout() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
let num_topics = 50;
|
||||
let msgs_per_topic = 1_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Publish to many topics in round-robin batches.
|
||||
for batch_start in (0..msgs_per_topic).step_by(100) {
|
||||
let batch_end = (batch_start + 100).min(msgs_per_topic);
|
||||
for t in 0..num_topics {
|
||||
let topic = format!("fanout-{t}");
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new(topic.clone(), payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
let total = num_topics as u64 * msgs_per_topic;
|
||||
eprintln!(
|
||||
"stress_multi_topic_fanout: {} topics x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
|
||||
num_topics,
|
||||
msgs_per_topic,
|
||||
total,
|
||||
duration.as_secs_f64(),
|
||||
total as f64 / duration.as_secs_f64()
|
||||
);
|
||||
|
||||
// Spot-check a few topics via gRPC.
|
||||
for t in [0, num_topics / 2, num_topics - 1] {
|
||||
let topic = format!("fanout-{t}");
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: topic.clone(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
while count < msgs_per_topic {
|
||||
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
count, msgs_per_topic,
|
||||
"topic {topic} expected {msgs_per_topic} messages, got {count}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 6: Large message bodies — 10K messages with 4KB payloads
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_large_messages() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
|
||||
let total = 10_000u64;
|
||||
let payload = vec![0xABu8; 4096]; // 4KB messages
|
||||
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
for batch_start in (0..total).step_by(50) {
|
||||
let batch_end = (batch_start + 50).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("large-msgs", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
let pub_duration = start.elapsed();
|
||||
let data_mb = (total as f64 * 4096.0) / (1024.0 * 1024.0);
|
||||
eprintln!(
|
||||
"stress_large_messages: published {} x 4KB = {:.1}MB in {:.2}s ({:.1} MB/s)",
|
||||
total,
|
||||
data_mb,
|
||||
pub_duration.as_secs_f64(),
|
||||
data_mb / pub_duration.as_secs_f64()
|
||||
);
|
||||
|
||||
// Verify all data reads back correctly via gRPC.
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep).await.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "large-msgs".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 200,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
while count < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => {
|
||||
for msg in &batch.messages {
|
||||
assert_eq!(msg.value.len(), 4096, "message body should be 4KB");
|
||||
assert!(msg.value.iter().all(|&b| b == 0xAB), "data integrity check");
|
||||
}
|
||||
count += batch.messages.len() as u64;
|
||||
}
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(count, total, "all large messages should be consumed");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 7: Consumer group offset tracking under load
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_consumer_group_resume() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let total = 10_000u64;
|
||||
let payload = vec![0u8; 32];
|
||||
|
||||
// Publish all messages.
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for batch_start in (0..total).step_by(500) {
|
||||
let batch_end = (batch_start + 500).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("cg-stress", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
// Consume first half with auto-commit.
|
||||
let half = total / 2;
|
||||
{
|
||||
let mut consumer = Consumer::connect(ConsumerConfig {
|
||||
address: endpoint.clone(),
|
||||
consumer_group: "stress-group".to_string(),
|
||||
topic: "cg-stress".to_string(),
|
||||
auto_commit: true,
|
||||
max_poll_records: 500,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut consumed = 0u64;
|
||||
while consumed < half {
|
||||
let msgs = tokio::time::timeout(Duration::from_secs(5), consumer.poll())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
consumed += msgs.len() as u64;
|
||||
}
|
||||
assert!(consumed >= half, "should have consumed at least half");
|
||||
}
|
||||
|
||||
// Reconnect — should resume from the committed offset.
|
||||
{
|
||||
let mut consumer = Consumer::connect(ConsumerConfig {
|
||||
address: endpoint.clone(),
|
||||
consumer_group: "stress-group".to_string(),
|
||||
topic: "cg-stress".to_string(),
|
||||
auto_commit: true,
|
||||
max_poll_records: 500,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let msgs = tokio::time::timeout(Duration::from_secs(5), consumer.poll())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
// First message after reconnect should be at or after the halfway point.
|
||||
assert!(
|
||||
!msgs.is_empty(),
|
||||
"should receive messages after resume"
|
||||
);
|
||||
let first_offset = msgs[0].offset;
|
||||
assert!(
|
||||
first_offset >= half - 500, // Allow some re-delivery due to batch commit
|
||||
"first offset after resume should be near {half}, got {first_offset}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 8: BatchProducer — 100K messages from a single batching producer
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_batch_producer_100k() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
|
||||
let producer = BatchProducer::connect(BatchProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
max_batch_size: 1000,
|
||||
flush_interval_ms: 5,
|
||||
channel_capacity: 20_000,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let producer = Arc::new(producer);
|
||||
let total = 100_000u64;
|
||||
let payload = vec![0u8; 128];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
// Spawn a task per message to fully saturate the batch pipeline.
|
||||
let mut handles = Vec::with_capacity(total as usize);
|
||||
for _ in 0..total {
|
||||
let p = producer.clone();
|
||||
let pl = payload.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
p.send(ProducerMessage::new("batch-stress", pl))
|
||||
.await
|
||||
.unwrap();
|
||||
}));
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
let publish_duration = start.elapsed();
|
||||
let msgs_per_sec = total as f64 / publish_duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_batch_producer_100k: published {} messages in {:.2}s ({:.0} msg/s, {:.1} MB/s)",
|
||||
total,
|
||||
publish_duration.as_secs_f64(),
|
||||
msgs_per_sec,
|
||||
(total as f64 * 128.0) / (1024.0 * 1024.0) / publish_duration.as_secs_f64()
|
||||
);
|
||||
|
||||
// Verify: read back all messages via gRPC.
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep).await.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "batch-stress".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut consumed = 0u64;
|
||||
|
||||
while consumed < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => consumed += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(consumed, total, "expected all messages to be consumed");
|
||||
|
||||
// Close the producer (flushes remaining).
|
||||
Arc::try_unwrap(producer).ok().unwrap().close().await;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 9: BatchProducer concurrent — 10 batching producers, 10K each
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_batch_concurrent_producers() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
|
||||
let num_producers = 10;
|
||||
let msgs_per_producer = 10_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for p in 0..num_producers {
|
||||
let ep = endpoint.clone();
|
||||
let pl = payload.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
let producer = Arc::new(
|
||||
BatchProducer::connect(BatchProducerConfig {
|
||||
address: ep,
|
||||
producer_id: format!("batch-producer-{p}"),
|
||||
max_batch_size: 500,
|
||||
flush_interval_ms: 5,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
let topic = format!("batch-concurrent-{p}");
|
||||
let mut send_handles = Vec::new();
|
||||
|
||||
// Fire all sends concurrently within each producer.
|
||||
for _ in 0..msgs_per_producer {
|
||||
let p = producer.clone();
|
||||
let t = topic.clone();
|
||||
let pl = pl.clone();
|
||||
send_handles.push(tokio::spawn(async move {
|
||||
p.send(ProducerMessage::new(t, pl)).await.unwrap();
|
||||
}));
|
||||
}
|
||||
|
||||
// Await all acks.
|
||||
for handle in send_handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
Arc::try_unwrap(producer).ok().unwrap().close().await;
|
||||
}));
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
let total = num_producers as u64 * msgs_per_producer;
|
||||
let msgs_per_sec = total as f64 / duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_batch_concurrent_producers: {} producers x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
|
||||
num_producers,
|
||||
msgs_per_producer,
|
||||
total,
|
||||
duration.as_secs_f64(),
|
||||
msgs_per_sec
|
||||
);
|
||||
|
||||
// Verify each topic has the right count via gRPC.
|
||||
for p in 0..num_producers {
|
||||
let topic = format!("batch-concurrent-{p}");
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: topic.clone(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
while count < msgs_per_producer {
|
||||
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
count, msgs_per_producer,
|
||||
"topic {topic} expected {msgs_per_producer} messages, got {count}"
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user