feat: add capnp

Signed-off-by: kjuulh <contact@kjuulh.io>
This commit is contained in:
2026-02-27 12:15:35 +01:00
parent 3162971c89
commit 749ae245c7
115 changed files with 16596 additions and 31 deletions

View File

@@ -4,12 +4,17 @@ version.workspace = true
edition.workspace = true
[dependencies]
sq-capnp-interface = { workspace = true }
sq-grpc-interface = { workspace = true }
sq-models = { workspace = true }
sq-storage = { workspace = true }
sq-cluster = { workspace = true }
sq-sim = { workspace = true }
capnp = { workspace = true }
bytes = { workspace = true }
futures = { workspace = true }
anyhow = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
@@ -24,3 +29,15 @@ axum = { workspace = true }
tower = { workspace = true }
tower-http = { workspace = true }
http = { workspace = true }
tokio-util = { workspace = true }
tokio-stream = { workspace = true }
async-stream = { workspace = true }
tracing-opentelemetry = { workspace = true }
opentelemetry = { workspace = true }
opentelemetry_sdk = { workspace = true }
opentelemetry-otlp = { workspace = true }
opentelemetry-semantic-conventions = { workspace = true }
[dev-dependencies]
tempfile = { workspace = true }
sq-sdk = { workspace = true }

View File

@@ -0,0 +1,50 @@
use sq_capnp_interface::codec::{self, Frame, OP_ACK_RES};
use sq_capnp_interface::data_plane_capnp;
use crate::metrics;
use crate::state::State;
pub async fn handle(state: &State, payload: &[u8]) -> Frame {
match handle_inner(state, payload) {
Ok(frame) => frame,
Err(e) => codec::error_frame(&e),
}
}
fn handle_inner(state: &State, payload: &[u8]) -> Result<Frame, String> {
let reader = codec::read_capnp(payload).map_err(|e| format!("decode error: {e}"))?;
let req = reader
.get_root::<data_plane_capnp::ack_request::Reader>()
.map_err(|e| format!("schema error: {e}"))?;
let consumer_group = req
.get_consumer_group()
.map_err(|e| format!("schema error: {e}"))?
.to_str()
.map_err(|e| format!("utf8 error: {e}"))?;
let topic = req
.get_topic()
.map_err(|e| format!("schema error: {e}"))?
.to_str()
.map_err(|e| format!("utf8 error: {e}"))?;
let partition = req.get_partition();
let offset = req.get_offset();
if consumer_group.is_empty() {
return Err("consumer_group must not be empty".to_string());
}
if topic.is_empty() {
return Err("topic must not be empty".to_string());
}
state
.engine
.commit_offset(consumer_group, topic, partition, offset)
.map_err(|e| format!("commit error: {e}"))?;
metrics::record_ack(topic);
let mut builder = capnp::message::Builder::new_default();
builder.init_root::<data_plane_capnp::ack_response::Builder>();
Ok(codec::build_frame(OP_ACK_RES, &builder))
}

View File

@@ -0,0 +1,54 @@
use futures::SinkExt;
use sq_capnp_interface::codec::{SqCodec, OP_ACK_REQ, OP_PUBLISH_REQ, OP_SUBSCRIBE_REQ};
use tokio::net::TcpStream;
use tokio_stream::StreamExt;
use tokio_util::codec::Framed;
use tokio_util::sync::CancellationToken;
use crate::state::State;
use super::{ack, publish, subscribe};
pub async fn handle_connection(
stream: TcpStream,
state: State,
cancel: CancellationToken,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
stream.set_nodelay(true)?;
let mut framed = Framed::new(stream, SqCodec::new());
loop {
let frame = tokio::select! {
result = framed.next() => {
match result {
Some(Ok(frame)) => frame,
Some(Err(e)) => return Err(e.into()),
None => return Ok(()), // connection closed
}
}
() = cancel.cancelled() => return Ok(()),
};
match frame.opcode {
OP_PUBLISH_REQ => {
let response = publish::handle(&state, &frame.payload).await;
framed.send(response).await?;
}
OP_SUBSCRIBE_REQ => {
// Subscribe takes ownership of the framed stream for writing multiple responses.
subscribe::handle(&state, &frame.payload, &mut framed, &cancel).await?;
// After subscribe ends, the connection is done.
return Ok(());
}
OP_ACK_REQ => {
let response = ack::handle(&state, &frame.payload).await;
framed.send(response).await?;
}
other => {
let response =
sq_capnp_interface::codec::error_frame(&format!("unknown opcode: 0x{other:02x}"));
framed.send(response).await?;
}
}
}
}

View File

@@ -0,0 +1,58 @@
mod ack;
mod handler;
mod publish;
mod subscribe;
use std::net::SocketAddr;
use notmad::{Component, ComponentInfo, MadError};
use tokio::net::TcpListener;
use tokio_util::sync::CancellationToken;
use crate::state::State;
pub struct CapnpServer {
pub host: SocketAddr,
pub state: State,
}
impl Component for CapnpServer {
fn info(&self) -> ComponentInfo {
"sq-server/capnp".into()
}
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
let listener = TcpListener::bind(self.host)
.await
.map_err(|e| MadError::Inner(e.into()))?;
tracing::info!(addr = %self.host, "capnp data plane listening");
loop {
tokio::select! {
result = listener.accept() => {
match result {
Ok((stream, addr)) => {
let state = self.state.clone();
let cancel = cancellation_token.clone();
tokio::spawn(async move {
if let Err(e) = handler::handle_connection(stream, state, cancel).await {
tracing::debug!(peer = %addr, error = %e, "capnp connection ended");
}
});
}
Err(e) => {
tracing::warn!(error = %e, "capnp accept error");
}
}
}
() = cancellation_token.cancelled() => {
tracing::info!("capnp server shutting down");
break;
}
}
}
Ok(())
}
}

View File

@@ -0,0 +1,138 @@
use sq_capnp_interface::codec::{self, Frame, OP_PUBLISH_RES};
use sq_capnp_interface::data_plane_capnp;
use crate::metrics;
use crate::pipeline::PipelineMessage;
use crate::state::State;
pub async fn handle(state: &State, payload: &[u8]) -> Frame {
match handle_inner(state, payload).await {
Ok(frame) => frame,
Err(e) => codec::error_frame(&e),
}
}
/// Decode the capnp payload into owned pipeline messages. This is sync (no .await)
/// so the capnp Reader (which is !Send) does not live across an await boundary.
fn decode_request(payload: &[u8]) -> Result<(Vec<PipelineMessage>, u8), String> {
let reader = codec::read_capnp(payload).map_err(|e| format!("decode error: {e}"))?;
let req = reader
.get_root::<data_plane_capnp::publish_request::Reader>()
.map_err(|e| format!("schema error: {e}"))?;
let messages = req
.get_messages()
.map_err(|e| format!("schema error: {e}"))?;
if messages.len() == 0 {
return Err("messages must not be empty".to_string());
}
let ack_mode = req.get_ack_mode();
let timestamp_ms = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64;
let mut pipeline_msgs = Vec::with_capacity(messages.len() as usize);
for i in 0..messages.len() {
let msg = messages.get(i);
let topic = msg
.get_topic()
.map_err(|e| format!("schema error: {e}"))?
.to_string()
.map_err(|e| format!("utf8 error: {e}"))?;
if topic.is_empty() {
return Err("topic must not be empty".to_string());
}
let key = msg
.get_key()
.map_err(|e| format!("schema error: {e}"))?
.to_vec();
let value = msg
.get_value()
.map_err(|e| format!("schema error: {e}"))?
.to_vec();
let headers_reader = msg
.get_headers()
.map_err(|e| format!("schema error: {e}"))?;
let mut headers = Vec::with_capacity(headers_reader.len() as usize);
for j in 0..headers_reader.len() {
let h = headers_reader.get(j);
let hkey = h
.get_key()
.map_err(|e| format!("schema error: {e}"))?
.to_string()
.map_err(|e| format!("utf8 error: {e}"))?;
let hval = h
.get_value()
.map_err(|e| format!("schema error: {e}"))?
.to_vec();
headers.push(sq_models::Header {
key: hkey,
value: hval,
});
}
pipeline_msgs.push(PipelineMessage {
topic,
partition: 0,
key,
value,
headers,
timestamp_ms,
});
}
Ok((pipeline_msgs, ack_mode))
}
async fn handle_inner(state: &State, payload: &[u8]) -> Result<Frame, String> {
let (pipeline_msgs, ack_mode) = decode_request(payload)?;
let start = std::time::Instant::now();
let first_topic = pipeline_msgs
.first()
.map(|m| m.topic.clone())
.unwrap_or_default();
// ACK mode 3 = None (fire and forget)
if ack_mode == 3 {
let result_count = pipeline_msgs.len();
state
.pipeline
.submit_fire_and_forget(pipeline_msgs)
.await;
metrics::record_messages_published(&first_topic, result_count as u64);
metrics::record_publish_duration(&first_topic, start);
return Ok(build_publish_response(&[]));
}
// Standard ack mode — submit and wait.
let results = state
.pipeline
.submit(pipeline_msgs)
.await
.map_err(|e| format!("pipeline error: {e}"))?;
metrics::record_messages_published(&first_topic, results.len() as u64);
metrics::record_publish_duration(&first_topic, start);
Ok(build_publish_response(&results))
}
fn build_publish_response(results: &[crate::pipeline::PipelineResult]) -> Frame {
let mut builder = capnp::message::Builder::new_default();
{
let resp = builder.init_root::<data_plane_capnp::publish_response::Builder>();
let mut res_list = resp.init_results(results.len() as u32);
for (i, r) in results.iter().enumerate() {
let mut entry = res_list.reborrow().get(i as u32);
entry.set_topic(&r.topic[..]);
entry.set_partition(r.partition);
entry.set_offset(r.offset);
}
}
codec::build_frame(OP_PUBLISH_RES, &builder)
}

View File

@@ -0,0 +1,113 @@
use bytes::Bytes;
use futures::SinkExt;
use sq_capnp_interface::codec::{self, Frame, SqCodec, OP_SUBSCRIBE_END, OP_SUBSCRIBE_RES};
use sq_capnp_interface::data_plane_capnp;
use tokio::net::TcpStream;
use tokio_util::codec::Framed;
use tokio_util::sync::CancellationToken;
use crate::metrics;
use crate::state::State;
pub async fn handle(
state: &State,
payload: &[u8],
framed: &mut Framed<TcpStream, SqCodec>,
cancel: &CancellationToken,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let reader = codec::read_capnp(payload)?;
let req = reader.get_root::<data_plane_capnp::subscribe_request::Reader>()?;
let topic = req.get_topic()?.to_string()?;
if topic.is_empty() {
let err = codec::error_frame("topic must not be empty");
framed.send(err).await?;
return Ok(());
}
let partition = req.get_partition();
let consumer_group = req.get_consumer_group()?.to_string()?;
let batch_size = if req.get_max_batch_size() == 0 {
100
} else {
req.get_max_batch_size() as usize
};
let start_offset = if req.get_has_start_offset() {
req.get_start_offset()
} else if !consumer_group.is_empty() {
state
.engine
.get_committed_offset(&consumer_group, &topic, partition)
.unwrap_or(0)
} else {
0
};
let mut current_offset = start_offset;
loop {
if cancel.is_cancelled() {
break;
}
let messages = state
.engine
.read(&topic, partition, current_offset, batch_size)
.map_err(|e| format!("read error: {e}"))?;
if messages.is_empty() {
tokio::select! {
() = tokio::time::sleep(tokio::time::Duration::from_millis(100)) => continue,
() = cancel.cancelled() => break,
}
}
let frame = build_subscribe_response(&messages, &mut current_offset);
metrics::record_messages_consumed(&topic, messages.len() as u64);
metrics::record_subscribe_batch(&topic);
if framed.send(frame).await.is_err() {
// Client disconnected.
return Ok(());
}
}
// Send end-of-stream sentinel.
let end = Frame {
opcode: OP_SUBSCRIBE_END,
payload: Bytes::new(),
};
let _ = framed.send(end).await;
Ok(())
}
fn build_subscribe_response(
messages: &[sq_models::Message],
current_offset: &mut u64,
) -> Frame {
let mut builder = capnp::message::Builder::new_default();
{
let resp = builder.init_root::<data_plane_capnp::subscribe_response::Builder>();
let mut msg_list = resp.init_messages(messages.len() as u32);
for (i, m) in messages.iter().enumerate() {
*current_offset = m.offset + 1;
let mut entry = msg_list.reborrow().get(i as u32);
entry.set_offset(m.offset);
entry.set_topic(m.topic.as_str());
entry.set_partition(m.partition);
entry.set_key(m.key.as_deref().unwrap_or(&[]));
entry.set_value(&m.value);
entry.set_timestamp_ms(m.timestamp_ms);
let mut headers = entry.init_headers(m.headers.len() as u32);
for (j, h) in m.headers.iter().enumerate() {
let mut hdr = headers.reborrow().get(j as u32);
hdr.set_key(&h.key[..]);
hdr.set_value(&h.value);
}
}
}
codec::build_frame(OP_SUBSCRIBE_RES, &builder)
}

129
crates/sq-server/src/cli.rs Normal file
View File

@@ -0,0 +1,129 @@
use std::path::PathBuf;
use std::sync::Arc;
use clap::{Parser, Subcommand};
use sq_models::SyncPolicy;
use sq_sim::fs::RealFileSystem;
use sq_storage::object_store::reader::ObjectStoreReader;
use sq_storage::object_store::s3::{S3Config, S3ObjectStore};
use crate::pipeline::WritePipeline;
use crate::state::{Config, State};
mod serve;
use serve::*;
#[derive(Parser)]
#[command(author, version, about = "SQ - Stored Queue Server", long_about = None, subcommand_required = true)]
struct Command {
#[command(subcommand)]
command: Commands,
#[arg(long, env = "SQ_NODE_ID", default_value = "node-1")]
node_id: String,
#[arg(long, env = "SQ_DATA_DIR", default_value = "./data")]
data_dir: PathBuf,
#[arg(long, env = "SQ_SEEDS", value_delimiter = ',')]
seeds: Vec<String>,
#[arg(long, env = "SQ_CLUSTER_ID", default_value = "default")]
cluster_id: String,
#[arg(long, env = "SQ_S3_BUCKET")]
s3_bucket: Option<String>,
#[arg(long, env = "SQ_S3_ENDPOINT")]
s3_endpoint: Option<String>,
#[arg(long, env = "SQ_S3_REGION")]
s3_region: Option<String>,
/// Fsync policy: "every-batch" (default), "none", or interval in ms (e.g. "200").
#[arg(long, env = "SQ_SYNC_POLICY", default_value = "every-batch")]
sync_policy: String,
}
#[derive(Subcommand)]
enum Commands {
Serve(ServeCommand),
}
impl Commands {
fn grpc_address(&self) -> String {
match self {
Commands::Serve(cmd) => cmd.grpc_host.to_string(),
}
}
async fn execute(&self, state: &State, pipeline: WritePipeline) -> anyhow::Result<()> {
match self {
Commands::Serve(cmd) => cmd.execute(state, pipeline).await,
}
}
}
pub async fn execute() -> anyhow::Result<()> {
let cli = Command::parse();
tracing::debug!("starting sq-server");
let sync_policy = match cli.sync_policy.as_str() {
"every-batch" => SyncPolicy::EveryBatch,
"none" => SyncPolicy::None,
ms => {
let millis: u64 = ms
.parse()
.map_err(|_| anyhow::anyhow!("invalid sync_policy: expected 'every-batch', 'none', or interval in ms, got '{ms}'"))?;
SyncPolicy::Interval(std::time::Duration::from_millis(millis))
}
};
let config = Config {
node_id: cli.node_id,
data_dir: cli.data_dir,
seeds: cli.seeds,
grpc_address: cli.command.grpc_address(),
cluster_id: cli.cluster_id,
s3_bucket: cli.s3_bucket,
s3_endpoint: cli.s3_endpoint,
s3_region: cli.s3_region,
sync_policy,
};
let (mut state, pipeline) = State::new(config)?;
// Set up S3 reader if S3 is configured.
if let Some(bucket) = &state.config.s3_bucket {
let s3_config = S3Config {
bucket: bucket.clone(),
region: state
.config
.s3_region
.clone()
.unwrap_or_else(|| "us-east-1".to_string()),
endpoint: state.config.s3_endpoint.clone(),
access_key_id: std::env::var("AWS_ACCESS_KEY_ID").ok(),
secret_access_key: std::env::var("AWS_SECRET_ACCESS_KEY").ok(),
allow_http: state.config.s3_endpoint.is_some(),
};
match S3ObjectStore::new(s3_config) {
Ok(store) => {
let cache_dir = state.config.data_dir.join(".s3-cache");
let reader = ObjectStoreReader::new(
Arc::new(RealFileSystem),
Arc::new(store),
cache_dir,
);
state.s3_reader = Some(Arc::new(reader));
}
Err(e) => {
tracing::warn!(error = %e, "failed to initialize S3 reader");
}
}
}
cli.command.execute(&state, pipeline).await?;
Ok(())
}

View File

@@ -0,0 +1,173 @@
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use notmad::{Component, ComponentInfo, MadError};
use sq_cluster::membership::{Membership, MembershipConfig};
use sq_storage::object_store::s3::{S3Config, S3ObjectStore};
use tokio_util::sync::CancellationToken;
use sq_models::SyncPolicy;
use crate::pipeline::WritePipeline;
use crate::shipper::BackgroundShipper;
use crate::sync_task::BackgroundSync;
use crate::{capnp::CapnpServer, grpc, servehttp::ServeHttp, state::State};
/// Wraps the WritePipeline as a notmad Component.
struct PipelineComponent {
pipeline: std::sync::Mutex<Option<WritePipeline>>,
}
impl Component for PipelineComponent {
fn info(&self) -> ComponentInfo {
"sq-server/pipeline".into()
}
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
let mut pipeline = self
.pipeline
.lock()
.unwrap()
.take()
.expect("pipeline already taken");
tokio::select! {
() = pipeline.run() => {}
() = cancellation_token.cancelled() => {}
}
Ok(())
}
}
#[derive(clap::Parser)]
pub struct ServeCommand {
#[arg(long, env = "SQ_GRPC_HOST", default_value = "127.0.0.1:6060")]
pub(crate) grpc_host: SocketAddr,
#[arg(long, env = "SQ_HTTP_HOST", default_value = "127.0.0.1:6062")]
http_host: SocketAddr,
#[arg(long, env = "SQ_CAPNP_HOST", default_value = "127.0.0.1:6064")]
capnp_host: SocketAddr,
}
impl ServeCommand {
pub async fn execute(&self, state: &State, pipeline: WritePipeline) -> anyhow::Result<()> {
tracing::info!(
node_id = %state.config.node_id,
grpc = %self.grpc_host,
http = %self.http_host,
capnp = %self.capnp_host,
seeds = ?state.config.seeds,
"starting sq-server"
);
let membership = Arc::new(Membership::new(MembershipConfig {
node_id: state.config.node_id.clone(),
address: state.config.grpc_address.clone(),
seeds: state.config.seeds.clone(),
..Default::default()
}));
// Optionally set up S3 background shipper.
let shipper = if let Some(bucket) = &state.config.s3_bucket {
let s3_config = S3Config {
bucket: bucket.clone(),
region: state
.config
.s3_region
.clone()
.unwrap_or_else(|| "us-east-1".to_string()),
endpoint: state.config.s3_endpoint.clone(),
access_key_id: std::env::var("AWS_ACCESS_KEY_ID").ok(),
secret_access_key: std::env::var("AWS_SECRET_ACCESS_KEY").ok(),
allow_http: state.config.s3_endpoint.is_some(),
};
match S3ObjectStore::new(s3_config) {
Ok(store) => {
tracing::info!(
bucket = %bucket,
cluster_id = %state.config.cluster_id,
"S3 background shipper enabled"
);
let store = Arc::new(store);
Some(BackgroundShipper::new(
state.clone(),
store,
state.config.cluster_id.clone(),
Duration::from_secs(30),
))
}
Err(e) => {
tracing::warn!(error = %e, "failed to initialize S3 object store, shipper disabled");
None
}
}
} else {
None
};
// Optionally set up background sync task for Interval sync policy.
let background_sync = if let SyncPolicy::Interval(interval) = &state.config.sync_policy {
tracing::info!(?interval, "background sync enabled");
Some(BackgroundSync::new(state.engine.clone(), *interval))
} else {
None
};
// Build the component set. We use match to handle optional components
// without storing temporaries (Mad::builder() returns a temporary).
match (shipper, background_sync) {
(Some(shipper), Some(sync)) => {
notmad::Mad::builder()
.add(PipelineComponent { pipeline: std::sync::Mutex::new(Some(pipeline)) })
.add(grpc::GrpcServer { host: self.grpc_host, state: state.clone(), membership: membership.clone() })
.add(CapnpServer { host: self.capnp_host, state: state.clone() })
.add(ServeHttp { host: self.http_host })
.add(state.drop_queue.clone())
.add(shipper)
.add(sync)
.run()
.await?;
}
(Some(shipper), None) => {
notmad::Mad::builder()
.add(PipelineComponent { pipeline: std::sync::Mutex::new(Some(pipeline)) })
.add(grpc::GrpcServer { host: self.grpc_host, state: state.clone(), membership: membership.clone() })
.add(CapnpServer { host: self.capnp_host, state: state.clone() })
.add(ServeHttp { host: self.http_host })
.add(state.drop_queue.clone())
.add(shipper)
.run()
.await?;
}
(None, Some(sync)) => {
notmad::Mad::builder()
.add(PipelineComponent { pipeline: std::sync::Mutex::new(Some(pipeline)) })
.add(grpc::GrpcServer { host: self.grpc_host, state: state.clone(), membership: membership.clone() })
.add(CapnpServer { host: self.capnp_host, state: state.clone() })
.add(ServeHttp { host: self.http_host })
.add(state.drop_queue.clone())
.add(sync)
.run()
.await?;
}
(None, None) => {
notmad::Mad::builder()
.add(PipelineComponent { pipeline: std::sync::Mutex::new(Some(pipeline)) })
.add(grpc::GrpcServer { host: self.grpc_host, state: state.clone(), membership: membership.clone() })
.add(CapnpServer { host: self.capnp_host, state: state.clone() })
.add(ServeHttp { host: self.http_host })
.add(state.drop_queue.clone())
.run()
.await?;
}
}
Ok(())
}
}

View File

@@ -0,0 +1,170 @@
use std::pin::Pin;
use std::sync::Arc;
use sq_cluster::membership::Membership;
use sq_grpc_interface::{
cluster_service_server::ClusterService, ClusterNodeInfo, FetchSegmentRequest,
FetchSegmentResponse, HeartbeatRequest, HeartbeatResponse, JoinRequest, JoinResponse,
ReplicateEntriesRequest, ReplicateEntriesResponse,
};
use tokio_stream::Stream;
use tonic::Status;
use crate::metrics;
use crate::state::State;
pub struct ClusterServer {
pub state: State,
pub membership: Arc<Membership>,
}
#[tonic::async_trait]
impl ClusterService for ClusterServer {
#[tracing::instrument(skip_all, fields(rpc.method = "ReplicateEntries", sq.topic, sq.partition, sq.entry_count))]
async fn replicate_entries(
&self,
request: tonic::Request<ReplicateEntriesRequest>,
) -> Result<tonic::Response<ReplicateEntriesResponse>, Status> {
let req = request.into_inner();
let span = tracing::Span::current();
span.record("sq.topic", &req.topic);
span.record("sq.partition", req.partition);
span.record("sq.entry_count", req.entries.len());
let mut last_offset = 0u64;
for entry_bytes in &req.entries {
let offset = self
.state
.engine
.append(&req.topic, req.partition, None, entry_bytes, &[], 0)
.map_err(|e| Status::internal(e.to_string()))?;
last_offset = offset;
}
metrics::record_replicate_entries(req.entries.len() as u64);
Ok(tonic::Response::new(ReplicateEntriesResponse {
last_replicated_offset: last_offset,
}))
}
#[tracing::instrument(skip_all, fields(rpc.method = "Join", sq.joining_node_id, sq.joining_address))]
async fn join(
&self,
request: tonic::Request<JoinRequest>,
) -> Result<tonic::Response<JoinResponse>, Status> {
let req = request.into_inner();
let span = tracing::Span::current();
span.record("sq.joining_node_id", &req.node_id);
span.record("sq.joining_address", &req.address);
// Record the joining node.
self.membership
.record_heartbeat(&req.node_id, &req.address)
.await;
tracing::info!(
node_id = %req.node_id,
address = %req.address,
"node joined cluster"
);
// Return current membership list.
let members = self.membership.all_members().await;
let member_infos: Vec<ClusterNodeInfo> = members
.into_iter()
.map(|m| ClusterNodeInfo {
node_id: m.node_id,
address: m.address,
status: m.status.to_string(),
})
.collect();
Ok(tonic::Response::new(JoinResponse {
members: member_infos,
}))
}
#[tracing::instrument(skip_all, fields(rpc.method = "Heartbeat", sq.from_node))]
async fn heartbeat(
&self,
request: tonic::Request<HeartbeatRequest>,
) -> Result<tonic::Response<HeartbeatResponse>, Status> {
let req = request.into_inner();
tracing::Span::current().record("sq.from_node", &req.node_id);
// Record heartbeat from the sender.
let sender_address = req
.known_members
.iter()
.find(|m| m.node_id == req.node_id)
.map(|m| m.address.clone())
.unwrap_or_default();
self.membership
.record_heartbeat(&req.node_id, &sender_address)
.await;
// Merge any members we don't know about.
let discovered: Vec<(String, String)> = req
.known_members
.iter()
.map(|m| (m.node_id.clone(), m.address.clone()))
.collect();
self.membership.merge_members(discovered).await;
// Return our view of the membership.
let members = self.membership.all_members().await;
let member_infos: Vec<ClusterNodeInfo> = members
.into_iter()
.map(|m| ClusterNodeInfo {
node_id: m.node_id,
address: m.address,
status: m.status.to_string(),
})
.collect();
Ok(tonic::Response::new(HeartbeatResponse {
members: member_infos,
}))
}
type FetchSegmentStream =
Pin<Box<dyn Stream<Item = Result<FetchSegmentResponse, Status>> + Send + 'static>>;
#[tracing::instrument(skip_all, fields(rpc.method = "FetchSegment", sq.topic, sq.partition, sq.from_offset))]
async fn fetch_segment(
&self,
request: tonic::Request<FetchSegmentRequest>,
) -> Result<tonic::Response<Self::FetchSegmentStream>, Status> {
let req = request.into_inner();
let span = tracing::Span::current();
span.record("sq.topic", &req.topic);
span.record("sq.partition", req.partition);
span.record("sq.from_offset", req.from_offset);
// Read messages from the requested offset. No lock needed.
let messages = self
.state
.engine
.read(&req.topic, req.partition, req.from_offset, 10_000)
.map_err(|e| Status::internal(e.to_string()))?;
// Stream raw message data back in chunks.
let stream = async_stream::try_stream! {
const CHUNK_SIZE: usize = 100;
for batch in messages.chunks(CHUNK_SIZE) {
let mut chunk_data = Vec::new();
for msg in batch {
// Simple wire format: offset(8) + value_len(4) + value
chunk_data.extend_from_slice(&msg.offset.to_le_bytes());
chunk_data.extend_from_slice(&(msg.value.len() as u32).to_le_bytes());
chunk_data.extend_from_slice(&msg.value);
}
yield FetchSegmentResponse { chunk: chunk_data };
}
};
Ok(tonic::Response::new(Box::pin(stream) as Self::FetchSegmentStream))
}
}

View File

@@ -0,0 +1,146 @@
use sq_grpc_interface::{
control_plane_service_server::ControlPlaneService, CreateConsumerGroupRequest,
CreateConsumerGroupResponse, CreateTopicRequest, CreateTopicResponse, DeleteTopicRequest,
DeleteTopicResponse, DescribeTopicRequest, DescribeTopicResponse, ListTopicsRequest,
ListTopicsResponse, PartitionInfo, TopicInfo,
};
use tonic::Status;
use crate::grpc::error;
use crate::metrics;
use crate::state::State;
pub struct ControlPlaneServer {
pub state: State,
}
#[tonic::async_trait]
impl ControlPlaneService for ControlPlaneServer {
#[tracing::instrument(skip_all, fields(rpc.method = "CreateTopic", sq.topic))]
async fn create_topic(
&self,
request: tonic::Request<CreateTopicRequest>,
) -> Result<tonic::Response<CreateTopicResponse>, Status> {
let req = request.into_inner();
tracing::Span::current().record("sq.topic", &req.name);
if req.name.is_empty() {
return Err(Status::invalid_argument("topic name must not be empty"));
}
let partitions = if req.partitions == 0 { 1 } else { req.partitions };
let replication_factor = if req.replication_factor == 0 {
3
} else {
req.replication_factor
};
let config = sq_models::TopicConfig::new(req.name.as_str())
.with_partitions(partitions)
.with_replication_factor(replication_factor);
self.state.engine.create_topic(config).map_err(|e| {
if e.to_string().contains("already exists") {
Status::already_exists(e.to_string())
} else {
error::internal(e)
}
})?;
metrics::record_topic_created();
Ok(tonic::Response::new(CreateTopicResponse {
name: req.name,
}))
}
#[tracing::instrument(skip_all, fields(rpc.method = "DeleteTopic", sq.topic))]
async fn delete_topic(
&self,
request: tonic::Request<DeleteTopicRequest>,
) -> Result<tonic::Response<DeleteTopicResponse>, Status> {
let req = request.into_inner();
tracing::Span::current().record("sq.topic", &req.name);
if req.name.is_empty() {
return Err(Status::invalid_argument("topic name must not be empty"));
}
self.state.engine.delete_topic(&req.name).map_err(|e| {
if e.to_string().contains("not found") {
Status::not_found(e.to_string())
} else {
error::internal(e)
}
})?;
Ok(tonic::Response::new(DeleteTopicResponse {}))
}
#[tracing::instrument(skip_all, fields(rpc.method = "ListTopics"))]
async fn list_topics(
&self,
_request: tonic::Request<ListTopicsRequest>,
) -> Result<tonic::Response<ListTopicsResponse>, Status> {
let topics = self.state.engine.list_topics();
let topic_infos: Vec<TopicInfo> = topics
.into_iter()
.map(|t| TopicInfo {
name: t.name.to_string(),
partitions: t.partitions,
replication_factor: t.replication_factor,
})
.collect();
Ok(tonic::Response::new(ListTopicsResponse {
topics: topic_infos,
}))
}
#[tracing::instrument(skip_all, fields(rpc.method = "DescribeTopic", sq.topic))]
async fn describe_topic(
&self,
request: tonic::Request<DescribeTopicRequest>,
) -> Result<tonic::Response<DescribeTopicResponse>, Status> {
let req = request.into_inner();
tracing::Span::current().record("sq.topic", &req.name);
let topic_config = self
.state
.engine
.get_topic(&req.name)
.ok_or_else(|| Status::not_found(format!("topic '{}' not found", req.name)))?;
let topic_info = TopicInfo {
name: topic_config.name.to_string(),
partitions: topic_config.partitions,
replication_factor: topic_config.replication_factor,
};
// Build partition info with offset ranges.
let mut partition_info = Vec::new();
for p in 0..topic_config.partitions {
let latest = self.state.engine.latest_offset(&req.name, p);
partition_info.push(PartitionInfo {
partition: p,
earliest_offset: 0,
latest_offset: latest,
});
}
Ok(tonic::Response::new(DescribeTopicResponse {
topic: Some(topic_info),
partition_info,
}))
}
#[tracing::instrument(skip_all, fields(rpc.method = "CreateConsumerGroup"))]
async fn create_consumer_group(
&self,
_request: tonic::Request<CreateConsumerGroupRequest>,
) -> Result<tonic::Response<CreateConsumerGroupResponse>, Status> {
// Consumer groups are implicit in our design - they exist as soon as someone uses them.
Ok(tonic::Response::new(CreateConsumerGroupResponse {}))
}
}

View File

@@ -0,0 +1,334 @@
use std::pin::Pin;
use std::sync::Arc;
use sq_grpc_interface::{
data_plane_service_server::DataPlaneService, AckRequest, AckResponse, AckMode,
ConsumedMessage, MessageHeader, PublishRequest, PublishResponse, PublishResult,
SubscribeRequest, SubscribeResponse,
};
use sq_sim::fs::RealFileSystem;
use sq_storage::object_store::layout;
use sq_storage::object_store::reader::ObjectStoreReader;
use sq_storage::object_store::s3::S3ObjectStore;
use tokio_stream::Stream;
use tonic::Status;
use crate::grpc::error;
use crate::metrics;
use crate::pipeline::PipelineMessage;
use crate::state::State;
pub struct DataPlaneServer {
pub state: State,
}
fn to_pipeline_messages(
messages: Vec<sq_grpc_interface::PublishMessage>,
) -> Vec<PipelineMessage> {
let timestamp_ms = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64;
messages
.into_iter()
.map(|msg| PipelineMessage {
topic: msg.topic,
partition: 0,
key: msg.key,
value: msg.value,
headers: msg
.headers
.into_iter()
.map(|h| sq_models::Header {
key: h.key,
value: h.value,
})
.collect(),
timestamp_ms,
})
.collect()
}
#[tonic::async_trait]
impl DataPlaneService for DataPlaneServer {
#[tracing::instrument(
skip_all,
fields(
rpc.method = "Publish",
sq.message_count,
sq.ack_mode,
)
)]
async fn publish(
&self,
request: tonic::Request<PublishRequest>,
) -> Result<tonic::Response<PublishResponse>, Status> {
let req = request.into_inner();
let span = tracing::Span::current();
span.record("sq.message_count", req.messages.len());
if req.messages.is_empty() {
return Err(Status::invalid_argument("messages must not be empty"));
}
let start = std::time::Instant::now();
let ack_mode = req
.settings
.as_ref()
.map(|s| AckMode::try_from(s.ack_mode).unwrap_or(AckMode::All))
.unwrap_or(AckMode::All);
// For ACK_MODE_NONE, fire-and-forget via the pipeline.
if ack_mode == AckMode::None {
let results: Vec<PublishResult> = req
.messages
.iter()
.map(|msg| PublishResult {
topic: msg.topic.clone(),
partition: 0,
offset: 0,
})
.collect();
let pipeline_msgs = to_pipeline_messages(req.messages);
self.state.pipeline.submit_fire_and_forget(pipeline_msgs).await;
let first_topic = results.first().map(|r| r.topic.as_str()).unwrap_or("");
metrics::record_messages_published(first_topic, results.len() as u64);
metrics::record_publish_duration(first_topic, start);
return Ok(tonic::Response::new(PublishResponse { results }));
}
// Validate topics before submitting.
for msg in &req.messages {
if msg.topic.is_empty() {
return Err(Status::invalid_argument("topic must not be empty"));
}
}
// Standard (ACK_MODE_ALL / ACK_MODE_LOCAL) - submit to pipeline and wait for ack.
let pipeline_msgs = to_pipeline_messages(req.messages);
let pipeline_results = self
.state
.pipeline
.submit(pipeline_msgs)
.await
.map_err(|e| error::internal(anyhow::anyhow!(e)))?;
let results: Vec<PublishResult> = pipeline_results
.into_iter()
.map(|r| PublishResult {
topic: r.topic,
partition: r.partition,
offset: r.offset,
})
.collect();
let first_topic = results.first().map(|r| r.topic.as_str()).unwrap_or("");
metrics::record_messages_published(first_topic, results.len() as u64);
metrics::record_publish_duration(first_topic, start);
Ok(tonic::Response::new(PublishResponse { results }))
}
type SubscribeStream =
Pin<Box<dyn Stream<Item = Result<SubscribeResponse, Status>> + Send + 'static>>;
#[tracing::instrument(
skip_all,
fields(
rpc.method = "Subscribe",
sq.topic,
sq.partition,
sq.consumer_group,
)
)]
async fn subscribe(
&self,
request: tonic::Request<SubscribeRequest>,
) -> Result<tonic::Response<Self::SubscribeStream>, Status> {
let req = request.into_inner();
let span = tracing::Span::current();
span.record("sq.topic", &req.topic);
span.record("sq.partition", req.partition);
span.record("sq.consumer_group", &req.consumer_group);
if req.topic.is_empty() {
return Err(Status::invalid_argument("topic must not be empty"));
}
let batch_size = if req.max_batch_size == 0 {
100
} else {
req.max_batch_size as usize
};
// If no explicit start_offset, try using the committed offset for the consumer group.
let start_offset = match req.start_offset {
Some(offset) => offset,
None => {
if !req.consumer_group.is_empty() {
self.state
.engine
.get_committed_offset(&req.consumer_group, &req.topic, req.partition)
.unwrap_or(0)
} else {
0
}
}
};
let topic = req.topic.clone();
let partition = req.partition;
let state = self.state.clone();
let stream = async_stream::try_stream! {
let mut current_offset = start_offset;
loop {
let messages = state.engine
.read(&topic, partition, current_offset, batch_size)
.map_err(|e| error::internal(e))?;
// If local WAL is empty and S3 reader is available, try S3 fallback.
let messages = if messages.is_empty() {
if let Some(ref s3_reader) = state.s3_reader {
read_from_s3(
s3_reader,
&state.config.cluster_id,
&topic,
partition,
current_offset,
batch_size,
)
.await
.unwrap_or_default()
} else {
messages
}
} else {
messages
};
if messages.is_empty() {
// Poll interval when caught up.
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
continue;
}
let consumed: Vec<ConsumedMessage> = messages
.iter()
.map(|m| {
current_offset = m.offset + 1;
ConsumedMessage {
offset: m.offset,
topic: m.topic.to_string(),
partition: m.partition,
key: m.key.clone().unwrap_or_default(),
value: m.value.clone(),
headers: m
.headers
.iter()
.map(|h| MessageHeader {
key: h.key.clone(),
value: h.value.clone(),
})
.collect(),
timestamp_ms: m.timestamp_ms,
}
})
.collect();
metrics::record_messages_consumed(&topic, consumed.len() as u64);
metrics::record_subscribe_batch(&topic);
yield SubscribeResponse { messages: consumed };
}
};
Ok(tonic::Response::new(Box::pin(stream)))
}
#[tracing::instrument(
skip_all,
fields(
rpc.method = "Ack",
sq.topic,
sq.partition,
sq.consumer_group,
sq.offset,
)
)]
async fn ack(
&self,
request: tonic::Request<AckRequest>,
) -> Result<tonic::Response<AckResponse>, Status> {
let req = request.into_inner();
let span = tracing::Span::current();
span.record("sq.topic", &req.topic);
span.record("sq.partition", req.partition);
span.record("sq.consumer_group", &req.consumer_group);
span.record("sq.offset", req.offset);
if req.consumer_group.is_empty() {
return Err(Status::invalid_argument("consumer_group must not be empty"));
}
if req.topic.is_empty() {
return Err(Status::invalid_argument("topic must not be empty"));
}
self.state
.engine
.commit_offset(&req.consumer_group, &req.topic, req.partition, req.offset)
.map_err(|e| error::internal(e))?;
metrics::record_ack(&req.topic);
Ok(tonic::Response::new(AckResponse {}))
}
}
/// Try to read messages from S3 when local WAL is empty (segments have been shipped and trimmed).
async fn read_from_s3(
reader: &Arc<ObjectStoreReader<RealFileSystem, S3ObjectStore>>,
cluster_id: &str,
topic: &str,
partition: u32,
from_offset: u64,
limit: usize,
) -> anyhow::Result<Vec<sq_models::Message>> {
// List all segment keys for this topic-partition.
let prefix = layout::topic_partition_prefix(cluster_id, topic, partition);
// We need to use the ObjectStore trait's list method through the reader's store.
// For now, we'll use a simpler approach: try to find and read the segment containing our offset.
// This works because segment keys are lexicographically ordered.
let keys = reader.list_segment_keys(&prefix).await?;
let mut messages = Vec::new();
for key in &keys {
if messages.len() >= limit {
break;
}
// Parse the segment key to check offset range.
if let Some((_, _, _, _base_offset, end_offset)) = layout::parse_segment_key(key) {
// Skip segments that are entirely before our requested offset.
if end_offset < from_offset {
continue;
}
let segment_msgs = reader.read_segment(key, from_offset).await?;
for msg in segment_msgs {
if messages.len() >= limit {
break;
}
messages.push(msg);
}
}
}
Ok(messages)
}

View File

@@ -0,0 +1,13 @@
use tonic::Status;
pub fn internal(err: impl std::fmt::Display) -> Status {
Status::internal(err.to_string())
}
pub fn not_found(msg: impl Into<String>) -> Status {
Status::not_found(msg)
}
pub fn invalid_argument(msg: impl Into<String>) -> Status {
Status::invalid_argument(msg)
}

View File

@@ -0,0 +1,23 @@
use sq_grpc_interface::{
status_service_server::StatusService, GetStatusRequest, GetStatusResponse,
};
use crate::state::State;
pub struct HealthServer {
pub state: State,
}
#[tonic::async_trait]
impl StatusService for HealthServer {
#[tracing::instrument(skip_all, fields(rpc.method = "Status"))]
async fn status(
&self,
_request: tonic::Request<GetStatusRequest>,
) -> Result<tonic::Response<GetStatusResponse>, tonic::Status> {
Ok(tonic::Response::new(GetStatusResponse {
node_id: self.state.config.node_id.clone(),
cluster: None,
}))
}
}

View File

@@ -0,0 +1,79 @@
use std::net::SocketAddr;
use std::sync::Arc;
use notmad::MadError;
use sq_cluster::membership::Membership;
use sq_grpc_interface::{
cluster_service_server::ClusterServiceServer,
control_plane_service_server::ControlPlaneServiceServer,
data_plane_service_server::DataPlaneServiceServer,
status_service_server::StatusServiceServer,
};
use tokio_util::sync::CancellationToken;
use crate::state::State;
pub mod cluster;
pub mod control_plane;
pub mod data_plane;
pub mod error;
pub mod health;
pub struct GrpcServer {
pub host: SocketAddr,
pub state: State,
pub membership: Arc<Membership>,
}
impl GrpcServer {
pub async fn serve(&self, cancellation_token: CancellationToken) -> anyhow::Result<()> {
tracing::info!("serving grpc on {}", self.host);
tonic::transport::Server::builder()
.trace_fn(|request| {
tracing::info_span!(
"grpc",
otel.kind = "server",
rpc.system = "grpc",
rpc.service = tracing::field::Empty,
rpc.method = %request.uri().path(),
)
})
.add_service(StatusServiceServer::new(health::HealthServer {
state: self.state.clone(),
}))
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
state: self.state.clone(),
}))
.add_service(ControlPlaneServiceServer::new(
control_plane::ControlPlaneServer {
state: self.state.clone(),
},
))
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
state: self.state.clone(),
membership: self.membership.clone(),
}))
.serve_with_shutdown(
self.host,
async move { cancellation_token.cancelled().await },
)
.await?;
Ok(())
}
}
impl notmad::Component for GrpcServer {
fn info(&self) -> notmad::ComponentInfo {
"sq-server/grpc".into()
}
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
self.serve(cancellation_token)
.await
.map_err(MadError::Inner)?;
Ok(())
}
}

View File

@@ -0,0 +1,10 @@
pub mod capnp;
pub mod cli;
pub mod grpc;
pub mod metrics;
pub mod otel;
pub mod pipeline;
pub mod servehttp;
pub mod shipper;
pub mod state;
pub mod sync_task;

View File

@@ -1,3 +1,27 @@
fn main() {
println!("sq-server");
use sq_server::cli;
use sq_server::otel::{LogFormat, OtelConfig};
#[tokio::main]
async fn main() -> anyhow::Result<()> {
dotenvy::dotenv().ok();
let log_format = match std::env::var("LOG_LEVEL")
.as_ref()
.map(|r| r.as_str())
{
Ok("json") => LogFormat::Json,
Ok("short") => LogFormat::Short,
_ => LogFormat::Pretty,
};
let _otel_guard = sq_server::otel::init(OtelConfig {
service_name: "sq-server".to_string(),
node_id: std::env::var("SQ_NODE_ID").unwrap_or_else(|_| "node-1".to_string()),
otlp_endpoint: std::env::var("OTEL_EXPORTER_OTLP_ENDPOINT").ok(),
log_format,
})?;
cli::execute().await?;
Ok(())
}

View File

@@ -0,0 +1,85 @@
use opentelemetry::metrics::{Counter, Histogram, Meter};
use opentelemetry::KeyValue;
use std::sync::LazyLock;
use std::time::Instant;
static METER: LazyLock<Meter> = LazyLock::new(|| opentelemetry::global::meter("sq-server"));
static MESSAGES_PUBLISHED: LazyLock<Counter<u64>> = LazyLock::new(|| {
METER
.u64_counter("sq.messages.published")
.with_description("Total messages published")
.build()
});
static MESSAGES_CONSUMED: LazyLock<Counter<u64>> = LazyLock::new(|| {
METER
.u64_counter("sq.messages.consumed")
.with_description("Total messages consumed via subscribe")
.build()
});
static PUBLISH_DURATION: LazyLock<Histogram<f64>> = LazyLock::new(|| {
METER
.f64_histogram("sq.publish.duration_ms")
.with_description("Publish RPC duration in milliseconds")
.with_unit("ms")
.build()
});
static SUBSCRIBE_BATCHES: LazyLock<Counter<u64>> = LazyLock::new(|| {
METER
.u64_counter("sq.subscribe.batches")
.with_description("Total subscribe batches sent")
.build()
});
static ACK_TOTAL: LazyLock<Counter<u64>> = LazyLock::new(|| {
METER
.u64_counter("sq.ack.total")
.with_description("Total ack (offset commit) operations")
.build()
});
static TOPICS_CREATED: LazyLock<Counter<u64>> = LazyLock::new(|| {
METER
.u64_counter("sq.topics.created")
.with_description("Total topics created")
.build()
});
static REPLICATE_ENTRIES: LazyLock<Counter<u64>> = LazyLock::new(|| {
METER
.u64_counter("sq.replicate.entries")
.with_description("Total entries replicated from other nodes")
.build()
});
pub fn record_messages_published(topic: &str, count: u64) {
MESSAGES_PUBLISHED.add(count, &[KeyValue::new("sq.topic", topic.to_string())]);
}
pub fn record_messages_consumed(topic: &str, count: u64) {
MESSAGES_CONSUMED.add(count, &[KeyValue::new("sq.topic", topic.to_string())]);
}
pub fn record_publish_duration(topic: &str, start: Instant) {
let duration_ms = start.elapsed().as_secs_f64() * 1000.0;
PUBLISH_DURATION.record(duration_ms, &[KeyValue::new("sq.topic", topic.to_string())]);
}
pub fn record_subscribe_batch(topic: &str) {
SUBSCRIBE_BATCHES.add(1, &[KeyValue::new("sq.topic", topic.to_string())]);
}
pub fn record_ack(topic: &str) {
ACK_TOTAL.add(1, &[KeyValue::new("sq.topic", topic.to_string())]);
}
pub fn record_topic_created() {
TOPICS_CREATED.add(1, &[]);
}
pub fn record_replicate_entries(count: u64) {
REPLICATE_ENTRIES.add(count, &[]);
}

View File

@@ -0,0 +1,121 @@
use opentelemetry::trace::TracerProvider as _;
use opentelemetry::KeyValue;
use opentelemetry_otlp::WithExportConfig;
use opentelemetry_sdk::metrics::SdkMeterProvider;
use opentelemetry_sdk::trace::SdkTracerProvider;
use opentelemetry_sdk::Resource;
use tracing_opentelemetry::OpenTelemetryLayer;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::{EnvFilter, Layer};
/// Configuration for OpenTelemetry.
pub struct OtelConfig {
pub service_name: String,
pub node_id: String,
pub otlp_endpoint: Option<String>,
pub log_format: LogFormat,
}
pub enum LogFormat {
Pretty,
Json,
Short,
}
/// Initialized OTel guard. Drop to flush and shut down providers.
pub struct OtelGuard {
tracer_provider: Option<SdkTracerProvider>,
meter_provider: Option<SdkMeterProvider>,
}
impl Drop for OtelGuard {
fn drop(&mut self) {
if let Some(provider) = self.meter_provider.take()
&& let Err(e) = provider.shutdown()
{
eprintln!("failed to shut down OTel meter provider: {e}");
}
if let Some(provider) = self.tracer_provider.take()
&& let Err(e) = provider.shutdown()
{
eprintln!("failed to shut down OTel tracer provider: {e}");
}
}
}
/// Initialize tracing and metrics with optional OpenTelemetry export.
///
/// If `otlp_endpoint` is set, spans and metrics are exported via OTLP/gRPC.
/// Otherwise, only local logging is configured.
pub fn init(config: OtelConfig) -> anyhow::Result<OtelGuard> {
let env_filter = EnvFilter::from_default_env().add_directive("notmad=trace".parse()?);
let resource = Resource::builder()
.with_attributes([
KeyValue::new(
opentelemetry_semantic_conventions::attribute::SERVICE_NAME,
config.service_name.clone(),
),
KeyValue::new("sq.node_id", config.node_id.clone()),
])
.build();
let (tracer_provider, meter_provider, otel_layer) = match &config.otlp_endpoint {
Some(endpoint) => {
// Traces
let span_exporter = opentelemetry_otlp::SpanExporter::builder()
.with_tonic()
.with_endpoint(endpoint)
.build()?;
let tp = SdkTracerProvider::builder()
.with_resource(resource.clone())
.with_batch_exporter(span_exporter)
.build();
let tracer = tp.tracer("sq-server");
// Metrics
let metric_exporter = opentelemetry_otlp::MetricExporter::builder()
.with_tonic()
.with_endpoint(endpoint)
.build()?;
let mp = SdkMeterProvider::builder()
.with_resource(resource)
.with_periodic_exporter(metric_exporter)
.build();
// Register the global meter provider so opentelemetry::global::meter() works.
opentelemetry::global::set_meter_provider(mp.clone());
let layer = OpenTelemetryLayer::new(tracer);
(Some(tp), Some(mp), Some(layer))
}
None => (None, None, None),
};
let fmt_layer = match config.log_format {
LogFormat::Json => tracing_subscriber::fmt::layer().json().boxed(),
LogFormat::Short => tracing_subscriber::fmt::layer()
.with_line_number(false)
.with_target(false)
.with_file(false)
.with_level(true)
.boxed(),
LogFormat::Pretty => tracing_subscriber::fmt::layer().pretty().boxed(),
};
tracing_subscriber::registry()
.with(env_filter)
.with(fmt_layer)
.with(otel_layer)
.init();
Ok(OtelGuard {
tracer_provider,
meter_provider,
})
}

View File

@@ -0,0 +1,211 @@
use std::collections::HashMap;
use std::sync::Arc;
use sq_models::Header;
use sq_sim::fs::RealFileSystem;
use sq_sim::RealClock;
use sq_storage::engine::StorageEngine;
use tokio::sync::{mpsc, oneshot};
/// A single message submitted to the pipeline.
pub struct PipelineMessage {
pub topic: String,
pub partition: u32,
pub key: Vec<u8>,
pub value: Vec<u8>,
pub headers: Vec<Header>,
pub timestamp_ms: u64,
}
/// Result returned for each published message.
pub struct PipelineResult {
pub topic: String,
pub partition: u32,
pub offset: u64,
}
/// A request sent through the channel: a batch of messages + reply channel.
struct PipelineRequest {
messages: Vec<PipelineMessage>,
reply: oneshot::Sender<Result<Vec<PipelineResult>, String>>,
}
/// Send-side handle for submitting messages to the write pipeline.
#[derive(Clone)]
pub struct PipelineHandle {
tx: mpsc::Sender<PipelineRequest>,
}
impl PipelineHandle {
/// Submit messages to the pipeline and wait for durable ack.
/// Returns the assigned offsets once the batch has been fsync'd.
pub async fn submit(
&self,
messages: Vec<PipelineMessage>,
) -> Result<Vec<PipelineResult>, String> {
let (reply_tx, reply_rx) = oneshot::channel();
let req = PipelineRequest {
messages,
reply: reply_tx,
};
self.tx
.send(req)
.await
.map_err(|_| "pipeline closed".to_string())?;
reply_rx.await.map_err(|_| "pipeline dropped".to_string())?
}
/// Fire-and-forget submit (for ACK_MODE_NONE).
pub async fn submit_fire_and_forget(&self, messages: Vec<PipelineMessage>) {
let (reply_tx, _reply_rx) = oneshot::channel();
let req = PipelineRequest {
messages,
reply: reply_tx,
};
// Best-effort send, ignore errors.
let _ = self.tx.send(req).await;
}
}
/// Receive-side of the pipeline that batches and flushes writes.
pub struct WritePipeline {
rx: mpsc::Receiver<PipelineRequest>,
engine: Arc<StorageEngine<RealFileSystem, RealClock>>,
}
/// Create a pipeline handle + runner pair.
pub fn create_pipeline(
engine: Arc<StorageEngine<RealFileSystem, RealClock>>,
capacity: usize,
) -> (PipelineHandle, WritePipeline) {
let (tx, rx) = mpsc::channel(capacity);
(PipelineHandle { tx }, WritePipeline { rx, engine })
}
impl WritePipeline {
/// Run the pipeline loop. Exits when all senders are dropped or the
/// cancellation token is triggered (caller should select on both).
pub async fn run(&mut self) {
loop {
// Block until at least one request arrives.
let first = match self.rx.recv().await {
Some(req) => req,
None => return, // Channel closed.
};
// Drain any additional pending requests (group commit).
let mut batch = vec![first];
while let Ok(req) = self.rx.try_recv() {
batch.push(req);
}
self.flush_batch(batch).await;
}
}
async fn flush_batch(&self, mut requests: Vec<PipelineRequest>) {
// Group all messages by (topic, partition).
// We keep track of which request+index each message belongs to so we
// can route results back.
struct Tracking {
request_idx: usize,
message_idx: usize,
}
// Count messages per request before draining (for result slot allocation).
let msg_counts: Vec<usize> = requests.iter().map(|r| r.messages.len()).collect();
let mut grouped: HashMap<(String, u32), (Vec<(Option<Vec<u8>>, Vec<u8>, Vec<Header>, u64)>, Vec<Tracking>)> = HashMap::new();
for (req_idx, req) in requests.iter_mut().enumerate() {
for (msg_idx, msg) in req.messages.drain(..).enumerate() {
let key = (msg.topic, msg.partition);
let entry = grouped.entry(key).or_insert_with(|| (Vec::new(), Vec::new()));
entry.0.push((
if msg.key.is_empty() { None } else { Some(msg.key) },
msg.value,
msg.headers,
msg.timestamp_ms,
));
entry.1.push(Tracking { request_idx: req_idx, message_idx: msg_idx });
}
}
// Prepare result slots.
let mut results: Vec<Result<Vec<PipelineResult>, String>> = msg_counts
.iter()
.map(|&count| {
Ok((0..count)
.map(|_| PipelineResult {
topic: String::new(),
partition: 0,
offset: 0,
})
.collect())
})
.collect();
// Split grouped data into messages (moved into spawn_blocking) and tracking (kept here).
let mut partition_messages: Vec<(
String,
u32,
Vec<(Option<Vec<u8>>, Vec<u8>, Vec<Header>, u64)>,
)> = Vec::new();
let mut partition_tracking: Vec<Vec<Tracking>> = Vec::new();
for ((topic, partition), (messages, tracking)) in grouped {
partition_messages.push((topic, partition, messages));
partition_tracking.push(tracking);
}
// Flush each topic-partition batch concurrently via spawn_blocking.
// Each partition acquires only its own lock inside the engine.
let mut handles = Vec::with_capacity(partition_messages.len());
for (topic, partition, messages) in partition_messages {
let engine = self.engine.clone();
handles.push(tokio::task::spawn_blocking(move || {
let batch_refs: Vec<(Option<&[u8]>, &[u8], &[Header], u64)> = messages
.iter()
.map(|(k, v, h, ts)| (k.as_deref(), v.as_slice(), h.as_slice(), *ts))
.collect();
let result = engine.append_batch(&topic, partition, &batch_refs);
(topic, partition, result)
}));
}
// Await all writes and route results back.
for (handle, tracking) in handles.into_iter().zip(partition_tracking) {
match handle.await {
Ok((topic, partition, Ok(offsets))) => {
for (i, track) in tracking.iter().enumerate() {
if let Ok(ref mut res) = results[track.request_idx] {
res[track.message_idx] = PipelineResult {
topic: topic.clone(),
partition,
offset: offsets[i],
};
}
}
}
Ok((_topic, _partition, Err(e))) => {
let err_msg = e.to_string();
for track in &tracking {
results[track.request_idx] = Err(err_msg.clone());
}
}
Err(e) => {
// spawn_blocking panicked.
let err_msg = format!("write task panicked: {e}");
for track in &tracking {
results[track.request_idx] = Err(err_msg.clone());
}
}
}
}
// Reply to all waiters.
for (req, result) in requests.into_iter().zip(results) {
let _ = req.reply.send(result);
}
}
}

View File

@@ -0,0 +1,40 @@
use std::net::SocketAddr;
use anyhow::Context;
use axum::routing::get;
use notmad::{Component, ComponentInfo, MadError};
use tokio::net::TcpListener;
use tokio_util::sync::CancellationToken;
use tower_http::trace::TraceLayer;
pub struct ServeHttp {
pub host: SocketAddr,
}
impl Component for ServeHttp {
fn info(&self) -> ComponentInfo {
"sq-server/http".into()
}
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
tracing::info!("serving http on {}", self.host);
let router = axum::Router::new()
.route("/health", get(|| async { "ok" }))
.route("/ready", get(|| async { "ok" }))
.layer(TraceLayer::new_for_http());
let listener = TcpListener::bind(&self.host)
.await
.context("failed to bind http port")?;
axum::serve(listener, router.into_make_service())
.with_graceful_shutdown(async move {
cancellation_token.cancelled().await;
})
.await
.context("http server failed")?;
Ok(())
}
}

View File

@@ -0,0 +1,101 @@
use std::sync::Arc;
use std::time::Duration;
use notmad::{Component, ComponentInfo, MadError};
use sq_sim::fs::RealFileSystem;
use sq_storage::object_store::s3::S3ObjectStore;
use sq_storage::object_store::shipper::{SegmentShipper, ShippedSegments};
use tokio::sync::Mutex;
use tokio_util::sync::CancellationToken;
use crate::state::State;
/// Background component that periodically ships closed WAL segments to S3
/// and trims local files after successful upload.
pub struct BackgroundShipper {
state: State,
shipper: SegmentShipper<RealFileSystem, S3ObjectStore>,
interval: Duration,
}
impl BackgroundShipper {
pub fn new(
state: State,
object_store: Arc<S3ObjectStore>,
cluster_id: String,
interval: Duration,
) -> Self {
let fs = Arc::new(sq_sim::fs::RealFileSystem);
let shipped = Arc::new(Mutex::new(ShippedSegments::new()));
let shipper = SegmentShipper::new(fs, object_store, cluster_id, shipped);
Self {
state,
shipper,
interval,
}
}
async fn cycle(&self) {
let closed = match self.state.engine.close_all_segments() {
Ok(segments) => segments,
Err(e) => {
tracing::warn!(error = %e, "failed to close segments for shipping");
return;
}
};
if closed.is_empty() {
return;
}
let count = self.shipper.ship_all(&closed).await;
if count > 0 {
tracing::info!(shipped = count, total = closed.len(), "shipped segments to S3");
}
// Trim local WAL files for successfully shipped segments.
// The shipper tracks which segments were shipped; we delete local copies.
// For now, we only delete if all segments were shipped successfully.
if count == closed.len() {
let fs = sq_sim::fs::RealFileSystem;
for seg in &closed {
if let Err(e) = sq_sim::fs::FileSystem::remove_file(&fs, &seg.path) {
tracing::warn!(
path = %seg.path.display(),
error = %e,
"failed to trim shipped segment"
);
}
}
}
}
}
impl Component for BackgroundShipper {
fn info(&self) -> ComponentInfo {
"sq-server/shipper".into()
}
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
tracing::info!(
interval_secs = self.interval.as_secs(),
"background shipper started"
);
loop {
tokio::select! {
() = cancellation_token.cancelled() => {
// Final flush on shutdown.
self.cycle().await;
break;
}
() = tokio::time::sleep(self.interval) => {
self.cycle().await;
}
}
}
Ok(())
}
}

View File

@@ -0,0 +1,62 @@
use std::path::PathBuf;
use std::sync::Arc;
use drop_queue::DropQueue;
use sq_sim::fs::RealFileSystem;
use sq_sim::RealClock;
use sq_storage::engine::StorageEngine;
use sq_storage::object_store::reader::ObjectStoreReader;
use sq_storage::object_store::s3::S3ObjectStore;
use crate::pipeline::{self, PipelineHandle, WritePipeline};
#[derive(Clone)]
pub struct State {
pub engine: Arc<StorageEngine<RealFileSystem, RealClock>>,
pub pipeline: PipelineHandle,
pub s3_reader: Option<Arc<ObjectStoreReader<RealFileSystem, S3ObjectStore>>>,
pub drop_queue: DropQueue,
pub config: Config,
}
#[derive(Clone)]
pub struct Config {
pub node_id: String,
pub data_dir: PathBuf,
pub seeds: Vec<String>,
pub grpc_address: String,
pub cluster_id: String,
pub s3_bucket: Option<String>,
pub s3_endpoint: Option<String>,
pub s3_region: Option<String>,
pub sync_policy: sq_models::SyncPolicy,
}
impl State {
pub fn new(config: Config) -> anyhow::Result<(Self, WritePipeline)> {
let fs = Arc::new(RealFileSystem);
let clock = Arc::new(RealClock);
let wal_config = sq_models::WalConfig {
data_dir: config.data_dir.clone(),
sync_policy: config.sync_policy.clone(),
..Default::default()
};
let engine = StorageEngine::new(fs, clock, wal_config)?;
engine.recover()?;
let engine = Arc::new(engine);
let (handle, writer) = pipeline::create_pipeline(engine.clone(), 10_000);
Ok((
Self {
engine,
pipeline: handle,
s3_reader: None,
drop_queue: DropQueue::new(),
config,
},
writer,
))
}
}

View File

@@ -0,0 +1,56 @@
use std::sync::Arc;
use std::time::Duration;
use notmad::{Component, ComponentInfo, MadError};
use sq_sim::fs::RealFileSystem;
use sq_sim::RealClock;
use sq_storage::engine::StorageEngine;
use tokio_util::sync::CancellationToken;
/// Background task that periodically fsyncs all open WAL writers.
/// Used when SyncPolicy is Interval.
pub struct BackgroundSync {
engine: Arc<StorageEngine<RealFileSystem, RealClock>>,
interval: Duration,
}
impl BackgroundSync {
pub fn new(
engine: Arc<StorageEngine<RealFileSystem, RealClock>>,
interval: Duration,
) -> Self {
Self { engine, interval }
}
}
impl Component for BackgroundSync {
fn info(&self) -> ComponentInfo {
"sq-server/background-sync".into()
}
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
let mut interval = tokio::time::interval(self.interval);
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
// Consume the first immediate tick.
interval.tick().await;
loop {
tokio::select! {
() = cancellation_token.cancelled() => {
// Final sync on shutdown.
if let Err(e) = self.engine.fsync_all_writers() {
tracing::warn!(error = %e, "final sync on shutdown failed");
}
break;
}
_ = interval.tick() => {
if let Err(e) = self.engine.fsync_all_writers() {
tracing::warn!(error = %e, "background sync failed");
}
}
}
}
Ok(())
}
}

View File

@@ -0,0 +1,462 @@
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::{Duration, Instant};
use sq_cluster::membership::{Membership, MembershipConfig};
use sq_grpc_interface::{
cluster_service_server::ClusterServiceServer,
control_plane_service_server::ControlPlaneServiceServer,
data_plane_service_server::DataPlaneServiceServer,
status_service_client::StatusServiceClient,
status_service_server::StatusServiceServer,
GetStatusRequest, SubscribeRequest,
};
use sq_grpc_interface::data_plane_service_client::DataPlaneServiceClient;
use sq_sdk::{
Consumer, ConsumerConfig, Producer, ProducerConfig,
GrpcProducer, GrpcProducerConfig, ProducerMessage,
};
use sq_server::capnp::CapnpServer;
use sq_server::grpc::{cluster, control_plane, data_plane, health};
use sq_server::state::{Config, State};
use tempfile::TempDir;
use tokio_stream::StreamExt;
use tokio_util::sync::CancellationToken;
// ---------------------------------------------------------------------------
// Test harness — extends TestCluster to include capnp server alongside gRPC
// ---------------------------------------------------------------------------
struct TestNode {
grpc_addr: SocketAddr,
capnp_addr: SocketAddr,
cancel: CancellationToken,
pipeline_cancel: CancellationToken,
_temp_dir: TempDir,
_server_handle: tokio::task::JoinHandle<()>,
_capnp_handle: tokio::task::JoinHandle<()>,
}
impl TestNode {
fn grpc_endpoint(&self) -> String {
format!("http://{}", self.grpc_addr)
}
fn capnp_endpoint(&self) -> String {
self.capnp_addr.to_string()
}
}
struct TestCluster {
nodes: Vec<TestNode>,
}
impl TestCluster {
async fn start(n: usize) -> Self {
let mut grpc_listeners = Vec::new();
let mut capnp_listeners = Vec::new();
let mut grpc_addrs = Vec::new();
let mut capnp_addrs = Vec::new();
for _ in 0..n {
let grpc_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
let capnp_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
grpc_addrs.push(grpc_listener.local_addr().unwrap());
capnp_addrs.push(capnp_listener.local_addr().unwrap());
grpc_listeners.push(grpc_listener);
capnp_listeners.push(capnp_listener);
}
let mut nodes = Vec::new();
for (i, (grpc_listener, capnp_listener)) in
grpc_listeners.into_iter().zip(capnp_listeners).enumerate()
{
let grpc_addr = grpc_addrs[i];
let capnp_addr = capnp_addrs[i];
let node_id = format!("capnp-stress-node-{}", i + 1);
let temp_dir = TempDir::new().unwrap();
let seeds: Vec<String> = grpc_addrs
.iter()
.enumerate()
.filter(|(j, _)| *j != i)
.map(|(_, a)| a.to_string())
.collect();
let config = Config {
node_id: node_id.clone(),
data_dir: temp_dir.path().to_path_buf(),
seeds: seeds.clone(),
grpc_address: grpc_addr.to_string(),
cluster_id: "test-cluster".to_string(),
s3_bucket: None,
s3_endpoint: None,
s3_region: None,
sync_policy: sq_models::SyncPolicy::EveryBatch,
};
let (state, mut pipeline) = State::new(config).unwrap();
let pipeline_cancel = CancellationToken::new();
let pipeline_cancel_clone = pipeline_cancel.clone();
tokio::spawn(async move {
tokio::select! {
() = pipeline.run() => {}
() = pipeline_cancel_clone.cancelled() => {}
}
});
let membership = Arc::new(Membership::new(MembershipConfig {
node_id: node_id.clone(),
address: grpc_addr.to_string(),
seeds,
..Default::default()
}));
let cancel = CancellationToken::new();
// Spawn gRPC server.
let cancel_clone = cancel.clone();
let state_clone = state.clone();
let membership_clone = membership.clone();
let incoming = tokio_stream::wrappers::TcpListenerStream::new(grpc_listener);
let server_handle = tokio::spawn(async move {
tonic::transport::Server::builder()
.add_service(StatusServiceServer::new(health::HealthServer {
state: state_clone.clone(),
}))
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
state: state_clone.clone(),
}))
.add_service(ControlPlaneServiceServer::new(
control_plane::ControlPlaneServer {
state: state_clone.clone(),
},
))
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
state: state_clone,
membership: membership_clone,
}))
.serve_with_incoming_shutdown(incoming, async move {
cancel_clone.cancelled().await;
})
.await
.unwrap();
});
// Spawn capnp server — use the CapnpServer Component's run method directly.
let cancel_clone = cancel.clone();
let capnp_state = state.clone();
let capnp_handle = tokio::spawn(async move {
let server = CapnpServer {
host: capnp_addr,
state: capnp_state,
};
// We can't use the TcpListener we already bound because CapnpServer binds its own.
// Instead, drop the listener and let CapnpServer rebind.
drop(capnp_listener);
let _ = notmad::Component::run(&server, cancel_clone).await;
});
nodes.push(TestNode {
grpc_addr,
capnp_addr,
cancel,
pipeline_cancel,
_temp_dir: temp_dir,
_server_handle: server_handle,
_capnp_handle: capnp_handle,
});
}
// Wait for gRPC to be ready.
for node in &nodes {
wait_for_ready(&node.grpc_endpoint()).await;
}
// Give capnp server a moment to bind.
tokio::time::sleep(Duration::from_millis(50)).await;
TestCluster { nodes }
}
fn node(&self, index: usize) -> &TestNode {
&self.nodes[index]
}
}
impl Drop for TestCluster {
fn drop(&mut self) {
for node in &self.nodes {
node.pipeline_cancel.cancel();
node.cancel.cancel();
}
}
}
async fn wait_for_ready(endpoint: &str) {
let deadline = tokio::time::Instant::now() + tokio::time::Duration::from_secs(5);
loop {
if tokio::time::Instant::now() > deadline {
panic!("Server at {} did not become ready in time", endpoint);
}
if let Ok(mut client) = StatusServiceClient::connect(endpoint.to_string()).await {
if client
.status(tonic::Request::new(GetStatusRequest {}))
.await
.is_ok()
{
return;
}
}
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
}
}
// ---------------------------------------------------------------------------
// Capnp stress test 1: Single producer — 100K messages via capnp
// ---------------------------------------------------------------------------
#[tokio::test]
async fn capnp_stress_single_producer_100k() {
let cluster = TestCluster::start(1).await;
let capnp_ep = cluster.node(0).capnp_endpoint();
let mut producer = Producer::connect(ProducerConfig {
address: capnp_ep,
..Default::default()
})
.await
.unwrap();
let total = 100_000u64;
let batch_size = 500;
let payload = vec![0u8; 128];
let start = Instant::now();
for batch_start in (0..total).step_by(batch_size) {
let batch_end = (batch_start + batch_size as u64).min(total);
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new("capnp-stress-topic", payload.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
let publish_duration = start.elapsed();
let msgs_per_sec = total as f64 / publish_duration.as_secs_f64();
eprintln!(
"capnp_stress_single_producer_100k: published {} messages in {:.2}s ({:.0} msg/s, {:.1} MB/s)",
total,
publish_duration.as_secs_f64(),
msgs_per_sec,
(total as f64 * 128.0) / (1024.0 * 1024.0) / publish_duration.as_secs_f64()
);
// Verify: read back via gRPC subscribe (capnp subscribe is streaming-only).
let grpc_ep = cluster.node(0).grpc_endpoint();
let mut client = DataPlaneServiceClient::connect(grpc_ep).await.unwrap();
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: "capnp-stress-topic".to_string(),
partition: 0,
consumer_group: String::new(),
start_offset: Some(0),
max_batch_size: 1000,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut consumed = 0u64;
while consumed < total {
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
Ok(Some(Ok(batch))) => consumed += batch.messages.len() as u64,
_ => break,
}
}
assert_eq!(consumed, total, "expected all messages to be consumed");
}
// ---------------------------------------------------------------------------
// Capnp stress test 2: Concurrent producers — 10 producers, 10K messages each
// ---------------------------------------------------------------------------
#[tokio::test]
async fn capnp_stress_concurrent_producers() {
let cluster = TestCluster::start(1).await;
let capnp_ep = cluster.node(0).capnp_endpoint();
let num_producers = 10;
let msgs_per_producer = 10_000u64;
let payload = vec![0u8; 64];
let start = Instant::now();
let mut handles = Vec::new();
for p in 0..num_producers {
let ep = capnp_ep.clone();
let pl = payload.clone();
handles.push(tokio::spawn(async move {
let mut producer = Producer::connect(ProducerConfig {
address: ep,
producer_id: format!("capnp-producer-{p}"),
..Default::default()
})
.await
.unwrap();
let topic = format!("capnp-concurrent-{p}");
for batch_start in (0..msgs_per_producer).step_by(100) {
let batch_end = (batch_start + 100).min(msgs_per_producer);
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new(topic.clone(), pl.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
}));
}
for handle in handles {
handle.await.unwrap();
}
let duration = start.elapsed();
let total = num_producers as u64 * msgs_per_producer;
let msgs_per_sec = total as f64 / duration.as_secs_f64();
eprintln!(
"capnp_stress_concurrent_producers: {} producers x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
num_producers,
msgs_per_producer,
total,
duration.as_secs_f64(),
msgs_per_sec
);
}
// ---------------------------------------------------------------------------
// Capnp stress test 3: Subscribe via capnp — publish then consume
// ---------------------------------------------------------------------------
#[tokio::test]
async fn capnp_stress_subscribe() {
let cluster = TestCluster::start(1).await;
let capnp_ep = cluster.node(0).capnp_endpoint();
let total = 10_000u64;
let payload = vec![0u8; 64];
// Publish via capnp.
let mut producer = Producer::connect(ProducerConfig {
address: capnp_ep.clone(),
..Default::default()
})
.await
.unwrap();
for batch_start in (0..total).step_by(500) {
let batch_end = (batch_start + 500).min(total);
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new("capnp-sub-topic", payload.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
// Consume via capnp.
let mut consumer = Consumer::connect(ConsumerConfig {
address: capnp_ep,
topic: "capnp-sub-topic".to_string(),
consumer_group: String::new(),
auto_commit: false,
start_offset: Some(0),
max_poll_records: 1000,
..Default::default()
})
.await
.unwrap();
let mut consumed = 0u64;
let start = Instant::now();
while consumed < total {
match tokio::time::timeout(Duration::from_secs(10), consumer.poll()).await {
Ok(Ok(msgs)) => consumed += msgs.len() as u64,
_ => break,
}
}
let consume_duration = start.elapsed();
eprintln!(
"capnp_stress_subscribe: consumed {} messages in {:.2}s ({:.0} msg/s)",
consumed,
consume_duration.as_secs_f64(),
consumed as f64 / consume_duration.as_secs_f64()
);
assert_eq!(consumed, total, "expected all messages to be consumed");
}
// ---------------------------------------------------------------------------
// Throughput comparison: gRPC vs capnp
// ---------------------------------------------------------------------------
async fn bench_grpc_publish(cluster: &TestCluster, total: u64, batch_size: usize) -> f64 {
let endpoint = cluster.node(0).grpc_endpoint();
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
address: endpoint,
..Default::default()
})
.await
.unwrap();
let payload = vec![0u8; 128];
let start = Instant::now();
for batch_start in (0..total).step_by(batch_size) {
let batch_end = (batch_start + batch_size as u64).min(total);
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new("bench-grpc", payload.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
total as f64 / start.elapsed().as_secs_f64()
}
async fn bench_capnp_publish(cluster: &TestCluster, total: u64, batch_size: usize) -> f64 {
let endpoint = cluster.node(0).capnp_endpoint();
let mut producer = Producer::connect(ProducerConfig {
address: endpoint,
..Default::default()
})
.await
.unwrap();
let payload = vec![0u8; 128];
let start = Instant::now();
for batch_start in (0..total).step_by(batch_size) {
let batch_end = (batch_start + batch_size as u64).min(total);
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new("bench-capnp", payload.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
total as f64 / start.elapsed().as_secs_f64()
}
#[tokio::test]
async fn capnp_vs_grpc_throughput() {
let cluster = TestCluster::start(1).await;
let grpc_rate = bench_grpc_publish(&cluster, 100_000, 500).await;
let capnp_rate = bench_capnp_publish(&cluster, 100_000, 500).await;
eprintln!("=== THROUGHPUT COMPARISON (single producer, 100K msgs x 128B) ===");
eprintln!("gRPC: {:.0} msg/s", grpc_rate);
eprintln!("capnp: {:.0} msg/s", capnp_rate);
eprintln!("ratio: {:.2}x", capnp_rate / grpc_rate);
}

View File

@@ -0,0 +1,763 @@
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use sq_cluster::membership::{Membership, MembershipConfig};
use sq_grpc_interface::{
cluster_service_client::ClusterServiceClient,
cluster_service_server::ClusterServiceServer,
control_plane_service_client::ControlPlaneServiceClient,
control_plane_service_server::ControlPlaneServiceServer,
data_plane_service_client::DataPlaneServiceClient,
data_plane_service_server::DataPlaneServiceServer,
status_service_client::StatusServiceClient,
status_service_server::StatusServiceServer,
ClusterNodeInfo, CreateTopicRequest, DeleteTopicRequest, DescribeTopicRequest,
FetchSegmentRequest, GetStatusRequest, HeartbeatRequest, JoinRequest, ListTopicsRequest,
ReplicateEntriesRequest, SubscribeRequest,
};
use sq_sdk::{GrpcConsumer, GrpcConsumerConfig, GrpcProducer, GrpcProducerConfig};
use sq_server::grpc::{cluster, control_plane, data_plane, health};
use sq_server::state::{Config, State};
use tempfile::TempDir;
use tokio_stream::StreamExt;
use tokio_util::sync::CancellationToken;
// ---------------------------------------------------------------------------
// Test harness
// ---------------------------------------------------------------------------
struct TestNode {
addr: SocketAddr,
#[allow(dead_code)]
node_id: String,
#[allow(dead_code)]
state: State,
membership: Arc<Membership>,
cancel: CancellationToken,
pipeline_cancel: CancellationToken,
_temp_dir: TempDir,
_server_handle: tokio::task::JoinHandle<()>,
}
impl TestNode {
fn endpoint(&self) -> String {
format!("http://{}", self.addr)
}
}
struct TestCluster {
nodes: Vec<TestNode>,
}
impl TestCluster {
/// Start a cluster of `n` real SQ server nodes on random ports.
async fn start(n: usize) -> Self {
// Phase 1: Bind all listeners to get ports before starting servers.
let mut listeners = Vec::new();
let mut addrs = Vec::new();
for _ in 0..n {
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
addrs.push(addr);
listeners.push(listener);
}
// Phase 2: Start each node.
let mut nodes = Vec::new();
for (i, listener) in listeners.into_iter().enumerate() {
let addr = addrs[i];
let node_id = format!("node-{}", i + 1);
let temp_dir = TempDir::new().unwrap();
// Seeds: all addresses except our own.
let seeds: Vec<String> = addrs
.iter()
.enumerate()
.filter(|(j, _)| *j != i)
.map(|(_, a)| a.to_string())
.collect();
let config = Config {
node_id: node_id.clone(),
data_dir: temp_dir.path().to_path_buf(),
seeds: seeds.clone(),
grpc_address: addr.to_string(),
cluster_id: "test-cluster".to_string(),
s3_bucket: None,
s3_endpoint: None,
s3_region: None,
sync_policy: sq_models::SyncPolicy::EveryBatch,
};
let (state, mut pipeline) = State::new(config).unwrap();
// Spawn the write pipeline for this node.
let pipeline_cancel = CancellationToken::new();
let pipeline_cancel_clone = pipeline_cancel.clone();
tokio::spawn(async move {
tokio::select! {
() = pipeline.run() => {}
() = pipeline_cancel_clone.cancelled() => {}
}
});
let membership = Arc::new(Membership::new(MembershipConfig {
node_id: node_id.clone(),
address: addr.to_string(),
seeds,
..Default::default()
}));
let cancel = CancellationToken::new();
let cancel_clone = cancel.clone();
let state_clone = state.clone();
let membership_clone = membership.clone();
let incoming = tokio_stream::wrappers::TcpListenerStream::new(listener);
let server_handle = tokio::spawn(async move {
tonic::transport::Server::builder()
.add_service(StatusServiceServer::new(health::HealthServer {
state: state_clone.clone(),
}))
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
state: state_clone.clone(),
}))
.add_service(ControlPlaneServiceServer::new(
control_plane::ControlPlaneServer {
state: state_clone.clone(),
},
))
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
state: state_clone,
membership: membership_clone,
}))
.serve_with_incoming_shutdown(incoming, async move {
cancel_clone.cancelled().await;
})
.await
.unwrap();
});
nodes.push(TestNode {
addr,
node_id,
state,
membership,
cancel,
pipeline_cancel,
_temp_dir: temp_dir,
_server_handle: server_handle,
});
}
// Phase 3: Wait for all servers to be ready.
for node in &nodes {
wait_for_ready(&node.endpoint()).await;
}
TestCluster { nodes }
}
fn node(&self, index: usize) -> &TestNode {
&self.nodes[index]
}
}
impl Drop for TestCluster {
fn drop(&mut self) {
for node in &self.nodes {
node.pipeline_cancel.cancel();
node.cancel.cancel();
}
}
}
/// Poll the Status RPC until the server responds, with a timeout.
async fn wait_for_ready(endpoint: &str) {
let deadline = tokio::time::Instant::now() + tokio::time::Duration::from_secs(5);
loop {
if tokio::time::Instant::now() > deadline {
panic!("Server at {} did not become ready in time", endpoint);
}
match StatusServiceClient::connect(endpoint.to_string()).await {
Ok(mut client) => {
if client
.status(tonic::Request::new(GetStatusRequest {}))
.await
.is_ok()
{
return;
}
}
Err(_) => {}
}
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
}
}
/// Collect messages from a subscribe stream with a timeout.
async fn collect_messages(
endpoint: &str,
topic: &str,
start_offset: u64,
expected_count: usize,
) -> Vec<sq_grpc_interface::ConsumedMessage> {
let mut client = DataPlaneServiceClient::connect(endpoint.to_string())
.await
.unwrap();
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: topic.to_string(),
partition: 0,
consumer_group: String::new(),
start_offset: Some(start_offset),
max_batch_size: 200,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut messages = Vec::new();
while messages.len() < expected_count {
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
Ok(Some(Ok(batch))) => messages.extend(batch.messages),
_ => break,
}
}
messages
}
// ---------------------------------------------------------------------------
// Test 1: Single node, 1000 messages via SDK
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_single_node_publish_consume_1000() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
// Publish 1000 messages via SDK Producer.
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
address: endpoint.clone(),
..Default::default()
})
.await
.unwrap();
for i in 0..1000u64 {
let result = producer
.send("orders", None, format!("msg-{i}").as_bytes())
.await
.unwrap();
assert_eq!(result.offset, i);
assert_eq!(result.topic, "orders");
}
// Consume all 1000 via raw subscribe.
let messages = collect_messages(&endpoint, "orders", 0, 1000).await;
assert_eq!(messages.len(), 1000);
for (i, msg) in messages.iter().enumerate() {
assert_eq!(msg.offset, i as u64);
assert_eq!(msg.value, format!("msg-{i}").as_bytes());
}
}
// ---------------------------------------------------------------------------
// Test 2: Multi-topic isolation
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_multi_topic_isolation() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
address: endpoint.clone(),
..Default::default()
})
.await
.unwrap();
let topics = ["alpha", "beta", "gamma"];
let counts: [usize; 3] = [50, 100, 25];
// Publish to each topic.
for (topic, count) in topics.iter().zip(counts.iter()) {
for i in 0..*count {
producer
.send(topic, None, format!("{topic}-{i}").as_bytes())
.await
.unwrap();
}
}
// Consume from each topic and verify isolation.
for (topic, expected_count) in topics.iter().zip(counts.iter()) {
let messages = collect_messages(&endpoint, topic, 0, *expected_count).await;
assert_eq!(
messages.len(),
*expected_count,
"topic {topic} expected {expected_count} messages, got {}",
messages.len()
);
for (i, msg) in messages.iter().enumerate() {
assert_eq!(msg.offset, i as u64);
assert_eq!(msg.value, format!("{topic}-{i}").as_bytes());
}
}
}
// ---------------------------------------------------------------------------
// Test 3: Consumer group offset resume
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_consumer_group_offset_resume() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
// Publish 20 messages.
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
address: endpoint.clone(),
..Default::default()
})
.await
.unwrap();
for i in 0..20u64 {
producer
.send("events", None, format!("msg-{i}").as_bytes())
.await
.unwrap();
}
// Consumer 1: consume with auto_commit, collect at least 10 messages.
{
let mut consumer = GrpcConsumer::connect(GrpcConsumerConfig {
address: endpoint.clone(),
consumer_group: "test-group".to_string(),
topic: "events".to_string(),
auto_commit: true,
..Default::default()
})
.await
.unwrap();
let mut received = Vec::new();
let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
while received.len() < 10 && tokio::time::Instant::now() < deadline {
let msgs = consumer.poll().await.unwrap();
if msgs.is_empty() {
tokio::time::sleep(Duration::from_millis(50)).await;
continue;
}
received.extend(msgs);
}
assert!(
received.len() >= 10,
"expected at least 10 messages, got {}",
received.len()
);
}
// Consumer 2: reconnect with same group, should resume from committed offset.
{
let mut consumer = GrpcConsumer::connect(GrpcConsumerConfig {
address: endpoint.clone(),
consumer_group: "test-group".to_string(),
topic: "events".to_string(),
auto_commit: false,
..Default::default()
})
.await
.unwrap();
let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
let mut msgs = Vec::new();
while msgs.is_empty() && tokio::time::Instant::now() < deadline {
msgs = consumer.poll().await.unwrap();
if msgs.is_empty() {
tokio::time::sleep(Duration::from_millis(50)).await;
}
}
assert!(
!msgs.is_empty(),
"expected messages from resumed consumer"
);
// Should start from at least offset 9 (last committed by auto_commit).
assert!(
msgs[0].offset >= 9,
"expected resume from offset >= 9, got {}",
msgs[0].offset
);
}
}
// ---------------------------------------------------------------------------
// Test 4: Topic management CRUD
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_topic_management_crud() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let mut client = ControlPlaneServiceClient::connect(endpoint.clone())
.await
.unwrap();
// Create topic.
let resp = client
.create_topic(tonic::Request::new(CreateTopicRequest {
name: "orders".to_string(),
partitions: 4,
replication_factor: 3,
}))
.await
.unwrap();
assert_eq!(resp.into_inner().name, "orders");
// Duplicate should fail.
let err = client
.create_topic(tonic::Request::new(CreateTopicRequest {
name: "orders".to_string(),
partitions: 4,
replication_factor: 3,
}))
.await
.unwrap_err();
assert_eq!(err.code(), tonic::Code::AlreadyExists);
// Create another.
client
.create_topic(tonic::Request::new(CreateTopicRequest {
name: "events".to_string(),
partitions: 1,
replication_factor: 1,
}))
.await
.unwrap();
// List topics.
let resp = client
.list_topics(tonic::Request::new(ListTopicsRequest {}))
.await
.unwrap();
let topics = resp.into_inner().topics;
assert_eq!(topics.len(), 2);
let names: Vec<&str> = topics.iter().map(|t| t.name.as_str()).collect();
assert!(names.contains(&"orders"));
assert!(names.contains(&"events"));
// Describe topic.
let resp = client
.describe_topic(tonic::Request::new(DescribeTopicRequest {
name: "orders".to_string(),
}))
.await
.unwrap()
.into_inner();
let topic = resp.topic.unwrap();
assert_eq!(topic.name, "orders");
assert_eq!(topic.partitions, 4);
assert_eq!(topic.replication_factor, 3);
assert_eq!(resp.partition_info.len(), 4);
// Describe non-existent topic.
let err = client
.describe_topic(tonic::Request::new(DescribeTopicRequest {
name: "nonexistent".to_string(),
}))
.await
.unwrap_err();
assert_eq!(err.code(), tonic::Code::NotFound);
// Delete topic.
client
.delete_topic(tonic::Request::new(DeleteTopicRequest {
name: "orders".to_string(),
}))
.await
.unwrap();
// Verify deleted.
let resp = client
.list_topics(tonic::Request::new(ListTopicsRequest {}))
.await
.unwrap();
assert_eq!(resp.into_inner().topics.len(), 1);
// Delete non-existent should fail.
let err = client
.delete_topic(tonic::Request::new(DeleteTopicRequest {
name: "orders".to_string(),
}))
.await
.unwrap_err();
assert_eq!(err.code(), tonic::Code::NotFound);
}
// ---------------------------------------------------------------------------
// Test 5: Three-node join discovery
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_three_node_join_discovery() {
let cluster = TestCluster::start(3).await;
// Node-2 joins node-1.
let mut client = ClusterServiceClient::connect(cluster.node(0).endpoint())
.await
.unwrap();
let resp = client
.join(tonic::Request::new(JoinRequest {
node_id: "node-2".to_string(),
address: cluster.nodes[1].addr.to_string(),
}))
.await
.unwrap();
let members = resp.into_inner().members;
assert!(
members.len() >= 2,
"after node-2 join, node-1 should know >= 2 members, got {}",
members.len()
);
let ids: Vec<&str> = members.iter().map(|m| m.node_id.as_str()).collect();
assert!(ids.contains(&"node-1"));
assert!(ids.contains(&"node-2"));
// Node-3 joins node-1.
let resp = client
.join(tonic::Request::new(JoinRequest {
node_id: "node-3".to_string(),
address: cluster.nodes[2].addr.to_string(),
}))
.await
.unwrap();
let members = resp.into_inner().members;
assert!(
members.len() >= 3,
"after node-3 join, node-1 should know >= 3 members, got {}",
members.len()
);
let ids: Vec<&str> = members.iter().map(|m| m.node_id.as_str()).collect();
assert!(ids.contains(&"node-1"));
assert!(ids.contains(&"node-2"));
assert!(ids.contains(&"node-3"));
// Verify via membership handle.
let all = cluster.node(0).membership.all_members().await;
assert_eq!(all.len(), 3);
}
// ---------------------------------------------------------------------------
// Test 6: Cross-node heartbeat gossip
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_cross_node_heartbeat_gossip() {
let cluster = TestCluster::start(3).await;
// Node-2 and node-3 join node-1.
let mut client1 = ClusterServiceClient::connect(cluster.node(0).endpoint())
.await
.unwrap();
client1
.join(tonic::Request::new(JoinRequest {
node_id: "node-2".to_string(),
address: cluster.nodes[1].addr.to_string(),
}))
.await
.unwrap();
client1
.join(tonic::Request::new(JoinRequest {
node_id: "node-3".to_string(),
address: cluster.nodes[2].addr.to_string(),
}))
.await
.unwrap();
// Node-1 now knows about all 3. Send heartbeat to node-2 carrying this info.
let all_members = cluster.node(0).membership.all_members().await;
let known: Vec<ClusterNodeInfo> = all_members
.iter()
.map(|m| ClusterNodeInfo {
node_id: m.node_id.clone(),
address: m.address.clone(),
status: m.status.to_string(),
})
.collect();
let mut client2 = ClusterServiceClient::connect(cluster.node(1).endpoint())
.await
.unwrap();
let resp = client2
.heartbeat(tonic::Request::new(HeartbeatRequest {
node_id: "node-1".to_string(),
known_members: known,
}))
.await
.unwrap();
// Node-2 should now know about all 3 nodes via gossip.
let node2_members = resp.into_inner().members;
assert!(
node2_members.len() >= 3,
"node-2 should know >= 3 members after gossip, got {}",
node2_members.len()
);
}
// ---------------------------------------------------------------------------
// Test 7: Cross-node replication via RPC
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_cross_node_replication_via_rpc() {
let cluster = TestCluster::start(2).await;
// Publish 10 messages to node-1 via SDK.
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
address: cluster.node(0).endpoint(),
..Default::default()
})
.await
.unwrap();
let mut entry_data = Vec::new();
for i in 0..10u64 {
let value = format!("replicated-{i}");
producer
.send("repl-topic", None, value.as_bytes())
.await
.unwrap();
entry_data.push(value.into_bytes());
}
// Replicate the same data to node-2 via ClusterService RPC.
let mut cluster_client = ClusterServiceClient::connect(cluster.node(1).endpoint())
.await
.unwrap();
let resp = cluster_client
.replicate_entries(tonic::Request::new(ReplicateEntriesRequest {
topic: "repl-topic".to_string(),
partition: 0,
entries: entry_data,
}))
.await
.unwrap();
let last_offset = resp.into_inner().last_replicated_offset;
assert_eq!(last_offset, 9);
// Read from node-2 to verify the data is there.
let messages = collect_messages(&cluster.node(1).endpoint(), "repl-topic", 0, 10).await;
assert_eq!(messages.len(), 10);
for (i, msg) in messages.iter().enumerate() {
assert_eq!(msg.offset, i as u64);
assert_eq!(msg.value, format!("replicated-{i}").as_bytes());
}
}
// ---------------------------------------------------------------------------
// Test 8: FetchSegment recovery
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_fetch_segment_recovery() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
// Write 50 messages.
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
address: endpoint.clone(),
..Default::default()
})
.await
.unwrap();
for i in 0..50u64 {
producer
.send("recovery-topic", None, format!("data-{i}").as_bytes())
.await
.unwrap();
}
// Fetch via FetchSegment stream.
let mut client = ClusterServiceClient::connect(endpoint)
.await
.unwrap();
let response = client
.fetch_segment(tonic::Request::new(FetchSegmentRequest {
topic: "recovery-topic".to_string(),
partition: 0,
from_offset: 0,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut all_chunks = Vec::new();
while let Ok(Some(Ok(resp))) =
tokio::time::timeout(Duration::from_secs(5), stream.next()).await
{
all_chunks.extend(resp.chunk);
}
// Decode the wire format: offset(8 LE) + value_len(4 LE) + value
let mut cursor = 0;
let mut decoded = Vec::new();
while cursor + 12 <= all_chunks.len() {
let offset = u64::from_le_bytes(all_chunks[cursor..cursor + 8].try_into().unwrap());
let value_len =
u32::from_le_bytes(all_chunks[cursor + 8..cursor + 12].try_into().unwrap()) as usize;
cursor += 12;
assert!(cursor + value_len <= all_chunks.len());
let value = all_chunks[cursor..cursor + value_len].to_vec();
cursor += value_len;
decoded.push((offset, value));
}
assert_eq!(decoded.len(), 50);
for (i, (offset, value)) in decoded.iter().enumerate() {
assert_eq!(*offset, i as u64);
assert_eq!(value, format!("data-{i}").as_bytes());
}
}
// ---------------------------------------------------------------------------
// Test 9: Node status returns correct id
// ---------------------------------------------------------------------------
#[tokio::test]
async fn test_node_status_returns_correct_id() {
let cluster = TestCluster::start(3).await;
for (i, node) in cluster.nodes.iter().enumerate() {
let mut client = StatusServiceClient::connect(node.endpoint()).await.unwrap();
let resp = client
.status(tonic::Request::new(GetStatusRequest {}))
.await
.unwrap();
let expected = format!("node-{}", i + 1);
assert_eq!(
resp.into_inner().node_id,
expected,
"node at index {} should have id '{}'",
i,
expected
);
}
}

View File

@@ -0,0 +1,496 @@
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;
use sq_grpc_interface::{
data_plane_service_client::DataPlaneServiceClient,
data_plane_service_server::DataPlaneServiceServer,
status_service_client::StatusServiceClient,
status_service_server::StatusServiceServer,
AckMode, GetStatusRequest, MessageHeader, PublishMessage, PublishRequest, PublishSettings,
SubscribeRequest,
};
use sq_sim::fs::InMemoryFileSystem;
use sq_sim::SimClock;
use sq_storage::engine::StorageEngine;
use tokio::sync::Mutex;
use tokio_stream::StreamExt;
/// A lightweight test harness that starts a gRPC server on a random port
/// and returns both the server task and connected clients.
struct TestServer {
addr: SocketAddr,
_shutdown: tokio::sync::oneshot::Sender<()>,
}
impl TestServer {
async fn start() -> Self {
let fs = Arc::new(InMemoryFileSystem::new());
let clock = Arc::new(SimClock::new());
let config = sq_models::WalConfig {
max_segment_bytes: 1024 * 1024,
max_segment_age_secs: 3600,
data_dir: PathBuf::from("/data"),
..Default::default()
};
let engine = StorageEngine::new(fs, clock, config).unwrap();
engine.recover().unwrap();
let engine = Arc::new(Mutex::new(engine));
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>();
// Build the health server state-like object inline for tests.
let node_id = "test-node".to_string();
struct TestHealthServer {
node_id: String,
}
#[tonic::async_trait]
impl sq_grpc_interface::status_service_server::StatusService for TestHealthServer {
async fn status(
&self,
_request: tonic::Request<GetStatusRequest>,
) -> Result<tonic::Response<sq_grpc_interface::GetStatusResponse>, tonic::Status> {
Ok(tonic::Response::new(sq_grpc_interface::GetStatusResponse {
node_id: self.node_id.clone(),
cluster: None,
}))
}
}
struct TestDataPlaneServer {
engine: Arc<Mutex<StorageEngine<InMemoryFileSystem, SimClock>>>,
}
#[tonic::async_trait]
impl sq_grpc_interface::data_plane_service_server::DataPlaneService
for TestDataPlaneServer
{
async fn publish(
&self,
request: tonic::Request<PublishRequest>,
) -> Result<tonic::Response<sq_grpc_interface::PublishResponse>, tonic::Status> {
let req = request.into_inner();
if req.messages.is_empty() {
return Err(tonic::Status::invalid_argument(
"messages must not be empty",
));
}
let mut results = Vec::new();
let engine = self.engine.lock().await;
for msg in &req.messages {
if msg.topic.is_empty() {
return Err(tonic::Status::invalid_argument("topic must not be empty"));
}
let headers: Vec<sq_models::Header> = msg
.headers
.iter()
.map(|h| sq_models::Header {
key: h.key.clone(),
value: h.value.clone(),
})
.collect();
let key = if msg.key.is_empty() {
None
} else {
Some(msg.key.as_slice())
};
let offset = engine
.append(&msg.topic, 0, key, &msg.value, &headers, 0)
.map_err(|e| tonic::Status::internal(e.to_string()))?;
results.push(sq_grpc_interface::PublishResult {
topic: msg.topic.clone(),
partition: 0,
offset,
});
}
Ok(tonic::Response::new(sq_grpc_interface::PublishResponse {
results,
}))
}
type SubscribeStream = std::pin::Pin<
Box<
dyn tokio_stream::Stream<
Item = Result<sq_grpc_interface::SubscribeResponse, tonic::Status>,
> + Send
+ 'static,
>,
>;
async fn subscribe(
&self,
request: tonic::Request<SubscribeRequest>,
) -> Result<tonic::Response<Self::SubscribeStream>, tonic::Status> {
let req = request.into_inner();
let batch_size = if req.max_batch_size == 0 {
100
} else {
req.max_batch_size as usize
};
let start_offset = req.start_offset.unwrap_or(0);
let topic = req.topic.clone();
let partition = req.partition;
let engine = self.engine.clone();
let stream = async_stream::try_stream! {
let mut current_offset = start_offset;
let mut empty_polls = 0u32;
loop {
let messages = {
let eng = engine.lock().await;
eng.read(&topic, partition, current_offset, batch_size)
.map_err(|e| tonic::Status::internal(e.to_string()))?
};
if messages.is_empty() {
empty_polls += 1;
// In tests, stop after a few empty polls to avoid hanging.
if empty_polls > 3 {
break;
}
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
continue;
}
empty_polls = 0;
let consumed: Vec<sq_grpc_interface::ConsumedMessage> = messages
.iter()
.map(|m| {
current_offset = m.offset + 1;
sq_grpc_interface::ConsumedMessage {
offset: m.offset,
topic: m.topic.to_string(),
partition: m.partition,
key: m.key.clone().unwrap_or_default(),
value: m.value.clone(),
headers: m
.headers
.iter()
.map(|h| MessageHeader {
key: h.key.clone(),
value: h.value.clone(),
})
.collect(),
timestamp_ms: m.timestamp_ms,
}
})
.collect();
yield sq_grpc_interface::SubscribeResponse { messages: consumed };
}
};
Ok(tonic::Response::new(Box::pin(stream)))
}
async fn ack(
&self,
_request: tonic::Request<sq_grpc_interface::AckRequest>,
) -> Result<tonic::Response<sq_grpc_interface::AckResponse>, tonic::Status> {
Ok(tonic::Response::new(sq_grpc_interface::AckResponse {}))
}
}
let incoming = tokio_stream::wrappers::TcpListenerStream::new(listener);
tokio::spawn(async move {
tonic::transport::Server::builder()
.add_service(StatusServiceServer::new(TestHealthServer {
node_id: node_id.clone(),
}))
.add_service(DataPlaneServiceServer::new(TestDataPlaneServer {
engine,
}))
.serve_with_incoming_shutdown(incoming, async {
let _ = shutdown_rx.await;
})
.await
.unwrap();
});
// Give the server a moment to start.
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
TestServer {
addr,
_shutdown: shutdown_tx,
}
}
fn endpoint(&self) -> String {
format!("http://{}", self.addr)
}
}
#[tokio::test]
async fn test_status_rpc() {
let server = TestServer::start().await;
let mut client = StatusServiceClient::connect(server.endpoint()).await.unwrap();
let response = client
.status(tonic::Request::new(GetStatusRequest {}))
.await
.unwrap();
assert_eq!(response.into_inner().node_id, "test-node");
}
#[tokio::test]
async fn test_publish_single_message() {
let server = TestServer::start().await;
let mut client = DataPlaneServiceClient::connect(server.endpoint())
.await
.unwrap();
let response = client
.publish(tonic::Request::new(PublishRequest {
messages: vec![PublishMessage {
topic: "orders".to_string(),
key: vec![],
value: b"hello world".to_vec(),
headers: vec![],
}],
settings: None,
producer_id: "test".to_string(),
}))
.await
.unwrap();
let results = response.into_inner().results;
assert_eq!(results.len(), 1);
assert_eq!(results[0].topic, "orders");
assert_eq!(results[0].offset, 0);
}
#[tokio::test]
async fn test_publish_batch_sequential_offsets() {
let server = TestServer::start().await;
let mut client = DataPlaneServiceClient::connect(server.endpoint())
.await
.unwrap();
let messages: Vec<PublishMessage> = (0..100)
.map(|i| PublishMessage {
topic: "events".to_string(),
key: vec![],
value: format!("msg-{i}").into_bytes(),
headers: vec![],
})
.collect();
let response = client
.publish(tonic::Request::new(PublishRequest {
messages,
settings: Some(PublishSettings {
ack_mode: AckMode::All.into(),
}),
producer_id: "test".to_string(),
}))
.await
.unwrap();
let results = response.into_inner().results;
assert_eq!(results.len(), 100);
for (i, r) in results.iter().enumerate() {
assert_eq!(r.offset, i as u64);
assert_eq!(r.topic, "events");
}
}
#[tokio::test]
async fn test_publish_empty_topic_returns_error() {
let server = TestServer::start().await;
let mut client = DataPlaneServiceClient::connect(server.endpoint())
.await
.unwrap();
let err = client
.publish(tonic::Request::new(PublishRequest {
messages: vec![PublishMessage {
topic: "".to_string(),
key: vec![],
value: b"data".to_vec(),
headers: vec![],
}],
settings: None,
producer_id: "test".to_string(),
}))
.await
.unwrap_err();
assert_eq!(err.code(), tonic::Code::InvalidArgument);
}
#[tokio::test]
async fn test_publish_empty_messages_returns_error() {
let server = TestServer::start().await;
let mut client = DataPlaneServiceClient::connect(server.endpoint())
.await
.unwrap();
let err = client
.publish(tonic::Request::new(PublishRequest {
messages: vec![],
settings: None,
producer_id: "test".to_string(),
}))
.await
.unwrap_err();
assert_eq!(err.code(), tonic::Code::InvalidArgument);
}
#[tokio::test]
async fn test_publish_with_key_and_headers() {
let server = TestServer::start().await;
let mut client = DataPlaneServiceClient::connect(server.endpoint())
.await
.unwrap();
let response = client
.publish(tonic::Request::new(PublishRequest {
messages: vec![PublishMessage {
topic: "orders".to_string(),
key: b"order-123".to_vec(),
value: b"payload".to_vec(),
headers: vec![MessageHeader {
key: "trace-id".to_string(),
value: b"abc-123".to_vec(),
}],
}],
settings: None,
producer_id: "test".to_string(),
}))
.await
.unwrap();
let results = response.into_inner().results;
assert_eq!(results.len(), 1);
assert_eq!(results[0].offset, 0);
}
#[tokio::test]
async fn test_subscribe_from_beginning() {
let server = TestServer::start().await;
let mut client = DataPlaneServiceClient::connect(server.endpoint())
.await
.unwrap();
// Publish 10 messages first.
let messages: Vec<PublishMessage> = (0..10)
.map(|i| PublishMessage {
topic: "events".to_string(),
key: vec![],
value: format!("msg-{i}").into_bytes(),
headers: vec![],
})
.collect();
client
.publish(tonic::Request::new(PublishRequest {
messages,
settings: None,
producer_id: "test".to_string(),
}))
.await
.unwrap();
// Subscribe from offset 0.
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: "events".to_string(),
partition: 0,
consumer_group: "".to_string(),
start_offset: Some(0),
max_batch_size: 100,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut all_messages = Vec::new();
while let Some(Ok(batch)) = stream.next().await {
all_messages.extend(batch.messages);
if all_messages.len() >= 10 {
break;
}
}
assert_eq!(all_messages.len(), 10);
for (i, msg) in all_messages.iter().enumerate() {
assert_eq!(msg.offset, i as u64);
assert_eq!(msg.value, format!("msg-{i}").as_bytes());
assert_eq!(msg.topic, "events");
}
}
#[tokio::test]
async fn test_subscribe_from_middle() {
let server = TestServer::start().await;
let mut client = DataPlaneServiceClient::connect(server.endpoint())
.await
.unwrap();
// Publish 10 messages.
let messages: Vec<PublishMessage> = (0..10)
.map(|i| PublishMessage {
topic: "events".to_string(),
key: vec![],
value: format!("msg-{i}").into_bytes(),
headers: vec![],
})
.collect();
client
.publish(tonic::Request::new(PublishRequest {
messages,
settings: None,
producer_id: "test".to_string(),
}))
.await
.unwrap();
// Subscribe from offset 5.
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: "events".to_string(),
partition: 0,
consumer_group: "".to_string(),
start_offset: Some(5),
max_batch_size: 100,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut all_messages = Vec::new();
while let Some(Ok(batch)) = stream.next().await {
all_messages.extend(batch.messages);
if all_messages.len() >= 5 {
break;
}
}
assert_eq!(all_messages.len(), 5);
assert_eq!(all_messages[0].offset, 5);
assert_eq!(all_messages[4].offset, 9);
}

View File

@@ -0,0 +1,965 @@
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::{Duration, Instant};
use sq_cluster::membership::{Membership, MembershipConfig};
use sq_grpc_interface::{
cluster_service_server::ClusterServiceServer,
control_plane_service_server::ControlPlaneServiceServer,
data_plane_service_server::DataPlaneServiceServer,
status_service_client::StatusServiceClient,
status_service_server::StatusServiceServer,
GetStatusRequest, SubscribeRequest,
};
use sq_grpc_interface::data_plane_service_client::DataPlaneServiceClient;
use sq_sdk::{
BatchProducer, BatchProducerConfig, Consumer, ConsumerConfig, Producer, ProducerConfig,
ProducerMessage,
};
use sq_server::capnp::CapnpServer;
use sq_server::grpc::{cluster, control_plane, data_plane, health};
use sq_server::state::{Config, State};
use tempfile::TempDir;
use tokio_stream::StreamExt;
use tokio_util::sync::CancellationToken;
// ---------------------------------------------------------------------------
// Test harness (shared with cluster_test.rs, inlined here for simplicity)
// ---------------------------------------------------------------------------
struct TestNode {
grpc_addr: SocketAddr,
capnp_addr: SocketAddr,
cancel: CancellationToken,
pipeline_cancel: CancellationToken,
_temp_dir: TempDir,
_server_handle: tokio::task::JoinHandle<()>,
_capnp_handle: tokio::task::JoinHandle<()>,
}
impl TestNode {
/// Cap'n Proto endpoint (default data plane).
fn endpoint(&self) -> String {
self.capnp_addr.to_string()
}
/// gRPC endpoint (health checks, subscribe verification).
fn grpc_endpoint(&self) -> String {
format!("http://{}", self.grpc_addr)
}
}
struct TestCluster {
nodes: Vec<TestNode>,
}
impl TestCluster {
async fn start(n: usize) -> Self {
let mut grpc_listeners = Vec::new();
let mut capnp_listeners = Vec::new();
let mut grpc_addrs = Vec::new();
let mut capnp_addrs = Vec::new();
for _ in 0..n {
let grpc_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
let capnp_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
grpc_addrs.push(grpc_listener.local_addr().unwrap());
capnp_addrs.push(capnp_listener.local_addr().unwrap());
grpc_listeners.push(grpc_listener);
capnp_listeners.push(capnp_listener);
}
let mut nodes = Vec::new();
for (i, (grpc_listener, capnp_listener)) in
grpc_listeners.into_iter().zip(capnp_listeners).enumerate()
{
let grpc_addr = grpc_addrs[i];
let capnp_addr = capnp_addrs[i];
let node_id = format!("stress-node-{}", i + 1);
let temp_dir = TempDir::new().unwrap();
let seeds: Vec<String> = grpc_addrs
.iter()
.enumerate()
.filter(|(j, _)| *j != i)
.map(|(_, a)| a.to_string())
.collect();
let config = Config {
node_id: node_id.clone(),
data_dir: temp_dir.path().to_path_buf(),
seeds: seeds.clone(),
grpc_address: grpc_addr.to_string(),
cluster_id: "test-cluster".to_string(),
s3_bucket: None,
s3_endpoint: None,
s3_region: None,
sync_policy: sq_models::SyncPolicy::EveryBatch,
};
let (state, mut pipeline) = State::new(config).unwrap();
let pipeline_cancel = CancellationToken::new();
let pipeline_cancel_clone = pipeline_cancel.clone();
tokio::spawn(async move {
tokio::select! {
() = pipeline.run() => {}
() = pipeline_cancel_clone.cancelled() => {}
}
});
let membership = Arc::new(Membership::new(MembershipConfig {
node_id: node_id.clone(),
address: grpc_addr.to_string(),
seeds,
..Default::default()
}));
let cancel = CancellationToken::new();
// Spawn gRPC server.
let cancel_clone = cancel.clone();
let state_clone = state.clone();
let membership_clone = membership.clone();
let incoming = tokio_stream::wrappers::TcpListenerStream::new(grpc_listener);
let server_handle = tokio::spawn(async move {
tonic::transport::Server::builder()
.add_service(StatusServiceServer::new(health::HealthServer {
state: state_clone.clone(),
}))
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
state: state_clone.clone(),
}))
.add_service(ControlPlaneServiceServer::new(
control_plane::ControlPlaneServer {
state: state_clone.clone(),
},
))
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
state: state_clone,
membership: membership_clone,
}))
.serve_with_incoming_shutdown(incoming, async move {
cancel_clone.cancelled().await;
})
.await
.unwrap();
});
// Spawn capnp server.
let cancel_clone = cancel.clone();
let capnp_state = state.clone();
let capnp_handle = tokio::spawn(async move {
let server = CapnpServer {
host: capnp_addr,
state: capnp_state,
};
drop(capnp_listener);
let _ = notmad::Component::run(&server, cancel_clone).await;
});
nodes.push(TestNode {
grpc_addr,
capnp_addr,
cancel,
pipeline_cancel,
_temp_dir: temp_dir,
_server_handle: server_handle,
_capnp_handle: capnp_handle,
});
}
for node in &nodes {
wait_for_ready(&node.grpc_endpoint()).await;
}
// Give capnp server a moment to bind.
tokio::time::sleep(Duration::from_millis(50)).await;
TestCluster { nodes }
}
fn node(&self, index: usize) -> &TestNode {
&self.nodes[index]
}
}
impl Drop for TestCluster {
fn drop(&mut self) {
for node in &self.nodes {
node.pipeline_cancel.cancel();
node.cancel.cancel();
}
}
}
async fn wait_for_ready(endpoint: &str) {
let deadline = tokio::time::Instant::now() + tokio::time::Duration::from_secs(5);
loop {
if tokio::time::Instant::now() > deadline {
panic!("Server at {} did not become ready in time", endpoint);
}
if let Ok(mut client) = StatusServiceClient::connect(endpoint.to_string()).await {
if client
.status(tonic::Request::new(GetStatusRequest {}))
.await
.is_ok()
{
return;
}
}
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
}
}
// ---------------------------------------------------------------------------
// Stress test 1: High-volume publish — 100K messages from a single producer
// ---------------------------------------------------------------------------
#[tokio::test]
async fn stress_single_producer_100k() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let grpc_ep = cluster.node(0).grpc_endpoint();
let mut producer = Producer::connect(ProducerConfig {
address: endpoint.clone(),
..Default::default()
})
.await
.unwrap();
let total = 100_000u64;
let batch_size = 500;
let payload = vec![0u8; 128]; // 128-byte messages
let start = Instant::now();
for batch_start in (0..total).step_by(batch_size) {
let batch_end = (batch_start + batch_size as u64).min(total);
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new("stress-topic", payload.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
let publish_duration = start.elapsed();
let msgs_per_sec = total as f64 / publish_duration.as_secs_f64();
eprintln!(
"stress_single_producer_100k: published {} messages in {:.2}s ({:.0} msg/s, {:.1} MB/s)",
total,
publish_duration.as_secs_f64(),
msgs_per_sec,
(total as f64 * 128.0) / (1024.0 * 1024.0) / publish_duration.as_secs_f64()
);
// Verify: read back all messages via gRPC subscribe.
let mut client = DataPlaneServiceClient::connect(grpc_ep)
.await
.unwrap();
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: "stress-topic".to_string(),
partition: 0,
consumer_group: String::new(),
start_offset: Some(0),
max_batch_size: 1000,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut consumed = 0u64;
let consume_start = Instant::now();
while consumed < total {
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
Ok(Some(Ok(batch))) => consumed += batch.messages.len() as u64,
_ => break,
}
}
let consume_duration = consume_start.elapsed();
let consume_per_sec = consumed as f64 / consume_duration.as_secs_f64();
eprintln!(
"stress_single_producer_100k: consumed {} messages in {:.2}s ({:.0} msg/s)",
consumed,
consume_duration.as_secs_f64(),
consume_per_sec
);
assert_eq!(consumed, total, "expected all messages to be consumed");
}
// ---------------------------------------------------------------------------
// Stress test 2: Concurrent producers — 10 producers, 10K messages each
// ---------------------------------------------------------------------------
#[tokio::test]
async fn stress_concurrent_producers() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let grpc_ep = cluster.node(0).grpc_endpoint();
let num_producers = 10;
let msgs_per_producer = 10_000u64;
let payload = vec![0u8; 64];
let start = Instant::now();
let mut handles = Vec::new();
for p in 0..num_producers {
let ep = endpoint.clone();
let pl = payload.clone();
handles.push(tokio::spawn(async move {
let mut producer = Producer::connect(ProducerConfig {
address: ep,
producer_id: format!("producer-{p}"),
..Default::default()
})
.await
.unwrap();
let topic = format!("concurrent-topic-{p}");
for batch_start in (0..msgs_per_producer).step_by(100) {
let batch_end = (batch_start + 100).min(msgs_per_producer);
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new(topic.clone(), pl.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
}));
}
for handle in handles {
handle.await.unwrap();
}
let duration = start.elapsed();
let total = num_producers as u64 * msgs_per_producer;
let msgs_per_sec = total as f64 / duration.as_secs_f64();
eprintln!(
"stress_concurrent_producers: {} producers x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
num_producers,
msgs_per_producer,
total,
duration.as_secs_f64(),
msgs_per_sec
);
// Verify each topic has the right count via gRPC.
for p in 0..num_producers {
let topic = format!("concurrent-topic-{p}");
let mut client = DataPlaneServiceClient::connect(grpc_ep.clone())
.await
.unwrap();
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: topic.clone(),
partition: 0,
consumer_group: String::new(),
start_offset: Some(0),
max_batch_size: 1000,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut count = 0u64;
while count < msgs_per_producer {
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
_ => break,
}
}
assert_eq!(
count, msgs_per_producer,
"topic {topic} expected {msgs_per_producer} messages, got {count}"
);
}
}
// ---------------------------------------------------------------------------
// Stress test 3: Concurrent consumers — publish then read in parallel
// ---------------------------------------------------------------------------
#[tokio::test]
async fn stress_concurrent_consumers() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let grpc_ep = cluster.node(0).grpc_endpoint();
let total = 50_000u64;
let payload = vec![0u8; 64];
// Pre-publish messages.
let mut producer = Producer::connect(ProducerConfig {
address: endpoint.clone(),
..Default::default()
})
.await
.unwrap();
for batch_start in (0..total).step_by(500) {
let batch_end = (batch_start + 500).min(total);
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new("consume-stress", payload.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
// Consume in parallel from 5 independent consumers via gRPC (no consumer group — each reads all).
let num_consumers = 5;
let start = Instant::now();
let mut handles = Vec::new();
for _ in 0..num_consumers {
let ep = grpc_ep.clone();
handles.push(tokio::spawn(async move {
let mut client = DataPlaneServiceClient::connect(ep).await.unwrap();
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: "consume-stress".to_string(),
partition: 0,
consumer_group: String::new(),
start_offset: Some(0),
max_batch_size: 1000,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut count = 0u64;
while count < total {
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
_ => break,
}
}
count
}));
}
for handle in handles {
let count = handle.await.unwrap();
assert_eq!(count, total, "each consumer should read all {total} messages");
}
let duration = start.elapsed();
eprintln!(
"stress_concurrent_consumers: {} consumers each read {} msgs in {:.2}s",
num_consumers,
total,
duration.as_secs_f64()
);
}
// ---------------------------------------------------------------------------
// Stress test 4: Sustained load — publish+consume simultaneously over time
// ---------------------------------------------------------------------------
#[tokio::test]
async fn stress_sustained_load() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let grpc_ep = cluster.node(0).grpc_endpoint();
let sustain_duration = Duration::from_secs(3);
let payload = vec![0u8; 256];
let ep = endpoint.clone();
let pl = payload.clone();
// Producer: publish as fast as possible for the sustained duration.
let producer_handle = tokio::spawn(async move {
let mut producer = Producer::connect(ProducerConfig {
address: ep,
..Default::default()
})
.await
.unwrap();
let start = Instant::now();
let mut total = 0u64;
while start.elapsed() < sustain_duration {
let batch: Vec<ProducerMessage> = (0..100)
.map(|_| ProducerMessage::new("sustained-topic", pl.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
total += 100;
}
(total, start.elapsed())
});
// Give producer a head start.
tokio::time::sleep(Duration::from_millis(100)).await;
// Consumer: read as fast as possible via gRPC subscribe.
let ep = grpc_ep.clone();
let consumer_handle = tokio::spawn(async move {
let mut client = DataPlaneServiceClient::connect(ep).await.unwrap();
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: "sustained-topic".to_string(),
partition: 0,
consumer_group: String::new(),
start_offset: Some(0),
max_batch_size: 1000,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut count = 0u64;
let start = Instant::now();
// Read for longer than the producer runs to drain everything.
let read_deadline = sustain_duration + Duration::from_secs(5);
while start.elapsed() < read_deadline {
match tokio::time::timeout(Duration::from_secs(2), stream.next()).await {
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
_ => break,
}
}
count
});
let (published, pub_duration) = producer_handle.await.unwrap();
let consumed = consumer_handle.await.unwrap();
let pub_rate = published as f64 / pub_duration.as_secs_f64();
let throughput_mb =
(published as f64 * 256.0) / (1024.0 * 1024.0) / pub_duration.as_secs_f64();
eprintln!(
"stress_sustained_load: published {} in {:.2}s ({:.0} msg/s, {:.1} MB/s), consumed {}",
published,
pub_duration.as_secs_f64(),
pub_rate,
throughput_mb,
consumed
);
assert!(
published > 0,
"should have published messages during sustained load"
);
assert_eq!(consumed, published, "consumer should eventually read all published messages");
}
// ---------------------------------------------------------------------------
// Stress test 5: Multi-topic fan-out — publish to many topics simultaneously
// ---------------------------------------------------------------------------
#[tokio::test]
async fn stress_multi_topic_fanout() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let grpc_ep = cluster.node(0).grpc_endpoint();
let num_topics = 50;
let msgs_per_topic = 1_000u64;
let payload = vec![0u8; 64];
let start = Instant::now();
let mut producer = Producer::connect(ProducerConfig {
address: endpoint.clone(),
..Default::default()
})
.await
.unwrap();
// Publish to many topics in round-robin batches.
for batch_start in (0..msgs_per_topic).step_by(100) {
let batch_end = (batch_start + 100).min(msgs_per_topic);
for t in 0..num_topics {
let topic = format!("fanout-{t}");
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new(topic.clone(), payload.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
}
let duration = start.elapsed();
let total = num_topics as u64 * msgs_per_topic;
eprintln!(
"stress_multi_topic_fanout: {} topics x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
num_topics,
msgs_per_topic,
total,
duration.as_secs_f64(),
total as f64 / duration.as_secs_f64()
);
// Spot-check a few topics via gRPC.
for t in [0, num_topics / 2, num_topics - 1] {
let topic = format!("fanout-{t}");
let mut client = DataPlaneServiceClient::connect(grpc_ep.clone())
.await
.unwrap();
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: topic.clone(),
partition: 0,
consumer_group: String::new(),
start_offset: Some(0),
max_batch_size: 1000,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut count = 0u64;
while count < msgs_per_topic {
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
_ => break,
}
}
assert_eq!(
count, msgs_per_topic,
"topic {topic} expected {msgs_per_topic} messages, got {count}"
);
}
}
// ---------------------------------------------------------------------------
// Stress test 6: Large message bodies — 10K messages with 4KB payloads
// ---------------------------------------------------------------------------
#[tokio::test]
async fn stress_large_messages() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let grpc_ep = cluster.node(0).grpc_endpoint();
let total = 10_000u64;
let payload = vec![0xABu8; 4096]; // 4KB messages
let mut producer = Producer::connect(ProducerConfig {
address: endpoint.clone(),
..Default::default()
})
.await
.unwrap();
let start = Instant::now();
for batch_start in (0..total).step_by(50) {
let batch_end = (batch_start + 50).min(total);
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new("large-msgs", payload.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
let pub_duration = start.elapsed();
let data_mb = (total as f64 * 4096.0) / (1024.0 * 1024.0);
eprintln!(
"stress_large_messages: published {} x 4KB = {:.1}MB in {:.2}s ({:.1} MB/s)",
total,
data_mb,
pub_duration.as_secs_f64(),
data_mb / pub_duration.as_secs_f64()
);
// Verify all data reads back correctly via gRPC.
let mut client = DataPlaneServiceClient::connect(grpc_ep).await.unwrap();
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: "large-msgs".to_string(),
partition: 0,
consumer_group: String::new(),
start_offset: Some(0),
max_batch_size: 200,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut count = 0u64;
while count < total {
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
Ok(Some(Ok(batch))) => {
for msg in &batch.messages {
assert_eq!(msg.value.len(), 4096, "message body should be 4KB");
assert!(msg.value.iter().all(|&b| b == 0xAB), "data integrity check");
}
count += batch.messages.len() as u64;
}
_ => break,
}
}
assert_eq!(count, total, "all large messages should be consumed");
}
// ---------------------------------------------------------------------------
// Stress test 7: Consumer group offset tracking under load
// ---------------------------------------------------------------------------
#[tokio::test]
async fn stress_consumer_group_resume() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let total = 10_000u64;
let payload = vec![0u8; 32];
// Publish all messages.
let mut producer = Producer::connect(ProducerConfig {
address: endpoint.clone(),
..Default::default()
})
.await
.unwrap();
for batch_start in (0..total).step_by(500) {
let batch_end = (batch_start + 500).min(total);
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
.map(|_| ProducerMessage::new("cg-stress", payload.clone()))
.collect();
producer.send_batch(batch).await.unwrap();
}
// Consume first half with auto-commit.
let half = total / 2;
{
let mut consumer = Consumer::connect(ConsumerConfig {
address: endpoint.clone(),
consumer_group: "stress-group".to_string(),
topic: "cg-stress".to_string(),
auto_commit: true,
max_poll_records: 500,
..Default::default()
})
.await
.unwrap();
let mut consumed = 0u64;
while consumed < half {
let msgs = tokio::time::timeout(Duration::from_secs(5), consumer.poll())
.await
.unwrap()
.unwrap();
consumed += msgs.len() as u64;
}
assert!(consumed >= half, "should have consumed at least half");
}
// Reconnect — should resume from the committed offset.
{
let mut consumer = Consumer::connect(ConsumerConfig {
address: endpoint.clone(),
consumer_group: "stress-group".to_string(),
topic: "cg-stress".to_string(),
auto_commit: true,
max_poll_records: 500,
..Default::default()
})
.await
.unwrap();
let msgs = tokio::time::timeout(Duration::from_secs(5), consumer.poll())
.await
.unwrap()
.unwrap();
// First message after reconnect should be at or after the halfway point.
assert!(
!msgs.is_empty(),
"should receive messages after resume"
);
let first_offset = msgs[0].offset;
assert!(
first_offset >= half - 500, // Allow some re-delivery due to batch commit
"first offset after resume should be near {half}, got {first_offset}"
);
}
}
// ---------------------------------------------------------------------------
// Stress test 8: BatchProducer — 100K messages from a single batching producer
// ---------------------------------------------------------------------------
#[tokio::test]
async fn stress_batch_producer_100k() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let grpc_ep = cluster.node(0).grpc_endpoint();
let producer = BatchProducer::connect(BatchProducerConfig {
address: endpoint.clone(),
max_batch_size: 1000,
flush_interval_ms: 5,
channel_capacity: 20_000,
..Default::default()
})
.await
.unwrap();
let producer = Arc::new(producer);
let total = 100_000u64;
let payload = vec![0u8; 128];
let start = Instant::now();
// Spawn a task per message to fully saturate the batch pipeline.
let mut handles = Vec::with_capacity(total as usize);
for _ in 0..total {
let p = producer.clone();
let pl = payload.clone();
handles.push(tokio::spawn(async move {
p.send(ProducerMessage::new("batch-stress", pl))
.await
.unwrap();
}));
}
for handle in handles {
handle.await.unwrap();
}
let publish_duration = start.elapsed();
let msgs_per_sec = total as f64 / publish_duration.as_secs_f64();
eprintln!(
"stress_batch_producer_100k: published {} messages in {:.2}s ({:.0} msg/s, {:.1} MB/s)",
total,
publish_duration.as_secs_f64(),
msgs_per_sec,
(total as f64 * 128.0) / (1024.0 * 1024.0) / publish_duration.as_secs_f64()
);
// Verify: read back all messages via gRPC.
let mut client = DataPlaneServiceClient::connect(grpc_ep).await.unwrap();
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: "batch-stress".to_string(),
partition: 0,
consumer_group: String::new(),
start_offset: Some(0),
max_batch_size: 1000,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut consumed = 0u64;
while consumed < total {
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
Ok(Some(Ok(batch))) => consumed += batch.messages.len() as u64,
_ => break,
}
}
assert_eq!(consumed, total, "expected all messages to be consumed");
// Close the producer (flushes remaining).
Arc::try_unwrap(producer).ok().unwrap().close().await;
}
// ---------------------------------------------------------------------------
// Stress test 9: BatchProducer concurrent — 10 batching producers, 10K each
// ---------------------------------------------------------------------------
#[tokio::test]
async fn stress_batch_concurrent_producers() {
let cluster = TestCluster::start(1).await;
let endpoint = cluster.node(0).endpoint();
let grpc_ep = cluster.node(0).grpc_endpoint();
let num_producers = 10;
let msgs_per_producer = 10_000u64;
let payload = vec![0u8; 64];
let start = Instant::now();
let mut handles = Vec::new();
for p in 0..num_producers {
let ep = endpoint.clone();
let pl = payload.clone();
handles.push(tokio::spawn(async move {
let producer = Arc::new(
BatchProducer::connect(BatchProducerConfig {
address: ep,
producer_id: format!("batch-producer-{p}"),
max_batch_size: 500,
flush_interval_ms: 5,
..Default::default()
})
.await
.unwrap(),
);
let topic = format!("batch-concurrent-{p}");
let mut send_handles = Vec::new();
// Fire all sends concurrently within each producer.
for _ in 0..msgs_per_producer {
let p = producer.clone();
let t = topic.clone();
let pl = pl.clone();
send_handles.push(tokio::spawn(async move {
p.send(ProducerMessage::new(t, pl)).await.unwrap();
}));
}
// Await all acks.
for handle in send_handles {
handle.await.unwrap();
}
Arc::try_unwrap(producer).ok().unwrap().close().await;
}));
}
for handle in handles {
handle.await.unwrap();
}
let duration = start.elapsed();
let total = num_producers as u64 * msgs_per_producer;
let msgs_per_sec = total as f64 / duration.as_secs_f64();
eprintln!(
"stress_batch_concurrent_producers: {} producers x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
num_producers,
msgs_per_producer,
total,
duration.as_secs_f64(),
msgs_per_sec
);
// Verify each topic has the right count via gRPC.
for p in 0..num_producers {
let topic = format!("batch-concurrent-{p}");
let mut client = DataPlaneServiceClient::connect(grpc_ep.clone())
.await
.unwrap();
let response = client
.subscribe(tonic::Request::new(SubscribeRequest {
topic: topic.clone(),
partition: 0,
consumer_group: String::new(),
start_offset: Some(0),
max_batch_size: 1000,
}))
.await
.unwrap();
let mut stream = response.into_inner();
let mut count = 0u64;
while count < msgs_per_producer {
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
_ => break,
}
}
assert_eq!(
count, msgs_per_producer,
"topic {topic} expected {msgs_per_producer} messages, got {count}"
);
}
}