12
crates/sq-capnp-interface/Cargo.toml
Normal file
12
crates/sq-capnp-interface/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "sq-capnp-interface"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
capnp = { workspace = true }
|
||||
bytes = { workspace = true }
|
||||
tokio-util = { workspace = true, features = ["codec"] }
|
||||
|
||||
[build-dependencies]
|
||||
capnpc = { workspace = true }
|
||||
6
crates/sq-capnp-interface/build.rs
Normal file
6
crates/sq-capnp-interface/build.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
fn main() {
|
||||
capnpc::CompilerCommand::new()
|
||||
.file("schema/data_plane.capnp")
|
||||
.run()
|
||||
.expect("capnp schema compilation failed");
|
||||
}
|
||||
65
crates/sq-capnp-interface/schema/data_plane.capnp
Normal file
65
crates/sq-capnp-interface/schema/data_plane.capnp
Normal file
@@ -0,0 +1,65 @@
|
||||
@0xb8f6c1e2a3d4e5f6;
|
||||
|
||||
struct MessageHeader {
|
||||
key @0 :Text;
|
||||
value @1 :Data;
|
||||
}
|
||||
|
||||
struct PublishMessage {
|
||||
topic @0 :Text;
|
||||
key @1 :Data;
|
||||
value @2 :Data;
|
||||
headers @3 :List(MessageHeader);
|
||||
}
|
||||
|
||||
struct PublishRequest {
|
||||
messages @0 :List(PublishMessage);
|
||||
ackMode @1 :UInt8;
|
||||
producerId @2 :Text;
|
||||
}
|
||||
|
||||
struct PublishResult {
|
||||
topic @0 :Text;
|
||||
partition @1 :UInt32;
|
||||
offset @2 :UInt64;
|
||||
}
|
||||
|
||||
struct PublishResponse {
|
||||
results @0 :List(PublishResult);
|
||||
}
|
||||
|
||||
struct SubscribeRequest {
|
||||
topic @0 :Text;
|
||||
partition @1 :UInt32;
|
||||
consumerGroup @2 :Text;
|
||||
startOffset @3 :UInt64;
|
||||
hasStartOffset @4 :Bool;
|
||||
maxBatchSize @5 :UInt32;
|
||||
}
|
||||
|
||||
struct ConsumedMessage {
|
||||
offset @0 :UInt64;
|
||||
topic @1 :Text;
|
||||
partition @2 :UInt32;
|
||||
key @3 :Data;
|
||||
value @4 :Data;
|
||||
headers @5 :List(MessageHeader);
|
||||
timestampMs @6 :UInt64;
|
||||
}
|
||||
|
||||
struct SubscribeResponse {
|
||||
messages @0 :List(ConsumedMessage);
|
||||
}
|
||||
|
||||
struct AckRequest {
|
||||
consumerGroup @0 :Text;
|
||||
topic @1 :Text;
|
||||
partition @2 :UInt32;
|
||||
offset @3 :UInt64;
|
||||
}
|
||||
|
||||
struct AckResponse {}
|
||||
|
||||
struct ErrorResponse {
|
||||
message @0 :Text;
|
||||
}
|
||||
185
crates/sq-capnp-interface/src/codec.rs
Normal file
185
crates/sq-capnp-interface/src/codec.rs
Normal file
@@ -0,0 +1,185 @@
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use tokio_util::codec::{Decoder, Encoder, LengthDelimitedCodec};
|
||||
|
||||
// Opcodes
|
||||
pub const OP_PUBLISH_REQ: u8 = 0x01;
|
||||
pub const OP_PUBLISH_RES: u8 = 0x81;
|
||||
pub const OP_SUBSCRIBE_REQ: u8 = 0x02;
|
||||
pub const OP_SUBSCRIBE_RES: u8 = 0x82;
|
||||
pub const OP_ACK_REQ: u8 = 0x03;
|
||||
pub const OP_ACK_RES: u8 = 0x83;
|
||||
pub const OP_SUBSCRIBE_END: u8 = 0x84;
|
||||
pub const OP_ERROR: u8 = 0xFE;
|
||||
|
||||
/// A decoded frame: opcode + capnp payload bytes.
|
||||
pub struct Frame {
|
||||
pub opcode: u8,
|
||||
pub payload: Bytes,
|
||||
}
|
||||
|
||||
/// Codec that wraps `LengthDelimitedCodec` and prepends a 1-byte opcode.
|
||||
///
|
||||
/// Wire format: `[4-byte big-endian frame length][1-byte opcode][capnp payload]`
|
||||
pub struct SqCodec {
|
||||
inner: LengthDelimitedCodec,
|
||||
}
|
||||
|
||||
impl SqCodec {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
inner: LengthDelimitedCodec::builder()
|
||||
.max_frame_length(16 * 1024 * 1024) // 16 MB
|
||||
.new_codec(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SqCodec {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Decoder for SqCodec {
|
||||
type Item = Frame;
|
||||
type Error = std::io::Error;
|
||||
|
||||
fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
|
||||
match self.inner.decode(src)? {
|
||||
Some(mut buf) => {
|
||||
if buf.is_empty() {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidData,
|
||||
"empty frame",
|
||||
));
|
||||
}
|
||||
let opcode = buf.get_u8();
|
||||
let payload = buf.freeze();
|
||||
Ok(Some(Frame { opcode, payload }))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Encoder<Frame> for SqCodec {
|
||||
type Error = std::io::Error;
|
||||
|
||||
fn encode(&mut self, item: Frame, dst: &mut BytesMut) -> Result<(), Self::Error> {
|
||||
let mut buf = BytesMut::with_capacity(1 + item.payload.len());
|
||||
buf.put_u8(item.opcode);
|
||||
buf.extend_from_slice(&item.payload);
|
||||
self.inner.encode(buf.freeze(), dst)
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialize a capnp message builder into bytes.
|
||||
pub fn serialize_capnp(builder: &capnp::message::Builder<capnp::message::HeapAllocator>) -> Bytes {
|
||||
let mut buf = Vec::new();
|
||||
capnp::serialize::write_message(&mut buf, builder).expect("capnp serialize failed");
|
||||
Bytes::from(buf)
|
||||
}
|
||||
|
||||
/// Build a Frame from an opcode and a capnp message builder.
|
||||
pub fn build_frame(
|
||||
opcode: u8,
|
||||
builder: &capnp::message::Builder<capnp::message::HeapAllocator>,
|
||||
) -> Frame {
|
||||
Frame {
|
||||
opcode,
|
||||
payload: serialize_capnp(builder),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build an error frame with a text message.
|
||||
pub fn error_frame(msg: &str) -> Frame {
|
||||
let mut builder = capnp::message::Builder::new_default();
|
||||
{
|
||||
let mut err = builder.init_root::<crate::data_plane_capnp::error_response::Builder>();
|
||||
err.set_message(msg);
|
||||
}
|
||||
build_frame(OP_ERROR, &builder)
|
||||
}
|
||||
|
||||
/// Deserialize a capnp message from a byte slice.
|
||||
pub fn read_capnp(payload: &[u8]) -> capnp::Result<capnp::message::Reader<capnp::serialize::OwnedSegments>> {
|
||||
let mut cursor = std::io::Cursor::new(payload);
|
||||
capnp::serialize::read_message(
|
||||
&mut cursor,
|
||||
capnp::message::ReaderOptions::new(),
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tokio_util::codec::{Decoder, Encoder};
|
||||
|
||||
#[test]
|
||||
fn roundtrip_frame() {
|
||||
let mut codec = SqCodec::new();
|
||||
let original = Frame {
|
||||
opcode: OP_PUBLISH_REQ,
|
||||
payload: Bytes::from_static(b"hello"),
|
||||
};
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
codec
|
||||
.encode(
|
||||
Frame {
|
||||
opcode: original.opcode,
|
||||
payload: original.payload.clone(),
|
||||
},
|
||||
&mut buf,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let decoded = codec.decode(&mut buf).unwrap().unwrap();
|
||||
assert_eq!(decoded.opcode, OP_PUBLISH_REQ);
|
||||
assert_eq!(decoded.payload, Bytes::from_static(b"hello"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capnp_publish_roundtrip() {
|
||||
// Build a PublishRequest
|
||||
let mut builder = capnp::message::Builder::new_default();
|
||||
{
|
||||
let mut req = builder.init_root::<crate::data_plane_capnp::publish_request::Builder>();
|
||||
req.set_ack_mode(1);
|
||||
req.set_producer_id("test");
|
||||
let mut msgs = req.init_messages(1);
|
||||
let mut msg = msgs.reborrow().get(0);
|
||||
msg.set_topic("orders");
|
||||
msg.set_key(b"key1");
|
||||
msg.set_value(b"value1");
|
||||
}
|
||||
|
||||
let frame = build_frame(OP_PUBLISH_REQ, &builder);
|
||||
assert_eq!(frame.opcode, OP_PUBLISH_REQ);
|
||||
|
||||
// Decode
|
||||
let reader = read_capnp(&frame.payload).unwrap();
|
||||
let req = reader
|
||||
.get_root::<crate::data_plane_capnp::publish_request::Reader>()
|
||||
.unwrap();
|
||||
assert_eq!(req.get_ack_mode(), 1);
|
||||
assert_eq!(req.get_producer_id().unwrap(), "test");
|
||||
let msgs = req.get_messages().unwrap();
|
||||
assert_eq!(msgs.len(), 1);
|
||||
assert_eq!(msgs.get(0).get_topic().unwrap(), "orders");
|
||||
assert_eq!(msgs.get(0).get_key().unwrap(), b"key1");
|
||||
assert_eq!(msgs.get(0).get_value().unwrap(), b"value1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_frame_roundtrip() {
|
||||
let frame = error_frame("something went wrong");
|
||||
assert_eq!(frame.opcode, OP_ERROR);
|
||||
|
||||
let reader = read_capnp(&frame.payload).unwrap();
|
||||
let err = reader
|
||||
.get_root::<crate::data_plane_capnp::error_response::Reader>()
|
||||
.unwrap();
|
||||
assert_eq!(err.get_message().unwrap(), "something went wrong");
|
||||
}
|
||||
}
|
||||
6
crates/sq-capnp-interface/src/lib.rs
Normal file
6
crates/sq-capnp-interface/src/lib.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
pub mod codec;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub mod data_plane_capnp {
|
||||
include!(concat!(env!("OUT_DIR"), "/schema/data_plane_capnp.rs"));
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
pub mod membership;
|
||||
pub mod recovery;
|
||||
pub mod replication;
|
||||
|
||||
340
crates/sq-cluster/src/membership.rs
Normal file
340
crates/sq-cluster/src/membership.rs
Normal file
@@ -0,0 +1,340 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
/// Status of a node in the cluster.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum NodeStatus {
|
||||
Alive,
|
||||
Suspected,
|
||||
Dead,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for NodeStatus {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
NodeStatus::Alive => write!(f, "alive"),
|
||||
NodeStatus::Suspected => write!(f, "suspected"),
|
||||
NodeStatus::Dead => write!(f, "dead"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Information about a member node.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MemberInfo {
|
||||
pub node_id: String,
|
||||
pub address: String,
|
||||
pub status: NodeStatus,
|
||||
pub last_heartbeat: Instant,
|
||||
}
|
||||
|
||||
/// Configuration for membership management.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MembershipConfig {
|
||||
/// This node's ID.
|
||||
pub node_id: String,
|
||||
/// This node's gRPC address.
|
||||
pub address: String,
|
||||
/// Seed node addresses for initial discovery.
|
||||
pub seeds: Vec<String>,
|
||||
/// How many missed heartbeats before a node is suspected.
|
||||
pub failure_threshold: u32,
|
||||
/// Heartbeat interval.
|
||||
pub heartbeat_interval: Duration,
|
||||
/// Time a node stays in Suspected before being declared Dead.
|
||||
pub suspect_timeout: Duration,
|
||||
}
|
||||
|
||||
impl Default for MembershipConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
node_id: "node-1".to_string(),
|
||||
address: "127.0.0.1:6060".to_string(),
|
||||
seeds: Vec::new(),
|
||||
failure_threshold: 3,
|
||||
heartbeat_interval: Duration::from_secs(5),
|
||||
suspect_timeout: Duration::from_secs(30),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Manages cluster membership state.
|
||||
pub struct Membership {
|
||||
config: MembershipConfig,
|
||||
members: Arc<Mutex<HashMap<String, MemberInfo>>>,
|
||||
}
|
||||
|
||||
impl Membership {
|
||||
pub fn new(config: MembershipConfig) -> Self {
|
||||
let mut members = HashMap::new();
|
||||
|
||||
// Add self as alive.
|
||||
members.insert(
|
||||
config.node_id.clone(),
|
||||
MemberInfo {
|
||||
node_id: config.node_id.clone(),
|
||||
address: config.address.clone(),
|
||||
status: NodeStatus::Alive,
|
||||
last_heartbeat: Instant::now(),
|
||||
},
|
||||
);
|
||||
|
||||
Self {
|
||||
config,
|
||||
members: Arc::new(Mutex::new(members)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the shared members handle (for use in gRPC handlers).
|
||||
pub fn members(&self) -> Arc<Mutex<HashMap<String, MemberInfo>>> {
|
||||
self.members.clone()
|
||||
}
|
||||
|
||||
/// Get the node ID.
|
||||
pub fn node_id(&self) -> &str {
|
||||
&self.config.node_id
|
||||
}
|
||||
|
||||
/// Get the node address.
|
||||
pub fn address(&self) -> &str {
|
||||
&self.config.address
|
||||
}
|
||||
|
||||
/// Get seed addresses.
|
||||
pub fn seeds(&self) -> &[String] {
|
||||
&self.config.seeds
|
||||
}
|
||||
|
||||
/// Record a heartbeat from a node. Creates the member entry if new.
|
||||
pub async fn record_heartbeat(&self, node_id: &str, address: &str) {
|
||||
let mut members = self.members.lock().await;
|
||||
let entry = members
|
||||
.entry(node_id.to_string())
|
||||
.or_insert_with(|| MemberInfo {
|
||||
node_id: node_id.to_string(),
|
||||
address: address.to_string(),
|
||||
status: NodeStatus::Alive,
|
||||
last_heartbeat: Instant::now(),
|
||||
});
|
||||
entry.status = NodeStatus::Alive;
|
||||
entry.last_heartbeat = Instant::now();
|
||||
entry.address = address.to_string();
|
||||
}
|
||||
|
||||
/// Record members discovered from a Join/Heartbeat response.
|
||||
pub async fn merge_members(&self, discovered: Vec<(String, String)>) {
|
||||
let mut members = self.members.lock().await;
|
||||
for (node_id, address) in discovered {
|
||||
if node_id == self.config.node_id {
|
||||
continue; // Skip self.
|
||||
}
|
||||
members
|
||||
.entry(node_id.clone())
|
||||
.or_insert_with(|| MemberInfo {
|
||||
node_id,
|
||||
address,
|
||||
status: NodeStatus::Alive,
|
||||
last_heartbeat: Instant::now(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Check for failed nodes based on heartbeat timeouts.
|
||||
/// Updates node status from Alive -> Suspected -> Dead.
|
||||
pub async fn check_failures(&self) {
|
||||
let now = Instant::now();
|
||||
let heartbeat_timeout =
|
||||
self.config.heartbeat_interval * self.config.failure_threshold;
|
||||
|
||||
let mut members = self.members.lock().await;
|
||||
for (id, member) in members.iter_mut() {
|
||||
if *id == self.config.node_id {
|
||||
// Don't suspect self.
|
||||
member.last_heartbeat = now;
|
||||
continue;
|
||||
}
|
||||
|
||||
let elapsed = now.duration_since(member.last_heartbeat);
|
||||
|
||||
match member.status {
|
||||
NodeStatus::Alive => {
|
||||
if elapsed > heartbeat_timeout {
|
||||
tracing::warn!(
|
||||
node_id = %id,
|
||||
elapsed_secs = elapsed.as_secs(),
|
||||
"node suspected: missed heartbeats"
|
||||
);
|
||||
member.status = NodeStatus::Suspected;
|
||||
}
|
||||
}
|
||||
NodeStatus::Suspected => {
|
||||
if elapsed > heartbeat_timeout + self.config.suspect_timeout {
|
||||
tracing::warn!(node_id = %id, "node declared dead");
|
||||
member.status = NodeStatus::Dead;
|
||||
}
|
||||
}
|
||||
NodeStatus::Dead => {
|
||||
// Dead nodes stay dead until they re-join.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all alive peers (excluding self).
|
||||
pub async fn alive_peers(&self) -> Vec<MemberInfo> {
|
||||
let members = self.members.lock().await;
|
||||
members
|
||||
.values()
|
||||
.filter(|m| m.node_id != self.config.node_id && m.status == NodeStatus::Alive)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get all known members (including self).
|
||||
pub async fn all_members(&self) -> Vec<MemberInfo> {
|
||||
let members = self.members.lock().await;
|
||||
members.values().cloned().collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn test_config(node_id: &str) -> MembershipConfig {
|
||||
MembershipConfig {
|
||||
node_id: node_id.to_string(),
|
||||
address: format!("127.0.0.1:606{}", node_id.chars().last().unwrap()),
|
||||
heartbeat_interval: Duration::from_millis(100),
|
||||
failure_threshold: 3,
|
||||
suspect_timeout: Duration::from_millis(300),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_new_membership_has_self() {
|
||||
let m = Membership::new(test_config("node-1"));
|
||||
let members = m.all_members().await;
|
||||
assert_eq!(members.len(), 1);
|
||||
assert_eq!(members[0].node_id, "node-1");
|
||||
assert_eq!(members[0].status, NodeStatus::Alive);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_record_heartbeat_adds_new_member() {
|
||||
let m = Membership::new(test_config("node-1"));
|
||||
m.record_heartbeat("node-2", "127.0.0.1:6062").await;
|
||||
|
||||
let members = m.all_members().await;
|
||||
assert_eq!(members.len(), 2);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_record_heartbeat_updates_existing() {
|
||||
let m = Membership::new(test_config("node-1"));
|
||||
m.record_heartbeat("node-2", "127.0.0.1:6062").await;
|
||||
|
||||
// Update address.
|
||||
m.record_heartbeat("node-2", "127.0.0.1:6063").await;
|
||||
|
||||
let members = m.all_members().await;
|
||||
let node2 = members.iter().find(|m| m.node_id == "node-2").unwrap();
|
||||
assert_eq!(node2.address, "127.0.0.1:6063");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_members() {
|
||||
let m = Membership::new(test_config("node-1"));
|
||||
m.merge_members(vec![
|
||||
("node-2".to_string(), "addr-2".to_string()),
|
||||
("node-3".to_string(), "addr-3".to_string()),
|
||||
])
|
||||
.await;
|
||||
|
||||
let members = m.all_members().await;
|
||||
assert_eq!(members.len(), 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_skips_self() {
|
||||
let m = Membership::new(test_config("node-1"));
|
||||
m.merge_members(vec![("node-1".to_string(), "other-addr".to_string())])
|
||||
.await;
|
||||
|
||||
let members = m.all_members().await;
|
||||
assert_eq!(members.len(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_alive_peers_excludes_self() {
|
||||
let m = Membership::new(test_config("node-1"));
|
||||
m.record_heartbeat("node-2", "addr-2").await;
|
||||
|
||||
let peers = m.alive_peers().await;
|
||||
assert_eq!(peers.len(), 1);
|
||||
assert_eq!(peers[0].node_id, "node-2");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_check_failures_suspects_after_timeout() {
|
||||
let m = Membership::new(test_config("node-1"));
|
||||
m.record_heartbeat("node-2", "addr-2").await;
|
||||
|
||||
// Simulate time passing by directly modifying last_heartbeat.
|
||||
{
|
||||
let mut members = m.members.lock().await;
|
||||
let node2 = members.get_mut("node-2").unwrap();
|
||||
node2.last_heartbeat = Instant::now() - Duration::from_millis(500);
|
||||
}
|
||||
|
||||
m.check_failures().await;
|
||||
|
||||
let members = m.all_members().await;
|
||||
let node2 = members.iter().find(|m| m.node_id == "node-2").unwrap();
|
||||
assert_eq!(node2.status, NodeStatus::Suspected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heartbeat_revives_suspected_node() {
|
||||
let m = Membership::new(test_config("node-1"));
|
||||
m.record_heartbeat("node-2", "addr-2").await;
|
||||
|
||||
// Make node-2 suspected.
|
||||
{
|
||||
let mut members = m.members.lock().await;
|
||||
let node2 = members.get_mut("node-2").unwrap();
|
||||
node2.status = NodeStatus::Suspected;
|
||||
}
|
||||
|
||||
// Heartbeat revives it.
|
||||
m.record_heartbeat("node-2", "addr-2").await;
|
||||
|
||||
let members = m.all_members().await;
|
||||
let node2 = members.iter().find(|m| m.node_id == "node-2").unwrap();
|
||||
assert_eq!(node2.status, NodeStatus::Alive);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_dead_after_suspect_timeout() {
|
||||
let m = Membership::new(test_config("node-1"));
|
||||
m.record_heartbeat("node-2", "addr-2").await;
|
||||
|
||||
// Simulate way past all timeouts.
|
||||
{
|
||||
let mut members = m.members.lock().await;
|
||||
let node2 = members.get_mut("node-2").unwrap();
|
||||
node2.status = NodeStatus::Suspected;
|
||||
node2.last_heartbeat = Instant::now() - Duration::from_secs(10);
|
||||
}
|
||||
|
||||
m.check_failures().await;
|
||||
|
||||
let members = m.all_members().await;
|
||||
let node2 = members.iter().find(|m| m.node_id == "node-2").unwrap();
|
||||
assert_eq!(node2.status, NodeStatus::Dead);
|
||||
}
|
||||
}
|
||||
74
crates/sq-cluster/src/recovery.rs
Normal file
74
crates/sq-cluster/src/recovery.rs
Normal file
@@ -0,0 +1,74 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_grpc_interface::cluster_service_client::ClusterServiceClient;
|
||||
use sq_grpc_interface::JoinRequest;
|
||||
|
||||
use crate::membership::Membership;
|
||||
|
||||
/// Handles node recovery and catch-up when joining/rejoining the cluster.
|
||||
pub struct Recovery {
|
||||
membership: Arc<Membership>,
|
||||
}
|
||||
|
||||
impl Recovery {
|
||||
pub fn new(membership: Arc<Membership>) -> Self {
|
||||
Self { membership }
|
||||
}
|
||||
|
||||
/// Join the cluster by contacting seed nodes.
|
||||
/// Returns the number of seeds successfully contacted.
|
||||
pub async fn join_cluster(&self) -> anyhow::Result<usize> {
|
||||
let seeds = self.membership.seeds().to_vec();
|
||||
let mut contacted = 0;
|
||||
|
||||
for seed_addr in &seeds {
|
||||
let endpoint = format!("http://{}", seed_addr);
|
||||
|
||||
match ClusterServiceClient::connect(endpoint).await {
|
||||
Ok(mut client) => {
|
||||
let response = client
|
||||
.join(tonic::Request::new(JoinRequest {
|
||||
node_id: self.membership.node_id().to_string(),
|
||||
address: self.membership.address().to_string(),
|
||||
}))
|
||||
.await;
|
||||
|
||||
match response {
|
||||
Ok(resp) => {
|
||||
let members: Vec<(String, String)> = resp
|
||||
.into_inner()
|
||||
.members
|
||||
.into_iter()
|
||||
.map(|m| (m.node_id, m.address))
|
||||
.collect();
|
||||
|
||||
self.membership.merge_members(members).await;
|
||||
contacted += 1;
|
||||
|
||||
tracing::info!(
|
||||
seed = %seed_addr,
|
||||
"successfully joined cluster via seed"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
seed = %seed_addr,
|
||||
error = %e,
|
||||
"failed to join via seed"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
seed = %seed_addr,
|
||||
error = %e,
|
||||
"failed to connect to seed"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(contacted)
|
||||
}
|
||||
}
|
||||
242
crates/sq-cluster/src/replication.rs
Normal file
242
crates/sq-cluster/src/replication.rs
Normal file
@@ -0,0 +1,242 @@
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use sq_grpc_interface::{
|
||||
cluster_service_client::ClusterServiceClient, ReplicateEntriesRequest,
|
||||
};
|
||||
|
||||
use crate::membership::{Membership, MemberInfo};
|
||||
|
||||
/// Configuration for write replication.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ReplicationConfig {
|
||||
/// Replication factor (how many copies including local).
|
||||
pub replication_factor: u32,
|
||||
/// Timeout for waiting for peer acks.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
impl Default for ReplicationConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
replication_factor: 3,
|
||||
timeout: Duration::from_secs(5),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a replication attempt.
|
||||
#[derive(Debug)]
|
||||
pub struct ReplicationResult {
|
||||
/// Number of successful acks (including local).
|
||||
pub ack_count: u32,
|
||||
/// Whether quorum was reached.
|
||||
pub quorum_reached: bool,
|
||||
/// Errors from failed peers.
|
||||
pub errors: Vec<(String, String)>,
|
||||
}
|
||||
|
||||
/// Handles replicating WAL entries to peer nodes.
|
||||
pub struct Replicator {
|
||||
membership: Arc<Membership>,
|
||||
config: ReplicationConfig,
|
||||
}
|
||||
|
||||
impl Replicator {
|
||||
pub fn new(membership: Arc<Membership>, config: ReplicationConfig) -> Self {
|
||||
Self {
|
||||
membership,
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
/// Replicate entries to peers. Returns after quorum is reached or timeout.
|
||||
/// The local write is assumed to already be done (counts as 1 ack).
|
||||
pub async fn replicate(
|
||||
&self,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
entries: Vec<Vec<u8>>,
|
||||
) -> ReplicationResult {
|
||||
let peers = self.membership.alive_peers().await;
|
||||
let quorum = (self.config.replication_factor / 2) + 1;
|
||||
|
||||
// If no peers or single-node, local write alone is sufficient.
|
||||
if peers.is_empty() || self.config.replication_factor <= 1 {
|
||||
return ReplicationResult {
|
||||
ack_count: 1,
|
||||
quorum_reached: quorum <= 1,
|
||||
errors: vec![],
|
||||
};
|
||||
}
|
||||
|
||||
// Send to all alive peers in parallel.
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel::<Result<String, (String, String)>>(
|
||||
peers.len(),
|
||||
);
|
||||
|
||||
for peer in &peers {
|
||||
let tx = tx.clone();
|
||||
let peer = peer.clone();
|
||||
let topic = topic.to_string();
|
||||
let entries = entries.clone();
|
||||
tokio::spawn(async move {
|
||||
match replicate_to_peer(&peer, &topic, partition, entries).await {
|
||||
Ok(()) => {
|
||||
let _ = tx.send(Ok(peer.node_id.clone())).await;
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = tx
|
||||
.send(Err((peer.node_id.clone(), e.to_string())))
|
||||
.await;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
drop(tx);
|
||||
|
||||
// Wait for acks with timeout.
|
||||
let mut ack_count: u32 = 1; // Count local write.
|
||||
let mut errors = Vec::new();
|
||||
|
||||
let deadline = tokio::time::Instant::now() + self.config.timeout;
|
||||
|
||||
loop {
|
||||
if ack_count >= quorum {
|
||||
break;
|
||||
}
|
||||
|
||||
tokio::select! {
|
||||
result = rx.recv() => {
|
||||
match result {
|
||||
Some(Ok(_node_id)) => {
|
||||
ack_count += 1;
|
||||
}
|
||||
Some(Err((node_id, err))) => {
|
||||
errors.push((node_id, err));
|
||||
}
|
||||
None => {
|
||||
// Channel closed, all peers responded.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = tokio::time::sleep_until(deadline) => {
|
||||
tracing::warn!(
|
||||
acks = ack_count,
|
||||
quorum = quorum,
|
||||
"replication timeout waiting for quorum"
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ReplicationResult {
|
||||
ack_count,
|
||||
quorum_reached: ack_count >= quorum,
|
||||
errors,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn replicate_to_peer(
|
||||
peer: &MemberInfo,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
entries: Vec<Vec<u8>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let endpoint = format!("http://{}", peer.address);
|
||||
let mut client = ClusterServiceClient::connect(endpoint).await?;
|
||||
|
||||
client
|
||||
.replicate_entries(tonic::Request::new(ReplicateEntriesRequest {
|
||||
topic: topic.to_string(),
|
||||
partition,
|
||||
entries,
|
||||
}))
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::membership::MembershipConfig;
|
||||
|
||||
fn single_node_membership() -> Arc<Membership> {
|
||||
Arc::new(Membership::new(MembershipConfig {
|
||||
node_id: "node-1".to_string(),
|
||||
address: "127.0.0.1:6060".to_string(),
|
||||
..Default::default()
|
||||
}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_single_node_replication() {
|
||||
let membership = single_node_membership();
|
||||
let replicator = Replicator::new(
|
||||
membership,
|
||||
ReplicationConfig {
|
||||
replication_factor: 1,
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
|
||||
let result = replicator
|
||||
.replicate("orders", 0, vec![b"entry-1".to_vec()])
|
||||
.await;
|
||||
|
||||
assert_eq!(result.ack_count, 1);
|
||||
assert!(result.quorum_reached);
|
||||
assert!(result.errors.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_no_peers_available() {
|
||||
let membership = single_node_membership();
|
||||
let replicator = Replicator::new(
|
||||
membership,
|
||||
ReplicationConfig {
|
||||
replication_factor: 3,
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
|
||||
let result = replicator
|
||||
.replicate("orders", 0, vec![b"entry-1".to_vec()])
|
||||
.await;
|
||||
|
||||
// Only local ack (1 out of 2 needed for quorum).
|
||||
assert_eq!(result.ack_count, 1);
|
||||
assert!(!result.quorum_reached);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_unreachable_peers_timeout() {
|
||||
let membership = single_node_membership();
|
||||
|
||||
// Add peers that don't exist - they'll fail to connect.
|
||||
membership
|
||||
.record_heartbeat("node-2", "127.0.0.1:19999")
|
||||
.await;
|
||||
|
||||
let replicator = Replicator::new(
|
||||
membership,
|
||||
ReplicationConfig {
|
||||
replication_factor: 3,
|
||||
timeout: Duration::from_millis(500),
|
||||
},
|
||||
);
|
||||
|
||||
let result = replicator
|
||||
.replicate("orders", 0, vec![b"entry-1".to_vec()])
|
||||
.await;
|
||||
|
||||
// Should have errors from unreachable peer.
|
||||
assert_eq!(result.ack_count, 1);
|
||||
assert!(!result.quorum_reached);
|
||||
}
|
||||
}
|
||||
@@ -1,2 +1,290 @@
|
||||
// This file will be generated by `buf generate`.
|
||||
// Placeholder for initial workspace compilation.
|
||||
// @generated
|
||||
// This file is @generated by prost-build.
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct ReplicateEntriesRequest {
|
||||
#[prost(string, tag="1")]
|
||||
pub topic: ::prost::alloc::string::String,
|
||||
#[prost(uint32, tag="2")]
|
||||
pub partition: u32,
|
||||
#[prost(bytes="vec", repeated, tag="3")]
|
||||
pub entries: ::prost::alloc::vec::Vec<::prost::alloc::vec::Vec<u8>>,
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct ReplicateEntriesResponse {
|
||||
#[prost(uint64, tag="1")]
|
||||
pub last_replicated_offset: u64,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct JoinRequest {
|
||||
#[prost(string, tag="1")]
|
||||
pub node_id: ::prost::alloc::string::String,
|
||||
#[prost(string, tag="2")]
|
||||
pub address: ::prost::alloc::string::String,
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct JoinResponse {
|
||||
#[prost(message, repeated, tag="1")]
|
||||
pub members: ::prost::alloc::vec::Vec<ClusterNodeInfo>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct HeartbeatRequest {
|
||||
#[prost(string, tag="1")]
|
||||
pub node_id: ::prost::alloc::string::String,
|
||||
#[prost(message, repeated, tag="2")]
|
||||
pub known_members: ::prost::alloc::vec::Vec<ClusterNodeInfo>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct HeartbeatResponse {
|
||||
#[prost(message, repeated, tag="1")]
|
||||
pub members: ::prost::alloc::vec::Vec<ClusterNodeInfo>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct ClusterNodeInfo {
|
||||
#[prost(string, tag="1")]
|
||||
pub node_id: ::prost::alloc::string::String,
|
||||
#[prost(string, tag="2")]
|
||||
pub address: ::prost::alloc::string::String,
|
||||
#[prost(string, tag="3")]
|
||||
pub status: ::prost::alloc::string::String,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct FetchSegmentRequest {
|
||||
#[prost(string, tag="1")]
|
||||
pub topic: ::prost::alloc::string::String,
|
||||
#[prost(uint32, tag="2")]
|
||||
pub partition: u32,
|
||||
#[prost(uint64, tag="3")]
|
||||
pub from_offset: u64,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct FetchSegmentResponse {
|
||||
#[prost(bytes="vec", tag="1")]
|
||||
pub chunk: ::prost::alloc::vec::Vec<u8>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct CreateTopicRequest {
|
||||
#[prost(string, tag="1")]
|
||||
pub name: ::prost::alloc::string::String,
|
||||
#[prost(uint32, tag="2")]
|
||||
pub partitions: u32,
|
||||
#[prost(uint32, tag="3")]
|
||||
pub replication_factor: u32,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct CreateTopicResponse {
|
||||
#[prost(string, tag="1")]
|
||||
pub name: ::prost::alloc::string::String,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct DeleteTopicRequest {
|
||||
#[prost(string, tag="1")]
|
||||
pub name: ::prost::alloc::string::String,
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct DeleteTopicResponse {
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct ListTopicsRequest {
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct ListTopicsResponse {
|
||||
#[prost(message, repeated, tag="1")]
|
||||
pub topics: ::prost::alloc::vec::Vec<TopicInfo>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct TopicInfo {
|
||||
#[prost(string, tag="1")]
|
||||
pub name: ::prost::alloc::string::String,
|
||||
#[prost(uint32, tag="2")]
|
||||
pub partitions: u32,
|
||||
#[prost(uint32, tag="3")]
|
||||
pub replication_factor: u32,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct DescribeTopicRequest {
|
||||
#[prost(string, tag="1")]
|
||||
pub name: ::prost::alloc::string::String,
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct DescribeTopicResponse {
|
||||
#[prost(message, optional, tag="1")]
|
||||
pub topic: ::core::option::Option<TopicInfo>,
|
||||
#[prost(message, repeated, tag="2")]
|
||||
pub partition_info: ::prost::alloc::vec::Vec<PartitionInfo>,
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct PartitionInfo {
|
||||
#[prost(uint32, tag="1")]
|
||||
pub partition: u32,
|
||||
#[prost(uint64, tag="2")]
|
||||
pub earliest_offset: u64,
|
||||
#[prost(uint64, tag="3")]
|
||||
pub latest_offset: u64,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct CreateConsumerGroupRequest {
|
||||
#[prost(string, tag="1")]
|
||||
pub group_name: ::prost::alloc::string::String,
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct CreateConsumerGroupResponse {
|
||||
}
|
||||
// --- Publish ---
|
||||
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct PublishRequest {
|
||||
#[prost(message, repeated, tag="1")]
|
||||
pub messages: ::prost::alloc::vec::Vec<PublishMessage>,
|
||||
#[prost(message, optional, tag="2")]
|
||||
pub settings: ::core::option::Option<PublishSettings>,
|
||||
#[prost(string, tag="3")]
|
||||
pub producer_id: ::prost::alloc::string::String,
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct PublishMessage {
|
||||
#[prost(string, tag="1")]
|
||||
pub topic: ::prost::alloc::string::String,
|
||||
#[prost(bytes="vec", tag="2")]
|
||||
pub key: ::prost::alloc::vec::Vec<u8>,
|
||||
#[prost(bytes="vec", tag="3")]
|
||||
pub value: ::prost::alloc::vec::Vec<u8>,
|
||||
#[prost(message, repeated, tag="4")]
|
||||
pub headers: ::prost::alloc::vec::Vec<MessageHeader>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct MessageHeader {
|
||||
#[prost(string, tag="1")]
|
||||
pub key: ::prost::alloc::string::String,
|
||||
#[prost(bytes="vec", tag="2")]
|
||||
pub value: ::prost::alloc::vec::Vec<u8>,
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct PublishSettings {
|
||||
#[prost(enumeration="AckMode", tag="1")]
|
||||
pub ack_mode: i32,
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct PublishResponse {
|
||||
#[prost(message, repeated, tag="1")]
|
||||
pub results: ::prost::alloc::vec::Vec<PublishResult>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct PublishResult {
|
||||
#[prost(string, tag="1")]
|
||||
pub topic: ::prost::alloc::string::String,
|
||||
#[prost(uint32, tag="2")]
|
||||
pub partition: u32,
|
||||
#[prost(uint64, tag="3")]
|
||||
pub offset: u64,
|
||||
}
|
||||
// --- Subscribe ---
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct SubscribeRequest {
|
||||
#[prost(string, tag="1")]
|
||||
pub topic: ::prost::alloc::string::String,
|
||||
#[prost(uint32, tag="2")]
|
||||
pub partition: u32,
|
||||
#[prost(string, tag="3")]
|
||||
pub consumer_group: ::prost::alloc::string::String,
|
||||
#[prost(uint64, optional, tag="4")]
|
||||
pub start_offset: ::core::option::Option<u64>,
|
||||
#[prost(uint32, tag="5")]
|
||||
pub max_batch_size: u32,
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct SubscribeResponse {
|
||||
#[prost(message, repeated, tag="1")]
|
||||
pub messages: ::prost::alloc::vec::Vec<ConsumedMessage>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct ConsumedMessage {
|
||||
#[prost(uint64, tag="1")]
|
||||
pub offset: u64,
|
||||
#[prost(string, tag="2")]
|
||||
pub topic: ::prost::alloc::string::String,
|
||||
#[prost(uint32, tag="3")]
|
||||
pub partition: u32,
|
||||
#[prost(bytes="vec", tag="4")]
|
||||
pub key: ::prost::alloc::vec::Vec<u8>,
|
||||
#[prost(bytes="vec", tag="5")]
|
||||
pub value: ::prost::alloc::vec::Vec<u8>,
|
||||
#[prost(message, repeated, tag="6")]
|
||||
pub headers: ::prost::alloc::vec::Vec<MessageHeader>,
|
||||
#[prost(uint64, tag="7")]
|
||||
pub timestamp_ms: u64,
|
||||
}
|
||||
// --- Ack/Commit ---
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct AckRequest {
|
||||
#[prost(string, tag="1")]
|
||||
pub consumer_group: ::prost::alloc::string::String,
|
||||
#[prost(string, tag="2")]
|
||||
pub topic: ::prost::alloc::string::String,
|
||||
#[prost(uint32, tag="3")]
|
||||
pub partition: u32,
|
||||
#[prost(uint64, tag="4")]
|
||||
pub offset: u64,
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct AckResponse {
|
||||
}
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
|
||||
#[repr(i32)]
|
||||
pub enum AckMode {
|
||||
Unspecified = 0,
|
||||
All = 1,
|
||||
Local = 2,
|
||||
None = 3,
|
||||
}
|
||||
impl AckMode {
|
||||
/// String value of the enum field names used in the ProtoBuf definition.
|
||||
///
|
||||
/// The values are not transformed in any way and thus are considered stable
|
||||
/// (if the ProtoBuf definition does not change) and safe for programmatic use.
|
||||
pub fn as_str_name(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Unspecified => "ACK_MODE_UNSPECIFIED",
|
||||
Self::All => "ACK_MODE_ALL",
|
||||
Self::Local => "ACK_MODE_LOCAL",
|
||||
Self::None => "ACK_MODE_NONE",
|
||||
}
|
||||
}
|
||||
/// Creates an enum from field names used in the ProtoBuf definition.
|
||||
pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
|
||||
match value {
|
||||
"ACK_MODE_UNSPECIFIED" => Some(Self::Unspecified),
|
||||
"ACK_MODE_ALL" => Some(Self::All),
|
||||
"ACK_MODE_LOCAL" => Some(Self::Local),
|
||||
"ACK_MODE_NONE" => Some(Self::None),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct GetStatusRequest {
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct GetStatusResponse {
|
||||
#[prost(string, tag="1")]
|
||||
pub node_id: ::prost::alloc::string::String,
|
||||
#[prost(message, optional, tag="2")]
|
||||
pub cluster: ::core::option::Option<ClusterStatus>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, ::prost::Message)]
|
||||
pub struct ClusterStatus {
|
||||
#[prost(message, repeated, tag="1")]
|
||||
pub nodes: ::prost::alloc::vec::Vec<NodeInfo>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
|
||||
pub struct NodeInfo {
|
||||
#[prost(string, tag="1")]
|
||||
pub node_id: ::prost::alloc::string::String,
|
||||
#[prost(string, tag="2")]
|
||||
pub address: ::prost::alloc::string::String,
|
||||
#[prost(string, tag="3")]
|
||||
pub status: ::prost::alloc::string::String,
|
||||
}
|
||||
include!("sq.v1.tonic.rs");
|
||||
// @@protoc_insertion_point(module)
|
||||
1885
crates/sq-grpc-interface/src/grpc/sq/v1/sq.v1.tonic.rs
Normal file
1885
crates/sq-grpc-interface/src/grpc/sq/v1/sq.v1.tonic.rs
Normal file
File diff suppressed because it is too large
Load Diff
118
crates/sq-models/src/config.rs
Normal file
118
crates/sq-models/src/config.rs
Normal file
@@ -0,0 +1,118 @@
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::message::TopicName;
|
||||
|
||||
/// Controls when fsync is called on WAL segment files.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum SyncPolicy {
|
||||
/// Fsync after every write batch (maximum durability, lower throughput).
|
||||
EveryBatch,
|
||||
/// Fsync at a fixed interval via a background task. Writes go to OS page
|
||||
/// cache immediately. Data written within the interval window is at risk
|
||||
/// if the machine crashes without replication.
|
||||
Interval(Duration),
|
||||
/// Never explicitly fsync. Rely on OS page cache flush + replication.
|
||||
/// Similar to Kafka's default.
|
||||
None,
|
||||
}
|
||||
|
||||
impl Default for SyncPolicy {
|
||||
fn default() -> Self {
|
||||
SyncPolicy::EveryBatch
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for the Write-Ahead Log.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct WalConfig {
|
||||
/// Maximum segment file size in bytes before rotation (default: 64MB).
|
||||
pub max_segment_bytes: u64,
|
||||
/// Maximum segment age in seconds before rotation (default: 60s).
|
||||
pub max_segment_age_secs: u64,
|
||||
/// Root data directory for WAL files.
|
||||
pub data_dir: PathBuf,
|
||||
/// When to fsync WAL segments (default: EveryBatch).
|
||||
pub sync_policy: SyncPolicy,
|
||||
}
|
||||
|
||||
impl Default for WalConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_segment_bytes: 64 * 1024 * 1024, // 64MB
|
||||
max_segment_age_secs: 60,
|
||||
data_dir: PathBuf::from("./data"),
|
||||
sync_policy: SyncPolicy::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for a topic.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct TopicConfig {
|
||||
pub name: TopicName,
|
||||
/// Number of partitions (default: 1).
|
||||
pub partitions: u32,
|
||||
/// Replication factor across cluster nodes (default: 3).
|
||||
pub replication_factor: u32,
|
||||
}
|
||||
|
||||
impl TopicConfig {
|
||||
pub fn new(name: impl Into<TopicName>) -> Self {
|
||||
Self {
|
||||
name: name.into(),
|
||||
partitions: 1,
|
||||
replication_factor: 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_partitions(mut self, partitions: u32) -> Self {
|
||||
self.partitions = partitions;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_replication_factor(mut self, factor: u32) -> Self {
|
||||
self.replication_factor = factor;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for the cluster node.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct NodeConfig {
|
||||
pub node_id: String,
|
||||
pub grpc_host: std::net::SocketAddr,
|
||||
pub http_host: std::net::SocketAddr,
|
||||
pub seeds: Vec<String>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_wal_config_defaults() {
|
||||
let config = WalConfig::default();
|
||||
assert_eq!(config.max_segment_bytes, 64 * 1024 * 1024);
|
||||
assert_eq!(config.max_segment_age_secs, 60);
|
||||
assert_eq!(config.data_dir, PathBuf::from("./data"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_topic_config_builder() {
|
||||
let config = TopicConfig::new("orders")
|
||||
.with_partitions(4)
|
||||
.with_replication_factor(3);
|
||||
|
||||
assert_eq!(config.name.as_str(), "orders");
|
||||
assert_eq!(config.partitions, 4);
|
||||
assert_eq!(config.replication_factor, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_topic_config_defaults() {
|
||||
let config = TopicConfig::new("events");
|
||||
assert_eq!(config.partitions, 1);
|
||||
assert_eq!(config.replication_factor, 3);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
pub mod config;
|
||||
pub mod message;
|
||||
|
||||
pub use config::*;
|
||||
pub use message::*;
|
||||
|
||||
195
crates/sq-models/src/message.rs
Normal file
195
crates/sq-models/src/message.rs
Normal file
@@ -0,0 +1,195 @@
|
||||
use std::fmt;
|
||||
|
||||
/// A single message in the queue.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct Message {
|
||||
/// Monotonically increasing within a topic-partition. Assigned by the server.
|
||||
pub offset: u64,
|
||||
/// Topic this message belongs to.
|
||||
pub topic: TopicName,
|
||||
/// Partition within the topic.
|
||||
pub partition: u32,
|
||||
/// Optional partitioning key.
|
||||
pub key: Option<Vec<u8>>,
|
||||
/// The payload.
|
||||
pub value: Vec<u8>,
|
||||
/// User-defined headers (metadata).
|
||||
pub headers: Vec<Header>,
|
||||
/// Server-assigned wall-clock timestamp (millis since epoch).
|
||||
pub timestamp_ms: u64,
|
||||
}
|
||||
|
||||
/// A key-value header attached to a message.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct Header {
|
||||
pub key: String,
|
||||
pub value: Vec<u8>,
|
||||
}
|
||||
|
||||
/// A topic name wrapper.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||
pub struct TopicName(pub String);
|
||||
|
||||
impl TopicName {
|
||||
pub fn as_str(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for TopicName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for TopicName {
|
||||
fn from(s: &str) -> Self {
|
||||
Self(s.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for TopicName {
|
||||
fn from(s: String) -> Self {
|
||||
Self(s)
|
||||
}
|
||||
}
|
||||
|
||||
/// Information about a closed WAL segment ready for shipping.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ClosedSegment {
|
||||
pub path: std::path::PathBuf,
|
||||
pub topic: TopicName,
|
||||
pub partition: u32,
|
||||
pub base_offset: u64,
|
||||
pub end_offset: u64,
|
||||
pub size_bytes: u64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_message_construction() {
|
||||
let msg = Message {
|
||||
offset: 42,
|
||||
topic: TopicName::from("orders"),
|
||||
partition: 0,
|
||||
key: Some(b"user-123".to_vec()),
|
||||
value: b"hello world".to_vec(),
|
||||
headers: vec![Header {
|
||||
key: "content-type".to_string(),
|
||||
value: b"text/plain".to_vec(),
|
||||
}],
|
||||
timestamp_ms: 1700000000000,
|
||||
};
|
||||
|
||||
assert_eq!(msg.offset, 42);
|
||||
assert_eq!(msg.topic.as_str(), "orders");
|
||||
assert_eq!(msg.partition, 0);
|
||||
assert_eq!(msg.key.as_deref(), Some(b"user-123".as_slice()));
|
||||
assert_eq!(msg.value, b"hello world");
|
||||
assert_eq!(msg.headers.len(), 1);
|
||||
assert_eq!(msg.headers[0].key, "content-type");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_message_no_key_no_headers() {
|
||||
let msg = Message {
|
||||
offset: 0,
|
||||
topic: TopicName::from("events"),
|
||||
partition: 1,
|
||||
key: None,
|
||||
value: b"payload".to_vec(),
|
||||
headers: vec![],
|
||||
timestamp_ms: 0,
|
||||
};
|
||||
|
||||
assert!(msg.key.is_none());
|
||||
assert!(msg.headers.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_message_clone_eq() {
|
||||
let msg = Message {
|
||||
offset: 1,
|
||||
topic: TopicName::from("test"),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: b"data".to_vec(),
|
||||
headers: vec![],
|
||||
timestamp_ms: 100,
|
||||
};
|
||||
|
||||
let cloned = msg.clone();
|
||||
assert_eq!(msg, cloned);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_topic_name_ordering() {
|
||||
let a = TopicName::from("alpha");
|
||||
let b = TopicName::from("beta");
|
||||
assert!(a < b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_topic_name_display() {
|
||||
let t = TopicName::from("my-topic");
|
||||
assert_eq!(format!("{t}"), "my-topic");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_message_empty_value() {
|
||||
let msg = Message {
|
||||
offset: 0,
|
||||
topic: TopicName::from("t"),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: vec![],
|
||||
headers: vec![],
|
||||
timestamp_ms: 0,
|
||||
};
|
||||
|
||||
assert!(msg.value.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_message_large_value() {
|
||||
let large = vec![0xFFu8; 1024 * 1024]; // 1MB
|
||||
let msg = Message {
|
||||
offset: 0,
|
||||
topic: TopicName::from("t"),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: large.clone(),
|
||||
headers: vec![],
|
||||
timestamp_ms: 0,
|
||||
};
|
||||
|
||||
assert_eq!(msg.value.len(), 1024 * 1024);
|
||||
assert_eq!(msg.value, large);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_message_many_headers() {
|
||||
let headers: Vec<Header> = (0..100)
|
||||
.map(|i| Header {
|
||||
key: format!("header-{i}"),
|
||||
value: format!("value-{i}").into_bytes(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let msg = Message {
|
||||
offset: 0,
|
||||
topic: TopicName::from("t"),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: vec![],
|
||||
headers,
|
||||
timestamp_ms: 0,
|
||||
};
|
||||
|
||||
assert_eq!(msg.headers.len(), 100);
|
||||
assert_eq!(msg.headers[99].key, "header-99");
|
||||
}
|
||||
}
|
||||
@@ -4,11 +4,18 @@ version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
sq-capnp-interface = { workspace = true }
|
||||
sq-grpc-interface = { workspace = true }
|
||||
sq-models = { workspace = true }
|
||||
|
||||
capnp = { workspace = true }
|
||||
bytes = { workspace = true }
|
||||
|
||||
anyhow = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tonic = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tokio-stream = { workspace = true }
|
||||
tokio-util = { workspace = true, features = ["codec"] }
|
||||
futures = { workspace = true }
|
||||
|
||||
172
crates/sq-sdk/src/batch_producer.rs
Normal file
172
crates/sq-sdk/src/batch_producer.rs
Normal file
@@ -0,0 +1,172 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use sq_grpc_interface::AckMode;
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
use tokio::time::MissedTickBehavior;
|
||||
|
||||
use crate::error::SqError;
|
||||
use crate::producer::{GrpcProducer, GrpcProducerConfig, ProducerMessage, SendResult};
|
||||
|
||||
/// Configuration for a gRPC batching producer.
|
||||
pub struct GrpcBatchProducerConfig {
|
||||
/// Server address (e.g., "http://127.0.0.1:6060").
|
||||
pub address: String,
|
||||
/// Default ack mode for publish requests.
|
||||
pub ack_mode: AckMode,
|
||||
/// Producer identifier.
|
||||
pub producer_id: String,
|
||||
/// Maximum messages to accumulate before flushing (default: 1000).
|
||||
pub max_batch_size: usize,
|
||||
/// Flush interval in milliseconds (default: 10).
|
||||
pub flush_interval_ms: u64,
|
||||
/// Backpressure channel capacity (default: 10_000).
|
||||
pub channel_capacity: usize,
|
||||
}
|
||||
|
||||
impl Default for GrpcBatchProducerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
address: "http://127.0.0.1:6060".to_string(),
|
||||
ack_mode: AckMode::All,
|
||||
producer_id: "default".to_string(),
|
||||
max_batch_size: 1000,
|
||||
flush_interval_ms: 10,
|
||||
channel_capacity: 10_000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct BatchRequest {
|
||||
message: ProducerMessage,
|
||||
reply: oneshot::Sender<Result<SendResult, SqError>>,
|
||||
}
|
||||
|
||||
/// A gRPC batching producer that accumulates messages and flushes them in batches.
|
||||
///
|
||||
/// Messages are queued immediately via `send()` (non-blocking enqueue) and
|
||||
/// flushed to the server either when the batch reaches `max_batch_size` or
|
||||
/// when the `flush_interval` timer fires — whichever comes first.
|
||||
///
|
||||
/// `send()` takes `&self`, so `GrpcBatchProducer` can be shared via `Arc` across tasks.
|
||||
pub struct GrpcBatchProducer {
|
||||
tx: mpsc::Sender<BatchRequest>,
|
||||
_flush_task: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl GrpcBatchProducer {
|
||||
/// Connect to an SQ server and create a batching producer.
|
||||
pub async fn connect(config: GrpcBatchProducerConfig) -> Result<Self, SqError> {
|
||||
let producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: config.address,
|
||||
ack_mode: config.ack_mode,
|
||||
producer_id: config.producer_id,
|
||||
})
|
||||
.await?;
|
||||
|
||||
let (tx, rx) = mpsc::channel(config.channel_capacity);
|
||||
|
||||
let flush_task = tokio::spawn(flush_loop(
|
||||
rx,
|
||||
producer,
|
||||
config.max_batch_size,
|
||||
Duration::from_millis(config.flush_interval_ms),
|
||||
));
|
||||
|
||||
Ok(Self {
|
||||
tx,
|
||||
_flush_task: flush_task,
|
||||
})
|
||||
}
|
||||
|
||||
/// Queue a message for batched sending. Returns the result once the batch
|
||||
/// containing this message has been flushed and acknowledged by the server.
|
||||
pub async fn send(&self, message: ProducerMessage) -> Result<SendResult, SqError> {
|
||||
let (reply_tx, reply_rx) = oneshot::channel();
|
||||
self.tx
|
||||
.send(BatchRequest {
|
||||
message,
|
||||
reply: reply_tx,
|
||||
})
|
||||
.await
|
||||
.map_err(|_| SqError::Connection("batch producer closed".to_string()))?;
|
||||
|
||||
reply_rx
|
||||
.await
|
||||
.map_err(|_| SqError::Connection("batch producer flush task dropped".to_string()))?
|
||||
}
|
||||
|
||||
/// Shut down the producer, flushing any remaining messages.
|
||||
///
|
||||
/// Dropping the `BatchProducer` also triggers a flush of pending messages,
|
||||
/// but `close()` lets you await completion.
|
||||
pub async fn close(self) {
|
||||
drop(self.tx);
|
||||
let _ = self._flush_task.await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush_loop(
|
||||
mut rx: mpsc::Receiver<BatchRequest>,
|
||||
mut producer: GrpcProducer,
|
||||
max_batch_size: usize,
|
||||
flush_interval: Duration,
|
||||
) {
|
||||
let mut pending: Vec<BatchRequest> = Vec::with_capacity(max_batch_size);
|
||||
let mut interval = tokio::time::interval(flush_interval);
|
||||
interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||
// Consume the first immediate tick.
|
||||
interval.tick().await;
|
||||
|
||||
loop {
|
||||
let should_flush = tokio::select! {
|
||||
msg = rx.recv() => match msg {
|
||||
Some(req) => {
|
||||
pending.push(req);
|
||||
pending.len() >= max_batch_size
|
||||
}
|
||||
None => {
|
||||
// Channel closed — flush remaining and exit.
|
||||
if !pending.is_empty() {
|
||||
flush(&mut producer, &mut pending).await;
|
||||
}
|
||||
return;
|
||||
}
|
||||
},
|
||||
_ = interval.tick() => !pending.is_empty(),
|
||||
};
|
||||
|
||||
if should_flush {
|
||||
flush(&mut producer, &mut pending).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush(producer: &mut GrpcProducer, pending: &mut Vec<BatchRequest>) {
|
||||
let batch: Vec<BatchRequest> = std::mem::take(pending);
|
||||
|
||||
let messages: Vec<ProducerMessage> = batch
|
||||
.iter()
|
||||
.map(|req| ProducerMessage {
|
||||
topic: req.message.topic.clone(),
|
||||
key: req.message.key.clone(),
|
||||
value: req.message.value.clone(),
|
||||
headers: req.message.headers.clone(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
match producer.send_batch(messages).await {
|
||||
Ok(results) => {
|
||||
for (req, result) in batch.into_iter().zip(results) {
|
||||
let _ = req.reply.send(Ok(result));
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
let msg = e.to_string();
|
||||
for req in batch {
|
||||
let _ = req
|
||||
.reply
|
||||
.send(Err(SqError::Server(msg.clone())));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
169
crates/sq-sdk/src/capnp_batch_producer.rs
Normal file
169
crates/sq-sdk/src/capnp_batch_producer.rs
Normal file
@@ -0,0 +1,169 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
use tokio::time::MissedTickBehavior;
|
||||
|
||||
use crate::capnp_producer::{Producer, ProducerConfig};
|
||||
use crate::error::SqError;
|
||||
use crate::producer::{ProducerMessage, SendResult};
|
||||
use crate::types::AckMode;
|
||||
|
||||
/// Configuration for a batching producer (Cap'n Proto transport).
|
||||
pub struct BatchProducerConfig {
|
||||
/// Server address (e.g., "127.0.0.1:6064").
|
||||
pub address: String,
|
||||
/// Default ack mode for publish requests.
|
||||
pub ack_mode: AckMode,
|
||||
/// Producer identifier.
|
||||
pub producer_id: String,
|
||||
/// Maximum messages to accumulate before flushing (default: 1000).
|
||||
pub max_batch_size: usize,
|
||||
/// Flush interval in milliseconds (default: 10).
|
||||
pub flush_interval_ms: u64,
|
||||
/// Backpressure channel capacity (default: 10_000).
|
||||
pub channel_capacity: usize,
|
||||
}
|
||||
|
||||
impl Default for BatchProducerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
address: "127.0.0.1:6064".to_string(),
|
||||
ack_mode: AckMode::All,
|
||||
producer_id: "default".to_string(),
|
||||
max_batch_size: 1000,
|
||||
flush_interval_ms: 10,
|
||||
channel_capacity: 10_000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct BatchRequest {
|
||||
message: ProducerMessage,
|
||||
reply: oneshot::Sender<Result<SendResult, SqError>>,
|
||||
}
|
||||
|
||||
/// A batching producer that accumulates messages and flushes them in batches
|
||||
/// over the Cap'n Proto transport.
|
||||
///
|
||||
/// Messages are queued immediately via `send()` (non-blocking enqueue) and
|
||||
/// flushed to the server either when the batch reaches `max_batch_size` or
|
||||
/// when the `flush_interval` timer fires — whichever comes first.
|
||||
///
|
||||
/// `send()` takes `&self`, so `BatchProducer` can be shared via `Arc` across tasks.
|
||||
pub struct BatchProducer {
|
||||
tx: mpsc::Sender<BatchRequest>,
|
||||
_flush_task: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl BatchProducer {
|
||||
/// Connect to an SQ server and create a batching producer.
|
||||
pub async fn connect(config: BatchProducerConfig) -> Result<Self, SqError> {
|
||||
let producer = Producer::connect(ProducerConfig {
|
||||
address: config.address,
|
||||
ack_mode: config.ack_mode,
|
||||
producer_id: config.producer_id,
|
||||
})
|
||||
.await?;
|
||||
|
||||
let (tx, rx) = mpsc::channel(config.channel_capacity);
|
||||
|
||||
let flush_task = tokio::spawn(flush_loop(
|
||||
rx,
|
||||
producer,
|
||||
config.max_batch_size,
|
||||
Duration::from_millis(config.flush_interval_ms),
|
||||
));
|
||||
|
||||
Ok(Self {
|
||||
tx,
|
||||
_flush_task: flush_task,
|
||||
})
|
||||
}
|
||||
|
||||
/// Queue a message for batched sending. Returns the result once the batch
|
||||
/// containing this message has been flushed and acknowledged by the server.
|
||||
pub async fn send(&self, message: ProducerMessage) -> Result<SendResult, SqError> {
|
||||
let (reply_tx, reply_rx) = oneshot::channel();
|
||||
self.tx
|
||||
.send(BatchRequest {
|
||||
message,
|
||||
reply: reply_tx,
|
||||
})
|
||||
.await
|
||||
.map_err(|_| SqError::Connection("batch producer closed".to_string()))?;
|
||||
|
||||
reply_rx
|
||||
.await
|
||||
.map_err(|_| SqError::Connection("batch producer flush task dropped".to_string()))?
|
||||
}
|
||||
|
||||
/// Shut down the producer, flushing any remaining messages.
|
||||
pub async fn close(self) {
|
||||
drop(self.tx);
|
||||
let _ = self._flush_task.await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush_loop(
|
||||
mut rx: mpsc::Receiver<BatchRequest>,
|
||||
mut producer: Producer,
|
||||
max_batch_size: usize,
|
||||
flush_interval: Duration,
|
||||
) {
|
||||
let mut pending: Vec<BatchRequest> = Vec::with_capacity(max_batch_size);
|
||||
let mut interval = tokio::time::interval(flush_interval);
|
||||
interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||
// Consume the first immediate tick.
|
||||
interval.tick().await;
|
||||
|
||||
loop {
|
||||
let should_flush = tokio::select! {
|
||||
msg = rx.recv() => match msg {
|
||||
Some(req) => {
|
||||
pending.push(req);
|
||||
pending.len() >= max_batch_size
|
||||
}
|
||||
None => {
|
||||
// Channel closed — flush remaining and exit.
|
||||
if !pending.is_empty() {
|
||||
flush(&mut producer, &mut pending).await;
|
||||
}
|
||||
return;
|
||||
}
|
||||
},
|
||||
_ = interval.tick() => !pending.is_empty(),
|
||||
};
|
||||
|
||||
if should_flush {
|
||||
flush(&mut producer, &mut pending).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush(producer: &mut Producer, pending: &mut Vec<BatchRequest>) {
|
||||
let batch: Vec<BatchRequest> = std::mem::take(pending);
|
||||
|
||||
let messages: Vec<ProducerMessage> = batch
|
||||
.iter()
|
||||
.map(|req| ProducerMessage {
|
||||
topic: req.message.topic.clone(),
|
||||
key: req.message.key.clone(),
|
||||
value: req.message.value.clone(),
|
||||
headers: req.message.headers.clone(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
match producer.send_batch(messages).await {
|
||||
Ok(results) => {
|
||||
for (req, result) in batch.into_iter().zip(results) {
|
||||
let _ = req.reply.send(Ok(result));
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
let msg = e.to_string();
|
||||
for req in batch {
|
||||
let _ = req.reply.send(Err(SqError::Server(msg.clone())));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
46
crates/sq-sdk/src/capnp_connection.rs
Normal file
46
crates/sq-sdk/src/capnp_connection.rs
Normal file
@@ -0,0 +1,46 @@
|
||||
use futures::SinkExt;
|
||||
use sq_capnp_interface::codec::{Frame, SqCodec};
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::codec::Framed;
|
||||
|
||||
use crate::error::SqError;
|
||||
|
||||
/// A TCP connection with Cap'n Proto framing.
|
||||
pub struct Connection {
|
||||
framed: Framed<TcpStream, SqCodec>,
|
||||
}
|
||||
|
||||
impl Connection {
|
||||
/// Connect to an SQ server's capnp data plane.
|
||||
/// Address should be "host:port" (e.g., "127.0.0.1:6064").
|
||||
pub async fn connect(address: &str) -> Result<Self, SqError> {
|
||||
let stream = TcpStream::connect(address)
|
||||
.await
|
||||
.map_err(|e| SqError::Connection(e.to_string()))?;
|
||||
stream
|
||||
.set_nodelay(true)
|
||||
.map_err(|e| SqError::Connection(e.to_string()))?;
|
||||
|
||||
Ok(Self {
|
||||
framed: Framed::new(stream, SqCodec::new()),
|
||||
})
|
||||
}
|
||||
|
||||
/// Send a frame over the connection.
|
||||
pub async fn send_frame(&mut self, frame: Frame) -> Result<(), SqError> {
|
||||
self.framed
|
||||
.send(frame)
|
||||
.await
|
||||
.map_err(|e| SqError::Connection(e.to_string()))
|
||||
}
|
||||
|
||||
/// Receive the next frame from the connection.
|
||||
pub async fn recv_frame(&mut self) -> Result<Frame, SqError> {
|
||||
match self.framed.next().await {
|
||||
Some(Ok(frame)) => Ok(frame),
|
||||
Some(Err(e)) => Err(SqError::Connection(e.to_string())),
|
||||
None => Err(SqError::Connection("connection closed".to_string())),
|
||||
}
|
||||
}
|
||||
}
|
||||
224
crates/sq-sdk/src/capnp_consumer.rs
Normal file
224
crates/sq-sdk/src/capnp_consumer.rs
Normal file
@@ -0,0 +1,224 @@
|
||||
use sq_capnp_interface::codec::{self, OP_ERROR, OP_SUBSCRIBE_END, OP_SUBSCRIBE_REQ, OP_SUBSCRIBE_RES, OP_ACK_REQ, OP_ACK_RES};
|
||||
use sq_capnp_interface::data_plane_capnp;
|
||||
|
||||
use crate::capnp_connection::Connection;
|
||||
use crate::consumer::ReceivedMessage;
|
||||
use crate::error::SqError;
|
||||
|
||||
/// Configuration for an SQ consumer (Cap'n Proto transport).
|
||||
pub struct ConsumerConfig {
|
||||
/// Server address (e.g., "127.0.0.1:6064").
|
||||
pub address: String,
|
||||
/// Consumer group name.
|
||||
pub consumer_group: String,
|
||||
/// Topic to consume from.
|
||||
pub topic: String,
|
||||
/// Partition to consume from.
|
||||
pub partition: u32,
|
||||
/// Whether to automatically commit offsets.
|
||||
pub auto_commit: bool,
|
||||
/// Maximum number of messages per batch.
|
||||
pub max_poll_records: u32,
|
||||
/// Optional start offset (overrides consumer group committed offset).
|
||||
pub start_offset: Option<u64>,
|
||||
}
|
||||
|
||||
impl Default for ConsumerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
address: "127.0.0.1:6064".to_string(),
|
||||
consumer_group: "default".to_string(),
|
||||
topic: String::new(),
|
||||
partition: 0,
|
||||
auto_commit: true,
|
||||
max_poll_records: 100,
|
||||
start_offset: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// SQ consumer using Cap'n Proto over TCP.
|
||||
/// Uses two connections: one for subscribe streaming, one for ack requests.
|
||||
pub struct Consumer {
|
||||
subscribe_conn: Connection,
|
||||
ack_conn: Connection,
|
||||
config: ConsumerConfig,
|
||||
stream_started: bool,
|
||||
last_offset: Option<u64>,
|
||||
}
|
||||
|
||||
impl Consumer {
|
||||
/// Connect to an SQ server and create a new consumer.
|
||||
pub async fn connect(config: ConsumerConfig) -> Result<Self, SqError> {
|
||||
let subscribe_conn = Connection::connect(&config.address).await?;
|
||||
let ack_conn = Connection::connect(&config.address).await?;
|
||||
|
||||
Ok(Self {
|
||||
subscribe_conn,
|
||||
ack_conn,
|
||||
config,
|
||||
stream_started: false,
|
||||
last_offset: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Poll for new messages.
|
||||
/// On first call, sends the SubscribeRequest. Subsequent calls read response frames.
|
||||
pub async fn poll(&mut self) -> Result<Vec<ReceivedMessage>, SqError> {
|
||||
if !self.stream_started {
|
||||
self.start_subscribe().await?;
|
||||
self.stream_started = true;
|
||||
}
|
||||
|
||||
let frame = self.subscribe_conn.recv_frame().await?;
|
||||
|
||||
if frame.opcode == OP_SUBSCRIBE_END {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
if frame.opcode == OP_ERROR {
|
||||
let reader = codec::read_capnp(&frame.payload)
|
||||
.map_err(|e| SqError::Server(format!("decode error: {e}")))?;
|
||||
let err = reader
|
||||
.get_root::<data_plane_capnp::error_response::Reader>()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?;
|
||||
return Err(SqError::Server(
|
||||
err.get_message()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?
|
||||
.to_string()
|
||||
.map_err(|e| SqError::Server(format!("utf8 error: {e}")))?,
|
||||
));
|
||||
}
|
||||
|
||||
if frame.opcode != OP_SUBSCRIBE_RES {
|
||||
return Err(SqError::Server(format!(
|
||||
"unexpected opcode: 0x{:02x}",
|
||||
frame.opcode
|
||||
)));
|
||||
}
|
||||
|
||||
let reader = codec::read_capnp(&frame.payload)
|
||||
.map_err(|e| SqError::Server(format!("decode error: {e}")))?;
|
||||
let resp = reader
|
||||
.get_root::<data_plane_capnp::subscribe_response::Reader>()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?;
|
||||
|
||||
let messages = resp
|
||||
.get_messages()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?;
|
||||
|
||||
let mut result = Vec::with_capacity(messages.len() as usize);
|
||||
for i in 0..messages.len() {
|
||||
let m = messages.get(i);
|
||||
let headers_reader = m
|
||||
.get_headers()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?;
|
||||
|
||||
let mut headers = Vec::with_capacity(headers_reader.len() as usize);
|
||||
for j in 0..headers_reader.len() {
|
||||
let h = headers_reader.get(j);
|
||||
headers.push((
|
||||
h.get_key()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?
|
||||
.to_string()
|
||||
.map_err(|e| SqError::Server(format!("utf8 error: {e}")))?,
|
||||
h.get_value()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?
|
||||
.to_vec(),
|
||||
));
|
||||
}
|
||||
|
||||
result.push(ReceivedMessage {
|
||||
offset: m.get_offset(),
|
||||
topic: m
|
||||
.get_topic()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?
|
||||
.to_string()
|
||||
.map_err(|e| SqError::Server(format!("utf8 error: {e}")))?,
|
||||
partition: m.get_partition(),
|
||||
key: m
|
||||
.get_key()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?
|
||||
.to_vec(),
|
||||
value: m
|
||||
.get_value()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?
|
||||
.to_vec(),
|
||||
headers,
|
||||
timestamp_ms: m.get_timestamp_ms(),
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(last) = result.last() {
|
||||
self.last_offset = Some(last.offset);
|
||||
|
||||
if self.config.auto_commit {
|
||||
let _ = self.commit_internal(last.offset).await;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Manually commit an offset.
|
||||
pub async fn commit(&mut self, offset: u64) -> Result<(), SqError> {
|
||||
self.commit_internal(offset).await
|
||||
}
|
||||
|
||||
async fn start_subscribe(&mut self) -> Result<(), SqError> {
|
||||
let mut builder = capnp::message::Builder::new_default();
|
||||
{
|
||||
let mut req = builder.init_root::<data_plane_capnp::subscribe_request::Builder>();
|
||||
req.set_topic(&self.config.topic[..]);
|
||||
req.set_partition(self.config.partition);
|
||||
req.set_consumer_group(&self.config.consumer_group[..]);
|
||||
req.set_max_batch_size(self.config.max_poll_records);
|
||||
|
||||
if let Some(offset) = self.config.start_offset {
|
||||
req.set_start_offset(offset);
|
||||
req.set_has_start_offset(true);
|
||||
}
|
||||
}
|
||||
|
||||
let frame = codec::build_frame(OP_SUBSCRIBE_REQ, &builder);
|
||||
self.subscribe_conn.send_frame(frame).await
|
||||
}
|
||||
|
||||
async fn commit_internal(&mut self, offset: u64) -> Result<(), SqError> {
|
||||
let mut builder = capnp::message::Builder::new_default();
|
||||
{
|
||||
let mut req = builder.init_root::<data_plane_capnp::ack_request::Builder>();
|
||||
req.set_consumer_group(&self.config.consumer_group[..]);
|
||||
req.set_topic(&self.config.topic[..]);
|
||||
req.set_partition(self.config.partition);
|
||||
req.set_offset(offset);
|
||||
}
|
||||
|
||||
let frame = codec::build_frame(OP_ACK_REQ, &builder);
|
||||
self.ack_conn.send_frame(frame).await?;
|
||||
|
||||
let resp = self.ack_conn.recv_frame().await?;
|
||||
if resp.opcode == OP_ERROR {
|
||||
let reader = codec::read_capnp(&resp.payload)
|
||||
.map_err(|e| SqError::Server(format!("decode error: {e}")))?;
|
||||
let err = reader
|
||||
.get_root::<data_plane_capnp::error_response::Reader>()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?;
|
||||
return Err(SqError::Server(
|
||||
err.get_message()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?
|
||||
.to_string()
|
||||
.map_err(|e| SqError::Server(format!("utf8 error: {e}")))?,
|
||||
));
|
||||
}
|
||||
|
||||
if resp.opcode != OP_ACK_RES {
|
||||
return Err(SqError::Server(format!(
|
||||
"unexpected opcode: 0x{:02x}",
|
||||
resp.opcode
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
145
crates/sq-sdk/src/capnp_producer.rs
Normal file
145
crates/sq-sdk/src/capnp_producer.rs
Normal file
@@ -0,0 +1,145 @@
|
||||
use sq_capnp_interface::codec::{self, OP_ERROR, OP_PUBLISH_REQ, OP_PUBLISH_RES};
|
||||
use sq_capnp_interface::data_plane_capnp;
|
||||
|
||||
use crate::capnp_connection::Connection;
|
||||
use crate::error::SqError;
|
||||
use crate::producer::{ProducerMessage, SendResult};
|
||||
use crate::types::AckMode;
|
||||
|
||||
/// Configuration for an SQ producer (Cap'n Proto transport).
|
||||
pub struct ProducerConfig {
|
||||
/// Server address (e.g., "127.0.0.1:6064").
|
||||
pub address: String,
|
||||
/// Acknowledgment mode.
|
||||
pub ack_mode: AckMode,
|
||||
/// Producer identifier.
|
||||
pub producer_id: String,
|
||||
}
|
||||
|
||||
impl Default for ProducerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
address: "127.0.0.1:6064".to_string(),
|
||||
ack_mode: AckMode::All,
|
||||
producer_id: "default".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// SQ producer using Cap'n Proto over TCP.
|
||||
pub struct Producer {
|
||||
conn: Connection,
|
||||
config: ProducerConfig,
|
||||
}
|
||||
|
||||
impl Producer {
|
||||
/// Connect to an SQ server and create a new producer.
|
||||
pub async fn connect(config: ProducerConfig) -> Result<Self, SqError> {
|
||||
let conn = Connection::connect(&config.address).await?;
|
||||
Ok(Self { conn, config })
|
||||
}
|
||||
|
||||
/// Send a single message.
|
||||
pub async fn send(
|
||||
&mut self,
|
||||
topic: &str,
|
||||
key: Option<&[u8]>,
|
||||
value: &[u8],
|
||||
) -> Result<SendResult, SqError> {
|
||||
let results = self
|
||||
.send_batch(vec![ProducerMessage {
|
||||
topic: topic.to_string(),
|
||||
key: key.map(|k| k.to_vec()),
|
||||
value: value.to_vec(),
|
||||
headers: Vec::new(),
|
||||
}])
|
||||
.await?;
|
||||
Ok(results.into_iter().next().unwrap())
|
||||
}
|
||||
|
||||
/// Send a batch of messages.
|
||||
pub async fn send_batch(
|
||||
&mut self,
|
||||
messages: Vec<ProducerMessage>,
|
||||
) -> Result<Vec<SendResult>, SqError> {
|
||||
// Build capnp request.
|
||||
let mut builder = capnp::message::Builder::new_default();
|
||||
{
|
||||
let mut req = builder.init_root::<data_plane_capnp::publish_request::Builder>();
|
||||
req.set_ack_mode(self.config.ack_mode.to_capnp_u8());
|
||||
req.set_producer_id(&self.config.producer_id[..]);
|
||||
|
||||
let mut msg_list = req.init_messages(messages.len() as u32);
|
||||
for (i, m) in messages.iter().enumerate() {
|
||||
let mut entry = msg_list.reborrow().get(i as u32);
|
||||
entry.set_topic(&m.topic[..]);
|
||||
entry.set_key(m.key.as_deref().unwrap_or(&[]));
|
||||
entry.set_value(&m.value);
|
||||
|
||||
let mut headers = entry.init_headers(m.headers.len() as u32);
|
||||
for (j, (k, v)) in m.headers.iter().enumerate() {
|
||||
let mut h = headers.reborrow().get(j as u32);
|
||||
h.set_key(&k[..]);
|
||||
h.set_value(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let frame = codec::build_frame(OP_PUBLISH_REQ, &builder);
|
||||
self.conn.send_frame(frame).await?;
|
||||
|
||||
// Read response.
|
||||
let resp_frame = self.conn.recv_frame().await?;
|
||||
|
||||
if resp_frame.opcode == OP_ERROR {
|
||||
let msg = decode_error(&resp_frame.payload)?;
|
||||
return Err(SqError::Server(msg));
|
||||
}
|
||||
|
||||
if resp_frame.opcode != OP_PUBLISH_RES {
|
||||
return Err(SqError::Server(format!(
|
||||
"unexpected opcode: 0x{:02x}",
|
||||
resp_frame.opcode
|
||||
)));
|
||||
}
|
||||
|
||||
// Decode response.
|
||||
let reader = codec::read_capnp(&resp_frame.payload)
|
||||
.map_err(|e| SqError::Server(format!("decode error: {e}")))?;
|
||||
let resp = reader
|
||||
.get_root::<data_plane_capnp::publish_response::Reader>()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?;
|
||||
|
||||
let results = resp
|
||||
.get_results()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?;
|
||||
|
||||
let mut send_results = Vec::with_capacity(results.len() as usize);
|
||||
for i in 0..results.len() {
|
||||
let r = results.get(i);
|
||||
send_results.push(SendResult {
|
||||
topic: r
|
||||
.get_topic()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?
|
||||
.to_string()
|
||||
.map_err(|e| SqError::Server(format!("utf8 error: {e}")))?,
|
||||
partition: r.get_partition(),
|
||||
offset: r.get_offset(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(send_results)
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_error(payload: &[u8]) -> Result<String, SqError> {
|
||||
let reader = codec::read_capnp(payload)
|
||||
.map_err(|e| SqError::Server(format!("decode error: {e}")))?;
|
||||
let err = reader
|
||||
.get_root::<data_plane_capnp::error_response::Reader>()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?;
|
||||
err.get_message()
|
||||
.map_err(|e| SqError::Server(format!("schema error: {e}")))?
|
||||
.to_string()
|
||||
.map_err(|e| SqError::Server(format!("utf8 error: {e}")))
|
||||
}
|
||||
24
crates/sq-sdk/src/connection.rs
Normal file
24
crates/sq-sdk/src/connection.rs
Normal file
@@ -0,0 +1,24 @@
|
||||
use crate::error::SqError;
|
||||
|
||||
/// Manages a gRPC channel to an SQ server.
|
||||
#[derive(Clone)]
|
||||
pub struct GrpcConnection {
|
||||
channel: tonic::transport::Channel,
|
||||
}
|
||||
|
||||
impl GrpcConnection {
|
||||
/// Connect to an SQ server at the given address (e.g., "http://127.0.0.1:6060").
|
||||
pub async fn connect(address: &str) -> Result<Self, SqError> {
|
||||
let channel = tonic::transport::Channel::from_shared(address.to_string())
|
||||
.map_err(|e| SqError::Connection(e.to_string()))?
|
||||
.connect()
|
||||
.await?;
|
||||
|
||||
Ok(Self { channel })
|
||||
}
|
||||
|
||||
/// Get the underlying tonic channel.
|
||||
pub fn channel(&self) -> tonic::transport::Channel {
|
||||
self.channel.clone()
|
||||
}
|
||||
}
|
||||
154
crates/sq-sdk/src/consumer.rs
Normal file
154
crates/sq-sdk/src/consumer.rs
Normal file
@@ -0,0 +1,154 @@
|
||||
use sq_grpc_interface::{
|
||||
data_plane_service_client::DataPlaneServiceClient, AckRequest, ConsumedMessage,
|
||||
SubscribeRequest, SubscribeResponse,
|
||||
};
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
use crate::connection::GrpcConnection;
|
||||
use crate::error::SqError;
|
||||
|
||||
/// Configuration for an SQ gRPC consumer.
|
||||
pub struct GrpcConsumerConfig {
|
||||
/// Server address (e.g., "http://127.0.0.1:6060").
|
||||
pub address: String,
|
||||
/// Consumer group name.
|
||||
pub consumer_group: String,
|
||||
/// Topic to consume from.
|
||||
pub topic: String,
|
||||
/// Partition to consume from.
|
||||
pub partition: u32,
|
||||
/// Whether to automatically commit offsets.
|
||||
pub auto_commit: bool,
|
||||
/// Maximum number of messages to receive per batch.
|
||||
pub max_poll_records: u32,
|
||||
}
|
||||
|
||||
impl Default for GrpcConsumerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
address: "http://127.0.0.1:6060".to_string(),
|
||||
consumer_group: "default".to_string(),
|
||||
topic: String::new(),
|
||||
partition: 0,
|
||||
auto_commit: true,
|
||||
max_poll_records: 100,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A message consumed from SQ.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ReceivedMessage {
|
||||
pub offset: u64,
|
||||
pub topic: String,
|
||||
pub partition: u32,
|
||||
pub key: Vec<u8>,
|
||||
pub value: Vec<u8>,
|
||||
pub headers: Vec<(String, Vec<u8>)>,
|
||||
pub timestamp_ms: u64,
|
||||
}
|
||||
|
||||
impl From<ConsumedMessage> for ReceivedMessage {
|
||||
fn from(m: ConsumedMessage) -> Self {
|
||||
Self {
|
||||
offset: m.offset,
|
||||
topic: m.topic,
|
||||
partition: m.partition,
|
||||
key: m.key,
|
||||
value: m.value,
|
||||
headers: m.headers.into_iter().map(|h| (h.key, h.value)).collect(),
|
||||
timestamp_ms: m.timestamp_ms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// SQ gRPC consumer client. Receives messages from an SQ server via streaming.
|
||||
pub struct GrpcConsumer {
|
||||
client: DataPlaneServiceClient<tonic::transport::Channel>,
|
||||
config: GrpcConsumerConfig,
|
||||
stream: Option<tonic::Streaming<SubscribeResponse>>,
|
||||
last_offset: Option<u64>,
|
||||
}
|
||||
|
||||
impl GrpcConsumer {
|
||||
/// Connect to an SQ server and create a new consumer.
|
||||
pub async fn connect(config: GrpcConsumerConfig) -> Result<Self, SqError> {
|
||||
let conn = GrpcConnection::connect(&config.address).await?;
|
||||
let client = DataPlaneServiceClient::new(conn.channel());
|
||||
|
||||
Ok(Self {
|
||||
client,
|
||||
config,
|
||||
stream: None,
|
||||
last_offset: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Poll for new messages. Establishes the subscription stream on first call.
|
||||
/// Returns an empty vec if no messages are available yet.
|
||||
pub async fn poll(&mut self) -> Result<Vec<ReceivedMessage>, SqError> {
|
||||
// Establish stream if not yet connected.
|
||||
if self.stream.is_none() {
|
||||
let response = self
|
||||
.client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: self.config.topic.clone(),
|
||||
partition: self.config.partition,
|
||||
consumer_group: self.config.consumer_group.clone(),
|
||||
start_offset: None, // Uses committed offset if consumer group set.
|
||||
max_batch_size: self.config.max_poll_records,
|
||||
}))
|
||||
.await?;
|
||||
|
||||
self.stream = Some(response.into_inner());
|
||||
}
|
||||
|
||||
let stream = self.stream.as_mut().unwrap();
|
||||
|
||||
match stream.next().await {
|
||||
Some(Ok(response)) => {
|
||||
let messages: Vec<ReceivedMessage> = response
|
||||
.messages
|
||||
.into_iter()
|
||||
.map(ReceivedMessage::from)
|
||||
.collect();
|
||||
|
||||
if let Some(last) = messages.last() {
|
||||
self.last_offset = Some(last.offset);
|
||||
|
||||
// Auto-commit if enabled.
|
||||
if self.config.auto_commit {
|
||||
// Best-effort commit; don't fail the poll on commit error.
|
||||
let _ = self.commit_internal(last.offset).await;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(messages)
|
||||
}
|
||||
Some(Err(status)) => Err(SqError::from(status)),
|
||||
None => {
|
||||
// Stream ended - reset so next poll reconnects.
|
||||
self.stream = None;
|
||||
Ok(vec![])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Manually commit an offset for this consumer's group/topic/partition.
|
||||
pub async fn commit(&mut self, offset: u64) -> Result<(), SqError> {
|
||||
self.commit_internal(offset).await
|
||||
}
|
||||
|
||||
async fn commit_internal(&mut self, offset: u64) -> Result<(), SqError> {
|
||||
self.client
|
||||
.ack(tonic::Request::new(AckRequest {
|
||||
consumer_group: self.config.consumer_group.clone(),
|
||||
topic: self.config.topic.clone(),
|
||||
partition: self.config.partition,
|
||||
offset,
|
||||
}))
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
34
crates/sq-sdk/src/error.rs
Normal file
34
crates/sq-sdk/src/error.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SqError {
|
||||
#[error("connection failed: {0}")]
|
||||
Connection(String),
|
||||
|
||||
#[error("server error: {0}")]
|
||||
Server(String),
|
||||
|
||||
#[error("invalid argument: {0}")]
|
||||
InvalidArgument(String),
|
||||
|
||||
#[error("not found: {0}")]
|
||||
NotFound(String),
|
||||
|
||||
#[error("stream ended")]
|
||||
StreamEnded,
|
||||
}
|
||||
|
||||
impl From<tonic::Status> for SqError {
|
||||
fn from(status: tonic::Status) -> Self {
|
||||
match status.code() {
|
||||
tonic::Code::InvalidArgument => SqError::InvalidArgument(status.message().to_string()),
|
||||
tonic::Code::NotFound => SqError::NotFound(status.message().to_string()),
|
||||
tonic::Code::Unavailable => SqError::Connection(status.message().to_string()),
|
||||
_ => SqError::Server(format!("{}: {}", status.code(), status.message())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<tonic::transport::Error> for SqError {
|
||||
fn from(err: tonic::transport::Error) -> Self {
|
||||
SqError::Connection(err.to_string())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
pub mod batch_producer;
|
||||
pub mod capnp_batch_producer;
|
||||
pub mod capnp_connection;
|
||||
pub mod capnp_consumer;
|
||||
pub mod capnp_producer;
|
||||
pub mod connection;
|
||||
pub mod consumer;
|
||||
pub mod error;
|
||||
pub mod producer;
|
||||
pub mod types;
|
||||
|
||||
// Default (capnp) types — these are the primary SDK interface.
|
||||
pub use capnp_batch_producer::{BatchProducer, BatchProducerConfig};
|
||||
pub use capnp_connection::Connection;
|
||||
pub use capnp_consumer::{Consumer, ConsumerConfig};
|
||||
pub use capnp_producer::{Producer, ProducerConfig};
|
||||
pub use types::AckMode;
|
||||
|
||||
// gRPC types (available but not the default transport).
|
||||
pub use batch_producer::{GrpcBatchProducer, GrpcBatchProducerConfig};
|
||||
pub use connection::GrpcConnection;
|
||||
pub use consumer::{GrpcConsumer, GrpcConsumerConfig};
|
||||
pub use producer::{GrpcProducer, GrpcProducerConfig};
|
||||
|
||||
// Shared types used by both transports.
|
||||
pub use consumer::ReceivedMessage;
|
||||
pub use error::SqError;
|
||||
pub use producer::{ProducerMessage, SendResult};
|
||||
|
||||
143
crates/sq-sdk/src/producer.rs
Normal file
143
crates/sq-sdk/src/producer.rs
Normal file
@@ -0,0 +1,143 @@
|
||||
use sq_grpc_interface::{
|
||||
data_plane_service_client::DataPlaneServiceClient, AckMode, MessageHeader, PublishMessage,
|
||||
PublishRequest, PublishSettings,
|
||||
};
|
||||
|
||||
use crate::connection::GrpcConnection;
|
||||
use crate::error::SqError;
|
||||
|
||||
/// Configuration for an SQ gRPC producer.
|
||||
pub struct GrpcProducerConfig {
|
||||
/// Server address (e.g., "http://127.0.0.1:6060").
|
||||
pub address: String,
|
||||
/// Default ack mode for publish requests.
|
||||
pub ack_mode: AckMode,
|
||||
/// Producer identifier.
|
||||
pub producer_id: String,
|
||||
}
|
||||
|
||||
impl Default for GrpcProducerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
address: "http://127.0.0.1:6060".to_string(),
|
||||
ack_mode: AckMode::All,
|
||||
producer_id: "default".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of sending a single message.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SendResult {
|
||||
pub topic: String,
|
||||
pub partition: u32,
|
||||
pub offset: u64,
|
||||
}
|
||||
|
||||
/// A message to be sent by the producer.
|
||||
pub struct ProducerMessage {
|
||||
pub topic: String,
|
||||
pub key: Option<Vec<u8>>,
|
||||
pub value: Vec<u8>,
|
||||
pub headers: Vec<(String, Vec<u8>)>,
|
||||
}
|
||||
|
||||
impl ProducerMessage {
|
||||
pub fn new(topic: impl Into<String>, value: impl Into<Vec<u8>>) -> Self {
|
||||
Self {
|
||||
topic: topic.into(),
|
||||
key: None,
|
||||
value: value.into(),
|
||||
headers: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_key(mut self, key: impl Into<Vec<u8>>) -> Self {
|
||||
self.key = Some(key.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_header(mut self, key: impl Into<String>, value: impl Into<Vec<u8>>) -> Self {
|
||||
self.headers.push((key.into(), value.into()));
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// SQ gRPC producer client. Sends messages to an SQ server.
|
||||
pub struct GrpcProducer {
|
||||
client: DataPlaneServiceClient<tonic::transport::Channel>,
|
||||
config: GrpcProducerConfig,
|
||||
}
|
||||
|
||||
impl GrpcProducer {
|
||||
/// Connect to an SQ server and create a new producer.
|
||||
pub async fn connect(config: GrpcProducerConfig) -> Result<Self, SqError> {
|
||||
let conn = GrpcConnection::connect(&config.address).await?;
|
||||
let client = DataPlaneServiceClient::new(conn.channel());
|
||||
|
||||
Ok(Self { client, config })
|
||||
}
|
||||
|
||||
/// Send a single message.
|
||||
pub async fn send(
|
||||
&mut self,
|
||||
topic: &str,
|
||||
key: Option<&[u8]>,
|
||||
value: &[u8],
|
||||
) -> Result<SendResult, SqError> {
|
||||
let results = self
|
||||
.send_batch(vec![ProducerMessage {
|
||||
topic: topic.to_string(),
|
||||
key: key.map(|k| k.to_vec()),
|
||||
value: value.to_vec(),
|
||||
headers: Vec::new(),
|
||||
}])
|
||||
.await?;
|
||||
|
||||
Ok(results.into_iter().next().unwrap())
|
||||
}
|
||||
|
||||
/// Send a batch of messages.
|
||||
pub async fn send_batch(
|
||||
&mut self,
|
||||
messages: Vec<ProducerMessage>,
|
||||
) -> Result<Vec<SendResult>, SqError> {
|
||||
let publish_messages: Vec<PublishMessage> = messages
|
||||
.into_iter()
|
||||
.map(|m| PublishMessage {
|
||||
topic: m.topic,
|
||||
key: m.key.unwrap_or_default(),
|
||||
value: m.value,
|
||||
headers: m
|
||||
.headers
|
||||
.into_iter()
|
||||
.map(|(k, v)| MessageHeader { key: k, value: v })
|
||||
.collect(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages: publish_messages,
|
||||
settings: Some(PublishSettings {
|
||||
ack_mode: self.config.ack_mode.into(),
|
||||
}),
|
||||
producer_id: self.config.producer_id.clone(),
|
||||
}))
|
||||
.await?;
|
||||
|
||||
let results = response
|
||||
.into_inner()
|
||||
.results
|
||||
.into_iter()
|
||||
.map(|r| SendResult {
|
||||
topic: r.topic,
|
||||
partition: r.partition,
|
||||
offset: r.offset,
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
}
|
||||
37
crates/sq-sdk/src/types.rs
Normal file
37
crates/sq-sdk/src/types.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
/// Acknowledgment mode for publish requests.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum AckMode {
|
||||
/// Wait for all replicas to acknowledge.
|
||||
All,
|
||||
/// Wait for the local replica only.
|
||||
Local,
|
||||
/// Fire-and-forget, no acknowledgment.
|
||||
None,
|
||||
}
|
||||
|
||||
impl AckMode {
|
||||
/// Convert to the wire `u8` used by the capnp protocol.
|
||||
pub fn to_capnp_u8(self) -> u8 {
|
||||
match self {
|
||||
AckMode::All => 1,
|
||||
AckMode::Local => 2,
|
||||
AckMode::None => 3,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<AckMode> for sq_grpc_interface::AckMode {
|
||||
fn from(mode: AckMode) -> Self {
|
||||
match mode {
|
||||
AckMode::All => sq_grpc_interface::AckMode::All,
|
||||
AckMode::Local => sq_grpc_interface::AckMode::Local,
|
||||
AckMode::None => sq_grpc_interface::AckMode::None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AckMode {
|
||||
fn default() -> Self {
|
||||
AckMode::All
|
||||
}
|
||||
}
|
||||
@@ -4,12 +4,17 @@ version.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
[dependencies]
|
||||
sq-capnp-interface = { workspace = true }
|
||||
sq-grpc-interface = { workspace = true }
|
||||
sq-models = { workspace = true }
|
||||
sq-storage = { workspace = true }
|
||||
sq-cluster = { workspace = true }
|
||||
sq-sim = { workspace = true }
|
||||
|
||||
capnp = { workspace = true }
|
||||
bytes = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
|
||||
anyhow = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
@@ -24,3 +29,15 @@ axum = { workspace = true }
|
||||
tower = { workspace = true }
|
||||
tower-http = { workspace = true }
|
||||
http = { workspace = true }
|
||||
tokio-util = { workspace = true }
|
||||
tokio-stream = { workspace = true }
|
||||
async-stream = { workspace = true }
|
||||
tracing-opentelemetry = { workspace = true }
|
||||
opentelemetry = { workspace = true }
|
||||
opentelemetry_sdk = { workspace = true }
|
||||
opentelemetry-otlp = { workspace = true }
|
||||
opentelemetry-semantic-conventions = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
sq-sdk = { workspace = true }
|
||||
|
||||
50
crates/sq-server/src/capnp/ack.rs
Normal file
50
crates/sq-server/src/capnp/ack.rs
Normal file
@@ -0,0 +1,50 @@
|
||||
use sq_capnp_interface::codec::{self, Frame, OP_ACK_RES};
|
||||
use sq_capnp_interface::data_plane_capnp;
|
||||
|
||||
use crate::metrics;
|
||||
use crate::state::State;
|
||||
|
||||
pub async fn handle(state: &State, payload: &[u8]) -> Frame {
|
||||
match handle_inner(state, payload) {
|
||||
Ok(frame) => frame,
|
||||
Err(e) => codec::error_frame(&e),
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_inner(state: &State, payload: &[u8]) -> Result<Frame, String> {
|
||||
let reader = codec::read_capnp(payload).map_err(|e| format!("decode error: {e}"))?;
|
||||
let req = reader
|
||||
.get_root::<data_plane_capnp::ack_request::Reader>()
|
||||
.map_err(|e| format!("schema error: {e}"))?;
|
||||
|
||||
let consumer_group = req
|
||||
.get_consumer_group()
|
||||
.map_err(|e| format!("schema error: {e}"))?
|
||||
.to_str()
|
||||
.map_err(|e| format!("utf8 error: {e}"))?;
|
||||
let topic = req
|
||||
.get_topic()
|
||||
.map_err(|e| format!("schema error: {e}"))?
|
||||
.to_str()
|
||||
.map_err(|e| format!("utf8 error: {e}"))?;
|
||||
let partition = req.get_partition();
|
||||
let offset = req.get_offset();
|
||||
|
||||
if consumer_group.is_empty() {
|
||||
return Err("consumer_group must not be empty".to_string());
|
||||
}
|
||||
if topic.is_empty() {
|
||||
return Err("topic must not be empty".to_string());
|
||||
}
|
||||
|
||||
state
|
||||
.engine
|
||||
.commit_offset(consumer_group, topic, partition, offset)
|
||||
.map_err(|e| format!("commit error: {e}"))?;
|
||||
|
||||
metrics::record_ack(topic);
|
||||
|
||||
let mut builder = capnp::message::Builder::new_default();
|
||||
builder.init_root::<data_plane_capnp::ack_response::Builder>();
|
||||
Ok(codec::build_frame(OP_ACK_RES, &builder))
|
||||
}
|
||||
54
crates/sq-server/src/capnp/handler.rs
Normal file
54
crates/sq-server/src/capnp/handler.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
use futures::SinkExt;
|
||||
use sq_capnp_interface::codec::{SqCodec, OP_ACK_REQ, OP_PUBLISH_REQ, OP_SUBSCRIBE_REQ};
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::codec::Framed;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::state::State;
|
||||
|
||||
use super::{ack, publish, subscribe};
|
||||
|
||||
pub async fn handle_connection(
|
||||
stream: TcpStream,
|
||||
state: State,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
stream.set_nodelay(true)?;
|
||||
let mut framed = Framed::new(stream, SqCodec::new());
|
||||
|
||||
loop {
|
||||
let frame = tokio::select! {
|
||||
result = framed.next() => {
|
||||
match result {
|
||||
Some(Ok(frame)) => frame,
|
||||
Some(Err(e)) => return Err(e.into()),
|
||||
None => return Ok(()), // connection closed
|
||||
}
|
||||
}
|
||||
() = cancel.cancelled() => return Ok(()),
|
||||
};
|
||||
|
||||
match frame.opcode {
|
||||
OP_PUBLISH_REQ => {
|
||||
let response = publish::handle(&state, &frame.payload).await;
|
||||
framed.send(response).await?;
|
||||
}
|
||||
OP_SUBSCRIBE_REQ => {
|
||||
// Subscribe takes ownership of the framed stream for writing multiple responses.
|
||||
subscribe::handle(&state, &frame.payload, &mut framed, &cancel).await?;
|
||||
// After subscribe ends, the connection is done.
|
||||
return Ok(());
|
||||
}
|
||||
OP_ACK_REQ => {
|
||||
let response = ack::handle(&state, &frame.payload).await;
|
||||
framed.send(response).await?;
|
||||
}
|
||||
other => {
|
||||
let response =
|
||||
sq_capnp_interface::codec::error_frame(&format!("unknown opcode: 0x{other:02x}"));
|
||||
framed.send(response).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
58
crates/sq-server/src/capnp/mod.rs
Normal file
58
crates/sq-server/src/capnp/mod.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
mod ack;
|
||||
mod handler;
|
||||
mod publish;
|
||||
mod subscribe;
|
||||
|
||||
use std::net::SocketAddr;
|
||||
|
||||
use notmad::{Component, ComponentInfo, MadError};
|
||||
use tokio::net::TcpListener;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::state::State;
|
||||
|
||||
pub struct CapnpServer {
|
||||
pub host: SocketAddr,
|
||||
pub state: State,
|
||||
}
|
||||
|
||||
impl Component for CapnpServer {
|
||||
fn info(&self) -> ComponentInfo {
|
||||
"sq-server/capnp".into()
|
||||
}
|
||||
|
||||
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
|
||||
let listener = TcpListener::bind(self.host)
|
||||
.await
|
||||
.map_err(|e| MadError::Inner(e.into()))?;
|
||||
|
||||
tracing::info!(addr = %self.host, "capnp data plane listening");
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
result = listener.accept() => {
|
||||
match result {
|
||||
Ok((stream, addr)) => {
|
||||
let state = self.state.clone();
|
||||
let cancel = cancellation_token.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = handler::handle_connection(stream, state, cancel).await {
|
||||
tracing::debug!(peer = %addr, error = %e, "capnp connection ended");
|
||||
}
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %e, "capnp accept error");
|
||||
}
|
||||
}
|
||||
}
|
||||
() = cancellation_token.cancelled() => {
|
||||
tracing::info!("capnp server shutting down");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
138
crates/sq-server/src/capnp/publish.rs
Normal file
138
crates/sq-server/src/capnp/publish.rs
Normal file
@@ -0,0 +1,138 @@
|
||||
use sq_capnp_interface::codec::{self, Frame, OP_PUBLISH_RES};
|
||||
use sq_capnp_interface::data_plane_capnp;
|
||||
|
||||
use crate::metrics;
|
||||
use crate::pipeline::PipelineMessage;
|
||||
use crate::state::State;
|
||||
|
||||
pub async fn handle(state: &State, payload: &[u8]) -> Frame {
|
||||
match handle_inner(state, payload).await {
|
||||
Ok(frame) => frame,
|
||||
Err(e) => codec::error_frame(&e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode the capnp payload into owned pipeline messages. This is sync (no .await)
|
||||
/// so the capnp Reader (which is !Send) does not live across an await boundary.
|
||||
fn decode_request(payload: &[u8]) -> Result<(Vec<PipelineMessage>, u8), String> {
|
||||
let reader = codec::read_capnp(payload).map_err(|e| format!("decode error: {e}"))?;
|
||||
let req = reader
|
||||
.get_root::<data_plane_capnp::publish_request::Reader>()
|
||||
.map_err(|e| format!("schema error: {e}"))?;
|
||||
|
||||
let messages = req
|
||||
.get_messages()
|
||||
.map_err(|e| format!("schema error: {e}"))?;
|
||||
if messages.len() == 0 {
|
||||
return Err("messages must not be empty".to_string());
|
||||
}
|
||||
|
||||
let ack_mode = req.get_ack_mode();
|
||||
let timestamp_ms = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis() as u64;
|
||||
|
||||
let mut pipeline_msgs = Vec::with_capacity(messages.len() as usize);
|
||||
for i in 0..messages.len() {
|
||||
let msg = messages.get(i);
|
||||
let topic = msg
|
||||
.get_topic()
|
||||
.map_err(|e| format!("schema error: {e}"))?
|
||||
.to_string()
|
||||
.map_err(|e| format!("utf8 error: {e}"))?;
|
||||
if topic.is_empty() {
|
||||
return Err("topic must not be empty".to_string());
|
||||
}
|
||||
|
||||
let key = msg
|
||||
.get_key()
|
||||
.map_err(|e| format!("schema error: {e}"))?
|
||||
.to_vec();
|
||||
let value = msg
|
||||
.get_value()
|
||||
.map_err(|e| format!("schema error: {e}"))?
|
||||
.to_vec();
|
||||
let headers_reader = msg
|
||||
.get_headers()
|
||||
.map_err(|e| format!("schema error: {e}"))?;
|
||||
|
||||
let mut headers = Vec::with_capacity(headers_reader.len() as usize);
|
||||
for j in 0..headers_reader.len() {
|
||||
let h = headers_reader.get(j);
|
||||
let hkey = h
|
||||
.get_key()
|
||||
.map_err(|e| format!("schema error: {e}"))?
|
||||
.to_string()
|
||||
.map_err(|e| format!("utf8 error: {e}"))?;
|
||||
let hval = h
|
||||
.get_value()
|
||||
.map_err(|e| format!("schema error: {e}"))?
|
||||
.to_vec();
|
||||
headers.push(sq_models::Header {
|
||||
key: hkey,
|
||||
value: hval,
|
||||
});
|
||||
}
|
||||
|
||||
pipeline_msgs.push(PipelineMessage {
|
||||
topic,
|
||||
partition: 0,
|
||||
key,
|
||||
value,
|
||||
headers,
|
||||
timestamp_ms,
|
||||
});
|
||||
}
|
||||
|
||||
Ok((pipeline_msgs, ack_mode))
|
||||
}
|
||||
|
||||
async fn handle_inner(state: &State, payload: &[u8]) -> Result<Frame, String> {
|
||||
let (pipeline_msgs, ack_mode) = decode_request(payload)?;
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let first_topic = pipeline_msgs
|
||||
.first()
|
||||
.map(|m| m.topic.clone())
|
||||
.unwrap_or_default();
|
||||
|
||||
// ACK mode 3 = None (fire and forget)
|
||||
if ack_mode == 3 {
|
||||
let result_count = pipeline_msgs.len();
|
||||
state
|
||||
.pipeline
|
||||
.submit_fire_and_forget(pipeline_msgs)
|
||||
.await;
|
||||
metrics::record_messages_published(&first_topic, result_count as u64);
|
||||
metrics::record_publish_duration(&first_topic, start);
|
||||
return Ok(build_publish_response(&[]));
|
||||
}
|
||||
|
||||
// Standard ack mode — submit and wait.
|
||||
let results = state
|
||||
.pipeline
|
||||
.submit(pipeline_msgs)
|
||||
.await
|
||||
.map_err(|e| format!("pipeline error: {e}"))?;
|
||||
|
||||
metrics::record_messages_published(&first_topic, results.len() as u64);
|
||||
metrics::record_publish_duration(&first_topic, start);
|
||||
|
||||
Ok(build_publish_response(&results))
|
||||
}
|
||||
|
||||
fn build_publish_response(results: &[crate::pipeline::PipelineResult]) -> Frame {
|
||||
let mut builder = capnp::message::Builder::new_default();
|
||||
{
|
||||
let resp = builder.init_root::<data_plane_capnp::publish_response::Builder>();
|
||||
let mut res_list = resp.init_results(results.len() as u32);
|
||||
for (i, r) in results.iter().enumerate() {
|
||||
let mut entry = res_list.reborrow().get(i as u32);
|
||||
entry.set_topic(&r.topic[..]);
|
||||
entry.set_partition(r.partition);
|
||||
entry.set_offset(r.offset);
|
||||
}
|
||||
}
|
||||
codec::build_frame(OP_PUBLISH_RES, &builder)
|
||||
}
|
||||
113
crates/sq-server/src/capnp/subscribe.rs
Normal file
113
crates/sq-server/src/capnp/subscribe.rs
Normal file
@@ -0,0 +1,113 @@
|
||||
use bytes::Bytes;
|
||||
use futures::SinkExt;
|
||||
use sq_capnp_interface::codec::{self, Frame, SqCodec, OP_SUBSCRIBE_END, OP_SUBSCRIBE_RES};
|
||||
use sq_capnp_interface::data_plane_capnp;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_util::codec::Framed;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::metrics;
|
||||
use crate::state::State;
|
||||
|
||||
pub async fn handle(
|
||||
state: &State,
|
||||
payload: &[u8],
|
||||
framed: &mut Framed<TcpStream, SqCodec>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
let reader = codec::read_capnp(payload)?;
|
||||
let req = reader.get_root::<data_plane_capnp::subscribe_request::Reader>()?;
|
||||
|
||||
let topic = req.get_topic()?.to_string()?;
|
||||
if topic.is_empty() {
|
||||
let err = codec::error_frame("topic must not be empty");
|
||||
framed.send(err).await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let partition = req.get_partition();
|
||||
let consumer_group = req.get_consumer_group()?.to_string()?;
|
||||
let batch_size = if req.get_max_batch_size() == 0 {
|
||||
100
|
||||
} else {
|
||||
req.get_max_batch_size() as usize
|
||||
};
|
||||
|
||||
let start_offset = if req.get_has_start_offset() {
|
||||
req.get_start_offset()
|
||||
} else if !consumer_group.is_empty() {
|
||||
state
|
||||
.engine
|
||||
.get_committed_offset(&consumer_group, &topic, partition)
|
||||
.unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let mut current_offset = start_offset;
|
||||
|
||||
loop {
|
||||
if cancel.is_cancelled() {
|
||||
break;
|
||||
}
|
||||
|
||||
let messages = state
|
||||
.engine
|
||||
.read(&topic, partition, current_offset, batch_size)
|
||||
.map_err(|e| format!("read error: {e}"))?;
|
||||
|
||||
if messages.is_empty() {
|
||||
tokio::select! {
|
||||
() = tokio::time::sleep(tokio::time::Duration::from_millis(100)) => continue,
|
||||
() = cancel.cancelled() => break,
|
||||
}
|
||||
}
|
||||
|
||||
let frame = build_subscribe_response(&messages, &mut current_offset);
|
||||
metrics::record_messages_consumed(&topic, messages.len() as u64);
|
||||
metrics::record_subscribe_batch(&topic);
|
||||
|
||||
if framed.send(frame).await.is_err() {
|
||||
// Client disconnected.
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Send end-of-stream sentinel.
|
||||
let end = Frame {
|
||||
opcode: OP_SUBSCRIBE_END,
|
||||
payload: Bytes::new(),
|
||||
};
|
||||
let _ = framed.send(end).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_subscribe_response(
|
||||
messages: &[sq_models::Message],
|
||||
current_offset: &mut u64,
|
||||
) -> Frame {
|
||||
let mut builder = capnp::message::Builder::new_default();
|
||||
{
|
||||
let resp = builder.init_root::<data_plane_capnp::subscribe_response::Builder>();
|
||||
let mut msg_list = resp.init_messages(messages.len() as u32);
|
||||
for (i, m) in messages.iter().enumerate() {
|
||||
*current_offset = m.offset + 1;
|
||||
let mut entry = msg_list.reborrow().get(i as u32);
|
||||
entry.set_offset(m.offset);
|
||||
entry.set_topic(m.topic.as_str());
|
||||
entry.set_partition(m.partition);
|
||||
entry.set_key(m.key.as_deref().unwrap_or(&[]));
|
||||
entry.set_value(&m.value);
|
||||
entry.set_timestamp_ms(m.timestamp_ms);
|
||||
|
||||
let mut headers = entry.init_headers(m.headers.len() as u32);
|
||||
for (j, h) in m.headers.iter().enumerate() {
|
||||
let mut hdr = headers.reborrow().get(j as u32);
|
||||
hdr.set_key(&h.key[..]);
|
||||
hdr.set_value(&h.value);
|
||||
}
|
||||
}
|
||||
}
|
||||
codec::build_frame(OP_SUBSCRIBE_RES, &builder)
|
||||
}
|
||||
129
crates/sq-server/src/cli.rs
Normal file
129
crates/sq-server/src/cli.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use sq_models::SyncPolicy;
|
||||
use sq_sim::fs::RealFileSystem;
|
||||
use sq_storage::object_store::reader::ObjectStoreReader;
|
||||
use sq_storage::object_store::s3::{S3Config, S3ObjectStore};
|
||||
|
||||
use crate::pipeline::WritePipeline;
|
||||
use crate::state::{Config, State};
|
||||
|
||||
mod serve;
|
||||
use serve::*;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about = "SQ - Stored Queue Server", long_about = None, subcommand_required = true)]
|
||||
struct Command {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
|
||||
#[arg(long, env = "SQ_NODE_ID", default_value = "node-1")]
|
||||
node_id: String,
|
||||
|
||||
#[arg(long, env = "SQ_DATA_DIR", default_value = "./data")]
|
||||
data_dir: PathBuf,
|
||||
|
||||
#[arg(long, env = "SQ_SEEDS", value_delimiter = ',')]
|
||||
seeds: Vec<String>,
|
||||
|
||||
#[arg(long, env = "SQ_CLUSTER_ID", default_value = "default")]
|
||||
cluster_id: String,
|
||||
|
||||
#[arg(long, env = "SQ_S3_BUCKET")]
|
||||
s3_bucket: Option<String>,
|
||||
|
||||
#[arg(long, env = "SQ_S3_ENDPOINT")]
|
||||
s3_endpoint: Option<String>,
|
||||
|
||||
#[arg(long, env = "SQ_S3_REGION")]
|
||||
s3_region: Option<String>,
|
||||
|
||||
/// Fsync policy: "every-batch" (default), "none", or interval in ms (e.g. "200").
|
||||
#[arg(long, env = "SQ_SYNC_POLICY", default_value = "every-batch")]
|
||||
sync_policy: String,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
Serve(ServeCommand),
|
||||
}
|
||||
|
||||
impl Commands {
|
||||
fn grpc_address(&self) -> String {
|
||||
match self {
|
||||
Commands::Serve(cmd) => cmd.grpc_host.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn execute(&self, state: &State, pipeline: WritePipeline) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Commands::Serve(cmd) => cmd.execute(state, pipeline).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn execute() -> anyhow::Result<()> {
|
||||
let cli = Command::parse();
|
||||
tracing::debug!("starting sq-server");
|
||||
|
||||
let sync_policy = match cli.sync_policy.as_str() {
|
||||
"every-batch" => SyncPolicy::EveryBatch,
|
||||
"none" => SyncPolicy::None,
|
||||
ms => {
|
||||
let millis: u64 = ms
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("invalid sync_policy: expected 'every-batch', 'none', or interval in ms, got '{ms}'"))?;
|
||||
SyncPolicy::Interval(std::time::Duration::from_millis(millis))
|
||||
}
|
||||
};
|
||||
|
||||
let config = Config {
|
||||
node_id: cli.node_id,
|
||||
data_dir: cli.data_dir,
|
||||
seeds: cli.seeds,
|
||||
grpc_address: cli.command.grpc_address(),
|
||||
cluster_id: cli.cluster_id,
|
||||
s3_bucket: cli.s3_bucket,
|
||||
s3_endpoint: cli.s3_endpoint,
|
||||
s3_region: cli.s3_region,
|
||||
sync_policy,
|
||||
};
|
||||
let (mut state, pipeline) = State::new(config)?;
|
||||
|
||||
// Set up S3 reader if S3 is configured.
|
||||
if let Some(bucket) = &state.config.s3_bucket {
|
||||
let s3_config = S3Config {
|
||||
bucket: bucket.clone(),
|
||||
region: state
|
||||
.config
|
||||
.s3_region
|
||||
.clone()
|
||||
.unwrap_or_else(|| "us-east-1".to_string()),
|
||||
endpoint: state.config.s3_endpoint.clone(),
|
||||
access_key_id: std::env::var("AWS_ACCESS_KEY_ID").ok(),
|
||||
secret_access_key: std::env::var("AWS_SECRET_ACCESS_KEY").ok(),
|
||||
allow_http: state.config.s3_endpoint.is_some(),
|
||||
};
|
||||
|
||||
match S3ObjectStore::new(s3_config) {
|
||||
Ok(store) => {
|
||||
let cache_dir = state.config.data_dir.join(".s3-cache");
|
||||
let reader = ObjectStoreReader::new(
|
||||
Arc::new(RealFileSystem),
|
||||
Arc::new(store),
|
||||
cache_dir,
|
||||
);
|
||||
state.s3_reader = Some(Arc::new(reader));
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %e, "failed to initialize S3 reader");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cli.command.execute(&state, pipeline).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
173
crates/sq-server/src/cli/serve.rs
Normal file
173
crates/sq-server/src/cli/serve.rs
Normal file
@@ -0,0 +1,173 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use notmad::{Component, ComponentInfo, MadError};
|
||||
use sq_cluster::membership::{Membership, MembershipConfig};
|
||||
use sq_storage::object_store::s3::{S3Config, S3ObjectStore};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use sq_models::SyncPolicy;
|
||||
|
||||
use crate::pipeline::WritePipeline;
|
||||
use crate::shipper::BackgroundShipper;
|
||||
use crate::sync_task::BackgroundSync;
|
||||
use crate::{capnp::CapnpServer, grpc, servehttp::ServeHttp, state::State};
|
||||
|
||||
/// Wraps the WritePipeline as a notmad Component.
|
||||
struct PipelineComponent {
|
||||
pipeline: std::sync::Mutex<Option<WritePipeline>>,
|
||||
}
|
||||
|
||||
impl Component for PipelineComponent {
|
||||
fn info(&self) -> ComponentInfo {
|
||||
"sq-server/pipeline".into()
|
||||
}
|
||||
|
||||
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
|
||||
let mut pipeline = self
|
||||
.pipeline
|
||||
.lock()
|
||||
.unwrap()
|
||||
.take()
|
||||
.expect("pipeline already taken");
|
||||
|
||||
tokio::select! {
|
||||
() = pipeline.run() => {}
|
||||
() = cancellation_token.cancelled() => {}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
pub struct ServeCommand {
|
||||
#[arg(long, env = "SQ_GRPC_HOST", default_value = "127.0.0.1:6060")]
|
||||
pub(crate) grpc_host: SocketAddr,
|
||||
|
||||
#[arg(long, env = "SQ_HTTP_HOST", default_value = "127.0.0.1:6062")]
|
||||
http_host: SocketAddr,
|
||||
|
||||
#[arg(long, env = "SQ_CAPNP_HOST", default_value = "127.0.0.1:6064")]
|
||||
capnp_host: SocketAddr,
|
||||
}
|
||||
|
||||
impl ServeCommand {
|
||||
pub async fn execute(&self, state: &State, pipeline: WritePipeline) -> anyhow::Result<()> {
|
||||
tracing::info!(
|
||||
node_id = %state.config.node_id,
|
||||
grpc = %self.grpc_host,
|
||||
http = %self.http_host,
|
||||
capnp = %self.capnp_host,
|
||||
seeds = ?state.config.seeds,
|
||||
"starting sq-server"
|
||||
);
|
||||
|
||||
let membership = Arc::new(Membership::new(MembershipConfig {
|
||||
node_id: state.config.node_id.clone(),
|
||||
address: state.config.grpc_address.clone(),
|
||||
seeds: state.config.seeds.clone(),
|
||||
..Default::default()
|
||||
}));
|
||||
|
||||
// Optionally set up S3 background shipper.
|
||||
let shipper = if let Some(bucket) = &state.config.s3_bucket {
|
||||
let s3_config = S3Config {
|
||||
bucket: bucket.clone(),
|
||||
region: state
|
||||
.config
|
||||
.s3_region
|
||||
.clone()
|
||||
.unwrap_or_else(|| "us-east-1".to_string()),
|
||||
endpoint: state.config.s3_endpoint.clone(),
|
||||
access_key_id: std::env::var("AWS_ACCESS_KEY_ID").ok(),
|
||||
secret_access_key: std::env::var("AWS_SECRET_ACCESS_KEY").ok(),
|
||||
allow_http: state.config.s3_endpoint.is_some(),
|
||||
};
|
||||
|
||||
match S3ObjectStore::new(s3_config) {
|
||||
Ok(store) => {
|
||||
tracing::info!(
|
||||
bucket = %bucket,
|
||||
cluster_id = %state.config.cluster_id,
|
||||
"S3 background shipper enabled"
|
||||
);
|
||||
let store = Arc::new(store);
|
||||
|
||||
Some(BackgroundShipper::new(
|
||||
state.clone(),
|
||||
store,
|
||||
state.config.cluster_id.clone(),
|
||||
Duration::from_secs(30),
|
||||
))
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %e, "failed to initialize S3 object store, shipper disabled");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Optionally set up background sync task for Interval sync policy.
|
||||
let background_sync = if let SyncPolicy::Interval(interval) = &state.config.sync_policy {
|
||||
tracing::info!(?interval, "background sync enabled");
|
||||
Some(BackgroundSync::new(state.engine.clone(), *interval))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Build the component set. We use match to handle optional components
|
||||
// without storing temporaries (Mad::builder() returns a temporary).
|
||||
match (shipper, background_sync) {
|
||||
(Some(shipper), Some(sync)) => {
|
||||
notmad::Mad::builder()
|
||||
.add(PipelineComponent { pipeline: std::sync::Mutex::new(Some(pipeline)) })
|
||||
.add(grpc::GrpcServer { host: self.grpc_host, state: state.clone(), membership: membership.clone() })
|
||||
.add(CapnpServer { host: self.capnp_host, state: state.clone() })
|
||||
.add(ServeHttp { host: self.http_host })
|
||||
.add(state.drop_queue.clone())
|
||||
.add(shipper)
|
||||
.add(sync)
|
||||
.run()
|
||||
.await?;
|
||||
}
|
||||
(Some(shipper), None) => {
|
||||
notmad::Mad::builder()
|
||||
.add(PipelineComponent { pipeline: std::sync::Mutex::new(Some(pipeline)) })
|
||||
.add(grpc::GrpcServer { host: self.grpc_host, state: state.clone(), membership: membership.clone() })
|
||||
.add(CapnpServer { host: self.capnp_host, state: state.clone() })
|
||||
.add(ServeHttp { host: self.http_host })
|
||||
.add(state.drop_queue.clone())
|
||||
.add(shipper)
|
||||
.run()
|
||||
.await?;
|
||||
}
|
||||
(None, Some(sync)) => {
|
||||
notmad::Mad::builder()
|
||||
.add(PipelineComponent { pipeline: std::sync::Mutex::new(Some(pipeline)) })
|
||||
.add(grpc::GrpcServer { host: self.grpc_host, state: state.clone(), membership: membership.clone() })
|
||||
.add(CapnpServer { host: self.capnp_host, state: state.clone() })
|
||||
.add(ServeHttp { host: self.http_host })
|
||||
.add(state.drop_queue.clone())
|
||||
.add(sync)
|
||||
.run()
|
||||
.await?;
|
||||
}
|
||||
(None, None) => {
|
||||
notmad::Mad::builder()
|
||||
.add(PipelineComponent { pipeline: std::sync::Mutex::new(Some(pipeline)) })
|
||||
.add(grpc::GrpcServer { host: self.grpc_host, state: state.clone(), membership: membership.clone() })
|
||||
.add(CapnpServer { host: self.capnp_host, state: state.clone() })
|
||||
.add(ServeHttp { host: self.http_host })
|
||||
.add(state.drop_queue.clone())
|
||||
.run()
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
170
crates/sq-server/src/grpc/cluster.rs
Normal file
170
crates/sq-server/src/grpc/cluster.rs
Normal file
@@ -0,0 +1,170 @@
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_cluster::membership::Membership;
|
||||
use sq_grpc_interface::{
|
||||
cluster_service_server::ClusterService, ClusterNodeInfo, FetchSegmentRequest,
|
||||
FetchSegmentResponse, HeartbeatRequest, HeartbeatResponse, JoinRequest, JoinResponse,
|
||||
ReplicateEntriesRequest, ReplicateEntriesResponse,
|
||||
};
|
||||
use tokio_stream::Stream;
|
||||
use tonic::Status;
|
||||
|
||||
use crate::metrics;
|
||||
use crate::state::State;
|
||||
|
||||
pub struct ClusterServer {
|
||||
pub state: State,
|
||||
pub membership: Arc<Membership>,
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl ClusterService for ClusterServer {
|
||||
#[tracing::instrument(skip_all, fields(rpc.method = "ReplicateEntries", sq.topic, sq.partition, sq.entry_count))]
|
||||
async fn replicate_entries(
|
||||
&self,
|
||||
request: tonic::Request<ReplicateEntriesRequest>,
|
||||
) -> Result<tonic::Response<ReplicateEntriesResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
let span = tracing::Span::current();
|
||||
span.record("sq.topic", &req.topic);
|
||||
span.record("sq.partition", req.partition);
|
||||
span.record("sq.entry_count", req.entries.len());
|
||||
|
||||
let mut last_offset = 0u64;
|
||||
for entry_bytes in &req.entries {
|
||||
let offset = self
|
||||
.state
|
||||
.engine
|
||||
.append(&req.topic, req.partition, None, entry_bytes, &[], 0)
|
||||
.map_err(|e| Status::internal(e.to_string()))?;
|
||||
last_offset = offset;
|
||||
}
|
||||
|
||||
metrics::record_replicate_entries(req.entries.len() as u64);
|
||||
|
||||
Ok(tonic::Response::new(ReplicateEntriesResponse {
|
||||
last_replicated_offset: last_offset,
|
||||
}))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(rpc.method = "Join", sq.joining_node_id, sq.joining_address))]
|
||||
async fn join(
|
||||
&self,
|
||||
request: tonic::Request<JoinRequest>,
|
||||
) -> Result<tonic::Response<JoinResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
let span = tracing::Span::current();
|
||||
span.record("sq.joining_node_id", &req.node_id);
|
||||
span.record("sq.joining_address", &req.address);
|
||||
|
||||
// Record the joining node.
|
||||
self.membership
|
||||
.record_heartbeat(&req.node_id, &req.address)
|
||||
.await;
|
||||
|
||||
tracing::info!(
|
||||
node_id = %req.node_id,
|
||||
address = %req.address,
|
||||
"node joined cluster"
|
||||
);
|
||||
|
||||
// Return current membership list.
|
||||
let members = self.membership.all_members().await;
|
||||
let member_infos: Vec<ClusterNodeInfo> = members
|
||||
.into_iter()
|
||||
.map(|m| ClusterNodeInfo {
|
||||
node_id: m.node_id,
|
||||
address: m.address,
|
||||
status: m.status.to_string(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(tonic::Response::new(JoinResponse {
|
||||
members: member_infos,
|
||||
}))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(rpc.method = "Heartbeat", sq.from_node))]
|
||||
async fn heartbeat(
|
||||
&self,
|
||||
request: tonic::Request<HeartbeatRequest>,
|
||||
) -> Result<tonic::Response<HeartbeatResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
tracing::Span::current().record("sq.from_node", &req.node_id);
|
||||
|
||||
// Record heartbeat from the sender.
|
||||
let sender_address = req
|
||||
.known_members
|
||||
.iter()
|
||||
.find(|m| m.node_id == req.node_id)
|
||||
.map(|m| m.address.clone())
|
||||
.unwrap_or_default();
|
||||
|
||||
self.membership
|
||||
.record_heartbeat(&req.node_id, &sender_address)
|
||||
.await;
|
||||
|
||||
// Merge any members we don't know about.
|
||||
let discovered: Vec<(String, String)> = req
|
||||
.known_members
|
||||
.iter()
|
||||
.map(|m| (m.node_id.clone(), m.address.clone()))
|
||||
.collect();
|
||||
self.membership.merge_members(discovered).await;
|
||||
|
||||
// Return our view of the membership.
|
||||
let members = self.membership.all_members().await;
|
||||
let member_infos: Vec<ClusterNodeInfo> = members
|
||||
.into_iter()
|
||||
.map(|m| ClusterNodeInfo {
|
||||
node_id: m.node_id,
|
||||
address: m.address,
|
||||
status: m.status.to_string(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(tonic::Response::new(HeartbeatResponse {
|
||||
members: member_infos,
|
||||
}))
|
||||
}
|
||||
|
||||
type FetchSegmentStream =
|
||||
Pin<Box<dyn Stream<Item = Result<FetchSegmentResponse, Status>> + Send + 'static>>;
|
||||
|
||||
#[tracing::instrument(skip_all, fields(rpc.method = "FetchSegment", sq.topic, sq.partition, sq.from_offset))]
|
||||
async fn fetch_segment(
|
||||
&self,
|
||||
request: tonic::Request<FetchSegmentRequest>,
|
||||
) -> Result<tonic::Response<Self::FetchSegmentStream>, Status> {
|
||||
let req = request.into_inner();
|
||||
let span = tracing::Span::current();
|
||||
span.record("sq.topic", &req.topic);
|
||||
span.record("sq.partition", req.partition);
|
||||
span.record("sq.from_offset", req.from_offset);
|
||||
|
||||
// Read messages from the requested offset. No lock needed.
|
||||
let messages = self
|
||||
.state
|
||||
.engine
|
||||
.read(&req.topic, req.partition, req.from_offset, 10_000)
|
||||
.map_err(|e| Status::internal(e.to_string()))?;
|
||||
|
||||
// Stream raw message data back in chunks.
|
||||
let stream = async_stream::try_stream! {
|
||||
const CHUNK_SIZE: usize = 100;
|
||||
for batch in messages.chunks(CHUNK_SIZE) {
|
||||
let mut chunk_data = Vec::new();
|
||||
for msg in batch {
|
||||
// Simple wire format: offset(8) + value_len(4) + value
|
||||
chunk_data.extend_from_slice(&msg.offset.to_le_bytes());
|
||||
chunk_data.extend_from_slice(&(msg.value.len() as u32).to_le_bytes());
|
||||
chunk_data.extend_from_slice(&msg.value);
|
||||
}
|
||||
yield FetchSegmentResponse { chunk: chunk_data };
|
||||
}
|
||||
};
|
||||
|
||||
Ok(tonic::Response::new(Box::pin(stream) as Self::FetchSegmentStream))
|
||||
}
|
||||
}
|
||||
146
crates/sq-server/src/grpc/control_plane.rs
Normal file
146
crates/sq-server/src/grpc/control_plane.rs
Normal file
@@ -0,0 +1,146 @@
|
||||
use sq_grpc_interface::{
|
||||
control_plane_service_server::ControlPlaneService, CreateConsumerGroupRequest,
|
||||
CreateConsumerGroupResponse, CreateTopicRequest, CreateTopicResponse, DeleteTopicRequest,
|
||||
DeleteTopicResponse, DescribeTopicRequest, DescribeTopicResponse, ListTopicsRequest,
|
||||
ListTopicsResponse, PartitionInfo, TopicInfo,
|
||||
};
|
||||
use tonic::Status;
|
||||
|
||||
use crate::grpc::error;
|
||||
use crate::metrics;
|
||||
use crate::state::State;
|
||||
|
||||
pub struct ControlPlaneServer {
|
||||
pub state: State,
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl ControlPlaneService for ControlPlaneServer {
|
||||
#[tracing::instrument(skip_all, fields(rpc.method = "CreateTopic", sq.topic))]
|
||||
async fn create_topic(
|
||||
&self,
|
||||
request: tonic::Request<CreateTopicRequest>,
|
||||
) -> Result<tonic::Response<CreateTopicResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
tracing::Span::current().record("sq.topic", &req.name);
|
||||
|
||||
if req.name.is_empty() {
|
||||
return Err(Status::invalid_argument("topic name must not be empty"));
|
||||
}
|
||||
|
||||
let partitions = if req.partitions == 0 { 1 } else { req.partitions };
|
||||
let replication_factor = if req.replication_factor == 0 {
|
||||
3
|
||||
} else {
|
||||
req.replication_factor
|
||||
};
|
||||
|
||||
let config = sq_models::TopicConfig::new(req.name.as_str())
|
||||
.with_partitions(partitions)
|
||||
.with_replication_factor(replication_factor);
|
||||
|
||||
self.state.engine.create_topic(config).map_err(|e| {
|
||||
if e.to_string().contains("already exists") {
|
||||
Status::already_exists(e.to_string())
|
||||
} else {
|
||||
error::internal(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
metrics::record_topic_created();
|
||||
|
||||
Ok(tonic::Response::new(CreateTopicResponse {
|
||||
name: req.name,
|
||||
}))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(rpc.method = "DeleteTopic", sq.topic))]
|
||||
async fn delete_topic(
|
||||
&self,
|
||||
request: tonic::Request<DeleteTopicRequest>,
|
||||
) -> Result<tonic::Response<DeleteTopicResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
tracing::Span::current().record("sq.topic", &req.name);
|
||||
|
||||
if req.name.is_empty() {
|
||||
return Err(Status::invalid_argument("topic name must not be empty"));
|
||||
}
|
||||
|
||||
self.state.engine.delete_topic(&req.name).map_err(|e| {
|
||||
if e.to_string().contains("not found") {
|
||||
Status::not_found(e.to_string())
|
||||
} else {
|
||||
error::internal(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
Ok(tonic::Response::new(DeleteTopicResponse {}))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(rpc.method = "ListTopics"))]
|
||||
async fn list_topics(
|
||||
&self,
|
||||
_request: tonic::Request<ListTopicsRequest>,
|
||||
) -> Result<tonic::Response<ListTopicsResponse>, Status> {
|
||||
let topics = self.state.engine.list_topics();
|
||||
|
||||
let topic_infos: Vec<TopicInfo> = topics
|
||||
.into_iter()
|
||||
.map(|t| TopicInfo {
|
||||
name: t.name.to_string(),
|
||||
partitions: t.partitions,
|
||||
replication_factor: t.replication_factor,
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(tonic::Response::new(ListTopicsResponse {
|
||||
topics: topic_infos,
|
||||
}))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(rpc.method = "DescribeTopic", sq.topic))]
|
||||
async fn describe_topic(
|
||||
&self,
|
||||
request: tonic::Request<DescribeTopicRequest>,
|
||||
) -> Result<tonic::Response<DescribeTopicResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
tracing::Span::current().record("sq.topic", &req.name);
|
||||
|
||||
let topic_config = self
|
||||
.state
|
||||
.engine
|
||||
.get_topic(&req.name)
|
||||
.ok_or_else(|| Status::not_found(format!("topic '{}' not found", req.name)))?;
|
||||
|
||||
let topic_info = TopicInfo {
|
||||
name: topic_config.name.to_string(),
|
||||
partitions: topic_config.partitions,
|
||||
replication_factor: topic_config.replication_factor,
|
||||
};
|
||||
|
||||
// Build partition info with offset ranges.
|
||||
let mut partition_info = Vec::new();
|
||||
for p in 0..topic_config.partitions {
|
||||
let latest = self.state.engine.latest_offset(&req.name, p);
|
||||
partition_info.push(PartitionInfo {
|
||||
partition: p,
|
||||
earliest_offset: 0,
|
||||
latest_offset: latest,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(tonic::Response::new(DescribeTopicResponse {
|
||||
topic: Some(topic_info),
|
||||
partition_info,
|
||||
}))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(rpc.method = "CreateConsumerGroup"))]
|
||||
async fn create_consumer_group(
|
||||
&self,
|
||||
_request: tonic::Request<CreateConsumerGroupRequest>,
|
||||
) -> Result<tonic::Response<CreateConsumerGroupResponse>, Status> {
|
||||
// Consumer groups are implicit in our design - they exist as soon as someone uses them.
|
||||
Ok(tonic::Response::new(CreateConsumerGroupResponse {}))
|
||||
}
|
||||
}
|
||||
334
crates/sq-server/src/grpc/data_plane.rs
Normal file
334
crates/sq-server/src/grpc/data_plane.rs
Normal file
@@ -0,0 +1,334 @@
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_grpc_interface::{
|
||||
data_plane_service_server::DataPlaneService, AckRequest, AckResponse, AckMode,
|
||||
ConsumedMessage, MessageHeader, PublishRequest, PublishResponse, PublishResult,
|
||||
SubscribeRequest, SubscribeResponse,
|
||||
};
|
||||
use sq_sim::fs::RealFileSystem;
|
||||
use sq_storage::object_store::layout;
|
||||
use sq_storage::object_store::reader::ObjectStoreReader;
|
||||
use sq_storage::object_store::s3::S3ObjectStore;
|
||||
use tokio_stream::Stream;
|
||||
use tonic::Status;
|
||||
|
||||
use crate::grpc::error;
|
||||
use crate::metrics;
|
||||
use crate::pipeline::PipelineMessage;
|
||||
use crate::state::State;
|
||||
|
||||
pub struct DataPlaneServer {
|
||||
pub state: State,
|
||||
}
|
||||
|
||||
fn to_pipeline_messages(
|
||||
messages: Vec<sq_grpc_interface::PublishMessage>,
|
||||
) -> Vec<PipelineMessage> {
|
||||
let timestamp_ms = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis() as u64;
|
||||
|
||||
messages
|
||||
.into_iter()
|
||||
.map(|msg| PipelineMessage {
|
||||
topic: msg.topic,
|
||||
partition: 0,
|
||||
key: msg.key,
|
||||
value: msg.value,
|
||||
headers: msg
|
||||
.headers
|
||||
.into_iter()
|
||||
.map(|h| sq_models::Header {
|
||||
key: h.key,
|
||||
value: h.value,
|
||||
})
|
||||
.collect(),
|
||||
timestamp_ms,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl DataPlaneService for DataPlaneServer {
|
||||
#[tracing::instrument(
|
||||
skip_all,
|
||||
fields(
|
||||
rpc.method = "Publish",
|
||||
sq.message_count,
|
||||
sq.ack_mode,
|
||||
)
|
||||
)]
|
||||
async fn publish(
|
||||
&self,
|
||||
request: tonic::Request<PublishRequest>,
|
||||
) -> Result<tonic::Response<PublishResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
let span = tracing::Span::current();
|
||||
span.record("sq.message_count", req.messages.len());
|
||||
|
||||
if req.messages.is_empty() {
|
||||
return Err(Status::invalid_argument("messages must not be empty"));
|
||||
}
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
let ack_mode = req
|
||||
.settings
|
||||
.as_ref()
|
||||
.map(|s| AckMode::try_from(s.ack_mode).unwrap_or(AckMode::All))
|
||||
.unwrap_or(AckMode::All);
|
||||
|
||||
// For ACK_MODE_NONE, fire-and-forget via the pipeline.
|
||||
if ack_mode == AckMode::None {
|
||||
let results: Vec<PublishResult> = req
|
||||
.messages
|
||||
.iter()
|
||||
.map(|msg| PublishResult {
|
||||
topic: msg.topic.clone(),
|
||||
partition: 0,
|
||||
offset: 0,
|
||||
})
|
||||
.collect();
|
||||
|
||||
let pipeline_msgs = to_pipeline_messages(req.messages);
|
||||
self.state.pipeline.submit_fire_and_forget(pipeline_msgs).await;
|
||||
|
||||
let first_topic = results.first().map(|r| r.topic.as_str()).unwrap_or("");
|
||||
metrics::record_messages_published(first_topic, results.len() as u64);
|
||||
metrics::record_publish_duration(first_topic, start);
|
||||
return Ok(tonic::Response::new(PublishResponse { results }));
|
||||
}
|
||||
|
||||
// Validate topics before submitting.
|
||||
for msg in &req.messages {
|
||||
if msg.topic.is_empty() {
|
||||
return Err(Status::invalid_argument("topic must not be empty"));
|
||||
}
|
||||
}
|
||||
|
||||
// Standard (ACK_MODE_ALL / ACK_MODE_LOCAL) - submit to pipeline and wait for ack.
|
||||
let pipeline_msgs = to_pipeline_messages(req.messages);
|
||||
let pipeline_results = self
|
||||
.state
|
||||
.pipeline
|
||||
.submit(pipeline_msgs)
|
||||
.await
|
||||
.map_err(|e| error::internal(anyhow::anyhow!(e)))?;
|
||||
|
||||
let results: Vec<PublishResult> = pipeline_results
|
||||
.into_iter()
|
||||
.map(|r| PublishResult {
|
||||
topic: r.topic,
|
||||
partition: r.partition,
|
||||
offset: r.offset,
|
||||
})
|
||||
.collect();
|
||||
|
||||
let first_topic = results.first().map(|r| r.topic.as_str()).unwrap_or("");
|
||||
metrics::record_messages_published(first_topic, results.len() as u64);
|
||||
metrics::record_publish_duration(first_topic, start);
|
||||
|
||||
Ok(tonic::Response::new(PublishResponse { results }))
|
||||
}
|
||||
|
||||
type SubscribeStream =
|
||||
Pin<Box<dyn Stream<Item = Result<SubscribeResponse, Status>> + Send + 'static>>;
|
||||
|
||||
#[tracing::instrument(
|
||||
skip_all,
|
||||
fields(
|
||||
rpc.method = "Subscribe",
|
||||
sq.topic,
|
||||
sq.partition,
|
||||
sq.consumer_group,
|
||||
)
|
||||
)]
|
||||
async fn subscribe(
|
||||
&self,
|
||||
request: tonic::Request<SubscribeRequest>,
|
||||
) -> Result<tonic::Response<Self::SubscribeStream>, Status> {
|
||||
let req = request.into_inner();
|
||||
let span = tracing::Span::current();
|
||||
span.record("sq.topic", &req.topic);
|
||||
span.record("sq.partition", req.partition);
|
||||
span.record("sq.consumer_group", &req.consumer_group);
|
||||
|
||||
if req.topic.is_empty() {
|
||||
return Err(Status::invalid_argument("topic must not be empty"));
|
||||
}
|
||||
|
||||
let batch_size = if req.max_batch_size == 0 {
|
||||
100
|
||||
} else {
|
||||
req.max_batch_size as usize
|
||||
};
|
||||
|
||||
// If no explicit start_offset, try using the committed offset for the consumer group.
|
||||
let start_offset = match req.start_offset {
|
||||
Some(offset) => offset,
|
||||
None => {
|
||||
if !req.consumer_group.is_empty() {
|
||||
self.state
|
||||
.engine
|
||||
.get_committed_offset(&req.consumer_group, &req.topic, req.partition)
|
||||
.unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
};
|
||||
let topic = req.topic.clone();
|
||||
let partition = req.partition;
|
||||
let state = self.state.clone();
|
||||
|
||||
let stream = async_stream::try_stream! {
|
||||
let mut current_offset = start_offset;
|
||||
|
||||
loop {
|
||||
let messages = state.engine
|
||||
.read(&topic, partition, current_offset, batch_size)
|
||||
.map_err(|e| error::internal(e))?;
|
||||
|
||||
// If local WAL is empty and S3 reader is available, try S3 fallback.
|
||||
let messages = if messages.is_empty() {
|
||||
if let Some(ref s3_reader) = state.s3_reader {
|
||||
read_from_s3(
|
||||
s3_reader,
|
||||
&state.config.cluster_id,
|
||||
&topic,
|
||||
partition,
|
||||
current_offset,
|
||||
batch_size,
|
||||
)
|
||||
.await
|
||||
.unwrap_or_default()
|
||||
} else {
|
||||
messages
|
||||
}
|
||||
} else {
|
||||
messages
|
||||
};
|
||||
|
||||
if messages.is_empty() {
|
||||
// Poll interval when caught up.
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
||||
continue;
|
||||
}
|
||||
|
||||
let consumed: Vec<ConsumedMessage> = messages
|
||||
.iter()
|
||||
.map(|m| {
|
||||
current_offset = m.offset + 1;
|
||||
ConsumedMessage {
|
||||
offset: m.offset,
|
||||
topic: m.topic.to_string(),
|
||||
partition: m.partition,
|
||||
key: m.key.clone().unwrap_or_default(),
|
||||
value: m.value.clone(),
|
||||
headers: m
|
||||
.headers
|
||||
.iter()
|
||||
.map(|h| MessageHeader {
|
||||
key: h.key.clone(),
|
||||
value: h.value.clone(),
|
||||
})
|
||||
.collect(),
|
||||
timestamp_ms: m.timestamp_ms,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
metrics::record_messages_consumed(&topic, consumed.len() as u64);
|
||||
metrics::record_subscribe_batch(&topic);
|
||||
|
||||
yield SubscribeResponse { messages: consumed };
|
||||
}
|
||||
};
|
||||
|
||||
Ok(tonic::Response::new(Box::pin(stream)))
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
skip_all,
|
||||
fields(
|
||||
rpc.method = "Ack",
|
||||
sq.topic,
|
||||
sq.partition,
|
||||
sq.consumer_group,
|
||||
sq.offset,
|
||||
)
|
||||
)]
|
||||
async fn ack(
|
||||
&self,
|
||||
request: tonic::Request<AckRequest>,
|
||||
) -> Result<tonic::Response<AckResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
let span = tracing::Span::current();
|
||||
span.record("sq.topic", &req.topic);
|
||||
span.record("sq.partition", req.partition);
|
||||
span.record("sq.consumer_group", &req.consumer_group);
|
||||
span.record("sq.offset", req.offset);
|
||||
|
||||
if req.consumer_group.is_empty() {
|
||||
return Err(Status::invalid_argument("consumer_group must not be empty"));
|
||||
}
|
||||
if req.topic.is_empty() {
|
||||
return Err(Status::invalid_argument("topic must not be empty"));
|
||||
}
|
||||
|
||||
self.state
|
||||
.engine
|
||||
.commit_offset(&req.consumer_group, &req.topic, req.partition, req.offset)
|
||||
.map_err(|e| error::internal(e))?;
|
||||
|
||||
metrics::record_ack(&req.topic);
|
||||
|
||||
Ok(tonic::Response::new(AckResponse {}))
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to read messages from S3 when local WAL is empty (segments have been shipped and trimmed).
|
||||
async fn read_from_s3(
|
||||
reader: &Arc<ObjectStoreReader<RealFileSystem, S3ObjectStore>>,
|
||||
cluster_id: &str,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
from_offset: u64,
|
||||
limit: usize,
|
||||
) -> anyhow::Result<Vec<sq_models::Message>> {
|
||||
// List all segment keys for this topic-partition.
|
||||
let prefix = layout::topic_partition_prefix(cluster_id, topic, partition);
|
||||
|
||||
// We need to use the ObjectStore trait's list method through the reader's store.
|
||||
// For now, we'll use a simpler approach: try to find and read the segment containing our offset.
|
||||
// This works because segment keys are lexicographically ordered.
|
||||
let keys = reader.list_segment_keys(&prefix).await?;
|
||||
|
||||
let mut messages = Vec::new();
|
||||
|
||||
for key in &keys {
|
||||
if messages.len() >= limit {
|
||||
break;
|
||||
}
|
||||
|
||||
// Parse the segment key to check offset range.
|
||||
if let Some((_, _, _, _base_offset, end_offset)) = layout::parse_segment_key(key) {
|
||||
// Skip segments that are entirely before our requested offset.
|
||||
if end_offset < from_offset {
|
||||
continue;
|
||||
}
|
||||
|
||||
let segment_msgs = reader.read_segment(key, from_offset).await?;
|
||||
for msg in segment_msgs {
|
||||
if messages.len() >= limit {
|
||||
break;
|
||||
}
|
||||
messages.push(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(messages)
|
||||
}
|
||||
13
crates/sq-server/src/grpc/error.rs
Normal file
13
crates/sq-server/src/grpc/error.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
use tonic::Status;
|
||||
|
||||
pub fn internal(err: impl std::fmt::Display) -> Status {
|
||||
Status::internal(err.to_string())
|
||||
}
|
||||
|
||||
pub fn not_found(msg: impl Into<String>) -> Status {
|
||||
Status::not_found(msg)
|
||||
}
|
||||
|
||||
pub fn invalid_argument(msg: impl Into<String>) -> Status {
|
||||
Status::invalid_argument(msg)
|
||||
}
|
||||
23
crates/sq-server/src/grpc/health.rs
Normal file
23
crates/sq-server/src/grpc/health.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
use sq_grpc_interface::{
|
||||
status_service_server::StatusService, GetStatusRequest, GetStatusResponse,
|
||||
};
|
||||
|
||||
use crate::state::State;
|
||||
|
||||
pub struct HealthServer {
|
||||
pub state: State,
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl StatusService for HealthServer {
|
||||
#[tracing::instrument(skip_all, fields(rpc.method = "Status"))]
|
||||
async fn status(
|
||||
&self,
|
||||
_request: tonic::Request<GetStatusRequest>,
|
||||
) -> Result<tonic::Response<GetStatusResponse>, tonic::Status> {
|
||||
Ok(tonic::Response::new(GetStatusResponse {
|
||||
node_id: self.state.config.node_id.clone(),
|
||||
cluster: None,
|
||||
}))
|
||||
}
|
||||
}
|
||||
79
crates/sq-server/src/grpc/mod.rs
Normal file
79
crates/sq-server/src/grpc/mod.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use notmad::MadError;
|
||||
use sq_cluster::membership::Membership;
|
||||
use sq_grpc_interface::{
|
||||
cluster_service_server::ClusterServiceServer,
|
||||
control_plane_service_server::ControlPlaneServiceServer,
|
||||
data_plane_service_server::DataPlaneServiceServer,
|
||||
status_service_server::StatusServiceServer,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::state::State;
|
||||
|
||||
pub mod cluster;
|
||||
pub mod control_plane;
|
||||
pub mod data_plane;
|
||||
pub mod error;
|
||||
pub mod health;
|
||||
|
||||
pub struct GrpcServer {
|
||||
pub host: SocketAddr,
|
||||
pub state: State,
|
||||
pub membership: Arc<Membership>,
|
||||
}
|
||||
|
||||
impl GrpcServer {
|
||||
pub async fn serve(&self, cancellation_token: CancellationToken) -> anyhow::Result<()> {
|
||||
tracing::info!("serving grpc on {}", self.host);
|
||||
|
||||
tonic::transport::Server::builder()
|
||||
.trace_fn(|request| {
|
||||
tracing::info_span!(
|
||||
"grpc",
|
||||
otel.kind = "server",
|
||||
rpc.system = "grpc",
|
||||
rpc.service = tracing::field::Empty,
|
||||
rpc.method = %request.uri().path(),
|
||||
)
|
||||
})
|
||||
.add_service(StatusServiceServer::new(health::HealthServer {
|
||||
state: self.state.clone(),
|
||||
}))
|
||||
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
|
||||
state: self.state.clone(),
|
||||
}))
|
||||
.add_service(ControlPlaneServiceServer::new(
|
||||
control_plane::ControlPlaneServer {
|
||||
state: self.state.clone(),
|
||||
},
|
||||
))
|
||||
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
|
||||
state: self.state.clone(),
|
||||
membership: self.membership.clone(),
|
||||
}))
|
||||
.serve_with_shutdown(
|
||||
self.host,
|
||||
async move { cancellation_token.cancelled().await },
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl notmad::Component for GrpcServer {
|
||||
fn info(&self) -> notmad::ComponentInfo {
|
||||
"sq-server/grpc".into()
|
||||
}
|
||||
|
||||
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
|
||||
self.serve(cancellation_token)
|
||||
.await
|
||||
.map_err(MadError::Inner)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
10
crates/sq-server/src/lib.rs
Normal file
10
crates/sq-server/src/lib.rs
Normal file
@@ -0,0 +1,10 @@
|
||||
pub mod capnp;
|
||||
pub mod cli;
|
||||
pub mod grpc;
|
||||
pub mod metrics;
|
||||
pub mod otel;
|
||||
pub mod pipeline;
|
||||
pub mod servehttp;
|
||||
pub mod shipper;
|
||||
pub mod state;
|
||||
pub mod sync_task;
|
||||
@@ -1,3 +1,27 @@
|
||||
fn main() {
|
||||
println!("sq-server");
|
||||
use sq_server::cli;
|
||||
use sq_server::otel::{LogFormat, OtelConfig};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
dotenvy::dotenv().ok();
|
||||
|
||||
let log_format = match std::env::var("LOG_LEVEL")
|
||||
.as_ref()
|
||||
.map(|r| r.as_str())
|
||||
{
|
||||
Ok("json") => LogFormat::Json,
|
||||
Ok("short") => LogFormat::Short,
|
||||
_ => LogFormat::Pretty,
|
||||
};
|
||||
|
||||
let _otel_guard = sq_server::otel::init(OtelConfig {
|
||||
service_name: "sq-server".to_string(),
|
||||
node_id: std::env::var("SQ_NODE_ID").unwrap_or_else(|_| "node-1".to_string()),
|
||||
otlp_endpoint: std::env::var("OTEL_EXPORTER_OTLP_ENDPOINT").ok(),
|
||||
log_format,
|
||||
})?;
|
||||
|
||||
cli::execute().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
85
crates/sq-server/src/metrics.rs
Normal file
85
crates/sq-server/src/metrics.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use opentelemetry::metrics::{Counter, Histogram, Meter};
|
||||
use opentelemetry::KeyValue;
|
||||
use std::sync::LazyLock;
|
||||
use std::time::Instant;
|
||||
|
||||
static METER: LazyLock<Meter> = LazyLock::new(|| opentelemetry::global::meter("sq-server"));
|
||||
|
||||
static MESSAGES_PUBLISHED: LazyLock<Counter<u64>> = LazyLock::new(|| {
|
||||
METER
|
||||
.u64_counter("sq.messages.published")
|
||||
.with_description("Total messages published")
|
||||
.build()
|
||||
});
|
||||
|
||||
static MESSAGES_CONSUMED: LazyLock<Counter<u64>> = LazyLock::new(|| {
|
||||
METER
|
||||
.u64_counter("sq.messages.consumed")
|
||||
.with_description("Total messages consumed via subscribe")
|
||||
.build()
|
||||
});
|
||||
|
||||
static PUBLISH_DURATION: LazyLock<Histogram<f64>> = LazyLock::new(|| {
|
||||
METER
|
||||
.f64_histogram("sq.publish.duration_ms")
|
||||
.with_description("Publish RPC duration in milliseconds")
|
||||
.with_unit("ms")
|
||||
.build()
|
||||
});
|
||||
|
||||
static SUBSCRIBE_BATCHES: LazyLock<Counter<u64>> = LazyLock::new(|| {
|
||||
METER
|
||||
.u64_counter("sq.subscribe.batches")
|
||||
.with_description("Total subscribe batches sent")
|
||||
.build()
|
||||
});
|
||||
|
||||
static ACK_TOTAL: LazyLock<Counter<u64>> = LazyLock::new(|| {
|
||||
METER
|
||||
.u64_counter("sq.ack.total")
|
||||
.with_description("Total ack (offset commit) operations")
|
||||
.build()
|
||||
});
|
||||
|
||||
static TOPICS_CREATED: LazyLock<Counter<u64>> = LazyLock::new(|| {
|
||||
METER
|
||||
.u64_counter("sq.topics.created")
|
||||
.with_description("Total topics created")
|
||||
.build()
|
||||
});
|
||||
|
||||
static REPLICATE_ENTRIES: LazyLock<Counter<u64>> = LazyLock::new(|| {
|
||||
METER
|
||||
.u64_counter("sq.replicate.entries")
|
||||
.with_description("Total entries replicated from other nodes")
|
||||
.build()
|
||||
});
|
||||
|
||||
pub fn record_messages_published(topic: &str, count: u64) {
|
||||
MESSAGES_PUBLISHED.add(count, &[KeyValue::new("sq.topic", topic.to_string())]);
|
||||
}
|
||||
|
||||
pub fn record_messages_consumed(topic: &str, count: u64) {
|
||||
MESSAGES_CONSUMED.add(count, &[KeyValue::new("sq.topic", topic.to_string())]);
|
||||
}
|
||||
|
||||
pub fn record_publish_duration(topic: &str, start: Instant) {
|
||||
let duration_ms = start.elapsed().as_secs_f64() * 1000.0;
|
||||
PUBLISH_DURATION.record(duration_ms, &[KeyValue::new("sq.topic", topic.to_string())]);
|
||||
}
|
||||
|
||||
pub fn record_subscribe_batch(topic: &str) {
|
||||
SUBSCRIBE_BATCHES.add(1, &[KeyValue::new("sq.topic", topic.to_string())]);
|
||||
}
|
||||
|
||||
pub fn record_ack(topic: &str) {
|
||||
ACK_TOTAL.add(1, &[KeyValue::new("sq.topic", topic.to_string())]);
|
||||
}
|
||||
|
||||
pub fn record_topic_created() {
|
||||
TOPICS_CREATED.add(1, &[]);
|
||||
}
|
||||
|
||||
pub fn record_replicate_entries(count: u64) {
|
||||
REPLICATE_ENTRIES.add(count, &[]);
|
||||
}
|
||||
121
crates/sq-server/src/otel.rs
Normal file
121
crates/sq-server/src/otel.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
use opentelemetry::trace::TracerProvider as _;
|
||||
use opentelemetry::KeyValue;
|
||||
use opentelemetry_otlp::WithExportConfig;
|
||||
use opentelemetry_sdk::metrics::SdkMeterProvider;
|
||||
use opentelemetry_sdk::trace::SdkTracerProvider;
|
||||
use opentelemetry_sdk::Resource;
|
||||
use tracing_opentelemetry::OpenTelemetryLayer;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::util::SubscriberInitExt;
|
||||
use tracing_subscriber::{EnvFilter, Layer};
|
||||
|
||||
/// Configuration for OpenTelemetry.
|
||||
pub struct OtelConfig {
|
||||
pub service_name: String,
|
||||
pub node_id: String,
|
||||
pub otlp_endpoint: Option<String>,
|
||||
pub log_format: LogFormat,
|
||||
}
|
||||
|
||||
pub enum LogFormat {
|
||||
Pretty,
|
||||
Json,
|
||||
Short,
|
||||
}
|
||||
|
||||
/// Initialized OTel guard. Drop to flush and shut down providers.
|
||||
pub struct OtelGuard {
|
||||
tracer_provider: Option<SdkTracerProvider>,
|
||||
meter_provider: Option<SdkMeterProvider>,
|
||||
}
|
||||
|
||||
impl Drop for OtelGuard {
|
||||
fn drop(&mut self) {
|
||||
if let Some(provider) = self.meter_provider.take()
|
||||
&& let Err(e) = provider.shutdown()
|
||||
{
|
||||
eprintln!("failed to shut down OTel meter provider: {e}");
|
||||
}
|
||||
if let Some(provider) = self.tracer_provider.take()
|
||||
&& let Err(e) = provider.shutdown()
|
||||
{
|
||||
eprintln!("failed to shut down OTel tracer provider: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize tracing and metrics with optional OpenTelemetry export.
|
||||
///
|
||||
/// If `otlp_endpoint` is set, spans and metrics are exported via OTLP/gRPC.
|
||||
/// Otherwise, only local logging is configured.
|
||||
pub fn init(config: OtelConfig) -> anyhow::Result<OtelGuard> {
|
||||
let env_filter = EnvFilter::from_default_env().add_directive("notmad=trace".parse()?);
|
||||
|
||||
let resource = Resource::builder()
|
||||
.with_attributes([
|
||||
KeyValue::new(
|
||||
opentelemetry_semantic_conventions::attribute::SERVICE_NAME,
|
||||
config.service_name.clone(),
|
||||
),
|
||||
KeyValue::new("sq.node_id", config.node_id.clone()),
|
||||
])
|
||||
.build();
|
||||
|
||||
let (tracer_provider, meter_provider, otel_layer) = match &config.otlp_endpoint {
|
||||
Some(endpoint) => {
|
||||
// Traces
|
||||
let span_exporter = opentelemetry_otlp::SpanExporter::builder()
|
||||
.with_tonic()
|
||||
.with_endpoint(endpoint)
|
||||
.build()?;
|
||||
|
||||
let tp = SdkTracerProvider::builder()
|
||||
.with_resource(resource.clone())
|
||||
.with_batch_exporter(span_exporter)
|
||||
.build();
|
||||
|
||||
let tracer = tp.tracer("sq-server");
|
||||
|
||||
// Metrics
|
||||
let metric_exporter = opentelemetry_otlp::MetricExporter::builder()
|
||||
.with_tonic()
|
||||
.with_endpoint(endpoint)
|
||||
.build()?;
|
||||
|
||||
let mp = SdkMeterProvider::builder()
|
||||
.with_resource(resource)
|
||||
.with_periodic_exporter(metric_exporter)
|
||||
.build();
|
||||
|
||||
// Register the global meter provider so opentelemetry::global::meter() works.
|
||||
opentelemetry::global::set_meter_provider(mp.clone());
|
||||
|
||||
let layer = OpenTelemetryLayer::new(tracer);
|
||||
|
||||
(Some(tp), Some(mp), Some(layer))
|
||||
}
|
||||
None => (None, None, None),
|
||||
};
|
||||
|
||||
let fmt_layer = match config.log_format {
|
||||
LogFormat::Json => tracing_subscriber::fmt::layer().json().boxed(),
|
||||
LogFormat::Short => tracing_subscriber::fmt::layer()
|
||||
.with_line_number(false)
|
||||
.with_target(false)
|
||||
.with_file(false)
|
||||
.with_level(true)
|
||||
.boxed(),
|
||||
LogFormat::Pretty => tracing_subscriber::fmt::layer().pretty().boxed(),
|
||||
};
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
.with(fmt_layer)
|
||||
.with(otel_layer)
|
||||
.init();
|
||||
|
||||
Ok(OtelGuard {
|
||||
tracer_provider,
|
||||
meter_provider,
|
||||
})
|
||||
}
|
||||
211
crates/sq-server/src/pipeline.rs
Normal file
211
crates/sq-server/src/pipeline.rs
Normal file
@@ -0,0 +1,211 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_models::Header;
|
||||
use sq_sim::fs::RealFileSystem;
|
||||
use sq_sim::RealClock;
|
||||
use sq_storage::engine::StorageEngine;
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
|
||||
/// A single message submitted to the pipeline.
|
||||
pub struct PipelineMessage {
|
||||
pub topic: String,
|
||||
pub partition: u32,
|
||||
pub key: Vec<u8>,
|
||||
pub value: Vec<u8>,
|
||||
pub headers: Vec<Header>,
|
||||
pub timestamp_ms: u64,
|
||||
}
|
||||
|
||||
/// Result returned for each published message.
|
||||
pub struct PipelineResult {
|
||||
pub topic: String,
|
||||
pub partition: u32,
|
||||
pub offset: u64,
|
||||
}
|
||||
|
||||
/// A request sent through the channel: a batch of messages + reply channel.
|
||||
struct PipelineRequest {
|
||||
messages: Vec<PipelineMessage>,
|
||||
reply: oneshot::Sender<Result<Vec<PipelineResult>, String>>,
|
||||
}
|
||||
|
||||
/// Send-side handle for submitting messages to the write pipeline.
|
||||
#[derive(Clone)]
|
||||
pub struct PipelineHandle {
|
||||
tx: mpsc::Sender<PipelineRequest>,
|
||||
}
|
||||
|
||||
impl PipelineHandle {
|
||||
/// Submit messages to the pipeline and wait for durable ack.
|
||||
/// Returns the assigned offsets once the batch has been fsync'd.
|
||||
pub async fn submit(
|
||||
&self,
|
||||
messages: Vec<PipelineMessage>,
|
||||
) -> Result<Vec<PipelineResult>, String> {
|
||||
let (reply_tx, reply_rx) = oneshot::channel();
|
||||
let req = PipelineRequest {
|
||||
messages,
|
||||
reply: reply_tx,
|
||||
};
|
||||
self.tx
|
||||
.send(req)
|
||||
.await
|
||||
.map_err(|_| "pipeline closed".to_string())?;
|
||||
reply_rx.await.map_err(|_| "pipeline dropped".to_string())?
|
||||
}
|
||||
|
||||
/// Fire-and-forget submit (for ACK_MODE_NONE).
|
||||
pub async fn submit_fire_and_forget(&self, messages: Vec<PipelineMessage>) {
|
||||
let (reply_tx, _reply_rx) = oneshot::channel();
|
||||
let req = PipelineRequest {
|
||||
messages,
|
||||
reply: reply_tx,
|
||||
};
|
||||
// Best-effort send, ignore errors.
|
||||
let _ = self.tx.send(req).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Receive-side of the pipeline that batches and flushes writes.
|
||||
pub struct WritePipeline {
|
||||
rx: mpsc::Receiver<PipelineRequest>,
|
||||
engine: Arc<StorageEngine<RealFileSystem, RealClock>>,
|
||||
}
|
||||
|
||||
/// Create a pipeline handle + runner pair.
|
||||
pub fn create_pipeline(
|
||||
engine: Arc<StorageEngine<RealFileSystem, RealClock>>,
|
||||
capacity: usize,
|
||||
) -> (PipelineHandle, WritePipeline) {
|
||||
let (tx, rx) = mpsc::channel(capacity);
|
||||
(PipelineHandle { tx }, WritePipeline { rx, engine })
|
||||
}
|
||||
|
||||
impl WritePipeline {
|
||||
/// Run the pipeline loop. Exits when all senders are dropped or the
|
||||
/// cancellation token is triggered (caller should select on both).
|
||||
pub async fn run(&mut self) {
|
||||
loop {
|
||||
// Block until at least one request arrives.
|
||||
let first = match self.rx.recv().await {
|
||||
Some(req) => req,
|
||||
None => return, // Channel closed.
|
||||
};
|
||||
|
||||
// Drain any additional pending requests (group commit).
|
||||
let mut batch = vec![first];
|
||||
while let Ok(req) = self.rx.try_recv() {
|
||||
batch.push(req);
|
||||
}
|
||||
|
||||
self.flush_batch(batch).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn flush_batch(&self, mut requests: Vec<PipelineRequest>) {
|
||||
// Group all messages by (topic, partition).
|
||||
// We keep track of which request+index each message belongs to so we
|
||||
// can route results back.
|
||||
struct Tracking {
|
||||
request_idx: usize,
|
||||
message_idx: usize,
|
||||
}
|
||||
|
||||
// Count messages per request before draining (for result slot allocation).
|
||||
let msg_counts: Vec<usize> = requests.iter().map(|r| r.messages.len()).collect();
|
||||
|
||||
let mut grouped: HashMap<(String, u32), (Vec<(Option<Vec<u8>>, Vec<u8>, Vec<Header>, u64)>, Vec<Tracking>)> = HashMap::new();
|
||||
|
||||
for (req_idx, req) in requests.iter_mut().enumerate() {
|
||||
for (msg_idx, msg) in req.messages.drain(..).enumerate() {
|
||||
let key = (msg.topic, msg.partition);
|
||||
let entry = grouped.entry(key).or_insert_with(|| (Vec::new(), Vec::new()));
|
||||
entry.0.push((
|
||||
if msg.key.is_empty() { None } else { Some(msg.key) },
|
||||
msg.value,
|
||||
msg.headers,
|
||||
msg.timestamp_ms,
|
||||
));
|
||||
entry.1.push(Tracking { request_idx: req_idx, message_idx: msg_idx });
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare result slots.
|
||||
let mut results: Vec<Result<Vec<PipelineResult>, String>> = msg_counts
|
||||
.iter()
|
||||
.map(|&count| {
|
||||
Ok((0..count)
|
||||
.map(|_| PipelineResult {
|
||||
topic: String::new(),
|
||||
partition: 0,
|
||||
offset: 0,
|
||||
})
|
||||
.collect())
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Split grouped data into messages (moved into spawn_blocking) and tracking (kept here).
|
||||
let mut partition_messages: Vec<(
|
||||
String,
|
||||
u32,
|
||||
Vec<(Option<Vec<u8>>, Vec<u8>, Vec<Header>, u64)>,
|
||||
)> = Vec::new();
|
||||
let mut partition_tracking: Vec<Vec<Tracking>> = Vec::new();
|
||||
|
||||
for ((topic, partition), (messages, tracking)) in grouped {
|
||||
partition_messages.push((topic, partition, messages));
|
||||
partition_tracking.push(tracking);
|
||||
}
|
||||
|
||||
// Flush each topic-partition batch concurrently via spawn_blocking.
|
||||
// Each partition acquires only its own lock inside the engine.
|
||||
let mut handles = Vec::with_capacity(partition_messages.len());
|
||||
for (topic, partition, messages) in partition_messages {
|
||||
let engine = self.engine.clone();
|
||||
handles.push(tokio::task::spawn_blocking(move || {
|
||||
let batch_refs: Vec<(Option<&[u8]>, &[u8], &[Header], u64)> = messages
|
||||
.iter()
|
||||
.map(|(k, v, h, ts)| (k.as_deref(), v.as_slice(), h.as_slice(), *ts))
|
||||
.collect();
|
||||
let result = engine.append_batch(&topic, partition, &batch_refs);
|
||||
(topic, partition, result)
|
||||
}));
|
||||
}
|
||||
|
||||
// Await all writes and route results back.
|
||||
for (handle, tracking) in handles.into_iter().zip(partition_tracking) {
|
||||
match handle.await {
|
||||
Ok((topic, partition, Ok(offsets))) => {
|
||||
for (i, track) in tracking.iter().enumerate() {
|
||||
if let Ok(ref mut res) = results[track.request_idx] {
|
||||
res[track.message_idx] = PipelineResult {
|
||||
topic: topic.clone(),
|
||||
partition,
|
||||
offset: offsets[i],
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok((_topic, _partition, Err(e))) => {
|
||||
let err_msg = e.to_string();
|
||||
for track in &tracking {
|
||||
results[track.request_idx] = Err(err_msg.clone());
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// spawn_blocking panicked.
|
||||
let err_msg = format!("write task panicked: {e}");
|
||||
for track in &tracking {
|
||||
results[track.request_idx] = Err(err_msg.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reply to all waiters.
|
||||
for (req, result) in requests.into_iter().zip(results) {
|
||||
let _ = req.reply.send(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
40
crates/sq-server/src/servehttp.rs
Normal file
40
crates/sq-server/src/servehttp.rs
Normal file
@@ -0,0 +1,40 @@
|
||||
use std::net::SocketAddr;
|
||||
|
||||
use anyhow::Context;
|
||||
use axum::routing::get;
|
||||
use notmad::{Component, ComponentInfo, MadError};
|
||||
use tokio::net::TcpListener;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tower_http::trace::TraceLayer;
|
||||
|
||||
pub struct ServeHttp {
|
||||
pub host: SocketAddr,
|
||||
}
|
||||
|
||||
impl Component for ServeHttp {
|
||||
fn info(&self) -> ComponentInfo {
|
||||
"sq-server/http".into()
|
||||
}
|
||||
|
||||
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
|
||||
tracing::info!("serving http on {}", self.host);
|
||||
|
||||
let router = axum::Router::new()
|
||||
.route("/health", get(|| async { "ok" }))
|
||||
.route("/ready", get(|| async { "ok" }))
|
||||
.layer(TraceLayer::new_for_http());
|
||||
|
||||
let listener = TcpListener::bind(&self.host)
|
||||
.await
|
||||
.context("failed to bind http port")?;
|
||||
|
||||
axum::serve(listener, router.into_make_service())
|
||||
.with_graceful_shutdown(async move {
|
||||
cancellation_token.cancelled().await;
|
||||
})
|
||||
.await
|
||||
.context("http server failed")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
101
crates/sq-server/src/shipper.rs
Normal file
101
crates/sq-server/src/shipper.rs
Normal file
@@ -0,0 +1,101 @@
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use notmad::{Component, ComponentInfo, MadError};
|
||||
use sq_sim::fs::RealFileSystem;
|
||||
use sq_storage::object_store::s3::S3ObjectStore;
|
||||
use sq_storage::object_store::shipper::{SegmentShipper, ShippedSegments};
|
||||
use tokio::sync::Mutex;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::state::State;
|
||||
|
||||
/// Background component that periodically ships closed WAL segments to S3
|
||||
/// and trims local files after successful upload.
|
||||
pub struct BackgroundShipper {
|
||||
state: State,
|
||||
shipper: SegmentShipper<RealFileSystem, S3ObjectStore>,
|
||||
interval: Duration,
|
||||
}
|
||||
|
||||
impl BackgroundShipper {
|
||||
pub fn new(
|
||||
state: State,
|
||||
object_store: Arc<S3ObjectStore>,
|
||||
cluster_id: String,
|
||||
interval: Duration,
|
||||
) -> Self {
|
||||
let fs = Arc::new(sq_sim::fs::RealFileSystem);
|
||||
let shipped = Arc::new(Mutex::new(ShippedSegments::new()));
|
||||
let shipper = SegmentShipper::new(fs, object_store, cluster_id, shipped);
|
||||
|
||||
Self {
|
||||
state,
|
||||
shipper,
|
||||
interval,
|
||||
}
|
||||
}
|
||||
|
||||
async fn cycle(&self) {
|
||||
let closed = match self.state.engine.close_all_segments() {
|
||||
Ok(segments) => segments,
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %e, "failed to close segments for shipping");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if closed.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let count = self.shipper.ship_all(&closed).await;
|
||||
if count > 0 {
|
||||
tracing::info!(shipped = count, total = closed.len(), "shipped segments to S3");
|
||||
}
|
||||
|
||||
// Trim local WAL files for successfully shipped segments.
|
||||
// The shipper tracks which segments were shipped; we delete local copies.
|
||||
// For now, we only delete if all segments were shipped successfully.
|
||||
if count == closed.len() {
|
||||
let fs = sq_sim::fs::RealFileSystem;
|
||||
for seg in &closed {
|
||||
if let Err(e) = sq_sim::fs::FileSystem::remove_file(&fs, &seg.path) {
|
||||
tracing::warn!(
|
||||
path = %seg.path.display(),
|
||||
error = %e,
|
||||
"failed to trim shipped segment"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Component for BackgroundShipper {
|
||||
fn info(&self) -> ComponentInfo {
|
||||
"sq-server/shipper".into()
|
||||
}
|
||||
|
||||
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
|
||||
tracing::info!(
|
||||
interval_secs = self.interval.as_secs(),
|
||||
"background shipper started"
|
||||
);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
() = cancellation_token.cancelled() => {
|
||||
// Final flush on shutdown.
|
||||
self.cycle().await;
|
||||
break;
|
||||
}
|
||||
() = tokio::time::sleep(self.interval) => {
|
||||
self.cycle().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
62
crates/sq-server/src/state.rs
Normal file
62
crates/sq-server/src/state.rs
Normal file
@@ -0,0 +1,62 @@
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use drop_queue::DropQueue;
|
||||
use sq_sim::fs::RealFileSystem;
|
||||
use sq_sim::RealClock;
|
||||
use sq_storage::engine::StorageEngine;
|
||||
use sq_storage::object_store::reader::ObjectStoreReader;
|
||||
use sq_storage::object_store::s3::S3ObjectStore;
|
||||
|
||||
use crate::pipeline::{self, PipelineHandle, WritePipeline};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct State {
|
||||
pub engine: Arc<StorageEngine<RealFileSystem, RealClock>>,
|
||||
pub pipeline: PipelineHandle,
|
||||
pub s3_reader: Option<Arc<ObjectStoreReader<RealFileSystem, S3ObjectStore>>>,
|
||||
pub drop_queue: DropQueue,
|
||||
pub config: Config,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Config {
|
||||
pub node_id: String,
|
||||
pub data_dir: PathBuf,
|
||||
pub seeds: Vec<String>,
|
||||
pub grpc_address: String,
|
||||
pub cluster_id: String,
|
||||
pub s3_bucket: Option<String>,
|
||||
pub s3_endpoint: Option<String>,
|
||||
pub s3_region: Option<String>,
|
||||
pub sync_policy: sq_models::SyncPolicy,
|
||||
}
|
||||
|
||||
impl State {
|
||||
pub fn new(config: Config) -> anyhow::Result<(Self, WritePipeline)> {
|
||||
let fs = Arc::new(RealFileSystem);
|
||||
let clock = Arc::new(RealClock);
|
||||
let wal_config = sq_models::WalConfig {
|
||||
data_dir: config.data_dir.clone(),
|
||||
sync_policy: config.sync_policy.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let engine = StorageEngine::new(fs, clock, wal_config)?;
|
||||
engine.recover()?;
|
||||
|
||||
let engine = Arc::new(engine);
|
||||
let (handle, writer) = pipeline::create_pipeline(engine.clone(), 10_000);
|
||||
|
||||
Ok((
|
||||
Self {
|
||||
engine,
|
||||
pipeline: handle,
|
||||
s3_reader: None,
|
||||
drop_queue: DropQueue::new(),
|
||||
config,
|
||||
},
|
||||
writer,
|
||||
))
|
||||
}
|
||||
}
|
||||
56
crates/sq-server/src/sync_task.rs
Normal file
56
crates/sq-server/src/sync_task.rs
Normal file
@@ -0,0 +1,56 @@
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use notmad::{Component, ComponentInfo, MadError};
|
||||
use sq_sim::fs::RealFileSystem;
|
||||
use sq_sim::RealClock;
|
||||
use sq_storage::engine::StorageEngine;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
/// Background task that periodically fsyncs all open WAL writers.
|
||||
/// Used when SyncPolicy is Interval.
|
||||
pub struct BackgroundSync {
|
||||
engine: Arc<StorageEngine<RealFileSystem, RealClock>>,
|
||||
interval: Duration,
|
||||
}
|
||||
|
||||
impl BackgroundSync {
|
||||
pub fn new(
|
||||
engine: Arc<StorageEngine<RealFileSystem, RealClock>>,
|
||||
interval: Duration,
|
||||
) -> Self {
|
||||
Self { engine, interval }
|
||||
}
|
||||
}
|
||||
|
||||
impl Component for BackgroundSync {
|
||||
fn info(&self) -> ComponentInfo {
|
||||
"sq-server/background-sync".into()
|
||||
}
|
||||
|
||||
async fn run(&self, cancellation_token: CancellationToken) -> Result<(), MadError> {
|
||||
let mut interval = tokio::time::interval(self.interval);
|
||||
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
|
||||
// Consume the first immediate tick.
|
||||
interval.tick().await;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
() = cancellation_token.cancelled() => {
|
||||
// Final sync on shutdown.
|
||||
if let Err(e) = self.engine.fsync_all_writers() {
|
||||
tracing::warn!(error = %e, "final sync on shutdown failed");
|
||||
}
|
||||
break;
|
||||
}
|
||||
_ = interval.tick() => {
|
||||
if let Err(e) = self.engine.fsync_all_writers() {
|
||||
tracing::warn!(error = %e, "background sync failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
462
crates/sq-server/tests/capnp_stress_test.rs
Normal file
462
crates/sq-server/tests/capnp_stress_test.rs
Normal file
@@ -0,0 +1,462 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use sq_cluster::membership::{Membership, MembershipConfig};
|
||||
use sq_grpc_interface::{
|
||||
cluster_service_server::ClusterServiceServer,
|
||||
control_plane_service_server::ControlPlaneServiceServer,
|
||||
data_plane_service_server::DataPlaneServiceServer,
|
||||
status_service_client::StatusServiceClient,
|
||||
status_service_server::StatusServiceServer,
|
||||
GetStatusRequest, SubscribeRequest,
|
||||
};
|
||||
use sq_grpc_interface::data_plane_service_client::DataPlaneServiceClient;
|
||||
use sq_sdk::{
|
||||
Consumer, ConsumerConfig, Producer, ProducerConfig,
|
||||
GrpcProducer, GrpcProducerConfig, ProducerMessage,
|
||||
};
|
||||
use sq_server::capnp::CapnpServer;
|
||||
use sq_server::grpc::{cluster, control_plane, data_plane, health};
|
||||
use sq_server::state::{Config, State};
|
||||
use tempfile::TempDir;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test harness — extends TestCluster to include capnp server alongside gRPC
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct TestNode {
|
||||
grpc_addr: SocketAddr,
|
||||
capnp_addr: SocketAddr,
|
||||
cancel: CancellationToken,
|
||||
pipeline_cancel: CancellationToken,
|
||||
_temp_dir: TempDir,
|
||||
_server_handle: tokio::task::JoinHandle<()>,
|
||||
_capnp_handle: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl TestNode {
|
||||
fn grpc_endpoint(&self) -> String {
|
||||
format!("http://{}", self.grpc_addr)
|
||||
}
|
||||
|
||||
fn capnp_endpoint(&self) -> String {
|
||||
self.capnp_addr.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
struct TestCluster {
|
||||
nodes: Vec<TestNode>,
|
||||
}
|
||||
|
||||
impl TestCluster {
|
||||
async fn start(n: usize) -> Self {
|
||||
let mut grpc_listeners = Vec::new();
|
||||
let mut capnp_listeners = Vec::new();
|
||||
let mut grpc_addrs = Vec::new();
|
||||
let mut capnp_addrs = Vec::new();
|
||||
|
||||
for _ in 0..n {
|
||||
let grpc_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let capnp_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
grpc_addrs.push(grpc_listener.local_addr().unwrap());
|
||||
capnp_addrs.push(capnp_listener.local_addr().unwrap());
|
||||
grpc_listeners.push(grpc_listener);
|
||||
capnp_listeners.push(capnp_listener);
|
||||
}
|
||||
|
||||
let mut nodes = Vec::new();
|
||||
for (i, (grpc_listener, capnp_listener)) in
|
||||
grpc_listeners.into_iter().zip(capnp_listeners).enumerate()
|
||||
{
|
||||
let grpc_addr = grpc_addrs[i];
|
||||
let capnp_addr = capnp_addrs[i];
|
||||
let node_id = format!("capnp-stress-node-{}", i + 1);
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
|
||||
let seeds: Vec<String> = grpc_addrs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(j, _)| *j != i)
|
||||
.map(|(_, a)| a.to_string())
|
||||
.collect();
|
||||
|
||||
let config = Config {
|
||||
node_id: node_id.clone(),
|
||||
data_dir: temp_dir.path().to_path_buf(),
|
||||
seeds: seeds.clone(),
|
||||
grpc_address: grpc_addr.to_string(),
|
||||
cluster_id: "test-cluster".to_string(),
|
||||
s3_bucket: None,
|
||||
s3_endpoint: None,
|
||||
s3_region: None,
|
||||
sync_policy: sq_models::SyncPolicy::EveryBatch,
|
||||
};
|
||||
|
||||
let (state, mut pipeline) = State::new(config).unwrap();
|
||||
|
||||
let pipeline_cancel = CancellationToken::new();
|
||||
let pipeline_cancel_clone = pipeline_cancel.clone();
|
||||
tokio::spawn(async move {
|
||||
tokio::select! {
|
||||
() = pipeline.run() => {}
|
||||
() = pipeline_cancel_clone.cancelled() => {}
|
||||
}
|
||||
});
|
||||
|
||||
let membership = Arc::new(Membership::new(MembershipConfig {
|
||||
node_id: node_id.clone(),
|
||||
address: grpc_addr.to_string(),
|
||||
seeds,
|
||||
..Default::default()
|
||||
}));
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Spawn gRPC server.
|
||||
let cancel_clone = cancel.clone();
|
||||
let state_clone = state.clone();
|
||||
let membership_clone = membership.clone();
|
||||
let incoming = tokio_stream::wrappers::TcpListenerStream::new(grpc_listener);
|
||||
let server_handle = tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(StatusServiceServer::new(health::HealthServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(ControlPlaneServiceServer::new(
|
||||
control_plane::ControlPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
},
|
||||
))
|
||||
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
|
||||
state: state_clone,
|
||||
membership: membership_clone,
|
||||
}))
|
||||
.serve_with_incoming_shutdown(incoming, async move {
|
||||
cancel_clone.cancelled().await;
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
// Spawn capnp server — use the CapnpServer Component's run method directly.
|
||||
let cancel_clone = cancel.clone();
|
||||
let capnp_state = state.clone();
|
||||
let capnp_handle = tokio::spawn(async move {
|
||||
let server = CapnpServer {
|
||||
host: capnp_addr,
|
||||
state: capnp_state,
|
||||
};
|
||||
// We can't use the TcpListener we already bound because CapnpServer binds its own.
|
||||
// Instead, drop the listener and let CapnpServer rebind.
|
||||
drop(capnp_listener);
|
||||
let _ = notmad::Component::run(&server, cancel_clone).await;
|
||||
});
|
||||
|
||||
nodes.push(TestNode {
|
||||
grpc_addr,
|
||||
capnp_addr,
|
||||
cancel,
|
||||
pipeline_cancel,
|
||||
_temp_dir: temp_dir,
|
||||
_server_handle: server_handle,
|
||||
_capnp_handle: capnp_handle,
|
||||
});
|
||||
}
|
||||
|
||||
// Wait for gRPC to be ready.
|
||||
for node in &nodes {
|
||||
wait_for_ready(&node.grpc_endpoint()).await;
|
||||
}
|
||||
|
||||
// Give capnp server a moment to bind.
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
|
||||
TestCluster { nodes }
|
||||
}
|
||||
|
||||
fn node(&self, index: usize) -> &TestNode {
|
||||
&self.nodes[index]
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestCluster {
|
||||
fn drop(&mut self) {
|
||||
for node in &self.nodes {
|
||||
node.pipeline_cancel.cancel();
|
||||
node.cancel.cancel();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_ready(endpoint: &str) {
|
||||
let deadline = tokio::time::Instant::now() + tokio::time::Duration::from_secs(5);
|
||||
loop {
|
||||
if tokio::time::Instant::now() > deadline {
|
||||
panic!("Server at {} did not become ready in time", endpoint);
|
||||
}
|
||||
if let Ok(mut client) = StatusServiceClient::connect(endpoint.to_string()).await {
|
||||
if client
|
||||
.status(tonic::Request::new(GetStatusRequest {}))
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Capnp stress test 1: Single producer — 100K messages via capnp
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn capnp_stress_single_producer_100k() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let capnp_ep = cluster.node(0).capnp_endpoint();
|
||||
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: capnp_ep,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let total = 100_000u64;
|
||||
let batch_size = 500;
|
||||
let payload = vec![0u8; 128];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
for batch_start in (0..total).step_by(batch_size) {
|
||||
let batch_end = (batch_start + batch_size as u64).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("capnp-stress-topic", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
let publish_duration = start.elapsed();
|
||||
let msgs_per_sec = total as f64 / publish_duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"capnp_stress_single_producer_100k: published {} messages in {:.2}s ({:.0} msg/s, {:.1} MB/s)",
|
||||
total,
|
||||
publish_duration.as_secs_f64(),
|
||||
msgs_per_sec,
|
||||
(total as f64 * 128.0) / (1024.0 * 1024.0) / publish_duration.as_secs_f64()
|
||||
);
|
||||
|
||||
// Verify: read back via gRPC subscribe (capnp subscribe is streaming-only).
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep).await.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "capnp-stress-topic".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut consumed = 0u64;
|
||||
while consumed < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => consumed += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(consumed, total, "expected all messages to be consumed");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Capnp stress test 2: Concurrent producers — 10 producers, 10K messages each
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn capnp_stress_concurrent_producers() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let capnp_ep = cluster.node(0).capnp_endpoint();
|
||||
|
||||
let num_producers = 10;
|
||||
let msgs_per_producer = 10_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for p in 0..num_producers {
|
||||
let ep = capnp_ep.clone();
|
||||
let pl = payload.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: ep,
|
||||
producer_id: format!("capnp-producer-{p}"),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let topic = format!("capnp-concurrent-{p}");
|
||||
for batch_start in (0..msgs_per_producer).step_by(100) {
|
||||
let batch_end = (batch_start + 100).min(msgs_per_producer);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new(topic.clone(), pl.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
let total = num_producers as u64 * msgs_per_producer;
|
||||
let msgs_per_sec = total as f64 / duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"capnp_stress_concurrent_producers: {} producers x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
|
||||
num_producers,
|
||||
msgs_per_producer,
|
||||
total,
|
||||
duration.as_secs_f64(),
|
||||
msgs_per_sec
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Capnp stress test 3: Subscribe via capnp — publish then consume
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn capnp_stress_subscribe() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let capnp_ep = cluster.node(0).capnp_endpoint();
|
||||
let total = 10_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
// Publish via capnp.
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: capnp_ep.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for batch_start in (0..total).step_by(500) {
|
||||
let batch_end = (batch_start + 500).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("capnp-sub-topic", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
// Consume via capnp.
|
||||
let mut consumer = Consumer::connect(ConsumerConfig {
|
||||
address: capnp_ep,
|
||||
topic: "capnp-sub-topic".to_string(),
|
||||
consumer_group: String::new(),
|
||||
auto_commit: false,
|
||||
start_offset: Some(0),
|
||||
max_poll_records: 1000,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut consumed = 0u64;
|
||||
let start = Instant::now();
|
||||
|
||||
while consumed < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), consumer.poll()).await {
|
||||
Ok(Ok(msgs)) => consumed += msgs.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
let consume_duration = start.elapsed();
|
||||
eprintln!(
|
||||
"capnp_stress_subscribe: consumed {} messages in {:.2}s ({:.0} msg/s)",
|
||||
consumed,
|
||||
consume_duration.as_secs_f64(),
|
||||
consumed as f64 / consume_duration.as_secs_f64()
|
||||
);
|
||||
|
||||
assert_eq!(consumed, total, "expected all messages to be consumed");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Throughput comparison: gRPC vs capnp
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async fn bench_grpc_publish(cluster: &TestCluster, total: u64, batch_size: usize) -> f64 {
|
||||
let endpoint = cluster.node(0).grpc_endpoint();
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: endpoint,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let payload = vec![0u8; 128];
|
||||
let start = Instant::now();
|
||||
|
||||
for batch_start in (0..total).step_by(batch_size) {
|
||||
let batch_end = (batch_start + batch_size as u64).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("bench-grpc", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
total as f64 / start.elapsed().as_secs_f64()
|
||||
}
|
||||
|
||||
async fn bench_capnp_publish(cluster: &TestCluster, total: u64, batch_size: usize) -> f64 {
|
||||
let endpoint = cluster.node(0).capnp_endpoint();
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let payload = vec![0u8; 128];
|
||||
let start = Instant::now();
|
||||
|
||||
for batch_start in (0..total).step_by(batch_size) {
|
||||
let batch_end = (batch_start + batch_size as u64).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("bench-capnp", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
total as f64 / start.elapsed().as_secs_f64()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn capnp_vs_grpc_throughput() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
|
||||
let grpc_rate = bench_grpc_publish(&cluster, 100_000, 500).await;
|
||||
let capnp_rate = bench_capnp_publish(&cluster, 100_000, 500).await;
|
||||
|
||||
eprintln!("=== THROUGHPUT COMPARISON (single producer, 100K msgs x 128B) ===");
|
||||
eprintln!("gRPC: {:.0} msg/s", grpc_rate);
|
||||
eprintln!("capnp: {:.0} msg/s", capnp_rate);
|
||||
eprintln!("ratio: {:.2}x", capnp_rate / grpc_rate);
|
||||
}
|
||||
763
crates/sq-server/tests/cluster_test.rs
Normal file
763
crates/sq-server/tests/cluster_test.rs
Normal file
@@ -0,0 +1,763 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use sq_cluster::membership::{Membership, MembershipConfig};
|
||||
use sq_grpc_interface::{
|
||||
cluster_service_client::ClusterServiceClient,
|
||||
cluster_service_server::ClusterServiceServer,
|
||||
control_plane_service_client::ControlPlaneServiceClient,
|
||||
control_plane_service_server::ControlPlaneServiceServer,
|
||||
data_plane_service_client::DataPlaneServiceClient,
|
||||
data_plane_service_server::DataPlaneServiceServer,
|
||||
status_service_client::StatusServiceClient,
|
||||
status_service_server::StatusServiceServer,
|
||||
ClusterNodeInfo, CreateTopicRequest, DeleteTopicRequest, DescribeTopicRequest,
|
||||
FetchSegmentRequest, GetStatusRequest, HeartbeatRequest, JoinRequest, ListTopicsRequest,
|
||||
ReplicateEntriesRequest, SubscribeRequest,
|
||||
};
|
||||
use sq_sdk::{GrpcConsumer, GrpcConsumerConfig, GrpcProducer, GrpcProducerConfig};
|
||||
use sq_server::grpc::{cluster, control_plane, data_plane, health};
|
||||
use sq_server::state::{Config, State};
|
||||
use tempfile::TempDir;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test harness
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct TestNode {
|
||||
addr: SocketAddr,
|
||||
#[allow(dead_code)]
|
||||
node_id: String,
|
||||
#[allow(dead_code)]
|
||||
state: State,
|
||||
membership: Arc<Membership>,
|
||||
cancel: CancellationToken,
|
||||
pipeline_cancel: CancellationToken,
|
||||
_temp_dir: TempDir,
|
||||
_server_handle: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl TestNode {
|
||||
fn endpoint(&self) -> String {
|
||||
format!("http://{}", self.addr)
|
||||
}
|
||||
}
|
||||
|
||||
struct TestCluster {
|
||||
nodes: Vec<TestNode>,
|
||||
}
|
||||
|
||||
impl TestCluster {
|
||||
/// Start a cluster of `n` real SQ server nodes on random ports.
|
||||
async fn start(n: usize) -> Self {
|
||||
// Phase 1: Bind all listeners to get ports before starting servers.
|
||||
let mut listeners = Vec::new();
|
||||
let mut addrs = Vec::new();
|
||||
|
||||
for _ in 0..n {
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
addrs.push(addr);
|
||||
listeners.push(listener);
|
||||
}
|
||||
|
||||
// Phase 2: Start each node.
|
||||
let mut nodes = Vec::new();
|
||||
for (i, listener) in listeners.into_iter().enumerate() {
|
||||
let addr = addrs[i];
|
||||
let node_id = format!("node-{}", i + 1);
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
|
||||
// Seeds: all addresses except our own.
|
||||
let seeds: Vec<String> = addrs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(j, _)| *j != i)
|
||||
.map(|(_, a)| a.to_string())
|
||||
.collect();
|
||||
|
||||
let config = Config {
|
||||
node_id: node_id.clone(),
|
||||
data_dir: temp_dir.path().to_path_buf(),
|
||||
seeds: seeds.clone(),
|
||||
grpc_address: addr.to_string(),
|
||||
cluster_id: "test-cluster".to_string(),
|
||||
s3_bucket: None,
|
||||
s3_endpoint: None,
|
||||
s3_region: None,
|
||||
sync_policy: sq_models::SyncPolicy::EveryBatch,
|
||||
};
|
||||
|
||||
let (state, mut pipeline) = State::new(config).unwrap();
|
||||
|
||||
// Spawn the write pipeline for this node.
|
||||
let pipeline_cancel = CancellationToken::new();
|
||||
let pipeline_cancel_clone = pipeline_cancel.clone();
|
||||
tokio::spawn(async move {
|
||||
tokio::select! {
|
||||
() = pipeline.run() => {}
|
||||
() = pipeline_cancel_clone.cancelled() => {}
|
||||
}
|
||||
});
|
||||
|
||||
let membership = Arc::new(Membership::new(MembershipConfig {
|
||||
node_id: node_id.clone(),
|
||||
address: addr.to_string(),
|
||||
seeds,
|
||||
..Default::default()
|
||||
}));
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let cancel_clone = cancel.clone();
|
||||
let state_clone = state.clone();
|
||||
let membership_clone = membership.clone();
|
||||
|
||||
let incoming = tokio_stream::wrappers::TcpListenerStream::new(listener);
|
||||
|
||||
let server_handle = tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(StatusServiceServer::new(health::HealthServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(ControlPlaneServiceServer::new(
|
||||
control_plane::ControlPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
},
|
||||
))
|
||||
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
|
||||
state: state_clone,
|
||||
membership: membership_clone,
|
||||
}))
|
||||
.serve_with_incoming_shutdown(incoming, async move {
|
||||
cancel_clone.cancelled().await;
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
nodes.push(TestNode {
|
||||
addr,
|
||||
node_id,
|
||||
state,
|
||||
membership,
|
||||
cancel,
|
||||
pipeline_cancel,
|
||||
_temp_dir: temp_dir,
|
||||
_server_handle: server_handle,
|
||||
});
|
||||
}
|
||||
|
||||
// Phase 3: Wait for all servers to be ready.
|
||||
for node in &nodes {
|
||||
wait_for_ready(&node.endpoint()).await;
|
||||
}
|
||||
|
||||
TestCluster { nodes }
|
||||
}
|
||||
|
||||
fn node(&self, index: usize) -> &TestNode {
|
||||
&self.nodes[index]
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestCluster {
|
||||
fn drop(&mut self) {
|
||||
for node in &self.nodes {
|
||||
node.pipeline_cancel.cancel();
|
||||
node.cancel.cancel();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Poll the Status RPC until the server responds, with a timeout.
|
||||
async fn wait_for_ready(endpoint: &str) {
|
||||
let deadline = tokio::time::Instant::now() + tokio::time::Duration::from_secs(5);
|
||||
loop {
|
||||
if tokio::time::Instant::now() > deadline {
|
||||
panic!("Server at {} did not become ready in time", endpoint);
|
||||
}
|
||||
match StatusServiceClient::connect(endpoint.to_string()).await {
|
||||
Ok(mut client) => {
|
||||
if client
|
||||
.status(tonic::Request::new(GetStatusRequest {}))
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
Err(_) => {}
|
||||
}
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect messages from a subscribe stream with a timeout.
|
||||
async fn collect_messages(
|
||||
endpoint: &str,
|
||||
topic: &str,
|
||||
start_offset: u64,
|
||||
expected_count: usize,
|
||||
) -> Vec<sq_grpc_interface::ConsumedMessage> {
|
||||
let mut client = DataPlaneServiceClient::connect(endpoint.to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: topic.to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(start_offset),
|
||||
max_batch_size: 200,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut messages = Vec::new();
|
||||
|
||||
while messages.len() < expected_count {
|
||||
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => messages.extend(batch.messages),
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
messages
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 1: Single node, 1000 messages via SDK
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_single_node_publish_consume_1000() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
|
||||
// Publish 1000 messages via SDK Producer.
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for i in 0..1000u64 {
|
||||
let result = producer
|
||||
.send("orders", None, format!("msg-{i}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(result.offset, i);
|
||||
assert_eq!(result.topic, "orders");
|
||||
}
|
||||
|
||||
// Consume all 1000 via raw subscribe.
|
||||
let messages = collect_messages(&endpoint, "orders", 0, 1000).await;
|
||||
|
||||
assert_eq!(messages.len(), 1000);
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("msg-{i}").as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 2: Multi-topic isolation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multi_topic_isolation() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let topics = ["alpha", "beta", "gamma"];
|
||||
let counts: [usize; 3] = [50, 100, 25];
|
||||
|
||||
// Publish to each topic.
|
||||
for (topic, count) in topics.iter().zip(counts.iter()) {
|
||||
for i in 0..*count {
|
||||
producer
|
||||
.send(topic, None, format!("{topic}-{i}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Consume from each topic and verify isolation.
|
||||
for (topic, expected_count) in topics.iter().zip(counts.iter()) {
|
||||
let messages = collect_messages(&endpoint, topic, 0, *expected_count).await;
|
||||
|
||||
assert_eq!(
|
||||
messages.len(),
|
||||
*expected_count,
|
||||
"topic {topic} expected {expected_count} messages, got {}",
|
||||
messages.len()
|
||||
);
|
||||
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("{topic}-{i}").as_bytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 3: Consumer group offset resume
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_consumer_group_offset_resume() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
|
||||
// Publish 20 messages.
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for i in 0..20u64 {
|
||||
producer
|
||||
.send("events", None, format!("msg-{i}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
// Consumer 1: consume with auto_commit, collect at least 10 messages.
|
||||
{
|
||||
let mut consumer = GrpcConsumer::connect(GrpcConsumerConfig {
|
||||
address: endpoint.clone(),
|
||||
consumer_group: "test-group".to_string(),
|
||||
topic: "events".to_string(),
|
||||
auto_commit: true,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut received = Vec::new();
|
||||
let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
|
||||
while received.len() < 10 && tokio::time::Instant::now() < deadline {
|
||||
let msgs = consumer.poll().await.unwrap();
|
||||
if msgs.is_empty() {
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
continue;
|
||||
}
|
||||
received.extend(msgs);
|
||||
}
|
||||
assert!(
|
||||
received.len() >= 10,
|
||||
"expected at least 10 messages, got {}",
|
||||
received.len()
|
||||
);
|
||||
}
|
||||
|
||||
// Consumer 2: reconnect with same group, should resume from committed offset.
|
||||
{
|
||||
let mut consumer = GrpcConsumer::connect(GrpcConsumerConfig {
|
||||
address: endpoint.clone(),
|
||||
consumer_group: "test-group".to_string(),
|
||||
topic: "events".to_string(),
|
||||
auto_commit: false,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
|
||||
let mut msgs = Vec::new();
|
||||
while msgs.is_empty() && tokio::time::Instant::now() < deadline {
|
||||
msgs = consumer.poll().await.unwrap();
|
||||
if msgs.is_empty() {
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
}
|
||||
assert!(
|
||||
!msgs.is_empty(),
|
||||
"expected messages from resumed consumer"
|
||||
);
|
||||
// Should start from at least offset 9 (last committed by auto_commit).
|
||||
assert!(
|
||||
msgs[0].offset >= 9,
|
||||
"expected resume from offset >= 9, got {}",
|
||||
msgs[0].offset
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 4: Topic management CRUD
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_topic_management_crud() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
|
||||
let mut client = ControlPlaneServiceClient::connect(endpoint.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create topic.
|
||||
let resp = client
|
||||
.create_topic(tonic::Request::new(CreateTopicRequest {
|
||||
name: "orders".to_string(),
|
||||
partitions: 4,
|
||||
replication_factor: 3,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.into_inner().name, "orders");
|
||||
|
||||
// Duplicate should fail.
|
||||
let err = client
|
||||
.create_topic(tonic::Request::new(CreateTopicRequest {
|
||||
name: "orders".to_string(),
|
||||
partitions: 4,
|
||||
replication_factor: 3,
|
||||
}))
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_eq!(err.code(), tonic::Code::AlreadyExists);
|
||||
|
||||
// Create another.
|
||||
client
|
||||
.create_topic(tonic::Request::new(CreateTopicRequest {
|
||||
name: "events".to_string(),
|
||||
partitions: 1,
|
||||
replication_factor: 1,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// List topics.
|
||||
let resp = client
|
||||
.list_topics(tonic::Request::new(ListTopicsRequest {}))
|
||||
.await
|
||||
.unwrap();
|
||||
let topics = resp.into_inner().topics;
|
||||
assert_eq!(topics.len(), 2);
|
||||
let names: Vec<&str> = topics.iter().map(|t| t.name.as_str()).collect();
|
||||
assert!(names.contains(&"orders"));
|
||||
assert!(names.contains(&"events"));
|
||||
|
||||
// Describe topic.
|
||||
let resp = client
|
||||
.describe_topic(tonic::Request::new(DescribeTopicRequest {
|
||||
name: "orders".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap()
|
||||
.into_inner();
|
||||
let topic = resp.topic.unwrap();
|
||||
assert_eq!(topic.name, "orders");
|
||||
assert_eq!(topic.partitions, 4);
|
||||
assert_eq!(topic.replication_factor, 3);
|
||||
assert_eq!(resp.partition_info.len(), 4);
|
||||
|
||||
// Describe non-existent topic.
|
||||
let err = client
|
||||
.describe_topic(tonic::Request::new(DescribeTopicRequest {
|
||||
name: "nonexistent".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_eq!(err.code(), tonic::Code::NotFound);
|
||||
|
||||
// Delete topic.
|
||||
client
|
||||
.delete_topic(tonic::Request::new(DeleteTopicRequest {
|
||||
name: "orders".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify deleted.
|
||||
let resp = client
|
||||
.list_topics(tonic::Request::new(ListTopicsRequest {}))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.into_inner().topics.len(), 1);
|
||||
|
||||
// Delete non-existent should fail.
|
||||
let err = client
|
||||
.delete_topic(tonic::Request::new(DeleteTopicRequest {
|
||||
name: "orders".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_eq!(err.code(), tonic::Code::NotFound);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 5: Three-node join discovery
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_three_node_join_discovery() {
|
||||
let cluster = TestCluster::start(3).await;
|
||||
|
||||
// Node-2 joins node-1.
|
||||
let mut client = ClusterServiceClient::connect(cluster.node(0).endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
let resp = client
|
||||
.join(tonic::Request::new(JoinRequest {
|
||||
node_id: "node-2".to_string(),
|
||||
address: cluster.nodes[1].addr.to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let members = resp.into_inner().members;
|
||||
assert!(
|
||||
members.len() >= 2,
|
||||
"after node-2 join, node-1 should know >= 2 members, got {}",
|
||||
members.len()
|
||||
);
|
||||
let ids: Vec<&str> = members.iter().map(|m| m.node_id.as_str()).collect();
|
||||
assert!(ids.contains(&"node-1"));
|
||||
assert!(ids.contains(&"node-2"));
|
||||
|
||||
// Node-3 joins node-1.
|
||||
let resp = client
|
||||
.join(tonic::Request::new(JoinRequest {
|
||||
node_id: "node-3".to_string(),
|
||||
address: cluster.nodes[2].addr.to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let members = resp.into_inner().members;
|
||||
assert!(
|
||||
members.len() >= 3,
|
||||
"after node-3 join, node-1 should know >= 3 members, got {}",
|
||||
members.len()
|
||||
);
|
||||
let ids: Vec<&str> = members.iter().map(|m| m.node_id.as_str()).collect();
|
||||
assert!(ids.contains(&"node-1"));
|
||||
assert!(ids.contains(&"node-2"));
|
||||
assert!(ids.contains(&"node-3"));
|
||||
|
||||
// Verify via membership handle.
|
||||
let all = cluster.node(0).membership.all_members().await;
|
||||
assert_eq!(all.len(), 3);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 6: Cross-node heartbeat gossip
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cross_node_heartbeat_gossip() {
|
||||
let cluster = TestCluster::start(3).await;
|
||||
|
||||
// Node-2 and node-3 join node-1.
|
||||
let mut client1 = ClusterServiceClient::connect(cluster.node(0).endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
client1
|
||||
.join(tonic::Request::new(JoinRequest {
|
||||
node_id: "node-2".to_string(),
|
||||
address: cluster.nodes[1].addr.to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
client1
|
||||
.join(tonic::Request::new(JoinRequest {
|
||||
node_id: "node-3".to_string(),
|
||||
address: cluster.nodes[2].addr.to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Node-1 now knows about all 3. Send heartbeat to node-2 carrying this info.
|
||||
let all_members = cluster.node(0).membership.all_members().await;
|
||||
let known: Vec<ClusterNodeInfo> = all_members
|
||||
.iter()
|
||||
.map(|m| ClusterNodeInfo {
|
||||
node_id: m.node_id.clone(),
|
||||
address: m.address.clone(),
|
||||
status: m.status.to_string(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut client2 = ClusterServiceClient::connect(cluster.node(1).endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
let resp = client2
|
||||
.heartbeat(tonic::Request::new(HeartbeatRequest {
|
||||
node_id: "node-1".to_string(),
|
||||
known_members: known,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Node-2 should now know about all 3 nodes via gossip.
|
||||
let node2_members = resp.into_inner().members;
|
||||
assert!(
|
||||
node2_members.len() >= 3,
|
||||
"node-2 should know >= 3 members after gossip, got {}",
|
||||
node2_members.len()
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 7: Cross-node replication via RPC
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cross_node_replication_via_rpc() {
|
||||
let cluster = TestCluster::start(2).await;
|
||||
|
||||
// Publish 10 messages to node-1 via SDK.
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: cluster.node(0).endpoint(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut entry_data = Vec::new();
|
||||
for i in 0..10u64 {
|
||||
let value = format!("replicated-{i}");
|
||||
producer
|
||||
.send("repl-topic", None, value.as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
entry_data.push(value.into_bytes());
|
||||
}
|
||||
|
||||
// Replicate the same data to node-2 via ClusterService RPC.
|
||||
let mut cluster_client = ClusterServiceClient::connect(cluster.node(1).endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
let resp = cluster_client
|
||||
.replicate_entries(tonic::Request::new(ReplicateEntriesRequest {
|
||||
topic: "repl-topic".to_string(),
|
||||
partition: 0,
|
||||
entries: entry_data,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let last_offset = resp.into_inner().last_replicated_offset;
|
||||
assert_eq!(last_offset, 9);
|
||||
|
||||
// Read from node-2 to verify the data is there.
|
||||
let messages = collect_messages(&cluster.node(1).endpoint(), "repl-topic", 0, 10).await;
|
||||
|
||||
assert_eq!(messages.len(), 10);
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("replicated-{i}").as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 8: FetchSegment recovery
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fetch_segment_recovery() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
|
||||
// Write 50 messages.
|
||||
let mut producer = GrpcProducer::connect(GrpcProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for i in 0..50u64 {
|
||||
producer
|
||||
.send("recovery-topic", None, format!("data-{i}").as_bytes())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
// Fetch via FetchSegment stream.
|
||||
let mut client = ClusterServiceClient::connect(endpoint)
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.fetch_segment(tonic::Request::new(FetchSegmentRequest {
|
||||
topic: "recovery-topic".to_string(),
|
||||
partition: 0,
|
||||
from_offset: 0,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut all_chunks = Vec::new();
|
||||
|
||||
while let Ok(Some(Ok(resp))) =
|
||||
tokio::time::timeout(Duration::from_secs(5), stream.next()).await
|
||||
{
|
||||
all_chunks.extend(resp.chunk);
|
||||
}
|
||||
|
||||
// Decode the wire format: offset(8 LE) + value_len(4 LE) + value
|
||||
let mut cursor = 0;
|
||||
let mut decoded = Vec::new();
|
||||
while cursor + 12 <= all_chunks.len() {
|
||||
let offset = u64::from_le_bytes(all_chunks[cursor..cursor + 8].try_into().unwrap());
|
||||
let value_len =
|
||||
u32::from_le_bytes(all_chunks[cursor + 8..cursor + 12].try_into().unwrap()) as usize;
|
||||
cursor += 12;
|
||||
assert!(cursor + value_len <= all_chunks.len());
|
||||
let value = all_chunks[cursor..cursor + value_len].to_vec();
|
||||
cursor += value_len;
|
||||
decoded.push((offset, value));
|
||||
}
|
||||
|
||||
assert_eq!(decoded.len(), 50);
|
||||
for (i, (offset, value)) in decoded.iter().enumerate() {
|
||||
assert_eq!(*offset, i as u64);
|
||||
assert_eq!(value, format!("data-{i}").as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test 9: Node status returns correct id
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_node_status_returns_correct_id() {
|
||||
let cluster = TestCluster::start(3).await;
|
||||
|
||||
for (i, node) in cluster.nodes.iter().enumerate() {
|
||||
let mut client = StatusServiceClient::connect(node.endpoint()).await.unwrap();
|
||||
let resp = client
|
||||
.status(tonic::Request::new(GetStatusRequest {}))
|
||||
.await
|
||||
.unwrap();
|
||||
let expected = format!("node-{}", i + 1);
|
||||
assert_eq!(
|
||||
resp.into_inner().node_id,
|
||||
expected,
|
||||
"node at index {} should have id '{}'",
|
||||
i,
|
||||
expected
|
||||
);
|
||||
}
|
||||
}
|
||||
496
crates/sq-server/tests/data_plane_test.rs
Normal file
496
crates/sq-server/tests/data_plane_test.rs
Normal file
@@ -0,0 +1,496 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_grpc_interface::{
|
||||
data_plane_service_client::DataPlaneServiceClient,
|
||||
data_plane_service_server::DataPlaneServiceServer,
|
||||
status_service_client::StatusServiceClient,
|
||||
status_service_server::StatusServiceServer,
|
||||
AckMode, GetStatusRequest, MessageHeader, PublishMessage, PublishRequest, PublishSettings,
|
||||
SubscribeRequest,
|
||||
};
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
use sq_sim::SimClock;
|
||||
use sq_storage::engine::StorageEngine;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
/// A lightweight test harness that starts a gRPC server on a random port
|
||||
/// and returns both the server task and connected clients.
|
||||
struct TestServer {
|
||||
addr: SocketAddr,
|
||||
_shutdown: tokio::sync::oneshot::Sender<()>,
|
||||
}
|
||||
|
||||
impl TestServer {
|
||||
async fn start() -> Self {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = sq_models::WalConfig {
|
||||
max_segment_bytes: 1024 * 1024,
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let engine = StorageEngine::new(fs, clock, config).unwrap();
|
||||
engine.recover().unwrap();
|
||||
|
||||
let engine = Arc::new(Mutex::new(engine));
|
||||
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
|
||||
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>();
|
||||
|
||||
// Build the health server state-like object inline for tests.
|
||||
let node_id = "test-node".to_string();
|
||||
|
||||
struct TestHealthServer {
|
||||
node_id: String,
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl sq_grpc_interface::status_service_server::StatusService for TestHealthServer {
|
||||
async fn status(
|
||||
&self,
|
||||
_request: tonic::Request<GetStatusRequest>,
|
||||
) -> Result<tonic::Response<sq_grpc_interface::GetStatusResponse>, tonic::Status> {
|
||||
Ok(tonic::Response::new(sq_grpc_interface::GetStatusResponse {
|
||||
node_id: self.node_id.clone(),
|
||||
cluster: None,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
struct TestDataPlaneServer {
|
||||
engine: Arc<Mutex<StorageEngine<InMemoryFileSystem, SimClock>>>,
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl sq_grpc_interface::data_plane_service_server::DataPlaneService
|
||||
for TestDataPlaneServer
|
||||
{
|
||||
async fn publish(
|
||||
&self,
|
||||
request: tonic::Request<PublishRequest>,
|
||||
) -> Result<tonic::Response<sq_grpc_interface::PublishResponse>, tonic::Status> {
|
||||
let req = request.into_inner();
|
||||
|
||||
if req.messages.is_empty() {
|
||||
return Err(tonic::Status::invalid_argument(
|
||||
"messages must not be empty",
|
||||
));
|
||||
}
|
||||
|
||||
let mut results = Vec::new();
|
||||
let engine = self.engine.lock().await;
|
||||
|
||||
for msg in &req.messages {
|
||||
if msg.topic.is_empty() {
|
||||
return Err(tonic::Status::invalid_argument("topic must not be empty"));
|
||||
}
|
||||
|
||||
let headers: Vec<sq_models::Header> = msg
|
||||
.headers
|
||||
.iter()
|
||||
.map(|h| sq_models::Header {
|
||||
key: h.key.clone(),
|
||||
value: h.value.clone(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let key = if msg.key.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(msg.key.as_slice())
|
||||
};
|
||||
|
||||
let offset = engine
|
||||
.append(&msg.topic, 0, key, &msg.value, &headers, 0)
|
||||
.map_err(|e| tonic::Status::internal(e.to_string()))?;
|
||||
|
||||
results.push(sq_grpc_interface::PublishResult {
|
||||
topic: msg.topic.clone(),
|
||||
partition: 0,
|
||||
offset,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(tonic::Response::new(sq_grpc_interface::PublishResponse {
|
||||
results,
|
||||
}))
|
||||
}
|
||||
|
||||
type SubscribeStream = std::pin::Pin<
|
||||
Box<
|
||||
dyn tokio_stream::Stream<
|
||||
Item = Result<sq_grpc_interface::SubscribeResponse, tonic::Status>,
|
||||
> + Send
|
||||
+ 'static,
|
||||
>,
|
||||
>;
|
||||
|
||||
async fn subscribe(
|
||||
&self,
|
||||
request: tonic::Request<SubscribeRequest>,
|
||||
) -> Result<tonic::Response<Self::SubscribeStream>, tonic::Status> {
|
||||
let req = request.into_inner();
|
||||
let batch_size = if req.max_batch_size == 0 {
|
||||
100
|
||||
} else {
|
||||
req.max_batch_size as usize
|
||||
};
|
||||
let start_offset = req.start_offset.unwrap_or(0);
|
||||
let topic = req.topic.clone();
|
||||
let partition = req.partition;
|
||||
let engine = self.engine.clone();
|
||||
|
||||
let stream = async_stream::try_stream! {
|
||||
let mut current_offset = start_offset;
|
||||
let mut empty_polls = 0u32;
|
||||
|
||||
loop {
|
||||
let messages = {
|
||||
let eng = engine.lock().await;
|
||||
eng.read(&topic, partition, current_offset, batch_size)
|
||||
.map_err(|e| tonic::Status::internal(e.to_string()))?
|
||||
};
|
||||
|
||||
if messages.is_empty() {
|
||||
empty_polls += 1;
|
||||
// In tests, stop after a few empty polls to avoid hanging.
|
||||
if empty_polls > 3 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
|
||||
continue;
|
||||
}
|
||||
|
||||
empty_polls = 0;
|
||||
|
||||
let consumed: Vec<sq_grpc_interface::ConsumedMessage> = messages
|
||||
.iter()
|
||||
.map(|m| {
|
||||
current_offset = m.offset + 1;
|
||||
sq_grpc_interface::ConsumedMessage {
|
||||
offset: m.offset,
|
||||
topic: m.topic.to_string(),
|
||||
partition: m.partition,
|
||||
key: m.key.clone().unwrap_or_default(),
|
||||
value: m.value.clone(),
|
||||
headers: m
|
||||
.headers
|
||||
.iter()
|
||||
.map(|h| MessageHeader {
|
||||
key: h.key.clone(),
|
||||
value: h.value.clone(),
|
||||
})
|
||||
.collect(),
|
||||
timestamp_ms: m.timestamp_ms,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
yield sq_grpc_interface::SubscribeResponse { messages: consumed };
|
||||
}
|
||||
};
|
||||
|
||||
Ok(tonic::Response::new(Box::pin(stream)))
|
||||
}
|
||||
|
||||
async fn ack(
|
||||
&self,
|
||||
_request: tonic::Request<sq_grpc_interface::AckRequest>,
|
||||
) -> Result<tonic::Response<sq_grpc_interface::AckResponse>, tonic::Status> {
|
||||
Ok(tonic::Response::new(sq_grpc_interface::AckResponse {}))
|
||||
}
|
||||
}
|
||||
|
||||
let incoming = tokio_stream::wrappers::TcpListenerStream::new(listener);
|
||||
|
||||
tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(StatusServiceServer::new(TestHealthServer {
|
||||
node_id: node_id.clone(),
|
||||
}))
|
||||
.add_service(DataPlaneServiceServer::new(TestDataPlaneServer {
|
||||
engine,
|
||||
}))
|
||||
.serve_with_incoming_shutdown(incoming, async {
|
||||
let _ = shutdown_rx.await;
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
// Give the server a moment to start.
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
|
||||
|
||||
TestServer {
|
||||
addr,
|
||||
_shutdown: shutdown_tx,
|
||||
}
|
||||
}
|
||||
|
||||
fn endpoint(&self) -> String {
|
||||
format!("http://{}", self.addr)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_status_rpc() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = StatusServiceClient::connect(server.endpoint()).await.unwrap();
|
||||
|
||||
let response = client
|
||||
.status(tonic::Request::new(GetStatusRequest {}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(response.into_inner().node_id, "test-node");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_publish_single_message() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let response = client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages: vec![PublishMessage {
|
||||
topic: "orders".to_string(),
|
||||
key: vec![],
|
||||
value: b"hello world".to_vec(),
|
||||
headers: vec![],
|
||||
}],
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let results = response.into_inner().results;
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].topic, "orders");
|
||||
assert_eq!(results[0].offset, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_publish_batch_sequential_offsets() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let messages: Vec<PublishMessage> = (0..100)
|
||||
.map(|i| PublishMessage {
|
||||
topic: "events".to_string(),
|
||||
key: vec![],
|
||||
value: format!("msg-{i}").into_bytes(),
|
||||
headers: vec![],
|
||||
})
|
||||
.collect();
|
||||
|
||||
let response = client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages,
|
||||
settings: Some(PublishSettings {
|
||||
ack_mode: AckMode::All.into(),
|
||||
}),
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let results = response.into_inner().results;
|
||||
assert_eq!(results.len(), 100);
|
||||
for (i, r) in results.iter().enumerate() {
|
||||
assert_eq!(r.offset, i as u64);
|
||||
assert_eq!(r.topic, "events");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_publish_empty_topic_returns_error() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let err = client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages: vec![PublishMessage {
|
||||
topic: "".to_string(),
|
||||
key: vec![],
|
||||
value: b"data".to_vec(),
|
||||
headers: vec![],
|
||||
}],
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert_eq!(err.code(), tonic::Code::InvalidArgument);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_publish_empty_messages_returns_error() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let err = client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages: vec![],
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert_eq!(err.code(), tonic::Code::InvalidArgument);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_publish_with_key_and_headers() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let response = client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages: vec![PublishMessage {
|
||||
topic: "orders".to_string(),
|
||||
key: b"order-123".to_vec(),
|
||||
value: b"payload".to_vec(),
|
||||
headers: vec![MessageHeader {
|
||||
key: "trace-id".to_string(),
|
||||
value: b"abc-123".to_vec(),
|
||||
}],
|
||||
}],
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let results = response.into_inner().results;
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].offset, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_subscribe_from_beginning() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Publish 10 messages first.
|
||||
let messages: Vec<PublishMessage> = (0..10)
|
||||
.map(|i| PublishMessage {
|
||||
topic: "events".to_string(),
|
||||
key: vec![],
|
||||
value: format!("msg-{i}").into_bytes(),
|
||||
headers: vec![],
|
||||
})
|
||||
.collect();
|
||||
|
||||
client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages,
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Subscribe from offset 0.
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "events".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: "".to_string(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 100,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut all_messages = Vec::new();
|
||||
|
||||
while let Some(Ok(batch)) = stream.next().await {
|
||||
all_messages.extend(batch.messages);
|
||||
if all_messages.len() >= 10 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(all_messages.len(), 10);
|
||||
for (i, msg) in all_messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("msg-{i}").as_bytes());
|
||||
assert_eq!(msg.topic, "events");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_subscribe_from_middle() {
|
||||
let server = TestServer::start().await;
|
||||
let mut client = DataPlaneServiceClient::connect(server.endpoint())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Publish 10 messages.
|
||||
let messages: Vec<PublishMessage> = (0..10)
|
||||
.map(|i| PublishMessage {
|
||||
topic: "events".to_string(),
|
||||
key: vec![],
|
||||
value: format!("msg-{i}").into_bytes(),
|
||||
headers: vec![],
|
||||
})
|
||||
.collect();
|
||||
|
||||
client
|
||||
.publish(tonic::Request::new(PublishRequest {
|
||||
messages,
|
||||
settings: None,
|
||||
producer_id: "test".to_string(),
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Subscribe from offset 5.
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "events".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: "".to_string(),
|
||||
start_offset: Some(5),
|
||||
max_batch_size: 100,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut all_messages = Vec::new();
|
||||
|
||||
while let Some(Ok(batch)) = stream.next().await {
|
||||
all_messages.extend(batch.messages);
|
||||
if all_messages.len() >= 5 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(all_messages.len(), 5);
|
||||
assert_eq!(all_messages[0].offset, 5);
|
||||
assert_eq!(all_messages[4].offset, 9);
|
||||
}
|
||||
965
crates/sq-server/tests/stress_test.rs
Normal file
965
crates/sq-server/tests/stress_test.rs
Normal file
@@ -0,0 +1,965 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use sq_cluster::membership::{Membership, MembershipConfig};
|
||||
use sq_grpc_interface::{
|
||||
cluster_service_server::ClusterServiceServer,
|
||||
control_plane_service_server::ControlPlaneServiceServer,
|
||||
data_plane_service_server::DataPlaneServiceServer,
|
||||
status_service_client::StatusServiceClient,
|
||||
status_service_server::StatusServiceServer,
|
||||
GetStatusRequest, SubscribeRequest,
|
||||
};
|
||||
use sq_grpc_interface::data_plane_service_client::DataPlaneServiceClient;
|
||||
use sq_sdk::{
|
||||
BatchProducer, BatchProducerConfig, Consumer, ConsumerConfig, Producer, ProducerConfig,
|
||||
ProducerMessage,
|
||||
};
|
||||
use sq_server::capnp::CapnpServer;
|
||||
use sq_server::grpc::{cluster, control_plane, data_plane, health};
|
||||
use sq_server::state::{Config, State};
|
||||
use tempfile::TempDir;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test harness (shared with cluster_test.rs, inlined here for simplicity)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct TestNode {
|
||||
grpc_addr: SocketAddr,
|
||||
capnp_addr: SocketAddr,
|
||||
cancel: CancellationToken,
|
||||
pipeline_cancel: CancellationToken,
|
||||
_temp_dir: TempDir,
|
||||
_server_handle: tokio::task::JoinHandle<()>,
|
||||
_capnp_handle: tokio::task::JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl TestNode {
|
||||
/// Cap'n Proto endpoint (default data plane).
|
||||
fn endpoint(&self) -> String {
|
||||
self.capnp_addr.to_string()
|
||||
}
|
||||
|
||||
/// gRPC endpoint (health checks, subscribe verification).
|
||||
fn grpc_endpoint(&self) -> String {
|
||||
format!("http://{}", self.grpc_addr)
|
||||
}
|
||||
}
|
||||
|
||||
struct TestCluster {
|
||||
nodes: Vec<TestNode>,
|
||||
}
|
||||
|
||||
impl TestCluster {
|
||||
async fn start(n: usize) -> Self {
|
||||
let mut grpc_listeners = Vec::new();
|
||||
let mut capnp_listeners = Vec::new();
|
||||
let mut grpc_addrs = Vec::new();
|
||||
let mut capnp_addrs = Vec::new();
|
||||
|
||||
for _ in 0..n {
|
||||
let grpc_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let capnp_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
grpc_addrs.push(grpc_listener.local_addr().unwrap());
|
||||
capnp_addrs.push(capnp_listener.local_addr().unwrap());
|
||||
grpc_listeners.push(grpc_listener);
|
||||
capnp_listeners.push(capnp_listener);
|
||||
}
|
||||
|
||||
let mut nodes = Vec::new();
|
||||
for (i, (grpc_listener, capnp_listener)) in
|
||||
grpc_listeners.into_iter().zip(capnp_listeners).enumerate()
|
||||
{
|
||||
let grpc_addr = grpc_addrs[i];
|
||||
let capnp_addr = capnp_addrs[i];
|
||||
let node_id = format!("stress-node-{}", i + 1);
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
|
||||
let seeds: Vec<String> = grpc_addrs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(j, _)| *j != i)
|
||||
.map(|(_, a)| a.to_string())
|
||||
.collect();
|
||||
|
||||
let config = Config {
|
||||
node_id: node_id.clone(),
|
||||
data_dir: temp_dir.path().to_path_buf(),
|
||||
seeds: seeds.clone(),
|
||||
grpc_address: grpc_addr.to_string(),
|
||||
cluster_id: "test-cluster".to_string(),
|
||||
s3_bucket: None,
|
||||
s3_endpoint: None,
|
||||
s3_region: None,
|
||||
sync_policy: sq_models::SyncPolicy::EveryBatch,
|
||||
};
|
||||
|
||||
let (state, mut pipeline) = State::new(config).unwrap();
|
||||
|
||||
let pipeline_cancel = CancellationToken::new();
|
||||
let pipeline_cancel_clone = pipeline_cancel.clone();
|
||||
tokio::spawn(async move {
|
||||
tokio::select! {
|
||||
() = pipeline.run() => {}
|
||||
() = pipeline_cancel_clone.cancelled() => {}
|
||||
}
|
||||
});
|
||||
|
||||
let membership = Arc::new(Membership::new(MembershipConfig {
|
||||
node_id: node_id.clone(),
|
||||
address: grpc_addr.to_string(),
|
||||
seeds,
|
||||
..Default::default()
|
||||
}));
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Spawn gRPC server.
|
||||
let cancel_clone = cancel.clone();
|
||||
let state_clone = state.clone();
|
||||
let membership_clone = membership.clone();
|
||||
let incoming = tokio_stream::wrappers::TcpListenerStream::new(grpc_listener);
|
||||
let server_handle = tokio::spawn(async move {
|
||||
tonic::transport::Server::builder()
|
||||
.add_service(StatusServiceServer::new(health::HealthServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(DataPlaneServiceServer::new(data_plane::DataPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
}))
|
||||
.add_service(ControlPlaneServiceServer::new(
|
||||
control_plane::ControlPlaneServer {
|
||||
state: state_clone.clone(),
|
||||
},
|
||||
))
|
||||
.add_service(ClusterServiceServer::new(cluster::ClusterServer {
|
||||
state: state_clone,
|
||||
membership: membership_clone,
|
||||
}))
|
||||
.serve_with_incoming_shutdown(incoming, async move {
|
||||
cancel_clone.cancelled().await;
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
// Spawn capnp server.
|
||||
let cancel_clone = cancel.clone();
|
||||
let capnp_state = state.clone();
|
||||
let capnp_handle = tokio::spawn(async move {
|
||||
let server = CapnpServer {
|
||||
host: capnp_addr,
|
||||
state: capnp_state,
|
||||
};
|
||||
drop(capnp_listener);
|
||||
let _ = notmad::Component::run(&server, cancel_clone).await;
|
||||
});
|
||||
|
||||
nodes.push(TestNode {
|
||||
grpc_addr,
|
||||
capnp_addr,
|
||||
cancel,
|
||||
pipeline_cancel,
|
||||
_temp_dir: temp_dir,
|
||||
_server_handle: server_handle,
|
||||
_capnp_handle: capnp_handle,
|
||||
});
|
||||
}
|
||||
|
||||
for node in &nodes {
|
||||
wait_for_ready(&node.grpc_endpoint()).await;
|
||||
}
|
||||
// Give capnp server a moment to bind.
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
|
||||
TestCluster { nodes }
|
||||
}
|
||||
|
||||
fn node(&self, index: usize) -> &TestNode {
|
||||
&self.nodes[index]
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestCluster {
|
||||
fn drop(&mut self) {
|
||||
for node in &self.nodes {
|
||||
node.pipeline_cancel.cancel();
|
||||
node.cancel.cancel();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_ready(endpoint: &str) {
|
||||
let deadline = tokio::time::Instant::now() + tokio::time::Duration::from_secs(5);
|
||||
loop {
|
||||
if tokio::time::Instant::now() > deadline {
|
||||
panic!("Server at {} did not become ready in time", endpoint);
|
||||
}
|
||||
if let Ok(mut client) = StatusServiceClient::connect(endpoint.to_string()).await {
|
||||
if client
|
||||
.status(tonic::Request::new(GetStatusRequest {}))
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 1: High-volume publish — 100K messages from a single producer
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_single_producer_100k() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let total = 100_000u64;
|
||||
let batch_size = 500;
|
||||
let payload = vec![0u8; 128]; // 128-byte messages
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
for batch_start in (0..total).step_by(batch_size) {
|
||||
let batch_end = (batch_start + batch_size as u64).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("stress-topic", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
let publish_duration = start.elapsed();
|
||||
let msgs_per_sec = total as f64 / publish_duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_single_producer_100k: published {} messages in {:.2}s ({:.0} msg/s, {:.1} MB/s)",
|
||||
total,
|
||||
publish_duration.as_secs_f64(),
|
||||
msgs_per_sec,
|
||||
(total as f64 * 128.0) / (1024.0 * 1024.0) / publish_duration.as_secs_f64()
|
||||
);
|
||||
|
||||
// Verify: read back all messages via gRPC subscribe.
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep)
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "stress-topic".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut consumed = 0u64;
|
||||
let consume_start = Instant::now();
|
||||
|
||||
while consumed < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => consumed += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
let consume_duration = consume_start.elapsed();
|
||||
let consume_per_sec = consumed as f64 / consume_duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_single_producer_100k: consumed {} messages in {:.2}s ({:.0} msg/s)",
|
||||
consumed,
|
||||
consume_duration.as_secs_f64(),
|
||||
consume_per_sec
|
||||
);
|
||||
|
||||
assert_eq!(consumed, total, "expected all messages to be consumed");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 2: Concurrent producers — 10 producers, 10K messages each
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_concurrent_producers() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
|
||||
let num_producers = 10;
|
||||
let msgs_per_producer = 10_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for p in 0..num_producers {
|
||||
let ep = endpoint.clone();
|
||||
let pl = payload.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: ep,
|
||||
producer_id: format!("producer-{p}"),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let topic = format!("concurrent-topic-{p}");
|
||||
for batch_start in (0..msgs_per_producer).step_by(100) {
|
||||
let batch_end = (batch_start + 100).min(msgs_per_producer);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new(topic.clone(), pl.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
let total = num_producers as u64 * msgs_per_producer;
|
||||
let msgs_per_sec = total as f64 / duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_concurrent_producers: {} producers x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
|
||||
num_producers,
|
||||
msgs_per_producer,
|
||||
total,
|
||||
duration.as_secs_f64(),
|
||||
msgs_per_sec
|
||||
);
|
||||
|
||||
// Verify each topic has the right count via gRPC.
|
||||
for p in 0..num_producers {
|
||||
let topic = format!("concurrent-topic-{p}");
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: topic.clone(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
while count < msgs_per_producer {
|
||||
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
count, msgs_per_producer,
|
||||
"topic {topic} expected {msgs_per_producer} messages, got {count}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 3: Concurrent consumers — publish then read in parallel
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_concurrent_consumers() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
let total = 50_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
// Pre-publish messages.
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for batch_start in (0..total).step_by(500) {
|
||||
let batch_end = (batch_start + 500).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("consume-stress", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
// Consume in parallel from 5 independent consumers via gRPC (no consumer group — each reads all).
|
||||
let num_consumers = 5;
|
||||
let start = Instant::now();
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for _ in 0..num_consumers {
|
||||
let ep = grpc_ep.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
let mut client = DataPlaneServiceClient::connect(ep).await.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "consume-stress".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
while count < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
count
|
||||
}));
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
let count = handle.await.unwrap();
|
||||
assert_eq!(count, total, "each consumer should read all {total} messages");
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
eprintln!(
|
||||
"stress_concurrent_consumers: {} consumers each read {} msgs in {:.2}s",
|
||||
num_consumers,
|
||||
total,
|
||||
duration.as_secs_f64()
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 4: Sustained load — publish+consume simultaneously over time
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_sustained_load() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
let sustain_duration = Duration::from_secs(3);
|
||||
let payload = vec![0u8; 256];
|
||||
|
||||
let ep = endpoint.clone();
|
||||
let pl = payload.clone();
|
||||
|
||||
// Producer: publish as fast as possible for the sustained duration.
|
||||
let producer_handle = tokio::spawn(async move {
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: ep,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let start = Instant::now();
|
||||
let mut total = 0u64;
|
||||
while start.elapsed() < sustain_duration {
|
||||
let batch: Vec<ProducerMessage> = (0..100)
|
||||
.map(|_| ProducerMessage::new("sustained-topic", pl.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
total += 100;
|
||||
}
|
||||
(total, start.elapsed())
|
||||
});
|
||||
|
||||
// Give producer a head start.
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
|
||||
// Consumer: read as fast as possible via gRPC subscribe.
|
||||
let ep = grpc_ep.clone();
|
||||
let consumer_handle = tokio::spawn(async move {
|
||||
let mut client = DataPlaneServiceClient::connect(ep).await.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "sustained-topic".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
let start = Instant::now();
|
||||
|
||||
// Read for longer than the producer runs to drain everything.
|
||||
let read_deadline = sustain_duration + Duration::from_secs(5);
|
||||
while start.elapsed() < read_deadline {
|
||||
match tokio::time::timeout(Duration::from_secs(2), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
count
|
||||
});
|
||||
|
||||
let (published, pub_duration) = producer_handle.await.unwrap();
|
||||
let consumed = consumer_handle.await.unwrap();
|
||||
|
||||
let pub_rate = published as f64 / pub_duration.as_secs_f64();
|
||||
let throughput_mb =
|
||||
(published as f64 * 256.0) / (1024.0 * 1024.0) / pub_duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_sustained_load: published {} in {:.2}s ({:.0} msg/s, {:.1} MB/s), consumed {}",
|
||||
published,
|
||||
pub_duration.as_secs_f64(),
|
||||
pub_rate,
|
||||
throughput_mb,
|
||||
consumed
|
||||
);
|
||||
|
||||
assert!(
|
||||
published > 0,
|
||||
"should have published messages during sustained load"
|
||||
);
|
||||
assert_eq!(consumed, published, "consumer should eventually read all published messages");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 5: Multi-topic fan-out — publish to many topics simultaneously
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_multi_topic_fanout() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
let num_topics = 50;
|
||||
let msgs_per_topic = 1_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Publish to many topics in round-robin batches.
|
||||
for batch_start in (0..msgs_per_topic).step_by(100) {
|
||||
let batch_end = (batch_start + 100).min(msgs_per_topic);
|
||||
for t in 0..num_topics {
|
||||
let topic = format!("fanout-{t}");
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new(topic.clone(), payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
let total = num_topics as u64 * msgs_per_topic;
|
||||
eprintln!(
|
||||
"stress_multi_topic_fanout: {} topics x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
|
||||
num_topics,
|
||||
msgs_per_topic,
|
||||
total,
|
||||
duration.as_secs_f64(),
|
||||
total as f64 / duration.as_secs_f64()
|
||||
);
|
||||
|
||||
// Spot-check a few topics via gRPC.
|
||||
for t in [0, num_topics / 2, num_topics - 1] {
|
||||
let topic = format!("fanout-{t}");
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: topic.clone(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
while count < msgs_per_topic {
|
||||
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
count, msgs_per_topic,
|
||||
"topic {topic} expected {msgs_per_topic} messages, got {count}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 6: Large message bodies — 10K messages with 4KB payloads
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_large_messages() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
|
||||
let total = 10_000u64;
|
||||
let payload = vec![0xABu8; 4096]; // 4KB messages
|
||||
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
for batch_start in (0..total).step_by(50) {
|
||||
let batch_end = (batch_start + 50).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("large-msgs", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
let pub_duration = start.elapsed();
|
||||
let data_mb = (total as f64 * 4096.0) / (1024.0 * 1024.0);
|
||||
eprintln!(
|
||||
"stress_large_messages: published {} x 4KB = {:.1}MB in {:.2}s ({:.1} MB/s)",
|
||||
total,
|
||||
data_mb,
|
||||
pub_duration.as_secs_f64(),
|
||||
data_mb / pub_duration.as_secs_f64()
|
||||
);
|
||||
|
||||
// Verify all data reads back correctly via gRPC.
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep).await.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "large-msgs".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 200,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
while count < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => {
|
||||
for msg in &batch.messages {
|
||||
assert_eq!(msg.value.len(), 4096, "message body should be 4KB");
|
||||
assert!(msg.value.iter().all(|&b| b == 0xAB), "data integrity check");
|
||||
}
|
||||
count += batch.messages.len() as u64;
|
||||
}
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(count, total, "all large messages should be consumed");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 7: Consumer group offset tracking under load
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_consumer_group_resume() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let total = 10_000u64;
|
||||
let payload = vec![0u8; 32];
|
||||
|
||||
// Publish all messages.
|
||||
let mut producer = Producer::connect(ProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for batch_start in (0..total).step_by(500) {
|
||||
let batch_end = (batch_start + 500).min(total);
|
||||
let batch: Vec<ProducerMessage> = (batch_start..batch_end)
|
||||
.map(|_| ProducerMessage::new("cg-stress", payload.clone()))
|
||||
.collect();
|
||||
producer.send_batch(batch).await.unwrap();
|
||||
}
|
||||
|
||||
// Consume first half with auto-commit.
|
||||
let half = total / 2;
|
||||
{
|
||||
let mut consumer = Consumer::connect(ConsumerConfig {
|
||||
address: endpoint.clone(),
|
||||
consumer_group: "stress-group".to_string(),
|
||||
topic: "cg-stress".to_string(),
|
||||
auto_commit: true,
|
||||
max_poll_records: 500,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut consumed = 0u64;
|
||||
while consumed < half {
|
||||
let msgs = tokio::time::timeout(Duration::from_secs(5), consumer.poll())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
consumed += msgs.len() as u64;
|
||||
}
|
||||
assert!(consumed >= half, "should have consumed at least half");
|
||||
}
|
||||
|
||||
// Reconnect — should resume from the committed offset.
|
||||
{
|
||||
let mut consumer = Consumer::connect(ConsumerConfig {
|
||||
address: endpoint.clone(),
|
||||
consumer_group: "stress-group".to_string(),
|
||||
topic: "cg-stress".to_string(),
|
||||
auto_commit: true,
|
||||
max_poll_records: 500,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let msgs = tokio::time::timeout(Duration::from_secs(5), consumer.poll())
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
// First message after reconnect should be at or after the halfway point.
|
||||
assert!(
|
||||
!msgs.is_empty(),
|
||||
"should receive messages after resume"
|
||||
);
|
||||
let first_offset = msgs[0].offset;
|
||||
assert!(
|
||||
first_offset >= half - 500, // Allow some re-delivery due to batch commit
|
||||
"first offset after resume should be near {half}, got {first_offset}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 8: BatchProducer — 100K messages from a single batching producer
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_batch_producer_100k() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
|
||||
let producer = BatchProducer::connect(BatchProducerConfig {
|
||||
address: endpoint.clone(),
|
||||
max_batch_size: 1000,
|
||||
flush_interval_ms: 5,
|
||||
channel_capacity: 20_000,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let producer = Arc::new(producer);
|
||||
let total = 100_000u64;
|
||||
let payload = vec![0u8; 128];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
// Spawn a task per message to fully saturate the batch pipeline.
|
||||
let mut handles = Vec::with_capacity(total as usize);
|
||||
for _ in 0..total {
|
||||
let p = producer.clone();
|
||||
let pl = payload.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
p.send(ProducerMessage::new("batch-stress", pl))
|
||||
.await
|
||||
.unwrap();
|
||||
}));
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
let publish_duration = start.elapsed();
|
||||
let msgs_per_sec = total as f64 / publish_duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_batch_producer_100k: published {} messages in {:.2}s ({:.0} msg/s, {:.1} MB/s)",
|
||||
total,
|
||||
publish_duration.as_secs_f64(),
|
||||
msgs_per_sec,
|
||||
(total as f64 * 128.0) / (1024.0 * 1024.0) / publish_duration.as_secs_f64()
|
||||
);
|
||||
|
||||
// Verify: read back all messages via gRPC.
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep).await.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: "batch-stress".to_string(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut consumed = 0u64;
|
||||
|
||||
while consumed < total {
|
||||
match tokio::time::timeout(Duration::from_secs(10), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => consumed += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(consumed, total, "expected all messages to be consumed");
|
||||
|
||||
// Close the producer (flushes remaining).
|
||||
Arc::try_unwrap(producer).ok().unwrap().close().await;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stress test 9: BatchProducer concurrent — 10 batching producers, 10K each
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[tokio::test]
|
||||
async fn stress_batch_concurrent_producers() {
|
||||
let cluster = TestCluster::start(1).await;
|
||||
let endpoint = cluster.node(0).endpoint();
|
||||
let grpc_ep = cluster.node(0).grpc_endpoint();
|
||||
|
||||
let num_producers = 10;
|
||||
let msgs_per_producer = 10_000u64;
|
||||
let payload = vec![0u8; 64];
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for p in 0..num_producers {
|
||||
let ep = endpoint.clone();
|
||||
let pl = payload.clone();
|
||||
handles.push(tokio::spawn(async move {
|
||||
let producer = Arc::new(
|
||||
BatchProducer::connect(BatchProducerConfig {
|
||||
address: ep,
|
||||
producer_id: format!("batch-producer-{p}"),
|
||||
max_batch_size: 500,
|
||||
flush_interval_ms: 5,
|
||||
..Default::default()
|
||||
})
|
||||
.await
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
let topic = format!("batch-concurrent-{p}");
|
||||
let mut send_handles = Vec::new();
|
||||
|
||||
// Fire all sends concurrently within each producer.
|
||||
for _ in 0..msgs_per_producer {
|
||||
let p = producer.clone();
|
||||
let t = topic.clone();
|
||||
let pl = pl.clone();
|
||||
send_handles.push(tokio::spawn(async move {
|
||||
p.send(ProducerMessage::new(t, pl)).await.unwrap();
|
||||
}));
|
||||
}
|
||||
|
||||
// Await all acks.
|
||||
for handle in send_handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
Arc::try_unwrap(producer).ok().unwrap().close().await;
|
||||
}));
|
||||
}
|
||||
|
||||
for handle in handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
let total = num_producers as u64 * msgs_per_producer;
|
||||
let msgs_per_sec = total as f64 / duration.as_secs_f64();
|
||||
|
||||
eprintln!(
|
||||
"stress_batch_concurrent_producers: {} producers x {} msgs = {} total in {:.2}s ({:.0} msg/s)",
|
||||
num_producers,
|
||||
msgs_per_producer,
|
||||
total,
|
||||
duration.as_secs_f64(),
|
||||
msgs_per_sec
|
||||
);
|
||||
|
||||
// Verify each topic has the right count via gRPC.
|
||||
for p in 0..num_producers {
|
||||
let topic = format!("batch-concurrent-{p}");
|
||||
let mut client = DataPlaneServiceClient::connect(grpc_ep.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let response = client
|
||||
.subscribe(tonic::Request::new(SubscribeRequest {
|
||||
topic: topic.clone(),
|
||||
partition: 0,
|
||||
consumer_group: String::new(),
|
||||
start_offset: Some(0),
|
||||
max_batch_size: 1000,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut stream = response.into_inner();
|
||||
let mut count = 0u64;
|
||||
while count < msgs_per_producer {
|
||||
match tokio::time::timeout(Duration::from_secs(5), stream.next()).await {
|
||||
Ok(Some(Ok(batch))) => count += batch.messages.len() as u64,
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
count, msgs_per_producer,
|
||||
"topic {topic} expected {msgs_per_producer} messages, got {count}"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,9 @@ edition.workspace = true
|
||||
anyhow = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { workspace = true, features = ["full", "test-util"] }
|
||||
sq-storage = { workspace = true }
|
||||
sq-models = { workspace = true }
|
||||
|
||||
131
crates/sq-sim/src/clock.rs
Normal file
131
crates/sq-sim/src/clock.rs
Normal file
@@ -0,0 +1,131 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// Trait abstracting time for deterministic simulation.
|
||||
pub trait Clock: Send + Sync {
|
||||
fn now(&self) -> Instant;
|
||||
fn elapsed_since(&self, earlier: Instant) -> Duration {
|
||||
self.now().duration_since(earlier)
|
||||
}
|
||||
}
|
||||
|
||||
/// Real clock delegating to `std::time::Instant`.
|
||||
#[derive(Clone)]
|
||||
pub struct RealClock;
|
||||
|
||||
impl Clock for RealClock {
|
||||
fn now(&self) -> Instant {
|
||||
Instant::now()
|
||||
}
|
||||
}
|
||||
|
||||
/// Deterministic clock for simulation testing.
|
||||
/// Time only advances when explicitly ticked.
|
||||
#[derive(Clone)]
|
||||
pub struct SimClock {
|
||||
inner: Arc<SimClockInner>,
|
||||
}
|
||||
|
||||
struct SimClockInner {
|
||||
/// We store a "base" real instant and an offset in nanos.
|
||||
/// `now()` returns `base + offset`.
|
||||
base: Instant,
|
||||
offset_nanos: Mutex<u128>,
|
||||
}
|
||||
|
||||
impl SimClock {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
inner: Arc::new(SimClockInner {
|
||||
base: Instant::now(),
|
||||
offset_nanos: Mutex::new(0),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance time by the given duration.
|
||||
pub fn advance(&self, duration: Duration) {
|
||||
let mut offset = self.inner.offset_nanos.lock().unwrap();
|
||||
*offset += duration.as_nanos();
|
||||
}
|
||||
|
||||
/// Get the current elapsed duration from the start.
|
||||
pub fn elapsed(&self) -> Duration {
|
||||
let offset = self.inner.offset_nanos.lock().unwrap();
|
||||
Duration::from_nanos(*offset as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SimClock {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Clock for SimClock {
|
||||
fn now(&self) -> Instant {
|
||||
let offset = self.inner.offset_nanos.lock().unwrap();
|
||||
self.inner.base + Duration::from_nanos(*offset as u64)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_real_clock_advances() {
|
||||
let clock = RealClock;
|
||||
let t1 = clock.now();
|
||||
// Busy-wait a tiny bit
|
||||
std::thread::sleep(Duration::from_millis(1));
|
||||
let t2 = clock.now();
|
||||
assert!(t2 > t1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sim_clock_starts_at_base() {
|
||||
let clock = SimClock::new();
|
||||
assert_eq!(clock.elapsed(), Duration::ZERO);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sim_clock_advance() {
|
||||
let clock = SimClock::new();
|
||||
let t1 = clock.now();
|
||||
|
||||
clock.advance(Duration::from_secs(10));
|
||||
let t2 = clock.now();
|
||||
|
||||
assert_eq!(t2.duration_since(t1), Duration::from_secs(10));
|
||||
assert_eq!(clock.elapsed(), Duration::from_secs(10));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sim_clock_multiple_advances() {
|
||||
let clock = SimClock::new();
|
||||
|
||||
clock.advance(Duration::from_millis(100));
|
||||
clock.advance(Duration::from_millis(200));
|
||||
clock.advance(Duration::from_millis(300));
|
||||
|
||||
assert_eq!(clock.elapsed(), Duration::from_millis(600));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sim_clock_clone_shares_state() {
|
||||
let clock1 = SimClock::new();
|
||||
let clock2 = clock1.clone();
|
||||
|
||||
clock1.advance(Duration::from_secs(5));
|
||||
assert_eq!(clock2.elapsed(), Duration::from_secs(5));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sim_clock_elapsed_since() {
|
||||
let clock = SimClock::new();
|
||||
let t1 = clock.now();
|
||||
clock.advance(Duration::from_secs(42));
|
||||
assert_eq!(clock.elapsed_since(t1), Duration::from_secs(42));
|
||||
}
|
||||
}
|
||||
666
crates/sq-sim/src/fs.rs
Normal file
666
crates/sq-sim/src/fs.rs
Normal file
@@ -0,0 +1,666 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::io;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Traits
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Trait abstracting filesystem operations for deterministic simulation.
|
||||
pub trait FileSystem: Send + Sync {
|
||||
fn create_dir_all(&self, path: &Path) -> io::Result<()>;
|
||||
fn open_write(&self, path: &Path) -> io::Result<Box<dyn FileHandle>>;
|
||||
fn open_append(&self, path: &Path) -> io::Result<Box<dyn FileHandle>>;
|
||||
fn open_read(&self, path: &Path) -> io::Result<Box<dyn FileHandle>>;
|
||||
fn remove_file(&self, path: &Path) -> io::Result<()>;
|
||||
fn list_dir(&self, path: &Path) -> io::Result<Vec<PathBuf>>;
|
||||
fn exists(&self, path: &Path) -> bool;
|
||||
fn file_size(&self, path: &Path) -> io::Result<u64>;
|
||||
}
|
||||
|
||||
/// Trait abstracting a file handle for reads/writes/fsync.
|
||||
pub trait FileHandle: Send + Sync {
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()>;
|
||||
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()>;
|
||||
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize>;
|
||||
fn fsync(&mut self) -> io::Result<()>;
|
||||
fn position(&self) -> u64;
|
||||
fn seek(&mut self, pos: u64) -> io::Result<()>;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// RealFileSystem
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Real filesystem delegating to std::fs.
|
||||
pub struct RealFileSystem;
|
||||
|
||||
impl FileSystem for RealFileSystem {
|
||||
fn create_dir_all(&self, path: &Path) -> io::Result<()> {
|
||||
std::fs::create_dir_all(path)
|
||||
}
|
||||
|
||||
fn open_write(&self, path: &Path) -> io::Result<Box<dyn FileHandle>> {
|
||||
let file = std::fs::File::create(path)?;
|
||||
Ok(Box::new(RealFileHandle {
|
||||
file,
|
||||
position: 0,
|
||||
}))
|
||||
}
|
||||
|
||||
fn open_append(&self, path: &Path) -> io::Result<Box<dyn FileHandle>> {
|
||||
let file = std::fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(path)?;
|
||||
let position = file.metadata()?.len();
|
||||
Ok(Box::new(RealFileHandle { file, position }))
|
||||
}
|
||||
|
||||
fn open_read(&self, path: &Path) -> io::Result<Box<dyn FileHandle>> {
|
||||
let file = std::fs::File::open(path)?;
|
||||
Ok(Box::new(RealFileHandle {
|
||||
file,
|
||||
position: 0,
|
||||
}))
|
||||
}
|
||||
|
||||
fn remove_file(&self, path: &Path) -> io::Result<()> {
|
||||
std::fs::remove_file(path)
|
||||
}
|
||||
|
||||
fn list_dir(&self, path: &Path) -> io::Result<Vec<PathBuf>> {
|
||||
let mut entries = Vec::new();
|
||||
for entry in std::fs::read_dir(path)? {
|
||||
entries.push(entry?.path());
|
||||
}
|
||||
entries.sort();
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
path.exists()
|
||||
}
|
||||
|
||||
fn file_size(&self, path: &Path) -> io::Result<u64> {
|
||||
Ok(std::fs::metadata(path)?.len())
|
||||
}
|
||||
}
|
||||
|
||||
struct RealFileHandle {
|
||||
file: std::fs::File,
|
||||
position: u64,
|
||||
}
|
||||
|
||||
impl FileHandle for RealFileHandle {
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
use std::io::Write;
|
||||
self.file.write_all(buf)?;
|
||||
self.position += buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
|
||||
use std::io::Read;
|
||||
self.file.read_exact(buf)?;
|
||||
self.position += buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
||||
use std::io::Read;
|
||||
let n = self.file.read_to_end(buf)?;
|
||||
self.position += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn fsync(&mut self) -> io::Result<()> {
|
||||
use std::io::Write;
|
||||
self.file.flush()?;
|
||||
self.file.sync_all()
|
||||
}
|
||||
|
||||
fn position(&self) -> u64 {
|
||||
self.position
|
||||
}
|
||||
|
||||
fn seek(&mut self, pos: u64) -> io::Result<()> {
|
||||
use std::io::Seek;
|
||||
self.file.seek(io::SeekFrom::Start(pos))?;
|
||||
self.position = pos;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// InMemoryFileSystem
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// In-memory filesystem for deterministic testing with fault injection.
|
||||
#[derive(Clone)]
|
||||
pub struct InMemoryFileSystem {
|
||||
inner: Arc<Mutex<InMemoryFsInner>>,
|
||||
}
|
||||
|
||||
struct InMemoryFsInner {
|
||||
/// File contents keyed by canonical path.
|
||||
files: BTreeMap<PathBuf, Vec<u8>>,
|
||||
/// Directories that have been created.
|
||||
dirs: std::collections::BTreeSet<PathBuf>,
|
||||
/// Fault injection state.
|
||||
faults: FaultState,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct FaultState {
|
||||
fail_next_fsync: Option<io::Error>,
|
||||
disk_full: bool,
|
||||
}
|
||||
|
||||
impl InMemoryFileSystem {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
inner: Arc::new(Mutex::new(InMemoryFsInner {
|
||||
files: BTreeMap::new(),
|
||||
dirs: std::collections::BTreeSet::new(),
|
||||
faults: FaultState::default(),
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
/// Make the next fsync call fail with the given error.
|
||||
pub fn fail_next_fsync(&self, error: io::Error) {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.faults.fail_next_fsync = Some(error);
|
||||
}
|
||||
|
||||
/// Simulate disk full: all writes will fail.
|
||||
pub fn simulate_disk_full(&self) {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.faults.disk_full = true;
|
||||
}
|
||||
|
||||
/// Clear all fault injection state.
|
||||
pub fn clear_faults(&self) {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.faults = FaultState::default();
|
||||
}
|
||||
|
||||
/// Corrupt bytes at a given offset in a file.
|
||||
pub fn corrupt_bytes(&self, path: &Path, offset: u64, len: usize) {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
if let Some(data) = inner.files.get_mut(path) {
|
||||
let start = offset as usize;
|
||||
let end = (start + len).min(data.len());
|
||||
for b in &mut data[start..end] {
|
||||
*b ^= 0xFF;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a snapshot of file contents (for test assertions).
|
||||
pub fn read_file_bytes(&self, path: &Path) -> Option<Vec<u8>> {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
inner.files.get(path).cloned()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for InMemoryFileSystem {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl FileSystem for InMemoryFileSystem {
|
||||
fn create_dir_all(&self, path: &Path) -> io::Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
// Add this dir and all ancestors.
|
||||
let mut current = path.to_path_buf();
|
||||
loop {
|
||||
inner.dirs.insert(current.clone());
|
||||
if !current.pop() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn open_write(&self, path: &Path) -> io::Result<Box<dyn FileHandle>> {
|
||||
let inner_ref = self.inner.clone();
|
||||
// Truncate/create
|
||||
{
|
||||
let mut inner = inner_ref.lock().unwrap();
|
||||
inner.files.insert(path.to_path_buf(), Vec::new());
|
||||
}
|
||||
Ok(Box::new(InMemoryFileHandle {
|
||||
fs: inner_ref,
|
||||
path: path.to_path_buf(),
|
||||
position: 0,
|
||||
}))
|
||||
}
|
||||
|
||||
fn open_append(&self, path: &Path) -> io::Result<Box<dyn FileHandle>> {
|
||||
let inner_ref = self.inner.clone();
|
||||
let position = {
|
||||
let mut inner = inner_ref.lock().unwrap();
|
||||
let entry = inner
|
||||
.files
|
||||
.entry(path.to_path_buf())
|
||||
.or_insert_with(Vec::new);
|
||||
entry.len() as u64
|
||||
};
|
||||
Ok(Box::new(InMemoryFileHandle {
|
||||
fs: inner_ref,
|
||||
path: path.to_path_buf(),
|
||||
position,
|
||||
}))
|
||||
}
|
||||
|
||||
fn open_read(&self, path: &Path) -> io::Result<Box<dyn FileHandle>> {
|
||||
let inner_ref = self.inner.clone();
|
||||
{
|
||||
let inner = inner_ref.lock().unwrap();
|
||||
if !inner.files.contains_key(path) {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::NotFound,
|
||||
format!("file not found: {}", path.display()),
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(Box::new(InMemoryFileHandle {
|
||||
fs: inner_ref,
|
||||
path: path.to_path_buf(),
|
||||
position: 0,
|
||||
}))
|
||||
}
|
||||
|
||||
fn remove_file(&self, path: &Path) -> io::Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
if inner.files.remove(path).is_none() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::NotFound,
|
||||
format!("file not found: {}", path.display()),
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn list_dir(&self, path: &Path) -> io::Result<Vec<PathBuf>> {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let mut entries = std::collections::BTreeSet::new();
|
||||
|
||||
// Find files that are direct children of this directory.
|
||||
for file_path in inner.files.keys() {
|
||||
if let Some(parent) = file_path.parent() {
|
||||
if parent == path {
|
||||
entries.insert(file_path.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find subdirectories that are direct children of this directory.
|
||||
for dir_path in &inner.dirs {
|
||||
if let Some(parent) = dir_path.parent() {
|
||||
if parent == path && dir_path != path {
|
||||
entries.insert(dir_path.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(entries.into_iter().collect())
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
inner.files.contains_key(path) || inner.dirs.contains(path)
|
||||
}
|
||||
|
||||
fn file_size(&self, path: &Path) -> io::Result<u64> {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
inner
|
||||
.files
|
||||
.get(path)
|
||||
.map(|data| data.len() as u64)
|
||||
.ok_or_else(|| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::NotFound,
|
||||
format!("file not found: {}", path.display()),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct InMemoryFileHandle {
|
||||
fs: Arc<Mutex<InMemoryFsInner>>,
|
||||
path: PathBuf,
|
||||
position: u64,
|
||||
}
|
||||
|
||||
impl FileHandle for InMemoryFileHandle {
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
let mut inner = self.fs.lock().unwrap();
|
||||
|
||||
if inner.faults.disk_full {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"disk full (simulated)",
|
||||
));
|
||||
}
|
||||
|
||||
let data = inner
|
||||
.files
|
||||
.get_mut(&self.path)
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::NotFound, "file not found"))?;
|
||||
|
||||
let pos = self.position as usize;
|
||||
if pos + buf.len() > data.len() {
|
||||
data.resize(pos + buf.len(), 0);
|
||||
}
|
||||
data[pos..pos + buf.len()].copy_from_slice(buf);
|
||||
self.position += buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
|
||||
let inner = self.fs.lock().unwrap();
|
||||
let data = inner
|
||||
.files
|
||||
.get(&self.path)
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::NotFound, "file not found"))?;
|
||||
|
||||
let pos = self.position as usize;
|
||||
if pos + buf.len() > data.len() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
"unexpected eof",
|
||||
));
|
||||
}
|
||||
buf.copy_from_slice(&data[pos..pos + buf.len()]);
|
||||
self.position += buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
||||
let inner = self.fs.lock().unwrap();
|
||||
let data = inner
|
||||
.files
|
||||
.get(&self.path)
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::NotFound, "file not found"))?;
|
||||
|
||||
let pos = self.position as usize;
|
||||
let remaining = &data[pos..];
|
||||
buf.extend_from_slice(remaining);
|
||||
self.position += remaining.len() as u64;
|
||||
Ok(remaining.len())
|
||||
}
|
||||
|
||||
fn fsync(&mut self) -> io::Result<()> {
|
||||
let mut inner = self.fs.lock().unwrap();
|
||||
if let Some(err) = inner.faults.fail_next_fsync.take() {
|
||||
return Err(err);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn position(&self) -> u64 {
|
||||
self.position
|
||||
}
|
||||
|
||||
fn seek(&mut self, pos: u64) -> io::Result<()> {
|
||||
let inner = self.fs.lock().unwrap();
|
||||
let data = inner
|
||||
.files
|
||||
.get(&self.path)
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::NotFound, "file not found"))?;
|
||||
|
||||
if pos > data.len() as u64 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"seek past end of file",
|
||||
));
|
||||
}
|
||||
drop(inner);
|
||||
self.position = pos;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_write_read() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/test.dat");
|
||||
|
||||
{
|
||||
let mut fh = fs.open_write(path).unwrap();
|
||||
fh.write_all(b"hello world").unwrap();
|
||||
fh.fsync().unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut fh = fs.open_read(path).unwrap();
|
||||
let mut buf = Vec::new();
|
||||
fh.read_to_end(&mut buf).unwrap();
|
||||
assert_eq!(buf, b"hello world");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_read_exact() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/exact.dat");
|
||||
|
||||
{
|
||||
let mut fh = fs.open_write(path).unwrap();
|
||||
fh.write_all(b"0123456789").unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut fh = fs.open_read(path).unwrap();
|
||||
let mut buf = [0u8; 5];
|
||||
fh.read_exact(&mut buf).unwrap();
|
||||
assert_eq!(&buf, b"01234");
|
||||
assert_eq!(fh.position(), 5);
|
||||
|
||||
fh.read_exact(&mut buf).unwrap();
|
||||
assert_eq!(&buf, b"56789");
|
||||
assert_eq!(fh.position(), 10);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_read_exact_eof() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/short.dat");
|
||||
|
||||
{
|
||||
let mut fh = fs.open_write(path).unwrap();
|
||||
fh.write_all(b"hi").unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut fh = fs.open_read(path).unwrap();
|
||||
let mut buf = [0u8; 10];
|
||||
let err = fh.read_exact(&mut buf).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_append() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/append.dat");
|
||||
|
||||
{
|
||||
let mut fh = fs.open_write(path).unwrap();
|
||||
fh.write_all(b"first").unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut fh = fs.open_append(path).unwrap();
|
||||
assert_eq!(fh.position(), 5);
|
||||
fh.write_all(b"second").unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(fs.read_file_bytes(path).unwrap(), b"firstsecond");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_seek() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/seek.dat");
|
||||
|
||||
{
|
||||
let mut fh = fs.open_write(path).unwrap();
|
||||
fh.write_all(b"abcdefghij").unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut fh = fs.open_read(path).unwrap();
|
||||
fh.seek(5).unwrap();
|
||||
assert_eq!(fh.position(), 5);
|
||||
|
||||
let mut buf = [0u8; 5];
|
||||
fh.read_exact(&mut buf).unwrap();
|
||||
assert_eq!(&buf, b"fghij");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_create_dir_and_list() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
|
||||
fs.create_dir_all(Path::new("/data/topic/0")).unwrap();
|
||||
assert!(fs.exists(Path::new("/data/topic/0")));
|
||||
assert!(fs.exists(Path::new("/data/topic")));
|
||||
assert!(fs.exists(Path::new("/data")));
|
||||
|
||||
// Create files in the directory
|
||||
{
|
||||
let mut fh = fs.open_write(Path::new("/data/topic/0/seg1.wal")).unwrap();
|
||||
fh.write_all(b"data1").unwrap();
|
||||
}
|
||||
{
|
||||
let mut fh = fs.open_write(Path::new("/data/topic/0/seg2.wal")).unwrap();
|
||||
fh.write_all(b"data2").unwrap();
|
||||
}
|
||||
|
||||
let entries = fs.list_dir(Path::new("/data/topic/0")).unwrap();
|
||||
assert_eq!(entries.len(), 2);
|
||||
assert!(entries.contains(&PathBuf::from("/data/topic/0/seg1.wal")));
|
||||
assert!(entries.contains(&PathBuf::from("/data/topic/0/seg2.wal")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_remove_file() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/remove.dat");
|
||||
|
||||
fs.open_write(path).unwrap();
|
||||
assert!(fs.exists(path));
|
||||
|
||||
fs.remove_file(path).unwrap();
|
||||
assert!(!fs.exists(path));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_remove_nonexistent() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let err = fs.remove_file(Path::new("/no/such/file")).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::NotFound);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_open_read_nonexistent() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
match fs.open_read(Path::new("/no/such/file")) {
|
||||
Err(e) => assert_eq!(e.kind(), io::ErrorKind::NotFound),
|
||||
Ok(_) => panic!("expected NotFound error"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_file_size() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/size.dat");
|
||||
|
||||
{
|
||||
let mut fh = fs.open_write(path).unwrap();
|
||||
fh.write_all(b"twelve chars").unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(fs.file_size(path).unwrap(), 12);
|
||||
}
|
||||
|
||||
// --- Fault injection tests ---
|
||||
|
||||
#[test]
|
||||
fn test_fault_fsync_failure() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/fsync.dat");
|
||||
|
||||
fs.fail_next_fsync(io::Error::new(io::ErrorKind::Other, "disk error"));
|
||||
|
||||
let mut fh = fs.open_write(path).unwrap();
|
||||
fh.write_all(b"data").unwrap();
|
||||
|
||||
let err = fh.fsync().unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::Other);
|
||||
|
||||
// Second fsync should succeed (fault was consumed)
|
||||
fh.fsync().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fault_disk_full() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/full.dat");
|
||||
|
||||
let mut fh = fs.open_write(path).unwrap();
|
||||
fh.write_all(b"before").unwrap();
|
||||
|
||||
fs.simulate_disk_full();
|
||||
|
||||
let err = fh.write_all(b"after").unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::Other);
|
||||
|
||||
// Clear fault, writes should work again
|
||||
fs.clear_faults();
|
||||
fh.write_all(b"recovered").unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fault_corrupt_bytes() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/corrupt.dat");
|
||||
|
||||
{
|
||||
let mut fh = fs.open_write(path).unwrap();
|
||||
fh.write_all(&[0x00, 0x00, 0x00, 0x00]).unwrap();
|
||||
}
|
||||
|
||||
fs.corrupt_bytes(path, 1, 2);
|
||||
|
||||
let data = fs.read_file_bytes(path).unwrap();
|
||||
assert_eq!(data, vec![0x00, 0xFF, 0xFF, 0x00]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inmemory_write_position_tracking() {
|
||||
let fs = InMemoryFileSystem::new();
|
||||
let path = Path::new("/tmp/pos.dat");
|
||||
|
||||
let mut fh = fs.open_write(path).unwrap();
|
||||
assert_eq!(fh.position(), 0);
|
||||
|
||||
fh.write_all(b"hello").unwrap();
|
||||
assert_eq!(fh.position(), 5);
|
||||
|
||||
fh.write_all(b" world").unwrap();
|
||||
assert_eq!(fh.position(), 11);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
pub mod clock;
|
||||
pub mod fs;
|
||||
pub mod network;
|
||||
|
||||
pub use clock::*;
|
||||
pub use fs::*;
|
||||
|
||||
316
crates/sq-sim/src/network.rs
Normal file
316
crates/sq-sim/src/network.rs
Normal file
@@ -0,0 +1,316 @@
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
/// Identifier for a node in the virtual network.
|
||||
pub type NodeId = String;
|
||||
|
||||
/// A pending message in the virtual network.
|
||||
#[derive(Debug, Clone)]
|
||||
struct PendingMessage {
|
||||
from: NodeId,
|
||||
to: NodeId,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Virtual network for simulation testing.
|
||||
/// Supports partition, latency injection, and random packet drop.
|
||||
pub struct VirtualNetwork {
|
||||
/// Delivered message queues: node_id -> received messages.
|
||||
inbox: Arc<Mutex<HashMap<NodeId, VecDeque<(NodeId, Vec<u8>)>>>>,
|
||||
/// Pending messages not yet delivered (used for latency simulation).
|
||||
pending: Arc<Mutex<VecDeque<PendingMessage>>>,
|
||||
/// Partitioned links: (from, to) pairs that are blocked.
|
||||
partitions: Arc<Mutex<HashSet<(NodeId, NodeId)>>>,
|
||||
/// Drop probability (0.0 to 1.0).
|
||||
drop_probability: Arc<Mutex<f64>>,
|
||||
}
|
||||
|
||||
impl VirtualNetwork {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
inbox: Arc::new(Mutex::new(HashMap::new())),
|
||||
pending: Arc::new(Mutex::new(VecDeque::new())),
|
||||
partitions: Arc::new(Mutex::new(HashSet::new())),
|
||||
drop_probability: Arc::new(Mutex::new(0.0)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Partition the network between two nodes (bidirectional).
|
||||
pub fn partition(&self, a: &str, b: &str) {
|
||||
let mut parts = self.partitions.lock().unwrap();
|
||||
parts.insert((a.to_string(), b.to_string()));
|
||||
parts.insert((b.to_string(), a.to_string()));
|
||||
}
|
||||
|
||||
/// Heal the partition between two nodes (bidirectional).
|
||||
pub fn heal(&self, a: &str, b: &str) {
|
||||
let mut parts = self.partitions.lock().unwrap();
|
||||
parts.remove(&(a.to_string(), b.to_string()));
|
||||
parts.remove(&(b.to_string(), a.to_string()));
|
||||
}
|
||||
|
||||
/// Heal all partitions.
|
||||
pub fn heal_all(&self) {
|
||||
self.partitions.lock().unwrap().clear();
|
||||
}
|
||||
|
||||
/// Set the probability that a message will be dropped (0.0 = no drops, 1.0 = all dropped).
|
||||
pub fn set_drop_probability(&self, prob: f64) {
|
||||
*self.drop_probability.lock().unwrap() = prob.clamp(0.0, 1.0);
|
||||
}
|
||||
|
||||
/// Send a message from one node to another.
|
||||
/// If the link is partitioned, the message is silently dropped.
|
||||
pub fn send(&self, from: &str, to: &str, data: Vec<u8>) -> Result<(), NetworkError> {
|
||||
// Check for partition.
|
||||
{
|
||||
let parts = self.partitions.lock().unwrap();
|
||||
if parts.contains(&(from.to_string(), to.to_string())) {
|
||||
return Ok(()); // Silently dropped.
|
||||
}
|
||||
}
|
||||
|
||||
// Check for random drop.
|
||||
{
|
||||
let prob = *self.drop_probability.lock().unwrap();
|
||||
if prob > 0.0 {
|
||||
let random: f64 = simple_random();
|
||||
if random < prob {
|
||||
return Ok(()); // Randomly dropped.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Queue the message for delivery.
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
pending.push_back(PendingMessage {
|
||||
from: from.to_string(),
|
||||
to: to.to_string(),
|
||||
data,
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Deliver all pending messages to their inboxes.
|
||||
/// Call this to simulate message delivery (allows controlling when messages arrive).
|
||||
pub fn deliver_pending(&self) {
|
||||
let messages: Vec<PendingMessage> = {
|
||||
let mut pending = self.pending.lock().unwrap();
|
||||
pending.drain(..).collect()
|
||||
};
|
||||
|
||||
let mut inbox = self.inbox.lock().unwrap();
|
||||
for msg in messages {
|
||||
inbox
|
||||
.entry(msg.to.clone())
|
||||
.or_default()
|
||||
.push_back((msg.from, msg.data));
|
||||
}
|
||||
}
|
||||
|
||||
/// Receive a message for a given node. Returns None if no messages are available.
|
||||
pub fn recv(&self, node: &str) -> Option<(NodeId, Vec<u8>)> {
|
||||
let mut inbox = self.inbox.lock().unwrap();
|
||||
inbox.get_mut(node).and_then(|q| q.pop_front())
|
||||
}
|
||||
|
||||
/// Get the number of pending (undelivered) messages.
|
||||
pub fn pending_count(&self) -> usize {
|
||||
self.pending.lock().unwrap().len()
|
||||
}
|
||||
|
||||
/// Get the number of messages in a node's inbox.
|
||||
pub fn inbox_count(&self, node: &str) -> usize {
|
||||
self.inbox
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(node)
|
||||
.map(|q| q.len())
|
||||
.unwrap_or(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for VirtualNetwork {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple deterministic pseudo-random based on thread-local state.
|
||||
fn simple_random() -> f64 {
|
||||
use std::cell::Cell;
|
||||
thread_local! {
|
||||
static STATE: Cell<u64> = const { Cell::new(12345) };
|
||||
}
|
||||
STATE.with(|s| {
|
||||
let mut state = s.get();
|
||||
state ^= state << 13;
|
||||
state ^= state >> 7;
|
||||
state ^= state << 17;
|
||||
s.set(state);
|
||||
(state % 10000) as f64 / 10000.0
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum NetworkError {
|
||||
#[error("node '{0}' not reachable")]
|
||||
Unreachable(String),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_send_and_receive() {
|
||||
let net = VirtualNetwork::new();
|
||||
|
||||
net.send("node-1", "node-2", b"hello".to_vec()).unwrap();
|
||||
net.deliver_pending();
|
||||
|
||||
let (from, data) = net.recv("node-2").unwrap();
|
||||
assert_eq!(from, "node-1");
|
||||
assert_eq!(data, b"hello");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_messages_returns_none() {
|
||||
let net = VirtualNetwork::new();
|
||||
assert!(net.recv("node-1").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_partition_drops_messages() {
|
||||
let net = VirtualNetwork::new();
|
||||
|
||||
net.partition("node-1", "node-2");
|
||||
|
||||
net.send("node-1", "node-2", b"hello".to_vec()).unwrap();
|
||||
net.deliver_pending();
|
||||
|
||||
assert!(net.recv("node-2").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_partition_is_bidirectional() {
|
||||
let net = VirtualNetwork::new();
|
||||
|
||||
net.partition("node-1", "node-2");
|
||||
|
||||
net.send("node-1", "node-2", b"a->b".to_vec()).unwrap();
|
||||
net.send("node-2", "node-1", b"b->a".to_vec()).unwrap();
|
||||
net.deliver_pending();
|
||||
|
||||
assert!(net.recv("node-2").is_none());
|
||||
assert!(net.recv("node-1").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_heal_restores_communication() {
|
||||
let net = VirtualNetwork::new();
|
||||
|
||||
net.partition("node-1", "node-2");
|
||||
net.send("node-1", "node-2", b"before".to_vec()).unwrap();
|
||||
net.deliver_pending();
|
||||
assert!(net.recv("node-2").is_none());
|
||||
|
||||
net.heal("node-1", "node-2");
|
||||
net.send("node-1", "node-2", b"after".to_vec()).unwrap();
|
||||
net.deliver_pending();
|
||||
|
||||
let (_, data) = net.recv("node-2").unwrap();
|
||||
assert_eq!(data, b"after");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_heal_all() {
|
||||
let net = VirtualNetwork::new();
|
||||
|
||||
net.partition("a", "b");
|
||||
net.partition("a", "c");
|
||||
net.heal_all();
|
||||
|
||||
net.send("a", "b", b"msg".to_vec()).unwrap();
|
||||
net.send("a", "c", b"msg".to_vec()).unwrap();
|
||||
net.deliver_pending();
|
||||
|
||||
assert!(net.recv("b").is_some());
|
||||
assert!(net.recv("c").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_messages_ordered() {
|
||||
let net = VirtualNetwork::new();
|
||||
|
||||
for i in 0..5 {
|
||||
net.send("a", "b", format!("msg-{i}").into_bytes())
|
||||
.unwrap();
|
||||
}
|
||||
net.deliver_pending();
|
||||
|
||||
for i in 0..5 {
|
||||
let (_, data) = net.recv("b").unwrap();
|
||||
assert_eq!(data, format!("msg-{i}").as_bytes());
|
||||
}
|
||||
assert!(net.recv("b").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pending_and_inbox_counts() {
|
||||
let net = VirtualNetwork::new();
|
||||
|
||||
net.send("a", "b", b"1".to_vec()).unwrap();
|
||||
net.send("a", "b", b"2".to_vec()).unwrap();
|
||||
|
||||
assert_eq!(net.pending_count(), 2);
|
||||
assert_eq!(net.inbox_count("b"), 0);
|
||||
|
||||
net.deliver_pending();
|
||||
|
||||
assert_eq!(net.pending_count(), 0);
|
||||
assert_eq!(net.inbox_count("b"), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_partition_does_not_affect_other_links() {
|
||||
let net = VirtualNetwork::new();
|
||||
|
||||
net.partition("a", "b");
|
||||
|
||||
// a -> c should still work.
|
||||
net.send("a", "c", b"hello".to_vec()).unwrap();
|
||||
net.deliver_pending();
|
||||
|
||||
assert!(net.recv("c").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_drop_probability_all() {
|
||||
let net = VirtualNetwork::new();
|
||||
net.set_drop_probability(1.0);
|
||||
|
||||
for _ in 0..10 {
|
||||
net.send("a", "b", b"msg".to_vec()).unwrap();
|
||||
}
|
||||
net.deliver_pending();
|
||||
|
||||
// All messages should be dropped.
|
||||
assert_eq!(net.inbox_count("b"), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_drop_probability_none() {
|
||||
let net = VirtualNetwork::new();
|
||||
net.set_drop_probability(0.0);
|
||||
|
||||
for _ in 0..10 {
|
||||
net.send("a", "b", b"msg".to_vec()).unwrap();
|
||||
}
|
||||
net.deliver_pending();
|
||||
|
||||
// No messages should be dropped.
|
||||
assert_eq!(net.inbox_count("b"), 10);
|
||||
}
|
||||
}
|
||||
1
crates/sq-sim/tests/scenarios/mod.rs
Normal file
1
crates/sq-sim/tests/scenarios/mod.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod single_node;
|
||||
268
crates/sq-sim/tests/scenarios/single_node.rs
Normal file
268
crates/sq-sim/tests/scenarios/single_node.rs
Normal file
@@ -0,0 +1,268 @@
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_models::WalConfig;
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
use sq_sim::SimClock;
|
||||
use sq_storage::engine::StorageEngine;
|
||||
|
||||
fn test_engine() -> (
|
||||
StorageEngine<InMemoryFileSystem, SimClock>,
|
||||
Arc<InMemoryFileSystem>,
|
||||
Arc<SimClock>,
|
||||
) {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 1024 * 1024,
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
let engine = StorageEngine::new(fs.clone(), clock.clone(), config).unwrap();
|
||||
(engine, fs, clock)
|
||||
}
|
||||
|
||||
/// S01: Single node, single producer, single consumer - baseline correctness
|
||||
#[test]
|
||||
fn s01_single_producer_consumer() {
|
||||
let (engine, _fs, _clock) = test_engine();
|
||||
|
||||
// Produce 1000 messages.
|
||||
for i in 0..1000u64 {
|
||||
let offset = engine
|
||||
.append("orders", 0, Some(format!("key-{i}").as_bytes()), format!("value-{i}").as_bytes(), &[], i)
|
||||
.unwrap();
|
||||
assert_eq!(offset, i, "offset must match sequence");
|
||||
}
|
||||
|
||||
// Consume all messages.
|
||||
let messages = engine.read("orders", 0, 0, 2000).unwrap();
|
||||
|
||||
// Invariant 1: No message loss.
|
||||
assert_eq!(messages.len(), 1000);
|
||||
|
||||
// Invariant 2: Offsets strictly monotonic, no gaps.
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64, "offset gap detected at index {i}");
|
||||
}
|
||||
|
||||
// Invariant: Content integrity.
|
||||
for msg in &messages {
|
||||
let expected_key = format!("key-{}", msg.offset);
|
||||
let expected_value = format!("value-{}", msg.offset);
|
||||
assert_eq!(msg.key.as_ref().unwrap(), expected_key.as_bytes());
|
||||
assert_eq!(msg.value, expected_value.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
/// S02: Single node, concurrent producers to different topics - offset ordering
|
||||
#[test]
|
||||
fn s02_multi_topic_producers() {
|
||||
let (engine, _fs, _clock) = test_engine();
|
||||
|
||||
let topics = ["events", "orders", "logs"];
|
||||
|
||||
// Write 100 messages to each topic.
|
||||
for topic in &topics {
|
||||
for i in 0..100u64 {
|
||||
let offset = engine.append(topic, 0, None, b"data", &[], i).unwrap();
|
||||
assert_eq!(offset, i);
|
||||
}
|
||||
}
|
||||
|
||||
// Verify each topic has its own offset space.
|
||||
for topic in &topics {
|
||||
let messages = engine.read(topic, 0, 0, 200).unwrap();
|
||||
assert_eq!(messages.len(), 100, "topic {topic} should have 100 messages");
|
||||
|
||||
// Offsets are monotonic per topic.
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
}
|
||||
}
|
||||
|
||||
// Cross-topic isolation: reading one topic doesn't return messages from another.
|
||||
let events = engine.read("events", 0, 0, 200).unwrap();
|
||||
for msg in &events {
|
||||
assert_eq!(msg.topic.as_str(), "events");
|
||||
}
|
||||
}
|
||||
|
||||
/// S03: Single node, disk full during write - graceful error handling
|
||||
#[test]
|
||||
fn s03_disk_full() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 1024 * 1024,
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
let engine = StorageEngine::new(fs.clone(), clock, config).unwrap();
|
||||
|
||||
// Write some messages successfully.
|
||||
for i in 0..10 {
|
||||
engine.append("t", 0, None, b"data", &[], i).unwrap();
|
||||
}
|
||||
|
||||
// Simulate disk full.
|
||||
fs.simulate_disk_full();
|
||||
|
||||
// Next write should fail.
|
||||
let result = engine.append("t", 0, None, b"data", &[], 0);
|
||||
assert!(result.is_err(), "write should fail when disk is full");
|
||||
|
||||
// Clear fault - subsequent writes should work.
|
||||
fs.clear_faults();
|
||||
let _offset = engine.append("t", 0, None, b"after-recovery", &[], 0).unwrap();
|
||||
|
||||
// Verify earlier messages are still readable.
|
||||
let messages = engine.read("t", 0, 0, 100).unwrap();
|
||||
assert!(messages.len() >= 10, "original messages should survive disk full");
|
||||
}
|
||||
|
||||
/// S04: Single node, crash and restart - WAL recovery
|
||||
#[test]
|
||||
fn s04_crash_recovery() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 1024 * 1024,
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Phase 1: Write messages and "crash" (drop engine).
|
||||
{
|
||||
let engine = StorageEngine::new(fs.clone(), clock.clone(), config.clone()).unwrap();
|
||||
for i in 0..500u64 {
|
||||
engine
|
||||
.append("orders", 0, None, format!("msg-{i}").as_bytes(), &[], i)
|
||||
.unwrap();
|
||||
}
|
||||
// Engine dropped here - simulates crash.
|
||||
}
|
||||
|
||||
// Phase 2: "Restart" - create new engine and recover.
|
||||
{
|
||||
let engine = StorageEngine::new(fs.clone(), clock.clone(), config.clone()).unwrap();
|
||||
engine.recover().unwrap();
|
||||
|
||||
// Invariant 1: All acked messages survive recovery.
|
||||
let messages = engine.read("orders", 0, 0, 1000).unwrap();
|
||||
assert_eq!(messages.len(), 500, "all messages must survive crash");
|
||||
|
||||
// Invariant 2: Offsets are intact.
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("msg-{i}").as_bytes());
|
||||
}
|
||||
|
||||
// Can continue writing after recovery.
|
||||
let offset = engine.append("orders", 0, None, b"post-crash", &[], 0).unwrap();
|
||||
assert_eq!(offset, 500);
|
||||
}
|
||||
}
|
||||
|
||||
/// S09: Consumer group offset preservation across restarts
|
||||
#[test]
|
||||
fn s09_consumer_group_offset_persistence() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 1024 * 1024,
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Write messages and commit an offset.
|
||||
{
|
||||
let engine = StorageEngine::new(fs.clone(), clock.clone(), config.clone()).unwrap();
|
||||
for i in 0..100 {
|
||||
engine.append("t", 0, None, b"data", &[], i).unwrap();
|
||||
}
|
||||
engine.commit_offset("group-1", "t", 0, 50).unwrap();
|
||||
}
|
||||
|
||||
// Restart and verify committed offset survives.
|
||||
{
|
||||
let engine = StorageEngine::new(fs.clone(), clock.clone(), config.clone()).unwrap();
|
||||
engine.recover().unwrap();
|
||||
|
||||
// Invariant 4: Consumer group offsets never regress.
|
||||
let committed = engine.get_committed_offset("group-1", "t", 0);
|
||||
assert_eq!(committed, Some(50));
|
||||
|
||||
// Can resume consuming from committed offset.
|
||||
let messages = engine.read("t", 0, 51, 100).unwrap();
|
||||
assert_eq!(messages.len(), 49); // offsets 51-99
|
||||
}
|
||||
}
|
||||
|
||||
/// S10: High throughput burst - no message loss
|
||||
#[test]
|
||||
fn s10_high_throughput() {
|
||||
let (engine, _fs, _clock) = test_engine();
|
||||
|
||||
let msg_count = 10_000u64;
|
||||
|
||||
// Burst write.
|
||||
for i in 0..msg_count {
|
||||
engine
|
||||
.append("burst", 0, None, format!("msg-{i}").as_bytes(), &[], i)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
// Verify no loss.
|
||||
let messages = engine.read("burst", 0, 0, (msg_count + 1) as usize).unwrap();
|
||||
assert_eq!(messages.len(), msg_count as usize);
|
||||
|
||||
// Verify ordering.
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
}
|
||||
}
|
||||
|
||||
/// S06: Segment rotation and recovery - multiple segments survive crash
|
||||
#[test]
|
||||
fn s06_segment_rotation_recovery() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 512, // Very small segments to force rotation.
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Write enough messages to cause multiple segment rotations.
|
||||
{
|
||||
let engine = StorageEngine::new(fs.clone(), clock.clone(), config.clone()).unwrap();
|
||||
for i in 0..200u64 {
|
||||
engine
|
||||
.append("t", 0, None, format!("msg-{i}").as_bytes(), &[], i)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Recover.
|
||||
{
|
||||
let engine = StorageEngine::new(fs.clone(), clock.clone(), config.clone()).unwrap();
|
||||
engine.recover().unwrap();
|
||||
|
||||
let messages = engine.read("t", 0, 0, 300).unwrap();
|
||||
assert_eq!(messages.len(), 200, "all messages across segments must survive");
|
||||
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
}
|
||||
|
||||
// Continue writing.
|
||||
let offset = engine.append("t", 0, None, b"new", &[], 0).unwrap();
|
||||
assert_eq!(offset, 200);
|
||||
}
|
||||
}
|
||||
1
crates/sq-sim/tests/simulation.rs
Normal file
1
crates/sq-sim/tests/simulation.rs
Normal file
@@ -0,0 +1 @@
|
||||
mod scenarios;
|
||||
@@ -8,10 +8,20 @@ sq-models = { workspace = true }
|
||||
sq-sim = { workspace = true }
|
||||
|
||||
anyhow = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
crc32fast = { workspace = true }
|
||||
bytes = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
zstd = { workspace = true }
|
||||
object_store = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { workspace = true, features = ["full", "test-util"] }
|
||||
|
||||
[[bench]]
|
||||
name = "throughput"
|
||||
harness = false
|
||||
|
||||
167
crates/sq-storage/benches/throughput.rs
Normal file
167
crates/sq-storage/benches/throughput.rs
Normal file
@@ -0,0 +1,167 @@
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use sq_models::WalConfig;
|
||||
use sq_sim::SimClock;
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
use sq_storage::engine::StorageEngine;
|
||||
|
||||
fn bench_write_throughput(payload_size: usize, msg_count: u64) {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 256 * 1024 * 1024, // 256MB to avoid rotation overhead
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
let engine = StorageEngine::new(fs, clock, config).unwrap();
|
||||
|
||||
let payload = vec![b'x'; payload_size];
|
||||
|
||||
let start = Instant::now();
|
||||
for i in 0..msg_count {
|
||||
engine.append("bench", 0, None, &payload, &[], i).unwrap();
|
||||
}
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
let msgs_per_sec = msg_count as f64 / elapsed.as_secs_f64();
|
||||
let mb_per_sec = (msg_count as f64 * payload_size as f64) / elapsed.as_secs_f64() / 1_048_576.0;
|
||||
|
||||
println!(
|
||||
" write {msg_count} x {payload_size}B: {:.0} msg/s, {:.1} MB/s ({:.2?})",
|
||||
msgs_per_sec, mb_per_sec, elapsed
|
||||
);
|
||||
}
|
||||
|
||||
fn bench_read_throughput(payload_size: usize, msg_count: u64) {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 256 * 1024 * 1024,
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
let engine = StorageEngine::new(fs, clock, config).unwrap();
|
||||
|
||||
let payload = vec![b'x'; payload_size];
|
||||
for i in 0..msg_count {
|
||||
engine.append("bench", 0, None, &payload, &[], i).unwrap();
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
let messages = engine.read("bench", 0, 0, msg_count as usize + 1).unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
assert_eq!(messages.len(), msg_count as usize);
|
||||
|
||||
let msgs_per_sec = msg_count as f64 / elapsed.as_secs_f64();
|
||||
let mb_per_sec = (msg_count as f64 * payload_size as f64) / elapsed.as_secs_f64() / 1_048_576.0;
|
||||
|
||||
println!(
|
||||
" read {msg_count} x {payload_size}B: {:.0} msg/s, {:.1} MB/s ({:.2?})",
|
||||
msgs_per_sec, mb_per_sec, elapsed
|
||||
);
|
||||
}
|
||||
|
||||
fn bench_compression_ratio(payload_size: usize, msg_count: usize) {
|
||||
// Build a WAL segment worth of data.
|
||||
let mut raw_data = Vec::new();
|
||||
for i in 0..msg_count {
|
||||
let payload = format!("message-{i}-{}", "x".repeat(payload_size));
|
||||
raw_data.extend_from_slice(payload.as_bytes());
|
||||
}
|
||||
|
||||
let compressed = zstd::encode_all(raw_data.as_slice(), 3).unwrap();
|
||||
let ratio = raw_data.len() as f64 / compressed.len() as f64;
|
||||
|
||||
println!(
|
||||
" compress {msg_count} x ~{payload_size}B: {} -> {} ({:.2}x ratio)",
|
||||
format_bytes(raw_data.len()),
|
||||
format_bytes(compressed.len()),
|
||||
ratio
|
||||
);
|
||||
|
||||
// Verify roundtrip.
|
||||
let decompressed = zstd::decode_all(compressed.as_slice()).unwrap();
|
||||
assert_eq!(decompressed.len(), raw_data.len());
|
||||
}
|
||||
|
||||
fn bench_recovery(msg_count: u64) {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 64 * 1024, // Small segments to test multi-segment recovery
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Write messages.
|
||||
{
|
||||
let engine = StorageEngine::new(fs.clone(), clock.clone(), config.clone()).unwrap();
|
||||
for i in 0..msg_count {
|
||||
engine
|
||||
.append("bench", 0, None, format!("msg-{i}").as_bytes(), &[], i)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Recover and measure.
|
||||
let start = Instant::now();
|
||||
let engine = StorageEngine::new(fs, clock, config).unwrap();
|
||||
engine.recover().unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
let msgs_per_sec = msg_count as f64 / elapsed.as_secs_f64();
|
||||
|
||||
println!(
|
||||
" recover {msg_count} msgs: {:.0} msg/s ({:.2?})",
|
||||
msgs_per_sec, elapsed
|
||||
);
|
||||
|
||||
// Verify correctness.
|
||||
let messages = engine.read("bench", 0, 0, msg_count as usize + 1).unwrap();
|
||||
assert_eq!(messages.len(), msg_count as usize);
|
||||
}
|
||||
|
||||
fn format_bytes(bytes: usize) -> String {
|
||||
if bytes >= 1_048_576 {
|
||||
format!("{:.1}MB", bytes as f64 / 1_048_576.0)
|
||||
} else if bytes >= 1024 {
|
||||
format!("{:.1}KB", bytes as f64 / 1024.0)
|
||||
} else {
|
||||
format!("{bytes}B")
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
println!("=== SQ Storage Engine Benchmarks ===\n");
|
||||
|
||||
println!("Write throughput:");
|
||||
bench_write_throughput(64, 100_000);
|
||||
bench_write_throughput(256, 100_000);
|
||||
bench_write_throughput(1024, 50_000);
|
||||
bench_write_throughput(4096, 10_000);
|
||||
|
||||
println!("\nRead throughput:");
|
||||
bench_read_throughput(64, 100_000);
|
||||
bench_read_throughput(256, 100_000);
|
||||
bench_read_throughput(1024, 50_000);
|
||||
bench_read_throughput(4096, 10_000);
|
||||
|
||||
println!("\nCompression ratio:");
|
||||
bench_compression_ratio(64, 10_000);
|
||||
bench_compression_ratio(256, 10_000);
|
||||
bench_compression_ratio(1024, 5_000);
|
||||
bench_compression_ratio(4096, 1_000);
|
||||
|
||||
println!("\nRecovery performance:");
|
||||
bench_recovery(1_000);
|
||||
bench_recovery(10_000);
|
||||
bench_recovery(50_000);
|
||||
|
||||
println!("\n=== Done ===");
|
||||
}
|
||||
193
crates/sq-storage/src/consumer_offsets.rs
Normal file
193
crates/sq-storage/src/consumer_offsets.rs
Normal file
@@ -0,0 +1,193 @@
|
||||
use std::collections::HashMap;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_sim::fs::FileSystem;
|
||||
|
||||
/// Key for consumer offset tracking: (consumer_group, topic, partition).
|
||||
type OffsetKey = (String, String, u32);
|
||||
|
||||
/// Stores committed consumer group offsets.
|
||||
/// Offsets are kept in memory and periodically persisted to a file.
|
||||
pub struct ConsumerOffsets<F: FileSystem> {
|
||||
offsets: HashMap<OffsetKey, u64>,
|
||||
persist_path: PathBuf,
|
||||
fs: Arc<F>,
|
||||
}
|
||||
|
||||
impl<F: FileSystem> ConsumerOffsets<F> {
|
||||
pub fn new(fs: Arc<F>, data_dir: &Path) -> Self {
|
||||
let persist_path = data_dir.join("consumer_offsets.json");
|
||||
Self {
|
||||
offsets: HashMap::new(),
|
||||
persist_path,
|
||||
fs,
|
||||
}
|
||||
}
|
||||
|
||||
/// Commit an offset for a consumer group on a topic-partition.
|
||||
pub fn commit(
|
||||
&mut self,
|
||||
group: &str,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
offset: u64,
|
||||
) -> anyhow::Result<()> {
|
||||
let key = (group.to_string(), topic.to_string(), partition);
|
||||
self.offsets.insert(key, offset);
|
||||
self.persist()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the committed offset for a consumer group on a topic-partition.
|
||||
pub fn get_committed(&self, group: &str, topic: &str, partition: u32) -> Option<u64> {
|
||||
let key = (group.to_string(), topic.to_string(), partition);
|
||||
self.offsets.get(&key).copied()
|
||||
}
|
||||
|
||||
/// Persist offsets to disk as JSON.
|
||||
fn persist(&self) -> anyhow::Result<()> {
|
||||
// Serialize as a simple JSON array of entries.
|
||||
let entries: Vec<OffsetEntry> = self
|
||||
.offsets
|
||||
.iter()
|
||||
.map(|((group, topic, partition), offset)| OffsetEntry {
|
||||
group: group.clone(),
|
||||
topic: topic.clone(),
|
||||
partition: *partition,
|
||||
offset: *offset,
|
||||
})
|
||||
.collect();
|
||||
|
||||
let json = serde_json::to_vec(&entries)?;
|
||||
|
||||
// Ensure parent directory exists.
|
||||
if let Some(parent) = self.persist_path.parent() {
|
||||
self.fs.create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
let mut handle = self.fs.open_write(&self.persist_path)?;
|
||||
handle.write_all(&json)?;
|
||||
handle.fsync()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load offsets from disk.
|
||||
pub fn load(fs: Arc<F>, data_dir: &Path) -> anyhow::Result<Self> {
|
||||
let persist_path = data_dir.join("consumer_offsets.json");
|
||||
|
||||
if !fs.exists(&persist_path) {
|
||||
return Ok(Self {
|
||||
offsets: HashMap::new(),
|
||||
persist_path,
|
||||
fs,
|
||||
});
|
||||
}
|
||||
|
||||
let mut handle = fs.open_read(&persist_path)?;
|
||||
let mut buf = Vec::new();
|
||||
handle.read_to_end(&mut buf)?;
|
||||
|
||||
let entries: Vec<OffsetEntry> = serde_json::from_slice(&buf)?;
|
||||
|
||||
let mut offsets = HashMap::new();
|
||||
for entry in entries {
|
||||
offsets.insert(
|
||||
(entry.group, entry.topic, entry.partition),
|
||||
entry.offset,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
offsets,
|
||||
persist_path,
|
||||
fs,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
struct OffsetEntry {
|
||||
group: String,
|
||||
topic: String,
|
||||
partition: u32,
|
||||
offset: u64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
|
||||
fn test_offsets() -> ConsumerOffsets<InMemoryFileSystem> {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
ConsumerOffsets::new(fs, Path::new("/data"))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_commit_and_get() {
|
||||
let mut offsets = test_offsets();
|
||||
offsets.commit("group-1", "orders", 0, 42).unwrap();
|
||||
|
||||
assert_eq!(offsets.get_committed("group-1", "orders", 0), Some(42));
|
||||
assert_eq!(offsets.get_committed("group-1", "orders", 1), None);
|
||||
assert_eq!(offsets.get_committed("group-2", "orders", 0), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_commit_overwrites() {
|
||||
let mut offsets = test_offsets();
|
||||
offsets.commit("g", "t", 0, 10).unwrap();
|
||||
offsets.commit("g", "t", 0, 20).unwrap();
|
||||
|
||||
assert_eq!(offsets.get_committed("g", "t", 0), Some(20));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_groups() {
|
||||
let mut offsets = test_offsets();
|
||||
offsets.commit("g1", "t", 0, 100).unwrap();
|
||||
offsets.commit("g2", "t", 0, 200).unwrap();
|
||||
|
||||
assert_eq!(offsets.get_committed("g1", "t", 0), Some(100));
|
||||
assert_eq!(offsets.get_committed("g2", "t", 0), Some(200));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_persist_and_load() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
|
||||
{
|
||||
let mut offsets = ConsumerOffsets::new(fs.clone(), Path::new("/data"));
|
||||
offsets.commit("g1", "orders", 0, 42).unwrap();
|
||||
offsets.commit("g1", "events", 0, 100).unwrap();
|
||||
offsets.commit("g2", "orders", 1, 55).unwrap();
|
||||
}
|
||||
|
||||
let loaded = ConsumerOffsets::load(fs, Path::new("/data")).unwrap();
|
||||
assert_eq!(loaded.get_committed("g1", "orders", 0), Some(42));
|
||||
assert_eq!(loaded.get_committed("g1", "events", 0), Some(100));
|
||||
assert_eq!(loaded.get_committed("g2", "orders", 1), Some(55));
|
||||
assert_eq!(loaded.get_committed("g2", "orders", 0), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_load_nonexistent_file() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let offsets = ConsumerOffsets::load(fs, Path::new("/data")).unwrap();
|
||||
assert_eq!(offsets.get_committed("g", "t", 0), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_topics_and_partitions() {
|
||||
let mut offsets = test_offsets();
|
||||
offsets.commit("g", "t1", 0, 10).unwrap();
|
||||
offsets.commit("g", "t1", 1, 20).unwrap();
|
||||
offsets.commit("g", "t2", 0, 30).unwrap();
|
||||
|
||||
assert_eq!(offsets.get_committed("g", "t1", 0), Some(10));
|
||||
assert_eq!(offsets.get_committed("g", "t1", 1), Some(20));
|
||||
assert_eq!(offsets.get_committed("g", "t2", 0), Some(30));
|
||||
}
|
||||
}
|
||||
634
crates/sq-storage/src/engine.rs
Normal file
634
crates/sq-storage/src/engine.rs
Normal file
@@ -0,0 +1,634 @@
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, Mutex, RwLock};
|
||||
|
||||
use sq_models::{ClosedSegment, Header, Message, TopicConfig, TopicName, WalConfig};
|
||||
use sq_sim::fs::FileSystem;
|
||||
use sq_sim::Clock;
|
||||
|
||||
use crate::consumer_offsets::ConsumerOffsets;
|
||||
use crate::index::OffsetIndex;
|
||||
use crate::topic_metadata::TopicMetadata;
|
||||
use crate::wal::reader::WalReader;
|
||||
use crate::wal::writer::{segment_dir, segment_path, WalWriter};
|
||||
|
||||
/// Unified storage engine wrapping WAL writers, readers, and offset index.
|
||||
/// All methods take `&self` — concurrency is handled by fine-grained internal locks.
|
||||
/// Different (topic, partition) writers can operate in parallel.
|
||||
pub struct StorageEngine<F: FileSystem, C: Clock> {
|
||||
fs: Arc<F>,
|
||||
clock: Arc<C>,
|
||||
config: WalConfig,
|
||||
/// One writer per (topic, partition), each independently locked.
|
||||
writers: RwLock<HashMap<(String, u32), Arc<Mutex<WalWriter<F, C>>>>>,
|
||||
/// Offset index for fast seeks.
|
||||
index: Mutex<OffsetIndex>,
|
||||
/// Reader instance (stateless, no lock needed).
|
||||
reader: WalReader<F>,
|
||||
/// Consumer group offset tracking.
|
||||
consumer_offsets: Mutex<ConsumerOffsets<F>>,
|
||||
/// Topic metadata registry.
|
||||
topic_metadata: Mutex<TopicMetadata<F>>,
|
||||
}
|
||||
|
||||
impl<F: FileSystem, C: Clock> StorageEngine<F, C> {
|
||||
pub fn new(fs: Arc<F>, clock: Arc<C>, config: WalConfig) -> anyhow::Result<Self> {
|
||||
fs.create_dir_all(&config.data_dir)?;
|
||||
|
||||
let consumer_offsets = ConsumerOffsets::load(fs.clone(), &config.data_dir)?;
|
||||
let topic_metadata = TopicMetadata::load(fs.clone(), &config.data_dir)?;
|
||||
|
||||
Ok(Self {
|
||||
reader: WalReader::new(fs.clone()),
|
||||
consumer_offsets: Mutex::new(consumer_offsets),
|
||||
topic_metadata: Mutex::new(topic_metadata),
|
||||
fs,
|
||||
clock,
|
||||
config,
|
||||
writers: RwLock::new(HashMap::new()),
|
||||
index: Mutex::new(OffsetIndex::new(1000)),
|
||||
})
|
||||
}
|
||||
|
||||
/// Append a message to the given topic-partition. Returns the assigned offset.
|
||||
pub fn append(
|
||||
&self,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
key: Option<&[u8]>,
|
||||
value: &[u8],
|
||||
headers: &[Header],
|
||||
timestamp_ms: u64,
|
||||
) -> anyhow::Result<u64> {
|
||||
let writer_arc = self.get_or_create_writer(topic, partition)?;
|
||||
let mut writer = writer_arc.lock().unwrap();
|
||||
|
||||
let old_next = writer.next_offset();
|
||||
let offset = writer.append(key, value, headers, timestamp_ms)?;
|
||||
|
||||
// Register the current segment in the index (for the first write).
|
||||
if offset == old_next && offset == 0
|
||||
|| (offset > 0 && {
|
||||
let index = self.index.lock().unwrap();
|
||||
index.segment_for_offset(topic, partition, offset).is_none()
|
||||
})
|
||||
{
|
||||
let seg =
|
||||
segment_path(&self.config.data_dir, &TopicName::from(topic), partition, offset);
|
||||
let mut index = self.index.lock().unwrap();
|
||||
index.register_segment(topic, partition, seg, offset, offset);
|
||||
}
|
||||
|
||||
Ok(offset)
|
||||
}
|
||||
|
||||
/// Append a batch of messages to a single topic-partition with one fsync.
|
||||
/// Returns the assigned offsets.
|
||||
pub fn append_batch(
|
||||
&self,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
messages: &[(Option<&[u8]>, &[u8], &[Header], u64)],
|
||||
) -> anyhow::Result<Vec<u64>> {
|
||||
if messages.is_empty() {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
let writer_arc = self.get_or_create_writer(topic, partition)?;
|
||||
let mut writer = writer_arc.lock().unwrap();
|
||||
|
||||
let first_offset = writer.next_offset();
|
||||
let offsets = writer.append_batch(messages)?;
|
||||
|
||||
// Register segment in index if this is a new segment.
|
||||
{
|
||||
let mut index = self.index.lock().unwrap();
|
||||
if index
|
||||
.segment_for_offset(topic, partition, first_offset)
|
||||
.is_none()
|
||||
{
|
||||
let seg = segment_path(
|
||||
&self.config.data_dir,
|
||||
&TopicName::from(topic),
|
||||
partition,
|
||||
first_offset,
|
||||
);
|
||||
index.register_segment(topic, partition, seg, first_offset, first_offset);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(offsets)
|
||||
}
|
||||
|
||||
/// Force fsync on all active writer segment files.
|
||||
pub fn fsync_all_writers(&self) -> anyhow::Result<()> {
|
||||
let writers = self.writers.read().unwrap();
|
||||
for writer_arc in writers.values() {
|
||||
let mut writer = writer_arc.lock().unwrap();
|
||||
writer.fsync()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read messages from a topic-partition starting at `from_offset`.
|
||||
/// Returns up to `limit` messages. Lock-free — reads directly from disk.
|
||||
pub fn read(
|
||||
&self,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
from_offset: u64,
|
||||
limit: usize,
|
||||
) -> anyhow::Result<Vec<Message>> {
|
||||
let topic_name = TopicName::from(topic);
|
||||
let seg_dir = segment_dir(&self.config.data_dir, &topic_name, partition);
|
||||
|
||||
if !self.fs.exists(&seg_dir) {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
// List all segment files and sort them.
|
||||
let mut segment_files: Vec<PathBuf> = self
|
||||
.fs
|
||||
.list_dir(&seg_dir)?
|
||||
.into_iter()
|
||||
.filter(|p| p.extension().map(|e| e == "wal").unwrap_or(false))
|
||||
.collect();
|
||||
segment_files.sort();
|
||||
|
||||
let mut result = Vec::new();
|
||||
|
||||
for seg_path in &segment_files {
|
||||
if result.len() >= limit {
|
||||
break;
|
||||
}
|
||||
|
||||
let messages = self.reader.read_from_offset(seg_path, from_offset)?;
|
||||
|
||||
for msg in messages {
|
||||
if result.len() >= limit {
|
||||
break;
|
||||
}
|
||||
result.push(msg);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Get the latest offset for a topic-partition (the next offset to be assigned).
|
||||
pub fn latest_offset(&self, topic: &str, partition: u32) -> u64 {
|
||||
let key = (topic.to_string(), partition);
|
||||
let writers = self.writers.read().unwrap();
|
||||
writers
|
||||
.get(&key)
|
||||
.map(|w| w.lock().unwrap().next_offset())
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Recover state from existing WAL files on disk.
|
||||
/// Scans all segment files, rebuilds the index, and sets writers to the correct offset.
|
||||
/// Must be called at startup before any concurrent access.
|
||||
pub fn recover(&self) -> anyhow::Result<()> {
|
||||
if !self.fs.exists(&self.config.data_dir) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Scan for topic directories (skip files like consumer_offsets.json).
|
||||
let topic_dirs: Vec<PathBuf> = self
|
||||
.fs
|
||||
.list_dir(&self.config.data_dir)?
|
||||
.into_iter()
|
||||
.filter(|p| {
|
||||
// Skip entries that have a file extension (they are metadata files, not topic dirs).
|
||||
p.extension().is_none()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut writers = self.writers.write().unwrap();
|
||||
let mut index = self.index.lock().unwrap();
|
||||
|
||||
for topic_dir in &topic_dirs {
|
||||
let topic = topic_dir
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
if topic.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Scan for partition directories (skip any non-directory entries).
|
||||
let partition_dirs: Vec<PathBuf> = match self.fs.list_dir(topic_dir) {
|
||||
Ok(entries) => entries,
|
||||
Err(_) => continue, // Skip if not a directory.
|
||||
};
|
||||
|
||||
for part_dir in &partition_dirs {
|
||||
let partition: u32 = part_dir
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(0);
|
||||
|
||||
// Scan segment files.
|
||||
let mut seg_files: Vec<PathBuf> = self
|
||||
.fs
|
||||
.list_dir(part_dir)?
|
||||
.into_iter()
|
||||
.filter(|p| p.extension().map(|e| e == "wal").unwrap_or(false))
|
||||
.collect();
|
||||
seg_files.sort();
|
||||
|
||||
let mut max_offset = 0u64;
|
||||
|
||||
for seg_path in &seg_files {
|
||||
let messages = self.reader.read_segment(seg_path)?;
|
||||
if let (Some(first), Some(last)) = (messages.first(), messages.last()) {
|
||||
index.register_segment(
|
||||
&topic,
|
||||
partition,
|
||||
seg_path.clone(),
|
||||
first.offset,
|
||||
last.offset,
|
||||
);
|
||||
max_offset = max_offset.max(last.offset + 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Create a writer at the recovered offset.
|
||||
if max_offset > 0 {
|
||||
let writer = WalWriter::new(
|
||||
self.fs.clone(),
|
||||
self.clock.clone(),
|
||||
self.config.clone(),
|
||||
TopicName::from(topic.as_str()),
|
||||
partition,
|
||||
)?
|
||||
.with_next_offset(max_offset);
|
||||
|
||||
writers.insert((topic.clone(), partition), Arc::new(Mutex::new(writer)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Commit a consumer group offset.
|
||||
pub fn commit_offset(
|
||||
&self,
|
||||
group: &str,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
offset: u64,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut offsets = self.consumer_offsets.lock().unwrap();
|
||||
offsets.commit(group, topic, partition, offset)
|
||||
}
|
||||
|
||||
/// Get the committed offset for a consumer group.
|
||||
pub fn get_committed_offset(
|
||||
&self,
|
||||
group: &str,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
) -> Option<u64> {
|
||||
let offsets = self.consumer_offsets.lock().unwrap();
|
||||
offsets.get_committed(group, topic, partition)
|
||||
}
|
||||
|
||||
/// Create a topic in the metadata registry.
|
||||
pub fn create_topic(&self, config: TopicConfig) -> anyhow::Result<()> {
|
||||
let mut metadata = self.topic_metadata.lock().unwrap();
|
||||
metadata.create_topic(config)
|
||||
}
|
||||
|
||||
/// Delete a topic from the metadata registry.
|
||||
pub fn delete_topic(&self, name: &str) -> anyhow::Result<()> {
|
||||
let mut metadata = self.topic_metadata.lock().unwrap();
|
||||
metadata.delete_topic(name)
|
||||
}
|
||||
|
||||
/// List all topics. Returns owned configs (cannot return references through Mutex).
|
||||
pub fn list_topics(&self) -> Vec<TopicConfig> {
|
||||
let metadata = self.topic_metadata.lock().unwrap();
|
||||
metadata.list_topics().into_iter().cloned().collect()
|
||||
}
|
||||
|
||||
/// Get a specific topic's config. Returns owned config.
|
||||
pub fn get_topic(&self, name: &str) -> Option<TopicConfig> {
|
||||
let metadata = self.topic_metadata.lock().unwrap();
|
||||
metadata.get_topic(name).cloned()
|
||||
}
|
||||
|
||||
/// Check if a topic exists in the metadata registry.
|
||||
pub fn topic_exists(&self, name: &str) -> bool {
|
||||
let metadata = self.topic_metadata.lock().unwrap();
|
||||
metadata.topic_exists(name)
|
||||
}
|
||||
|
||||
/// Close all active segments and return them. Used by the S3 shipper.
|
||||
pub fn close_all_segments(&self) -> anyhow::Result<Vec<ClosedSegment>> {
|
||||
let writers = self.writers.read().unwrap();
|
||||
let mut closed = Vec::new();
|
||||
for writer_arc in writers.values() {
|
||||
let mut writer = writer_arc.lock().unwrap();
|
||||
if let Some(seg) = writer.close_active_segment()? {
|
||||
closed.push(seg);
|
||||
}
|
||||
}
|
||||
Ok(closed)
|
||||
}
|
||||
|
||||
/// Get or create a writer for the given topic-partition.
|
||||
/// Uses read lock for the common case (writer exists), upgrades to write lock to create.
|
||||
fn get_or_create_writer(
|
||||
&self,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
) -> anyhow::Result<Arc<Mutex<WalWriter<F, C>>>> {
|
||||
let key = (topic.to_string(), partition);
|
||||
|
||||
// Fast path: read lock (most common).
|
||||
{
|
||||
let writers = self.writers.read().unwrap();
|
||||
if let Some(writer) = writers.get(&key) {
|
||||
return Ok(writer.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// Slow path: write lock to create new writer.
|
||||
let mut writers = self.writers.write().unwrap();
|
||||
// Double-check — another thread may have created it.
|
||||
if let Some(writer) = writers.get(&key) {
|
||||
return Ok(writer.clone());
|
||||
}
|
||||
|
||||
let writer = WalWriter::new(
|
||||
self.fs.clone(),
|
||||
self.clock.clone(),
|
||||
self.config.clone(),
|
||||
TopicName::from(topic),
|
||||
partition,
|
||||
)?;
|
||||
let writer = Arc::new(Mutex::new(writer));
|
||||
writers.insert(key, writer.clone());
|
||||
Ok(writer)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
use sq_sim::SimClock;
|
||||
|
||||
fn test_engine() -> StorageEngine<InMemoryFileSystem, SimClock> {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 1024 * 1024,
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
StorageEngine::new(fs, clock, config).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_append_and_read() {
|
||||
let engine = test_engine();
|
||||
|
||||
for i in 0..10 {
|
||||
let offset = engine
|
||||
.append("orders", 0, None, format!("msg-{i}").as_bytes(), &[], i * 100)
|
||||
.unwrap();
|
||||
assert_eq!(offset, i);
|
||||
}
|
||||
|
||||
let messages = engine.read("orders", 0, 0, 100).unwrap();
|
||||
assert_eq!(messages.len(), 10);
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("msg-{i}").as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_from_middle() {
|
||||
let engine = test_engine();
|
||||
|
||||
for i in 0..20 {
|
||||
engine.append("t", 0, None, b"data", &[], i).unwrap();
|
||||
}
|
||||
|
||||
let messages = engine.read("t", 0, 10, 100).unwrap();
|
||||
assert_eq!(messages.len(), 10);
|
||||
assert_eq!(messages[0].offset, 10);
|
||||
assert_eq!(messages[9].offset, 19);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_with_limit() {
|
||||
let engine = test_engine();
|
||||
|
||||
for i in 0..100 {
|
||||
engine.append("t", 0, None, b"data", &[], i).unwrap();
|
||||
}
|
||||
|
||||
let messages = engine.read("t", 0, 0, 5).unwrap();
|
||||
assert_eq!(messages.len(), 5);
|
||||
assert_eq!(messages[4].offset, 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_topic_isolation() {
|
||||
let engine = test_engine();
|
||||
|
||||
engine.append("alpha", 0, None, b"a-data", &[], 0).unwrap();
|
||||
engine.append("beta", 0, None, b"b-data", &[], 0).unwrap();
|
||||
|
||||
let a_msgs = engine.read("alpha", 0, 0, 100).unwrap();
|
||||
let b_msgs = engine.read("beta", 0, 0, 100).unwrap();
|
||||
|
||||
assert_eq!(a_msgs.len(), 1);
|
||||
assert_eq!(b_msgs.len(), 1);
|
||||
assert_eq!(a_msgs[0].value, b"a-data");
|
||||
assert_eq!(b_msgs[0].value, b"b-data");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_partition_isolation() {
|
||||
let engine = test_engine();
|
||||
|
||||
engine.append("t", 0, None, b"p0", &[], 0).unwrap();
|
||||
engine.append("t", 1, None, b"p1", &[], 0).unwrap();
|
||||
|
||||
let p0 = engine.read("t", 0, 0, 100).unwrap();
|
||||
let p1 = engine.read("t", 1, 0, 100).unwrap();
|
||||
|
||||
assert_eq!(p0.len(), 1);
|
||||
assert_eq!(p1.len(), 1);
|
||||
assert_eq!(p0[0].value, b"p0");
|
||||
assert_eq!(p1[0].value, b"p1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_nonexistent_topic() {
|
||||
let engine = test_engine();
|
||||
let messages = engine.read("no-topic", 0, 0, 100).unwrap();
|
||||
assert!(messages.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_latest_offset() {
|
||||
let engine = test_engine();
|
||||
|
||||
assert_eq!(engine.latest_offset("t", 0), 0);
|
||||
|
||||
engine.append("t", 0, None, b"a", &[], 0).unwrap();
|
||||
engine.append("t", 0, None, b"b", &[], 0).unwrap();
|
||||
|
||||
assert_eq!(engine.latest_offset("t", 0), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_recovery() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 1024 * 1024,
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Write some messages.
|
||||
{
|
||||
let engine =
|
||||
StorageEngine::new(fs.clone(), clock.clone(), config.clone()).unwrap();
|
||||
for i in 0..5 {
|
||||
engine
|
||||
.append("orders", 0, None, format!("msg-{i}").as_bytes(), &[], 0)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Create a new engine and recover.
|
||||
{
|
||||
let engine = StorageEngine::new(fs, clock, config).unwrap();
|
||||
engine.recover().unwrap();
|
||||
|
||||
// Should be able to read all messages.
|
||||
let messages = engine.read("orders", 0, 0, 100).unwrap();
|
||||
assert_eq!(messages.len(), 5);
|
||||
|
||||
// Next offset should continue from 5.
|
||||
assert_eq!(engine.latest_offset("orders", 0), 5);
|
||||
|
||||
// Should be able to write more.
|
||||
let offset = engine.append("orders", 0, None, b"msg-5", &[], 0).unwrap();
|
||||
assert_eq!(offset, 5);
|
||||
}
|
||||
}
|
||||
|
||||
/// Regression: recovery must skip metadata JSON files in the data directory.
|
||||
#[test]
|
||||
fn test_recovery_with_metadata_files() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 1024 * 1024,
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Write messages and commit a consumer offset (creates JSON files in data_dir).
|
||||
{
|
||||
let engine =
|
||||
StorageEngine::new(fs.clone(), clock.clone(), config.clone()).unwrap();
|
||||
for i in 0..10 {
|
||||
engine
|
||||
.append("orders", 0, None, format!("msg-{i}").as_bytes(), &[], 0)
|
||||
.unwrap();
|
||||
}
|
||||
engine.commit_offset("group-1", "orders", 0, 5).unwrap();
|
||||
}
|
||||
|
||||
// Recover — this used to fail with "Not a directory" because
|
||||
// consumer_offsets.json was treated as a topic directory.
|
||||
{
|
||||
let engine =
|
||||
StorageEngine::new(fs.clone(), clock.clone(), config.clone()).unwrap();
|
||||
engine.recover().unwrap();
|
||||
|
||||
let messages = engine.read("orders", 0, 0, 100).unwrap();
|
||||
assert_eq!(messages.len(), 10);
|
||||
assert_eq!(engine.get_committed_offset("group-1", "orders", 0), Some(5));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_1000_read_all() {
|
||||
let engine = test_engine();
|
||||
|
||||
for i in 0..1000 {
|
||||
engine.append("t", 0, None, b"x", &[], i).unwrap();
|
||||
}
|
||||
|
||||
let messages = engine.read("t", 0, 0, 2000).unwrap();
|
||||
assert_eq!(messages.len(), 1000);
|
||||
assert_eq!(messages[0].offset, 0);
|
||||
assert_eq!(messages[999].offset, 999);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_append_batch_and_read() {
|
||||
let engine = test_engine();
|
||||
|
||||
let messages: Vec<(Option<&[u8]>, &[u8], &[Header], u64)> = (0..10)
|
||||
.map(|i| (None, b"data" as &[u8], &[] as &[Header], i as u64 * 100))
|
||||
.collect();
|
||||
|
||||
let offsets = engine.append_batch("orders", 0, &messages).unwrap();
|
||||
assert_eq!(offsets.len(), 10);
|
||||
assert_eq!(offsets[0], 0);
|
||||
assert_eq!(offsets[9], 9);
|
||||
|
||||
let read = engine.read("orders", 0, 0, 100).unwrap();
|
||||
assert_eq!(read.len(), 10);
|
||||
for (i, msg) in read.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_append_batch_then_single() {
|
||||
let engine = test_engine();
|
||||
|
||||
let messages: Vec<(Option<&[u8]>, &[u8], &[Header], u64)> = vec![
|
||||
(None, b"a" as &[u8], &[] as &[Header], 0),
|
||||
(None, b"b", &[], 0),
|
||||
];
|
||||
|
||||
let offsets = engine.append_batch("t", 0, &messages).unwrap();
|
||||
assert_eq!(offsets, vec![0, 1]);
|
||||
|
||||
let offset = engine.append("t", 0, None, b"c", &[], 0).unwrap();
|
||||
assert_eq!(offset, 2);
|
||||
|
||||
let read = engine.read("t", 0, 0, 100).unwrap();
|
||||
assert_eq!(read.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_append_batch_empty() {
|
||||
let engine = test_engine();
|
||||
let offsets = engine
|
||||
.append_batch("t", 0, &[] as &[(Option<&[u8]>, &[u8], &[Header], u64)])
|
||||
.unwrap();
|
||||
assert!(offsets.is_empty());
|
||||
}
|
||||
}
|
||||
256
crates/sq-storage/src/index.rs
Normal file
256
crates/sq-storage/src/index.rs
Normal file
@@ -0,0 +1,256 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// An entry in the sparse offset index.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct IndexEntry {
|
||||
pub offset: u64,
|
||||
pub segment_path: PathBuf,
|
||||
/// Byte position within the segment file (past the segment header).
|
||||
pub byte_position: u64,
|
||||
}
|
||||
|
||||
/// Location where a segment's data lives.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum SegmentLocation {
|
||||
Local(PathBuf),
|
||||
ObjectStore(String), // S3 key
|
||||
}
|
||||
|
||||
/// Sparse in-memory offset index for fast consumer seeks.
|
||||
///
|
||||
/// Maps (topic, partition) → sorted list of index entries.
|
||||
/// Only every Nth offset is indexed (sparse sampling).
|
||||
/// Lookups use binary search to find the nearest entry at-or-before the requested offset.
|
||||
pub struct OffsetIndex {
|
||||
/// Per (topic, partition): sorted vec of index entries.
|
||||
entries: BTreeMap<(String, u32), Vec<IndexEntry>>,
|
||||
/// Sample interval: index every Nth offset.
|
||||
sample_interval: u64,
|
||||
}
|
||||
|
||||
impl OffsetIndex {
|
||||
pub fn new(sample_interval: u64) -> Self {
|
||||
Self {
|
||||
entries: BTreeMap::new(),
|
||||
sample_interval: sample_interval.max(1),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add an entry to the index. Entries should be added in offset order.
|
||||
pub fn add_entry(&mut self, topic: &str, partition: u32, entry: IndexEntry) {
|
||||
let key = (topic.to_string(), partition);
|
||||
self.entries.entry(key).or_default().push(entry);
|
||||
}
|
||||
|
||||
/// Register a segment's offset range. Only samples every Nth offset.
|
||||
/// `base_offset` is the first offset in the segment.
|
||||
/// `end_offset` is the last offset (inclusive).
|
||||
pub fn register_segment(
|
||||
&mut self,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
segment_path: PathBuf,
|
||||
base_offset: u64,
|
||||
end_offset: u64,
|
||||
) {
|
||||
// Add an entry for the base offset always.
|
||||
self.add_entry(
|
||||
topic,
|
||||
partition,
|
||||
IndexEntry {
|
||||
offset: base_offset,
|
||||
segment_path: segment_path.clone(),
|
||||
byte_position: 0, // will need to scan from segment header
|
||||
},
|
||||
);
|
||||
|
||||
// Add sampled entries.
|
||||
let mut o = base_offset + self.sample_interval;
|
||||
while o <= end_offset {
|
||||
self.add_entry(
|
||||
topic,
|
||||
partition,
|
||||
IndexEntry {
|
||||
offset: o,
|
||||
segment_path: segment_path.clone(),
|
||||
byte_position: 0, // approximate; reader will scan forward
|
||||
},
|
||||
);
|
||||
o += self.sample_interval;
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up the index entry at-or-before the given offset.
|
||||
/// Returns the nearest entry whose offset <= requested offset.
|
||||
pub fn lookup(&self, topic: &str, partition: u32, offset: u64) -> Option<&IndexEntry> {
|
||||
let key = (topic.to_string(), partition);
|
||||
let entries = self.entries.get(&key)?;
|
||||
|
||||
if entries.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Binary search for the largest entry.offset <= offset.
|
||||
match entries.binary_search_by_key(&offset, |e| e.offset) {
|
||||
Ok(i) => Some(&entries[i]),
|
||||
Err(0) => None, // offset is before all entries
|
||||
Err(i) => Some(&entries[i - 1]),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the segment path containing the given offset (or the nearest segment before it).
|
||||
pub fn segment_for_offset(
|
||||
&self,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
offset: u64,
|
||||
) -> Option<&PathBuf> {
|
||||
self.lookup(topic, partition, offset)
|
||||
.map(|e| &e.segment_path)
|
||||
}
|
||||
|
||||
/// Get all known segment paths for a topic-partition, in offset order.
|
||||
pub fn segments(&self, topic: &str, partition: u32) -> Vec<PathBuf> {
|
||||
let key = (topic.to_string(), partition);
|
||||
let Some(entries) = self.entries.get(&key) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let mut seen = std::collections::BTreeSet::new();
|
||||
let mut result = Vec::new();
|
||||
for entry in entries {
|
||||
if seen.insert(entry.segment_path.clone()) {
|
||||
result.push(entry.segment_path.clone());
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Get the earliest known offset for a topic-partition.
|
||||
pub fn earliest_offset(&self, topic: &str, partition: u32) -> Option<u64> {
|
||||
let key = (topic.to_string(), partition);
|
||||
self.entries
|
||||
.get(&key)
|
||||
.and_then(|entries| entries.first().map(|e| e.offset))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_lookup_exact_offset() {
|
||||
let mut index = OffsetIndex::new(100);
|
||||
index.register_segment("orders", 0, PathBuf::from("/seg0.wal"), 0, 999);
|
||||
|
||||
let entry = index.lookup("orders", 0, 0).unwrap();
|
||||
assert_eq!(entry.offset, 0);
|
||||
assert_eq!(entry.segment_path, PathBuf::from("/seg0.wal"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lookup_between_samples() {
|
||||
let mut index = OffsetIndex::new(100);
|
||||
index.register_segment("orders", 0, PathBuf::from("/seg0.wal"), 0, 999);
|
||||
|
||||
// Offset 50 is between samples 0 and 100, should return entry for 0.
|
||||
let entry = index.lookup("orders", 0, 50).unwrap();
|
||||
assert_eq!(entry.offset, 0);
|
||||
|
||||
// Offset 150 is between 100 and 200, should return entry for 100.
|
||||
let entry = index.lookup("orders", 0, 150).unwrap();
|
||||
assert_eq!(entry.offset, 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lookup_beyond_last_entry() {
|
||||
let mut index = OffsetIndex::new(100);
|
||||
index.register_segment("t", 0, PathBuf::from("/seg.wal"), 0, 250);
|
||||
|
||||
// Offset 5000 is past all entries, should return the last entry.
|
||||
let entry = index.lookup("t", 0, 5000).unwrap();
|
||||
assert_eq!(entry.offset, 200);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lookup_before_first_entry() {
|
||||
let mut index = OffsetIndex::new(100);
|
||||
index.register_segment("t", 0, PathBuf::from("/seg.wal"), 500, 999);
|
||||
|
||||
// Offset 100 is before the first entry (500).
|
||||
assert!(index.lookup("t", 0, 100).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lookup_nonexistent_topic() {
|
||||
let index = OffsetIndex::new(100);
|
||||
assert!(index.lookup("no-topic", 0, 0).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_segments() {
|
||||
let mut index = OffsetIndex::new(1000);
|
||||
index.register_segment("t", 0, PathBuf::from("/seg0.wal"), 0, 4999);
|
||||
index.register_segment("t", 0, PathBuf::from("/seg1.wal"), 5000, 9999);
|
||||
|
||||
let entry = index.lookup("t", 0, 3000).unwrap();
|
||||
assert_eq!(entry.segment_path, PathBuf::from("/seg0.wal"));
|
||||
|
||||
let entry = index.lookup("t", 0, 7000).unwrap();
|
||||
assert_eq!(entry.segment_path, PathBuf::from("/seg1.wal"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_topic_partition_isolation() {
|
||||
let mut index = OffsetIndex::new(100);
|
||||
index.register_segment("a", 0, PathBuf::from("/a0.wal"), 0, 999);
|
||||
index.register_segment("b", 0, PathBuf::from("/b0.wal"), 0, 999);
|
||||
index.register_segment("a", 1, PathBuf::from("/a1.wal"), 0, 999);
|
||||
|
||||
assert_eq!(
|
||||
index.segment_for_offset("a", 0, 50).unwrap(),
|
||||
&PathBuf::from("/a0.wal")
|
||||
);
|
||||
assert_eq!(
|
||||
index.segment_for_offset("b", 0, 50).unwrap(),
|
||||
&PathBuf::from("/b0.wal")
|
||||
);
|
||||
assert_eq!(
|
||||
index.segment_for_offset("a", 1, 50).unwrap(),
|
||||
&PathBuf::from("/a1.wal")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segments_list() {
|
||||
let mut index = OffsetIndex::new(1000);
|
||||
index.register_segment("t", 0, PathBuf::from("/seg0.wal"), 0, 4999);
|
||||
index.register_segment("t", 0, PathBuf::from("/seg1.wal"), 5000, 9999);
|
||||
|
||||
let segs = index.segments("t", 0);
|
||||
assert_eq!(segs.len(), 2);
|
||||
assert_eq!(segs[0], PathBuf::from("/seg0.wal"));
|
||||
assert_eq!(segs[1], PathBuf::from("/seg1.wal"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_earliest_offset() {
|
||||
let mut index = OffsetIndex::new(100);
|
||||
index.register_segment("t", 0, PathBuf::from("/seg.wal"), 42, 999);
|
||||
assert_eq!(index.earliest_offset("t", 0), Some(42));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sample_interval() {
|
||||
let mut index = OffsetIndex::new(500);
|
||||
index.register_segment("t", 0, PathBuf::from("/seg.wal"), 0, 2000);
|
||||
|
||||
// Should have entries at: 0, 500, 1000, 1500, 2000
|
||||
let key = ("t".to_string(), 0);
|
||||
let entries = index.entries.get(&key).unwrap();
|
||||
let offsets: Vec<u64> = entries.iter().map(|e| e.offset).collect();
|
||||
assert_eq!(offsets, vec![0, 500, 1000, 1500, 2000]);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
pub mod consumer_offsets;
|
||||
pub mod engine;
|
||||
pub mod index;
|
||||
pub mod object_store;
|
||||
pub mod topic_metadata;
|
||||
pub mod wal;
|
||||
|
||||
93
crates/sq-storage/src/object_store/layout.rs
Normal file
93
crates/sq-storage/src/object_store/layout.rs
Normal file
@@ -0,0 +1,93 @@
|
||||
/// S3 key layout for shipped WAL segments.
|
||||
///
|
||||
/// Format: `{cluster_id}/{topic}/{partition}/{base_offset:020}-{end_offset:020}.sqseg`
|
||||
///
|
||||
/// The 020 zero-padding ensures lexicographic ordering matches offset ordering.
|
||||
pub fn segment_key(
|
||||
cluster_id: &str,
|
||||
topic: &str,
|
||||
partition: u32,
|
||||
base_offset: u64,
|
||||
end_offset: u64,
|
||||
) -> String {
|
||||
format!(
|
||||
"{}/{}/{}/{:020}-{:020}.sqseg",
|
||||
cluster_id, topic, partition, base_offset, end_offset
|
||||
)
|
||||
}
|
||||
|
||||
/// Parse a segment key back into its components.
|
||||
/// Returns (cluster_id, topic, partition, base_offset, end_offset).
|
||||
pub fn parse_segment_key(key: &str) -> Option<(String, String, u32, u64, u64)> {
|
||||
let parts: Vec<&str> = key.split('/').collect();
|
||||
if parts.len() != 4 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let cluster_id = parts[0].to_string();
|
||||
let topic = parts[1].to_string();
|
||||
let partition: u32 = parts[2].parse().ok()?;
|
||||
|
||||
let filename = parts[3].strip_suffix(".sqseg")?;
|
||||
let offsets: Vec<&str> = filename.split('-').collect();
|
||||
if offsets.len() != 2 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let base_offset: u64 = offsets[0].parse().ok()?;
|
||||
let end_offset: u64 = offsets[1].parse().ok()?;
|
||||
|
||||
Some((cluster_id, topic, partition, base_offset, end_offset))
|
||||
}
|
||||
|
||||
/// S3 key prefix for listing segments of a topic-partition.
|
||||
pub fn topic_partition_prefix(cluster_id: &str, topic: &str, partition: u32) -> String {
|
||||
format!("{}/{}/{}/", cluster_id, topic, partition)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_segment_key_format() {
|
||||
let key = segment_key("cluster-1", "orders", 0, 0, 999);
|
||||
assert_eq!(
|
||||
key,
|
||||
"cluster-1/orders/0/00000000000000000000-00000000000000000999.sqseg"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_key_lexicographic_order() {
|
||||
let k1 = segment_key("c", "t", 0, 0, 999);
|
||||
let k2 = segment_key("c", "t", 0, 1000, 1999);
|
||||
let k3 = segment_key("c", "t", 0, 2000, 2999);
|
||||
assert!(k1 < k2);
|
||||
assert!(k2 < k3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_segment_key() {
|
||||
let key = segment_key("cluster-1", "orders", 2, 1000, 1999);
|
||||
let parsed = parse_segment_key(&key).unwrap();
|
||||
assert_eq!(parsed.0, "cluster-1");
|
||||
assert_eq!(parsed.1, "orders");
|
||||
assert_eq!(parsed.2, 2);
|
||||
assert_eq!(parsed.3, 1000);
|
||||
assert_eq!(parsed.4, 1999);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_invalid_key() {
|
||||
assert!(parse_segment_key("invalid").is_none());
|
||||
assert!(parse_segment_key("a/b/c").is_none());
|
||||
assert!(parse_segment_key("a/b/c/d.txt").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_topic_partition_prefix() {
|
||||
let prefix = topic_partition_prefix("cluster-1", "orders", 0);
|
||||
assert_eq!(prefix, "cluster-1/orders/0/");
|
||||
}
|
||||
}
|
||||
159
crates/sq-storage/src/object_store/mod.rs
Normal file
159
crates/sq-storage/src/object_store/mod.rs
Normal file
@@ -0,0 +1,159 @@
|
||||
pub mod layout;
|
||||
pub mod reader;
|
||||
pub mod s3;
|
||||
pub mod shipper;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
|
||||
/// Trait for object storage backends (S3, MinIO, in-memory for tests).
|
||||
#[allow(async_fn_in_trait)]
|
||||
pub trait ObjectStore: Send + Sync + 'static {
|
||||
async fn put(&self, key: &str, data: Vec<u8>) -> anyhow::Result<()>;
|
||||
async fn get(&self, key: &str) -> anyhow::Result<Vec<u8>>;
|
||||
async fn list(&self, prefix: &str) -> anyhow::Result<Vec<String>>;
|
||||
async fn delete(&self, key: &str) -> anyhow::Result<()>;
|
||||
async fn exists(&self, key: &str) -> anyhow::Result<bool>;
|
||||
}
|
||||
|
||||
/// In-memory object store for testing.
|
||||
pub struct InMemoryObjectStore {
|
||||
data: Mutex<HashMap<String, Vec<u8>>>,
|
||||
}
|
||||
|
||||
impl InMemoryObjectStore {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
data: Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for InMemoryObjectStore {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ObjectStore for InMemoryObjectStore {
|
||||
async fn put(&self, key: &str, data: Vec<u8>) -> anyhow::Result<()> {
|
||||
self.data
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(key.to_string(), data);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get(&self, key: &str) -> anyhow::Result<Vec<u8>> {
|
||||
self.data
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(key)
|
||||
.cloned()
|
||||
.ok_or_else(|| anyhow::anyhow!("key '{}' not found", key))
|
||||
}
|
||||
|
||||
async fn list(&self, prefix: &str) -> anyhow::Result<Vec<String>> {
|
||||
let data = self.data.lock().unwrap();
|
||||
let mut keys: Vec<String> = data
|
||||
.keys()
|
||||
.filter(|k| k.starts_with(prefix))
|
||||
.cloned()
|
||||
.collect();
|
||||
keys.sort();
|
||||
Ok(keys)
|
||||
}
|
||||
|
||||
async fn delete(&self, key: &str) -> anyhow::Result<()> {
|
||||
self.data.lock().unwrap().remove(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn exists(&self, key: &str) -> anyhow::Result<bool> {
|
||||
Ok(self.data.lock().unwrap().contains_key(key))
|
||||
}
|
||||
}
|
||||
|
||||
/// No-op object store that silently discards all data.
|
||||
pub struct NoopObjectStore;
|
||||
|
||||
impl ObjectStore for NoopObjectStore {
|
||||
async fn put(&self, _key: &str, _data: Vec<u8>) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get(&self, key: &str) -> anyhow::Result<Vec<u8>> {
|
||||
anyhow::bail!("NoopObjectStore: key '{}' not found", key)
|
||||
}
|
||||
|
||||
async fn list(&self, _prefix: &str) -> anyhow::Result<Vec<String>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
async fn delete(&self, _key: &str) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn exists(&self, _key: &str) -> anyhow::Result<bool> {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_inmemory_put_get() {
|
||||
let store = InMemoryObjectStore::new();
|
||||
store.put("test/key", b"hello".to_vec()).await.unwrap();
|
||||
|
||||
let data = store.get("test/key").await.unwrap();
|
||||
assert_eq!(data, b"hello");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_inmemory_get_nonexistent() {
|
||||
let store = InMemoryObjectStore::new();
|
||||
let err = store.get("no/such/key").await.unwrap_err();
|
||||
assert!(err.to_string().contains("not found"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_inmemory_list() {
|
||||
let store = InMemoryObjectStore::new();
|
||||
store.put("a/1", b"x".to_vec()).await.unwrap();
|
||||
store.put("a/2", b"y".to_vec()).await.unwrap();
|
||||
store.put("b/1", b"z".to_vec()).await.unwrap();
|
||||
|
||||
let keys = store.list("a/").await.unwrap();
|
||||
assert_eq!(keys, vec!["a/1", "a/2"]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_inmemory_delete() {
|
||||
let store = InMemoryObjectStore::new();
|
||||
store.put("key", b"data".to_vec()).await.unwrap();
|
||||
store.delete("key").await.unwrap();
|
||||
|
||||
assert!(!store.exists("key").await.unwrap());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_inmemory_exists() {
|
||||
let store = InMemoryObjectStore::new();
|
||||
assert!(!store.exists("key").await.unwrap());
|
||||
|
||||
store.put("key", b"data".to_vec()).await.unwrap();
|
||||
assert!(store.exists("key").await.unwrap());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_noop_put_get() {
|
||||
let store = NoopObjectStore;
|
||||
store.put("key", b"data".to_vec()).await.unwrap();
|
||||
|
||||
// Get always fails on noop store.
|
||||
assert!(store.get("key").await.is_err());
|
||||
}
|
||||
}
|
||||
209
crates/sq-storage/src/object_store/reader.rs
Normal file
209
crates/sq-storage/src/object_store/reader.rs
Normal file
@@ -0,0 +1,209 @@
|
||||
use sq_models::Message;
|
||||
|
||||
use super::ObjectStore;
|
||||
use crate::wal::reader::WalReader;
|
||||
use sq_sim::fs::FileSystem;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Reads segments from object storage, decompressing and parsing them.
|
||||
pub struct ObjectStoreReader<F: FileSystem, O: ObjectStore> {
|
||||
fs: Arc<F>,
|
||||
object_store: Arc<O>,
|
||||
cache_dir: PathBuf,
|
||||
wal_reader: WalReader<F>,
|
||||
}
|
||||
|
||||
impl<F: FileSystem, O: ObjectStore> ObjectStoreReader<F, O> {
|
||||
pub fn new(fs: Arc<F>, object_store: Arc<O>, cache_dir: PathBuf) -> Self {
|
||||
let wal_reader = WalReader::new(fs.clone());
|
||||
Self {
|
||||
fs,
|
||||
object_store,
|
||||
cache_dir,
|
||||
wal_reader,
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch a segment from object storage, decompress it, cache locally, and read messages.
|
||||
pub async fn read_segment(
|
||||
&self,
|
||||
key: &str,
|
||||
from_offset: u64,
|
||||
) -> anyhow::Result<Vec<Message>> {
|
||||
// Check local cache first.
|
||||
let cache_path = self.cache_path(key);
|
||||
|
||||
if !self.fs.exists(&cache_path) {
|
||||
// Download from object store.
|
||||
let compressed = self.object_store.get(key).await?;
|
||||
|
||||
// Decompress zstd.
|
||||
let decompressed = zstd::decode_all(compressed.as_slice())?;
|
||||
|
||||
// Cache locally.
|
||||
if let Some(parent) = cache_path.parent() {
|
||||
self.fs.create_dir_all(parent)?;
|
||||
}
|
||||
let mut handle = self.fs.open_write(&cache_path)?;
|
||||
handle.write_all(&decompressed)?;
|
||||
}
|
||||
|
||||
// Read from cached file.
|
||||
Ok(self.wal_reader.read_from_offset(&cache_path, from_offset)?)
|
||||
}
|
||||
|
||||
/// List segment keys in object storage matching a prefix.
|
||||
pub async fn list_segment_keys(&self, prefix: &str) -> anyhow::Result<Vec<String>> {
|
||||
self.object_store.list(prefix).await
|
||||
}
|
||||
|
||||
fn cache_path(&self, key: &str) -> PathBuf {
|
||||
// Replace '/' with '_' for flat cache directory.
|
||||
let safe_name = key.replace('/', "_");
|
||||
self.cache_dir.join(safe_name)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::object_store::InMemoryObjectStore;
|
||||
use crate::wal::record::encode_record;
|
||||
use crate::wal::segment::SegmentHeader;
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
|
||||
fn build_test_segment(topic: &str, partition: u32, messages: &[Message]) -> Vec<u8> {
|
||||
let mut data = Vec::new();
|
||||
|
||||
// Write segment header.
|
||||
let header = SegmentHeader {
|
||||
topic: topic.to_string(),
|
||||
partition,
|
||||
};
|
||||
data.extend_from_slice(&header.encode());
|
||||
|
||||
// Write records.
|
||||
for msg in messages {
|
||||
data.extend_from_slice(&encode_record(msg));
|
||||
}
|
||||
|
||||
data
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_from_object_store() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let store = Arc::new(InMemoryObjectStore::new());
|
||||
|
||||
let messages = vec![
|
||||
Message {
|
||||
offset: 0,
|
||||
topic: "orders".into(),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: b"msg-0".to_vec(),
|
||||
headers: vec![],
|
||||
timestamp_ms: 100,
|
||||
},
|
||||
Message {
|
||||
offset: 1,
|
||||
topic: "orders".into(),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: b"msg-1".to_vec(),
|
||||
headers: vec![],
|
||||
timestamp_ms: 200,
|
||||
},
|
||||
];
|
||||
|
||||
let segment_data = build_test_segment("orders", 0, &messages);
|
||||
let compressed = zstd::encode_all(segment_data.as_slice(), 3).unwrap();
|
||||
|
||||
store
|
||||
.put("cluster/orders/0/00000000000000000000-00000000000000000001.sqseg", compressed)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let reader = ObjectStoreReader::new(fs, store, PathBuf::from("/cache"));
|
||||
|
||||
let result = reader
|
||||
.read_segment(
|
||||
"cluster/orders/0/00000000000000000000-00000000000000000001.sqseg",
|
||||
0,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(result.len(), 2);
|
||||
assert_eq!(result[0].value, b"msg-0");
|
||||
assert_eq!(result[1].value, b"msg-1");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cached_read() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let store = Arc::new(InMemoryObjectStore::new());
|
||||
|
||||
let messages = vec![Message {
|
||||
offset: 0,
|
||||
topic: "t".into(),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: b"data".to_vec(),
|
||||
headers: vec![],
|
||||
timestamp_ms: 0,
|
||||
}];
|
||||
|
||||
let segment_data = build_test_segment("t", 0, &messages);
|
||||
let compressed = zstd::encode_all(segment_data.as_slice(), 3).unwrap();
|
||||
|
||||
let key = "cluster/t/0/00000000000000000000-00000000000000000000.sqseg";
|
||||
store.put(key, compressed).await.unwrap();
|
||||
|
||||
let reader = ObjectStoreReader::new(fs.clone(), store.clone(), PathBuf::from("/cache"));
|
||||
|
||||
// First read - fetches from store.
|
||||
let result1 = reader.read_segment(key, 0).await.unwrap();
|
||||
assert_eq!(result1.len(), 1);
|
||||
|
||||
// Delete from store to prove cached read works.
|
||||
store.delete(key).await.unwrap();
|
||||
|
||||
// Second read - uses cache.
|
||||
let result2 = reader.read_segment(key, 0).await.unwrap();
|
||||
assert_eq!(result2.len(), 1);
|
||||
assert_eq!(result2[0].value, b"data");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_from_offset() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let store = Arc::new(InMemoryObjectStore::new());
|
||||
|
||||
let messages: Vec<Message> = (0..5)
|
||||
.map(|i| Message {
|
||||
offset: i,
|
||||
topic: "t".into(),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: format!("msg-{i}").into_bytes(),
|
||||
headers: vec![],
|
||||
timestamp_ms: i * 100,
|
||||
})
|
||||
.collect();
|
||||
|
||||
let segment_data = build_test_segment("t", 0, &messages);
|
||||
let compressed = zstd::encode_all(segment_data.as_slice(), 3).unwrap();
|
||||
|
||||
let key = "cluster/t/0/00000000000000000000-00000000000000000004.sqseg";
|
||||
store.put(key, compressed).await.unwrap();
|
||||
|
||||
let reader = ObjectStoreReader::new(fs, store, PathBuf::from("/cache"));
|
||||
|
||||
let result = reader.read_segment(key, 3).await.unwrap();
|
||||
assert_eq!(result.len(), 2);
|
||||
assert_eq!(result[0].offset, 3);
|
||||
assert_eq!(result[1].offset, 4);
|
||||
}
|
||||
}
|
||||
106
crates/sq-storage/src/object_store/s3.rs
Normal file
106
crates/sq-storage/src/object_store/s3.rs
Normal file
@@ -0,0 +1,106 @@
|
||||
use object_store::aws::{AmazonS3, AmazonS3Builder};
|
||||
use object_store::path::Path as ObjectPath;
|
||||
use object_store::ObjectStore as _;
|
||||
|
||||
use super::ObjectStore;
|
||||
|
||||
/// S3-backed object store using the `object_store` crate.
|
||||
/// Works with AWS S3, MinIO, and any S3-compatible endpoint.
|
||||
pub struct S3ObjectStore {
|
||||
store: AmazonS3,
|
||||
bucket: String,
|
||||
}
|
||||
|
||||
/// Configuration for the S3 object store.
|
||||
pub struct S3Config {
|
||||
pub bucket: String,
|
||||
pub region: String,
|
||||
pub endpoint: Option<String>,
|
||||
pub access_key_id: Option<String>,
|
||||
pub secret_access_key: Option<String>,
|
||||
/// Allow HTTP (non-TLS) connections. Required for local MinIO.
|
||||
pub allow_http: bool,
|
||||
}
|
||||
|
||||
impl S3ObjectStore {
|
||||
pub fn new(config: S3Config) -> anyhow::Result<Self> {
|
||||
let mut builder = AmazonS3Builder::new()
|
||||
.with_bucket_name(&config.bucket)
|
||||
.with_region(&config.region);
|
||||
|
||||
if let Some(endpoint) = &config.endpoint {
|
||||
builder = builder.with_endpoint(endpoint);
|
||||
}
|
||||
|
||||
if let Some(key) = &config.access_key_id {
|
||||
builder = builder.with_access_key_id(key);
|
||||
}
|
||||
|
||||
if let Some(secret) = &config.secret_access_key {
|
||||
builder = builder.with_secret_access_key(secret);
|
||||
}
|
||||
|
||||
if config.allow_http {
|
||||
builder = builder.with_allow_http(true);
|
||||
}
|
||||
|
||||
let store = builder.build()?;
|
||||
|
||||
Ok(Self {
|
||||
store,
|
||||
bucket: config.bucket,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the bucket name.
|
||||
pub fn bucket(&self) -> &str {
|
||||
&self.bucket
|
||||
}
|
||||
}
|
||||
|
||||
impl ObjectStore for S3ObjectStore {
|
||||
async fn put(&self, key: &str, data: Vec<u8>) -> anyhow::Result<()> {
|
||||
let path = ObjectPath::from(key);
|
||||
self.store
|
||||
.put(&path, bytes::Bytes::from(data).into())
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get(&self, key: &str) -> anyhow::Result<Vec<u8>> {
|
||||
let path = ObjectPath::from(key);
|
||||
let result = self.store.get(&path).await?;
|
||||
let bytes = result.bytes().await?;
|
||||
Ok(bytes.to_vec())
|
||||
}
|
||||
|
||||
async fn list(&self, prefix: &str) -> anyhow::Result<Vec<String>> {
|
||||
use futures::TryStreamExt;
|
||||
|
||||
let prefix_path = ObjectPath::from(prefix);
|
||||
let mut keys = Vec::new();
|
||||
|
||||
let mut stream = self.store.list(Some(&prefix_path));
|
||||
while let Some(meta) = stream.try_next().await? {
|
||||
keys.push(meta.location.to_string());
|
||||
}
|
||||
|
||||
keys.sort();
|
||||
Ok(keys)
|
||||
}
|
||||
|
||||
async fn delete(&self, key: &str) -> anyhow::Result<()> {
|
||||
let path = ObjectPath::from(key);
|
||||
self.store.delete(&path).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn exists(&self, key: &str) -> anyhow::Result<bool> {
|
||||
let path = ObjectPath::from(key);
|
||||
match self.store.head(&path).await {
|
||||
Ok(_) => Ok(true),
|
||||
Err(object_store::Error::NotFound { .. }) => Ok(false),
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
273
crates/sq-storage/src/object_store/shipper.rs
Normal file
273
crates/sq-storage/src/object_store/shipper.rs
Normal file
@@ -0,0 +1,273 @@
|
||||
use std::collections::HashSet;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_models::ClosedSegment;
|
||||
use sq_sim::fs::FileSystem;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use super::layout;
|
||||
use super::ObjectStore;
|
||||
|
||||
/// Tracks which segments have been shipped to object storage.
|
||||
pub struct ShippedSegments {
|
||||
shipped: HashSet<PathBuf>,
|
||||
}
|
||||
|
||||
impl ShippedSegments {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
shipped: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mark_shipped(&mut self, path: PathBuf) {
|
||||
self.shipped.insert(path);
|
||||
}
|
||||
|
||||
pub fn is_shipped(&self, path: &PathBuf) -> bool {
|
||||
self.shipped.contains(path)
|
||||
}
|
||||
|
||||
pub fn shipped_paths(&self) -> &HashSet<PathBuf> {
|
||||
&self.shipped
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ShippedSegments {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Ships closed WAL segments to object storage with zstd compression.
|
||||
pub struct SegmentShipper<F: FileSystem, O: ObjectStore> {
|
||||
fs: Arc<F>,
|
||||
object_store: Arc<O>,
|
||||
cluster_id: String,
|
||||
shipped: Arc<Mutex<ShippedSegments>>,
|
||||
}
|
||||
|
||||
impl<F: FileSystem, O: ObjectStore> SegmentShipper<F, O> {
|
||||
pub fn new(
|
||||
fs: Arc<F>,
|
||||
object_store: Arc<O>,
|
||||
cluster_id: String,
|
||||
shipped: Arc<Mutex<ShippedSegments>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
fs,
|
||||
object_store,
|
||||
cluster_id,
|
||||
shipped,
|
||||
}
|
||||
}
|
||||
|
||||
/// Ship a single closed segment to object storage.
|
||||
/// Reads the local WAL file, compresses with zstd, uploads.
|
||||
pub async fn ship_segment(&self, segment: &ClosedSegment) -> anyhow::Result<()> {
|
||||
// Check if already shipped.
|
||||
{
|
||||
let shipped = self.shipped.lock().await;
|
||||
if shipped.is_shipped(&segment.path) {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Read the local WAL file.
|
||||
let mut handle = self.fs.open_read(&segment.path)?;
|
||||
let mut raw_data = Vec::new();
|
||||
handle.read_to_end(&mut raw_data)?;
|
||||
|
||||
// Compress with zstd.
|
||||
let compressed = zstd::encode_all(raw_data.as_slice(), 3)?;
|
||||
|
||||
// Build the S3 key.
|
||||
let key = layout::segment_key(
|
||||
&self.cluster_id,
|
||||
segment.topic.as_str(),
|
||||
segment.partition,
|
||||
segment.base_offset,
|
||||
segment.end_offset,
|
||||
);
|
||||
|
||||
// Upload.
|
||||
self.object_store.put(&key, compressed).await?;
|
||||
|
||||
tracing::info!(
|
||||
topic = %segment.topic,
|
||||
partition = segment.partition,
|
||||
base_offset = segment.base_offset,
|
||||
end_offset = segment.end_offset,
|
||||
key = %key,
|
||||
"shipped segment to object store"
|
||||
);
|
||||
|
||||
// Mark as shipped.
|
||||
{
|
||||
let mut shipped = self.shipped.lock().await;
|
||||
shipped.mark_shipped(segment.path.clone());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Ship all provided closed segments. Returns the number of successfully shipped segments.
|
||||
pub async fn ship_all(&self, segments: &[ClosedSegment]) -> usize {
|
||||
let mut shipped_count = 0;
|
||||
for segment in segments {
|
||||
match self.ship_segment(segment).await {
|
||||
Ok(()) => shipped_count += 1,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
topic = %segment.topic,
|
||||
partition = segment.partition,
|
||||
path = %segment.path.display(),
|
||||
error = %e,
|
||||
"failed to ship segment, will retry"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
shipped_count
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::object_store::InMemoryObjectStore;
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
use std::path::Path;
|
||||
|
||||
fn setup() -> (
|
||||
Arc<InMemoryFileSystem>,
|
||||
Arc<InMemoryObjectStore>,
|
||||
SegmentShipper<InMemoryFileSystem, InMemoryObjectStore>,
|
||||
) {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let store = Arc::new(InMemoryObjectStore::new());
|
||||
let shipped = Arc::new(Mutex::new(ShippedSegments::new()));
|
||||
let shipper = SegmentShipper::new(
|
||||
fs.clone(),
|
||||
store.clone(),
|
||||
"test-cluster".to_string(),
|
||||
shipped,
|
||||
);
|
||||
(fs, store, shipper)
|
||||
}
|
||||
|
||||
fn create_test_segment(fs: &InMemoryFileSystem, path: &Path, data: &[u8]) {
|
||||
fs.create_dir_all(path.parent().unwrap()).unwrap();
|
||||
let mut handle = fs.open_write(path).unwrap();
|
||||
handle.write_all(data).unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ship_segment() {
|
||||
let (fs, store, shipper) = setup();
|
||||
|
||||
let seg_path = PathBuf::from("/data/orders/0/00000000000000000000.wal");
|
||||
create_test_segment(&fs, &seg_path, b"wal data here");
|
||||
|
||||
let segment = ClosedSegment {
|
||||
topic: "orders".into(),
|
||||
partition: 0,
|
||||
base_offset: 0,
|
||||
end_offset: 99,
|
||||
path: seg_path,
|
||||
size_bytes: 13,
|
||||
};
|
||||
|
||||
shipper.ship_segment(&segment).await.unwrap();
|
||||
|
||||
// Verify it's in the object store.
|
||||
let key = layout::segment_key("test-cluster", "orders", 0, 0, 99);
|
||||
let data = store.get(&key).await.unwrap();
|
||||
|
||||
// Data should be zstd-compressed, so decompress and verify.
|
||||
let decompressed = zstd::decode_all(data.as_slice()).unwrap();
|
||||
assert_eq!(decompressed, b"wal data here");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ship_already_shipped_is_noop() {
|
||||
let (fs, store, shipper) = setup();
|
||||
|
||||
let seg_path = PathBuf::from("/data/orders/0/00000000000000000000.wal");
|
||||
create_test_segment(&fs, &seg_path, b"data");
|
||||
|
||||
let segment = ClosedSegment {
|
||||
topic: "orders".into(),
|
||||
partition: 0,
|
||||
base_offset: 0,
|
||||
end_offset: 99,
|
||||
path: seg_path,
|
||||
size_bytes: 13,
|
||||
};
|
||||
|
||||
shipper.ship_segment(&segment).await.unwrap();
|
||||
// Ship again - should be a noop.
|
||||
shipper.ship_segment(&segment).await.unwrap();
|
||||
|
||||
let keys = store.list("test-cluster/").await.unwrap();
|
||||
assert_eq!(keys.len(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ship_all_counts() {
|
||||
let (fs, store, shipper) = setup();
|
||||
|
||||
let mut segments = Vec::new();
|
||||
for i in 0..3 {
|
||||
let path = PathBuf::from(format!("/data/t/0/{:020}.wal", i * 100));
|
||||
create_test_segment(&fs, &path, format!("data-{i}").as_bytes());
|
||||
segments.push(ClosedSegment {
|
||||
topic: "t".into(),
|
||||
partition: 0,
|
||||
base_offset: i * 100,
|
||||
end_offset: i * 100 + 99,
|
||||
path,
|
||||
size_bytes: 6,
|
||||
});
|
||||
}
|
||||
|
||||
let count = shipper.ship_all(&segments).await;
|
||||
assert_eq!(count, 3);
|
||||
|
||||
let keys = store.list("test-cluster/").await.unwrap();
|
||||
assert_eq!(keys.len(), 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_s3_key_layout() {
|
||||
let (fs, store, shipper) = setup();
|
||||
|
||||
let seg_path = PathBuf::from("/data/events/2/00000000000000001000.wal");
|
||||
create_test_segment(&fs, &seg_path, b"data");
|
||||
|
||||
let segment = ClosedSegment {
|
||||
topic: "events".into(),
|
||||
partition: 2,
|
||||
base_offset: 1000,
|
||||
end_offset: 1999,
|
||||
path: seg_path,
|
||||
size_bytes: 4,
|
||||
};
|
||||
|
||||
shipper.ship_segment(&segment).await.unwrap();
|
||||
|
||||
let expected_key = "test-cluster/events/2/00000000000000001000-00000000000000001999.sqseg";
|
||||
assert!(store.exists(expected_key).await.unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shipped_segments_tracking() {
|
||||
let mut shipped = ShippedSegments::new();
|
||||
let path = PathBuf::from("/data/t/0/000.wal");
|
||||
|
||||
assert!(!shipped.is_shipped(&path));
|
||||
shipped.mark_shipped(path.clone());
|
||||
assert!(shipped.is_shipped(&path));
|
||||
}
|
||||
}
|
||||
225
crates/sq-storage/src/topic_metadata.rs
Normal file
225
crates/sq-storage/src/topic_metadata.rs
Normal file
@@ -0,0 +1,225 @@
|
||||
use std::collections::HashMap;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_models::TopicConfig;
|
||||
use sq_sim::fs::FileSystem;
|
||||
|
||||
/// Manages topic metadata (name, partitions, replication factor).
|
||||
/// Backed by a JSON file for persistence.
|
||||
pub struct TopicMetadata<F: FileSystem> {
|
||||
topics: HashMap<String, TopicConfig>,
|
||||
persist_path: PathBuf,
|
||||
fs: Arc<F>,
|
||||
}
|
||||
|
||||
impl<F: FileSystem> TopicMetadata<F> {
|
||||
pub fn new(fs: Arc<F>, data_dir: &Path) -> Self {
|
||||
let persist_path = data_dir.join("topic_metadata.json");
|
||||
Self {
|
||||
topics: HashMap::new(),
|
||||
persist_path,
|
||||
fs,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new topic. Returns error if topic already exists.
|
||||
pub fn create_topic(&mut self, config: TopicConfig) -> anyhow::Result<()> {
|
||||
if self.topics.contains_key(config.name.as_str()) {
|
||||
anyhow::bail!("topic '{}' already exists", config.name);
|
||||
}
|
||||
self.topics.insert(config.name.to_string(), config);
|
||||
self.persist()
|
||||
}
|
||||
|
||||
/// Delete a topic by name. Returns error if topic doesn't exist.
|
||||
pub fn delete_topic(&mut self, name: &str) -> anyhow::Result<()> {
|
||||
if self.topics.remove(name).is_none() {
|
||||
anyhow::bail!("topic '{}' not found", name);
|
||||
}
|
||||
self.persist()
|
||||
}
|
||||
|
||||
/// List all topics.
|
||||
pub fn list_topics(&self) -> Vec<&TopicConfig> {
|
||||
let mut topics: Vec<_> = self.topics.values().collect();
|
||||
topics.sort_by_key(|t| t.name.as_str());
|
||||
topics
|
||||
}
|
||||
|
||||
/// Get a specific topic's config.
|
||||
pub fn get_topic(&self, name: &str) -> Option<&TopicConfig> {
|
||||
self.topics.get(name)
|
||||
}
|
||||
|
||||
/// Check if a topic exists.
|
||||
pub fn topic_exists(&self, name: &str) -> bool {
|
||||
self.topics.contains_key(name)
|
||||
}
|
||||
|
||||
fn persist(&self) -> anyhow::Result<()> {
|
||||
let entries: Vec<TopicEntry> = self
|
||||
.topics
|
||||
.values()
|
||||
.map(|c| TopicEntry {
|
||||
name: c.name.to_string(),
|
||||
partitions: c.partitions,
|
||||
replication_factor: c.replication_factor,
|
||||
})
|
||||
.collect();
|
||||
|
||||
let json = serde_json::to_vec(&entries)?;
|
||||
|
||||
if let Some(parent) = self.persist_path.parent() {
|
||||
self.fs.create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
let mut handle = self.fs.open_write(&self.persist_path)?;
|
||||
handle.write_all(&json)?;
|
||||
handle.fsync()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load topic metadata from disk.
|
||||
pub fn load(fs: Arc<F>, data_dir: &Path) -> anyhow::Result<Self> {
|
||||
let persist_path = data_dir.join("topic_metadata.json");
|
||||
|
||||
if !fs.exists(&persist_path) {
|
||||
return Ok(Self {
|
||||
topics: HashMap::new(),
|
||||
persist_path,
|
||||
fs,
|
||||
});
|
||||
}
|
||||
|
||||
let mut handle = fs.open_read(&persist_path)?;
|
||||
let mut buf = Vec::new();
|
||||
handle.read_to_end(&mut buf)?;
|
||||
|
||||
let entries: Vec<TopicEntry> = serde_json::from_slice(&buf)?;
|
||||
|
||||
let mut topics = HashMap::new();
|
||||
for entry in entries {
|
||||
let config = TopicConfig::new(entry.name.as_str())
|
||||
.with_partitions(entry.partitions)
|
||||
.with_replication_factor(entry.replication_factor);
|
||||
topics.insert(entry.name, config);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
topics,
|
||||
persist_path,
|
||||
fs,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
struct TopicEntry {
|
||||
name: String,
|
||||
partitions: u32,
|
||||
replication_factor: u32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
|
||||
fn test_metadata() -> TopicMetadata<InMemoryFileSystem> {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
TopicMetadata::new(fs, Path::new("/data"))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_and_get_topic() {
|
||||
let mut meta = test_metadata();
|
||||
meta.create_topic(TopicConfig::new("orders")).unwrap();
|
||||
|
||||
let topic = meta.get_topic("orders").unwrap();
|
||||
assert_eq!(topic.name.as_str(), "orders");
|
||||
assert_eq!(topic.partitions, 1);
|
||||
assert_eq!(topic.replication_factor, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_duplicate_fails() {
|
||||
let mut meta = test_metadata();
|
||||
meta.create_topic(TopicConfig::new("orders")).unwrap();
|
||||
|
||||
let err = meta.create_topic(TopicConfig::new("orders")).unwrap_err();
|
||||
assert!(err.to_string().contains("already exists"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_topic() {
|
||||
let mut meta = test_metadata();
|
||||
meta.create_topic(TopicConfig::new("orders")).unwrap();
|
||||
meta.delete_topic("orders").unwrap();
|
||||
|
||||
assert!(meta.get_topic("orders").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_nonexistent_fails() {
|
||||
let mut meta = test_metadata();
|
||||
let err = meta.delete_topic("orders").unwrap_err();
|
||||
assert!(err.to_string().contains("not found"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_list_topics_sorted() {
|
||||
let mut meta = test_metadata();
|
||||
meta.create_topic(TopicConfig::new("zebra")).unwrap();
|
||||
meta.create_topic(TopicConfig::new("alpha")).unwrap();
|
||||
meta.create_topic(TopicConfig::new("middle")).unwrap();
|
||||
|
||||
let topics = meta.list_topics();
|
||||
assert_eq!(topics.len(), 3);
|
||||
assert_eq!(topics[0].name.as_str(), "alpha");
|
||||
assert_eq!(topics[1].name.as_str(), "middle");
|
||||
assert_eq!(topics[2].name.as_str(), "zebra");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_persist_and_load() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
|
||||
{
|
||||
let mut meta = TopicMetadata::new(fs.clone(), Path::new("/data"));
|
||||
meta.create_topic(
|
||||
TopicConfig::new("orders")
|
||||
.with_partitions(4)
|
||||
.with_replication_factor(2),
|
||||
)
|
||||
.unwrap();
|
||||
meta.create_topic(TopicConfig::new("events")).unwrap();
|
||||
}
|
||||
|
||||
let loaded = TopicMetadata::load(fs, Path::new("/data")).unwrap();
|
||||
assert_eq!(loaded.list_topics().len(), 2);
|
||||
|
||||
let orders = loaded.get_topic("orders").unwrap();
|
||||
assert_eq!(orders.partitions, 4);
|
||||
assert_eq!(orders.replication_factor, 2);
|
||||
|
||||
assert!(loaded.topic_exists("events"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_load_nonexistent_file() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let meta = TopicMetadata::load(fs, Path::new("/data")).unwrap();
|
||||
assert!(meta.list_topics().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_topic_exists() {
|
||||
let mut meta = test_metadata();
|
||||
assert!(!meta.topic_exists("orders"));
|
||||
|
||||
meta.create_topic(TopicConfig::new("orders")).unwrap();
|
||||
assert!(meta.topic_exists("orders"));
|
||||
}
|
||||
}
|
||||
5
crates/sq-storage/src/wal/mod.rs
Normal file
5
crates/sq-storage/src/wal/mod.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
pub mod reader;
|
||||
pub mod record;
|
||||
pub mod segment;
|
||||
pub mod trimmer;
|
||||
pub mod writer;
|
||||
281
crates/sq-storage/src/wal/reader.rs
Normal file
281
crates/sq-storage/src/wal/reader.rs
Normal file
@@ -0,0 +1,281 @@
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_models::{Message, TopicName};
|
||||
use sq_sim::fs::FileSystem;
|
||||
|
||||
use super::record::{decode_record, RecordError, MIN_RECORD_SIZE};
|
||||
use super::segment::{SegmentHeader, SegmentHeaderError, SEGMENT_HEADER_SIZE};
|
||||
|
||||
/// Errors from reading WAL segments.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ReaderError {
|
||||
#[error("io error: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
#[error("segment header error: {0}")]
|
||||
SegmentHeader(#[from] SegmentHeaderError),
|
||||
#[error("record error at byte offset {byte_offset}: {source}")]
|
||||
Record {
|
||||
byte_offset: usize,
|
||||
source: RecordError,
|
||||
},
|
||||
}
|
||||
|
||||
/// WAL segment reader. Reads messages from segment files.
|
||||
pub struct WalReader<F: FileSystem> {
|
||||
fs: Arc<F>,
|
||||
}
|
||||
|
||||
impl<F: FileSystem> WalReader<F> {
|
||||
pub fn new(fs: Arc<F>) -> Self {
|
||||
Self { fs }
|
||||
}
|
||||
|
||||
/// Read the segment header from a segment file.
|
||||
pub fn read_segment_header(&self, path: &Path) -> Result<SegmentHeader, ReaderError> {
|
||||
let mut fh = self.fs.open_read(path)?;
|
||||
let mut header_buf = [0u8; SEGMENT_HEADER_SIZE];
|
||||
fh.read_exact(&mut header_buf)?;
|
||||
Ok(SegmentHeader::decode(&header_buf)?)
|
||||
}
|
||||
|
||||
/// Read all messages from a segment file.
|
||||
pub fn read_segment(&self, path: &Path) -> Result<Vec<Message>, ReaderError> {
|
||||
let header = self.read_segment_header(path)?;
|
||||
let topic = TopicName::from(header.topic.as_str());
|
||||
|
||||
let mut fh = self.fs.open_read(path)?;
|
||||
let mut all_bytes = Vec::new();
|
||||
fh.read_to_end(&mut all_bytes)?;
|
||||
|
||||
let data = &all_bytes[SEGMENT_HEADER_SIZE..];
|
||||
Self::decode_records(data, &topic, header.partition)
|
||||
}
|
||||
|
||||
/// Read messages from a segment file starting at a given offset.
|
||||
/// Returns all messages with offset >= `from_offset`.
|
||||
pub fn read_from_offset(
|
||||
&self,
|
||||
path: &Path,
|
||||
from_offset: u64,
|
||||
) -> Result<Vec<Message>, ReaderError> {
|
||||
let all = self.read_segment(path)?;
|
||||
Ok(all.into_iter().filter(|m| m.offset >= from_offset).collect())
|
||||
}
|
||||
|
||||
/// Decode records from a byte buffer. Stops at the first unrecoverable error
|
||||
/// or end of data. Partial/truncated records at the end are silently ignored
|
||||
/// (they indicate a crash mid-write).
|
||||
fn decode_records(
|
||||
data: &[u8],
|
||||
topic: &TopicName,
|
||||
partition: u32,
|
||||
) -> Result<Vec<Message>, ReaderError> {
|
||||
let mut messages = Vec::new();
|
||||
let mut pos = 0;
|
||||
|
||||
while pos + MIN_RECORD_SIZE <= data.len() {
|
||||
match decode_record(&data[pos..], topic, partition) {
|
||||
Ok((msg, consumed)) => {
|
||||
messages.push(msg);
|
||||
pos += consumed;
|
||||
}
|
||||
Err(RecordError::BufferTooShort { .. }) => {
|
||||
// Truncated record at end of segment (partial write) — stop cleanly.
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(ReaderError::Record {
|
||||
byte_offset: SEGMENT_HEADER_SIZE + pos,
|
||||
source: e,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(messages)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::PathBuf;
|
||||
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
use sq_sim::SimClock;
|
||||
|
||||
use super::*;
|
||||
use crate::wal::writer::WalWriter;
|
||||
use sq_models::WalConfig;
|
||||
|
||||
fn test_setup() -> (Arc<InMemoryFileSystem>, Arc<SimClock>, WalConfig) {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 1024 * 1024, // large, no rotation during tests
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
(fs, clock, config)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_segment_header() {
|
||||
let (fs, clock, config) = test_setup();
|
||||
let topic = TopicName::from("orders");
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs.clone(), clock, config, topic.clone(), 5).unwrap();
|
||||
writer.append(None, b"data", &[], 0).unwrap();
|
||||
|
||||
let seg_path = crate::wal::writer::segment_path(
|
||||
&PathBuf::from("/data"),
|
||||
&topic,
|
||||
5,
|
||||
0,
|
||||
);
|
||||
|
||||
let reader = WalReader::new(fs);
|
||||
let header = reader.read_segment_header(&seg_path).unwrap();
|
||||
assert_eq!(header.topic, "orders");
|
||||
assert_eq!(header.partition, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_then_read_all() {
|
||||
let (fs, clock, config) = test_setup();
|
||||
let topic = TopicName::from("events");
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs.clone(), clock, config, topic.clone(), 0).unwrap();
|
||||
|
||||
for i in 0..10 {
|
||||
writer
|
||||
.append(None, format!("msg-{i}").as_bytes(), &[], i * 100)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let seg_path =
|
||||
crate::wal::writer::segment_path(&PathBuf::from("/data"), &topic, 0, 0);
|
||||
|
||||
let reader = WalReader::new(fs);
|
||||
let messages = reader.read_segment(&seg_path).unwrap();
|
||||
|
||||
assert_eq!(messages.len(), 10);
|
||||
for (i, msg) in messages.iter().enumerate() {
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("msg-{i}").as_bytes());
|
||||
assert_eq!(msg.timestamp_ms, i as u64 * 100);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_from_offset() {
|
||||
let (fs, clock, config) = test_setup();
|
||||
let topic = TopicName::from("t");
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs.clone(), clock, config, topic.clone(), 0).unwrap();
|
||||
|
||||
for _ in 0..10 {
|
||||
writer.append(None, b"data", &[], 0).unwrap();
|
||||
}
|
||||
|
||||
let seg_path =
|
||||
crate::wal::writer::segment_path(&PathBuf::from("/data"), &topic, 0, 0);
|
||||
|
||||
let reader = WalReader::new(fs);
|
||||
let messages = reader.read_from_offset(&seg_path, 5).unwrap();
|
||||
|
||||
assert_eq!(messages.len(), 5);
|
||||
assert_eq!(messages[0].offset, 5);
|
||||
assert_eq!(messages[4].offset, 9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_empty_segment() {
|
||||
let (fs, clock, config) = test_setup();
|
||||
let topic = TopicName::from("t");
|
||||
|
||||
// Create a writer but don't write any messages — just ensure the segment exists
|
||||
let mut writer =
|
||||
WalWriter::new(fs.clone(), clock, config, topic.clone(), 0).unwrap();
|
||||
// Force segment creation by writing then reading
|
||||
writer.append(None, b"x", &[], 0).unwrap();
|
||||
|
||||
// Create a segment with just a header (no records)
|
||||
let empty_path = PathBuf::from("/data/t/0/empty.wal");
|
||||
{
|
||||
let mut fh = fs.open_write(&empty_path).unwrap();
|
||||
let header = super::super::segment::SegmentHeader {
|
||||
topic: "t".to_string(),
|
||||
partition: 0,
|
||||
};
|
||||
fh.write_all(&header.encode()).unwrap();
|
||||
}
|
||||
|
||||
let reader = WalReader::new(fs);
|
||||
let messages = reader.read_segment(&empty_path).unwrap();
|
||||
assert!(messages.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_corrupted_record_returns_error() {
|
||||
let (fs, clock, config) = test_setup();
|
||||
let topic = TopicName::from("t");
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs.clone(), clock, config, topic.clone(), 0).unwrap();
|
||||
writer.append(None, b"data", &[], 0).unwrap();
|
||||
|
||||
let seg_path =
|
||||
crate::wal::writer::segment_path(&PathBuf::from("/data"), &topic, 0, 0);
|
||||
|
||||
// Corrupt a byte in the record area (past the segment header)
|
||||
fs.corrupt_bytes(&seg_path, (SEGMENT_HEADER_SIZE + 10) as u64, 1);
|
||||
|
||||
let reader = WalReader::new(fs);
|
||||
let result = reader.read_segment(&seg_path);
|
||||
assert!(result.is_err());
|
||||
match result.unwrap_err() {
|
||||
ReaderError::Record { source, .. } => {
|
||||
assert!(matches!(source, RecordError::CrcMismatch { .. }));
|
||||
}
|
||||
other => panic!("expected Record error, got: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_truncated_record_at_end_is_ignored() {
|
||||
let (fs, clock, config) = test_setup();
|
||||
let topic = TopicName::from("t");
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs.clone(), clock, config, topic.clone(), 0).unwrap();
|
||||
writer.append(None, b"good message", &[], 0).unwrap();
|
||||
|
||||
let seg_path =
|
||||
crate::wal::writer::segment_path(&PathBuf::from("/data"), &topic, 0, 0);
|
||||
|
||||
// Append some garbage bytes (simulating a partial write before crash)
|
||||
{
|
||||
let mut fh = fs.open_append(&seg_path).unwrap();
|
||||
fh.write_all(&[0xDE, 0xAD, 0xBE, 0xEF, 0x00]).unwrap();
|
||||
}
|
||||
|
||||
let reader = WalReader::new(fs);
|
||||
let messages = reader.read_segment(&seg_path).unwrap();
|
||||
// Should get the one good message and ignore the garbage
|
||||
assert_eq!(messages.len(), 1);
|
||||
assert_eq!(messages[0].value, b"good message");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_nonexistent_file() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let reader = WalReader::new(fs);
|
||||
let result = reader.read_segment(Path::new("/no/such/file.wal"));
|
||||
assert!(result.is_err());
|
||||
}
|
||||
}
|
||||
514
crates/sq-storage/src/wal/record.rs
Normal file
514
crates/sq-storage/src/wal/record.rs
Normal file
@@ -0,0 +1,514 @@
|
||||
use sq_models::{Header, Message, TopicName};
|
||||
|
||||
/// Errors that can occur during record decoding.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RecordError {
|
||||
#[error("crc mismatch: expected {expected:#010x}, got {actual:#010x}")]
|
||||
CrcMismatch { expected: u32, actual: u32 },
|
||||
#[error("buffer too short: need {need} bytes, have {have}")]
|
||||
BufferTooShort { need: usize, have: usize },
|
||||
#[error("invalid utf8 in header key: {0}")]
|
||||
InvalidHeaderKey(std::string::FromUtf8Error),
|
||||
}
|
||||
|
||||
/// Record wire format (little-endian):
|
||||
///
|
||||
/// ```text
|
||||
/// [crc32: u32] CRC32 over everything after this field
|
||||
/// [length: u32] total byte length of record body (after length field)
|
||||
/// [offset: u64]
|
||||
/// [timestamp_ms: u64]
|
||||
/// [key_len: u32] 0 = no key
|
||||
/// [key: [u8; key_len]]
|
||||
/// [value_len: u32]
|
||||
/// [value: [u8; value_len]]
|
||||
/// [headers_count: u16]
|
||||
/// [for each header:]
|
||||
/// [hdr_key_len: u16]
|
||||
/// [hdr_key: [u8; hdr_key_len]]
|
||||
/// [hdr_val_len: u32]
|
||||
/// [hdr_val: [u8; hdr_val_len]]
|
||||
/// ```
|
||||
/// Encode a message into the binary WAL record format.
|
||||
/// Returns the encoded bytes.
|
||||
pub fn encode_record(msg: &Message) -> Vec<u8> {
|
||||
// First, encode the body (everything after crc + length).
|
||||
let body = encode_body(msg);
|
||||
let body_len = body.len() as u32;
|
||||
|
||||
// Compute CRC over length + body.
|
||||
let mut crc_input = Vec::with_capacity(4 + body.len());
|
||||
crc_input.extend_from_slice(&body_len.to_le_bytes());
|
||||
crc_input.extend_from_slice(&body);
|
||||
let crc = crc32fast::hash(&crc_input);
|
||||
|
||||
// Assemble: crc + length + body
|
||||
let mut out = Vec::with_capacity(4 + 4 + body.len());
|
||||
out.extend_from_slice(&crc.to_le_bytes());
|
||||
out.extend_from_slice(&body_len.to_le_bytes());
|
||||
out.extend_from_slice(&body);
|
||||
out
|
||||
}
|
||||
|
||||
/// Encode a record directly into an existing buffer, avoiding intermediate allocations.
|
||||
/// Appends the encoded bytes (crc + length + body) to `buf`.
|
||||
pub fn encode_record_into(
|
||||
buf: &mut Vec<u8>,
|
||||
offset: u64,
|
||||
timestamp_ms: u64,
|
||||
key: Option<&[u8]>,
|
||||
value: &[u8],
|
||||
headers: &[Header],
|
||||
) {
|
||||
// Reserve space for crc(4) + length(4), fill in after writing body.
|
||||
let header_pos = buf.len();
|
||||
buf.extend_from_slice(&[0u8; 8]);
|
||||
|
||||
let body_start = buf.len();
|
||||
|
||||
// offset + timestamp
|
||||
buf.extend_from_slice(&offset.to_le_bytes());
|
||||
buf.extend_from_slice(×tamp_ms.to_le_bytes());
|
||||
|
||||
// key
|
||||
match key {
|
||||
Some(k) => {
|
||||
buf.extend_from_slice(&(k.len() as u32).to_le_bytes());
|
||||
buf.extend_from_slice(k);
|
||||
}
|
||||
None => {
|
||||
buf.extend_from_slice(&0u32.to_le_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
// value
|
||||
buf.extend_from_slice(&(value.len() as u32).to_le_bytes());
|
||||
buf.extend_from_slice(value);
|
||||
|
||||
// headers
|
||||
buf.extend_from_slice(&(headers.len() as u16).to_le_bytes());
|
||||
for hdr in headers {
|
||||
buf.extend_from_slice(&(hdr.key.len() as u16).to_le_bytes());
|
||||
buf.extend_from_slice(hdr.key.as_bytes());
|
||||
buf.extend_from_slice(&(hdr.value.len() as u32).to_le_bytes());
|
||||
buf.extend_from_slice(&hdr.value);
|
||||
}
|
||||
|
||||
// Patch length field.
|
||||
let body_len = (buf.len() - body_start) as u32;
|
||||
buf[header_pos + 4..header_pos + 8].copy_from_slice(&body_len.to_le_bytes());
|
||||
|
||||
// Compute CRC over length(4) + body.
|
||||
let crc = crc32fast::hash(&buf[header_pos + 4..]);
|
||||
buf[header_pos..header_pos + 4].copy_from_slice(&crc.to_le_bytes());
|
||||
}
|
||||
|
||||
fn encode_body(msg: &Message) -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
|
||||
// offset
|
||||
buf.extend_from_slice(&msg.offset.to_le_bytes());
|
||||
// timestamp_ms
|
||||
buf.extend_from_slice(&msg.timestamp_ms.to_le_bytes());
|
||||
|
||||
// key
|
||||
match &msg.key {
|
||||
Some(key) => {
|
||||
buf.extend_from_slice(&(key.len() as u32).to_le_bytes());
|
||||
buf.extend_from_slice(key);
|
||||
}
|
||||
None => {
|
||||
buf.extend_from_slice(&0u32.to_le_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
// value
|
||||
buf.extend_from_slice(&(msg.value.len() as u32).to_le_bytes());
|
||||
buf.extend_from_slice(&msg.value);
|
||||
|
||||
// headers
|
||||
buf.extend_from_slice(&(msg.headers.len() as u16).to_le_bytes());
|
||||
for hdr in &msg.headers {
|
||||
buf.extend_from_slice(&(hdr.key.len() as u16).to_le_bytes());
|
||||
buf.extend_from_slice(hdr.key.as_bytes());
|
||||
buf.extend_from_slice(&(hdr.value.len() as u32).to_le_bytes());
|
||||
buf.extend_from_slice(&hdr.value);
|
||||
}
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
/// Minimum record size: crc(4) + length(4) + offset(8) + timestamp(8) + key_len(4) + value_len(4) + headers_count(2)
|
||||
pub const MIN_RECORD_SIZE: usize = 4 + 4 + 8 + 8 + 4 + 4 + 2;
|
||||
|
||||
/// Decode a record from the given buffer.
|
||||
/// Returns the decoded Message and the number of bytes consumed.
|
||||
/// The `topic` and `partition` are not stored in the record (they come from the segment header),
|
||||
/// so they must be provided.
|
||||
pub fn decode_record(
|
||||
buf: &[u8],
|
||||
topic: &TopicName,
|
||||
partition: u32,
|
||||
) -> Result<(Message, usize), RecordError> {
|
||||
if buf.len() < MIN_RECORD_SIZE {
|
||||
return Err(RecordError::BufferTooShort {
|
||||
need: MIN_RECORD_SIZE,
|
||||
have: buf.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let mut pos = 0;
|
||||
|
||||
// crc32
|
||||
let stored_crc = read_u32(buf, &mut pos);
|
||||
|
||||
// length
|
||||
let body_len = read_u32(buf, &mut pos) as usize;
|
||||
|
||||
// Verify we have enough bytes for the full body.
|
||||
let total_record_size = 4 + 4 + body_len; // crc + length + body
|
||||
if buf.len() < total_record_size {
|
||||
return Err(RecordError::BufferTooShort {
|
||||
need: total_record_size,
|
||||
have: buf.len(),
|
||||
});
|
||||
}
|
||||
|
||||
// Verify CRC: computed over length(4 bytes) + body.
|
||||
let crc_start = 4; // skip the crc field itself
|
||||
let crc_end = 4 + 4 + body_len;
|
||||
let computed_crc = crc32fast::hash(&buf[crc_start..crc_end]);
|
||||
if stored_crc != computed_crc {
|
||||
return Err(RecordError::CrcMismatch {
|
||||
expected: stored_crc,
|
||||
actual: computed_crc,
|
||||
});
|
||||
}
|
||||
|
||||
// Now decode the body fields.
|
||||
let offset = read_u64(buf, &mut pos);
|
||||
let timestamp_ms = read_u64(buf, &mut pos);
|
||||
|
||||
// key
|
||||
let key_len = read_u32(buf, &mut pos) as usize;
|
||||
let key = if key_len > 0 {
|
||||
let k = buf[pos..pos + key_len].to_vec();
|
||||
pos += key_len;
|
||||
Some(k)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// value
|
||||
let value_len = read_u32(buf, &mut pos) as usize;
|
||||
let value = buf[pos..pos + value_len].to_vec();
|
||||
pos += value_len;
|
||||
|
||||
// headers
|
||||
let headers_count = read_u16(buf, &mut pos) as usize;
|
||||
let mut headers = Vec::with_capacity(headers_count);
|
||||
for _ in 0..headers_count {
|
||||
let hdr_key_len = read_u16(buf, &mut pos) as usize;
|
||||
let hdr_key = String::from_utf8(buf[pos..pos + hdr_key_len].to_vec())
|
||||
.map_err(RecordError::InvalidHeaderKey)?;
|
||||
pos += hdr_key_len;
|
||||
|
||||
let hdr_val_len = read_u32(buf, &mut pos) as usize;
|
||||
let hdr_val = buf[pos..pos + hdr_val_len].to_vec();
|
||||
pos += hdr_val_len;
|
||||
|
||||
headers.push(Header {
|
||||
key: hdr_key,
|
||||
value: hdr_val,
|
||||
});
|
||||
}
|
||||
|
||||
let msg = Message {
|
||||
offset,
|
||||
topic: topic.clone(),
|
||||
partition,
|
||||
key,
|
||||
value,
|
||||
headers,
|
||||
timestamp_ms,
|
||||
};
|
||||
|
||||
Ok((msg, total_record_size))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn read_u16(buf: &[u8], pos: &mut usize) -> u16 {
|
||||
let val = u16::from_le_bytes(buf[*pos..*pos + 2].try_into().unwrap());
|
||||
*pos += 2;
|
||||
val
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn read_u32(buf: &[u8], pos: &mut usize) -> u32 {
|
||||
let val = u32::from_le_bytes(buf[*pos..*pos + 4].try_into().unwrap());
|
||||
*pos += 4;
|
||||
val
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn read_u64(buf: &[u8], pos: &mut usize) -> u64 {
|
||||
let val = u64::from_le_bytes(buf[*pos..*pos + 8].try_into().unwrap());
|
||||
*pos += 8;
|
||||
val
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_msg(offset: u64, value: &[u8]) -> Message {
|
||||
Message {
|
||||
offset,
|
||||
topic: TopicName::from("test-topic"),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: value.to_vec(),
|
||||
headers: vec![],
|
||||
timestamp_ms: 1700000000000,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_simple() {
|
||||
let msg = make_msg(0, b"hello world");
|
||||
let encoded = encode_record(&msg);
|
||||
let (decoded, consumed) =
|
||||
decode_record(&encoded, &TopicName::from("test-topic"), 0).unwrap();
|
||||
|
||||
assert_eq!(consumed, encoded.len());
|
||||
assert_eq!(decoded, msg);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_with_key() {
|
||||
let msg = Message {
|
||||
offset: 42,
|
||||
topic: TopicName::from("orders"),
|
||||
partition: 3,
|
||||
key: Some(b"user-123".to_vec()),
|
||||
value: b"order data".to_vec(),
|
||||
headers: vec![],
|
||||
timestamp_ms: 999,
|
||||
};
|
||||
|
||||
let encoded = encode_record(&msg);
|
||||
let (decoded, _) = decode_record(&encoded, &TopicName::from("orders"), 3).unwrap();
|
||||
assert_eq!(decoded, msg);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_with_headers() {
|
||||
let msg = Message {
|
||||
offset: 1,
|
||||
topic: TopicName::from("events"),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: b"event payload".to_vec(),
|
||||
headers: vec![
|
||||
Header {
|
||||
key: "content-type".to_string(),
|
||||
value: b"application/json".to_vec(),
|
||||
},
|
||||
Header {
|
||||
key: "trace-id".to_string(),
|
||||
value: b"abc-123".to_vec(),
|
||||
},
|
||||
],
|
||||
timestamp_ms: 5000,
|
||||
};
|
||||
|
||||
let encoded = encode_record(&msg);
|
||||
let (decoded, _) = decode_record(&encoded, &TopicName::from("events"), 0).unwrap();
|
||||
assert_eq!(decoded, msg);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_empty_value() {
|
||||
let msg = make_msg(0, b"");
|
||||
let encoded = encode_record(&msg);
|
||||
let (decoded, _) = decode_record(&encoded, &TopicName::from("test-topic"), 0).unwrap();
|
||||
assert_eq!(decoded.value, b"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_large_value() {
|
||||
let large = vec![0xAB; 256 * 1024]; // 256KB
|
||||
let msg = make_msg(99, &large);
|
||||
let encoded = encode_record(&msg);
|
||||
let (decoded, _) = decode_record(&encoded, &TopicName::from("test-topic"), 0).unwrap();
|
||||
assert_eq!(decoded.value, large);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_many_headers() {
|
||||
let headers: Vec<Header> = (0..50)
|
||||
.map(|i| Header {
|
||||
key: format!("h{i}"),
|
||||
value: format!("v{i}").into_bytes(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let msg = Message {
|
||||
offset: 0,
|
||||
topic: TopicName::from("t"),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: b"data".to_vec(),
|
||||
headers,
|
||||
timestamp_ms: 0,
|
||||
};
|
||||
|
||||
let encoded = encode_record(&msg);
|
||||
let (decoded, _) = decode_record(&encoded, &TopicName::from("t"), 0).unwrap();
|
||||
assert_eq!(decoded.headers.len(), 50);
|
||||
assert_eq!(decoded, msg);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_crc_corruption_detected() {
|
||||
let msg = make_msg(0, b"important data");
|
||||
let mut encoded = encode_record(&msg);
|
||||
|
||||
// Flip a byte in the value section (past the header).
|
||||
let corruption_offset = encoded.len() - 5;
|
||||
encoded[corruption_offset] ^= 0xFF;
|
||||
|
||||
match decode_record(&encoded, &TopicName::from("test-topic"), 0) {
|
||||
Err(RecordError::CrcMismatch { .. }) => {} // expected
|
||||
other => panic!("expected CrcMismatch, got: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_crc_corruption_in_header() {
|
||||
let msg = make_msg(0, b"data");
|
||||
let mut encoded = encode_record(&msg);
|
||||
|
||||
// Corrupt the length field (bytes 4-7).
|
||||
encoded[5] ^= 0x01;
|
||||
|
||||
match decode_record(&encoded, &TopicName::from("test-topic"), 0) {
|
||||
Err(RecordError::CrcMismatch { .. }) => {}
|
||||
Err(RecordError::BufferTooShort { .. }) => {} // also valid if length becomes huge
|
||||
other => panic!("expected CrcMismatch or BufferTooShort, got: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_buffer_too_short() {
|
||||
let buf = [0u8; 4]; // way too small
|
||||
match decode_record(&buf, &TopicName::from("t"), 0) {
|
||||
Err(RecordError::BufferTooShort { need, have }) => {
|
||||
assert_eq!(need, MIN_RECORD_SIZE);
|
||||
assert_eq!(have, 4);
|
||||
}
|
||||
other => panic!("expected BufferTooShort, got: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_from_middle_of_buffer() {
|
||||
// Encode two records back-to-back and decode them sequentially.
|
||||
let msg1 = make_msg(0, b"first");
|
||||
let msg2 = make_msg(1, b"second");
|
||||
|
||||
let mut buf = encode_record(&msg1);
|
||||
buf.extend_from_slice(&encode_record(&msg2));
|
||||
|
||||
let (decoded1, consumed1) = decode_record(&buf, &TopicName::from("test-topic"), 0).unwrap();
|
||||
assert_eq!(decoded1, msg1);
|
||||
|
||||
let (decoded2, consumed2) =
|
||||
decode_record(&buf[consumed1..], &TopicName::from("test-topic"), 0).unwrap();
|
||||
assert_eq!(decoded2, msg2);
|
||||
|
||||
assert_eq!(consumed1 + consumed2, buf.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_record_size_consistency() {
|
||||
// Verify that encode produces exactly crc(4) + length(4) + body(length) bytes.
|
||||
let msg = make_msg(0, b"test");
|
||||
let encoded = encode_record(&msg);
|
||||
|
||||
let stored_len = u32::from_le_bytes(encoded[4..8].try_into().unwrap()) as usize;
|
||||
assert_eq!(encoded.len(), 4 + 4 + stored_len);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_record_into_matches_encode_record() {
|
||||
let msg = Message {
|
||||
offset: 42,
|
||||
topic: TopicName::from("orders"),
|
||||
partition: 3,
|
||||
key: Some(b"user-123".to_vec()),
|
||||
value: b"order data".to_vec(),
|
||||
headers: vec![
|
||||
Header {
|
||||
key: "content-type".to_string(),
|
||||
value: b"application/json".to_vec(),
|
||||
},
|
||||
],
|
||||
timestamp_ms: 999,
|
||||
};
|
||||
|
||||
let old = encode_record(&msg);
|
||||
|
||||
let mut new = Vec::new();
|
||||
encode_record_into(
|
||||
&mut new,
|
||||
msg.offset,
|
||||
msg.timestamp_ms,
|
||||
msg.key.as_deref(),
|
||||
&msg.value,
|
||||
&msg.headers,
|
||||
);
|
||||
|
||||
assert_eq!(old, new, "encode_record and encode_record_into must produce identical bytes");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_record_into_decodable() {
|
||||
let mut buf = Vec::new();
|
||||
let headers = vec![Header {
|
||||
key: "h1".to_string(),
|
||||
value: b"v1".to_vec(),
|
||||
}];
|
||||
encode_record_into(&mut buf, 7, 5000, Some(b"key1"), b"value1", &headers);
|
||||
encode_record_into(&mut buf, 8, 5001, None, b"value2", &[]);
|
||||
|
||||
let (msg1, consumed1) = decode_record(&buf, &TopicName::from("t"), 0).unwrap();
|
||||
assert_eq!(msg1.offset, 7);
|
||||
assert_eq!(msg1.key, Some(b"key1".to_vec()));
|
||||
assert_eq!(msg1.value, b"value1");
|
||||
assert_eq!(msg1.headers.len(), 1);
|
||||
|
||||
let (msg2, _) = decode_record(&buf[consumed1..], &TopicName::from("t"), 0).unwrap();
|
||||
assert_eq!(msg2.offset, 8);
|
||||
assert_eq!(msg2.key, None);
|
||||
assert_eq!(msg2.value, b"value2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_offset_and_timestamp_preserved() {
|
||||
let msg = Message {
|
||||
offset: u64::MAX,
|
||||
topic: TopicName::from("t"),
|
||||
partition: 0,
|
||||
key: None,
|
||||
value: vec![],
|
||||
headers: vec![],
|
||||
timestamp_ms: u64::MAX,
|
||||
};
|
||||
|
||||
let encoded = encode_record(&msg);
|
||||
let (decoded, _) = decode_record(&encoded, &TopicName::from("t"), 0).unwrap();
|
||||
assert_eq!(decoded.offset, u64::MAX);
|
||||
assert_eq!(decoded.timestamp_ms, u64::MAX);
|
||||
}
|
||||
}
|
||||
174
crates/sq-storage/src/wal/segment.rs
Normal file
174
crates/sq-storage/src/wal/segment.rs
Normal file
@@ -0,0 +1,174 @@
|
||||
/// WAL segment header format (32 bytes fixed):
|
||||
///
|
||||
/// ```text
|
||||
/// [magic: [u8; 4]] = b"SQWL"
|
||||
/// [version: u16] = 1
|
||||
/// [topic_len: u16] actual topic name length
|
||||
/// [topic: [u8; 20]] topic name, zero-padded
|
||||
/// [partition: u32]
|
||||
/// ```
|
||||
pub const SEGMENT_HEADER_SIZE: usize = 32;
|
||||
pub const SEGMENT_MAGIC: &[u8; 4] = b"SQWL";
|
||||
pub const SEGMENT_VERSION: u16 = 1;
|
||||
const TOPIC_FIELD_SIZE: usize = 20;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct SegmentHeader {
|
||||
pub topic: String,
|
||||
pub partition: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SegmentHeaderError {
|
||||
#[error("invalid magic bytes")]
|
||||
InvalidMagic,
|
||||
#[error("unsupported version: {0}")]
|
||||
UnsupportedVersion(u16),
|
||||
#[error("buffer too short: need {need}, have {have}")]
|
||||
BufferTooShort { need: usize, have: usize },
|
||||
#[error("invalid utf8 in topic: {0}")]
|
||||
InvalidUtf8(#[from] std::string::FromUtf8Error),
|
||||
}
|
||||
|
||||
impl SegmentHeader {
|
||||
pub fn encode(&self) -> [u8; SEGMENT_HEADER_SIZE] {
|
||||
let mut buf = [0u8; SEGMENT_HEADER_SIZE];
|
||||
let mut pos = 0;
|
||||
|
||||
// magic
|
||||
buf[pos..pos + 4].copy_from_slice(SEGMENT_MAGIC);
|
||||
pos += 4;
|
||||
|
||||
// version
|
||||
buf[pos..pos + 2].copy_from_slice(&SEGMENT_VERSION.to_le_bytes());
|
||||
pos += 2;
|
||||
|
||||
// topic_len
|
||||
let topic_bytes = self.topic.as_bytes();
|
||||
let topic_len = topic_bytes.len().min(TOPIC_FIELD_SIZE) as u16;
|
||||
buf[pos..pos + 2].copy_from_slice(&topic_len.to_le_bytes());
|
||||
pos += 2;
|
||||
|
||||
// topic (zero-padded)
|
||||
let copy_len = topic_len as usize;
|
||||
buf[pos..pos + copy_len].copy_from_slice(&topic_bytes[..copy_len]);
|
||||
pos += TOPIC_FIELD_SIZE;
|
||||
|
||||
// partition
|
||||
buf[pos..pos + 4].copy_from_slice(&self.partition.to_le_bytes());
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
pub fn decode(buf: &[u8]) -> Result<Self, SegmentHeaderError> {
|
||||
if buf.len() < SEGMENT_HEADER_SIZE {
|
||||
return Err(SegmentHeaderError::BufferTooShort {
|
||||
need: SEGMENT_HEADER_SIZE,
|
||||
have: buf.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let mut pos = 0;
|
||||
|
||||
// magic
|
||||
if &buf[pos..pos + 4] != SEGMENT_MAGIC {
|
||||
return Err(SegmentHeaderError::InvalidMagic);
|
||||
}
|
||||
pos += 4;
|
||||
|
||||
// version
|
||||
let version = u16::from_le_bytes(buf[pos..pos + 2].try_into().unwrap());
|
||||
if version != SEGMENT_VERSION {
|
||||
return Err(SegmentHeaderError::UnsupportedVersion(version));
|
||||
}
|
||||
pos += 2;
|
||||
|
||||
// topic_len
|
||||
let topic_len = u16::from_le_bytes(buf[pos..pos + 2].try_into().unwrap()) as usize;
|
||||
pos += 2;
|
||||
|
||||
// topic
|
||||
let topic = String::from_utf8(buf[pos..pos + topic_len].to_vec())?;
|
||||
pos += TOPIC_FIELD_SIZE;
|
||||
|
||||
// partition
|
||||
let partition = u32::from_le_bytes(buf[pos..pos + 4].try_into().unwrap());
|
||||
|
||||
Ok(Self { topic, partition })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_segment_header_roundtrip() {
|
||||
let header = SegmentHeader {
|
||||
topic: "orders".to_string(),
|
||||
partition: 7,
|
||||
};
|
||||
|
||||
let encoded = header.encode();
|
||||
assert_eq!(encoded.len(), SEGMENT_HEADER_SIZE);
|
||||
|
||||
let decoded = SegmentHeader::decode(&encoded).unwrap();
|
||||
assert_eq!(decoded, header);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_header_magic_bytes() {
|
||||
let header = SegmentHeader {
|
||||
topic: "test".to_string(),
|
||||
partition: 0,
|
||||
};
|
||||
let encoded = header.encode();
|
||||
assert_eq!(&encoded[..4], b"SQWL");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_header_invalid_magic() {
|
||||
let mut buf = [0u8; SEGMENT_HEADER_SIZE];
|
||||
buf[..4].copy_from_slice(b"XXXX");
|
||||
match SegmentHeader::decode(&buf) {
|
||||
Err(SegmentHeaderError::InvalidMagic) => {}
|
||||
other => panic!("expected InvalidMagic, got: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_header_unsupported_version() {
|
||||
let header = SegmentHeader {
|
||||
topic: "t".to_string(),
|
||||
partition: 0,
|
||||
};
|
||||
let mut encoded = header.encode();
|
||||
// Set version to 99
|
||||
encoded[4..6].copy_from_slice(&99u16.to_le_bytes());
|
||||
match SegmentHeader::decode(&encoded) {
|
||||
Err(SegmentHeaderError::UnsupportedVersion(99)) => {}
|
||||
other => panic!("expected UnsupportedVersion(99), got: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_header_long_topic_truncated() {
|
||||
let header = SegmentHeader {
|
||||
topic: "a-very-long-topic-name-exceeding-20-bytes".to_string(),
|
||||
partition: 0,
|
||||
};
|
||||
let encoded = header.encode();
|
||||
let decoded = SegmentHeader::decode(&encoded).unwrap();
|
||||
// Topic should be truncated to 20 bytes
|
||||
assert_eq!(decoded.topic, "a-very-long-topic-na");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_header_buffer_too_short() {
|
||||
let buf = [0u8; 10];
|
||||
match SegmentHeader::decode(&buf) {
|
||||
Err(SegmentHeaderError::BufferTooShort { need: 32, have: 10 }) => {}
|
||||
other => panic!("expected BufferTooShort, got: {other:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
130
crates/sq-storage/src/wal/trimmer.rs
Normal file
130
crates/sq-storage/src/wal/trimmer.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sq_sim::fs::FileSystem;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use crate::object_store::shipper::ShippedSegments;
|
||||
|
||||
/// Trims (deletes) local WAL segment files that have been shipped to object storage.
|
||||
pub struct WalTrimmer<F: FileSystem> {
|
||||
fs: Arc<F>,
|
||||
shipped: Arc<Mutex<ShippedSegments>>,
|
||||
}
|
||||
|
||||
impl<F: FileSystem> WalTrimmer<F> {
|
||||
pub fn new(fs: Arc<F>, shipped: Arc<Mutex<ShippedSegments>>) -> Self {
|
||||
Self { fs, shipped }
|
||||
}
|
||||
|
||||
/// Trim all segments that have been shipped to object storage.
|
||||
/// Returns the list of paths that were successfully deleted.
|
||||
pub async fn trim(&self) -> anyhow::Result<Vec<PathBuf>> {
|
||||
let shipped_paths: Vec<PathBuf> = {
|
||||
let shipped = self.shipped.lock().await;
|
||||
shipped.shipped_paths().iter().cloned().collect()
|
||||
};
|
||||
|
||||
let mut trimmed = Vec::new();
|
||||
|
||||
for path in &shipped_paths {
|
||||
if self.fs.exists(path) {
|
||||
match self.fs.remove_file(path) {
|
||||
Ok(()) => {
|
||||
tracing::info!(path = %path.display(), "trimmed shipped WAL segment");
|
||||
trimmed.push(path.clone());
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
path = %path.display(),
|
||||
error = %e,
|
||||
"failed to trim WAL segment"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(trimmed)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::object_store::shipper::ShippedSegments;
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
use std::path::Path;
|
||||
|
||||
fn create_file(fs: &InMemoryFileSystem, path: &Path) {
|
||||
fs.create_dir_all(path.parent().unwrap()).unwrap();
|
||||
let mut handle = fs.open_write(path).unwrap();
|
||||
handle.write_all(b"wal data").unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_trim_shipped_segment() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let shipped = Arc::new(Mutex::new(ShippedSegments::new()));
|
||||
|
||||
let path = PathBuf::from("/data/t/0/000.wal");
|
||||
create_file(&fs, &path);
|
||||
|
||||
shipped.lock().await.mark_shipped(path.clone());
|
||||
|
||||
let trimmer = WalTrimmer::new(fs.clone(), shipped);
|
||||
let trimmed = trimmer.trim().await.unwrap();
|
||||
|
||||
assert_eq!(trimmed.len(), 1);
|
||||
assert!(!fs.exists(&path));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_unshipped_segment_not_trimmed() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let shipped = Arc::new(Mutex::new(ShippedSegments::new()));
|
||||
|
||||
let path = PathBuf::from("/data/t/0/000.wal");
|
||||
create_file(&fs, &path);
|
||||
|
||||
// Don't mark as shipped.
|
||||
let trimmer = WalTrimmer::new(fs.clone(), shipped);
|
||||
let trimmed = trimmer.trim().await.unwrap();
|
||||
|
||||
assert!(trimmed.is_empty());
|
||||
assert!(fs.exists(&path));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_trim_multiple_segments() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let shipped = Arc::new(Mutex::new(ShippedSegments::new()));
|
||||
|
||||
for i in 0..3 {
|
||||
let path = PathBuf::from(format!("/data/t/0/{:020}.wal", i * 100));
|
||||
create_file(&fs, &path);
|
||||
shipped.lock().await.mark_shipped(path);
|
||||
}
|
||||
|
||||
let trimmer = WalTrimmer::new(fs.clone(), shipped);
|
||||
let trimmed = trimmer.trim().await.unwrap();
|
||||
|
||||
assert_eq!(trimmed.len(), 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_trim_already_deleted_is_noop() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let shipped = Arc::new(Mutex::new(ShippedSegments::new()));
|
||||
|
||||
let path = PathBuf::from("/data/t/0/000.wal");
|
||||
// Mark as shipped but don't create the file.
|
||||
shipped.lock().await.mark_shipped(path);
|
||||
|
||||
let trimmer = WalTrimmer::new(fs, shipped);
|
||||
let trimmed = trimmer.trim().await.unwrap();
|
||||
|
||||
// File didn't exist, so nothing to trim.
|
||||
assert!(trimmed.is_empty());
|
||||
}
|
||||
}
|
||||
547
crates/sq-storage/src/wal/writer.rs
Normal file
547
crates/sq-storage/src/wal/writer.rs
Normal file
@@ -0,0 +1,547 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Instant;
|
||||
|
||||
use sq_models::{ClosedSegment, Header, SyncPolicy, TopicName, WalConfig};
|
||||
use sq_sim::fs::{FileHandle, FileSystem};
|
||||
use sq_sim::Clock;
|
||||
|
||||
use super::record::encode_record_into;
|
||||
use super::segment::{SegmentHeader, SEGMENT_HEADER_SIZE};
|
||||
|
||||
/// WAL writer for a single topic-partition.
|
||||
/// Appends records to segment files with fsync for durability.
|
||||
pub struct WalWriter<F: FileSystem, C: Clock> {
|
||||
fs: std::sync::Arc<F>,
|
||||
clock: std::sync::Arc<C>,
|
||||
config: WalConfig,
|
||||
topic: TopicName,
|
||||
partition: u32,
|
||||
/// Currently active segment file handle.
|
||||
active_segment: Option<Box<dyn FileHandle>>,
|
||||
/// Path of the active segment file.
|
||||
active_segment_path: Option<PathBuf>,
|
||||
/// Base offset of the active segment.
|
||||
segment_base_offset: u64,
|
||||
/// Current byte position in the active segment.
|
||||
segment_position: u64,
|
||||
/// Next offset to assign.
|
||||
next_offset: u64,
|
||||
/// When the active segment was opened.
|
||||
segment_opened_at: Instant,
|
||||
}
|
||||
|
||||
impl<F: FileSystem, C: Clock> WalWriter<F, C> {
|
||||
pub fn new(
|
||||
fs: std::sync::Arc<F>,
|
||||
clock: std::sync::Arc<C>,
|
||||
config: WalConfig,
|
||||
topic: TopicName,
|
||||
partition: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
let segment_dir = segment_dir(&config.data_dir, &topic, partition);
|
||||
fs.create_dir_all(&segment_dir)?;
|
||||
|
||||
Ok(Self {
|
||||
fs,
|
||||
clock: clock.clone(),
|
||||
config,
|
||||
topic,
|
||||
partition,
|
||||
active_segment: None,
|
||||
active_segment_path: None,
|
||||
segment_base_offset: 0,
|
||||
segment_position: 0,
|
||||
next_offset: 0,
|
||||
segment_opened_at: clock.now(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Restore a writer at a known offset (used during recovery).
|
||||
pub fn with_next_offset(mut self, offset: u64) -> Self {
|
||||
self.next_offset = offset;
|
||||
self
|
||||
}
|
||||
|
||||
/// Append a message to the WAL. Returns the assigned offset.
|
||||
/// The record is fsync'd before returning.
|
||||
pub fn append(
|
||||
&mut self,
|
||||
key: Option<&[u8]>,
|
||||
value: &[u8],
|
||||
headers: &[Header],
|
||||
timestamp_ms: u64,
|
||||
) -> anyhow::Result<u64> {
|
||||
// Check if we need to rotate the segment.
|
||||
self.maybe_rotate()?;
|
||||
|
||||
let offset = self.next_offset;
|
||||
|
||||
let mut buf = Vec::new();
|
||||
encode_record_into(&mut buf, offset, timestamp_ms, key, value, headers);
|
||||
|
||||
let should_fsync = self.config.sync_policy == SyncPolicy::EveryBatch;
|
||||
let fh = self.ensure_segment()?;
|
||||
fh.write_all(&buf)?;
|
||||
if should_fsync {
|
||||
fh.fsync()?;
|
||||
}
|
||||
|
||||
self.segment_position += buf.len() as u64;
|
||||
self.next_offset += 1;
|
||||
|
||||
Ok(offset)
|
||||
}
|
||||
|
||||
/// Append a batch of messages. Fsync depends on the configured SyncPolicy.
|
||||
pub fn append_batch(
|
||||
&mut self,
|
||||
messages: &[(Option<&[u8]>, &[u8], &[Header], u64)],
|
||||
) -> anyhow::Result<Vec<u64>> {
|
||||
if messages.is_empty() {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
self.maybe_rotate()?;
|
||||
|
||||
// Encode all records up front so we don't hold a mutable borrow on self
|
||||
// while also needing to mutate next_offset.
|
||||
let mut offsets = Vec::with_capacity(messages.len());
|
||||
let mut buf = Vec::new();
|
||||
let mut offset = self.next_offset;
|
||||
|
||||
for (key, value, headers, timestamp_ms) in messages {
|
||||
encode_record_into(&mut buf, offset, *timestamp_ms, *key, value, headers);
|
||||
offsets.push(offset);
|
||||
offset += 1;
|
||||
}
|
||||
|
||||
let should_fsync = self.config.sync_policy == SyncPolicy::EveryBatch;
|
||||
let fh = self.ensure_segment()?;
|
||||
fh.write_all(&buf)?;
|
||||
if should_fsync {
|
||||
fh.fsync()?;
|
||||
}
|
||||
|
||||
self.segment_position += buf.len() as u64;
|
||||
self.next_offset = offset;
|
||||
|
||||
Ok(offsets)
|
||||
}
|
||||
|
||||
/// Close the active segment and return it as a ClosedSegment (if any).
|
||||
pub fn close_active_segment(&mut self) -> anyhow::Result<Option<ClosedSegment>> {
|
||||
if self.active_segment.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let path = self.active_segment_path.take().unwrap();
|
||||
let base_offset = self.segment_base_offset;
|
||||
let end_offset = if self.next_offset > 0 {
|
||||
self.next_offset - 1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let size_bytes = self.segment_position;
|
||||
|
||||
self.active_segment = None;
|
||||
self.segment_position = 0;
|
||||
|
||||
Ok(Some(ClosedSegment {
|
||||
path,
|
||||
topic: self.topic.clone(),
|
||||
partition: self.partition,
|
||||
base_offset,
|
||||
end_offset,
|
||||
size_bytes,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Get the next offset that will be assigned.
|
||||
pub fn next_offset(&self) -> u64 {
|
||||
self.next_offset
|
||||
}
|
||||
|
||||
/// Force an fsync on the active segment file.
|
||||
/// Used by the background sync task when SyncPolicy is Interval.
|
||||
pub fn fsync(&mut self) -> anyhow::Result<()> {
|
||||
if let Some(fh) = self.active_segment.as_mut() {
|
||||
fh.fsync()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the current segment position in bytes.
|
||||
pub fn segment_position(&self) -> u64 {
|
||||
self.segment_position
|
||||
}
|
||||
|
||||
fn maybe_rotate(&mut self) -> anyhow::Result<()> {
|
||||
if self.active_segment.is_none() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let size_exceeded = self.segment_position >= self.config.max_segment_bytes;
|
||||
let age_exceeded = self
|
||||
.clock
|
||||
.elapsed_since(self.segment_opened_at)
|
||||
.as_secs()
|
||||
>= self.config.max_segment_age_secs;
|
||||
|
||||
if size_exceeded || age_exceeded {
|
||||
// Close current segment.
|
||||
let _closed = self.close_active_segment()?;
|
||||
// Next call to ensure_segment will open a new one.
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_segment(&mut self) -> anyhow::Result<&mut Box<dyn FileHandle>> {
|
||||
if self.active_segment.is_none() {
|
||||
let seg_path = segment_path(
|
||||
&self.config.data_dir,
|
||||
&self.topic,
|
||||
self.partition,
|
||||
self.next_offset,
|
||||
);
|
||||
|
||||
let mut fh = self.fs.open_write(&seg_path)?;
|
||||
|
||||
// Write segment header.
|
||||
let header = SegmentHeader {
|
||||
topic: self.topic.0.clone(),
|
||||
partition: self.partition,
|
||||
};
|
||||
let header_bytes = header.encode();
|
||||
fh.write_all(&header_bytes)?;
|
||||
|
||||
self.active_segment = Some(fh);
|
||||
self.active_segment_path = Some(seg_path);
|
||||
self.segment_base_offset = self.next_offset;
|
||||
self.segment_position = SEGMENT_HEADER_SIZE as u64;
|
||||
self.segment_opened_at = self.clock.now();
|
||||
}
|
||||
|
||||
Ok(self.active_segment.as_mut().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the directory path for a topic-partition's WAL segments.
|
||||
pub fn segment_dir(data_dir: &Path, topic: &TopicName, partition: u32) -> PathBuf {
|
||||
data_dir.join(topic.as_str()).join(partition.to_string())
|
||||
}
|
||||
|
||||
/// Build the file path for a specific segment.
|
||||
pub fn segment_path(data_dir: &Path, topic: &TopicName, partition: u32, base_offset: u64) -> PathBuf {
|
||||
segment_dir(data_dir, topic, partition).join(format!("{base_offset:020}.wal"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use sq_sim::fs::InMemoryFileSystem;
|
||||
use sq_sim::SimClock;
|
||||
|
||||
use super::*;
|
||||
use crate::wal::record::decode_record;
|
||||
use crate::wal::segment::SegmentHeader;
|
||||
|
||||
fn test_config() -> WalConfig {
|
||||
WalConfig {
|
||||
max_segment_bytes: 1024, // small for testing
|
||||
max_segment_age_secs: 60,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_single_message() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let topic = TopicName::from("orders");
|
||||
|
||||
let mut writer = WalWriter::new(fs.clone(), clock, test_config(), topic.clone(), 0).unwrap();
|
||||
|
||||
let offset = writer.append(None, b"hello", &[], 1000).unwrap();
|
||||
assert_eq!(offset, 0);
|
||||
assert_eq!(writer.next_offset(), 1);
|
||||
|
||||
// Verify file exists
|
||||
let seg_path = segment_path(&PathBuf::from("/data"), &topic, 0, 0);
|
||||
assert!(fs.exists(&seg_path));
|
||||
|
||||
// Verify contents
|
||||
let data = fs.read_file_bytes(&seg_path).unwrap();
|
||||
assert!(data.len() > SEGMENT_HEADER_SIZE);
|
||||
|
||||
// Decode header
|
||||
let header = SegmentHeader::decode(&data).unwrap();
|
||||
assert_eq!(header.topic, "orders");
|
||||
assert_eq!(header.partition, 0);
|
||||
|
||||
// Decode record
|
||||
let (msg, _) = decode_record(&data[SEGMENT_HEADER_SIZE..], &topic, 0).unwrap();
|
||||
assert_eq!(msg.offset, 0);
|
||||
assert_eq!(msg.value, b"hello");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_multiple_monotonic_offsets() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs, clock, test_config(), TopicName::from("t"), 0).unwrap();
|
||||
|
||||
for i in 0..100 {
|
||||
let offset = writer.append(None, b"data", &[], 0).unwrap();
|
||||
assert_eq!(offset, i);
|
||||
}
|
||||
|
||||
assert_eq!(writer.next_offset(), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_rotation_by_size() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let topic = TopicName::from("t");
|
||||
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 200, // very small, forces rotation
|
||||
max_segment_age_secs: 3600,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut writer = WalWriter::new(fs.clone(), clock, config, topic.clone(), 0).unwrap();
|
||||
|
||||
// Write enough messages to cause rotation
|
||||
for _ in 0..20 {
|
||||
writer.append(None, b"some data here", &[], 0).unwrap();
|
||||
}
|
||||
|
||||
// Should have multiple segment files
|
||||
let entries = fs.list_dir(&segment_dir(&PathBuf::from("/data"), &topic, 0)).unwrap();
|
||||
assert!(
|
||||
entries.len() > 1,
|
||||
"expected multiple segments, got {}",
|
||||
entries.len()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_rotation_by_time() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let topic = TopicName::from("t");
|
||||
|
||||
let config = WalConfig {
|
||||
max_segment_bytes: 1024 * 1024, // large
|
||||
max_segment_age_secs: 10,
|
||||
data_dir: PathBuf::from("/data"),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs.clone(), clock.clone(), config, topic.clone(), 0).unwrap();
|
||||
|
||||
writer.append(None, b"msg1", &[], 0).unwrap();
|
||||
|
||||
// Advance time past the threshold
|
||||
clock.advance(Duration::from_secs(15));
|
||||
|
||||
writer.append(None, b"msg2", &[], 0).unwrap();
|
||||
|
||||
let entries = fs.list_dir(&segment_dir(&PathBuf::from("/data"), &topic, 0)).unwrap();
|
||||
assert_eq!(entries.len(), 2, "expected 2 segments after time rotation");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fsync_failure_does_not_advance_offset() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs.clone(), clock, test_config(), TopicName::from("t"), 0).unwrap();
|
||||
|
||||
// First write succeeds
|
||||
writer.append(None, b"good", &[], 0).unwrap();
|
||||
assert_eq!(writer.next_offset(), 1);
|
||||
|
||||
// Inject fsync failure
|
||||
fs.fail_next_fsync(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"disk error",
|
||||
));
|
||||
|
||||
// This write should fail
|
||||
let result = writer.append(None, b"bad", &[], 0);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Offset should NOT have advanced
|
||||
// Note: offset advances before fsync in current impl, but the write is not considered
|
||||
// durable. The caller should retry. This is the simplest approach for v1.
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_close_active_segment() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs, clock, test_config(), TopicName::from("t"), 0).unwrap();
|
||||
|
||||
writer.append(None, b"msg1", &[], 0).unwrap();
|
||||
writer.append(None, b"msg2", &[], 0).unwrap();
|
||||
|
||||
let closed = writer.close_active_segment().unwrap().unwrap();
|
||||
assert_eq!(closed.base_offset, 0);
|
||||
assert_eq!(closed.end_offset, 1);
|
||||
assert_eq!(closed.topic.as_str(), "t");
|
||||
assert_eq!(closed.partition, 0);
|
||||
assert!(closed.size_bytes > SEGMENT_HEADER_SIZE as u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_close_empty_returns_none() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs, clock, test_config(), TopicName::from("t"), 0).unwrap();
|
||||
|
||||
assert!(writer.close_active_segment().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_path_format() {
|
||||
let path = segment_path(&PathBuf::from("/data"), &TopicName::from("orders"), 0, 42);
|
||||
assert_eq!(
|
||||
path,
|
||||
PathBuf::from("/data/orders/0/00000000000000000042.wal")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_with_key_and_headers() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let topic = TopicName::from("t");
|
||||
|
||||
let mut writer = WalWriter::new(fs.clone(), clock, test_config(), topic.clone(), 0).unwrap();
|
||||
|
||||
let headers = vec![Header {
|
||||
key: "ct".to_string(),
|
||||
value: b"json".to_vec(),
|
||||
}];
|
||||
let offset = writer
|
||||
.append(Some(b"key1"), b"value1", &headers, 5000)
|
||||
.unwrap();
|
||||
assert_eq!(offset, 0);
|
||||
|
||||
// Read back and verify
|
||||
let seg_path = segment_path(&PathBuf::from("/data"), &topic, 0, 0);
|
||||
let data = fs.read_file_bytes(&seg_path).unwrap();
|
||||
let (msg, _) = decode_record(&data[SEGMENT_HEADER_SIZE..], &topic, 0).unwrap();
|
||||
|
||||
assert_eq!(msg.key.as_deref(), Some(b"key1".as_slice()));
|
||||
assert_eq!(msg.value, b"value1");
|
||||
assert_eq!(msg.headers.len(), 1);
|
||||
assert_eq!(msg.headers[0].key, "ct");
|
||||
assert_eq!(msg.timestamp_ms, 5000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_append_batch_basic() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
let topic = TopicName::from("t");
|
||||
|
||||
let mut writer = WalWriter::new(fs.clone(), clock, test_config(), topic.clone(), 0).unwrap();
|
||||
|
||||
let messages: Vec<(Option<&[u8]>, &[u8], &[Header], u64)> = vec![
|
||||
(None, b"msg-0", &[], 100),
|
||||
(None, b"msg-1", &[], 200),
|
||||
(None, b"msg-2", &[], 300),
|
||||
];
|
||||
|
||||
let offsets = writer.append_batch(&messages).unwrap();
|
||||
assert_eq!(offsets, vec![0, 1, 2]);
|
||||
assert_eq!(writer.next_offset(), 3);
|
||||
|
||||
// Verify all records are readable.
|
||||
let seg_path = segment_path(&PathBuf::from("/data"), &topic, 0, 0);
|
||||
let data = fs.read_file_bytes(&seg_path).unwrap();
|
||||
let mut pos = SEGMENT_HEADER_SIZE;
|
||||
for i in 0..3 {
|
||||
let (msg, consumed) = decode_record(&data[pos..], &topic, 0).unwrap();
|
||||
assert_eq!(msg.offset, i as u64);
|
||||
assert_eq!(msg.value, format!("msg-{i}").as_bytes());
|
||||
pos += consumed;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_append_batch_empty() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs, clock, test_config(), TopicName::from("t"), 0).unwrap();
|
||||
|
||||
let offsets = writer.append_batch(&[]).unwrap();
|
||||
assert!(offsets.is_empty());
|
||||
assert_eq!(writer.next_offset(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_append_batch_continues_offset() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs, clock, test_config(), TopicName::from("t"), 0).unwrap();
|
||||
|
||||
// Single append first.
|
||||
writer.append(None, b"solo", &[], 0).unwrap();
|
||||
assert_eq!(writer.next_offset(), 1);
|
||||
|
||||
// Then batch.
|
||||
let messages: Vec<(Option<&[u8]>, &[u8], &[Header], u64)> = vec![
|
||||
(None, b"batch-0", &[], 0),
|
||||
(None, b"batch-1", &[], 0),
|
||||
];
|
||||
let offsets = writer.append_batch(&messages).unwrap();
|
||||
assert_eq!(offsets, vec![1, 2]);
|
||||
assert_eq!(writer.next_offset(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_append_batch_fsync_failure() {
|
||||
let fs = Arc::new(InMemoryFileSystem::new());
|
||||
let clock = Arc::new(SimClock::new());
|
||||
|
||||
let mut writer =
|
||||
WalWriter::new(fs.clone(), clock, test_config(), TopicName::from("t"), 0).unwrap();
|
||||
|
||||
// Write one message to open segment.
|
||||
writer.append(None, b"ok", &[], 0).unwrap();
|
||||
|
||||
// Inject fsync failure.
|
||||
fs.fail_next_fsync(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"disk error",
|
||||
));
|
||||
|
||||
let messages: Vec<(Option<&[u8]>, &[u8], &[Header], u64)> = vec![
|
||||
(None, b"a", &[], 0),
|
||||
(None, b"b", &[], 0),
|
||||
];
|
||||
let result = writer.append_batch(&messages);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user