feat: add weight

This commit is contained in:
2026-03-06 12:54:41 +01:00
parent 0fa906a8cf
commit 5e1cd2b1e7
11 changed files with 680 additions and 39 deletions

View File

@@ -29,7 +29,6 @@ async fn main() -> anyhow::Result<()> {
EnvFilter::from_default_env()
.add_directive("nocontrol=trace".parse().unwrap())
.add_directive("postgres_backend=trace".parse().unwrap())
.add_directive("debug".parse().unwrap()),
)
.with_file(false)

View File

@@ -0,0 +1,16 @@
[package]
name = "rebalancing-stress"
version = "0.1.0"
edition = "2024"
publish = false
[dependencies]
nocontrol.workspace = true
anyhow.workspace = true
tokio.workspace = true
serde.workspace = true
tracing-subscriber.workspace = true
tracing.workspace = true
uuid.workspace = true
tokio-util = { version = "0.7", features = ["rt"] }

View File

@@ -0,0 +1,268 @@
//! Stress test for weight-based rebalancing.
//!
//! Simulates multiple workers sharing an in-process backing store.
//! Manifests have varying weights. Workers have capacity limits and
//! use FairShare rebalancing to redistribute work as nodes join/leave.
//!
//! Run with: RUST_LOG=info cargo run -p rebalancing-stress
use std::time::Duration;
use nocontrol::{
ControlPlane, Operator, OperatorConfig, OperatorState, RebalancePolicy, Specification,
manifests::{Action, Manifest, ManifestMetadata, ManifestState},
stores::{BackingStore, BackingStoreEdge},
};
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use tracing_subscriber::EnvFilter;
#[tokio::main]
async fn main() -> anyhow::Result<()> {
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
)
.with_target(false)
.without_time()
.init();
let store = BackingStore::in_process();
// Create 20 manifests with varying weights (total weight = 110)
let manifests = vec![
("heavy-job-1", 10),
("heavy-job-2", 10),
("heavy-job-3", 10),
("medium-job-1", 5),
("medium-job-2", 5),
("medium-job-3", 5),
("medium-job-4", 5),
("medium-job-5", 5),
("medium-job-6", 5),
("light-job-1", 1),
("light-job-2", 1),
("light-job-3", 1),
("light-job-4", 1),
("light-job-5", 1),
("light-job-6", 1),
("light-job-7", 1),
("light-job-8", 1),
("light-job-9", 1),
("light-job-10", 1),
("tiny-job-1", 0),
];
let total_weight: u64 = manifests.iter().map(|(_, w)| *w).sum();
tracing::info!(
manifest_count = manifests.len(),
total_weight,
"creating manifests"
);
// Insert all manifests into the shared store using a temporary control plane
let seed_operator = OperatorState::new(StressOperator);
let seed_cp = ControlPlane::new(seed_operator, store.clone());
for (name, weight) in &manifests {
seed_cp
.add_manifest(Manifest {
name: name.to_string(),
metadata: ManifestMetadata {},
spec: WeightedJob {
weight: *weight,
name: name.to_string(),
},
})
.await?;
}
let cancellation = CancellationToken::new();
// --- Phase 1: Start 2 workers ---
tracing::info!("=== PHASE 1: Starting 2 workers (capacity=60 each, headroom=5) ===");
let worker1 = spawn_worker("worker-1", store.clone(), 60, 5, cancellation.child_token());
let worker2 = spawn_worker("worker-2", store.clone(), 60, 5, cancellation.child_token());
// Let them stabilize
tokio::time::sleep(Duration::from_secs(15)).await;
print_distribution(&seed_cp).await;
// --- Phase 2: Add a 3rd worker ---
tracing::info!("=== PHASE 2: Adding worker-3 ===");
let worker3 = spawn_worker("worker-3", store.clone(), 60, 5, cancellation.child_token());
// Let rebalancing happen
tokio::time::sleep(Duration::from_secs(25)).await;
print_distribution(&seed_cp).await;
// --- Phase 3: Add a 4th worker with low capacity ---
tracing::info!("=== PHASE 3: Adding worker-4 (capacity=15) ===");
let worker4 = spawn_worker("worker-4", store.clone(), 15, 2, cancellation.child_token());
tokio::time::sleep(Duration::from_secs(25)).await;
print_distribution(&seed_cp).await;
// --- Phase 4: Kill worker-1, observe redistribution ---
tracing::info!("=== PHASE 4: Killing worker-1, observing redistribution ===");
worker1.cancel();
// Wait for lease expiry (10s in-process) + sync cycles
tokio::time::sleep(Duration::from_secs(25)).await;
print_distribution(&seed_cp).await;
// Cleanup
tracing::info!("=== DONE: Shutting down all workers ===");
cancellation.cancel();
worker2.cancel();
worker3.cancel();
worker4.cancel();
// Give workers time to shut down gracefully
tokio::time::sleep(Duration::from_secs(2)).await;
Ok(())
}
fn spawn_worker<TStore: BackingStoreEdge<WeightedJob> + 'static>(
name: &'static str,
store: BackingStore<WeightedJob, TStore>,
max_capacity: u64,
headroom: u64,
cancellation: CancellationToken,
) -> CancellationToken {
let worker_cancel = CancellationToken::new();
tokio::spawn({
let cancel = worker_cancel.clone();
async move {
let config = OperatorConfig {
max_capacity: Some(max_capacity),
rebalance_policy: RebalancePolicy::FairShare { headroom },
resync_interval: Duration::from_secs(60),
..Default::default()
};
let operator = OperatorState::new_with_config(StressOperator, config);
let cp = ControlPlane::new(operator, store);
tracing::info!(%name, max_capacity, headroom, "worker started");
let combined = CancellationToken::new();
let combined_child = combined.child_token();
tokio::spawn({
let combined = combined.clone();
async move {
tokio::select! {
_ = cancel.cancelled() => {}
_ = cancellation.cancelled() => {}
}
combined.cancel();
}
});
if let Err(e) = cp.execute_with_cancellation(combined_child).await {
tracing::error!(%name, error = %e, "worker failed");
}
tracing::info!(%name, "worker stopped");
}
});
worker_cancel
}
async fn print_distribution<TOperator, TStore>(cp: &ControlPlane<TOperator, TStore>)
where
TOperator: Operator<Specifications = WeightedJob>,
TStore: BackingStoreEdge<WeightedJob>,
{
let manifests = cp.get_manifests().await.unwrap_or_default();
let mut by_worker: std::collections::HashMap<String, (usize, u64)> =
std::collections::HashMap::new();
let mut unowned = Vec::new();
for m in &manifests {
let w = m.manifest.spec.weight;
match &m.lease {
Some(lease) => {
let entry = by_worker
.entry(format!("{}", lease.owner))
.or_insert((0, 0));
entry.0 += 1;
entry.1 += w;
}
None => {
unowned.push(m.manifest.name.as_str());
}
}
}
tracing::info!("--- Distribution ---");
let mut workers: Vec<_> = by_worker.into_iter().collect();
workers.sort_by_key(|(id, _)| id.clone());
for (worker_id, (count, weight)) in &workers {
tracing::info!(worker = %worker_id, count, weight, "");
}
if !unowned.is_empty() {
tracing::info!(count = unowned.len(), "unowned manifests");
}
let total_owned_weight: u64 = workers.iter().map(|(_, (_, w))| w).sum();
tracing::info!(
total_owned_weight,
total_manifests = manifests.len(),
workers = workers.len(),
"summary"
);
tracing::info!("--------------------");
}
// --- Operator and Specification ---
#[derive(Clone)]
struct StressOperator;
impl Operator for StressOperator {
type Specifications = WeightedJob;
type Error = anyhow::Error;
async fn reconcile(
&self,
manifest: &mut ManifestState<WeightedJob>,
) -> Result<Action, Self::Error> {
// Simulate work proportional to weight
let work_ms = manifest.manifest.spec.weight * 10;
tokio::time::sleep(Duration::from_millis(work_ms)).await;
Ok(Action::Requeue(Duration::from_secs(5)))
}
async fn on_lease_lost(
&self,
manifest: &ManifestState<WeightedJob>,
) -> Result<(), Self::Error> {
tracing::debug!(
manifest = %manifest.manifest.name,
"lease lost, cleaning up"
);
Ok(())
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct WeightedJob {
pub name: String,
pub weight: u64,
}
impl Specification for WeightedJob {
fn kind(&self) -> &'static str {
"weighted-job"
}
fn weight(&self) -> u64 {
self.weight
}
}