feat: add postgresql backing store

Signed-off-by: kjuulh <contact@kjuulh.io>
This commit is contained in:
2026-01-18 22:52:30 +01:00
parent e0d6172e21
commit 2b7014e038
10 changed files with 477 additions and 28 deletions

View File

@@ -1 +1,21 @@
-- Add migration script here
create table manifests (
id UUID PRIMARY KEY NOT NULL,
generation BIGINT NOT NULL,
name TEXT NOT NULL,
kind TEXT NOT NULL,
status JSONB NOT NULL,
manifest_content JSONB NOT NULL,
manifest_hash BYTEA NOT NULL,
created TIMESTAMPTZ NOT NULL,
updated TIMESTAMPTZ NOT NULL,
lease_owner_id UUID,
lease_last_updated TIMESTAMPTZ
);
CREATE UNIQUE INDEX idx_manifest_name ON manifests(name, kind);

View File

@@ -17,6 +17,7 @@ pub struct ControlPlane<TOperator: Operator, TStore: BackingStoreEdge<TOperator:
reconciler: Reconciler<TOperator, TStore>,
worker_id: uuid::Uuid,
store: BackingStore<TOperator::Specifications, TStore>,
cancellation: CancellationToken,
deadline: Option<std::time::Duration>,
}
@@ -33,6 +34,7 @@ impl<TOperator: Operator, TStore: BackingStoreEdge<TOperator::Specifications>>
let reconciler = Reconciler::new(worker_id, &store, operator);
Self {
cancellation: CancellationToken::new(),
reconciler,
worker_id,
deadline: None,
@@ -59,12 +61,24 @@ impl<TOperator: Operator, TStore: BackingStoreEdge<TOperator::Specifications>>
let cancellation_token = cancellation;
let child_token = cancellation_token.child_token();
if let Some(deadline) = self.deadline {
tokio::spawn(async move {
tokio::time::sleep(deadline).await;
cancellation_token.cancel();
tokio::spawn({
let cancellation_token = cancellation_token.clone();
async move {
tokio::time::sleep(deadline).await;
cancellation_token.cancel();
}
});
tokio::spawn({
let self_cancel = self.cancellation.child_token();
async move {
self_cancel.cancelled().await;
cancellation_token.cancel();
}
});
}
tracing::debug_span!("reconcile", worker = self.worker_id.to_string());
self.reconciler.reconcile(&child_token).await?;
Ok(())
@@ -86,4 +100,8 @@ impl<TOperator: Operator, TStore: BackingStoreEdge<TOperator::Specifications>>
) -> anyhow::Result<Vec<ManifestState<TOperator::Specifications>>> {
self.store.get_manifests().await
}
pub async fn shutdown(&self) -> anyhow::Result<()> {
Ok(())
}
}

View File

@@ -47,11 +47,11 @@ impl<T: Specification> BackingStore<T, BackingStoreInProcess<T>> {
#[cfg(feature = "postgres")]
impl<T: Specification> BackingStore<T, postgres::BackingStorePostgres<T>> {
pub fn postgres(database_url: &str) -> Self {
Self {
inner: postgres::BackingStorePostgres::new(database_url),
pub async fn postgres(database_url: &str) -> anyhow::Result<Self> {
Ok(Self {
inner: postgres::BackingStorePostgres::new(database_url).await?,
_marker: PhantomData,
}
})
}
}
@@ -81,6 +81,12 @@ pub trait BackingStoreEdge<T: Specification>: Send + Sync + Clone {
worker_id: &WorkerId,
) -> impl std::future::Future<Output = anyhow::Result<()>> + Send;
fn delete_lease(
&self,
manifest: &ManifestState<T>,
worker_id: &WorkerId,
) -> impl std::future::Future<Output = anyhow::Result<()>> + Send;
fn upsert_manifest(
&self,
manifest: Manifest<T>,

View File

@@ -166,6 +166,23 @@ impl<T: Specification> BackingStoreEdge<T> for BackingStoreInProcess<T> {
Ok(())
}
async fn delete_lease(
&self,
manifest: &ManifestState<T>,
_worker_id: &WorkerId,
) -> anyhow::Result<()> {
let mut manifests = self.manifests.write().await;
if let Some(manifest) = manifests
.iter_mut()
.find(|m| m.manifest.name == manifest.manifest.name)
{
manifest.lease = None;
}
Ok(())
}
}
impl<T: Specification> BackingStoreInProcess<T> {

View File

@@ -1,9 +1,17 @@
use std::marker::PhantomData;
use anyhow::Context;
use jiff::Timestamp;
use sha2::Digest;
use sqlx::PgPool;
use crate::{Specification, stores::BackingStoreEdge};
use crate::{
Specification,
manifests::{
Manifest, ManifestLease, ManifestState, ManifestStatus, ManifestStatusState, WorkerId,
},
stores::BackingStoreEdge,
};
#[derive(Clone)]
pub struct BackingStorePostgres<T: Specification> {
@@ -12,10 +20,13 @@ pub struct BackingStorePostgres<T: Specification> {
}
impl<T: Specification> BackingStorePostgres<T> {
pub(crate) async fn new(database_url: &str) -> anyhow::Result<Self> {
tracing::debug!("connecting to postgres database");
let pool = sqlx::PgPool::connect(database_url)
.await
.context("failed to connect to database")?;
tracing::debug!("migrating database");
sqlx::migrate!("migrations/postgres/")
.run(&pool)
.await
@@ -33,40 +44,357 @@ impl<T: Specification> BackingStoreEdge<T> for BackingStorePostgres<T> {
&self,
worker_id: &uuid::Uuid,
) -> anyhow::Result<Vec<crate::manifests::ManifestState<T>>> {
todo!()
let recs = sqlx::query!(
r#"
SELECT
id,
generation,
name,
kind,
status,
manifest_content,
manifest_hash,
created,
updated,
lease_owner_id,
lease_last_updated
FROM
manifests
WHERE
lease_last_updated < now() - INTERVAL '30 seconds'
OR (lease_owner_id = $1 AND lease_last_updated > now() - INTERVAL '15 seconds')
OR lease_owner_id IS NULL
"#,
worker_id
)
.fetch_all(&self.pool)
.await?;
recs.into_iter()
.map(|r| {
let content: Manifest<T> = serde_json::from_value(r.manifest_content)?;
Ok(ManifestState {
manifest: content,
manifest_hash: r.manifest_hash,
generation: r.generation as u64,
status: serde_json::from_value(r.status)?,
created: Timestamp::from_millisecond(r.created.timestamp_millis())?,
updated: Timestamp::from_millisecond(r.updated.timestamp_millis())?,
lease: {
match (r.lease_owner_id, r.lease_last_updated) {
(Some(owner_id), Some(last_updated)) => Some(ManifestLease {
owner: owner_id,
last_seen: Timestamp::from_millisecond(
last_updated.timestamp_millis(),
)?,
}),
(_, _) => None,
}
},
})
})
.collect::<anyhow::Result<Vec<_>>>()
}
async fn get_manifests(&self) -> anyhow::Result<Vec<crate::manifests::ManifestState<T>>> {
todo!()
let recs = sqlx::query!(
r#"
SELECT
id,
generation,
name,
kind,
status,
manifest_content,
manifest_hash,
created,
updated,
lease_owner_id,
lease_last_updated
FROM
manifests
"#
)
.fetch_all(&self.pool)
.await
.context("failed to get manifests from database")?;
recs.into_iter()
.map(|r| {
let content: Manifest<T> = serde_json::from_value(r.manifest_content)?;
Ok(ManifestState {
manifest: content,
manifest_hash: r.manifest_hash,
generation: r.generation as u64,
status: serde_json::from_value(r.status)?,
created: Timestamp::from_millisecond(r.created.timestamp_millis())?,
updated: Timestamp::from_millisecond(r.updated.timestamp_millis())?,
lease: {
match (r.lease_owner_id, r.lease_last_updated) {
(Some(owner_id), Some(last_updated)) => Some(ManifestLease {
owner: owner_id,
last_seen: Timestamp::from_millisecond(
last_updated.timestamp_millis(),
)?,
}),
(_, _) => None,
}
},
})
})
.collect::<anyhow::Result<Vec<_>>>()
}
async fn get(&self, name: &str) -> anyhow::Result<Option<crate::manifests::ManifestState<T>>> {
todo!()
async fn get(&self, name: &str) -> anyhow::Result<Option<ManifestState<T>>> {
let rec = sqlx::query!(
r#"
SELECT
id,
generation,
name,
kind,
status,
manifest_content,
manifest_hash,
created,
updated,
lease_owner_id,
lease_last_updated
FROM
manifests
WHERE
name = $1
"#,
name
)
.fetch_optional(&self.pool)
.await
.context("failed to get")?;
let Some(rec) = rec else { return Ok(None) };
let content: Manifest<T> = serde_json::from_value(rec.manifest_content)?;
Ok(Some(ManifestState {
manifest: content,
manifest_hash: rec.manifest_hash,
generation: rec.generation as u64,
status: serde_json::from_value(rec.status)?,
created: Timestamp::from_millisecond(rec.created.timestamp_millis())?,
updated: Timestamp::from_millisecond(rec.updated.timestamp_millis())?,
lease: {
match (rec.lease_owner_id, rec.lease_last_updated) {
(Some(owner_id), Some(last_updated)) => Some(ManifestLease {
owner: owner_id,
last_seen: Timestamp::from_millisecond(last_updated.timestamp_millis())?,
}),
(_, _) => None,
}
},
}))
}
async fn update_lease(
&self,
manifest_state: &crate::manifests::ManifestState<T>,
) -> anyhow::Result<()> {
todo!()
let resp = sqlx::query!(
r#"
UPDATE manifests
SET
lease_last_updated = now()
WHERE
name = $1
AND kind = $2
AND generation = $3
-- AND owner_id = $4
"#,
manifest_state.manifest.name,
manifest_state.manifest.spec.kind(),
manifest_state.generation as i64,
// worker_id,
)
.execute(&self.pool)
.await
.context("failed to update lease")?;
if resp.rows_affected() == 0 {
anyhow::bail!("failed to update lease, the host is no longer the owner")
}
Ok(())
}
async fn acquire_lease(
&self,
manifest_state: &crate::manifests::ManifestState<T>,
worker_id: &crate::manifests::WorkerId,
manifest_state: &ManifestState<T>,
worker_id: &WorkerId,
) -> anyhow::Result<()> {
todo!()
let name = &manifest_state.manifest.name;
let kind = manifest_state.manifest.spec.kind();
let generation = manifest_state.generation;
let resp = sqlx::query!(
r#"
UPDATE manifests
SET
lease_owner_id = $4,
lease_last_updated = now(),
generation = $3 + 1
WHERE
name = $1
AND kind = $2
AND generation = $3
"#,
name,
kind,
generation as i64,
worker_id
)
.execute(&self.pool)
.await
.context("failed to acquire lease")?;
if resp.rows_affected() == 0 {
anyhow::bail!("failed to acquire lease: {}/{}@{}", kind, name, generation);
}
// TODO: maybe we should update fence as well
// manifest_state.generation = generation + 1;
Ok(())
}
async fn upsert_manifest(&self, manifest: crate::manifests::Manifest<T>) -> anyhow::Result<()> {
todo!()
let id = uuid::Uuid::now_v7();
let name = &manifest.name;
let kind = manifest.spec.kind();
let content = serde_json::to_value(&manifest)?;
let hash = &sha2::Sha256::digest(serde_json::to_vec(&content)?)[..];
let status = serde_json::to_value(ManifestStatus {
status: ManifestStatusState::Pending,
events: vec![],
changes: vec![],
})?;
sqlx::query!(
r#"
INSERT INTO manifests (
id,
generation,
name,
kind,
status,
manifest_content,
manifest_hash,
lease_owner_id,
lease_last_updated,
created,
updated
) VALUES (
$1,
0,
$2,
$3,
$4,
$5,
$6,
NULL,
NULL,
now(),
now()
)
ON CONFLICT (name, kind) DO UPDATE
SET
manifest_content = $5,
updated = now()
"#,
id,
name,
kind,
status,
content,
hash
)
.execute(&self.pool)
.await
.context("failed to upsert manifest")?;
Ok(())
}
async fn update_state(
&self,
manifest: &crate::manifests::ManifestState<T>,
) -> anyhow::Result<()> {
todo!()
let generation = manifest.generation;
let status = serde_json::to_value(&manifest.status)?;
let resp = sqlx::query!(
r#"
UPDATE manifests
SET
generation = $3 + 1,
status = $4,
updated = now()
WHERE
name = $1
AND kind = $2
AND generation = $3
"#,
manifest.manifest.name,
manifest.manifest.spec.kind(),
generation as i32,
status
)
.execute(&self.pool)
.await
.context("failed to update state")?;
if resp.rows_affected() == 0 {
anyhow::bail!("failed to update state")
}
Ok(())
}
async fn delete_lease(
&self,
manifest: &ManifestState<T>,
worker_id: &WorkerId,
) -> anyhow::Result<()> {
let name = &manifest.manifest.name;
let kind = manifest.manifest.spec.kind();
let generation = manifest.generation;
let resp = sqlx::query!(
r#"
UPDATE manifests
SET
lease_owner_id = NULL,
lease_last_updated = NULL
WHERE
name = $1
AND kind = $2
AND generation = $3
AND lease_owner_id = $4
"#,
name,
kind,
generation as i64,
worker_id,
)
.execute(&self.pool)
.await
.context("failed to update lease")?;
if resp.rows_affected() == 0 {
anyhow::bail!("failed to delete lease, the host is no longer the owner")
}
Ok(())
}
}

View File

@@ -55,6 +55,8 @@ impl<T: Operator, TStore: BackingStoreEdge<T::Specifications>> Reconciler<T, TSt
}
tracing::debug!("reconciler shutting down");
self.relinquish_manifests().await?;
Ok(())
}
@@ -73,7 +75,27 @@ impl<T: Operator, TStore: BackingStoreEdge<T::Specifications>> Reconciler<T, TSt
tracing::warn!(error = %e, "failed to sync manifests");
}
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
tokio::time::sleep(std::time::Duration::from_secs(10)).await;
}
Ok(())
}
async fn relinquish_manifests(&self) -> anyhow::Result<()> {
tracing::info!("relinquishing all known manifests");
let manifests = self
.store
.get_owned_and_potential_leases(&self.worker_id)
.await?;
for manifest in manifests {
if let Some(lease) = &manifest.lease
&& lease.owner == self.worker_id
&& let Err(e) = self.store.delete_lease(&manifest, &self.worker_id).await
{
tracing::warn!("failed to relinquish manifest: {e:#}");
}
}
Ok(())
@@ -81,15 +103,20 @@ impl<T: Operator, TStore: BackingStoreEdge<T::Specifications>> Reconciler<T, TSt
/// Single sync iteration - check for manifests, acquire leases, enqueue work.
async fn sync_once(&self) -> anyhow::Result<()> {
for manifest_state in self
let manifests = self
.store
.get_owned_and_potential_leases(&self.worker_id)
.await?
{
.await?;
tracing::trace!(manifests = manifests.len(), "sync once manifests");
for manifest_state in manifests {
let manifest_name = manifest_state.manifest.name.clone();
match &manifest_state.lease {
Some(lease) if lease.owner == self.worker_id => {
tracing::trace!("updating lease");
// We own the lease, update it
self.store
.update_lease(&manifest_state)
@@ -101,7 +128,9 @@ impl<T: Operator, TStore: BackingStoreEdge<T::Specifications>> Reconciler<T, TSt
self.reconcile_queue.enqueue(manifest_name).await;
}
}
None => {
_ => {
tracing::trace!("acquiring lease");
// No lease, try to acquire
self.store
.acquire_lease(&manifest_state, &self.worker_id)
@@ -111,10 +140,6 @@ impl<T: Operator, TStore: BackingStoreEdge<T::Specifications>> Reconciler<T, TSt
// Enqueue for reconciliation
self.reconcile_queue.enqueue(manifest_name).await;
}
_ => {
// Someone else owns the lease, skip
continue;
}
}
}
@@ -218,7 +243,13 @@ impl<T: Operator, TStore: BackingStoreEdge<T::Specifications>> Reconciler<T, TSt
match &manifest.lease {
Some(lease) if lease.owner == self.worker_id => {}
_ => {
tracing::debug!(%manifest_name, "we don't own the lease, skipping");
tracing::debug!(%manifest_name, "we don't own the lease, shutting down owned resources");
self.operator
.on_lease_lost(&manifest)
.await
.map_err(|_e| anyhow::anyhow!("failed handle lease lost event"))?;
return Ok(());
}
}

View File

@@ -16,6 +16,13 @@ pub trait Operator: Send + Sync + 'static {
desired_manifest: &mut ManifestState<Self::Specifications>,
) -> impl Future<Output = Result<Action, Self::Error>>;
fn on_lease_lost(
&self,
manifest: &ManifestState<Self::Specifications>,
) -> impl Future<Output = Result<(), Self::Error>> {
async { Ok(()) }
}
fn on_error(
&self,
desired_manifest: &mut ManifestState<Self::Specifications>,

View File

@@ -41,6 +41,13 @@ impl<T: Operator> Operator for OperatorState<T> {
self.inner.lock().await.reconcile(desired_manifest).await
}
async fn on_lease_lost(
&self,
manifest: &crate::manifests::ManifestState<Self::Specifications>,
) -> Result<(), Self::Error> {
self.inner.lock().await.on_lease_lost(manifest).await
}
async fn on_error(
&self,
desired_manifest: &mut crate::manifests::ManifestState<Self::Specifications>,