Skip to content

Commit ca7a790

Browse files
committed
Consistent schema across partitions
Summary: This PR make sure that PPs have a consistent service schema by writing and upsert-schema record to bifrost. The leader processor makes sure to write this record to bifrost when schema changes are detected
1 parent 1ebb5ab commit ca7a790

File tree

8 files changed

+126
-13
lines changed

8 files changed

+126
-13
lines changed

crates/types/src/schema/metadata/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,9 @@ use crate::schema::{deployment, service};
3838
use crate::time::MillisSinceEpoch;
3939
use crate::{Version, Versioned, identifiers};
4040

41-
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
41+
#[derive(derive_more::Debug, Clone, serde::Serialize, serde::Deserialize)]
4242
#[serde(from = "serde_hacks::Schema", into = "serde_hacks::Schema")]
43+
#[debug("Schema(version: {version})")]
4344
pub struct Schema {
4445
/// This gets bumped on each update.
4546
version: Version,

crates/wal-protocol/src/lib.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ use restate_types::invocation::{
1818
use restate_types::logs;
1919
use restate_types::logs::{HasRecordKeys, Keys, MatchKeyQuery};
2020
use restate_types::message::MessageIndex;
21+
use restate_types::schema::Schema;
2122
use restate_types::state_mut::ExternalStateMutation;
2223

2324
use crate::control::{AnnounceLeader, VersionBarrier};
@@ -29,7 +30,7 @@ pub mod control;
2930
pub mod timer;
3031

3132
/// The primary envelope for all messages in the system.
32-
#[derive(Debug, Clone, PartialEq, Eq)]
33+
#[derive(Debug, Clone)]
3334
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
3435
pub struct Envelope {
3536
pub header: Header,
@@ -125,7 +126,7 @@ pub enum Destination {
125126
}
126127

127128
/// State machine input commands
128-
#[derive(Debug, Clone, PartialEq, Eq, strum::EnumDiscriminants, strum::VariantNames)]
129+
#[derive(Debug, Clone, strum::EnumDiscriminants, strum::VariantNames)]
129130
#[strum_discriminants(derive(strum::IntoStaticStr))]
130131
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
131132
pub enum Command {
@@ -183,6 +184,9 @@ pub enum Command {
183184
NotifyGetInvocationOutputResponse(GetInvocationOutputResponse),
184185
/// Notify a signal.
185186
NotifySignal(NotifySignalRequest),
187+
188+
/// Upsert schema for consistent schema across replicas
189+
UpsertSchema(Schema),
186190
}
187191

188192
impl Command {
@@ -233,6 +237,7 @@ impl HasRecordKeys for Envelope {
233237
Command::InvocationResponse(response) => Keys::Single(response.partition_key()),
234238
Command::NotifySignal(sig) => Keys::Single(sig.partition_key()),
235239
Command::NotifyGetInvocationOutputResponse(res) => Keys::Single(res.partition_key()),
240+
Command::UpsertSchema(_) => Keys::Single(self.partition_key()),
236241
}
237242
}
238243
}
@@ -337,6 +342,7 @@ mod envelope {
337342
UpdatePartitionDurability = 17, // bilrost
338343
ResumeInvocation = 18, // flexbuffers
339344
RestartAsNewInvocation = 19, // flexbuffers
345+
UpsertSchema = 20, // flexbuffers
340346
}
341347

342348
#[derive(bilrost::Message)]
@@ -524,6 +530,10 @@ mod envelope {
524530
let value = protobuf::outbox_message::NotifySignal::from(value.clone());
525531
(CommandKind::NotifySignal, Field::encode_protobuf(&value))
526532
}
533+
Command::UpsertSchema(value) => (
534+
CommandKind::UpsertSchema,
535+
Field::encode_serde(StorageCodecKind::FlexbuffersSerde, value),
536+
),
527537
};
528538

529539
let dto = Envelope {
@@ -631,6 +641,10 @@ mod envelope {
631641

632642
Command::NotifySignal(value.try_into()?)
633643
}
644+
CommandKind::UpsertSchema => {
645+
codec_or_error!(envelope.command, StorageCodecKind::FlexbuffersSerde);
646+
Command::UpsertSchema(envelope.command.decode_serde()?)
647+
}
634648
};
635649

636650
Ok(super::Envelope { header, command })

crates/worker/src/partition/leadership/leader_state.rs

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,12 @@ use futures::future::OptionFuture;
2020
use futures::stream::FuturesUnordered;
2121
use futures::{FutureExt, StreamExt, stream};
2222
use metrics::counter;
23-
use tokio_stream::wrappers::ReceiverStream;
23+
use tokio_stream::wrappers::{ReceiverStream, WatchStream};
2424
use tracing::{debug, trace};
2525

2626
use restate_bifrost::CommitToken;
2727
use restate_core::network::{Oneshot, Reciprocal};
28-
use restate_core::{TaskCenter, TaskHandle, TaskId};
28+
use restate_core::{Metadata, MetadataKind, TaskCenter, TaskHandle, TaskId};
2929
use restate_partition_store::PartitionStore;
3030
use restate_types::identifiers::{
3131
InvocationId, LeaderEpoch, PartitionId, PartitionKey, PartitionProcessorRpcRequestId,
@@ -36,6 +36,7 @@ use restate_types::net::partition_processor::{
3636
PartitionProcessorRpcError, PartitionProcessorRpcResponse,
3737
};
3838
use restate_types::time::MillisSinceEpoch;
39+
use restate_types::{Version, Versioned};
3940
use restate_wal_protocol::Command;
4041
use restate_wal_protocol::timer::TimerKeyValue;
4142

@@ -45,7 +46,7 @@ use crate::partition::leadership::self_proposer::SelfProposer;
4546
use crate::partition::leadership::{ActionEffect, Error, InvokerStream, TimerService};
4647
use crate::partition::shuffle;
4748
use crate::partition::shuffle::HintSender;
48-
use crate::partition::state_machine::Action;
49+
use crate::partition::state_machine::{Action, StateMachineRef};
4950

5051
use super::durability_tracker::DurabilityTracker;
5152

@@ -73,6 +74,7 @@ pub struct LeaderState {
7374

7475
invoker_stream: InvokerStream,
7576
shuffle_stream: ReceiverStream<shuffle::OutboxTruncation>,
77+
schema_stream: WatchStream<Version>,
7678
pub pending_cleanup_timers_to_schedule: VecDeque<(InvocationId, Duration)>,
7779
cleaner_task_id: TaskId,
7880
trimmer_task_id: TaskId,
@@ -103,6 +105,9 @@ impl LeaderState {
103105
cleaner_task_id,
104106
trimmer_task_id,
105107
shuffle_hint_tx,
108+
schema_stream: Metadata::with_current(|m| {
109+
WatchStream::new(m.watch(MetadataKind::Schema))
110+
}),
106111
timer_service: Box::pin(timer_service),
107112
self_proposer,
108113
awaiting_rpc_actions: Default::default(),
@@ -119,7 +124,10 @@ impl LeaderState {
119124
///
120125
/// Important: The future needs to be cancellation safe since it is polled as a tokio::select
121126
/// arm!
122-
pub async fn run(&mut self) -> Result<Vec<ActionEffect>, Error> {
127+
pub async fn run(
128+
&mut self,
129+
state_machine: StateMachineRef<'_>,
130+
) -> Result<Vec<ActionEffect>, Error> {
123131
let timer_stream = std::pin::pin!(stream::unfold(
124132
&mut self.timer_service,
125133
|timer_service| async {
@@ -128,6 +136,21 @@ impl LeaderState {
128136
}
129137
));
130138

139+
let schema_stream = (&mut self.schema_stream)
140+
.filter(|version| {
141+
// only upsert schema iff version is newer that
142+
futures::future::ready(
143+
state_machine
144+
.schema
145+
.as_ref()
146+
.map(|schema| schema.version() < *version)
147+
.unwrap_or(true),
148+
)
149+
})
150+
.map(|_| {
151+
let schema = Metadata::with_current(|m| m.schema().clone());
152+
ActionEffect::UpsertSchema(schema)
153+
});
131154
let invoker_stream = (&mut self.invoker_stream).map(ActionEffect::Invoker);
132155
let shuffle_stream = (&mut self.shuffle_stream).map(ActionEffect::Shuffle);
133156
let dur_tracker_stream =
@@ -155,7 +178,8 @@ impl LeaderState {
155178
timer_stream,
156179
action_effects_stream,
157180
awaiting_rpc_self_propose_stream,
158-
dur_tracker_stream
181+
dur_tracker_stream,
182+
schema_stream
159183
);
160184
let mut all_streams = all_streams.ready_chunks(BATCH_READY_UP_TO);
161185

@@ -284,6 +308,16 @@ impl LeaderState {
284308
)
285309
.await?;
286310
}
311+
ActionEffect::UpsertSchema(schema) => {
312+
debug!(
313+
"Self purposing {schema:?} for partition key {}",
314+
self.own_partition_key
315+
);
316+
317+
self.self_proposer
318+
.propose(self.own_partition_key, Command::UpsertSchema(schema))
319+
.await?;
320+
}
287321
ActionEffect::AwaitingRpcSelfProposeDone => {
288322
// Nothing to do here
289323
}

crates/worker/src/partition/leadership/mod.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ use restate_types::net::partition_processor::{
5252
use restate_types::partitions::Partition;
5353
use restate_types::partitions::state::PartitionReplicaSetStates;
5454
use restate_types::retries::with_jitter;
55+
use restate_types::schema::Schema;
5556
use restate_types::storage::StorageEncodeError;
5657
use restate_wal_protocol::Command;
5758
use restate_wal_protocol::control::{AnnounceLeader, PartitionDurability};
@@ -63,7 +64,7 @@ use crate::partition::leadership::leader_state::LeaderState;
6364
use crate::partition::leadership::self_proposer::SelfProposer;
6465
use crate::partition::shuffle;
6566
use crate::partition::shuffle::{OutboxReaderError, Shuffle, ShuffleMetadata};
66-
use crate::partition::state_machine::Action;
67+
use crate::partition::state_machine::{Action, StateMachineRef};
6768
use crate::partition::types::InvokerEffect;
6869

6970
use self::durability_tracker::DurabilityTracker;
@@ -124,6 +125,7 @@ pub(crate) enum ActionEffect {
124125
Timer(TimerKeyValue),
125126
ScheduleCleanupTimer(InvocationId, Duration),
126127
PartitionMaintenance(PartitionDurability),
128+
UpsertSchema(Schema),
127129
AwaitingRpcSelfProposeDone,
128130
}
129131
enum State {
@@ -518,7 +520,10 @@ where
518520
/// * Follower: Nothing to do
519521
/// * Candidate: Monitor appender task
520522
/// * Leader: Await action effects and monitor appender task
521-
pub async fn run(&mut self) -> Result<Vec<ActionEffect>, Error> {
523+
pub async fn run(
524+
&mut self,
525+
state_machine: StateMachineRef<'_>,
526+
) -> Result<Vec<ActionEffect>, Error> {
522527
match &mut self.state {
523528
State::Follower => Ok(futures::future::pending::<Vec<_>>().await),
524529
State::Candidate { self_proposer, .. } => Err(self_proposer
@@ -527,7 +532,7 @@ where
527532
.join_on_err()
528533
.await
529534
.expect_err("never should never be returned")),
530-
State::Leader(leader_state) => leader_state.run().await,
535+
State::Leader(leader_state) => leader_state.run(state_machine).await,
531536
}
532537
}
533538

crates/worker/src/partition/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ where
181181
let outbox_seq_number = partition_store.get_outbox_seq_number().await?;
182182
let outbox_head_seq_number = partition_store.get_outbox_head_seq_number().await?;
183183
let min_restate_version = partition_store.get_min_restate_version().await?;
184+
let schema = partition_store.get_schema().await?;
184185

185186
if !SemanticRestateVersion::current().is_equal_or_newer_than(&min_restate_version) {
186187
gauge!(PARTITION_BLOCKED_FLARE, PARTITION_LABEL =>
@@ -199,6 +200,7 @@ where
199200
partition_store.partition_key_range().clone(),
200201
min_restate_version,
201202
EnumSet::empty(),
203+
schema,
202204
);
203205

204206
Ok(state_machine)
@@ -561,7 +563,7 @@ where
561563
transaction.commit().await?;
562564
self.leadership_state.handle_actions(action_collector.drain(..))?;
563565
},
564-
result = self.leadership_state.run() => {
566+
result = self.leadership_state.run(self.state_machine.state_machine_ref()) => {
565567
let action_effects = result?;
566568
// We process the action_effects not directly in the run future because it
567569
// requires the run future to be cancellation safe. In the future this could be

crates/worker/src/partition/state_machine/lifecycle/version_barrier.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ mod tests {
6464
PartitionKey::MIN..=PartitionKey::MAX,
6565
SemanticRestateVersion::unknown().clone(),
6666
Default::default(),
67+
None,
6768
);
6869
// this is fine as we are always above the unknown version (current > 0.0.0)
6970
let mut test_env = TestEnv::create_with_state_machine(state_machine).await;
@@ -106,6 +107,7 @@ mod tests {
106107
PartitionKey::MIN..=PartitionKey::MAX,
107108
SemanticRestateVersion::unknown().clone(),
108109
Default::default(),
110+
None,
109111
);
110112
// this is fine as we are always above the unknown version (current > 0.0.0)
111113
let mut test_env = TestEnv::create_with_state_machine(state_machine).await;

0 commit comments

Comments
 (0)