fix(dsrepl): reassign timestamp at the time of submission

This is needed to ensure total message order for a shard, and
guarantee that no messages will be written "in the past". which
may break replay consistency.
This commit is contained in:
Andrew Mayorov 2024-02-01 13:03:57 +01:00
parent 146f082fdc
commit be793e4735
No known key found for this signature in database
GPG Key ID: 2837C62ACFBFED5D
3 changed files with 32 additions and 18 deletions

View File

@ -555,8 +555,7 @@ ra_start_shard(DB, Shard) ->
ra_store_batch(DB, Shard, Messages) ->
Command = #{
?tag => ?BATCH,
?batch_messages => Messages,
?timestamp => emqx_ds:timestamp_us()
?batch_messages => Messages
},
case ra:process_command(ra_leader_servers(DB, Shard), Command) of
{ok, Result, _Leader} ->
@ -652,8 +651,7 @@ apply(
#{index := RaftIdx},
#{
?tag := ?BATCH,
?batch_messages := MessagesIn,
?timestamp := TimestampLocal
?batch_messages := MessagesIn
},
#{latest := Latest} = State
) ->
@ -661,18 +659,30 @@ apply(
%% Unique timestamp tracking real time closely.
%% With microsecond granularity it should be nearly impossible for it to run
%% too far ahead than the real time clock.
Timestamp = max(Latest + 1, TimestampLocal),
Messages = assign_timestamps(Timestamp, MessagesIn),
{NLatest, Messages} = assign_timestamps(Latest, MessagesIn),
%% TODO
%% Batch is now reversed, but it should not make a lot of difference.
%% Even if it would be in order, it's still possible to write messages far away
%% in the past, i.e. when replica catches up with the leader. Storage layer
%% currently relies on wall clock time to decide if it's safe to iterate over
%% next epoch, this is likely wrong. Ideally it should rely on consensus clock
%% time instead.
Result = emqx_ds_storage_layer:store_batch(erlang:get(emqx_ds_db_shard), Messages, #{}),
%% NOTE: Last assigned timestamp.
NLatest = Timestamp + length(Messages) - 1,
NState = State#{latest := NLatest},
%% TODO: Need to measure effects of changing frequency of `release_cursor`.
Effect = {release_cursor, RaftIdx, NState},
{NState, Result, Effect}.
assign_timestamps(Timestamp, [MessageIn | Rest]) ->
Message = emqx_message:set_timestamp(Timestamp, MessageIn),
[Message | assign_timestamps(Timestamp + 1, Rest)];
assign_timestamps(_Timestamp, []) ->
[].
assign_timestamps(Latest, Messages) ->
assign_timestamps(Latest, Messages, []).
assign_timestamps(Latest, [MessageIn | Rest], Acc) ->
case emqx_message:timestamp(MessageIn) of
Later when Later > Latest ->
assign_timestamps(Later, Rest, [MessageIn | Acc]);
_Earlier ->
Message = emqx_message:set_timestamp(Latest + 1, MessageIn),
assign_timestamps(Latest + 1, Rest, [Message | Acc])
end;
assign_timestamps(Latest, [], Acc) ->
{Latest, Acc}.

View File

@ -67,8 +67,9 @@ store_batch(DB, Messages, Opts) ->
case maps:get(atomic, Opts, false) of
false ->
lists:foreach(
fun(Message) ->
Shard = emqx_ds_replication_layer:shard_of_message(DB, Message, clientid),
fun(MessageIn) ->
Shard = emqx_ds_replication_layer:shard_of_message(DB, MessageIn, clientid),
Message = emqx_message:set_timestamp(emqx_ds:timestamp_us(), MessageIn),
gen_server:call(
?via(DB, Shard),
#enqueue_req{
@ -83,10 +84,14 @@ store_batch(DB, Messages, Opts) ->
true ->
maps:foreach(
fun(Shard, Batch) ->
Timestamp = emqx_ds:timestamp_us(),
gen_server:call(
?via(DB, Shard),
#enqueue_atomic_req{
batch = Batch,
batch = [
emqx_message:set_timestamp(Timestamp, Message)
|| Message <- Batch
],
sync = Sync
},
infinity

View File

@ -118,8 +118,7 @@ store_batch(_ShardId, #s{db = DB, cf = CF}, Messages, _Options = #{atomic := tru
store_batch(_ShardId, #s{db = DB, cf = CF}, Messages, _Options) ->
lists:foreach(
fun(Msg) ->
Id = erlang:unique_integer([monotonic]),
Key = <<Id:64>>,
Key = <<(emqx_message:timestamp(Msg)):64>>,
Val = term_to_binary(Msg),
rocksdb:put(DB, CF, Key, Val, [])
end,