fix(dsrepl): handle errors gracefully in shard egress process

Also add cooldown on timeout / unavailability.
This commit is contained in:
Andrew Mayorov 2024-02-19 19:09:22 +01:00
parent e16aee99b4
commit 887e151be5
No known key found for this signature in database
GPG Key ID: 2837C62ACFBFED5D
2 changed files with 39 additions and 20 deletions

View File

@ -492,23 +492,27 @@ list_nodes() ->
%% %%
%% TODO
%% Too large for normal operation, need better backpressure mechanism.
-define(RA_TIMEOUT, 60 * 1000).
ra_store_batch(DB, Shard, Messages) -> ra_store_batch(DB, Shard, Messages) ->
Command = #{ Command = #{
?tag => ?BATCH, ?tag => ?BATCH,
?batch_messages => Messages ?batch_messages => Messages
}, },
Servers = emqx_ds_replication_layer_shard:servers(DB, Shard, leader_preferred), Servers = emqx_ds_replication_layer_shard:servers(DB, Shard, leader_preferred),
case ra:process_command(Servers, Command) of case ra:process_command(Servers, Command, ?RA_TIMEOUT) of
{ok, Result, _Leader} -> {ok, Result, _Leader} ->
Result; Result;
Error -> Error ->
error(Error, [DB, Shard]) Error
end. end.
ra_add_generation(DB, Shard) -> ra_add_generation(DB, Shard) ->
Command = #{?tag => add_generation}, Command = #{?tag => add_generation},
Servers = emqx_ds_replication_layer_shard:servers(DB, Shard, leader_preferred), Servers = emqx_ds_replication_layer_shard:servers(DB, Shard, leader_preferred),
case ra:process_command(Servers, Command) of case ra:process_command(Servers, Command, ?RA_TIMEOUT) of
{ok, Result, _Leader} -> {ok, Result, _Leader} ->
Result; Result;
Error -> Error ->
@ -518,7 +522,7 @@ ra_add_generation(DB, Shard) ->
ra_update_config(DB, Shard, Opts) -> ra_update_config(DB, Shard, Opts) ->
Command = #{?tag => update_config, ?config => Opts}, Command = #{?tag => update_config, ?config => Opts},
Servers = emqx_ds_replication_layer_shard:servers(DB, Shard, leader_preferred), Servers = emqx_ds_replication_layer_shard:servers(DB, Shard, leader_preferred),
case ra:process_command(Servers, Command) of case ra:process_command(Servers, Command, ?RA_TIMEOUT) of
{ok, Result, _Leader} -> {ok, Result, _Leader} ->
Result; Result;
Error -> Error ->
@ -528,7 +532,7 @@ ra_update_config(DB, Shard, Opts) ->
ra_drop_generation(DB, Shard, GenId) -> ra_drop_generation(DB, Shard, GenId) ->
Command = #{?tag => drop_generation, ?generation => GenId}, Command = #{?tag => drop_generation, ?generation => GenId},
Servers = emqx_ds_replication_layer_shard:servers(DB, Shard, leader_preferred), Servers = emqx_ds_replication_layer_shard:servers(DB, Shard, leader_preferred),
case ra:process_command(Servers, Command) of case ra:process_command(Servers, Command, ?RA_TIMEOUT) of
{ok, Result, _Leader} -> {ok, Result, _Leader} ->
Result; Result;
Error -> Error ->

View File

@ -83,15 +83,13 @@ store_batch(DB, Messages, Opts) ->
); );
true -> true ->
maps:foreach( maps:foreach(
fun(Shard, Batch) -> fun(Shard, BatchIn) ->
Timestamp = emqx_ds:timestamp_us(), Timestamp = emqx_ds:timestamp_us(),
Batch = [emqx_message:set_timestamp(Timestamp, Message) || Message <- BatchIn],
gen_server:call( gen_server:call(
?via(DB, Shard), ?via(DB, Shard),
#enqueue_atomic_req{ #enqueue_atomic_req{
batch = [ batch = Batch,
emqx_message:set_timestamp(Timestamp, Message)
|| Message <- Batch
],
sync = Sync sync = Sync
}, },
infinity infinity
@ -156,22 +154,39 @@ terminate(_Reason, _S) ->
%% Internal functions %% Internal functions
%%================================================================================ %%================================================================================
-define(COOLDOWN_MIN, 1000).
-define(COOLDOWN_MAX, 5000).
do_flush(S = #s{batch = []}) -> do_flush(S = #s{batch = []}) ->
S#s{tref = start_timer()}; S#s{tref = start_timer()};
do_flush( do_flush(
S = #s{batch = Messages, pending_replies = Replies, db = DB, shard = Shard} S = #s{batch = Messages, pending_replies = Replies, db = DB, shard = Shard}
) -> ) ->
%% FIXME %% FIXME
ok = emqx_ds_replication_layer:ra_store_batch(DB, Shard, lists:reverse(Messages)), case emqx_ds_replication_layer:ra_store_batch(DB, Shard, lists:reverse(Messages)) of
[gen_server:reply(From, ok) || From <- lists:reverse(Replies)], ok ->
?tp(emqx_ds_replication_layer_egress_flush, #{db => DB, shard => Shard, batch => Messages}), lists:foreach(fun(From) -> gen_server:reply(From, ok) end, Replies),
erlang:garbage_collect(), ?tp(
S#s{ emqx_ds_replication_layer_egress_flush,
n = 0, #{db => DB, shard => Shard, batch => Messages}
batch = [], ),
pending_replies = [], true = erlang:garbage_collect(),
tref = start_timer() S#s{
}. n = 0,
batch = [],
pending_replies = [],
tref = start_timer()
};
{error, Reason} ->
?tp(
warning,
emqx_ds_replication_layer_egress_flush_failed,
#{db => DB, shard => Shard, reason => Reason}
),
Cooldown = ?COOLDOWN_MIN + rand:uniform(?COOLDOWN_MAX - ?COOLDOWN_MIN),
ok = timer:sleep(Cooldown),
S#s{tref = start_timer()}
end.
do_enqueue(From, Sync, MsgOrBatch, S0 = #s{n = N, batch = Batch, pending_replies = Replies}) -> do_enqueue(From, Sync, MsgOrBatch, S0 = #s{n = N, batch = Batch, pending_replies = Replies}) ->
NMax = application:get_env(emqx_durable_storage, egress_batch_size, 1000), NMax = application:get_env(emqx_durable_storage, egress_batch_size, 1000),