refactor(buffer_worker): ensure flsh message is never missed
This commit is contained in:
parent
dbfdeec5e9
commit
3a6dbbdd05
|
@ -70,6 +70,18 @@
|
||||||
-define(RETRY_IDX, 3).
|
-define(RETRY_IDX, 3).
|
||||||
-define(WORKER_MREF_IDX, 4).
|
-define(WORKER_MREF_IDX, 4).
|
||||||
|
|
||||||
|
-define(ENSURE_ASYNC_FLUSH(InflightTID, EXPR),
|
||||||
|
(fun() ->
|
||||||
|
IsFullBefore = is_inflight_full(InflightTID),
|
||||||
|
case (EXPR) of
|
||||||
|
blocked ->
|
||||||
|
ok;
|
||||||
|
ok ->
|
||||||
|
maybe_flush_after_async_reply(IsFullBefore)
|
||||||
|
end
|
||||||
|
end)()
|
||||||
|
).
|
||||||
|
|
||||||
-type id() :: binary().
|
-type id() :: binary().
|
||||||
-type index() :: pos_integer().
|
-type index() :: pos_integer().
|
||||||
-type expire_at() :: infinity | integer().
|
-type expire_at() :: infinity | integer().
|
||||||
|
@ -194,8 +206,8 @@ init({Id, Index, Opts}) ->
|
||||||
?tp(buffer_worker_init, #{id => Id, index => Index}),
|
?tp(buffer_worker_init, #{id => Id, index => Index}),
|
||||||
{ok, running, Data}.
|
{ok, running, Data}.
|
||||||
|
|
||||||
running(enter, _, Data) ->
|
running(enter, _, #{tref := _Tref} = Data) ->
|
||||||
?tp(buffer_worker_enter_running, #{id => maps:get(id, Data)}),
|
?tp(buffer_worker_enter_running, #{id => maps:get(id, Data), tref => _Tref}),
|
||||||
%% According to `gen_statem' laws, we mustn't call `maybe_flush'
|
%% According to `gen_statem' laws, we mustn't call `maybe_flush'
|
||||||
%% directly because it may decide to return `{next_state, blocked, _}',
|
%% directly because it may decide to return `{next_state, blocked, _}',
|
||||||
%% and that's an invalid response for a state enter call.
|
%% and that's an invalid response for a state enter call.
|
||||||
|
@ -212,9 +224,8 @@ running(info, ?SEND_REQ(_ReplyTo, _Req) = Request0, Data) ->
|
||||||
handle_query_requests(Request0, Data);
|
handle_query_requests(Request0, Data);
|
||||||
running(info, {flush, Ref}, St = #{tref := {_TRef, Ref}}) ->
|
running(info, {flush, Ref}, St = #{tref := {_TRef, Ref}}) ->
|
||||||
flush(St#{tref := undefined});
|
flush(St#{tref := undefined});
|
||||||
running(internal, flush, St) ->
|
|
||||||
flush(St);
|
|
||||||
running(info, {flush, _Ref}, _St) ->
|
running(info, {flush, _Ref}, _St) ->
|
||||||
|
?tp(discarded_stale_flush, #{}),
|
||||||
keep_state_and_data;
|
keep_state_and_data;
|
||||||
running(info, {'DOWN', _MRef, process, Pid, Reason}, Data0 = #{async_workers := AsyncWorkers0}) when
|
running(info, {'DOWN', _MRef, process, Pid, Reason}, Data0 = #{async_workers := AsyncWorkers0}) when
|
||||||
is_map_key(Pid, AsyncWorkers0)
|
is_map_key(Pid, AsyncWorkers0)
|
||||||
|
@ -472,10 +483,15 @@ flush(Data0) ->
|
||||||
Data1 = cancel_flush_timer(Data0),
|
Data1 = cancel_flush_timer(Data0),
|
||||||
CurrentCount = queue_count(Q0),
|
CurrentCount = queue_count(Q0),
|
||||||
IsFull = is_inflight_full(InflightTID),
|
IsFull = is_inflight_full(InflightTID),
|
||||||
?tp(buffer_worker_flush, #{queue_count => CurrentCount, is_full => IsFull}),
|
InflightCount = inflight_num_batches(InflightTID),
|
||||||
|
?tp(buffer_worker_flush, #{
|
||||||
|
queued => CurrentCount,
|
||||||
|
is_inflight_full => IsFull,
|
||||||
|
inflight => InflightCount
|
||||||
|
}),
|
||||||
case {CurrentCount, IsFull} of
|
case {CurrentCount, IsFull} of
|
||||||
{0, _} ->
|
{0, _} ->
|
||||||
?tp(buffer_worker_queue_drained, #{inflight => inflight_num_batches(InflightTID)}),
|
?tp(buffer_worker_queue_drained, #{inflight => InflightCount}),
|
||||||
{keep_state, Data1};
|
{keep_state, Data1};
|
||||||
{_, true} ->
|
{_, true} ->
|
||||||
?tp(buffer_worker_flush_but_inflight_full, #{}),
|
?tp(buffer_worker_flush_but_inflight_full, #{}),
|
||||||
|
@ -714,18 +730,18 @@ batch_reply_caller_defer_metrics(Id, BatchResult, Batch, QueryOpts) ->
|
||||||
end,
|
end,
|
||||||
Batch
|
Batch
|
||||||
),
|
),
|
||||||
{Action, PostFn1} = reply_caller_defer_metrics(Id, hd(Replies), QueryOpts),
|
{ShouldAck, PostFns} =
|
||||||
PostFns =
|
|
||||||
lists:foldl(
|
lists:foldl(
|
||||||
fun(Reply, PostFns) ->
|
fun(Reply, {_ShouldAck, PostFns}) ->
|
||||||
{_, PostFn} = reply_caller_defer_metrics(Id, Reply, QueryOpts),
|
%% _ShouldAck should be the same as ShouldAck starting from the second reply
|
||||||
[PostFn | PostFns]
|
{ShouldAck, PostFn} = reply_caller_defer_metrics(Id, Reply, QueryOpts),
|
||||||
|
{ShouldAck, [PostFn | PostFns]}
|
||||||
end,
|
end,
|
||||||
[PostFn1],
|
{ack, []},
|
||||||
tl(Replies)
|
Replies
|
||||||
),
|
),
|
||||||
PostFn = fun() -> lists:foreach(fun(F) -> F() end, lists:reverse(PostFns)) end,
|
PostFn = fun() -> lists:foreach(fun(F) -> F() end, lists:reverse(PostFns)) end,
|
||||||
{Action, PostFn}.
|
{ShouldAck, PostFn}.
|
||||||
|
|
||||||
reply_caller(Id, Reply, QueryOpts) ->
|
reply_caller(Id, Reply, QueryOpts) ->
|
||||||
{ShouldAck, PostFn} = reply_caller_defer_metrics(Id, Reply, QueryOpts),
|
{ShouldAck, PostFn} = reply_caller_defer_metrics(Id, Reply, QueryOpts),
|
||||||
|
@ -978,7 +994,7 @@ handle_async_reply(
|
||||||
discard ->
|
discard ->
|
||||||
ok;
|
ok;
|
||||||
continue ->
|
continue ->
|
||||||
handle_async_reply1(ReplyContext, Result)
|
?ENSURE_ASYNC_FLUSH(InflightTID, handle_async_reply1(ReplyContext, Result))
|
||||||
end.
|
end.
|
||||||
|
|
||||||
handle_async_reply1(
|
handle_async_reply1(
|
||||||
|
@ -999,10 +1015,8 @@ handle_async_reply1(
|
||||||
Now = now_(),
|
Now = now_(),
|
||||||
case is_expired(ExpireAt, Now) of
|
case is_expired(ExpireAt, Now) of
|
||||||
true ->
|
true ->
|
||||||
IsFullBefore = is_inflight_full(InflightTID),
|
|
||||||
IsAcked = ack_inflight(InflightTID, Ref, Id, Index),
|
IsAcked = ack_inflight(InflightTID, Ref, Id, Index),
|
||||||
IsAcked andalso emqx_resource_metrics:late_reply_inc(Id),
|
IsAcked andalso emqx_resource_metrics:late_reply_inc(Id),
|
||||||
IsFullBefore andalso ?MODULE:flush_worker(Pid),
|
|
||||||
?tp(handle_async_reply_expired, #{expired => [_Query]}),
|
?tp(handle_async_reply_expired, #{expired => [_Query]}),
|
||||||
ok;
|
ok;
|
||||||
false ->
|
false ->
|
||||||
|
@ -1034,16 +1048,15 @@ do_handle_async_reply(
|
||||||
ref => Ref,
|
ref => Ref,
|
||||||
result => Result
|
result => Result
|
||||||
}),
|
}),
|
||||||
IsFullBefore = is_inflight_full(InflightTID),
|
|
||||||
case Action of
|
case Action of
|
||||||
nack ->
|
nack ->
|
||||||
%% Keep retrying.
|
%% Keep retrying.
|
||||||
ok = mark_inflight_as_retriable(InflightTID, Ref),
|
ok = mark_inflight_as_retriable(InflightTID, Ref),
|
||||||
?MODULE:block(Pid);
|
ok = ?MODULE:block(Pid),
|
||||||
|
blocked;
|
||||||
ack ->
|
ack ->
|
||||||
do_async_ack(InflightTID, Ref, Id, Index, PostFn, QueryOpts)
|
ok = do_async_ack(InflightTID, Ref, Id, Index, PostFn, QueryOpts)
|
||||||
end,
|
end.
|
||||||
ok = maybe_flush_after_async_reply(IsFullBefore).
|
|
||||||
|
|
||||||
handle_async_batch_reply(
|
handle_async_batch_reply(
|
||||||
#{
|
#{
|
||||||
|
@ -1056,7 +1069,7 @@ handle_async_batch_reply(
|
||||||
discard ->
|
discard ->
|
||||||
ok;
|
ok;
|
||||||
continue ->
|
continue ->
|
||||||
handle_async_batch_reply1(ReplyContext, Result)
|
?ENSURE_ASYNC_FLUSH(InflightTID, handle_async_batch_reply1(ReplyContext, Result))
|
||||||
end.
|
end.
|
||||||
|
|
||||||
handle_async_batch_reply1(
|
handle_async_batch_reply1(
|
||||||
|
@ -1072,21 +1085,19 @@ handle_async_batch_reply1(
|
||||||
#{batch_or_query => Batch, ref => Ref, result => Result}
|
#{batch_or_query => Batch, ref => Ref, result => Result}
|
||||||
),
|
),
|
||||||
Now = now_(),
|
Now = now_(),
|
||||||
IsFullBefore = is_inflight_full(InflightTID),
|
|
||||||
case sieve_expired_requests(Batch, Now) of
|
case sieve_expired_requests(Batch, Now) of
|
||||||
{_NotExpired, []} ->
|
{_NotExpired, []} ->
|
||||||
%% this is the critical code path,
|
%% this is the critical code path,
|
||||||
%% we try not to do ets:lookup in this case
|
%% we try not to do ets:lookup in this case
|
||||||
%% because the batch can be quite big
|
%% because the batch can be quite big
|
||||||
ok = do_handle_async_batch_reply(ReplyContext, Result);
|
do_handle_async_batch_reply(ReplyContext, Result);
|
||||||
{_NotExpired, _Expired} ->
|
{_NotExpired, _Expired} ->
|
||||||
%% at least one is expired
|
%% at least one is expired
|
||||||
%% the batch from reply context is minimized, so it cannot be used
|
%% the batch from reply context is minimized, so it cannot be used
|
||||||
%% to update the inflight items, hence discard Batch and lookup the RealBatch
|
%% to update the inflight items, hence discard Batch and lookup the RealBatch
|
||||||
?tp(handle_async_reply_expired, #{expired => _Expired}),
|
?tp(handle_async_reply_expired, #{expired => _Expired}),
|
||||||
ok = handle_async_batch_reply2(ets:lookup(InflightTID, Ref), ReplyContext, Result, Now)
|
handle_async_batch_reply2(ets:lookup(InflightTID, Ref), ReplyContext, Result, Now)
|
||||||
end,
|
end.
|
||||||
ok = maybe_flush_after_async_reply(IsFullBefore).
|
|
||||||
|
|
||||||
handle_async_batch_reply2([], _, _, _) ->
|
handle_async_batch_reply2([], _, _, _) ->
|
||||||
%% should have caused the unknown_async_reply_discarded
|
%% should have caused the unknown_async_reply_discarded
|
||||||
|
@ -1124,9 +1135,8 @@ handle_async_batch_reply2([Inflight], ReplyContext, Result, Now) ->
|
||||||
%% some queries are not expired, put them back to the inflight batch
|
%% some queries are not expired, put them back to the inflight batch
|
||||||
%% so it can be either acked now or retried later
|
%% so it can be either acked now or retried later
|
||||||
ok = update_inflight_item(InflightTID, Ref, RealNotExpired, NumExpired),
|
ok = update_inflight_item(InflightTID, Ref, RealNotExpired, NumExpired),
|
||||||
ok = do_handle_async_batch_reply(ReplyContext#{min_batch := RealNotExpired}, Result)
|
do_handle_async_batch_reply(ReplyContext#{min_batch := RealNotExpired}, Result)
|
||||||
end,
|
end.
|
||||||
ok.
|
|
||||||
|
|
||||||
do_handle_async_batch_reply(
|
do_handle_async_batch_reply(
|
||||||
#{
|
#{
|
||||||
|
@ -1151,7 +1161,8 @@ do_handle_async_batch_reply(
|
||||||
nack ->
|
nack ->
|
||||||
%% Keep retrying.
|
%% Keep retrying.
|
||||||
ok = mark_inflight_as_retriable(InflightTID, Ref),
|
ok = mark_inflight_as_retriable(InflightTID, Ref),
|
||||||
ok = ?MODULE:block(Pid);
|
ok = ?MODULE:block(Pid),
|
||||||
|
blocked;
|
||||||
ack ->
|
ack ->
|
||||||
ok = do_async_ack(InflightTID, Ref, Id, Index, PostFn, QueryOpts)
|
ok = do_async_ack(InflightTID, Ref, Id, Index, PostFn, QueryOpts)
|
||||||
end.
|
end.
|
||||||
|
@ -1173,9 +1184,11 @@ maybe_flush_after_async_reply(_WasFullBeforeReplyHandled = false) ->
|
||||||
%% after it is handled, the inflight table must be even smaller
|
%% after it is handled, the inflight table must be even smaller
|
||||||
%% hance we can rely on the buffer worker's flush timer to trigger
|
%% hance we can rely on the buffer worker's flush timer to trigger
|
||||||
%% the next flush
|
%% the next flush
|
||||||
|
?tp(skip_flushing_worker, #{}),
|
||||||
ok;
|
ok;
|
||||||
maybe_flush_after_async_reply(_WasFullBeforeReplyHandled = true) ->
|
maybe_flush_after_async_reply(_WasFullBeforeReplyHandled = true) ->
|
||||||
%% the inflight table was full before handling aync reply
|
%% the inflight table was full before handling aync reply
|
||||||
|
?tp(do_flushing_worker, #{}),
|
||||||
ok = ?MODULE:flush_worker(self()).
|
ok = ?MODULE:flush_worker(self()).
|
||||||
|
|
||||||
%% check if the async reply is valid.
|
%% check if the async reply is valid.
|
||||||
|
@ -1189,7 +1202,6 @@ maybe_flush_after_async_reply(_WasFullBeforeReplyHandled = true) ->
|
||||||
maybe_handle_unknown_async_reply(InflightTID, Ref) ->
|
maybe_handle_unknown_async_reply(InflightTID, Ref) ->
|
||||||
try ets:member(InflightTID, Ref) of
|
try ets:member(InflightTID, Ref) of
|
||||||
true ->
|
true ->
|
||||||
%% NOTE: this does not mean the
|
|
||||||
continue;
|
continue;
|
||||||
false ->
|
false ->
|
||||||
?tp(
|
?tp(
|
||||||
|
@ -1446,7 +1458,7 @@ mark_inflight_items_as_retriable(Data, WorkerMRef) ->
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
%% used to update a batch after dropping expired individual queries.
|
%% used to update a batch after dropping expired individual queries.
|
||||||
update_inflight_item(InflightTID, Ref, NewBatch, NumExpired) ->
|
update_inflight_item(InflightTID, Ref, NewBatch, NumExpired) when NumExpired > 0 ->
|
||||||
_ = ets:update_element(InflightTID, Ref, {?ITEM_IDX, NewBatch}),
|
_ = ets:update_element(InflightTID, Ref, {?ITEM_IDX, NewBatch}),
|
||||||
ok = dec_inflight(InflightTID, NumExpired),
|
ok = dec_inflight(InflightTID, NumExpired),
|
||||||
ok.
|
ok.
|
||||||
|
|
|
@ -314,6 +314,8 @@ counter_loop(
|
||||||
%% drain the buffer (and inflights table)
|
%% drain the buffer (and inflights table)
|
||||||
ReplyCount = 1 + (RandNum rem 3),
|
ReplyCount = 1 + (RandNum rem 3),
|
||||||
Results = random_replies(ReplyCount),
|
Results = random_replies(ReplyCount),
|
||||||
|
%% add a delay to trigger inflight full
|
||||||
|
timer:sleep(5),
|
||||||
lists:foreach(
|
lists:foreach(
|
||||||
fun(Result) ->
|
fun(Result) ->
|
||||||
apply_reply(ReplyFun, Result)
|
apply_reply(ReplyFun, Result)
|
||||||
|
|
|
@ -1510,8 +1510,9 @@ t_retry_async_inflight_full(_Config) ->
|
||||||
%% this test case is to ensure the buffer worker will not go crazy even
|
%% this test case is to ensure the buffer worker will not go crazy even
|
||||||
%% if the underlying connector is misbehaving: evaluate async callbacks multiple times
|
%% if the underlying connector is misbehaving: evaluate async callbacks multiple times
|
||||||
t_async_reply_multi_eval(_Config) ->
|
t_async_reply_multi_eval(_Config) ->
|
||||||
ResumeInterval = 20,
|
ResumeInterval = 5,
|
||||||
AsyncInflightWindow = 5,
|
TotalTime = 5_000,
|
||||||
|
AsyncInflightWindow = 3,
|
||||||
emqx_connector_demo:set_callback_mode(async_if_possible),
|
emqx_connector_demo:set_callback_mode(async_if_possible),
|
||||||
{ok, _} = emqx_resource:create(
|
{ok, _} = emqx_resource:create(
|
||||||
?ID,
|
?ID,
|
||||||
|
@ -1528,29 +1529,31 @@ t_async_reply_multi_eval(_Config) ->
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
?check_trace(
|
?check_trace(
|
||||||
#{timetrap => 15_000},
|
#{timetrap => 30_000},
|
||||||
begin
|
begin
|
||||||
%% block
|
%% block
|
||||||
ok = emqx_resource:simple_sync_query(?ID, block),
|
ok = emqx_resource:simple_sync_query(?ID, block),
|
||||||
|
|
||||||
{ok, {ok, _}} =
|
|
||||||
?wait_async_action(
|
?wait_async_action(
|
||||||
inc_counter_in_parallel(
|
inc_counter_in_parallel(
|
||||||
AsyncInflightWindow * 2,
|
AsyncInflightWindow * 5,
|
||||||
fun() ->
|
fun() ->
|
||||||
Rand = rand:uniform(1000),
|
Rand = rand:uniform(1000),
|
||||||
{random_reply, Rand}
|
{random_reply, Rand}
|
||||||
end,
|
end,
|
||||||
#{}
|
#{}
|
||||||
),
|
),
|
||||||
#{?snk_kind := buffer_worker_queue_drained, inflight := 0},
|
#{?snk_kind := buffer_worker_flush, inflight := 0, queued := 0},
|
||||||
ResumeInterval * 200
|
TotalTime
|
||||||
),
|
),
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
[
|
[
|
||||||
fun(Trace) ->
|
fun(Trace) ->
|
||||||
?assertMatch([#{inflight := 0}], ?of_kind(buffer_worker_queue_drained, Trace))
|
?assertMatch(
|
||||||
|
[#{inflight := 0} | _],
|
||||||
|
lists:reverse(?of_kind(buffer_worker_queue_drained, Trace))
|
||||||
|
)
|
||||||
end
|
end
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
Loading…
Reference in New Issue