Merge pull request #9922 from keynslug/fix/EMQX-8876/inflight-stuck-full
fix(bufworker): do not avoid retry if inflight table is full
This commit is contained in:
commit
73d5592b5a
|
@ -1,7 +1,7 @@
|
||||||
%% -*- mode: erlang -*-
|
%% -*- mode: erlang -*-
|
||||||
{application, emqx_resource, [
|
{application, emqx_resource, [
|
||||||
{description, "Manager for all external resources"},
|
{description, "Manager for all external resources"},
|
||||||
{vsn, "0.1.6"},
|
{vsn, "0.1.7"},
|
||||||
{registered, []},
|
{registered, []},
|
||||||
{mod, {emqx_resource_app, []}},
|
{mod, {emqx_resource_app, []}},
|
||||||
{applications, [
|
{applications, [
|
||||||
|
|
|
@ -334,12 +334,7 @@ resume_from_blocked(Data) ->
|
||||||
{single, Ref, Query} ->
|
{single, Ref, Query} ->
|
||||||
%% We retry msgs in inflight window sync, as if we send them
|
%% We retry msgs in inflight window sync, as if we send them
|
||||||
%% async, they will be appended to the end of inflight window again.
|
%% async, they will be appended to the end of inflight window again.
|
||||||
case is_inflight_full(InflightTID) of
|
retry_inflight_sync(Ref, Query, Data);
|
||||||
true ->
|
|
||||||
{keep_state, Data};
|
|
||||||
false ->
|
|
||||||
retry_inflight_sync(Ref, Query, Data)
|
|
||||||
end;
|
|
||||||
{batch, Ref, NotExpired, Expired} ->
|
{batch, Ref, NotExpired, Expired} ->
|
||||||
update_inflight_item(InflightTID, Ref, NotExpired),
|
update_inflight_item(InflightTID, Ref, NotExpired),
|
||||||
NumExpired = length(Expired),
|
NumExpired = length(Expired),
|
||||||
|
@ -347,12 +342,7 @@ resume_from_blocked(Data) ->
|
||||||
NumExpired > 0 andalso ?tp(buffer_worker_retry_expired, #{expired => Expired}),
|
NumExpired > 0 andalso ?tp(buffer_worker_retry_expired, #{expired => Expired}),
|
||||||
%% We retry msgs in inflight window sync, as if we send them
|
%% We retry msgs in inflight window sync, as if we send them
|
||||||
%% async, they will be appended to the end of inflight window again.
|
%% async, they will be appended to the end of inflight window again.
|
||||||
case is_inflight_full(InflightTID) of
|
retry_inflight_sync(Ref, NotExpired, Data)
|
||||||
true ->
|
|
||||||
{keep_state, Data};
|
|
||||||
false ->
|
|
||||||
retry_inflight_sync(Ref, NotExpired, Data)
|
|
||||||
end
|
|
||||||
end.
|
end.
|
||||||
|
|
||||||
retry_inflight_sync(Ref, QueryOrBatch, Data0) ->
|
retry_inflight_sync(Ref, QueryOrBatch, Data0) ->
|
||||||
|
|
|
@ -134,6 +134,17 @@ on_query(_InstId, get_counter, #{pid := Pid}) ->
|
||||||
{ReqRef, Num} -> {ok, Num}
|
{ReqRef, Num} -> {ok, Num}
|
||||||
after 1000 ->
|
after 1000 ->
|
||||||
{error, timeout}
|
{error, timeout}
|
||||||
|
end;
|
||||||
|
on_query(_InstId, {sleep, For}, #{pid := Pid}) ->
|
||||||
|
?tp(connector_demo_sleep, #{mode => sync, for => For}),
|
||||||
|
ReqRef = make_ref(),
|
||||||
|
From = {self(), ReqRef},
|
||||||
|
Pid ! {From, {sleep, For}},
|
||||||
|
receive
|
||||||
|
{ReqRef, Result} ->
|
||||||
|
Result
|
||||||
|
after 1000 ->
|
||||||
|
{error, timeout}
|
||||||
end.
|
end.
|
||||||
|
|
||||||
on_query_async(_InstId, {inc_counter, N}, ReplyFun, #{pid := Pid}) ->
|
on_query_async(_InstId, {inc_counter, N}, ReplyFun, #{pid := Pid}) ->
|
||||||
|
@ -147,6 +158,10 @@ on_query_async(_InstId, block_now, ReplyFun, #{pid := Pid}) ->
|
||||||
{ok, Pid};
|
{ok, Pid};
|
||||||
on_query_async(_InstId, {big_payload, Payload}, ReplyFun, #{pid := Pid}) ->
|
on_query_async(_InstId, {big_payload, Payload}, ReplyFun, #{pid := Pid}) ->
|
||||||
Pid ! {big_payload, Payload, ReplyFun},
|
Pid ! {big_payload, Payload, ReplyFun},
|
||||||
|
{ok, Pid};
|
||||||
|
on_query_async(_InstId, {sleep, For}, ReplyFun, #{pid := Pid}) ->
|
||||||
|
?tp(connector_demo_sleep, #{mode => async, for => For}),
|
||||||
|
Pid ! {{sleep, For}, ReplyFun},
|
||||||
{ok, Pid}.
|
{ok, Pid}.
|
||||||
|
|
||||||
on_batch_query(InstId, BatchReq, State) ->
|
on_batch_query(InstId, BatchReq, State) ->
|
||||||
|
@ -283,10 +298,28 @@ counter_loop(
|
||||||
State;
|
State;
|
||||||
{{FromPid, ReqRef}, get} ->
|
{{FromPid, ReqRef}, get} ->
|
||||||
FromPid ! {ReqRef, Num},
|
FromPid ! {ReqRef, Num},
|
||||||
|
State;
|
||||||
|
{{sleep, _} = SleepQ, ReplyFun} ->
|
||||||
|
apply_reply(ReplyFun, handle_query(async, SleepQ, Status)),
|
||||||
|
State;
|
||||||
|
{{FromPid, ReqRef}, {sleep, _} = SleepQ} ->
|
||||||
|
FromPid ! {ReqRef, handle_query(sync, SleepQ, Status)},
|
||||||
State
|
State
|
||||||
end,
|
end,
|
||||||
counter_loop(NewState).
|
counter_loop(NewState).
|
||||||
|
|
||||||
|
handle_query(Mode, {sleep, For} = Query, Status) ->
|
||||||
|
ok = timer:sleep(For),
|
||||||
|
Result =
|
||||||
|
case Status of
|
||||||
|
running -> ok;
|
||||||
|
blocked -> {error, {recoverable_error, blocked}}
|
||||||
|
end,
|
||||||
|
?tp(connector_demo_sleep_handled, #{
|
||||||
|
mode => Mode, query => Query, slept => For, result => Result
|
||||||
|
}),
|
||||||
|
Result.
|
||||||
|
|
||||||
maybe_register(Name, Pid, true) ->
|
maybe_register(Name, Pid, true) ->
|
||||||
ct:pal("---- Register Name: ~p", [Name]),
|
ct:pal("---- Register Name: ~p", [Name]),
|
||||||
ct:pal("---- whereis(): ~p", [whereis(Name)]),
|
ct:pal("---- whereis(): ~p", [whereis(Name)]),
|
||||||
|
|
|
@ -1452,6 +1452,61 @@ t_retry_async_inflight(_Config) ->
|
||||||
),
|
),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
|
t_retry_async_inflight_full(_Config) ->
|
||||||
|
ResumeInterval = 1_000,
|
||||||
|
AsyncInflightWindow = 5,
|
||||||
|
emqx_connector_demo:set_callback_mode(async_if_possible),
|
||||||
|
{ok, _} = emqx_resource:create(
|
||||||
|
?ID,
|
||||||
|
?DEFAULT_RESOURCE_GROUP,
|
||||||
|
?TEST_RESOURCE,
|
||||||
|
#{name => ?FUNCTION_NAME},
|
||||||
|
#{
|
||||||
|
query_mode => async,
|
||||||
|
async_inflight_window => AsyncInflightWindow,
|
||||||
|
batch_size => 1,
|
||||||
|
batch_time => 20,
|
||||||
|
worker_pool_size => 1,
|
||||||
|
resume_interval => ResumeInterval
|
||||||
|
}
|
||||||
|
),
|
||||||
|
?check_trace(
|
||||||
|
#{timetrap => 15_000},
|
||||||
|
begin
|
||||||
|
%% block
|
||||||
|
ok = emqx_resource:simple_sync_query(?ID, block),
|
||||||
|
|
||||||
|
{ok, {ok, _}} =
|
||||||
|
?wait_async_action(
|
||||||
|
inc_counter_in_parallel(
|
||||||
|
AsyncInflightWindow * 2,
|
||||||
|
fun() ->
|
||||||
|
For = (ResumeInterval div 4) + rand:uniform(ResumeInterval div 4),
|
||||||
|
{sleep, For}
|
||||||
|
end,
|
||||||
|
#{async_reply_fun => {fun(Res) -> ct:pal("Res = ~p", [Res]) end, []}}
|
||||||
|
),
|
||||||
|
#{?snk_kind := buffer_worker_flush_but_inflight_full},
|
||||||
|
ResumeInterval * 2
|
||||||
|
),
|
||||||
|
|
||||||
|
%% will reply with success after the resource is healed
|
||||||
|
{ok, {ok, _}} =
|
||||||
|
?wait_async_action(
|
||||||
|
emqx_resource:simple_sync_query(?ID, resume),
|
||||||
|
#{?snk_kind := buffer_worker_enter_running}
|
||||||
|
),
|
||||||
|
ok
|
||||||
|
end,
|
||||||
|
[
|
||||||
|
fun(Trace) ->
|
||||||
|
?assertMatch([#{} | _], ?of_kind(buffer_worker_flush_but_inflight_full, Trace))
|
||||||
|
end
|
||||||
|
]
|
||||||
|
),
|
||||||
|
?assertEqual(0, emqx_resource_metrics:inflight_get(?ID)),
|
||||||
|
ok.
|
||||||
|
|
||||||
t_retry_async_inflight_batch(_Config) ->
|
t_retry_async_inflight_batch(_Config) ->
|
||||||
ResumeInterval = 1_000,
|
ResumeInterval = 1_000,
|
||||||
emqx_connector_demo:set_callback_mode(async_if_possible),
|
emqx_connector_demo:set_callback_mode(async_if_possible),
|
||||||
|
@ -2241,18 +2296,16 @@ t_expiration_retry_batch_multiple_times(_Config) ->
|
||||||
%%------------------------------------------------------------------------------
|
%%------------------------------------------------------------------------------
|
||||||
|
|
||||||
inc_counter_in_parallel(N) ->
|
inc_counter_in_parallel(N) ->
|
||||||
inc_counter_in_parallel(N, #{}).
|
inc_counter_in_parallel(N, {inc_counter, 1}, #{}).
|
||||||
|
|
||||||
inc_counter_in_parallel(N, Opts0) ->
|
inc_counter_in_parallel(N, Opts0) ->
|
||||||
|
inc_counter_in_parallel(N, {inc_counter, 1}, Opts0).
|
||||||
|
|
||||||
|
inc_counter_in_parallel(N, Query, Opts) ->
|
||||||
Parent = self(),
|
Parent = self(),
|
||||||
Pids = [
|
Pids = [
|
||||||
erlang:spawn(fun() ->
|
erlang:spawn(fun() ->
|
||||||
Opts =
|
emqx_resource:query(?ID, maybe_apply(Query), maybe_apply(Opts)),
|
||||||
case is_function(Opts0) of
|
|
||||||
true -> Opts0();
|
|
||||||
false -> Opts0
|
|
||||||
end,
|
|
||||||
emqx_resource:query(?ID, {inc_counter, 1}, Opts),
|
|
||||||
Parent ! {complete, self()}
|
Parent ! {complete, self()}
|
||||||
end)
|
end)
|
||||||
|| _ <- lists:seq(1, N)
|
|| _ <- lists:seq(1, N)
|
||||||
|
@ -2267,16 +2320,11 @@ inc_counter_in_parallel(N, Opts0) ->
|
||||||
],
|
],
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
inc_counter_in_parallel_increasing(N, StartN, Opts0) ->
|
inc_counter_in_parallel_increasing(N, StartN, Opts) ->
|
||||||
Parent = self(),
|
Parent = self(),
|
||||||
Pids = [
|
Pids = [
|
||||||
erlang:spawn(fun() ->
|
erlang:spawn(fun() ->
|
||||||
Opts =
|
emqx_resource:query(?ID, {inc_counter, M}, maybe_apply(Opts)),
|
||||||
case is_function(Opts0) of
|
|
||||||
true -> Opts0();
|
|
||||||
false -> Opts0
|
|
||||||
end,
|
|
||||||
emqx_resource:query(?ID, {inc_counter, M}, Opts),
|
|
||||||
Parent ! {complete, self()}
|
Parent ! {complete, self()}
|
||||||
end)
|
end)
|
||||||
|| M <- lists:seq(StartN, StartN + N - 1)
|
|| M <- lists:seq(StartN, StartN + N - 1)
|
||||||
|
@ -2290,6 +2338,14 @@ inc_counter_in_parallel_increasing(N, StartN, Opts0) ->
|
||||||
|| Pid <- Pids
|
|| Pid <- Pids
|
||||||
].
|
].
|
||||||
|
|
||||||
|
maybe_apply(FunOrTerm) ->
|
||||||
|
maybe_apply(FunOrTerm, []).
|
||||||
|
|
||||||
|
maybe_apply(Fun, Args) when is_function(Fun) ->
|
||||||
|
erlang:apply(Fun, Args);
|
||||||
|
maybe_apply(Term, _Args) ->
|
||||||
|
Term.
|
||||||
|
|
||||||
bin_config() ->
|
bin_config() ->
|
||||||
<<"\"name\": \"test_resource\"">>.
|
<<"\"name\": \"test_resource\"">>.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Fix the issue with the bridge resource buffer where it might become stuck if enough async queries fill the inflight window full before failing with retryable errors.
|
|
@ -0,0 +1 @@
|
||||||
|
修复桥接资源缓冲区的问题,如果足够多的异步查询在失败并出现可重试错误之前将机上窗口填满,则可能会卡住。
|
Loading…
Reference in New Issue