chore: don't cancel inflight items upon worker death; retry them
This commit is contained in:
parent
087b667263
commit
5c2ac0ac81
|
@ -771,7 +771,7 @@ handle_async_worker_down(Data0, Pid) ->
|
||||||
#{async_workers := AsyncWorkers0} = Data0,
|
#{async_workers := AsyncWorkers0} = Data0,
|
||||||
{WorkerMRef, AsyncWorkers} = maps:take(Pid, AsyncWorkers0),
|
{WorkerMRef, AsyncWorkers} = maps:take(Pid, AsyncWorkers0),
|
||||||
Data = Data0#{async_workers := AsyncWorkers},
|
Data = Data0#{async_workers := AsyncWorkers},
|
||||||
cancel_inflight_items(Data, WorkerMRef),
|
mark_inflight_items_as_retriable(Data, WorkerMRef),
|
||||||
{keep_state, Data}.
|
{keep_state, Data}.
|
||||||
|
|
||||||
call_query(QM0, Id, Index, Ref, Query, QueryOpts) ->
|
call_query(QM0, Id, Index, Ref, Query, QueryOpts) ->
|
||||||
|
@ -1118,37 +1118,19 @@ ack_inflight(InflightTID, Ref, Id, Index) ->
|
||||||
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)),
|
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)),
|
||||||
IsAcked.
|
IsAcked.
|
||||||
|
|
||||||
cancel_inflight_items(Data, WorkerMRef) ->
|
mark_inflight_items_as_retriable(Data, WorkerMRef) ->
|
||||||
#{inflight_tid := InflightTID} = Data,
|
#{inflight_tid := InflightTID} = Data,
|
||||||
|
IsRetriable = true,
|
||||||
MatchSpec =
|
MatchSpec =
|
||||||
ets:fun2ms(
|
ets:fun2ms(
|
||||||
fun(?INFLIGHT_ITEM(Ref, _BatchOrQuery, _IsRetriable, WorkerMRef0)) when
|
fun(?INFLIGHT_ITEM(Ref, BatchOrQuery, _IsRetriable, WorkerMRef0)) when
|
||||||
WorkerMRef =:= WorkerMRef0
|
WorkerMRef =:= WorkerMRef0
|
||||||
->
|
->
|
||||||
Ref
|
?INFLIGHT_ITEM(Ref, BatchOrQuery, IsRetriable, WorkerMRef0)
|
||||||
end
|
end
|
||||||
),
|
),
|
||||||
Refs = ets:select(InflightTID, MatchSpec),
|
_NumAffected = ets:select_replace(InflightTID, MatchSpec),
|
||||||
lists:foreach(fun(Ref) -> do_cancel_inflight_item(Data, Ref) end, Refs).
|
?tp(resource_worker_worker_down_update, #{num_affected => _NumAffected}),
|
||||||
|
|
||||||
do_cancel_inflight_item(Data, Ref) ->
|
|
||||||
#{id := Id, index := Index, inflight_tid := InflightTID} = Data,
|
|
||||||
{Count, Batch} =
|
|
||||||
case ets:take(InflightTID, Ref) of
|
|
||||||
[?INFLIGHT_ITEM(Ref, ?QUERY(_, _, _) = Query, _IsRetriable, _WorkerMRef)] ->
|
|
||||||
{1, [Query]};
|
|
||||||
[?INFLIGHT_ITEM(Ref, [?QUERY(_, _, _) | _] = Batch0, _IsRetriable, _WorkerMRef)] ->
|
|
||||||
{length(Batch0), Batch0};
|
|
||||||
_ ->
|
|
||||||
{0, []}
|
|
||||||
end,
|
|
||||||
IsAcked = Count > 0,
|
|
||||||
IsAcked andalso ets:update_counter(InflightTID, ?SIZE_REF, {2, -Count, 0, 0}),
|
|
||||||
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)),
|
|
||||||
Result = {error, interrupted},
|
|
||||||
QueryOpts = #{simple_query => false},
|
|
||||||
_ = batch_reply_caller(Id, Result, Batch, QueryOpts),
|
|
||||||
?tp(resource_worker_cancelled_inflight, #{ref => Ref}),
|
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
%%==============================================================================
|
%%==============================================================================
|
||||||
|
|
|
@ -1432,6 +1432,7 @@ t_retry_async_inflight_batch(_Config) ->
|
||||||
%% requests if they die.
|
%% requests if they die.
|
||||||
t_async_pool_worker_death(_Config) ->
|
t_async_pool_worker_death(_Config) ->
|
||||||
ResumeInterval = 1_000,
|
ResumeInterval = 1_000,
|
||||||
|
NumBufferWorkers = 2,
|
||||||
emqx_connector_demo:set_callback_mode(async_if_possible),
|
emqx_connector_demo:set_callback_mode(async_if_possible),
|
||||||
{ok, _} = emqx_resource:create(
|
{ok, _} = emqx_resource:create(
|
||||||
?ID,
|
?ID,
|
||||||
|
@ -1441,7 +1442,7 @@ t_async_pool_worker_death(_Config) ->
|
||||||
#{
|
#{
|
||||||
query_mode => async,
|
query_mode => async,
|
||||||
batch_size => 1,
|
batch_size => 1,
|
||||||
worker_pool_size => 2,
|
worker_pool_size => NumBufferWorkers,
|
||||||
resume_interval => ResumeInterval
|
resume_interval => ResumeInterval
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
|
@ -1471,9 +1472,9 @@ t_async_pool_worker_death(_Config) ->
|
||||||
%% grab one of the worker pids and kill it
|
%% grab one of the worker pids and kill it
|
||||||
{ok, SRef1} =
|
{ok, SRef1} =
|
||||||
snabbkaffe:subscribe(
|
snabbkaffe:subscribe(
|
||||||
?match_event(#{?snk_kind := resource_worker_cancelled_inflight}),
|
?match_event(#{?snk_kind := resource_worker_worker_down_update}),
|
||||||
NumReqs,
|
NumBufferWorkers,
|
||||||
1_000
|
10_000
|
||||||
),
|
),
|
||||||
{ok, #{pid := Pid0}} = emqx_resource:simple_sync_query(?ID, get_state),
|
{ok, #{pid := Pid0}} = emqx_resource:simple_sync_query(?ID, get_state),
|
||||||
MRef = monitor(process, Pid0),
|
MRef = monitor(process, Pid0),
|
||||||
|
@ -1487,21 +1488,11 @@ t_async_pool_worker_death(_Config) ->
|
||||||
ct:fail("worker should have died")
|
ct:fail("worker should have died")
|
||||||
end,
|
end,
|
||||||
|
|
||||||
%% inflight requests should have been cancelled
|
%% inflight requests should have been marked as retriable
|
||||||
{ok, _} = snabbkaffe:receive_events(SRef1),
|
{ok, _} = snabbkaffe:receive_events(SRef1),
|
||||||
Inflight1 = emqx_resource_metrics:inflight_get(?ID),
|
Inflight1 = emqx_resource_metrics:inflight_get(?ID),
|
||||||
?assertEqual(0, Inflight1),
|
?assertEqual(NumReqs, Inflight1),
|
||||||
|
|
||||||
?assert(
|
|
||||||
lists:all(
|
|
||||||
fun
|
|
||||||
({_, {error, interrupted}}) -> true;
|
|
||||||
(_) -> false
|
|
||||||
end,
|
|
||||||
ets:tab2list(Tab0)
|
|
||||||
),
|
|
||||||
#{tab => ets:tab2list(Tab0)}
|
|
||||||
),
|
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
[]
|
[]
|
||||||
|
|
Loading…
Reference in New Issue