Merge pull request #10020 from zmstone/0222-fix-bridge-async-mode-counters
fix(bridge): fix dropped counter and inflight gauge
This commit is contained in:
commit
79bf77c2f1
|
@ -2,7 +2,7 @@ version: '3.9'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
zookeeper:
|
zookeeper:
|
||||||
image: wurstmeister/zookeeper
|
image: docker.io/library/zookeeper:3.6
|
||||||
ports:
|
ports:
|
||||||
- "2181:2181"
|
- "2181:2181"
|
||||||
container_name: zookeeper
|
container_name: zookeeper
|
||||||
|
|
|
@ -720,4 +720,4 @@ pub_props_to_packet(Properties) ->
|
||||||
safe_filename(Filename) when is_binary(Filename) ->
|
safe_filename(Filename) when is_binary(Filename) ->
|
||||||
binary:replace(Filename, <<":">>, <<"-">>, [global]);
|
binary:replace(Filename, <<":">>, <<"-">>, [global]);
|
||||||
safe_filename(Filename) when is_list(Filename) ->
|
safe_filename(Filename) when is_list(Filename) ->
|
||||||
string:replace(Filename, ":", "-", all).
|
lists:flatten(string:replace(Filename, ":", "-", all)).
|
||||||
|
|
|
@ -70,6 +70,18 @@
|
||||||
-define(RETRY_IDX, 3).
|
-define(RETRY_IDX, 3).
|
||||||
-define(WORKER_MREF_IDX, 4).
|
-define(WORKER_MREF_IDX, 4).
|
||||||
|
|
||||||
|
-define(ENSURE_ASYNC_FLUSH(InflightTID, EXPR),
|
||||||
|
(fun() ->
|
||||||
|
IsFullBefore = is_inflight_full(InflightTID),
|
||||||
|
case (EXPR) of
|
||||||
|
blocked ->
|
||||||
|
ok;
|
||||||
|
ok ->
|
||||||
|
ok = maybe_flush_after_async_reply(IsFullBefore)
|
||||||
|
end
|
||||||
|
end)()
|
||||||
|
).
|
||||||
|
|
||||||
-type id() :: binary().
|
-type id() :: binary().
|
||||||
-type index() :: pos_integer().
|
-type index() :: pos_integer().
|
||||||
-type expire_at() :: infinity | integer().
|
-type expire_at() :: infinity | integer().
|
||||||
|
@ -97,6 +109,7 @@ start_link(Id, Index, Opts) ->
|
||||||
|
|
||||||
-spec sync_query(id(), request(), query_opts()) -> Result :: term().
|
-spec sync_query(id(), request(), query_opts()) -> Result :: term().
|
||||||
sync_query(Id, Request, Opts0) ->
|
sync_query(Id, Request, Opts0) ->
|
||||||
|
?tp(sync_query, #{id => Id, request => Request, query_opts => Opts0}),
|
||||||
Opts1 = ensure_timeout_query_opts(Opts0, sync),
|
Opts1 = ensure_timeout_query_opts(Opts0, sync),
|
||||||
Opts = ensure_expire_at(Opts1),
|
Opts = ensure_expire_at(Opts1),
|
||||||
PickKey = maps:get(pick_key, Opts, self()),
|
PickKey = maps:get(pick_key, Opts, self()),
|
||||||
|
@ -106,6 +119,7 @@ sync_query(Id, Request, Opts0) ->
|
||||||
|
|
||||||
-spec async_query(id(), request(), query_opts()) -> Result :: term().
|
-spec async_query(id(), request(), query_opts()) -> Result :: term().
|
||||||
async_query(Id, Request, Opts0) ->
|
async_query(Id, Request, Opts0) ->
|
||||||
|
?tp(async_query, #{id => Id, request => Request, query_opts => Opts0}),
|
||||||
Opts1 = ensure_timeout_query_opts(Opts0, async),
|
Opts1 = ensure_timeout_query_opts(Opts0, async),
|
||||||
Opts = ensure_expire_at(Opts1),
|
Opts = ensure_expire_at(Opts1),
|
||||||
PickKey = maps:get(pick_key, Opts, self()),
|
PickKey = maps:get(pick_key, Opts, self()),
|
||||||
|
@ -121,6 +135,7 @@ simple_sync_query(Id, Request) ->
|
||||||
%% call ends up calling buffering functions, that's a bug and
|
%% call ends up calling buffering functions, that's a bug and
|
||||||
%% would mess up the metrics anyway. `undefined' is ignored by
|
%% would mess up the metrics anyway. `undefined' is ignored by
|
||||||
%% `emqx_resource_metrics:*_shift/3'.
|
%% `emqx_resource_metrics:*_shift/3'.
|
||||||
|
?tp(simple_sync_query, #{id => Id, request => Request}),
|
||||||
Index = undefined,
|
Index = undefined,
|
||||||
QueryOpts = simple_query_opts(),
|
QueryOpts = simple_query_opts(),
|
||||||
emqx_resource_metrics:matched_inc(Id),
|
emqx_resource_metrics:matched_inc(Id),
|
||||||
|
@ -132,6 +147,7 @@ simple_sync_query(Id, Request) ->
|
||||||
%% simple async-query the resource without batching and queuing.
|
%% simple async-query the resource without batching and queuing.
|
||||||
-spec simple_async_query(id(), request(), query_opts()) -> term().
|
-spec simple_async_query(id(), request(), query_opts()) -> term().
|
||||||
simple_async_query(Id, Request, QueryOpts0) ->
|
simple_async_query(Id, Request, QueryOpts0) ->
|
||||||
|
?tp(simple_async_query, #{id => Id, request => Request, query_opts => QueryOpts0}),
|
||||||
Index = undefined,
|
Index = undefined,
|
||||||
QueryOpts = maps:merge(simple_query_opts(), QueryOpts0),
|
QueryOpts = maps:merge(simple_query_opts(), QueryOpts0),
|
||||||
emqx_resource_metrics:matched_inc(Id),
|
emqx_resource_metrics:matched_inc(Id),
|
||||||
|
@ -194,8 +210,8 @@ init({Id, Index, Opts}) ->
|
||||||
?tp(buffer_worker_init, #{id => Id, index => Index}),
|
?tp(buffer_worker_init, #{id => Id, index => Index}),
|
||||||
{ok, running, Data}.
|
{ok, running, Data}.
|
||||||
|
|
||||||
running(enter, _, Data) ->
|
running(enter, _, #{tref := _Tref} = Data) ->
|
||||||
?tp(buffer_worker_enter_running, #{id => maps:get(id, Data)}),
|
?tp(buffer_worker_enter_running, #{id => maps:get(id, Data), tref => _Tref}),
|
||||||
%% According to `gen_statem' laws, we mustn't call `maybe_flush'
|
%% According to `gen_statem' laws, we mustn't call `maybe_flush'
|
||||||
%% directly because it may decide to return `{next_state, blocked, _}',
|
%% directly because it may decide to return `{next_state, blocked, _}',
|
||||||
%% and that's an invalid response for a state enter call.
|
%% and that's an invalid response for a state enter call.
|
||||||
|
@ -212,9 +228,8 @@ running(info, ?SEND_REQ(_ReplyTo, _Req) = Request0, Data) ->
|
||||||
handle_query_requests(Request0, Data);
|
handle_query_requests(Request0, Data);
|
||||||
running(info, {flush, Ref}, St = #{tref := {_TRef, Ref}}) ->
|
running(info, {flush, Ref}, St = #{tref := {_TRef, Ref}}) ->
|
||||||
flush(St#{tref := undefined});
|
flush(St#{tref := undefined});
|
||||||
running(internal, flush, St) ->
|
|
||||||
flush(St);
|
|
||||||
running(info, {flush, _Ref}, _St) ->
|
running(info, {flush, _Ref}, _St) ->
|
||||||
|
?tp(discarded_stale_flush, #{}),
|
||||||
keep_state_and_data;
|
keep_state_and_data;
|
||||||
running(info, {'DOWN', _MRef, process, Pid, Reason}, Data0 = #{async_workers := AsyncWorkers0}) when
|
running(info, {'DOWN', _MRef, process, Pid, Reason}, Data0 = #{async_workers := AsyncWorkers0}) when
|
||||||
is_map_key(Pid, AsyncWorkers0)
|
is_map_key(Pid, AsyncWorkers0)
|
||||||
|
@ -225,21 +240,24 @@ running(info, Info, _St) ->
|
||||||
?SLOG(error, #{msg => unexpected_msg, state => running, info => Info}),
|
?SLOG(error, #{msg => unexpected_msg, state => running, info => Info}),
|
||||||
keep_state_and_data.
|
keep_state_and_data.
|
||||||
|
|
||||||
blocked(enter, _, #{resume_interval := ResumeT} = _St) ->
|
blocked(enter, _, #{resume_interval := ResumeT} = St0) ->
|
||||||
?tp(buffer_worker_enter_blocked, #{}),
|
?tp(buffer_worker_enter_blocked, #{}),
|
||||||
{keep_state_and_data, {state_timeout, ResumeT, unblock}};
|
%% discard the old timer, new timer will be started when entering running state again
|
||||||
|
St = cancel_flush_timer(St0),
|
||||||
|
{keep_state, St, {state_timeout, ResumeT, unblock}};
|
||||||
blocked(cast, block, _St) ->
|
blocked(cast, block, _St) ->
|
||||||
keep_state_and_data;
|
keep_state_and_data;
|
||||||
blocked(cast, resume, St) ->
|
blocked(cast, resume, St) ->
|
||||||
resume_from_blocked(St);
|
resume_from_blocked(St);
|
||||||
blocked(cast, flush, Data) ->
|
blocked(cast, flush, St) ->
|
||||||
resume_from_blocked(Data);
|
resume_from_blocked(St);
|
||||||
blocked(state_timeout, unblock, St) ->
|
blocked(state_timeout, unblock, St) ->
|
||||||
resume_from_blocked(St);
|
resume_from_blocked(St);
|
||||||
blocked(info, ?SEND_REQ(_ReplyTo, _Req) = Request0, Data0) ->
|
blocked(info, ?SEND_REQ(_ReplyTo, _Req) = Request0, Data0) ->
|
||||||
Data = collect_and_enqueue_query_requests(Request0, Data0),
|
Data = collect_and_enqueue_query_requests(Request0, Data0),
|
||||||
{keep_state, Data};
|
{keep_state, Data};
|
||||||
blocked(info, {flush, _Ref}, _Data) ->
|
blocked(info, {flush, _Ref}, _Data) ->
|
||||||
|
%% ignore stale timer
|
||||||
keep_state_and_data;
|
keep_state_and_data;
|
||||||
blocked(info, {'DOWN', _MRef, process, Pid, Reason}, Data0 = #{async_workers := AsyncWorkers0}) when
|
blocked(info, {'DOWN', _MRef, process, Pid, Reason}, Data0 = #{async_workers := AsyncWorkers0}) when
|
||||||
is_map_key(Pid, AsyncWorkers0)
|
is_map_key(Pid, AsyncWorkers0)
|
||||||
|
@ -335,11 +353,13 @@ resume_from_blocked(Data) ->
|
||||||
%% We retry msgs in inflight window sync, as if we send them
|
%% We retry msgs in inflight window sync, as if we send them
|
||||||
%% async, they will be appended to the end of inflight window again.
|
%% async, they will be appended to the end of inflight window again.
|
||||||
retry_inflight_sync(Ref, Query, Data);
|
retry_inflight_sync(Ref, Query, Data);
|
||||||
|
{batch, Ref, NotExpired, []} ->
|
||||||
|
retry_inflight_sync(Ref, NotExpired, Data);
|
||||||
{batch, Ref, NotExpired, Expired} ->
|
{batch, Ref, NotExpired, Expired} ->
|
||||||
update_inflight_item(InflightTID, Ref, NotExpired),
|
|
||||||
NumExpired = length(Expired),
|
NumExpired = length(Expired),
|
||||||
|
ok = update_inflight_item(InflightTID, Ref, NotExpired, NumExpired),
|
||||||
emqx_resource_metrics:dropped_expired_inc(Id, NumExpired),
|
emqx_resource_metrics:dropped_expired_inc(Id, NumExpired),
|
||||||
NumExpired > 0 andalso ?tp(buffer_worker_retry_expired, #{expired => Expired}),
|
?tp(buffer_worker_retry_expired, #{expired => Expired}),
|
||||||
%% We retry msgs in inflight window sync, as if we send them
|
%% We retry msgs in inflight window sync, as if we send them
|
||||||
%% async, they will be appended to the end of inflight window again.
|
%% async, they will be appended to the end of inflight window again.
|
||||||
retry_inflight_sync(Ref, NotExpired, Data)
|
retry_inflight_sync(Ref, NotExpired, Data)
|
||||||
|
@ -470,9 +490,14 @@ flush(Data0) ->
|
||||||
Data1 = cancel_flush_timer(Data0),
|
Data1 = cancel_flush_timer(Data0),
|
||||||
CurrentCount = queue_count(Q0),
|
CurrentCount = queue_count(Q0),
|
||||||
IsFull = is_inflight_full(InflightTID),
|
IsFull = is_inflight_full(InflightTID),
|
||||||
?tp(buffer_worker_flush, #{queue_count => CurrentCount, is_full => IsFull}),
|
?tp(buffer_worker_flush, #{
|
||||||
|
queued => CurrentCount,
|
||||||
|
is_inflight_full => IsFull,
|
||||||
|
inflight => inflight_count(InflightTID)
|
||||||
|
}),
|
||||||
case {CurrentCount, IsFull} of
|
case {CurrentCount, IsFull} of
|
||||||
{0, _} ->
|
{0, _} ->
|
||||||
|
?tp(buffer_worker_queue_drained, #{inflight => inflight_count(InflightTID)}),
|
||||||
{keep_state, Data1};
|
{keep_state, Data1};
|
||||||
{_, true} ->
|
{_, true} ->
|
||||||
?tp(buffer_worker_flush_but_inflight_full, #{}),
|
?tp(buffer_worker_flush_but_inflight_full, #{}),
|
||||||
|
@ -487,7 +512,7 @@ flush(Data0) ->
|
||||||
%% if the request has expired, the caller is no longer
|
%% if the request has expired, the caller is no longer
|
||||||
%% waiting for a response.
|
%% waiting for a response.
|
||||||
case sieve_expired_requests(Batch, Now) of
|
case sieve_expired_requests(Batch, Now) of
|
||||||
all_expired ->
|
{[], _AllExpired} ->
|
||||||
ok = replayq:ack(Q1, QAckRef),
|
ok = replayq:ack(Q1, QAckRef),
|
||||||
emqx_resource_metrics:dropped_expired_inc(Id, length(Batch)),
|
emqx_resource_metrics:dropped_expired_inc(Id, length(Batch)),
|
||||||
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
|
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
|
||||||
|
@ -496,7 +521,7 @@ flush(Data0) ->
|
||||||
{NotExpired, Expired} ->
|
{NotExpired, Expired} ->
|
||||||
NumExpired = length(Expired),
|
NumExpired = length(Expired),
|
||||||
emqx_resource_metrics:dropped_expired_inc(Id, NumExpired),
|
emqx_resource_metrics:dropped_expired_inc(Id, NumExpired),
|
||||||
IsBatch = BatchSize =/= 1,
|
IsBatch = (BatchSize > 1),
|
||||||
%% We *must* use the new queue, because we currently can't
|
%% We *must* use the new queue, because we currently can't
|
||||||
%% `nack' a `pop'.
|
%% `nack' a `pop'.
|
||||||
%% Maybe we could re-open the queue?
|
%% Maybe we could re-open the queue?
|
||||||
|
@ -506,7 +531,6 @@ flush(Data0) ->
|
||||||
),
|
),
|
||||||
Ref = make_request_ref(),
|
Ref = make_request_ref(),
|
||||||
do_flush(Data2, #{
|
do_flush(Data2, #{
|
||||||
new_queue => Q1,
|
|
||||||
is_batch => IsBatch,
|
is_batch => IsBatch,
|
||||||
batch => NotExpired,
|
batch => NotExpired,
|
||||||
ref => Ref,
|
ref => Ref,
|
||||||
|
@ -519,18 +543,16 @@ flush(Data0) ->
|
||||||
is_batch := boolean(),
|
is_batch := boolean(),
|
||||||
batch := [queue_query()],
|
batch := [queue_query()],
|
||||||
ack_ref := replayq:ack_ref(),
|
ack_ref := replayq:ack_ref(),
|
||||||
ref := inflight_key(),
|
ref := inflight_key()
|
||||||
new_queue := replayq:q()
|
|
||||||
}) ->
|
}) ->
|
||||||
gen_statem:event_handler_result(state(), data()).
|
gen_statem:event_handler_result(state(), data()).
|
||||||
do_flush(
|
do_flush(
|
||||||
Data0,
|
#{queue := Q1} = Data0,
|
||||||
#{
|
#{
|
||||||
is_batch := false,
|
is_batch := false,
|
||||||
batch := Batch,
|
batch := Batch,
|
||||||
ref := Ref,
|
ref := Ref,
|
||||||
ack_ref := QAckRef,
|
ack_ref := QAckRef
|
||||||
new_queue := Q1
|
|
||||||
}
|
}
|
||||||
) ->
|
) ->
|
||||||
#{
|
#{
|
||||||
|
@ -606,16 +628,18 @@ do_flush(
|
||||||
}),
|
}),
|
||||||
flush_worker(self());
|
flush_worker(self());
|
||||||
false ->
|
false ->
|
||||||
|
?tp(buffer_worker_queue_drained, #{
|
||||||
|
inflight => inflight_count(InflightTID)
|
||||||
|
}),
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
{keep_state, Data1}
|
{keep_state, Data1}
|
||||||
end;
|
end;
|
||||||
do_flush(Data0, #{
|
do_flush(#{queue := Q1} = Data0, #{
|
||||||
is_batch := true,
|
is_batch := true,
|
||||||
batch := Batch,
|
batch := Batch,
|
||||||
ref := Ref,
|
ref := Ref,
|
||||||
ack_ref := QAckRef,
|
ack_ref := QAckRef
|
||||||
new_queue := Q1
|
|
||||||
}) ->
|
}) ->
|
||||||
#{
|
#{
|
||||||
id := Id,
|
id := Id,
|
||||||
|
@ -685,6 +709,9 @@ do_flush(Data0, #{
|
||||||
Data2 =
|
Data2 =
|
||||||
case {CurrentCount > 0, CurrentCount >= BatchSize} of
|
case {CurrentCount > 0, CurrentCount >= BatchSize} of
|
||||||
{false, _} ->
|
{false, _} ->
|
||||||
|
?tp(buffer_worker_queue_drained, #{
|
||||||
|
inflight => inflight_count(InflightTID)
|
||||||
|
}),
|
||||||
Data1;
|
Data1;
|
||||||
{true, true} ->
|
{true, true} ->
|
||||||
?tp(buffer_worker_flush_ack_reflush, #{
|
?tp(buffer_worker_flush_ack_reflush, #{
|
||||||
|
@ -718,13 +745,14 @@ batch_reply_caller_defer_metrics(Id, BatchResult, Batch, QueryOpts) ->
|
||||||
{ShouldAck, PostFns} =
|
{ShouldAck, PostFns} =
|
||||||
lists:foldl(
|
lists:foldl(
|
||||||
fun(Reply, {_ShouldAck, PostFns}) ->
|
fun(Reply, {_ShouldAck, PostFns}) ->
|
||||||
|
%% _ShouldAck should be the same as ShouldAck starting from the second reply
|
||||||
{ShouldAck, PostFn} = reply_caller_defer_metrics(Id, Reply, QueryOpts),
|
{ShouldAck, PostFn} = reply_caller_defer_metrics(Id, Reply, QueryOpts),
|
||||||
{ShouldAck, [PostFn | PostFns]}
|
{ShouldAck, [PostFn | PostFns]}
|
||||||
end,
|
end,
|
||||||
{ack, []},
|
{ack, []},
|
||||||
Replies
|
Replies
|
||||||
),
|
),
|
||||||
PostFn = fun() -> lists:foreach(fun(F) -> F() end, PostFns) end,
|
PostFn = fun() -> lists:foreach(fun(F) -> F() end, lists:reverse(PostFns)) end,
|
||||||
{ShouldAck, PostFn}.
|
{ShouldAck, PostFn}.
|
||||||
|
|
||||||
reply_caller(Id, Reply, QueryOpts) ->
|
reply_caller(Id, Reply, QueryOpts) ->
|
||||||
|
@ -853,7 +881,7 @@ handle_async_worker_down(Data0, Pid) ->
|
||||||
{keep_state, Data}.
|
{keep_state, Data}.
|
||||||
|
|
||||||
call_query(QM0, Id, Index, Ref, Query, QueryOpts) ->
|
call_query(QM0, Id, Index, Ref, Query, QueryOpts) ->
|
||||||
?tp(call_query_enter, #{id => Id, query => Query}),
|
?tp(call_query_enter, #{id => Id, query => Query, query_mode => QM0}),
|
||||||
case emqx_resource_manager:ets_lookup(Id) of
|
case emqx_resource_manager:ets_lookup(Id) of
|
||||||
{ok, _Group, #{status := stopped}} ->
|
{ok, _Group, #{status := stopped}} ->
|
||||||
?RESOURCE_ERROR(stopped, "resource stopped or disabled");
|
?RESOURCE_ERROR(stopped, "resource stopped or disabled");
|
||||||
|
@ -919,7 +947,7 @@ apply_query_fun(async, Mod, Id, Index, Ref, ?QUERY(_, Request, _, _) = Query, Re
|
||||||
inflight_tid => InflightTID,
|
inflight_tid => InflightTID,
|
||||||
request_ref => Ref,
|
request_ref => Ref,
|
||||||
query_opts => QueryOpts,
|
query_opts => QueryOpts,
|
||||||
query => minimize(Query)
|
min_query => minimize(Query)
|
||||||
},
|
},
|
||||||
IsRetriable = false,
|
IsRetriable = false,
|
||||||
WorkerMRef = undefined,
|
WorkerMRef = undefined,
|
||||||
|
@ -952,7 +980,7 @@ apply_query_fun(async, Mod, Id, Index, Ref, [?QUERY(_, _, _, _) | _] = Batch, Re
|
||||||
inflight_tid => InflightTID,
|
inflight_tid => InflightTID,
|
||||||
request_ref => Ref,
|
request_ref => Ref,
|
||||||
query_opts => QueryOpts,
|
query_opts => QueryOpts,
|
||||||
batch => minimize(Batch)
|
min_batch => minimize(Batch)
|
||||||
},
|
},
|
||||||
Requests = lists:map(
|
Requests = lists:map(
|
||||||
fun(?QUERY(_ReplyTo, Request, _, _ExpireAt)) -> Request end, Batch
|
fun(?QUERY(_ReplyTo, Request, _, _ExpireAt)) -> Request end, Batch
|
||||||
|
@ -968,27 +996,39 @@ apply_query_fun(async, Mod, Id, Index, Ref, [?QUERY(_, _, _, _) | _] = Batch, Re
|
||||||
).
|
).
|
||||||
|
|
||||||
handle_async_reply(
|
handle_async_reply(
|
||||||
|
#{
|
||||||
|
request_ref := Ref,
|
||||||
|
inflight_tid := InflightTID,
|
||||||
|
query_opts := Opts
|
||||||
|
} = ReplyContext,
|
||||||
|
Result
|
||||||
|
) ->
|
||||||
|
case maybe_handle_unknown_async_reply(InflightTID, Ref, Opts) of
|
||||||
|
discard ->
|
||||||
|
ok;
|
||||||
|
continue ->
|
||||||
|
?ENSURE_ASYNC_FLUSH(InflightTID, handle_async_reply1(ReplyContext, Result))
|
||||||
|
end.
|
||||||
|
|
||||||
|
handle_async_reply1(
|
||||||
#{
|
#{
|
||||||
request_ref := Ref,
|
request_ref := Ref,
|
||||||
inflight_tid := InflightTID,
|
inflight_tid := InflightTID,
|
||||||
resource_id := Id,
|
resource_id := Id,
|
||||||
worker_index := Index,
|
worker_index := Index,
|
||||||
buffer_worker := Pid,
|
min_query := ?QUERY(_, _, _, ExpireAt) = _Query
|
||||||
query := ?QUERY(_, _, _, ExpireAt) = _Query
|
|
||||||
} = ReplyContext,
|
} = ReplyContext,
|
||||||
Result
|
Result
|
||||||
) ->
|
) ->
|
||||||
?tp(
|
?tp(
|
||||||
handle_async_reply_enter,
|
handle_async_reply_enter,
|
||||||
#{batch_or_query => [_Query], ref => Ref}
|
#{batch_or_query => [_Query], ref => Ref, result => Result}
|
||||||
),
|
),
|
||||||
Now = now_(),
|
Now = now_(),
|
||||||
case is_expired(ExpireAt, Now) of
|
case is_expired(ExpireAt, Now) of
|
||||||
true ->
|
true ->
|
||||||
IsFullBefore = is_inflight_full(InflightTID),
|
|
||||||
IsAcked = ack_inflight(InflightTID, Ref, Id, Index),
|
IsAcked = ack_inflight(InflightTID, Ref, Id, Index),
|
||||||
IsAcked andalso emqx_resource_metrics:late_reply_inc(Id),
|
IsAcked andalso emqx_resource_metrics:late_reply_inc(Id),
|
||||||
IsFullBefore andalso ?MODULE:flush_worker(Pid),
|
|
||||||
?tp(handle_async_reply_expired, #{expired => [_Query]}),
|
?tp(handle_async_reply_expired, #{expired => [_Query]}),
|
||||||
ok;
|
ok;
|
||||||
false ->
|
false ->
|
||||||
|
@ -1003,7 +1043,7 @@ do_handle_async_reply(
|
||||||
worker_index := Index,
|
worker_index := Index,
|
||||||
buffer_worker := Pid,
|
buffer_worker := Pid,
|
||||||
inflight_tid := InflightTID,
|
inflight_tid := InflightTID,
|
||||||
query := ?QUERY(ReplyTo, _, Sent, _ExpireAt) = _Query
|
min_query := ?QUERY(ReplyTo, _, Sent, _ExpireAt) = _Query
|
||||||
},
|
},
|
||||||
Result
|
Result
|
||||||
) ->
|
) ->
|
||||||
|
@ -1020,46 +1060,95 @@ do_handle_async_reply(
|
||||||
ref => Ref,
|
ref => Ref,
|
||||||
result => Result
|
result => Result
|
||||||
}),
|
}),
|
||||||
|
|
||||||
case Action of
|
case Action of
|
||||||
nack ->
|
nack ->
|
||||||
%% Keep retrying.
|
%% Keep retrying.
|
||||||
mark_inflight_as_retriable(InflightTID, Ref),
|
ok = mark_inflight_as_retriable(InflightTID, Ref),
|
||||||
?MODULE:block(Pid);
|
ok = ?MODULE:block(Pid),
|
||||||
|
blocked;
|
||||||
ack ->
|
ack ->
|
||||||
do_ack(InflightTID, Ref, Id, Index, PostFn, Pid, QueryOpts)
|
ok = do_async_ack(InflightTID, Ref, Id, Index, PostFn, QueryOpts)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
handle_async_batch_reply(
|
handle_async_batch_reply(
|
||||||
#{
|
#{
|
||||||
buffer_worker := Pid,
|
|
||||||
resource_id := Id,
|
|
||||||
worker_index := Index,
|
|
||||||
inflight_tid := InflightTID,
|
inflight_tid := InflightTID,
|
||||||
request_ref := Ref,
|
request_ref := Ref,
|
||||||
batch := Batch
|
query_opts := Opts
|
||||||
|
} = ReplyContext,
|
||||||
|
Result
|
||||||
|
) ->
|
||||||
|
case maybe_handle_unknown_async_reply(InflightTID, Ref, Opts) of
|
||||||
|
discard ->
|
||||||
|
ok;
|
||||||
|
continue ->
|
||||||
|
?ENSURE_ASYNC_FLUSH(InflightTID, handle_async_batch_reply1(ReplyContext, Result))
|
||||||
|
end.
|
||||||
|
|
||||||
|
handle_async_batch_reply1(
|
||||||
|
#{
|
||||||
|
inflight_tid := InflightTID,
|
||||||
|
request_ref := Ref,
|
||||||
|
min_batch := Batch
|
||||||
} = ReplyContext,
|
} = ReplyContext,
|
||||||
Result
|
Result
|
||||||
) ->
|
) ->
|
||||||
?tp(
|
?tp(
|
||||||
handle_async_reply_enter,
|
handle_async_reply_enter,
|
||||||
#{batch_or_query => Batch, ref => Ref}
|
#{batch_or_query => Batch, ref => Ref, result => Result}
|
||||||
),
|
),
|
||||||
Now = now_(),
|
Now = now_(),
|
||||||
case sieve_expired_requests(Batch, Now) of
|
case sieve_expired_requests(Batch, Now) of
|
||||||
all_expired ->
|
{_NotExpired, []} ->
|
||||||
IsFullBefore = is_inflight_full(InflightTID),
|
%% this is the critical code path,
|
||||||
IsAcked = ack_inflight(InflightTID, Ref, Id, Index),
|
%% we try not to do ets:lookup in this case
|
||||||
IsAcked andalso emqx_resource_metrics:late_reply_inc(Id),
|
%% because the batch can be quite big
|
||||||
IsFullBefore andalso ?MODULE:flush_worker(Pid),
|
do_handle_async_batch_reply(ReplyContext, Result);
|
||||||
?tp(handle_async_reply_expired, #{expired => Batch}),
|
{_NotExpired, _Expired} ->
|
||||||
|
%% at least one is expired
|
||||||
|
%% the batch from reply context is minimized, so it cannot be used
|
||||||
|
%% to update the inflight items, hence discard Batch and lookup the RealBatch
|
||||||
|
?tp(handle_async_reply_expired, #{expired => _Expired}),
|
||||||
|
handle_async_batch_reply2(ets:lookup(InflightTID, Ref), ReplyContext, Result, Now)
|
||||||
|
end.
|
||||||
|
|
||||||
|
handle_async_batch_reply2([], _, _, _) ->
|
||||||
|
%% this usually should never happen unless the async callback is being evaluated concurrently
|
||||||
|
ok;
|
||||||
|
handle_async_batch_reply2([Inflight], ReplyContext, Result, Now) ->
|
||||||
|
?INFLIGHT_ITEM(_, RealBatch, _IsRetriable, _WorkerMRef) = Inflight,
|
||||||
|
#{
|
||||||
|
resource_id := Id,
|
||||||
|
worker_index := Index,
|
||||||
|
inflight_tid := InflightTID,
|
||||||
|
request_ref := Ref,
|
||||||
|
min_batch := Batch
|
||||||
|
} = ReplyContext,
|
||||||
|
%% All batch items share the same HasBeenSent flag
|
||||||
|
%% So we just take the original flag from the ReplyContext batch
|
||||||
|
%% and put it back to the batch found in inflight table
|
||||||
|
%% which must have already been set to `false`
|
||||||
|
[?QUERY(_ReplyTo, _, HasBeenSent, _ExpireAt) | _] = Batch,
|
||||||
|
{RealNotExpired0, RealExpired} = sieve_expired_requests(RealBatch, Now),
|
||||||
|
RealNotExpired =
|
||||||
|
lists:map(
|
||||||
|
fun(?QUERY(ReplyTo, CoreReq, _HasBeenSent, ExpireAt)) ->
|
||||||
|
?QUERY(ReplyTo, CoreReq, HasBeenSent, ExpireAt)
|
||||||
|
end,
|
||||||
|
RealNotExpired0
|
||||||
|
),
|
||||||
|
NumExpired = length(RealExpired),
|
||||||
|
emqx_resource_metrics:late_reply_inc(Id, NumExpired),
|
||||||
|
case RealNotExpired of
|
||||||
|
[] ->
|
||||||
|
%% all expired, no need to update back the inflight batch
|
||||||
|
_ = ack_inflight(InflightTID, Ref, Id, Index),
|
||||||
ok;
|
ok;
|
||||||
{NotExpired, Expired} ->
|
_ ->
|
||||||
NumExpired = length(Expired),
|
%% some queries are not expired, put them back to the inflight batch
|
||||||
emqx_resource_metrics:late_reply_inc(Id, NumExpired),
|
%% so it can be either acked now or retried later
|
||||||
NumExpired > 0 andalso
|
ok = update_inflight_item(InflightTID, Ref, RealNotExpired, NumExpired),
|
||||||
?tp(handle_async_reply_expired, #{expired => Expired}),
|
do_handle_async_batch_reply(ReplyContext#{min_batch := RealNotExpired}, Result)
|
||||||
do_handle_async_batch_reply(ReplyContext#{batch := NotExpired}, Result)
|
|
||||||
end.
|
end.
|
||||||
|
|
||||||
do_handle_async_batch_reply(
|
do_handle_async_batch_reply(
|
||||||
|
@ -1069,7 +1158,7 @@ do_handle_async_batch_reply(
|
||||||
worker_index := Index,
|
worker_index := Index,
|
||||||
inflight_tid := InflightTID,
|
inflight_tid := InflightTID,
|
||||||
request_ref := Ref,
|
request_ref := Ref,
|
||||||
batch := Batch,
|
min_batch := Batch,
|
||||||
query_opts := QueryOpts
|
query_opts := QueryOpts
|
||||||
},
|
},
|
||||||
Result
|
Result
|
||||||
|
@ -1084,14 +1173,14 @@ do_handle_async_batch_reply(
|
||||||
case Action of
|
case Action of
|
||||||
nack ->
|
nack ->
|
||||||
%% Keep retrying.
|
%% Keep retrying.
|
||||||
mark_inflight_as_retriable(InflightTID, Ref),
|
ok = mark_inflight_as_retriable(InflightTID, Ref),
|
||||||
?MODULE:block(Pid);
|
ok = ?MODULE:block(Pid),
|
||||||
|
blocked;
|
||||||
ack ->
|
ack ->
|
||||||
do_ack(InflightTID, Ref, Id, Index, PostFn, Pid, QueryOpts)
|
ok = do_async_ack(InflightTID, Ref, Id, Index, PostFn, QueryOpts)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
do_ack(InflightTID, Ref, Id, Index, PostFn, WorkerPid, QueryOpts) ->
|
do_async_ack(InflightTID, Ref, Id, Index, PostFn, QueryOpts) ->
|
||||||
IsFullBefore = is_inflight_full(InflightTID),
|
|
||||||
IsKnownRef = ack_inflight(InflightTID, Ref, Id, Index),
|
IsKnownRef = ack_inflight(InflightTID, Ref, Id, Index),
|
||||||
case maps:get(simple_query, QueryOpts, false) of
|
case maps:get(simple_query, QueryOpts, false) of
|
||||||
true ->
|
true ->
|
||||||
|
@ -1101,9 +1190,47 @@ do_ack(InflightTID, Ref, Id, Index, PostFn, WorkerPid, QueryOpts) ->
|
||||||
false ->
|
false ->
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
IsFullBefore andalso ?MODULE:flush_worker(WorkerPid),
|
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
|
maybe_flush_after_async_reply(_WasFullBeforeReplyHandled = false) ->
|
||||||
|
%% inflight was not full before async reply is handled,
|
||||||
|
%% after it is handled, the inflight table must be even smaller
|
||||||
|
%% hance we can rely on the buffer worker's flush timer to trigger
|
||||||
|
%% the next flush
|
||||||
|
?tp(skip_flushing_worker, #{}),
|
||||||
|
ok;
|
||||||
|
maybe_flush_after_async_reply(_WasFullBeforeReplyHandled = true) ->
|
||||||
|
%% the inflight table was full before handling aync reply
|
||||||
|
?tp(do_flushing_worker, #{}),
|
||||||
|
ok = ?MODULE:flush_worker(self()).
|
||||||
|
|
||||||
|
%% check if the async reply is valid.
|
||||||
|
%% e.g. if a connector evaluates the callback more than once:
|
||||||
|
%% 1. If the request was previously deleted from inflight table due to
|
||||||
|
%% either succeeded previously or expired, this function logs a
|
||||||
|
%% warning message and returns 'discard' instruction.
|
||||||
|
%% 2. If the request was previously failed and now pending on a retry,
|
||||||
|
%% then this function will return 'continue' as there is no way to
|
||||||
|
%% tell if this reply is stae or not.
|
||||||
|
maybe_handle_unknown_async_reply(undefined, _Ref, #{simple_query := true}) ->
|
||||||
|
continue;
|
||||||
|
maybe_handle_unknown_async_reply(InflightTID, Ref, #{}) ->
|
||||||
|
try ets:member(InflightTID, Ref) of
|
||||||
|
true ->
|
||||||
|
continue;
|
||||||
|
false ->
|
||||||
|
?tp(
|
||||||
|
warning,
|
||||||
|
unknown_async_reply_discarded,
|
||||||
|
#{inflight_key => Ref}
|
||||||
|
),
|
||||||
|
discard
|
||||||
|
catch
|
||||||
|
error:badarg ->
|
||||||
|
%% shutdown ?
|
||||||
|
discard
|
||||||
|
end.
|
||||||
|
|
||||||
%%==============================================================================
|
%%==============================================================================
|
||||||
%% operations for queue
|
%% operations for queue
|
||||||
queue_item_marshaller(Bin) when is_binary(Bin) ->
|
queue_item_marshaller(Bin) when is_binary(Bin) ->
|
||||||
|
@ -1202,10 +1329,8 @@ inflight_get_first_retriable(InflightTID, Now) ->
|
||||||
{single, Ref, Query}
|
{single, Ref, Query}
|
||||||
end;
|
end;
|
||||||
{[{Ref, Batch = [_ | _]}], _Continuation} ->
|
{[{Ref, Batch = [_ | _]}], _Continuation} ->
|
||||||
%% batch is non-empty because we check that in
|
|
||||||
%% `sieve_expired_requests'.
|
|
||||||
case sieve_expired_requests(Batch, Now) of
|
case sieve_expired_requests(Batch, Now) of
|
||||||
all_expired ->
|
{[], _AllExpired} ->
|
||||||
{expired, Ref, Batch};
|
{expired, Ref, Batch};
|
||||||
{NotExpired, Expired} ->
|
{NotExpired, Expired} ->
|
||||||
{batch, Ref, NotExpired, Expired}
|
{batch, Ref, NotExpired, Expired}
|
||||||
|
@ -1218,10 +1343,10 @@ is_inflight_full(InflightTID) ->
|
||||||
[{_, MaxSize}] = ets:lookup(InflightTID, ?MAX_SIZE_REF),
|
[{_, MaxSize}] = ets:lookup(InflightTID, ?MAX_SIZE_REF),
|
||||||
%% we consider number of batches rather than number of messages
|
%% we consider number of batches rather than number of messages
|
||||||
%% because one batch request may hold several messages.
|
%% because one batch request may hold several messages.
|
||||||
Size = inflight_num_batches(InflightTID),
|
Size = inflight_count(InflightTID),
|
||||||
Size >= MaxSize.
|
Size >= MaxSize.
|
||||||
|
|
||||||
inflight_num_batches(InflightTID) ->
|
inflight_count(InflightTID) ->
|
||||||
case ets:info(InflightTID, size) of
|
case ets:info(InflightTID, size) of
|
||||||
undefined -> 0;
|
undefined -> 0;
|
||||||
Size -> max(0, Size - ?INFLIGHT_META_ROWS)
|
Size -> max(0, Size - ?INFLIGHT_META_ROWS)
|
||||||
|
@ -1243,7 +1368,7 @@ inflight_append(
|
||||||
InflightItem = ?INFLIGHT_ITEM(Ref, Batch, IsRetriable, WorkerMRef),
|
InflightItem = ?INFLIGHT_ITEM(Ref, Batch, IsRetriable, WorkerMRef),
|
||||||
IsNew = ets:insert_new(InflightTID, InflightItem),
|
IsNew = ets:insert_new(InflightTID, InflightItem),
|
||||||
BatchSize = length(Batch),
|
BatchSize = length(Batch),
|
||||||
IsNew andalso ets:update_counter(InflightTID, ?SIZE_REF, {2, BatchSize}),
|
IsNew andalso inc_inflight(InflightTID, BatchSize),
|
||||||
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)),
|
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)),
|
||||||
?tp(buffer_worker_appended_to_inflight, #{item => InflightItem, is_new => IsNew}),
|
?tp(buffer_worker_appended_to_inflight, #{item => InflightItem, is_new => IsNew}),
|
||||||
ok;
|
ok;
|
||||||
|
@ -1258,7 +1383,7 @@ inflight_append(
|
||||||
Query = mark_as_sent(Query0),
|
Query = mark_as_sent(Query0),
|
||||||
InflightItem = ?INFLIGHT_ITEM(Ref, Query, IsRetriable, WorkerMRef),
|
InflightItem = ?INFLIGHT_ITEM(Ref, Query, IsRetriable, WorkerMRef),
|
||||||
IsNew = ets:insert_new(InflightTID, InflightItem),
|
IsNew = ets:insert_new(InflightTID, InflightItem),
|
||||||
IsNew andalso ets:update_counter(InflightTID, ?SIZE_REF, {2, 1}),
|
IsNew andalso inc_inflight(InflightTID, 1),
|
||||||
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)),
|
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)),
|
||||||
?tp(buffer_worker_appended_to_inflight, #{item => InflightItem, is_new => IsNew}),
|
?tp(buffer_worker_appended_to_inflight, #{item => InflightItem, is_new => IsNew}),
|
||||||
ok;
|
ok;
|
||||||
|
@ -1274,6 +1399,8 @@ mark_inflight_as_retriable(undefined, _Ref) ->
|
||||||
ok;
|
ok;
|
||||||
mark_inflight_as_retriable(InflightTID, Ref) ->
|
mark_inflight_as_retriable(InflightTID, Ref) ->
|
||||||
_ = ets:update_element(InflightTID, Ref, {?RETRY_IDX, true}),
|
_ = ets:update_element(InflightTID, Ref, {?RETRY_IDX, true}),
|
||||||
|
%% the old worker's DOWN should not affect this inflight any more
|
||||||
|
_ = ets:update_element(InflightTID, Ref, {?WORKER_MREF_IDX, erased}),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
%% Track each worker pid only once.
|
%% Track each worker pid only once.
|
||||||
|
@ -1317,13 +1444,18 @@ ack_inflight(InflightTID, Ref, Id, Index) ->
|
||||||
1;
|
1;
|
||||||
[?INFLIGHT_ITEM(Ref, [?QUERY(_, _, _, _) | _] = Batch, _IsRetriable, _WorkerMRef)] ->
|
[?INFLIGHT_ITEM(Ref, [?QUERY(_, _, _, _) | _] = Batch, _IsRetriable, _WorkerMRef)] ->
|
||||||
length(Batch);
|
length(Batch);
|
||||||
_ ->
|
[] ->
|
||||||
0
|
0
|
||||||
end,
|
end,
|
||||||
IsAcked = Count > 0,
|
ok = dec_inflight(InflightTID, Count),
|
||||||
IsAcked andalso ets:update_counter(InflightTID, ?SIZE_REF, {2, -Count, 0, 0}),
|
IsKnownRef = (Count > 0),
|
||||||
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)),
|
case IsKnownRef of
|
||||||
IsAcked.
|
true ->
|
||||||
|
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID));
|
||||||
|
false ->
|
||||||
|
ok
|
||||||
|
end,
|
||||||
|
IsKnownRef.
|
||||||
|
|
||||||
mark_inflight_items_as_retriable(Data, WorkerMRef) ->
|
mark_inflight_items_as_retriable(Data, WorkerMRef) ->
|
||||||
#{inflight_tid := InflightTID} = Data,
|
#{inflight_tid := InflightTID} = Data,
|
||||||
|
@ -1341,9 +1473,18 @@ mark_inflight_items_as_retriable(Data, WorkerMRef) ->
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
%% used to update a batch after dropping expired individual queries.
|
%% used to update a batch after dropping expired individual queries.
|
||||||
update_inflight_item(InflightTID, Ref, NewBatch) ->
|
update_inflight_item(InflightTID, Ref, NewBatch, NumExpired) ->
|
||||||
_ = ets:update_element(InflightTID, Ref, {?ITEM_IDX, NewBatch}),
|
_ = ets:update_element(InflightTID, Ref, {?ITEM_IDX, NewBatch}),
|
||||||
?tp(buffer_worker_worker_update_inflight_item, #{ref => Ref}),
|
ok = dec_inflight(InflightTID, NumExpired).
|
||||||
|
|
||||||
|
inc_inflight(InflightTID, Count) ->
|
||||||
|
_ = ets:update_counter(InflightTID, ?SIZE_REF, {2, Count}),
|
||||||
|
ok.
|
||||||
|
|
||||||
|
dec_inflight(_InflightTID, 0) ->
|
||||||
|
ok;
|
||||||
|
dec_inflight(InflightTID, Count) when Count > 0 ->
|
||||||
|
_ = ets:update_counter(InflightTID, ?SIZE_REF, {2, -Count, 0, 0}),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
%%==============================================================================
|
%%==============================================================================
|
||||||
|
@ -1453,22 +1594,12 @@ is_async_return(_) ->
|
||||||
false.
|
false.
|
||||||
|
|
||||||
sieve_expired_requests(Batch, Now) ->
|
sieve_expired_requests(Batch, Now) ->
|
||||||
{Expired, NotExpired} =
|
lists:partition(
|
||||||
lists:partition(
|
fun(?QUERY(_ReplyTo, _CoreReq, _HasBeenSent, ExpireAt)) ->
|
||||||
fun(?QUERY(_ReplyTo, _CoreReq, _HasBeenSent, ExpireAt)) ->
|
not is_expired(ExpireAt, Now)
|
||||||
is_expired(ExpireAt, Now)
|
end,
|
||||||
end,
|
Batch
|
||||||
Batch
|
).
|
||||||
),
|
|
||||||
case {NotExpired, Expired} of
|
|
||||||
{[], []} ->
|
|
||||||
%% Should be impossible for batch_size >= 1.
|
|
||||||
all_expired;
|
|
||||||
{[], [_ | _]} ->
|
|
||||||
all_expired;
|
|
||||||
{[_ | _], _} ->
|
|
||||||
{NotExpired, Expired}
|
|
||||||
end.
|
|
||||||
|
|
||||||
-spec is_expired(infinity | integer(), integer()) -> boolean().
|
-spec is_expired(infinity | integer(), integer()) -> boolean().
|
||||||
is_expired(infinity = _ExpireAt, _Now) ->
|
is_expired(infinity = _ExpireAt, _Now) ->
|
||||||
|
|
|
@ -135,11 +135,11 @@ on_query(_InstId, get_counter, #{pid := Pid}) ->
|
||||||
after 1000 ->
|
after 1000 ->
|
||||||
{error, timeout}
|
{error, timeout}
|
||||||
end;
|
end;
|
||||||
on_query(_InstId, {sleep, For}, #{pid := Pid}) ->
|
on_query(_InstId, {sleep_before_reply, For}, #{pid := Pid}) ->
|
||||||
?tp(connector_demo_sleep, #{mode => sync, for => For}),
|
?tp(connector_demo_sleep, #{mode => sync, for => For}),
|
||||||
ReqRef = make_ref(),
|
ReqRef = make_ref(),
|
||||||
From = {self(), ReqRef},
|
From = {self(), ReqRef},
|
||||||
Pid ! {From, {sleep, For}},
|
Pid ! {From, {sleep_before_reply, For}},
|
||||||
receive
|
receive
|
||||||
{ReqRef, Result} ->
|
{ReqRef, Result} ->
|
||||||
Result
|
Result
|
||||||
|
@ -159,9 +159,9 @@ on_query_async(_InstId, block_now, ReplyFun, #{pid := Pid}) ->
|
||||||
on_query_async(_InstId, {big_payload, Payload}, ReplyFun, #{pid := Pid}) ->
|
on_query_async(_InstId, {big_payload, Payload}, ReplyFun, #{pid := Pid}) ->
|
||||||
Pid ! {big_payload, Payload, ReplyFun},
|
Pid ! {big_payload, Payload, ReplyFun},
|
||||||
{ok, Pid};
|
{ok, Pid};
|
||||||
on_query_async(_InstId, {sleep, For}, ReplyFun, #{pid := Pid}) ->
|
on_query_async(_InstId, {sleep_before_reply, For}, ReplyFun, #{pid := Pid}) ->
|
||||||
?tp(connector_demo_sleep, #{mode => async, for => For}),
|
?tp(connector_demo_sleep, #{mode => async, for => For}),
|
||||||
Pid ! {{sleep, For}, ReplyFun},
|
Pid ! {{sleep_before_reply, For}, ReplyFun},
|
||||||
{ok, Pid}.
|
{ok, Pid}.
|
||||||
|
|
||||||
on_batch_query(InstId, BatchReq, State) ->
|
on_batch_query(InstId, BatchReq, State) ->
|
||||||
|
@ -173,10 +173,13 @@ on_batch_query(InstId, BatchReq, State) ->
|
||||||
get_counter ->
|
get_counter ->
|
||||||
batch_get_counter(sync, InstId, State);
|
batch_get_counter(sync, InstId, State);
|
||||||
{big_payload, _Payload} ->
|
{big_payload, _Payload} ->
|
||||||
batch_big_payload(sync, InstId, BatchReq, State)
|
batch_big_payload(sync, InstId, BatchReq, State);
|
||||||
|
{random_reply, Num} ->
|
||||||
|
%% async batch retried
|
||||||
|
make_random_reply(Num)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
on_batch_query_async(InstId, BatchReq, ReplyFunAndArgs, State) ->
|
on_batch_query_async(InstId, BatchReq, ReplyFunAndArgs, #{pid := Pid} = State) ->
|
||||||
%% Requests can be of multiple types, but cannot be mixed.
|
%% Requests can be of multiple types, but cannot be mixed.
|
||||||
case hd(BatchReq) of
|
case hd(BatchReq) of
|
||||||
{inc_counter, _} ->
|
{inc_counter, _} ->
|
||||||
|
@ -186,7 +189,11 @@ on_batch_query_async(InstId, BatchReq, ReplyFunAndArgs, State) ->
|
||||||
block_now ->
|
block_now ->
|
||||||
on_query_async(InstId, block_now, ReplyFunAndArgs, State);
|
on_query_async(InstId, block_now, ReplyFunAndArgs, State);
|
||||||
{big_payload, _Payload} ->
|
{big_payload, _Payload} ->
|
||||||
batch_big_payload({async, ReplyFunAndArgs}, InstId, BatchReq, State)
|
batch_big_payload({async, ReplyFunAndArgs}, InstId, BatchReq, State);
|
||||||
|
{random_reply, Num} ->
|
||||||
|
%% only take the first Num in the batch should be random enough
|
||||||
|
Pid ! {{random_reply, Num}, ReplyFunAndArgs},
|
||||||
|
{ok, Pid}
|
||||||
end.
|
end.
|
||||||
|
|
||||||
batch_inc_counter(CallMode, InstId, BatchReq, State) ->
|
batch_inc_counter(CallMode, InstId, BatchReq, State) ->
|
||||||
|
@ -299,16 +306,33 @@ counter_loop(
|
||||||
{{FromPid, ReqRef}, get} ->
|
{{FromPid, ReqRef}, get} ->
|
||||||
FromPid ! {ReqRef, Num},
|
FromPid ! {ReqRef, Num},
|
||||||
State;
|
State;
|
||||||
{{sleep, _} = SleepQ, ReplyFun} ->
|
{{random_reply, RandNum}, ReplyFun} ->
|
||||||
|
%% usually a behaving connector should reply once and only once for
|
||||||
|
%% each (batch) request
|
||||||
|
%% but we try to reply random results a random number of times
|
||||||
|
%% with 'ok' in the result, the buffer worker should eventually
|
||||||
|
%% drain the buffer (and inflights table)
|
||||||
|
ReplyCount = 1 + (RandNum rem 3),
|
||||||
|
Results = make_random_replies(ReplyCount),
|
||||||
|
%% add a delay to trigger inflight full
|
||||||
|
lists:foreach(
|
||||||
|
fun(Result) ->
|
||||||
|
timer:sleep(rand:uniform(5)),
|
||||||
|
apply_reply(ReplyFun, Result)
|
||||||
|
end,
|
||||||
|
Results
|
||||||
|
),
|
||||||
|
State;
|
||||||
|
{{sleep_before_reply, _} = SleepQ, ReplyFun} ->
|
||||||
apply_reply(ReplyFun, handle_query(async, SleepQ, Status)),
|
apply_reply(ReplyFun, handle_query(async, SleepQ, Status)),
|
||||||
State;
|
State;
|
||||||
{{FromPid, ReqRef}, {sleep, _} = SleepQ} ->
|
{{FromPid, ReqRef}, {sleep_before_reply, _} = SleepQ} ->
|
||||||
FromPid ! {ReqRef, handle_query(sync, SleepQ, Status)},
|
FromPid ! {ReqRef, handle_query(sync, SleepQ, Status)},
|
||||||
State
|
State
|
||||||
end,
|
end,
|
||||||
counter_loop(NewState).
|
counter_loop(NewState).
|
||||||
|
|
||||||
handle_query(Mode, {sleep, For} = Query, Status) ->
|
handle_query(Mode, {sleep_before_reply, For} = Query, Status) ->
|
||||||
ok = timer:sleep(For),
|
ok = timer:sleep(For),
|
||||||
Result =
|
Result =
|
||||||
case Status of
|
case Status of
|
||||||
|
@ -329,3 +353,18 @@ maybe_register(_Name, _Pid, false) ->
|
||||||
|
|
||||||
apply_reply({ReplyFun, Args}, Result) when is_function(ReplyFun) ->
|
apply_reply({ReplyFun, Args}, Result) when is_function(ReplyFun) ->
|
||||||
apply(ReplyFun, Args ++ [Result]).
|
apply(ReplyFun, Args ++ [Result]).
|
||||||
|
|
||||||
|
make_random_replies(0) ->
|
||||||
|
[];
|
||||||
|
make_random_replies(N) ->
|
||||||
|
[make_random_reply(N) | make_random_replies(N - 1)].
|
||||||
|
|
||||||
|
make_random_reply(N) ->
|
||||||
|
case rand:uniform(3) of
|
||||||
|
1 ->
|
||||||
|
{ok, N};
|
||||||
|
2 ->
|
||||||
|
{error, {recoverable_error, N}};
|
||||||
|
3 ->
|
||||||
|
{error, {unrecoverable_error, N}}
|
||||||
|
end.
|
||||||
|
|
|
@ -1482,7 +1482,7 @@ t_retry_async_inflight_full(_Config) ->
|
||||||
AsyncInflightWindow * 2,
|
AsyncInflightWindow * 2,
|
||||||
fun() ->
|
fun() ->
|
||||||
For = (ResumeInterval div 4) + rand:uniform(ResumeInterval div 4),
|
For = (ResumeInterval div 4) + rand:uniform(ResumeInterval div 4),
|
||||||
{sleep, For}
|
{sleep_before_reply, For}
|
||||||
end,
|
end,
|
||||||
#{async_reply_fun => {fun(Res) -> ct:pal("Res = ~p", [Res]) end, []}}
|
#{async_reply_fun => {fun(Res) -> ct:pal("Res = ~p", [Res]) end, []}}
|
||||||
),
|
),
|
||||||
|
@ -1507,6 +1507,59 @@ t_retry_async_inflight_full(_Config) ->
|
||||||
?assertEqual(0, emqx_resource_metrics:inflight_get(?ID)),
|
?assertEqual(0, emqx_resource_metrics:inflight_get(?ID)),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
|
%% this test case is to ensure the buffer worker will not go crazy even
|
||||||
|
%% if the underlying connector is misbehaving: evaluate async callbacks multiple times
|
||||||
|
t_async_reply_multi_eval(_Config) ->
|
||||||
|
ResumeInterval = 5,
|
||||||
|
TotalTime = 5_000,
|
||||||
|
AsyncInflightWindow = 3,
|
||||||
|
TotalQueries = AsyncInflightWindow * 5,
|
||||||
|
emqx_connector_demo:set_callback_mode(async_if_possible),
|
||||||
|
{ok, _} = emqx_resource:create(
|
||||||
|
?ID,
|
||||||
|
?DEFAULT_RESOURCE_GROUP,
|
||||||
|
?TEST_RESOURCE,
|
||||||
|
#{name => ?FUNCTION_NAME},
|
||||||
|
#{
|
||||||
|
query_mode => async,
|
||||||
|
async_inflight_window => AsyncInflightWindow,
|
||||||
|
batch_size => 3,
|
||||||
|
batch_time => 10,
|
||||||
|
worker_pool_size => 1,
|
||||||
|
resume_interval => ResumeInterval
|
||||||
|
}
|
||||||
|
),
|
||||||
|
%% block
|
||||||
|
ok = emqx_resource:simple_sync_query(?ID, block),
|
||||||
|
inc_counter_in_parallel(
|
||||||
|
TotalQueries,
|
||||||
|
fun() ->
|
||||||
|
Rand = rand:uniform(1000),
|
||||||
|
{random_reply, Rand}
|
||||||
|
end,
|
||||||
|
#{}
|
||||||
|
),
|
||||||
|
?retry(
|
||||||
|
ResumeInterval,
|
||||||
|
TotalTime div ResumeInterval,
|
||||||
|
begin
|
||||||
|
Metrics = tap_metrics(?LINE),
|
||||||
|
#{
|
||||||
|
counters := Counters,
|
||||||
|
gauges := #{queuing := 0, inflight := 0}
|
||||||
|
} = Metrics,
|
||||||
|
#{
|
||||||
|
matched := Matched,
|
||||||
|
success := Success,
|
||||||
|
dropped := Dropped,
|
||||||
|
late_reply := LateReply,
|
||||||
|
failed := Failed
|
||||||
|
} = Counters,
|
||||||
|
?assertEqual(TotalQueries, Matched - 1),
|
||||||
|
?assertEqual(Matched, Success + Dropped + LateReply + Failed)
|
||||||
|
end
|
||||||
|
).
|
||||||
|
|
||||||
t_retry_async_inflight_batch(_Config) ->
|
t_retry_async_inflight_batch(_Config) ->
|
||||||
ResumeInterval = 1_000,
|
ResumeInterval = 1_000,
|
||||||
emqx_connector_demo:set_callback_mode(async_if_possible),
|
emqx_connector_demo:set_callback_mode(async_if_possible),
|
||||||
|
@ -1944,7 +1997,7 @@ t_expiration_async_batch_after_reply(_Config) ->
|
||||||
#{name => test_resource},
|
#{name => test_resource},
|
||||||
#{
|
#{
|
||||||
query_mode => async,
|
query_mode => async,
|
||||||
batch_size => 2,
|
batch_size => 3,
|
||||||
batch_time => 100,
|
batch_time => 100,
|
||||||
worker_pool_size => 1,
|
worker_pool_size => 1,
|
||||||
resume_interval => 2_000
|
resume_interval => 2_000
|
||||||
|
@ -1959,7 +2012,7 @@ do_t_expiration_async_after_reply(IsBatch) ->
|
||||||
NAcks =
|
NAcks =
|
||||||
case IsBatch of
|
case IsBatch of
|
||||||
batch -> 1;
|
batch -> 1;
|
||||||
single -> 2
|
single -> 3
|
||||||
end,
|
end,
|
||||||
?force_ordering(
|
?force_ordering(
|
||||||
#{?snk_kind := buffer_worker_flush_ack},
|
#{?snk_kind := buffer_worker_flush_ack},
|
||||||
|
@ -1980,6 +2033,10 @@ do_t_expiration_async_after_reply(IsBatch) ->
|
||||||
ok,
|
ok,
|
||||||
emqx_resource:query(?ID, {inc_counter, 199}, #{timeout => TimeoutMS})
|
emqx_resource:query(?ID, {inc_counter, 199}, #{timeout => TimeoutMS})
|
||||||
),
|
),
|
||||||
|
?assertEqual(
|
||||||
|
ok,
|
||||||
|
emqx_resource:query(?ID, {inc_counter, 299}, #{timeout => TimeoutMS})
|
||||||
|
),
|
||||||
?assertEqual(
|
?assertEqual(
|
||||||
ok, emqx_resource:query(?ID, {inc_counter, 99}, #{timeout => infinity})
|
ok, emqx_resource:query(?ID, {inc_counter, 99}, #{timeout => infinity})
|
||||||
),
|
),
|
||||||
|
@ -1997,30 +2054,44 @@ do_t_expiration_async_after_reply(IsBatch) ->
|
||||||
{ok, _} = ?block_until(
|
{ok, _} = ?block_until(
|
||||||
#{?snk_kind := handle_async_reply_expired}, 10 * TimeoutMS
|
#{?snk_kind := handle_async_reply_expired}, 10 * TimeoutMS
|
||||||
),
|
),
|
||||||
|
wait_telemetry_event(success, #{n_events => 1, timeout => 4_000}),
|
||||||
|
|
||||||
unlink(Pid0),
|
unlink(Pid0),
|
||||||
exit(Pid0, kill),
|
exit(Pid0, kill),
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
fun(Trace) ->
|
fun(Trace) ->
|
||||||
?assertMatch(
|
case IsBatch of
|
||||||
[
|
batch ->
|
||||||
#{
|
?assertMatch(
|
||||||
expired := [{query, _, {inc_counter, 199}, _, _}]
|
[
|
||||||
}
|
#{
|
||||||
],
|
expired := [
|
||||||
?of_kind(handle_async_reply_expired, Trace)
|
{query, _, {inc_counter, 199}, _, _},
|
||||||
),
|
{query, _, {inc_counter, 299}, _, _}
|
||||||
wait_telemetry_event(success, #{n_events => 1, timeout => 4_000}),
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
?of_kind(handle_async_reply_expired, Trace)
|
||||||
|
);
|
||||||
|
single ->
|
||||||
|
?assertMatch(
|
||||||
|
[
|
||||||
|
#{expired := [{query, _, {inc_counter, 199}, _, _}]},
|
||||||
|
#{expired := [{query, _, {inc_counter, 299}, _, _}]}
|
||||||
|
],
|
||||||
|
?of_kind(handle_async_reply_expired, Trace)
|
||||||
|
)
|
||||||
|
end,
|
||||||
Metrics = tap_metrics(?LINE),
|
Metrics = tap_metrics(?LINE),
|
||||||
?assertMatch(
|
?assertMatch(
|
||||||
#{
|
#{
|
||||||
counters := #{
|
counters := #{
|
||||||
matched := 2,
|
matched := 3,
|
||||||
%% the request with infinity timeout.
|
%% the request with infinity timeout.
|
||||||
success := 1,
|
success := 1,
|
||||||
dropped := 0,
|
dropped := 0,
|
||||||
late_reply := 1,
|
late_reply := 2,
|
||||||
retried := 0,
|
retried := 0,
|
||||||
failed := 0
|
failed := 0
|
||||||
}
|
}
|
||||||
|
@ -2042,7 +2113,7 @@ t_expiration_batch_all_expired_after_reply(_Config) ->
|
||||||
#{name => test_resource},
|
#{name => test_resource},
|
||||||
#{
|
#{
|
||||||
query_mode => async,
|
query_mode => async,
|
||||||
batch_size => 2,
|
batch_size => 3,
|
||||||
batch_time => 100,
|
batch_time => 100,
|
||||||
worker_pool_size => 1,
|
worker_pool_size => 1,
|
||||||
resume_interval => ResumeInterval
|
resume_interval => ResumeInterval
|
||||||
|
@ -2067,6 +2138,10 @@ t_expiration_batch_all_expired_after_reply(_Config) ->
|
||||||
ok,
|
ok,
|
||||||
emqx_resource:query(?ID, {inc_counter, 199}, #{timeout => TimeoutMS})
|
emqx_resource:query(?ID, {inc_counter, 199}, #{timeout => TimeoutMS})
|
||||||
),
|
),
|
||||||
|
?assertEqual(
|
||||||
|
ok,
|
||||||
|
emqx_resource:query(?ID, {inc_counter, 299}, #{timeout => TimeoutMS})
|
||||||
|
),
|
||||||
Pid0 =
|
Pid0 =
|
||||||
spawn_link(fun() ->
|
spawn_link(fun() ->
|
||||||
?tp(delay_enter, #{}),
|
?tp(delay_enter, #{}),
|
||||||
|
@ -2087,7 +2162,10 @@ t_expiration_batch_all_expired_after_reply(_Config) ->
|
||||||
?assertMatch(
|
?assertMatch(
|
||||||
[
|
[
|
||||||
#{
|
#{
|
||||||
expired := [{query, _, {inc_counter, 199}, _, _}]
|
expired := [
|
||||||
|
{query, _, {inc_counter, 199}, _, _},
|
||||||
|
{query, _, {inc_counter, 299}, _, _}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
?of_kind(handle_async_reply_expired, Trace)
|
?of_kind(handle_async_reply_expired, Trace)
|
||||||
|
@ -2096,12 +2174,16 @@ t_expiration_batch_all_expired_after_reply(_Config) ->
|
||||||
?assertMatch(
|
?assertMatch(
|
||||||
#{
|
#{
|
||||||
counters := #{
|
counters := #{
|
||||||
matched := 1,
|
matched := 2,
|
||||||
success := 0,
|
success := 0,
|
||||||
dropped := 0,
|
dropped := 0,
|
||||||
late_reply := 1,
|
late_reply := 2,
|
||||||
retried := 0,
|
retried := 0,
|
||||||
failed := 0
|
failed := 0
|
||||||
|
},
|
||||||
|
gauges := #{
|
||||||
|
inflight := 0,
|
||||||
|
queuing := 0
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
Metrics
|
Metrics
|
||||||
|
@ -2217,6 +2299,16 @@ do_t_expiration_retry(IsBatch) ->
|
||||||
[#{expired := [{query, _, {inc_counter, 1}, _, _}]}],
|
[#{expired := [{query, _, {inc_counter, 1}, _, _}]}],
|
||||||
?of_kind(buffer_worker_retry_expired, Trace)
|
?of_kind(buffer_worker_retry_expired, Trace)
|
||||||
),
|
),
|
||||||
|
Metrics = tap_metrics(?LINE),
|
||||||
|
?assertMatch(
|
||||||
|
#{
|
||||||
|
gauges := #{
|
||||||
|
inflight := 0,
|
||||||
|
queuing := 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Metrics
|
||||||
|
),
|
||||||
ok
|
ok
|
||||||
end
|
end
|
||||||
),
|
),
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Fix bridge metrics when running in async mode with batching enabled (`batch_size` > 1).
|
|
@ -0,0 +1 @@
|
||||||
|
修复使用异步和批量配置的桥接计数不准确的问题。
|
|
@ -268,7 +268,7 @@ kafka_bridge_rest_api_helper(Config) ->
|
||||||
CreateBodyTmp = #{
|
CreateBodyTmp = #{
|
||||||
<<"type">> => <<"kafka">>,
|
<<"type">> => <<"kafka">>,
|
||||||
<<"name">> => <<"my_kafka_bridge">>,
|
<<"name">> => <<"my_kafka_bridge">>,
|
||||||
<<"bootstrap_hosts">> => maps:get(<<"bootstrap_hosts">>, Config),
|
<<"bootstrap_hosts">> => iolist_to_binary(maps:get(<<"bootstrap_hosts">>, Config)),
|
||||||
<<"enable">> => true,
|
<<"enable">> => true,
|
||||||
<<"authentication">> => maps:get(<<"authentication">>, Config),
|
<<"authentication">> => maps:get(<<"authentication">>, Config),
|
||||||
<<"producer">> => #{
|
<<"producer">> => #{
|
||||||
|
@ -276,7 +276,7 @@ kafka_bridge_rest_api_helper(Config) ->
|
||||||
topic => <<"t/#">>
|
topic => <<"t/#">>
|
||||||
},
|
},
|
||||||
<<"kafka">> => #{
|
<<"kafka">> => #{
|
||||||
<<"topic">> => erlang:list_to_binary(KafkaTopic),
|
<<"topic">> => iolist_to_binary(KafkaTopic),
|
||||||
<<"buffer">> => #{
|
<<"buffer">> => #{
|
||||||
<<"memory_overload_protection">> => <<"false">>
|
<<"memory_overload_protection">> => <<"false">>
|
||||||
},
|
},
|
||||||
|
|
|
@ -21,6 +21,12 @@ help() {
|
||||||
echo " otherwise it runs the entire app's CT"
|
echo " otherwise it runs the entire app's CT"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if command -v docker-compose; then
|
||||||
|
DC='docker-compose'
|
||||||
|
else
|
||||||
|
DC='docker compose'
|
||||||
|
fi
|
||||||
|
|
||||||
WHICH_APP='novalue'
|
WHICH_APP='novalue'
|
||||||
CONSOLE='no'
|
CONSOLE='no'
|
||||||
KEEP_UP='no'
|
KEEP_UP='no'
|
||||||
|
@ -201,7 +207,7 @@ if [ "$STOP" = 'no' ]; then
|
||||||
# some left-over log file has to be deleted before a new docker-compose up
|
# some left-over log file has to be deleted before a new docker-compose up
|
||||||
rm -f '.ci/docker-compose-file/redis/*.log'
|
rm -f '.ci/docker-compose-file/redis/*.log'
|
||||||
# shellcheck disable=2086 # no quotes for F_OPTIONS
|
# shellcheck disable=2086 # no quotes for F_OPTIONS
|
||||||
docker compose $F_OPTIONS up -d --build --remove-orphans
|
$DC $F_OPTIONS up -d --build --remove-orphans
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Fixing file owners and permissions for $UID_GID"
|
echo "Fixing file owners and permissions for $UID_GID"
|
||||||
|
@ -218,7 +224,7 @@ set +e
|
||||||
|
|
||||||
if [ "$STOP" = 'yes' ]; then
|
if [ "$STOP" = 'yes' ]; then
|
||||||
# shellcheck disable=2086 # no quotes for F_OPTIONS
|
# shellcheck disable=2086 # no quotes for F_OPTIONS
|
||||||
docker compose $F_OPTIONS down --remove-orphans
|
$DC $F_OPTIONS down --remove-orphans
|
||||||
elif [ "$ATTACH" = 'yes' ]; then
|
elif [ "$ATTACH" = 'yes' ]; then
|
||||||
docker exec -it "$ERLANG_CONTAINER" bash
|
docker exec -it "$ERLANG_CONTAINER" bash
|
||||||
elif [ "$CONSOLE" = 'yes' ]; then
|
elif [ "$CONSOLE" = 'yes' ]; then
|
||||||
|
@ -235,11 +241,11 @@ else
|
||||||
LOG='_build/test/logs/docker-compose.log'
|
LOG='_build/test/logs/docker-compose.log'
|
||||||
echo "Dumping docker-compose log to $LOG"
|
echo "Dumping docker-compose log to $LOG"
|
||||||
# shellcheck disable=2086 # no quotes for F_OPTIONS
|
# shellcheck disable=2086 # no quotes for F_OPTIONS
|
||||||
docker compose $F_OPTIONS logs --no-color --timestamps > "$LOG"
|
$DC $F_OPTIONS logs --no-color --timestamps > "$LOG"
|
||||||
fi
|
fi
|
||||||
if [ "$KEEP_UP" != 'yes' ]; then
|
if [ "$KEEP_UP" != 'yes' ]; then
|
||||||
# shellcheck disable=2086 # no quotes for F_OPTIONS
|
# shellcheck disable=2086 # no quotes for F_OPTIONS
|
||||||
docker compose $F_OPTIONS down
|
$DC $F_OPTIONS down
|
||||||
fi
|
fi
|
||||||
exit $RESULT
|
exit $RESULT
|
||||||
fi
|
fi
|
||||||
|
|
Loading…
Reference in New Issue