Merge pull request #10020 from zmstone/0222-fix-bridge-async-mode-counters

fix(bridge): fix dropped counter and inflight gauge
This commit is contained in:
Zaiming (Stone) Shi 2023-02-24 19:51:24 +01:00 committed by GitHub
commit 79bf77c2f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 397 additions and 127 deletions

View File

@ -2,7 +2,7 @@ version: '3.9'
services: services:
zookeeper: zookeeper:
image: wurstmeister/zookeeper image: docker.io/library/zookeeper:3.6
ports: ports:
- "2181:2181" - "2181:2181"
container_name: zookeeper container_name: zookeeper

View File

@ -720,4 +720,4 @@ pub_props_to_packet(Properties) ->
safe_filename(Filename) when is_binary(Filename) -> safe_filename(Filename) when is_binary(Filename) ->
binary:replace(Filename, <<":">>, <<"-">>, [global]); binary:replace(Filename, <<":">>, <<"-">>, [global]);
safe_filename(Filename) when is_list(Filename) -> safe_filename(Filename) when is_list(Filename) ->
string:replace(Filename, ":", "-", all). lists:flatten(string:replace(Filename, ":", "-", all)).

View File

@ -70,6 +70,18 @@
-define(RETRY_IDX, 3). -define(RETRY_IDX, 3).
-define(WORKER_MREF_IDX, 4). -define(WORKER_MREF_IDX, 4).
-define(ENSURE_ASYNC_FLUSH(InflightTID, EXPR),
(fun() ->
IsFullBefore = is_inflight_full(InflightTID),
case (EXPR) of
blocked ->
ok;
ok ->
ok = maybe_flush_after_async_reply(IsFullBefore)
end
end)()
).
-type id() :: binary(). -type id() :: binary().
-type index() :: pos_integer(). -type index() :: pos_integer().
-type expire_at() :: infinity | integer(). -type expire_at() :: infinity | integer().
@ -97,6 +109,7 @@ start_link(Id, Index, Opts) ->
-spec sync_query(id(), request(), query_opts()) -> Result :: term(). -spec sync_query(id(), request(), query_opts()) -> Result :: term().
sync_query(Id, Request, Opts0) -> sync_query(Id, Request, Opts0) ->
?tp(sync_query, #{id => Id, request => Request, query_opts => Opts0}),
Opts1 = ensure_timeout_query_opts(Opts0, sync), Opts1 = ensure_timeout_query_opts(Opts0, sync),
Opts = ensure_expire_at(Opts1), Opts = ensure_expire_at(Opts1),
PickKey = maps:get(pick_key, Opts, self()), PickKey = maps:get(pick_key, Opts, self()),
@ -106,6 +119,7 @@ sync_query(Id, Request, Opts0) ->
-spec async_query(id(), request(), query_opts()) -> Result :: term(). -spec async_query(id(), request(), query_opts()) -> Result :: term().
async_query(Id, Request, Opts0) -> async_query(Id, Request, Opts0) ->
?tp(async_query, #{id => Id, request => Request, query_opts => Opts0}),
Opts1 = ensure_timeout_query_opts(Opts0, async), Opts1 = ensure_timeout_query_opts(Opts0, async),
Opts = ensure_expire_at(Opts1), Opts = ensure_expire_at(Opts1),
PickKey = maps:get(pick_key, Opts, self()), PickKey = maps:get(pick_key, Opts, self()),
@ -121,6 +135,7 @@ simple_sync_query(Id, Request) ->
%% call ends up calling buffering functions, that's a bug and %% call ends up calling buffering functions, that's a bug and
%% would mess up the metrics anyway. `undefined' is ignored by %% would mess up the metrics anyway. `undefined' is ignored by
%% `emqx_resource_metrics:*_shift/3'. %% `emqx_resource_metrics:*_shift/3'.
?tp(simple_sync_query, #{id => Id, request => Request}),
Index = undefined, Index = undefined,
QueryOpts = simple_query_opts(), QueryOpts = simple_query_opts(),
emqx_resource_metrics:matched_inc(Id), emqx_resource_metrics:matched_inc(Id),
@ -132,6 +147,7 @@ simple_sync_query(Id, Request) ->
%% simple async-query the resource without batching and queuing. %% simple async-query the resource without batching and queuing.
-spec simple_async_query(id(), request(), query_opts()) -> term(). -spec simple_async_query(id(), request(), query_opts()) -> term().
simple_async_query(Id, Request, QueryOpts0) -> simple_async_query(Id, Request, QueryOpts0) ->
?tp(simple_async_query, #{id => Id, request => Request, query_opts => QueryOpts0}),
Index = undefined, Index = undefined,
QueryOpts = maps:merge(simple_query_opts(), QueryOpts0), QueryOpts = maps:merge(simple_query_opts(), QueryOpts0),
emqx_resource_metrics:matched_inc(Id), emqx_resource_metrics:matched_inc(Id),
@ -194,8 +210,8 @@ init({Id, Index, Opts}) ->
?tp(buffer_worker_init, #{id => Id, index => Index}), ?tp(buffer_worker_init, #{id => Id, index => Index}),
{ok, running, Data}. {ok, running, Data}.
running(enter, _, Data) -> running(enter, _, #{tref := _Tref} = Data) ->
?tp(buffer_worker_enter_running, #{id => maps:get(id, Data)}), ?tp(buffer_worker_enter_running, #{id => maps:get(id, Data), tref => _Tref}),
%% According to `gen_statem' laws, we mustn't call `maybe_flush' %% According to `gen_statem' laws, we mustn't call `maybe_flush'
%% directly because it may decide to return `{next_state, blocked, _}', %% directly because it may decide to return `{next_state, blocked, _}',
%% and that's an invalid response for a state enter call. %% and that's an invalid response for a state enter call.
@ -212,9 +228,8 @@ running(info, ?SEND_REQ(_ReplyTo, _Req) = Request0, Data) ->
handle_query_requests(Request0, Data); handle_query_requests(Request0, Data);
running(info, {flush, Ref}, St = #{tref := {_TRef, Ref}}) -> running(info, {flush, Ref}, St = #{tref := {_TRef, Ref}}) ->
flush(St#{tref := undefined}); flush(St#{tref := undefined});
running(internal, flush, St) ->
flush(St);
running(info, {flush, _Ref}, _St) -> running(info, {flush, _Ref}, _St) ->
?tp(discarded_stale_flush, #{}),
keep_state_and_data; keep_state_and_data;
running(info, {'DOWN', _MRef, process, Pid, Reason}, Data0 = #{async_workers := AsyncWorkers0}) when running(info, {'DOWN', _MRef, process, Pid, Reason}, Data0 = #{async_workers := AsyncWorkers0}) when
is_map_key(Pid, AsyncWorkers0) is_map_key(Pid, AsyncWorkers0)
@ -225,21 +240,24 @@ running(info, Info, _St) ->
?SLOG(error, #{msg => unexpected_msg, state => running, info => Info}), ?SLOG(error, #{msg => unexpected_msg, state => running, info => Info}),
keep_state_and_data. keep_state_and_data.
blocked(enter, _, #{resume_interval := ResumeT} = _St) -> blocked(enter, _, #{resume_interval := ResumeT} = St0) ->
?tp(buffer_worker_enter_blocked, #{}), ?tp(buffer_worker_enter_blocked, #{}),
{keep_state_and_data, {state_timeout, ResumeT, unblock}}; %% discard the old timer, new timer will be started when entering running state again
St = cancel_flush_timer(St0),
{keep_state, St, {state_timeout, ResumeT, unblock}};
blocked(cast, block, _St) -> blocked(cast, block, _St) ->
keep_state_and_data; keep_state_and_data;
blocked(cast, resume, St) -> blocked(cast, resume, St) ->
resume_from_blocked(St); resume_from_blocked(St);
blocked(cast, flush, Data) -> blocked(cast, flush, St) ->
resume_from_blocked(Data); resume_from_blocked(St);
blocked(state_timeout, unblock, St) -> blocked(state_timeout, unblock, St) ->
resume_from_blocked(St); resume_from_blocked(St);
blocked(info, ?SEND_REQ(_ReplyTo, _Req) = Request0, Data0) -> blocked(info, ?SEND_REQ(_ReplyTo, _Req) = Request0, Data0) ->
Data = collect_and_enqueue_query_requests(Request0, Data0), Data = collect_and_enqueue_query_requests(Request0, Data0),
{keep_state, Data}; {keep_state, Data};
blocked(info, {flush, _Ref}, _Data) -> blocked(info, {flush, _Ref}, _Data) ->
%% ignore stale timer
keep_state_and_data; keep_state_and_data;
blocked(info, {'DOWN', _MRef, process, Pid, Reason}, Data0 = #{async_workers := AsyncWorkers0}) when blocked(info, {'DOWN', _MRef, process, Pid, Reason}, Data0 = #{async_workers := AsyncWorkers0}) when
is_map_key(Pid, AsyncWorkers0) is_map_key(Pid, AsyncWorkers0)
@ -335,11 +353,13 @@ resume_from_blocked(Data) ->
%% We retry msgs in inflight window sync, as if we send them %% We retry msgs in inflight window sync, as if we send them
%% async, they will be appended to the end of inflight window again. %% async, they will be appended to the end of inflight window again.
retry_inflight_sync(Ref, Query, Data); retry_inflight_sync(Ref, Query, Data);
{batch, Ref, NotExpired, []} ->
retry_inflight_sync(Ref, NotExpired, Data);
{batch, Ref, NotExpired, Expired} -> {batch, Ref, NotExpired, Expired} ->
update_inflight_item(InflightTID, Ref, NotExpired),
NumExpired = length(Expired), NumExpired = length(Expired),
ok = update_inflight_item(InflightTID, Ref, NotExpired, NumExpired),
emqx_resource_metrics:dropped_expired_inc(Id, NumExpired), emqx_resource_metrics:dropped_expired_inc(Id, NumExpired),
NumExpired > 0 andalso ?tp(buffer_worker_retry_expired, #{expired => Expired}), ?tp(buffer_worker_retry_expired, #{expired => Expired}),
%% We retry msgs in inflight window sync, as if we send them %% We retry msgs in inflight window sync, as if we send them
%% async, they will be appended to the end of inflight window again. %% async, they will be appended to the end of inflight window again.
retry_inflight_sync(Ref, NotExpired, Data) retry_inflight_sync(Ref, NotExpired, Data)
@ -470,9 +490,14 @@ flush(Data0) ->
Data1 = cancel_flush_timer(Data0), Data1 = cancel_flush_timer(Data0),
CurrentCount = queue_count(Q0), CurrentCount = queue_count(Q0),
IsFull = is_inflight_full(InflightTID), IsFull = is_inflight_full(InflightTID),
?tp(buffer_worker_flush, #{queue_count => CurrentCount, is_full => IsFull}), ?tp(buffer_worker_flush, #{
queued => CurrentCount,
is_inflight_full => IsFull,
inflight => inflight_count(InflightTID)
}),
case {CurrentCount, IsFull} of case {CurrentCount, IsFull} of
{0, _} -> {0, _} ->
?tp(buffer_worker_queue_drained, #{inflight => inflight_count(InflightTID)}),
{keep_state, Data1}; {keep_state, Data1};
{_, true} -> {_, true} ->
?tp(buffer_worker_flush_but_inflight_full, #{}), ?tp(buffer_worker_flush_but_inflight_full, #{}),
@ -487,7 +512,7 @@ flush(Data0) ->
%% if the request has expired, the caller is no longer %% if the request has expired, the caller is no longer
%% waiting for a response. %% waiting for a response.
case sieve_expired_requests(Batch, Now) of case sieve_expired_requests(Batch, Now) of
all_expired -> {[], _AllExpired} ->
ok = replayq:ack(Q1, QAckRef), ok = replayq:ack(Q1, QAckRef),
emqx_resource_metrics:dropped_expired_inc(Id, length(Batch)), emqx_resource_metrics:dropped_expired_inc(Id, length(Batch)),
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)), emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
@ -496,7 +521,7 @@ flush(Data0) ->
{NotExpired, Expired} -> {NotExpired, Expired} ->
NumExpired = length(Expired), NumExpired = length(Expired),
emqx_resource_metrics:dropped_expired_inc(Id, NumExpired), emqx_resource_metrics:dropped_expired_inc(Id, NumExpired),
IsBatch = BatchSize =/= 1, IsBatch = (BatchSize > 1),
%% We *must* use the new queue, because we currently can't %% We *must* use the new queue, because we currently can't
%% `nack' a `pop'. %% `nack' a `pop'.
%% Maybe we could re-open the queue? %% Maybe we could re-open the queue?
@ -506,7 +531,6 @@ flush(Data0) ->
), ),
Ref = make_request_ref(), Ref = make_request_ref(),
do_flush(Data2, #{ do_flush(Data2, #{
new_queue => Q1,
is_batch => IsBatch, is_batch => IsBatch,
batch => NotExpired, batch => NotExpired,
ref => Ref, ref => Ref,
@ -519,18 +543,16 @@ flush(Data0) ->
is_batch := boolean(), is_batch := boolean(),
batch := [queue_query()], batch := [queue_query()],
ack_ref := replayq:ack_ref(), ack_ref := replayq:ack_ref(),
ref := inflight_key(), ref := inflight_key()
new_queue := replayq:q()
}) -> }) ->
gen_statem:event_handler_result(state(), data()). gen_statem:event_handler_result(state(), data()).
do_flush( do_flush(
Data0, #{queue := Q1} = Data0,
#{ #{
is_batch := false, is_batch := false,
batch := Batch, batch := Batch,
ref := Ref, ref := Ref,
ack_ref := QAckRef, ack_ref := QAckRef
new_queue := Q1
} }
) -> ) ->
#{ #{
@ -606,16 +628,18 @@ do_flush(
}), }),
flush_worker(self()); flush_worker(self());
false -> false ->
?tp(buffer_worker_queue_drained, #{
inflight => inflight_count(InflightTID)
}),
ok ok
end, end,
{keep_state, Data1} {keep_state, Data1}
end; end;
do_flush(Data0, #{ do_flush(#{queue := Q1} = Data0, #{
is_batch := true, is_batch := true,
batch := Batch, batch := Batch,
ref := Ref, ref := Ref,
ack_ref := QAckRef, ack_ref := QAckRef
new_queue := Q1
}) -> }) ->
#{ #{
id := Id, id := Id,
@ -685,6 +709,9 @@ do_flush(Data0, #{
Data2 = Data2 =
case {CurrentCount > 0, CurrentCount >= BatchSize} of case {CurrentCount > 0, CurrentCount >= BatchSize} of
{false, _} -> {false, _} ->
?tp(buffer_worker_queue_drained, #{
inflight => inflight_count(InflightTID)
}),
Data1; Data1;
{true, true} -> {true, true} ->
?tp(buffer_worker_flush_ack_reflush, #{ ?tp(buffer_worker_flush_ack_reflush, #{
@ -718,13 +745,14 @@ batch_reply_caller_defer_metrics(Id, BatchResult, Batch, QueryOpts) ->
{ShouldAck, PostFns} = {ShouldAck, PostFns} =
lists:foldl( lists:foldl(
fun(Reply, {_ShouldAck, PostFns}) -> fun(Reply, {_ShouldAck, PostFns}) ->
%% _ShouldAck should be the same as ShouldAck starting from the second reply
{ShouldAck, PostFn} = reply_caller_defer_metrics(Id, Reply, QueryOpts), {ShouldAck, PostFn} = reply_caller_defer_metrics(Id, Reply, QueryOpts),
{ShouldAck, [PostFn | PostFns]} {ShouldAck, [PostFn | PostFns]}
end, end,
{ack, []}, {ack, []},
Replies Replies
), ),
PostFn = fun() -> lists:foreach(fun(F) -> F() end, PostFns) end, PostFn = fun() -> lists:foreach(fun(F) -> F() end, lists:reverse(PostFns)) end,
{ShouldAck, PostFn}. {ShouldAck, PostFn}.
reply_caller(Id, Reply, QueryOpts) -> reply_caller(Id, Reply, QueryOpts) ->
@ -853,7 +881,7 @@ handle_async_worker_down(Data0, Pid) ->
{keep_state, Data}. {keep_state, Data}.
call_query(QM0, Id, Index, Ref, Query, QueryOpts) -> call_query(QM0, Id, Index, Ref, Query, QueryOpts) ->
?tp(call_query_enter, #{id => Id, query => Query}), ?tp(call_query_enter, #{id => Id, query => Query, query_mode => QM0}),
case emqx_resource_manager:ets_lookup(Id) of case emqx_resource_manager:ets_lookup(Id) of
{ok, _Group, #{status := stopped}} -> {ok, _Group, #{status := stopped}} ->
?RESOURCE_ERROR(stopped, "resource stopped or disabled"); ?RESOURCE_ERROR(stopped, "resource stopped or disabled");
@ -919,7 +947,7 @@ apply_query_fun(async, Mod, Id, Index, Ref, ?QUERY(_, Request, _, _) = Query, Re
inflight_tid => InflightTID, inflight_tid => InflightTID,
request_ref => Ref, request_ref => Ref,
query_opts => QueryOpts, query_opts => QueryOpts,
query => minimize(Query) min_query => minimize(Query)
}, },
IsRetriable = false, IsRetriable = false,
WorkerMRef = undefined, WorkerMRef = undefined,
@ -952,7 +980,7 @@ apply_query_fun(async, Mod, Id, Index, Ref, [?QUERY(_, _, _, _) | _] = Batch, Re
inflight_tid => InflightTID, inflight_tid => InflightTID,
request_ref => Ref, request_ref => Ref,
query_opts => QueryOpts, query_opts => QueryOpts,
batch => minimize(Batch) min_batch => minimize(Batch)
}, },
Requests = lists:map( Requests = lists:map(
fun(?QUERY(_ReplyTo, Request, _, _ExpireAt)) -> Request end, Batch fun(?QUERY(_ReplyTo, Request, _, _ExpireAt)) -> Request end, Batch
@ -968,27 +996,39 @@ apply_query_fun(async, Mod, Id, Index, Ref, [?QUERY(_, _, _, _) | _] = Batch, Re
). ).
handle_async_reply( handle_async_reply(
#{
request_ref := Ref,
inflight_tid := InflightTID,
query_opts := Opts
} = ReplyContext,
Result
) ->
case maybe_handle_unknown_async_reply(InflightTID, Ref, Opts) of
discard ->
ok;
continue ->
?ENSURE_ASYNC_FLUSH(InflightTID, handle_async_reply1(ReplyContext, Result))
end.
handle_async_reply1(
#{ #{
request_ref := Ref, request_ref := Ref,
inflight_tid := InflightTID, inflight_tid := InflightTID,
resource_id := Id, resource_id := Id,
worker_index := Index, worker_index := Index,
buffer_worker := Pid, min_query := ?QUERY(_, _, _, ExpireAt) = _Query
query := ?QUERY(_, _, _, ExpireAt) = _Query
} = ReplyContext, } = ReplyContext,
Result Result
) -> ) ->
?tp( ?tp(
handle_async_reply_enter, handle_async_reply_enter,
#{batch_or_query => [_Query], ref => Ref} #{batch_or_query => [_Query], ref => Ref, result => Result}
), ),
Now = now_(), Now = now_(),
case is_expired(ExpireAt, Now) of case is_expired(ExpireAt, Now) of
true -> true ->
IsFullBefore = is_inflight_full(InflightTID),
IsAcked = ack_inflight(InflightTID, Ref, Id, Index), IsAcked = ack_inflight(InflightTID, Ref, Id, Index),
IsAcked andalso emqx_resource_metrics:late_reply_inc(Id), IsAcked andalso emqx_resource_metrics:late_reply_inc(Id),
IsFullBefore andalso ?MODULE:flush_worker(Pid),
?tp(handle_async_reply_expired, #{expired => [_Query]}), ?tp(handle_async_reply_expired, #{expired => [_Query]}),
ok; ok;
false -> false ->
@ -1003,7 +1043,7 @@ do_handle_async_reply(
worker_index := Index, worker_index := Index,
buffer_worker := Pid, buffer_worker := Pid,
inflight_tid := InflightTID, inflight_tid := InflightTID,
query := ?QUERY(ReplyTo, _, Sent, _ExpireAt) = _Query min_query := ?QUERY(ReplyTo, _, Sent, _ExpireAt) = _Query
}, },
Result Result
) -> ) ->
@ -1020,46 +1060,95 @@ do_handle_async_reply(
ref => Ref, ref => Ref,
result => Result result => Result
}), }),
case Action of case Action of
nack -> nack ->
%% Keep retrying. %% Keep retrying.
mark_inflight_as_retriable(InflightTID, Ref), ok = mark_inflight_as_retriable(InflightTID, Ref),
?MODULE:block(Pid); ok = ?MODULE:block(Pid),
blocked;
ack -> ack ->
do_ack(InflightTID, Ref, Id, Index, PostFn, Pid, QueryOpts) ok = do_async_ack(InflightTID, Ref, Id, Index, PostFn, QueryOpts)
end. end.
handle_async_batch_reply( handle_async_batch_reply(
#{ #{
buffer_worker := Pid,
resource_id := Id,
worker_index := Index,
inflight_tid := InflightTID, inflight_tid := InflightTID,
request_ref := Ref, request_ref := Ref,
batch := Batch query_opts := Opts
} = ReplyContext,
Result
) ->
case maybe_handle_unknown_async_reply(InflightTID, Ref, Opts) of
discard ->
ok;
continue ->
?ENSURE_ASYNC_FLUSH(InflightTID, handle_async_batch_reply1(ReplyContext, Result))
end.
handle_async_batch_reply1(
#{
inflight_tid := InflightTID,
request_ref := Ref,
min_batch := Batch
} = ReplyContext, } = ReplyContext,
Result Result
) -> ) ->
?tp( ?tp(
handle_async_reply_enter, handle_async_reply_enter,
#{batch_or_query => Batch, ref => Ref} #{batch_or_query => Batch, ref => Ref, result => Result}
), ),
Now = now_(), Now = now_(),
case sieve_expired_requests(Batch, Now) of case sieve_expired_requests(Batch, Now) of
all_expired -> {_NotExpired, []} ->
IsFullBefore = is_inflight_full(InflightTID), %% this is the critical code path,
IsAcked = ack_inflight(InflightTID, Ref, Id, Index), %% we try not to do ets:lookup in this case
IsAcked andalso emqx_resource_metrics:late_reply_inc(Id), %% because the batch can be quite big
IsFullBefore andalso ?MODULE:flush_worker(Pid), do_handle_async_batch_reply(ReplyContext, Result);
?tp(handle_async_reply_expired, #{expired => Batch}), {_NotExpired, _Expired} ->
%% at least one is expired
%% the batch from reply context is minimized, so it cannot be used
%% to update the inflight items, hence discard Batch and lookup the RealBatch
?tp(handle_async_reply_expired, #{expired => _Expired}),
handle_async_batch_reply2(ets:lookup(InflightTID, Ref), ReplyContext, Result, Now)
end.
handle_async_batch_reply2([], _, _, _) ->
%% this usually should never happen unless the async callback is being evaluated concurrently
ok;
handle_async_batch_reply2([Inflight], ReplyContext, Result, Now) ->
?INFLIGHT_ITEM(_, RealBatch, _IsRetriable, _WorkerMRef) = Inflight,
#{
resource_id := Id,
worker_index := Index,
inflight_tid := InflightTID,
request_ref := Ref,
min_batch := Batch
} = ReplyContext,
%% All batch items share the same HasBeenSent flag
%% So we just take the original flag from the ReplyContext batch
%% and put it back to the batch found in inflight table
%% which must have already been set to `false`
[?QUERY(_ReplyTo, _, HasBeenSent, _ExpireAt) | _] = Batch,
{RealNotExpired0, RealExpired} = sieve_expired_requests(RealBatch, Now),
RealNotExpired =
lists:map(
fun(?QUERY(ReplyTo, CoreReq, _HasBeenSent, ExpireAt)) ->
?QUERY(ReplyTo, CoreReq, HasBeenSent, ExpireAt)
end,
RealNotExpired0
),
NumExpired = length(RealExpired),
emqx_resource_metrics:late_reply_inc(Id, NumExpired),
case RealNotExpired of
[] ->
%% all expired, no need to update back the inflight batch
_ = ack_inflight(InflightTID, Ref, Id, Index),
ok; ok;
{NotExpired, Expired} -> _ ->
NumExpired = length(Expired), %% some queries are not expired, put them back to the inflight batch
emqx_resource_metrics:late_reply_inc(Id, NumExpired), %% so it can be either acked now or retried later
NumExpired > 0 andalso ok = update_inflight_item(InflightTID, Ref, RealNotExpired, NumExpired),
?tp(handle_async_reply_expired, #{expired => Expired}), do_handle_async_batch_reply(ReplyContext#{min_batch := RealNotExpired}, Result)
do_handle_async_batch_reply(ReplyContext#{batch := NotExpired}, Result)
end. end.
do_handle_async_batch_reply( do_handle_async_batch_reply(
@ -1069,7 +1158,7 @@ do_handle_async_batch_reply(
worker_index := Index, worker_index := Index,
inflight_tid := InflightTID, inflight_tid := InflightTID,
request_ref := Ref, request_ref := Ref,
batch := Batch, min_batch := Batch,
query_opts := QueryOpts query_opts := QueryOpts
}, },
Result Result
@ -1084,14 +1173,14 @@ do_handle_async_batch_reply(
case Action of case Action of
nack -> nack ->
%% Keep retrying. %% Keep retrying.
mark_inflight_as_retriable(InflightTID, Ref), ok = mark_inflight_as_retriable(InflightTID, Ref),
?MODULE:block(Pid); ok = ?MODULE:block(Pid),
blocked;
ack -> ack ->
do_ack(InflightTID, Ref, Id, Index, PostFn, Pid, QueryOpts) ok = do_async_ack(InflightTID, Ref, Id, Index, PostFn, QueryOpts)
end. end.
do_ack(InflightTID, Ref, Id, Index, PostFn, WorkerPid, QueryOpts) -> do_async_ack(InflightTID, Ref, Id, Index, PostFn, QueryOpts) ->
IsFullBefore = is_inflight_full(InflightTID),
IsKnownRef = ack_inflight(InflightTID, Ref, Id, Index), IsKnownRef = ack_inflight(InflightTID, Ref, Id, Index),
case maps:get(simple_query, QueryOpts, false) of case maps:get(simple_query, QueryOpts, false) of
true -> true ->
@ -1101,9 +1190,47 @@ do_ack(InflightTID, Ref, Id, Index, PostFn, WorkerPid, QueryOpts) ->
false -> false ->
ok ok
end, end,
IsFullBefore andalso ?MODULE:flush_worker(WorkerPid),
ok. ok.
maybe_flush_after_async_reply(_WasFullBeforeReplyHandled = false) ->
%% inflight was not full before async reply is handled,
%% after it is handled, the inflight table must be even smaller
%% hance we can rely on the buffer worker's flush timer to trigger
%% the next flush
?tp(skip_flushing_worker, #{}),
ok;
maybe_flush_after_async_reply(_WasFullBeforeReplyHandled = true) ->
%% the inflight table was full before handling aync reply
?tp(do_flushing_worker, #{}),
ok = ?MODULE:flush_worker(self()).
%% check if the async reply is valid.
%% e.g. if a connector evaluates the callback more than once:
%% 1. If the request was previously deleted from inflight table due to
%% either succeeded previously or expired, this function logs a
%% warning message and returns 'discard' instruction.
%% 2. If the request was previously failed and now pending on a retry,
%% then this function will return 'continue' as there is no way to
%% tell if this reply is stae or not.
maybe_handle_unknown_async_reply(undefined, _Ref, #{simple_query := true}) ->
continue;
maybe_handle_unknown_async_reply(InflightTID, Ref, #{}) ->
try ets:member(InflightTID, Ref) of
true ->
continue;
false ->
?tp(
warning,
unknown_async_reply_discarded,
#{inflight_key => Ref}
),
discard
catch
error:badarg ->
%% shutdown ?
discard
end.
%%============================================================================== %%==============================================================================
%% operations for queue %% operations for queue
queue_item_marshaller(Bin) when is_binary(Bin) -> queue_item_marshaller(Bin) when is_binary(Bin) ->
@ -1202,10 +1329,8 @@ inflight_get_first_retriable(InflightTID, Now) ->
{single, Ref, Query} {single, Ref, Query}
end; end;
{[{Ref, Batch = [_ | _]}], _Continuation} -> {[{Ref, Batch = [_ | _]}], _Continuation} ->
%% batch is non-empty because we check that in
%% `sieve_expired_requests'.
case sieve_expired_requests(Batch, Now) of case sieve_expired_requests(Batch, Now) of
all_expired -> {[], _AllExpired} ->
{expired, Ref, Batch}; {expired, Ref, Batch};
{NotExpired, Expired} -> {NotExpired, Expired} ->
{batch, Ref, NotExpired, Expired} {batch, Ref, NotExpired, Expired}
@ -1218,10 +1343,10 @@ is_inflight_full(InflightTID) ->
[{_, MaxSize}] = ets:lookup(InflightTID, ?MAX_SIZE_REF), [{_, MaxSize}] = ets:lookup(InflightTID, ?MAX_SIZE_REF),
%% we consider number of batches rather than number of messages %% we consider number of batches rather than number of messages
%% because one batch request may hold several messages. %% because one batch request may hold several messages.
Size = inflight_num_batches(InflightTID), Size = inflight_count(InflightTID),
Size >= MaxSize. Size >= MaxSize.
inflight_num_batches(InflightTID) -> inflight_count(InflightTID) ->
case ets:info(InflightTID, size) of case ets:info(InflightTID, size) of
undefined -> 0; undefined -> 0;
Size -> max(0, Size - ?INFLIGHT_META_ROWS) Size -> max(0, Size - ?INFLIGHT_META_ROWS)
@ -1243,7 +1368,7 @@ inflight_append(
InflightItem = ?INFLIGHT_ITEM(Ref, Batch, IsRetriable, WorkerMRef), InflightItem = ?INFLIGHT_ITEM(Ref, Batch, IsRetriable, WorkerMRef),
IsNew = ets:insert_new(InflightTID, InflightItem), IsNew = ets:insert_new(InflightTID, InflightItem),
BatchSize = length(Batch), BatchSize = length(Batch),
IsNew andalso ets:update_counter(InflightTID, ?SIZE_REF, {2, BatchSize}), IsNew andalso inc_inflight(InflightTID, BatchSize),
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)), emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)),
?tp(buffer_worker_appended_to_inflight, #{item => InflightItem, is_new => IsNew}), ?tp(buffer_worker_appended_to_inflight, #{item => InflightItem, is_new => IsNew}),
ok; ok;
@ -1258,7 +1383,7 @@ inflight_append(
Query = mark_as_sent(Query0), Query = mark_as_sent(Query0),
InflightItem = ?INFLIGHT_ITEM(Ref, Query, IsRetriable, WorkerMRef), InflightItem = ?INFLIGHT_ITEM(Ref, Query, IsRetriable, WorkerMRef),
IsNew = ets:insert_new(InflightTID, InflightItem), IsNew = ets:insert_new(InflightTID, InflightItem),
IsNew andalso ets:update_counter(InflightTID, ?SIZE_REF, {2, 1}), IsNew andalso inc_inflight(InflightTID, 1),
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)), emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)),
?tp(buffer_worker_appended_to_inflight, #{item => InflightItem, is_new => IsNew}), ?tp(buffer_worker_appended_to_inflight, #{item => InflightItem, is_new => IsNew}),
ok; ok;
@ -1274,6 +1399,8 @@ mark_inflight_as_retriable(undefined, _Ref) ->
ok; ok;
mark_inflight_as_retriable(InflightTID, Ref) -> mark_inflight_as_retriable(InflightTID, Ref) ->
_ = ets:update_element(InflightTID, Ref, {?RETRY_IDX, true}), _ = ets:update_element(InflightTID, Ref, {?RETRY_IDX, true}),
%% the old worker's DOWN should not affect this inflight any more
_ = ets:update_element(InflightTID, Ref, {?WORKER_MREF_IDX, erased}),
ok. ok.
%% Track each worker pid only once. %% Track each worker pid only once.
@ -1317,13 +1444,18 @@ ack_inflight(InflightTID, Ref, Id, Index) ->
1; 1;
[?INFLIGHT_ITEM(Ref, [?QUERY(_, _, _, _) | _] = Batch, _IsRetriable, _WorkerMRef)] -> [?INFLIGHT_ITEM(Ref, [?QUERY(_, _, _, _) | _] = Batch, _IsRetriable, _WorkerMRef)] ->
length(Batch); length(Batch);
_ -> [] ->
0 0
end, end,
IsAcked = Count > 0, ok = dec_inflight(InflightTID, Count),
IsAcked andalso ets:update_counter(InflightTID, ?SIZE_REF, {2, -Count, 0, 0}), IsKnownRef = (Count > 0),
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID)), case IsKnownRef of
IsAcked. true ->
emqx_resource_metrics:inflight_set(Id, Index, inflight_num_msgs(InflightTID));
false ->
ok
end,
IsKnownRef.
mark_inflight_items_as_retriable(Data, WorkerMRef) -> mark_inflight_items_as_retriable(Data, WorkerMRef) ->
#{inflight_tid := InflightTID} = Data, #{inflight_tid := InflightTID} = Data,
@ -1341,9 +1473,18 @@ mark_inflight_items_as_retriable(Data, WorkerMRef) ->
ok. ok.
%% used to update a batch after dropping expired individual queries. %% used to update a batch after dropping expired individual queries.
update_inflight_item(InflightTID, Ref, NewBatch) -> update_inflight_item(InflightTID, Ref, NewBatch, NumExpired) ->
_ = ets:update_element(InflightTID, Ref, {?ITEM_IDX, NewBatch}), _ = ets:update_element(InflightTID, Ref, {?ITEM_IDX, NewBatch}),
?tp(buffer_worker_worker_update_inflight_item, #{ref => Ref}), ok = dec_inflight(InflightTID, NumExpired).
inc_inflight(InflightTID, Count) ->
_ = ets:update_counter(InflightTID, ?SIZE_REF, {2, Count}),
ok.
dec_inflight(_InflightTID, 0) ->
ok;
dec_inflight(InflightTID, Count) when Count > 0 ->
_ = ets:update_counter(InflightTID, ?SIZE_REF, {2, -Count, 0, 0}),
ok. ok.
%%============================================================================== %%==============================================================================
@ -1453,22 +1594,12 @@ is_async_return(_) ->
false. false.
sieve_expired_requests(Batch, Now) -> sieve_expired_requests(Batch, Now) ->
{Expired, NotExpired} = lists:partition(
lists:partition( fun(?QUERY(_ReplyTo, _CoreReq, _HasBeenSent, ExpireAt)) ->
fun(?QUERY(_ReplyTo, _CoreReq, _HasBeenSent, ExpireAt)) -> not is_expired(ExpireAt, Now)
is_expired(ExpireAt, Now) end,
end, Batch
Batch ).
),
case {NotExpired, Expired} of
{[], []} ->
%% Should be impossible for batch_size >= 1.
all_expired;
{[], [_ | _]} ->
all_expired;
{[_ | _], _} ->
{NotExpired, Expired}
end.
-spec is_expired(infinity | integer(), integer()) -> boolean(). -spec is_expired(infinity | integer(), integer()) -> boolean().
is_expired(infinity = _ExpireAt, _Now) -> is_expired(infinity = _ExpireAt, _Now) ->

View File

@ -135,11 +135,11 @@ on_query(_InstId, get_counter, #{pid := Pid}) ->
after 1000 -> after 1000 ->
{error, timeout} {error, timeout}
end; end;
on_query(_InstId, {sleep, For}, #{pid := Pid}) -> on_query(_InstId, {sleep_before_reply, For}, #{pid := Pid}) ->
?tp(connector_demo_sleep, #{mode => sync, for => For}), ?tp(connector_demo_sleep, #{mode => sync, for => For}),
ReqRef = make_ref(), ReqRef = make_ref(),
From = {self(), ReqRef}, From = {self(), ReqRef},
Pid ! {From, {sleep, For}}, Pid ! {From, {sleep_before_reply, For}},
receive receive
{ReqRef, Result} -> {ReqRef, Result} ->
Result Result
@ -159,9 +159,9 @@ on_query_async(_InstId, block_now, ReplyFun, #{pid := Pid}) ->
on_query_async(_InstId, {big_payload, Payload}, ReplyFun, #{pid := Pid}) -> on_query_async(_InstId, {big_payload, Payload}, ReplyFun, #{pid := Pid}) ->
Pid ! {big_payload, Payload, ReplyFun}, Pid ! {big_payload, Payload, ReplyFun},
{ok, Pid}; {ok, Pid};
on_query_async(_InstId, {sleep, For}, ReplyFun, #{pid := Pid}) -> on_query_async(_InstId, {sleep_before_reply, For}, ReplyFun, #{pid := Pid}) ->
?tp(connector_demo_sleep, #{mode => async, for => For}), ?tp(connector_demo_sleep, #{mode => async, for => For}),
Pid ! {{sleep, For}, ReplyFun}, Pid ! {{sleep_before_reply, For}, ReplyFun},
{ok, Pid}. {ok, Pid}.
on_batch_query(InstId, BatchReq, State) -> on_batch_query(InstId, BatchReq, State) ->
@ -173,10 +173,13 @@ on_batch_query(InstId, BatchReq, State) ->
get_counter -> get_counter ->
batch_get_counter(sync, InstId, State); batch_get_counter(sync, InstId, State);
{big_payload, _Payload} -> {big_payload, _Payload} ->
batch_big_payload(sync, InstId, BatchReq, State) batch_big_payload(sync, InstId, BatchReq, State);
{random_reply, Num} ->
%% async batch retried
make_random_reply(Num)
end. end.
on_batch_query_async(InstId, BatchReq, ReplyFunAndArgs, State) -> on_batch_query_async(InstId, BatchReq, ReplyFunAndArgs, #{pid := Pid} = State) ->
%% Requests can be of multiple types, but cannot be mixed. %% Requests can be of multiple types, but cannot be mixed.
case hd(BatchReq) of case hd(BatchReq) of
{inc_counter, _} -> {inc_counter, _} ->
@ -186,7 +189,11 @@ on_batch_query_async(InstId, BatchReq, ReplyFunAndArgs, State) ->
block_now -> block_now ->
on_query_async(InstId, block_now, ReplyFunAndArgs, State); on_query_async(InstId, block_now, ReplyFunAndArgs, State);
{big_payload, _Payload} -> {big_payload, _Payload} ->
batch_big_payload({async, ReplyFunAndArgs}, InstId, BatchReq, State) batch_big_payload({async, ReplyFunAndArgs}, InstId, BatchReq, State);
{random_reply, Num} ->
%% only take the first Num in the batch should be random enough
Pid ! {{random_reply, Num}, ReplyFunAndArgs},
{ok, Pid}
end. end.
batch_inc_counter(CallMode, InstId, BatchReq, State) -> batch_inc_counter(CallMode, InstId, BatchReq, State) ->
@ -299,16 +306,33 @@ counter_loop(
{{FromPid, ReqRef}, get} -> {{FromPid, ReqRef}, get} ->
FromPid ! {ReqRef, Num}, FromPid ! {ReqRef, Num},
State; State;
{{sleep, _} = SleepQ, ReplyFun} -> {{random_reply, RandNum}, ReplyFun} ->
%% usually a behaving connector should reply once and only once for
%% each (batch) request
%% but we try to reply random results a random number of times
%% with 'ok' in the result, the buffer worker should eventually
%% drain the buffer (and inflights table)
ReplyCount = 1 + (RandNum rem 3),
Results = make_random_replies(ReplyCount),
%% add a delay to trigger inflight full
lists:foreach(
fun(Result) ->
timer:sleep(rand:uniform(5)),
apply_reply(ReplyFun, Result)
end,
Results
),
State;
{{sleep_before_reply, _} = SleepQ, ReplyFun} ->
apply_reply(ReplyFun, handle_query(async, SleepQ, Status)), apply_reply(ReplyFun, handle_query(async, SleepQ, Status)),
State; State;
{{FromPid, ReqRef}, {sleep, _} = SleepQ} -> {{FromPid, ReqRef}, {sleep_before_reply, _} = SleepQ} ->
FromPid ! {ReqRef, handle_query(sync, SleepQ, Status)}, FromPid ! {ReqRef, handle_query(sync, SleepQ, Status)},
State State
end, end,
counter_loop(NewState). counter_loop(NewState).
handle_query(Mode, {sleep, For} = Query, Status) -> handle_query(Mode, {sleep_before_reply, For} = Query, Status) ->
ok = timer:sleep(For), ok = timer:sleep(For),
Result = Result =
case Status of case Status of
@ -329,3 +353,18 @@ maybe_register(_Name, _Pid, false) ->
apply_reply({ReplyFun, Args}, Result) when is_function(ReplyFun) -> apply_reply({ReplyFun, Args}, Result) when is_function(ReplyFun) ->
apply(ReplyFun, Args ++ [Result]). apply(ReplyFun, Args ++ [Result]).
make_random_replies(0) ->
[];
make_random_replies(N) ->
[make_random_reply(N) | make_random_replies(N - 1)].
make_random_reply(N) ->
case rand:uniform(3) of
1 ->
{ok, N};
2 ->
{error, {recoverable_error, N}};
3 ->
{error, {unrecoverable_error, N}}
end.

View File

@ -1482,7 +1482,7 @@ t_retry_async_inflight_full(_Config) ->
AsyncInflightWindow * 2, AsyncInflightWindow * 2,
fun() -> fun() ->
For = (ResumeInterval div 4) + rand:uniform(ResumeInterval div 4), For = (ResumeInterval div 4) + rand:uniform(ResumeInterval div 4),
{sleep, For} {sleep_before_reply, For}
end, end,
#{async_reply_fun => {fun(Res) -> ct:pal("Res = ~p", [Res]) end, []}} #{async_reply_fun => {fun(Res) -> ct:pal("Res = ~p", [Res]) end, []}}
), ),
@ -1507,6 +1507,59 @@ t_retry_async_inflight_full(_Config) ->
?assertEqual(0, emqx_resource_metrics:inflight_get(?ID)), ?assertEqual(0, emqx_resource_metrics:inflight_get(?ID)),
ok. ok.
%% this test case is to ensure the buffer worker will not go crazy even
%% if the underlying connector is misbehaving: evaluate async callbacks multiple times
t_async_reply_multi_eval(_Config) ->
ResumeInterval = 5,
TotalTime = 5_000,
AsyncInflightWindow = 3,
TotalQueries = AsyncInflightWindow * 5,
emqx_connector_demo:set_callback_mode(async_if_possible),
{ok, _} = emqx_resource:create(
?ID,
?DEFAULT_RESOURCE_GROUP,
?TEST_RESOURCE,
#{name => ?FUNCTION_NAME},
#{
query_mode => async,
async_inflight_window => AsyncInflightWindow,
batch_size => 3,
batch_time => 10,
worker_pool_size => 1,
resume_interval => ResumeInterval
}
),
%% block
ok = emqx_resource:simple_sync_query(?ID, block),
inc_counter_in_parallel(
TotalQueries,
fun() ->
Rand = rand:uniform(1000),
{random_reply, Rand}
end,
#{}
),
?retry(
ResumeInterval,
TotalTime div ResumeInterval,
begin
Metrics = tap_metrics(?LINE),
#{
counters := Counters,
gauges := #{queuing := 0, inflight := 0}
} = Metrics,
#{
matched := Matched,
success := Success,
dropped := Dropped,
late_reply := LateReply,
failed := Failed
} = Counters,
?assertEqual(TotalQueries, Matched - 1),
?assertEqual(Matched, Success + Dropped + LateReply + Failed)
end
).
t_retry_async_inflight_batch(_Config) -> t_retry_async_inflight_batch(_Config) ->
ResumeInterval = 1_000, ResumeInterval = 1_000,
emqx_connector_demo:set_callback_mode(async_if_possible), emqx_connector_demo:set_callback_mode(async_if_possible),
@ -1944,7 +1997,7 @@ t_expiration_async_batch_after_reply(_Config) ->
#{name => test_resource}, #{name => test_resource},
#{ #{
query_mode => async, query_mode => async,
batch_size => 2, batch_size => 3,
batch_time => 100, batch_time => 100,
worker_pool_size => 1, worker_pool_size => 1,
resume_interval => 2_000 resume_interval => 2_000
@ -1959,7 +2012,7 @@ do_t_expiration_async_after_reply(IsBatch) ->
NAcks = NAcks =
case IsBatch of case IsBatch of
batch -> 1; batch -> 1;
single -> 2 single -> 3
end, end,
?force_ordering( ?force_ordering(
#{?snk_kind := buffer_worker_flush_ack}, #{?snk_kind := buffer_worker_flush_ack},
@ -1980,6 +2033,10 @@ do_t_expiration_async_after_reply(IsBatch) ->
ok, ok,
emqx_resource:query(?ID, {inc_counter, 199}, #{timeout => TimeoutMS}) emqx_resource:query(?ID, {inc_counter, 199}, #{timeout => TimeoutMS})
), ),
?assertEqual(
ok,
emqx_resource:query(?ID, {inc_counter, 299}, #{timeout => TimeoutMS})
),
?assertEqual( ?assertEqual(
ok, emqx_resource:query(?ID, {inc_counter, 99}, #{timeout => infinity}) ok, emqx_resource:query(?ID, {inc_counter, 99}, #{timeout => infinity})
), ),
@ -1997,30 +2054,44 @@ do_t_expiration_async_after_reply(IsBatch) ->
{ok, _} = ?block_until( {ok, _} = ?block_until(
#{?snk_kind := handle_async_reply_expired}, 10 * TimeoutMS #{?snk_kind := handle_async_reply_expired}, 10 * TimeoutMS
), ),
wait_telemetry_event(success, #{n_events => 1, timeout => 4_000}),
unlink(Pid0), unlink(Pid0),
exit(Pid0, kill), exit(Pid0, kill),
ok ok
end, end,
fun(Trace) -> fun(Trace) ->
?assertMatch( case IsBatch of
[ batch ->
#{ ?assertMatch(
expired := [{query, _, {inc_counter, 199}, _, _}] [
} #{
], expired := [
?of_kind(handle_async_reply_expired, Trace) {query, _, {inc_counter, 199}, _, _},
), {query, _, {inc_counter, 299}, _, _}
wait_telemetry_event(success, #{n_events => 1, timeout => 4_000}), ]
}
],
?of_kind(handle_async_reply_expired, Trace)
);
single ->
?assertMatch(
[
#{expired := [{query, _, {inc_counter, 199}, _, _}]},
#{expired := [{query, _, {inc_counter, 299}, _, _}]}
],
?of_kind(handle_async_reply_expired, Trace)
)
end,
Metrics = tap_metrics(?LINE), Metrics = tap_metrics(?LINE),
?assertMatch( ?assertMatch(
#{ #{
counters := #{ counters := #{
matched := 2, matched := 3,
%% the request with infinity timeout. %% the request with infinity timeout.
success := 1, success := 1,
dropped := 0, dropped := 0,
late_reply := 1, late_reply := 2,
retried := 0, retried := 0,
failed := 0 failed := 0
} }
@ -2042,7 +2113,7 @@ t_expiration_batch_all_expired_after_reply(_Config) ->
#{name => test_resource}, #{name => test_resource},
#{ #{
query_mode => async, query_mode => async,
batch_size => 2, batch_size => 3,
batch_time => 100, batch_time => 100,
worker_pool_size => 1, worker_pool_size => 1,
resume_interval => ResumeInterval resume_interval => ResumeInterval
@ -2067,6 +2138,10 @@ t_expiration_batch_all_expired_after_reply(_Config) ->
ok, ok,
emqx_resource:query(?ID, {inc_counter, 199}, #{timeout => TimeoutMS}) emqx_resource:query(?ID, {inc_counter, 199}, #{timeout => TimeoutMS})
), ),
?assertEqual(
ok,
emqx_resource:query(?ID, {inc_counter, 299}, #{timeout => TimeoutMS})
),
Pid0 = Pid0 =
spawn_link(fun() -> spawn_link(fun() ->
?tp(delay_enter, #{}), ?tp(delay_enter, #{}),
@ -2087,7 +2162,10 @@ t_expiration_batch_all_expired_after_reply(_Config) ->
?assertMatch( ?assertMatch(
[ [
#{ #{
expired := [{query, _, {inc_counter, 199}, _, _}] expired := [
{query, _, {inc_counter, 199}, _, _},
{query, _, {inc_counter, 299}, _, _}
]
} }
], ],
?of_kind(handle_async_reply_expired, Trace) ?of_kind(handle_async_reply_expired, Trace)
@ -2096,12 +2174,16 @@ t_expiration_batch_all_expired_after_reply(_Config) ->
?assertMatch( ?assertMatch(
#{ #{
counters := #{ counters := #{
matched := 1, matched := 2,
success := 0, success := 0,
dropped := 0, dropped := 0,
late_reply := 1, late_reply := 2,
retried := 0, retried := 0,
failed := 0 failed := 0
},
gauges := #{
inflight := 0,
queuing := 0
} }
}, },
Metrics Metrics
@ -2217,6 +2299,16 @@ do_t_expiration_retry(IsBatch) ->
[#{expired := [{query, _, {inc_counter, 1}, _, _}]}], [#{expired := [{query, _, {inc_counter, 1}, _, _}]}],
?of_kind(buffer_worker_retry_expired, Trace) ?of_kind(buffer_worker_retry_expired, Trace)
), ),
Metrics = tap_metrics(?LINE),
?assertMatch(
#{
gauges := #{
inflight := 0,
queuing := 0
}
},
Metrics
),
ok ok
end end
), ),

View File

@ -0,0 +1 @@
Fix bridge metrics when running in async mode with batching enabled (`batch_size` > 1).

View File

@ -0,0 +1 @@
修复使用异步和批量配置的桥接计数不准确的问题。

View File

@ -268,7 +268,7 @@ kafka_bridge_rest_api_helper(Config) ->
CreateBodyTmp = #{ CreateBodyTmp = #{
<<"type">> => <<"kafka">>, <<"type">> => <<"kafka">>,
<<"name">> => <<"my_kafka_bridge">>, <<"name">> => <<"my_kafka_bridge">>,
<<"bootstrap_hosts">> => maps:get(<<"bootstrap_hosts">>, Config), <<"bootstrap_hosts">> => iolist_to_binary(maps:get(<<"bootstrap_hosts">>, Config)),
<<"enable">> => true, <<"enable">> => true,
<<"authentication">> => maps:get(<<"authentication">>, Config), <<"authentication">> => maps:get(<<"authentication">>, Config),
<<"producer">> => #{ <<"producer">> => #{
@ -276,7 +276,7 @@ kafka_bridge_rest_api_helper(Config) ->
topic => <<"t/#">> topic => <<"t/#">>
}, },
<<"kafka">> => #{ <<"kafka">> => #{
<<"topic">> => erlang:list_to_binary(KafkaTopic), <<"topic">> => iolist_to_binary(KafkaTopic),
<<"buffer">> => #{ <<"buffer">> => #{
<<"memory_overload_protection">> => <<"false">> <<"memory_overload_protection">> => <<"false">>
}, },

View File

@ -21,6 +21,12 @@ help() {
echo " otherwise it runs the entire app's CT" echo " otherwise it runs the entire app's CT"
} }
if command -v docker-compose; then
DC='docker-compose'
else
DC='docker compose'
fi
WHICH_APP='novalue' WHICH_APP='novalue'
CONSOLE='no' CONSOLE='no'
KEEP_UP='no' KEEP_UP='no'
@ -155,7 +161,7 @@ for dep in ${CT_DEPS}; do
;; ;;
tdengine) tdengine)
FILES+=( '.ci/docker-compose-file/docker-compose-tdengine-restful.yaml' ) FILES+=( '.ci/docker-compose-file/docker-compose-tdengine-restful.yaml' )
;; ;;
*) *)
echo "unknown_ct_dependency $dep" echo "unknown_ct_dependency $dep"
exit 1 exit 1
@ -201,7 +207,7 @@ if [ "$STOP" = 'no' ]; then
# some left-over log file has to be deleted before a new docker-compose up # some left-over log file has to be deleted before a new docker-compose up
rm -f '.ci/docker-compose-file/redis/*.log' rm -f '.ci/docker-compose-file/redis/*.log'
# shellcheck disable=2086 # no quotes for F_OPTIONS # shellcheck disable=2086 # no quotes for F_OPTIONS
docker compose $F_OPTIONS up -d --build --remove-orphans $DC $F_OPTIONS up -d --build --remove-orphans
fi fi
echo "Fixing file owners and permissions for $UID_GID" echo "Fixing file owners and permissions for $UID_GID"
@ -218,7 +224,7 @@ set +e
if [ "$STOP" = 'yes' ]; then if [ "$STOP" = 'yes' ]; then
# shellcheck disable=2086 # no quotes for F_OPTIONS # shellcheck disable=2086 # no quotes for F_OPTIONS
docker compose $F_OPTIONS down --remove-orphans $DC $F_OPTIONS down --remove-orphans
elif [ "$ATTACH" = 'yes' ]; then elif [ "$ATTACH" = 'yes' ]; then
docker exec -it "$ERLANG_CONTAINER" bash docker exec -it "$ERLANG_CONTAINER" bash
elif [ "$CONSOLE" = 'yes' ]; then elif [ "$CONSOLE" = 'yes' ]; then
@ -235,11 +241,11 @@ else
LOG='_build/test/logs/docker-compose.log' LOG='_build/test/logs/docker-compose.log'
echo "Dumping docker-compose log to $LOG" echo "Dumping docker-compose log to $LOG"
# shellcheck disable=2086 # no quotes for F_OPTIONS # shellcheck disable=2086 # no quotes for F_OPTIONS
docker compose $F_OPTIONS logs --no-color --timestamps > "$LOG" $DC $F_OPTIONS logs --no-color --timestamps > "$LOG"
fi fi
if [ "$KEEP_UP" != 'yes' ]; then if [ "$KEEP_UP" != 'yes' ]; then
# shellcheck disable=2086 # no quotes for F_OPTIONS # shellcheck disable=2086 # no quotes for F_OPTIONS
docker compose $F_OPTIONS down $DC $F_OPTIONS down
fi fi
exit $RESULT exit $RESULT
fi fi