chore: retry as much as possible, don't reply to caller too soon
This commit is contained in:
parent
b82009bc29
commit
fa01deb3eb
|
@ -82,7 +82,7 @@ authorize(
|
||||||
} = Config
|
} = Config
|
||||||
) ->
|
) ->
|
||||||
Request = generate_request(PubSub, Topic, Client, Config),
|
Request = generate_request(PubSub, Topic, Client, Config),
|
||||||
case emqx_resource:simple_sync_query(ResourceID, {Method, Request, RequestTimeout}) of
|
try emqx_resource:simple_sync_query(ResourceID, {Method, Request, RequestTimeout}) of
|
||||||
{ok, 204, _Headers} ->
|
{ok, 204, _Headers} ->
|
||||||
{matched, allow};
|
{matched, allow};
|
||||||
{ok, 200, Headers, Body} ->
|
{ok, 200, Headers, Body} ->
|
||||||
|
@ -112,6 +112,16 @@ authorize(
|
||||||
reason => Reason
|
reason => Reason
|
||||||
}),
|
}),
|
||||||
ignore
|
ignore
|
||||||
|
catch
|
||||||
|
error:timeout ->
|
||||||
|
Reason = timeout,
|
||||||
|
?tp(authz_http_request_failure, #{error => Reason}),
|
||||||
|
?SLOG(error, #{
|
||||||
|
msg => "http_server_query_failed",
|
||||||
|
resource => ResourceID,
|
||||||
|
reason => Reason
|
||||||
|
}),
|
||||||
|
ignore
|
||||||
end.
|
end.
|
||||||
|
|
||||||
log_nomtach_msg(Status, Headers, Body) ->
|
log_nomtach_msg(Status, Headers, Body) ->
|
||||||
|
|
|
@ -172,7 +172,7 @@ t_response_handling(_Config) ->
|
||||||
[
|
[
|
||||||
#{
|
#{
|
||||||
?snk_kind := authz_http_request_failure,
|
?snk_kind := authz_http_request_failure,
|
||||||
error := {recoverable_error, econnrefused}
|
error := timeout
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
?of_kind(authz_http_request_failure, Trace)
|
?of_kind(authz_http_request_failure, Trace)
|
||||||
|
|
|
@ -170,8 +170,11 @@ send_message(BridgeId, Message) ->
|
||||||
case emqx:get_config([bridges, BridgeType, BridgeName], not_found) of
|
case emqx:get_config([bridges, BridgeType, BridgeName], not_found) of
|
||||||
not_found ->
|
not_found ->
|
||||||
{error, {bridge_not_found, BridgeId}};
|
{error, {bridge_not_found, BridgeId}};
|
||||||
#{enable := true} ->
|
#{enable := true} = Config ->
|
||||||
emqx_resource:query(ResId, {send_message, Message});
|
Timeout = emqx_map_lib:deep_get(
|
||||||
|
[resource_opts, request_timeout], Config, timer:seconds(15)
|
||||||
|
),
|
||||||
|
emqx_resource:query(ResId, {send_message, Message}, #{timeout => Timeout});
|
||||||
#{enable := false} ->
|
#{enable := false} ->
|
||||||
{error, {bridge_stopped, BridgeId}}
|
{error, {bridge_stopped, BridgeId}}
|
||||||
end.
|
end.
|
||||||
|
|
|
@ -145,10 +145,12 @@ set_special_configs(_) ->
|
||||||
|
|
||||||
init_per_testcase(_, Config) ->
|
init_per_testcase(_, Config) ->
|
||||||
{ok, _} = emqx_cluster_rpc:start_link(node(), emqx_cluster_rpc, 1000),
|
{ok, _} = emqx_cluster_rpc:start_link(node(), emqx_cluster_rpc, 1000),
|
||||||
|
ok = snabbkaffe:start_trace(),
|
||||||
Config.
|
Config.
|
||||||
end_per_testcase(_, _Config) ->
|
end_per_testcase(_, _Config) ->
|
||||||
clear_resources(),
|
clear_resources(),
|
||||||
emqx_common_test_helpers:call_janitor(),
|
emqx_common_test_helpers:call_janitor(),
|
||||||
|
snabbkaffe:stop(),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
clear_resources() ->
|
clear_resources() ->
|
||||||
|
@ -478,8 +480,6 @@ t_egress_custom_clientid_prefix(_Config) ->
|
||||||
end,
|
end,
|
||||||
|
|
||||||
{ok, 204, <<>>} = request(delete, uri(["bridges", BridgeIDEgress]), []),
|
{ok, 204, <<>>} = request(delete, uri(["bridges", BridgeIDEgress]), []),
|
||||||
{ok, 200, <<"[]">>} = request(get, uri(["bridges"]), []),
|
|
||||||
|
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
t_mqtt_conn_bridge_ingress_and_egress(_) ->
|
t_mqtt_conn_bridge_ingress_and_egress(_) ->
|
||||||
|
@ -830,6 +830,7 @@ t_mqtt_conn_bridge_egress_reconnect(_) ->
|
||||||
<<"resource_opts">> => #{
|
<<"resource_opts">> => #{
|
||||||
<<"worker_pool_size">> => 2,
|
<<"worker_pool_size">> => 2,
|
||||||
<<"query_mode">> => <<"sync">>,
|
<<"query_mode">> => <<"sync">>,
|
||||||
|
<<"request_timeout">> => <<"500ms">>,
|
||||||
%% to make it check the healthy quickly
|
%% to make it check the healthy quickly
|
||||||
<<"health_check_interval">> => <<"0.5s">>
|
<<"health_check_interval">> => <<"0.5s">>
|
||||||
}
|
}
|
||||||
|
@ -880,17 +881,14 @@ t_mqtt_conn_bridge_egress_reconnect(_) ->
|
||||||
ok = emqx_listeners:stop_listener('tcp:default'),
|
ok = emqx_listeners:stop_listener('tcp:default'),
|
||||||
ct:sleep(1500),
|
ct:sleep(1500),
|
||||||
|
|
||||||
%% PUBLISH 2 messages to the 'local' broker, the message should
|
%% PUBLISH 2 messages to the 'local' broker, the messages should
|
||||||
ok = snabbkaffe:start_trace(),
|
%% be enqueued and the resource will block
|
||||||
{ok, SRef} =
|
{ok, SRef} =
|
||||||
snabbkaffe:subscribe(
|
snabbkaffe:subscribe(
|
||||||
fun
|
fun
|
||||||
(
|
(#{?snk_kind := resource_worker_retry_inflight_failed}) ->
|
||||||
#{
|
true;
|
||||||
?snk_kind := call_query_enter,
|
(#{?snk_kind := resource_worker_flush_nack}) ->
|
||||||
query := {query, _From, {send_message, #{}}, _Sent}
|
|
||||||
}
|
|
||||||
) ->
|
|
||||||
true;
|
true;
|
||||||
(_) ->
|
(_) ->
|
||||||
false
|
false
|
||||||
|
@ -903,7 +901,6 @@ t_mqtt_conn_bridge_egress_reconnect(_) ->
|
||||||
emqx:publish(emqx_message:make(LocalTopic, Payload1)),
|
emqx:publish(emqx_message:make(LocalTopic, Payload1)),
|
||||||
emqx:publish(emqx_message:make(LocalTopic, Payload2)),
|
emqx:publish(emqx_message:make(LocalTopic, Payload2)),
|
||||||
{ok, _} = snabbkaffe:receive_events(SRef),
|
{ok, _} = snabbkaffe:receive_events(SRef),
|
||||||
ok = snabbkaffe:stop(),
|
|
||||||
|
|
||||||
%% verify the metrics of the bridge, the message should be queued
|
%% verify the metrics of the bridge, the message should be queued
|
||||||
{ok, 200, BridgeStr1} = request(get, uri(["bridges", BridgeIDEgress]), []),
|
{ok, 200, BridgeStr1} = request(get, uri(["bridges", BridgeIDEgress]), []),
|
||||||
|
|
|
@ -89,6 +89,17 @@ For bridges only have ingress direction data flow, it can be set to 0 otherwise
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
request_timeout {
|
||||||
|
desc {
|
||||||
|
en: """Timeout for requests. If <code>query_mode</code> is <code>sync</code>, calls to the resource will be blocked for this amount of time before timing out."""
|
||||||
|
zh: """请求的超时。 如果<code>query_mode</code>是<code>sync</code>,对资源的调用将在超时前被阻断这一时间。"""
|
||||||
|
}
|
||||||
|
label {
|
||||||
|
en: """Request timeout"""
|
||||||
|
zh: """请求超时"""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
enable_batch {
|
enable_batch {
|
||||||
desc {
|
desc {
|
||||||
en: """Batch mode enabled."""
|
en: """Batch mode enabled."""
|
||||||
|
|
|
@ -100,7 +100,7 @@ start_link(Id, Index, Opts) ->
|
||||||
-spec sync_query(id(), request(), query_opts()) -> Result :: term().
|
-spec sync_query(id(), request(), query_opts()) -> Result :: term().
|
||||||
sync_query(Id, Request, Opts) ->
|
sync_query(Id, Request, Opts) ->
|
||||||
PickKey = maps:get(pick_key, Opts, self()),
|
PickKey = maps:get(pick_key, Opts, self()),
|
||||||
Timeout = maps:get(timeout, Opts, infinity),
|
Timeout = maps:get(timeout, Opts, timer:seconds(15)),
|
||||||
emqx_resource_metrics:matched_inc(Id),
|
emqx_resource_metrics:matched_inc(Id),
|
||||||
pick_call(Id, PickKey, {query, Request, Opts}, Timeout).
|
pick_call(Id, PickKey, {query, Request, Opts}, Timeout).
|
||||||
|
|
||||||
|
@ -234,10 +234,7 @@ blocked(cast, flush, Data) ->
|
||||||
blocked(state_timeout, unblock, St) ->
|
blocked(state_timeout, unblock, St) ->
|
||||||
resume_from_blocked(St);
|
resume_from_blocked(St);
|
||||||
blocked(info, ?SEND_REQ(_ReqFrom, {query, _Request, _Opts}) = Request0, Data0) ->
|
blocked(info, ?SEND_REQ(_ReqFrom, {query, _Request, _Opts}) = Request0, Data0) ->
|
||||||
#{id := Id} = Data0,
|
{_Queries, Data} = collect_and_enqueue_query_requests(Request0, Data0),
|
||||||
{Queries, Data} = collect_and_enqueue_query_requests(Request0, Data0),
|
|
||||||
Error = ?RESOURCE_ERROR(blocked, "resource is blocked"),
|
|
||||||
_ = batch_reply_caller(Id, Error, Queries),
|
|
||||||
{keep_state, Data};
|
{keep_state, Data};
|
||||||
blocked(info, {flush, _Ref}, _Data) ->
|
blocked(info, {flush, _Ref}, _Data) ->
|
||||||
keep_state_and_data;
|
keep_state_and_data;
|
||||||
|
@ -337,10 +334,16 @@ retry_inflight_sync(Ref, QueryOrBatch, Data0) ->
|
||||||
} = Data0,
|
} = Data0,
|
||||||
?tp(resource_worker_retry_inflight, #{query_or_batch => QueryOrBatch, ref => Ref}),
|
?tp(resource_worker_retry_inflight, #{query_or_batch => QueryOrBatch, ref => Ref}),
|
||||||
QueryOpts = #{},
|
QueryOpts = #{},
|
||||||
%% if we are retrying an inflight query, it has been sent
|
|
||||||
HasBeenSent = true,
|
|
||||||
Result = call_query(sync, Id, Index, Ref, QueryOrBatch, QueryOpts),
|
Result = call_query(sync, Id, Index, Ref, QueryOrBatch, QueryOpts),
|
||||||
case handle_query_result_pure(Id, Result, HasBeenSent) of
|
ReplyResult =
|
||||||
|
case QueryOrBatch of
|
||||||
|
?QUERY(From, CoreReq, HasBeenSent) ->
|
||||||
|
Reply = ?REPLY(From, CoreReq, HasBeenSent, Result),
|
||||||
|
reply_caller_defer_metrics(Id, Reply);
|
||||||
|
[?QUERY(_, _, _) | _] = Batch ->
|
||||||
|
batch_reply_caller_defer_metrics(Id, Result, Batch)
|
||||||
|
end,
|
||||||
|
case ReplyResult of
|
||||||
%% Send failed because resource is down
|
%% Send failed because resource is down
|
||||||
{nack, PostFn} ->
|
{nack, PostFn} ->
|
||||||
PostFn(),
|
PostFn(),
|
||||||
|
@ -476,27 +479,20 @@ do_flush(
|
||||||
Reply = ?REPLY(From, CoreReq, HasBeenSent, Result),
|
Reply = ?REPLY(From, CoreReq, HasBeenSent, Result),
|
||||||
case reply_caller(Id, Reply) of
|
case reply_caller(Id, Reply) of
|
||||||
%% Failed; remove the request from the queue, as we cannot pop
|
%% Failed; remove the request from the queue, as we cannot pop
|
||||||
%% from it again. But we must ensure it's in the inflight
|
%% from it again, but we'll retry it using the inflight table.
|
||||||
%% table, even if it's full, so we don't lose the request.
|
|
||||||
%% And only in that case.
|
|
||||||
nack ->
|
nack ->
|
||||||
ok = replayq:ack(Q1, QAckRef),
|
ok = replayq:ack(Q1, QAckRef),
|
||||||
%% We might get a retriable response without having added
|
|
||||||
%% the request to the inflight table (e.g.: sync request,
|
|
||||||
%% but resource health check failed prior to calling and
|
|
||||||
%% so we didn't even call it). In that case, we must then
|
|
||||||
%% add it to the inflight table.
|
|
||||||
IsRetriable =
|
|
||||||
is_recoverable_error_result(Result) orelse
|
|
||||||
is_not_connected_result(Result),
|
|
||||||
ShouldPreserveInInflight = is_not_connected_result(Result),
|
|
||||||
%% we set it atomically just below; a limitation of having
|
%% we set it atomically just below; a limitation of having
|
||||||
%% to use tuples for atomic ets updates
|
%% to use tuples for atomic ets updates
|
||||||
|
IsRetriable = true,
|
||||||
WorkerMRef0 = undefined,
|
WorkerMRef0 = undefined,
|
||||||
InflightItem = ?INFLIGHT_ITEM(Ref, Request, IsRetriable, WorkerMRef0),
|
InflightItem = ?INFLIGHT_ITEM(Ref, Request, IsRetriable, WorkerMRef0),
|
||||||
ShouldPreserveInInflight andalso
|
%% we must append again to the table to ensure that the
|
||||||
|
%% request will be retried (i.e., it might not have been
|
||||||
|
%% inserted during `call_query' if the resource was down
|
||||||
|
%% and/or if it was a sync request).
|
||||||
inflight_append(InflightTID, InflightItem, Id, Index),
|
inflight_append(InflightTID, InflightItem, Id, Index),
|
||||||
IsRetriable andalso mark_inflight_as_retriable(InflightTID, Ref),
|
mark_inflight_as_retriable(InflightTID, Ref),
|
||||||
{Data1, WorkerMRef} = ensure_async_worker_monitored(Data0, Result),
|
{Data1, WorkerMRef} = ensure_async_worker_monitored(Data0, Result),
|
||||||
store_async_worker_reference(InflightTID, Ref, WorkerMRef),
|
store_async_worker_reference(InflightTID, Ref, WorkerMRef),
|
||||||
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
|
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
|
||||||
|
@ -513,11 +509,21 @@ do_flush(
|
||||||
%% Success; just ack.
|
%% Success; just ack.
|
||||||
ack ->
|
ack ->
|
||||||
ok = replayq:ack(Q1, QAckRef),
|
ok = replayq:ack(Q1, QAckRef),
|
||||||
|
%% Async requests are acked later when the async worker
|
||||||
|
%% calls the corresponding callback function. Also, we
|
||||||
|
%% must ensure the async worker is being monitored for
|
||||||
|
%% such requests.
|
||||||
is_async(Id) orelse ack_inflight(InflightTID, Ref, Id, Index),
|
is_async(Id) orelse ack_inflight(InflightTID, Ref, Id, Index),
|
||||||
{Data1, WorkerMRef} = ensure_async_worker_monitored(Data0, Result),
|
{Data1, WorkerMRef} = ensure_async_worker_monitored(Data0, Result),
|
||||||
store_async_worker_reference(InflightTID, Ref, WorkerMRef),
|
store_async_worker_reference(InflightTID, Ref, WorkerMRef),
|
||||||
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
|
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
|
||||||
?tp(resource_worker_flush_ack, #{batch_or_query => Request}),
|
?tp(
|
||||||
|
resource_worker_flush_ack,
|
||||||
|
#{
|
||||||
|
batch_or_query => Request,
|
||||||
|
result => Result
|
||||||
|
}
|
||||||
|
),
|
||||||
case queue_count(Q1) > 0 of
|
case queue_count(Q1) > 0 of
|
||||||
true ->
|
true ->
|
||||||
{keep_state, Data1, [{next_event, internal, flush}]};
|
{keep_state, Data1, [{next_event, internal, flush}]};
|
||||||
|
@ -542,27 +548,20 @@ do_flush(Data0, #{
|
||||||
Result = call_query(configured, Id, Index, Ref, Batch, QueryOpts),
|
Result = call_query(configured, Id, Index, Ref, Batch, QueryOpts),
|
||||||
case batch_reply_caller(Id, Result, Batch) of
|
case batch_reply_caller(Id, Result, Batch) of
|
||||||
%% Failed; remove the request from the queue, as we cannot pop
|
%% Failed; remove the request from the queue, as we cannot pop
|
||||||
%% from it again. But we must ensure it's in the inflight
|
%% from it again, but we'll retry it using the inflight table.
|
||||||
%% table, even if it's full, so we don't lose the request.
|
|
||||||
%% And only in that case.
|
|
||||||
nack ->
|
nack ->
|
||||||
ok = replayq:ack(Q1, QAckRef),
|
ok = replayq:ack(Q1, QAckRef),
|
||||||
%% We might get a retriable response without having added
|
|
||||||
%% the request to the inflight table (e.g.: sync request,
|
|
||||||
%% but resource health check failed prior to calling and
|
|
||||||
%% so we didn't even call it). In that case, we must then
|
|
||||||
%% add it to the inflight table.
|
|
||||||
IsRetriable =
|
|
||||||
is_recoverable_error_result(Result) orelse
|
|
||||||
is_not_connected_result(Result),
|
|
||||||
ShouldPreserveInInflight = is_not_connected_result(Result),
|
|
||||||
%% we set it atomically just below; a limitation of having
|
%% we set it atomically just below; a limitation of having
|
||||||
%% to use tuples for atomic ets updates
|
%% to use tuples for atomic ets updates
|
||||||
|
IsRetriable = true,
|
||||||
WorkerMRef0 = undefined,
|
WorkerMRef0 = undefined,
|
||||||
InflightItem = ?INFLIGHT_ITEM(Ref, Batch, IsRetriable, WorkerMRef0),
|
InflightItem = ?INFLIGHT_ITEM(Ref, Batch, IsRetriable, WorkerMRef0),
|
||||||
ShouldPreserveInInflight andalso
|
%% we must append again to the table to ensure that the
|
||||||
|
%% request will be retried (i.e., it might not have been
|
||||||
|
%% inserted during `call_query' if the resource was down
|
||||||
|
%% and/or if it was a sync request).
|
||||||
inflight_append(InflightTID, InflightItem, Id, Index),
|
inflight_append(InflightTID, InflightItem, Id, Index),
|
||||||
IsRetriable andalso mark_inflight_as_retriable(InflightTID, Ref),
|
mark_inflight_as_retriable(InflightTID, Ref),
|
||||||
{Data1, WorkerMRef} = ensure_async_worker_monitored(Data0, Result),
|
{Data1, WorkerMRef} = ensure_async_worker_monitored(Data0, Result),
|
||||||
store_async_worker_reference(InflightTID, Ref, WorkerMRef),
|
store_async_worker_reference(InflightTID, Ref, WorkerMRef),
|
||||||
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
|
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
|
||||||
|
@ -579,11 +578,21 @@ do_flush(Data0, #{
|
||||||
%% Success; just ack.
|
%% Success; just ack.
|
||||||
ack ->
|
ack ->
|
||||||
ok = replayq:ack(Q1, QAckRef),
|
ok = replayq:ack(Q1, QAckRef),
|
||||||
|
%% Async requests are acked later when the async worker
|
||||||
|
%% calls the corresponding callback function. Also, we
|
||||||
|
%% must ensure the async worker is being monitored for
|
||||||
|
%% such requests.
|
||||||
is_async(Id) orelse ack_inflight(InflightTID, Ref, Id, Index),
|
is_async(Id) orelse ack_inflight(InflightTID, Ref, Id, Index),
|
||||||
{Data1, WorkerMRef} = ensure_async_worker_monitored(Data0, Result),
|
{Data1, WorkerMRef} = ensure_async_worker_monitored(Data0, Result),
|
||||||
store_async_worker_reference(InflightTID, Ref, WorkerMRef),
|
store_async_worker_reference(InflightTID, Ref, WorkerMRef),
|
||||||
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
|
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q1)),
|
||||||
?tp(resource_worker_flush_ack, #{batch_or_query => Batch}),
|
?tp(
|
||||||
|
resource_worker_flush_ack,
|
||||||
|
#{
|
||||||
|
batch_or_query => Batch,
|
||||||
|
result => Result
|
||||||
|
}
|
||||||
|
),
|
||||||
CurrentCount = queue_count(Q1),
|
CurrentCount = queue_count(Q1),
|
||||||
case {CurrentCount > 0, CurrentCount >= BatchSize} of
|
case {CurrentCount > 0, CurrentCount >= BatchSize} of
|
||||||
{false, _} ->
|
{false, _} ->
|
||||||
|
@ -597,54 +606,79 @@ do_flush(Data0, #{
|
||||||
end.
|
end.
|
||||||
|
|
||||||
batch_reply_caller(Id, BatchResult, Batch) ->
|
batch_reply_caller(Id, BatchResult, Batch) ->
|
||||||
{ShouldBlock, PostFns} = batch_reply_caller_defer_metrics(Id, BatchResult, Batch),
|
{ShouldBlock, PostFn} = batch_reply_caller_defer_metrics(Id, BatchResult, Batch),
|
||||||
lists:foreach(fun(F) -> F() end, PostFns),
|
PostFn(),
|
||||||
ShouldBlock.
|
ShouldBlock.
|
||||||
|
|
||||||
batch_reply_caller_defer_metrics(Id, BatchResult, Batch) ->
|
batch_reply_caller_defer_metrics(Id, BatchResult, Batch) ->
|
||||||
|
{ShouldAck, PostFns} =
|
||||||
lists:foldl(
|
lists:foldl(
|
||||||
fun(Reply, {_ShouldBlock, PostFns}) ->
|
fun(Reply, {_ShouldAck, PostFns}) ->
|
||||||
{ShouldBlock, PostFn} = reply_caller_defer_metrics(Id, Reply),
|
{ShouldAck, PostFn} = reply_caller_defer_metrics(Id, Reply),
|
||||||
{ShouldBlock, [PostFn | PostFns]}
|
{ShouldAck, [PostFn | PostFns]}
|
||||||
end,
|
end,
|
||||||
{ack, []},
|
{ack, []},
|
||||||
%% the `Mod:on_batch_query/3` returns a single result for a batch,
|
%% the `Mod:on_batch_query/3` returns a single result for a batch,
|
||||||
%% so we need to expand
|
%% so we need to expand
|
||||||
?EXPAND(BatchResult, Batch)
|
?EXPAND(BatchResult, Batch)
|
||||||
).
|
),
|
||||||
|
PostFn = fun() -> lists:foreach(fun(F) -> F() end, PostFns) end,
|
||||||
|
{ShouldAck, PostFn}.
|
||||||
|
|
||||||
reply_caller(Id, Reply) ->
|
reply_caller(Id, Reply) ->
|
||||||
{ShouldBlock, PostFn} = reply_caller_defer_metrics(Id, Reply),
|
{ShouldAck, PostFn} = reply_caller_defer_metrics(Id, Reply),
|
||||||
PostFn(),
|
PostFn(),
|
||||||
ShouldBlock.
|
ShouldAck.
|
||||||
|
|
||||||
|
%% Should only reply to the caller when the decision is final (not
|
||||||
|
%% retriable). See comment on `handle_query_result_pure'.
|
||||||
reply_caller_defer_metrics(Id, ?REPLY(undefined, _, HasBeenSent, Result)) ->
|
reply_caller_defer_metrics(Id, ?REPLY(undefined, _, HasBeenSent, Result)) ->
|
||||||
handle_query_result_pure(Id, Result, HasBeenSent);
|
handle_query_result_pure(Id, Result, HasBeenSent);
|
||||||
reply_caller_defer_metrics(Id, ?REPLY({ReplyFun, Args}, _, HasBeenSent, Result)) when
|
reply_caller_defer_metrics(Id, ?REPLY({ReplyFun, Args}, _, HasBeenSent, Result)) when
|
||||||
is_function(ReplyFun)
|
is_function(ReplyFun)
|
||||||
->
|
->
|
||||||
_ =
|
{ShouldAck, PostFn} = handle_query_result_pure(Id, Result, HasBeenSent),
|
||||||
case Result of
|
case {ShouldAck, Result} of
|
||||||
{async_return, _} -> no_reply_for_now;
|
{nack, _} ->
|
||||||
_ -> apply(ReplyFun, Args ++ [Result])
|
ok;
|
||||||
|
{ack, {async_return, _}} ->
|
||||||
|
ok;
|
||||||
|
{ack, _} ->
|
||||||
|
apply(ReplyFun, Args ++ [Result]),
|
||||||
|
ok
|
||||||
end,
|
end,
|
||||||
handle_query_result_pure(Id, Result, HasBeenSent);
|
{ShouldAck, PostFn};
|
||||||
reply_caller_defer_metrics(Id, ?REPLY(From, _, HasBeenSent, Result)) ->
|
reply_caller_defer_metrics(Id, ?REPLY(From, _, HasBeenSent, Result)) ->
|
||||||
|
{ShouldAck, PostFn} = handle_query_result_pure(Id, Result, HasBeenSent),
|
||||||
|
case {ShouldAck, Result} of
|
||||||
|
{nack, _} ->
|
||||||
|
ok;
|
||||||
|
{ack, {async_return, _}} ->
|
||||||
|
ok;
|
||||||
|
{ack, _} ->
|
||||||
gen_statem:reply(From, Result),
|
gen_statem:reply(From, Result),
|
||||||
handle_query_result_pure(Id, Result, HasBeenSent).
|
ok
|
||||||
|
end,
|
||||||
|
{ShouldAck, PostFn}.
|
||||||
|
|
||||||
handle_query_result(Id, Result, HasBeenSent) ->
|
handle_query_result(Id, Result, HasBeenSent) ->
|
||||||
{ShouldBlock, PostFn} = handle_query_result_pure(Id, Result, HasBeenSent),
|
{ShouldBlock, PostFn} = handle_query_result_pure(Id, Result, HasBeenSent),
|
||||||
PostFn(),
|
PostFn(),
|
||||||
ShouldBlock.
|
ShouldBlock.
|
||||||
|
|
||||||
handle_query_result_pure(Id, ?RESOURCE_ERROR_M(exception, Msg), HasBeenSent) ->
|
%% We should always retry (nack), except when:
|
||||||
|
%% * resource is not found
|
||||||
|
%% * resource is stopped
|
||||||
|
%% * the result is a success (or at least a delayed result)
|
||||||
|
%% We also retry even sync requests. In that case, we shouldn't reply
|
||||||
|
%% the caller until one of those final results above happen.
|
||||||
|
handle_query_result_pure(_Id, ?RESOURCE_ERROR_M(exception, Msg), _HasBeenSent) ->
|
||||||
PostFn = fun() ->
|
PostFn = fun() ->
|
||||||
?SLOG(error, #{msg => resource_exception, info => Msg}),
|
?SLOG(error, #{msg => resource_exception, info => Msg}),
|
||||||
inc_sent_failed(Id, HasBeenSent),
|
%% inc_sent_failed(Id, HasBeenSent),
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
{ack, PostFn};
|
{nack, PostFn};
|
||||||
handle_query_result_pure(_Id, ?RESOURCE_ERROR_M(NotWorking, _), _HasBeenSent) when
|
handle_query_result_pure(_Id, ?RESOURCE_ERROR_M(NotWorking, _), _HasBeenSent) when
|
||||||
NotWorking == not_connected; NotWorking == blocked
|
NotWorking == not_connected; NotWorking == blocked
|
||||||
->
|
->
|
||||||
|
@ -666,10 +700,12 @@ handle_query_result_pure(Id, ?RESOURCE_ERROR_M(stopped, Msg), _HasBeenSent) ->
|
||||||
handle_query_result_pure(Id, ?RESOURCE_ERROR_M(Reason, _), _HasBeenSent) ->
|
handle_query_result_pure(Id, ?RESOURCE_ERROR_M(Reason, _), _HasBeenSent) ->
|
||||||
PostFn = fun() ->
|
PostFn = fun() ->
|
||||||
?SLOG(error, #{id => Id, msg => other_resource_error, reason => Reason}),
|
?SLOG(error, #{id => Id, msg => other_resource_error, reason => Reason}),
|
||||||
emqx_resource_metrics:dropped_other_inc(Id),
|
%% emqx_resource_metrics:dropped_other_inc(Id),
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
{ack, PostFn};
|
{nack, PostFn};
|
||||||
|
%% TODO: invert this logic: we should differentiate errors that are
|
||||||
|
%% irrecoverable; all others are deemed recoverable.
|
||||||
handle_query_result_pure(Id, {error, {recoverable_error, Reason}}, _HasBeenSent) ->
|
handle_query_result_pure(Id, {error, {recoverable_error, Reason}}, _HasBeenSent) ->
|
||||||
%% the message will be queued in replayq or inflight window,
|
%% the message will be queued in replayq or inflight window,
|
||||||
%% i.e. the counter 'queuing' or 'dropped' will increase, so we pretend that we have not
|
%% i.e. the counter 'queuing' or 'dropped' will increase, so we pretend that we have not
|
||||||
|
@ -679,22 +715,18 @@ handle_query_result_pure(Id, {error, {recoverable_error, Reason}}, _HasBeenSent)
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
{nack, PostFn};
|
{nack, PostFn};
|
||||||
handle_query_result_pure(Id, {error, Reason}, HasBeenSent) ->
|
handle_query_result_pure(Id, {error, Reason}, _HasBeenSent) ->
|
||||||
PostFn = fun() ->
|
PostFn = fun() ->
|
||||||
?SLOG(error, #{id => Id, msg => send_error, reason => Reason}),
|
?SLOG(error, #{id => Id, msg => send_error, reason => Reason}),
|
||||||
inc_sent_failed(Id, HasBeenSent),
|
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
{ack, PostFn};
|
{nack, PostFn};
|
||||||
handle_query_result_pure(_Id, {async_return, inflight_full}, _HasBeenSent) ->
|
handle_query_result_pure(Id, {async_return, {error, Msg}}, _HasBeenSent) ->
|
||||||
{nack, fun() -> ok end};
|
|
||||||
handle_query_result_pure(Id, {async_return, {error, Msg}}, HasBeenSent) ->
|
|
||||||
PostFn = fun() ->
|
PostFn = fun() ->
|
||||||
?SLOG(error, #{id => Id, msg => async_send_error, info => Msg}),
|
?SLOG(error, #{id => Id, msg => async_send_error, info => Msg}),
|
||||||
inc_sent_failed(Id, HasBeenSent),
|
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
{ack, PostFn};
|
{nack, PostFn};
|
||||||
handle_query_result_pure(_Id, {async_return, ok}, _HasBeenSent) ->
|
handle_query_result_pure(_Id, {async_return, ok}, _HasBeenSent) ->
|
||||||
{ack, fun() -> ok end};
|
{ack, fun() -> ok end};
|
||||||
handle_query_result_pure(_Id, {async_return, {ok, Pid}}, _HasBeenSent) when is_pid(Pid) ->
|
handle_query_result_pure(_Id, {async_return, {ok, Pid}}, _HasBeenSent) when is_pid(Pid) ->
|
||||||
|
@ -714,18 +746,6 @@ handle_async_worker_down(Data0, Pid) ->
|
||||||
cancel_inflight_items(Data, WorkerMRef),
|
cancel_inflight_items(Data, WorkerMRef),
|
||||||
{keep_state, Data}.
|
{keep_state, Data}.
|
||||||
|
|
||||||
is_not_connected_result(?RESOURCE_ERROR_M(Error, _)) when
|
|
||||||
Error =:= not_connected; Error =:= blocked
|
|
||||||
->
|
|
||||||
true;
|
|
||||||
is_not_connected_result(_) ->
|
|
||||||
false.
|
|
||||||
|
|
||||||
is_recoverable_error_result({error, {recoverable_error, _Reason}}) ->
|
|
||||||
true;
|
|
||||||
is_recoverable_error_result(_) ->
|
|
||||||
false.
|
|
||||||
|
|
||||||
call_query(QM0, Id, Index, Ref, Query, QueryOpts) ->
|
call_query(QM0, Id, Index, Ref, Query, QueryOpts) ->
|
||||||
?tp(call_query_enter, #{id => Id, query => Query}),
|
?tp(call_query_enter, #{id => Id, query => Query}),
|
||||||
case emqx_resource_manager:ets_lookup(Id) of
|
case emqx_resource_manager:ets_lookup(Id) of
|
||||||
|
@ -735,8 +755,9 @@ call_query(QM0, Id, Index, Ref, Query, QueryOpts) ->
|
||||||
configured -> maps:get(query_mode, Data);
|
configured -> maps:get(query_mode, Data);
|
||||||
_ -> QM0
|
_ -> QM0
|
||||||
end,
|
end,
|
||||||
CM = maps:get(callback_mode, Data),
|
CBM = maps:get(callback_mode, Data),
|
||||||
apply_query_fun(call_mode(QM, CM), Mod, Id, Index, Ref, Query, ResSt, QueryOpts);
|
CallMode = call_mode(QM, CBM),
|
||||||
|
apply_query_fun(CallMode, Mod, Id, Index, Ref, Query, ResSt, QueryOpts);
|
||||||
{ok, _Group, #{status := stopped}} ->
|
{ok, _Group, #{status := stopped}} ->
|
||||||
?RESOURCE_ERROR(stopped, "resource stopped or disabled");
|
?RESOURCE_ERROR(stopped, "resource stopped or disabled");
|
||||||
{ok, _Group, #{status := S}} when S == connecting; S == disconnected ->
|
{ok, _Group, #{status := S}} when S == connecting; S == disconnected ->
|
||||||
|
@ -763,20 +784,9 @@ call_query(QM0, Id, Index, Ref, Query, QueryOpts) ->
|
||||||
end
|
end
|
||||||
).
|
).
|
||||||
|
|
||||||
apply_query_fun(sync, Mod, Id, Index, Ref, ?QUERY(_, Request, _) = Query, ResSt, QueryOpts) ->
|
apply_query_fun(sync, Mod, Id, _Index, _Ref, ?QUERY(_, Request, _) = _Query, ResSt, _QueryOpts) ->
|
||||||
?tp(call_query, #{id => Id, mod => Mod, query => Query, res_st => ResSt, call_mode => sync}),
|
?tp(call_query, #{id => Id, mod => Mod, query => _Query, res_st => ResSt, call_mode => sync}),
|
||||||
InflightTID = maps:get(inflight_tid, QueryOpts, undefined),
|
?APPLY_RESOURCE(call_query, Mod:on_query(Id, Request, ResSt), Request);
|
||||||
?APPLY_RESOURCE(
|
|
||||||
call_query,
|
|
||||||
begin
|
|
||||||
IsRetriable = false,
|
|
||||||
WorkerMRef = undefined,
|
|
||||||
InflightItem = ?INFLIGHT_ITEM(Ref, Query, IsRetriable, WorkerMRef),
|
|
||||||
ok = inflight_append(InflightTID, InflightItem, Id, Index),
|
|
||||||
Mod:on_query(Id, Request, ResSt)
|
|
||||||
end,
|
|
||||||
Request
|
|
||||||
);
|
|
||||||
apply_query_fun(async, Mod, Id, Index, Ref, ?QUERY(_, Request, _) = Query, ResSt, QueryOpts) ->
|
apply_query_fun(async, Mod, Id, Index, Ref, ?QUERY(_, Request, _) = Query, ResSt, QueryOpts) ->
|
||||||
?tp(call_query_async, #{
|
?tp(call_query_async, #{
|
||||||
id => Id, mod => Mod, query => Query, res_st => ResSt, call_mode => async
|
id => Id, mod => Mod, query => Query, res_st => ResSt, call_mode => async
|
||||||
|
@ -796,23 +806,12 @@ apply_query_fun(async, Mod, Id, Index, Ref, ?QUERY(_, Request, _) = Query, ResSt
|
||||||
end,
|
end,
|
||||||
Request
|
Request
|
||||||
);
|
);
|
||||||
apply_query_fun(sync, Mod, Id, Index, Ref, [?QUERY(_, _, _) | _] = Batch, ResSt, QueryOpts) ->
|
apply_query_fun(sync, Mod, Id, _Index, _Ref, [?QUERY(_, _, _) | _] = Batch, ResSt, _QueryOpts) ->
|
||||||
?tp(call_batch_query, #{
|
?tp(call_batch_query, #{
|
||||||
id => Id, mod => Mod, batch => Batch, res_st => ResSt, call_mode => sync
|
id => Id, mod => Mod, batch => Batch, res_st => ResSt, call_mode => sync
|
||||||
}),
|
}),
|
||||||
InflightTID = maps:get(inflight_tid, QueryOpts, undefined),
|
|
||||||
Requests = [Request || ?QUERY(_From, Request, _) <- Batch],
|
Requests = [Request || ?QUERY(_From, Request, _) <- Batch],
|
||||||
?APPLY_RESOURCE(
|
?APPLY_RESOURCE(call_batch_query, Mod:on_batch_query(Id, Requests, ResSt), Batch);
|
||||||
call_batch_query,
|
|
||||||
begin
|
|
||||||
IsRetriable = false,
|
|
||||||
WorkerMRef = undefined,
|
|
||||||
InflightItem = ?INFLIGHT_ITEM(Ref, Batch, IsRetriable, WorkerMRef),
|
|
||||||
ok = inflight_append(InflightTID, InflightItem, Id, Index),
|
|
||||||
Mod:on_batch_query(Id, Requests, ResSt)
|
|
||||||
end,
|
|
||||||
Batch
|
|
||||||
);
|
|
||||||
apply_query_fun(async, Mod, Id, Index, Ref, [?QUERY(_, _, _) | _] = Batch, ResSt, QueryOpts) ->
|
apply_query_fun(async, Mod, Id, Index, Ref, [?QUERY(_, _, _) | _] = Batch, ResSt, QueryOpts) ->
|
||||||
?tp(call_batch_query_async, #{
|
?tp(call_batch_query_async, #{
|
||||||
id => Id, mod => Mod, batch => Batch, res_st => ResSt, call_mode => async
|
id => Id, mod => Mod, batch => Batch, res_st => ResSt, call_mode => async
|
||||||
|
@ -839,27 +838,27 @@ reply_after_query(Pid, Id, Index, InflightTID, Ref, ?QUERY(From, Request, HasBee
|
||||||
%% but received no ACK, NOT the number of messages queued in the
|
%% but received no ACK, NOT the number of messages queued in the
|
||||||
%% inflight window.
|
%% inflight window.
|
||||||
{Action, PostFn} = reply_caller_defer_metrics(Id, ?REPLY(From, Request, HasBeenSent, Result)),
|
{Action, PostFn} = reply_caller_defer_metrics(Id, ?REPLY(From, Request, HasBeenSent, Result)),
|
||||||
%% Should always ack async inflight requests that
|
|
||||||
%% returned, otherwise the request will get retried. The
|
|
||||||
%% caller has just been notified of the failure and should
|
|
||||||
%% decide if it wants to retry or not.
|
|
||||||
IsFullBefore = is_inflight_full(InflightTID),
|
|
||||||
IsAcked = ack_inflight(InflightTID, Ref, Id, Index),
|
|
||||||
IsAcked andalso PostFn(),
|
|
||||||
case Action of
|
case Action of
|
||||||
nack ->
|
nack ->
|
||||||
|
%% Keep retrying.
|
||||||
?tp(resource_worker_reply_after_query, #{
|
?tp(resource_worker_reply_after_query, #{
|
||||||
action => nack,
|
action => Action,
|
||||||
batch_or_query => ?QUERY(From, Request, HasBeenSent),
|
batch_or_query => ?QUERY(From, Request, HasBeenSent),
|
||||||
|
ref => Ref,
|
||||||
result => Result
|
result => Result
|
||||||
}),
|
}),
|
||||||
|
mark_inflight_as_retriable(InflightTID, Ref),
|
||||||
?MODULE:block(Pid);
|
?MODULE:block(Pid);
|
||||||
ack ->
|
ack ->
|
||||||
?tp(resource_worker_reply_after_query, #{
|
?tp(resource_worker_reply_after_query, #{
|
||||||
action => ack,
|
action => Action,
|
||||||
batch_or_query => ?QUERY(From, Request, HasBeenSent),
|
batch_or_query => ?QUERY(From, Request, HasBeenSent),
|
||||||
|
ref => Ref,
|
||||||
result => Result
|
result => Result
|
||||||
}),
|
}),
|
||||||
|
IsFullBefore = is_inflight_full(InflightTID),
|
||||||
|
IsAcked = ack_inflight(InflightTID, Ref, Id, Index),
|
||||||
|
IsAcked andalso PostFn(),
|
||||||
IsFullBefore andalso ?MODULE:flush_worker(Pid),
|
IsFullBefore andalso ?MODULE:flush_worker(Pid),
|
||||||
ok
|
ok
|
||||||
end.
|
end.
|
||||||
|
@ -868,24 +867,28 @@ batch_reply_after_query(Pid, Id, Index, InflightTID, Ref, Batch, Result) ->
|
||||||
%% NOTE: 'inflight' is the count of messages that were sent async
|
%% NOTE: 'inflight' is the count of messages that were sent async
|
||||||
%% but received no ACK, NOT the number of messages queued in the
|
%% but received no ACK, NOT the number of messages queued in the
|
||||||
%% inflight window.
|
%% inflight window.
|
||||||
{Action, PostFns} = batch_reply_caller_defer_metrics(Id, Result, Batch),
|
{Action, PostFn} = batch_reply_caller_defer_metrics(Id, Result, Batch),
|
||||||
%% Should always ack async inflight requests that
|
|
||||||
%% returned, otherwise the request will get retried. The
|
|
||||||
%% caller has just been notified of the failure and should
|
|
||||||
%% decide if it wants to retry or not.
|
|
||||||
IsFullBefore = is_inflight_full(InflightTID),
|
|
||||||
IsAcked = ack_inflight(InflightTID, Ref, Id, Index),
|
|
||||||
IsAcked andalso lists:foreach(fun(F) -> F() end, PostFns),
|
|
||||||
case Action of
|
case Action of
|
||||||
nack ->
|
nack ->
|
||||||
|
%% Keep retrying.
|
||||||
?tp(resource_worker_reply_after_query, #{
|
?tp(resource_worker_reply_after_query, #{
|
||||||
action => nack, batch_or_query => Batch, result => Result
|
action => nack,
|
||||||
|
batch_or_query => Batch,
|
||||||
|
ref => Ref,
|
||||||
|
result => Result
|
||||||
}),
|
}),
|
||||||
|
mark_inflight_as_retriable(InflightTID, Ref),
|
||||||
?MODULE:block(Pid);
|
?MODULE:block(Pid);
|
||||||
ack ->
|
ack ->
|
||||||
?tp(resource_worker_reply_after_query, #{
|
?tp(resource_worker_reply_after_query, #{
|
||||||
action => ack, batch_or_query => Batch, result => Result
|
action => ack,
|
||||||
|
batch_or_query => Batch,
|
||||||
|
ref => Ref,
|
||||||
|
result => Result
|
||||||
}),
|
}),
|
||||||
|
IsFullBefore = is_inflight_full(InflightTID),
|
||||||
|
IsAcked = ack_inflight(InflightTID, Ref, Id, Index),
|
||||||
|
IsAcked andalso PostFn(),
|
||||||
IsFullBefore andalso ?MODULE:flush_worker(Pid),
|
IsFullBefore andalso ?MODULE:flush_worker(Pid),
|
||||||
ok
|
ok
|
||||||
end.
|
end.
|
||||||
|
@ -919,7 +922,14 @@ append_queue(Id, Index, Q, Queries) when not is_binary(Q) ->
|
||||||
Q1
|
Q1
|
||||||
end,
|
end,
|
||||||
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q2)),
|
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q2)),
|
||||||
?tp(resource_worker_appended_to_queue, #{id => Id, items => Queries}),
|
?tp(
|
||||||
|
resource_worker_appended_to_queue,
|
||||||
|
#{
|
||||||
|
id => Id,
|
||||||
|
items => Queries,
|
||||||
|
queue_count => queue_count(Q2)
|
||||||
|
}
|
||||||
|
),
|
||||||
Q2.
|
Q2.
|
||||||
|
|
||||||
%%==============================================================================
|
%%==============================================================================
|
||||||
|
@ -1110,11 +1120,6 @@ do_cancel_inflight_item(Data, Ref) ->
|
||||||
|
|
||||||
%%==============================================================================
|
%%==============================================================================
|
||||||
|
|
||||||
inc_sent_failed(Id, _HasBeenSent = true) ->
|
|
||||||
emqx_resource_metrics:retried_failed_inc(Id);
|
|
||||||
inc_sent_failed(Id, _HasBeenSent) ->
|
|
||||||
emqx_resource_metrics:failed_inc(Id).
|
|
||||||
|
|
||||||
inc_sent_success(Id, _HasBeenSent = true) ->
|
inc_sent_success(Id, _HasBeenSent = true) ->
|
||||||
emqx_resource_metrics:retried_success_inc(Id);
|
emqx_resource_metrics:retried_success_inc(Id);
|
||||||
inc_sent_success(Id, _HasBeenSent) ->
|
inc_sent_success(Id, _HasBeenSent) ->
|
||||||
|
|
|
@ -48,6 +48,7 @@ fields("creation_opts") ->
|
||||||
{health_check_interval, fun health_check_interval/1},
|
{health_check_interval, fun health_check_interval/1},
|
||||||
{auto_restart_interval, fun auto_restart_interval/1},
|
{auto_restart_interval, fun auto_restart_interval/1},
|
||||||
{query_mode, fun query_mode/1},
|
{query_mode, fun query_mode/1},
|
||||||
|
{request_timeout, fun request_timeout/1},
|
||||||
{async_inflight_window, fun async_inflight_window/1},
|
{async_inflight_window, fun async_inflight_window/1},
|
||||||
{enable_batch, fun enable_batch/1},
|
{enable_batch, fun enable_batch/1},
|
||||||
{batch_size, fun batch_size/1},
|
{batch_size, fun batch_size/1},
|
||||||
|
@ -80,6 +81,11 @@ query_mode(default) -> async;
|
||||||
query_mode(required) -> false;
|
query_mode(required) -> false;
|
||||||
query_mode(_) -> undefined.
|
query_mode(_) -> undefined.
|
||||||
|
|
||||||
|
request_timeout(type) -> hoconsc:union([infinity, emqx_schema:duration_ms()]);
|
||||||
|
request_timeout(desc) -> ?DESC("request_timeout");
|
||||||
|
request_timeout(default) -> <<"15s">>;
|
||||||
|
request_timeout(_) -> undefined.
|
||||||
|
|
||||||
enable_batch(type) -> boolean();
|
enable_batch(type) -> boolean();
|
||||||
enable_batch(required) -> false;
|
enable_batch(required) -> false;
|
||||||
enable_batch(default) -> true;
|
enable_batch(default) -> true;
|
||||||
|
|
|
@ -259,6 +259,9 @@ counter_loop(
|
||||||
apply_reply(ReplyFun, ok),
|
apply_reply(ReplyFun, ok),
|
||||||
?tp(connector_demo_inc_counter_async, #{n => N}),
|
?tp(connector_demo_inc_counter_async, #{n => N}),
|
||||||
State#{counter => Num + N};
|
State#{counter => Num + N};
|
||||||
|
{big_payload, _Payload, ReplyFun} when Status == blocked ->
|
||||||
|
apply_reply(ReplyFun, {error, blocked}),
|
||||||
|
State;
|
||||||
{{FromPid, ReqRef}, {inc, N}} when Status == running ->
|
{{FromPid, ReqRef}, {inc, N}} when Status == running ->
|
||||||
%ct:pal("sync counter recv: ~p", [{inc, N}]),
|
%ct:pal("sync counter recv: ~p", [{inc, N}]),
|
||||||
FromPid ! {ReqRef, ok},
|
FromPid ! {ReqRef, ok},
|
||||||
|
@ -269,6 +272,9 @@ counter_loop(
|
||||||
{{FromPid, ReqRef}, {big_payload, _Payload}} when Status == blocked ->
|
{{FromPid, ReqRef}, {big_payload, _Payload}} when Status == blocked ->
|
||||||
FromPid ! {ReqRef, incorrect_status},
|
FromPid ! {ReqRef, incorrect_status},
|
||||||
State#{incorrect_status_count := IncorrectCount + 1};
|
State#{incorrect_status_count := IncorrectCount + 1};
|
||||||
|
{{FromPid, ReqRef}, {big_payload, _Payload}} when Status == running ->
|
||||||
|
FromPid ! {ReqRef, ok},
|
||||||
|
State;
|
||||||
{get, ReplyFun} ->
|
{get, ReplyFun} ->
|
||||||
apply_reply(ReplyFun, Num),
|
apply_reply(ReplyFun, Num),
|
||||||
State;
|
State;
|
||||||
|
|
|
@ -411,35 +411,18 @@ t_query_counter_async_inflight(_) ->
|
||||||
|
|
||||||
%% send async query to make the inflight window full
|
%% send async query to make the inflight window full
|
||||||
?check_trace(
|
?check_trace(
|
||||||
begin
|
{_, {ok, _}} =
|
||||||
{ok, SRef} = snabbkaffe:subscribe(
|
?wait_async_action(
|
||||||
?match_event(
|
|
||||||
#{
|
|
||||||
?snk_kind := resource_worker_appended_to_inflight,
|
|
||||||
is_new := true
|
|
||||||
}
|
|
||||||
),
|
|
||||||
WindowSize,
|
|
||||||
_Timeout = 5_000
|
|
||||||
),
|
|
||||||
inc_counter_in_parallel(WindowSize, ReqOpts),
|
inc_counter_in_parallel(WindowSize, ReqOpts),
|
||||||
{ok, _} = snabbkaffe:receive_events(SRef),
|
#{?snk_kind := resource_worker_flush_but_inflight_full},
|
||||||
ok
|
1_000
|
||||||
end,
|
),
|
||||||
fun(Trace) ->
|
fun(Trace) ->
|
||||||
QueryTrace = ?of_kind(call_query_async, Trace),
|
QueryTrace = ?of_kind(call_query_async, Trace),
|
||||||
?assertMatch([#{query := {query, _, {inc_counter, 1}, _}} | _], QueryTrace)
|
?assertMatch([#{query := {query, _, {inc_counter, 1}, _}} | _], QueryTrace)
|
||||||
end
|
end
|
||||||
),
|
),
|
||||||
tap_metrics(?LINE),
|
tap_metrics(?LINE),
|
||||||
|
|
||||||
%% this will block the resource_worker as the inflight window is full now
|
|
||||||
{ok, {ok, _}} =
|
|
||||||
?wait_async_action(
|
|
||||||
emqx_resource:query(?ID, {inc_counter, 199}),
|
|
||||||
#{?snk_kind := resource_worker_flush_but_inflight_full},
|
|
||||||
1_000
|
|
||||||
),
|
|
||||||
?assertMatch(0, ets:info(Tab0, size)),
|
?assertMatch(0, ets:info(Tab0, size)),
|
||||||
|
|
||||||
tap_metrics(?LINE),
|
tap_metrics(?LINE),
|
||||||
|
@ -464,9 +447,9 @@ t_query_counter_async_inflight(_) ->
|
||||||
%% all responses should be received after the resource is resumed.
|
%% all responses should be received after the resource is resumed.
|
||||||
{ok, SRef0} = snabbkaffe:subscribe(
|
{ok, SRef0} = snabbkaffe:subscribe(
|
||||||
?match_event(#{?snk_kind := connector_demo_inc_counter_async}),
|
?match_event(#{?snk_kind := connector_demo_inc_counter_async}),
|
||||||
%% +2 because the tmp_query above will be retried and succeed
|
%% +1 because the tmp_query above will be retried and succeed
|
||||||
%% this time, and there was the inc 199 request as well.
|
%% this time.
|
||||||
WindowSize + 2,
|
WindowSize + 1,
|
||||||
_Timeout0 = 10_000
|
_Timeout0 = 10_000
|
||||||
),
|
),
|
||||||
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, resume)),
|
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, resume)),
|
||||||
|
@ -504,8 +487,12 @@ t_query_counter_async_inflight(_) ->
|
||||||
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, block)),
|
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, block)),
|
||||||
%% again, send async query to make the inflight window full
|
%% again, send async query to make the inflight window full
|
||||||
?check_trace(
|
?check_trace(
|
||||||
?TRACE_OPTS,
|
{_, {ok, _}} =
|
||||||
|
?wait_async_action(
|
||||||
inc_counter_in_parallel(WindowSize, ReqOpts),
|
inc_counter_in_parallel(WindowSize, ReqOpts),
|
||||||
|
#{?snk_kind := resource_worker_flush_but_inflight_full},
|
||||||
|
1_000
|
||||||
|
),
|
||||||
fun(Trace) ->
|
fun(Trace) ->
|
||||||
QueryTrace = ?of_kind(call_query_async, Trace),
|
QueryTrace = ?of_kind(call_query_async, Trace),
|
||||||
?assertMatch([#{query := {query, _, {inc_counter, 1}, _}} | _], QueryTrace)
|
?assertMatch([#{query := {query, _, {inc_counter, 1}, _}} | _], QueryTrace)
|
||||||
|
@ -584,7 +571,7 @@ t_query_counter_async_inflight_batch(_) ->
|
||||||
end,
|
end,
|
||||||
ReqOpts = fun() -> #{async_reply_fun => {Insert0, [Tab0, make_ref()]}} end,
|
ReqOpts = fun() -> #{async_reply_fun => {Insert0, [Tab0, make_ref()]}} end,
|
||||||
BatchSize = 2,
|
BatchSize = 2,
|
||||||
WindowSize = 3,
|
WindowSize = 15,
|
||||||
{ok, _} = emqx_resource:create_local(
|
{ok, _} = emqx_resource:create_local(
|
||||||
?ID,
|
?ID,
|
||||||
?DEFAULT_RESOURCE_GROUP,
|
?DEFAULT_RESOURCE_GROUP,
|
||||||
|
@ -606,16 +593,12 @@ t_query_counter_async_inflight_batch(_) ->
|
||||||
%% send async query to make the inflight window full
|
%% send async query to make the inflight window full
|
||||||
NumMsgs = BatchSize * WindowSize,
|
NumMsgs = BatchSize * WindowSize,
|
||||||
?check_trace(
|
?check_trace(
|
||||||
begin
|
{_, {ok, _}} =
|
||||||
{ok, SRef} = snabbkaffe:subscribe(
|
?wait_async_action(
|
||||||
?match_event(#{?snk_kind := call_batch_query_async}),
|
|
||||||
WindowSize,
|
|
||||||
_Timeout = 60_000
|
|
||||||
),
|
|
||||||
inc_counter_in_parallel(NumMsgs, ReqOpts),
|
inc_counter_in_parallel(NumMsgs, ReqOpts),
|
||||||
{ok, _} = snabbkaffe:receive_events(SRef),
|
#{?snk_kind := resource_worker_flush_but_inflight_full},
|
||||||
ok
|
5_000
|
||||||
end,
|
),
|
||||||
fun(Trace) ->
|
fun(Trace) ->
|
||||||
QueryTrace = ?of_kind(call_batch_query_async, Trace),
|
QueryTrace = ?of_kind(call_batch_query_async, Trace),
|
||||||
?assertMatch(
|
?assertMatch(
|
||||||
|
@ -674,7 +657,7 @@ t_query_counter_async_inflight_batch(_) ->
|
||||||
%% +1 because the tmp_query above will be retried and succeed
|
%% +1 because the tmp_query above will be retried and succeed
|
||||||
%% this time.
|
%% this time.
|
||||||
WindowSize + 1,
|
WindowSize + 1,
|
||||||
_Timeout = 60_000
|
10_000
|
||||||
),
|
),
|
||||||
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, resume)),
|
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, resume)),
|
||||||
tap_metrics(?LINE),
|
tap_metrics(?LINE),
|
||||||
|
@ -695,7 +678,7 @@ t_query_counter_async_inflight_batch(_) ->
|
||||||
{ok, SRef} = snabbkaffe:subscribe(
|
{ok, SRef} = snabbkaffe:subscribe(
|
||||||
?match_event(#{?snk_kind := connector_demo_inc_counter_async}),
|
?match_event(#{?snk_kind := connector_demo_inc_counter_async}),
|
||||||
NumBatches1,
|
NumBatches1,
|
||||||
_Timeout = 60_000
|
10_000
|
||||||
),
|
),
|
||||||
inc_counter_in_parallel(NumMsgs1, ReqOpts),
|
inc_counter_in_parallel(NumMsgs1, ReqOpts),
|
||||||
{ok, _} = snabbkaffe:receive_events(SRef),
|
{ok, _} = snabbkaffe:receive_events(SRef),
|
||||||
|
@ -720,8 +703,12 @@ t_query_counter_async_inflight_batch(_) ->
|
||||||
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, block)),
|
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, block)),
|
||||||
%% again, send async query to make the inflight window full
|
%% again, send async query to make the inflight window full
|
||||||
?check_trace(
|
?check_trace(
|
||||||
?TRACE_OPTS,
|
{_, {ok, _}} =
|
||||||
inc_counter_in_parallel(WindowSize, ReqOpts),
|
?wait_async_action(
|
||||||
|
inc_counter_in_parallel(NumMsgs, ReqOpts),
|
||||||
|
#{?snk_kind := resource_worker_flush_but_inflight_full},
|
||||||
|
5_000
|
||||||
|
),
|
||||||
fun(Trace) ->
|
fun(Trace) ->
|
||||||
QueryTrace = ?of_kind(call_batch_query_async, Trace),
|
QueryTrace = ?of_kind(call_batch_query_async, Trace),
|
||||||
?assertMatch(
|
?assertMatch(
|
||||||
|
@ -734,11 +721,11 @@ t_query_counter_async_inflight_batch(_) ->
|
||||||
%% this will block the resource_worker
|
%% this will block the resource_worker
|
||||||
ok = emqx_resource:query(?ID, {inc_counter, 1}),
|
ok = emqx_resource:query(?ID, {inc_counter, 1}),
|
||||||
|
|
||||||
Sent = NumMsgs + NumMsgs1 + WindowSize,
|
Sent = NumMsgs + NumMsgs1 + NumMsgs,
|
||||||
{ok, SRef1} = snabbkaffe:subscribe(
|
{ok, SRef1} = snabbkaffe:subscribe(
|
||||||
?match_event(#{?snk_kind := connector_demo_inc_counter_async}),
|
?match_event(#{?snk_kind := connector_demo_inc_counter_async}),
|
||||||
WindowSize,
|
WindowSize,
|
||||||
_Timeout = 60_000
|
10_000
|
||||||
),
|
),
|
||||||
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, resume)),
|
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, resume)),
|
||||||
{ok, _} = snabbkaffe:receive_events(SRef1),
|
{ok, _} = snabbkaffe:receive_events(SRef1),
|
||||||
|
@ -785,10 +772,8 @@ t_healthy_timeout(_) ->
|
||||||
%% the ?TEST_RESOURCE always returns the `Mod:on_get_status/2` 300ms later.
|
%% the ?TEST_RESOURCE always returns the `Mod:on_get_status/2` 300ms later.
|
||||||
#{health_check_interval => 200}
|
#{health_check_interval => 200}
|
||||||
),
|
),
|
||||||
?assertMatch(
|
?assertError(timeout, emqx_resource:query(?ID, get_state, #{timeout => 1_000})),
|
||||||
?RESOURCE_ERROR(not_connected),
|
?assertMatch({ok, _Group, #{status := disconnected}}, emqx_resource_manager:ets_lookup(?ID)),
|
||||||
emqx_resource:query(?ID, get_state)
|
|
||||||
),
|
|
||||||
ok = emqx_resource:remove_local(?ID).
|
ok = emqx_resource:remove_local(?ID).
|
||||||
|
|
||||||
t_healthy(_) ->
|
t_healthy(_) ->
|
||||||
|
@ -1131,6 +1116,7 @@ t_retry_batch(_Config) ->
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
t_delete_and_re_create_with_same_name(_Config) ->
|
t_delete_and_re_create_with_same_name(_Config) ->
|
||||||
|
NumBufferWorkers = 2,
|
||||||
{ok, _} = emqx_resource:create(
|
{ok, _} = emqx_resource:create(
|
||||||
?ID,
|
?ID,
|
||||||
?DEFAULT_RESOURCE_GROUP,
|
?DEFAULT_RESOURCE_GROUP,
|
||||||
|
@ -1139,7 +1125,7 @@ t_delete_and_re_create_with_same_name(_Config) ->
|
||||||
#{
|
#{
|
||||||
query_mode => sync,
|
query_mode => sync,
|
||||||
batch_size => 1,
|
batch_size => 1,
|
||||||
worker_pool_size => 2,
|
worker_pool_size => NumBufferWorkers,
|
||||||
queue_seg_bytes => 100,
|
queue_seg_bytes => 100,
|
||||||
resume_interval => 1_000
|
resume_interval => 1_000
|
||||||
}
|
}
|
||||||
|
@ -1154,19 +1140,21 @@ t_delete_and_re_create_with_same_name(_Config) ->
|
||||||
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, block)),
|
?assertMatch(ok, emqx_resource:simple_sync_query(?ID, block)),
|
||||||
NumRequests = 10,
|
NumRequests = 10,
|
||||||
{ok, SRef} = snabbkaffe:subscribe(
|
{ok, SRef} = snabbkaffe:subscribe(
|
||||||
?match_event(#{?snk_kind := resource_worker_appended_to_queue}),
|
?match_event(#{?snk_kind := resource_worker_enter_blocked}),
|
||||||
NumRequests,
|
NumBufferWorkers,
|
||||||
_Timeout = 5_000
|
_Timeout = 5_000
|
||||||
),
|
),
|
||||||
%% ensure replayq offloads to disk
|
%% ensure replayq offloads to disk
|
||||||
Payload = binary:copy(<<"a">>, 119),
|
Payload = binary:copy(<<"a">>, 119),
|
||||||
lists:foreach(
|
lists:foreach(
|
||||||
fun(N) ->
|
fun(N) ->
|
||||||
|
spawn_link(fun() ->
|
||||||
{error, _} =
|
{error, _} =
|
||||||
emqx_resource:query(
|
emqx_resource:query(
|
||||||
?ID,
|
?ID,
|
||||||
{big_payload, <<(integer_to_binary(N))/binary, Payload/binary>>}
|
{big_payload, <<(integer_to_binary(N))/binary, Payload/binary>>}
|
||||||
)
|
)
|
||||||
|
end)
|
||||||
end,
|
end,
|
||||||
lists:seq(1, NumRequests)
|
lists:seq(1, NumRequests)
|
||||||
),
|
),
|
||||||
|
@ -1177,10 +1165,11 @@ t_delete_and_re_create_with_same_name(_Config) ->
|
||||||
tap_metrics(?LINE),
|
tap_metrics(?LINE),
|
||||||
Queuing1 = emqx_resource_metrics:queuing_get(?ID),
|
Queuing1 = emqx_resource_metrics:queuing_get(?ID),
|
||||||
Inflight1 = emqx_resource_metrics:inflight_get(?ID),
|
Inflight1 = emqx_resource_metrics:inflight_get(?ID),
|
||||||
?assertEqual(NumRequests - 1, Queuing1),
|
?assert(Queuing1 > 0),
|
||||||
?assertEqual(1, Inflight1),
|
?assertEqual(2, Inflight1),
|
||||||
|
|
||||||
%% now, we delete the resource
|
%% now, we delete the resource
|
||||||
|
process_flag(trap_exit, true),
|
||||||
ok = emqx_resource:remove_local(?ID),
|
ok = emqx_resource:remove_local(?ID),
|
||||||
?assertEqual({error, not_found}, emqx_resource_manager:lookup(?ID)),
|
?assertEqual({error, not_found}, emqx_resource_manager:lookup(?ID)),
|
||||||
|
|
||||||
|
@ -1275,9 +1264,13 @@ t_retry_sync_inflight(_Config) ->
|
||||||
%% now really make the resource go into `blocked' state.
|
%% now really make the resource go into `blocked' state.
|
||||||
%% this results in a retriable error when sync.
|
%% this results in a retriable error when sync.
|
||||||
ok = emqx_resource:simple_sync_query(?ID, block),
|
ok = emqx_resource:simple_sync_query(?ID, block),
|
||||||
{{error, {recoverable_error, incorrect_status}}, {ok, _}} =
|
TestPid = self(),
|
||||||
|
{_, {ok, _}} =
|
||||||
?wait_async_action(
|
?wait_async_action(
|
||||||
emqx_resource:query(?ID, {big_payload, <<"a">>}, QueryOpts),
|
spawn_link(fun() ->
|
||||||
|
Res = emqx_resource:query(?ID, {big_payload, <<"a">>}, QueryOpts),
|
||||||
|
TestPid ! {res, Res}
|
||||||
|
end),
|
||||||
#{?snk_kind := resource_worker_retry_inflight_failed},
|
#{?snk_kind := resource_worker_retry_inflight_failed},
|
||||||
ResumeInterval * 2
|
ResumeInterval * 2
|
||||||
),
|
),
|
||||||
|
@ -1287,9 +1280,15 @@ t_retry_sync_inflight(_Config) ->
|
||||||
#{?snk_kind := resource_worker_retry_inflight_succeeded},
|
#{?snk_kind := resource_worker_retry_inflight_succeeded},
|
||||||
ResumeInterval * 3
|
ResumeInterval * 3
|
||||||
),
|
),
|
||||||
|
receive
|
||||||
|
{res, Res} ->
|
||||||
|
?assertEqual(ok, Res)
|
||||||
|
after 5_000 ->
|
||||||
|
ct:fail("no response")
|
||||||
|
end,
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
[fun ?MODULE:assert_retry_fail_then_succeed_inflight/1]
|
[fun ?MODULE:assert_sync_retry_fail_then_succeed_inflight/1]
|
||||||
),
|
),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
|
@ -1312,12 +1311,17 @@ t_retry_sync_inflight_batch(_Config) ->
|
||||||
QueryOpts = #{},
|
QueryOpts = #{},
|
||||||
?check_trace(
|
?check_trace(
|
||||||
begin
|
begin
|
||||||
%% now really make the resource go into `blocked' state.
|
%% make the resource go into `blocked' state. this
|
||||||
%% this results in a retriable error when sync.
|
%% results in a retriable error when sync.
|
||||||
ok = emqx_resource:simple_sync_query(?ID, block),
|
ok = emqx_resource:simple_sync_query(?ID, block),
|
||||||
{{error, {recoverable_error, incorrect_status}}, {ok, _}} =
|
process_flag(trap_exit, true),
|
||||||
|
TestPid = self(),
|
||||||
|
{_, {ok, _}} =
|
||||||
?wait_async_action(
|
?wait_async_action(
|
||||||
emqx_resource:query(?ID, {big_payload, <<"a">>}, QueryOpts),
|
spawn_link(fun() ->
|
||||||
|
Res = emqx_resource:query(?ID, {big_payload, <<"a">>}, QueryOpts),
|
||||||
|
TestPid ! {res, Res}
|
||||||
|
end),
|
||||||
#{?snk_kind := resource_worker_retry_inflight_failed},
|
#{?snk_kind := resource_worker_retry_inflight_failed},
|
||||||
ResumeInterval * 2
|
ResumeInterval * 2
|
||||||
),
|
),
|
||||||
|
@ -1327,13 +1331,19 @@ t_retry_sync_inflight_batch(_Config) ->
|
||||||
#{?snk_kind := resource_worker_retry_inflight_succeeded},
|
#{?snk_kind := resource_worker_retry_inflight_succeeded},
|
||||||
ResumeInterval * 3
|
ResumeInterval * 3
|
||||||
),
|
),
|
||||||
|
receive
|
||||||
|
{res, Res} ->
|
||||||
|
?assertEqual(ok, Res)
|
||||||
|
after 5_000 ->
|
||||||
|
ct:fail("no response")
|
||||||
|
end,
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
[fun ?MODULE:assert_retry_fail_then_succeed_inflight/1]
|
[fun ?MODULE:assert_sync_retry_fail_then_succeed_inflight/1]
|
||||||
),
|
),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
t_dont_retry_async_inflight(_Config) ->
|
t_retry_async_inflight(_Config) ->
|
||||||
ResumeInterval = 1_000,
|
ResumeInterval = 1_000,
|
||||||
emqx_connector_demo:set_callback_mode(async_if_possible),
|
emqx_connector_demo:set_callback_mode(async_if_possible),
|
||||||
{ok, _} = emqx_resource:create(
|
{ok, _} = emqx_resource:create(
|
||||||
|
@ -1351,33 +1361,31 @@ t_dont_retry_async_inflight(_Config) ->
|
||||||
QueryOpts = #{},
|
QueryOpts = #{},
|
||||||
?check_trace(
|
?check_trace(
|
||||||
begin
|
begin
|
||||||
%% block,
|
%% block
|
||||||
{ok, {ok, _}} =
|
ok = emqx_resource:simple_sync_query(?ID, block),
|
||||||
?wait_async_action(
|
|
||||||
emqx_resource:query(?ID, block_now),
|
|
||||||
#{?snk_kind := resource_worker_enter_blocked},
|
|
||||||
ResumeInterval * 2
|
|
||||||
),
|
|
||||||
|
|
||||||
%% then send an async request; that shouldn't be retriable.
|
%% then send an async request; that should be retriable.
|
||||||
{ok, {ok, _}} =
|
{ok, {ok, _}} =
|
||||||
?wait_async_action(
|
?wait_async_action(
|
||||||
emqx_resource:query(?ID, {big_payload, <<"b">>}, QueryOpts),
|
emqx_resource:query(?ID, {big_payload, <<"b">>}, QueryOpts),
|
||||||
#{?snk_kind := resource_worker_flush_ack},
|
#{?snk_kind := resource_worker_retry_inflight_failed},
|
||||||
ResumeInterval * 2
|
ResumeInterval * 2
|
||||||
),
|
),
|
||||||
|
|
||||||
%% will re-enter running because the single request is not retriable
|
%% will reply with success after the resource is healed
|
||||||
{ok, _} = ?block_until(
|
{ok, {ok, _}} =
|
||||||
#{?snk_kind := resource_worker_enter_running}, ResumeInterval * 2
|
?wait_async_action(
|
||||||
|
emqx_resource:simple_sync_query(?ID, resume),
|
||||||
|
#{?snk_kind := resource_worker_enter_running},
|
||||||
|
ResumeInterval * 2
|
||||||
),
|
),
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
[fun ?MODULE:assert_no_retry_inflight/1]
|
[fun ?MODULE:assert_async_retry_fail_then_succeed_inflight/1]
|
||||||
),
|
),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
t_dont_retry_async_inflight_batch(_Config) ->
|
t_retry_async_inflight_batch(_Config) ->
|
||||||
ResumeInterval = 1_000,
|
ResumeInterval = 1_000,
|
||||||
emqx_connector_demo:set_callback_mode(async_if_possible),
|
emqx_connector_demo:set_callback_mode(async_if_possible),
|
||||||
{ok, _} = emqx_resource:create(
|
{ok, _} = emqx_resource:create(
|
||||||
|
@ -1396,29 +1404,27 @@ t_dont_retry_async_inflight_batch(_Config) ->
|
||||||
QueryOpts = #{},
|
QueryOpts = #{},
|
||||||
?check_trace(
|
?check_trace(
|
||||||
begin
|
begin
|
||||||
%% block,
|
%% block
|
||||||
{ok, {ok, _}} =
|
ok = emqx_resource:simple_sync_query(?ID, block),
|
||||||
?wait_async_action(
|
|
||||||
emqx_resource:query(?ID, block_now),
|
|
||||||
#{?snk_kind := resource_worker_enter_blocked},
|
|
||||||
ResumeInterval * 2
|
|
||||||
),
|
|
||||||
|
|
||||||
%% then send an async request; that shouldn't be retriable.
|
%% then send an async request; that should be retriable.
|
||||||
{ok, {ok, _}} =
|
{ok, {ok, _}} =
|
||||||
?wait_async_action(
|
?wait_async_action(
|
||||||
emqx_resource:query(?ID, {big_payload, <<"b">>}, QueryOpts),
|
emqx_resource:query(?ID, {big_payload, <<"b">>}, QueryOpts),
|
||||||
#{?snk_kind := resource_worker_flush_ack},
|
#{?snk_kind := resource_worker_retry_inflight_failed},
|
||||||
ResumeInterval * 2
|
ResumeInterval * 2
|
||||||
),
|
),
|
||||||
|
|
||||||
%% will re-enter running because the single request is not retriable
|
%% will reply with success after the resource is healed
|
||||||
{ok, _} = ?block_until(
|
{ok, {ok, _}} =
|
||||||
#{?snk_kind := resource_worker_enter_running}, ResumeInterval * 2
|
?wait_async_action(
|
||||||
|
emqx_resource:simple_sync_query(?ID, resume),
|
||||||
|
#{?snk_kind := resource_worker_enter_running},
|
||||||
|
ResumeInterval * 2
|
||||||
),
|
),
|
||||||
ok
|
ok
|
||||||
end,
|
end,
|
||||||
[fun ?MODULE:assert_no_retry_inflight/1]
|
[fun ?MODULE:assert_async_retry_fail_then_succeed_inflight/1]
|
||||||
),
|
),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
|
@ -1529,7 +1535,8 @@ inc_counter_in_parallel(N, Opts0) ->
|
||||||
ct:fail({wait_for_query_timeout, Pid})
|
ct:fail({wait_for_query_timeout, Pid})
|
||||||
end
|
end
|
||||||
|| Pid <- Pids
|
|| Pid <- Pids
|
||||||
].
|
],
|
||||||
|
ok.
|
||||||
|
|
||||||
inc_counter_in_parallel_increasing(N, StartN, Opts0) ->
|
inc_counter_in_parallel_increasing(N, StartN, Opts0) ->
|
||||||
Parent = self(),
|
Parent = self(),
|
||||||
|
@ -1566,12 +1573,8 @@ tap_metrics(Line) ->
|
||||||
ct:pal("metrics (l. ~b): ~p", [Line, #{counters => C, gauges => G}]),
|
ct:pal("metrics (l. ~b): ~p", [Line, #{counters => C, gauges => G}]),
|
||||||
#{counters => C, gauges => G}.
|
#{counters => C, gauges => G}.
|
||||||
|
|
||||||
assert_no_retry_inflight(Trace) ->
|
assert_sync_retry_fail_then_succeed_inflight(Trace) ->
|
||||||
?assertEqual([], ?of_kind(resource_worker_retry_inflight_failed, Trace)),
|
ct:pal(" ~p", [Trace]),
|
||||||
?assertEqual([], ?of_kind(resource_worker_retry_inflight_succeeded, Trace)),
|
|
||||||
ok.
|
|
||||||
|
|
||||||
assert_retry_fail_then_succeed_inflight(Trace) ->
|
|
||||||
?assert(
|
?assert(
|
||||||
?strict_causality(
|
?strict_causality(
|
||||||
#{?snk_kind := resource_worker_flush_nack, ref := _Ref},
|
#{?snk_kind := resource_worker_flush_nack, ref := _Ref},
|
||||||
|
@ -1589,3 +1592,23 @@ assert_retry_fail_then_succeed_inflight(Trace) ->
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
|
assert_async_retry_fail_then_succeed_inflight(Trace) ->
|
||||||
|
ct:pal(" ~p", [Trace]),
|
||||||
|
?assert(
|
||||||
|
?strict_causality(
|
||||||
|
#{?snk_kind := resource_worker_reply_after_query, action := nack, ref := _Ref},
|
||||||
|
#{?snk_kind := resource_worker_retry_inflight_failed, ref := _Ref},
|
||||||
|
Trace
|
||||||
|
)
|
||||||
|
),
|
||||||
|
%% not strict causality because it might retry more than once
|
||||||
|
%% before restoring the resource health.
|
||||||
|
?assert(
|
||||||
|
?causality(
|
||||||
|
#{?snk_kind := resource_worker_retry_inflight_failed, ref := _Ref},
|
||||||
|
#{?snk_kind := resource_worker_retry_inflight_succeeded, ref := _Ref},
|
||||||
|
Trace
|
||||||
|
)
|
||||||
|
),
|
||||||
|
ok.
|
||||||
|
|
|
@ -108,6 +108,7 @@ end_per_group(_Group, _Config) ->
|
||||||
init_per_testcase(TestCase, Config0) when
|
init_per_testcase(TestCase, Config0) when
|
||||||
TestCase =:= t_publish_success_batch
|
TestCase =:= t_publish_success_batch
|
||||||
->
|
->
|
||||||
|
ct:timetrap({seconds, 30}),
|
||||||
case ?config(batch_size, Config0) of
|
case ?config(batch_size, Config0) of
|
||||||
1 ->
|
1 ->
|
||||||
[{skip_due_to_no_batching, true}];
|
[{skip_due_to_no_batching, true}];
|
||||||
|
@ -120,6 +121,7 @@ init_per_testcase(TestCase, Config0) when
|
||||||
[{telemetry_table, Tid} | Config]
|
[{telemetry_table, Tid} | Config]
|
||||||
end;
|
end;
|
||||||
init_per_testcase(TestCase, Config0) ->
|
init_per_testcase(TestCase, Config0) ->
|
||||||
|
ct:timetrap({seconds, 30}),
|
||||||
{ok, _} = start_echo_http_server(),
|
{ok, _} = start_echo_http_server(),
|
||||||
delete_all_bridges(),
|
delete_all_bridges(),
|
||||||
Tid = install_telemetry_handler(TestCase),
|
Tid = install_telemetry_handler(TestCase),
|
||||||
|
@ -283,6 +285,7 @@ gcp_pubsub_config(Config) ->
|
||||||
" pool_size = 1\n"
|
" pool_size = 1\n"
|
||||||
" pipelining = ~b\n"
|
" pipelining = ~b\n"
|
||||||
" resource_opts = {\n"
|
" resource_opts = {\n"
|
||||||
|
" request_timeout = 500ms\n"
|
||||||
" worker_pool_size = 1\n"
|
" worker_pool_size = 1\n"
|
||||||
" query_mode = ~s\n"
|
" query_mode = ~s\n"
|
||||||
" batch_size = ~b\n"
|
" batch_size = ~b\n"
|
||||||
|
@ -1266,7 +1269,6 @@ t_failure_no_body(Config) ->
|
||||||
|
|
||||||
t_unrecoverable_error(Config) ->
|
t_unrecoverable_error(Config) ->
|
||||||
ResourceId = ?config(resource_id, Config),
|
ResourceId = ?config(resource_id, Config),
|
||||||
TelemetryTable = ?config(telemetry_table, Config),
|
|
||||||
QueryMode = ?config(query_mode, Config),
|
QueryMode = ?config(query_mode, Config),
|
||||||
TestPid = self(),
|
TestPid = self(),
|
||||||
FailureNoBodyHandler =
|
FailureNoBodyHandler =
|
||||||
|
@ -1328,26 +1330,14 @@ t_unrecoverable_error(Config) ->
|
||||||
ok
|
ok
|
||||||
end
|
end
|
||||||
),
|
),
|
||||||
wait_telemetry_event(TelemetryTable, failed, ResourceId),
|
|
||||||
ExpectedInflightEvents =
|
wait_until_gauge_is(queuing, 0, _Timeout = 400),
|
||||||
case QueryMode of
|
wait_until_gauge_is(inflight, 1, _Timeout = 400),
|
||||||
sync -> 1;
|
|
||||||
async -> 3
|
|
||||||
end,
|
|
||||||
wait_telemetry_event(
|
|
||||||
TelemetryTable,
|
|
||||||
inflight,
|
|
||||||
ResourceId,
|
|
||||||
#{n_events => ExpectedInflightEvents, timeout => 5_000}
|
|
||||||
),
|
|
||||||
%% even waiting, hard to avoid flakiness... simpler to just sleep
|
|
||||||
%% a bit until stabilization.
|
|
||||||
ct:sleep(200),
|
|
||||||
assert_metrics(
|
assert_metrics(
|
||||||
#{
|
#{
|
||||||
dropped => 0,
|
dropped => 0,
|
||||||
failed => 1,
|
failed => 0,
|
||||||
inflight => 0,
|
inflight => 1,
|
||||||
matched => 1,
|
matched => 1,
|
||||||
queuing => 0,
|
queuing => 0,
|
||||||
retried => 0,
|
retried => 0,
|
||||||
|
|
Loading…
Reference in New Issue