fix(resource-manager): ensure no false creation
Update is implemented as remove + create. If a dleete call is made while the create is in progress the remove call is likely to timeout too. This causes the follwing creation to falsely succeed, because there is alreay a running child under the supervisor. As a result, the resource is permanently removed after resource_manager eventually handles the remove call.
This commit is contained in:
parent
f5e5c59763
commit
0d8ffc0d59
|
@ -53,7 +53,18 @@
|
||||||
|
|
||||||
% State record
|
% State record
|
||||||
-record(data, {
|
-record(data, {
|
||||||
id, group, mod, callback_mode, query_mode, config, opts, status, state, error, pid
|
id,
|
||||||
|
group,
|
||||||
|
mod,
|
||||||
|
callback_mode,
|
||||||
|
query_mode,
|
||||||
|
config,
|
||||||
|
opts,
|
||||||
|
status,
|
||||||
|
state,
|
||||||
|
error,
|
||||||
|
pid,
|
||||||
|
extra
|
||||||
}).
|
}).
|
||||||
-type data() :: #data{}.
|
-type data() :: #data{}.
|
||||||
|
|
||||||
|
@ -181,7 +192,15 @@ remove(ResId) when is_binary(ResId) ->
|
||||||
%% @doc Stops a running resource_manager and optionally clears the metrics for the resource
|
%% @doc Stops a running resource_manager and optionally clears the metrics for the resource
|
||||||
-spec remove(resource_id(), boolean()) -> ok | {error, Reason :: term()}.
|
-spec remove(resource_id(), boolean()) -> ok | {error, Reason :: term()}.
|
||||||
remove(ResId, ClearMetrics) when is_binary(ResId) ->
|
remove(ResId, ClearMetrics) when is_binary(ResId) ->
|
||||||
safe_call(ResId, {remove, ClearMetrics}, ?T_OPERATION).
|
ResourceManagerPid = gproc:whereis_name(?NAME(ResId)),
|
||||||
|
try
|
||||||
|
safe_call(ResId, {remove, ClearMetrics}, ?T_OPERATION)
|
||||||
|
after
|
||||||
|
%% Ensure the supervisor has it removed, otherwise the immediate re-add will see a stale process
|
||||||
|
%% If the 'remove' call babove had succeeded, this is mostly a no-op but still needed to avoid race condition.
|
||||||
|
%% Otherwise this is a 'infinity' shutdown, so it may take arbitrary long.
|
||||||
|
emqx_resource_manager_sup:delete_child(ResourceManagerPid)
|
||||||
|
end.
|
||||||
|
|
||||||
%% @doc Stops and then starts an instance that was already running
|
%% @doc Stops and then starts an instance that was already running
|
||||||
-spec restart(resource_id(), creation_opts()) -> ok | {error, Reason :: term()}.
|
-spec restart(resource_id(), creation_opts()) -> ok | {error, Reason :: term()}.
|
||||||
|
@ -439,8 +458,10 @@ health_check_actions(Data) ->
|
||||||
[{state_timeout, health_check_interval(Data#data.opts), health_check}].
|
[{state_timeout, health_check_interval(Data#data.opts), health_check}].
|
||||||
|
|
||||||
handle_remove_event(From, ClearMetrics, Data) ->
|
handle_remove_event(From, ClearMetrics, Data) ->
|
||||||
_ = stop_resource(Data),
|
%% stop the buffer workers first, brutal_kill, so it should be fast
|
||||||
ok = emqx_resource_buffer_worker_sup:stop_workers(Data#data.id, Data#data.opts),
|
ok = emqx_resource_buffer_worker_sup:stop_workers(Data#data.id, Data#data.opts),
|
||||||
|
%% no stop the resource, this can be slow
|
||||||
|
_ = stop_resource(Data),
|
||||||
case ClearMetrics of
|
case ClearMetrics of
|
||||||
true -> ok = emqx_metrics_worker:clear_metrics(?RES_METRICS, Data#data.id);
|
true -> ok = emqx_metrics_worker:clear_metrics(?RES_METRICS, Data#data.id);
|
||||||
false -> ok
|
false -> ok
|
||||||
|
|
Loading…
Reference in New Issue