From eb2d3a3b7e76bc55992e124687c97f875747fc4c Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 19 Jul 2024 10:53:30 -0300 Subject: [PATCH] chore: attempt to reduce race condition supervisor shutdown errors Fixes https://emqx.atlassian.net/browse/EMQX-12442 e.g.: ``` 2024-05-23T08:52:39.811845+00:00 [error] Supervisor: {local,emqx_resource_manager_sup}. Context: shutdown_error. Reason: noproc. Offender: id=<<99, 101, 110, 115, 111, 114, 101, 100>>,pid=<0.7752.1030>. ``` It could be just a race condition, as it seems to be the case for resource manager: i) a call is made to the process to stop it; ii) the call times out; iii) the after clause ends up calling supervisor:terminate_child; iv) while the supervisor is finding the child to terminate, the process actually finishes terminating, and the supervisor receives a noproc reason back. --- .../src/emqx_resource_manager.erl | 55 ++++++++++++++----- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/apps/emqx_resource/src/emqx_resource_manager.erl b/apps/emqx_resource/src/emqx_resource_manager.erl index c3b746d8e..e06537568 100644 --- a/apps/emqx_resource/src/emqx_resource_manager.erl +++ b/apps/emqx_resource/src/emqx_resource_manager.erl @@ -59,7 +59,7 @@ ]). % Server --export([start_link/5]). +-export([start_link/5, where/1]). % Behaviour -export([init/1, callback_mode/0, handle_event/4, terminate/3]). @@ -147,6 +147,9 @@ %% API %%------------------------------------------------------------------------------ +where(ResId) -> + gproc:where(?NAME(ResId)). + %% @doc Called from emqx_resource when starting a resource instance. %% %% Triggers the emqx_resource_manager_sup supervisor to actually create @@ -268,17 +271,7 @@ remove(ResId) when is_binary(ResId) -> -spec remove(resource_id(), boolean()) -> ok | {error, Reason :: term()}. remove(ResId, ClearMetrics) when is_binary(ResId) -> try - case safe_call(ResId, {remove, ClearMetrics}, ?T_OPERATION) of - {error, timeout} -> - ?tp(error, "forcefully_stopping_resource_due_to_timeout", #{ - action => remove, - resource_id => ResId - }), - force_kill(ResId), - ok; - Res -> - Res - end + do_remove(ResId, ClearMetrics) after %% Ensure the supervisor has it removed, otherwise the immediate re-add will see a stale process %% If the 'remove' call above had succeeded, this is mostly a no-op but still needed to avoid race condition. @@ -286,6 +279,31 @@ remove(ResId, ClearMetrics) when is_binary(ResId) -> emqx_resource_manager_sup:delete_child(ResId) end. +do_remove(ResId, ClearMetrics) -> + case gproc:whereis_name(?NAME(ResId)) of + undefined -> + ok; + Pid when is_pid(Pid) -> + MRef = monitor(process, Pid), + case safe_call(ResId, {remove, ClearMetrics}, ?T_OPERATION) of + {error, timeout} -> + ?tp(error, "forcefully_stopping_resource_due_to_timeout", #{ + action => remove, + resource_id => ResId + }), + force_kill(ResId, MRef), + ok; + ok -> + receive + {'DOWN', MRef, process, Pid, _} -> + ok + end, + ok; + Res -> + Res + end + end. + %% @doc Stops and then starts an instance that was already running -spec restart(resource_id(), creation_opts()) -> ok | {error, Reason :: term()}. restart(ResId, Opts) when is_binary(ResId) -> @@ -323,7 +341,7 @@ stop(ResId, Timeout) -> action => stop, resource_id => ResId }), - force_kill(ResId), + force_kill(ResId, _MRef = undefined), ok; {error, _Reason} = Error -> Error @@ -460,12 +478,21 @@ get_error(ResId, #{added_channels := #{} = Channels} = ResourceData) when get_error(_ResId, #{error := Error}) -> Error. -force_kill(ResId) -> +force_kill(ResId, MRef0) -> case gproc:whereis_name(?NAME(ResId)) of undefined -> ok; Pid when is_pid(Pid) -> + MRef = + case MRef0 of + undefined -> monitor(process, Pid); + _ -> MRef0 + end, exit(Pid, kill), + receive + {'DOWN', MRef, process, Pid, _} -> + ok + end, try_clean_allocated_resources(ResId), ok end.