fix: handle channel updated during health check

This commit fixes an issue found by CI test case
emqx_bridge_influxdb_SUITE:t_start_stop and others. While the channel
health check process is running, the channel could be removed or updated
which could cause a crash in the resource manager or non up-to-date
alarms being triggered.
This commit is contained in:
Kjell Winblad 2024-05-22 10:39:19 +02:00
parent 39d758c4d6
commit cff8b97e8a
1 changed files with 33 additions and 6 deletions

View File

@ -1228,14 +1228,29 @@ trigger_health_check_for_added_channels(Data0 = #data{hc_workers = HCWorkers0})
start_channel_health_check(Data1, ChannelId)
end.
-spec continue_channel_health_check_connected(channel_id(), channel_status_map(), data()) -> data().
continue_channel_health_check_connected(ChannelId, OldStatus, Data0) ->
-spec continue_channel_health_check_connected(
channel_id(), channel_status_map(), channel_status_map(), data()
) -> data().
continue_channel_health_check_connected(ChannelId, OldStatus, CurrentStatus, Data0) ->
#data{hc_workers = HCWorkers0} = Data0,
#{channel := CHCWorkers0} = HCWorkers0,
CHCWorkers = emqx_utils_maps:deep_remove([ongoing, ChannelId], CHCWorkers0),
Data1 = Data0#data{hc_workers = HCWorkers0#{channel := CHCWorkers}},
case OldStatus =:= CurrentStatus of
true ->
continue_channel_health_check_connected_no_update_during_check(
ChannelId, OldStatus, Data1
);
false ->
%% Channel has been updated while the health check process was working so
%% we should not clear any alarm or remove the channel from the
%% connector
Data1
end.
continue_channel_health_check_connected_no_update_during_check(ChannelId, OldStatus, Data1) ->
%% Remove the added channels with a status different from connected or connecting
NewStatus = maps:get(ChannelId, Data0#data.added_channels),
NewStatus = maps:get(ChannelId, Data1#data.added_channels),
ChannelsToRemove = [ChannelId || not channel_status_is_channel_added(NewStatus)],
Data = remove_channels_in_list(ChannelsToRemove, Data1, true),
%% Raise/clear alarms
@ -1299,13 +1314,23 @@ handle_channel_health_check_worker_down(Data0, WorkerRef, ExitResult) ->
CHCWorkers = CHCWorkers3#{pending := Rest},
HCWorkers = HCWorkers0#{channel := CHCWorkers},
Data3 = Data2#data{hc_workers = HCWorkers},
Data4 = continue_channel_health_check_connected(ChannelId, PreviousChanStatus, Data3),
Data4 = continue_channel_health_check_connected(
ChannelId,
PreviousChanStatus,
CurrentStatus,
Data3
),
Data = start_channel_health_check(Data4, NextChannelId),
{keep_state, update_state(Data, Data0), Replies};
#{pending := []} ->
HCWorkers = HCWorkers0#{channel := CHCWorkers3},
Data3 = Data2#data{hc_workers = HCWorkers},
Data = continue_channel_health_check_connected(ChannelId, PreviousChanStatus, Data3),
Data = continue_channel_health_check_connected(
ChannelId,
PreviousChanStatus,
CurrentStatus,
Data3
),
{keep_state, update_state(Data, Data0), Replies}
end.
@ -1326,7 +1351,9 @@ handle_channel_health_check_worker_down_new_channels_and_status(
) ->
%% The checked config is different from the current config which means we
%% should not update AddedChannels because the channel has been removed or
%% updated while the health check was in progress
%% updated while the health check was in progress. We can still reply with
%% NewStatus because the health check must have been issued before the
%% configuration changed or the channel got removed.
{AddedChannels, NewStatus}.
reply_pending_channel_health_check_callers(