From c75e9bbe0d1e53b7bd43a4819d0eda6d7689fe04 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Fri, 2 Jun 2023 11:32:24 +0200 Subject: [PATCH] fix(emqx_cm): fix channel data registration race-condition when clustered, there are chances the a mqtt client process get killed (e.g. holding the channel registeration lock for too long), if the channel data inserts happen before casting out the message for channel process monitoring, there is a chance for the stale message left in the ets tables indefinitely. this commit changes the order of the non-atomic operations: it casts out the monitor request message before inserting channel data. --- apps/emqx/src/emqx_cm.erl | 4 +++- changes/ce/fix-10923.en.md | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 changes/ce/fix-10923.en.md diff --git a/apps/emqx/src/emqx_cm.erl b/apps/emqx/src/emqx_cm.erl index ebcf9c434..db3d48f5d 100644 --- a/apps/emqx/src/emqx_cm.erl +++ b/apps/emqx/src/emqx_cm.erl @@ -176,11 +176,13 @@ insert_channel_info(ClientId, Info, Stats) -> %% Note that: It should be called on a lock transaction register_channel(ClientId, ChanPid, #{conn_mod := ConnMod}) when is_pid(ChanPid) -> Chan = {ClientId, ChanPid}, + %% cast (for process monitor) before inserting ets tables + cast({registered, Chan}), true = ets:insert(?CHAN_TAB, Chan), true = ets:insert(?CHAN_CONN_TAB, {Chan, ConnMod}), ok = emqx_cm_registry:register_channel(Chan), mark_channel_connected(ChanPid), - cast({registered, Chan}). + ok. %% @doc Unregister a channel. -spec unregister_channel(emqx_types:clientid()) -> ok. diff --git a/changes/ce/fix-10923.en.md b/changes/ce/fix-10923.en.md new file mode 100644 index 000000000..accd547fc --- /dev/null +++ b/changes/ce/fix-10923.en.md @@ -0,0 +1,4 @@ +Fix a race-condition in channel info registration. + +Prior to this fix, when system is under heavy load, it might happen that a client is disconnected (or has its session expired) but still can be found in the clients page in dashboard. +One of the possible reasons is a race condition fixed in this PR: the connection is killed in the middle of channel data registration.