From f1a2f354c4aa79a50511e6adb8ffb67fe4ae8018 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Tue, 4 Oct 2022 12:42:40 +0200 Subject: [PATCH 01/34] test: ensure module loaded for ct-slave node before starting apps --- test/emqx_node_helpers.erl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/emqx_node_helpers.erl b/test/emqx_node_helpers.erl index ad530b3d7..af7093dd7 100644 --- a/test/emqx_node_helpers.erl +++ b/test/emqx_node_helpers.erl @@ -40,7 +40,9 @@ start_slave(Name, Opts) -> {ok, _} -> ok; {error, started_not_connected, _} -> - ok + ok; + Other -> + throw(Other) end, pong = net_adm:ping(Node), setup_node(Node, Opts), @@ -92,7 +94,11 @@ setup_node(Node, #{} = Opts) -> end, EnvHandler = maps:get(env_handler, Opts, DefaultEnvHandler), - [ok = rpc:call(Node, application, load, [App]) || App <- [gen_rpc, emqx]], + %% apps need to be loaded before starting for ekka to find and create mnesia tables + LoadApps = lists:usort([gen_rcp, emqx] ++ ?SLAVE_START_APPS), + lists:foreach(fun(App) -> + rpc:call(Node, application, load, [App]) + end, LoadApps), ok = rpc:call(Node, emqx_ct_helpers, start_apps, [StartApps, EnvHandler]), case maps:get(no_join, Opts, false) of From 9989f4df7ef450bb16928acbf13cb842f78dcd39 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Tue, 4 Oct 2022 18:09:38 +0200 Subject: [PATCH 02/34] chore: fix shared subscription redispatch --- src/emqx_session.erl | 35 ++++++++++++-------------- src/emqx_shared_sub.erl | 54 +++++++++++++++++++++++++++++++++-------- 2 files changed, 59 insertions(+), 30 deletions(-) diff --git a/src/emqx_session.erl b/src/emqx_session.erl index 94483bb87..73eacea8b 100644 --- a/src/emqx_session.erl +++ b/src/emqx_session.erl @@ -638,26 +638,21 @@ run_terminate_hooks(ClientInfo, takeovered, Session) -> run_terminate_hooks(ClientInfo, Reason, Session) -> run_hook('session.terminated', [ClientInfo, Reason, info(Session)]). -redispatch_shared_messages(#session{inflight = Inflight}) -> - InflightList = emqx_inflight:to_list(Inflight), - lists:foreach(fun - %% Only QoS1 messages get redispatched, because QoS2 messages - %% must be sent to the same client, once they're in flight - ({_, {#message{qos = ?QOS_2} = Msg, _}}) -> - ?LOG(warning, "Not redispatching qos2 msg: ~s", [emqx_message:format(Msg)]); - ({_, {#message{topic = Topic, qos = ?QOS_1} = Msg, _}}) -> - case emqx_shared_sub:get_group(Msg) of - {ok, Group} -> - %% Note that dispatch is called with self() in failed subs - %% This is done to avoid dispatching back to caller - Delivery = #delivery{sender = self(), message = Msg}, - emqx_shared_sub:dispatch_to_non_self(Group, Topic, Delivery); - _ -> - false - end; - (_) -> - ok - end, InflightList). +redispatch_shared_messages(#session{inflight = Inflight, mqueue = Q}) -> + InflightList = lists:map(fun({_, {Msg, _Ts}}) -> Msg end, + emqx_inflight:to_list(sort_fun(), Inflight)), + MqList = mqueue_to_list(Q, []), + emqx_shared_sub:redispatch(InflightList ++ MqList). + +%% convert mqueue to a list +%% the messages at the head of the list is to be dispatched before the tail +mqueue_to_list(Q, Acc) -> + case emqx_mqueue:out(Q) of + {empty, _Q} -> + lists:reverse(Acc); + {{value, Msg}, Q1} -> + mqueue_to_list(Q1, [Msg | Acc]) + end. -compile({inline, [run_hook/2]}). run_hook(Name, Args) -> diff --git a/src/emqx_shared_sub.erl b/src/emqx_shared_sub.erl index cc57e001f..4987248cf 100644 --- a/src/emqx_shared_sub.erl +++ b/src/emqx_shared_sub.erl @@ -39,7 +39,7 @@ ]). -export([ dispatch/3 - , dispatch_to_non_self/3 + , redispatch/1 ]). -export([ maybe_ack/1 @@ -47,7 +47,6 @@ , nack_no_connection/1 , is_ack_required/1 , is_retry_dispatch/1 - , get_group/1 ]). %% for testing @@ -84,6 +83,7 @@ -define(ACK, shared_sub_ack). -define(NACK(Reason), {shared_sub_nack, Reason}). -define(NO_ACK, no_ack). +-define(REDISPATCH_TO(GROUP, TOPIC), {GROUP, TOPIC}). -record(state, {pmon}). @@ -134,11 +134,12 @@ dispatch(Group, Topic, Delivery) -> Strategy = strategy(Group), dispatch(Strategy, Group, Topic, Delivery, _FailedSubs = #{}). -dispatch(Strategy, Group, Topic, Delivery = #delivery{message = Msg}, FailedSubs) -> - #message{from = ClientId, topic = SourceTopic} = Msg, +dispatch(Strategy, Group, Topic, Delivery = #delivery{message = Msg0}, FailedSubs) -> + #message{from = ClientId, topic = SourceTopic} = Msg0, case pick(Strategy, ClientId, SourceTopic, Group, Topic, FailedSubs) of false -> {error, no_subscribers}; {Type, SubPid} -> + Msg = with_redispatch_to(Msg0, Group, Topic), case do_dispatch(SubPid, Group, Topic, Msg, Type) of ok -> {ok, 1}; {error, Reason} -> @@ -162,7 +163,7 @@ ack_enabled() -> emqx:get_env(shared_dispatch_ack_enabled, false). do_dispatch(SubPid, _Group, Topic, Msg, _Type) when SubPid =:= self() -> - %% Deadlock otherwise + %% dispatch without ack, deadlock otherwise send(SubPid, Topic, {deliver, Topic, Msg}); %% return either 'ok' (when everything is fine) or 'error' do_dispatch(SubPid, _Group, Topic, #message{qos = ?QOS_0} = Msg, _Type) -> @@ -176,6 +177,10 @@ do_dispatch(SubPid, Group, Topic, Msg, Type) -> send(SubPid, Topic, {deliver, Topic, Msg}) end. +with_redispatch_to(#message{qos = ?QOS_0} = Msg, _Group, _Topic) -> Msg; +with_redispatch_to(Msg, Group, Topic) -> + emqx_message:set_headers(#{redispatch_to => ?REDISPATCH_TO(Group, Topic)}, Msg). + dispatch_with_ack(SubPid, Group, Topic, Msg, Type) -> %% For QoS 1/2 message, expect an ack Ref = erlang:monitor(process, SubPid), @@ -228,13 +233,22 @@ without_group_ack(Msg) -> get_group_ack(Msg) -> emqx_message:get_header(shared_dispatch_ack, Msg, ?NO_ACK). --spec(get_group(emqx_types:message()) -> {ok, any()} | error). -get_group(Msg) -> - case get_group_ack(Msg) of - {_Sender, {_Type, Group, _Ref}} -> {ok, Group}; - _ -> error +%% @hidden Redispatch is neede only for the messages with redispatch_to header added. +is_redispatch_needed(Msg) -> + case get_redispatch_to(Msg) of + ?REDISPATCH_TO(_, _) -> + true; + _ -> + false end. +%% @hidden Return the `redispatch_to` group-topic in the message header. +%% `false` is returned if the message is not a shared dispatch. +%% or when it's a QoS 0 message. +-spec(get_redispatch_to(emqx_types:message()) -> emqx_types:topic() | false). +get_redispatch_to(Msg) -> + emqx_message:get_header(redispatch_to, Msg, false). + -spec(is_ack_required(emqx_types:message()) -> boolean()). is_ack_required(Msg) -> ?NO_ACK =/= get_group_ack(Msg). @@ -245,6 +259,26 @@ is_retry_dispatch(Msg) -> _ -> false end. +%% @doc Redispatch shared deliveries to other members in the group. +redispatch(Messages0) -> + Messages = lists:filter(fun is_redispatch_needed/1, Messages0), + case length(Messages) of + L when L > 0 -> + ?LOG(info, "Redispatching ~p shared subscription messages", [L]), + lists:foreach(fun redispatch_shared_message/1, Messages); + _ -> + ok + end. + +redispatch_shared_message(Msg) -> + %% As long as it's still a #message{} record in inflight, + %% we should try to re-dispatch + ?REDISPATCH_TO(Group, Topic) = get_redispatch_to(Msg), + %% Note that dispatch is called with self() in failed subs + %% This is done to avoid dispatching back to caller + Delivery = #delivery{sender = self(), message = Msg}, + dispatch_to_non_self(Group, Topic, Delivery). + %% @doc Negative ack dropped message due to inflight window or message queue being full. -spec(maybe_nack_dropped(emqx_types:message()) -> store | drop). maybe_nack_dropped(Msg) -> From ba1c276c75433c53902475a408733c842376b8a7 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 5 Oct 2022 10:04:09 +0200 Subject: [PATCH 03/34] fix(typespec): fix type spec for emqx_shared_sub:redispatch_to --- src/emqx_shared_sub.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/emqx_shared_sub.erl b/src/emqx_shared_sub.erl index 4987248cf..be897569a 100644 --- a/src/emqx_shared_sub.erl +++ b/src/emqx_shared_sub.erl @@ -85,6 +85,8 @@ -define(NO_ACK, no_ack). -define(REDISPATCH_TO(GROUP, TOPIC), {GROUP, TOPIC}). +-type redispatch_to() :: ?REDISPATCH_TO(emqx_topic:group(), emqx_topic:topic()). + -record(state, {pmon}). -record(emqx_shared_subscription, {group, topic, subpid}). @@ -245,7 +247,7 @@ is_redispatch_needed(Msg) -> %% @hidden Return the `redispatch_to` group-topic in the message header. %% `false` is returned if the message is not a shared dispatch. %% or when it's a QoS 0 message. --spec(get_redispatch_to(emqx_types:message()) -> emqx_types:topic() | false). +-spec(get_redispatch_to(emqx_types:message()) -> redispatch_to() | false). get_redispatch_to(Msg) -> emqx_message:get_header(redispatch_to, Msg, false). From 6769bd4edc598cf82f7b666718633ea2017ee054 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 5 Oct 2022 10:30:59 +0200 Subject: [PATCH 04/34] fix(shared): drop pubrel from inflight collection before redispatch --- src/emqx_session.erl | 13 +++++++++++-- src/emqx_shared_sub.erl | 4 ++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/emqx_session.erl b/src/emqx_session.erl index 73eacea8b..f87fc1f23 100644 --- a/src/emqx_session.erl +++ b/src/emqx_session.erl @@ -639,8 +639,17 @@ run_terminate_hooks(ClientInfo, Reason, Session) -> run_hook('session.terminated', [ClientInfo, Reason, info(Session)]). redispatch_shared_messages(#session{inflight = Inflight, mqueue = Q}) -> - InflightList = lists:map(fun({_, {Msg, _Ts}}) -> Msg end, - emqx_inflight:to_list(sort_fun(), Inflight)), + F = fun({_, {Msg, _Ts}}) -> + case Msg of + #message{} -> + {true, Msg}; + _ -> + %% QoS 2, after pubrec is received + %% the inflight record is updated to an atom + false + end + end, + InflightList = lists:filtermap(F, emqx_inflight:to_list(sort_fun(), Inflight)), MqList = mqueue_to_list(Q, []), emqx_shared_sub:redispatch(InflightList ++ MqList). diff --git a/src/emqx_shared_sub.erl b/src/emqx_shared_sub.erl index be897569a..8667ae6c4 100644 --- a/src/emqx_shared_sub.erl +++ b/src/emqx_shared_sub.erl @@ -236,7 +236,7 @@ get_group_ack(Msg) -> emqx_message:get_header(shared_dispatch_ack, Msg, ?NO_ACK). %% @hidden Redispatch is neede only for the messages with redispatch_to header added. -is_redispatch_needed(Msg) -> +is_redispatch_needed(#message{} = Msg) -> case get_redispatch_to(Msg) of ?REDISPATCH_TO(_, _) -> true; @@ -272,7 +272,7 @@ redispatch(Messages0) -> ok end. -redispatch_shared_message(Msg) -> +redispatch_shared_message(#message{} = Msg) -> %% As long as it's still a #message{} record in inflight, %% we should try to re-dispatch ?REDISPATCH_TO(Group, Topic) = get_redispatch_to(Msg), From 3339df8b249a0acba545292beabad15dd85daf2a Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 5 Oct 2022 12:33:15 +0200 Subject: [PATCH 05/34] test: Add test case to cover shared sub QoS2 pubrel in inflights --- src/emqx_session.erl | 3 +- src/emqx_shared_sub.erl | 2 +- test/emqx_shared_sub_SUITE.erl | 192 +++++++++++++++++++++++++-------- 3 files changed, 149 insertions(+), 48 deletions(-) diff --git a/src/emqx_session.erl b/src/emqx_session.erl index f87fc1f23..40c8eacc0 100644 --- a/src/emqx_session.erl +++ b/src/emqx_session.erl @@ -639,6 +639,7 @@ run_terminate_hooks(ClientInfo, Reason, Session) -> run_hook('session.terminated', [ClientInfo, Reason, info(Session)]). redispatch_shared_messages(#session{inflight = Inflight, mqueue = Q}) -> + AllInflights = emqx_inflight:to_list(sort_fun(), Inflight), F = fun({_, {Msg, _Ts}}) -> case Msg of #message{} -> @@ -649,7 +650,7 @@ redispatch_shared_messages(#session{inflight = Inflight, mqueue = Q}) -> false end end, - InflightList = lists:filtermap(F, emqx_inflight:to_list(sort_fun(), Inflight)), + InflightList = lists:filtermap(F, AllInflights), MqList = mqueue_to_list(Q, []), emqx_shared_sub:redispatch(InflightList ++ MqList). diff --git a/src/emqx_shared_sub.erl b/src/emqx_shared_sub.erl index 8667ae6c4..6130ccc0a 100644 --- a/src/emqx_shared_sub.erl +++ b/src/emqx_shared_sub.erl @@ -266,7 +266,7 @@ redispatch(Messages0) -> Messages = lists:filter(fun is_redispatch_needed/1, Messages0), case length(Messages) of L when L > 0 -> - ?LOG(info, "Redispatching ~p shared subscription messages", [L]), + ?LOG(info, "Redispatching ~p shared subscription message(s)", [L]), lists:foreach(fun redispatch_shared_message/1, Messages); _ -> ok diff --git a/test/emqx_shared_sub_SUITE.erl b/test/emqx_shared_sub_SUITE.erl index 5e79ee983..d8a7d12b4 100644 --- a/test/emqx_shared_sub_SUITE.erl +++ b/test/emqx_shared_sub_SUITE.erl @@ -25,13 +25,24 @@ -define(SUITE, ?MODULE). --define(wait(For, Timeout), - emqx_ct_helpers:wait_for( - ?FUNCTION_NAME, ?LINE, fun() -> For end, Timeout)). - -define(ack, shared_sub_ack). -define(no_ack, no_ack). +-define(WAIT(TIMEOUT, PATTERN, Res), + (fun() -> + receive + PATTERN -> + Res; + Other -> + ct:fail(#{expected => ??PATTERN, + got => Other + }) + after + TIMEOUT -> + ct:fail({timeout, ??PATTERN}) + end + end)()). + all() -> emqx_ct:all(?SUITE). init_per_suite(Config) -> @@ -135,40 +146,7 @@ t_no_connection_nack(_) -> SendF(1), ct:sleep(200), %% This is the connection which was picked by broker to dispatch (sticky) for 1st message - ?assertMatch([#{packet_id := 1}], recv_msgs(1)), - %% Now kill the connection, expect all following messages to be delivered to the other - %% subscriber. - %emqx_mock_client:stop(ConnPid), - %% sleep then make synced calls to session processes to ensure that - %% the connection pid's 'EXIT' message is propagated to the session process - %% also to be sure sessions are still alive - % timer:sleep(2), - % _ = emqx_session:info(SPid1), - % _ = emqx_session:info(SPid2), - % %% Now we know what is the other still alive connection - % [TheOtherConnPid] = [SubConnPid1, SubConnPid2] -- [ConnPid], - % %% Send some more messages - % PacketIdList = lists:seq(2, 10), - % lists:foreach(fun(Id) -> - % SendF(Id), - % ?wait(Received(Id, TheOtherConnPid), 1000) - % end, PacketIdList), - % %% Now close the 2nd (last connection) - % emqx_mock_client:stop(TheOtherConnPid), - % timer:sleep(2), - % %% both sessions should have conn_pid = undefined - % ?assertEqual({conn_pid, undefined}, lists:keyfind(conn_pid, 1, emqx_session:info(SPid1))), - % ?assertEqual({conn_pid, undefined}, lists:keyfind(conn_pid, 1, emqx_session:info(SPid2))), - % %% send more messages, but all should be queued in session state - % lists:foreach(fun(Id) -> SendF(Id) end, PacketIdList), - % {_, L1} = lists:keyfind(mqueue_len, 1, emqx_session:info(SPid1)), - % {_, L2} = lists:keyfind(mqueue_len, 1, emqx_session:info(SPid2)), - % ?assertEqual(length(PacketIdList), L1 + L2), - % %% clean up - % emqx_mock_client:close_session(PubConnPid), - % emqx_sm:close_session(SPid1), - % emqx_sm:close_session(SPid2), ok. t_random(_) -> @@ -422,8 +400,14 @@ t_local_fallback(_) -> %% This one tests that broker tries to select another shared subscriber %% If the first one doesn't return an ACK -t_redispatch(_) -> - ok = ensure_config(sticky, true), +t_redispatch_with_ack(Config) -> + test_redispatch(Config, true). + +t_redispatch_no_ack(Config) -> + test_redispatch(Config, false). + +test_redispatch(_Config, AckEnabled) -> + ok = ensure_config(sticky, AckEnabled), application:set_env(emqx, shared_dispatch_ack_enabled, true), Group = <<"group1">>, @@ -453,15 +437,55 @@ t_redispatch(_) -> emqtt:stop(UsedSubPid2), ok. -t_dispatch_when_inflights_are_full(_) -> - ok = ensure_config(round_robin, true), +t_redispatch_wildcard_with_ack(Config) -> + redispatch_wildcard(Config, true). + +t_redispatch_wildcard_no_ack(Config) -> + redispatch_wildcard(Config, false). + +%% This one tests that broker tries to redispatch to another member in the group +%% if the first one disconnected before acking (auto_ack set to false) +redispatch_wildcard(_Config, AckEnabled) -> + ok = ensure_config(sticky, AckEnabled), + + Group = <<"group1">>, + + Topic = <<"foo/bar/1">>, + ClientId1 = <<"ClientId1">>, + ClientId2 = <<"ClientId2">>, + {ok, ConnPid1} = emqtt:start_link([{clientid, ClientId1}, {auto_ack, false}]), + {ok, ConnPid2} = emqtt:start_link([{clientid, ClientId2}, {auto_ack, false}]), + {ok, _} = emqtt:connect(ConnPid1), + {ok, _} = emqtt:connect(ConnPid2), + + emqtt:subscribe(ConnPid1, {<<"$share/", Group/binary, "/foo/bar/#">>, 1}), + emqtt:subscribe(ConnPid2, {<<"$share/", Group/binary, "/foo/bar/#">>, 1}), + + Message = emqx_message:make(ClientId1, 1, Topic, <<"hello1">>), + + emqx:publish(Message), + + {true, UsedSubPid1} = last_message(<<"hello1">>, [ConnPid1, ConnPid2]), + ok = emqtt:stop(UsedSubPid1), + + Res = last_message(<<"hello1">>, [ConnPid1, ConnPid2], 6000), + ?assertMatch({true, Pid} when Pid =/= UsedSubPid1, Res), + + {true, UsedSubPid2} = Res, + emqtt:stop(UsedSubPid2), + ok. + +t_dispatch_when_inflights_are_full_with_ack(Config) when is_list(Config) -> + ok = ensure_config(round_robin, _AckEnabled = true), Topic = <<"foo/bar">>, ClientId1 = <<"ClientId1">>, ClientId2 = <<"ClientId2">>, - %% Note that max_inflight is 1 - {ok, ConnPid1} = emqtt:start_link([{clientid, ClientId1}, {max_inflight, 1}]), - {ok, ConnPid2} = emqtt:start_link([{clientid, ClientId2}, {max_inflight, 1}]), + %% make sure broker does not push more than one inflight + meck:new(emqx_zone, [passthrough, no_history]), + meck:expect(emqx_zone, max_inflight, fun(_Zone) -> 1 end), + {ok, ConnPid1} = emqtt:start_link([{clientid, ClientId1}]), + {ok, ConnPid2} = emqtt:start_link([{clientid, ClientId2}]), {ok, _} = emqtt:connect(ConnPid1), {ok, _} = emqtt:connect(ConnPid2), @@ -484,8 +508,7 @@ t_dispatch_when_inflights_are_full(_) -> ?assertMatch([{_, _, {ok, 1}}], emqx:publish(Message2)), %% Now kill any client - erlang:exit(ConnPid1, normal), - ct:sleep(100), + ok = kill_process(ConnPid1), %% And try to send the message ?assertMatch([{_, _, {ok, 1}}], emqx:publish(Message3)), @@ -497,13 +520,90 @@ t_dispatch_when_inflights_are_full(_) -> ?assertMatch({true, ConnPid2}, last_message(<<"hello3">>, [ConnPid1, ConnPid2])), ?assertMatch({true, ConnPid2}, last_message(<<"hello4">>, [ConnPid1, ConnPid2])), + meck:unload(emqx_zone), emqtt:stop(ConnPid2), ok. +%% No ack, QoS 2 subscriptions, +%% client1 receives one message, send pubrec, then suspend +%% client2 acts normal (aot_ack=true) +%% Expected behaviour: +%% the messages sent to client1's inflight and mq are re-dispatched after client1 is down +t_dispatch_qos2(Config) when is_list(Config) -> + ok = ensure_config(round_robin, _AckEnabled = false), + Topic = <<"foo/bar/1">>, + ClientId1 = <<"ClientId1">>, + ClientId2 = <<"ClientId2">>, + meck:new(emqx_zone, [passthrough, no_history]), + meck:expect(emqx_zone, max_inflight, fun(_Zone) -> 1 end), + + {ok, ConnPid1} = emqtt:start_link([{clientid, ClientId1}, {auto_ack, false}]), + {ok, ConnPid2} = emqtt:start_link([{clientid, ClientId2}, {auto_ack, true}]), + {ok, _} = emqtt:connect(ConnPid1), + {ok, _} = emqtt:connect(ConnPid2), + + emqtt:subscribe(ConnPid1, {<<"$share/group/foo/bar/#">>, 2}), + emqtt:subscribe(ConnPid2, {<<"$share/group/foo/bar/#">>, 2}), + + Message1 = emqx_message:make(ClientId1, 2, Topic, <<"hello1">>), + Message2 = emqx_message:make(ClientId1, 2, Topic, <<"hello2">>), + Message3 = emqx_message:make(ClientId1, 2, Topic, <<"hello3">>), + Message4 = emqx_message:make(ClientId1, 2, Topic, <<"hello4">>), + ct:sleep(100), + + ok = sys:suspend(ConnPid1), + + %% One message is inflight + ?assertMatch([{_, _, {ok, 1}}], emqx:publish(Message1)), + ?assertMatch([{_, _, {ok, 1}}], emqx:publish(Message2)), + ?assertMatch([{_, _, {ok, 1}}], emqx:publish(Message3)), + ?assertMatch([{_, _, {ok, 1}}], emqx:publish(Message4)), + + MsgRec1 = ?WAIT(2000, {publish, #{client_pid := ConnPid2, payload := P1}}, P1), + MsgRec2 = ?WAIT(2000, {publish, #{client_pid := ConnPid2, payload := P2}}, P2), + %% assert hello2 > hello1 or hello4 > hello3 + ?assert(MsgRec2 > MsgRec1), + + sys:resume(ConnPid1), + %% emqtt automatically send PUBREC, but since auto_ack is set to false + %% so it will never send PUBCOMP, hence EMQX should not attempt to send + %% the 4th message yet since max_inflight is 1. + MsgRec3 = ?WAIT(2000, {publish, #{client_pid := ConnPid1, payload := P3}}, P3), + ct:sleep(100), + %% no message expected + ?assertEqual([], collect_msgs([])), + %% now kill client 1 + kill_process(ConnPid1), + %% client 2 should receive the message + MsgRec4 = ?WAIT(2000, {publish, #{client_pid := ConnPid2, payload := P4}}, P4), + %% assert hello2 > hello1 or hello4 > hello3 + ?assert(MsgRec4 > MsgRec3), + emqtt:stop(ConnPid2), + meck:unload(emqx_zone), + ok. + %%-------------------------------------------------------------------- %% help functions %%-------------------------------------------------------------------- +kill_process(Pid) -> + _ = unlink(Pid), + _ = monitor(process, Pid), + erlang:exit(Pid, kill), + receive + {'DOWN', _, process, Pid, _} -> + ok + end. + +collect_msgs(Acc) -> + receive + Msg -> + collect_msgs([Msg | Acc]) + after + 0 -> + lists:reverse(Acc) + end. + ensure_config(Strategy) -> ensure_config(Strategy, _AckEnabled = true). From 8d42589bf5045742ae60732f4c1042796a30a32e Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 5 Oct 2022 13:32:48 +0200 Subject: [PATCH 06/34] chore: update appup --- src/emqx.appup.src | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/emqx.appup.src b/src/emqx.appup.src index 3db3d2c45..51576061c 100644 --- a/src/emqx.appup.src +++ b/src/emqx.appup.src @@ -2,7 +2,9 @@ %% Unless you know what you are doing, DO NOT edit manually!! {VSN, [{"4.3.21", - [{add_module,emqx_secret}, + [{load_module,emqx_shared_sub,brutal_purge,soft_purge,[]}, + {load_module,emqx_session,brutal_purge,soft_purge,[]}, + {add_module,emqx_secret}, {load_module,emqx_alarm,brutal_purge,soft_purge,[]}, {load_module,emqx_app,brutal_purge,soft_purge,[]}, {load_module,emqx_connection,brutal_purge,soft_purge,[]}, @@ -17,7 +19,8 @@ {load_module,emqx_http_lib,brutal_purge,soft_purge,[]}, {load_module,emqx,brutal_purge,soft_purge,[]}]}, {"4.3.20", - [{add_module,emqx_secret}, + [{load_module,emqx_session,brutal_purge,soft_purge,[]}, + {add_module,emqx_secret}, {load_module,emqx_plugins,brutal_purge,soft_purge,[]}, {load_module,emqx_router_helper,brutal_purge,soft_purge,[]}, {load_module,emqx_router,brutal_purge,soft_purge,[]}, @@ -34,7 +37,8 @@ {load_module,emqx_http_lib,brutal_purge,soft_purge,[]}, {load_module,emqx,brutal_purge,soft_purge,[]}]}, {"4.3.19", - [{add_module,emqx_secret}, + [{load_module,emqx_session,brutal_purge,soft_purge,[]}, + {add_module,emqx_secret}, {load_module,emqx_listeners,brutal_purge,soft_purge,[]}, {load_module,emqx_router_helper,brutal_purge,soft_purge,[]}, {load_module,emqx_router,brutal_purge,soft_purge,[]}, @@ -52,7 +56,8 @@ {load_module,emqx_http_lib,brutal_purge,soft_purge,[]}, {load_module,emqx_channel,brutal_purge,soft_purge,[]}]}, {"4.3.18", - [{add_module,emqx_secret}, + [{load_module,emqx_session,brutal_purge,soft_purge,[]}, + {add_module,emqx_secret}, {load_module,emqx_listeners,brutal_purge,soft_purge,[]}, {load_module,emqx_router_helper,brutal_purge,soft_purge,[]}, {load_module,emqx_router,brutal_purge,soft_purge,[]}, @@ -857,9 +862,10 @@ {load_module,emqx_message,brutal_purge,soft_purge,[]}, {load_module,emqx_limiter,brutal_purge,soft_purge,[]}]}, {<<".*">>,[]}], - [ - {"4.3.21", - [{load_module,emqx_alarm,brutal_purge,soft_purge,[]}, + [{"4.3.21", + [{load_module,emqx_shared_sub,brutal_purge,soft_purge,[]}, + {load_module,emqx_session,brutal_purge,soft_purge,[]}, + {load_module,emqx_alarm,brutal_purge,soft_purge,[]}, {load_module,emqx_app,brutal_purge,soft_purge,[]}, {load_module,emqx_connection,brutal_purge,soft_purge,[]}, {load_module,emqx_plugins,brutal_purge,soft_purge,[]}, @@ -873,7 +879,8 @@ {load_module,emqx_http_lib,brutal_purge,soft_purge,[]}, {load_module,emqx,brutal_purge,soft_purge,[]}]}, {"4.3.20", - [{load_module,emqx_plugins,brutal_purge,soft_purge,[]}, + [{load_module,emqx_session,brutal_purge,soft_purge,[]}, + {load_module,emqx_plugins,brutal_purge,soft_purge,[]}, {load_module,emqx_router_helper,brutal_purge,soft_purge,[]}, {load_module,emqx_router,brutal_purge,soft_purge,[]}, {load_module,emqx_tracer,brutal_purge,soft_purge,[]}, @@ -889,7 +896,8 @@ {load_module,emqx_http_lib,brutal_purge,soft_purge,[]}, {load_module,emqx,brutal_purge,soft_purge,[]}]}, {"4.3.19", - [{load_module,emqx_listeners,brutal_purge,soft_purge,[]}, + [{load_module,emqx_session,brutal_purge,soft_purge,[]}, + {load_module,emqx_listeners,brutal_purge,soft_purge,[]}, {load_module,emqx_router_helper,brutal_purge,soft_purge,[]}, {load_module,emqx_router,brutal_purge,soft_purge,[]}, {load_module,emqx_tracer,brutal_purge,soft_purge,[]}, @@ -906,7 +914,8 @@ {load_module,emqx_http_lib,brutal_purge,soft_purge,[]}, {load_module,emqx_channel,brutal_purge,soft_purge,[]}]}, {"4.3.18", - [{load_module,emqx_listeners,brutal_purge,soft_purge,[]}, + [{load_module,emqx_session,brutal_purge,soft_purge,[]}, + {load_module,emqx_listeners,brutal_purge,soft_purge,[]}, {load_module,emqx_router_helper,brutal_purge,soft_purge,[]}, {load_module,emqx_router,brutal_purge,soft_purge,[]}, {load_module,emqx_tracer,brutal_purge,soft_purge,[]}, From d23dfcca39efa778f056c4a2094cd6e900c62fcf Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 5 Oct 2022 16:03:00 +0200 Subject: [PATCH 07/34] fix(shared): only re-dispatch QoS1 inflights --- CHANGES-4.3.md | 10 ++++++++++ src/emqx_session.erl | 20 ++++++++++++-------- test/emqx_shared_sub_SUITE.erl | 4 ++-- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/CHANGES-4.3.md b/CHANGES-4.3.md index 713f89f37..ef96ce745 100644 --- a/CHANGES-4.3.md +++ b/CHANGES-4.3.md @@ -32,6 +32,11 @@ File format: - Added a test to prevent a last will testament message to be published when a client is denied connection. [#8894](https://github.com/emqx/emqx/pull/8894) +- QoS1 and QoS2 messages in session's buffer are re-dispatched to other members in the group + when the session terminates [#9094](https://github.com/emqx/emqx/pull/9094). + Prior to this enhancement, one would have to set `broker.shared_dispatch_ack_enabled` to true + to prevent sessions from buffering messages, however this acknowledgement comes with a cost. + ### Bug fixes - Fix delayed publish inaccurate caused by os time change. [#8908](https://github.com/emqx/emqx/pull/8908) @@ -48,6 +53,11 @@ File format: Same `format_status` callback is added here too for `gen_server`s which hold password in their state. +- Fix shared subscription message re-dispatches [#9094](https://github.com/emqx/emqx/pull/9094). + - When discarding QoS 2 inflight messages, there were excessive logs + - For wildcard deliveries, the re-dispatch used the wrong topic (the publishing topic, + but not the subscrbing topic), caused messages to be lost when dispatching. + ## v4.3.20 ### Bug fixes diff --git a/src/emqx_session.erl b/src/emqx_session.erl index 40c8eacc0..d2379a3fd 100644 --- a/src/emqx_session.erl +++ b/src/emqx_session.erl @@ -641,14 +641,18 @@ run_terminate_hooks(ClientInfo, Reason, Session) -> redispatch_shared_messages(#session{inflight = Inflight, mqueue = Q}) -> AllInflights = emqx_inflight:to_list(sort_fun(), Inflight), F = fun({_, {Msg, _Ts}}) -> - case Msg of - #message{} -> - {true, Msg}; - _ -> - %% QoS 2, after pubrec is received - %% the inflight record is updated to an atom - false - end + case Msg of + #message{qos = ?QOS_1} -> + %% For QoS 2, here is what the spec says: + %% If the Client's Session terminates before the Client reconnects, + %% the Server MUST NOT send the Application Message to any other + %% subscribed Client [MQTT-4.8.2-5]. + {true, Msg}; + _ -> + %% QoS 2, after pubrec is received + %% the inflight record is updated to an atom + false + end end, InflightList = lists:filtermap(F, AllInflights), MqList = mqueue_to_list(Q, []), diff --git a/test/emqx_shared_sub_SUITE.erl b/test/emqx_shared_sub_SUITE.erl index ba216abba..a16f948ff 100644 --- a/test/emqx_shared_sub_SUITE.erl +++ b/test/emqx_shared_sub_SUITE.erl @@ -550,7 +550,7 @@ t_dispatch_when_inflights_are_full(Config) when is_list(Config) -> %% No ack, QoS 2 subscriptions, %% client1 receives one message, send pubrec, then suspend -%% client2 acts normal (aot_ack=true) +%% client2 acts normal (auto_ack=true) %% Expected behaviour: %% the messages sent to client1's inflight and mq are re-dispatched after client1 is down t_dispatch_qos2({init, Config}) when is_list(Config) -> @@ -593,7 +593,7 @@ t_dispatch_qos2(Config) when is_list(Config) -> ?assert(MsgRec2 > MsgRec1), sys:resume(ConnPid1), - %% emqtt automatically send PUBREC, but since auto_ack is set to false + %% emqtt subscriber automatically sends PUBREC, but since auto_ack is set to false %% so it will never send PUBCOMP, hence EMQX should not attempt to send %% the 4th message yet since max_inflight is 1. MsgRec3 = ?WAIT(2000, {publish, #{client_pid := ConnPid1, payload := P3}}, P3), From a1032db4e18a4ebb5ed25657dbf4b10bfbad4869 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 5 Oct 2022 17:17:51 +0200 Subject: [PATCH 08/34] test: add test case to verify QoS 0 message is never redispatched --- CHANGES-4.3.md | 2 +- test/emqx_shared_sub_SUITE.erl | 56 +++++++++++++++++++++++++++++++--- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/CHANGES-4.3.md b/CHANGES-4.3.md index ef96ce745..26dd529a2 100644 --- a/CHANGES-4.3.md +++ b/CHANGES-4.3.md @@ -56,7 +56,7 @@ File format: - Fix shared subscription message re-dispatches [#9094](https://github.com/emqx/emqx/pull/9094). - When discarding QoS 2 inflight messages, there were excessive logs - For wildcard deliveries, the re-dispatch used the wrong topic (the publishing topic, - but not the subscrbing topic), caused messages to be lost when dispatching. + but not the subscribing topic), caused messages to be lost when dispatching. ## v4.3.20 diff --git a/test/emqx_shared_sub_SUITE.erl b/test/emqx_shared_sub_SUITE.erl index a16f948ff..2c4ecf265 100644 --- a/test/emqx_shared_sub_SUITE.erl +++ b/test/emqx_shared_sub_SUITE.erl @@ -599,7 +599,7 @@ t_dispatch_qos2(Config) when is_list(Config) -> MsgRec3 = ?WAIT(2000, {publish, #{client_pid := ConnPid1, payload := P3}}, P3), ct:sleep(100), %% no message expected - ?assertEqual([], collect_msgs([])), + ?assertEqual([], collect_msgs(0)), %% now kill client 1 kill_process(ConnPid1), %% client 2 should receive the message @@ -609,6 +609,51 @@ t_dispatch_qos2(Config) when is_list(Config) -> emqtt:stop(ConnPid2), ok. +t_dispatch_qos0({init, Config}) when is_list(Config) -> + Config; +t_dispatch_qos0({'end', Config}) when is_list(Config) -> + ok; +t_dispatch_qos0(Config) when is_list(Config) -> + ok = ensure_config(round_robin, _AckEnabled = false), + Topic = <<"foo/bar/1">>, + ClientId1 = <<"ClientId1">>, + ClientId2 = <<"ClientId2">>, + + {ok, ConnPid1} = emqtt:start_link([{clientid, ClientId1}, {auto_ack, false}]), + {ok, ConnPid2} = emqtt:start_link([{clientid, ClientId2}, {auto_ack, true}]), + {ok, _} = emqtt:connect(ConnPid1), + {ok, _} = emqtt:connect(ConnPid2), + + %% subscribe with QoS 0 + emqtt:subscribe(ConnPid1, {<<"$share/group/foo/bar/#">>, 0}), + emqtt:subscribe(ConnPid2, {<<"$share/group/foo/bar/#">>, 0}), + + %% publish with QoS 2, but should be downgraded to 0 as the subscribers + %% subscribe with QoS 0 + Message1 = emqx_message:make(ClientId1, 2, Topic, <<"hello1">>), + Message2 = emqx_message:make(ClientId1, 2, Topic, <<"hello2">>), + Message3 = emqx_message:make(ClientId1, 2, Topic, <<"hello3">>), + Message4 = emqx_message:make(ClientId1, 2, Topic, <<"hello4">>), + ct:sleep(100), + + ok = sys:suspend(ConnPid1), + + ?assertMatch([_], emqx:publish(Message1)), + ?assertMatch([_], emqx:publish(Message2)), + ?assertMatch([_], emqx:publish(Message3)), + ?assertMatch([_], emqx:publish(Message4)), + + MsgRec1 = ?WAIT(2000, {publish, #{client_pid := ConnPid2, payload := P1}}, P1), + MsgRec2 = ?WAIT(2000, {publish, #{client_pid := ConnPid2, payload := P2}}, P2), + %% assert hello2 > hello1 or hello4 > hello3 + ?assert(MsgRec2 > MsgRec1), + + kill_process(ConnPid1), + %% expect no redispatch + ?assertEqual([], collect_msgs(timer:seconds(2))), + emqtt:stop(ConnPid2), + ok. + %%-------------------------------------------------------------------- %% help functions %%-------------------------------------------------------------------- @@ -622,12 +667,15 @@ kill_process(Pid) -> ok end. -collect_msgs(Acc) -> +collect_msgs(Timeout) -> + collect_msgs([], Timeout). + +collect_msgs(Acc, Timeout) -> receive Msg -> - collect_msgs([Msg | Acc]) + collect_msgs([Msg | Acc], Timeout) after - 0 -> + Timeout -> lists:reverse(Acc) end. From 206ab125a4b1157d7f1a17ebe6cb920980bc1f7b Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Thu, 6 Oct 2022 09:21:52 +0200 Subject: [PATCH 09/34] build: support additional checks before cutting a release tag --- scripts/rel/cut4x.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/rel/cut4x.sh b/scripts/rel/cut4x.sh index 58131c757..df142b936 100755 --- a/scripts/rel/cut4x.sh +++ b/scripts/rel/cut4x.sh @@ -230,6 +230,16 @@ if [ "$HAS_RELUP_DB" = 'yes' ]; then ./scripts/relup-base-vsns.escript check-vsn-db "$PKG_VSN" "$RELUP_PATHS" fi +## Run some additional checks (e.g. some for enterprise edition only) +CHECKS_DIR="./scripts/rel/checks" +if [ -d "${CHECKS_DIR}" ]; then + CHECKS="$(find "${CHECKS_DIR}" -name "*.sh" -print0 2>/dev/null | xargs -0)" + for c in $CHECKS; do + logmsg "Executing $c" + $c + done +fi + if [ "$DRYRUN" = 'yes' ]; then logmsg "Release tag is ready to be created with command: git tag $TAG" else From 13f3dafe2297df37aab0d48511889275fb13350f Mon Sep 17 00:00:00 2001 From: DDDHuang <44492639+DDDHuang@users.noreply.github.com> Date: Sun, 24 Apr 2022 14:10:51 +0800 Subject: [PATCH 10/34] refactor: enhance the flapping detect accuracy Count the `flapping` event as long as a client try to connect to server whatever it suceed or failed. It is more helpful to improve stablebility. --- src/emqx_channel.erl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/emqx_channel.erl b/src/emqx_channel.erl index b8f3c5b2c..38cc8f989 100644 --- a/src/emqx_channel.erl +++ b/src/emqx_channel.erl @@ -288,7 +288,8 @@ handle_in(?CONNECT_PACKET(ConnPkt), Channel) -> fun enrich_client/2, fun set_log_meta/2, fun check_banned/2, - fun auth_connect/2 + fun auth_connect/2, + fun flapping_detect/2 ], ConnPkt, Channel#channel{conn_state = connecting}) of {ok, NConnPkt, NChannel = #channel{clientinfo = ClientInfo}} -> NChannel1 = NChannel#channel{ @@ -1022,11 +1023,7 @@ handle_info({sock_closed, Reason}, Channel = #channel{conn_state = idle}) -> handle_info({sock_closed, Reason}, Channel = #channel{conn_state = connecting}) -> shutdown(Reason, Channel); -handle_info({sock_closed, Reason}, Channel = - #channel{conn_state = connected, - clientinfo = ClientInfo = #{zone := Zone}}) -> - emqx_zone:enable_flapping_detect(Zone) - andalso emqx_flapping:detect(ClientInfo), +handle_info({sock_closed, Reason}, Channel = #channel{conn_state = connected}) -> Channel1 = ensure_disconnected(Reason, maybe_publish_will_msg(Channel)), case maybe_shutdown(Reason, Channel1) of {ok, Channel2} -> {ok, {event, disconnected}, Channel2}; @@ -1335,6 +1332,13 @@ auth_connect(#mqtt_packet_connect{password = Password}, {error, emqx_reason_codes:connack_error(Reason)} end. +%%-------------------------------------------------------------------- +%% Flapping + +flapping_detect(_ConnPkt, Channel = #channel{clientinfo = ClientInfo = #{zone := Zone}}) -> + _ = emqx_zone:enable_flapping_detect(Zone) andalso emqx_flapping:detect(ClientInfo), + {ok, Channel}. + %%-------------------------------------------------------------------- %% Enhanced Authentication From 165842ded4822ef49cf7d6f6743816cc7539a816 Mon Sep 17 00:00:00 2001 From: JianBo He Date: Mon, 26 Sep 2022 14:08:28 +0800 Subject: [PATCH 11/34] chore: update changes.md --- CHANGES-4.3.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES-4.3.md b/CHANGES-4.3.md index 713f89f37..62df98f2d 100644 --- a/CHANGES-4.3.md +++ b/CHANGES-4.3.md @@ -32,6 +32,8 @@ File format: - Added a test to prevent a last will testament message to be published when a client is denied connection. [#8894](https://github.com/emqx/emqx/pull/8894) +- More rigorous checking of flapping to improve stability of the system. [#9045](https://github.com/emqx/emqx/pull/9045) + ### Bug fixes - Fix delayed publish inaccurate caused by os time change. [#8908](https://github.com/emqx/emqx/pull/8908) From e2b0048e88850389f42e3b6c4abbae60f183dbf5 Mon Sep 17 00:00:00 2001 From: JianBo He Date: Mon, 26 Sep 2022 15:03:30 +0800 Subject: [PATCH 12/34] refactor(flapping): count flapping event if connecting failed --- src/emqx_channel.erl | 6 +++--- test/emqx_channel_SUITE.erl | 38 +++++++++++++++++++++++++++++------- test/emqx_flapping_SUITE.erl | 2 +- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/emqx_channel.erl b/src/emqx_channel.erl index 38cc8f989..1c0c6ee72 100644 --- a/src/emqx_channel.erl +++ b/src/emqx_channel.erl @@ -288,8 +288,8 @@ handle_in(?CONNECT_PACKET(ConnPkt), Channel) -> fun enrich_client/2, fun set_log_meta/2, fun check_banned/2, - fun auth_connect/2, - fun flapping_detect/2 + fun count_flapping_event/2, + fun auth_connect/2 ], ConnPkt, Channel#channel{conn_state = connecting}) of {ok, NConnPkt, NChannel = #channel{clientinfo = ClientInfo}} -> NChannel1 = NChannel#channel{ @@ -1335,7 +1335,7 @@ auth_connect(#mqtt_packet_connect{password = Password}, %%-------------------------------------------------------------------- %% Flapping -flapping_detect(_ConnPkt, Channel = #channel{clientinfo = ClientInfo = #{zone := Zone}}) -> +count_flapping_event(_ConnPkt, Channel = #channel{clientinfo = ClientInfo = #{zone := Zone}}) -> _ = emqx_zone:enable_flapping_detect(Zone) andalso emqx_flapping:detect(ClientInfo), {ok, Channel}. diff --git a/test/emqx_channel_SUITE.erl b/test/emqx_channel_SUITE.erl index 00edde5b1..4f250bd4a 100644 --- a/test/emqx_channel_SUITE.erl +++ b/test/emqx_channel_SUITE.erl @@ -33,11 +33,6 @@ all() -> emqx_ct:all(?MODULE). init_per_suite(Config) -> %% CM Meck ok = meck:new(emqx_cm, [passthrough, no_history, no_link]), - %% Access Control Meck - ok = meck:new(emqx_access_control, [passthrough, no_history, no_link]), - ok = meck:expect(emqx_access_control, authenticate, - fun(_) -> {ok, #{auth_result => success}} end), - ok = meck:expect(emqx_access_control, check_acl, fun(_, _, _) -> allow end), %% Broker Meck ok = meck:new(emqx_broker, [passthrough, no_history, no_link]), %% Hooks Meck @@ -53,8 +48,7 @@ init_per_suite(Config) -> Config. end_per_suite(_Config) -> - meck:unload([emqx_access_control, - emqx_metrics, + meck:unload([emqx_metrics, emqx_session, emqx_broker, emqx_hooks, @@ -63,10 +57,16 @@ end_per_suite(_Config) -> init_per_testcase(_TestCase, Config) -> meck:new(emqx_zone, [passthrough, no_history, no_link]), + %% Access Control Meck + ok = meck:new(emqx_access_control, [passthrough, no_history, no_link]), + ok = meck:expect(emqx_access_control, authenticate, + fun(_) -> {ok, #{auth_result => success}} end), + ok = meck:expect(emqx_access_control, check_acl, fun(_, _, _) -> allow end), Config. end_per_testcase(_TestCase, Config) -> meck:unload([emqx_zone]), + meck:unload([emqx_access_control]), Config. %%-------------------------------------------------------------------- @@ -853,6 +853,30 @@ t_ws_cookie_init(_) -> Channel = emqx_channel:init(ConnInfo, [{zone, zone}]), ?assertMatch(#{ws_cookie := WsCookie}, emqx_channel:info(clientinfo, Channel)). +%%-------------------------------------------------------------------- +%% Test cases for other mechnisms +%%-------------------------------------------------------------------- + +t_flapping_detect(_) -> + Parent = self(), + ok = meck:expect(emqx_cm, open_session, + fun(true, _ClientInfo, _ConnInfo) -> + {ok, #{session => session(), present => false}} + end), + ok = meck:expect(emqx_access_control, authenticate, fun(_) -> {error, not_authorized} end), + ok = meck:new(emqx_flapping, [passthrough, no_history, no_link]), + ok = meck:expect(emqx_flapping, detect, fun(_) -> Parent ! flapping_detect end), + ok = meck:expect(emqx_zone, enable_flapping_detect, fun(_) -> true end), + IdleChannel = channel(#{conn_state => idle}), + {shutdown, not_authorized, _ConnAck, _Channel} = + emqx_channel:handle_in(?CONNECT_PACKET(connpkt()), IdleChannel), + receive + flapping_detect -> ok + after 2000 -> + ?assert(false, "Flapping detect should be exected in connecting progress") + end, + meck:unload([emqx_flapping]). + %%-------------------------------------------------------------------- %% Helper functions %%-------------------------------------------------------------------- diff --git a/test/emqx_flapping_SUITE.erl b/test/emqx_flapping_SUITE.erl index eadd89192..8074a8607 100644 --- a/test/emqx_flapping_SUITE.erl +++ b/test/emqx_flapping_SUITE.erl @@ -72,4 +72,4 @@ t_expired_detecting(_) -> (_) -> false end, ets:tab2list(emqx_flapping))), timer:sleep(200), ?assertEqual(true, lists:all(fun({flapping, <<"clientid">>, _, _, _}) -> false; - (_) -> true end, ets:tab2list(emqx_flapping))). \ No newline at end of file + (_) -> true end, ets:tab2list(emqx_flapping))). From e5a673376feba0a4313cf616b001e717f92a1f58 Mon Sep 17 00:00:00 2001 From: JianBo He Date: Sun, 9 Oct 2022 17:25:48 +0800 Subject: [PATCH 13/34] refactor: support the retry option --- .../emqx_auth_http/include/emqx_auth_http.hrl | 19 +++++++++++++++++++ apps/emqx_auth_http/src/emqx_acl_http.erl | 8 +++++--- apps/emqx_auth_http/src/emqx_auth_http.erl | 14 +++++++++----- .../emqx_auth_http/src/emqx_auth_http_cli.erl | 14 +++++++++----- 4 files changed, 42 insertions(+), 13 deletions(-) diff --git a/apps/emqx_auth_http/include/emqx_auth_http.hrl b/apps/emqx_auth_http/include/emqx_auth_http.hrl index 0eaa59daf..4e659293f 100644 --- a/apps/emqx_auth_http/include/emqx_auth_http.hrl +++ b/apps/emqx_auth_http/include/emqx_auth_http.hrl @@ -1 +1,20 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2020-2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + -define(APP, emqx_auth_http). + +%% equals to the default value of ehttpc +-define(DEFAULT_RETRY_TIMES, 2). diff --git a/apps/emqx_auth_http/src/emqx_acl_http.erl b/apps/emqx_auth_http/src/emqx_acl_http.erl index d4fd96a95..73cf6ce11 100644 --- a/apps/emqx_auth_http/src/emqx_acl_http.erl +++ b/apps/emqx_auth_http/src/emqx_acl_http.erl @@ -24,7 +24,7 @@ -logger_header("[ACL http]"). -import(emqx_auth_http_cli, - [ request/6 + [ request/7 , feedvar/2 ]). @@ -56,13 +56,15 @@ description() -> "ACL with HTTP API". %% Internal functions %%-------------------------------------------------------------------- -check_acl_request(#{pool_name := PoolName, +check_acl_request(Params = + #{pool_name := PoolName, path := Path, method := Method, headers := Headers, params := Params, timeout := Timeout}, ClientInfo) -> - request(PoolName, Method, Path, Headers, feedvar(Params, ClientInfo), Timeout). + Retry = maps:get(retry_times, Params, ?DEFAULT_RETRY_TIMES), + request(PoolName, Method, Path, Headers, feedvar(Params, ClientInfo), Timeout, Retry). access(subscribe) -> 1; access(publish) -> 2. diff --git a/apps/emqx_auth_http/src/emqx_auth_http.erl b/apps/emqx_auth_http/src/emqx_auth_http.erl index 98a897a8c..0c8c46670 100644 --- a/apps/emqx_auth_http/src/emqx_auth_http.erl +++ b/apps/emqx_auth_http/src/emqx_auth_http.erl @@ -25,7 +25,7 @@ -logger_header("[Auth http]"). -import(emqx_auth_http_cli, - [ request/6 + [ request/7 , feedvar/2 ]). @@ -63,24 +63,28 @@ description() -> "Authentication by HTTP API". %% Requests %%-------------------------------------------------------------------- -authenticate(#{pool_name := PoolName, +authenticate(Params = + #{pool_name := PoolName, path := Path, method := Method, headers := Headers, params := Params, timeout := Timeout}, ClientInfo) -> - request(PoolName, Method, Path, Headers, feedvar(Params, ClientInfo), Timeout). + Retry = maps:get(retry_times, Params, ?DEFAULT_RETRY_TIMES), + request(PoolName, Method, Path, Headers, feedvar(Params, ClientInfo), Timeout, Retry). -spec(is_superuser(maybe(map()), emqx_types:client()) -> boolean()). is_superuser(undefined, _ClientInfo) -> false; -is_superuser(#{pool_name := PoolName, +is_superuser(Params = + #{pool_name := PoolName, path := Path, method := Method, headers := Headers, params := Params, timeout := Timeout}, ClientInfo) -> - case request(PoolName, Method, Path, Headers, feedvar(Params, ClientInfo), Timeout) of + Retry = maps:get(retry_times, Params, ?DEFAULT_RETRY_TIMES), + case request(PoolName, Method, Path, Headers, feedvar(Params, ClientInfo), Timeout, Retry) of {ok, 200, _Body} -> true; {ok, _Code, _Body} -> false; {error, Error} -> ?LOG(error, "Request superuser path ~s, error: ~p", [Path, Error]), diff --git a/apps/emqx_auth_http/src/emqx_auth_http_cli.erl b/apps/emqx_auth_http/src/emqx_auth_http_cli.erl index 3c7efd9c9..16c2c8574 100644 --- a/apps/emqx_auth_http/src/emqx_auth_http_cli.erl +++ b/apps/emqx_auth_http/src/emqx_auth_http_cli.erl @@ -19,6 +19,7 @@ -include("emqx_auth_http.hrl"). -export([ request/6 + , request/7 , feedvar/2 , feedvar/3 ]). @@ -27,18 +28,21 @@ %% HTTP Request %%-------------------------------------------------------------------- -request(PoolName, get, Path, Headers, Params, Timeout) -> - NewPath = Path ++ "?" ++ binary_to_list(cow_qs:qs(bin_kw(Params))), - reply(ehttpc:request(PoolName, get, {NewPath, Headers}, Timeout)); +request(PoolName, Method, Path, Headers, Params, Timeout) -> + request(PoolName, Method, Path, Headers, Params, ?DEFAULT_RETRY_TIMES). -request(PoolName, post, Path, Headers, Params, Timeout) -> +request(PoolName, get, Path, Headers, Params, Timeout, Retry) -> + NewPath = Path ++ "?" ++ binary_to_list(cow_qs:qs(bin_kw(Params))), + reply(ehttpc:request(PoolName, get, {NewPath, Headers}, Timeout, Retry)); + +request(PoolName, post, Path, Headers, Params, Timeout, Retry) -> Body = case proplists:get_value(<<"content-type">>, Headers) of "application/x-www-form-urlencoded" -> cow_qs:qs(bin_kw(Params)); "application/json" -> emqx_json:encode(bin_kw(Params)) end, - reply(ehttpc:request(PoolName, post, {Path, Headers, Body}, Timeout)). + reply(ehttpc:request(PoolName, post, {Path, Headers, Body}, Timeout, Retry)). reply({ok, StatusCode, _Headers}) -> {ok, StatusCode, <<>>}; From 8e8ff08973c06951aea98a05c3523ff1db5d4792 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Sun, 9 Oct 2022 11:31:54 +0200 Subject: [PATCH 14/34] fix(shared): check sticky if sticky pid is still a member Prior to this fix, in case of a subscriber unsubscribes without disconnect, the sticky dispatch strategy will continue to pick the old member. This commit fixes it by checking if the member is still in the group --- src/emqx_shared_sub.erl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/emqx_shared_sub.erl b/src/emqx_shared_sub.erl index cc57e001f..2d85a834a 100644 --- a/src/emqx_shared_sub.erl +++ b/src/emqx_shared_sub.erl @@ -301,7 +301,8 @@ fetch_sender_ref({Sender, Ref}) -> {Sender, Ref}. pick(sticky, ClientId, SourceTopic, Group, Topic, FailedSubs) -> Sub0 = erlang:get({shared_sub_sticky, Group, Topic}), - case is_active_sub(Sub0, FailedSubs) of + All = subscribers(Group, Topic), + case is_active_sub(Sub0, FailedSubs, All) of true -> %% the old subscriber is still alive %% keep using it for sticky strategy @@ -471,8 +472,10 @@ update_stats(State) -> State. %% Return 'true' if the subscriber process is alive AND not in the failed list -is_active_sub(Pid, FailedSubs) -> - not maps:is_key(Pid, FailedSubs) andalso is_alive_sub(Pid). +is_active_sub(Pid, FailedSubs, All) -> + lists:member(Pid, All) andalso + (not maps:is_key(Pid, FailedSubs)) andalso + is_alive_sub(Pid). %% erlang:is_process_alive/1 does not work with remote pid. is_alive_sub(Pid) when ?IS_LOCAL_PID(Pid) -> From 68dd29420ddc024b96852f5eb35c678c4a12c827 Mon Sep 17 00:00:00 2001 From: JianBo He Date: Sun, 9 Oct 2022 17:43:30 +0800 Subject: [PATCH 15/34] chore: fix duplicated variable name --- apps/emqx_auth_http/src/emqx_acl_http.erl | 4 ++-- apps/emqx_auth_http/src/emqx_auth_http.app.src | 2 +- apps/emqx_auth_http/src/emqx_auth_http.erl | 8 ++++---- apps/emqx_auth_http/src/emqx_auth_http_cli.erl | 2 +- apps/emqx_auth_mnesia/test/emqx_auth_mnesia_SUITE.erl | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/apps/emqx_auth_http/src/emqx_acl_http.erl b/apps/emqx_auth_http/src/emqx_acl_http.erl index 73cf6ce11..51bf9c303 100644 --- a/apps/emqx_auth_http/src/emqx_acl_http.erl +++ b/apps/emqx_auth_http/src/emqx_acl_http.erl @@ -56,14 +56,14 @@ description() -> "ACL with HTTP API". %% Internal functions %%-------------------------------------------------------------------- -check_acl_request(Params = +check_acl_request(ACLParams = #{pool_name := PoolName, path := Path, method := Method, headers := Headers, params := Params, timeout := Timeout}, ClientInfo) -> - Retry = maps:get(retry_times, Params, ?DEFAULT_RETRY_TIMES), + Retry = maps:get(retry_times, ACLParams, ?DEFAULT_RETRY_TIMES), request(PoolName, Method, Path, Headers, feedvar(Params, ClientInfo), Timeout, Retry). access(subscribe) -> 1; diff --git a/apps/emqx_auth_http/src/emqx_auth_http.app.src b/apps/emqx_auth_http/src/emqx_auth_http.app.src index e943317f8..87d087bae 100644 --- a/apps/emqx_auth_http/src/emqx_auth_http.app.src +++ b/apps/emqx_auth_http/src/emqx_auth_http.app.src @@ -1,6 +1,6 @@ {application, emqx_auth_http, [{description, "EMQ X Authentication/ACL with HTTP API"}, - {vsn, "4.3.7"}, % strict semver, bump manually! + {vsn, "4.3.8"}, % strict semver, bump manually! {modules, []}, {registered, [emqx_auth_http_sup]}, {applications, [kernel,stdlib,ehttpc]}, diff --git a/apps/emqx_auth_http/src/emqx_auth_http.erl b/apps/emqx_auth_http/src/emqx_auth_http.erl index 0c8c46670..620750bd0 100644 --- a/apps/emqx_auth_http/src/emqx_auth_http.erl +++ b/apps/emqx_auth_http/src/emqx_auth_http.erl @@ -63,27 +63,27 @@ description() -> "Authentication by HTTP API". %% Requests %%-------------------------------------------------------------------- -authenticate(Params = +authenticate(AuthParams = #{pool_name := PoolName, path := Path, method := Method, headers := Headers, params := Params, timeout := Timeout}, ClientInfo) -> - Retry = maps:get(retry_times, Params, ?DEFAULT_RETRY_TIMES), + Retry = maps:get(retry_times, AuthParams, ?DEFAULT_RETRY_TIMES), request(PoolName, Method, Path, Headers, feedvar(Params, ClientInfo), Timeout, Retry). -spec(is_superuser(maybe(map()), emqx_types:client()) -> boolean()). is_superuser(undefined, _ClientInfo) -> false; -is_superuser(Params = +is_superuser(SuperParams = #{pool_name := PoolName, path := Path, method := Method, headers := Headers, params := Params, timeout := Timeout}, ClientInfo) -> - Retry = maps:get(retry_times, Params, ?DEFAULT_RETRY_TIMES), + Retry = maps:get(retry_times, SuperParams, ?DEFAULT_RETRY_TIMES), case request(PoolName, Method, Path, Headers, feedvar(Params, ClientInfo), Timeout, Retry) of {ok, 200, _Body} -> true; {ok, _Code, _Body} -> false; diff --git a/apps/emqx_auth_http/src/emqx_auth_http_cli.erl b/apps/emqx_auth_http/src/emqx_auth_http_cli.erl index 16c2c8574..c747b778a 100644 --- a/apps/emqx_auth_http/src/emqx_auth_http_cli.erl +++ b/apps/emqx_auth_http/src/emqx_auth_http_cli.erl @@ -29,7 +29,7 @@ %%-------------------------------------------------------------------- request(PoolName, Method, Path, Headers, Params, Timeout) -> - request(PoolName, Method, Path, Headers, Params, ?DEFAULT_RETRY_TIMES). + request(PoolName, Method, Path, Headers, Params, Timeout, ?DEFAULT_RETRY_TIMES). request(PoolName, get, Path, Headers, Params, Timeout, Retry) -> NewPath = Path ++ "?" ++ binary_to_list(cow_qs:qs(bin_kw(Params))), diff --git a/apps/emqx_auth_mnesia/test/emqx_auth_mnesia_SUITE.erl b/apps/emqx_auth_mnesia/test/emqx_auth_mnesia_SUITE.erl index f7071bc17..8529fb143 100644 --- a/apps/emqx_auth_mnesia/test/emqx_auth_mnesia_SUITE.erl +++ b/apps/emqx_auth_mnesia/test/emqx_auth_mnesia_SUITE.erl @@ -408,7 +408,7 @@ t_password_hash(_) -> ok = application:start(emqx_auth_mnesia). t_will_message_connection_denied(Config) when is_list(Config) -> - ClientId = Username = <<"subscriber">>, + ClientId = <<"subscriber">>, Password = <<"p">>, application:stop(emqx_auth_mnesia), ok = emqx_ct_helpers:start_apps([emqx_auth_mnesia]), From 761283f61639de11411d650e115b1d8a30383f54 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Sun, 9 Oct 2022 11:49:11 +0200 Subject: [PATCH 16/34] docs: update change log --- CHANGES-4.3.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES-4.3.md b/CHANGES-4.3.md index 26dd529a2..c15ac220b 100644 --- a/CHANGES-4.3.md +++ b/CHANGES-4.3.md @@ -58,6 +58,12 @@ File format: - For wildcard deliveries, the re-dispatch used the wrong topic (the publishing topic, but not the subscribing topic), caused messages to be lost when dispatching. +- Fix shared subscription group member unsubscribe issue when 'sticky' strategy is used. + Prior to this fix, if a previously picked member unsubscribes from the group (without reconnect) + the message is still dispatched to it. + This issue only occurs when unsubscribe with the session kept. + Fixed in [#9119](https://github.com/emqx/emqx/pull/9119) + ## v4.3.20 ### Bug fixes From bc68f60bb5f3372deaa6fcd5ff6b86dd61660b76 Mon Sep 17 00:00:00 2001 From: JianBo He Date: Sun, 9 Oct 2022 17:52:59 +0800 Subject: [PATCH 17/34] chore: update appup.src --- .../src/emqx_auth_http.appup.src | 53 ++++++++++++------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/apps/emqx_auth_http/src/emqx_auth_http.appup.src b/apps/emqx_auth_http/src/emqx_auth_http.appup.src index f5c2bfe42..01d756b9e 100644 --- a/apps/emqx_auth_http/src/emqx_auth_http.appup.src +++ b/apps/emqx_auth_http/src/emqx_auth_http.appup.src @@ -1,17 +1,27 @@ %% -*- mode: erlang -*- +%% Unless you know what you are doing, DO NOT edit manually!! {VSN, - [{"4.3.6", - [ %% There are only changes to the schema file, so we don't need any - %% commands here - ]}, + [{"4.3.7", + [{load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}, + {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}, + {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}]}, + {"4.3.6", + [{load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}, + {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}, + {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}]}, {"4.3.5", - [{load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, + [{load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}, + {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}, + {load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}]}, {"4.3.4", - [{load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, + [{load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}, + {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}, + {load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}]}, {"4.3.3", - [{load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, + [{load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}, + {load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}, {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}]}, {"4.3.2", @@ -20,21 +30,29 @@ {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}, {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}, {load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}]}, - {<<"4.3.[0-1]">>, - [{restart_application,emqx_auth_http}]}, + {<<"4.3.[0-1]">>,[{restart_application,emqx_auth_http}]}, {<<".*">>,[]}], - [{"4.3.6", - [ %% There are only changes to the schema file, so we don't need any - %% commands here - ]}, + [{"4.3.7", + [{load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}, + {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}, + {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}]}, + {"4.3.6", + [{load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}, + {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}, + {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}]}, {"4.3.5", - [{load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, + [{load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}, + {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}, + {load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}]}, {"4.3.4", - [{load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, + [{load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}, + {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}, + {load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}]}, {"4.3.3", - [{load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, + [{load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}, + {load_module,emqx_auth_http_app,brutal_purge,soft_purge,[]}, {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}, {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}]}, {"4.3.2", @@ -43,6 +61,5 @@ {load_module,emqx_auth_http,brutal_purge,soft_purge,[]}, {load_module,emqx_acl_http,brutal_purge,soft_purge,[]}, {load_module,emqx_auth_http_cli,brutal_purge,soft_purge,[]}]}, - {<<"4.3.[0-1]">>, - [{restart_application,emqx_auth_http}]}, + {<<"4.3.[0-1]">>,[{restart_application,emqx_auth_http}]}, {<<".*">>,[]}]}. From 4f8a7349bfc7ae114e8fc9daa09d10fe95beb320 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Sun, 9 Oct 2022 20:13:52 +0200 Subject: [PATCH 18/34] fix(shared): ensure sticky strategy sticks to the first pick Prior to this fix, the alive pids are never inserted due to a missing insert when handing remote pids from mnesia event. --- CHANGES-4.3.md | 7 +++++++ src/emqx_shared_sub.erl | 1 + 2 files changed, 8 insertions(+) diff --git a/CHANGES-4.3.md b/CHANGES-4.3.md index c15ac220b..22a909780 100644 --- a/CHANGES-4.3.md +++ b/CHANGES-4.3.md @@ -64,6 +64,13 @@ File format: This issue only occurs when unsubscribe with the session kept. Fixed in [#9119](https://github.com/emqx/emqx/pull/9119) +- Fix shared subscription 'sticky' strategy when there is no local subscriptions at all. + Prior to this change, it may take a few rounds to randomly pick group members until a local subscriber + is hit (and then start sticking to it). + After this fix, it will start sticking to whichever randomly picked member even when it is a + subscriber from another node in the cluster. + Fixed in [#9122](https://github.com/emqx/emqx/pull/9122) + ## v4.3.20 ### Bug fixes diff --git a/src/emqx_shared_sub.erl b/src/emqx_shared_sub.erl index 65645c86a..9c051c62a 100644 --- a/src/emqx_shared_sub.erl +++ b/src/emqx_shared_sub.erl @@ -456,6 +456,7 @@ handle_cast(Msg, State) -> handle_info({mnesia_table_event, {write, NewRecord, _}}, State = #state{pmon = PMon}) -> #emqx_shared_subscription{subpid = SubPid} = NewRecord, + ok = maybe_insert_alive_tab(SubPid), {noreply, update_stats(State#state{pmon = emqx_pmon:monitor(SubPid, PMon)})}; %% The subscriber may have subscribed multiple topics, so we need to keep monitoring the PID until From 338b11ab95f879116a90dcd70415c06dcec00ed0 Mon Sep 17 00:00:00 2001 From: Shawn <506895667@qq.com> Date: Mon, 10 Oct 2022 17:11:57 +0800 Subject: [PATCH 19/34] fix: cannot reset metrics for fallback actions --- .../src/emqx_rule_metrics.erl | 13 ++- .../test/emqx_rule_engine_SUITE.erl | 79 +++++++++++++++---- 2 files changed, 73 insertions(+), 19 deletions(-) diff --git a/apps/emqx_rule_engine/src/emqx_rule_metrics.erl b/apps/emqx_rule_engine/src/emqx_rule_metrics.erl index 624032056..2e30f40dc 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_metrics.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_metrics.erl @@ -131,16 +131,15 @@ clear_metrics(Id) -> -spec(reset_metrics(rule_id()) -> ok). reset_metrics(Id) -> reset_speeds(Id), - reset_metrics(Id, rule_metrics()), + do_reset_metrics(Id, rule_metrics()), case emqx_rule_registry:get_rule(Id) of not_found -> ok; {ok, #rule{actions = Actions}} -> - [ reset_metrics(ActionId, action_metrics()) - || #action_instance{ id = ActionId} <- Actions], + reset_action_metrics(Actions), ok end. -reset_metrics(Id, Metrics) -> +do_reset_metrics(Id, Metrics) -> case couters_ref(Id) of not_found -> ok; Ref -> [counters:put(Ref, metrics_idx(Idx), 0) @@ -148,6 +147,12 @@ reset_metrics(Id, Metrics) -> ok end. +reset_action_metrics(Actions) -> + lists:foreach(fun(#action_instance{id = ActionId, fallbacks = FallbackActions}) -> + do_reset_metrics(ActionId, action_metrics()), + reset_action_metrics(FallbackActions) + end, Actions). + reset_speeds(Id) -> gen_server:call(?MODULE, {reset_speeds, Id}). diff --git a/apps/emqx_rule_engine/test/emqx_rule_engine_SUITE.erl b/apps/emqx_rule_engine/test/emqx_rule_engine_SUITE.erl index c8f66ea00..ec9717a75 100644 --- a/apps/emqx_rule_engine/test/emqx_rule_engine_SUITE.erl +++ b/apps/emqx_rule_engine/test/emqx_rule_engine_SUITE.erl @@ -50,6 +50,7 @@ groups() -> t_unregister_provider, t_create_rule, t_reset_metrics, + t_reset_metrics_fallbacks, t_create_resource ]}, {actions, [], @@ -379,18 +380,14 @@ t_inspect_action(_Config) -> t_reset_metrics(_Config) -> ok = emqx_rule_engine:load_providers(), - {ok, #resource{id = ResId}} = emqx_rule_engine:create_resource( - #{type => built_in, - config => #{}, - description => <<"debug resource">>}), - {ok, #rule{id = Id}} = emqx_rule_engine:create_rule( - #{rawsql => "select clientid as c, username as u " - "from \"t1\" ", - actions => [#{name => 'inspect', - args => #{'$resource' => ResId, a=>1, b=>2}}], - type => built_in, - description => <<"Inspect rule">> - }), + {ok, #rule{id = Id, actions = [#action_instance{id = ActId0}]}} = + emqx_rule_engine:create_rule( + #{rawsql => "select clientid as c, username as u " + "from \"t1\" ", + actions => [#{name => 'inspect', args => #{a=>1, b=>2}}], + type => built_in, + description => <<"Inspect rule">> + }), {ok, Client} = emqtt:start_link([{username, <<"emqx">>}]), {ok, _} = emqtt:connect(Client), [ begin @@ -398,16 +395,68 @@ t_reset_metrics(_Config) -> timer:sleep(100) end || _ <- lists:seq(1,10)], + ?assertMatch(#{exception := 0, failed := 0, + matched := 10, no_result := 0, passed := 10}, + emqx_rule_metrics:get_rule_metrics(Id)), + ?assertMatch(#{failed := 0, success := 10, taken := 10}, + emqx_rule_metrics:get_action_metrics(ActId0)), emqx_rule_metrics:reset_metrics(Id), ?assertEqual(#{exception => 0,failed => 0, matched => 0,no_result => 0,passed => 0, speed => 0.0,speed_last5m => 0.0,speed_max => 0.0}, emqx_rule_metrics:get_rule_metrics(Id)), - ?assertEqual(#{failed => 0,success => 0,taken => 0}, - emqx_rule_metrics:get_action_metrics(ResId)), + ?assertEqual(#{failed => 0, success => 0, taken => 0}, + emqx_rule_metrics:get_action_metrics(ActId0)), emqtt:stop(Client), emqx_rule_registry:remove_rule(Id), - emqx_rule_registry:remove_resource(ResId), + ok. + +t_reset_metrics_fallbacks(_Config) -> + ok = emqx_rule_engine:load_providers(), + ok = emqx_rule_registry:add_action( + #action{name = 'crash_action', app = ?APP, + module = ?MODULE, on_create = crash_action, + types=[], params_spec = #{}, + title = #{en => <<"Crash Action">>}, + description = #{en => <<"This action will always fail!">>}}), + {ok, #rule{id = Id, actions = [#action_instance{id = ActId0, fallbacks = [ + #action_instance{id = ActId1}, + #action_instance{id = ActId2} + ]}]}} = + emqx_rule_engine:create_rule( + #{rawsql => "select clientid as c, username as u " + "from \"t1\" ", + actions => [#{name => 'crash_action', args => #{a=>1, b=>2}, fallbacks => [ + #{name => 'inspect', args => #{}, fallbacks => []}, + #{name => 'inspect', args => #{}, fallbacks => []} + ]}], + type => built_in, + description => <<"Inspect rule">> + }), + {ok, Client} = emqtt:start_link([{username, <<"emqx">>}]), + {ok, _} = emqtt:connect(Client), + [ begin + emqtt:publish(Client, <<"t1">>, <<"{\"id\": 1, \"name\": \"ha\"}">>, 0), + timer:sleep(100) + end + || _ <- lists:seq(1,10)], + ?assertMatch(#{exception := 0, failed := 0, + matched := 10, no_result := 0, passed := 10}, + emqx_rule_metrics:get_rule_metrics(Id)), + [?assertMatch(#{failed := 10, success := 0, taken := 10}, + emqx_rule_metrics:get_action_metrics(AId)) || AId <- [ActId0]], + [?assertMatch(#{failed := 0, success := 10, taken := 10}, + emqx_rule_metrics:get_action_metrics(AId)) || AId <- [ ActId1, ActId2]], + emqx_rule_metrics:reset_metrics(Id), + ?assertEqual(#{exception => 0,failed => 0, + matched => 0,no_result => 0,passed => 0, + speed => 0.0,speed_last5m => 0.0,speed_max => 0.0}, + emqx_rule_metrics:get_rule_metrics(Id)), + [?assertEqual(#{failed => 0, success => 0, taken => 0}, + emqx_rule_metrics:get_action_metrics(AId)) || AId <- [ActId0, ActId1, ActId2]], + emqtt:stop(Client), + emqx_rule_registry:remove_rule(Id), + ok = emqx_rule_registry:remove_action('crash_action'), ok. t_republish_action(_Config) -> From 6d52f908d1689a2fe2a4636cec4e7e19a05ed5bc Mon Sep 17 00:00:00 2001 From: Shawn <506895667@qq.com> Date: Mon, 10 Oct 2022 17:17:21 +0800 Subject: [PATCH 20/34] chore: update emqx_rule_engine.appup.src --- CHANGES-4.3.md | 2 ++ .../src/emqx_rule_engine.appup.src | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/CHANGES-4.3.md b/CHANGES-4.3.md index 22a909780..7762f5eb1 100644 --- a/CHANGES-4.3.md +++ b/CHANGES-4.3.md @@ -71,6 +71,8 @@ File format: subscriber from another node in the cluster. Fixed in [#9122](https://github.com/emqx/emqx/pull/9122) +- Fix cannot reset metrics for fallback actions. [#9125](https://github.com/emqx/emqx/pull/9125) + ## v4.3.20 ### Bug fixes diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.appup.src b/apps/emqx_rule_engine/src/emqx_rule_engine.appup.src index 6aa35d2d6..130e5d4d4 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine.appup.src +++ b/apps/emqx_rule_engine/src/emqx_rule_engine.appup.src @@ -5,12 +5,14 @@ [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine,brutal_purge,soft_purge,[]}]}, {"4.3.14", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine,brutal_purge,soft_purge,[]}]}, {"4.3.13", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, @@ -19,6 +21,7 @@ {load_module,emqx_rule_actions,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_utils,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, {"4.3.12", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, @@ -27,6 +30,7 @@ {load_module,emqx_rule_actions,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_utils,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, {"4.3.11", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, @@ -36,6 +40,7 @@ {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_validator,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}]}, {"4.3.10", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, @@ -45,6 +50,7 @@ {load_module,emqx_rule_engine,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}]}, {"4.3.9", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, @@ -56,6 +62,7 @@ {load_module,emqx_rule_funcs,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_events,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine,brutal_purge,soft_purge,[]}, {add_module,emqx_rule_date}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, @@ -202,12 +209,14 @@ [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine,brutal_purge,soft_purge,[]}]}, {"4.3.14", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine,brutal_purge,soft_purge,[]}]}, {"4.3.13", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, @@ -216,6 +225,7 @@ {load_module,emqx_rule_actions,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_utils,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, {"4.3.12", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, @@ -224,6 +234,7 @@ {load_module,emqx_rule_actions,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_utils,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, {"4.3.11", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, @@ -233,6 +244,7 @@ {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_validator,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}]}, {"4.3.10", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, @@ -242,6 +254,7 @@ {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_actions,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}]}, {"4.3.9", [{load_module,emqx_rule_sqltester,brutal_purge,soft_purge,[]}, @@ -254,6 +267,7 @@ {load_module,emqx_rule_events,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine,brutal_purge,soft_purge,[]}, + {load_module,emqx_rule_metrics,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, {delete_module,emqx_rule_date}]}, {"4.3.8", From edf69cee885b1d1976bc0183a750fd960c0819b5 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Tue, 4 Oct 2022 16:01:36 +0200 Subject: [PATCH 21/34] feat: mute emqx shutdown log in rpc calls --- src/emqx.erl | 1 + src/emqx_misc.erl | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/emqx.erl b/src/emqx.erl index ae78e5795..4f5f1e3f2 100644 --- a/src/emqx.erl +++ b/src/emqx.erl @@ -227,6 +227,7 @@ shutdown() -> shutdown(normal). shutdown(Reason) -> + ok = emqx_misc:maybe_mute_rpc_log(), ?LOG(critical, "emqx shutdown for ~s", [Reason]), on_shutdown(Reason), _ = emqx_plugins:unload(), diff --git a/src/emqx_misc.erl b/src/emqx_misc.erl index d256569fb..021596957 100644 --- a/src/emqx_misc.erl +++ b/src/emqx_misc.erl @@ -23,6 +23,7 @@ -export([ merge_opts/2 , maybe_apply/2 + , maybe_mute_rpc_log/0 , compose/1 , compose/2 , run_fold/3 @@ -444,6 +445,27 @@ do_parallel_map(Fun, List) -> PidList ). +%% @doc Call this function to avoid logs printed to RPC caller node. +-spec maybe_mute_rpc_log() -> ok. +maybe_mute_rpc_log() -> + GlNode = node(group_leader()), + maybe_mute_rpc_log(GlNode). + +maybe_mute_rpc_log(Node) when Node =:= node() -> + %% do nothing, this is a local call + ok; +maybe_mute_rpc_log(Node) -> + case atom_to_list(Node) of + "remsh_" ++ _ -> + %% this is either an upgrade script or nodetool + %% do nothing, the log may go to the 'emqx' command line console + ok; + _ -> + %% otherwise set group leader to local node + _ = group_leader(whereis(init), self()), + ok + end. + -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). From e72e1567a1c8581f108da685587b1c6b145bcd68 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Mon, 10 Oct 2022 09:19:50 -0300 Subject: [PATCH 22/34] ci(test): stop containers at the beginning of the run An attempt to prevent updated container definitions clashing in CI GH runners between different branches. A self-hosted runner only runs a single job at a time. If a container is already running there, an updated docker compose file might fail to recreate that container, failing the run. --- .github/workflows/run_cts_tests.yaml | 21 +++++++++++++++++++++ .github/workflows/run_test_cases.yaml | 5 +++++ 2 files changed, 26 insertions(+) diff --git a/.github/workflows/run_cts_tests.yaml b/.github/workflows/run_cts_tests.yaml index 6b05a014e..269b8bc65 100644 --- a/.github/workflows/run_cts_tests.yaml +++ b/.github/workflows/run_cts_tests.yaml @@ -24,6 +24,11 @@ jobs: steps: - uses: actions/checkout@v1 + # to avoid dirty self-hosted runners + - name: stop containers + run: | + docker rm -f $(docker ps -qa) || true + docker network rm $(docker network ls -q) || true - name: docker compose up env: LDAP_TAG: ${{ matrix.ldap_tag }} @@ -79,6 +84,10 @@ jobs: steps: - uses: actions/checkout@v1 + - name: stop containers + run: | + docker rm -f $(docker ps -qa) || true + docker network rm $(docker network ls -q) || true - name: docker-compose up run: | docker-compose \ @@ -150,6 +159,10 @@ jobs: steps: - uses: actions/checkout@v1 + - name: stop containers + run: | + docker rm -f $(docker ps -qa) || true + docker network rm $(docker network ls -q) || true - name: docker-compose up timeout-minutes: 5 run: | @@ -236,6 +249,10 @@ jobs: - tcp steps: - uses: actions/checkout@v1 + - name: stop containers + run: | + docker rm -f $(docker ps -qa) || true + docker network rm $(docker network ls -q) || true - name: docker-compose up run: | docker-compose \ @@ -317,6 +334,10 @@ jobs: steps: - uses: actions/checkout@v1 + - name: stop containers + run: | + docker rm -f $(docker ps -qa) || true + docker network rm $(docker network ls -q) || true - name: docker-compose up run: | docker-compose \ diff --git a/.github/workflows/run_test_cases.yaml b/.github/workflows/run_test_cases.yaml index def12cb16..8b16e47db 100644 --- a/.github/workflows/run_test_cases.yaml +++ b/.github/workflows/run_test_cases.yaml @@ -42,6 +42,11 @@ jobs: use-self-hosted: false steps: - uses: actions/checkout@v2 + # to avoid dirty self-hosted runners + - name: stop containers + run: | + docker rm -f $(docker ps -qa) || true + docker network rm $(docker network ls -q) || true - name: docker compose up if: endsWith(github.repository, 'emqx') env: From cfd1d7eea1f69f07c05d7d95ef3657432c707fec Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Tue, 11 Oct 2022 17:41:47 +0200 Subject: [PATCH 23/34] chore(bin/emqx): no need to disable SC2086 --- bin/emqx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/emqx b/bin/emqx index 82eb90e16..89fc564b1 100755 --- a/bin/emqx +++ b/bin/emqx @@ -483,7 +483,7 @@ esac if [ "$IS_BOOT_COMMAND" = 'no' ]; then # for non-boot commands, inspect vm.