From f08342c704ffd7e98c37009c8624d5c6b54bde24 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Mon, 1 Apr 2024 16:32:56 +0300 Subject: [PATCH 01/46] feat: cluster link prototype WIP --- apps/emqx/src/emqx_broker.erl | 85 ++- apps/emqx/src/emqx_external_broker.erl | 117 ++++ apps/emqx/src/emqx_router.erl | 52 +- apps/emqx/src/emqx_topic.erl | 44 +- apps/emqx/src/emqx_topic_index.erl | 7 + apps/emqx/src/emqx_trie_search.erl | 30 +- apps/emqx/src/emqx_types.erl | 1 + apps/emqx_cluster_link/BSL.txt | 94 +++ .../include/emqx_cluster_link.hrl | 10 + .../src/emqx_cluster_link.app.src | 24 + .../src/emqx_cluster_link.erl | 155 +++++ .../src/emqx_cluster_link_app.erl | 61 ++ .../src/emqx_cluster_link_config.erl | 162 ++++++ .../src/emqx_cluster_link_coord_sup.erl | 57 ++ .../src/emqx_cluster_link_coordinator.erl | 454 +++++++++++++++ .../src/emqx_cluster_link_mqtt.erl | 547 ++++++++++++++++++ .../src/emqx_cluster_link_schema.erl | 56 ++ .../src/emqx_cluster_link_sup.erl | 36 ++ apps/emqx_conf/include/emqx_conf.hrl | 4 +- apps/emqx_conf/src/emqx_conf_schema.erl | 2 +- apps/emqx_machine/priv/reboot_lists.eterm | 3 +- 21 files changed, 1947 insertions(+), 54 deletions(-) create mode 100644 apps/emqx/src/emqx_external_broker.erl create mode 100644 apps/emqx_cluster_link/BSL.txt create mode 100644 apps/emqx_cluster_link/include/emqx_cluster_link.hrl create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link.app.src create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link.erl create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_app.erl create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_config.erl create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_coord_sup.erl create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_coordinator.erl create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl diff --git a/apps/emqx/src/emqx_broker.erl b/apps/emqx/src/emqx_broker.erl index ed29ea614..d42258611 100644 --- a/apps/emqx/src/emqx_broker.erl +++ b/apps/emqx/src/emqx_broker.erl @@ -244,11 +244,22 @@ publish(Msg) when is_record(Msg, message) -> topic => Topic }), []; - Msg1 = #message{topic = Topic} -> - PersistRes = persist_publish(Msg1), - route(aggre(emqx_router:match_routes(Topic)), delivery(Msg1), PersistRes) + Msg1 = #message{} -> + do_publish(Msg1); + Msgs when is_list(Msgs) -> do_publish_many(Msgs) end. +do_publish_many([]) -> + []; +do_publish_many([Msg | T]) -> + do_publish(Msg) ++ do_publish_many(T). + +do_publish(#message{topic = Topic} = Msg) -> + PersistRes = persist_publish(Msg), + {Routes, ExtRoutes} = aggre(emqx_router:match_routes(Topic)), + Routes1 = maybe_add_ext_routes(ExtRoutes, Routes, Msg), + route(Routes1, delivery(Msg), PersistRes). + persist_publish(Msg) -> case emqx_persistent_message:persist(Msg) of ok -> @@ -311,26 +322,40 @@ do_route({To, Node}, Delivery) when Node =:= node() -> {Node, To, dispatch(To, Delivery)}; do_route({To, Node}, Delivery) when is_atom(Node) -> {Node, To, forward(Node, To, Delivery, emqx:get_config([rpc, mode]))}; +do_route({To, {external, _} = ExtDest}, Delivery) -> + {ExtDest, To, emqx_external_broker:forward(ExtDest, Delivery)}; do_route({To, Group}, Delivery) when is_tuple(Group); is_binary(Group) -> {share, To, emqx_shared_sub:dispatch(Group, To, Delivery)}. aggre([]) -> - []; + {[], []}; aggre([#route{topic = To, dest = Node}]) when is_atom(Node) -> - [{To, Node}]; + {[{To, Node}], []}; +aggre([#route{topic = To, dest = {external, _} = ExtDest}]) -> + {[], [{To, ExtDest}]}; aggre([#route{topic = To, dest = {Group, _Node}}]) -> - [{To, Group}]; + {[{To, Group}], []}; aggre(Routes) -> - aggre(Routes, false, []). + aggre(Routes, false, {[], []}). -aggre([#route{topic = To, dest = Node} | Rest], Dedup, Acc) when is_atom(Node) -> - aggre(Rest, Dedup, [{To, Node} | Acc]); -aggre([#route{topic = To, dest = {Group, _Node}} | Rest], _Dedup, Acc) -> - aggre(Rest, true, [{To, Group} | Acc]); +aggre([#route{topic = To, dest = Node} | Rest], Dedup, {Acc, ExtAcc}) when is_atom(Node) -> + aggre(Rest, Dedup, {[{To, Node} | Acc], ExtAcc}); +aggre([#route{topic = To, dest = {external, _} = ExtDest} | Rest], Dedup, {Acc, ExtAcc}) -> + aggre(Rest, Dedup, {Acc, [{To, ExtDest} | ExtAcc]}); +aggre([#route{topic = To, dest = {Group, _Node}} | Rest], _Dedup, {Acc, ExtAcc}) -> + aggre(Rest, true, {[{To, Group} | Acc], ExtAcc}); aggre([], false, Acc) -> Acc; -aggre([], true, Acc) -> - lists:usort(Acc). +aggre([], true, {Acc, ExtAcc}) -> + {lists:usort(Acc), lists:usort(ExtAcc)}. + +maybe_add_ext_routes([] = _ExtRoutes, Routes, _Msg) -> + Routes; +maybe_add_ext_routes(ExtRoutes, Routes, Msg) -> + case emqx_external_broker:should_route_to_external_dests(Msg) of + true -> Routes ++ ExtRoutes; + false -> Routes + end. %% @doc Forward message to another node. -spec forward( @@ -643,19 +668,27 @@ maybe_delete_route(Topic) -> sync_route(Action, Topic, ReplyTo) -> EnabledOn = emqx_config:get([broker, routing, batch_sync, enable_on]), - case EnabledOn of - all -> - push_sync_route(Action, Topic, ReplyTo); - none -> - regular_sync_route(Action, Topic); - Role -> - case Role =:= mria_config:whoami() of - true -> - push_sync_route(Action, Topic, ReplyTo); - false -> - regular_sync_route(Action, Topic) - end - end. + Res = + case EnabledOn of + all -> + push_sync_route(Action, Topic, ReplyTo); + none -> + regular_sync_route(Action, Topic); + Role -> + case Role =:= mria_config:whoami() of + true -> + push_sync_route(Action, Topic, ReplyTo); + false -> + regular_sync_route(Action, Topic) + end + end, + _ = external_sync_route(Action, Topic), + Res. + +external_sync_route(add, Topic) -> + emqx_external_broker:maybe_add_route(Topic); +external_sync_route(delete, Topic) -> + emqx_external_broker:maybe_delete_route(Topic). push_sync_route(Action, Topic, Opts) -> emqx_router_syncer:push(Action, Topic, node(), Opts). diff --git a/apps/emqx/src/emqx_external_broker.erl b/apps/emqx/src/emqx_external_broker.erl new file mode 100644 index 000000000..a9af9ddc9 --- /dev/null +++ b/apps/emqx/src/emqx_external_broker.erl @@ -0,0 +1,117 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_external_broker). + +-callback forward(emqx_router:external_dest(), emqx_types:delivery()) -> + emqx_types:deliver_result(). + +-callback should_route_to_external_dests(emqx_types:message()) -> boolean(). + +-callback maybe_add_route(emqx_types:topic()) -> ok. +-callback maybe_delete_route(emqx_types:topic()) -> ok. + +-export([ + provider/0, + register_provider/1, + unregister_provider/1, + forward/2, + should_route_to_external_dests/1, + maybe_add_route/1, + maybe_delete_route/1 +]). + +-include("logger.hrl"). + +-define(PROVIDER, {?MODULE, external_broker}). + +-define(safe_with_provider(IfRegistered, IfNotRegistered), + case persistent_term:get(?PROVIDER, undefined) of + undefined -> + IfNotRegistered; + Provider -> + try + Provider:IfRegistered + catch + Err:Reason:St -> + ?SLOG(error, #{ + msg => "external_broker_crashed", + provider => Provider, + callback => ?FUNCTION_NAME, + stacktrace => St, + error => Err, + reason => Reason + }), + {error, Reason} + end + end +). + +%% TODO: provider API copied from emqx_external_traces, +%% but it can be moved to a common module. + +%%-------------------------------------------------------------------- +%% Provider API +%%-------------------------------------------------------------------- + +-spec register_provider(module()) -> ok | {error, term()}. +register_provider(Module) when is_atom(Module) -> + case is_valid_provider(Module) of + true -> + persistent_term:put(?PROVIDER, Module); + false -> + {error, invalid_provider} + end. + +-spec unregister_provider(module()) -> ok | {error, term()}. +unregister_provider(Module) -> + case persistent_term:get(?PROVIDER, undefined) of + Module -> + persistent_term:erase(?PROVIDER), + ok; + _ -> + {error, not_registered} + end. + +-spec provider() -> module() | undefined. +provider() -> + persistent_term:get(?PROVIDER, undefined). + +%%-------------------------------------------------------------------- +%% Broker API +%%-------------------------------------------------------------------- + +forward(ExternalDest, Delivery) -> + ?safe_with_provider(?FUNCTION_NAME(ExternalDest, Delivery), {error, unknown_dest}). + +should_route_to_external_dests(Message) -> + ?safe_with_provider(?FUNCTION_NAME(Message), false). + +maybe_add_route(Topic) -> + ?safe_with_provider(?FUNCTION_NAME(Topic), ok). + +maybe_delete_route(Topic) -> + ?safe_with_provider(?FUNCTION_NAME(Topic), ok). + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +is_valid_provider(Module) -> + lists:all( + fun({F, A}) -> erlang:function_exported(Module, F, A) end, + ?MODULE:behaviour_info(callbacks) + ). diff --git a/apps/emqx/src/emqx_router.erl b/apps/emqx/src/emqx_router.erl index c2616f98a..55b9ab079 100644 --- a/apps/emqx/src/emqx_router.erl +++ b/apps/emqx/src/emqx_router.erl @@ -91,11 +91,12 @@ deinit_schema/0 ]). --export_type([dest/0]). +-export_type([dest/0, external_dest/0]). -export_type([schemavsn/0]). -type group() :: binary(). --type dest() :: node() | {group(), node()}. +-type external_dest() :: {external, term()}. +-type dest() :: node() | {group(), node()} | external_dest(). -type schemavsn() :: v1 | v2. %% Operation :: {add, ...} | {delete, ...}. @@ -107,7 +108,14 @@ unused = [] :: nil() }). --define(node_patterns(Node), [Node, {'_', Node}]). +-define(dest_patterns(NodeOrExtDest), + case is_atom(NodeOrExtDest) of + %% node + true -> [NodeOrExtDest, {'_', NodeOrExtDest}]; + %% external destination + false -> [NodeOrExtDest] + end +). -define(UNSUPPORTED, unsupported). @@ -306,14 +314,14 @@ print_routes(Topic) -> match_routes(Topic) ). --spec cleanup_routes(node()) -> ok. -cleanup_routes(Node) -> - cleanup_routes(get_schema_vsn(), Node). +-spec cleanup_routes(node() | external_dest()) -> ok. +cleanup_routes(NodeOrExtDest) -> + cleanup_routes(get_schema_vsn(), NodeOrExtDest). -cleanup_routes(v2, Node) -> - cleanup_routes_v2(Node); -cleanup_routes(v1, Node) -> - cleanup_routes_v1(Node). +cleanup_routes(v2, NodeOrExtDest) -> + cleanup_routes_v2(NodeOrExtDest); +cleanup_routes(v1, NodeOrExtDest) -> + cleanup_routes_v1(NodeOrExtDest). -spec foldl_routes(fun((emqx_types:route(), Acc) -> Acc), Acc) -> Acc. foldl_routes(FoldFun, AccIn) -> @@ -430,19 +438,19 @@ has_route_v1(Topic, Dest) -> has_route_tab_entry(Topic, Dest) -> [] =/= ets:match(?ROUTE_TAB, #route{topic = Topic, dest = Dest}). -cleanup_routes_v1(Node) -> +cleanup_routes_v1(NodeOrExtDest) -> ?with_fallback( lists:foreach( fun(Pattern) -> throw_unsupported(mria:match_delete(?ROUTE_TAB, make_route_rec_pat(Pattern))) end, - ?node_patterns(Node) + ?dest_patterns(NodeOrExtDest) ), - cleanup_routes_v1_fallback(Node) + cleanup_routes_v1_fallback(NodeOrExtDest) ). -cleanup_routes_v1_fallback(Node) -> - Patterns = [make_route_rec_pat(P) || P <- ?node_patterns(Node)], +cleanup_routes_v1_fallback(NodeOrExtDest) -> + Patterns = [make_route_rec_pat(P) || P <- ?dest_patterns(NodeOrExtDest)], mria:transaction(?ROUTE_SHARD, fun() -> [ mnesia:delete_object(?ROUTE_TAB, Route, write) @@ -525,7 +533,7 @@ has_route_v2(Topic, Dest) -> has_route_tab_entry(Topic, Dest) end. -cleanup_routes_v2(Node) -> +cleanup_routes_v2(NodeOrExtDest) -> ?with_fallback( lists:foreach( fun(Pattern) -> @@ -537,18 +545,18 @@ cleanup_routes_v2(Node) -> ), throw_unsupported(mria:match_delete(?ROUTE_TAB, make_route_rec_pat(Pattern))) end, - ?node_patterns(Node) + ?dest_patterns(NodeOrExtDest) ), - cleanup_routes_v2_fallback(Node) + cleanup_routes_v2_fallback(NodeOrExtDest) ). -cleanup_routes_v2_fallback(Node) -> +cleanup_routes_v2_fallback(NodeOrExtDest) -> %% NOTE %% No point in transaction here because all the operations on filters table are dirty. ok = ets:foldl( fun(#routeidx{entry = K}, ok) -> case get_dest_node(emqx_topic_index:get_id(K)) of - Node -> + NodeOrExtDest -> mria:dirty_delete(?ROUTE_TAB_FILTERS, K); _ -> ok @@ -560,7 +568,7 @@ cleanup_routes_v2_fallback(Node) -> ok = ets:foldl( fun(#route{dest = Dest} = Route, ok) -> case get_dest_node(Dest) of - Node -> + NodeOrExtDest -> mria:dirty_delete_object(?ROUTE_TAB, Route); _ -> ok @@ -570,6 +578,8 @@ cleanup_routes_v2_fallback(Node) -> ?ROUTE_TAB ). +get_dest_node({external, _} = ExtDest) -> + ExtDest; get_dest_node({_, Node}) -> Node; get_dest_node(Node) -> diff --git a/apps/emqx/src/emqx_topic.erl b/apps/emqx/src/emqx_topic.erl index cdb27b052..9cd631508 100644 --- a/apps/emqx/src/emqx_topic.erl +++ b/apps/emqx/src/emqx_topic.erl @@ -33,7 +33,8 @@ feed_var/3, systop/1, parse/1, - parse/2 + parse/2, + intersection/2 ]). -export([ @@ -52,6 +53,8 @@ ((C =:= '#' orelse C =:= <<"#">>) andalso REST =/= []) ). +-define(IS_WILDCARD(W), W =:= '+' orelse W =:= '#'). + %%-------------------------------------------------------------------- %% APIs %%-------------------------------------------------------------------- @@ -98,6 +101,45 @@ match(_, ['#']) -> match(_, _) -> false. +%% @doc Finds an intersection between two topics, two filters or a topic and a filter. +%% The function is commutative: reversing parameters doesn't affect the returned value. +%% Two topics intersect only when they are equal. +%% The intersection of a topic and a filter is always either the topic itself or false (no intersection). +%% The intersection of two filters is either false or a new topic filter that would match only those topics, +%% that can be matched by both input filters. +%% For example, the intersection of "t/global/#" and "t/+/1/+" is "t/global/1/+". +-spec intersection(TopicOrFilter, TopicOrFilter) -> TopicOrFilter | false when + TopicOrFilter :: emqx_types:topic(). +intersection(Topic1, Topic2) when is_binary(Topic1), is_binary(Topic2) -> + case intersection(words(Topic1), words(Topic2), []) of + [] -> false; + Intersection -> join(lists:reverse(Intersection)) + end. + +intersection(Words1, ['#'], Acc) -> + lists:reverse(Words1, Acc); +intersection(['#'], Words2, Acc) -> + lists:reverse(Words2, Acc); +intersection([W1], ['+'], Acc) -> + [W1 | Acc]; +intersection(['+'], [W2], Acc) -> + [W2 | Acc]; +intersection([W1 | T1], [W2 | T2], Acc) when ?IS_WILDCARD(W1), ?IS_WILDCARD(W2) -> + intersection(T1, T2, [wildcard_intersection(W1, W2) | Acc]); +intersection([W | T1], [W | T2], Acc) -> + intersection(T1, T2, [W | Acc]); +intersection([W1 | T1], [W2 | T2], Acc) when ?IS_WILDCARD(W1) -> + intersection(T1, T2, [W2 | Acc]); +intersection([W1 | T1], [W2 | T2], Acc) when ?IS_WILDCARD(W2) -> + intersection(T1, T2, [W1 | Acc]); +intersection([], [], Acc) -> + Acc; +intersection(_, _, _) -> + []. + +wildcard_intersection(W, W) -> W; +wildcard_intersection(_, _) -> '+'. + -spec match_share(Name, Filter) -> boolean() when Name :: share(), Filter :: topic() | share(). diff --git a/apps/emqx/src/emqx_topic_index.erl b/apps/emqx/src/emqx_topic_index.erl index f416aabc4..8c7011f7a 100644 --- a/apps/emqx/src/emqx_topic_index.erl +++ b/apps/emqx/src/emqx_topic_index.erl @@ -23,6 +23,7 @@ -export([delete/3]). -export([match/2]). -export([matches/3]). +-export([matches_filter/3]). -export([make_key/2]). @@ -72,6 +73,12 @@ match(Topic, Tab) -> matches(Topic, Tab, Opts) -> emqx_trie_search:matches(Topic, make_nextf(Tab), Opts). +%% @doc Match given topic filter against the index and return _all_ matches. +%% If `unique` option is given, return only unique matches by record ID. +-spec matches_filter(emqx_types:topic(), ets:table(), emqx_trie_search:opts()) -> [match(_ID)]. +matches_filter(TopicFilter, Tab, Opts) -> + emqx_trie_search:matches_filter(TopicFilter, make_nextf(Tab), Opts). + %% @doc Extract record ID from the match. -spec get_id(match(ID)) -> ID. get_id(Key) -> diff --git a/apps/emqx/src/emqx_trie_search.erl b/apps/emqx/src/emqx_trie_search.erl index 94a04f5ae..080fad74b 100644 --- a/apps/emqx/src/emqx_trie_search.erl +++ b/apps/emqx/src/emqx_trie_search.erl @@ -99,7 +99,7 @@ -module(emqx_trie_search). -export([make_key/2, make_pat/2, filter/1]). --export([match/2, matches/3, get_id/1, get_topic/1]). +-export([match/2, matches/3, get_id/1, get_topic/1, matches_filter/3]). -export_type([key/1, word/0, words/0, nextf/0, opts/0]). -define(END, '$end_of_table'). @@ -183,9 +183,20 @@ match(Topic, NextF) -> matches(Topic, NextF, Opts) -> search(Topic, NextF, Opts). +%% @doc Match given topic filter against the index and return _all_ matches. +-spec matches_filter(emqx_types:topic(), nextf(), opts()) -> [key(_)]. +matches_filter(TopicFilter, NextF, Opts) -> + search(TopicFilter, NextF, [topic_filter | Opts]). + %% @doc Entrypoint of the search for a given topic. search(Topic, NextF, Opts) -> - Words = topic_words(Topic), + %% A private opt + IsFilter = proplists:get_bool(topic_filter, Opts), + Words = + case IsFilter of + true -> filter_words(Topic); + false -> topic_words(Topic) + end, Base = base_init(Words), ORetFirst = proplists:get_bool(return_first, Opts), OUnique = proplists:get_bool(unique, Opts), @@ -200,8 +211,10 @@ search(Topic, NextF, Opts) -> end, Matches = case search_new(Words, Base, NextF, Acc0) of - {Cursor, Acc} -> + {Cursor, Acc} when not IsFilter -> match_topics(Topic, Cursor, NextF, Acc); + {_Cursor, Acc} -> + Acc; Acc -> Acc end, @@ -275,6 +288,17 @@ compare(['#'], _Words, _) -> % Closest possible next entries that we must not miss: % * a/+/+/d/# (same topic but a different ID) match_full; +%% Filter search %% +compare(_Filter, ['#'], _) -> + match_full; +compare([_ | TF], ['+' | TW], Pos) -> + case compare(TF, TW, Pos + 1) of + lower -> + lower; + Other -> + Other + end; +%% Filter search end %% compare(['+' | TF], [HW | TW], Pos) -> case compare(TF, TW, Pos + 1) of lower -> diff --git a/apps/emqx/src/emqx_types.erl b/apps/emqx/src/emqx_types.erl index 322cc1c05..03a3c8a0f 100644 --- a/apps/emqx/src/emqx_types.erl +++ b/apps/emqx/src/emqx_types.erl @@ -267,6 +267,7 @@ [ {node(), topic(), deliver_result()} | {share, topic(), deliver_result()} + | {emqx_router:external_dest(), topic(), deliver_result()} | persisted ] | disconnect. diff --git a/apps/emqx_cluster_link/BSL.txt b/apps/emqx_cluster_link/BSL.txt new file mode 100644 index 000000000..c22445af8 --- /dev/null +++ b/apps/emqx_cluster_link/BSL.txt @@ -0,0 +1,94 @@ +Business Source License 1.1 + +Licensor: Hangzhou EMQ Technologies Co., Ltd. +Licensed Work: EMQX Enterprise Edition + The Licensed Work is (c) 2024 + Hangzhou EMQ Technologies Co., Ltd. +Additional Use Grant: Students and educators are granted right to copy, + modify, and create derivative work for research + or education. +Change Date: 2028-04-17 +Change License: Apache License, Version 2.0 + +For information about alternative licensing arrangements for the Software, +please contact Licensor: https://www.emqx.com/en/contact + +Notice + +The Business Source License (this document, or the “License”) is not an Open +Source license. However, the Licensed Work will eventually be made available +under an Open Source License, as stated in this License. + +License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved. +“Business Source License” is a trademark of MariaDB Corporation Ab. + +----------------------------------------------------------------------------- + +Business Source License 1.1 + +Terms + +The Licensor hereby grants you the right to copy, modify, create derivative +works, redistribute, and make non-production use of the Licensed Work. The +Licensor may make an Additional Use Grant, above, permitting limited +production use. + +Effective on the Change Date, or the fourth anniversary of the first publicly +available distribution of a specific version of the Licensed Work under this +License, whichever comes first, the Licensor hereby grants you rights under +the terms of the Change License, and the rights granted in the paragraph +above terminate. + +If your use of the Licensed Work does not comply with the requirements +currently in effect as described in this License, you must purchase a +commercial license from the Licensor, its affiliated entities, or authorized +resellers, or you must refrain from using the Licensed Work. + +All copies of the original and modified Licensed Work, and derivative works +of the Licensed Work, are subject to this License. This License applies +separately for each version of the Licensed Work and the Change Date may vary +for each version of the Licensed Work released by Licensor. + +You must conspicuously display this License on each original or modified copy +of the Licensed Work. If you receive the Licensed Work in original or +modified form from a third party, the terms and conditions set forth in this +License apply to your use of that work. + +Any use of the Licensed Work in violation of this License will automatically +terminate your rights under this License for the current and all other +versions of the Licensed Work. + +This License does not grant you any right in any trademark or logo of +Licensor or its affiliates (provided that you may use a trademark or logo of +Licensor as expressly required by this License). + +TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON +AN “AS IS” BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, +EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND +TITLE. + +MariaDB hereby grants you permission to use this License’s text to license +your works, and to refer to it using the trademark “Business Source License”, +as long as you comply with the Covenants of Licensor below. + +Covenants of Licensor + +In consideration of the right to use this License’s text and the “Business +Source License” name and trademark, Licensor covenants to MariaDB, and to all +other recipients of the licensed work to be provided by Licensor: + +1. To specify as the Change License the GPL Version 2.0 or any later version, + or a license that is compatible with GPL Version 2.0 or a later version, + where “compatible” means that software provided under the Change License can + be included in a program with software provided under GPL Version 2.0 or a + later version. Licensor may specify additional Change Licenses without + limitation. + +2. To either: (a) specify an additional grant of rights to use that does not + impose any additional restriction on the right granted in this License, as + the Additional Use Grant; or (b) insert the text “None”. + +3. To specify a Change Date. + +4. Not to modify this License in any other way. diff --git a/apps/emqx_cluster_link/include/emqx_cluster_link.hrl b/apps/emqx_cluster_link/include/emqx_cluster_link.hrl new file mode 100644 index 000000000..42eb7ca7b --- /dev/null +++ b/apps/emqx_cluster_link/include/emqx_cluster_link.hrl @@ -0,0 +1,10 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-define(TOPIC_PREFIX, "$LINK/cluster/"). +-define(CTRL_TOPIC_PREFIX, ?TOPIC_PREFIX "ctrl/"). +-define(ROUTE_TOPIC_PREFIX, ?TOPIC_PREFIX "route/"). +-define(MSG_TOPIC_PREFIX, ?TOPIC_PREFIX "msg/"). + +-define(DEST(FromClusterName), {external, {link, FromClusterName}}). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.app.src b/apps/emqx_cluster_link/src/emqx_cluster_link.app.src new file mode 100644 index 000000000..d8da0c1ee --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.app.src @@ -0,0 +1,24 @@ +%% -*- mode: erlang -*- +{application, emqx_cluster_link, [ + {description, "EMQX Cluster Linking"}, + % strict semver, bump manually! + {vsn, "0.1.0"}, + {modules, []}, + {registered, []}, + {applications, [ + kernel, + stdlib, + emqtt, + ecpool, + emqx, + emqx_resource + ]}, + {mod, {emqx_cluster_link_app, []}}, + {env, []}, + {licenses, ["Business Source License 1.1"]}, + {maintainers, ["EMQX Team "]}, + {links, [ + {"Homepage", "https://emqx.io/"}, + {"Github", "https://github.com/emqx/emqx"} + ]} +]}. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl new file mode 100644 index 000000000..f0b0c95ba --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -0,0 +1,155 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link). + +-behaviour(emqx_external_broker). + +-export([ + register_external_broker/0, + unregister_external_broker/0, + maybe_add_route/1, + maybe_delete_route/1, + forward/2, + should_route_to_external_dests/1 +]). + +%% emqx hooks +-export([ + put_hook/0, + delete_hook/0, + on_message_publish/1 +]). + +-include("emqx_cluster_link.hrl"). +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_hooks.hrl"). +-include_lib("emqx/include/logger.hrl"). + +%%-------------------------------------------------------------------- +%% emqx_external_broker API +%%-------------------------------------------------------------------- + +register_external_broker() -> + emqx_external_broker:register_provider(?MODULE). + +unregister_external_broker() -> + emqx_external_broker:unregister_provider(?MODULE). + +maybe_add_route(Topic) -> + emqx_cluster_link_coordinator:route_op(<<"add">>, Topic). + +maybe_delete_route(_Topic) -> + %% Not implemented yet + %% emqx_cluster_link_coordinator:route_op(<<"delete">>, Topic). + ok. + +forward(ExternalDest, Delivery) -> + emqx_cluster_link_mqtt:forward(ExternalDest, Delivery). + +%% Do not forward any external messages to other links. +%% Only forward locally originated messages to all the relevant links, i.e. no gossip message forwarding. +should_route_to_external_dests(#message{extra = #{link_origin := _}}) -> + false; +should_route_to_external_dests(_Msg) -> + true. + +%%-------------------------------------------------------------------- +%% EMQX Hooks +%%-------------------------------------------------------------------- + +on_message_publish(#message{topic = <>, payload = Payload}) -> + _ = + case emqx_cluster_link_mqtt:decode_route_op(Payload) of + {add, Topics} when is_list(Topics) -> + add_routes(Topics, ClusterName); + {add, Topic} -> + emqx_router_syncer:push(add, Topic, ?DEST(ClusterName), #{}); + {delete, _} -> + %% Not implemented yet + ok; + cleanup_routes -> + cleanup_routes(ClusterName) + end, + {stop, []}; +on_message_publish(#message{topic = <>, payload = Payload}) -> + case emqx_cluster_link_mqtt:decode_forwarded_msg(Payload) of + #message{} = ForwardedMsg -> + {stop, with_sender_name(ForwardedMsg, ClusterName)}; + _Err -> + %% Just ignore it. It must be already logged by the decoder + {stop, []} + end; +on_message_publish( + #message{topic = <>, payload = Payload} = Msg +) -> + case emqx_cluster_link_mqtt:decode_ctrl_msg(Payload, ClusterName) of + {init_link, InitRes} -> + on_init(InitRes, ClusterName, Msg); + {ack_link, Res} -> + on_init_ack(Res, ClusterName, Msg); + unlink -> + %% Stop pushing messages to the cluster that requested unlink, + %% It brings the link to a half-closed (unidirectional) state, + %% as this cluster may still replicate routes and receive messages from ClusterName. + emqx_cluster_link_mqtt:stop_msg_fwd_resource(ClusterName), + cleanup_routes(ClusterName) + end, + {stop, []}; +on_message_publish(_Msg) -> + ok. + +put_hook() -> + emqx_hooks:put('message.publish', {?MODULE, on_message_publish, []}, ?HP_SYS_MSGS). + +delete_hook() -> + emqx_hooks:del('message.publish', {?MODULE, on_message_publish, []}). + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +cleanup_routes(ClusterName) -> + emqx_router:cleanup_routes(?DEST(ClusterName)). + +lookup_link_conf(ClusterName) -> + lists:search( + fun(#{upstream := N}) -> N =:= ClusterName end, + emqx:get_config([cluster, links], []) + ). + +on_init(Res, ClusterName, Msg) -> + #{ + 'Correlation-Data' := ReqId, + 'Response-Topic' := RespTopic + } = emqx_message:get_header(properties, Msg), + case lookup_link_conf(ClusterName) of + {value, LinkConf} -> + _ = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf), + emqx_cluster_link_mqtt:ack_link(ClusterName, Res, RespTopic, ReqId); + false -> + ?SLOG(error, #{ + msg => "init_link_request_from_unknown_cluster", + link_name => ClusterName + }), + %% Cannot ack/reply since we don't know how to reach the link cluster, + %% The cluster that tried to initiatw this link is expected to eventually fail with timeout. + ok + end. + +on_init_ack(Res, ClusterName, Msg) -> + #{'Correlation-Data' := ReqId} = emqx_message:get_header(properties, Msg), + emqx_cluster_link_coordinator:on_link_ack(ClusterName, ReqId, Res). + +add_routes(Topics, ClusterName) -> + lists:foreach( + fun(T) -> emqx_router_syncer:push(add, T, ?DEST(ClusterName), #{}) end, + Topics + ). + +%% let it crash if extra is not a map, +%% we don't expect the message to be forwarded from an older EMQX release, +%% that doesn't set extra = #{} by default. +with_sender_name(#message{extra = Extra} = Msg, ClusterName) when is_map(Extra) -> + Msg#message{extra = Extra#{link_origin => ClusterName}}. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl new file mode 100644 index 000000000..68dc07f48 --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl @@ -0,0 +1,61 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_app). + +-behaviour(application). + +-export([start/2, prep_stop/1, stop/1]). + +-define(BROKER_MOD, emqx_cluster_link). + +start(_StartType, _StartArgs) -> + emqx_cluster_link_config:add_handler(), + LinksConf = enabled_links(), + _ = + case LinksConf of + [_ | _] -> + ok = emqx_cluster_link:register_external_broker(), + ok = emqx_cluster_link:put_hook(), + ok = start_msg_fwd_resources(LinksConf); + _ -> + ok + end, + emqx_cluster_link_sup:start_link(LinksConf). + +prep_stop(State) -> + emqx_cluster_link_config:remove_handler(), + State. + +stop(_State) -> + _ = emqx_cluster_link:delete_hook(), + _ = emqx_cluster_link:unregister_external_broker(), + _ = stop_msg_fwd_resources(emqx:get_config([cluster, links], [])), + ok. + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +enabled_links() -> + lists:filter( + fun(#{enable := IsEnabled}) -> IsEnabled =:= true end, + emqx:get_config([cluster, links], []) + ). + +start_msg_fwd_resources(LinksConf) -> + lists:foreach( + fun(LinkConf) -> + {ok, _} = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf) + end, + LinksConf + ). + +stop_msg_fwd_resources(LinksConf) -> + lists:foreach( + fun(#{upstream := Name}) -> + emqx_cluster_link_mqtt:stop_msg_fwd_resource(Name) + end, + LinksConf + ). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl new file mode 100644 index 000000000..ade3a8c97 --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -0,0 +1,162 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_config). + +-behaviour(emqx_config_handler). + +-include_lib("emqx/include/logger.hrl"). + +-define(LINKS_PATH, [cluster, links]). +-define(CERTS_PATH(LinkName), filename:join(["cluster", "links", LinkName])). + +-export([ + add_handler/0, + remove_handler/0 +]). + +-export([ + pre_config_update/3, + post_config_update/5 +]). + +add_handler() -> + ok = emqx_config_handler:add_handler(?LINKS_PATH, ?MODULE). + +remove_handler() -> + ok = emqx_config_handler:remove_handler(?LINKS_PATH). + +pre_config_update(?LINKS_PATH, RawConf, RawConf) -> + {ok, RawConf}; +pre_config_update(?LINKS_PATH, NewRawConf, _RawConf) -> + {ok, convert_certs(NewRawConf)}. + +post_config_update(?LINKS_PATH, _Req, Old, Old, _AppEnvs) -> + ok; +post_config_update(?LINKS_PATH, _Req, New, Old, _AppEnvs) -> + ok = maybe_toggle_hook_and_provider(New), + #{ + removed := Removed, + added := Added, + changed := Changed + } = emqx_utils:diff_lists(New, Old, fun upstream_name/1), + RemovedRes = remove_links(Removed), + AddedRes = add_links(Added), + UpdatedRes = update_links(Changed), + IsAllOk = all_ok(RemovedRes) andalso all_ok(AddedRes) andalso all_ok(UpdatedRes), + case IsAllOk of + true -> + ok; + false -> + {error, #{added => AddedRes, removed => RemovedRes, updated => UpdatedRes}} + end. + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +maybe_toggle_hook_and_provider(LinksConf) -> + case is_any_enabled(LinksConf) of + true -> + ok = emqx_cluster_link:register_external_broker(), + ok = emqx_cluster_link:put_hook(); + false -> + _ = emqx_cluster_link:delete_hook(), + _ = emqx_cluster_link:unregister_external_broker(), + ok + end. + +is_any_enabled(LinksConf) -> + lists:any( + fun(#{enable := IsEnabled}) -> IsEnabled =:= true end, + LinksConf + ). + +all_ok(Results) -> + lists:all( + fun + (ok) -> true; + ({ok, _}) -> true; + (_) -> false + end, + Results + ). + +add_links(LinksConf) -> + [add_link(Link) || Link <- LinksConf]. + +add_link(#{enabled := true} = LinkConf) -> + %% NOTE: this can be started later during init_link phase, but it looks not harmful to start it beforehand... + MsgFwdRes = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf), + CoordRes = ensure_coordinator(LinkConf), + combine_results(CoordRes, MsgFwdRes); +add_link(_DisabledLinkConf) -> + ok. + +remove_links(LinksConf) -> + [remove_link(Link) || Link <- LinksConf]. + +remove_link(LinkConf) -> + emqx_cluster_link_coord_sup:stop_coordinator(LinkConf). + +update_links(LinksConf) -> + [update_link(Link) || Link <- LinksConf]. + +%% TODO: do some updates without restart (at least without coordinator restart and re-election) +update_link(#{enabled := true} = LinkConf) -> + _ = remove_link(LinkConf), + add_link(LinkConf); +update_link(#{enabled := false} = LinkConf) -> + case remove_link(LinkConf) of + {error, not_found} -> ok; + Other -> Other + end. + +ensure_coordinator(LinkConf) -> + case emqx_cluster_link_coord_sup:start_coordinator(LinkConf) of + {error, {already_started, Pid}} -> + {ok, Pid}; + {error, already_present} -> + emqx_cluster_link_coord_sup:restart_coordinator(LinkConf) + end. + +combine_results(ok, ok) -> + ok; +combine_results(CoordRes, MsgFwdRes) -> + {error, #{coordinator => CoordRes, msg_fwd_resource => MsgFwdRes}}. + +upstream_name(#{upstream := N}) -> N; +upstream_name(#{<<"upstream">> := N}) -> N. + +convert_certs(LinksConf) -> + lists:map( + fun + (#{ssl := SSLOpts} = LinkConf) -> + LinkConf#{ssl => do_convert_certs(upstream_name(LinkConf), SSLOpts)}; + (#{<<"ssl">> := SSLOpts} = LinkConf) -> + LinkConf#{<<"ssl">> => do_convert_certs(upstream_name(LinkConf), SSLOpts)}; + (LinkConf) -> + LinkConf + end, + LinksConf + ). + +do_convert_certs(LinkName, SSLOpts) -> + case emqx_tls_lib:ensure_ssl_files(?CERTS_PATH(LinkName), SSLOpts) of + {ok, undefined} -> + SSLOpts; + {ok, SSLOpts1} -> + SSLOpts1; + {error, Reason} -> + ?SLOG( + error, + #{ + msg => "bad_ssl_config", + config_path => ?LINKS_PATH, + name => LinkName, + reason => Reason + } + ), + throw({bad_ssl_config, Reason}) + end. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_coord_sup.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_coord_sup.erl new file mode 100644 index 000000000..78fa030f2 --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_coord_sup.erl @@ -0,0 +1,57 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_coord_sup). + +-behaviour(supervisor). + +-export([start_link/1]). +-export([init/1]). + +-export([ + start_coordinator/1, + restart_coordinator/1, + stop_coordinator/1 +]). + +-define(SERVER, ?MODULE). +-define(COORDINATOR_MOD, emqx_cluster_link_coordinator). + +start_link(LinksConf) -> + supervisor:start_link({local, ?SERVER}, ?SERVER, LinksConf). + +init(LinksConf) -> + SupFlags = #{ + strategy => one_for_one, + intensity => 10, + period => 5 + }, + {ok, {SupFlags, children(LinksConf)}}. + +start_coordinator(#{upstream := Name} = LinkConf) -> + supervisor:start_child(?SERVER, worker_spec(Name, LinkConf)). + +restart_coordinator(#{upstream := Name} = _LinkConf) -> + supervisor:restart_child(?SERVER, Name). + +stop_coordinator(#{upstream := Name} = _LinkConf) -> + case supervisor:terminate_child(?SERVER, Name) of + ok -> + supervisor:delete_child(?SERVER, Name); + Err -> + Err + end. + +worker_spec(Id, LinkConf) -> + #{ + id => Id, + start => {?COORDINATOR_MOD, start_link, [LinkConf]}, + restart => permanent, + shutdown => 5000, + type => worker, + modules => [?COORDINATOR_MOD] + }. + +children(LinksConf) -> + [worker_spec(Name, Conf) || #{upstream := Name, enable := true} = Conf <- LinksConf]. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_coordinator.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_coordinator.erl new file mode 100644 index 000000000..4b8b9be8f --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_coordinator.erl @@ -0,0 +1,454 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +%% @doc experimental prototype implementation. +%% The idea is to add a sync point for all cluster route operations, +%% so that, routes can be batched/shrunk (via using emqx_route_syncer) before pushing them to linked clusters. +%% The expected result is reduced communication between linked clusters: +%% each nodes communicates with other clusters through coordinator. +%% The drawbacks are numerous though: +%% - complexity/leader elections, +%% - routes removal seems hard to implement unless remote cluster routes as stored per node, +%% in that case global coordinator per cluster is not needed any more. - TBD +-module(emqx_cluster_link_coordinator). + +-behaviour(gen_statem). + +%% API +-export([ + route_op/2, + on_link_ack/3 +]). + +-export([start_link/1]). + +%% gen_statem +-export([ + callback_mode/0, + init/1, + terminate/3 +]). + +%% gen_statem state functions +-export([ + wait_for_coordinator/3, + connecting/3, + init_linking/3, + bootstrapping/3, + coordinating/3, + following/3 +]). + +-export([select_routes/1]). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_router.hrl"). +-include_lib("emqx/include/logger.hrl"). + +-define(COORDINATOR(UpstreamName), {?MODULE, UpstreamName}). +-define(SERVER, ?MODULE). +-define(WAIT_COORD_RETRY_INTERVAL, 100). +-define(CONN_RETRY_INTERVAL, 5000). +-define(INIT_LINK_RESP_TIMEOUT, 15_000). +-define(INIT_LINK_RETRIES, 5). +-define(UPSTREAM_DEST, {external, {link, _}}). +-define(IS_ROUTE_OP(Op), Op =:= <<"add">>; Op =:= <<"delete">>). + +start_link(Conf) -> + gen_statem:start_link(?MODULE, Conf, []). + +route_op(Op, Topic) -> + lists:foreach( + fun(#{upstream := UpstreamName, topics := LinkFilters}) -> + case topic_intersect_any(Topic, LinkFilters) of + false -> ok; + TopicOrFilter -> maybe_cast(UpstreamName, {Op, TopicOrFilter}) + end + end, + emqx:get_config([cluster, links]) + ). + +on_link_ack(ClusterName, ReqId, Res) -> + maybe_cast(ClusterName, {ack_link, ClusterName, ReqId, Res}). + +callback_mode() -> + [state_functions, state_enter]. + +init(LinkConf) -> + process_flag(trap_exit, true), + %% It helps to avoid unnecessary global name conflicts (and, as a result, coordinator re-election), + %% e.g. when a down nodes comes back + %% TODO: need to better understand `global` behaviour + _ = global:sync(), + Data = #{is_coordinator => false, link_conf => LinkConf}, + {ok, wait_for_coordinator, Data}. + +wait_for_coordinator(enter, _OldState, _Data) -> + {keep_state_and_data, [{state_timeout, 0, do_wait_for_coordinator}]}; +wait_for_coordinator(_, do_wait_for_coordinator, Data) -> + #{link_conf := #{upstream := Name}} = Data, + case global:whereis_name(?COORDINATOR(Name)) of + undefined -> + case register_coordinator(Name) of + yes -> + {next_state, connecting, Data#{is_coordinator => true}}; + no -> + %% TODO: this should not happen forever, if it does, we need to detect it + {keep_state_and_data, [ + {state_timeout, ?WAIT_COORD_RETRY_INTERVAL, do_wait_for_coordinator} + ]} + end; + %% Can be a prev stale pid? + %% Let it crash with case_clause if it happens... + Pid when is_pid(Pid) andalso Pid =/= self() -> + Data1 = Data#{coordinator_mon => erlang:monitor(process, Pid), coordinator_pid => Pid}, + {next_state, following, Data1} + end; +wait_for_coordinator(cast, {Op, _Topic}, _Data) when ?IS_ROUTE_OP(Op) -> + %% Ignore any route op, until bootstrapping is started. + %% All ignored route ops are expected to be caught up during the bootstrap. + keep_state_and_data; +wait_for_coordinator(EventType, Event, Data) -> + handle_event_(?FUNCTION_NAME, EventType, Event, Data). + +connecting(enter, _OldState, _Data) -> + {keep_state_and_data, [{state_timeout, 0, reconnect}]}; +connecting(cast, {Op, _Topic}, _Data) when ?IS_ROUTE_OP(Op) -> + %% Ignore any route op, until bootstrapping is started. + %% All ignored route ops are expected to be caught up during the bootstrap. + keep_state_and_data; +connecting(_EventType, reconnect, Data) -> + ensure_conn_pool(init_linking, Data); +connecting(EventType, Event, Data) -> + handle_event_(?FUNCTION_NAME, EventType, Event, Data). + +init_linking(enter, _OldState, Data) -> + {keep_state, Data#{link_retries => ?INIT_LINK_RETRIES}, [{state_timeout, 0, init_link}]}; +init_linking(cast, {ack_link, _ClusterName, ReqId, Res}, #{link_req_id := ReqId} = Data) -> + case Res of + %% This state machine is not suitable to bootstrap the upstream cluster conditionally, + %% since it ignores any route ops received before bootstrapping... + {ok, #{proto_ver := _, need_bootstrap := _}} -> + {next_state, bootstrapping, maps:without([link_req_id, link_retries], Data)}; + {error, <<"bad_upstream_name">>} -> + %% unrecoverable error that needs a user intervention, + %% TODO: maybe need to transition to some error state + {keep_state, maps:without([link_req_id, link_retries], Data), [{state_timeout, cancel}]} + end; +init_linking(_, init_link, #{link_conf := #{upstream := Name}, link_retries := Retries} = Data) -> + case Retries > 0 of + true -> + {ReqId, {ok, _}} = emqx_cluster_link_mqtt:init_link(Name), + Data1 = Data#{link_req_id => ReqId, link_retries => Retries - 1}, + {keep_state, Data1, [{state_timeout, ?INIT_LINK_RESP_TIMEOUT, init_link}]}; + false -> + ?SLOG(error, #{ + msg => "no_link_ack_response_received", + link_name => Name + }), + %% unrecoverable error that needs a user intervention, + %% TODO: maybe need to transition to some error state + keep_state_and_data + end; +init_linking(cast, {Op, _Topic}, _Data) when ?IS_ROUTE_OP(Op) -> + %% Ignore any route op, until bootstrapping is started. + %% All ignored route ops are expected to be caught up during the bootstrap. + keep_state_and_data; +init_linking(EventType, Event, Data) -> + handle_event_(?FUNCTION_NAME, EventType, Event, Data). + +bootstrapping(enter, _OldState, #{link_conf := LinkConf} = Data) -> + #{topics := LinkFilters, upstream := ClusterName} = LinkConf, + %% TODO add timeout? + {Pid, Ref} = erlang:spawn_monitor(fun() -> bootstrap(ClusterName, LinkFilters) end), + {keep_state, Data#{bootstrap_pid => Pid, bootstrap_ref => Ref}}; +bootstrapping(info, {'DOWN', Ref, process, _Pid, Reason}, #{bootstrap_ref := Ref} = Data) -> + %% TODO: think about the best way to proceed if bootstrapping failed, + %% perhaps just transition back to connecting state? + normal = Reason, + Data1 = maps:without([bootstrap_ref, bootstrap_pid], Data), + {next_state, coordinating, Data1}; +%% Accumulate new route ops, since there is no guarantee +%% they will be included in the bootstrapped data +bootstrapping(cast, {Op, _Topic}, _Data) when ?IS_ROUTE_OP(Op) -> + {keep_state_and_data, [postpone]}; +bootstrapping(EventType, Event, Data) -> + handle_event_(?FUNCTION_NAME, EventType, Event, Data). + +coordinating(enter, _OldState, _Data) -> + keep_state_and_data; +coordinating(cast, {Op, Topic}, Data) when ?IS_ROUTE_OP(Op) -> + #{link_conf := #{upstream := ClusterName}} = Data, + %% TODO: batching + case emqx_cluster_link_mqtt:publish_route_op(async, ClusterName, Op, Topic) of + {error, _} -> + %% Conn pool error, reconnect. + {next_state, connecting, stop_conn_pool(Data)}; + _Ref -> + keep_state_and_data + end; +%% TODO: this can also be received in other states, move to generic handler? +coordinating(info, {global_name_conflict, CoordName}, Data) -> + LogData = #{ + msg => "emqx_cluster_link_coordinator_name_conflict", + coordinator_name => CoordName + }, + LogData1 = + %% TODO: this can be a previous (self) coordinator? + case global:whereis_name(CoordName) of + undefined -> LogData; + Pid -> LogData#{new_coordinator => Pid, coordinator_node => node(Pid)} + end, + ?SLOG(warning, LogData1), + Data1 = stop_conn_pool(Data), + {next_state, wait_for_coordinator, Data1#{is_coordinator => false}}; +%% only errors results are expected +%% TODO: a single error causes reconnection and re-bootstrapping, +%% it's worth considering some optimizations. +coordinating(info, {pub_result, _Ref, {error, Reason}}, #{link_conf := #{upstream := Name}} = Data) -> + ?SLOG(error, #{ + msg => "failed_to_replicate_route_op_to_linked_cluster", + link_name => Name, + reason => Reason + }), + %% TODO: check errors, some may be not possible to correct by re-connecting + Data1 = stop_conn_pool(Data), + {next_state, connecting, Data1}; +coordinating(EventType, Event, Data) -> + handle_event_(?FUNCTION_NAME, EventType, Event, Data). + +following(enter, _OldState, _Data) -> + keep_state_and_data; +following(info, {'DOWN', MRef, process, _Pid, _Info}, #{coordinator_mon := MRef} = Data) -> + {next_state, wait_for_coordinator, maps:without([coordinator_mon, coordinator_pid], Data)}; +following(EventType, Event, Data) -> + handle_event_(?FUNCTION_NAME, EventType, Event, Data). + +handle_event_(_State, info, {'DOWN', Ref, process, _Pid, Reason}, Data) -> + case Data of + #{conn_pool_mons := #{Ref := WorkerName}, is_coordinator := true} -> + ?SLOG(warning, #{ + msg => "cluster_link_route_connection_is_down", + reason => Reason, + worker => WorkerName + }), + {next_state, connecting, stop_conn_pool(Data)}; + _ -> + %% Must be a stale 'DOWN' msg (e.g., from the next worker) which is already handled. + keep_state_and_data + end; +handle_event_(State, EventType, Event, Data) -> + ?SLOG(warning, #{ + msg => "unexpected_event", + event => Event, + event_type => EventType, + state => State, + data => Data + }), + keep_state_and_data. + +terminate(Reason, _State, #{link_conf := #{upstream := ClusterName}} = Data) -> + %% TODO unregister coordinator? + IsCoordinator = maps:get(is_coordinator, Data, false), + case Reason of + shutdown when IsCoordinator -> + %% must be sync, since we are going to stop the pool + %% NOTE: there is no guarantee that unlink op will arrive the last one + %% (since there may be other route op sent over another pool worker) + %% and clear everything, but it must be good enough to GC most of the routes. + _ = emqx_cluster_link_mqtt:remove_link(ClusterName); + _ -> + ok + end, + _ = stop_conn_pool(Data), + ok. + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +topic_intersect_any(Topic, [LinkFilter | T]) -> + case emqx_topic:intersection(Topic, LinkFilter) of + false -> topic_intersect_any(Topic, T); + TopicOrFilter -> TopicOrFilter + end; +topic_intersect_any(_Topic, []) -> + false. + +bootstrap(ClusterName, LinkFilters) -> + %% TODO: do this in chunks + Topics = select_routes(LinkFilters), + {ok, _} = emqx_cluster_link_mqtt:publish_routes(sync, ClusterName, Topics). + +%% TODO: if a local route matches link filter exactly, +%% it's enough to only select this matching filter itself and skip any other routes? +%% E.g., local routes: "t/global/#", "t/global/1/+", clsuter link topics = ["t/global/#"], +%% it's enough to replicate "t/global/#" only to the linked cluster. +%% What to do when "t/global/#" subscriber unsubscribers +%% and we start to get forwarded messages (e.g. "t/global/2/3") matching no subscribers? +%% How can we efficiently replace "t/global/#" route with "t/global/1/+" +%% (intersection of "t/global/#" and "t/global/#")? +%% So maybe better not to do it at all and replicate both "t/global/1/+" and "t/global/#" ? +select_routes(LinkFilters) -> + {Wildcards, Topics} = lists:partition(fun emqx_topic:wildcard/1, LinkFilters), + Routes = select_routes_by_topics(Topics), + Routes1 = intersecting_routes(Wildcards), + AllRoutes = Routes ++ Routes1, + case emqx_router:get_schema_vsn() of + v1 -> AllRoutes; + %% v2 stores filters (Wildcard subscriptions routes) in a separate index, + %% so WildcardRoutes contains only non-wildcard routes matching wildcard link filters. + %% Thus, we need to select wildcard routes additionally + v2 -> intersecting_routes_v2(Wildcards) ++ AllRoutes + end. + +select_routes_by_topics([]) -> + []; +select_routes_by_topics([Topic | T]) -> + case filter_out_upstream_routes(emqx_router:match_routes(Topic)) of + [_ | _] -> + %% These are non-wildcard link topics, so we don't care about actual + %% routes as long as they are matched, and just need to replicate + %% topic routes to the linked cluster + [Topic | select_routes_by_topics(T)]; + _ -> + select_routes_by_topics(T) + end. + +filter_out_upstream_routes(Routes) -> + lists:filter( + fun + (#route{dest = ?UPSTREAM_DEST}) -> false; + (_) -> true + end, + Routes + ). + +%% selects only non-wildcard routes that match wildcards (filters), +%% can only be done as a linear search over all routes +intersecting_routes([]) -> + []; +intersecting_routes(Wildcards) -> + Res = ets:foldl( + fun + (#route{dest = ?UPSTREAM_DEST}, Acc) -> + Acc; + (#route{topic = T}, Acc) -> + %% TODO: probably nice to validate cluster link topic filters + %% to have no intersections between each other? + case topic_intersect_any(T, Wildcards) of + false -> Acc; + Intersection -> Acc#{Intersection => undefined} + end + end, + #{}, + ?ROUTE_TAB + ), + maps:keys(Res). + +intersecting_routes_v2([]) -> + []; +intersecting_routes_v2(Wildcards) -> + lists:foldl( + fun(Wildcard, Acc) -> + MatchedFilters = matched_filters_v2(Wildcard), + all_intersections(Wildcard, MatchedFilters, Acc) + end, + [], + Wildcards + ). + +matched_filters_v2(Wildcard) -> + MatchesAcc = lists:foldl( + fun(M, Acc) -> + case emqx_topic_index:get_id(M) of + ?UPSTREAM_DEST -> + Acc; + _ -> + Acc#{emqx_topic_index:get_topic(M) => undefined} + end + end, + #{}, + emqx_topic_index:matches_filter(Wildcard, ?ROUTE_TAB_FILTERS, []) + ), + maps:keys(MatchesAcc). + +all_intersections(Wildcard, [W | Wildcards], Acc) -> + case emqx_topic:intersection(Wildcard, W) of + false -> all_intersections(Wildcard, Wildcards, Acc); + Intersection -> all_intersections(Wildcard, Wildcards, [Intersection | Acc]) + end; +all_intersections(_, [], Acc) -> + lists:usort(Acc). + +maybe_cast(UpstreamName, Msg) -> + case global:whereis_name(?COORDINATOR(UpstreamName)) of + Pid when is_pid(Pid) -> + gen_statem:cast(Pid, Msg); + undefined -> + %% Ignore and rely on coordinator bootstrapping once it's elected + ok + end. + +register_coordinator(UpstreamName) -> + case mria_config:role() of + core -> + global:register_name( + ?COORDINATOR(UpstreamName), self(), fun global:random_notify_name/3 + ); + _ -> + no + end. + +%% connecting state helper +ensure_conn_pool(NextState, #{link_conf := LinkConf} = Data) -> + Res = start_conn_pool(LinkConf), + Data1 = Data#{conn_pool => Res}, + case Res of + {ok, _} -> + Data2 = Data1#{conn_pool_mons => mon_pool_workers(LinkConf)}, + {next_state, NextState, Data2}; + _Err -> + {keep_state, Data1, [{state_timeout, ?CONN_RETRY_INTERVAL, reconnect}]} + end. + +start_conn_pool(LinkConf) -> + case emqx_cluster_link_mqtt:start_routing_pool(LinkConf) of + {ok, _Pid} = Ok -> + Ok; + {error, Reason} = Err -> + #{upstream := Name} = LinkConf, + ?SLOG(error, #{ + msg => "failed_to_connect_to_linked_cluster", + cluster_name => Name, + reason => Reason + }), + Err + end. + +stop_conn_pool(#{link_conf := #{upstream := Name}} = Data) -> + case Data of + #{conn_pool := {ok, _}} -> + Data1 = maybe_unmointor_workers(Data), + Data1#{conn_pool => {stopped, emqx_cluster_link_mqtt:stop_routing_pool(Name)}}; + _ -> + Data + end. + +maybe_unmointor_workers(#{conn_pool_mons := MonitorsMap} = Data) -> + _ = maps:foreach( + fun(Mref, _Name) -> + erlang:demonitor(Mref) + end, + MonitorsMap + ), + maps:remove(conn_pool_mons, Data); +maybe_unmointor_workers(Data) -> + Data. + +mon_pool_workers(LinkConf) -> + maps:from_list([ + {erlang:monitor(process, Pid), Name} + || {Name, Pid} <- emqx_cluster_link_mqtt:routing_pool_workers(LinkConf) + ]). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl new file mode 100644 index 000000000..1e9310aca --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -0,0 +1,547 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- +-module(emqx_cluster_link_mqtt). + +-include("emqx_cluster_link.hrl"). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). + +%-include_lib("emqtt/include/emqtt.hrl"). + +-behaviour(emqx_resource). +-behaviour(ecpool_worker). + +%% ecpool +-export([connect/1]). + +%% callbacks of behaviour emqx_resource +-export([ + callback_mode/0, + on_start/2, + on_stop/2, + on_query/3, + on_query_async/4, + on_get_status/2 +]). + +-export([ + ensure_msg_fwd_resource/1, + stop_msg_fwd_resource/1, + start_routing_pool/1, + stop_routing_pool/1, + routing_pool_workers/1, + init_link/1, + ack_link/4, + remove_link/1, + publish_route_op/4, + publish_routes/3, + cleanup_routes/1, + decode_ctrl_msg/2, + decode_route_op/1, + decode_forwarded_msg/1 +]). + +-export([ + forward/2 +]). + +-define(ROUTE_CLIENTID_SUFFIX, ":route:"). +-define(MSG_CLIENTID_SUFFIX, ":msg:"). +-define(CLIENTID(Base, Suffix), emqx_bridge_mqtt_lib:clientid_base([Base, Suffix])). + +-define(MQTT_HOST_OPTS, #{default_port => 1883}). +-define(MY_CLUSTER_NAME, atom_to_binary(emqx_config:get([cluster, name]))). + +-define(ROUTE_TOPIC, <>). +-define(MSG_FWD_TOPIC, <>). +-define(CTRL_TOPIC(ClusterName), <>). + +%% ecpool and emqx_resource names +-define(ROUTE_POOL_PREFIX, "emqx_cluster_link_mqtt:route:"). +-define(MSG_POOL_PREFIX, "emqx_cluster_link_mqtt:msg:"). +-define(RES_NAME(Prefix, ClusterName), <>). +-define(ROUTE_POOL_NAME(ClusterName), ?RES_NAME(?ROUTE_POOL_PREFIX, ClusterName)). +-define(MSG_RES_ID(ClusterName), ?RES_NAME(?MSG_POOL_PREFIX, ClusterName)). +-define(HEALTH_CHECK_TIMEOUT, 1000). +-define(RES_GROUP, <<"emqx_cluster_link">>). +-define(DEFAULT_POOL_KEY, <<"default">>). + +%% Protocol +-define(PROTO_VER, <<"1.0">>). +-define(INIT_LINK_OP, <<"init_link">>). +-define(ACK_LINK_OP, <<"ack_link">>). +-define(UNLINK_OP, <<"unlink">>). +-define(BATCH_ROUTES_OP, <<"add_routes">>). +-define(CLEANUP_ROUTES_OP, <<"cleanup_routes">>). +%% It's worth optimizing non-batch op payload size, +%% thus it's encoded as a plain binary +-define(TOPIC_WITH_OP(Op, Topic), <>). +-define(DECODE(Payload), erlang:binary_to_term(Payload, [safe])). +-define(ENCODE(Payload), erlang:term_to_binary(Payload)). + +-define(PUB_TIMEOUT, 10_000). + +ensure_msg_fwd_resource(#{upstream := Name, pool_size := PoolSize} = ClusterConf) -> + ResConf = #{ + query_mode => async, + start_after_created => true, + start_timeout => 5000, + health_check_interval => 5000, + %% TODO: configure res_buf_worker pool separately? + worker_pool_size => PoolSize + }, + emqx_resource:create_local(?MSG_RES_ID(Name), ?RES_GROUP, ?MODULE, ClusterConf, ResConf). + +stop_msg_fwd_resource(ClusterName) -> + emqx_resource:stop(?MSG_RES_ID(ClusterName)). + +%%-------------------------------------------------------------------- +%% emqx_resource callbacks (message forwarding) +%%-------------------------------------------------------------------- + +callback_mode() -> async_if_possible. + +on_start(ResourceId, #{pool_size := PoolSize} = ClusterConf) -> + PoolName = ResourceId, + Options = [ + {name, PoolName}, + {pool_size, PoolSize}, + {pool_type, hash}, + {client_opts, emqtt_client_opts(?MSG_CLIENTID_SUFFIX, ClusterConf)} + ], + ok = emqx_resource:allocate_resource(ResourceId, pool_name, PoolName), + case emqx_resource_pool:start(PoolName, ?MODULE, Options) of + ok -> + {ok, #{pool_name => PoolName, topic => ?MSG_FWD_TOPIC}}; + {error, {start_pool_failed, _, Reason}} -> + {error, Reason} + end. + +on_stop(ResourceId, _State) -> + #{pool_name := PoolName} = emqx_resource:get_allocated_resources(ResourceId), + emqx_resource_pool:stop(PoolName). + +on_query(_ResourceId, FwdMsg, #{pool_name := PoolName, topic := LinkTopic} = _State) when + is_record(FwdMsg, message) +-> + #message{topic = Topic, qos = QoS} = FwdMsg, + handle_send_result( + ecpool:pick_and_do( + {PoolName, Topic}, + fun(ConnPid) -> + emqtt:publish(ConnPid, LinkTopic, ?ENCODE(FwdMsg), QoS) + end, + no_handover + ) + ); +on_query(_ResourceId, {Topic, Props, Payload, QoS}, #{pool_name := PoolName} = _State) -> + handle_send_result( + ecpool:pick_and_do( + {PoolName, Topic}, + fun(ConnPid) -> + emqtt:publish(ConnPid, Topic, Props, ?ENCODE(Payload), [{qos, QoS}]) + end, + no_handover + ) + ). + +on_query_async( + _ResourceId, FwdMsg, CallbackIn, #{pool_name := PoolName, topic := LinkTopic} = _State +) -> + Callback = {fun on_async_result/2, [CallbackIn]}, + #message{topic = Topic, qos = QoS} = FwdMsg, + %% TODO check message ordering, pick by topic,client pair? + ecpool:pick_and_do( + {PoolName, Topic}, + fun(ConnPid) -> + %% #delivery{} record has no valuable data for a remote link... + Payload = ?ENCODE(FwdMsg), + %% TODO: check override QOS requirements (if any) + emqtt:publish_async(ConnPid, LinkTopic, Payload, QoS, Callback) + end, + no_handover + ). + +%% copied from emqx_bridge_mqtt_connector + +on_async_result(Callback, Result) -> + apply_callback_function(Callback, handle_send_result(Result)). + +apply_callback_function(F, Result) when is_function(F) -> + erlang:apply(F, [Result]); +apply_callback_function({F, A}, Result) when is_function(F), is_list(A) -> + erlang:apply(F, A ++ [Result]); +apply_callback_function({M, F, A}, Result) when is_atom(M), is_atom(F), is_list(A) -> + erlang:apply(M, F, A ++ [Result]). + +handle_send_result(ok) -> + ok; +handle_send_result({ok, #{reason_code := ?RC_SUCCESS}}) -> + ok; +handle_send_result({ok, #{reason_code := ?RC_NO_MATCHING_SUBSCRIBERS}}) -> + ok; +handle_send_result({ok, Reply}) -> + {error, classify_reply(Reply)}; +handle_send_result({error, Reason}) -> + {error, classify_error(Reason)}. + +classify_reply(Reply = #{reason_code := _}) -> + {unrecoverable_error, Reply}. + +classify_error(disconnected = Reason) -> + {recoverable_error, Reason}; +classify_error(ecpool_empty) -> + {recoverable_error, disconnected}; +classify_error({disconnected, _RC, _} = Reason) -> + {recoverable_error, Reason}; +classify_error({shutdown, _} = Reason) -> + {recoverable_error, Reason}; +classify_error(shutdown = Reason) -> + {recoverable_error, Reason}; +classify_error(Reason) -> + {unrecoverable_error, Reason}. + +%% copied from emqx_bridge_mqtt_connector +on_get_status(_ResourceId, #{pool_name := PoolName} = _State) -> + Workers = [Worker || {_Name, Worker} <- ecpool:workers(PoolName)], + try emqx_utils:pmap(fun get_status/1, Workers, ?HEALTH_CHECK_TIMEOUT) of + Statuses -> + combine_status(Statuses) + catch + exit:timeout -> + connecting + end. + +get_status(Worker) -> + case ecpool_worker:client(Worker) of + {ok, Client} -> status(Client); + {error, _} -> disconnected + end. + +status(Pid) -> + try + case proplists:get_value(socket, emqtt:info(Pid)) of + Socket when Socket /= undefined -> + connected; + undefined -> + connecting + end + catch + exit:{noproc, _} -> + disconnected + end. + +combine_status(Statuses) -> + %% NOTE + %% Natural order of statuses: [connected, connecting, disconnected] + %% * `disconnected` wins over any other status + %% * `connecting` wins over `connected` + case lists:reverse(lists:usort(Statuses)) of + [Status | _] -> + Status; + [] -> + disconnected + end. + +%%-------------------------------------------------------------------- +%% ecpool +%%-------------------------------------------------------------------- + +connect(Options) -> + WorkerId = proplists:get_value(ecpool_worker_id, Options), + #{clientid := ClientId} = ClientOpts = proplists:get_value(client_opts, Options), + ClientId1 = emqx_bridge_mqtt_lib:bytes23([ClientId], WorkerId), + ClientOpts1 = ClientOpts#{clientid => ClientId1}, + case emqtt:start_link(ClientOpts1) of + {ok, Pid} -> + case emqtt:connect(Pid) of + {ok, _Props} -> + {ok, Pid}; + Error -> + Error + end; + {error, Reason} = Error -> + ?SLOG(error, #{ + msg => "client_start_failed", + config => emqx_utils:redact(ClientOpts), + reason => Reason + }), + Error + end. + +%%-------------------------------------------------------------------- +%% Routing +%%-------------------------------------------------------------------- + +routing_pool_workers(#{upstream := ClusterName} = _ClusterConf) -> + ecpool:workers(?ROUTE_POOL_NAME(ClusterName)). + +start_routing_pool(#{upstream := ClusterName} = ClusterConf) -> + start_pool(?ROUTE_POOL_NAME(ClusterName), ?ROUTE_CLIENTID_SUFFIX, ClusterConf). + +stop_routing_pool(ClusterName) -> + ecpool:stop_sup_pool(?ROUTE_POOL_NAME(ClusterName)). + +init_link(ClusterName) -> + Payload = #{ + <<"op">> => ?INIT_LINK_OP, + <<"proto_ver">> => ?PROTO_VER, + <<"upstream">> => ClusterName, + %% TODO: may no need to reserve it as it is a map? + <<"extra">> => #{} + }, + ReqId = emqx_utils_conv:bin(emqx_utils:gen_id(16)), + Properties = #{ + 'Response-Topic' => ?CTRL_TOPIC(ClusterName), + 'Correlation-Data' => ReqId + }, + Topic = ?CTRL_TOPIC(?MY_CLUSTER_NAME), + {ReqId, publish(sync, ClusterName, ?DEFAULT_POOL_KEY, Payload, Properties, Topic, ?QOS_1)}. + +ack_link(ClusterName, Result, RespTopic, ReqId) -> + Payload = #{ + <<"op">> => ?ACK_LINK_OP, + %% The links may compare and downgrade/adjust protocol in future + <<"proto_ver">> => ?PROTO_VER, + %% may be used in future to avoud re-bootrstrapping all the routes, + %% for example, if the connection was abrupted for a while but the cluster was healthy + %% and didn't lost any routes. In that case, retrying lost route updates would be sufficient. + %% For now, it's always true for simplicitiy reasons. + <<"need_bootstrap">> => true, + <<"extra">> => #{} + }, + Payload1 = + case Result of + {ok, _} -> + Payload#{<<"result">> => <<"ok">>}; + {error, Reason} -> + Payload#{<<"result">> => <<"error">>, reason => Reason} + end, + Props = #{'Correlation-Data' => ReqId}, + Query = {RespTopic, Props, Payload1, ?QOS_1}, + %% Using msg forwading resource to send the response back. + %% TODO: maybe async query? + emqx_resource:query(?MSG_RES_ID(ClusterName), Query, #{ + query_mode => simple_sync, pick_key => RespTopic + }). + +remove_link(ClusterName) -> + Payload = #{<<"op">> => ?UNLINK_OP}, + Topic = ?CTRL_TOPIC(?MY_CLUSTER_NAME), + publish(sync, ClusterName, ?DEFAULT_POOL_KEY, Payload, #{}, Topic, ?QOS_0). + +publish_routes(QueryType, ClusterName, Topics) -> + %% Picks the same pool worker consistently. + %% Although, as writes are idompotent we can pick it randomly - TBD. + publish_routes(QueryType, ClusterName, ?DEFAULT_POOL_KEY, Topics). + +publish_routes(QueryType, ClusterName, PoolKey, Topics) -> + Payload = #{<<"op">> => ?BATCH_ROUTES_OP, <<"topics">> => Topics}, + publish(QueryType, ClusterName, PoolKey, Payload). + +cleanup_routes(ClusterName) -> + Payload = #{<<"op">> => ?CLEANUP_ROUTES_OP}, + publish(sync, ClusterName, ?DEFAULT_POOL_KEY, Payload, #{}, ?ROUTE_TOPIC, ?QOS_0). + +publish_route_op(QueryType, ClusterName, Op, Topic) when Op =:= <<"add">>; Op =:= <<"delete">> -> + Payload = ?TOPIC_WITH_OP(Op, Topic), + publish(QueryType, ClusterName, Topic, Payload). + +publish(QueryType, ClusterName, PoolKey, Payload) -> + publish(QueryType, ClusterName, PoolKey, Payload, #{}). + +publish(QueryType, ClusterName, PoolKey, Payload, Props) -> + %% Deletes are not implemented for now, writes are idempotent, so QOS_1 is fine. + publish(QueryType, ClusterName, PoolKey, Payload, Props, ?ROUTE_TOPIC, ?QOS_1). + +publish(async, ClusterName, PoolKey, Payload, Props, Topic, QoS) -> + ecpool:pick_and_do( + {?ROUTE_POOL_NAME(ClusterName), PoolKey}, + fun(ConnPid) -> + Ref = erlang:make_ref(), + Cb = {fun publish_result/3, [self(), Ref]}, + emqtt:publish_async( + ConnPid, Topic, Props, ?ENCODE(Payload), [{qos, QoS}], ?PUB_TIMEOUT, Cb + ), + Ref + end, + no_handover + ); +publish(sync, ClusterName, PoolKey, Payload, Props, Topic, QoS) -> + ecpool:pick_and_do( + {?ROUTE_POOL_NAME(ClusterName), PoolKey}, + fun(ConnPid) -> + emqtt:publish(ConnPid, Topic, Props, ?ENCODE(Payload), [{qos, QoS}]) + end, + no_handover + ). + +publish_result(Caller, Ref, Result) -> + case handle_send_result(Result) of + ok -> + %% avoid extra message passing, we only care about errors for now + ok; + Err -> + Caller ! {pub_result, Ref, Err} + end. + +%%-------------------------------------------------------------------- +%% Protocol +%%-------------------------------------------------------------------- + +decode_ctrl_msg(Payload, ClusterName) -> + decode_ctrl_msg1(?DECODE(Payload), ClusterName). + +decode_ctrl_msg1( + #{ + <<"op">> := ?INIT_LINK_OP, + <<"proto_ver">> := ProtoVer, + <<"upstream">> := UpstreamName + }, + ClusterName +) -> + ProtoVer1 = decode_proto_ver(ProtoVer, ClusterName), + %% UpstreamName is the name the remote linked cluster refers to this cluster, + %% so it must equal to the local cluster name, more clear naming is desired... + MyClusterName = ?MY_CLUSTER_NAME, + case UpstreamName of + MyClusterName -> + {init_link, {ok, #{proto_ver => ProtoVer1}}}; + _ -> + ?SLOG(error, #{ + msg => "misconfigured_cluster_link_name", + %% How this cluster names itself + local_name => MyClusterName, + %% How the remote cluster names itself + link_name => ClusterName, + %% How the remote cluster names this local cluster + upstream_name => UpstreamName + }), + {init_link, {error, <<"bad_upstream_name">>}} + end; +decode_ctrl_msg1( + #{ + <<"op">> := ?ACK_LINK_OP, + <<"result">> := <<"ok">>, + <<"proto_ver">> := ProtoVer, + <<"need_bootstrap">> := IsBootstrapNeeded + }, + ClusterName +) -> + ProtoVer1 = decode_proto_ver(ProtoVer, ClusterName), + {ack_link, {ok, #{proto_ver => ProtoVer1, need_bootstrap => IsBootstrapNeeded}}}; +decode_ctrl_msg1( + #{ + <<"op">> := ?ACK_LINK_OP, + <<"result">> := <<"error">>, + <<"reason">> := Reason + }, + _ClusterName +) -> + {ack_link, {error, Reason}}; +decode_ctrl_msg1(#{<<"op">> := ?UNLINK_OP}, _ClusterName) -> + unlink. + +decode_route_op(Payload) -> + decode_route_op1(?DECODE(Payload)). + +decode_route_op1(<<"add_", Topic/binary>>) -> + {add, Topic}; +decode_route_op1(<<"delete_", Topic/binary>>) -> + {delete, Topic}; +decode_route_op1(#{<<"op">> := ?BATCH_ROUTES_OP, <<"topics">> := Topics}) when is_list(Topics) -> + {add, Topics}; +decode_route_op1(#{<<"op">> := ?CLEANUP_ROUTES_OP}) -> + cleanup_routes; +decode_route_op1(Payload) -> + ?SLOG(warning, #{ + msg => "unexpected_cluster_link_route_op_payload", + payload => Payload + }), + {error, Payload}. + +decode_forwarded_msg(Payload) -> + case ?DECODE(Payload) of + #message{} = Msg -> + Msg; + _ -> + ?SLOG(warning, #{ + msg => "unexpected_cluster_link_forwarded_msg_payload", + payload => Payload + }), + {error, Payload} + end. + +decode_proto_ver(ProtoVer, ClusterName) -> + {MyMajor, MyMinor} = decode_proto_ver1(?PROTO_VER), + case decode_proto_ver1(ProtoVer) of + {Major, Minor} = Res when + Major > MyMajor; + Minor > MyMinor + -> + ?SLOG(notice, #{ + msg => "different_cluster_link_protocol_versions", + protocol_version => ?PROTO_VER, + link_protocol_version => ProtoVer, + link_name => ClusterName + }), + Res; + Res -> + Res + end. + +decode_proto_ver1(ProtoVer) -> + [Major, Minor] = binary:split(ProtoVer, <<".">>), + %% Let it fail (for now), we don't expect invalid data to pass through the linking protocol.. + {emqx_utils_conv:int(Major), emqx_utils_conv:int(Minor)}. + +%%-------------------------------------------------------------------- +%% emqx_external_broker +%%-------------------------------------------------------------------- + +forward({external, {link, ClusterName}}, #delivery{message = #message{topic = Topic} = Msg}) -> + QueryOpts = #{pick_key => Topic}, + emqx_resource:query(?MSG_RES_ID(ClusterName), Msg, QueryOpts). + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +emqtt_client_opts( + ClientIdSuffix, #{server := Server, ssl := #{enable := EnableSsl} = Ssl} = ClusterConf +) -> + BaseClientId = maps:get(client_id, ClusterConf, ?MY_CLUSTER_NAME), + ClientId = ?CLIENTID(BaseClientId, ClientIdSuffix), + #{hostname := Host, port := Port} = emqx_schema:parse_server(Server, ?MQTT_HOST_OPTS), + Opts = #{ + host => Host, + port => Port, + clientid => ClientId, + proto_ver => v5, + ssl => EnableSsl, + ssl_opts => maps:to_list(maps:remove(enable, Ssl)) + }, + with_password(with_user(Opts, ClusterConf), ClusterConf). + +with_user(Opts, #{username := U} = _ClusterConf) -> + Opts#{username => U}; +with_user(Opts, _ClusterConf) -> + Opts. + +with_password(Opts, #{password := P} = _ClusterConf) -> + Opts#{password => emqx_secret:unwrap(P)}; +with_password(Opts, _ClusterConf) -> + Opts. + +start_pool(PoolName, ClientIdSuffix, #{pool_size := PoolSize} = ClusterConf) -> + ClientOpts = emqtt_client_opts(ClientIdSuffix, ClusterConf), + Opts = [ + {name, PoolName}, + {pool_size, PoolSize}, + {pool_type, hash}, + {client_opts, ClientOpts} + ], + ecpool:start_sup_pool(PoolName, ?MODULE, Opts). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl new file mode 100644 index 000000000..abdfff39f --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl @@ -0,0 +1,56 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_schema). + +-behaviour(emqx_schema_hooks). + +-include_lib("hocon/include/hoconsc.hrl"). + +-export([injected_fields/0]). + +-export([ + roots/0, + fields/1, + namespace/0, + desc/1 +]). + +-define(MQTT_HOST_OPTS, #{default_port => 1883}). + +namespace() -> "cluster_linking". + +roots() -> []. + +injected_fields() -> + #{cluster => fields("cluster_linking")}. + +fields("cluster_linking") -> + [ + %% TODO: validate and ensure upstream names are unique! + {links, ?HOCON(?ARRAY(?R_REF("link")), #{default => []})} + ]; +fields("link") -> + [ + {enable, ?HOCON(boolean(), #{default => false})}, + {upstream, ?HOCON(binary(), #{required => true})}, + {server, + emqx_schema:servers_sc(#{required => true, desc => ?DESC("server")}, ?MQTT_HOST_OPTS)}, + {clientid, ?HOCON(binary(), #{desc => ?DESC("clientid")})}, + {username, ?HOCON(binary(), #{desc => ?DESC("username")})}, + {password, emqx_schema_secret:mk(#{desc => ?DESC("password")})}, + {ssl, #{ + type => ?R_REF(emqx_schema, "ssl_client_opts"), + default => #{<<"enable">> => false}, + desc => ?DESC("ssl") + }}, + %% TODO: validate topics: + %% - basic topic validation + %% - non-overlapping (not intersecting) filters ? + {topics, ?HOCON(?ARRAY(binary()), #{required => true})}, + {pool_size, ?HOCON(pos_integer(), #{default => emqx_vm:schedulers() * 2})} + ]. + +desc(_) -> + "todo". diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl new file mode 100644 index 000000000..c98b9f4c5 --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl @@ -0,0 +1,36 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_sup). + +-behaviour(supervisor). + +-export([start_link/1]). + +-export([init/1]). + +-define(COORD_SUP, emqx_cluster_link_coord_sup). +-define(SERVER, ?MODULE). + +start_link(LinksConf) -> + supervisor:start_link({local, ?SERVER}, ?SERVER, LinksConf). + +init(LinksConf) -> + SupFlags = #{ + strategy => one_for_one, + intensity => 10, + period => 5 + }, + Children = [sup_spec(?COORD_SUP, ?COORD_SUP, LinksConf)], + {ok, {SupFlags, Children}}. + +sup_spec(Id, Mod, Conf) -> + #{ + id => Id, + start => {Mod, start_link, [Conf]}, + restart => permanent, + shutdown => infinity, + type => supervisor, + modules => [Mod] + }. diff --git a/apps/emqx_conf/include/emqx_conf.hrl b/apps/emqx_conf/include/emqx_conf.hrl index 786b0f685..a0b5c820f 100644 --- a/apps/emqx_conf/include/emqx_conf.hrl +++ b/apps/emqx_conf/include/emqx_conf.hrl @@ -74,7 +74,9 @@ (?CE_AUTHN_PROVIDER_SCHEMA_MODS ++ ?EE_AUTHN_PROVIDER_SCHEMA_MODS) ). --define(OTHER_INJECTING_CONFIGS, ?AUTH_EXT_SCHEMA_MODS). +-define(CLUSTER_LINKING_SCHEMA_MODS, [emqx_cluster_link_schema]). + +-define(OTHER_INJECTING_CONFIGS, ?AUTH_EXT_SCHEMA_MODS ++ ?CLUSTER_LINKING_SCHEMA_MODS). -else. diff --git a/apps/emqx_conf/src/emqx_conf_schema.erl b/apps/emqx_conf/src/emqx_conf_schema.erl index 2c0de10aa..b4c59d291 100644 --- a/apps/emqx_conf/src/emqx_conf_schema.erl +++ b/apps/emqx_conf/src/emqx_conf_schema.erl @@ -255,7 +255,7 @@ fields("cluster") -> importance => ?IMPORTANCE_HIDDEN } )} - ]; + ] ++ emqx_schema_hooks:injection_point(cluster); fields(cluster_static) -> [ {"seeds", diff --git a/apps/emqx_machine/priv/reboot_lists.eterm b/apps/emqx_machine/priv/reboot_lists.eterm index e0e62d123..d9bcf9d25 100644 --- a/apps/emqx_machine/priv/reboot_lists.eterm +++ b/apps/emqx_machine/priv/reboot_lists.eterm @@ -133,7 +133,8 @@ emqx_bridge_syskeeper, emqx_bridge_confluent, emqx_ds_shared_sub, - emqx_auth_ext + emqx_auth_ext, + emqx_cluster_link ], %% must always be of type `load' ce_business_apps => From 2dd99c5a081d8e31592cdaa2d3e1b4231f7389e7 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 6 May 2024 17:29:20 +0200 Subject: [PATCH 02/46] feat(clusterlink): add facility to reconstruct remote routing table --- .../src/emqx_cluster_link_extrouter.erl | 318 ++++++++++++++++++ .../emqx_cluster_link_extrouter_SUITE.erl | 286 ++++++++++++++++ 2 files changed, 604 insertions(+) create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl create mode 100644 apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl new file mode 100644 index 000000000..ec25461a7 --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -0,0 +1,318 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_extrouter). + +-export([create_tables/0]). + +%% Router API +-export([ + match_routes/1, + lookup_routes/1, + topics/0 +]). + +%% Actor API +-export([ + actor_init/3, + actor_apply_operation/3, + actor_gc/1 +]). + +%% Strictly monotonically increasing integer. +-type smint() :: integer(). + +%% Actor. +%% Identifies an independent route replication actor on the remote broker. +%% Usually something like `node()` or `{node(), _Shard}`. +-type actor() :: term(). + +%% Identifies incarnation of the actor. +%% In the event of actor restart, it's the actor's responsibility to keep track of +%% monotonicity of its incarnation number. Each time actor's incarnation increases, +%% we assume that all the state of the previous incarnations is lost. +-type incarnation() :: smint(). + +%% Operation. +%% RouteID should come in handy when two or more different routes on the actor side +%% are "intersected" to the same topic filter that needs to be replicated here. +-type op() :: {add | del, _TopicFilter :: binary(), _RouteID} | heartbeat. + +%% Basically a bit offset. +%% Each actor + incarnation pair occupies a separate lane in the multi-counter. +%% Example: +%% Actors | n1@ds n2@ds n3@ds +%% Lanes | 0 1 2 +%% Op1 | n3@ds add client/42/# → MCounter += 1 bsl 2 = 4 +%% Op2 | n2@ds add client/42/# → MCounter += 1 bsl 1 = 6 +%% Op3 | n3@ds del client/42/# → MCounter -= 1 bsl 2 = 2 +%% Op4 | n2@ds del client/42/# → MCounter -= 1 bsl 1 = 0 → route deleted +-type lane() :: non_neg_integer(). + +-define(DEFAULT_ACTOR_TTL_MS, 30_000). + +-define(EXTROUTE_SHARD, ?MODULE). +-define(EXTROUTE_TAB, emqx_external_router_route). +-define(EXTROUTE_ACTOR_TAB, emqx_external_router_actor). + +-record(extroute, { + entry :: emqx_topic_index:key(_RouteID), + mcounter = 0 :: non_neg_integer() +}). + +-record(actor, { + id :: actor(), + incarnation :: incarnation(), + lane :: lane(), + until :: _Timestamp +}). + +%% + +create_tables() -> + %% TODO: Table per link viable? + mria_config:set_dirty_shard(?EXTROUTE_SHARD, true), + ok = mria:create_table(?EXTROUTE_ACTOR_TAB, [ + {type, set}, + {rlog_shard, ?EXTROUTE_SHARD}, + {storage, ram_copies}, + {record_name, actor}, + {attributes, record_info(fields, actor)} + ]), + ok = mria:create_table(?EXTROUTE_TAB, [ + {type, ordered_set}, + {rlog_shard, ?EXTROUTE_SHARD}, + {storage, ram_copies}, + {record_name, extroute}, + {attributes, record_info(fields, extroute)}, + {storage_properties, [ + {ets, [ + {read_concurrency, true}, + {write_concurrency, true}, + {decentralized_counters, true} + ]} + ]} + ]), + [?EXTROUTE_TAB]. + +%% + +match_routes(Topic) -> + Matches = emqx_topic_index:matches(Topic, ?EXTROUTE_TAB, [unique]), + [match_to_route(M) || M <- Matches]. + +lookup_routes(Topic) -> + Pat = #extroute{entry = emqx_topic_index:make_key(Topic, '$1'), _ = '_'}, + [match_to_route(R#extroute.entry) || Records <- ets:match(?EXTROUTE_TAB, Pat), R <- Records]. + +topics() -> + Pat = #extroute{entry = '$1', _ = '_'}, + [emqx_topic_index:get_topic(K) || [K] <- ets:match(?EXTROUTE_TAB, Pat)]. + +match_to_route(M) -> + emqx_topic_index:get_topic(M). + +%% + +-record(state, { + actor :: actor(), + incarnation :: incarnation(), + lane :: lane() | undefined +}). + +-type state() :: #state{}. + +-type env() :: #{timestamp := _Milliseconds}. + +-spec actor_init(actor(), incarnation(), env()) -> {ok, state()}. +actor_init(Actor, Incarnation, Env = #{timestamp := Now}) -> + %% FIXME: Sane transactions. + case transaction(fun mnesia_actor_init/3, [Actor, Incarnation, Now]) of + {ok, State} -> + {ok, State}; + {reincarnate, Rec} -> + %% TODO: Do this asynchronously. + ok = clean_incarnation(Rec), + actor_init(Actor, Incarnation, Env) + end. + +mnesia_actor_init(Actor, Incarnation, TS) -> + %% NOTE + %% We perform this heavy-weight transaction only in the case of a new route + %% replication connection. The implicit assumption is that each replication + %% channel is uniquely identified by the ClientID (reflecting the Actor), and + %% the broker will take care of ensuring that there's only one connection per + %% ClientID. There's always a chance of having stray process severely lagging + %% that applies some update out of the blue, but it seems impossible to prevent + %% it completely w/o transactions. + State = #state{actor = Actor, incarnation = Incarnation}, + case mnesia:read(?EXTROUTE_ACTOR_TAB, Actor, write) of + [#actor{incarnation = Incarnation, lane = Lane} = Rec] -> + ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec#actor{until = bump_actor_ttl(TS)}, write), + {ok, State#state{lane = Lane}}; + [] -> + Lane = mnesia_assign_lane(), + Rec = #actor{ + id = Actor, + incarnation = Incarnation, + lane = Lane, + until = bump_actor_ttl(TS) + }, + ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec, write), + {ok, State#state{lane = Lane}}; + [#actor{incarnation = Outdated} = Rec] when Incarnation > Outdated -> + {reincarnate, Rec}; + [#actor{incarnation = Newer}] -> + mnesia:abort({outdated_incarnation_actor, Actor, Incarnation, Newer}) + end. + +-spec actor_apply_operation(op(), state(), env()) -> state(). +actor_apply_operation( + {OpName, TopicFilter, ID}, + State = #state{actor = Actor, incarnation = Incarnation, lane = Lane}, + _Env +) -> + _ = assert_current_incarnation(Actor, Incarnation), + _ = apply_operation(emqx_topic_index:make_key(TopicFilter, ID), OpName, Lane), + State; +actor_apply_operation( + heartbeat, + State = #state{actor = Actor, incarnation = Incarnation}, + _Env = #{timestamp := Now} +) -> + ok = transaction(fun mnesia_actor_heartbeat/3, [Actor, Incarnation, Now]), + State. + +apply_operation(Entry, OpName, Lane) -> + %% NOTE + %% This is safe sequence of operations only on core nodes. On replicants, + %% `mria:dirty_update_counter/3` will be replicated asynchronously, which + %% means this read can be stale. + % MCounter = ets:lookup_element(Tab, Entry, 2, 0), + case mnesia:dirty_read(?EXTROUTE_TAB, Entry) of + [#extroute{mcounter = MCounter}] -> + apply_operation(Entry, MCounter, OpName, Lane); + [] -> + apply_operation(Entry, 0, OpName, Lane) + end. + +apply_operation(Entry, MCounter, OpName, Lane) -> + %% NOTE + %% We are relying on the fact that changes to each individual lane of this + %% multi-counter are synchronized. Without this, such counter updates would + %% be unsafe. Instead, we would have to use another, more complex approach, + %% that runs `ets:lookup/2` + `ets:select_replace/2` in a loop until the + %% counter is updated accordingly. + Marker = 1 bsl Lane, + case MCounter band Marker of + 0 when OpName =:= add -> + mria:dirty_update_counter(?EXTROUTE_TAB, Entry, Marker); + Marker when OpName =:= add -> + %% Already added. + MCounter; + Marker when OpName =:= del -> + case mria:dirty_update_counter(?EXTROUTE_TAB, Entry, -Marker) of + 0 -> + Record = #extroute{entry = Entry, mcounter = 0}, + ok = mria:dirty_delete_object(?EXTROUTE_TAB, Record), + 0; + C -> + C + end; + 0 when OpName =:= del -> + %% Already deleted. + MCounter + end. + +-spec actor_gc(env()) -> ok. +actor_gc(#{timestamp := Now}) -> + MS = [{#actor{until = '$1', _ = '_'}, [{'<', '$1', Now}], ['$_']}], + case mnesia:dirty_select(?EXTROUTE_ACTOR_TAB, MS) of + [Rec | _Rest] -> + %% NOTE: One at a time. + clean_incarnation(Rec); + [] -> + ok + end. + +mnesia_assign_lane() -> + Assignment = mnesia:foldl( + fun(#actor{lane = Lane}, Acc) -> + Acc bor (1 bsl Lane) + end, + 0, + ?EXTROUTE_ACTOR_TAB, + write + ), + Lane = first_zero_bit(Assignment), + Lane. + +mnesia_actor_heartbeat(Actor, Incarnation, TS) -> + case mnesia:read(?EXTROUTE_ACTOR_TAB, Actor, write) of + [#actor{incarnation = Incarnation} = Rec] -> + ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec#actor{until = bump_actor_ttl(TS)}, write); + [#actor{incarnation = Outdated}] -> + mnesia:abort({outdated_incarnation_actor, Actor, Incarnation, Outdated}); + [] -> + mnesia:abort({nonexistent_actor, Actor}) + end. + +clean_incarnation(Rec) -> + transaction(fun mnesia_clean_incarnation/1, [Rec]). + +mnesia_clean_incarnation(#actor{id = Actor, incarnation = Incarnation, lane = Lane}) -> + case mnesia:read(?EXTROUTE_ACTOR_TAB, Actor, write) of + [#actor{incarnation = Incarnation}] -> + _ = clean_lane(Lane), + mnesia:delete(?EXTROUTE_ACTOR_TAB, Actor, write); + _Renewed -> + ok + end. + +clean_lane(Lane) -> + ets:foldl( + fun(#extroute{entry = Entry, mcounter = MCounter}, _) -> + apply_operation(Entry, MCounter, del, Lane) + end, + 0, + ?EXTROUTE_TAB + ). + +assert_current_incarnation(Actor, Incarnation) -> + %% NOTE + %% Ugly, but should not really happen anyway. This is a safety net for the case + %% when this process tries to apply some outdated operation for whatever reason + %% (e.g. heavy CPU starvation). Still, w/o transactions, it's just a best-effort + %% attempt. + [#actor{incarnation = Incarnation}] = mnesia:dirty_read(?EXTROUTE_ACTOR_TAB, Actor), + ok. + +%% + +transaction(Fun, Args) -> + case mria:transaction(?EXTROUTE_SHARD, Fun, Args) of + {atomic, Result} -> + Result; + {aborted, Reason} -> + error(Reason) + end. + +%% + +first_zero_bit(N) -> + first_zero_bit(N, 0). + +first_zero_bit(N, I) -> + case N band 1 of + 0 -> I; + _ -> first_zero_bit(N bsr 1, I + 1) + end. + +%% + +bump_actor_ttl(TS) -> + TS + get_actor_ttl(). + +get_actor_ttl() -> + ?DEFAULT_ACTOR_TTL_MS. diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl new file mode 100644 index 000000000..e83698895 --- /dev/null +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl @@ -0,0 +1,286 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_extrouter_SUITE). + +-include_lib("common_test/include/ct.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("emqx/include/asserts.hrl"). + +-compile(export_all). +-compile(nowarn_export_all). + +%% + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + Apps = emqx_cth_suite:start([], #{work_dir => emqx_cth_suite:work_dir(Config)}), + ok = init_db(), + [{apps, Apps} | Config]. + +end_per_suite(Config) -> + ok = emqx_cth_suite:stop(?config(apps, Config)). + +init_per_testcase(TC, Config) -> + emqx_common_test_helpers:init_per_testcase(?MODULE, TC, Config). + +end_per_testcase(TC, Config) -> + emqx_common_test_helpers:end_per_testcase(?MODULE, TC, Config). + +init_db() -> + mria:wait_for_tables(emqx_cluster_link_extrouter:create_tables()). + +%% + +t_consistent_routing_view(_Config) -> + Actor1 = {?FUNCTION_NAME, 1}, + Actor2 = {?FUNCTION_NAME, 2}, + Actor3 = {?FUNCTION_NAME, 3}, + {ok, AS10} = emqx_cluster_link_extrouter:actor_init(Actor1, 1, env()), + {ok, AS20} = emqx_cluster_link_extrouter:actor_init(Actor2, 1, env()), + {ok, AS30} = emqx_cluster_link_extrouter:actor_init(Actor3, 1, env()), + %% Add few routes originating from different actors. + %% Also test that route operations are idempotent. + AS11 = apply_operation({add, <<"t/client/#">>, id}, AS10), + _AS11 = apply_operation({add, <<"t/client/#">>, id}, AS10), + AS21 = apply_operation({add, <<"t/client/#">>, id}, AS20), + AS31 = apply_operation({add, <<"t/client/+/+">>, id1}, AS30), + AS32 = apply_operation({add, <<"t/client/+/+">>, id2}, AS31), + _AS22 = apply_operation({del, <<"t/client/#">>, id}, AS21), + AS12 = apply_operation({add, <<"t/client/+/+">>, id1}, AS11), + AS33 = apply_operation({del, <<"t/client/+/+">>, id1}, AS32), + _AS34 = apply_operation({del, <<"t/client/+/+">>, id2}, AS33), + ?assertEqual( + [<<"t/client/#">>, <<"t/client/+/+">>], + topics_sorted() + ), + ?assertEqual( + [<<"t/client/#">>], + lists:sort(emqx_cluster_link_extrouter:match_routes(<<"t/client/42">>)) + ), + %% Remove all routes from the actors. + AS13 = apply_operation({del, <<"t/client/#">>, id}, AS12), + AS14 = apply_operation({del, <<"t/client/+/+">>, id1}, AS13), + AS14 = apply_operation({del, <<"t/client/+/+">>, id1}, AS13), + ?assertEqual( + [], + topics_sorted() + ). + +t_actor_reincarnation(_Config) -> + Actor1 = {?FUNCTION_NAME, 1}, + Actor2 = {?FUNCTION_NAME, 2}, + {ok, AS10} = emqx_cluster_link_extrouter:actor_init(Actor1, 1, env()), + {ok, AS20} = emqx_cluster_link_extrouter:actor_init(Actor2, 1, env()), + AS11 = apply_operation({add, <<"topic/#">>, id}, AS10), + AS12 = apply_operation({add, <<"topic/42/+">>, id}, AS11), + AS21 = apply_operation({add, <<"topic/#">>, id}, AS20), + ?assertEqual( + [<<"topic/#">>, <<"topic/42/+">>], + topics_sorted() + ), + {ok, _AS3} = emqx_cluster_link_extrouter:actor_init(Actor1, 2, env()), + ?assertError( + _IncarnationMismatch, + apply_operation({add, <<"toolate/#">>, id}, AS12) + ), + ?assertEqual( + [<<"topic/#">>], + topics_sorted() + ), + {ok, _AS4} = emqx_cluster_link_extrouter:actor_init(Actor2, 2, env()), + ?assertError( + _IncarnationMismatch, + apply_operation({add, <<"toolate/#">>, id}, AS21) + ), + ?assertEqual( + [], + topics_sorted() + ). + +t_actor_gc(_Config) -> + Actor1 = {?FUNCTION_NAME, 1}, + Actor2 = {?FUNCTION_NAME, 2}, + {ok, AS10} = emqx_cluster_link_extrouter:actor_init(Actor1, 1, env()), + {ok, AS20} = emqx_cluster_link_extrouter:actor_init(Actor2, 1, env()), + AS11 = apply_operation({add, <<"topic/#">>, id}, AS10), + AS12 = apply_operation({add, <<"topic/42/+">>, id}, AS11), + AS21 = apply_operation({add, <<"global/#">>, id}, AS20), + ?assertEqual( + [<<"global/#">>, <<"topic/#">>, <<"topic/42/+">>], + topics_sorted() + ), + _AS13 = apply_operation(heartbeat, AS12, 50_000), + ok = emqx_cluster_link_extrouter:actor_gc(env(60_000)), + ?assertEqual( + [<<"topic/#">>, <<"topic/42/+">>], + topics_sorted() + ), + ?assertError( + _IncarnationMismatch, + apply_operation({add, <<"toolate/#">>, id}, AS21) + ), + ok = emqx_cluster_link_extrouter:actor_gc(env(120_000)), + ?assertEqual( + [], + topics_sorted() + ). + +t_consistent_routing_view_concurrent_updates(_Config) -> + A1Seq = repeat(10, [ + reincarnate, + {add, <<"t/client/#">>, id}, + {add, <<"t/client/+/+">>, id1}, + {add, <<"t/client/+/+">>, id1}, + {del, <<"t/client/#">>, id} + ]), + A2Seq = repeat(10, [ + {add, <<"global/#">>, id}, + {add, <<"t/client/+/+">>, id1}, + {add, <<"t/client/+/+">>, id2}, + {del, <<"t/client/+/+">>, id1}, + heartbeat + ]), + A3Seq = repeat(10, [ + {add, <<"global/#">>, id}, + {del, <<"global/#">>, id}, + {add, <<"t/client/+/+">>, id1}, + {del, <<"t/client/+/+">>, id1}, + {add, <<"t/client/+/+">>, id2}, + {del, <<"t/client/+/+">>, id2}, + reincarnate + ]), + A4Seq = repeat(10, [ + gc, + {sleep, 1} + ]), + _ = emqx_utils:pmap( + fun run_actor/1, + [ + {{?FUNCTION_NAME, 1}, A1Seq}, + {{?FUNCTION_NAME, 2}, A2Seq}, + {{?FUNCTION_NAME, 3}, A3Seq}, + {{?FUNCTION_NAME, gc}, A4Seq} + ], + infinity + ), + ?assertEqual( + [<<"global/#">>, <<"t/client/+/+">>, <<"t/client/+/+">>], + topics_sorted() + ). + +t_consistent_routing_view_concurrent_cluster_updates('init', Config) -> + Specs = [ + {emqx_external_router1, #{role => core}}, + {emqx_external_router2, #{role => core}}, + {emqx_external_router3, #{role => core}} + ], + Cluster = emqx_cth_cluster:start( + Specs, + #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + ), + ok = lists:foreach( + fun(Node) -> + ok = erpc:call(Node, ?MODULE, init_db, []) + end, + Cluster + ), + [{cluster, Cluster} | Config]; +t_consistent_routing_view_concurrent_cluster_updates('end', Config) -> + ok = emqx_cth_cluster:stop(?config(cluster, Config)). + +t_consistent_routing_view_concurrent_cluster_updates(Config) -> + [N1, N2, N3] = ?config(cluster, Config), + A1Seq = repeat(10, [ + reincarnate, + {add, <<"t/client/#">>, id}, + {add, <<"t/client/+/+">>, id1}, + {add, <<"t/client/+/+">>, id1}, + {del, <<"t/client/#">>, id} + ]), + A2Seq = repeat(10, [ + {add, <<"global/#">>, id}, + {add, <<"t/client/+/+">>, id1}, + {add, <<"t/client/+/+">>, id2}, + {del, <<"t/client/+/+">>, id1}, + heartbeat + ]), + A3Seq = repeat(10, [ + {add, <<"global/#">>, id}, + {del, <<"global/#">>, id}, + {add, <<"t/client/+/+">>, id1}, + {del, <<"t/client/+/+">>, id1}, + {add, <<"t/client/+/+">>, id2}, + {del, <<"t/client/+/+">>, id2}, + reincarnate + ]), + A4Seq = repeat(10, [ + gc, + {sleep, 1} + ]), + Runners = lists:map( + fun run_remote_actor/1, + [ + {N1, {{?FUNCTION_NAME, 1}, A1Seq}}, + {N2, {{?FUNCTION_NAME, 2}, A2Seq}}, + {N3, {{?FUNCTION_NAME, 3}, A3Seq}}, + {N3, {{?FUNCTION_NAME, gc}, A4Seq}} + ] + ), + [?assertReceive({'DOWN', MRef, _, Pid, normal}) || {Pid, MRef} <- Runners], + ?assertEqual( + [<<"global/#">>, <<"t/client/+/+">>, <<"t/client/+/+">>], + erpc:call(N1, ?MODULE, topics_sorted, []) + ). + +run_remote_actor({Node, Run}) -> + erlang:spawn_monitor(Node, ?MODULE, run_actor, [Run]). + +run_actor({Actor, Seq}) -> + {ok, AS0} = emqx_cluster_link_extrouter:actor_init(Actor, 0, env(0)), + lists:foldl( + fun + ({TS, {add, _, _} = Op}, AS) -> + apply_operation(Op, AS, TS); + ({TS, {del, _, _} = Op}, AS) -> + apply_operation(Op, AS, TS); + ({TS, heartbeat}, AS) -> + apply_operation(heartbeat, AS, TS); + ({TS, gc}, AS) -> + ok = emqx_cluster_link_extrouter:actor_gc(env(TS)), + AS; + ({_TS, {sleep, MS}}, AS) -> + ok = timer:sleep(MS), + AS; + ({TS, reincarnate}, _AS) -> + {ok, AS} = emqx_cluster_link_extrouter:actor_init(Actor, TS, env(TS)), + AS + end, + AS0, + lists:enumerate(Seq) + ). + +%% + +apply_operation(Op, AS) -> + apply_operation(Op, AS, _TS = 42). + +apply_operation(Op, AS, TS) -> + emqx_cluster_link_extrouter:actor_apply_operation(Op, AS, env(TS)). + +env() -> + env(42). + +env(TS) -> + #{timestamp => TS}. + +topics_sorted() -> + lists:sort(emqx_cluster_link_extrouter:topics()). + +%% + +repeat(N, L) -> + lists:flatten(lists:duplicate(N, L)). From a53524c82602f8390a3fd251623477ec622858ae Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 6 May 2024 18:32:02 +0200 Subject: [PATCH 03/46] fix(cth-cluster): fix occasional case clauses during cluster bootup --- apps/emqx/test/emqx_cth_cluster.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx/test/emqx_cth_cluster.erl b/apps/emqx/test/emqx_cth_cluster.erl index f3c5d97f9..981b2e5eb 100644 --- a/apps/emqx/test/emqx_cth_cluster.erl +++ b/apps/emqx/test/emqx_cth_cluster.erl @@ -158,7 +158,7 @@ wait_clustered([Node | Nodes] = All, Check, Deadline) -> nodes_not_running => NodesNotRunnging }} ); - {false, Nodes} -> + {false, _Nodes} -> timer:sleep(100), wait_clustered(All, Check, Deadline) end. From 4097585f5dae66b35ae290f4152c758a7c381ce4 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 6 May 2024 18:33:45 +0200 Subject: [PATCH 04/46] fix(clusterlink): ensure extrouter works on replicants This is sort of a quick fix to make things safe, but it will likely be a subject to the same drawbacks as the regular router in high-latency deployments: reduced throughput. --- .../src/emqx_cluster_link_extrouter.erl | 33 ++++++++++++++--- .../emqx_cluster_link_extrouter_SUITE.erl | 37 ++++++++++++++----- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index ec25461a7..76999f4cf 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -20,6 +20,14 @@ actor_gc/1 ]). +%% Internal API +-export([ + mnesia_actor_init/3, + mnesia_actor_heartbeat/3, + mnesia_clean_incarnation/1, + apply_actor_operation/5 +]). + %% Strictly monotonically increasing integer. -type smint() :: integer(). @@ -127,8 +135,8 @@ match_to_route(M) -> -spec actor_init(actor(), incarnation(), env()) -> {ok, state()}. actor_init(Actor, Incarnation, Env = #{timestamp := Now}) -> - %% FIXME: Sane transactions. - case transaction(fun mnesia_actor_init/3, [Actor, Incarnation, Now]) of + %% TODO: Rolling upgrade safety? + case transaction(fun ?MODULE:mnesia_actor_init/3, [Actor, Incarnation, Now]) of {ok, State} -> {ok, State}; {reincarnate, Rec} -> @@ -173,17 +181,30 @@ actor_apply_operation( State = #state{actor = Actor, incarnation = Incarnation, lane = Lane}, _Env ) -> - _ = assert_current_incarnation(Actor, Incarnation), - _ = apply_operation(emqx_topic_index:make_key(TopicFilter, ID), OpName, Lane), + Entry = emqx_topic_index:make_key(TopicFilter, ID), + case mria_config:whoami() of + Role when Role /= replicant -> + apply_actor_operation(Actor, Incarnation, Entry, OpName, Lane); + replicant -> + mria:async_dirty( + ?EXTROUTE_SHARD, + fun ?MODULE:apply_actor_operation/5, + [Actor, Incarnation, Entry, OpName, Lane] + ) + end, State; actor_apply_operation( heartbeat, State = #state{actor = Actor, incarnation = Incarnation}, _Env = #{timestamp := Now} ) -> - ok = transaction(fun mnesia_actor_heartbeat/3, [Actor, Incarnation, Now]), + ok = transaction(fun ?MODULE:mnesia_actor_heartbeat/3, [Actor, Incarnation, Now]), State. +apply_actor_operation(Actor, Incarnation, Entry, OpName, Lane) -> + _ = assert_current_incarnation(Actor, Incarnation), + apply_operation(Entry, OpName, Lane). + apply_operation(Entry, OpName, Lane) -> %% NOTE %% This is safe sequence of operations only on core nodes. On replicants, @@ -259,7 +280,7 @@ mnesia_actor_heartbeat(Actor, Incarnation, TS) -> end. clean_incarnation(Rec) -> - transaction(fun mnesia_clean_incarnation/1, [Rec]). + transaction(fun ?MODULE:mnesia_clean_incarnation/1, [Rec]). mnesia_clean_incarnation(#actor{id = Actor, incarnation = Incarnation, lane = Lane}) -> case mnesia:read(?EXTROUTE_ACTOR_TAB, Actor, write) of diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl index e83698895..fffca47c7 100644 --- a/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl @@ -33,6 +33,12 @@ end_per_testcase(TC, Config) -> init_db() -> mria:wait_for_tables(emqx_cluster_link_extrouter:create_tables()). +init_db_nodes(Nodes) -> + ok = lists:foreach( + fun(Node) -> ok = erpc:call(Node, ?MODULE, init_db, []) end, + Nodes + ). + %% t_consistent_routing_view(_Config) -> @@ -174,20 +180,15 @@ t_consistent_routing_view_concurrent_updates(_Config) -> t_consistent_routing_view_concurrent_cluster_updates('init', Config) -> Specs = [ - {emqx_external_router1, #{role => core}}, - {emqx_external_router2, #{role => core}}, - {emqx_external_router3, #{role => core}} + {emqx_cluster_link_extrouter1, #{role => core}}, + {emqx_cluster_link_extrouter2, #{role => core}}, + {emqx_cluster_link_extrouter3, #{role => core}} ], Cluster = emqx_cth_cluster:start( Specs, #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} ), - ok = lists:foreach( - fun(Node) -> - ok = erpc:call(Node, ?MODULE, init_db, []) - end, - Cluster - ), + ok = init_db_nodes(Cluster), [{cluster, Cluster} | Config]; t_consistent_routing_view_concurrent_cluster_updates('end', Config) -> ok = emqx_cth_cluster:stop(?config(cluster, Config)). @@ -236,6 +237,24 @@ t_consistent_routing_view_concurrent_cluster_updates(Config) -> erpc:call(N1, ?MODULE, topics_sorted, []) ). +t_consistent_routing_view_concurrent_cluster_replicant_updates('init', Config) -> + Specs = [ + {emqx_cluster_link_extrouter_repl1, #{role => core}}, + {emqx_cluster_link_extrouter_repl2, #{role => core}}, + {emqx_cluster_link_extrouter_repl3, #{role => replicant}} + ], + Cluster = emqx_cth_cluster:start( + Specs, + #{work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)} + ), + ok = init_db_nodes(Cluster), + [{cluster, Cluster} | Config]; +t_consistent_routing_view_concurrent_cluster_replicant_updates('end', Config) -> + ok = emqx_cth_cluster:stop(?config(cluster, Config)). + +t_consistent_routing_view_concurrent_cluster_replicant_updates(Config) -> + t_consistent_routing_view_concurrent_cluster_updates(Config). + run_remote_actor({Node, Run}) -> erlang:spawn_monitor(Node, ?MODULE, run_actor, [Run]). From 5bd9ee5c7f1923a935514189aa6f5c279d4ade57 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Fri, 10 May 2024 15:19:58 +0200 Subject: [PATCH 05/46] feat(utils): add `emqx_utils_ets:keyfoldl/3` function Designed to be used with `bag` / `duplicate_bag` tables. --- apps/emqx_utils/src/emqx_utils_ets.erl | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/apps/emqx_utils/src/emqx_utils_ets.erl b/apps/emqx_utils/src/emqx_utils_ets.erl index e737c8934..c2819dbeb 100644 --- a/apps/emqx_utils/src/emqx_utils_ets.erl +++ b/apps/emqx_utils/src/emqx_utils_ets.erl @@ -26,6 +26,8 @@ lookup_value/3 ]). +-export([keyfoldl/3]). + -export([delete/1]). %% Create an ets table. @@ -57,6 +59,24 @@ lookup_value(Tab, Key, Def) -> error:badarg -> Def end. +-spec keyfoldl(fun((_Key :: term(), Acc) -> Acc), Acc, ets:tab()) -> Acc. +keyfoldl(F, Acc, Tab) -> + true = ets:safe_fixtable(Tab, true), + First = ets:first(Tab), + try + keyfoldl(F, Acc, First, Tab) + after + ets:safe_fixtable(Tab, false) + end. + +keyfoldl(F, Acc, Key, Tab) -> + case Key of + '$end_of_table' -> + Acc; + _ -> + keyfoldl(F, F(Key, Acc), ets:next(Tab, Key), Tab) + end. + %% Delete the ets table. -spec delete(ets:tab()) -> ok. delete(Tab) -> From 7b95273218d23795ceef9fb08990fd1cafebd82b Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Fri, 10 May 2024 15:21:58 +0200 Subject: [PATCH 06/46] feat(routesync): make syncer a bit more generic and reusable --- apps/emqx/src/emqx_broker_sup.erl | 2 +- apps/emqx/src/emqx_router_syncer.erl | 124 +++++++++++++++++++++------ 2 files changed, 98 insertions(+), 28 deletions(-) diff --git a/apps/emqx/src/emqx_broker_sup.erl b/apps/emqx/src/emqx_broker_sup.erl index 2abc43ceb..cda7e2167 100644 --- a/apps/emqx/src/emqx_broker_sup.erl +++ b/apps/emqx/src/emqx_broker_sup.erl @@ -47,7 +47,7 @@ init([]) -> router_syncer_pool, hash, PoolSize, - {emqx_router_syncer, start_link, []} + {emqx_router_syncer, start_link_pooled, []} ]), %% Shared subscription diff --git a/apps/emqx/src/emqx_router_syncer.erl b/apps/emqx/src/emqx_router_syncer.erl index d050d9d18..4756d0a37 100644 --- a/apps/emqx/src/emqx_router_syncer.erl +++ b/apps/emqx/src/emqx_router_syncer.erl @@ -21,11 +21,17 @@ -behaviour(gen_server). +-export([start_link/1]). -export([start_link/2]). +-export([start_link_pooled/2]). -export([push/4]). +-export([push/5]). -export([wait/1]). +-export([close/1]). +-export([open/1]). + -export([stats/0]). -export([ @@ -38,6 +44,15 @@ -type action() :: add | delete. +-type options() :: #{ + max_batch_size => pos_integer(), + min_sync_interval => non_neg_integer(), + error_delay => non_neg_integer(), + error_retry_interval => non_neg_integer(), + initial_state => open | closed, + batch_handler => {module(), _Function :: atom(), _Args :: list()} +}. + -define(POOL, router_syncer_pool). -define(MAX_BATCH_SIZE, 1000). @@ -77,13 +92,23 @@ %% --spec start_link(atom(), pos_integer()) -> +-spec start_link(options()) -> + {ok, pid()} | {error, _Reason}. +start_link(Options) -> + gen_server:start_link(?MODULE, mk_state(Options), []). + +-spec start_link(_Name, options()) -> + {ok, pid()} | {error, _Reason}. +start_link(Name, Options) -> + gen_server:start_link(Name, ?MODULE, mk_state(Options), []). + +-spec start_link_pooled(atom(), pos_integer()) -> {ok, pid()}. -start_link(Pool, Id) -> +start_link_pooled(Pool, Id) -> gen_server:start_link( {local, emqx_utils:proc_name(?MODULE, Id)}, ?MODULE, - [Pool, Id], + {Pool, Id, mk_state(#{})}, [] ). @@ -93,9 +118,16 @@ when Opts :: #{reply => pid()}. push(Action, Topic, Dest, Opts) -> Worker = gproc_pool:pick_worker(?POOL, Topic), + push(Worker, Action, Topic, Dest, Opts). + +-spec push(_Ref, action(), emqx_types:topic(), emqx_router:dest(), Opts) -> + ok | _WaitRef :: reference() +when + Opts :: #{reply => pid()}. +push(Ref, Action, Topic, Dest, Opts) -> Prio = designate_prio(Action, Opts), Context = mk_push_context(Opts), - _ = erlang:send(Worker, ?PUSH(Prio, {Action, Topic, Dest, Context})), + _ = gproc:send(Ref, ?PUSH(Prio, {Action, Topic, Dest, Context})), case Context of [{MRef, _}] -> MRef; @@ -134,6 +166,14 @@ mk_push_context(_) -> %% +close(Ref) -> + gen_server:call(Ref, close, infinity). + +open(Ref) -> + gen_server:call(Ref, open, infinity). + +%% + -type stats() :: #{ size := non_neg_integer(), n_add := non_neg_integer(), @@ -149,10 +189,34 @@ stats() -> %% -init([Pool, Id]) -> - true = gproc_pool:connect_worker(Pool, {Pool, Id}), - {ok, #{stash => stash_new()}}. +mk_state(Options) -> + #{ + state => maps:get(initial_state, Options, open), + stash => stash_new(), + retry_timer => undefined, + max_batch_size => maps:get(max_batch_size, Options, ?MAX_BATCH_SIZE), + min_sync_interval => maps:get(min_sync_interval, Options, ?MIN_SYNC_INTERVAL), + error_delay => maps:get(error_delay, Options, ?ERROR_DELAY), + error_retry_interval => maps:get(error_retry_interval, Options, ?ERROR_RETRY_INTERVAL), + batch_handler => maps:get(batch_handler, Options, default) + }. +%% + +init({Pool, Id, State}) -> + true = gproc_pool:connect_worker(Pool, {Pool, Id}), + {ok, State}; +init(State) -> + {ok, State}. + +handle_call(close, _From, State) -> + NState = State#{state := closed}, + {reply, ok, NState}; +handle_call(open, _From, State = #{state := closed}) -> + NState = run_batch_loop([], State#{state := open}), + {reply, ok, NState}; +handle_call(open, _From, State) -> + {reply, ok, State}; handle_call(stats, _From, State = #{stash := Stash}) -> {reply, stash_stats(Stash), State}; handle_call(_Call, _From, State) -> @@ -162,11 +226,11 @@ handle_cast(_Msg, State) -> {noreply, State}. handle_info({timeout, _TRef, retry}, State) -> - NState = run_batch_loop([], maps:remove(retry_timer, State)), + NState = run_batch_loop([], State#{retry_timer := undefined}), {noreply, NState}; -handle_info(Push = ?PUSH(_, _), State) -> +handle_info(Push = ?PUSH(_, _), State = #{min_sync_interval := MSI}) -> %% NOTE: Wait a bit to collect potentially overlapping operations. - ok = timer:sleep(?MIN_SYNC_INTERVAL), + ok = timer:sleep(MSI), NState = run_batch_loop([Push], State), {noreply, NState}. @@ -175,12 +239,16 @@ terminate(_Reason, _State) -> %% -run_batch_loop(Incoming, State = #{stash := Stash0}) -> +run_batch_loop(Incoming, State = #{stash := Stash0, state := closed}) -> Stash1 = stash_add(Incoming, Stash0), Stash2 = stash_drain(Stash1), - {Batch, Stash3} = mk_batch(Stash2), + State#{stash := Stash2}; +run_batch_loop(Incoming, State = #{stash := Stash0, max_batch_size := MBS}) -> + Stash1 = stash_add(Incoming, Stash0), + Stash2 = stash_drain(Stash1), + {Batch, Stash3} = mk_batch(Stash2, MBS), ?tp_ignore_side_effects_in_prod(router_syncer_new_batch, batch_stats(Batch, Stash3)), - case run_batch(Batch) of + case run_batch(Batch, State) of Status = #{} -> ok = send_replies(Status, Batch), NState = cancel_retry_timer(State#{stash := Stash3}), @@ -203,37 +271,37 @@ run_batch_loop(Incoming, State = #{stash := Stash0}) -> batch => batch_stats(Batch, Stash3) }), NState = State#{stash := Stash2}, - ok = timer:sleep(?ERROR_DELAY), + ok = error_cooldown(NState), ensure_retry_timer(NState) end. +error_cooldown(#{error_delay := ED}) -> + timer:sleep(ED). + +ensure_retry_timer(State = #{retry_timer := undefined, error_retry_interval := ERI}) -> + TRef = emqx_utils:start_timer(ERI, retry), + State#{retry_timer := TRef}; ensure_retry_timer(State = #{retry_timer := _TRef}) -> - State; -ensure_retry_timer(State) -> - TRef = emqx_utils:start_timer(?ERROR_RETRY_INTERVAL, retry), - State#{retry_timer => TRef}. + State. cancel_retry_timer(State = #{retry_timer := TRef}) -> ok = emqx_utils:cancel_timer(TRef), - maps:remove(retry_timer, State); + State#{retry_timer := undefined}; cancel_retry_timer(State) -> State. %% -mk_batch(Stash) when map_size(Stash) =< ?MAX_BATCH_SIZE -> +mk_batch(Stash, BatchSize) when map_size(Stash) =< BatchSize -> %% This is perfect situation, we just use stash as batch w/o extra reallocations. {Stash, stash_new()}; -mk_batch(Stash) -> +mk_batch(Stash, BatchSize) -> %% Take a subset of stashed operations to form a batch. %% Note that stash is an unordered map, it's not a queue. The order of operations is %% not preserved strictly, only loosely, because of how we start from high priority %% operations and go down to low priority ones. This might cause some operations to %% stay in stash for unfairly long time, when there are many high priority operations. %% However, it's unclear how likely this is to happen in practice. - mk_batch(Stash, ?MAX_BATCH_SIZE). - -mk_batch(Stash, BatchSize) -> mk_batch(?PRIO_HI, #{}, BatchSize, Stash). mk_batch(Prio, Batch, SizeLeft, Stash) -> @@ -278,10 +346,12 @@ replyctx_send(Result, RefsPids) -> %% -run_batch(Batch) when map_size(Batch) > 0 -> +run_batch(Empty, _State) when Empty =:= #{} -> + #{}; +run_batch(Batch, #{batch_handler := default}) -> catch emqx_router:do_batch(Batch); -run_batch(_Empty) -> - #{}. +run_batch(Batch, #{batch_handler := {Module, Function, Args}}) -> + erlang:apply(Module, Function, [Batch | Args]). %% From cbd01ae8182824138510fd0edebf6a6cccff1922 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Fri, 10 May 2024 15:30:01 +0200 Subject: [PATCH 07/46] feat(clusterlink): add node-local route sync actor implementation --- .../src/emqx_cluster_link_config.erl | 59 ++++ .../src/emqx_cluster_link_mqtt.erl | 30 ++ .../emqx_cluster_link_router_bootstrap.erl | 83 +++++ .../src/emqx_cluster_link_router_syncer.erl | 321 ++++++++++++++++++ apps/emqx_utils/src/emqx_utils.erl | 4 + 5 files changed, 497 insertions(+) create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index ade3a8c97..bdbb702ca 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -11,6 +11,18 @@ -define(LINKS_PATH, [cluster, links]). -define(CERTS_PATH(LinkName), filename:join(["cluster", "links", LinkName])). +-define(MQTT_HOST_OPTS, #{default_port => 1883}). + +-export([ + %% General + cluster/0, + links/0, + link/1, + topic_filters/1, + %% Connections + emqtt_options/1 +]). + -export([ add_handler/0, remove_handler/0 @@ -21,6 +33,53 @@ post_config_update/5 ]). +%% + +cluster() -> + atom_to_binary(emqx_config:get([cluster, name])). + +links() -> + emqx:get_config(?LINKS_PATH, []). + +link(Name) -> + case lists:dropwhile(fun(L) -> Name =/= upstream_name(L) end, links()) of + [LinkConf | _] -> LinkConf; + [] -> undefined + end. + +emqtt_options(LinkName) -> + emqx_maybe:apply(fun mk_emqtt_options/1, ?MODULE:link(LinkName)). + +topic_filters(LinkName) -> + maps:get(filters, ?MODULE:link(LinkName), []). + +%% + +mk_emqtt_options(#{server := Server, ssl := #{enable := EnableSsl} = Ssl} = LinkConf) -> + ClientId = maps:get(client_id, LinkConf, cluster()), + #{hostname := Host, port := Port} = emqx_schema:parse_server(Server, ?MQTT_HOST_OPTS), + Opts = #{ + host => Host, + port => Port, + clientid => ClientId, + proto_ver => v5, + ssl => EnableSsl, + ssl_opts => maps:to_list(maps:remove(enable, Ssl)) + }, + with_password(with_user(Opts, LinkConf), LinkConf). + +with_user(Opts, #{username := U} = _LinkConf) -> + Opts#{username => U}; +with_user(Opts, _LinkConf) -> + Opts. + +with_password(Opts, #{password := P} = _LinkConf) -> + Opts#{password => emqx_secret:unwrap(P)}; +with_password(Opts, _LinkConf) -> + Opts. + +%% + add_handler() -> ok = emqx_config_handler:add_handler(?LINKS_PATH, ?MODULE). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index 1e9310aca..b111be954 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -44,6 +44,11 @@ decode_forwarded_msg/1 ]). +-export([ + publish_route_sync/4, + encode_field/2 +]). + -export([ forward/2 ]). @@ -79,9 +84,19 @@ %% It's worth optimizing non-batch op payload size, %% thus it's encoded as a plain binary -define(TOPIC_WITH_OP(Op, Topic), <>). + -define(DECODE(Payload), erlang:binary_to_term(Payload, [safe])). -define(ENCODE(Payload), erlang:term_to_binary(Payload)). +-define(F_OPERATION, '$op'). +-define(OP_ROUTE, <<"route">>). + +-define(F_ACTOR, 10). +-define(F_INCARNATION, 11). +-define(F_ROUTES, 12). + +-define(ROUTE_DELETE, 100). + -define(PUB_TIMEOUT, 10_000). ensure_msg_fwd_resource(#{upstream := Name, pool_size := PoolSize} = ClusterConf) -> @@ -388,6 +403,16 @@ publish_result(Caller, Ref, Result) -> Caller ! {pub_result, Ref, Err} end. +publish_route_sync(ClientPid, Actor, Incarnation, Updates) -> + PubTopic = ?ROUTE_TOPIC, + Payload = #{ + ?F_OPERATION => ?OP_ROUTE, + ?F_ACTOR => Actor, + ?F_INCARNATION => Incarnation, + ?F_ROUTES => Updates + }, + emqtt:publish(ClientPid, PubTopic, ?ENCODE(Payload), ?QOS_1). + %%-------------------------------------------------------------------- %% Protocol %%-------------------------------------------------------------------- @@ -498,6 +523,11 @@ decode_proto_ver1(ProtoVer) -> %% Let it fail (for now), we don't expect invalid data to pass through the linking protocol.. {emqx_utils_conv:int(Major), emqx_utils_conv:int(Minor)}. +encode_field(route, {add, Route = {_Topic, _ID}}) -> + Route; +encode_field(route, {delete, {Topic, ID}}) -> + {?ROUTE_DELETE, Topic, ID}. + %%-------------------------------------------------------------------- %% emqx_external_broker %%-------------------------------------------------------------------- diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl new file mode 100644 index 000000000..8c0e609dc --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl @@ -0,0 +1,83 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- +-module(emqx_cluster_link_router_bootstrap). + +-include_lib("emqx/include/emqx_router.hrl"). + +-export([ + init/2, + next_batch/1 +]). + +-define(MAX_BATCH_SIZE, 4000). + +-record(bootstrap, { + target :: _ClusterName :: binary(), + wildcards :: [emqx_types:topic()], + topics :: [emqx_types:topic()], + stash :: [{emqx_types:topic(), _RouteID}], + max_batch_size :: non_neg_integer() +}). + +%% + +init(TargetCluster, Options) -> + LinkFilters = emqx_cluster_link_config:topic_filters(TargetCluster), + {Wildcards, Topics} = lists:partition(fun emqx_topic:wildcard/1, LinkFilters), + #bootstrap{ + target = TargetCluster, + wildcards = Wildcards, + topics = Topics, + stash = [], + max_batch_size = maps:get(max_batch_size, Options, ?MAX_BATCH_SIZE) + }. + +next_batch(B = #bootstrap{stash = S0 = [_ | _], max_batch_size = MBS}) -> + {Batch, Stash} = mk_batch(S0, MBS), + {Batch, B#bootstrap{stash = Stash}}; +next_batch(B = #bootstrap{topics = Topics = [_ | _], stash = []}) -> + Routes = select_routes_by_topics(Topics), + next_batch(B#bootstrap{topics = [], stash = Routes}); +next_batch(B0 = #bootstrap{wildcards = Wildcards = [_ | _], stash = []}) -> + Routes = select_routes_by_wildcards(Wildcards), + next_batch(B0#bootstrap{wildcards = [], stash = Routes}); +next_batch(#bootstrap{topics = [], wildcards = [], stash = []}) -> + done. + +mk_batch(Stash, MaxBatchSize) when length(Stash) =< MaxBatchSize -> + {Stash, []}; +mk_batch(Stash, MaxBatchSize) -> + {Batch, Rest} = lists:split(MaxBatchSize, Stash), + {Batch, Rest}. + +%% + +select_routes_by_topics(Topics) -> + [encode_route(Topic, Topic) || Topic <- Topics, emqx_broker:subscribers(Topic) =/= []]. + +select_routes_by_wildcards(Wildcards) -> + emqx_utils_ets:keyfoldl( + fun(Topic, Acc) -> intersecting_route(Topic, Wildcards) ++ Acc end, + [], + ?SUBSCRIBER + ). + +intersecting_route(Topic, Wildcards) -> + %% TODO: probably nice to validate cluster link topic filters + %% to have no intersections between each other? + case topic_intersect_any(Topic, Wildcards) of + false -> []; + Intersection -> [encode_route(Intersection, Topic)] + end. + +topic_intersect_any(Topic, [LinkFilter | T]) -> + case emqx_topic:intersection(Topic, LinkFilter) of + false -> topic_intersect_any(Topic, T); + TopicOrFilter -> TopicOrFilter + end; +topic_intersect_any(_Topic, []) -> + false. + +encode_route(Topic, RouteID) -> + emqx_cluster_link_mqtt:encode_field(route, {add, {Topic, RouteID}}). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl new file mode 100644 index 000000000..48dda2e2d --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -0,0 +1,321 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- +-module(emqx_cluster_link_router_syncer). + +-include_lib("emqtt/include/emqtt.hrl"). + +%% API +-export([start_link/1]). +-export([push/4]). + +-export([ + start_link_actor/1, + start_link_syncer/1 +]). + +%% Internal API / Syncer +-export([ + process_syncer_batch/4 +]). + +-behaviour(supervisor). +-export([init/1]). + +-behaviour(gen_server). +-export([ + handle_continue/2, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2 +]). + +-define(NAME(Cluster), {n, l, {?MODULE, Cluster}}). +-define(REF(Cluster), {via, gproc, ?NAME(Cluster)}). + +-define(NAME(Cluster, What), {n, l, {?MODULE, Cluster, What}}). +-define(CLIENT_NAME(Cluster), ?NAME(Cluster, client)). +-define(SYNCER_NAME(Cluster), ?NAME(Cluster, syncer)). +-define(SYNCER_REF(Cluster), {via, gproc, ?SYNCER_NAME(Cluster)}). +-define(ACTOR_REF(Cluster), {via, gproc, ?NAME(Cluster, actor)}). + +-define(MAX_BATCH_SIZE, 4000). +-define(MIN_SYNC_INTERVAL, 10). +-define(ERROR_DELAY, 200). + +-define(RECONNECT_TIMEOUT, 5_000). + +%% + +push(TargetCluster, OpName, Topic, ID) -> + case gproc:where(?SYNCER_NAME(TargetCluster)) of + SyncerPid when is_pid(SyncerPid) -> + emqx_router_syncer:push(SyncerPid, OpName, Topic, ID, #{}); + undefined -> + dropped + end. + +%% Supervisor: +%% 1. Actor + MQTT Client +%% 2. Syncer + +start_link(TargetCluster) -> + supervisor:start_link(?REF(TargetCluster), ?MODULE, {sup, TargetCluster}). + +%% Actor + +start_link_actor(TargetCluster) -> + Actor = get_actor_id(), + Incarnation = ensure_actor_incarnation(), + gen_server:start_link( + ?ACTOR_REF(TargetCluster), + ?MODULE, + {actor, mk_state(TargetCluster, Actor, Incarnation)}, + [] + ). + +get_actor_id() -> + atom_to_binary(node()). + +get_actor_incarnation() -> + persistent_term:get({?MODULE, incarnation}). + +set_actor_incarnation(Incarnation) -> + ok = persistent_term:put({?MODULE, incarnation}, Incarnation), + Incarnation. + +ensure_actor_incarnation() -> + try + get_actor_incarnation() + catch + error:badarg -> + %% TODO: Subject to clock skew, need something more robust. + Incarnation = erlang:system_time(millisecond), + set_actor_incarnation(Incarnation) + end. + +%% MQTT Client + +start_link_client(TargetCluster) -> + Options = emqx_cluster_link_config:emqtt_options(TargetCluster), + emqtt:start_link(refine_client_options(Options)). + +refine_client_options(Options = #{clientid := ClientID}) -> + %% TODO: Reconnect should help, but it looks broken right now. + Options#{ + clientid => emqx_utils:format("~s:~s:routesync", [ClientID, node()]), + clean_start => false, + properties => #{'Session-Expiry-Interval' => 60}, + retry_interval => 0 + }. + +client_session_present(ClientPid) -> + Info = emqtt:info(ClientPid), + proplists:get_value(session_present, Info, false). + +announce_client(TargetCluster, Pid) -> + true = gproc:reg_other(?CLIENT_NAME(TargetCluster), Pid), + ok. + +publish_routes(ClientPid, Actor, Incarnation, Updates) -> + try emqx_cluster_link_mqtt:publish_route_sync(ClientPid, Actor, Incarnation, Updates) of + {ok, #{reason_code := RC}} when RC < ?RC_UNSPECIFIED_ERROR -> + #{}; + {ok, #{reason_code_name := RCN}} -> + {error, {mqtt, RCN}}; + {error, Reason} -> + {error, Reason} + catch + exit:Reason -> + {error, {client, ClientPid, Reason}} + end. + +%% Route syncer + +start_syncer(TargetCluster) -> + case supervisor:start_child(?REF(TargetCluster), child_spec(syncer, TargetCluster)) of + {ok, _} -> + ok; + {error, {already_started, _}} -> + ok + end. + +start_link_syncer(TargetCluster) -> + Actor = get_actor_id(), + Incarnation = get_actor_incarnation(), + ClientName = ?CLIENT_NAME(TargetCluster), + emqx_router_syncer:start_link(?SYNCER_REF(TargetCluster), #{ + max_batch_size => ?MAX_BATCH_SIZE, + min_sync_interval => ?MIN_SYNC_INTERVAL, + error_delay => ?ERROR_DELAY, + initial_state => closed, + batch_handler => {?MODULE, process_syncer_batch, [ClientName, Actor, Incarnation]} + %% TODO: enable_replies => false + }). + +close_syncer(TargetCluster) -> + emqx_router_syncer:close(?SYNCER_REF(TargetCluster)). + +open_syncer(TargetCluster) -> + emqx_router_syncer:open(?SYNCER_REF(TargetCluster)). + +process_syncer_batch(Batch, ClientName, Actor, Incarnation) -> + Updates = maps:fold( + fun(Route, Op, Acc) -> + OpName = batch_get_opname(Op), + Entry = emqx_cluster_link_mqtt:encode_field(route, {OpName, Route}), + [Entry | Acc] + end, + [], + Batch + ), + publish_routes(gproc:where(ClientName), Actor, Incarnation, Updates). + +batch_get_opname(Op) -> + element(1, Op). + +%% + +init({sup, TargetCluster}) -> + %% FIXME: Intensity. + SupFlags = #{ + strategy => all_for_one, + intensity => 10, + period => 60 + }, + Children = [ + child_spec(actor, TargetCluster) + ], + {ok, {SupFlags, Children}}; +init({actor, State}) -> + init_actor(State). + +child_spec(actor, TargetCluster) -> + %% Actor process. + %% Wraps MQTT Client process. + %% ClientID: `mycluster:emqx1@emqx.local:routesync` + %% Occasional TCP/MQTT-level disconnects are expected, and should be handled + %% gracefully. + #{ + id => actor, + start => {?MODULE, start_link_actor, [TargetCluster]}, + restart => permanent, + type => worker + }; +child_spec(syncer, TargetCluster) -> + %% Route syncer process. + %% Initially starts in a "closed" state. Actor decides when to open it, i.e. + %% when bootstrapping is done. Syncer crash means re-bootstrap is needed, so + %% we just restart the actor in this case. + #{ + id => syncer, + start => {?MODULE, start_link_syncer, [TargetCluster]}, + restart => permanent, + type => worker + }. + +%% + +-record(st, { + target :: binary(), + actor :: binary(), + incarnation :: non_neg_integer(), + client :: {pid(), reference()}, + bootstrapped :: boolean(), + reconnect_timer :: reference() +}). + +mk_state(TargetCluster, Actor, Incarnation) -> + #st{ + target = TargetCluster, + actor = Actor, + incarnation = Incarnation, + bootstrapped = false + }. + +init_actor(State = #st{}) -> + _ = erlang:process_flag(trap_exit, true), + {ok, State, {continue, connect}}. + +handle_continue(connect, State) -> + process_connect(State). + +handle_call(_Request, _From, State) -> + {reply, ignored, State}. + +handle_cast(_Request, State) -> + {noreply, State}. + +handle_info({'EXIT', ClientPid, Reason}, St = #st{client = ClientPid}) -> + handle_client_down(Reason, St); +handle_info({timeout, TRef, _Reconnect}, St = #st{reconnect_timer = TRef}) -> + process_connect(St#st{reconnect_timer = undefined}); +handle_info(_Info, St) -> + %% TODO: log? + {noreply, St}. + +terminate(_Reason, _State) -> + ok. + +process_connect(St = #st{actor = TargetCluster}) -> + case start_link_client(TargetCluster) of + {ok, ClientPid} -> + ok = start_syncer(TargetCluster), + ok = announce_client(TargetCluster, ClientPid), + process_bootstrap(St#st{client = ClientPid}); + {error, Reason} -> + handle_connect_error(Reason, St) + end. + +handle_connect_error(Reason, St) -> + %% TODO: logs + TRef = erlang:start_timer(?RECONNECT_TIMEOUT, self(), reconnect), + St#st{reconnect_timer = TRef}. + +handle_client_down(Reason, St = #st{target = TargetCluster}) -> + %% TODO: logs + ok = close_syncer(TargetCluster), + process_connect(St#st{client = undefined}). + +process_bootstrap(St = #st{bootstrapped = false}) -> + run_bootstrap(St); +process_bootstrap(St = #st{client = ClientPid, bootstrapped = true}) -> + case client_session_present(ClientPid) of + true -> + process_bootstrapped(St); + false -> + run_bootstrap(St) + end. + +%% Bootstrapping. +%% Responsible for transferring local routing table snapshot to the target +%% cluster. Does so either during the initial startup or when MQTT connection +%% is re-established with a clean session. Once bootstrapping is done, it +%% opens the syncer. + +run_bootstrap(St = #st{target = TargetCluster}) -> + Bootstrap = emqx_cluster_link_router_bootstrap:init(TargetCluster, #{}), + run_bootstrap(Bootstrap, St). + +run_bootstrap(Bootstrap, St) -> + case emqx_cluster_link_router_bootstrap:next_batch(Bootstrap) of + done -> + process_bootstrapped(St); + {Batch, NBootstrap} -> + %% TODO: Better error handling. + case process_bootstrap_batch(Batch, St) of + #{} -> + run_bootstrap(NBootstrap, St); + {error, {client, _, _}} -> + %% Client has exited, let `reconnect` codepath handle it. + St + end + end. + +process_bootstrapped(St = #st{target = TargetCluster}) -> + ok = open_syncer(TargetCluster), + St#st{bootstrapped = true}. + +process_bootstrap_batch(Batch, #st{client = ClientPid, actor = Actor, incarnation = Incarnation}) -> + publish_routes(ClientPid, Actor, Incarnation, Batch). diff --git a/apps/emqx_utils/src/emqx_utils.erl b/apps/emqx_utils/src/emqx_utils.erl index 644ed7ae8..8f41a4919 100644 --- a/apps/emqx_utils/src/emqx_utils.erl +++ b/apps/emqx_utils/src/emqx_utils.erl @@ -65,6 +65,7 @@ flattermap/2, tcp_keepalive_opts/4, format/1, + format/2, format_mfal/2, call_first_defined/1, ntoa/1, @@ -566,6 +567,9 @@ tcp_keepalive_opts(OS, _Idle, _Interval, _Probes) -> format(Term) -> iolist_to_binary(io_lib:format("~0p", [Term])). +format(Fmt, Args) -> + iolist_to_binary(io_lib:format(Fmt, Args)). + %% @doc Helper function for log formatters. -spec format_mfal(map(), map()) -> undefined | binary(). format_mfal(Data, #{with_mfa := true}) -> From 7df91d852c7f8c790a99e34579c92b8715295d06 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Wed, 15 May 2024 19:18:57 +0300 Subject: [PATCH 08/46] feat(clusterlink): integrate node local syncer/actor implementation - support multiple cluster links in extrouter - apply extrouter ops on 'message.publish' hook - fix several minor bugs --- .../src/emqx_cluster_link.erl | 36 +++++--- .../src/emqx_cluster_link_app.erl | 5 +- .../src/emqx_cluster_link_extrouter.erl | 92 ++++++++++++------- .../src/emqx_cluster_link_mqtt.erl | 51 ++++++++-- .../src/emqx_cluster_link_router_syncer.erl | 32 +++++-- .../src/emqx_cluster_link_sup.erl | 10 +- 6 files changed, 154 insertions(+), 72 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index f0b0c95ba..ae3647d4a 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -62,15 +62,10 @@ should_route_to_external_dests(_Msg) -> on_message_publish(#message{topic = <>, payload = Payload}) -> _ = case emqx_cluster_link_mqtt:decode_route_op(Payload) of - {add, Topics} when is_list(Topics) -> - add_routes(Topics, ClusterName); - {add, Topic} -> - emqx_router_syncer:push(add, Topic, ?DEST(ClusterName), #{}); - {delete, _} -> - %% Not implemented yet - ok; - cleanup_routes -> - cleanup_routes(ClusterName) + {actor_init, #{actor := Actor, incarnation := Incr}} -> + actor_init(ClusterName, Actor, Incr); + {route_updates, #{actor := Actor, incarnation := Incr}, RouteOps} -> + update_routes(ClusterName, Actor, Incr, RouteOps) end, {stop, []}; on_message_publish(#message{topic = <>, payload = Payload}) -> @@ -110,6 +105,19 @@ delete_hook() -> %% Internal functions %%-------------------------------------------------------------------- +actor_init(ClusterName, Actor, Incarnation) -> + Env = #{timestamp => erlang:system_time(millisecond)}, + {ok, _} = emqx_cluster_link_extrouter:actor_init(ClusterName, Actor, Incarnation, Env). + +update_routes(ClusterName, Actor, Incarnation, RouteOps) -> + ActorState = emqx_cluster_link_extrouter:actor_state(ClusterName, Actor, Incarnation), + lists:foreach( + fun(RouteOp) -> + emqx_cluster_link_extrouter:actor_apply_operation(RouteOp, ActorState) + end, + RouteOps + ). + cleanup_routes(ClusterName) -> emqx_router:cleanup_routes(?DEST(ClusterName)). @@ -142,11 +150,11 @@ on_init_ack(Res, ClusterName, Msg) -> #{'Correlation-Data' := ReqId} = emqx_message:get_header(properties, Msg), emqx_cluster_link_coordinator:on_link_ack(ClusterName, ReqId, Res). -add_routes(Topics, ClusterName) -> - lists:foreach( - fun(T) -> emqx_router_syncer:push(add, T, ?DEST(ClusterName), #{}) end, - Topics - ). +%% add_routes(Topics, ClusterName) -> +%% lists:foreach( +%% fun(T) -> emqx_router_syncer:push(add, T, ?DEST(ClusterName), #{}) end, +%% Topics +%% ). %% let it crash if extra is not a map, %% we don't expect the message to be forwarded from an older EMQX release, diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl index 68dc07f48..f05c5c1a0 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl @@ -11,6 +11,7 @@ -define(BROKER_MOD, emqx_cluster_link). start(_StartType, _StartArgs) -> + ok = mria:wait_for_tables(emqx_cluster_link_extrouter:create_tables()), emqx_cluster_link_config:add_handler(), LinksConf = enabled_links(), _ = @@ -31,7 +32,7 @@ prep_stop(State) -> stop(_State) -> _ = emqx_cluster_link:delete_hook(), _ = emqx_cluster_link:unregister_external_broker(), - _ = stop_msg_fwd_resources(emqx:get_config([cluster, links], [])), + _ = stop_msg_fwd_resources(emqx_cluster_link_config:links()), ok. %%-------------------------------------------------------------------- @@ -41,7 +42,7 @@ stop(_State) -> enabled_links() -> lists:filter( fun(#{enable := IsEnabled}) -> IsEnabled =:= true end, - emqx:get_config([cluster, links], []) + emqx_cluster_link_config:links() ). start_msg_fwd_resources(LinksConf) -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index 76999f4cf..a09d4d8de 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -15,14 +15,16 @@ %% Actor API -export([ - actor_init/3, + actor_init/4, + actor_state/3, + actor_apply_operation/2, actor_apply_operation/3, actor_gc/1 ]). %% Internal API -export([ - mnesia_actor_init/3, + mnesia_actor_init/4, mnesia_actor_heartbeat/3, mnesia_clean_incarnation/1, apply_actor_operation/5 @@ -31,6 +33,9 @@ %% Strictly monotonically increasing integer. -type smint() :: integer(). +%% Remote cluster name +-type cluster() :: binary(). + %% Actor. %% Identifies an independent route replication actor on the remote broker. %% Usually something like `node()` or `{node(), _Shard}`. @@ -45,17 +50,18 @@ %% Operation. %% RouteID should come in handy when two or more different routes on the actor side %% are "intersected" to the same topic filter that needs to be replicated here. --type op() :: {add | del, _TopicFilter :: binary(), _RouteID} | heartbeat. +-type op() :: {add | delete, {_TopicFilter :: binary(), _RouteID}} | heartbeat. %% Basically a bit offset. %% Each actor + incarnation pair occupies a separate lane in the multi-counter. %% Example: %% Actors | n1@ds n2@ds n3@ds %% Lanes | 0 1 2 -%% Op1 | n3@ds add client/42/# → MCounter += 1 bsl 2 = 4 -%% Op2 | n2@ds add client/42/# → MCounter += 1 bsl 1 = 6 -%% Op3 | n3@ds del client/42/# → MCounter -= 1 bsl 2 = 2 -%% Op4 | n2@ds del client/42/# → MCounter -= 1 bsl 1 = 0 → route deleted +%% --------------------------- +%% Op1 | n3@ds add client/42/# → MCounter += 1 bsl 2 = 4 +%% Op2 | n2@ds add client/42/# → MCounter += 1 bsl 1 = 6 +%% Op3 | n3@ds delete client/42/# → MCounter -= 1 bsl 2 = 2 +%% Op4 | n2@ds delete client/42/# → MCounter -= 1 bsl 1 = 0 → route deleted -type lane() :: non_neg_integer(). -define(DEFAULT_ACTOR_TTL_MS, 30_000). @@ -64,13 +70,16 @@ -define(EXTROUTE_TAB, emqx_external_router_route). -define(EXTROUTE_ACTOR_TAB, emqx_external_router_actor). +-define(ACTOR_ID(Cluster, Actor), {Cluster, Actor}). +-define(ROUTE_ID(Cluster, RouteID), {Cluster, RouteID}). + -record(extroute, { entry :: emqx_topic_index:key(_RouteID), mcounter = 0 :: non_neg_integer() }). -record(actor, { - id :: actor(), + id :: {cluster(), actor()}, incarnation :: incarnation(), lane :: lane(), until :: _Timestamp @@ -102,7 +111,7 @@ create_tables() -> ]} ]} ]), - [?EXTROUTE_TAB]. + [?EXTROUTE_ACTOR_TAB, ?EXTROUTE_TAB]. %% @@ -124,6 +133,7 @@ match_to_route(M) -> %% -record(state, { + cluster :: cluster(), actor :: actor(), incarnation :: incarnation(), lane :: lane() | undefined @@ -133,19 +143,19 @@ match_to_route(M) -> -type env() :: #{timestamp := _Milliseconds}. --spec actor_init(actor(), incarnation(), env()) -> {ok, state()}. -actor_init(Actor, Incarnation, Env = #{timestamp := Now}) -> +-spec actor_init(cluster(), actor(), incarnation(), env()) -> {ok, state()}. +actor_init(Cluster, Actor, Incarnation, Env = #{timestamp := Now}) -> %% TODO: Rolling upgrade safety? - case transaction(fun ?MODULE:mnesia_actor_init/3, [Actor, Incarnation, Now]) of + case transaction(fun ?MODULE:mnesia_actor_init/4, [Cluster, Actor, Incarnation, Now]) of {ok, State} -> {ok, State}; {reincarnate, Rec} -> %% TODO: Do this asynchronously. ok = clean_incarnation(Rec), - actor_init(Actor, Incarnation, Env) + actor_init(Cluster, Actor, Incarnation, Env) end. -mnesia_actor_init(Actor, Incarnation, TS) -> +mnesia_actor_init(Cluster, Actor, Incarnation, TS) -> %% NOTE %% We perform this heavy-weight transaction only in the case of a new route %% replication connection. The implicit assumption is that each replication @@ -154,15 +164,15 @@ mnesia_actor_init(Actor, Incarnation, TS) -> %% ClientID. There's always a chance of having stray process severely lagging %% that applies some update out of the blue, but it seems impossible to prevent %% it completely w/o transactions. - State = #state{actor = Actor, incarnation = Incarnation}, + State = #state{cluster = Cluster, actor = Actor, incarnation = Incarnation}, case mnesia:read(?EXTROUTE_ACTOR_TAB, Actor, write) of [#actor{incarnation = Incarnation, lane = Lane} = Rec] -> ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec#actor{until = bump_actor_ttl(TS)}, write), {ok, State#state{lane = Lane}}; [] -> - Lane = mnesia_assign_lane(), + Lane = mnesia_assign_lane(Cluster), Rec = #actor{ - id = Actor, + id = ?ACTOR_ID(Cluster, Actor), incarnation = Incarnation, lane = Lane, until = bump_actor_ttl(TS) @@ -175,21 +185,32 @@ mnesia_actor_init(Actor, Incarnation, TS) -> mnesia:abort({outdated_incarnation_actor, Actor, Incarnation, Newer}) end. +-spec actor_state(cluster(), actor(), incarnation()) -> state(). +actor_state(Cluster, Actor, Incarnation) -> + ActorID = ?ACTOR_ID(Cluster, Actor), + [#actor{lane = Lane}] = mnesia:dirty_read(?EXTROUTE_ACTOR_TAB, ActorID), + #state{cluster = Cluster, actor = Actor, incarnation = Incarnation, lane = Lane}. + +-spec actor_apply_operation(op(), state()) -> state(). +actor_apply_operation(Op, State) -> + actor_apply_operation(Op, State, #{}). + -spec actor_apply_operation(op(), state(), env()) -> state(). actor_apply_operation( - {OpName, TopicFilter, ID}, - State = #state{actor = Actor, incarnation = Incarnation, lane = Lane}, + {OpName, {TopicFilter, ID}}, + State = #state{cluster = Cluster, actor = Actor, incarnation = Incarnation, lane = Lane}, _Env ) -> - Entry = emqx_topic_index:make_key(TopicFilter, ID), + ActorID = ?ACTOR_ID(Cluster, Actor), + Entry = emqx_topic_index:make_key(TopicFilter, ?ROUTE_ID(Cluster, ID)), case mria_config:whoami() of Role when Role /= replicant -> - apply_actor_operation(Actor, Incarnation, Entry, OpName, Lane); + apply_actor_operation(ActorID, Incarnation, Entry, OpName, Lane); replicant -> mria:async_dirty( ?EXTROUTE_SHARD, fun ?MODULE:apply_actor_operation/5, - [Actor, Incarnation, Entry, OpName, Lane] + [ActorID, Incarnation, Entry, OpName, Lane] ) end, State; @@ -201,8 +222,8 @@ actor_apply_operation( ok = transaction(fun ?MODULE:mnesia_actor_heartbeat/3, [Actor, Incarnation, Now]), State. -apply_actor_operation(Actor, Incarnation, Entry, OpName, Lane) -> - _ = assert_current_incarnation(Actor, Incarnation), +apply_actor_operation(ActorID, Incarnation, Entry, OpName, Lane) -> + _ = assert_current_incarnation(ActorID, Incarnation), apply_operation(Entry, OpName, Lane). apply_operation(Entry, OpName, Lane) -> @@ -232,7 +253,7 @@ apply_operation(Entry, MCounter, OpName, Lane) -> Marker when OpName =:= add -> %% Already added. MCounter; - Marker when OpName =:= del -> + Marker when OpName =:= delete -> case mria:dirty_update_counter(?EXTROUTE_TAB, Entry, -Marker) of 0 -> Record = #extroute{entry = Entry, mcounter = 0}, @@ -241,7 +262,7 @@ apply_operation(Entry, MCounter, OpName, Lane) -> C -> C end; - 0 when OpName =:= del -> + 0 when OpName =:= delete -> %% Already deleted. MCounter end. @@ -257,18 +278,19 @@ actor_gc(#{timestamp := Now}) -> ok end. -mnesia_assign_lane() -> - Assignment = mnesia:foldl( - fun(#actor{lane = Lane}, Acc) -> - Acc bor (1 bsl Lane) - end, +mnesia_assign_lane(Cluster) -> + Assignment = lists:foldl( + fun(Lane, Acc) -> Acc bor (1 bsl Lane) end, 0, - ?EXTROUTE_ACTOR_TAB, - write + select_cluster_lanes(Cluster) ), Lane = first_zero_bit(Assignment), Lane. +select_cluster_lanes(Cluster) -> + MS = [{#actor{id = {Cluster, '_'}, lane = '$1', _ = '_'}, [], ['$1']}], + mnesia:select(?EXTROUTE_ACTOR_TAB, MS, write). + mnesia_actor_heartbeat(Actor, Incarnation, TS) -> case mnesia:read(?EXTROUTE_ACTOR_TAB, Actor, write) of [#actor{incarnation = Incarnation} = Rec] -> @@ -300,13 +322,13 @@ clean_lane(Lane) -> ?EXTROUTE_TAB ). -assert_current_incarnation(Actor, Incarnation) -> +assert_current_incarnation(ActorID, Incarnation) -> %% NOTE %% Ugly, but should not really happen anyway. This is a safety net for the case %% when this process tries to apply some outdated operation for whatever reason %% (e.g. heavy CPU starvation). Still, w/o transactions, it's just a best-effort %% attempt. - [#actor{incarnation = Incarnation}] = mnesia:dirty_read(?EXTROUTE_ACTOR_TAB, Actor), + [#actor{incarnation = Incarnation}] = mnesia:dirty_read(?EXTROUTE_ACTOR_TAB, ActorID), ok. %% diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index b111be954..3b16642ac 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -45,6 +45,7 @@ ]). -export([ + publish_actor_init_sync/3, publish_route_sync/4, encode_field/2 ]). @@ -58,7 +59,7 @@ -define(CLIENTID(Base, Suffix), emqx_bridge_mqtt_lib:clientid_base([Base, Suffix])). -define(MQTT_HOST_OPTS, #{default_port => 1883}). --define(MY_CLUSTER_NAME, atom_to_binary(emqx_config:get([cluster, name]))). +-define(MY_CLUSTER_NAME, emqx_cluster_link_config:cluster()). -define(ROUTE_TOPIC, <>). -define(MSG_FWD_TOPIC, <>). @@ -90,6 +91,7 @@ -define(F_OPERATION, '$op'). -define(OP_ROUTE, <<"route">>). +-define(OP_ACTOR_INIT, <<"actor_init">>). -define(F_ACTOR, 10). -define(F_INCARNATION, 11). @@ -403,6 +405,18 @@ publish_result(Caller, Ref, Result) -> Caller ! {pub_result, Ref, Err} end. +%%% New leader-less Syncer/Actor implementation + +publish_actor_init_sync(ClientPid, Actor, Incarnation) -> + %% TODO: handshake (request / response) to make sure the link is established + PubTopic = ?ROUTE_TOPIC, + Payload = #{ + ?F_OPERATION => ?OP_ACTOR_INIT, + ?F_ACTOR => Actor, + ?F_INCARNATION => Incarnation + }, + emqtt:publish(ClientPid, PubTopic, ?ENCODE(Payload), ?QOS_1). + publish_route_sync(ClientPid, Actor, Incarnation, Updates) -> PubTopic = ?ROUTE_TOPIC, Payload = #{ @@ -473,14 +487,28 @@ decode_ctrl_msg1(#{<<"op">> := ?UNLINK_OP}, _ClusterName) -> decode_route_op(Payload) -> decode_route_op1(?DECODE(Payload)). -decode_route_op1(<<"add_", Topic/binary>>) -> - {add, Topic}; -decode_route_op1(<<"delete_", Topic/binary>>) -> - {delete, Topic}; -decode_route_op1(#{<<"op">> := ?BATCH_ROUTES_OP, <<"topics">> := Topics}) when is_list(Topics) -> - {add, Topics}; -decode_route_op1(#{<<"op">> := ?CLEANUP_ROUTES_OP}) -> - cleanup_routes; +decode_route_op1(#{ + ?F_OPERATION := ?OP_ACTOR_INIT, + ?F_ACTOR := Actor, + ?F_INCARNATION := Incr +}) -> + {actor_init, #{actor => Actor, incarnation => Incr}}; +decode_route_op1(#{ + ?F_OPERATION := ?OP_ROUTE, + ?F_ACTOR := Actor, + ?F_INCARNATION := Incr, + ?F_ROUTES := RouteOps +}) -> + RouteOps1 = lists:map(fun(Op) -> decode_field(route, Op) end, RouteOps), + {route_updates, #{actor => Actor, incarnation => Incr}, RouteOps1}; +%%decode_route_op1(<<"add_", Topic/binary>>) -> +%% {add, Topic}; +%%decode_route_op1(<<"delete_", Topic/binary>>) -> +%% {delete, Topic}; +%%decode_route_op1(#{<<"op">> := ?BATCH_ROUTES_OP, <<"topics">> := Topics}) when is_list(Topics) -> +%% {add, Topics}; +%%decode_route_op1(#{<<"op">> := ?CLEANUP_ROUTES_OP}) -> +%% cleanup_routes; decode_route_op1(Payload) -> ?SLOG(warning, #{ msg => "unexpected_cluster_link_route_op_payload", @@ -528,6 +556,11 @@ encode_field(route, {add, Route = {_Topic, _ID}}) -> encode_field(route, {delete, {Topic, ID}}) -> {?ROUTE_DELETE, Topic, ID}. +decode_field(route, {?ROUTE_DELETE, Route = {_Topic, _ID}}) -> + {delete, Route}; +decode_field(route, Route = {_Topic, _ID}) -> + {add, Route}. + %%-------------------------------------------------------------------- %% emqx_external_broker %%-------------------------------------------------------------------- diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 48dda2e2d..d432cd7d8 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -19,7 +19,8 @@ process_syncer_batch/4 ]). --behaviour(supervisor). +%% silence warning +%% -behaviour(supervisor). -export([init/1]). -behaviour(gen_server). @@ -99,7 +100,17 @@ ensure_actor_incarnation() -> start_link_client(TargetCluster) -> Options = emqx_cluster_link_config:emqtt_options(TargetCluster), - emqtt:start_link(refine_client_options(Options)). + case emqtt:start_link(refine_client_options(Options)) of + {ok, Pid} -> + case emqtt:connect(Pid) of + {ok, _Props} -> + {ok, Pid}; + Error -> + Error + end; + Error -> + Error + end. refine_client_options(Options = #{clientid := ClientID}) -> %% TODO: Reconnect should help, but it looks broken right now. @@ -180,7 +191,7 @@ batch_get_opname(Op) -> init({sup, TargetCluster}) -> %% FIXME: Intensity. SupFlags = #{ - strategy => all_for_one, + strategy => one_for_all, intensity => 10, period => 60 }, @@ -239,7 +250,7 @@ init_actor(State = #st{}) -> {ok, State, {continue, connect}}. handle_continue(connect, State) -> - process_connect(State). + {noreply, process_connect(State)}. handle_call(_Request, _From, State) -> {reply, ignored, State}. @@ -248,9 +259,9 @@ handle_cast(_Request, State) -> {noreply, State}. handle_info({'EXIT', ClientPid, Reason}, St = #st{client = ClientPid}) -> - handle_client_down(Reason, St); + {noreply, handle_client_down(Reason, St)}; handle_info({timeout, TRef, _Reconnect}, St = #st{reconnect_timer = TRef}) -> - process_connect(St#st{reconnect_timer = undefined}); + {noreply, process_connect(St#st{reconnect_timer = undefined})}; handle_info(_Info, St) -> %% TODO: log? {noreply, St}. @@ -258,22 +269,25 @@ handle_info(_Info, St) -> terminate(_Reason, _State) -> ok. -process_connect(St = #st{actor = TargetCluster}) -> +process_connect(St = #st{target = TargetCluster, actor = Actor, incarnation = Incr}) -> case start_link_client(TargetCluster) of {ok, ClientPid} -> ok = start_syncer(TargetCluster), ok = announce_client(TargetCluster, ClientPid), + %% TODO: error handling, handshake + + {ok, _} = emqx_cluster_link_mqtt:publish_actor_init_sync(ClientPid, Actor, Incr), process_bootstrap(St#st{client = ClientPid}); {error, Reason} -> handle_connect_error(Reason, St) end. -handle_connect_error(Reason, St) -> +handle_connect_error(_Reason, St) -> %% TODO: logs TRef = erlang:start_timer(?RECONNECT_TIMEOUT, self(), reconnect), St#st{reconnect_timer = TRef}. -handle_client_down(Reason, St = #st{target = TargetCluster}) -> +handle_client_down(_Reason, St = #st{target = TargetCluster}) -> %% TODO: logs ok = close_syncer(TargetCluster), process_connect(St#st{client = undefined}). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl index c98b9f4c5..beb641a92 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl @@ -22,13 +22,17 @@ init(LinksConf) -> intensity => 10, period => 5 }, - Children = [sup_spec(?COORD_SUP, ?COORD_SUP, LinksConf)], + %% Children = [sup_spec(?COORD_SUP, ?COORD_SUP, LinksConf)], + Children = [ + sup_spec(Name, emqx_cluster_link_router_syncer, [Name]) + || #{upstream := Name} <- LinksConf + ], {ok, {SupFlags, Children}}. -sup_spec(Id, Mod, Conf) -> +sup_spec(Id, Mod, Args) -> #{ id => Id, - start => {Mod, start_link, [Conf]}, + start => {Mod, start_link, Args}, restart => permanent, shutdown => infinity, type => supervisor, From f036b641eb07121d3f369e1d2ea0d452c51491db Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Thu, 16 May 2024 20:19:55 +0300 Subject: [PATCH 09/46] feat(clusterlink): integrate node local routes replication and message forwarding --- apps/emqx/include/emqx.hrl | 2 +- apps/emqx/src/emqx_broker.erl | 52 ++++++++++--------- apps/emqx/src/emqx_external_broker.erl | 12 ++++- apps/emqx/src/emqx_router.erl | 3 +- apps/emqx/src/emqx_types.erl | 2 +- .../src/emqx_cluster_link.erl | 38 +++++++++++--- .../src/emqx_cluster_link_config.erl | 4 ++ .../src/emqx_cluster_link_extrouter.erl | 8 ++- .../src/emqx_cluster_link_mqtt.erl | 6 +-- .../src/emqx_cluster_link_router_syncer.erl | 11 ++-- 10 files changed, 93 insertions(+), 45 deletions(-) diff --git a/apps/emqx/include/emqx.hrl b/apps/emqx/include/emqx.hrl index 73e1cd144..0ad01c0ef 100644 --- a/apps/emqx/include/emqx.hrl +++ b/apps/emqx/include/emqx.hrl @@ -67,7 +67,7 @@ -record(route, { topic :: binary(), - dest :: node() | {binary(), node()} | emqx_session:session_id() + dest :: node() | {binary(), node()} | emqx_session:session_id() | emqx_external_broker:dest() }). %%-------------------------------------------------------------------- diff --git a/apps/emqx/src/emqx_broker.erl b/apps/emqx/src/emqx_broker.erl index d42258611..5744f2e74 100644 --- a/apps/emqx/src/emqx_broker.erl +++ b/apps/emqx/src/emqx_broker.erl @@ -256,9 +256,10 @@ do_publish_many([Msg | T]) -> do_publish(#message{topic = Topic} = Msg) -> PersistRes = persist_publish(Msg), - {Routes, ExtRoutes} = aggre(emqx_router:match_routes(Topic)), - Routes1 = maybe_add_ext_routes(ExtRoutes, Routes, Msg), - route(Routes1, delivery(Msg), PersistRes). + Routes = aggre(emqx_router:match_routes(Topic)), + Delivery = delivery(Msg), + RouteRes = route(Routes, Delivery, PersistRes), + ext_route(ext_routes(Topic, Msg), Delivery, RouteRes). persist_publish(Msg) -> case emqx_persistent_message:persist(Msg) of @@ -322,41 +323,44 @@ do_route({To, Node}, Delivery) when Node =:= node() -> {Node, To, dispatch(To, Delivery)}; do_route({To, Node}, Delivery) when is_atom(Node) -> {Node, To, forward(Node, To, Delivery, emqx:get_config([rpc, mode]))}; -do_route({To, {external, _} = ExtDest}, Delivery) -> - {ExtDest, To, emqx_external_broker:forward(ExtDest, Delivery)}; do_route({To, Group}, Delivery) when is_tuple(Group); is_binary(Group) -> {share, To, emqx_shared_sub:dispatch(Group, To, Delivery)}. aggre([]) -> - {[], []}; + []; aggre([#route{topic = To, dest = Node}]) when is_atom(Node) -> - {[{To, Node}], []}; -aggre([#route{topic = To, dest = {external, _} = ExtDest}]) -> - {[], [{To, ExtDest}]}; + [{To, Node}]; aggre([#route{topic = To, dest = {Group, _Node}}]) -> - {[{To, Group}], []}; + [{To, Group}]; aggre(Routes) -> - aggre(Routes, false, {[], []}). + aggre(Routes, false, []). -aggre([#route{topic = To, dest = Node} | Rest], Dedup, {Acc, ExtAcc}) when is_atom(Node) -> - aggre(Rest, Dedup, {[{To, Node} | Acc], ExtAcc}); -aggre([#route{topic = To, dest = {external, _} = ExtDest} | Rest], Dedup, {Acc, ExtAcc}) -> - aggre(Rest, Dedup, {Acc, [{To, ExtDest} | ExtAcc]}); -aggre([#route{topic = To, dest = {Group, _Node}} | Rest], _Dedup, {Acc, ExtAcc}) -> - aggre(Rest, true, {[{To, Group} | Acc], ExtAcc}); +aggre([#route{topic = To, dest = Node} | Rest], Dedup, Acc) when is_atom(Node) -> + aggre(Rest, Dedup, [{To, Node} | Acc]); +aggre([#route{topic = To, dest = {Group, _Node}} | Rest], _Dedup, Acc) -> + aggre(Rest, true, [{To, Group} | Acc]); aggre([], false, Acc) -> Acc; -aggre([], true, {Acc, ExtAcc}) -> - {lists:usort(Acc), lists:usort(ExtAcc)}. +aggre([], true, Acc) -> + lists:usort(Acc). -maybe_add_ext_routes([] = _ExtRoutes, Routes, _Msg) -> - Routes; -maybe_add_ext_routes(ExtRoutes, Routes, Msg) -> +ext_routes(Topic, Msg) -> case emqx_external_broker:should_route_to_external_dests(Msg) of - true -> Routes ++ ExtRoutes; - false -> Routes + true -> emqx_external_broker:match_routes(Topic); + false -> [] end. +ext_route([], _Delivery, RouteRes) -> + RouteRes; +ext_route(ExtRoutes, Delivery, RouteRes) -> + lists:foldl( + fun(#route{topic = To, dest = ExtDest}, Acc) -> + [{ExtDest, To, emqx_external_broker:forward(ExtDest, Delivery)} | Acc] + end, + RouteRes, + ExtRoutes + ). + %% @doc Forward message to another node. -spec forward( node(), emqx_types:topic() | emqx_types:share(), emqx_types:delivery(), RpcMode :: sync | async diff --git a/apps/emqx/src/emqx_external_broker.erl b/apps/emqx/src/emqx_external_broker.erl index a9af9ddc9..3b3ff83c6 100644 --- a/apps/emqx/src/emqx_external_broker.erl +++ b/apps/emqx/src/emqx_external_broker.erl @@ -24,6 +24,10 @@ -callback maybe_add_route(emqx_types:topic()) -> ok. -callback maybe_delete_route(emqx_types:topic()) -> ok. +-callback match_routes(emqx_types:topic()) -> [emqx_types:route()]. + +-type dest() :: term(). + -export([ provider/0, register_provider/1, @@ -31,9 +35,12 @@ forward/2, should_route_to_external_dests/1, maybe_add_route/1, - maybe_delete_route/1 + maybe_delete_route/1, + match_routes/1 ]). +-export_type([dest/0]). + -include("logger.hrl"). -define(PROVIDER, {?MODULE, external_broker}). @@ -106,6 +113,9 @@ maybe_add_route(Topic) -> maybe_delete_route(Topic) -> ?safe_with_provider(?FUNCTION_NAME(Topic), ok). +match_routes(Topic) -> + ?safe_with_provider(?FUNCTION_NAME(Topic), ok). + %%-------------------------------------------------------------------- %% Internal functions %%-------------------------------------------------------------------- diff --git a/apps/emqx/src/emqx_router.erl b/apps/emqx/src/emqx_router.erl index 55b9ab079..c60bd2383 100644 --- a/apps/emqx/src/emqx_router.erl +++ b/apps/emqx/src/emqx_router.erl @@ -95,8 +95,7 @@ -export_type([schemavsn/0]). -type group() :: binary(). --type external_dest() :: {external, term()}. --type dest() :: node() | {group(), node()} | external_dest(). +-type dest() :: node() | {group(), node()}. -type schemavsn() :: v1 | v2. %% Operation :: {add, ...} | {delete, ...}. diff --git a/apps/emqx/src/emqx_types.erl b/apps/emqx/src/emqx_types.erl index 03a3c8a0f..d8dd1cff4 100644 --- a/apps/emqx/src/emqx_types.erl +++ b/apps/emqx/src/emqx_types.erl @@ -267,7 +267,7 @@ [ {node(), topic(), deliver_result()} | {share, topic(), deliver_result()} - | {emqx_router:external_dest(), topic(), deliver_result()} + | {emqx_external_broker:dest(), topic(), deliver_result()} | persisted ] | disconnect. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index ae3647d4a..579f58dce 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -11,6 +11,7 @@ unregister_external_broker/0, maybe_add_route/1, maybe_delete_route/1, + match_routes/1, forward/2, should_route_to_external_dests/1 ]). @@ -38,15 +39,16 @@ unregister_external_broker() -> emqx_external_broker:unregister_provider(?MODULE). maybe_add_route(Topic) -> - emqx_cluster_link_coordinator:route_op(<<"add">>, Topic). + maybe_push_route_op(add, Topic). -maybe_delete_route(_Topic) -> - %% Not implemented yet - %% emqx_cluster_link_coordinator:route_op(<<"delete">>, Topic). - ok. +maybe_delete_route(Topic) -> + maybe_push_route_op(delete, Topic). -forward(ExternalDest, Delivery) -> - emqx_cluster_link_mqtt:forward(ExternalDest, Delivery). +forward(DestCluster, Delivery) -> + emqx_cluster_link_mqtt:forward(DestCluster, Delivery). + +match_routes(Topic) -> + emqx_cluster_link_extrouter:match_routes(Topic). %% Do not forward any external messages to other links. %% Only forward locally originated messages to all the relevant links, i.e. no gossip message forwarding. @@ -105,6 +107,28 @@ delete_hook() -> %% Internal functions %%-------------------------------------------------------------------- +maybe_push_route_op(Op, Topic) -> + lists:foreach( + fun(#{upstream := Cluster, topics := LinkFilters}) -> + case topic_intersect_any(Topic, LinkFilters) of + false -> + ok; + TopicIntersection -> + ID = Topic, + emqx_cluster_link_router_syncer:push(Cluster, Op, TopicIntersection, ID) + end + end, + emqx_cluster_link_config:enabled_links() + ). + +topic_intersect_any(Topic, [LinkFilter | T]) -> + case emqx_topic:intersection(Topic, LinkFilter) of + false -> topic_intersect_any(Topic, T); + TopicOrFilter -> TopicOrFilter + end; +topic_intersect_any(_Topic, []) -> + false. + actor_init(ClusterName, Actor, Incarnation) -> Env = #{timestamp => erlang:system_time(millisecond)}, {ok, _} = emqx_cluster_link_extrouter:actor_init(ClusterName, Actor, Incarnation, Env). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index bdbb702ca..be2769fd5 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -16,6 +16,7 @@ -export([ %% General cluster/0, + enabled_links/0, links/0, link/1, topic_filters/1, @@ -41,6 +42,9 @@ cluster() -> links() -> emqx:get_config(?LINKS_PATH, []). +enabled_links() -> + [L || L = #{enable := true} <- links()]. + link(Name) -> case lists:dropwhile(fun(L) -> Name =/= upstream_name(L) end, links()) of [LinkConf | _] -> LinkConf; diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index a09d4d8de..504e59c74 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -64,6 +64,8 @@ %% Op4 | n2@ds delete client/42/# → MCounter -= 1 bsl 1 = 0 → route deleted -type lane() :: non_neg_integer(). +-include_lib("emqx/include/emqx.hrl"). + -define(DEFAULT_ACTOR_TTL_MS, 30_000). -define(EXTROUTE_SHARD, ?MODULE). @@ -117,7 +119,8 @@ create_tables() -> match_routes(Topic) -> Matches = emqx_topic_index:matches(Topic, ?EXTROUTE_TAB, [unique]), - [match_to_route(M) || M <- Matches]. + %% `unique` opt is not enough, since we keep the original Topic as a part of RouteID + lists:usort([match_to_route(M) || M <- Matches]). lookup_routes(Topic) -> Pat = #extroute{entry = emqx_topic_index:make_key(Topic, '$1'), _ = '_'}, @@ -128,7 +131,8 @@ topics() -> [emqx_topic_index:get_topic(K) || [K] <- ets:match(?EXTROUTE_TAB, Pat)]. match_to_route(M) -> - emqx_topic_index:get_topic(M). + ?ROUTE_ID(Cluster, _) = emqx_topic_index:get_id(M), + #route{topic = emqx_topic_index:get_topic(M), dest = Cluster}. %% diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index 3b16642ac..a3e3ce2fb 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -556,8 +556,8 @@ encode_field(route, {add, Route = {_Topic, _ID}}) -> encode_field(route, {delete, {Topic, ID}}) -> {?ROUTE_DELETE, Topic, ID}. -decode_field(route, {?ROUTE_DELETE, Route = {_Topic, _ID}}) -> - {delete, Route}; +decode_field(route, {?ROUTE_DELETE, Topic, ID}) -> + {delete, {Topic, ID}}; decode_field(route, Route = {_Topic, _ID}) -> {add, Route}. @@ -565,7 +565,7 @@ decode_field(route, Route = {_Topic, _ID}) -> %% emqx_external_broker %%-------------------------------------------------------------------- -forward({external, {link, ClusterName}}, #delivery{message = #message{topic = Topic} = Msg}) -> +forward(ClusterName, #delivery{message = #message{topic = Topic} = Msg}) -> QueryOpts = #{pick_key => Topic}, emqx_resource:query(?MSG_RES_ID(ClusterName), Msg, QueryOpts). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index d432cd7d8..d41a41c5f 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -123,7 +123,11 @@ refine_client_options(Options = #{clientid := ClientID}) -> client_session_present(ClientPid) -> Info = emqtt:info(ClientPid), - proplists:get_value(session_present, Info, false). + %% FIXME: waitnig for emqtt release that fixes session_present type (must be a boolean) + case proplists:get_value(session_present, Info, 0) of + 0 -> false; + 1 -> true + end. announce_client(TargetCluster, Pid) -> true = gproc:reg_other(?CLIENT_NAME(TargetCluster), Pid), @@ -272,11 +276,10 @@ terminate(_Reason, _State) -> process_connect(St = #st{target = TargetCluster, actor = Actor, incarnation = Incr}) -> case start_link_client(TargetCluster) of {ok, ClientPid} -> + %% TODO: error handling, handshake + {ok, _} = emqx_cluster_link_mqtt:publish_actor_init_sync(ClientPid, Actor, Incr), ok = start_syncer(TargetCluster), ok = announce_client(TargetCluster, ClientPid), - %% TODO: error handling, handshake - - {ok, _} = emqx_cluster_link_mqtt:publish_actor_init_sync(ClientPid, Actor, Incr), process_bootstrap(St#st{client = ClientPid}); {error, Reason} -> handle_connect_error(Reason, St) From e7305c62ee31ab4f5cd59be3554e2e16887bbef4 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Tue, 21 May 2024 20:07:37 +0300 Subject: [PATCH 10/46] feat(clusterlink): replicate shared subscription and persistent session routes --- apps/emqx/src/emqx_external_broker.erl | 22 +++ .../emqx_persistent_session_ds_subs.erl | 2 + apps/emqx/src/emqx_router.erl | 4 +- apps/emqx/src/emqx_shared_sub.erl | 12 +- .../include/emqx_cluster_link.hrl | 4 + apps/emqx_cluster_link/rebar.config | 8 + .../src/emqx_cluster_link.erl | 34 ++++- .../src/emqx_cluster_link_config.erl | 2 +- .../src/emqx_cluster_link_extrouter.erl | 5 +- .../emqx_cluster_link_router_bootstrap.erl | 78 ++++++++-- .../src/emqx_cluster_link_router_syncer.erl | 138 ++++++++++++------ 11 files changed, 246 insertions(+), 63 deletions(-) create mode 100644 apps/emqx_cluster_link/rebar.config diff --git a/apps/emqx/src/emqx_external_broker.erl b/apps/emqx/src/emqx_external_broker.erl index 3b3ff83c6..acd4b8c3d 100644 --- a/apps/emqx/src/emqx_external_broker.erl +++ b/apps/emqx/src/emqx_external_broker.erl @@ -24,6 +24,12 @@ -callback maybe_add_route(emqx_types:topic()) -> ok. -callback maybe_delete_route(emqx_types:topic()) -> ok. +-callback maybe_add_shared_route(emqx_types:topic(), emqx_types:group()) -> ok. +-callback maybe_delete_shared_route(emqx_types:topic(), emqx_types:group()) -> ok. + +-callback maybe_add_persistent_route(emqx_types:topic(), emqx_persistent_session_ds:id()) -> ok. +-callback maybe_delete_persistent_route(emqx_types:topic(), emqx_persistent_session_ds:id()) -> ok. + -callback match_routes(emqx_types:topic()) -> [emqx_types:route()]. -type dest() :: term(). @@ -36,6 +42,10 @@ should_route_to_external_dests/1, maybe_add_route/1, maybe_delete_route/1, + maybe_add_shared_route/2, + maybe_delete_shared_route/2, + maybe_add_persistent_route/2, + maybe_delete_persistent_route/2, match_routes/1 ]). @@ -113,6 +123,18 @@ maybe_add_route(Topic) -> maybe_delete_route(Topic) -> ?safe_with_provider(?FUNCTION_NAME(Topic), ok). +maybe_add_shared_route(Topic, Group) -> + ?safe_with_provider(?FUNCTION_NAME(Topic, Group), ok). + +maybe_delete_shared_route(Topic, Group) -> + ?safe_with_provider(?FUNCTION_NAME(Topic, Group), ok). + +maybe_add_persistent_route(Topic, ID) -> + ?safe_with_provider(?FUNCTION_NAME(Topic, ID), ok). + +maybe_delete_persistent_route(Topic, ID) -> + ?safe_with_provider(?FUNCTION_NAME(Topic, ID), ok). + match_routes(Topic) -> ?safe_with_provider(?FUNCTION_NAME(Topic), ok). diff --git a/apps/emqx/src/emqx_persistent_session_ds/emqx_persistent_session_ds_subs.erl b/apps/emqx/src/emqx_persistent_session_ds/emqx_persistent_session_ds_subs.erl index e8422674b..fc86b67a6 100644 --- a/apps/emqx/src/emqx_persistent_session_ds/emqx_persistent_session_ds_subs.erl +++ b/apps/emqx/src/emqx_persistent_session_ds/emqx_persistent_session_ds_subs.erl @@ -92,6 +92,7 @@ on_subscribe(TopicFilter, SubOpts, #{id := SessionId, s := S0, props := Props}) case emqx_persistent_session_ds_state:n_subscriptions(S0) < MaxSubscriptions of true -> ok = emqx_persistent_session_ds_router:do_add_route(TopicFilter, SessionId), + _ = emqx_external_broker:maybe_add_persistent_route(TopicFilter, SessionId), {SubId, S1} = emqx_persistent_session_ds_state:new_id(S0), {SStateId, S2} = emqx_persistent_session_ds_state:new_id(S1), SState = #{ @@ -154,6 +155,7 @@ on_unsubscribe(SessionId, TopicFilter, S0) -> #{session_id => SessionId, topic_filter => TopicFilter}, ok = emqx_persistent_session_ds_router:do_delete_route(TopicFilter, SessionId) ), + _ = emqx_external_broker:maybe_delete_persistent_route(TopicFilter, SessionId), {ok, emqx_persistent_session_ds_state:del_subscription(TopicFilter, S0), Subscription} end. diff --git a/apps/emqx/src/emqx_router.erl b/apps/emqx/src/emqx_router.erl index c60bd2383..0bcaf3c0e 100644 --- a/apps/emqx/src/emqx_router.erl +++ b/apps/emqx/src/emqx_router.erl @@ -91,7 +91,7 @@ deinit_schema/0 ]). --export_type([dest/0, external_dest/0]). +-export_type([dest/0]). -export_type([schemavsn/0]). -type group() :: binary(). @@ -313,7 +313,7 @@ print_routes(Topic) -> match_routes(Topic) ). --spec cleanup_routes(node() | external_dest()) -> ok. +-spec cleanup_routes(node()) -> ok. cleanup_routes(NodeOrExtDest) -> cleanup_routes(get_schema_vsn(), NodeOrExtDest). diff --git a/apps/emqx/src/emqx_shared_sub.erl b/apps/emqx/src/emqx_shared_sub.erl index 54c107111..4498523da 100644 --- a/apps/emqx/src/emqx_shared_sub.erl +++ b/apps/emqx/src/emqx_shared_sub.erl @@ -421,8 +421,12 @@ init_monitors() -> handle_call({subscribe, Group, Topic, SubPid}, _From, State = #state{pmon = PMon}) -> mria:dirty_write(?SHARED_SUBSCRIPTION, record(Group, Topic, SubPid)), case ets:member(?SHARED_SUBSCRIBER, {Group, Topic}) of - true -> ok; - false -> ok = emqx_router:do_add_route(Topic, {Group, node()}) + true -> + ok; + false -> + ok = emqx_router:do_add_route(Topic, {Group, node()}), + _ = emqx_external_broker:maybe_add_shared_route(Topic, Group), + ok end, ok = maybe_insert_alive_tab(SubPid), ok = maybe_insert_round_robin_count({Group, Topic}), @@ -545,7 +549,9 @@ is_alive_sub(Pid) -> delete_route_if_needed({Group, Topic} = GroupTopic) -> if_no_more_subscribers(GroupTopic, fun() -> - ok = emqx_router:do_delete_route(Topic, {Group, node()}) + ok = emqx_router:do_delete_route(Topic, {Group, node()}), + _ = emqx_external_broker:maybe_delete_shared_route(Topic, Group), + ok end). get_default_shared_subscription_strategy() -> diff --git a/apps/emqx_cluster_link/include/emqx_cluster_link.hrl b/apps/emqx_cluster_link/include/emqx_cluster_link.hrl index 42eb7ca7b..8bf9dd7c2 100644 --- a/apps/emqx_cluster_link/include/emqx_cluster_link.hrl +++ b/apps/emqx_cluster_link/include/emqx_cluster_link.hrl @@ -8,3 +8,7 @@ -define(MSG_TOPIC_PREFIX, ?TOPIC_PREFIX "msg/"). -define(DEST(FromClusterName), {external, {link, FromClusterName}}). + +%% Fairly compact text encoding. +-define(SHARED_ROUTE_ID(Topic, Group), <<"$s/", Group/binary, "/", Topic/binary>>). +-define(PERSISTENT_ROUTE_ID(Topic, ID), <<"$p/", ID/binary, "/", Topic/binary>>). diff --git a/apps/emqx_cluster_link/rebar.config b/apps/emqx_cluster_link/rebar.config new file mode 100644 index 000000000..8835441e4 --- /dev/null +++ b/apps/emqx_cluster_link/rebar.config @@ -0,0 +1,8 @@ +%% -*- mode: erlang; -*- + +{erl_opts, [debug_info]}. + +{deps, [ + {emqx, {path, "../../apps/emqx"}}, + {emqx_resource, {path, "../../apps/emqx_resource"}} +]}. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index 579f58dce..8d843edcc 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -11,6 +11,10 @@ unregister_external_broker/0, maybe_add_route/1, maybe_delete_route/1, + maybe_add_shared_route/2, + maybe_delete_shared_route/2, + maybe_add_persistent_route/2, + maybe_delete_persistent_route/2, match_routes/1, forward/2, should_route_to_external_dests/1 @@ -38,11 +42,29 @@ register_external_broker() -> unregister_external_broker() -> emqx_external_broker:unregister_provider(?MODULE). +%% Using original Topic as Route ID in the most common scenario: +%% (non-shared, non-persistent routes). +%% Original Topic is used to identify the route and be able +%% to delete it on a remote cluster. +%% There is no need to push Node name as this info can be derived from +%% agent state on the remote cluster. maybe_add_route(Topic) -> - maybe_push_route_op(add, Topic). + maybe_push_route_op(add, Topic, Topic). maybe_delete_route(Topic) -> - maybe_push_route_op(delete, Topic). + maybe_push_route_op(delete, Topic, Topic). + +maybe_add_shared_route(Topic, Group) -> + maybe_push_route_op(add, Topic, ?SHARED_ROUTE_ID(Topic, Group)). + +maybe_delete_shared_route(Topic, Group) -> + maybe_push_route_op(delete, Topic, ?SHARED_ROUTE_ID(Topic, Group)). + +maybe_add_persistent_route(Topic, ID) -> + maybe_push_route_op(add, Topic, ?PERSISTENT_ROUTE_ID(Topic, ID), push_persistent_route). + +maybe_delete_persistent_route(Topic, ID) -> + maybe_push_route_op(delete, Topic, ?PERSISTENT_ROUTE_ID(Topic, ID), push_persistent_route). forward(DestCluster, Delivery) -> emqx_cluster_link_mqtt:forward(DestCluster, Delivery). @@ -107,15 +129,17 @@ delete_hook() -> %% Internal functions %%-------------------------------------------------------------------- -maybe_push_route_op(Op, Topic) -> +maybe_push_route_op(Op, Topic, RouteID) -> + maybe_push_route_op(Op, Topic, RouteID, push). + +maybe_push_route_op(Op, Topic, RouteID, PushFun) -> lists:foreach( fun(#{upstream := Cluster, topics := LinkFilters}) -> case topic_intersect_any(Topic, LinkFilters) of false -> ok; TopicIntersection -> - ID = Topic, - emqx_cluster_link_router_syncer:push(Cluster, Op, TopicIntersection, ID) + emqx_cluster_link_router_syncer:PushFun(Cluster, Op, TopicIntersection, RouteID) end end, emqx_cluster_link_config:enabled_links() diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index be2769fd5..ba17d22e8 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -55,7 +55,7 @@ emqtt_options(LinkName) -> emqx_maybe:apply(fun mk_emqtt_options/1, ?MODULE:link(LinkName)). topic_filters(LinkName) -> - maps:get(filters, ?MODULE:link(LinkName), []). + maps:get(topics, ?MODULE:link(LinkName), []). %% diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index 504e59c74..bbec844df 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -169,14 +169,15 @@ mnesia_actor_init(Cluster, Actor, Incarnation, TS) -> %% that applies some update out of the blue, but it seems impossible to prevent %% it completely w/o transactions. State = #state{cluster = Cluster, actor = Actor, incarnation = Incarnation}, - case mnesia:read(?EXTROUTE_ACTOR_TAB, Actor, write) of + ActorID = ?ACTOR_ID(Cluster, Actor), + case mnesia:read(?EXTROUTE_ACTOR_TAB, ActorID, write) of [#actor{incarnation = Incarnation, lane = Lane} = Rec] -> ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec#actor{until = bump_actor_ttl(TS)}, write), {ok, State#state{lane = Lane}}; [] -> Lane = mnesia_assign_lane(Cluster), Rec = #actor{ - id = ?ACTOR_ID(Cluster, Actor), + id = ActorID, incarnation = Incarnation, lane = Lane, until = bump_actor_ttl(TS) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl index 8c0e609dc..105b8d94c 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl @@ -4,6 +4,10 @@ -module(emqx_cluster_link_router_bootstrap). -include_lib("emqx/include/emqx_router.hrl"). +-include_lib("emqx/include/emqx_shared_sub.hrl"). +-include_lib("emqx/src/emqx_persistent_session_ds/emqx_ps_ds_int.hrl"). + +-include("emqx_cluster_link.hrl"). -export([ init/2, @@ -17,7 +21,8 @@ wildcards :: [emqx_types:topic()], topics :: [emqx_types:topic()], stash :: [{emqx_types:topic(), _RouteID}], - max_batch_size :: non_neg_integer() + max_batch_size :: non_neg_integer(), + is_persistent_route :: boolean() }). %% @@ -25,23 +30,25 @@ init(TargetCluster, Options) -> LinkFilters = emqx_cluster_link_config:topic_filters(TargetCluster), {Wildcards, Topics} = lists:partition(fun emqx_topic:wildcard/1, LinkFilters), + IsPersistentRoute = maps:get(is_persistent_route, Options, false), #bootstrap{ target = TargetCluster, wildcards = Wildcards, topics = Topics, stash = [], - max_batch_size = maps:get(max_batch_size, Options, ?MAX_BATCH_SIZE) + max_batch_size = maps:get(max_batch_size, Options, ?MAX_BATCH_SIZE), + is_persistent_route = IsPersistentRoute }. next_batch(B = #bootstrap{stash = S0 = [_ | _], max_batch_size = MBS}) -> {Batch, Stash} = mk_batch(S0, MBS), {Batch, B#bootstrap{stash = Stash}}; -next_batch(B = #bootstrap{topics = Topics = [_ | _], stash = []}) -> - Routes = select_routes_by_topics(Topics), - next_batch(B#bootstrap{topics = [], stash = Routes}); -next_batch(B0 = #bootstrap{wildcards = Wildcards = [_ | _], stash = []}) -> - Routes = select_routes_by_wildcards(Wildcards), - next_batch(B0#bootstrap{wildcards = [], stash = Routes}); +next_batch(B = #bootstrap{topics = Topics = [_ | _], stash = [], is_persistent_route = IsPs}) -> + next_batch(B#bootstrap{topics = [], stash = routes_by_topic(Topics, IsPs)}); +next_batch( + B0 = #bootstrap{wildcards = Wildcards = [_ | _], stash = [], is_persistent_route = IsPs} +) -> + next_batch(B0#bootstrap{wildcards = [], stash = routes_by_wildcards(Wildcards, IsPs)}); next_batch(#bootstrap{topics = [], wildcards = [], stash = []}) -> done. @@ -53,6 +60,37 @@ mk_batch(Stash, MaxBatchSize) -> %% +routes_by_topic(Topics, _IsPersistentRoute = false) -> + Routes = select_routes_by_topics(Topics), + SharedRoutes = select_shared_sub_routes_by_topics(Topics), + Routes ++ SharedRoutes; +routes_by_topic(Topics, _IsPersistentRoute = true) -> + lists:foldl( + fun(T, Acc) -> + Routes = emqx_persistent_session_ds_router:lookup_routes(T), + [encode_route(T, ?PERSISTENT_ROUTE_ID(T, D)) || #ps_route{dest = D} <- Routes] ++ Acc + end, + [], + Topics + ). + +routes_by_wildcards(Wildcards, _IsPersistentRoute = false) -> + Routes = select_routes_by_wildcards(Wildcards), + SharedRoutes = select_shared_sub_routes_by_wildcards(Wildcards), + Routes ++ SharedRoutes; +routes_by_wildcards(Wildcards, _IsPersistentRoute = true) -> + emqx_persistent_session_ds_router:foldl_routes( + fun(#ps_route{dest = D, topic = T}, Acc) -> + case topic_intersect_any(T, Wildcards) of + false -> + Acc; + Intersec -> + [encode_route(Intersec, ?PERSISTENT_ROUTE_ID(T, D)) | Acc] + end + end, + [] + ). + select_routes_by_topics(Topics) -> [encode_route(Topic, Topic) || Topic <- Topics, emqx_broker:subscribers(Topic) =/= []]. @@ -63,12 +101,34 @@ select_routes_by_wildcards(Wildcards) -> ?SUBSCRIBER ). +select_shared_sub_routes_by_topics([T | Topics]) -> + select_shared_sub_routes(T) ++ select_shared_sub_routes_by_topics(Topics); +select_shared_sub_routes_by_topics([]) -> + []. + +select_shared_sub_routes_by_wildcards(Wildcards) -> + emqx_utils_ets:keyfoldl( + fun({Group, Topic}, Acc) -> + RouteID = ?SHARED_ROUTE_ID(Topic, Group), + intersecting_route(Topic, RouteID, Wildcards) ++ Acc + end, + [], + ?SHARED_SUBSCRIBER + ). + +select_shared_sub_routes(Topic) -> + LocalGroups = lists:usort(ets:select(?SHARED_SUBSCRIBER, [{{{'$1', Topic}, '_'}, [], ['$1']}])), + [encode_route(Topic, ?SHARED_ROUTE_ID(Topic, G)) || G <- LocalGroups]. + intersecting_route(Topic, Wildcards) -> + intersecting_route(Topic, Topic, Wildcards). + +intersecting_route(Topic, RouteID, Wildcards) -> %% TODO: probably nice to validate cluster link topic filters %% to have no intersections between each other? case topic_intersect_any(Topic, Wildcards) of false -> []; - Intersection -> [encode_route(Intersection, Topic)] + Intersection -> [encode_route(Intersection, RouteID)] end. topic_intersect_any(Topic, [LinkFilter | T]) -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index d41a41c5f..62d434071 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -7,11 +7,15 @@ %% API -export([start_link/1]). --export([push/4]). -export([ - start_link_actor/1, - start_link_syncer/1 + push/4, + push_persistent_route/4 +]). + +-export([ + start_link_actor/4, + start_link_syncer/4 ]). %% Internal API / Syncer @@ -47,10 +51,25 @@ -define(RECONNECT_TIMEOUT, 5_000). -%% +%% Special actor for persistent routes that has the same actor name on all nodes. +%% Node actors with the same name nay race with each other (e.g. during bootstrap), +%% but it must be tolerable, since persistent route destination is a client ID, +%% which is unique cluster-wide. +-define(PS_ACTOR, <<"ps-routes-v1">>). +-define(PS_INCARNATION, 0). +-define(PS_ACTOR_REF(Cluster), {via, gproc, ?NAME(Cluster, ps_actor)}). +-define(PS_CLIENT_NAME(Cluster), ?NAME(Cluster, ps_client)). +-define(PS_SYNCER_REF(Cluster), {via, gproc, ?PS_SYNCER_NAME(Cluster)}). +-define(PS_SYNCER_NAME(Cluster), ?NAME(Cluster, ps_syncer)). push(TargetCluster, OpName, Topic, ID) -> - case gproc:where(?SYNCER_NAME(TargetCluster)) of + do_push(?SYNCER_NAME(TargetCluster), OpName, Topic, ID). + +push_persistent_route(TargetCluster, OpName, Topic, ID) -> + do_push(?PS_SYNCER_NAME(TargetCluster), OpName, Topic, ID). + +do_push(SyncerName, OpName, Topic, ID) -> + case gproc:where(SyncerName) of SyncerPid when is_pid(SyncerPid) -> emqx_router_syncer:push(SyncerPid, OpName, Topic, ID, #{}); undefined -> @@ -66,11 +85,9 @@ start_link(TargetCluster) -> %% Actor -start_link_actor(TargetCluster) -> - Actor = get_actor_id(), - Incarnation = ensure_actor_incarnation(), +start_link_actor(ActorRef, Actor, Incarnation, TargetCluster) -> gen_server:start_link( - ?ACTOR_REF(TargetCluster), + ActorRef, ?MODULE, {actor, mk_state(TargetCluster, Actor, Incarnation)}, [] @@ -98,9 +115,9 @@ ensure_actor_incarnation() -> %% MQTT Client -start_link_client(TargetCluster) -> +start_link_client(TargetCluster, Actor) -> Options = emqx_cluster_link_config:emqtt_options(TargetCluster), - case emqtt:start_link(refine_client_options(Options)) of + case emqtt:start_link(refine_client_options(Options, Actor)) of {ok, Pid} -> case emqtt:connect(Pid) of {ok, _Props} -> @@ -112,10 +129,15 @@ start_link_client(TargetCluster) -> Error end. -refine_client_options(Options = #{clientid := ClientID}) -> +refine_client_options(Options = #{clientid := ClientID}, Actor) -> + Suffix = + case Actor of + ?PS_ACTOR -> "-ps"; + _ -> "" + end, %% TODO: Reconnect should help, but it looks broken right now. Options#{ - clientid => emqx_utils:format("~s:~s:routesync", [ClientID, node()]), + clientid => emqx_utils:format("~s:~s:routesync~s", [ClientID, node(), Suffix]), clean_start => false, properties => #{'Session-Expiry-Interval' => 60}, retry_interval => 0 @@ -129,8 +151,13 @@ client_session_present(ClientPid) -> 1 -> true end. -announce_client(TargetCluster, Pid) -> - true = gproc:reg_other(?CLIENT_NAME(TargetCluster), Pid), +announce_client(Actor, TargetCluster, Pid) -> + Name = + case Actor of + ?PS_ACTOR -> ?PS_CLIENT_NAME(TargetCluster); + _ -> ?CLIENT_NAME(TargetCluster) + end, + true = gproc:reg_other(Name, Pid), ok. publish_routes(ClientPid, Actor, Incarnation, Updates) -> @@ -148,19 +175,17 @@ publish_routes(ClientPid, Actor, Incarnation, Updates) -> %% Route syncer -start_syncer(TargetCluster) -> - case supervisor:start_child(?REF(TargetCluster), child_spec(syncer, TargetCluster)) of +start_syncer(TargetCluster, Actor, Incr) -> + Spec = child_spec(syncer, Actor, Incr, TargetCluster), + case supervisor:start_child(?REF(TargetCluster), Spec) of {ok, _} -> ok; {error, {already_started, _}} -> ok end. -start_link_syncer(TargetCluster) -> - Actor = get_actor_id(), - Incarnation = get_actor_incarnation(), - ClientName = ?CLIENT_NAME(TargetCluster), - emqx_router_syncer:start_link(?SYNCER_REF(TargetCluster), #{ +start_link_syncer(Actor, Incarnation, SyncerRef, ClientName) -> + emqx_router_syncer:start_link(SyncerRef, #{ max_batch_size => ?MAX_BATCH_SIZE, min_sync_interval => ?MIN_SYNC_INTERVAL, error_delay => ?ERROR_DELAY, @@ -169,10 +194,14 @@ start_link_syncer(TargetCluster) -> %% TODO: enable_replies => false }). -close_syncer(TargetCluster) -> +close_syncer(TargetCluster, ?PS_ACTOR) -> + emqx_router_syncer:close(?PS_SYNCER_REF(TargetCluster)); +close_syncer(TargetCluster, _Actor) -> emqx_router_syncer:close(?SYNCER_REF(TargetCluster)). -open_syncer(TargetCluster) -> +open_syncer(TargetCluster, ?PS_ACTOR) -> + emqx_router_syncer:open(?PS_SYNCER_REF(TargetCluster)); +open_syncer(TargetCluster, _Actor) -> emqx_router_syncer:open(?SYNCER_REF(TargetCluster)). process_syncer_batch(Batch, ClientName, Actor, Incarnation) -> @@ -200,7 +229,8 @@ init({sup, TargetCluster}) -> period => 60 }, Children = [ - child_spec(actor, TargetCluster) + child_spec(actor, TargetCluster), + child_spec(ps_actor, TargetCluster) ], {ok, {SupFlags, Children}}; init({actor, State}) -> @@ -212,20 +242,37 @@ child_spec(actor, TargetCluster) -> %% ClientID: `mycluster:emqx1@emqx.local:routesync` %% Occasional TCP/MQTT-level disconnects are expected, and should be handled %% gracefully. - #{ - id => actor, - start => {?MODULE, start_link_actor, [TargetCluster]}, - restart => permanent, - type => worker - }; -child_spec(syncer, TargetCluster) -> + Actor = get_actor_id(), + Incarnation = ensure_actor_incarnation(), + actor_spec(actor, ?ACTOR_REF(TargetCluster), Actor, Incarnation, TargetCluster); +child_spec(ps_actor, TargetCluster) -> + actor_spec(ps_actor, ?PS_ACTOR_REF(TargetCluster), ?PS_ACTOR, ?PS_INCARNATION, TargetCluster). + +child_spec(syncer, ?PS_ACTOR, Incarnation, TargetCluster) -> + SyncerRef = ?PS_SYNCER_REF(TargetCluster), + ClientName = ?PS_CLIENT_NAME(TargetCluster), + syncer_spec(ps_syncer, ?PS_ACTOR, Incarnation, SyncerRef, ClientName); +child_spec(syncer, Actor, Incarnation, TargetCluster) -> %% Route syncer process. %% Initially starts in a "closed" state. Actor decides when to open it, i.e. %% when bootstrapping is done. Syncer crash means re-bootstrap is needed, so %% we just restart the actor in this case. + SyncerRef = ?SYNCER_REF(TargetCluster), + ClientName = ?CLIENT_NAME(TargetCluster), + syncer_spec(syncer, Actor, Incarnation, SyncerRef, ClientName). + +actor_spec(ChildID, ActorRef, Actor, Incarnation, TargetCluster) -> #{ - id => syncer, - start => {?MODULE, start_link_syncer, [TargetCluster]}, + id => ChildID, + start => {?MODULE, start_link_actor, [ActorRef, Actor, Incarnation, TargetCluster]}, + restart => permanent, + type => worker + }. + +syncer_spec(ChildID, Actor, Incarnation, SyncerRef, ClientName) -> + #{ + id => ChildID, + start => {?MODULE, start_link_syncer, [Actor, Incarnation, SyncerRef, ClientName]}, restart => permanent, type => worker }. @@ -274,12 +321,12 @@ terminate(_Reason, _State) -> ok. process_connect(St = #st{target = TargetCluster, actor = Actor, incarnation = Incr}) -> - case start_link_client(TargetCluster) of + case start_link_client(TargetCluster, Actor) of {ok, ClientPid} -> %% TODO: error handling, handshake {ok, _} = emqx_cluster_link_mqtt:publish_actor_init_sync(ClientPid, Actor, Incr), - ok = start_syncer(TargetCluster), - ok = announce_client(TargetCluster, ClientPid), + ok = start_syncer(TargetCluster, Actor, Incr), + ok = announce_client(Actor, TargetCluster, ClientPid), process_bootstrap(St#st{client = ClientPid}); {error, Reason} -> handle_connect_error(Reason, St) @@ -290,9 +337,9 @@ handle_connect_error(_Reason, St) -> TRef = erlang:start_timer(?RECONNECT_TIMEOUT, self(), reconnect), St#st{reconnect_timer = TRef}. -handle_client_down(_Reason, St = #st{target = TargetCluster}) -> +handle_client_down(_Reason, St = #st{target = TargetCluster, actor = Actor}) -> %% TODO: logs - ok = close_syncer(TargetCluster), + ok = close_syncer(TargetCluster, Actor), process_connect(St#st{client = undefined}). process_bootstrap(St = #st{bootstrapped = false}) -> @@ -311,6 +358,15 @@ process_bootstrap(St = #st{client = ClientPid, bootstrapped = true}) -> %% is re-established with a clean session. Once bootstrapping is done, it %% opens the syncer. +run_bootstrap(St = #st{target = TargetCluster, actor = ?PS_ACTOR}) -> + case mria_config:whoami() of + Role when Role /= replicant -> + Opts = #{is_persistent_route => true}, + Bootstrap = emqx_cluster_link_router_bootstrap:init(TargetCluster, Opts), + run_bootstrap(Bootstrap, St); + _ -> + process_bootstrapped(St) + end; run_bootstrap(St = #st{target = TargetCluster}) -> Bootstrap = emqx_cluster_link_router_bootstrap:init(TargetCluster, #{}), run_bootstrap(Bootstrap, St). @@ -330,8 +386,8 @@ run_bootstrap(Bootstrap, St) -> end end. -process_bootstrapped(St = #st{target = TargetCluster}) -> - ok = open_syncer(TargetCluster), +process_bootstrapped(St = #st{target = TargetCluster, actor = Actor}) -> + ok = open_syncer(TargetCluster, Actor), St#st{bootstrapped = true}. process_bootstrap_batch(Batch, #st{client = ClientPid, actor = Actor, incarnation = Incarnation}) -> From e26e7acaa1f542ab79ad755e8bc9c6432cf2cee7 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Thu, 23 May 2024 12:32:46 +0300 Subject: [PATCH 11/46] refactor(clusterlink): use `emqx_bridge_mqtt_lib:clientid_base/1` to construct routesync client id --- .../src/emqx_cluster_link_router_syncer.erl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 62d434071..2a19a54b3 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -51,6 +51,9 @@ -define(RECONNECT_TIMEOUT, 5_000). +-define(CLIENT_SUFFIX, ":routesync"). +-define(PS_CLIENT_SUFFIX, ":routesync-ps"). + %% Special actor for persistent routes that has the same actor name on all nodes. %% Node actors with the same name nay race with each other (e.g. during bootstrap), %% but it must be tolerable, since persistent route destination is a client ID, @@ -132,12 +135,12 @@ start_link_client(TargetCluster, Actor) -> refine_client_options(Options = #{clientid := ClientID}, Actor) -> Suffix = case Actor of - ?PS_ACTOR -> "-ps"; - _ -> "" + ?PS_ACTOR -> ?PS_CLIENT_SUFFIX; + _ -> ?CLIENT_SUFFIX end, %% TODO: Reconnect should help, but it looks broken right now. Options#{ - clientid => emqx_utils:format("~s:~s:routesync~s", [ClientID, node(), Suffix]), + clientid => emqx_bridge_mqtt_lib:clientid_base([ClientID, Suffix]), clean_start => false, properties => #{'Session-Expiry-Interval' => 60}, retry_interval => 0 From ac19cf89df60577e33b6c801bc64f6283ff8b7ba Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Thu, 23 May 2024 12:47:48 +0300 Subject: [PATCH 12/46] chore(clusterlink): remove code related to the rejected coordinator-based implementation --- .../include/emqx_cluster_link.hrl | 3 - .../src/emqx_cluster_link.app.src | 1 - .../src/emqx_cluster_link.erl | 54 --- .../src/emqx_cluster_link_config.erl | 21 +- .../src/emqx_cluster_link_coord_sup.erl | 57 --- .../src/emqx_cluster_link_coordinator.erl | 454 ------------------ .../src/emqx_cluster_link_mqtt.erl | 270 +---------- 7 files changed, 15 insertions(+), 845 deletions(-) delete mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_coord_sup.erl delete mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_coordinator.erl diff --git a/apps/emqx_cluster_link/include/emqx_cluster_link.hrl b/apps/emqx_cluster_link/include/emqx_cluster_link.hrl index 8bf9dd7c2..dd2544114 100644 --- a/apps/emqx_cluster_link/include/emqx_cluster_link.hrl +++ b/apps/emqx_cluster_link/include/emqx_cluster_link.hrl @@ -3,12 +3,9 @@ %%-------------------------------------------------------------------- -define(TOPIC_PREFIX, "$LINK/cluster/"). --define(CTRL_TOPIC_PREFIX, ?TOPIC_PREFIX "ctrl/"). -define(ROUTE_TOPIC_PREFIX, ?TOPIC_PREFIX "route/"). -define(MSG_TOPIC_PREFIX, ?TOPIC_PREFIX "msg/"). --define(DEST(FromClusterName), {external, {link, FromClusterName}}). - %% Fairly compact text encoding. -define(SHARED_ROUTE_ID(Topic, Group), <<"$s/", Group/binary, "/", Topic/binary>>). -define(PERSISTENT_ROUTE_ID(Topic, ID), <<"$p/", ID/binary, "/", Topic/binary>>). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.app.src b/apps/emqx_cluster_link/src/emqx_cluster_link.app.src index d8da0c1ee..f7c5e102a 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.app.src +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.app.src @@ -9,7 +9,6 @@ kernel, stdlib, emqtt, - ecpool, emqx, emqx_resource ]}, diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index 8d843edcc..846204066 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -100,22 +100,6 @@ on_message_publish(#message{topic = <>, p %% Just ignore it. It must be already logged by the decoder {stop, []} end; -on_message_publish( - #message{topic = <>, payload = Payload} = Msg -) -> - case emqx_cluster_link_mqtt:decode_ctrl_msg(Payload, ClusterName) of - {init_link, InitRes} -> - on_init(InitRes, ClusterName, Msg); - {ack_link, Res} -> - on_init_ack(Res, ClusterName, Msg); - unlink -> - %% Stop pushing messages to the cluster that requested unlink, - %% It brings the link to a half-closed (unidirectional) state, - %% as this cluster may still replicate routes and receive messages from ClusterName. - emqx_cluster_link_mqtt:stop_msg_fwd_resource(ClusterName), - cleanup_routes(ClusterName) - end, - {stop, []}; on_message_publish(_Msg) -> ok. @@ -166,44 +150,6 @@ update_routes(ClusterName, Actor, Incarnation, RouteOps) -> RouteOps ). -cleanup_routes(ClusterName) -> - emqx_router:cleanup_routes(?DEST(ClusterName)). - -lookup_link_conf(ClusterName) -> - lists:search( - fun(#{upstream := N}) -> N =:= ClusterName end, - emqx:get_config([cluster, links], []) - ). - -on_init(Res, ClusterName, Msg) -> - #{ - 'Correlation-Data' := ReqId, - 'Response-Topic' := RespTopic - } = emqx_message:get_header(properties, Msg), - case lookup_link_conf(ClusterName) of - {value, LinkConf} -> - _ = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf), - emqx_cluster_link_mqtt:ack_link(ClusterName, Res, RespTopic, ReqId); - false -> - ?SLOG(error, #{ - msg => "init_link_request_from_unknown_cluster", - link_name => ClusterName - }), - %% Cannot ack/reply since we don't know how to reach the link cluster, - %% The cluster that tried to initiatw this link is expected to eventually fail with timeout. - ok - end. - -on_init_ack(Res, ClusterName, Msg) -> - #{'Correlation-Data' := ReqId} = emqx_message:get_header(properties, Msg), - emqx_cluster_link_coordinator:on_link_ack(ClusterName, ReqId, Res). - -%% add_routes(Topics, ClusterName) -> -%% lists:foreach( -%% fun(T) -> emqx_router_syncer:push(add, T, ?DEST(ClusterName), #{}) end, -%% Topics -%% ). - %% let it crash if extra is not a map, %% we don't expect the message to be forwarded from an older EMQX release, %% that doesn't set extra = #{} by default. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index ba17d22e8..4b93407b2 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -21,7 +21,8 @@ link/1, topic_filters/1, %% Connections - emqtt_options/1 + emqtt_options/1, + mk_emqtt_options/1 ]). -export([ @@ -152,16 +153,18 @@ add_links(LinksConf) -> add_link(#{enabled := true} = LinkConf) -> %% NOTE: this can be started later during init_link phase, but it looks not harmful to start it beforehand... MsgFwdRes = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf), - CoordRes = ensure_coordinator(LinkConf), - combine_results(CoordRes, MsgFwdRes); + %% TODO + ActorRes = ok, + combine_results(ActorRes, MsgFwdRes); add_link(_DisabledLinkConf) -> ok. remove_links(LinksConf) -> [remove_link(Link) || Link <- LinksConf]. -remove_link(LinkConf) -> - emqx_cluster_link_coord_sup:stop_coordinator(LinkConf). +remove_link(_LinkConf) -> + %% TODO + ok. update_links(LinksConf) -> [update_link(Link) || Link <- LinksConf]. @@ -176,14 +179,6 @@ update_link(#{enabled := false} = LinkConf) -> Other -> Other end. -ensure_coordinator(LinkConf) -> - case emqx_cluster_link_coord_sup:start_coordinator(LinkConf) of - {error, {already_started, Pid}} -> - {ok, Pid}; - {error, already_present} -> - emqx_cluster_link_coord_sup:restart_coordinator(LinkConf) - end. - combine_results(ok, ok) -> ok; combine_results(CoordRes, MsgFwdRes) -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_coord_sup.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_coord_sup.erl deleted file mode 100644 index 78fa030f2..000000000 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_coord_sup.erl +++ /dev/null @@ -1,57 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. -%%-------------------------------------------------------------------- - --module(emqx_cluster_link_coord_sup). - --behaviour(supervisor). - --export([start_link/1]). --export([init/1]). - --export([ - start_coordinator/1, - restart_coordinator/1, - stop_coordinator/1 -]). - --define(SERVER, ?MODULE). --define(COORDINATOR_MOD, emqx_cluster_link_coordinator). - -start_link(LinksConf) -> - supervisor:start_link({local, ?SERVER}, ?SERVER, LinksConf). - -init(LinksConf) -> - SupFlags = #{ - strategy => one_for_one, - intensity => 10, - period => 5 - }, - {ok, {SupFlags, children(LinksConf)}}. - -start_coordinator(#{upstream := Name} = LinkConf) -> - supervisor:start_child(?SERVER, worker_spec(Name, LinkConf)). - -restart_coordinator(#{upstream := Name} = _LinkConf) -> - supervisor:restart_child(?SERVER, Name). - -stop_coordinator(#{upstream := Name} = _LinkConf) -> - case supervisor:terminate_child(?SERVER, Name) of - ok -> - supervisor:delete_child(?SERVER, Name); - Err -> - Err - end. - -worker_spec(Id, LinkConf) -> - #{ - id => Id, - start => {?COORDINATOR_MOD, start_link, [LinkConf]}, - restart => permanent, - shutdown => 5000, - type => worker, - modules => [?COORDINATOR_MOD] - }. - -children(LinksConf) -> - [worker_spec(Name, Conf) || #{upstream := Name, enable := true} = Conf <- LinksConf]. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_coordinator.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_coordinator.erl deleted file mode 100644 index 4b8b9be8f..000000000 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_coordinator.erl +++ /dev/null @@ -1,454 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. -%%-------------------------------------------------------------------- - -%% @doc experimental prototype implementation. -%% The idea is to add a sync point for all cluster route operations, -%% so that, routes can be batched/shrunk (via using emqx_route_syncer) before pushing them to linked clusters. -%% The expected result is reduced communication between linked clusters: -%% each nodes communicates with other clusters through coordinator. -%% The drawbacks are numerous though: -%% - complexity/leader elections, -%% - routes removal seems hard to implement unless remote cluster routes as stored per node, -%% in that case global coordinator per cluster is not needed any more. - TBD --module(emqx_cluster_link_coordinator). - --behaviour(gen_statem). - -%% API --export([ - route_op/2, - on_link_ack/3 -]). - --export([start_link/1]). - -%% gen_statem --export([ - callback_mode/0, - init/1, - terminate/3 -]). - -%% gen_statem state functions --export([ - wait_for_coordinator/3, - connecting/3, - init_linking/3, - bootstrapping/3, - coordinating/3, - following/3 -]). - --export([select_routes/1]). - --include_lib("emqx/include/emqx.hrl"). --include_lib("emqx/include/emqx_router.hrl"). --include_lib("emqx/include/logger.hrl"). - --define(COORDINATOR(UpstreamName), {?MODULE, UpstreamName}). --define(SERVER, ?MODULE). --define(WAIT_COORD_RETRY_INTERVAL, 100). --define(CONN_RETRY_INTERVAL, 5000). --define(INIT_LINK_RESP_TIMEOUT, 15_000). --define(INIT_LINK_RETRIES, 5). --define(UPSTREAM_DEST, {external, {link, _}}). --define(IS_ROUTE_OP(Op), Op =:= <<"add">>; Op =:= <<"delete">>). - -start_link(Conf) -> - gen_statem:start_link(?MODULE, Conf, []). - -route_op(Op, Topic) -> - lists:foreach( - fun(#{upstream := UpstreamName, topics := LinkFilters}) -> - case topic_intersect_any(Topic, LinkFilters) of - false -> ok; - TopicOrFilter -> maybe_cast(UpstreamName, {Op, TopicOrFilter}) - end - end, - emqx:get_config([cluster, links]) - ). - -on_link_ack(ClusterName, ReqId, Res) -> - maybe_cast(ClusterName, {ack_link, ClusterName, ReqId, Res}). - -callback_mode() -> - [state_functions, state_enter]. - -init(LinkConf) -> - process_flag(trap_exit, true), - %% It helps to avoid unnecessary global name conflicts (and, as a result, coordinator re-election), - %% e.g. when a down nodes comes back - %% TODO: need to better understand `global` behaviour - _ = global:sync(), - Data = #{is_coordinator => false, link_conf => LinkConf}, - {ok, wait_for_coordinator, Data}. - -wait_for_coordinator(enter, _OldState, _Data) -> - {keep_state_and_data, [{state_timeout, 0, do_wait_for_coordinator}]}; -wait_for_coordinator(_, do_wait_for_coordinator, Data) -> - #{link_conf := #{upstream := Name}} = Data, - case global:whereis_name(?COORDINATOR(Name)) of - undefined -> - case register_coordinator(Name) of - yes -> - {next_state, connecting, Data#{is_coordinator => true}}; - no -> - %% TODO: this should not happen forever, if it does, we need to detect it - {keep_state_and_data, [ - {state_timeout, ?WAIT_COORD_RETRY_INTERVAL, do_wait_for_coordinator} - ]} - end; - %% Can be a prev stale pid? - %% Let it crash with case_clause if it happens... - Pid when is_pid(Pid) andalso Pid =/= self() -> - Data1 = Data#{coordinator_mon => erlang:monitor(process, Pid), coordinator_pid => Pid}, - {next_state, following, Data1} - end; -wait_for_coordinator(cast, {Op, _Topic}, _Data) when ?IS_ROUTE_OP(Op) -> - %% Ignore any route op, until bootstrapping is started. - %% All ignored route ops are expected to be caught up during the bootstrap. - keep_state_and_data; -wait_for_coordinator(EventType, Event, Data) -> - handle_event_(?FUNCTION_NAME, EventType, Event, Data). - -connecting(enter, _OldState, _Data) -> - {keep_state_and_data, [{state_timeout, 0, reconnect}]}; -connecting(cast, {Op, _Topic}, _Data) when ?IS_ROUTE_OP(Op) -> - %% Ignore any route op, until bootstrapping is started. - %% All ignored route ops are expected to be caught up during the bootstrap. - keep_state_and_data; -connecting(_EventType, reconnect, Data) -> - ensure_conn_pool(init_linking, Data); -connecting(EventType, Event, Data) -> - handle_event_(?FUNCTION_NAME, EventType, Event, Data). - -init_linking(enter, _OldState, Data) -> - {keep_state, Data#{link_retries => ?INIT_LINK_RETRIES}, [{state_timeout, 0, init_link}]}; -init_linking(cast, {ack_link, _ClusterName, ReqId, Res}, #{link_req_id := ReqId} = Data) -> - case Res of - %% This state machine is not suitable to bootstrap the upstream cluster conditionally, - %% since it ignores any route ops received before bootstrapping... - {ok, #{proto_ver := _, need_bootstrap := _}} -> - {next_state, bootstrapping, maps:without([link_req_id, link_retries], Data)}; - {error, <<"bad_upstream_name">>} -> - %% unrecoverable error that needs a user intervention, - %% TODO: maybe need to transition to some error state - {keep_state, maps:without([link_req_id, link_retries], Data), [{state_timeout, cancel}]} - end; -init_linking(_, init_link, #{link_conf := #{upstream := Name}, link_retries := Retries} = Data) -> - case Retries > 0 of - true -> - {ReqId, {ok, _}} = emqx_cluster_link_mqtt:init_link(Name), - Data1 = Data#{link_req_id => ReqId, link_retries => Retries - 1}, - {keep_state, Data1, [{state_timeout, ?INIT_LINK_RESP_TIMEOUT, init_link}]}; - false -> - ?SLOG(error, #{ - msg => "no_link_ack_response_received", - link_name => Name - }), - %% unrecoverable error that needs a user intervention, - %% TODO: maybe need to transition to some error state - keep_state_and_data - end; -init_linking(cast, {Op, _Topic}, _Data) when ?IS_ROUTE_OP(Op) -> - %% Ignore any route op, until bootstrapping is started. - %% All ignored route ops are expected to be caught up during the bootstrap. - keep_state_and_data; -init_linking(EventType, Event, Data) -> - handle_event_(?FUNCTION_NAME, EventType, Event, Data). - -bootstrapping(enter, _OldState, #{link_conf := LinkConf} = Data) -> - #{topics := LinkFilters, upstream := ClusterName} = LinkConf, - %% TODO add timeout? - {Pid, Ref} = erlang:spawn_monitor(fun() -> bootstrap(ClusterName, LinkFilters) end), - {keep_state, Data#{bootstrap_pid => Pid, bootstrap_ref => Ref}}; -bootstrapping(info, {'DOWN', Ref, process, _Pid, Reason}, #{bootstrap_ref := Ref} = Data) -> - %% TODO: think about the best way to proceed if bootstrapping failed, - %% perhaps just transition back to connecting state? - normal = Reason, - Data1 = maps:without([bootstrap_ref, bootstrap_pid], Data), - {next_state, coordinating, Data1}; -%% Accumulate new route ops, since there is no guarantee -%% they will be included in the bootstrapped data -bootstrapping(cast, {Op, _Topic}, _Data) when ?IS_ROUTE_OP(Op) -> - {keep_state_and_data, [postpone]}; -bootstrapping(EventType, Event, Data) -> - handle_event_(?FUNCTION_NAME, EventType, Event, Data). - -coordinating(enter, _OldState, _Data) -> - keep_state_and_data; -coordinating(cast, {Op, Topic}, Data) when ?IS_ROUTE_OP(Op) -> - #{link_conf := #{upstream := ClusterName}} = Data, - %% TODO: batching - case emqx_cluster_link_mqtt:publish_route_op(async, ClusterName, Op, Topic) of - {error, _} -> - %% Conn pool error, reconnect. - {next_state, connecting, stop_conn_pool(Data)}; - _Ref -> - keep_state_and_data - end; -%% TODO: this can also be received in other states, move to generic handler? -coordinating(info, {global_name_conflict, CoordName}, Data) -> - LogData = #{ - msg => "emqx_cluster_link_coordinator_name_conflict", - coordinator_name => CoordName - }, - LogData1 = - %% TODO: this can be a previous (self) coordinator? - case global:whereis_name(CoordName) of - undefined -> LogData; - Pid -> LogData#{new_coordinator => Pid, coordinator_node => node(Pid)} - end, - ?SLOG(warning, LogData1), - Data1 = stop_conn_pool(Data), - {next_state, wait_for_coordinator, Data1#{is_coordinator => false}}; -%% only errors results are expected -%% TODO: a single error causes reconnection and re-bootstrapping, -%% it's worth considering some optimizations. -coordinating(info, {pub_result, _Ref, {error, Reason}}, #{link_conf := #{upstream := Name}} = Data) -> - ?SLOG(error, #{ - msg => "failed_to_replicate_route_op_to_linked_cluster", - link_name => Name, - reason => Reason - }), - %% TODO: check errors, some may be not possible to correct by re-connecting - Data1 = stop_conn_pool(Data), - {next_state, connecting, Data1}; -coordinating(EventType, Event, Data) -> - handle_event_(?FUNCTION_NAME, EventType, Event, Data). - -following(enter, _OldState, _Data) -> - keep_state_and_data; -following(info, {'DOWN', MRef, process, _Pid, _Info}, #{coordinator_mon := MRef} = Data) -> - {next_state, wait_for_coordinator, maps:without([coordinator_mon, coordinator_pid], Data)}; -following(EventType, Event, Data) -> - handle_event_(?FUNCTION_NAME, EventType, Event, Data). - -handle_event_(_State, info, {'DOWN', Ref, process, _Pid, Reason}, Data) -> - case Data of - #{conn_pool_mons := #{Ref := WorkerName}, is_coordinator := true} -> - ?SLOG(warning, #{ - msg => "cluster_link_route_connection_is_down", - reason => Reason, - worker => WorkerName - }), - {next_state, connecting, stop_conn_pool(Data)}; - _ -> - %% Must be a stale 'DOWN' msg (e.g., from the next worker) which is already handled. - keep_state_and_data - end; -handle_event_(State, EventType, Event, Data) -> - ?SLOG(warning, #{ - msg => "unexpected_event", - event => Event, - event_type => EventType, - state => State, - data => Data - }), - keep_state_and_data. - -terminate(Reason, _State, #{link_conf := #{upstream := ClusterName}} = Data) -> - %% TODO unregister coordinator? - IsCoordinator = maps:get(is_coordinator, Data, false), - case Reason of - shutdown when IsCoordinator -> - %% must be sync, since we are going to stop the pool - %% NOTE: there is no guarantee that unlink op will arrive the last one - %% (since there may be other route op sent over another pool worker) - %% and clear everything, but it must be good enough to GC most of the routes. - _ = emqx_cluster_link_mqtt:remove_link(ClusterName); - _ -> - ok - end, - _ = stop_conn_pool(Data), - ok. - -%%-------------------------------------------------------------------- -%% Internal functions -%%-------------------------------------------------------------------- - -topic_intersect_any(Topic, [LinkFilter | T]) -> - case emqx_topic:intersection(Topic, LinkFilter) of - false -> topic_intersect_any(Topic, T); - TopicOrFilter -> TopicOrFilter - end; -topic_intersect_any(_Topic, []) -> - false. - -bootstrap(ClusterName, LinkFilters) -> - %% TODO: do this in chunks - Topics = select_routes(LinkFilters), - {ok, _} = emqx_cluster_link_mqtt:publish_routes(sync, ClusterName, Topics). - -%% TODO: if a local route matches link filter exactly, -%% it's enough to only select this matching filter itself and skip any other routes? -%% E.g., local routes: "t/global/#", "t/global/1/+", clsuter link topics = ["t/global/#"], -%% it's enough to replicate "t/global/#" only to the linked cluster. -%% What to do when "t/global/#" subscriber unsubscribers -%% and we start to get forwarded messages (e.g. "t/global/2/3") matching no subscribers? -%% How can we efficiently replace "t/global/#" route with "t/global/1/+" -%% (intersection of "t/global/#" and "t/global/#")? -%% So maybe better not to do it at all and replicate both "t/global/1/+" and "t/global/#" ? -select_routes(LinkFilters) -> - {Wildcards, Topics} = lists:partition(fun emqx_topic:wildcard/1, LinkFilters), - Routes = select_routes_by_topics(Topics), - Routes1 = intersecting_routes(Wildcards), - AllRoutes = Routes ++ Routes1, - case emqx_router:get_schema_vsn() of - v1 -> AllRoutes; - %% v2 stores filters (Wildcard subscriptions routes) in a separate index, - %% so WildcardRoutes contains only non-wildcard routes matching wildcard link filters. - %% Thus, we need to select wildcard routes additionally - v2 -> intersecting_routes_v2(Wildcards) ++ AllRoutes - end. - -select_routes_by_topics([]) -> - []; -select_routes_by_topics([Topic | T]) -> - case filter_out_upstream_routes(emqx_router:match_routes(Topic)) of - [_ | _] -> - %% These are non-wildcard link topics, so we don't care about actual - %% routes as long as they are matched, and just need to replicate - %% topic routes to the linked cluster - [Topic | select_routes_by_topics(T)]; - _ -> - select_routes_by_topics(T) - end. - -filter_out_upstream_routes(Routes) -> - lists:filter( - fun - (#route{dest = ?UPSTREAM_DEST}) -> false; - (_) -> true - end, - Routes - ). - -%% selects only non-wildcard routes that match wildcards (filters), -%% can only be done as a linear search over all routes -intersecting_routes([]) -> - []; -intersecting_routes(Wildcards) -> - Res = ets:foldl( - fun - (#route{dest = ?UPSTREAM_DEST}, Acc) -> - Acc; - (#route{topic = T}, Acc) -> - %% TODO: probably nice to validate cluster link topic filters - %% to have no intersections between each other? - case topic_intersect_any(T, Wildcards) of - false -> Acc; - Intersection -> Acc#{Intersection => undefined} - end - end, - #{}, - ?ROUTE_TAB - ), - maps:keys(Res). - -intersecting_routes_v2([]) -> - []; -intersecting_routes_v2(Wildcards) -> - lists:foldl( - fun(Wildcard, Acc) -> - MatchedFilters = matched_filters_v2(Wildcard), - all_intersections(Wildcard, MatchedFilters, Acc) - end, - [], - Wildcards - ). - -matched_filters_v2(Wildcard) -> - MatchesAcc = lists:foldl( - fun(M, Acc) -> - case emqx_topic_index:get_id(M) of - ?UPSTREAM_DEST -> - Acc; - _ -> - Acc#{emqx_topic_index:get_topic(M) => undefined} - end - end, - #{}, - emqx_topic_index:matches_filter(Wildcard, ?ROUTE_TAB_FILTERS, []) - ), - maps:keys(MatchesAcc). - -all_intersections(Wildcard, [W | Wildcards], Acc) -> - case emqx_topic:intersection(Wildcard, W) of - false -> all_intersections(Wildcard, Wildcards, Acc); - Intersection -> all_intersections(Wildcard, Wildcards, [Intersection | Acc]) - end; -all_intersections(_, [], Acc) -> - lists:usort(Acc). - -maybe_cast(UpstreamName, Msg) -> - case global:whereis_name(?COORDINATOR(UpstreamName)) of - Pid when is_pid(Pid) -> - gen_statem:cast(Pid, Msg); - undefined -> - %% Ignore and rely on coordinator bootstrapping once it's elected - ok - end. - -register_coordinator(UpstreamName) -> - case mria_config:role() of - core -> - global:register_name( - ?COORDINATOR(UpstreamName), self(), fun global:random_notify_name/3 - ); - _ -> - no - end. - -%% connecting state helper -ensure_conn_pool(NextState, #{link_conf := LinkConf} = Data) -> - Res = start_conn_pool(LinkConf), - Data1 = Data#{conn_pool => Res}, - case Res of - {ok, _} -> - Data2 = Data1#{conn_pool_mons => mon_pool_workers(LinkConf)}, - {next_state, NextState, Data2}; - _Err -> - {keep_state, Data1, [{state_timeout, ?CONN_RETRY_INTERVAL, reconnect}]} - end. - -start_conn_pool(LinkConf) -> - case emqx_cluster_link_mqtt:start_routing_pool(LinkConf) of - {ok, _Pid} = Ok -> - Ok; - {error, Reason} = Err -> - #{upstream := Name} = LinkConf, - ?SLOG(error, #{ - msg => "failed_to_connect_to_linked_cluster", - cluster_name => Name, - reason => Reason - }), - Err - end. - -stop_conn_pool(#{link_conf := #{upstream := Name}} = Data) -> - case Data of - #{conn_pool := {ok, _}} -> - Data1 = maybe_unmointor_workers(Data), - Data1#{conn_pool => {stopped, emqx_cluster_link_mqtt:stop_routing_pool(Name)}}; - _ -> - Data - end. - -maybe_unmointor_workers(#{conn_pool_mons := MonitorsMap} = Data) -> - _ = maps:foreach( - fun(Mref, _Name) -> - erlang:demonitor(Mref) - end, - MonitorsMap - ), - maps:remove(conn_pool_mons, Data); -maybe_unmointor_workers(Data) -> - Data. - -mon_pool_workers(LinkConf) -> - maps:from_list([ - {erlang:monitor(process, Pid), Name} - || {Name, Pid} <- emqx_cluster_link_mqtt:routing_pool_workers(LinkConf) - ]). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index a3e3ce2fb..d62965bb2 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -9,8 +9,6 @@ -include_lib("emqx/include/emqx_mqtt.hrl"). -include_lib("emqx/include/logger.hrl"). -%-include_lib("emqtt/include/emqtt.hrl"). - -behaviour(emqx_resource). -behaviour(ecpool_worker). @@ -30,16 +28,6 @@ -export([ ensure_msg_fwd_resource/1, stop_msg_fwd_resource/1, - start_routing_pool/1, - stop_routing_pool/1, - routing_pool_workers/1, - init_link/1, - ack_link/4, - remove_link/1, - publish_route_op/4, - publish_routes/3, - cleanup_routes/1, - decode_ctrl_msg/2, decode_route_op/1, decode_forwarded_msg/1 ]). @@ -54,37 +42,24 @@ forward/2 ]). --define(ROUTE_CLIENTID_SUFFIX, ":route:"). -define(MSG_CLIENTID_SUFFIX, ":msg:"). --define(CLIENTID(Base, Suffix), emqx_bridge_mqtt_lib:clientid_base([Base, Suffix])). -define(MQTT_HOST_OPTS, #{default_port => 1883}). -define(MY_CLUSTER_NAME, emqx_cluster_link_config:cluster()). -define(ROUTE_TOPIC, <>). -define(MSG_FWD_TOPIC, <>). --define(CTRL_TOPIC(ClusterName), <>). +%%-define(CTRL_TOPIC(ClusterName), <>). -%% ecpool and emqx_resource names --define(ROUTE_POOL_PREFIX, "emqx_cluster_link_mqtt:route:"). -define(MSG_POOL_PREFIX, "emqx_cluster_link_mqtt:msg:"). -define(RES_NAME(Prefix, ClusterName), <>). -define(ROUTE_POOL_NAME(ClusterName), ?RES_NAME(?ROUTE_POOL_PREFIX, ClusterName)). -define(MSG_RES_ID(ClusterName), ?RES_NAME(?MSG_POOL_PREFIX, ClusterName)). -define(HEALTH_CHECK_TIMEOUT, 1000). -define(RES_GROUP, <<"emqx_cluster_link">>). --define(DEFAULT_POOL_KEY, <<"default">>). %% Protocol --define(PROTO_VER, <<"1.0">>). --define(INIT_LINK_OP, <<"init_link">>). --define(ACK_LINK_OP, <<"ack_link">>). --define(UNLINK_OP, <<"unlink">>). --define(BATCH_ROUTES_OP, <<"add_routes">>). --define(CLEANUP_ROUTES_OP, <<"cleanup_routes">>). -%% It's worth optimizing non-batch op payload size, -%% thus it's encoded as a plain binary --define(TOPIC_WITH_OP(Op, Topic), <>). +%% -define(PROTO_VER, <<"1.0">>). -define(DECODE(Payload), erlang:binary_to_term(Payload, [safe])). -define(ENCODE(Payload), erlang:term_to_binary(Payload)). @@ -290,121 +265,9 @@ connect(Options) -> end. %%-------------------------------------------------------------------- -%% Routing +%% Protocol %%-------------------------------------------------------------------- -routing_pool_workers(#{upstream := ClusterName} = _ClusterConf) -> - ecpool:workers(?ROUTE_POOL_NAME(ClusterName)). - -start_routing_pool(#{upstream := ClusterName} = ClusterConf) -> - start_pool(?ROUTE_POOL_NAME(ClusterName), ?ROUTE_CLIENTID_SUFFIX, ClusterConf). - -stop_routing_pool(ClusterName) -> - ecpool:stop_sup_pool(?ROUTE_POOL_NAME(ClusterName)). - -init_link(ClusterName) -> - Payload = #{ - <<"op">> => ?INIT_LINK_OP, - <<"proto_ver">> => ?PROTO_VER, - <<"upstream">> => ClusterName, - %% TODO: may no need to reserve it as it is a map? - <<"extra">> => #{} - }, - ReqId = emqx_utils_conv:bin(emqx_utils:gen_id(16)), - Properties = #{ - 'Response-Topic' => ?CTRL_TOPIC(ClusterName), - 'Correlation-Data' => ReqId - }, - Topic = ?CTRL_TOPIC(?MY_CLUSTER_NAME), - {ReqId, publish(sync, ClusterName, ?DEFAULT_POOL_KEY, Payload, Properties, Topic, ?QOS_1)}. - -ack_link(ClusterName, Result, RespTopic, ReqId) -> - Payload = #{ - <<"op">> => ?ACK_LINK_OP, - %% The links may compare and downgrade/adjust protocol in future - <<"proto_ver">> => ?PROTO_VER, - %% may be used in future to avoud re-bootrstrapping all the routes, - %% for example, if the connection was abrupted for a while but the cluster was healthy - %% and didn't lost any routes. In that case, retrying lost route updates would be sufficient. - %% For now, it's always true for simplicitiy reasons. - <<"need_bootstrap">> => true, - <<"extra">> => #{} - }, - Payload1 = - case Result of - {ok, _} -> - Payload#{<<"result">> => <<"ok">>}; - {error, Reason} -> - Payload#{<<"result">> => <<"error">>, reason => Reason} - end, - Props = #{'Correlation-Data' => ReqId}, - Query = {RespTopic, Props, Payload1, ?QOS_1}, - %% Using msg forwading resource to send the response back. - %% TODO: maybe async query? - emqx_resource:query(?MSG_RES_ID(ClusterName), Query, #{ - query_mode => simple_sync, pick_key => RespTopic - }). - -remove_link(ClusterName) -> - Payload = #{<<"op">> => ?UNLINK_OP}, - Topic = ?CTRL_TOPIC(?MY_CLUSTER_NAME), - publish(sync, ClusterName, ?DEFAULT_POOL_KEY, Payload, #{}, Topic, ?QOS_0). - -publish_routes(QueryType, ClusterName, Topics) -> - %% Picks the same pool worker consistently. - %% Although, as writes are idompotent we can pick it randomly - TBD. - publish_routes(QueryType, ClusterName, ?DEFAULT_POOL_KEY, Topics). - -publish_routes(QueryType, ClusterName, PoolKey, Topics) -> - Payload = #{<<"op">> => ?BATCH_ROUTES_OP, <<"topics">> => Topics}, - publish(QueryType, ClusterName, PoolKey, Payload). - -cleanup_routes(ClusterName) -> - Payload = #{<<"op">> => ?CLEANUP_ROUTES_OP}, - publish(sync, ClusterName, ?DEFAULT_POOL_KEY, Payload, #{}, ?ROUTE_TOPIC, ?QOS_0). - -publish_route_op(QueryType, ClusterName, Op, Topic) when Op =:= <<"add">>; Op =:= <<"delete">> -> - Payload = ?TOPIC_WITH_OP(Op, Topic), - publish(QueryType, ClusterName, Topic, Payload). - -publish(QueryType, ClusterName, PoolKey, Payload) -> - publish(QueryType, ClusterName, PoolKey, Payload, #{}). - -publish(QueryType, ClusterName, PoolKey, Payload, Props) -> - %% Deletes are not implemented for now, writes are idempotent, so QOS_1 is fine. - publish(QueryType, ClusterName, PoolKey, Payload, Props, ?ROUTE_TOPIC, ?QOS_1). - -publish(async, ClusterName, PoolKey, Payload, Props, Topic, QoS) -> - ecpool:pick_and_do( - {?ROUTE_POOL_NAME(ClusterName), PoolKey}, - fun(ConnPid) -> - Ref = erlang:make_ref(), - Cb = {fun publish_result/3, [self(), Ref]}, - emqtt:publish_async( - ConnPid, Topic, Props, ?ENCODE(Payload), [{qos, QoS}], ?PUB_TIMEOUT, Cb - ), - Ref - end, - no_handover - ); -publish(sync, ClusterName, PoolKey, Payload, Props, Topic, QoS) -> - ecpool:pick_and_do( - {?ROUTE_POOL_NAME(ClusterName), PoolKey}, - fun(ConnPid) -> - emqtt:publish(ConnPid, Topic, Props, ?ENCODE(Payload), [{qos, QoS}]) - end, - no_handover - ). - -publish_result(Caller, Ref, Result) -> - case handle_send_result(Result) of - ok -> - %% avoid extra message passing, we only care about errors for now - ok; - Err -> - Caller ! {pub_result, Ref, Err} - end. - %%% New leader-less Syncer/Actor implementation publish_actor_init_sync(ClientPid, Actor, Incarnation) -> @@ -427,63 +290,6 @@ publish_route_sync(ClientPid, Actor, Incarnation, Updates) -> }, emqtt:publish(ClientPid, PubTopic, ?ENCODE(Payload), ?QOS_1). -%%-------------------------------------------------------------------- -%% Protocol -%%-------------------------------------------------------------------- - -decode_ctrl_msg(Payload, ClusterName) -> - decode_ctrl_msg1(?DECODE(Payload), ClusterName). - -decode_ctrl_msg1( - #{ - <<"op">> := ?INIT_LINK_OP, - <<"proto_ver">> := ProtoVer, - <<"upstream">> := UpstreamName - }, - ClusterName -) -> - ProtoVer1 = decode_proto_ver(ProtoVer, ClusterName), - %% UpstreamName is the name the remote linked cluster refers to this cluster, - %% so it must equal to the local cluster name, more clear naming is desired... - MyClusterName = ?MY_CLUSTER_NAME, - case UpstreamName of - MyClusterName -> - {init_link, {ok, #{proto_ver => ProtoVer1}}}; - _ -> - ?SLOG(error, #{ - msg => "misconfigured_cluster_link_name", - %% How this cluster names itself - local_name => MyClusterName, - %% How the remote cluster names itself - link_name => ClusterName, - %% How the remote cluster names this local cluster - upstream_name => UpstreamName - }), - {init_link, {error, <<"bad_upstream_name">>}} - end; -decode_ctrl_msg1( - #{ - <<"op">> := ?ACK_LINK_OP, - <<"result">> := <<"ok">>, - <<"proto_ver">> := ProtoVer, - <<"need_bootstrap">> := IsBootstrapNeeded - }, - ClusterName -) -> - ProtoVer1 = decode_proto_ver(ProtoVer, ClusterName), - {ack_link, {ok, #{proto_ver => ProtoVer1, need_bootstrap => IsBootstrapNeeded}}}; -decode_ctrl_msg1( - #{ - <<"op">> := ?ACK_LINK_OP, - <<"result">> := <<"error">>, - <<"reason">> := Reason - }, - _ClusterName -) -> - {ack_link, {error, Reason}}; -decode_ctrl_msg1(#{<<"op">> := ?UNLINK_OP}, _ClusterName) -> - unlink. - decode_route_op(Payload) -> decode_route_op1(?DECODE(Payload)). @@ -501,14 +307,6 @@ decode_route_op1(#{ }) -> RouteOps1 = lists:map(fun(Op) -> decode_field(route, Op) end, RouteOps), {route_updates, #{actor => Actor, incarnation => Incr}, RouteOps1}; -%%decode_route_op1(<<"add_", Topic/binary>>) -> -%% {add, Topic}; -%%decode_route_op1(<<"delete_", Topic/binary>>) -> -%% {delete, Topic}; -%%decode_route_op1(#{<<"op">> := ?BATCH_ROUTES_OP, <<"topics">> := Topics}) when is_list(Topics) -> -%% {add, Topics}; -%%decode_route_op1(#{<<"op">> := ?CLEANUP_ROUTES_OP}) -> -%% cleanup_routes; decode_route_op1(Payload) -> ?SLOG(warning, #{ msg => "unexpected_cluster_link_route_op_payload", @@ -528,29 +326,6 @@ decode_forwarded_msg(Payload) -> {error, Payload} end. -decode_proto_ver(ProtoVer, ClusterName) -> - {MyMajor, MyMinor} = decode_proto_ver1(?PROTO_VER), - case decode_proto_ver1(ProtoVer) of - {Major, Minor} = Res when - Major > MyMajor; - Minor > MyMinor - -> - ?SLOG(notice, #{ - msg => "different_cluster_link_protocol_versions", - protocol_version => ?PROTO_VER, - link_protocol_version => ProtoVer, - link_name => ClusterName - }), - Res; - Res -> - Res - end. - -decode_proto_ver1(ProtoVer) -> - [Major, Minor] = binary:split(ProtoVer, <<".">>), - %% Let it fail (for now), we don't expect invalid data to pass through the linking protocol.. - {emqx_utils_conv:int(Major), emqx_utils_conv:int(Minor)}. - encode_field(route, {add, Route = {_Topic, _ID}}) -> Route; encode_field(route, {delete, {Topic, ID}}) -> @@ -573,38 +348,7 @@ forward(ClusterName, #delivery{message = #message{topic = Topic} = Msg}) -> %% Internal functions %%-------------------------------------------------------------------- -emqtt_client_opts( - ClientIdSuffix, #{server := Server, ssl := #{enable := EnableSsl} = Ssl} = ClusterConf -) -> - BaseClientId = maps:get(client_id, ClusterConf, ?MY_CLUSTER_NAME), - ClientId = ?CLIENTID(BaseClientId, ClientIdSuffix), - #{hostname := Host, port := Port} = emqx_schema:parse_server(Server, ?MQTT_HOST_OPTS), - Opts = #{ - host => Host, - port => Port, - clientid => ClientId, - proto_ver => v5, - ssl => EnableSsl, - ssl_opts => maps:to_list(maps:remove(enable, Ssl)) - }, - with_password(with_user(Opts, ClusterConf), ClusterConf). - -with_user(Opts, #{username := U} = _ClusterConf) -> - Opts#{username => U}; -with_user(Opts, _ClusterConf) -> - Opts. - -with_password(Opts, #{password := P} = _ClusterConf) -> - Opts#{password => emqx_secret:unwrap(P)}; -with_password(Opts, _ClusterConf) -> - Opts. - -start_pool(PoolName, ClientIdSuffix, #{pool_size := PoolSize} = ClusterConf) -> - ClientOpts = emqtt_client_opts(ClientIdSuffix, ClusterConf), - Opts = [ - {name, PoolName}, - {pool_size, PoolSize}, - {pool_type, hash}, - {client_opts, ClientOpts} - ], - ecpool:start_sup_pool(PoolName, ?MODULE, Opts). +emqtt_client_opts(ClientIdSuffix, ClusterConf) -> + #{clientid := BaseClientId} = Opts = emqx_cluster_link_config:mk_emqtt_options(ClusterConf), + ClientId = emqx_bridge_mqtt_lib:clientid_base([BaseClientId, ClientIdSuffix]), + Opts#{clientid => ClientId}. From b1aeb35370b7b546d6d22afa9704402f6d5bfcad Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 24 May 2024 18:49:37 +0300 Subject: [PATCH 13/46] feat(clusterlink): implement actor init handshake --- .../include/emqx_cluster_link.hrl | 6 + .../src/emqx_cluster_link.erl | 66 ++++++++- .../src/emqx_cluster_link_mqtt.erl | 75 +++++++---- .../src/emqx_cluster_link_router_syncer.erl | 125 +++++++++++++++--- 4 files changed, 221 insertions(+), 51 deletions(-) diff --git a/apps/emqx_cluster_link/include/emqx_cluster_link.hrl b/apps/emqx_cluster_link/include/emqx_cluster_link.hrl index dd2544114..3ee7e9fdf 100644 --- a/apps/emqx_cluster_link/include/emqx_cluster_link.hrl +++ b/apps/emqx_cluster_link/include/emqx_cluster_link.hrl @@ -5,6 +5,12 @@ -define(TOPIC_PREFIX, "$LINK/cluster/"). -define(ROUTE_TOPIC_PREFIX, ?TOPIC_PREFIX "route/"). -define(MSG_TOPIC_PREFIX, ?TOPIC_PREFIX "msg/"). +-define(RESP_TOPIC_PREFIX, ?TOPIC_PREFIX "resp/"). + +-define(MY_CLUSTER_NAME, emqx_cluster_link_config:cluster()). +-define(ROUTE_TOPIC, <>). +-define(MSG_FWD_TOPIC, <>). +-define(RESP_TOPIC(Actor), <>). %% Fairly compact text encoding. -define(SHARED_ROUTE_ID(Topic, Group), <<"$s/", Group/binary, "/", Topic/binary>>). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index 846204066..37456faea 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -83,11 +83,13 @@ should_route_to_external_dests(_Msg) -> %% EMQX Hooks %%-------------------------------------------------------------------- -on_message_publish(#message{topic = <>, payload = Payload}) -> +on_message_publish( + #message{topic = <>, payload = Payload} = Msg +) -> _ = case emqx_cluster_link_mqtt:decode_route_op(Payload) of - {actor_init, #{actor := Actor, incarnation := Incr}} -> - actor_init(ClusterName, Actor, Incr); + {actor_init, InitInfoMap} -> + actor_init(ClusterName, emqx_message:get_header(properties, Msg), InitInfoMap); {route_updates, #{actor := Actor, incarnation := Incr}, RouteOps} -> update_routes(ClusterName, Actor, Incr, RouteOps) end, @@ -137,9 +139,61 @@ topic_intersect_any(Topic, [LinkFilter | T]) -> topic_intersect_any(_Topic, []) -> false. -actor_init(ClusterName, Actor, Incarnation) -> - Env = #{timestamp => erlang:system_time(millisecond)}, - {ok, _} = emqx_cluster_link_extrouter:actor_init(ClusterName, Actor, Incarnation, Env). +actor_init( + ClusterName, + #{'Correlation-Data' := ReqId, 'Response-Topic' := RespTopic}, + #{ + actor := Actor, + incarnation := Incr, + cluster := TargetCluster, + proto_ver := _ + } +) -> + Res = + case emqx_cluster_link_config:link(ClusterName) of + undefined -> + ?SLOG( + error, + #{ + msg => "init_link_request_from_unknown_cluster", + link_name => ClusterName + } + ), + %% Avoid atom error reasons, since they can be sent to the remote cluster, + %% which will use safe binary_to_term decoding + %% TODO: add error details? + {error, <<"unknown_cluster">>}; + LinkConf -> + %% TODO: may be worth checking resource health and communicate it? + _ = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf), + MyClusterName = emqx_cluster_link_config:cluster(), + case MyClusterName of + TargetCluster -> + Env = #{timestamp => erlang:system_time(millisecond)}, + {ok, _} = emqx_cluster_link_extrouter:actor_init( + ClusterName, Actor, Incr, Env + ), + ok; + _ -> + %% The remote cluster uses a different name to refer to this cluster + ?SLOG(error, #{ + msg => "misconfigured_cluster_link_name", + %% How this cluster names itself + local_name => MyClusterName, + %% How the remote cluster names this local cluster + remote_name => TargetCluster, + %% How the remote cluster names itself + received_from => ClusterName + }), + {error, <<"bad_remote_cluster_link_name">>} + end + end, + _ = actor_init_ack(Actor, Res, ReqId, RespTopic), + {stop, []}. + +actor_init_ack(Actor, Res, ReqId, RespTopic) -> + RespMsg = emqx_cluster_link_mqtt:actor_init_ack_resp_msg(Actor, Res, ReqId, RespTopic), + emqx_broker:publish(RespMsg). update_routes(ClusterName, Actor, Incarnation, RouteOps) -> ActorState = emqx_cluster_link_extrouter:actor_state(ClusterName, Actor, Incarnation), diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index d62965bb2..3ebfd6de6 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -29,11 +29,13 @@ ensure_msg_fwd_resource/1, stop_msg_fwd_resource/1, decode_route_op/1, - decode_forwarded_msg/1 + decode_forwarded_msg/1, + decode_resp/1 ]). -export([ - publish_actor_init_sync/3, + publish_actor_init_sync/6, + actor_init_ack_resp_msg/4, publish_route_sync/4, encode_field/2 ]). @@ -45,11 +47,6 @@ -define(MSG_CLIENTID_SUFFIX, ":msg:"). -define(MQTT_HOST_OPTS, #{default_port => 1883}). --define(MY_CLUSTER_NAME, emqx_cluster_link_config:cluster()). - --define(ROUTE_TOPIC, <>). --define(MSG_FWD_TOPIC, <>). -%%-define(CTRL_TOPIC(ClusterName), <>). -define(MSG_POOL_PREFIX, "emqx_cluster_link_mqtt:msg:"). -define(RES_NAME(Prefix, ClusterName), <>). @@ -58,8 +55,7 @@ -define(HEALTH_CHECK_TIMEOUT, 1000). -define(RES_GROUP, <<"emqx_cluster_link">>). -%% Protocol -%% -define(PROTO_VER, <<"1.0">>). +-define(PROTO_VER, 1). -define(DECODE(Payload), erlang:binary_to_term(Payload, [safe])). -define(ENCODE(Payload), erlang:term_to_binary(Payload)). @@ -67,10 +63,14 @@ -define(F_OPERATION, '$op'). -define(OP_ROUTE, <<"route">>). -define(OP_ACTOR_INIT, <<"actor_init">>). +-define(OP_ACTOR_INIT_ACK, <<"actor_init_ack">>). -define(F_ACTOR, 10). -define(F_INCARNATION, 11). -define(F_ROUTES, 12). +-define(F_TARGET_CLUSTER, 13). +-define(F_PROTO_VER, 14). +-define(F_RESULT, 15). -define(ROUTE_DELETE, 100). @@ -128,16 +128,6 @@ on_query(_ResourceId, FwdMsg, #{pool_name := PoolName, topic := LinkTopic} = _St end, no_handover ) - ); -on_query(_ResourceId, {Topic, Props, Payload, QoS}, #{pool_name := PoolName} = _State) -> - handle_send_result( - ecpool:pick_and_do( - {PoolName, Topic}, - fun(ConnPid) -> - emqtt:publish(ConnPid, Topic, Props, ?ENCODE(Payload), [{qos, QoS}]) - end, - no_handover - ) ). on_query_async( @@ -270,15 +260,36 @@ connect(Options) -> %%% New leader-less Syncer/Actor implementation -publish_actor_init_sync(ClientPid, Actor, Incarnation) -> - %% TODO: handshake (request / response) to make sure the link is established +publish_actor_init_sync(ClientPid, ReqId, RespTopic, TargetCluster, Actor, Incarnation) -> PubTopic = ?ROUTE_TOPIC, Payload = #{ ?F_OPERATION => ?OP_ACTOR_INIT, + ?F_PROTO_VER => ?PROTO_VER, + ?F_TARGET_CLUSTER => TargetCluster, ?F_ACTOR => Actor, ?F_INCARNATION => Incarnation }, - emqtt:publish(ClientPid, PubTopic, ?ENCODE(Payload), ?QOS_1). + Properties = #{ + 'Response-Topic' => RespTopic, + 'Correlation-Data' => ReqId + }, + emqtt:publish(ClientPid, PubTopic, Properties, ?ENCODE(Payload), [{qos, ?QOS_1}]). + +actor_init_ack_resp_msg(Actor, InitRes, ReqId, RespTopic) -> + Payload = #{ + ?F_OPERATION => ?OP_ACTOR_INIT_ACK, + ?F_PROTO_VER => ?PROTO_VER, + ?F_ACTOR => Actor, + ?F_RESULT => InitRes + }, + emqx_message:make( + undefined, + ?QOS_1, + RespTopic, + ?ENCODE(Payload), + #{}, + #{properties => #{'Correlation-Data' => ReqId}} + ). publish_route_sync(ClientPid, Actor, Incarnation, Updates) -> PubTopic = ?ROUTE_TOPIC, @@ -293,12 +304,22 @@ publish_route_sync(ClientPid, Actor, Incarnation, Updates) -> decode_route_op(Payload) -> decode_route_op1(?DECODE(Payload)). +decode_resp(Payload) -> + decode_resp1(?DECODE(Payload)). + decode_route_op1(#{ ?F_OPERATION := ?OP_ACTOR_INIT, + ?F_PROTO_VER := ProtoVer, + ?F_TARGET_CLUSTER := TargetCluster, ?F_ACTOR := Actor, ?F_INCARNATION := Incr }) -> - {actor_init, #{actor => Actor, incarnation => Incr}}; + {actor_init, #{ + actor => Actor, + incarnation => Incr, + cluster => TargetCluster, + proto_ver => ProtoVer + }}; decode_route_op1(#{ ?F_OPERATION := ?OP_ROUTE, ?F_ACTOR := Actor, @@ -314,6 +335,14 @@ decode_route_op1(Payload) -> }), {error, Payload}. +decode_resp1(#{ + ?F_OPERATION := ?OP_ACTOR_INIT_ACK, + ?F_ACTOR := Actor, + ?F_PROTO_VER := ProtoVer, + ?F_RESULT := InitResult +}) -> + {actor_init_ack, #{actor => Actor, result => InitResult, proto_ver => ProtoVer}}. + decode_forwarded_msg(Payload) -> case ?DECODE(Payload) of #message{} = Msg -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 2a19a54b3..506eeb176 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -4,6 +4,8 @@ -module(emqx_cluster_link_router_syncer). -include_lib("emqtt/include/emqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include("emqx_cluster_link.hrl"). %% API -export([start_link/1]). @@ -50,9 +52,10 @@ -define(ERROR_DELAY, 200). -define(RECONNECT_TIMEOUT, 5_000). +-define(ACTOR_REINIT_TIMEOUT, 7000). --define(CLIENT_SUFFIX, ":routesync"). --define(PS_CLIENT_SUFFIX, ":routesync-ps"). +-define(CLIENT_SUFFIX, ":routesync:"). +-define(PS_CLIENT_SUFFIX, ":routesync-ps:"). %% Special actor for persistent routes that has the same actor name on all nodes. %% Node actors with the same name nay race with each other (e.g. during bootstrap), @@ -65,6 +68,21 @@ -define(PS_SYNCER_REF(Cluster), {via, gproc, ?PS_SYNCER_NAME(Cluster)}). -define(PS_SYNCER_NAME(Cluster), ?NAME(Cluster, ps_syncer)). +-define(SAFE_MQTT_PUB(Expr, ClientPid), ?SAFE_MQTT_PUB(Expr, ClientPid, ok)). +-define(SAFE_MQTT_PUB(Expr, ClientPid, OnSuccess), + try Expr of + {ok, #{reason_code := __RC}} when __RC < ?RC_UNSPECIFIED_ERROR -> + OnSuccess; + {ok, #{reason_code_name := __RCN}} -> + {error, {mqtt, __RCN}}; + {error, __Reason} -> + {error, __Reason} + catch + exit:__Reason -> + {error, {client, ClientPid, __Reason}} + end +). + push(TargetCluster, OpName, Topic, ID) -> do_push(?SYNCER_NAME(TargetCluster), OpName, Topic, ID). @@ -164,17 +182,11 @@ announce_client(Actor, TargetCluster, Pid) -> ok. publish_routes(ClientPid, Actor, Incarnation, Updates) -> - try emqx_cluster_link_mqtt:publish_route_sync(ClientPid, Actor, Incarnation, Updates) of - {ok, #{reason_code := RC}} when RC < ?RC_UNSPECIFIED_ERROR -> - #{}; - {ok, #{reason_code_name := RCN}} -> - {error, {mqtt, RCN}}; - {error, Reason} -> - {error, Reason} - catch - exit:Reason -> - {error, {client, ClientPid, Reason}} - end. + ?SAFE_MQTT_PUB( + emqx_cluster_link_mqtt:publish_route_sync(ClientPid, Actor, Incarnation, Updates), + ClientPid, + #{} + ). %% Route syncer @@ -227,6 +239,7 @@ batch_get_opname(Op) -> init({sup, TargetCluster}) -> %% FIXME: Intensity. SupFlags = #{ + %% TODO: one_for_one? strategy => one_for_all, intensity => 10, period => 60 @@ -288,7 +301,10 @@ syncer_spec(ChildID, Actor, Incarnation, SyncerRef, ClientName) -> incarnation :: non_neg_integer(), client :: {pid(), reference()}, bootstrapped :: boolean(), - reconnect_timer :: reference() + reconnect_timer :: reference(), + actor_init_req_id :: binary(), + actor_init_timer :: reference(), + remote_actor_info :: undefined | map() }). mk_state(TargetCluster, Actor, Incarnation) -> @@ -314,27 +330,91 @@ handle_cast(_Request, State) -> handle_info({'EXIT', ClientPid, Reason}, St = #st{client = ClientPid}) -> {noreply, handle_client_down(Reason, St)}; -handle_info({timeout, TRef, _Reconnect}, St = #st{reconnect_timer = TRef}) -> +handle_info( + {publish, #{payload := Payload, properties := #{'Correlation-Data' := ReqId}}}, + St = #st{actor_init_req_id = ReqId} +) -> + {actor_init_ack, #{result := Res} = AckInfoMap} = emqx_cluster_link_mqtt:decode_resp(Payload), + St1 = St#st{ + actor_init_req_id = undefined, actor_init_timer = undefined, remote_actor_info = AckInfoMap + }, + case Res of + ok -> + {noreply, post_actor_init(St1)}; + Error -> + ?SLOG(error, #{ + msg => "failed_to_init_link", + reason => Error, + target_cluster => St#st.target, + actor => St#st.actor, + remote_link_proto_ver => maps:get(proto_ver, AckInfoMap, undefined) + }), + %% TODO: It doesn't fit permanent workers/one_for_all restart/strategy. + %% The actor may be kept alive with some error status instead (waiting for a user intervention to fix it)? + {stop, {shutdown, Error}, St1} + end; +handle_info({publish, #{}}, St) -> + {noreply, St}; +handle_info({timeout, TRef, reconnect}, St = #st{reconnect_timer = TRef}) -> {noreply, process_connect(St#st{reconnect_timer = undefined})}; -handle_info(_Info, St) -> - %% TODO: log? +handle_info({timeout, TRef, actor_reinit}, St = #st{reconnect_timer = TRef}) -> + ?SLOG(error, #{ + msg => "remote_actor_init_timeout", + target_cluster => St#st.target, + actor => St#st.actor + }), + {noreply, init_remote_actor(St#st{reconnect_timer = undefined})}; +%% Stale timeout. +handle_info({timeout, _, _}, St) -> + {noreply, St}; +handle_info(Info, St) -> + ?SLOG(warning, #{msg => "unexpected_info", info => Info}), {noreply, St}. terminate(_Reason, _State) -> ok. -process_connect(St = #st{target = TargetCluster, actor = Actor, incarnation = Incr}) -> +process_connect(St = #st{target = TargetCluster, actor = Actor}) -> case start_link_client(TargetCluster, Actor) of {ok, ClientPid} -> - %% TODO: error handling, handshake - {ok, _} = emqx_cluster_link_mqtt:publish_actor_init_sync(ClientPid, Actor, Incr), - ok = start_syncer(TargetCluster, Actor, Incr), ok = announce_client(Actor, TargetCluster, ClientPid), - process_bootstrap(St#st{client = ClientPid}); + %% TODO: handle subscribe errors + {ok, _, _} = emqtt:subscribe(ClientPid, ?RESP_TOPIC(Actor), ?QOS_1), + init_remote_actor(St#st{client = ClientPid}); {error, Reason} -> handle_connect_error(Reason, St) end. +init_remote_actor( + St = #st{target = TargetCluster, client = ClientPid, actor = Actor, incarnation = Incr} +) -> + ReqId = emqx_utils_conv:bin(emqx_utils:gen_id(16)), + Res = ?SAFE_MQTT_PUB( + emqx_cluster_link_mqtt:publish_actor_init_sync( + ClientPid, ReqId, ?RESP_TOPIC(Actor), TargetCluster, Actor, Incr + ), + ClientPid + ), + case Res of + ok -> + ok; + {error, Reason} -> + ?SLOG(error, #{ + msg => "failed_to_init_remote_actor", + reason => Reason, + target_cluster => TargetCluster, + actor => Actor + }) + end, + TRef = erlang:start_timer(?ACTOR_REINIT_TIMEOUT, self(), actor_reinit), + St#st{actor_init_req_id = ReqId, actor_init_timer = TRef}. + +post_actor_init( + St = #st{client = ClientPid, target = TargetCluster, actor = Actor, incarnation = Incr} +) -> + ok = start_syncer(TargetCluster, Actor, Incr), + process_bootstrap(St#st{client = ClientPid}). + handle_connect_error(_Reason, St) -> %% TODO: logs TRef = erlang:start_timer(?RECONNECT_TIMEOUT, self(), reconnect), @@ -342,6 +422,7 @@ handle_connect_error(_Reason, St) -> handle_client_down(_Reason, St = #st{target = TargetCluster, actor = Actor}) -> %% TODO: logs + %% TODO: syncer may be already down due to one_for_all strategy ok = close_syncer(TargetCluster, Actor), process_connect(St#st{client = undefined}). From d578ac3f9ed7651208daa4865f699cb68b633be0 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Mon, 27 May 2024 19:52:50 +0300 Subject: [PATCH 14/46] fix(clusterlink): match correct timer ref in router actor --- apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 506eeb176..7d0a9db25 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -357,7 +357,7 @@ handle_info({publish, #{}}, St) -> {noreply, St}; handle_info({timeout, TRef, reconnect}, St = #st{reconnect_timer = TRef}) -> {noreply, process_connect(St#st{reconnect_timer = undefined})}; -handle_info({timeout, TRef, actor_reinit}, St = #st{reconnect_timer = TRef}) -> +handle_info({timeout, TRef, actor_reinit}, St = #st{actor_init_timer = TRef}) -> ?SLOG(error, #{ msg => "remote_actor_init_timeout", target_cluster => St#st.target, From d5e82cdfac45b811191e052b311522ec151ddd83 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Mon, 27 May 2024 19:54:53 +0300 Subject: [PATCH 15/46] refactor(clusterlink): avoid unnecessary `maybe_` external_broker CB names --- apps/emqx/src/emqx_broker.erl | 4 +-- apps/emqx/src/emqx_external_broker.erl | 36 +++++++++---------- .../emqx_persistent_session_ds_subs.erl | 4 +-- apps/emqx/src/emqx_shared_sub.erl | 4 +-- .../src/emqx_cluster_link.erl | 24 ++++++------- 5 files changed, 36 insertions(+), 36 deletions(-) diff --git a/apps/emqx/src/emqx_broker.erl b/apps/emqx/src/emqx_broker.erl index 5744f2e74..df6898470 100644 --- a/apps/emqx/src/emqx_broker.erl +++ b/apps/emqx/src/emqx_broker.erl @@ -690,9 +690,9 @@ sync_route(Action, Topic, ReplyTo) -> Res. external_sync_route(add, Topic) -> - emqx_external_broker:maybe_add_route(Topic); + emqx_external_broker:add_route(Topic); external_sync_route(delete, Topic) -> - emqx_external_broker:maybe_delete_route(Topic). + emqx_external_broker:delete_route(Topic). push_sync_route(Action, Topic, Opts) -> emqx_router_syncer:push(Action, Topic, node(), Opts). diff --git a/apps/emqx/src/emqx_external_broker.erl b/apps/emqx/src/emqx_external_broker.erl index acd4b8c3d..bf6448490 100644 --- a/apps/emqx/src/emqx_external_broker.erl +++ b/apps/emqx/src/emqx_external_broker.erl @@ -21,14 +21,14 @@ -callback should_route_to_external_dests(emqx_types:message()) -> boolean(). --callback maybe_add_route(emqx_types:topic()) -> ok. --callback maybe_delete_route(emqx_types:topic()) -> ok. +-callback add_route(emqx_types:topic()) -> ok. +-callback delete_route(emqx_types:topic()) -> ok. --callback maybe_add_shared_route(emqx_types:topic(), emqx_types:group()) -> ok. --callback maybe_delete_shared_route(emqx_types:topic(), emqx_types:group()) -> ok. +-callback add_shared_route(emqx_types:topic(), emqx_types:group()) -> ok. +-callback delete_shared_route(emqx_types:topic(), emqx_types:group()) -> ok. --callback maybe_add_persistent_route(emqx_types:topic(), emqx_persistent_session_ds:id()) -> ok. --callback maybe_delete_persistent_route(emqx_types:topic(), emqx_persistent_session_ds:id()) -> ok. +-callback add_persistent_route(emqx_types:topic(), emqx_persistent_session_ds:id()) -> ok. +-callback delete_persistent_route(emqx_types:topic(), emqx_persistent_session_ds:id()) -> ok. -callback match_routes(emqx_types:topic()) -> [emqx_types:route()]. @@ -40,12 +40,12 @@ unregister_provider/1, forward/2, should_route_to_external_dests/1, - maybe_add_route/1, - maybe_delete_route/1, - maybe_add_shared_route/2, - maybe_delete_shared_route/2, - maybe_add_persistent_route/2, - maybe_delete_persistent_route/2, + add_route/1, + delete_route/1, + add_shared_route/2, + delete_shared_route/2, + add_persistent_route/2, + delete_persistent_route/2, match_routes/1 ]). @@ -117,22 +117,22 @@ forward(ExternalDest, Delivery) -> should_route_to_external_dests(Message) -> ?safe_with_provider(?FUNCTION_NAME(Message), false). -maybe_add_route(Topic) -> +add_route(Topic) -> ?safe_with_provider(?FUNCTION_NAME(Topic), ok). -maybe_delete_route(Topic) -> +delete_route(Topic) -> ?safe_with_provider(?FUNCTION_NAME(Topic), ok). -maybe_add_shared_route(Topic, Group) -> +add_shared_route(Topic, Group) -> ?safe_with_provider(?FUNCTION_NAME(Topic, Group), ok). -maybe_delete_shared_route(Topic, Group) -> +delete_shared_route(Topic, Group) -> ?safe_with_provider(?FUNCTION_NAME(Topic, Group), ok). -maybe_add_persistent_route(Topic, ID) -> +add_persistent_route(Topic, ID) -> ?safe_with_provider(?FUNCTION_NAME(Topic, ID), ok). -maybe_delete_persistent_route(Topic, ID) -> +delete_persistent_route(Topic, ID) -> ?safe_with_provider(?FUNCTION_NAME(Topic, ID), ok). match_routes(Topic) -> diff --git a/apps/emqx/src/emqx_persistent_session_ds/emqx_persistent_session_ds_subs.erl b/apps/emqx/src/emqx_persistent_session_ds/emqx_persistent_session_ds_subs.erl index fc86b67a6..0708a980c 100644 --- a/apps/emqx/src/emqx_persistent_session_ds/emqx_persistent_session_ds_subs.erl +++ b/apps/emqx/src/emqx_persistent_session_ds/emqx_persistent_session_ds_subs.erl @@ -92,7 +92,7 @@ on_subscribe(TopicFilter, SubOpts, #{id := SessionId, s := S0, props := Props}) case emqx_persistent_session_ds_state:n_subscriptions(S0) < MaxSubscriptions of true -> ok = emqx_persistent_session_ds_router:do_add_route(TopicFilter, SessionId), - _ = emqx_external_broker:maybe_add_persistent_route(TopicFilter, SessionId), + _ = emqx_external_broker:add_persistent_route(TopicFilter, SessionId), {SubId, S1} = emqx_persistent_session_ds_state:new_id(S0), {SStateId, S2} = emqx_persistent_session_ds_state:new_id(S1), SState = #{ @@ -155,7 +155,7 @@ on_unsubscribe(SessionId, TopicFilter, S0) -> #{session_id => SessionId, topic_filter => TopicFilter}, ok = emqx_persistent_session_ds_router:do_delete_route(TopicFilter, SessionId) ), - _ = emqx_external_broker:maybe_delete_persistent_route(TopicFilter, SessionId), + _ = emqx_external_broker:delete_persistent_route(TopicFilter, SessionId), {ok, emqx_persistent_session_ds_state:del_subscription(TopicFilter, S0), Subscription} end. diff --git a/apps/emqx/src/emqx_shared_sub.erl b/apps/emqx/src/emqx_shared_sub.erl index 4498523da..519ede132 100644 --- a/apps/emqx/src/emqx_shared_sub.erl +++ b/apps/emqx/src/emqx_shared_sub.erl @@ -425,7 +425,7 @@ handle_call({subscribe, Group, Topic, SubPid}, _From, State = #state{pmon = PMon ok; false -> ok = emqx_router:do_add_route(Topic, {Group, node()}), - _ = emqx_external_broker:maybe_add_shared_route(Topic, Group), + _ = emqx_external_broker:add_shared_route(Topic, Group), ok end, ok = maybe_insert_alive_tab(SubPid), @@ -550,7 +550,7 @@ is_alive_sub(Pid) -> delete_route_if_needed({Group, Topic} = GroupTopic) -> if_no_more_subscribers(GroupTopic, fun() -> ok = emqx_router:do_delete_route(Topic, {Group, node()}), - _ = emqx_external_broker:maybe_delete_shared_route(Topic, Group), + _ = emqx_external_broker:delete_shared_route(Topic, Group), ok end). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index 37456faea..1d196aa91 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -9,12 +9,12 @@ -export([ register_external_broker/0, unregister_external_broker/0, - maybe_add_route/1, - maybe_delete_route/1, - maybe_add_shared_route/2, - maybe_delete_shared_route/2, - maybe_add_persistent_route/2, - maybe_delete_persistent_route/2, + add_route/1, + delete_route/1, + add_shared_route/2, + delete_shared_route/2, + add_persistent_route/2, + delete_persistent_route/2, match_routes/1, forward/2, should_route_to_external_dests/1 @@ -48,22 +48,22 @@ unregister_external_broker() -> %% to delete it on a remote cluster. %% There is no need to push Node name as this info can be derived from %% agent state on the remote cluster. -maybe_add_route(Topic) -> +add_route(Topic) -> maybe_push_route_op(add, Topic, Topic). -maybe_delete_route(Topic) -> +delete_route(Topic) -> maybe_push_route_op(delete, Topic, Topic). -maybe_add_shared_route(Topic, Group) -> +add_shared_route(Topic, Group) -> maybe_push_route_op(add, Topic, ?SHARED_ROUTE_ID(Topic, Group)). -maybe_delete_shared_route(Topic, Group) -> +delete_shared_route(Topic, Group) -> maybe_push_route_op(delete, Topic, ?SHARED_ROUTE_ID(Topic, Group)). -maybe_add_persistent_route(Topic, ID) -> +add_persistent_route(Topic, ID) -> maybe_push_route_op(add, Topic, ?PERSISTENT_ROUTE_ID(Topic, ID), push_persistent_route). -maybe_delete_persistent_route(Topic, ID) -> +delete_persistent_route(Topic, ID) -> maybe_push_route_op(delete, Topic, ?PERSISTENT_ROUTE_ID(Topic, ID), push_persistent_route). forward(DestCluster, Delivery) -> From 21711c6e0d0f0e68d9e6730c8452a6b3637fd080 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Tue, 28 May 2024 17:55:36 +0300 Subject: [PATCH 16/46] fix(clusterlink): communicate bootstrap requirements via actor handshake `session_present` flag is not reliable to decide whether bootstrap is needed if durable sessions is enabled. In this case, the client session may survive cluster restart while all the external routes are lost, as they are not persistent. --- .../src/emqx_cluster_link.erl | 5 +--- .../src/emqx_cluster_link_extrouter.erl | 18 ++++++++---- .../src/emqx_cluster_link_mqtt.erl | 25 ++++++++++++---- .../src/emqx_cluster_link_router_syncer.erl | 29 ++++++++----------- 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index 1d196aa91..7a64b0ff7 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -170,10 +170,7 @@ actor_init( case MyClusterName of TargetCluster -> Env = #{timestamp => erlang:system_time(millisecond)}, - {ok, _} = emqx_cluster_link_extrouter:actor_init( - ClusterName, Actor, Incr, Env - ), - ok; + emqx_cluster_link_extrouter:actor_init(ClusterName, Actor, Incr, Env); _ -> %% The remote cluster uses a different name to refer to this cluster ?SLOG(error, #{ diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index bbec844df..9b6cf07e2 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -19,7 +19,8 @@ actor_state/3, actor_apply_operation/2, actor_apply_operation/3, - actor_gc/1 + actor_gc/1, + is_present_incarnation/1 ]). %% Internal API @@ -140,7 +141,8 @@ match_to_route(M) -> cluster :: cluster(), actor :: actor(), incarnation :: incarnation(), - lane :: lane() | undefined + lane :: lane() | undefined, + extra :: map() }). -type state() :: #state{}. @@ -159,6 +161,12 @@ actor_init(Cluster, Actor, Incarnation, Env = #{timestamp := Now}) -> actor_init(Cluster, Actor, Incarnation, Env) end. +-spec is_present_incarnation(state()) -> boolean(). +is_present_incarnation(#state{extra = #{is_present_incarnation := IsNew}}) -> + IsNew; +is_present_incarnation(_State) -> + false. + mnesia_actor_init(Cluster, Actor, Incarnation, TS) -> %% NOTE %% We perform this heavy-weight transaction only in the case of a new route @@ -173,7 +181,7 @@ mnesia_actor_init(Cluster, Actor, Incarnation, TS) -> case mnesia:read(?EXTROUTE_ACTOR_TAB, ActorID, write) of [#actor{incarnation = Incarnation, lane = Lane} = Rec] -> ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec#actor{until = bump_actor_ttl(TS)}, write), - {ok, State#state{lane = Lane}}; + {ok, State#state{lane = Lane, extra = #{is_present_incarnation => true}}}; [] -> Lane = mnesia_assign_lane(Cluster), Rec = #actor{ @@ -183,7 +191,7 @@ mnesia_actor_init(Cluster, Actor, Incarnation, TS) -> until = bump_actor_ttl(TS) }, ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec, write), - {ok, State#state{lane = Lane}}; + {ok, State#state{lane = Lane, extra = #{is_present_incarnation => false}}}; [#actor{incarnation = Outdated} = Rec] when Incarnation > Outdated -> {reincarnate, Rec}; [#actor{incarnation = Newer}] -> @@ -321,7 +329,7 @@ mnesia_clean_incarnation(#actor{id = Actor, incarnation = Incarnation, lane = La clean_lane(Lane) -> ets:foldl( fun(#extroute{entry = Entry, mcounter = MCounter}, _) -> - apply_operation(Entry, MCounter, del, Lane) + apply_operation(Entry, MCounter, delete, Lane) end, 0, ?EXTROUTE_TAB diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index 3ebfd6de6..db4e39224 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -71,6 +71,7 @@ -define(F_TARGET_CLUSTER, 13). -define(F_PROTO_VER, 14). -define(F_RESULT, 15). +-define(F_NEED_BOOTSTRAP, 16). -define(ROUTE_DELETE, 100). @@ -279,18 +280,29 @@ actor_init_ack_resp_msg(Actor, InitRes, ReqId, RespTopic) -> Payload = #{ ?F_OPERATION => ?OP_ACTOR_INIT_ACK, ?F_PROTO_VER => ?PROTO_VER, - ?F_ACTOR => Actor, - ?F_RESULT => InitRes + ?F_ACTOR => Actor }, + Payload1 = with_res_and_bootstrap(Payload, InitRes), emqx_message:make( undefined, ?QOS_1, RespTopic, - ?ENCODE(Payload), + ?ENCODE(Payload1), #{}, #{properties => #{'Correlation-Data' => ReqId}} ). +with_res_and_bootstrap(Payload, {ok, ActorState}) -> + Payload#{ + ?F_RESULT => ok, + ?F_NEED_BOOTSTRAP => not emqx_cluster_link_extrouter:is_present_incarnation(ActorState) + }; +with_res_and_bootstrap(Payload, Error) -> + Payload#{ + ?F_RESULT => Error, + ?F_NEED_BOOTSTRAP => false + }. + publish_route_sync(ClientPid, Actor, Incarnation, Updates) -> PubTopic = ?ROUTE_TOPIC, Payload = #{ @@ -339,9 +351,12 @@ decode_resp1(#{ ?F_OPERATION := ?OP_ACTOR_INIT_ACK, ?F_ACTOR := Actor, ?F_PROTO_VER := ProtoVer, - ?F_RESULT := InitResult + ?F_RESULT := InitResult, + ?F_NEED_BOOTSTRAP := NeedBootstrap }) -> - {actor_init_ack, #{actor => Actor, result => InitResult, proto_ver => ProtoVer}}. + {actor_init_ack, #{ + actor => Actor, result => InitResult, proto_ver => ProtoVer, need_bootstrap => NeedBootstrap + }}. decode_forwarded_msg(Payload) -> case ?DECODE(Payload) of diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 7d0a9db25..0571ba099 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -164,14 +164,6 @@ refine_client_options(Options = #{clientid := ClientID}, Actor) -> retry_interval => 0 }. -client_session_present(ClientPid) -> - Info = emqtt:info(ClientPid), - %% FIXME: waitnig for emqtt release that fixes session_present type (must be a boolean) - case proplists:get_value(session_present, Info, 0) of - 0 -> false; - 1 -> true - end. - announce_client(Actor, TargetCluster, Pid) -> Name = case Actor of @@ -334,13 +326,15 @@ handle_info( {publish, #{payload := Payload, properties := #{'Correlation-Data' := ReqId}}}, St = #st{actor_init_req_id = ReqId} ) -> - {actor_init_ack, #{result := Res} = AckInfoMap} = emqx_cluster_link_mqtt:decode_resp(Payload), + {actor_init_ack, #{result := Res, need_bootstrap := NeedBootstrap} = AckInfoMap} = emqx_cluster_link_mqtt:decode_resp( + Payload + ), St1 = St#st{ actor_init_req_id = undefined, actor_init_timer = undefined, remote_actor_info = AckInfoMap }, case Res of ok -> - {noreply, post_actor_init(St1)}; + {noreply, post_actor_init(St1, NeedBootstrap)}; Error -> ?SLOG(error, #{ msg => "failed_to_init_link", @@ -410,10 +404,11 @@ init_remote_actor( St#st{actor_init_req_id = ReqId, actor_init_timer = TRef}. post_actor_init( - St = #st{client = ClientPid, target = TargetCluster, actor = Actor, incarnation = Incr} + St = #st{client = ClientPid, target = TargetCluster, actor = Actor, incarnation = Incr}, + NeedBootstrap ) -> ok = start_syncer(TargetCluster, Actor, Incr), - process_bootstrap(St#st{client = ClientPid}). + process_bootstrap(St#st{client = ClientPid}, NeedBootstrap). handle_connect_error(_Reason, St) -> %% TODO: logs @@ -426,14 +421,14 @@ handle_client_down(_Reason, St = #st{target = TargetCluster, actor = Actor}) -> ok = close_syncer(TargetCluster, Actor), process_connect(St#st{client = undefined}). -process_bootstrap(St = #st{bootstrapped = false}) -> +process_bootstrap(St = #st{bootstrapped = false}, _NeedBootstrap) -> run_bootstrap(St); -process_bootstrap(St = #st{client = ClientPid, bootstrapped = true}) -> - case client_session_present(ClientPid) of +process_bootstrap(St = #st{bootstrapped = true}, NeedBootstrap) -> + case NeedBootstrap of true -> - process_bootstrapped(St); + run_bootstrap(St); false -> - run_bootstrap(St) + process_bootstrapped(St) end. %% Bootstrapping. From faa4420e1f48991b6fa36c3582f89747d2014f31 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Tue, 28 May 2024 22:10:23 +0300 Subject: [PATCH 17/46] fix(clusterlink): improve actor error handling Add status and error reason to the actor state, report alarms. --- .../src/emqx_cluster_link_router_syncer.erl | 103 +++++++++++++----- 1 file changed, 77 insertions(+), 26 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 0571ba099..3c87b9e37 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -296,7 +296,9 @@ syncer_spec(ChildID, Actor, Incarnation, SyncerRef, ClientName) -> reconnect_timer :: reference(), actor_init_req_id :: binary(), actor_init_timer :: reference(), - remote_actor_info :: undefined | map() + remote_actor_info :: undefined | map(), + status :: connecting | connected | disconnected, + error :: undefined | term() }). mk_state(TargetCluster, Actor, Incarnation) -> @@ -304,7 +306,8 @@ mk_state(TargetCluster, Actor, Incarnation) -> target = TargetCluster, actor = Actor, incarnation = Incarnation, - bootstrapped = false + bootstrapped = false, + status = connecting }. init_actor(State = #st{}) -> @@ -334,18 +337,20 @@ handle_info( }, case Res of ok -> - {noreply, post_actor_init(St1, NeedBootstrap)}; + _ = maybe_deactivate_alarm(St), + {noreply, + post_actor_init(St1#st{error = undefined, status = connected}, NeedBootstrap)}; Error -> + Reason = error_reason(Error), ?SLOG(error, #{ msg => "failed_to_init_link", - reason => Error, + reason => Reason, target_cluster => St#st.target, actor => St#st.actor, remote_link_proto_ver => maps:get(proto_ver, AckInfoMap, undefined) }), - %% TODO: It doesn't fit permanent workers/one_for_all restart/strategy. - %% The actor may be kept alive with some error status instead (waiting for a user intervention to fix it)? - {stop, {shutdown, Error}, St1} + _ = maybe_alarm(Reason, St1), + {noreply, St1#st{error = Reason, status = disconnected}} end; handle_info({publish, #{}}, St) -> {noreply, St}; @@ -357,7 +362,10 @@ handle_info({timeout, TRef, actor_reinit}, St = #st{actor_init_timer = TRef}) -> target_cluster => St#st.target, actor => St#st.actor }), - {noreply, init_remote_actor(St#st{reconnect_timer = undefined})}; + Reason = init_timeout, + _ = maybe_alarm(Reason, St), + {noreply, + init_remote_actor(St#st{reconnect_timer = undefined, status = disconnected, error = Reason})}; %% Stale timeout. handle_info({timeout, _, _}, St) -> {noreply, St}; @@ -371,6 +379,7 @@ terminate(_Reason, _State) -> process_connect(St = #st{target = TargetCluster, actor = Actor}) -> case start_link_client(TargetCluster, Actor) of {ok, ClientPid} -> + _ = maybe_deactivate_alarm(St), ok = announce_client(Actor, TargetCluster, ClientPid), %% TODO: handle subscribe errors {ok, _, _} = emqtt:subscribe(ClientPid, ?RESP_TOPIC(Actor), ?QOS_1), @@ -389,19 +398,22 @@ init_remote_actor( ), ClientPid ), - case Res of - ok -> - ok; - {error, Reason} -> - ?SLOG(error, #{ - msg => "failed_to_init_remote_actor", - reason => Reason, - target_cluster => TargetCluster, - actor => Actor - }) - end, + St1 = + case Res of + ok -> + St#st{status = connecting}; + {error, Reason} -> + ?SLOG(error, #{ + msg => "cluster_link_init_failed", + reason => Reason, + target_cluster => TargetCluster, + actor => Actor + }), + _ = maybe_alarm(Reason, St), + St#st{error = Reason, status = disconnected} + end, TRef = erlang:start_timer(?ACTOR_REINIT_TIMEOUT, self(), actor_reinit), - St#st{actor_init_req_id = ReqId, actor_init_timer = TRef}. + St1#st{actor_init_req_id = ReqId, actor_init_timer = TRef}. post_actor_init( St = #st{client = ClientPid, target = TargetCluster, actor = Actor, incarnation = Incr}, @@ -410,16 +422,28 @@ post_actor_init( ok = start_syncer(TargetCluster, Actor, Incr), process_bootstrap(St#st{client = ClientPid}, NeedBootstrap). -handle_connect_error(_Reason, St) -> - %% TODO: logs +handle_connect_error(Reason, St) -> + ?SLOG(error, #{ + msg => "cluster_link_connection_failed", + reason => Reason, + target_cluster => St#st.target, + actor => St#st.actor + }), TRef = erlang:start_timer(?RECONNECT_TIMEOUT, self(), reconnect), - St#st{reconnect_timer = TRef}. + _ = maybe_alarm(Reason, St), + St#st{reconnect_timer = TRef, error = Reason, status = disconnected}. -handle_client_down(_Reason, St = #st{target = TargetCluster, actor = Actor}) -> - %% TODO: logs +handle_client_down(Reason, St = #st{target = TargetCluster, actor = Actor}) -> + ?SLOG(error, #{ + msg => "cluster_link_connection_failed", + reason => Reason, + target_cluster => St#st.target, + actor => St#st.actor + }), %% TODO: syncer may be already down due to one_for_all strategy ok = close_syncer(TargetCluster, Actor), - process_connect(St#st{client = undefined}). + _ = maybe_alarm(Reason, St), + process_connect(St#st{client = undefined, error = Reason, status = connecting}). process_bootstrap(St = #st{bootstrapped = false}, _NeedBootstrap) -> run_bootstrap(St); @@ -471,3 +495,30 @@ process_bootstrapped(St = #st{target = TargetCluster, actor = Actor}) -> process_bootstrap_batch(Batch, #st{client = ClientPid, actor = Actor, incarnation = Incarnation}) -> publish_routes(ClientPid, Actor, Incarnation, Batch). + +error_reason({error, Reason}) -> + Reason; +error_reason(OtherErr) -> + OtherErr. + +%% Assume that alarm is already active +maybe_alarm(Error, #st{error = Error}) -> + ok; +maybe_alarm(Error, St) -> + HrError = emqx_utils:readable_error_msg(error_reason(Error)), + Name = link_name(St), + emqx_alarm:safe_activate( + Name, + #{custer_link => Name, reason => cluster_link_down}, + <<"cluster link down: ", HrError/binary>> + ). + +maybe_deactivate_alarm(#st{error = undefined}) -> + ok; +maybe_deactivate_alarm(St) -> + emqx_alarm:safe_deactivate(link_name(St)). + +link_name(#st{actor = ?PS_ACTOR = Actor, target = Target}) -> + <<"cluster_link:", Target/binary, ":", (get_actor_id())/binary, ":", Actor/binary>>; +link_name(#st{actor = Actor, target = Target}) -> + <<"cluster_link:", Target/binary, ":", Actor/binary>>. From d4b449c6e1869976c19340fa6a5bd98c99d351c9 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 20 May 2024 11:38:07 +0200 Subject: [PATCH 18/46] feat(cluster-link): implement replication actor heartbeats --- .gitignore | 4 ++++ .../src/emqx_cluster_link.erl | 9 +++++++- .../src/emqx_cluster_link_extrouter.erl | 1 - .../src/emqx_cluster_link_mqtt.erl | 22 +++++++++++++++---- .../src/emqx_cluster_link_router_syncer.erl | 20 ++++++++++++++++- 5 files changed, 49 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 8d95669ac..e97571448 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,7 @@ rebar-git-cache.tar apps/emqx_utils/src/emqx_variform_parser.erl apps/emqx_utils/src/emqx_variform_scan.erl default-profile.mk +# local +/_compat +/scratch +SCRATCH diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index 7a64b0ff7..515d8d125 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -91,7 +91,9 @@ on_message_publish( {actor_init, InitInfoMap} -> actor_init(ClusterName, emqx_message:get_header(properties, Msg), InitInfoMap); {route_updates, #{actor := Actor, incarnation := Incr}, RouteOps} -> - update_routes(ClusterName, Actor, Incr, RouteOps) + update_routes(ClusterName, Actor, Incr, RouteOps); + {heartbeat, #{actor := Actor, incarnation := Incr}} -> + actor_heartbeat(ClusterName, Actor, Incr) end, {stop, []}; on_message_publish(#message{topic = <>, payload = Payload}) -> @@ -201,6 +203,11 @@ update_routes(ClusterName, Actor, Incarnation, RouteOps) -> RouteOps ). +actor_heartbeat(ClusterName, Actor, Incarnation) -> + Env = #{timestamp => erlang:system_time(millisecond)}, + ActorState = emqx_cluster_link_extrouter:actor_state(ClusterName, Actor, Incarnation), + _State = emqx_cluster_link_extrouter:actor_apply_operation(heartbeat, ActorState, Env). + %% let it crash if extra is not a map, %% we don't expect the message to be forwarded from an older EMQX release, %% that doesn't set extra = #{} by default. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index 9b6cf07e2..6e4fff7c7 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -244,7 +244,6 @@ apply_operation(Entry, OpName, Lane) -> %% This is safe sequence of operations only on core nodes. On replicants, %% `mria:dirty_update_counter/3` will be replicated asynchronously, which %% means this read can be stale. - % MCounter = ets:lookup_element(Tab, Entry, 2, 0), case mnesia:dirty_read(?EXTROUTE_TAB, Entry) of [#extroute{mcounter = MCounter}] -> apply_operation(Entry, MCounter, OpName, Lane); diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index db4e39224..b126ce886 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -37,6 +37,7 @@ publish_actor_init_sync/6, actor_init_ack_resp_msg/4, publish_route_sync/4, + publish_heartbeat/3, encode_field/2 ]). @@ -62,6 +63,7 @@ -define(F_OPERATION, '$op'). -define(OP_ROUTE, <<"route">>). +-define(OP_HEARTBEAT, <<"heartbeat">>). -define(OP_ACTOR_INIT, <<"actor_init">>). -define(OP_ACTOR_INIT_ACK, <<"actor_init_ack">>). @@ -262,7 +264,6 @@ connect(Options) -> %%% New leader-less Syncer/Actor implementation publish_actor_init_sync(ClientPid, ReqId, RespTopic, TargetCluster, Actor, Incarnation) -> - PubTopic = ?ROUTE_TOPIC, Payload = #{ ?F_OPERATION => ?OP_ACTOR_INIT, ?F_PROTO_VER => ?PROTO_VER, @@ -274,7 +275,7 @@ publish_actor_init_sync(ClientPid, ReqId, RespTopic, TargetCluster, Actor, Incar 'Response-Topic' => RespTopic, 'Correlation-Data' => ReqId }, - emqtt:publish(ClientPid, PubTopic, Properties, ?ENCODE(Payload), [{qos, ?QOS_1}]). + emqtt:publish(ClientPid, ?ROUTE_TOPIC, Properties, ?ENCODE(Payload), [{qos, ?QOS_1}]). actor_init_ack_resp_msg(Actor, InitRes, ReqId, RespTopic) -> Payload = #{ @@ -304,14 +305,21 @@ with_res_and_bootstrap(Payload, Error) -> }. publish_route_sync(ClientPid, Actor, Incarnation, Updates) -> - PubTopic = ?ROUTE_TOPIC, Payload = #{ ?F_OPERATION => ?OP_ROUTE, ?F_ACTOR => Actor, ?F_INCARNATION => Incarnation, ?F_ROUTES => Updates }, - emqtt:publish(ClientPid, PubTopic, ?ENCODE(Payload), ?QOS_1). + emqtt:publish(ClientPid, ?ROUTE_TOPIC, ?ENCODE(Payload), ?QOS_1). + +publish_heartbeat(ClientPid, Actor, Incarnation) -> + Payload = #{ + ?F_OPERATION => ?OP_HEARTBEAT, + ?F_ACTOR => Actor, + ?F_INCARNATION => Incarnation + }, + emqtt:publish_async(ClientPid, ?ROUTE_TOPIC, ?ENCODE(Payload), ?QOS_0, undefined). decode_route_op(Payload) -> decode_route_op1(?DECODE(Payload)). @@ -340,6 +348,12 @@ decode_route_op1(#{ }) -> RouteOps1 = lists:map(fun(Op) -> decode_field(route, Op) end, RouteOps), {route_updates, #{actor => Actor, incarnation => Incr}, RouteOps1}; +decode_route_op1(#{ + ?F_OPERATION := ?OP_HEARTBEAT, + ?F_ACTOR := Actor, + ?F_INCARNATION := Incr +}) -> + {heartbeat, #{actor => Actor, incarnation => Incr}}; decode_route_op1(Payload) -> ?SLOG(warning, #{ msg => "unexpected_cluster_link_route_op_payload", diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 3c87b9e37..dc3903f72 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -53,6 +53,7 @@ -define(RECONNECT_TIMEOUT, 5_000). -define(ACTOR_REINIT_TIMEOUT, 7000). +-define(HEARTBEAT_INTERVAL, 10_000). -define(CLIENT_SUFFIX, ":routesync:"). -define(PS_CLIENT_SUFFIX, ":routesync-ps:"). @@ -180,6 +181,10 @@ publish_routes(ClientPid, Actor, Incarnation, Updates) -> #{} ). +publish_heartbeat(ClientPid, Actor, Incarnation) -> + %% NOTE: Fully asynchronous, no need for error handling. + emqx_cluster_link_mqtt:publish_heartbeat(ClientPid, Actor, Incarnation). + %% Route syncer start_syncer(TargetCluster, Actor, Incr) -> @@ -294,6 +299,7 @@ syncer_spec(ChildID, Actor, Incarnation, SyncerRef, ClientName) -> client :: {pid(), reference()}, bootstrapped :: boolean(), reconnect_timer :: reference(), + heartbeat_timer :: reference(), actor_init_req_id :: binary(), actor_init_timer :: reference(), remote_actor_info :: undefined | map(), @@ -366,6 +372,8 @@ handle_info({timeout, TRef, actor_reinit}, St = #st{actor_init_timer = TRef}) -> _ = maybe_alarm(Reason, St), {noreply, init_remote_actor(St#st{reconnect_timer = undefined, status = disconnected, error = Reason})}; +handle_info({timeout, TRef, _Heartbeat}, St = #st{heartbeat_timer = TRef}) -> + {noreply, process_heartbeat(St#st{heartbeat_timer = undefined})}; %% Stale timeout. handle_info({timeout, _, _}, St) -> {noreply, St}; @@ -420,7 +428,9 @@ post_actor_init( NeedBootstrap ) -> ok = start_syncer(TargetCluster, Actor, Incr), - process_bootstrap(St#st{client = ClientPid}, NeedBootstrap). + %% TODO: Heartbeats are currently blocked by bootstrapping. + NSt = schedule_heartbeat(St#st{client = ClientPid}), + process_bootstrap(NSt, NeedBootstrap). handle_connect_error(Reason, St) -> ?SLOG(error, #{ @@ -455,6 +465,14 @@ process_bootstrap(St = #st{bootstrapped = true}, NeedBootstrap) -> process_bootstrapped(St) end. +process_heartbeat(St = #st{client = ClientPid, actor = Actor, incarnation = Incarnation}) -> + ok = publish_heartbeat(ClientPid, Actor, Incarnation), + schedule_heartbeat(St). + +schedule_heartbeat(St = #st{heartbeat_timer = undefined}) -> + TRef = erlang:start_timer(?HEARTBEAT_INTERVAL, self(), heartbeat), + St#st{heartbeat_timer = TRef}. + %% Bootstrapping. %% Responsible for transferring local routing table snapshot to the target %% cluster. Does so either during the initial startup or when MQTT connection From 5771a41a32fd2b4458cbba051bf10b709d0d56b9 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 20 May 2024 11:44:20 +0200 Subject: [PATCH 19/46] fix(cluster-link): ensure replication actor bootstraps do heartbeats --- .../src/emqx_cluster_link_router_syncer.erl | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index dc3903f72..9d35bb812 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -389,8 +389,6 @@ process_connect(St = #st{target = TargetCluster, actor = Actor}) -> {ok, ClientPid} -> _ = maybe_deactivate_alarm(St), ok = announce_client(Actor, TargetCluster, ClientPid), - %% TODO: handle subscribe errors - {ok, _, _} = emqtt:subscribe(ClientPid, ?RESP_TOPIC(Actor), ?QOS_1), init_remote_actor(St#st{client = ClientPid}); {error, Reason} -> handle_connect_error(Reason, St) @@ -400,6 +398,8 @@ init_remote_actor( St = #st{target = TargetCluster, client = ClientPid, actor = Actor, incarnation = Incr} ) -> ReqId = emqx_utils_conv:bin(emqx_utils:gen_id(16)), + %% TODO: handle subscribe errors + {ok, _, _} = emqtt:subscribe(ClientPid, ?RESP_TOPIC(Actor), ?QOS_1), Res = ?SAFE_MQTT_PUB( emqx_cluster_link_mqtt:publish_actor_init_sync( ClientPid, ReqId, ?RESP_TOPIC(Actor), TargetCluster, Actor, Incr @@ -428,7 +428,6 @@ post_actor_init( NeedBootstrap ) -> ok = start_syncer(TargetCluster, Actor, Incr), - %% TODO: Heartbeats are currently blocked by bootstrapping. NSt = schedule_heartbeat(St#st{client = ClientPid}), process_bootstrap(NSt, NeedBootstrap). @@ -500,7 +499,8 @@ run_bootstrap(Bootstrap, St) -> %% TODO: Better error handling. case process_bootstrap_batch(Batch, St) of #{} -> - run_bootstrap(NBootstrap, St); + NSt = ensure_bootstrap_heartbeat(St), + run_bootstrap(NBootstrap, NSt); {error, {client, _, _}} -> %% Client has exited, let `reconnect` codepath handle it. St @@ -514,6 +514,17 @@ process_bootstrapped(St = #st{target = TargetCluster, actor = Actor}) -> process_bootstrap_batch(Batch, #st{client = ClientPid, actor = Actor, incarnation = Incarnation}) -> publish_routes(ClientPid, Actor, Incarnation, Batch). +ensure_bootstrap_heartbeat(St = #st{heartbeat_timer = TRef}) -> + case erlang:read_timer(TRef) of + false -> + ok = emqx_utils:cancel_timer(TRef), + process_heartbeat(St); + _TimeLeft -> + St + end. + +%% + error_reason({error, Reason}) -> Reason; error_reason(OtherErr) -> From 43d114546c3c348b450d5f835b9c2269d5c909fe Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 20 May 2024 11:55:56 +0200 Subject: [PATCH 20/46] feat(cluster-link): preserve replication actor state in pdict --- .../src/emqx_cluster_link.erl | 131 ++++++++++-------- .../src/emqx_cluster_link_mqtt.erl | 17 ++- 2 files changed, 82 insertions(+), 66 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index 515d8d125..2b469b114 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -86,15 +86,16 @@ should_route_to_external_dests(_Msg) -> on_message_publish( #message{topic = <>, payload = Payload} = Msg ) -> - _ = - case emqx_cluster_link_mqtt:decode_route_op(Payload) of - {actor_init, InitInfoMap} -> - actor_init(ClusterName, emqx_message:get_header(properties, Msg), InitInfoMap); - {route_updates, #{actor := Actor, incarnation := Incr}, RouteOps} -> - update_routes(ClusterName, Actor, Incr, RouteOps); - {heartbeat, #{actor := Actor, incarnation := Incr}} -> - actor_heartbeat(ClusterName, Actor, Incr) - end, + case emqx_cluster_link_mqtt:decode_route_op(Payload) of + {actor_init, Actor, InitInfo} -> + Result = actor_init(ClusterName, Actor, InitInfo), + _ = actor_init_ack(Actor, Result, Msg), + ok; + {route_updates, #{actor := Actor}, RouteOps} -> + ok = update_routes(ClusterName, Actor, RouteOps); + {heartbeat, #{actor := Actor}} -> + ok = actor_heartbeat(ClusterName, Actor) + end, {stop, []}; on_message_publish(#message{topic = <>, payload = Payload}) -> case emqx_cluster_link_mqtt:decode_forwarded_msg(Payload) of @@ -117,6 +118,9 @@ delete_hook() -> %% Internal functions %%-------------------------------------------------------------------- +-define(PD_EXTROUTER_ACTOR, '$clink_extrouter_actor'). +-define(PD_EXTROUTER_ACTOR_STATE, '$clink_extrouter_actor_state'). + maybe_push_route_op(Op, Topic, RouteID) -> maybe_push_route_op(Op, Topic, RouteID, push). @@ -143,70 +147,79 @@ topic_intersect_any(_Topic, []) -> actor_init( ClusterName, - #{'Correlation-Data' := ReqId, 'Response-Topic' := RespTopic}, + #{actor := Actor, incarnation := Incr}, #{ - actor := Actor, - incarnation := Incr, - cluster := TargetCluster, + target_cluster := TargetCluster, proto_ver := _ } ) -> - Res = - case emqx_cluster_link_config:link(ClusterName) of - undefined -> - ?SLOG( - error, - #{ - msg => "init_link_request_from_unknown_cluster", - link_name => ClusterName - } - ), - %% Avoid atom error reasons, since they can be sent to the remote cluster, - %% which will use safe binary_to_term decoding - %% TODO: add error details? - {error, <<"unknown_cluster">>}; - LinkConf -> - %% TODO: may be worth checking resource health and communicate it? - _ = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf), - MyClusterName = emqx_cluster_link_config:cluster(), - case MyClusterName of - TargetCluster -> - Env = #{timestamp => erlang:system_time(millisecond)}, - emqx_cluster_link_extrouter:actor_init(ClusterName, Actor, Incr, Env); - _ -> - %% The remote cluster uses a different name to refer to this cluster - ?SLOG(error, #{ - msg => "misconfigured_cluster_link_name", - %% How this cluster names itself - local_name => MyClusterName, - %% How the remote cluster names this local cluster - remote_name => TargetCluster, - %% How the remote cluster names itself - received_from => ClusterName - }), - {error, <<"bad_remote_cluster_link_name">>} - end - end, - _ = actor_init_ack(Actor, Res, ReqId, RespTopic), - {stop, []}. + case emqx_cluster_link_config:link(ClusterName) of + undefined -> + ?SLOG(error, #{ + msg => "init_link_request_from_unknown_cluster", + link_name => ClusterName + }), + %% Avoid atom error reasons, since they can be sent to the remote cluster, + %% which will use safe binary_to_term decoding + %% TODO: add error details? + {error, <<"unknown_cluster">>}; + LinkConf -> + %% TODO: may be worth checking resource health and communicate it? + _ = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf), + MyClusterName = emqx_cluster_link_config:cluster(), + case MyClusterName of + TargetCluster -> + Env = #{timestamp => erlang:system_time(millisecond)}, + {ok, ActorSt} = emqx_cluster_link_extrouter:actor_init( + ClusterName, Actor, Incr, Env + ), + undefined = set_actor_state(ClusterName, Actor, ActorSt), + ok; + _ -> + %% The remote cluster uses a different name to refer to this cluster + ?SLOG(error, #{ + msg => "misconfigured_cluster_link_name", + %% How this cluster names itself + local_name => MyClusterName, + %% How the remote cluster names this local cluster + remote_name => TargetCluster, + %% How the remote cluster names itself + received_from => ClusterName + }), + {error, <<"bad_remote_cluster_link_name">>} + end + end. -actor_init_ack(Actor, Res, ReqId, RespTopic) -> - RespMsg = emqx_cluster_link_mqtt:actor_init_ack_resp_msg(Actor, Res, ReqId, RespTopic), +actor_init_ack(#{actor := Actor}, Res, MsgIn) -> + RespMsg = emqx_cluster_link_mqtt:actor_init_ack_resp_msg(Actor, Res, MsgIn), emqx_broker:publish(RespMsg). -update_routes(ClusterName, Actor, Incarnation, RouteOps) -> - ActorState = emqx_cluster_link_extrouter:actor_state(ClusterName, Actor, Incarnation), +update_routes(ClusterName, Actor, RouteOps) -> + ActorSt = get_actor_state(ClusterName, Actor), lists:foreach( fun(RouteOp) -> - emqx_cluster_link_extrouter:actor_apply_operation(RouteOp, ActorState) + emqx_cluster_link_extrouter:actor_apply_operation(RouteOp, ActorSt) end, RouteOps ). -actor_heartbeat(ClusterName, Actor, Incarnation) -> +actor_heartbeat(ClusterName, Actor) -> Env = #{timestamp => erlang:system_time(millisecond)}, - ActorState = emqx_cluster_link_extrouter:actor_state(ClusterName, Actor, Incarnation), - _State = emqx_cluster_link_extrouter:actor_apply_operation(heartbeat, ActorState, Env). + ActorSt0 = get_actor_state(ClusterName, Actor), + ActorSt = emqx_cluster_link_extrouter:actor_apply_operation(heartbeat, ActorSt0, Env), + _ = update_actor_state(ActorSt), + ok. + +get_actor_state(ClusterName, Actor) -> + {ClusterName, Actor} = erlang:get(?PD_EXTROUTER_ACTOR), + erlang:get(?PD_EXTROUTER_ACTOR_STATE). + +set_actor_state(ClusterName, Actor, ActorSt) -> + undefined = erlang:put(?PD_EXTROUTER_ACTOR, {ClusterName, Actor}), + update_actor_state(ActorSt). + +update_actor_state(ActorSt) -> + erlang:put(?PD_EXTROUTER_ACTOR_STATE, ActorSt). %% let it crash if extra is not a map, %% we don't expect the message to be forwarded from an older EMQX release, diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index b126ce886..6091b6ffc 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -35,7 +35,7 @@ -export([ publish_actor_init_sync/6, - actor_init_ack_resp_msg/4, + actor_init_ack_resp_msg/3, publish_route_sync/4, publish_heartbeat/3, encode_field/2 @@ -277,13 +277,17 @@ publish_actor_init_sync(ClientPid, ReqId, RespTopic, TargetCluster, Actor, Incar }, emqtt:publish(ClientPid, ?ROUTE_TOPIC, Properties, ?ENCODE(Payload), [{qos, ?QOS_1}]). -actor_init_ack_resp_msg(Actor, InitRes, ReqId, RespTopic) -> +actor_init_ack_resp_msg(Actor, InitRes, MsgIn) -> Payload = #{ ?F_OPERATION => ?OP_ACTOR_INIT_ACK, ?F_PROTO_VER => ?PROTO_VER, ?F_ACTOR => Actor }, Payload1 = with_res_and_bootstrap(Payload, InitRes), + #{ + 'Response-Topic' := RespTopic, + 'Correlation-Data' := ReqId + } = emqx_message:get_header(properties, MsgIn), emqx_message:make( undefined, ?QOS_1, @@ -334,12 +338,11 @@ decode_route_op1(#{ ?F_ACTOR := Actor, ?F_INCARNATION := Incr }) -> - {actor_init, #{ - actor => Actor, - incarnation => Incr, - cluster => TargetCluster, + Info = #{ + target_cluster => TargetCluster, proto_ver => ProtoVer - }}; + }, + {actor_init, #{actor => Actor, incarnation => Incr}, Info}; decode_route_op1(#{ ?F_OPERATION := ?OP_ROUTE, ?F_ACTOR := Actor, From 45eda4f3b956a3c0c12d1d4f592fdc8ecf7d4970 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 20 May 2024 16:20:02 +0200 Subject: [PATCH 21/46] fix(cluster-link): adapt heartbeat / reincarnation handling to new API --- .../src/emqx_cluster_link_extrouter.erl | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index 6e4fff7c7..f060f4c56 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -229,10 +229,11 @@ actor_apply_operation( State; actor_apply_operation( heartbeat, - State = #state{actor = Actor, incarnation = Incarnation}, + State = #state{cluster = Cluster, actor = Actor, incarnation = Incarnation}, _Env = #{timestamp := Now} ) -> - ok = transaction(fun ?MODULE:mnesia_actor_heartbeat/3, [Actor, Incarnation, Now]), + ActorID = ?ACTOR_ID(Cluster, Actor), + ok = transaction(fun ?MODULE:mnesia_actor_heartbeat/3, [ActorID, Incarnation, Now]), State. apply_actor_operation(ActorID, Incarnation, Entry, OpName, Lane) -> @@ -303,14 +304,14 @@ select_cluster_lanes(Cluster) -> MS = [{#actor{id = {Cluster, '_'}, lane = '$1', _ = '_'}, [], ['$1']}], mnesia:select(?EXTROUTE_ACTOR_TAB, MS, write). -mnesia_actor_heartbeat(Actor, Incarnation, TS) -> - case mnesia:read(?EXTROUTE_ACTOR_TAB, Actor, write) of +mnesia_actor_heartbeat(ActorID, Incarnation, TS) -> + case mnesia:read(?EXTROUTE_ACTOR_TAB, ActorID, write) of [#actor{incarnation = Incarnation} = Rec] -> ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec#actor{until = bump_actor_ttl(TS)}, write); [#actor{incarnation = Outdated}] -> - mnesia:abort({outdated_incarnation_actor, Actor, Incarnation, Outdated}); + mnesia:abort({outdated_incarnation_actor, ActorID, Incarnation, Outdated}); [] -> - mnesia:abort({nonexistent_actor, Actor}) + mnesia:abort({nonexistent_actor, ActorID}) end. clean_incarnation(Rec) -> From 036c7e8492eaaef6d3e0d40d253cc9a6d3137ea0 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 20 May 2024 16:21:03 +0200 Subject: [PATCH 22/46] test(cluster-link): adapt extrouter testsuite to new APIs --- .../emqx_cluster_link_extrouter_SUITE.erl | 138 ++++++++++-------- 1 file changed, 74 insertions(+), 64 deletions(-) diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl index fffca47c7..9f80109fd 100644 --- a/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl @@ -8,9 +8,13 @@ -include_lib("eunit/include/eunit.hrl"). -include_lib("emqx/include/asserts.hrl"). +-include_lib("emqx/include/emqx.hrl"). + -compile(export_all). -compile(nowarn_export_all). +-define(CLUSTER, <<"link1">>). + %% all() -> @@ -45,32 +49,32 @@ t_consistent_routing_view(_Config) -> Actor1 = {?FUNCTION_NAME, 1}, Actor2 = {?FUNCTION_NAME, 2}, Actor3 = {?FUNCTION_NAME, 3}, - {ok, AS10} = emqx_cluster_link_extrouter:actor_init(Actor1, 1, env()), - {ok, AS20} = emqx_cluster_link_extrouter:actor_init(Actor2, 1, env()), - {ok, AS30} = emqx_cluster_link_extrouter:actor_init(Actor3, 1, env()), + {ok, AS10} = actor_init(Actor1, 1), + {ok, AS20} = actor_init(Actor2, 1), + {ok, AS30} = actor_init(Actor3, 1), %% Add few routes originating from different actors. %% Also test that route operations are idempotent. - AS11 = apply_operation({add, <<"t/client/#">>, id}, AS10), - _AS11 = apply_operation({add, <<"t/client/#">>, id}, AS10), - AS21 = apply_operation({add, <<"t/client/#">>, id}, AS20), - AS31 = apply_operation({add, <<"t/client/+/+">>, id1}, AS30), - AS32 = apply_operation({add, <<"t/client/+/+">>, id2}, AS31), - _AS22 = apply_operation({del, <<"t/client/#">>, id}, AS21), - AS12 = apply_operation({add, <<"t/client/+/+">>, id1}, AS11), - AS33 = apply_operation({del, <<"t/client/+/+">>, id1}, AS32), - _AS34 = apply_operation({del, <<"t/client/+/+">>, id2}, AS33), + AS11 = apply_operation({add, {<<"t/client/#">>, id}}, AS10), + _AS11 = apply_operation({add, {<<"t/client/#">>, id}}, AS10), + AS21 = apply_operation({add, {<<"t/client/#">>, id}}, AS20), + AS31 = apply_operation({add, {<<"t/client/+/+">>, id1}}, AS30), + AS32 = apply_operation({add, {<<"t/client/+/+">>, id2}}, AS31), + _AS22 = apply_operation({delete, {<<"t/client/#">>, id}}, AS21), + AS12 = apply_operation({add, {<<"t/client/+/+">>, id1}}, AS11), + AS33 = apply_operation({delete, {<<"t/client/+/+">>, id1}}, AS32), + _AS34 = apply_operation({delete, {<<"t/client/+/+">>, id2}}, AS33), ?assertEqual( [<<"t/client/#">>, <<"t/client/+/+">>], topics_sorted() ), ?assertEqual( - [<<"t/client/#">>], - lists:sort(emqx_cluster_link_extrouter:match_routes(<<"t/client/42">>)) + [#route{topic = <<"t/client/#">>, dest = ?CLUSTER}], + emqx_cluster_link_extrouter:match_routes(<<"t/client/42">>) ), %% Remove all routes from the actors. - AS13 = apply_operation({del, <<"t/client/#">>, id}, AS12), - AS14 = apply_operation({del, <<"t/client/+/+">>, id1}, AS13), - AS14 = apply_operation({del, <<"t/client/+/+">>, id1}, AS13), + AS13 = apply_operation({delete, {<<"t/client/#">>, id}}, AS12), + AS14 = apply_operation({delete, {<<"t/client/+/+">>, id1}}, AS13), + AS14 = apply_operation({delete, {<<"t/client/+/+">>, id1}}, AS13), ?assertEqual( [], topics_sorted() @@ -79,28 +83,28 @@ t_consistent_routing_view(_Config) -> t_actor_reincarnation(_Config) -> Actor1 = {?FUNCTION_NAME, 1}, Actor2 = {?FUNCTION_NAME, 2}, - {ok, AS10} = emqx_cluster_link_extrouter:actor_init(Actor1, 1, env()), - {ok, AS20} = emqx_cluster_link_extrouter:actor_init(Actor2, 1, env()), - AS11 = apply_operation({add, <<"topic/#">>, id}, AS10), - AS12 = apply_operation({add, <<"topic/42/+">>, id}, AS11), - AS21 = apply_operation({add, <<"topic/#">>, id}, AS20), + {ok, AS10} = actor_init(Actor1, 1), + {ok, AS20} = actor_init(Actor2, 1), + AS11 = apply_operation({add, {<<"topic/#">>, id}}, AS10), + AS12 = apply_operation({add, {<<"topic/42/+">>, id}}, AS11), + AS21 = apply_operation({add, {<<"topic/#">>, id}}, AS20), ?assertEqual( [<<"topic/#">>, <<"topic/42/+">>], topics_sorted() ), - {ok, _AS3} = emqx_cluster_link_extrouter:actor_init(Actor1, 2, env()), + {ok, _AS3} = actor_init(Actor1, 2), ?assertError( _IncarnationMismatch, - apply_operation({add, <<"toolate/#">>, id}, AS12) + apply_operation({add, {<<"toolate/#">>, id}}, AS12) ), ?assertEqual( [<<"topic/#">>], topics_sorted() ), - {ok, _AS4} = emqx_cluster_link_extrouter:actor_init(Actor2, 2, env()), + {ok, _AS4} = actor_init(Actor2, 2), ?assertError( _IncarnationMismatch, - apply_operation({add, <<"toolate/#">>, id}, AS21) + apply_operation({add, {<<"toolate/#">>, id}}, AS21) ), ?assertEqual( [], @@ -110,11 +114,11 @@ t_actor_reincarnation(_Config) -> t_actor_gc(_Config) -> Actor1 = {?FUNCTION_NAME, 1}, Actor2 = {?FUNCTION_NAME, 2}, - {ok, AS10} = emqx_cluster_link_extrouter:actor_init(Actor1, 1, env()), - {ok, AS20} = emqx_cluster_link_extrouter:actor_init(Actor2, 1, env()), - AS11 = apply_operation({add, <<"topic/#">>, id}, AS10), - AS12 = apply_operation({add, <<"topic/42/+">>, id}, AS11), - AS21 = apply_operation({add, <<"global/#">>, id}, AS20), + {ok, AS10} = actor_init(Actor1, 1), + {ok, AS20} = actor_init(Actor2, 1), + AS11 = apply_operation({add, {<<"topic/#">>, id}}, AS10), + AS12 = apply_operation({add, {<<"topic/42/+">>, id}}, AS11), + AS21 = apply_operation({add, {<<"global/#">>, id}}, AS20), ?assertEqual( [<<"global/#">>, <<"topic/#">>, <<"topic/42/+">>], topics_sorted() @@ -127,7 +131,7 @@ t_actor_gc(_Config) -> ), ?assertError( _IncarnationMismatch, - apply_operation({add, <<"toolate/#">>, id}, AS21) + apply_operation({add, {<<"toolate/#">>, id}}, AS21) ), ok = emqx_cluster_link_extrouter:actor_gc(env(120_000)), ?assertEqual( @@ -138,25 +142,25 @@ t_actor_gc(_Config) -> t_consistent_routing_view_concurrent_updates(_Config) -> A1Seq = repeat(10, [ reincarnate, - {add, <<"t/client/#">>, id}, - {add, <<"t/client/+/+">>, id1}, - {add, <<"t/client/+/+">>, id1}, - {del, <<"t/client/#">>, id} + {add, {<<"t/client/#">>, id}}, + {add, {<<"t/client/+/+">>, id1}}, + {add, {<<"t/client/+/+">>, id1}}, + {delete, {<<"t/client/#">>, id}} ]), A2Seq = repeat(10, [ - {add, <<"global/#">>, id}, - {add, <<"t/client/+/+">>, id1}, - {add, <<"t/client/+/+">>, id2}, - {del, <<"t/client/+/+">>, id1}, + {add, {<<"global/#">>, id}}, + {add, {<<"t/client/+/+">>, id1}}, + {add, {<<"t/client/+/+">>, id2}}, + {delete, {<<"t/client/+/+">>, id1}}, heartbeat ]), A3Seq = repeat(10, [ - {add, <<"global/#">>, id}, - {del, <<"global/#">>, id}, - {add, <<"t/client/+/+">>, id1}, - {del, <<"t/client/+/+">>, id1}, - {add, <<"t/client/+/+">>, id2}, - {del, <<"t/client/+/+">>, id2}, + {add, {<<"global/#">>, id}}, + {delete, {<<"global/#">>, id}}, + {add, {<<"t/client/+/+">>, id1}}, + {delete, {<<"t/client/+/+">>, id1}}, + {add, {<<"t/client/+/+">>, id2}}, + {delete, {<<"t/client/+/+">>, id2}}, reincarnate ]), A4Seq = repeat(10, [ @@ -197,25 +201,25 @@ t_consistent_routing_view_concurrent_cluster_updates(Config) -> [N1, N2, N3] = ?config(cluster, Config), A1Seq = repeat(10, [ reincarnate, - {add, <<"t/client/#">>, id}, - {add, <<"t/client/+/+">>, id1}, - {add, <<"t/client/+/+">>, id1}, - {del, <<"t/client/#">>, id} + {add, {<<"t/client/#">>, id}}, + {add, {<<"t/client/+/+">>, id1}}, + {add, {<<"t/client/+/+">>, id1}}, + {delete, {<<"t/client/#">>, id}} ]), A2Seq = repeat(10, [ - {add, <<"global/#">>, id}, - {add, <<"t/client/+/+">>, id1}, - {add, <<"t/client/+/+">>, id2}, - {del, <<"t/client/+/+">>, id1}, + {add, {<<"global/#">>, id}}, + {add, {<<"t/client/+/+">>, id1}}, + {add, {<<"t/client/+/+">>, id2}}, + {delete, {<<"t/client/+/+">>, id1}}, heartbeat ]), A3Seq = repeat(10, [ - {add, <<"global/#">>, id}, - {del, <<"global/#">>, id}, - {add, <<"t/client/+/+">>, id1}, - {del, <<"t/client/+/+">>, id1}, - {add, <<"t/client/+/+">>, id2}, - {del, <<"t/client/+/+">>, id2}, + {add, {<<"global/#">>, id}}, + {delete, {<<"global/#">>, id}}, + {add, {<<"t/client/+/+">>, id1}}, + {delete, {<<"t/client/+/+">>, id1}}, + {add, {<<"t/client/+/+">>, id2}}, + {delete, {<<"t/client/+/+">>, id2}}, reincarnate ]), A4Seq = repeat(10, [ @@ -259,12 +263,12 @@ run_remote_actor({Node, Run}) -> erlang:spawn_monitor(Node, ?MODULE, run_actor, [Run]). run_actor({Actor, Seq}) -> - {ok, AS0} = emqx_cluster_link_extrouter:actor_init(Actor, 0, env(0)), + {ok, AS0} = actor_init(Actor, 0), lists:foldl( fun - ({TS, {add, _, _} = Op}, AS) -> + ({TS, {add, _} = Op}, AS) -> apply_operation(Op, AS, TS); - ({TS, {del, _, _} = Op}, AS) -> + ({TS, {delete, _} = Op}, AS) -> apply_operation(Op, AS, TS); ({TS, heartbeat}, AS) -> apply_operation(heartbeat, AS, TS); @@ -275,7 +279,7 @@ run_actor({Actor, Seq}) -> ok = timer:sleep(MS), AS; ({TS, reincarnate}, _AS) -> - {ok, AS} = emqx_cluster_link_extrouter:actor_init(Actor, TS, env(TS)), + {ok, AS} = actor_init(Actor, TS, TS), AS end, AS0, @@ -284,6 +288,12 @@ run_actor({Actor, Seq}) -> %% +actor_init(Actor, Incarnation) -> + actor_init(Actor, Incarnation, _TS = 0). + +actor_init(Actor, Incarnation, TS) -> + emqx_cluster_link_extrouter:actor_init(?CLUSTER, Actor, Incarnation, env(TS)). + apply_operation(Op, AS) -> apply_operation(Op, AS, _TS = 42). From e9c24090d493b6c43c7a331e91f239dd638e690d Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Tue, 28 May 2024 20:25:22 +0200 Subject: [PATCH 23/46] fix(cluster-link): avoid starting ps syncer if persistence disabled --- .../src/emqx_cluster_link_router_syncer.erl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 9d35bb812..0cc27c328 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -241,10 +241,10 @@ init({sup, TargetCluster}) -> intensity => 10, period => 60 }, - Children = [ - child_spec(actor, TargetCluster), - child_spec(ps_actor, TargetCluster) - ], + Children = lists:append([ + [child_spec(actor, TargetCluster)], + [child_spec(ps_actor, TargetCluster) || emqx_persistent_message:is_persistence_enabled()] + ]), {ok, {SupFlags, Children}}; init({actor, State}) -> init_actor(State). From 54d51d09820fc1a46e299c9fd3ef04c3b261dc9b Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Tue, 28 May 2024 20:27:13 +0200 Subject: [PATCH 24/46] test(cluster-link): draft basic integration test suite --- .../src/emqx_cluster_link_router_syncer.erl | 8 +- .../test/emqx_cluster_link_SUITE.erl | 158 ++++++++++++++++++ 2 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 0cc27c328..2e6f63834 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -5,6 +5,7 @@ -include_lib("emqtt/include/emqtt.hrl"). -include_lib("emqx/include/logger.hrl"). +-include_lib("snabbkaffe/include/trace.hrl"). -include("emqx_cluster_link.hrl"). %% API @@ -226,7 +227,12 @@ process_syncer_batch(Batch, ClientName, Actor, Incarnation) -> [], Batch ), - publish_routes(gproc:where(ClientName), Actor, Incarnation, Updates). + Result = publish_routes(gproc:where(ClientName), Actor, Incarnation, Updates), + ?tp(debug, clink_route_sync_complete, #{ + actor => {Actor, Incarnation}, + batch => Batch + }), + Result. batch_get_opname(Op) -> element(1, Op). diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl new file mode 100644 index 000000000..11911fc00 --- /dev/null +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl @@ -0,0 +1,158 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_SUITE). + +-include_lib("common_test/include/ct.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("emqx/include/asserts.hrl"). + +-compile(export_all). +-compile(nowarn_export_all). + +%% + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + SourceCluster = start_source_cluster(Config), + TargetCluster = start_target_cluster(Config), + [ + {source_cluster, SourceCluster}, + {target_cluster, TargetCluster} + | Config + ]. + +end_per_suite(Config) -> + ok = emqx_cth_cluster:stop(?config(source_cluster, Config)), + ok = emqx_cth_cluster:stop(?config(target_cluster, Config)). + +init_per_testcase(TCName, Config) -> + emqx_common_test_helpers:init_per_testcase(?MODULE, TCName, Config). + +end_per_testcase(TCName, Config) -> + emqx_common_test_helpers:end_per_testcase(?MODULE, TCName, Config). + +%% + +start_source_cluster(Config) -> + SourceConf = + "cluster {" + "\n name = cl.source" + "\n links = [" + "\n { enable = true" + "\n upstream = cl.target" + "\n server = \"localhost:31883\"" + "\n clientid = client.source" + "\n topics = []" + "\n }" + "\n ]}", + SourceApps1 = [{emqx_conf, combine([conf_log(), SourceConf])}], + SourceApps2 = [{emqx_conf, combine([conf_log(), conf_mqtt_listener(41883), SourceConf])}], + emqx_cth_cluster:start( + [ + {emqx_clink_msgfwd_source1, #{apps => SourceApps1 ++ [emqx]}}, + {emqx_clink_msgfwd_source2, #{apps => SourceApps2 ++ [emqx]}} + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ). + +start_target_cluster(Config) -> + TargetConf = + "cluster {" + "\n name = cl.target" + "\n links = [" + "\n { enable = true" + "\n upstream = cl.source" + "\n server = \"localhost:41883\"" + "\n clientid = client.target" + "\n topics = [\"#\"]" + "\n }" + "\n ]}", + TargetApps1 = [{emqx_conf, combine([conf_log(), TargetConf])}], + TargetApps2 = [{emqx_conf, combine([conf_log(), conf_mqtt_listener(31883), TargetConf])}], + emqx_cth_cluster:start( + [ + {emqx_clink_msgfwd_target1, #{apps => TargetApps1 ++ [emqx], base_port => 20100}}, + {emqx_clink_msgfwd_target2, #{apps => TargetApps2 ++ [emqx], base_port => 20200}} + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ). + +conf_mqtt_listener(LPort) when is_integer(LPort) -> + fmt("listeners.tcp.clink { bind = ~p }", [LPort]); +conf_mqtt_listener(_) -> + "". + +conf_log() -> + "log.file { enable = true, level = debug, path = node.log, supervisor_reports = progress }". + +combine([Entry | Rest]) -> + lists:foldl(fun emqx_cth_suite:merge_config/2, Entry, Rest). + +start_cluster_link(Config) -> + Nodes = nodes_all(Config), + [{ok, Apps}] = lists:usort( + erpc:multicall(Nodes, emqx_cth_suite, start_apps, [ + [emqx_cluster_link], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ]) + ), + Apps. + +stop_cluster_link(Config) -> + Apps = ?config(tc_apps, Config), + Nodes = nodes_all(Config), + [{ok, ok}] = lists:usort( + erpc:multicall(Nodes, emqx_cth_suite, stop_apps, [Apps]) + ). + +%% + +nodes_all(Config) -> + nodes_source(Config) ++ nodes_target(Config). + +nodes_source(Config) -> + ?config(source_cluster, Config). + +nodes_target(Config) -> + ?config(target_cluster, Config). + +%% + +t_message_forwarding('init', Config) -> + Apps = start_cluster_link(Config), + ok = snabbkaffe:start_trace(), + [{tc_apps, Apps} | Config]; +t_message_forwarding('end', Config) -> + ok = snabbkaffe:stop(), + stop_cluster_link(Config). + +t_message_forwarding(Config) -> + [SourceNode1 | _] = nodes_source(Config), + [TargetNode1 | _] = nodes_target(Config), + SourceC1 = start_client("t_message_forwarding", SourceNode1), + TargetC1 = start_client("t_message_forwarding", TargetNode1), + {ok, _, _} = emqtt:subscribe(TargetC1, <<"t/+">>, qos1), + {ok, _} = ?block_until(#{?snk_kind := clink_route_sync_complete}), + {ok, _} = emqtt:publish(SourceC1, <<"t/42">>, <<"hello">>, qos1), + ?assertReceive({publish, #{topic := <<"t/42">>, payload := <<"hello">>}}), + ok = emqtt:stop(SourceC1), + ok = emqtt:stop(TargetC1). + +%% + +start_client(ClientId, Node) -> + Port = tcp_port(Node), + {ok, Client} = emqtt:start_link([{proto_ver, v5}, {clientid, ClientId}, {port, Port}]), + {ok, _} = emqtt:connect(Client), + Client. + +tcp_port(Node) -> + {_Host, Port} = erpc:call(Node, emqx_config, get, [[listeners, tcp, default, bind]]), + Port. + +fmt(Fmt, Args) -> + io_lib:format(Fmt, Args). From 58eaf076278f88897bcdc32705c9abdb391eb3e3 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Wed, 29 May 2024 13:37:06 +0300 Subject: [PATCH 25/46] fix(clusterlink): valide config to disallow duplicated cluster links --- .../src/emqx_cluster_link_config.erl | 1 - .../src/emqx_cluster_link_schema.erl | 29 +++++++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index 4b93407b2..c81c7e2e8 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -169,7 +169,6 @@ remove_link(_LinkConf) -> update_links(LinksConf) -> [update_link(Link) || Link <- LinksConf]. -%% TODO: do some updates without restart (at least without coordinator restart and re-election) update_link(#{enabled := true} = LinkConf) -> _ = remove_link(LinkConf), add_link(LinkConf); diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl index abdfff39f..695b29330 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl @@ -28,8 +28,8 @@ injected_fields() -> fields("cluster_linking") -> [ - %% TODO: validate and ensure upstream names are unique! - {links, ?HOCON(?ARRAY(?R_REF("link")), #{default => []})} + {links, + ?HOCON(?ARRAY(?R_REF("link")), #{default => [], validator => fun links_validator/1})} ]; fields("link") -> [ @@ -47,10 +47,33 @@ fields("link") -> }}, %% TODO: validate topics: %% - basic topic validation - %% - non-overlapping (not intersecting) filters ? + %% - non-overlapping (not intersecting) filters? + %% (this may be not required, depends on config update implementation) {topics, ?HOCON(?ARRAY(binary()), #{required => true})}, {pool_size, ?HOCON(pos_integer(), #{default => emqx_vm:schedulers() * 2})} ]. desc(_) -> "todo". + +links_validator(Links) -> + {_, Dups} = lists:foldl( + fun(Link, {Acc, DupAcc}) -> + Name = link_name(Link), + case Acc of + #{Name := _} -> + {Acc, [Name | DupAcc]}; + _ -> + {Acc#{Name => undefined}, DupAcc} + end + end, + {#{}, []}, + Links + ), + case Dups of + [] -> ok; + _ -> {error, #{reason => duplicated_cluster_links, names => Dups}} + end. + +link_name(#{upstream := Name}) -> Name; +link_name(#{<<"upstream">> := Name}) -> Name. From de1ac131f704a3dbacc413421219caae7c8a6da0 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Tue, 28 May 2024 20:27:13 +0200 Subject: [PATCH 26/46] test(cluster-link): fix test suite setup --- apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl index 11911fc00..d957eb580 100644 --- a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl @@ -53,8 +53,8 @@ start_source_cluster(Config) -> SourceApps2 = [{emqx_conf, combine([conf_log(), conf_mqtt_listener(41883), SourceConf])}], emqx_cth_cluster:start( [ - {emqx_clink_msgfwd_source1, #{apps => SourceApps1 ++ [emqx]}}, - {emqx_clink_msgfwd_source2, #{apps => SourceApps2 ++ [emqx]}} + {emqx_clink_msgfwd_source1, #{apps => SourceApps1}}, + {emqx_clink_msgfwd_source2, #{apps => SourceApps2}} ], #{work_dir => emqx_cth_suite:work_dir(Config)} ). @@ -75,8 +75,8 @@ start_target_cluster(Config) -> TargetApps2 = [{emqx_conf, combine([conf_log(), conf_mqtt_listener(31883), TargetConf])}], emqx_cth_cluster:start( [ - {emqx_clink_msgfwd_target1, #{apps => TargetApps1 ++ [emqx], base_port => 20100}}, - {emqx_clink_msgfwd_target2, #{apps => TargetApps2 ++ [emqx], base_port => 20200}} + {emqx_clink_msgfwd_target1, #{apps => TargetApps1, base_port => 20100}}, + {emqx_clink_msgfwd_target2, #{apps => TargetApps2, base_port => 20200}} ], #{work_dir => emqx_cth_suite:work_dir(Config)} ). From 24be189728e28ee52be1a2b301597fc7b08d1db2 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Wed, 29 May 2024 12:49:53 +0200 Subject: [PATCH 27/46] fix(topic): respect special topic rules when intersecting --- apps/emqx/src/emqx_topic.erl | 9 ++++++++- apps/emqx/test/emqx_topic_SUITE.erl | 16 ++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/apps/emqx/src/emqx_topic.erl b/apps/emqx/src/emqx_topic.erl index 9cd631508..dd3d4294e 100644 --- a/apps/emqx/src/emqx_topic.erl +++ b/apps/emqx/src/emqx_topic.erl @@ -111,11 +111,18 @@ match(_, _) -> -spec intersection(TopicOrFilter, TopicOrFilter) -> TopicOrFilter | false when TopicOrFilter :: emqx_types:topic(). intersection(Topic1, Topic2) when is_binary(Topic1), is_binary(Topic2) -> - case intersection(words(Topic1), words(Topic2), []) of + case intersect_start(words(Topic1), words(Topic2)) of [] -> false; Intersection -> join(lists:reverse(Intersection)) end. +intersect_start([<<"$", _/bytes>> | _], [W | _]) when ?IS_WILDCARD(W) -> + []; +intersect_start([W | _], [<<"$", _/bytes>> | _]) when ?IS_WILDCARD(W) -> + []; +intersect_start(Words1, Words2) -> + intersection(Words1, Words2, []). + intersection(Words1, ['#'], Acc) -> lists:reverse(Words1, Acc); intersection(['#'], Words2, Acc) -> diff --git a/apps/emqx/test/emqx_topic_SUITE.erl b/apps/emqx/test/emqx_topic_SUITE.erl index 6e0451591..5993fbc40 100644 --- a/apps/emqx/test/emqx_topic_SUITE.erl +++ b/apps/emqx/test/emqx_topic_SUITE.erl @@ -28,6 +28,7 @@ [ wildcard/1, match/2, + intersection/2, validate/1, prepend/2, join/1, @@ -128,6 +129,21 @@ t_match_perf(_) -> true = match(Name, Filter), ok = bench('match/2', fun emqx_topic:match/2, [Name, Filter]). +t_intersect(_) -> + <<"t/global/1/+">> = intersection(<<"t/global/#">>, <<"t/+/1/+">>), + <<"t/global/#">> = intersection(<<"t/global/#">>, <<"#">>), + <<"t/global/#">> = intersection(<<"t/global/#">>, <<"t/global/#">>), + <<"1/2/3/4/5">> = intersection(<<"1/+/3/+/5/#">>, <<"+/2/+/4/+">>), + <<"t/local/1">> = intersection(<<"t/local/1/#">>, <<"t/local/+">>), + false = intersection(<<"t/global/#">>, <<"t/local/+">>), + false = intersection(<<"t/local/1/+">>, <<"t/local/+">>). + +t_sys_intersect(_) -> + <<"$SYS/broker/+">> = intersection(<<"$SYS/broker/#">>, <<"$SYS/+/+">>), + <<"$SYS/broker">> = intersection(<<"$SYS/broker">>, <<"$SYS/+">>), + false = intersection(<<"$SYS/broker">>, <<"+/+">>), + false = intersection(<<"$SYS/broker">>, <<"#">>). + t_validate(_) -> true = validate(<<"a/+/#">>), true = validate(<<"a/b/c/d">>), From 7b8f466adfac01cb9e4ed8e023dca5d3fedb106f Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Wed, 29 May 2024 13:00:29 +0200 Subject: [PATCH 28/46] feat(topic): avoid `lists:reverse` when intersecting --- apps/emqx/src/emqx_topic.erl | 53 +++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/apps/emqx/src/emqx_topic.erl b/apps/emqx/src/emqx_topic.erl index dd3d4294e..88338d963 100644 --- a/apps/emqx/src/emqx_topic.erl +++ b/apps/emqx/src/emqx_topic.erl @@ -112,37 +112,40 @@ match(_, _) -> TopicOrFilter :: emqx_types:topic(). intersection(Topic1, Topic2) when is_binary(Topic1), is_binary(Topic2) -> case intersect_start(words(Topic1), words(Topic2)) of - [] -> false; - Intersection -> join(lists:reverse(Intersection)) + false -> false; + Intersection -> join(Intersection) end. intersect_start([<<"$", _/bytes>> | _], [W | _]) when ?IS_WILDCARD(W) -> - []; + false; intersect_start([W | _], [<<"$", _/bytes>> | _]) when ?IS_WILDCARD(W) -> - []; + false; intersect_start(Words1, Words2) -> - intersection(Words1, Words2, []). + intersect(Words1, Words2). -intersection(Words1, ['#'], Acc) -> - lists:reverse(Words1, Acc); -intersection(['#'], Words2, Acc) -> - lists:reverse(Words2, Acc); -intersection([W1], ['+'], Acc) -> - [W1 | Acc]; -intersection(['+'], [W2], Acc) -> - [W2 | Acc]; -intersection([W1 | T1], [W2 | T2], Acc) when ?IS_WILDCARD(W1), ?IS_WILDCARD(W2) -> - intersection(T1, T2, [wildcard_intersection(W1, W2) | Acc]); -intersection([W | T1], [W | T2], Acc) -> - intersection(T1, T2, [W | Acc]); -intersection([W1 | T1], [W2 | T2], Acc) when ?IS_WILDCARD(W1) -> - intersection(T1, T2, [W2 | Acc]); -intersection([W1 | T1], [W2 | T2], Acc) when ?IS_WILDCARD(W2) -> - intersection(T1, T2, [W1 | Acc]); -intersection([], [], Acc) -> - Acc; -intersection(_, _, _) -> - []. +intersect(Words1, ['#']) -> + Words1; +intersect(['#'], Words2) -> + Words2; +intersect([W1], ['+']) -> + [W1]; +intersect(['+'], [W2]) -> + [W2]; +intersect([W1 | T1], [W2 | T2]) when ?IS_WILDCARD(W1), ?IS_WILDCARD(W2) -> + intersect_join(wildcard_intersection(W1, W2), intersect(T1, T2)); +intersect([W | T1], [W | T2]) -> + intersect_join(W, intersect(T1, T2)); +intersect([W1 | T1], [W2 | T2]) when ?IS_WILDCARD(W1) -> + intersect_join(W2, intersect(T1, T2)); +intersect([W1 | T1], [W2 | T2]) when ?IS_WILDCARD(W2) -> + intersect_join(W1, intersect(T1, T2)); +intersect([], []) -> + []; +intersect(_, _) -> + false. + +intersect_join(_, false) -> false; +intersect_join(W, Words) -> [W | Words]. wildcard_intersection(W, W) -> W; wildcard_intersection(_, _) -> '+'. From 7fccb5dbc99febaa0aaf590819a05133079e9e9e Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 3 Jun 2024 10:19:05 +0200 Subject: [PATCH 29/46] test(topic): add more `intersection/2` testcases --- apps/emqx/test/emqx_topic_SUITE.erl | 42 +++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/apps/emqx/test/emqx_topic_SUITE.erl b/apps/emqx/test/emqx_topic_SUITE.erl index 5993fbc40..af61c497d 100644 --- a/apps/emqx/test/emqx_topic_SUITE.erl +++ b/apps/emqx/test/emqx_topic_SUITE.erl @@ -138,6 +138,48 @@ t_intersect(_) -> false = intersection(<<"t/global/#">>, <<"t/local/+">>), false = intersection(<<"t/local/1/+">>, <<"t/local/+">>). +t_intersect_topic_wildcard(_) -> + <<"t/test/1">> = intersection(<<"t/test/#">>, <<"t/test/1">>), + <<"t/test/1/1">> = intersection(<<"t/test/1/1">>, <<"t/test/#">>), + false = intersection(<<"t/test/1/1">>, <<"t/test/+">>), + <<"t/test/1/1">> = intersection(<<"t/test/1/1">>, <<"t/test/1/1">>), + false = intersection(<<"t/test/1">>, <<"t/test/2">>), + false = intersection(<<"t/test/1">>, <<"t/test/1/2">>). + +t_intersect_commutes(_) -> + ?assertEqual( + intersection(<<"t/+/1/+">>, <<"t/global/#">>), + intersection(<<"t/global/#">>, <<"t/+/1/+">>) + ), + ?assertEqual( + intersection(<<"#">>, <<"t/global/#">>), + intersection(<<"t/global/#">>, <<"#">>) + ), + ?assertEqual( + intersection(<<"+/2/+/4/+">>, <<"1/+/3/+/5/#">>), + intersection(<<"1/+/3/+/5/#">>, <<"+/2/+/4/+">>) + ), + ?assertEqual( + intersection(<<"t/local/+">>, <<"t/local/1/#">>), + intersection(<<"t/local/1/#">>, <<"t/local/+">>) + ), + ?assertEqual( + intersection(<<"t/local/+">>, <<"t/global/#">>), + intersection(<<"t/global/#">>, <<"t/local/+">>) + ), + ?assertEqual( + intersection(<<"t/local/+">>, <<"t/local/1/+">>), + intersection(<<"t/local/1/+">>, <<"t/local/+">>) + ), + ?assertEqual( + intersection(<<"t/test/#">>, <<"t/test/1/1">>), + intersection(<<"t/test/1/1">>, <<"t/test/#">>) + ), + ?assertEqual( + intersection(<<"t/test/+">>, <<"t/test/1/1">>), + intersection(<<"t/test/1/1">>, <<"t/test/+">>) + ). + t_sys_intersect(_) -> <<"$SYS/broker/+">> = intersection(<<"$SYS/broker/#">>, <<"$SYS/+/+">>), <<"$SYS/broker">> = intersection(<<"$SYS/broker">>, <<"$SYS/+">>), From 0219b8bd4dbcbd79ce3b5e2afce1d0f3debf0eeb Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Wed, 29 May 2024 16:07:25 +0200 Subject: [PATCH 30/46] feat(cluster-link): add simple replication actor GC process --- .../src/emqx_cluster_link_config.erl | 20 +++- .../src/emqx_cluster_link_extrouter.erl | 31 +++--- .../src/emqx_cluster_link_extrouter_gc.erl | 95 +++++++++++++++++++ .../src/emqx_cluster_link_router_syncer.erl | 4 +- .../src/emqx_cluster_link_sup.erl | 16 +++- .../emqx_cluster_link_extrouter_SUITE.erl | 12 ++- 6 files changed, 153 insertions(+), 25 deletions(-) create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index c81c7e2e8..9d840256a 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -13,6 +13,8 @@ -define(MQTT_HOST_OPTS, #{default_port => 1883}). +-define(DEFAULT_ACTOR_TTL, 30_000). + -export([ %% General cluster/0, @@ -22,7 +24,11 @@ topic_filters/1, %% Connections emqtt_options/1, - mk_emqtt_options/1 + mk_emqtt_options/1, + %% Actor Lifecycle + actor_ttl/0, + actor_gc_interval/0, + actor_heartbeat_interval/0 ]). -export([ @@ -58,6 +64,18 @@ emqtt_options(LinkName) -> topic_filters(LinkName) -> maps:get(topics, ?MODULE:link(LinkName), []). +-spec actor_ttl() -> _Milliseconds :: pos_integer(). +actor_ttl() -> + ?DEFAULT_ACTOR_TTL. + +-spec actor_gc_interval() -> _Milliseconds :: pos_integer(). +actor_gc_interval() -> + actor_ttl(). + +-spec actor_heartbeat_interval() -> _Milliseconds :: pos_integer(). +actor_heartbeat_interval() -> + actor_ttl() div 3. + %% mk_emqtt_options(#{server := Server, ssl := #{enable := EnableSsl} = Ssl} = LinkConf) -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index f060f4c56..e058cb816 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -67,8 +67,6 @@ -include_lib("emqx/include/emqx.hrl"). --define(DEFAULT_ACTOR_TTL_MS, 30_000). - -define(EXTROUTE_SHARD, ?MODULE). -define(EXTROUTE_TAB, emqx_external_router_route). -define(EXTROUTE_ACTOR_TAB, emqx_external_router_actor). @@ -280,16 +278,22 @@ apply_operation(Entry, MCounter, OpName, Lane) -> MCounter end. --spec actor_gc(env()) -> ok. +-spec actor_gc(env()) -> _NumCleaned :: non_neg_integer(). actor_gc(#{timestamp := Now}) -> MS = [{#actor{until = '$1', _ = '_'}, [{'<', '$1', Now}], ['$_']}], - case mnesia:dirty_select(?EXTROUTE_ACTOR_TAB, MS) of - [Rec | _Rest] -> - %% NOTE: One at a time. - clean_incarnation(Rec); - [] -> - ok - end. + Dead = mnesia:dirty_select(?EXTROUTE_ACTOR_TAB, MS), + try_clean_incarnation(Dead). + +try_clean_incarnation([Rec | Rest]) -> + %% NOTE: One at a time. + case clean_incarnation(Rec) of + ok -> + 1; + stale -> + try_clean_incarnation(Rest) + end; +try_clean_incarnation([]) -> + 0. mnesia_assign_lane(Cluster) -> Assignment = lists:foldl( @@ -323,7 +327,7 @@ mnesia_clean_incarnation(#actor{id = Actor, incarnation = Incarnation, lane = La _ = clean_lane(Lane), mnesia:delete(?EXTROUTE_ACTOR_TAB, Actor, write); _Renewed -> - ok + stale end. clean_lane(Lane) -> @@ -368,7 +372,4 @@ first_zero_bit(N, I) -> %% bump_actor_ttl(TS) -> - TS + get_actor_ttl(). - -get_actor_ttl() -> - ?DEFAULT_ACTOR_TTL_MS. + TS + emqx_cluster_link_config:actor_ttl(). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl new file mode 100644 index 000000000..e185c5137 --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl @@ -0,0 +1,95 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_extrouter_gc). + +-include_lib("emqx/include/logger.hrl"). + +-export([start_link/0]). + +-export([run/0]). + +-behaviour(gen_server). +-export([ + init/1, + handle_call/3, + handle_cast/2, + handle_info/2 +]). + +-define(SERVER, ?MODULE). + +-define(REPEAT_GC_INTERVAL, 5_000). + +%% + +start_link() -> + gen_server:start_link({local, ?SERVER}, ?MODULE, [], []). + +run() -> + gen_server:call(?SERVER, run). + +%% + +-record(st, { + gc_timer :: reference() +}). + +init(_) -> + {ok, schedule_gc(#st{})}. + +handle_call(run, _From, St) -> + Result = run_gc(), + Timeout = choose_timeout(Result), + {reply, Result, reschedule_gc(Timeout, St)}; +handle_call(_Call, _From, St) -> + {reply, ignored, St}. + +handle_cast(Cast, State) -> + ?SLOG(warning, #{msg => "unexpected_cast", cast => Cast}), + {noreply, State}. + +handle_info({timeout, TRef, _GC}, St = #st{gc_timer = TRef}) -> + Result = run_gc_exclusive(), + Timeout = choose_timeout(Result), + {noreply, schedule_gc(Timeout, St#st{gc_timer = undefined})}; +handle_info(Info, St) -> + ?SLOG(warning, #{msg => "unexpected_info", info => Info}), + {noreply, St}. + +%% + +run_gc_exclusive() -> + case is_responsible() of + true -> run_gc(); + false -> 0 + end. + +is_responsible() -> + Nodes = lists:sort(mria_membership:running_core_nodelist()), + Nodes =/= [] andalso hd(Nodes) == node(). + +-spec run_gc() -> _NumCleaned :: non_neg_integer(). +run_gc() -> + Env = #{timestamp => erlang:system_time(millisecond)}, + emqx_cluster_link_extrouter:actor_gc(Env). + +choose_timeout(_NumCleaned = 0) -> + emqx_cluster_link_config:actor_gc_interval(); +choose_timeout(_NumCleaned) -> + %% NOTE: There could likely be more outdated actors. + ?REPEAT_GC_INTERVAL. + +schedule_gc(St) -> + schedule_gc(emqx_cluster_link_config:actor_gc_interval(), St). + +schedule_gc(Timeout, St = #st{gc_timer = undefined}) -> + TRef = erlang:start_timer(Timeout, self(), gc), + St#st{gc_timer = TRef}. + +reschedule_gc(Timeout, St = #st{gc_timer = undefined}) -> + schedule_gc(Timeout, St); +reschedule_gc(Timeout, St = #st{gc_timer = TRef}) -> + ok = emqx_utils:cancel_timer(TRef), + schedule_gc(Timeout, St#st{gc_timer = undefined}). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 2e6f63834..ffce01812 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -54,7 +54,6 @@ -define(RECONNECT_TIMEOUT, 5_000). -define(ACTOR_REINIT_TIMEOUT, 7000). --define(HEARTBEAT_INTERVAL, 10_000). -define(CLIENT_SUFFIX, ":routesync:"). -define(PS_CLIENT_SUFFIX, ":routesync-ps:"). @@ -475,7 +474,8 @@ process_heartbeat(St = #st{client = ClientPid, actor = Actor, incarnation = Inca schedule_heartbeat(St). schedule_heartbeat(St = #st{heartbeat_timer = undefined}) -> - TRef = erlang:start_timer(?HEARTBEAT_INTERVAL, self(), heartbeat), + Timeout = emqx_cluster_link_config:actor_heartbeat_interval(), + TRef = erlang:start_timer(Timeout, self(), heartbeat), St#st{heartbeat_timer = TRef}. %% Bootstrapping. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl index beb641a92..872054fa0 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl @@ -10,7 +10,6 @@ -export([init/1]). --define(COORD_SUP, emqx_cluster_link_coord_sup). -define(SERVER, ?MODULE). start_link(LinksConf) -> @@ -22,12 +21,21 @@ init(LinksConf) -> intensity => 10, period => 5 }, - %% Children = [sup_spec(?COORD_SUP, ?COORD_SUP, LinksConf)], - Children = [ + ExtrouterGC = extrouter_gc_spec(), + RouteActors = [ sup_spec(Name, emqx_cluster_link_router_syncer, [Name]) || #{upstream := Name} <- LinksConf ], - {ok, {SupFlags, Children}}. + {ok, {SupFlags, [ExtrouterGC | RouteActors]}}. + +extrouter_gc_spec() -> + %% NOTE: This one is currently global, not per-link. + #{ + id => {extrouter, gc}, + start => {emqx_cluster_link_extrouter_gc, start_link, []}, + restart => permanent, + type => worker + }. sup_spec(Id, Mod, Args) -> #{ diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl index 9f80109fd..bb281ce4c 100644 --- a/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl @@ -124,7 +124,10 @@ t_actor_gc(_Config) -> topics_sorted() ), _AS13 = apply_operation(heartbeat, AS12, 50_000), - ok = emqx_cluster_link_extrouter:actor_gc(env(60_000)), + ?assertEqual( + 1, + emqx_cluster_link_extrouter:actor_gc(env(60_000)) + ), ?assertEqual( [<<"topic/#">>, <<"topic/42/+">>], topics_sorted() @@ -133,7 +136,10 @@ t_actor_gc(_Config) -> _IncarnationMismatch, apply_operation({add, {<<"toolate/#">>, id}}, AS21) ), - ok = emqx_cluster_link_extrouter:actor_gc(env(120_000)), + ?assertEqual( + 1, + emqx_cluster_link_extrouter:actor_gc(env(120_000)) + ), ?assertEqual( [], topics_sorted() @@ -273,7 +279,7 @@ run_actor({Actor, Seq}) -> ({TS, heartbeat}, AS) -> apply_operation(heartbeat, AS, TS); ({TS, gc}, AS) -> - ok = emqx_cluster_link_extrouter:actor_gc(env(TS)), + _NC = emqx_cluster_link_extrouter:actor_gc(env(TS)), AS; ({_TS, {sleep, MS}}, AS) -> ok = timer:sleep(MS), From e0604e3af667763b0799332912edc58b84f0c6c2 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Wed, 29 May 2024 16:08:14 +0200 Subject: [PATCH 31/46] fix(cluster-link): anticipate clients may occasionally retry --- apps/emqx_cluster_link/src/emqx_cluster_link.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index 2b469b114..a567fb6cc 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -215,7 +215,7 @@ get_actor_state(ClusterName, Actor) -> erlang:get(?PD_EXTROUTER_ACTOR_STATE). set_actor_state(ClusterName, Actor, ActorSt) -> - undefined = erlang:put(?PD_EXTROUTER_ACTOR, {ClusterName, Actor}), + _Undefined = erlang:put(?PD_EXTROUTER_ACTOR, {ClusterName, Actor}), update_actor_state(ActorSt). update_actor_state(ActorSt) -> From ede35df24af34cafd0043ed4247d2ff12d7a2e7e Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 3 Jun 2024 14:45:39 +0200 Subject: [PATCH 32/46] fix(cluster-link): cancel heartbeats on client down --- .../src/emqx_cluster_link_router_syncer.erl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index ffce01812..ea1803f6d 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -457,7 +457,8 @@ handle_client_down(Reason, St = #st{target = TargetCluster, actor = Actor}) -> %% TODO: syncer may be already down due to one_for_all strategy ok = close_syncer(TargetCluster, Actor), _ = maybe_alarm(Reason, St), - process_connect(St#st{client = undefined, error = Reason, status = connecting}). + NSt = cancel_heartbeat(St), + process_connect(NSt#st{client = undefined, error = Reason, status = connecting}). process_bootstrap(St = #st{bootstrapped = false}, _NeedBootstrap) -> run_bootstrap(St); @@ -478,6 +479,12 @@ schedule_heartbeat(St = #st{heartbeat_timer = undefined}) -> TRef = erlang:start_timer(Timeout, self(), heartbeat), St#st{heartbeat_timer = TRef}. +cancel_heartbeat(St = #st{heartbeat_timer = undefined}) -> + St; +cancel_heartbeat(St = #st{heartbeat_timer = TRef}) -> + ok = emqx_utils:cancel_timer(TRef), + St#st{heartbeat_timer = undefined}. + %% Bootstrapping. %% Responsible for transferring local routing table snapshot to the target %% cluster. Does so either during the initial startup or when MQTT connection From c4840b30d2098c637330543a2a82cfddf360bc52 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 3 Jun 2024 16:51:01 +0200 Subject: [PATCH 33/46] fix(cluster-link): deduplicate routes down to dest cluster --- .../src/emqx_cluster_link_extrouter.erl | 2 +- .../test/emqx_cluster_link_SUITE.erl | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index e058cb816..dc6233d25 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -119,7 +119,7 @@ create_tables() -> match_routes(Topic) -> Matches = emqx_topic_index:matches(Topic, ?EXTROUTE_TAB, [unique]), %% `unique` opt is not enough, since we keep the original Topic as a part of RouteID - lists:usort([match_to_route(M) || M <- Matches]). + lists:ukeysort(#route.dest, [match_to_route(M) || M <- Matches]). lookup_routes(Topic) -> Pat = #extroute{entry = emqx_topic_index:make_key(Topic, '$1'), _ = '_'}, diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl index d957eb580..26b951c00 100644 --- a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl @@ -132,13 +132,24 @@ t_message_forwarding('end', Config) -> t_message_forwarding(Config) -> [SourceNode1 | _] = nodes_source(Config), - [TargetNode1 | _] = nodes_target(Config), + [TargetNode1, TargetNode2 | _] = nodes_target(Config), SourceC1 = start_client("t_message_forwarding", SourceNode1), - TargetC1 = start_client("t_message_forwarding", TargetNode1), + TargetC1 = start_client("t_message_forwarding1", TargetNode1), + TargetC2 = start_client("t_message_forwarding2", TargetNode2), {ok, _, _} = emqtt:subscribe(TargetC1, <<"t/+">>, qos1), + {ok, _, _} = emqtt:subscribe(TargetC2, <<"t/#">>, qos1), {ok, _} = ?block_until(#{?snk_kind := clink_route_sync_complete}), {ok, _} = emqtt:publish(SourceC1, <<"t/42">>, <<"hello">>, qos1), - ?assertReceive({publish, #{topic := <<"t/42">>, payload := <<"hello">>}}), + ?assertReceive( + {publish, #{topic := <<"t/42">>, payload := <<"hello">>, client_pid := TargetC1}} + ), + ?assertReceive( + {publish, #{topic := <<"t/42">>, payload := <<"hello">>, client_pid := TargetC2}} + ), + ?assertNotReceive({publish, _Message = #{}}), + ok = emqtt:stop(SourceC1), + ok = emqtt:stop(TargetC1), + ok = emqtt:stop(TargetC2). ok = emqtt:stop(SourceC1), ok = emqtt:stop(TargetC1). From d0df4de2a36a752f638a93c890bd84f895e3cb3d Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 3 Jun 2024 16:53:38 +0200 Subject: [PATCH 34/46] test(cluster-link): add e2e replication actor GC testcase --- .../src/emqx_cluster_link_config.erl | 4 + .../src/emqx_cluster_link_extrouter.erl | 14 +- .../src/emqx_cluster_link_extrouter_gc.erl | 4 + .../src/emqx_cluster_link_mqtt.erl | 31 ++-- .../test/emqx_cluster_link_SUITE.erl | 136 ++++++++++++++---- 5 files changed, 150 insertions(+), 39 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index 9d840256a..568d1a69d 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -13,7 +13,11 @@ -define(MQTT_HOST_OPTS, #{default_port => 1883}). +-ifndef(TEST). -define(DEFAULT_ACTOR_TTL, 30_000). +-else. +-define(DEFAULT_ACTOR_TTL, 3_000). +-endif. -export([ %% General diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index dc6233d25..1a69733ee 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -4,6 +4,8 @@ -module(emqx_cluster_link_extrouter). +-include_lib("snabbkaffe/include/trace.hrl"). + -export([create_tables/0]). %% Router API @@ -318,8 +320,16 @@ mnesia_actor_heartbeat(ActorID, Incarnation, TS) -> mnesia:abort({nonexistent_actor, ActorID}) end. -clean_incarnation(Rec) -> - transaction(fun ?MODULE:mnesia_clean_incarnation/1, [Rec]). +clean_incarnation(Rec = #actor{id = {Cluster, Actor}}) -> + case transaction(fun ?MODULE:mnesia_clean_incarnation/1, [Rec]) of + ok -> + ?tp(debug, clink_extrouter_actor_cleaned, #{ + cluster => Cluster, + actor => Actor + }); + Result -> + Result + end. mnesia_clean_incarnation(#actor{id = Actor, incarnation = Incarnation, lane = Lane}) -> case mnesia:read(?EXTROUTE_ACTOR_TAB, Actor, write) of diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl index e185c5137..89258b506 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl @@ -20,7 +20,11 @@ -define(SERVER, ?MODULE). +-ifndef(TEST). -define(REPEAT_GC_INTERVAL, 5_000). +-else. +-define(REPEAT_GC_INTERVAL, 1_000). +-endif. %% diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index 6091b6ffc..62e021289 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -8,6 +8,7 @@ -include_lib("emqx/include/emqx.hrl"). -include_lib("emqx/include/emqx_mqtt.hrl"). -include_lib("emqx/include/logger.hrl"). +-include_lib("snabbkaffe/include/trace.hrl"). -behaviour(emqx_resource). -behaviour(ecpool_worker). @@ -123,15 +124,19 @@ on_query(_ResourceId, FwdMsg, #{pool_name := PoolName, topic := LinkTopic} = _St is_record(FwdMsg, message) -> #message{topic = Topic, qos = QoS} = FwdMsg, - handle_send_result( - ecpool:pick_and_do( - {PoolName, Topic}, - fun(ConnPid) -> - emqtt:publish(ConnPid, LinkTopic, ?ENCODE(FwdMsg), QoS) - end, - no_handover - ) - ). + PubResult = ecpool:pick_and_do( + {PoolName, Topic}, + fun(ConnPid) -> + emqtt:publish(ConnPid, LinkTopic, ?ENCODE(FwdMsg), QoS) + end, + no_handover + ), + ?tp_ignore_side_effects_in_prod(clink_message_forwarded, #{ + pool => PoolName, + message => FwdMsg, + pub_result => PubResult + }), + handle_send_result(PubResult). on_query_async( _ResourceId, FwdMsg, CallbackIn, #{pool_name := PoolName, topic := LinkTopic} = _State @@ -145,7 +150,13 @@ on_query_async( %% #delivery{} record has no valuable data for a remote link... Payload = ?ENCODE(FwdMsg), %% TODO: check override QOS requirements (if any) - emqtt:publish_async(ConnPid, LinkTopic, Payload, QoS, Callback) + PubResult = emqtt:publish_async(ConnPid, LinkTopic, Payload, QoS, Callback), + ?tp_ignore_side_effects_in_prod(clink_message_forwarded, #{ + pool => PoolName, + message => FwdMsg, + pub_result => PubResult + }), + PubResult end, no_handover ). diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl index 26b951c00..922af832f 100644 --- a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl @@ -7,6 +7,7 @@ -include_lib("common_test/include/ct.hrl"). -include_lib("eunit/include/eunit.hrl"). -include_lib("emqx/include/asserts.hrl"). +-include_lib("emqx_utils/include/emqx_message.hrl"). -compile(export_all). -compile(nowarn_export_all). @@ -17,17 +18,10 @@ all() -> emqx_common_test_helpers:all(?MODULE). init_per_suite(Config) -> - SourceCluster = start_source_cluster(Config), - TargetCluster = start_target_cluster(Config), - [ - {source_cluster, SourceCluster}, - {target_cluster, TargetCluster} - | Config - ]. + Config. -end_per_suite(Config) -> - ok = emqx_cth_cluster:stop(?config(source_cluster, Config)), - ok = emqx_cth_cluster:stop(?config(target_cluster, Config)). +end_per_suite(_Config) -> + ok. init_per_testcase(TCName, Config) -> emqx_common_test_helpers:init_per_testcase(?MODULE, TCName, Config). @@ -37,7 +31,7 @@ end_per_testcase(TCName, Config) -> %% -start_source_cluster(Config) -> +mk_source_cluster(BaseName, Config) -> SourceConf = "cluster {" "\n name = cl.source" @@ -51,15 +45,15 @@ start_source_cluster(Config) -> "\n ]}", SourceApps1 = [{emqx_conf, combine([conf_log(), SourceConf])}], SourceApps2 = [{emqx_conf, combine([conf_log(), conf_mqtt_listener(41883), SourceConf])}], - emqx_cth_cluster:start( + emqx_cth_cluster:mk_nodespecs( [ - {emqx_clink_msgfwd_source1, #{apps => SourceApps1}}, - {emqx_clink_msgfwd_source2, #{apps => SourceApps2}} + {mk_nodename(BaseName, s1), #{apps => SourceApps1}}, + {mk_nodename(BaseName, s2), #{apps => SourceApps2}} ], #{work_dir => emqx_cth_suite:work_dir(Config)} ). -start_target_cluster(Config) -> +mk_target_cluster(BaseName, Config) -> TargetConf = "cluster {" "\n name = cl.target" @@ -73,14 +67,17 @@ start_target_cluster(Config) -> "\n ]}", TargetApps1 = [{emqx_conf, combine([conf_log(), TargetConf])}], TargetApps2 = [{emqx_conf, combine([conf_log(), conf_mqtt_listener(31883), TargetConf])}], - emqx_cth_cluster:start( + emqx_cth_cluster:mk_nodespecs( [ - {emqx_clink_msgfwd_target1, #{apps => TargetApps1, base_port => 20100}}, - {emqx_clink_msgfwd_target2, #{apps => TargetApps2, base_port => 20200}} + {mk_nodename(BaseName, t1), #{apps => TargetApps1, base_port => 20100}}, + {mk_nodename(BaseName, t2), #{apps => TargetApps2, base_port => 20200}} ], #{work_dir => emqx_cth_suite:work_dir(Config)} ). +mk_nodename(BaseName, Suffix) -> + binary_to_atom(fmt("emqx_clink_~s_~s", [BaseName, Suffix])). + conf_mqtt_listener(LPort) when is_integer(LPort) -> fmt("listeners.tcp.clink { bind = ~p }", [LPort]); conf_mqtt_listener(_) -> @@ -92,8 +89,7 @@ conf_log() -> combine([Entry | Rest]) -> lists:foldl(fun emqx_cth_suite:merge_config/2, Entry, Rest). -start_cluster_link(Config) -> - Nodes = nodes_all(Config), +start_cluster_link(Nodes, Config) -> [{ok, Apps}] = lists:usort( erpc:multicall(Nodes, emqx_cth_suite, start_apps, [ [emqx_cluster_link], @@ -115,20 +111,27 @@ nodes_all(Config) -> nodes_source(Config) ++ nodes_target(Config). nodes_source(Config) -> - ?config(source_cluster, Config). + ?config(source_nodes, Config). nodes_target(Config) -> - ?config(target_cluster, Config). + ?config(target_nodes, Config). %% t_message_forwarding('init', Config) -> - Apps = start_cluster_link(Config), + SourceNodes = emqx_cth_cluster:start(mk_source_cluster(?FUNCTION_NAME, Config)), + TargetNodes = emqx_cth_cluster:start(mk_target_cluster(?FUNCTION_NAME, Config)), + _Apps = start_cluster_link(SourceNodes ++ TargetNodes, Config), ok = snabbkaffe:start_trace(), - [{tc_apps, Apps} | Config]; + [ + {source_nodes, SourceNodes}, + {target_nodes, TargetNodes} + | Config + ]; t_message_forwarding('end', Config) -> ok = snabbkaffe:stop(), - stop_cluster_link(Config). + ok = emqx_cth_cluster:stop(?config(source_nodes, Config)), + ok = emqx_cth_cluster:stop(?config(target_nodes, Config)). t_message_forwarding(Config) -> [SourceNode1 | _] = nodes_source(Config), @@ -150,11 +153,90 @@ t_message_forwarding(Config) -> ok = emqtt:stop(SourceC1), ok = emqtt:stop(TargetC1), ok = emqtt:stop(TargetC2). + +t_target_extrouting_gc('init', Config) -> + SourceCluster = mk_source_cluster(?FUNCTION_NAME, Config), + SourceNodes = emqx_cth_cluster:start(SourceCluster), + TargetCluster = mk_target_cluster(?FUNCTION_NAME, Config), + TargetNodes = emqx_cth_cluster:start(TargetCluster), + _Apps = start_cluster_link(SourceNodes ++ TargetNodes, Config), + ok = snabbkaffe:start_trace(), + [ + {source_cluster, SourceCluster}, + {source_nodes, SourceNodes}, + {target_cluster, TargetCluster}, + {target_nodes, TargetNodes} + | Config + ]; +t_target_extrouting_gc('end', Config) -> + ok = snabbkaffe:stop(), + ok = emqx_cth_cluster:stop(?config(source_nodes, Config)). + +t_target_extrouting_gc(Config) -> + [SourceNode1 | _] = nodes_source(Config), + [TargetNode1, TargetNode2 | _] = nodes_target(Config), + SourceC1 = start_client("t_target_extrouting_gc", SourceNode1), + TargetC1 = start_client_unlink("t_target_extrouting_gc1", TargetNode1), + TargetC2 = start_client_unlink("t_target_extrouting_gc2", TargetNode2), + {ok, _, _} = emqtt:subscribe(TargetC1, <<"t/#">>, qos1), + {ok, _, _} = emqtt:subscribe(TargetC2, <<"t/+">>, qos1), + {ok, _} = ?block_until(#{?snk_kind := clink_route_sync_complete}), + {ok, _} = emqtt:publish(SourceC1, <<"t/1">>, <<"HELLO1">>, qos1), + {ok, _} = emqtt:publish(SourceC1, <<"t/2/ext">>, <<"HELLO2">>, qos1), + {ok, _} = emqtt:publish(SourceC1, <<"t/3/ext">>, <<"HELLO3">>, qos1), + Pubs1 = [M || {publish, M} <- ?drainMailbox(1_000)], + {ok, _} = ?wait_async_action( + emqx_cth_cluster:stop_node(TargetNode1), + #{?snk_kind := clink_extrouter_actor_cleaned, cluster := <<"cl.target">>} + ), + {ok, _} = emqtt:publish(SourceC1, <<"t/4/ext">>, <<"HELLO4">>, qos1), + {ok, _} = emqtt:publish(SourceC1, <<"t/5">>, <<"HELLO5">>, qos1), + Pubs2 = [M || {publish, M} <- ?drainMailbox(1_000)], + {ok, _} = ?wait_async_action( + emqx_cth_cluster:stop_node(TargetNode2), + #{?snk_kind := clink_extrouter_actor_cleaned, cluster := <<"cl.target">>} + ), ok = emqtt:stop(SourceC1), - ok = emqtt:stop(TargetC1). + %% Verify that extrouter table eventually becomes empty. + ?assertEqual( + [], + erpc:call(SourceNode1, emqx_cluster_link_extrouter, topics, []), + { + erpc:call(SourceNode1, ets, tab2list, [emqx_external_router_actor]), + erpc:call(SourceNode1, ets, tab2list, [emqx_external_router_route]) + } + ), + %% Verify all relevant messages were forwarded. + ?assertMatch( + [ + #{topic := <<"t/1">>, payload := <<"HELLO1">>, client_pid := _C1}, + #{topic := <<"t/1">>, payload := <<"HELLO1">>, client_pid := _C2}, + #{topic := <<"t/2/ext">>, payload := <<"HELLO2">>}, + #{topic := <<"t/3/ext">>, payload := <<"HELLO3">>}, + #{topic := <<"t/5">>, payload := <<"HELLO5">>} + ], + lists:sort(emqx_utils_maps:key_comparer(topic), Pubs1 ++ Pubs2) + ), + %% Verify there was no unnecessary forwarding. + Trace = snabbkaffe:collect_trace(), + ?assertMatch( + [ + #{message := #message{topic = <<"t/1">>, payload = <<"HELLO1">>}}, + #{message := #message{topic = <<"t/2/ext">>, payload = <<"HELLO2">>}}, + #{message := #message{topic = <<"t/3/ext">>, payload = <<"HELLO3">>}}, + #{message := #message{topic = <<"t/5">>, payload = <<"HELLO5">>}} + ], + ?of_kind(clink_message_forwarded, Trace), + Trace + ). %% +start_client_unlink(ClientId, Node) -> + Client = start_client(ClientId, Node), + _ = erlang:unlink(Client), + Client. + start_client(ClientId, Node) -> Port = tcp_port(Node), {ok, Client} = emqtt:start_link([{proto_ver, v5}, {clientid, ClientId}, {port, Port}]), @@ -166,4 +248,4 @@ tcp_port(Node) -> Port. fmt(Fmt, Args) -> - io_lib:format(Fmt, Args). + emqx_utils:format(Fmt, Args). From 780a0bf807e7d0dab7597eafc19799348ec25401 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 3 Jun 2024 17:04:29 +0200 Subject: [PATCH 35/46] fix(cluster-link): clear exit signal of failed-to-connect client --- .../src/emqx_cluster_link_router_syncer.erl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index ea1803f6d..5fc267bd9 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -145,12 +145,19 @@ start_link_client(TargetCluster, Actor) -> {ok, _Props} -> {ok, Pid}; Error -> + _ = flush_link_signal(Pid), Error end; Error -> Error end. +flush_link_signal(Pid) -> + receive + {'EXIT', Pid, _} -> ok + after 1 -> timeout + end. + refine_client_options(Options = #{clientid := ClientID}, Actor) -> Suffix = case Actor of From c871b3745320b34bffb9f7b456beb2b413a07123 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Thu, 30 May 2024 13:48:00 +0300 Subject: [PATCH 36/46] fix(clusterlink): add link topics schema validator --- .../include/emqx_cluster_link.hrl | 2 + .../src/emqx_cluster_link_schema.erl | 40 ++++++++++++++++--- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/apps/emqx_cluster_link/include/emqx_cluster_link.hrl b/apps/emqx_cluster_link/include/emqx_cluster_link.hrl index 3ee7e9fdf..08dc7f4ad 100644 --- a/apps/emqx_cluster_link/include/emqx_cluster_link.hrl +++ b/apps/emqx_cluster_link/include/emqx_cluster_link.hrl @@ -3,6 +3,8 @@ %%-------------------------------------------------------------------- -define(TOPIC_PREFIX, "$LINK/cluster/"). +-define(TOPIC_PREFIX_WILDCARD, <>). + -define(ROUTE_TOPIC_PREFIX, ?TOPIC_PREFIX "route/"). -define(MSG_TOPIC_PREFIX, ?TOPIC_PREFIX "msg/"). -define(RESP_TOPIC_PREFIX, ?TOPIC_PREFIX "resp/"). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl index 695b29330..22c9e31ec 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl @@ -6,6 +6,7 @@ -behaviour(emqx_schema_hooks). +-include("emqx_cluster_link.hrl"). -include_lib("hocon/include/hoconsc.hrl"). -export([injected_fields/0]). @@ -49,13 +50,16 @@ fields("link") -> %% - basic topic validation %% - non-overlapping (not intersecting) filters? %% (this may be not required, depends on config update implementation) - {topics, ?HOCON(?ARRAY(binary()), #{required => true})}, + {topics, + ?HOCON(?ARRAY(binary()), #{required => true, validator => fun topics_validator/1})}, {pool_size, ?HOCON(pos_integer(), #{default => emqx_vm:schedulers() * 2})} ]. desc(_) -> "todo". +%% TODO: check that no link name equals local cluster name, +%% but this may be tricky since the link config is injected into cluster config (emqx_conf_schema). links_validator(Links) -> {_, Dups} = lists:foldl( fun(Link, {Acc, DupAcc}) -> @@ -70,10 +74,36 @@ links_validator(Links) -> {#{}, []}, Links ), - case Dups of - [] -> ok; - _ -> {error, #{reason => duplicated_cluster_links, names => Dups}} - end. + check_errors(Dups, duplicated_cluster_links, names). link_name(#{upstream := Name}) -> Name; link_name(#{<<"upstream">> := Name}) -> Name. + +topics_validator(Topics) -> + Errors = lists:foldl( + fun(T, ErrAcc) -> + try + _ = emqx_topic:validate(T), + validate_sys_link_topic(T, ErrAcc) + catch + E:R -> + [{T, {E, R}} | ErrAcc] + end + end, + [], + Topics + ), + check_errors(Errors, invalid_topics, topics). + +validate_sys_link_topic(T, ErrAcc) -> + case emqx_topic:match(T, ?TOPIC_PREFIX_WILDCARD) of + true -> + [{T, {error, topic_not_allowed}} | ErrAcc]; + false -> + ErrAcc + end. + +check_errors([] = _Errors, _Reason, _ValuesField) -> + ok; +check_errors(Errors, Reason, ValuesField) -> + {error, #{reason => Reason, ValuesField => Errors}}. From 94e81ba81269dc61a88eda8eef7468cd679e2a16 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Mon, 3 Jun 2024 21:17:46 +0300 Subject: [PATCH 37/46] feat(clusterlink): implement actor config handler --- .../src/emqx_cluster_link.erl | 35 ++++- .../src/emqx_cluster_link_api.erl | 116 +++++++++++++++ .../src/emqx_cluster_link_app.erl | 14 +- .../src/emqx_cluster_link_config.erl | 136 ++++++++++++------ .../src/emqx_cluster_link_extrouter.erl | 11 +- .../src/emqx_cluster_link_mqtt.erl | 26 ++-- .../emqx_cluster_link_router_bootstrap.erl | 5 +- .../src/emqx_cluster_link_router_syncer.erl | 95 ++++++------ .../src/emqx_cluster_link_schema.erl | 23 +-- .../src/emqx_cluster_link_sup.erl | 29 +++- 10 files changed, 354 insertions(+), 136 deletions(-) create mode 100644 apps/emqx_cluster_link/src/emqx_cluster_link_api.erl diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index a567fb6cc..fd5280262 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -7,6 +7,7 @@ -behaviour(emqx_external_broker). -export([ + is_registered/0, register_external_broker/0, unregister_external_broker/0, add_route/1, @@ -36,8 +37,14 @@ %% emqx_external_broker API %%-------------------------------------------------------------------- +is_registered() -> + emqx_external_broker:provider() =:= ?MODULE. + register_external_broker() -> - emqx_external_broker:register_provider(?MODULE). + case is_registered() of + true -> ok; + false -> emqx_external_broker:register_provider(?MODULE) + end. unregister_external_broker() -> emqx_external_broker:unregister_provider(?MODULE). @@ -94,13 +101,18 @@ on_message_publish( {route_updates, #{actor := Actor}, RouteOps} -> ok = update_routes(ClusterName, Actor, RouteOps); {heartbeat, #{actor := Actor}} -> - ok = actor_heartbeat(ClusterName, Actor) + ok = actor_heartbeat(ClusterName, Actor); + {error, {unknown_payload, ParsedPayload}} -> + ?SLOG(warning, #{ + msg => "unexpected_cluster_link_route_op_payload", + payload => ParsedPayload + }) end, {stop, []}; on_message_publish(#message{topic = <>, payload = Payload}) -> case emqx_cluster_link_mqtt:decode_forwarded_msg(Payload) of #message{} = ForwardedMsg -> - {stop, with_sender_name(ForwardedMsg, ClusterName)}; + {stop, maybe_filter_incomming_msg(ForwardedMsg, ClusterName)}; _Err -> %% Just ignore it. It must be already logged by the decoder {stop, []} @@ -163,9 +175,7 @@ actor_init( %% which will use safe binary_to_term decoding %% TODO: add error details? {error, <<"unknown_cluster">>}; - LinkConf -> - %% TODO: may be worth checking resource health and communicate it? - _ = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf), + #{enable := true} = _LinkConf -> MyClusterName = emqx_cluster_link_config:cluster(), case MyClusterName of TargetCluster -> @@ -187,7 +197,9 @@ actor_init( received_from => ClusterName }), {error, <<"bad_remote_cluster_link_name">>} - end + end; + #{enable := false} -> + {error, <<"clster_link_disabled">>} end. actor_init_ack(#{actor := Actor}, Res, MsgIn) -> @@ -226,3 +238,12 @@ update_actor_state(ActorSt) -> %% that doesn't set extra = #{} by default. with_sender_name(#message{extra = Extra} = Msg, ClusterName) when is_map(Extra) -> Msg#message{extra = Extra#{link_origin => ClusterName}}. + +maybe_filter_incomming_msg(#message{topic = T} = Msg, ClusterName) -> + %% Should prevent irrelevant messages from being dispatched in case + %% the remote routing state lags behind the local config changes. + #{enable := Enable, topics := Topics} = emqx_cluster_link_config:link(ClusterName), + case Enable andalso emqx_topic:match_any(T, Topics) of + true -> with_sender_name(Msg, ClusterName); + false -> [] + end. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_api.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_api.erl new file mode 100644 index 000000000..c74d2d3f7 --- /dev/null +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_api.erl @@ -0,0 +1,116 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- +-module(emqx_cluster_link_api). + +-behaviour(minirest_api). + +-include_lib("hocon/include/hoconsc.hrl"). +-include_lib("emqx/include/http_api.hrl"). + +-export([ + api_spec/0, + paths/0, + schema/1 +]). + +-export([config/2]). + +-define(CONF_PATH, [cluster, links]). +-define(TAGS, [<<"Cluster">>]). + +api_spec() -> + emqx_dashboard_swagger:spec(?MODULE, #{check_schema => true}). + +paths() -> + [ + "/cluster/links" + ]. + +schema("/cluster/links") -> + #{ + 'operationId' => config, + get => + #{ + description => "Get cluster links configuration", + tags => ?TAGS, + responses => + #{200 => links_config_schema()} + }, + put => + #{ + description => "Update cluster links configuration", + tags => ?TAGS, + 'requestBody' => links_config_schema(), + responses => + #{ + 200 => links_config_schema(), + 400 => + emqx_dashboard_swagger:error_codes( + [?BAD_REQUEST], <<"Update Config Failed">> + ) + } + } + }. + +%%-------------------------------------------------------------------- +%% API Handler funcs +%%-------------------------------------------------------------------- + +config(get, _Params) -> + {200, get_raw()}; +config(put, #{body := Body}) -> + case emqx_cluster_link_config:update(Body) of + {ok, NewConfig} -> + {200, NewConfig}; + {error, Reason} -> + Message = list_to_binary(io_lib:format("Update config failed ~p", [Reason])), + {400, ?BAD_REQUEST, Message} + end. + +%%-------------------------------------------------------------------- +%% Internal funcs +%%-------------------------------------------------------------------- + +get_raw() -> + #{<<"links">> := Conf} = + emqx_config:fill_defaults( + #{<<"links">> => emqx_conf:get_raw(?CONF_PATH)}, + #{obfuscate_sensitive_values => true} + ), + Conf. + +links_config_schema() -> + emqx_cluster_link_schema:links_schema( + #{ + examples => #{<<"example">> => links_config_example()} + } + ). + +links_config_example() -> + [ + #{ + <<"enable">> => true, + <<"pool_size">> => 10, + <<"server">> => <<"emqxcl_b.host:1883">>, + <<"ssl">> => #{<<"enable">> => false}, + <<"topics">> => + [ + <<"t/topic-example">>, + <<"t/topic-filter-example/1/#">> + ], + <<"upstream">> => <<"emqxcl_b">> + }, + #{ + <<"enable">> => true, + <<"pool_size">> => 10, + <<"server">> => <<"emqxcl_c.host:1883">>, + <<"ssl">> => #{<<"enable">> => false}, + <<"topics">> => + [ + <<"t/topic-example">>, + <<"t/topic-filter-example/1/#">> + ], + <<"upstream">> => <<"emqxcl_c">> + } + ]. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl index f05c5c1a0..750387ca9 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl @@ -13,7 +13,7 @@ start(_StartType, _StartArgs) -> ok = mria:wait_for_tables(emqx_cluster_link_extrouter:create_tables()), emqx_cluster_link_config:add_handler(), - LinksConf = enabled_links(), + LinksConf = emqx_cluster_link_config:enabled_links(), _ = case LinksConf of [_ | _] -> @@ -32,19 +32,13 @@ prep_stop(State) -> stop(_State) -> _ = emqx_cluster_link:delete_hook(), _ = emqx_cluster_link:unregister_external_broker(), - _ = stop_msg_fwd_resources(emqx_cluster_link_config:links()), + _ = remove_msg_fwd_resources(emqx_cluster_link_config:links()), ok. %%-------------------------------------------------------------------- %% Internal functions %%-------------------------------------------------------------------- -enabled_links() -> - lists:filter( - fun(#{enable := IsEnabled}) -> IsEnabled =:= true end, - emqx_cluster_link_config:links() - ). - start_msg_fwd_resources(LinksConf) -> lists:foreach( fun(LinkConf) -> @@ -53,10 +47,10 @@ start_msg_fwd_resources(LinksConf) -> LinksConf ). -stop_msg_fwd_resources(LinksConf) -> +remove_msg_fwd_resources(LinksConf) -> lists:foreach( fun(#{upstream := Name}) -> - emqx_cluster_link_mqtt:stop_msg_fwd_resource(Name) + emqx_cluster_link_mqtt:remove_msg_fwd_resource(Name) end, LinksConf ). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index 568d1a69d..67dc267e6 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -21,6 +21,7 @@ -export([ %% General + update/1, cluster/0, enabled_links/0, links/0, @@ -47,6 +48,20 @@ %% +update(Config) -> + case + emqx_conf:update( + ?LINKS_PATH, + Config, + #{rawconf_with_defaults => true, override_to => cluster} + ) + of + {ok, #{raw_config := NewConfigRows}} -> + {ok, NewConfigRows}; + {error, Reason} -> + {error, Reason} + end. + cluster() -> atom_to_binary(emqx_config:get([cluster, name])). @@ -83,7 +98,7 @@ actor_heartbeat_interval() -> %% mk_emqtt_options(#{server := Server, ssl := #{enable := EnableSsl} = Ssl} = LinkConf) -> - ClientId = maps:get(client_id, LinkConf, cluster()), + ClientId = maps:get(clientid, LinkConf, cluster()), #{hostname := Host, port := Port} = emqx_schema:parse_server(Server, ?MQTT_HOST_OPTS), Opts = #{ host => Host, @@ -115,13 +130,13 @@ remove_handler() -> pre_config_update(?LINKS_PATH, RawConf, RawConf) -> {ok, RawConf}; -pre_config_update(?LINKS_PATH, NewRawConf, _RawConf) -> - {ok, convert_certs(NewRawConf)}. +pre_config_update(?LINKS_PATH, NewRawConf, OldRawConf) -> + {ok, convert_certs(maybe_increment_ps_actor_incr(NewRawConf, OldRawConf))}. post_config_update(?LINKS_PATH, _Req, Old, Old, _AppEnvs) -> ok; post_config_update(?LINKS_PATH, _Req, New, Old, _AppEnvs) -> - ok = maybe_toggle_hook_and_provider(New), + ok = toggle_hook_and_broker(enabled_links(New), enabled_links(Old)), #{ removed := Removed, added := Added, @@ -142,22 +157,17 @@ post_config_update(?LINKS_PATH, _Req, New, Old, _AppEnvs) -> %% Internal functions %%-------------------------------------------------------------------- -maybe_toggle_hook_and_provider(LinksConf) -> - case is_any_enabled(LinksConf) of - true -> - ok = emqx_cluster_link:register_external_broker(), - ok = emqx_cluster_link:put_hook(); - false -> - _ = emqx_cluster_link:delete_hook(), - _ = emqx_cluster_link:unregister_external_broker(), - ok - end. +toggle_hook_and_broker([_ | _] = _NewEnabledLinks, [] = _OldEnabledLinks) -> + ok = emqx_cluster_link:register_external_broker(), + ok = emqx_cluster_link:put_hook(); +toggle_hook_and_broker([] = _NewEnabledLinks, _OldLinks) -> + ok = emqx_cluster_link:unregister_external_broker(), + ok = emqx_cluster_link:delete_hook(); +toggle_hook_and_broker(_, _) -> + ok. -is_any_enabled(LinksConf) -> - lists:any( - fun(#{enable := IsEnabled}) -> IsEnabled =:= true end, - LinksConf - ). +enabled_links(LinksConf) -> + [L || #{enable := true} = L <- LinksConf]. all_ok(Results) -> lists:all( @@ -172,42 +182,86 @@ all_ok(Results) -> add_links(LinksConf) -> [add_link(Link) || Link <- LinksConf]. -add_link(#{enabled := true} = LinkConf) -> - %% NOTE: this can be started later during init_link phase, but it looks not harmful to start it beforehand... - MsgFwdRes = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf), - %% TODO - ActorRes = ok, - combine_results(ActorRes, MsgFwdRes); +add_link(#{enable := true} = LinkConf) -> + {ok, _Pid} = emqx_cluster_link_sup:ensure_actor(LinkConf), + {ok, _} = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(LinkConf), + ok; add_link(_DisabledLinkConf) -> ok. remove_links(LinksConf) -> - [remove_link(Link) || Link <- LinksConf]. + [remove_link(Name) || #{upstream := Name} <- LinksConf]. -remove_link(_LinkConf) -> - %% TODO - ok. +remove_link(Name) -> + _ = emqx_cluster_link_mqtt:remove_msg_fwd_resource(Name), + ensure_actor_stopped(Name). update_links(LinksConf) -> [update_link(Link) || Link <- LinksConf]. -update_link(#{enabled := true} = LinkConf) -> - _ = remove_link(LinkConf), - add_link(LinkConf); -update_link(#{enabled := false} = LinkConf) -> - case remove_link(LinkConf) of - {error, not_found} -> ok; - Other -> Other - end. - -combine_results(ok, ok) -> +update_link({OldLinkConf, #{enable := true, upstream := Name} = NewLinkConf}) -> + _ = ensure_actor_stopped(Name), + {ok, _Pid} = emqx_cluster_link_sup:ensure_actor(NewLinkConf), + %% TODO: if only msg_fwd resource related config is changed, + %% we can skip actor reincarnation/restart. + ok = update_msg_fwd_resource(OldLinkConf, NewLinkConf), ok; -combine_results(CoordRes, MsgFwdRes) -> - {error, #{coordinator => CoordRes, msg_fwd_resource => MsgFwdRes}}. +update_link({_OldLinkConf, #{enable := false, upstream := Name} = _NewLinkConf}) -> + _ = emqx_cluster_link_mqtt:remove_msg_fwd_resource(Name), + ensure_actor_stopped(Name). + +update_msg_fwd_resource(#{pool_size := Old}, #{pool_size := Old} = NewConf) -> + {ok, _} = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(NewConf), + ok; +update_msg_fwd_resource(_, #{upstream := Name} = NewConf) -> + _ = emqx_cluster_link_mqtt:remove_msg_fwd_resource(Name), + {ok, _} = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(NewConf), + ok. + +ensure_actor_stopped(ClusterName) -> + emqx_cluster_link_sup:ensure_actor_stopped(ClusterName). upstream_name(#{upstream := N}) -> N; upstream_name(#{<<"upstream">> := N}) -> N. +maybe_increment_ps_actor_incr(New, Old) -> + case emqx_persistent_message:is_persistence_enabled() of + true -> + %% TODO: what if a link was removed and then added again? + %% Assume that incarnation was 0 when the link was removed + %% and now it's also 0 (a default value for new actor). + %% If persistent routing state changed during this link absence + %% and remote GC has not started before ps actor restart (with the same incarnation), + %% then some old (stale) external ps routes may be never cleaned on the remote side. + %% No (permanent) message loss is expected, as new actor incrantaion will re-bootstrap. + %% Similarly, irrelevant messages will be filtered out at receiving end, so + %% the main risk is having some stale routes unreachable for GC... + #{changed := Changed} = emqx_utils:diff_lists(New, Old, fun upstream_name/1), + ChangedNames = [upstream_name(C) || {_, C} <- Changed], + lists:foldr( + fun(LConf, Acc) -> + case lists:member(upstream_name(LConf), ChangedNames) of + true -> [increment_ps_actor_incr(LConf) | Acc]; + false -> [LConf | Acc] + end + end, + [], + New + ); + false -> + New + end. + +increment_ps_actor_incr(#{ps_actor_incarnation := Incr} = Conf) -> + Conf#{ps_actor_incarnation => Incr + 1}; +increment_ps_actor_incr(#{<<"ps_actor_incarnation">> := Incr} = Conf) -> + Conf#{<<"ps_actor_incarnation">> => Incr + 1}; +%% Default value set in schema is 0, so need to set it to 1 during the first update. +increment_ps_actor_incr(#{<<"upstream">> := _} = Conf) -> + Conf#{<<"ps_actor_incarnation">> => 1}; +increment_ps_actor_incr(#{upstream := _} = Conf) -> + Conf#{ps_actor_incarnation => 1}. + convert_certs(LinksConf) -> lists:map( fun diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index 1a69733ee..a97aa7ece 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -22,7 +22,8 @@ actor_apply_operation/2, actor_apply_operation/3, actor_gc/1, - is_present_incarnation/1 + is_present_incarnation/1, + list_actors/1 ]). %% Internal API @@ -167,6 +168,14 @@ is_present_incarnation(#state{extra = #{is_present_incarnation := IsNew}}) -> is_present_incarnation(_State) -> false. +-spec list_actors(cluster()) -> [#{actor := actor(), incarnation := incarnation()}]. +list_actors(Cluster) -> + Matches = ets:match( + emqx_external_router_actor, + #actor{id = {Cluster, '$1'}, incarnation = '$2', _ = '_'} + ), + [#{actor => Actor, incarnation => Incr} || [Actor, Incr] <- Matches]. + mnesia_actor_init(Cluster, Actor, Incarnation, TS) -> %% NOTE %% We perform this heavy-weight transaction only in the case of a new route diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index 62e021289..e4b398397 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -28,7 +28,7 @@ -export([ ensure_msg_fwd_resource/1, - stop_msg_fwd_resource/1, + remove_msg_fwd_resource/1, decode_route_op/1, decode_forwarded_msg/1, decode_resp/1 @@ -80,6 +80,15 @@ -define(PUB_TIMEOUT, 10_000). +-spec ensure_msg_fwd_resource(binary() | map()) -> + {ok, emqx_resource:resource_data() | already_started} | {error, Reason :: term()}. +ensure_msg_fwd_resource(ClusterName) when is_binary(ClusterName) -> + case emqx_cluster_link_config:link(ClusterName) of + #{} = Conf -> + ensure_msg_fwd_resource(Conf); + undefined -> + {error, link_config_not_found} + end; ensure_msg_fwd_resource(#{upstream := Name, pool_size := PoolSize} = ClusterConf) -> ResConf = #{ query_mode => async, @@ -91,8 +100,9 @@ ensure_msg_fwd_resource(#{upstream := Name, pool_size := PoolSize} = ClusterConf }, emqx_resource:create_local(?MSG_RES_ID(Name), ?RES_GROUP, ?MODULE, ClusterConf, ResConf). -stop_msg_fwd_resource(ClusterName) -> - emqx_resource:stop(?MSG_RES_ID(ClusterName)). +-spec remove_msg_fwd_resource(binary() | map()) -> ok | {error, Reason :: term()}. +remove_msg_fwd_resource(ClusterName) -> + emqx_resource:remove_local(?MSG_RES_ID(ClusterName)). %%-------------------------------------------------------------------- %% emqx_resource callbacks (message forwarding) @@ -247,9 +257,9 @@ combine_status(Statuses) -> %%-------------------------------------------------------------------- connect(Options) -> - WorkerId = proplists:get_value(ecpool_worker_id, Options), + WorkerIdBin = integer_to_binary(proplists:get_value(ecpool_worker_id, Options)), #{clientid := ClientId} = ClientOpts = proplists:get_value(client_opts, Options), - ClientId1 = emqx_bridge_mqtt_lib:bytes23([ClientId], WorkerId), + ClientId1 = <>, ClientOpts1 = ClientOpts#{clientid => ClientId1}, case emqtt:start_link(ClientOpts1) of {ok, Pid} -> @@ -369,11 +379,7 @@ decode_route_op1(#{ }) -> {heartbeat, #{actor => Actor, incarnation => Incr}}; decode_route_op1(Payload) -> - ?SLOG(warning, #{ - msg => "unexpected_cluster_link_route_op_payload", - payload => Payload - }), - {error, Payload}. + {error, {unknown_payload, Payload}}. decode_resp1(#{ ?F_OPERATION := ?OP_ACTOR_INIT_ACK, diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl index 105b8d94c..1670c2ab4 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_bootstrap.erl @@ -10,7 +10,7 @@ -include("emqx_cluster_link.hrl"). -export([ - init/2, + init/3, next_batch/1 ]). @@ -27,8 +27,7 @@ %% -init(TargetCluster, Options) -> - LinkFilters = emqx_cluster_link_config:topic_filters(TargetCluster), +init(TargetCluster, LinkFilters, Options) -> {Wildcards, Topics} = lists:partition(fun emqx_topic:wildcard/1, LinkFilters), IsPersistentRoute = maps:get(is_persistent_route, Options, false), #bootstrap{ diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index 5fc267bd9..bccb3e349 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -47,6 +47,7 @@ -define(SYNCER_NAME(Cluster), ?NAME(Cluster, syncer)). -define(SYNCER_REF(Cluster), {via, gproc, ?SYNCER_NAME(Cluster)}). -define(ACTOR_REF(Cluster), {via, gproc, ?NAME(Cluster, actor)}). +-define(ACTOR_NAME(Cluster), ?NAME(Cluster, actor)). -define(MAX_BATCH_SIZE, 4000). -define(MIN_SYNC_INTERVAL, 10). @@ -63,8 +64,8 @@ %% but it must be tolerable, since persistent route destination is a client ID, %% which is unique cluster-wide. -define(PS_ACTOR, <<"ps-routes-v1">>). --define(PS_INCARNATION, 0). -define(PS_ACTOR_REF(Cluster), {via, gproc, ?NAME(Cluster, ps_actor)}). +-define(PS_ACTOR_NAME(Cluster), ?NAME(Cluster, ps_actor)). -define(PS_CLIENT_NAME(Cluster), ?NAME(Cluster, ps_client)). -define(PS_SYNCER_REF(Cluster), {via, gproc, ?PS_SYNCER_NAME(Cluster)}). -define(PS_SYNCER_NAME(Cluster), ?NAME(Cluster, ps_syncer)). @@ -102,43 +103,30 @@ do_push(SyncerName, OpName, Topic, ID) -> %% 1. Actor + MQTT Client %% 2. Syncer -start_link(TargetCluster) -> - supervisor:start_link(?REF(TargetCluster), ?MODULE, {sup, TargetCluster}). +start_link(#{upstream := TargetCluster} = LinkConf) -> + supervisor:start_link(?REF(TargetCluster), ?MODULE, {sup, LinkConf}). %% Actor -start_link_actor(ActorRef, Actor, Incarnation, TargetCluster) -> +new_incarnation() -> + %% TODO: Subject to clock skew, need something more robust. + erlang:system_time(millisecond). + +start_link_actor(ActorRef, Actor, Incarnation, LinkConf) -> gen_server:start_link( ActorRef, ?MODULE, - {actor, mk_state(TargetCluster, Actor, Incarnation)}, + {actor, mk_state(LinkConf, Actor, Incarnation)}, [] ). get_actor_id() -> atom_to_binary(node()). -get_actor_incarnation() -> - persistent_term:get({?MODULE, incarnation}). - -set_actor_incarnation(Incarnation) -> - ok = persistent_term:put({?MODULE, incarnation}, Incarnation), - Incarnation. - -ensure_actor_incarnation() -> - try - get_actor_incarnation() - catch - error:badarg -> - %% TODO: Subject to clock skew, need something more robust. - Incarnation = erlang:system_time(millisecond), - set_actor_incarnation(Incarnation) - end. - %% MQTT Client -start_link_client(TargetCluster, Actor) -> - Options = emqx_cluster_link_config:emqtt_options(TargetCluster), +start_link_client(Actor, LinkConf) -> + Options = emqx_cluster_link_config:mk_emqtt_options(LinkConf), case emqtt:start_link(refine_client_options(Options, Actor)) of {ok, Pid} -> case emqtt:connect(Pid) of @@ -245,7 +233,7 @@ batch_get_opname(Op) -> %% -init({sup, TargetCluster}) -> +init({sup, LinkConf}) -> %% FIXME: Intensity. SupFlags = #{ %% TODO: one_for_one? @@ -254,24 +242,24 @@ init({sup, TargetCluster}) -> period => 60 }, Children = lists:append([ - [child_spec(actor, TargetCluster)], - [child_spec(ps_actor, TargetCluster) || emqx_persistent_message:is_persistence_enabled()] + [child_spec(actor, LinkConf)], + [child_spec(ps_actor, LinkConf) || emqx_persistent_message:is_persistence_enabled()] ]), {ok, {SupFlags, Children}}; init({actor, State}) -> init_actor(State). -child_spec(actor, TargetCluster) -> +child_spec(actor, #{upstream := TargetCluster} = LinkConf) -> %% Actor process. %% Wraps MQTT Client process. %% ClientID: `mycluster:emqx1@emqx.local:routesync` %% Occasional TCP/MQTT-level disconnects are expected, and should be handled %% gracefully. Actor = get_actor_id(), - Incarnation = ensure_actor_incarnation(), - actor_spec(actor, ?ACTOR_REF(TargetCluster), Actor, Incarnation, TargetCluster); -child_spec(ps_actor, TargetCluster) -> - actor_spec(ps_actor, ?PS_ACTOR_REF(TargetCluster), ?PS_ACTOR, ?PS_INCARNATION, TargetCluster). + Incarnation = new_incarnation(), + actor_spec(actor, ?ACTOR_REF(TargetCluster), Actor, Incarnation, LinkConf); +child_spec(ps_actor, #{upstream := TargetCluster, ps_actor_incarnation := Incr} = LinkConf) -> + actor_spec(ps_actor, ?PS_ACTOR_REF(TargetCluster), ?PS_ACTOR, Incr, LinkConf). child_spec(syncer, ?PS_ACTOR, Incarnation, TargetCluster) -> SyncerRef = ?PS_SYNCER_REF(TargetCluster), @@ -286,10 +274,10 @@ child_spec(syncer, Actor, Incarnation, TargetCluster) -> ClientName = ?CLIENT_NAME(TargetCluster), syncer_spec(syncer, Actor, Incarnation, SyncerRef, ClientName). -actor_spec(ChildID, ActorRef, Actor, Incarnation, TargetCluster) -> +actor_spec(ChildID, ActorRef, Actor, Incarnation, LinkConf) -> #{ id => ChildID, - start => {?MODULE, start_link_actor, [ActorRef, Actor, Incarnation, TargetCluster]}, + start => {?MODULE, start_link_actor, [ActorRef, Actor, Incarnation, LinkConf]}, restart => permanent, type => worker }. @@ -308,7 +296,7 @@ syncer_spec(ChildID, Actor, Incarnation, SyncerRef, ClientName) -> target :: binary(), actor :: binary(), incarnation :: non_neg_integer(), - client :: {pid(), reference()}, + client :: {pid(), reference()} | undefined, bootstrapped :: boolean(), reconnect_timer :: reference(), heartbeat_timer :: reference(), @@ -316,30 +304,31 @@ syncer_spec(ChildID, Actor, Incarnation, SyncerRef, ClientName) -> actor_init_timer :: reference(), remote_actor_info :: undefined | map(), status :: connecting | connected | disconnected, - error :: undefined | term() + error :: undefined | term(), + link_conf :: map() }). -mk_state(TargetCluster, Actor, Incarnation) -> +mk_state(#{upstream := TargetCluster} = LinkConf, Actor, Incarnation) -> #st{ target = TargetCluster, actor = Actor, incarnation = Incarnation, bootstrapped = false, - status = connecting + status = connecting, + link_conf = LinkConf }. init_actor(State = #st{}) -> _ = erlang:process_flag(trap_exit, true), {ok, State, {continue, connect}}. -handle_continue(connect, State) -> - {noreply, process_connect(State)}. +handle_continue(connect, St) -> + {noreply, process_connect(St)}. +handle_call(_Request, _From, St) -> + {reply, ignored, St}. -handle_call(_Request, _From, State) -> - {reply, ignored, State}. - -handle_cast(_Request, State) -> - {noreply, State}. +handle_cast(_Request, St) -> + {noreply, St}. handle_info({'EXIT', ClientPid, Reason}, St = #st{client = ClientPid}) -> {noreply, handle_client_down(Reason, St)}; @@ -396,8 +385,8 @@ handle_info(Info, St) -> terminate(_Reason, _State) -> ok. -process_connect(St = #st{target = TargetCluster, actor = Actor}) -> - case start_link_client(TargetCluster, Actor) of +process_connect(St = #st{target = TargetCluster, actor = Actor, link_conf = Conf}) -> + case start_link_client(Actor, Conf) of {ok, ClientPid} -> _ = maybe_deactivate_alarm(St), ok = announce_client(Actor, TargetCluster, ClientPid), @@ -498,17 +487,17 @@ cancel_heartbeat(St = #st{heartbeat_timer = TRef}) -> %% is re-established with a clean session. Once bootstrapping is done, it %% opens the syncer. -run_bootstrap(St = #st{target = TargetCluster, actor = ?PS_ACTOR}) -> +run_bootstrap(St = #st{target = TargetCluster, actor = ?PS_ACTOR, link_conf = #{topics := Topics}}) -> case mria_config:whoami() of Role when Role /= replicant -> Opts = #{is_persistent_route => true}, - Bootstrap = emqx_cluster_link_router_bootstrap:init(TargetCluster, Opts), + Bootstrap = emqx_cluster_link_router_bootstrap:init(TargetCluster, Topics, Opts), run_bootstrap(Bootstrap, St); _ -> process_bootstrapped(St) end; -run_bootstrap(St = #st{target = TargetCluster}) -> - Bootstrap = emqx_cluster_link_router_bootstrap:init(TargetCluster, #{}), +run_bootstrap(St = #st{target = TargetCluster, link_conf = #{topics := Topics}}) -> + Bootstrap = emqx_cluster_link_router_bootstrap:init(TargetCluster, Topics, #{}), run_bootstrap(Bootstrap, St). run_bootstrap(Bootstrap, St) -> @@ -527,7 +516,9 @@ run_bootstrap(Bootstrap, St) -> end end. -process_bootstrapped(St = #st{target = TargetCluster, actor = Actor}) -> +process_bootstrapped( + St = #st{target = TargetCluster, actor = Actor} +) -> ok = open_syncer(TargetCluster, Actor), St#st{bootstrapped = true}. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl index 22c9e31ec..03c8902df 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl @@ -11,6 +11,9 @@ -export([injected_fields/0]). +%% Used in emqx_cluster_link_api +-export([links_schema/1]). + -export([ roots/0, fields/1, @@ -27,14 +30,14 @@ roots() -> []. injected_fields() -> #{cluster => fields("cluster_linking")}. +links_schema(Meta) -> + ?HOCON(?ARRAY(?R_REF("link")), Meta#{default => [], validator => fun links_validator/1}). + fields("cluster_linking") -> - [ - {links, - ?HOCON(?ARRAY(?R_REF("link")), #{default => [], validator => fun links_validator/1})} - ]; + [{links, links_schema(#{})}]; fields("link") -> [ - {enable, ?HOCON(boolean(), #{default => false})}, + {enable, ?HOCON(boolean(), #{default => true})}, {upstream, ?HOCON(binary(), #{required => true})}, {server, emqx_schema:servers_sc(#{required => true, desc => ?DESC("server")}, ?MQTT_HOST_OPTS)}, @@ -46,13 +49,13 @@ fields("link") -> default => #{<<"enable">> => false}, desc => ?DESC("ssl") }}, - %% TODO: validate topics: - %% - basic topic validation - %% - non-overlapping (not intersecting) filters? - %% (this may be not required, depends on config update implementation) {topics, ?HOCON(?ARRAY(binary()), #{required => true, validator => fun topics_validator/1})}, - {pool_size, ?HOCON(pos_integer(), #{default => emqx_vm:schedulers() * 2})} + {pool_size, ?HOCON(pos_integer(), #{default => emqx_vm:schedulers() * 2})}, + %% Must not be configured manually. The value is incremented by cluster link config handler + %% and is used as a globally synchronized sequence to ensure persistent routes actors have + %% the same next incarnation after each config change. + {ps_actor_incarnation, ?HOCON(integer(), #{default => 0, importance => ?IMPORTANCE_HIDDEN})} ]. desc(_) -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl index 872054fa0..0991583e2 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl @@ -8,9 +8,15 @@ -export([start_link/1]). +-export([ + ensure_actor/1, + ensure_actor_stopped/1 +]). + -export([init/1]). -define(SERVER, ?MODULE). +-define(ACTOR_MODULE, emqx_cluster_link_router_syncer). start_link(LinksConf) -> supervisor:start_link({local, ?SERVER}, ?SERVER, LinksConf). @@ -23,8 +29,8 @@ init(LinksConf) -> }, ExtrouterGC = extrouter_gc_spec(), RouteActors = [ - sup_spec(Name, emqx_cluster_link_router_syncer, [Name]) - || #{upstream := Name} <- LinksConf + sup_spec(Name, ?ACTOR_MODULE, [LinkConf]) + || #{upstream := Name} = LinkConf <- LinksConf ], {ok, {SupFlags, [ExtrouterGC | RouteActors]}}. @@ -46,3 +52,22 @@ sup_spec(Id, Mod, Args) -> type => supervisor, modules => [Mod] }. + +ensure_actor(#{upstream := Name} = LinkConf) -> + case supervisor:start_child(?SERVER, sup_spec(Name, ?ACTOR_MODULE, [LinkConf])) of + {ok, Pid} -> + {ok, Pid}; + {error, {already_started, Pid}} -> + {ok, Pid}; + Err -> + Err + end. + +ensure_actor_stopped(ClusterName) -> + case supervisor:terminate_child(?MODULE, ClusterName) of + ok -> + _ = supervisor:delete_child(?MODULE, ClusterName), + ok; + {error, not_found} -> + ok + end. From ff16521d4fd3ca355199c1c79c405835c34921bb Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 7 Jun 2024 14:00:43 +0300 Subject: [PATCH 38/46] fix(clusterlink): add schema descriptions, fix dialyzer warnings, add resource_opts --- apps/emqx/src/emqx_external_broker.erl | 6 +- apps/emqx/src/emqx_router_syncer.erl | 30 ++++---- .../src/emqx_cluster_link.erl | 2 +- .../src/emqx_cluster_link_app.erl | 17 ++--- .../src/emqx_cluster_link_config.erl | 10 +-- .../src/emqx_cluster_link_extrouter.erl | 39 +++++++--- .../src/emqx_cluster_link_extrouter_gc.erl | 2 +- .../src/emqx_cluster_link_mqtt.erl | 14 ++-- .../src/emqx_cluster_link_router_syncer.erl | 40 +++++----- .../src/emqx_cluster_link_schema.erl | 74 +++++++++++++++---- apps/emqx_conf/src/emqx_conf_schema.erl | 3 +- apps/emqx_utils/src/emqx_utils.erl | 4 +- rel/i18n/emqx_cluster_link_schema.hocon | 53 +++++++++++++ 13 files changed, 203 insertions(+), 91 deletions(-) create mode 100644 rel/i18n/emqx_cluster_link_schema.hocon diff --git a/apps/emqx/src/emqx_external_broker.erl b/apps/emqx/src/emqx_external_broker.erl index bf6448490..ebcd48994 100644 --- a/apps/emqx/src/emqx_external_broker.erl +++ b/apps/emqx/src/emqx_external_broker.erl @@ -16,7 +16,7 @@ -module(emqx_external_broker). --callback forward(emqx_router:external_dest(), emqx_types:delivery()) -> +-callback forward(dest(), emqx_types:delivery()) -> emqx_types:deliver_result(). -callback should_route_to_external_dests(emqx_types:message()) -> boolean(). @@ -64,8 +64,8 @@ Provider:IfRegistered catch Err:Reason:St -> - ?SLOG(error, #{ - msg => "external_broker_crashed", + ?SLOG_THROTTLE(error, #{ + msg => external_broker_crashed, provider => Provider, callback => ?FUNCTION_NAME, stacktrace => St, diff --git a/apps/emqx/src/emqx_router_syncer.erl b/apps/emqx/src/emqx_router_syncer.erl index 4756d0a37..09bdd6129 100644 --- a/apps/emqx/src/emqx_router_syncer.erl +++ b/apps/emqx/src/emqx_router_syncer.erl @@ -29,8 +29,8 @@ -export([push/5]). -export([wait/1]). --export([close/1]). --export([open/1]). +-export([suspend/1]). +-export([activate/1]). -export([stats/0]). @@ -49,7 +49,7 @@ min_sync_interval => non_neg_integer(), error_delay => non_neg_integer(), error_retry_interval => non_neg_integer(), - initial_state => open | closed, + initial_state => activated | suspended, batch_handler => {module(), _Function :: atom(), _Args :: list()} }. @@ -166,11 +166,13 @@ mk_push_context(_) -> %% -close(Ref) -> - gen_server:call(Ref, close, infinity). +%% Suspended syncer receives and accumulates route ops but doesn't apply them +%% until it is activated. +suspend(Ref) -> + gen_server:call(Ref, suspend, infinity). -open(Ref) -> - gen_server:call(Ref, open, infinity). +activate(Ref) -> + gen_server:call(Ref, activate, infinity). %% @@ -191,7 +193,7 @@ stats() -> mk_state(Options) -> #{ - state => maps:get(initial_state, Options, open), + state => maps:get(initial_state, Options, active), stash => stash_new(), retry_timer => undefined, max_batch_size => maps:get(max_batch_size, Options, ?MAX_BATCH_SIZE), @@ -209,13 +211,13 @@ init({Pool, Id, State}) -> init(State) -> {ok, State}. -handle_call(close, _From, State) -> - NState = State#{state := closed}, +handle_call(suspend, _From, State) -> + NState = State#{state := suspended}, {reply, ok, NState}; -handle_call(open, _From, State = #{state := closed}) -> - NState = run_batch_loop([], State#{state := open}), +handle_call(activate, _From, State = #{state := suspended}) -> + NState = run_batch_loop([], State#{state := active}), {reply, ok, NState}; -handle_call(open, _From, State) -> +handle_call(activate, _From, State) -> {reply, ok, State}; handle_call(stats, _From, State = #{stash := Stash}) -> {reply, stash_stats(Stash), State}; @@ -239,7 +241,7 @@ terminate(_Reason, _State) -> %% -run_batch_loop(Incoming, State = #{stash := Stash0, state := closed}) -> +run_batch_loop(Incoming, State = #{stash := Stash0, state := suspended}) -> Stash1 = stash_add(Incoming, Stash0), Stash2 = stash_drain(Stash1), State#{stash := Stash2}; diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index fd5280262..cdfe22f3d 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -210,7 +210,7 @@ update_routes(ClusterName, Actor, RouteOps) -> ActorSt = get_actor_state(ClusterName, Actor), lists:foreach( fun(RouteOp) -> - emqx_cluster_link_extrouter:actor_apply_operation(RouteOp, ActorSt) + _ = emqx_cluster_link_extrouter:actor_apply_operation(RouteOp, ActorSt) end, RouteOps ). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl index 750387ca9..ddf3028a2 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl @@ -14,15 +14,14 @@ start(_StartType, _StartArgs) -> ok = mria:wait_for_tables(emqx_cluster_link_extrouter:create_tables()), emqx_cluster_link_config:add_handler(), LinksConf = emqx_cluster_link_config:enabled_links(), - _ = - case LinksConf of - [_ | _] -> - ok = emqx_cluster_link:register_external_broker(), - ok = emqx_cluster_link:put_hook(), - ok = start_msg_fwd_resources(LinksConf); - _ -> - ok - end, + case LinksConf of + [_ | _] -> + ok = emqx_cluster_link:register_external_broker(), + ok = emqx_cluster_link:put_hook(), + ok = start_msg_fwd_resources(LinksConf); + _ -> + ok + end, emqx_cluster_link_sup:start_link(LinksConf). prep_stop(State) -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index 67dc267e6..28344cd7e 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -100,7 +100,8 @@ actor_heartbeat_interval() -> mk_emqtt_options(#{server := Server, ssl := #{enable := EnableSsl} = Ssl} = LinkConf) -> ClientId = maps:get(clientid, LinkConf, cluster()), #{hostname := Host, port := Port} = emqx_schema:parse_server(Server, ?MQTT_HOST_OPTS), - Opts = #{ + Opts = maps:with([username, retry_interval, max_inflight], LinkConf), + Opts1 = Opts#{ host => Host, port => Port, clientid => ClientId, @@ -108,12 +109,7 @@ mk_emqtt_options(#{server := Server, ssl := #{enable := EnableSsl} = Ssl} = Link ssl => EnableSsl, ssl_opts => maps:to_list(maps:remove(enable, Ssl)) }, - with_password(with_user(Opts, LinkConf), LinkConf). - -with_user(Opts, #{username := U} = _LinkConf) -> - Opts#{username => U}; -with_user(Opts, _LinkConf) -> - Opts. + with_password(Opts1, LinkConf). with_password(Opts, #{password := P} = _LinkConf) -> Opts#{password => emqx_secret:unwrap(P)}; diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl index a97aa7ece..79d96e207 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter.erl @@ -119,23 +119,34 @@ create_tables() -> %% +-spec match_routes(emqx_types:topic()) -> [emqx_types:route()]. match_routes(Topic) -> Matches = emqx_topic_index:matches(Topic, ?EXTROUTE_TAB, [unique]), %% `unique` opt is not enough, since we keep the original Topic as a part of RouteID lists:ukeysort(#route.dest, [match_to_route(M) || M <- Matches]). +-spec lookup_routes(emqx_types:topic()) -> [emqx_types:route()]. lookup_routes(Topic) -> - Pat = #extroute{entry = emqx_topic_index:make_key(Topic, '$1'), _ = '_'}, + Pat = make_extroute_rec_pat(emqx_topic_index:make_key(Topic, '$1')), [match_to_route(R#extroute.entry) || Records <- ets:match(?EXTROUTE_TAB, Pat), R <- Records]. +-spec topics() -> [emqx_types:topic()]. topics() -> - Pat = #extroute{entry = '$1', _ = '_'}, + Pat = make_extroute_rec_pat('$1'), [emqx_topic_index:get_topic(K) || [K] <- ets:match(?EXTROUTE_TAB, Pat)]. match_to_route(M) -> ?ROUTE_ID(Cluster, _) = emqx_topic_index:get_id(M), #route{topic = emqx_topic_index:get_topic(M), dest = Cluster}. +%% Make Dialyzer happy +make_extroute_rec_pat(Entry) -> + erlang:make_tuple( + record_info(size, extroute), + '_', + [{1, extroute}, {#extroute.entry, Entry}] + ). + %% -record(state, { @@ -143,12 +154,12 @@ match_to_route(M) -> actor :: actor(), incarnation :: incarnation(), lane :: lane() | undefined, - extra :: map() + extra = #{} :: map() }). -type state() :: #state{}. --type env() :: #{timestamp := _Milliseconds}. +-type env() :: #{timestamp => _Milliseconds}. -spec actor_init(cluster(), actor(), incarnation(), env()) -> {ok, state()}. actor_init(Cluster, Actor, Incarnation, Env = #{timestamp := Now}) -> @@ -170,10 +181,8 @@ is_present_incarnation(_State) -> -spec list_actors(cluster()) -> [#{actor := actor(), incarnation := incarnation()}]. list_actors(Cluster) -> - Matches = ets:match( - emqx_external_router_actor, - #actor{id = {Cluster, '$1'}, incarnation = '$2', _ = '_'} - ), + Pat = make_actor_rec_pat([{#actor.id, {Cluster, '$1'}}, {#actor.incarnation, '$2'}]), + Matches = ets:match(emqx_external_router_actor, Pat), [#{actor => Actor, incarnation => Incr} || [Actor, Incr] <- Matches]. mnesia_actor_init(Cluster, Actor, Incarnation, TS) -> @@ -291,7 +300,8 @@ apply_operation(Entry, MCounter, OpName, Lane) -> -spec actor_gc(env()) -> _NumCleaned :: non_neg_integer(). actor_gc(#{timestamp := Now}) -> - MS = [{#actor{until = '$1', _ = '_'}, [{'<', '$1', Now}], ['$_']}], + Pat = make_actor_rec_pat([{#actor.until, '$1'}]), + MS = [{Pat, [{'<', '$1', Now}], ['$_']}], Dead = mnesia:dirty_select(?EXTROUTE_ACTOR_TAB, MS), try_clean_incarnation(Dead). @@ -316,9 +326,18 @@ mnesia_assign_lane(Cluster) -> Lane. select_cluster_lanes(Cluster) -> - MS = [{#actor{id = {Cluster, '_'}, lane = '$1', _ = '_'}, [], ['$1']}], + Pat = make_actor_rec_pat([{#actor.id, {Cluster, '_'}}, {#actor.lane, '$1'}]), + MS = [{Pat, [], ['$1']}], mnesia:select(?EXTROUTE_ACTOR_TAB, MS, write). +%% Make Dialyzer happy +make_actor_rec_pat(PosValues) -> + erlang:make_tuple( + record_info(size, actor), + '_', + [{1, actor} | PosValues] + ). + mnesia_actor_heartbeat(ActorID, Incarnation, TS) -> case mnesia:read(?EXTROUTE_ACTOR_TAB, ActorID, write) of [#actor{incarnation = Incarnation} = Rec] -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl index 89258b506..695273808 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_extrouter_gc.erl @@ -37,7 +37,7 @@ run() -> %% -record(st, { - gc_timer :: reference() + gc_timer :: undefined | reference() }). init(_) -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index e4b398397..7a8bf1dff 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -89,16 +89,12 @@ ensure_msg_fwd_resource(ClusterName) when is_binary(ClusterName) -> undefined -> {error, link_config_not_found} end; -ensure_msg_fwd_resource(#{upstream := Name, pool_size := PoolSize} = ClusterConf) -> - ResConf = #{ +ensure_msg_fwd_resource(#{upstream := Name, resource_opts := ResOpts} = ClusterConf) -> + ResOpts1 = ResOpts#{ query_mode => async, - start_after_created => true, - start_timeout => 5000, - health_check_interval => 5000, - %% TODO: configure res_buf_worker pool separately? - worker_pool_size => PoolSize + start_after_created => true }, - emqx_resource:create_local(?MSG_RES_ID(Name), ?RES_GROUP, ?MODULE, ClusterConf, ResConf). + emqx_resource:create_local(?MSG_RES_ID(Name), ?RES_GROUP, ?MODULE, ClusterConf, ResOpts1). -spec remove_msg_fwd_resource(binary() | map()) -> ok | {error, Reason :: term()}. remove_msg_fwd_resource(ClusterName) -> @@ -344,7 +340,7 @@ publish_heartbeat(ClientPid, Actor, Incarnation) -> ?F_ACTOR => Actor, ?F_INCARNATION => Incarnation }, - emqtt:publish_async(ClientPid, ?ROUTE_TOPIC, ?ENCODE(Payload), ?QOS_0, undefined). + emqtt:publish_async(ClientPid, ?ROUTE_TOPIC, ?ENCODE(Payload), ?QOS_0, {fun(_) -> ok end, []}). decode_route_op(Payload) -> decode_route_op1(?DECODE(Payload)). diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index bccb3e349..fdcbd91c7 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -196,20 +196,20 @@ start_link_syncer(Actor, Incarnation, SyncerRef, ClientName) -> max_batch_size => ?MAX_BATCH_SIZE, min_sync_interval => ?MIN_SYNC_INTERVAL, error_delay => ?ERROR_DELAY, - initial_state => closed, + initial_state => suspended, batch_handler => {?MODULE, process_syncer_batch, [ClientName, Actor, Incarnation]} %% TODO: enable_replies => false }). -close_syncer(TargetCluster, ?PS_ACTOR) -> - emqx_router_syncer:close(?PS_SYNCER_REF(TargetCluster)); -close_syncer(TargetCluster, _Actor) -> - emqx_router_syncer:close(?SYNCER_REF(TargetCluster)). +suspend_syncer(TargetCluster, ?PS_ACTOR) -> + emqx_router_syncer:suspend(?PS_SYNCER_REF(TargetCluster)); +suspend_syncer(TargetCluster, _Actor) -> + emqx_router_syncer:suspend(?SYNCER_REF(TargetCluster)). -open_syncer(TargetCluster, ?PS_ACTOR) -> - emqx_router_syncer:open(?PS_SYNCER_REF(TargetCluster)); -open_syncer(TargetCluster, _Actor) -> - emqx_router_syncer:open(?SYNCER_REF(TargetCluster)). +activate_syncer(TargetCluster, ?PS_ACTOR) -> + emqx_router_syncer:activate(?PS_SYNCER_REF(TargetCluster)); +activate_syncer(TargetCluster, _Actor) -> + emqx_router_syncer:activate(?SYNCER_REF(TargetCluster)). process_syncer_batch(Batch, ClientName, Actor, Incarnation) -> Updates = maps:fold( @@ -296,12 +296,12 @@ syncer_spec(ChildID, Actor, Incarnation, SyncerRef, ClientName) -> target :: binary(), actor :: binary(), incarnation :: non_neg_integer(), - client :: {pid(), reference()} | undefined, + client :: undefined | pid(), bootstrapped :: boolean(), - reconnect_timer :: reference(), - heartbeat_timer :: reference(), - actor_init_req_id :: binary(), - actor_init_timer :: reference(), + reconnect_timer :: undefined | reference(), + heartbeat_timer :: undefined | reference(), + actor_init_req_id :: undefined | binary(), + actor_init_timer :: undefined | reference(), remote_actor_info :: undefined | map(), status :: connecting | connected | disconnected, error :: undefined | term(), @@ -336,7 +336,11 @@ handle_info( {publish, #{payload := Payload, properties := #{'Correlation-Data' := ReqId}}}, St = #st{actor_init_req_id = ReqId} ) -> - {actor_init_ack, #{result := Res, need_bootstrap := NeedBootstrap} = AckInfoMap} = emqx_cluster_link_mqtt:decode_resp( + {actor_init_ack, + #{ + result := Res, + need_bootstrap := NeedBootstrap + } = AckInfoMap} = emqx_cluster_link_mqtt:decode_resp( Payload ), St1 = St#st{ @@ -451,7 +455,7 @@ handle_client_down(Reason, St = #st{target = TargetCluster, actor = Actor}) -> actor => St#st.actor }), %% TODO: syncer may be already down due to one_for_all strategy - ok = close_syncer(TargetCluster, Actor), + ok = suspend_syncer(TargetCluster, Actor), _ = maybe_alarm(Reason, St), NSt = cancel_heartbeat(St), process_connect(NSt#st{client = undefined, error = Reason, status = connecting}). @@ -519,7 +523,7 @@ run_bootstrap(Bootstrap, St) -> process_bootstrapped( St = #st{target = TargetCluster, actor = Actor} ) -> - ok = open_syncer(TargetCluster, Actor), + ok = activate_syncer(TargetCluster, Actor), St#st{bootstrapped = true}. process_bootstrap_batch(Batch, #st{client = ClientPid, actor = Actor, incarnation = Incarnation}) -> @@ -529,7 +533,7 @@ ensure_bootstrap_heartbeat(St = #st{heartbeat_timer = TRef}) -> case erlang:read_timer(TRef) of false -> ok = emqx_utils:cancel_timer(TRef), - process_heartbeat(St); + process_heartbeat(St#st{heartbeat_timer = undefined}); _TimeLeft -> St end. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl index 03c8902df..b6d0fbcda 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl @@ -21,45 +21,87 @@ desc/1 ]). +-import(emqx_schema, [mk_duration/2]). + -define(MQTT_HOST_OPTS, #{default_port => 1883}). -namespace() -> "cluster_linking". +namespace() -> "cluster". roots() -> []. injected_fields() -> - #{cluster => fields("cluster_linking")}. + #{cluster => [{links, links_schema(#{})}]}. links_schema(Meta) -> - ?HOCON(?ARRAY(?R_REF("link")), Meta#{default => [], validator => fun links_validator/1}). + ?HOCON(?ARRAY(?R_REF("link")), Meta#{ + default => [], validator => fun links_validator/1, desc => ?DESC("links") + }). -fields("cluster_linking") -> - [{links, links_schema(#{})}]; fields("link") -> [ - {enable, ?HOCON(boolean(), #{default => true})}, - {upstream, ?HOCON(binary(), #{required => true})}, + {enable, ?HOCON(boolean(), #{default => true, desc => ?DESC(enable)})}, + {upstream, ?HOCON(binary(), #{required => true, desc => ?DESC(upstream)})}, {server, - emqx_schema:servers_sc(#{required => true, desc => ?DESC("server")}, ?MQTT_HOST_OPTS)}, - {clientid, ?HOCON(binary(), #{desc => ?DESC("clientid")})}, - {username, ?HOCON(binary(), #{desc => ?DESC("username")})}, - {password, emqx_schema_secret:mk(#{desc => ?DESC("password")})}, + emqx_schema:servers_sc(#{required => true, desc => ?DESC(server)}, ?MQTT_HOST_OPTS)}, + {clientid, ?HOCON(binary(), #{desc => ?DESC(clientid)})}, + {username, ?HOCON(binary(), #{desc => ?DESC(username)})}, + {password, emqx_schema_secret:mk(#{desc => ?DESC(password)})}, {ssl, #{ type => ?R_REF(emqx_schema, "ssl_client_opts"), default => #{<<"enable">> => false}, - desc => ?DESC("ssl") + desc => ?DESC(ssl) }}, {topics, - ?HOCON(?ARRAY(binary()), #{required => true, validator => fun topics_validator/1})}, - {pool_size, ?HOCON(pos_integer(), #{default => emqx_vm:schedulers() * 2})}, + ?HOCON(?ARRAY(binary()), #{ + desc => ?DESC(topics), required => true, validator => fun topics_validator/1 + })}, + {pool_size, ?HOCON(pos_integer(), #{default => 8, desc => ?DESC(pool_size)})}, + {retry_interval, + mk_duration( + "MQTT Message retry interval. Delay for the link to retry sending the QoS1/QoS2 " + "messages in case of ACK not received.", + #{default => <<"15s">>} + )}, + {max_inflight, + ?HOCON( + non_neg_integer(), + #{ + default => 32, + desc => ?DESC("max_inflight") + } + )}, + {resource_opts, + ?HOCON( + ?R_REF(?MODULE, "creation_opts"), + #{ + required => false, + default => #{}, + desc => ?DESC(emqx_resource_schema, <<"resource_opts">>) + } + )}, %% Must not be configured manually. The value is incremented by cluster link config handler %% and is used as a globally synchronized sequence to ensure persistent routes actors have %% the same next incarnation after each config change. {ps_actor_incarnation, ?HOCON(integer(), #{default => 0, importance => ?IMPORTANCE_HIDDEN})} - ]. + ]; +fields("creation_opts") -> + Opts = emqx_resource_schema:fields("creation_opts"), + [O || {Field, _} = O <- Opts, not is_hidden_res_opt(Field)]. +desc("links") -> + ?DESC("links"); +desc("link") -> + ?DESC("link"); +desc("creation_opts" = Name) -> + emqx_resource_schema:desc(Name); desc(_) -> - "todo". + undefined. + +is_hidden_res_opt(Field) -> + lists:member( + Field, + [start_after_created, query_mode, enable_batch, batch_size, batch_time] + ). %% TODO: check that no link name equals local cluster name, %% but this may be tricky since the link config is injected into cluster config (emqx_conf_schema). diff --git a/apps/emqx_conf/src/emqx_conf_schema.erl b/apps/emqx_conf/src/emqx_conf_schema.erl index b4c59d291..64d341bce 100644 --- a/apps/emqx_conf/src/emqx_conf_schema.erl +++ b/apps/emqx_conf/src/emqx_conf_schema.erl @@ -82,7 +82,8 @@ connection_rejected_due_to_license_limit_reached, dropped_msg_due_to_mqueue_is_full, socket_receive_paused_by_rate_limit, - data_bridge_buffer_overflow + data_bridge_buffer_overflow, + external_broker_crashed ]). %% Callback to upgrade config after loaded from config file but before validation. diff --git a/apps/emqx_utils/src/emqx_utils.erl b/apps/emqx_utils/src/emqx_utils.erl index 8f41a4919..a6efcb443 100644 --- a/apps/emqx_utils/src/emqx_utils.erl +++ b/apps/emqx_utils/src/emqx_utils.erl @@ -565,10 +565,10 @@ tcp_keepalive_opts(OS, _Idle, _Interval, _Probes) -> {error, {unsupported_os, OS}}. format(Term) -> - iolist_to_binary(io_lib:format("~0p", [Term])). + unicode:characters_to_binary(io_lib:format("~0p", [Term])). format(Fmt, Args) -> - iolist_to_binary(io_lib:format(Fmt, Args)). + unicode:characters_to_binary(io_lib:format(Fmt, Args)). %% @doc Helper function for log formatters. -spec format_mfal(map(), map()) -> undefined | binary(). diff --git a/rel/i18n/emqx_cluster_link_schema.hocon b/rel/i18n/emqx_cluster_link_schema.hocon new file mode 100644 index 000000000..77e4987f7 --- /dev/null +++ b/rel/i18n/emqx_cluster_link_schema.hocon @@ -0,0 +1,53 @@ +emqx_cluster_link_schema { + +links.desc: +"""The list of the linked EMQX clusters.""" +links.label: "Cluster Links" + +link.desc: +"""Cluster link configuration""" +link.label: "Cluster Link" + +enable.desc: +"""Enable or disable a cluster link. The link is enabled by default, disabling it allows stopping the link without removing its configuration. The link must be enabled on both sides to be operational. Disabling the link should also be done on both clusters in order to free up all associated resources.""" +enable.label: "Enable" + +upstream.desc: +"""Upstream cluster name. Must be exactly equal to the value of `cluster.name` configured at the remote cluster. Must not be equal to the local cluster.name. All configured cluster link upstream names must be unique.""" +upstream.label: "Upstream Name" + +server.desc: +"""MQTT host and port of the remote EMQX broker.""" +server.label: "MQTT Server" + +username.desc: +"""Optional MQTT username for connecting to the remote EMQX cluster.""" +username.label: "Username" + +password.desc: +"""Optional MQTT username for connecting to the remote EMQX cluster.""" +password.label: "Password" + +clientid.desc: +"""Optional Base MQTT client ID for connecting to the remote EMQX cluster. If omitted, local `cluster.name` is used. EMQX maintains several connections between linked clusters, so distinct suffixes are automatically appended to the base client ID.""" +clientid.label: "Base Client ID" + +ssl.desc: """SSL configuration for connecting to the remote EMQX cluster.""" +ssl.label: "SSL Options" + +topics.desc: """MQTT topics to be forwarded by the linked remote EMQX broker to the local broker. Messages are only forwarded if the local EMQX broker has matching subscriber(s). +Wildcards are supported. Setting empty topics list on one side of the link can be used to establish unidirectional links: the side with the empty topics won't receive remote messages, but it can forward relevant messages to its linked counterpart (according to the topics configured on that side of the link).""" +topics.label: "Topics" + +pool_size.desc: +"""Size of the pool of MQTT clients that will publish messages to the linked EMQX broker.""" + +pool_size.label: +"""Connection Pool Size""" + +max_inflight.desc: +"""Max inflight (sent, but un-acked) messages of the MQTT protocol""" + +max_inflight.label: +"""Max Inflight Message""" +} From 5304ca156385da9e3362cf95a8b7e1a72bdfe56d Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 7 Jun 2024 16:07:40 +0300 Subject: [PATCH 39/46] fix(clusterlink): add emqx_cluster_link app to mix.exs/rebar.config.erl --- mix.exs | 3 ++- rebar.config.erl | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mix.exs b/mix.exs index eb2bda31f..06399edfe 100644 --- a/mix.exs +++ b/mix.exs @@ -204,7 +204,8 @@ defmodule EMQXUmbrella.MixProject do :emqx_gateway_jt808, :emqx_bridge_syskeeper, :emqx_ds_shared_sub, - :emqx_auth_ext + :emqx_auth_ext, + :emqx_cluster_link ]) end diff --git a/rebar.config.erl b/rebar.config.erl index 1e507aa18..493955670 100644 --- a/rebar.config.erl +++ b/rebar.config.erl @@ -123,6 +123,7 @@ is_community_umbrella_app("apps/emqx_eviction_agent") -> false; is_community_umbrella_app("apps/emqx_node_rebalance") -> false; is_community_umbrella_app("apps/emqx_ds_shared_sub") -> false; is_community_umbrella_app("apps/emqx_auth_ext") -> false; +is_community_umbrella_app("apps/emqx_cluster_link") -> false; is_community_umbrella_app(_) -> true. %% BUILD_WITHOUT_JQ From d282c61120dd1bc9fd019713c3962c7be2aea667 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 7 Jun 2024 19:17:01 +0300 Subject: [PATCH 40/46] feat(clusterlink): update only necessary resources when a link config is changed --- .../src/emqx_cluster_link_config.erl | 49 +++++++++++++++---- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index 28344cd7e..c28755ac1 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -19,6 +19,13 @@ -define(DEFAULT_ACTOR_TTL, 3_000). -endif. +-define(COMMON_FIELDS, [username, password, clientid, server, ssl]). +%% NOTE: retry_interval, max_inflight may be used for router syncer client as well, +%% but for now they are not. +-define(MSG_RES_FIELDS, [resource_opts, pool_size, retry_interval, max_inflight]). +%% Excludes a special hidden `ps_actor_incarnation` field. +-define(ACTOR_FIELDS, [topics]). + -export([ %% General update/1, @@ -196,19 +203,43 @@ update_links(LinksConf) -> [update_link(Link) || Link <- LinksConf]. update_link({OldLinkConf, #{enable := true, upstream := Name} = NewLinkConf}) -> - _ = ensure_actor_stopped(Name), - {ok, _Pid} = emqx_cluster_link_sup:ensure_actor(NewLinkConf), - %% TODO: if only msg_fwd resource related config is changed, - %% we can skip actor reincarnation/restart. - ok = update_msg_fwd_resource(OldLinkConf, NewLinkConf), - ok; + case what_is_changed(OldLinkConf, NewLinkConf) of + both -> + _ = ensure_actor_stopped(Name), + {ok, _Pid} = emqx_cluster_link_sup:ensure_actor(NewLinkConf), + ok = update_msg_fwd_resource(OldLinkConf, NewLinkConf); + actor -> + _ = ensure_actor_stopped(Name), + {ok, _Pid} = emqx_cluster_link_sup:ensure_actor(NewLinkConf), + ok; + msg_resource -> + ok = update_msg_fwd_resource(OldLinkConf, NewLinkConf) + end; update_link({_OldLinkConf, #{enable := false, upstream := Name} = _NewLinkConf}) -> _ = emqx_cluster_link_mqtt:remove_msg_fwd_resource(Name), ensure_actor_stopped(Name). -update_msg_fwd_resource(#{pool_size := Old}, #{pool_size := Old} = NewConf) -> - {ok, _} = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(NewConf), - ok; +what_is_changed(OldLink, NewLink) -> + CommonChanged = are_fields_changed(?COMMON_FIELDS, OldLink, NewLink), + ActorChanged = are_fields_changed(?ACTOR_FIELDS, OldLink, NewLink), + MsgResChanged = are_fields_changed(?MSG_RES_FIELDS, OldLink, NewLink), + AllChanged = ActorChanged andalso MsgResChanged, + case CommonChanged orelse AllChanged of + true -> + both; + false -> + %% This function is only applicable when it's certain that link conf is changed, + %% so if resource fields are the same, + %% then some other actor-related fields are definitely changed. + case MsgResChanged of + true -> msg_resource; + false -> actor + end + end. + +are_fields_changed(Fields, OldLink, NewLink) -> + maps:with(Fields, OldLink) =/= maps:with(Fields, NewLink). + update_msg_fwd_resource(_, #{upstream := Name} = NewConf) -> _ = emqx_cluster_link_mqtt:remove_msg_fwd_resource(Name), {ok, _} = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(NewConf), From ed16ff07dfce00d3cc10a490b9e50f28514014fb Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Wed, 5 Jun 2024 18:21:22 +0200 Subject: [PATCH 41/46] refactor(broker): simplify external broker behaviour --- apps/emqx/src/emqx_broker.erl | 23 +++---------- apps/emqx/src/emqx_external_broker.erl | 25 +++++--------- .../src/emqx_cluster_link.erl | 34 +++++++++++-------- 3 files changed, 33 insertions(+), 49 deletions(-) diff --git a/apps/emqx/src/emqx_broker.erl b/apps/emqx/src/emqx_broker.erl index df6898470..8fa011429 100644 --- a/apps/emqx/src/emqx_broker.erl +++ b/apps/emqx/src/emqx_broker.erl @@ -246,7 +246,8 @@ publish(Msg) when is_record(Msg, message) -> []; Msg1 = #message{} -> do_publish(Msg1); - Msgs when is_list(Msgs) -> do_publish_many(Msgs) + Msgs when is_list(Msgs) -> + do_publish_many(Msgs) end. do_publish_many([]) -> @@ -259,7 +260,7 @@ do_publish(#message{topic = Topic} = Msg) -> Routes = aggre(emqx_router:match_routes(Topic)), Delivery = delivery(Msg), RouteRes = route(Routes, Delivery, PersistRes), - ext_route(ext_routes(Topic, Msg), Delivery, RouteRes). + do_forward_external(Delivery, RouteRes). persist_publish(Msg) -> case emqx_persistent_message:persist(Msg) of @@ -344,22 +345,8 @@ aggre([], false, Acc) -> aggre([], true, Acc) -> lists:usort(Acc). -ext_routes(Topic, Msg) -> - case emqx_external_broker:should_route_to_external_dests(Msg) of - true -> emqx_external_broker:match_routes(Topic); - false -> [] - end. - -ext_route([], _Delivery, RouteRes) -> - RouteRes; -ext_route(ExtRoutes, Delivery, RouteRes) -> - lists:foldl( - fun(#route{topic = To, dest = ExtDest}, Acc) -> - [{ExtDest, To, emqx_external_broker:forward(ExtDest, Delivery)} | Acc] - end, - RouteRes, - ExtRoutes - ). +do_forward_external(Delivery, RouteRes) -> + emqx_external_broker:forward(Delivery) ++ RouteRes. %% @doc Forward message to another node. -spec forward( diff --git a/apps/emqx/src/emqx_external_broker.erl b/apps/emqx/src/emqx_external_broker.erl index ebcd48994..253d73edd 100644 --- a/apps/emqx/src/emqx_external_broker.erl +++ b/apps/emqx/src/emqx_external_broker.erl @@ -16,11 +16,9 @@ -module(emqx_external_broker). --callback forward(dest(), emqx_types:delivery()) -> +-callback forward(emqx_types:delivery()) -> emqx_types:deliver_result(). --callback should_route_to_external_dests(emqx_types:message()) -> boolean(). - -callback add_route(emqx_types:topic()) -> ok. -callback delete_route(emqx_types:topic()) -> ok. @@ -30,23 +28,22 @@ -callback add_persistent_route(emqx_types:topic(), emqx_persistent_session_ds:id()) -> ok. -callback delete_persistent_route(emqx_types:topic(), emqx_persistent_session_ds:id()) -> ok. --callback match_routes(emqx_types:topic()) -> [emqx_types:route()]. - -type dest() :: term(). -export([ + %% Registration provider/0, register_provider/1, unregister_provider/1, - forward/2, - should_route_to_external_dests/1, + %% Forwarding + forward/1, + %% Routing updates add_route/1, delete_route/1, add_shared_route/2, delete_shared_route/2, add_persistent_route/2, - delete_persistent_route/2, - match_routes/1 + delete_persistent_route/2 ]). -export_type([dest/0]). @@ -111,11 +108,8 @@ provider() -> %% Broker API %%-------------------------------------------------------------------- -forward(ExternalDest, Delivery) -> - ?safe_with_provider(?FUNCTION_NAME(ExternalDest, Delivery), {error, unknown_dest}). - -should_route_to_external_dests(Message) -> - ?safe_with_provider(?FUNCTION_NAME(Message), false). +forward(Delivery) -> + ?safe_with_provider(?FUNCTION_NAME(Delivery), []). add_route(Topic) -> ?safe_with_provider(?FUNCTION_NAME(Topic), ok). @@ -135,9 +129,6 @@ add_persistent_route(Topic, ID) -> delete_persistent_route(Topic, ID) -> ?safe_with_provider(?FUNCTION_NAME(Topic, ID), ok). -match_routes(Topic) -> - ?safe_with_provider(?FUNCTION_NAME(Topic), ok). - %%-------------------------------------------------------------------- %% Internal functions %%-------------------------------------------------------------------- diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index cdfe22f3d..19211cb56 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -16,9 +16,7 @@ delete_shared_route/2, add_persistent_route/2, delete_persistent_route/2, - match_routes/1, - forward/2, - should_route_to_external_dests/1 + forward/1 ]). %% emqx hooks @@ -73,18 +71,26 @@ add_persistent_route(Topic, ID) -> delete_persistent_route(Topic, ID) -> maybe_push_route_op(delete, Topic, ?PERSISTENT_ROUTE_ID(Topic, ID), push_persistent_route). -forward(DestCluster, Delivery) -> - emqx_cluster_link_mqtt:forward(DestCluster, Delivery). +forward(#delivery{message = #message{extra = #{link_origin := _}}}) -> + %% Do not forward any external messages to other links. + %% Only forward locally originated messages to all the relevant links, i.e. no gossip + %% message forwarding. + []; +forward(Delivery = #delivery{message = #message{topic = Topic}}) -> + Routes = emqx_cluster_link_extrouter:match_routes(Topic), + forward(Routes, Delivery). -match_routes(Topic) -> - emqx_cluster_link_extrouter:match_routes(Topic). - -%% Do not forward any external messages to other links. -%% Only forward locally originated messages to all the relevant links, i.e. no gossip message forwarding. -should_route_to_external_dests(#message{extra = #{link_origin := _}}) -> - false; -should_route_to_external_dests(_Msg) -> - true. +forward([], _Delivery) -> + []; +forward(Routes, Delivery) -> + lists:foldl( + fun(#route{topic = To, dest = Cluster}, Acc) -> + Result = emqx_cluster_link_mqtt:forward(Cluster, Delivery), + [{Cluster, To, Result} | Acc] + end, + [], + Routes + ). %%-------------------------------------------------------------------- %% EMQX Hooks From 179870c573f835df4c1245af9c68b7dd79dc637b Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Fri, 7 Jun 2024 18:55:16 +0200 Subject: [PATCH 42/46] chore: remove author-specific gitignore stuff Which was accidentally committed. --- .gitignore | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.gitignore b/.gitignore index e97571448..8d95669ac 100644 --- a/.gitignore +++ b/.gitignore @@ -79,7 +79,3 @@ rebar-git-cache.tar apps/emqx_utils/src/emqx_variform_parser.erl apps/emqx_utils/src/emqx_variform_scan.erl default-profile.mk -# local -/_compat -/scratch -SCRATCH From 00f912928f0c0c2e3825d6cc0a6810097b0df987 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Mon, 10 Jun 2024 19:20:21 +0300 Subject: [PATCH 43/46] fix: fix emqx_external_broker:forward/1 spec --- apps/emqx/src/emqx_external_broker.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx/src/emqx_external_broker.erl b/apps/emqx/src/emqx_external_broker.erl index 253d73edd..fe360a5b8 100644 --- a/apps/emqx/src/emqx_external_broker.erl +++ b/apps/emqx/src/emqx_external_broker.erl @@ -17,7 +17,7 @@ -module(emqx_external_broker). -callback forward(emqx_types:delivery()) -> - emqx_types:deliver_result(). + emqx_types:publish_result(). -callback add_route(emqx_types:topic()) -> ok. -callback delete_route(emqx_types:topic()) -> ok. From 44c37571cc4b867a236830a7ce65554845f30416 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Wed, 12 Jun 2024 15:33:35 +0300 Subject: [PATCH 44/46] fix(clusterlink): ignore not_registered error --- apps/emqx_cluster_link/src/emqx_cluster_link_config.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index c28755ac1..2b5dea2e8 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -164,7 +164,7 @@ toggle_hook_and_broker([_ | _] = _NewEnabledLinks, [] = _OldEnabledLinks) -> ok = emqx_cluster_link:register_external_broker(), ok = emqx_cluster_link:put_hook(); toggle_hook_and_broker([] = _NewEnabledLinks, _OldLinks) -> - ok = emqx_cluster_link:unregister_external_broker(), + _ = emqx_cluster_link:unregister_external_broker(), ok = emqx_cluster_link:delete_hook(); toggle_hook_and_broker(_, _) -> ok. From a95a08efd3f22ec0273cbb1daf291477c13fc400 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Wed, 12 Jun 2024 15:34:56 +0300 Subject: [PATCH 45/46] test(clusterlink): add more test cases --- .../src/emqx_cluster_link.erl | 2 +- .../src/emqx_cluster_link_router_syncer.erl | 77 ++- .../test/emqx_cluster_link_SUITE.erl | 38 +- .../test/emqx_cluster_link_api_SUITE.erl | 132 ++++ .../test/emqx_cluster_link_config_SUITE.erl | 647 ++++++++++++++++++ .../emqx_cluster_link_extrouter_SUITE.erl | 3 +- 6 files changed, 871 insertions(+), 28 deletions(-) create mode 100644 apps/emqx_cluster_link/test/emqx_cluster_link_api_SUITE.erl create mode 100644 apps/emqx_cluster_link/test/emqx_cluster_link_config_SUITE.erl diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index 19211cb56..d91b33c3a 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -205,7 +205,7 @@ actor_init( {error, <<"bad_remote_cluster_link_name">>} end; #{enable := false} -> - {error, <<"clster_link_disabled">>} + {error, <<"cluster_link_disabled">>} end. actor_init_ack(#{actor := Actor}, Res, MsgIn) -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index fdcbd91c7..b7d165419 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -16,6 +16,13 @@ push_persistent_route/4 ]). +%% debug/test helpers +-export([ + status/1, + where/1, + where/2 +]). + -export([ start_link_actor/4, start_link_syncer/4 @@ -46,8 +53,8 @@ -define(CLIENT_NAME(Cluster), ?NAME(Cluster, client)). -define(SYNCER_NAME(Cluster), ?NAME(Cluster, syncer)). -define(SYNCER_REF(Cluster), {via, gproc, ?SYNCER_NAME(Cluster)}). --define(ACTOR_REF(Cluster), {via, gproc, ?NAME(Cluster, actor)}). -define(ACTOR_NAME(Cluster), ?NAME(Cluster, actor)). +-define(ACTOR_REF(Cluster), {via, gproc, ?ACTOR_NAME(Cluster)}). -define(MAX_BATCH_SIZE, 4000). -define(MIN_SYNC_INTERVAL, 10). @@ -85,6 +92,22 @@ end ). +-record(st, { + target :: binary(), + actor :: binary(), + incarnation :: non_neg_integer(), + client :: undefined | pid(), + bootstrapped :: boolean(), + reconnect_timer :: undefined | reference(), + heartbeat_timer :: undefined | reference(), + actor_init_req_id :: undefined | binary(), + actor_init_timer :: undefined | reference(), + remote_actor_info :: undefined | map(), + status :: connecting | connected | disconnected, + error :: undefined | term(), + link_conf :: map() +}). + push(TargetCluster, OpName, Topic, ID) -> do_push(?SYNCER_NAME(TargetCluster), OpName, Topic, ID). @@ -99,6 +122,24 @@ do_push(SyncerName, OpName, Topic, ID) -> dropped end. +%% Debug/test helpers +where(Cluster) -> + where(actor, Cluster). + +where(actor, Cluster) -> + gproc:where(?ACTOR_NAME(Cluster)); +where(ps_actor, Cluster) -> + gproc:where(?PS_ACTOR_NAME(Cluster)). + +status(Cluster) -> + case where(actor, Cluster) of + Pid when is_pid(Pid) -> + #st{error = Err, status = Status} = sys:get_state(Pid), + #{error => Err, status => Status}; + undefined -> + undefined + end. + %% Supervisor: %% 1. Actor + MQTT Client %% 2. Syncer @@ -290,24 +331,6 @@ syncer_spec(ChildID, Actor, Incarnation, SyncerRef, ClientName) -> type => worker }. -%% - --record(st, { - target :: binary(), - actor :: binary(), - incarnation :: non_neg_integer(), - client :: undefined | pid(), - bootstrapped :: boolean(), - reconnect_timer :: undefined | reference(), - heartbeat_timer :: undefined | reference(), - actor_init_req_id :: undefined | binary(), - actor_init_timer :: undefined | reference(), - remote_actor_info :: undefined | map(), - status :: connecting | connected | disconnected, - error :: undefined | term(), - link_conf :: map() -}). - mk_state(#{upstream := TargetCluster} = LinkConf, Actor, Incarnation) -> #st{ target = TargetCluster, @@ -361,6 +384,12 @@ handle_info( remote_link_proto_ver => maps:get(proto_ver, AckInfoMap, undefined) }), _ = maybe_alarm(Reason, St1), + ?tp( + debug, + clink_handshake_error, + #{actor => {St1#st.actor, St1#st.incarnation}, reason => Reason} + ), + %% TODO: retry after a timeout? {noreply, St1#st{error = Reason, status = disconnected}} end; handle_info({publish, #{}}, St) -> @@ -376,7 +405,7 @@ handle_info({timeout, TRef, actor_reinit}, St = #st{actor_init_timer = TRef}) -> Reason = init_timeout, _ = maybe_alarm(Reason, St), {noreply, - init_remote_actor(St#st{reconnect_timer = undefined, status = disconnected, error = Reason})}; + init_remote_actor(St#st{actor_init_timer = undefined, status = disconnected, error = Reason})}; handle_info({timeout, TRef, _Heartbeat}, St = #st{heartbeat_timer = TRef}) -> {noreply, process_heartbeat(St#st{heartbeat_timer = undefined})}; %% Stale timeout. @@ -386,7 +415,8 @@ handle_info(Info, St) -> ?SLOG(warning, #{msg => "unexpected_info", info => Info}), {noreply, St}. -terminate(_Reason, _State) -> +terminate(_Reason, State) -> + _ = maybe_deactivate_alarm(State), ok. process_connect(St = #st{target = TargetCluster, actor = Actor, link_conf = Conf}) -> @@ -507,6 +537,11 @@ run_bootstrap(St = #st{target = TargetCluster, link_conf = #{topics := Topics}}) run_bootstrap(Bootstrap, St) -> case emqx_cluster_link_router_bootstrap:next_batch(Bootstrap) of done -> + ?tp( + debug, + clink_route_bootstrap_complete, + #{actor => {St#st.actor, St#st.incarnation}, cluster => St#st.target} + ), process_bootstrapped(St); {Batch, NBootstrap} -> %% TODO: Better error handling. diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl index 922af832f..e38cd3999 100644 --- a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl @@ -15,7 +15,17 @@ %% all() -> - emqx_common_test_helpers:all(?MODULE). + [ + {group, shared_subs}, + {group, non_shared_subs} + ]. + +groups() -> + AllTCs = emqx_common_test_helpers:all(?MODULE), + [ + {shared_subs, AllTCs}, + {non_shared_subs, AllTCs} + ]. init_per_suite(Config) -> Config. @@ -23,6 +33,14 @@ init_per_suite(Config) -> end_per_suite(_Config) -> ok. +init_per_group(shared_subs, Config) -> + [{is_shared_sub, true} | Config]; +init_per_group(non_shared_subs, Config) -> + [{is_shared_sub, false} | Config]. + +end_per_group(_Group, _Config) -> + ok. + init_per_testcase(TCName, Config) -> emqx_common_test_helpers:init_per_testcase(?MODULE, TCName, Config). @@ -136,11 +154,14 @@ t_message_forwarding('end', Config) -> t_message_forwarding(Config) -> [SourceNode1 | _] = nodes_source(Config), [TargetNode1, TargetNode2 | _] = nodes_target(Config), + SourceC1 = start_client("t_message_forwarding", SourceNode1), TargetC1 = start_client("t_message_forwarding1", TargetNode1), TargetC2 = start_client("t_message_forwarding2", TargetNode2), - {ok, _, _} = emqtt:subscribe(TargetC1, <<"t/+">>, qos1), - {ok, _, _} = emqtt:subscribe(TargetC2, <<"t/#">>, qos1), + IsShared = ?config(is_shared_sub, Config), + + {ok, _, _} = emqtt:subscribe(TargetC1, maybe_shared_topic(IsShared, <<"t/+">>), qos1), + {ok, _, _} = emqtt:subscribe(TargetC2, maybe_shared_topic(IsShared, <<"t/#">>), qos1), {ok, _} = ?block_until(#{?snk_kind := clink_route_sync_complete}), {ok, _} = emqtt:publish(SourceC1, <<"t/42">>, <<"hello">>, qos1), ?assertReceive( @@ -178,8 +199,10 @@ t_target_extrouting_gc(Config) -> SourceC1 = start_client("t_target_extrouting_gc", SourceNode1), TargetC1 = start_client_unlink("t_target_extrouting_gc1", TargetNode1), TargetC2 = start_client_unlink("t_target_extrouting_gc2", TargetNode2), - {ok, _, _} = emqtt:subscribe(TargetC1, <<"t/#">>, qos1), - {ok, _, _} = emqtt:subscribe(TargetC2, <<"t/+">>, qos1), + IsShared = ?config(is_shared_sub, Config), + + {ok, _, _} = emqtt:subscribe(TargetC1, maybe_shared_topic(IsShared, <<"t/#">>), qos1), + {ok, _, _} = emqtt:subscribe(TargetC2, maybe_shared_topic(IsShared, <<"t/+">>), qos1), {ok, _} = ?block_until(#{?snk_kind := clink_route_sync_complete}), {ok, _} = emqtt:publish(SourceC1, <<"t/1">>, <<"HELLO1">>, qos1), {ok, _} = emqtt:publish(SourceC1, <<"t/2/ext">>, <<"HELLO2">>, qos1), @@ -232,6 +255,11 @@ t_target_extrouting_gc(Config) -> %% +maybe_shared_topic(true = _IsShared, Topic) -> + <<"$share/test-group/", Topic/binary>>; +maybe_shared_topic(false = _IsShared, Topic) -> + Topic. + start_client_unlink(ClientId, Node) -> Client = start_client(ClientId, Node), _ = erlang:unlink(Client), diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_api_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_api_SUITE.erl new file mode 100644 index 000000000..c5ec8da6c --- /dev/null +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_api_SUITE.erl @@ -0,0 +1,132 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_api_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-define(API_PATH, emqx_mgmt_api_test_util:api_path(["cluster", "links"])). +-define(CONF_PATH, [cluster, links]). + +-define(CACERT, << + "-----BEGIN CERTIFICATE-----\n" + "MIIDUTCCAjmgAwIBAgIJAPPYCjTmxdt/MA0GCSqGSIb3DQEBCwUAMD8xCzAJBgNV\n" + "BAYTAkNOMREwDwYDVQQIDAhoYW5nemhvdTEMMAoGA1UECgwDRU1RMQ8wDQYDVQQD\n" + "DAZSb290Q0EwHhcNMjAwNTA4MDgwNjUyWhcNMzAwNTA2MDgwNjUyWjA/MQswCQYD\n" + "VQQGEwJDTjERMA8GA1UECAwIaGFuZ3pob3UxDDAKBgNVBAoMA0VNUTEPMA0GA1UE\n" + "AwwGUm9vdENBMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAzcgVLex1\n" + "EZ9ON64EX8v+wcSjzOZpiEOsAOuSXOEN3wb8FKUxCdsGrsJYB7a5VM/Jot25Mod2\n" + "juS3OBMg6r85k2TWjdxUoUs+HiUB/pP/ARaaW6VntpAEokpij/przWMPgJnBF3Ur\n" + "MjtbLayH9hGmpQrI5c2vmHQ2reRZnSFbY+2b8SXZ+3lZZgz9+BaQYWdQWfaUWEHZ\n" + "uDaNiViVO0OT8DRjCuiDp3yYDj3iLWbTA/gDL6Tf5XuHuEwcOQUrd+h0hyIphO8D\n" + "tsrsHZ14j4AWYLk1CPA6pq1HIUvEl2rANx2lVUNv+nt64K/Mr3RnVQd9s8bK+TXQ\n" + "KGHd2Lv/PALYuwIDAQABo1AwTjAdBgNVHQ4EFgQUGBmW+iDzxctWAWxmhgdlE8Pj\n" + "EbQwHwYDVR0jBBgwFoAUGBmW+iDzxctWAWxmhgdlE8PjEbQwDAYDVR0TBAUwAwEB\n" + "/zANBgkqhkiG9w0BAQsFAAOCAQEAGbhRUjpIred4cFAFJ7bbYD9hKu/yzWPWkMRa\n" + "ErlCKHmuYsYk+5d16JQhJaFy6MGXfLgo3KV2itl0d+OWNH0U9ULXcglTxy6+njo5\n" + "CFqdUBPwN1jxhzo9yteDMKF4+AHIxbvCAJa17qcwUKR5MKNvv09C6pvQDJLzid7y\n" + "E2dkgSuggik3oa0427KvctFf8uhOV94RvEDyqvT5+pgNYZ2Yfga9pD/jjpoHEUlo\n" + "88IGU8/wJCx3Ds2yc8+oBg/ynxG8f/HmCC1ET6EHHoe2jlo8FpU/SgGtghS1YL30\n" + "IWxNsPrUP+XsZpBJy/mvOhE5QXo6Y35zDqqj8tI7AGmAWu22jg==\n" + "-----END CERTIFICATE-----" +>>). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + %% This is called by emqx_machine in EMQX release + emqx_otel_app:configure_otel_deps(), + Apps = emqx_cth_suite:start( + [ + emqx_conf, + emqx_management, + {emqx_dashboard, "dashboard.listeners.http { enable = true, bind = 18083 }"}, + emqx_cluster_link + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + Auth = auth_header(), + [{suite_apps, Apps}, {auth, Auth} | Config]. + +end_per_suite(Config) -> + emqx_cth_suite:stop(?config(suite_apps, Config)), + emqx_config:delete_override_conf_files(), + ok. + +auth_header() -> + {ok, API} = emqx_common_test_http:create_default_app(), + emqx_common_test_http:auth_header(API). + +init_per_testcase(_TC, Config) -> + {ok, _} = emqx_cluster_link_config:update([]), + Config. + +end_per_testcase(_TC, _Config) -> + ok. + +t_put_get_valid(Config) -> + Auth = ?config(auth, Config), + Path = ?API_PATH, + {ok, Resp} = emqx_mgmt_api_test_util:request_api(get, Path, Auth), + ?assertMatch([], emqx_utils_json:decode(Resp)), + + Link1 = #{ + <<"pool_size">> => 1, + <<"server">> => <<"emqxcl_2.nohost:31883">>, + <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], + <<"name">> => <<"emqcl_1">> + }, + Link2 = #{ + <<"pool_size">> => 1, + <<"server">> => <<"emqxcl_2.nohost:41883">>, + <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], + <<"name">> => <<"emqcl_2">> + }, + ?assertMatch({ok, _}, emqx_mgmt_api_test_util:request_api(put, Path, "", Auth, [Link1, Link2])), + + {ok, Resp1} = emqx_mgmt_api_test_util:request_api(get, Path, Auth), + ?assertMatch([Link1, Link2], emqx_utils_json:decode(Resp1)), + + DisabledLink1 = Link1#{<<"enable">> => false}, + ?assertMatch( + {ok, _}, emqx_mgmt_api_test_util:request_api(put, Path, "", Auth, [DisabledLink1, Link2]) + ), + + {ok, Resp2} = emqx_mgmt_api_test_util:request_api(get, Path, Auth), + ?assertMatch([DisabledLink1, Link2], emqx_utils_json:decode(Resp2)), + + SSL = #{<<"enable">> => true, <<"cacertfile">> => ?CACERT}, + SSLLink1 = Link1#{<<"ssl">> => SSL}, + ?assertMatch( + {ok, _}, emqx_mgmt_api_test_util:request_api(put, Path, "", Auth, [Link2, SSLLink1]) + ), + {ok, Resp3} = emqx_mgmt_api_test_util:request_api(get, Path, Auth), + + ?assertMatch( + [Link2, #{<<"ssl">> := #{<<"enable">> := true, <<"cacertfile">> := _Path}}], + emqx_utils_json:decode(Resp3) + ). + +t_put_invalid(Config) -> + Auth = ?config(auth, Config), + Path = ?API_PATH, + Link = #{ + <<"pool_size">> => 1, + <<"server">> => <<"emqxcl_2.nohost:31883">>, + <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], + <<"name">> => <<"emqcl_1">> + }, + ?assertMatch( + {error, {_, 400, _}}, emqx_mgmt_api_test_util:request_api(put, Path, "", Auth, [Link, Link]) + ), + ?assertMatch( + {error, {_, 400, _}}, + emqx_mgmt_api_test_util:request_api(put, Path, "", Auth, [maps:remove(<<"name">>, Link)]) + ). diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_config_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_config_SUITE.erl new file mode 100644 index 000000000..e09f12ce4 --- /dev/null +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_config_SUITE.erl @@ -0,0 +1,647 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_cluster_link_config_SUITE). + +-include_lib("common_test/include/ct.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("emqx/include/asserts.hrl"). +-include_lib("emqx_utils/include/emqx_message.hrl"). + +-compile(export_all). +-compile(nowarn_export_all). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + Config. + +end_per_suite(_Config) -> + ok. + +init_per_testcase(TCName, Config) -> + emqx_common_test_helpers:init_per_testcase(?MODULE, TCName, Config). + +end_per_testcase(TCName, Config) -> + emqx_common_test_helpers:end_per_testcase(?MODULE, TCName, Config). + +mk_clusters(NameA, NameB, PortA, PortB, ConfA, ConfB, Config) -> + AppsA = [{emqx_conf, ConfA}, emqx_cluster_link], + AppsA1 = [ + {emqx_conf, combine([ConfA, conf_mqtt_listener(PortA)])}, + emqx_cluster_link + ], + AppsB = [{emqx_conf, ConfB}, emqx_cluster_link], + AppsB1 = [ + {emqx_conf, combine([ConfB, conf_mqtt_listener(PortB)])}, + emqx_cluster_link + ], + + NodesA = emqx_cth_cluster:mk_nodespecs( + [ + {mk_nodename(NameA, 1), #{apps => AppsA}}, + {mk_nodename(NameA, 2), #{apps => AppsA}}, + {mk_nodename(NameA, 3), #{apps => AppsA1, role => replicant}} + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + NodesB = emqx_cth_cluster:mk_nodespecs( + [ + {mk_nodename(NameB, 1), #{apps => AppsB, base_port => 20100}}, + {mk_nodename(NameB, 2), #{apps => AppsB1, base_port => 20200}} + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + {NodesA, NodesB}. + +t_config_update('init', Config) -> + NameA = fmt("~s_~s", [?FUNCTION_NAME, "a"]), + NameB = fmt("~s_~s", [?FUNCTION_NAME, "b"]), + LPortA = 31883, + LPortB = 41883, + ConfA = combine([conf_cluster(NameA), conf_log()]), + ConfB = combine([conf_cluster(NameB), conf_log()]), + {NodesA, NodesB} = mk_clusters(NameA, NameB, LPortA, LPortB, ConfA, ConfB, Config), + ClusterA = emqx_cth_cluster:start(NodesA), + ClusterB = emqx_cth_cluster:start(NodesB), + ok = snabbkaffe:start_trace(), + [ + {cluster_a, ClusterA}, + {cluster_b, ClusterB}, + {lport_a, LPortA}, + {lport_b, LPortB}, + {name_a, NameA}, + {name_b, NameB} + | Config + ]; +t_config_update('end', Config) -> + ok = snabbkaffe:stop(), + ok = emqx_cth_cluster:stop(?config(cluster_a, Config)), + ok = emqx_cth_cluster:stop(?config(cluster_b, Config)). + +t_config_update(Config) -> + [NodeA1, _, _] = ?config(cluster_a, Config), + [NodeB1, _] = ?config(cluster_b, Config), + LPortA = ?config(lport_a, Config), + LPortB = ?config(lport_b, Config), + NameA = ?config(name_a, Config), + NameB = ?config(name_b, Config), + + ClientA = start_client("t_config_a", NodeA1), + ClientB = start_client("t_config_b", NodeB1), + + {ok, _, _} = emqtt:subscribe(ClientA, <<"t/test/1/+">>, qos1), + {ok, _, _} = emqtt:subscribe(ClientB, <<"t/test-topic">>, qos1), + + %% add link + LinkConfA = #{ + <<"enable">> => true, + <<"pool_size">> => 1, + <<"server">> => <<"localhost:", (integer_to_binary(LPortB))/binary>>, + <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], + <<"upstream">> => NameB + }, + LinkConfB = #{ + <<"enable">> => true, + <<"pool_size">> => 1, + <<"server">> => <<"localhost:", (integer_to_binary(LPortA))/binary>>, + <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], + <<"upstream">> => NameA + }, + + {ok, SubRef} = snabbkaffe:subscribe( + ?match_event(#{?snk_kind := clink_route_bootstrap_complete}), + %% 5 nodes = 5 actors (durable storage is dsabled) + 5, + 30_000 + ), + ?assertMatch({ok, _}, erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA]])), + ?assertMatch({ok, _}, erpc:call(NodeB1, emqx_cluster_link_config, update, [[LinkConfB]])), + + ?assertMatch( + {ok, [ + #{?snk_kind := clink_route_bootstrap_complete}, + #{?snk_kind := clink_route_bootstrap_complete}, + #{?snk_kind := clink_route_bootstrap_complete}, + #{?snk_kind := clink_route_bootstrap_complete}, + #{?snk_kind := clink_route_bootstrap_complete} + ]}, + snabbkaffe:receive_events(SubRef) + ), + + {ok, _} = emqtt:publish(ClientA, <<"t/test-topic">>, <<"hello-from-a">>, qos1), + {ok, _} = emqtt:publish(ClientB, <<"t/test/1/1">>, <<"hello-from-b">>, qos1), + + ?assertReceive( + {publish, #{ + topic := <<"t/test-topic">>, payload := <<"hello-from-a">>, client_pid := ClientB + }}, + 7000 + ), + ?assertReceive( + {publish, #{ + topic := <<"t/test/1/1">>, payload := <<"hello-from-b">>, client_pid := ClientA + }}, + 7000 + ), + %% no more messages expected + ?assertNotReceive({publish, _Message = #{}}), + + {ok, SubRef1} = snabbkaffe:subscribe( + ?match_event(#{?snk_kind := clink_route_bootstrap_complete}), + %% 3 nodes in cluster a + 3, + 30_000 + ), + + %% update link + LinkConfA1 = LinkConfA#{<<"pool_size">> => 2, <<"topics">> => [<<"t/new/+">>]}, + ?assertMatch({ok, _}, erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA1]])), + + ?assertMatch( + {ok, [ + #{?snk_kind := clink_route_bootstrap_complete}, + #{?snk_kind := clink_route_bootstrap_complete}, + #{?snk_kind := clink_route_bootstrap_complete} + ]}, + snabbkaffe:receive_events(SubRef1) + ), + + %% wait for route sync on ClientA node + {{ok, _, _}, {ok, _}} = ?wait_async_action( + emqtt:subscribe(ClientA, <<"t/new/1">>, qos1), + #{?snk_kind := clink_route_sync_complete, ?snk_meta := #{node := NodeA1}}, + 10_000 + ), + + %% not expected to be received anymore + {ok, _} = emqtt:publish(ClientB, <<"t/test/1/1">>, <<"not-expected-hello-from-b">>, qos1), + {ok, _} = emqtt:publish(ClientB, <<"t/new/1">>, <<"hello-from-b-1">>, qos1), + ?assertReceive( + {publish, #{topic := <<"t/new/1">>, payload := <<"hello-from-b-1">>, client_pid := ClientA}}, + 7000 + ), + ?assertNotReceive({publish, _Message = #{}}), + + %% disable link + LinkConfA2 = LinkConfA1#{<<"enable">> => false}, + ?assertMatch({ok, _}, erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA2]])), + %% must be already blocked by the receiving cluster even if externak routing state is not + %% updated yet + {ok, _} = emqtt:publish(ClientB, <<"t/new/1">>, <<"not-expected-hello-from-b-1">>, qos1), + + LinkConfB1 = LinkConfB#{<<"enable">> => false}, + ?assertMatch({ok, _}, erpc:call(NodeB1, emqx_cluster_link_config, update, [[LinkConfB1]])), + {ok, _} = emqtt:publish(ClientA, <<"t/test-topic">>, <<"not-expected-hello-from-a">>, qos1), + + ?assertNotReceive({publish, _Message = #{}}, 3000), + + %% delete links + ?assertMatch({ok, _}, erpc:call(NodeA1, emqx_cluster_link_config, update, [[]])), + ?assertMatch({ok, _}, erpc:call(NodeB1, emqx_cluster_link_config, update, [[]])), + + ok = emqtt:stop(ClientA), + ok = emqtt:stop(ClientB). + +t_config_validations('init', Config) -> + NameA = fmt("~s_~s", [?FUNCTION_NAME, "a"]), + NameB = fmt("~s_~s", [?FUNCTION_NAME, "b"]), + LPortA = 31883, + LPortB = 41883, + ConfA = combine([conf_cluster(NameA), conf_log()]), + ConfB = combine([conf_cluster(NameB), conf_log()]), + %% Single node clusters are enough for a basic validation test + {[NodeA, _, _], [NodeB, _]} = mk_clusters(NameA, NameB, LPortA, LPortB, ConfA, ConfB, Config), + ClusterA = emqx_cth_cluster:start([NodeA]), + ClusterB = emqx_cth_cluster:start([NodeB]), + ok = snabbkaffe:start_trace(), + [ + {cluster_a, ClusterA}, + {cluster_b, ClusterB}, + {lport_a, LPortA}, + {lport_b, LPortB}, + {name_a, NameA}, + {name_b, NameB} + | Config + ]; +t_config_validations('end', Config) -> + ok = snabbkaffe:stop(), + ok = emqx_cth_cluster:stop(?config(cluster_a, Config)), + ok = emqx_cth_cluster:stop(?config(cluster_b, Config)). + +t_config_validations(Config) -> + [NodeA] = ?config(cluster_a, Config), + LPortB = ?config(lport_b, Config), + + NameB = ?config(name_b, Config), + + LinkConfA = #{ + <<"enable">> => true, + <<"pool_size">> => 1, + <<"server">> => <<"localhost:", (integer_to_binary(LPortB))/binary>>, + <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], + <<"upstream">> => NameB + }, + DuplicatedLinks = [LinkConfA, LinkConfA#{<<"enable">> => false, <<"pool_size">> => 2}], + ?assertMatch( + {error, #{reason := #{reason := duplicated_cluster_links, names := _}}}, + erpc:call(NodeA, emqx_cluster_link_config, update, [DuplicatedLinks]) + ), + + InvalidTopics = [<<"t/test/#">>, <<"$LINK/cluster/test/#">>], + InvalidTopics1 = [<<"t/+/#/+">>, <<>>], + ?assertMatch( + {error, #{reason := #{reason := invalid_topics, topics := _}}}, + erpc:call(NodeA, emqx_cluster_link_config, update, [ + [LinkConfA#{<<"topics">> => InvalidTopics}] + ]) + ), + ?assertMatch( + {error, #{reason := #{reason := invalid_topics, topics := _}}}, + erpc:call(NodeA, emqx_cluster_link_config, update, [ + [LinkConfA#{<<"topics">> => InvalidTopics1}] + ]) + ), + ?assertMatch( + {error, #{reason := required_field}}, + erpc:call(NodeA, emqx_cluster_link_config, update, [ + [maps:remove(<<"upstream">>, LinkConfA)] + ]) + ), + ?assertMatch( + {error, #{reason := required_field}}, + erpc:call(NodeA, emqx_cluster_link_config, update, [[maps:remove(<<"server">>, LinkConfA)]]) + ), + ?assertMatch( + {error, #{reason := required_field}}, + erpc:call(NodeA, emqx_cluster_link_config, update, [[maps:remove(<<"topics">>, LinkConfA)]]) + ), + + %% Some valid changes to cover different update scenarios (msg resource changed, actor changed, both changed) + ?assertMatch( + {ok, _}, + erpc:call(NodeA, emqx_cluster_link_config, update, [[LinkConfA]]) + ), + LinkConfUnknown = LinkConfA#{ + <<"upstream">> => <<"no-cluster">>, <<"server">> => <<"no-cluster.emqx:31883">> + }, + ?assertMatch( + {ok, _}, + erpc:call(NodeA, emqx_cluster_link_config, update, [ + [LinkConfA#{<<"pool_size">> => 5}, LinkConfUnknown] + ]) + ), + + ?assertMatch( + {ok, _}, + erpc:call(NodeA, emqx_cluster_link_config, update, [ + [LinkConfA, LinkConfUnknown#{<<"topics">> => []}] + ]) + ), + + ?assertMatch( + {ok, _}, + erpc:call( + NodeA, + emqx_cluster_link_config, + update, + [ + [ + LinkConfA#{ + <<"clientid">> => <<"new-client">>, + <<"username">> => <<"user">> + }, + LinkConfUnknown#{ + <<"clientid">> => <<"new-client">>, + <<"username">> => <<"user">> + } + ] + ] + ) + ). + +t_config_update_ds('init', Config) -> + NameA = fmt("~s_~s", [?FUNCTION_NAME, "a"]), + NameB = fmt("~s_~s", [?FUNCTION_NAME, "b"]), + LPortA = 31883, + LPortB = 41883, + ConfA = combine([conf_cluster(NameA), conf_log(), conf_ds()]), + ConfB = combine([conf_cluster(NameB), conf_log(), conf_ds()]), + {NodesA, NodesB} = mk_clusters(NameA, NameB, LPortA, LPortB, ConfA, ConfB, Config), + ClusterA = emqx_cth_cluster:start(NodesA), + ClusterB = emqx_cth_cluster:start(NodesB), + ok = snabbkaffe:start_trace(), + [ + {cluster_a, ClusterA}, + {cluster_b, ClusterB}, + {lport_a, LPortA}, + {lport_b, LPortB}, + {name_a, NameA}, + {name_b, NameB} + | Config + ]; +t_config_update_ds('end', Config) -> + ok = snabbkaffe:stop(), + ok = emqx_cth_cluster:stop(?config(cluster_a, Config)), + ok = emqx_cth_cluster:stop(?config(cluster_b, Config)). + +t_config_update_ds(Config) -> + [NodeA1, _, _] = ?config(cluster_a, Config), + [NodeB1, _] = ?config(cluster_b, Config), + LPortA = ?config(lport_a, Config), + LPortB = ?config(lport_b, Config), + NameA = ?config(name_a, Config), + NameB = ?config(name_b, Config), + + ClientA = start_client("t_config_a", NodeA1, false), + ClientB = start_client("t_config_b", NodeB1, false), + {ok, _, _} = emqtt:subscribe(ClientA, <<"t/test/1/+">>, qos1), + {ok, _, _} = emqtt:subscribe(ClientB, <<"t/test-topic">>, qos1), + + LinkConfA = #{ + <<"enable">> => true, + <<"pool_size">> => 1, + <<"server">> => <<"localhost:", (integer_to_binary(LPortB))/binary>>, + <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], + <<"upstream">> => NameB + }, + LinkConfB = #{ + <<"enable">> => true, + <<"pool_size">> => 1, + <<"server">> => <<"localhost:", (integer_to_binary(LPortA))/binary>>, + <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], + <<"upstream">> => NameA + }, + + {ok, SubRef} = snabbkaffe:subscribe( + ?match_event(#{?snk_kind := clink_route_bootstrap_complete}), + %% 5 nodes = 9 actors (durable storage is enabled, + %% 1 replicant node is not doing ds bootstrap) + 9, + 30_000 + ), + ?assertMatch({ok, _}, erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA]])), + ?assertMatch({ok, _}, erpc:call(NodeB1, emqx_cluster_link_config, update, [[LinkConfB]])), + + ?assertMatch( + [#{ps_actor_incarnation := 0}], erpc:call(NodeA1, emqx, get_config, [[cluster, links]]) + ), + ?assertMatch( + [#{ps_actor_incarnation := 0}], erpc:call(NodeB1, emqx, get_config, [[cluster, links]]) + ), + + {ok, Events} = snabbkaffe:receive_events(SubRef), + ?assertEqual(9, length(Events)), + + {ok, _} = emqtt:publish(ClientA, <<"t/test-topic">>, <<"hello-from-a">>, qos1), + {ok, _} = emqtt:publish(ClientB, <<"t/test/1/1">>, <<"hello-from-b">>, qos1), + + ?assertReceive( + {publish, #{ + topic := <<"t/test-topic">>, payload := <<"hello-from-a">>, client_pid := ClientB + }}, + 30_000 + ), + ?assertReceive( + {publish, #{ + topic := <<"t/test/1/1">>, payload := <<"hello-from-b">>, client_pid := ClientA + }}, + 30_000 + ), + %% no more messages expected + ?assertNotReceive({publish, _Message = #{}}), + {ok, SubRef1} = snabbkaffe:subscribe( + ?match_event(#{?snk_kind := clink_route_bootstrap_complete}), + %% 3 nodes (1 replicant) in cluster a (5 actors including ds) + 5, + 30_000 + ), + + %% update link + + LinkConfA1 = LinkConfA#{<<"pool_size">> => 2, <<"topics">> => [<<"t/new/+">>]}, + ?assertMatch({ok, _}, erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA1]])), + + {ok, Events1} = snabbkaffe:receive_events(SubRef1), + ?assertEqual(5, length(Events1)), + + %% wait for route sync on ClientA node + {{ok, _, _}, {ok, _}} = ?wait_async_action( + emqtt:subscribe(ClientA, <<"t/new/1">>, qos1), + #{ + ?snk_kind := clink_route_sync_complete, + ?snk_meta := #{node := NodeA1}, + actor := {<<"ps-routes-v1">>, 1} + }, + 10_000 + ), + %% not expected to be received anymore + {ok, _} = emqtt:publish(ClientB, <<"t/test/1/1">>, <<"not-expected-hello-from-b">>, qos1), + {ok, _} = emqtt:publish(ClientB, <<"t/new/1">>, <<"hello-from-b-1">>, qos1), + ?assertReceive( + {publish, #{topic := <<"t/new/1">>, payload := <<"hello-from-b-1">>, client_pid := ClientA}}, + 30_000 + ), + ?assertNotReceive({publish, _Message = #{}}), + + ?assertMatch( + [#{ps_actor_incarnation := 1}], erpc:call(NodeA1, emqx, get_config, [[cluster, links]]) + ), + ?assertMatch( + [#{ps_actor_incarnation := 1}], erpc:call(NodeA1, emqx, get_config, [[cluster, links]]) + ), + + ok = emqtt:stop(ClientA), + ok = emqtt:stop(ClientB). + +t_misconfigured_links('init', Config) -> + NameA = fmt("~s_~s", [?FUNCTION_NAME, "a"]), + NameB = fmt("~s_~s", [?FUNCTION_NAME, "b"]), + LPortA = 31883, + LPortB = 41883, + ConfA = combine([conf_cluster(NameA), conf_log()]), + ConfB = combine([conf_cluster(NameB), conf_log()]), + {NodesA, NodesB} = mk_clusters(NameA, NameB, LPortA, LPortB, ConfA, ConfB, Config), + ClusterA = emqx_cth_cluster:start(NodesA), + ClusterB = emqx_cth_cluster:start(NodesB), + ok = snabbkaffe:start_trace(), + [ + {cluster_a, ClusterA}, + {cluster_b, ClusterB}, + {lport_a, LPortA}, + {lport_b, LPortB}, + {name_a, NameA}, + {name_b, NameB} + | Config + ]; +t_misconfigured_links('end', Config) -> + ok = snabbkaffe:stop(), + ok = emqx_cth_cluster:stop(?config(cluster_a, Config)), + ok = emqx_cth_cluster:stop(?config(cluster_b, Config)). + +t_misconfigured_links(Config) -> + [NodeA1, _, _] = ?config(cluster_a, Config), + [NodeB1, _] = ?config(cluster_b, Config), + LPortA = ?config(lport_a, Config), + LPortB = ?config(lport_b, Config), + NameA = ?config(name_a, Config), + NameB = ?config(name_b, Config), + + ClientA = start_client("t_config_a", NodeA1), + ClientB = start_client("t_config_b", NodeB1), + + {ok, _, _} = emqtt:subscribe(ClientA, <<"t/test/1/+">>, qos1), + {ok, _, _} = emqtt:subscribe(ClientB, <<"t/test-topic">>, qos1), + + LinkConfA = #{ + <<"enable">> => true, + <<"pool_size">> => 1, + <<"server">> => <<"localhost:", (integer_to_binary(LPortB))/binary>>, + <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], + <<"upstream">> => <<"bad-b-name">> + }, + LinkConfB = #{ + <<"enable">> => true, + <<"pool_size">> => 1, + <<"server">> => <<"localhost:", (integer_to_binary(LPortA))/binary>>, + <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], + <<"upstream">> => NameA + }, + + ?assertMatch({ok, _}, erpc:call(NodeB1, emqx_cluster_link_config, update, [[LinkConfB]])), + + {{ok, _}, {ok, _}} = ?wait_async_action( + erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA]]), + #{ + ?snk_kind := clink_handshake_error, + reason := <<"bad_remote_cluster_link_name">>, + ?snk_meta := #{node := NodeA1} + }, + 10_000 + ), + timer:sleep(10), + ?assertMatch( + #{error := <<"bad_remote_cluster_link_name">>}, + erpc:call(NodeA1, emqx_cluster_link_router_syncer, status, [<<"bad-b-name">>]) + ), + + {{ok, _}, {ok, _}} = ?wait_async_action( + erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA#{<<"upstream">> => NameB}]]), + #{ + ?snk_kind := clink_route_bootstrap_complete, + ?snk_meta := #{node := NodeA1} + }, + 10_000 + ), + ?assertMatch( + #{status := connected, error := undefined}, + erpc:call(NodeA1, emqx_cluster_link_router_syncer, status, [NameB]) + ), + ?assertEqual( + undefined, erpc:call(NodeA1, emqx_cluster_link_router_syncer, status, [<<"bad-b-name">>]) + ), + + ?assertMatch( + {ok, _}, + erpc:call( + NodeB1, + emqx_cluster_link_config, + update, + [ + [ + LinkConfB#{<<"enable">> => false}, + %% An extra dummy link to keep B hook/external_broker registered and be able to + %% respond with "link disabled error" for the first disabled link + LinkConfB#{<<"upstream">> => <<"bad-a-name">>} + ] + ] + ) + ), + + ?assertMatch({ok, _}, erpc:call(NodeA1, emqx_cluster_link_config, update, [[]])), + {{ok, _}, {ok, _}} = ?wait_async_action( + erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA#{<<"upstream">> => NameB}]]), + #{ + ?snk_kind := clink_handshake_error, + reason := <<"cluster_link_disabled">>, + ?snk_meta := #{node := NodeA1} + }, + 10_000 + ), + timer:sleep(10), + ?assertMatch( + #{error := <<"cluster_link_disabled">>}, + erpc:call(NodeA1, emqx_cluster_link_router_syncer, status, [NameB]) + ), + + ?assertMatch( + {ok, _}, + erpc:call(NodeB1, emqx_cluster_link_config, update, [ + [LinkConfB#{<<"upstream">> => <<"bad-a-name">>}] + ]) + ), + ?assertMatch({ok, _}, erpc:call(NodeA1, emqx_cluster_link_config, update, [[]])), + + {{ok, _}, {ok, _}} = ?wait_async_action( + erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA#{<<"upstream">> => NameB}]]), + #{ + ?snk_kind := clink_handshake_error, + reason := <<"unknown_cluster">>, + ?snk_meta := #{node := NodeA1} + }, + 10_000 + ), + timer:sleep(10), + ?assertMatch( + #{error := <<"unknown_cluster">>}, + erpc:call(NodeA1, emqx_cluster_link_router_syncer, status, [NameB]) + ), + + ok = emqtt:stop(ClientA), + ok = emqtt:stop(ClientB). + +start_client(ClientId, Node) -> + start_client(ClientId, Node, true). + +start_client(ClientId, Node, CleanStart) -> + Port = tcp_port(Node), + {ok, Client} = emqtt:start_link( + [ + {proto_ver, v5}, + {clientid, ClientId}, + {port, Port}, + {clean_start, CleanStart} + | [{properties, #{'Session-Expiry-Interval' => 300}} || CleanStart =:= false] + ] + ), + {ok, _} = emqtt:connect(Client), + Client. + +tcp_port(Node) -> + {_Host, Port} = erpc:call(Node, emqx_config, get, [[listeners, tcp, default, bind]]), + Port. + +combine([Entry | Rest]) -> + lists:foldl(fun emqx_cth_suite:merge_config/2, Entry, Rest). + +conf_mqtt_listener(LPort) when is_integer(LPort) -> + fmt("listeners.tcp.clink { bind = ~p }", [LPort]); +conf_mqtt_listener(_) -> + "". + +conf_cluster(ClusterName) -> + fmt("cluster.name = ~s", [ClusterName]). + +conf_log() -> + "log.file { enable = true, level = debug, path = node.log, supervisor_reports = progress }". + +conf_ds() -> + "durable_sessions.enable = true". + +fmt(Fmt, Args) -> + emqx_utils:format(Fmt, Args). + +mk_nodename(BaseName, Idx) -> + binary_to_atom(fmt("emqx_clink_~s_~b", [BaseName, Idx])). diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl index bb281ce4c..5bd63862a 100644 --- a/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_extrouter_SUITE.erl @@ -123,7 +123,8 @@ t_actor_gc(_Config) -> [<<"global/#">>, <<"topic/#">>, <<"topic/42/+">>], topics_sorted() ), - _AS13 = apply_operation(heartbeat, AS12, 50_000), + _AS13 = apply_operation(heartbeat, AS12, 60_000), + ?assertEqual( 1, emqx_cluster_link_extrouter:actor_gc(env(60_000)) From a905a6048c7095922cd51da64ba195896e33ef38 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 14 Jun 2024 19:48:40 +0300 Subject: [PATCH 46/46] chore(clusterlink): rename link `upstream` field to `name` --- .../src/emqx_cluster_link.erl | 2 +- .../src/emqx_cluster_link_api.erl | 4 +-- .../src/emqx_cluster_link_app.erl | 2 +- .../src/emqx_cluster_link_config.erl | 16 +++++------ .../src/emqx_cluster_link_mqtt.erl | 11 ++------ .../src/emqx_cluster_link_router_syncer.erl | 8 +++--- .../src/emqx_cluster_link_schema.erl | 6 ++-- .../src/emqx_cluster_link_sup.erl | 4 +-- .../test/emqx_cluster_link_SUITE.erl | 4 +-- .../test/emqx_cluster_link_config_SUITE.erl | 28 +++++++++---------- rel/i18n/emqx_cluster_link_schema.hocon | 6 ++-- 11 files changed, 42 insertions(+), 49 deletions(-) diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link.erl b/apps/emqx_cluster_link/src/emqx_cluster_link.erl index d91b33c3a..76228c052 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link.erl @@ -144,7 +144,7 @@ maybe_push_route_op(Op, Topic, RouteID) -> maybe_push_route_op(Op, Topic, RouteID, PushFun) -> lists:foreach( - fun(#{upstream := Cluster, topics := LinkFilters}) -> + fun(#{name := Cluster, topics := LinkFilters}) -> case topic_intersect_any(Topic, LinkFilters) of false -> ok; diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_api.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_api.erl index c74d2d3f7..33634607e 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_api.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_api.erl @@ -99,7 +99,7 @@ links_config_example() -> <<"t/topic-example">>, <<"t/topic-filter-example/1/#">> ], - <<"upstream">> => <<"emqxcl_b">> + <<"name">> => <<"emqxcl_b">> }, #{ <<"enable">> => true, @@ -111,6 +111,6 @@ links_config_example() -> <<"t/topic-example">>, <<"t/topic-filter-example/1/#">> ], - <<"upstream">> => <<"emqxcl_c">> + <<"name">> => <<"emqxcl_c">> } ]. diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl index ddf3028a2..41f1a0a77 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_app.erl @@ -48,7 +48,7 @@ start_msg_fwd_resources(LinksConf) -> remove_msg_fwd_resources(LinksConf) -> lists:foreach( - fun(#{upstream := Name}) -> + fun(#{name := Name}) -> emqx_cluster_link_mqtt:remove_msg_fwd_resource(Name) end, LinksConf diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl index 2b5dea2e8..f27c7702e 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_config.erl @@ -193,7 +193,7 @@ add_link(_DisabledLinkConf) -> ok. remove_links(LinksConf) -> - [remove_link(Name) || #{upstream := Name} <- LinksConf]. + [remove_link(Name) || #{name := Name} <- LinksConf]. remove_link(Name) -> _ = emqx_cluster_link_mqtt:remove_msg_fwd_resource(Name), @@ -202,7 +202,7 @@ remove_link(Name) -> update_links(LinksConf) -> [update_link(Link) || Link <- LinksConf]. -update_link({OldLinkConf, #{enable := true, upstream := Name} = NewLinkConf}) -> +update_link({OldLinkConf, #{enable := true, name := Name} = NewLinkConf}) -> case what_is_changed(OldLinkConf, NewLinkConf) of both -> _ = ensure_actor_stopped(Name), @@ -215,7 +215,7 @@ update_link({OldLinkConf, #{enable := true, upstream := Name} = NewLinkConf}) -> msg_resource -> ok = update_msg_fwd_resource(OldLinkConf, NewLinkConf) end; -update_link({_OldLinkConf, #{enable := false, upstream := Name} = _NewLinkConf}) -> +update_link({_OldLinkConf, #{enable := false, name := Name} = _NewLinkConf}) -> _ = emqx_cluster_link_mqtt:remove_msg_fwd_resource(Name), ensure_actor_stopped(Name). @@ -240,7 +240,7 @@ what_is_changed(OldLink, NewLink) -> are_fields_changed(Fields, OldLink, NewLink) -> maps:with(Fields, OldLink) =/= maps:with(Fields, NewLink). -update_msg_fwd_resource(_, #{upstream := Name} = NewConf) -> +update_msg_fwd_resource(_, #{name := Name} = NewConf) -> _ = emqx_cluster_link_mqtt:remove_msg_fwd_resource(Name), {ok, _} = emqx_cluster_link_mqtt:ensure_msg_fwd_resource(NewConf), ok. @@ -248,8 +248,8 @@ update_msg_fwd_resource(_, #{upstream := Name} = NewConf) -> ensure_actor_stopped(ClusterName) -> emqx_cluster_link_sup:ensure_actor_stopped(ClusterName). -upstream_name(#{upstream := N}) -> N; -upstream_name(#{<<"upstream">> := N}) -> N. +upstream_name(#{name := N}) -> N; +upstream_name(#{<<"name">> := N}) -> N. maybe_increment_ps_actor_incr(New, Old) -> case emqx_persistent_message:is_persistence_enabled() of @@ -284,9 +284,9 @@ increment_ps_actor_incr(#{ps_actor_incarnation := Incr} = Conf) -> increment_ps_actor_incr(#{<<"ps_actor_incarnation">> := Incr} = Conf) -> Conf#{<<"ps_actor_incarnation">> => Incr + 1}; %% Default value set in schema is 0, so need to set it to 1 during the first update. -increment_ps_actor_incr(#{<<"upstream">> := _} = Conf) -> +increment_ps_actor_incr(#{<<"name">> := _} = Conf) -> Conf#{<<"ps_actor_incarnation">> => 1}; -increment_ps_actor_incr(#{upstream := _} = Conf) -> +increment_ps_actor_incr(#{name := _} = Conf) -> Conf#{ps_actor_incarnation => 1}. convert_certs(LinksConf) -> diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl index 7a8bf1dff..5185803b6 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_mqtt.erl @@ -80,16 +80,9 @@ -define(PUB_TIMEOUT, 10_000). --spec ensure_msg_fwd_resource(binary() | map()) -> +-spec ensure_msg_fwd_resource(map()) -> {ok, emqx_resource:resource_data() | already_started} | {error, Reason :: term()}. -ensure_msg_fwd_resource(ClusterName) when is_binary(ClusterName) -> - case emqx_cluster_link_config:link(ClusterName) of - #{} = Conf -> - ensure_msg_fwd_resource(Conf); - undefined -> - {error, link_config_not_found} - end; -ensure_msg_fwd_resource(#{upstream := Name, resource_opts := ResOpts} = ClusterConf) -> +ensure_msg_fwd_resource(#{name := Name, resource_opts := ResOpts} = ClusterConf) -> ResOpts1 = ResOpts#{ query_mode => async, start_after_created => true diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl index b7d165419..6808da0bd 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_router_syncer.erl @@ -144,7 +144,7 @@ status(Cluster) -> %% 1. Actor + MQTT Client %% 2. Syncer -start_link(#{upstream := TargetCluster} = LinkConf) -> +start_link(#{name := TargetCluster} = LinkConf) -> supervisor:start_link(?REF(TargetCluster), ?MODULE, {sup, LinkConf}). %% Actor @@ -290,7 +290,7 @@ init({sup, LinkConf}) -> init({actor, State}) -> init_actor(State). -child_spec(actor, #{upstream := TargetCluster} = LinkConf) -> +child_spec(actor, #{name := TargetCluster} = LinkConf) -> %% Actor process. %% Wraps MQTT Client process. %% ClientID: `mycluster:emqx1@emqx.local:routesync` @@ -299,7 +299,7 @@ child_spec(actor, #{upstream := TargetCluster} = LinkConf) -> Actor = get_actor_id(), Incarnation = new_incarnation(), actor_spec(actor, ?ACTOR_REF(TargetCluster), Actor, Incarnation, LinkConf); -child_spec(ps_actor, #{upstream := TargetCluster, ps_actor_incarnation := Incr} = LinkConf) -> +child_spec(ps_actor, #{name := TargetCluster, ps_actor_incarnation := Incr} = LinkConf) -> actor_spec(ps_actor, ?PS_ACTOR_REF(TargetCluster), ?PS_ACTOR, Incr, LinkConf). child_spec(syncer, ?PS_ACTOR, Incarnation, TargetCluster) -> @@ -331,7 +331,7 @@ syncer_spec(ChildID, Actor, Incarnation, SyncerRef, ClientName) -> type => worker }. -mk_state(#{upstream := TargetCluster} = LinkConf, Actor, Incarnation) -> +mk_state(#{name := TargetCluster} = LinkConf, Actor, Incarnation) -> #st{ target = TargetCluster, actor = Actor, diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl index b6d0fbcda..f46249a4f 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_schema.erl @@ -40,7 +40,7 @@ links_schema(Meta) -> fields("link") -> [ {enable, ?HOCON(boolean(), #{default => true, desc => ?DESC(enable)})}, - {upstream, ?HOCON(binary(), #{required => true, desc => ?DESC(upstream)})}, + {name, ?HOCON(binary(), #{required => true, desc => ?DESC(link_name)})}, {server, emqx_schema:servers_sc(#{required => true, desc => ?DESC(server)}, ?MQTT_HOST_OPTS)}, {clientid, ?HOCON(binary(), #{desc => ?DESC(clientid)})}, @@ -121,8 +121,8 @@ links_validator(Links) -> ), check_errors(Dups, duplicated_cluster_links, names). -link_name(#{upstream := Name}) -> Name; -link_name(#{<<"upstream">> := Name}) -> Name. +link_name(#{name := Name}) -> Name; +link_name(#{<<"name">> := Name}) -> Name. topics_validator(Topics) -> Errors = lists:foldl( diff --git a/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl b/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl index 0991583e2..2025510fc 100644 --- a/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl +++ b/apps/emqx_cluster_link/src/emqx_cluster_link_sup.erl @@ -30,7 +30,7 @@ init(LinksConf) -> ExtrouterGC = extrouter_gc_spec(), RouteActors = [ sup_spec(Name, ?ACTOR_MODULE, [LinkConf]) - || #{upstream := Name} = LinkConf <- LinksConf + || #{name := Name} = LinkConf <- LinksConf ], {ok, {SupFlags, [ExtrouterGC | RouteActors]}}. @@ -53,7 +53,7 @@ sup_spec(Id, Mod, Args) -> modules => [Mod] }. -ensure_actor(#{upstream := Name} = LinkConf) -> +ensure_actor(#{name := Name} = LinkConf) -> case supervisor:start_child(?SERVER, sup_spec(Name, ?ACTOR_MODULE, [LinkConf])) of {ok, Pid} -> {ok, Pid}; diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl index e38cd3999..e023aacab 100644 --- a/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_SUITE.erl @@ -55,7 +55,7 @@ mk_source_cluster(BaseName, Config) -> "\n name = cl.source" "\n links = [" "\n { enable = true" - "\n upstream = cl.target" + "\n name = cl.target" "\n server = \"localhost:31883\"" "\n clientid = client.source" "\n topics = []" @@ -77,7 +77,7 @@ mk_target_cluster(BaseName, Config) -> "\n name = cl.target" "\n links = [" "\n { enable = true" - "\n upstream = cl.source" + "\n name = cl.source" "\n server = \"localhost:41883\"" "\n clientid = client.target" "\n topics = [\"#\"]" diff --git a/apps/emqx_cluster_link/test/emqx_cluster_link_config_SUITE.erl b/apps/emqx_cluster_link/test/emqx_cluster_link_config_SUITE.erl index e09f12ce4..97e62402c 100644 --- a/apps/emqx_cluster_link/test/emqx_cluster_link_config_SUITE.erl +++ b/apps/emqx_cluster_link/test/emqx_cluster_link_config_SUITE.erl @@ -101,14 +101,14 @@ t_config_update(Config) -> <<"pool_size">> => 1, <<"server">> => <<"localhost:", (integer_to_binary(LPortB))/binary>>, <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], - <<"upstream">> => NameB + <<"name">> => NameB }, LinkConfB = #{ <<"enable">> => true, <<"pool_size">> => 1, <<"server">> => <<"localhost:", (integer_to_binary(LPortA))/binary>>, <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], - <<"upstream">> => NameA + <<"name">> => NameA }, {ok, SubRef} = snabbkaffe:subscribe( @@ -242,7 +242,7 @@ t_config_validations(Config) -> <<"pool_size">> => 1, <<"server">> => <<"localhost:", (integer_to_binary(LPortB))/binary>>, <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], - <<"upstream">> => NameB + <<"name">> => NameB }, DuplicatedLinks = [LinkConfA, LinkConfA#{<<"enable">> => false, <<"pool_size">> => 2}], ?assertMatch( @@ -267,7 +267,7 @@ t_config_validations(Config) -> ?assertMatch( {error, #{reason := required_field}}, erpc:call(NodeA, emqx_cluster_link_config, update, [ - [maps:remove(<<"upstream">>, LinkConfA)] + [maps:remove(<<"name">>, LinkConfA)] ]) ), ?assertMatch( @@ -285,7 +285,7 @@ t_config_validations(Config) -> erpc:call(NodeA, emqx_cluster_link_config, update, [[LinkConfA]]) ), LinkConfUnknown = LinkConfA#{ - <<"upstream">> => <<"no-cluster">>, <<"server">> => <<"no-cluster.emqx:31883">> + <<"name">> => <<"no-cluster">>, <<"server">> => <<"no-cluster.emqx:31883">> }, ?assertMatch( {ok, _}, @@ -365,14 +365,14 @@ t_config_update_ds(Config) -> <<"pool_size">> => 1, <<"server">> => <<"localhost:", (integer_to_binary(LPortB))/binary>>, <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], - <<"upstream">> => NameB + <<"name">> => NameB }, LinkConfB = #{ <<"enable">> => true, <<"pool_size">> => 1, <<"server">> => <<"localhost:", (integer_to_binary(LPortA))/binary>>, <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], - <<"upstream">> => NameA + <<"name">> => NameA }, {ok, SubRef} = snabbkaffe:subscribe( @@ -500,14 +500,14 @@ t_misconfigured_links(Config) -> <<"pool_size">> => 1, <<"server">> => <<"localhost:", (integer_to_binary(LPortB))/binary>>, <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], - <<"upstream">> => <<"bad-b-name">> + <<"name">> => <<"bad-b-name">> }, LinkConfB = #{ <<"enable">> => true, <<"pool_size">> => 1, <<"server">> => <<"localhost:", (integer_to_binary(LPortA))/binary>>, <<"topics">> => [<<"t/test-topic">>, <<"t/test/#">>], - <<"upstream">> => NameA + <<"name">> => NameA }, ?assertMatch({ok, _}, erpc:call(NodeB1, emqx_cluster_link_config, update, [[LinkConfB]])), @@ -528,7 +528,7 @@ t_misconfigured_links(Config) -> ), {{ok, _}, {ok, _}} = ?wait_async_action( - erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA#{<<"upstream">> => NameB}]]), + erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA#{<<"name">> => NameB}]]), #{ ?snk_kind := clink_route_bootstrap_complete, ?snk_meta := #{node := NodeA1} @@ -554,7 +554,7 @@ t_misconfigured_links(Config) -> LinkConfB#{<<"enable">> => false}, %% An extra dummy link to keep B hook/external_broker registered and be able to %% respond with "link disabled error" for the first disabled link - LinkConfB#{<<"upstream">> => <<"bad-a-name">>} + LinkConfB#{<<"name">> => <<"bad-a-name">>} ] ] ) @@ -562,7 +562,7 @@ t_misconfigured_links(Config) -> ?assertMatch({ok, _}, erpc:call(NodeA1, emqx_cluster_link_config, update, [[]])), {{ok, _}, {ok, _}} = ?wait_async_action( - erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA#{<<"upstream">> => NameB}]]), + erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA#{<<"name">> => NameB}]]), #{ ?snk_kind := clink_handshake_error, reason := <<"cluster_link_disabled">>, @@ -579,13 +579,13 @@ t_misconfigured_links(Config) -> ?assertMatch( {ok, _}, erpc:call(NodeB1, emqx_cluster_link_config, update, [ - [LinkConfB#{<<"upstream">> => <<"bad-a-name">>}] + [LinkConfB#{<<"name">> => <<"bad-a-name">>}] ]) ), ?assertMatch({ok, _}, erpc:call(NodeA1, emqx_cluster_link_config, update, [[]])), {{ok, _}, {ok, _}} = ?wait_async_action( - erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA#{<<"upstream">> => NameB}]]), + erpc:call(NodeA1, emqx_cluster_link_config, update, [[LinkConfA#{<<"name">> => NameB}]]), #{ ?snk_kind := clink_handshake_error, reason := <<"unknown_cluster">>, diff --git a/rel/i18n/emqx_cluster_link_schema.hocon b/rel/i18n/emqx_cluster_link_schema.hocon index 77e4987f7..efd402569 100644 --- a/rel/i18n/emqx_cluster_link_schema.hocon +++ b/rel/i18n/emqx_cluster_link_schema.hocon @@ -12,9 +12,9 @@ enable.desc: """Enable or disable a cluster link. The link is enabled by default, disabling it allows stopping the link without removing its configuration. The link must be enabled on both sides to be operational. Disabling the link should also be done on both clusters in order to free up all associated resources.""" enable.label: "Enable" -upstream.desc: -"""Upstream cluster name. Must be exactly equal to the value of `cluster.name` configured at the remote cluster. Must not be equal to the local cluster.name. All configured cluster link upstream names must be unique.""" -upstream.label: "Upstream Name" +link_name.desc: +"""Linked (remote) cluster name. Must be exactly equal to the value of `cluster.name` configured at the remote cluster. Must not be equal to the local cluster.name. All configured cluster link names must be unique.""" +link_name.label: "Linked Cluster Name" server.desc: """MQTT host and port of the remote EMQX broker."""